Wednesday, September 18, 2019

Panda basic tutorial

Exercise

Pandas Tutorial

In [1]:
import pandas as pd
In [4]:
#check pandas version
print(pd.__version__)
0.24.2
In [3]:
pip install pandas
Requirement already satisfied: pandas in c:\users\v-shdash\appdata\local\continuum\anaconda3\lib\site-packages (0.24.2)
Requirement already satisfied: numpy>=1.12.0 in c:\users\v-shdash\appdata\local\continuum\anaconda3\lib\site-packages (from pandas) (1.16.2)
Requirement already satisfied: python-dateutil>=2.5.0 in c:\users\v-shdash\appdata\local\continuum\anaconda3\lib\site-packages (from pandas) (2.8.0)
Requirement already satisfied: pytz>=2011k in c:\users\v-shdash\appdata\local\continuum\anaconda3\lib\site-packages (from pandas) (2018.9)
Requirement already satisfied: six>=1.5 in c:\users\v-shdash\appdata\local\continuum\anaconda3\lib\site-packages (from python-dateutil>=2.5.0->pandas) (1.12.0)
Note: you may need to restart the kernel to use updated packages.

Series create, manipulate, query, delete

In [5]:
#creating a series from a list
arr=[0,1,2,3,4]
s1=pd.Series(arr)
s1
Out[5]:
0    0
1    1
2    2
3    3
4    4
dtype: int64
In [8]:
order=[1,2,3,4,5]
s2=pd.Series(arr,index=order)
s2
Out[8]:
1    0
2    1
3    2
4    3
5    4
dtype: int64
In [10]:
import numpy as np
n=np.random.randn(5)
index=['a','b','c','d','e']
s2=pd.Series(n,index=index)
s2
Out[10]:
a   -1.103802
b    1.078865
c    0.124427
d    0.391288
e    0.300528
dtype: float64
In [11]:
#Create series from dictionary
d={'a':1,'b':2,'c':3,'d':4,'e':5}
s3=pd.Series(d)
s3
Out[11]:
a    1
b    2
c    3
d    4
e    5
dtype: int64
In [13]:
#modify the index of the series
s1
s1.index=['a','b','c','d','e']
s1
Out[13]:
a    0
b    1
c    2
d    3
e    4
dtype: int64
In [25]:
#slicing
s1[1:3]
Out[25]:
b    1
c    2
dtype: int64
In [27]:
s4=s1.append(s3)
s4
Out[27]:
a    0
b    1
c    2
d    3
e    4
a    1
b    2
c    3
d    4
e    5
dtype: int64
In [29]:
s4=s4.drop('e')
s4
Out[29]:
a    0
b    1
c    2
d    3
a    1
b    2
c    3
d    4
dtype: int64

Series Operations

In [32]:
arr1=[0,1,2,3,4,5,6,7]
arr2=[6,9,10,3.3]
s5=pd.Series(arr1)
s6=pd.Series(arr2)
print(s5)
print(s6)
0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
dtype: int64
0     6.0
1     9.0
2    10.0
3     3.3
dtype: float64
In [33]:
s5.add(s6)
Out[33]:
0     6.0
1    10.0
2    12.0
3     6.3
4     NaN
5     NaN
6     NaN
7     NaN
dtype: float64
In [34]:
s5.sub(s6)
Out[34]:
0   -6.0
1   -8.0
2   -8.0
3   -0.3
4    NaN
5    NaN
6    NaN
7    NaN
dtype: float64
In [35]:
s5.mul(s6)
Out[35]:
0     0.0
1     9.0
2    20.0
3     9.9
4     NaN
5     NaN
6     NaN
7     NaN
dtype: float64
In [39]:
s7=s5.div(s6)
In [37]:
print(s6.median())
print(s6.max())
print(s6.min())
7.5
10.0
3.3
In [40]:
print(s7.median())
print(s7.max())
print(s7.min())
0.15555555555555556
0.9090909090909092
0.0

Create Dataframe

In [45]:
dates=pd.date_range('today',periods=6)
num_arr=np.random.randn(6,4)
columns=['A','B','C','D']

df1=pd.DataFrame(num_arr,index=dates,columns=columns)
df1
Out[45]:
A B C D
2019-09-16 21:15:24.260457 0.762413 -0.318652 -1.372874 -1.823678
2019-09-17 21:15:24.260457 0.984545 1.217833 -0.281376 -0.496665
2019-09-18 21:15:24.260457 -0.784411 -0.845514 0.047044 0.469878
2019-09-19 21:15:24.260457 -0.158247 -0.780051 -0.093783 0.330817
2019-09-20 21:15:24.260457 0.582693 -1.578394 0.542424 -1.186354
2019-09-21 21:15:24.260457 0.209068 0.751279 -0.280143 -0.598862
In [47]:
#dataframe from dictionary array
data={'animal':['cat','cat','snake','dog','dog','cat','snake','cat','dog','dog'],
     'age':[2.5,3,0.5,np.nan,5,2,4.5,np.nan,7,3],
     'visits':[1,3,2,3,2,3,1,1,2,1],
     'priority':['yes','yes','no','yes','no','no','no','yes','no','no'] 
     }
lables=['a','b','c','d','e','f','g','h','i','j']

df2=pd.DataFrame(data,index=lables)
df2
Out[47]:
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
c snake 0.5 2 no
d dog NaN 3 yes
e dog 5.0 2 no
f cat 2.0 3 no
g snake 4.5 1 no
h cat NaN 1 yes
i dog 7.0 2 no
j dog 3.0 1 no
In [49]:
df2.dtypes
Out[49]:
animal       object
age         float64
visits        int64
priority     object
dtype: object
In [52]:
df3=df2.head(3)
df3
Out[52]:
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
c snake 0.5 2 no
In [54]:
df2.tail(1)
Out[54]:
animal age visits priority
j dog 3.0 1 no
In [57]:
print(df2.index)
print(df2.columns)
print(df2.values)
Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], dtype='object')
Index(['animal', 'age', 'visits', 'priority'], dtype='object')
[['cat' 2.5 1 'yes']
 ['cat' 3.0 3 'yes']
 ['snake' 0.5 2 'no']
 ['dog' nan 3 'yes']
 ['dog' 5.0 2 'no']
 ['cat' 2.0 3 'no']
 ['snake' 4.5 1 'no']
 ['cat' nan 1 'yes']
 ['dog' 7.0 2 'no']
 ['dog' 3.0 1 'no']]
In [59]:
df2.describe()
Out[59]:
age visits
count 8.000000 10.000000
mean 3.437500 1.900000
std 2.007797 0.875595
min 0.500000 1.000000
25% 2.375000 1.000000
50% 3.000000 2.000000
75% 4.625000 2.750000
max 7.000000 3.000000
In [60]:
df2.T
Out[60]:
a b c d e f g h i j
animal cat cat snake dog dog cat snake cat dog dog
age 2.5 3 0.5 NaN 5 2 4.5 NaN 7 3
visits 1 3 2 3 2 3 1 1 2 1
priority yes yes no yes no no no yes no no
In [61]:
df2.sort_values(by='age')
Out[61]:
animal age visits priority
c snake 0.5 2 no
f cat 2.0 3 no
a cat 2.5 1 yes
b cat 3.0 3 yes
j dog 3.0 1 no
g snake 4.5 1 no
e dog 5.0 2 no
i dog 7.0 2 no
d dog NaN 3 yes
h cat NaN 1 yes
In [66]:
#Slicing dataframes
df2.sort_values(by="age")[0:3]
Out[66]:
animal age visits priority
c snake 0.5 2 no
f cat 2.0 3 no
a cat 2.5 1 yes
In [68]:
df2[['age','visits']][1:3]#two columns and 1 to 2 rows
Out[68]:
age visits
b 3.0 3
c 0.5 2
In [69]:
df2.iloc[1:3]
Out[69]:
animal age visits priority
b cat 3.0 3 yes
c snake 0.5 2 no
In [70]:
df3=df2.copy()
df3
Out[70]:
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
c snake 0.5 2 no
d dog NaN 3 yes
e dog 5.0 2 no
f cat 2.0 3 no
g snake 4.5 1 no
h cat NaN 1 yes
i dog 7.0 2 no
j dog 3.0 1 no
In [72]:
df3.isnull()
Out[72]:
animal age visits priority
a False False False False
b False False False False
c False False False False
d False True False False
e False False False False
f False False False False
g False False False False
h False True False False
i False False False False
j False False False False
In [73]:
df3.loc['f','age']=1.5
df3
Out[73]:
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
c snake 0.5 2 no
d dog NaN 3 yes
e dog 5.0 2 no
f cat 1.5 3 no
g snake 4.5 1 no
h cat NaN 1 yes
i dog 7.0 2 no
j dog 3.0 1 no
In [75]:
df3[['age']].mean()
Out[75]:
age    3.375
dtype: float64
In [77]:
df3.mean()
Out[77]:
age       3.375
visits    1.900
dtype: float64
In [80]:
df3[['age']].max()
Out[80]:
age    7.0
dtype: float64
In [85]:
string=pd.Series(['A','C','D','Aaa','BaCa',np.nan,'CBA','cow','owl'])
print(string.str.lower())
print(string.str.upper())
0       a
1       c
2       d
3     aaa
4    baca
5     NaN
6     cba
7     cow
8     owl
dtype: object
0       A
1       C
2       D
3     AAA
4    BACA
5     NaN
6     CBA
7     COW
8     OWL
dtype: object

Operations for DataFrame missing values

In [88]:
df4=df3.copy()
meanAge=df4['age'].mean()
df4.fillna(meanAge)
Out[88]:
animal age visits priority
a cat 2.500 1 yes
b cat 3.000 3 yes
c snake 0.500 2 no
d dog 3.375 3 yes
e dog 5.000 2 no
f cat 1.500 3 no
g snake 4.500 1 no
h cat 3.375 1 yes
i dog 7.000 2 no
j dog 3.000 1 no
In [89]:
df5=df3.copy()
df5.dropna(how='any')
Out[89]:
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
c snake 0.5 2 no
e dog 5.0 2 no
f cat 1.5 3 no
g snake 4.5 1 no
i dog 7.0 2 no
j dog 3.0 1 no

Dataframe file operations

In [90]:
df3.to_csv('animal.csv')
In [91]:
df_animal=pd.read_csv('animal.csv')
df_animal
Out[91]:
Unnamed: 0 animal age visits priority
0 a cat 2.5 1 yes
1 b cat 3.0 3 yes
2 c snake 0.5 2 no
3 d dog NaN 3 yes
4 e dog 5.0 2 no
5 f cat 1.5 3 no
6 g snake 4.5 1 no
7 h cat NaN 1 yes
8 i dog 7.0 2 no
9 j dog 3.0 1 no
In [101]:
df3.to_excel('animal.xlsx',sheet_name='Sheet1')
df_animal2=pd.read_excel('animal.xlsx','Sheet1',index_col=None,na_values=['NA'])
df_animal2
Out[101]:
Unnamed: 0 animal age visits priority
0 a cat 2.5 1 yes
1 b cat 3.0 3 yes
2 c snake 0.5 2 no
3 d dog NaN 3 yes
4 e dog 5.0 2 no
5 f cat 1.5 3 no
6 g snake 4.5 1 no
7 h cat NaN 1 yes
8 i dog 7.0 2 no
9 j dog 3.0 1 no

Visualization in Pandas

In [112]:
import numpy as np
%matplotlib inline
ts=pd.Series(np.random.randn(50),index=pd.date_range('today',periods=50))
ts=ts.cumsum()
ts.plot()
Out[112]:
<matplotlib.axes._subplots.AxesSubplot at 0x21f31c6c8d0>
In [115]:
df=pd.DataFrame(np.random.randn(50,4),index=ts.index,columns=['A','B','X','Y'])
df=df.cumsum()
df.plot()
Out[115]:
<matplotlib.axes._subplots.AxesSubplot at 0x21f32015320>
Practice example Remove repeated data using pandas
In [117]:
df=pd.DataFrame({'A':[1,2,2,2,4,4,5,5,6,6,7,8,8]})
df.loc[df['A'].shift()!=df['A']]
Out[117]:
A
0 1
1 2
4 4
6 5
8 6
10 7
11 8
In [ ]:
 

No comments:

Post a Comment