Pandas Tutorial¶

import pandas as pd

#check pandas version
print(pd.__version__)

0.24.2

pip install pandas

Requirement already satisfied: pandas in c:\users\v-shdash\appdata\local\continuum\anaconda3\lib\site-packages (0.24.2)
Requirement already satisfied: numpy>=1.12.0 in c:\users\v-shdash\appdata\local\continuum\anaconda3\lib\site-packages (from pandas) (1.16.2)
Requirement already satisfied: python-dateutil>=2.5.0 in c:\users\v-shdash\appdata\local\continuum\anaconda3\lib\site-packages (from pandas) (2.8.0)
Requirement already satisfied: pytz>=2011k in c:\users\v-shdash\appdata\local\continuum\anaconda3\lib\site-packages (from pandas) (2018.9)
Requirement already satisfied: six>=1.5 in c:\users\v-shdash\appdata\local\continuum\anaconda3\lib\site-packages (from python-dateutil>=2.5.0->pandas) (1.12.0)
Note: you may need to restart the kernel to use updated packages.

Series create, manipulate, query, delete¶

#creating a series from a list
arr=[0,1,2,3,4]
s1=pd.Series(arr)
s1

0    0
1    1
2    2
3    3
4    4
dtype: int64

order=[1,2,3,4,5]
s2=pd.Series(arr,index=order)
s2

1    0
2    1
3    2
4    3
5    4
dtype: int64

import numpy as np
n=np.random.randn(5)
index=['a','b','c','d','e']
s2=pd.Series(n,index=index)
s2

a   -1.103802
b    1.078865
c    0.124427
d    0.391288
e    0.300528
dtype: float64

#Create series from dictionary
d={'a':1,'b':2,'c':3,'d':4,'e':5}
s3=pd.Series(d)
s3

a    1
b    2
c    3
d    4
e    5
dtype: int64

#modify the index of the series
s1
s1.index=['a','b','c','d','e']
s1

a    0
b    1
c    2
d    3
e    4
dtype: int64

#slicing
s1[1:3]

b    1
c    2
dtype: int64

s4=s1.append(s3)
s4

a    0
b    1
c    2
d    3
e    4
a    1
b    2
c    3
d    4
e    5
dtype: int64

s4=s4.drop('e')
s4

a    0
b    1
c    2
d    3
a    1
b    2
c    3
d    4
dtype: int64

Series Operations¶

arr1=[0,1,2,3,4,5,6,7]
arr2=[6,9,10,3.3]
s5=pd.Series(arr1)
s6=pd.Series(arr2)
print(s5)
print(s6)

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
dtype: int64
0     6.0
1     9.0
2    10.0
3     3.3
dtype: float64

s5.add(s6)

0     6.0
1    10.0
2    12.0
3     6.3
4     NaN
5     NaN
6     NaN
7     NaN
dtype: float64

s5.sub(s6)

0   -6.0
1   -8.0
2   -8.0
3   -0.3
4    NaN
5    NaN
6    NaN
7    NaN
dtype: float64

s5.mul(s6)

0     0.0
1     9.0
2    20.0
3     9.9
4     NaN
5     NaN
6     NaN
7     NaN
dtype: float64

s7=s5.div(s6)

print(s6.median())
print(s6.max())
print(s6.min())

7.5
10.0
3.3

print(s7.median())
print(s7.max())
print(s7.min())

0.15555555555555556
0.9090909090909092
0.0

Create Dataframe¶

dates=pd.date_range('today',periods=6)
num_arr=np.random.randn(6,4)
columns=['A','B','C','D']

df1=pd.DataFrame(num_arr,index=dates,columns=columns)
df1

#dataframe from dictionary array
data={'animal':['cat','cat','snake','dog','dog','cat','snake','cat','dog','dog'],
     'age':[2.5,3,0.5,np.nan,5,2,4.5,np.nan,7,3],
     'visits':[1,3,2,3,2,3,1,1,2,1],
     'priority':['yes','yes','no','yes','no','no','no','yes','no','no'] 
     }
lables=['a','b','c','d','e','f','g','h','i','j']

df2=pd.DataFrame(data,index=lables)
df2

df2.dtypes

animal       object
age         float64
visits        int64
priority     object
dtype: object

df3=df2.head(3)
df3

df2.tail(1)

print(df2.index)
print(df2.columns)
print(df2.values)

Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], dtype='object')
Index(['animal', 'age', 'visits', 'priority'], dtype='object')
[['cat' 2.5 1 'yes']
 ['cat' 3.0 3 'yes']
 ['snake' 0.5 2 'no']
 ['dog' nan 3 'yes']
 ['dog' 5.0 2 'no']
 ['cat' 2.0 3 'no']
 ['snake' 4.5 1 'no']
 ['cat' nan 1 'yes']
 ['dog' 7.0 2 'no']
 ['dog' 3.0 1 'no']]

df2.describe()

df2.T

df2.sort_values(by='age')

#Slicing dataframes
df2.sort_values(by="age")[0:3]

df2[['age','visits']][1:3]#two columns and 1 to 2 rows

df2.iloc[1:3]

df3=df2.copy()
df3

df3.isnull()

df3.loc['f','age']=1.5
df3

df3[['age']].mean()

age    3.375
dtype: float64

df3.mean()

age       3.375
visits    1.900
dtype: float64

df3[['age']].max()

age    7.0
dtype: float64

string=pd.Series(['A','C','D','Aaa','BaCa',np.nan,'CBA','cow','owl'])
print(string.str.lower())
print(string.str.upper())

0       a
1       c
2       d
3     aaa
4    baca
5     NaN
6     cba
7     cow
8     owl
dtype: object
0       A
1       C
2       D
3     AAA
4    BACA
5     NaN
6     CBA
7     COW
8     OWL
dtype: object

Operations for DataFrame missing values¶

df4=df3.copy()
meanAge=df4['age'].mean()
df4.fillna(meanAge)

df5=df3.copy()
df5.dropna(how='any')

Dataframe file operations¶

df3.to_csv('animal.csv')

df_animal=pd.read_csv('animal.csv')
df_animal

df3.to_excel('animal.xlsx',sheet_name='Sheet1')
df_animal2=pd.read_excel('animal.xlsx','Sheet1',index_col=None,na_values=['NA'])
df_animal2

Visualization in Pandas¶

import numpy as np
%matplotlib inline
ts=pd.Series(np.random.randn(50),index=pd.date_range('today',periods=50))
ts=ts.cumsum()
ts.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x21f31c6c8d0>

df=pd.DataFrame(np.random.randn(50,4),index=ts.index,columns=['A','B','X','Y'])
df=df.cumsum()
df.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x21f32015320>

Practice example Remove repeated data using pandas

df=pd.DataFrame({'A':[1,2,2,2,4,4,5,5,6,6,7,8,8]})
df.loc[df['A'].shift()!=df['A']]

	A	B	C	D
2019-09-16 21:15:24.260457	0.762413	-0.318652	-1.372874	-1.823678
2019-09-17 21:15:24.260457	0.984545	1.217833	-0.281376	-0.496665
2019-09-18 21:15:24.260457	-0.784411	-0.845514	0.047044	0.469878
2019-09-19 21:15:24.260457	-0.158247	-0.780051	-0.093783	0.330817
2019-09-20 21:15:24.260457	0.582693	-1.578394	0.542424	-1.186354
2019-09-21 21:15:24.260457	0.209068	0.751279	-0.280143	-0.598862

	age	visits
count	8.000000	10.000000
mean	3.437500	1.900000
std	2.007797	0.875595
min	0.500000	1.000000
25%	2.375000	1.000000
50%	3.000000	2.000000
75%	4.625000	2.750000
max	7.000000	3.000000

	animal	age	visits	priority
a	False	False	False	False
b	False	False	False	False
c	False	False	False	False
d	False	True	False	False
e	False	False	False	False
f	False	False	False	False
g	False	False	False	False
h	False	True	False	False
i	False	False	False	False
j	False	False	False	False

Python

Wednesday, September 18, 2019

Panda basic tutorial

Pandas Tutorial¶

Series create, manipulate, query, delete¶

Series Operations¶

Create Dataframe¶

Operations for DataFrame missing values¶

Dataframe file operations¶

Visualization in Pandas¶

No comments:

Post a Comment

	animal	age	visits	priority
a	cat	2.5	1	yes
b	cat	3.0	3	yes
c	snake	0.5	2	no
d	dog	NaN	3	yes
e	dog	5.0	2	no
f	cat	2.0	3	no
g	snake	4.5	1	no
h	cat	NaN	1	yes
i	dog	7.0	2	no
j	dog	3.0	1	no

	animal	age	visits	priority
a	cat	2.500	1	yes
b	cat	3.000	3	yes
c	snake	0.500	2	no
d	dog	3.375	3	yes
e	dog	5.000	2	no
f	cat	1.500	3	no
g	snake	4.500	1	no
h	cat	3.375	1	yes
i	dog	7.000	2	no
j	dog	3.000	1	no