Pandas Tutorial¶
In [1]:
import pandas as pd
In [4]:
#check pandas version
print(pd.__version__)
In [3]:
pip install pandas
Series create, manipulate, query, delete¶
In [5]:
#creating a series from a list
arr=[0,1,2,3,4]
s1=pd.Series(arr)
s1
Out[5]:
In [8]:
order=[1,2,3,4,5]
s2=pd.Series(arr,index=order)
s2
Out[8]:
In [10]:
import numpy as np
n=np.random.randn(5)
index=['a','b','c','d','e']
s2=pd.Series(n,index=index)
s2
Out[10]:
In [11]:
#Create series from dictionary
d={'a':1,'b':2,'c':3,'d':4,'e':5}
s3=pd.Series(d)
s3
Out[11]:
In [13]:
#modify the index of the series
s1
s1.index=['a','b','c','d','e']
s1
Out[13]:
In [25]:
#slicing
s1[1:3]
Out[25]:
In [27]:
s4=s1.append(s3)
s4
Out[27]:
In [29]:
s4=s4.drop('e')
s4
Out[29]:
Series Operations¶
In [32]:
arr1=[0,1,2,3,4,5,6,7]
arr2=[6,9,10,3.3]
s5=pd.Series(arr1)
s6=pd.Series(arr2)
print(s5)
print(s6)
In [33]:
s5.add(s6)
Out[33]:
In [34]:
s5.sub(s6)
Out[34]:
In [35]:
s5.mul(s6)
Out[35]:
In [39]:
s7=s5.div(s6)
In [37]:
print(s6.median())
print(s6.max())
print(s6.min())
In [40]:
print(s7.median())
print(s7.max())
print(s7.min())
Create Dataframe¶
In [45]:
dates=pd.date_range('today',periods=6)
num_arr=np.random.randn(6,4)
columns=['A','B','C','D']
df1=pd.DataFrame(num_arr,index=dates,columns=columns)
df1
Out[45]:
In [47]:
#dataframe from dictionary array
data={'animal':['cat','cat','snake','dog','dog','cat','snake','cat','dog','dog'],
'age':[2.5,3,0.5,np.nan,5,2,4.5,np.nan,7,3],
'visits':[1,3,2,3,2,3,1,1,2,1],
'priority':['yes','yes','no','yes','no','no','no','yes','no','no']
}
lables=['a','b','c','d','e','f','g','h','i','j']
df2=pd.DataFrame(data,index=lables)
df2
Out[47]:
In [49]:
df2.dtypes
Out[49]:
In [52]:
df3=df2.head(3)
df3
Out[52]:
In [54]:
df2.tail(1)
Out[54]:
In [57]:
print(df2.index)
print(df2.columns)
print(df2.values)
In [59]:
df2.describe()
Out[59]:
In [60]:
df2.T
Out[60]:
In [61]:
df2.sort_values(by='age')
Out[61]:
In [66]:
#Slicing dataframes
df2.sort_values(by="age")[0:3]
Out[66]:
In [68]:
df2[['age','visits']][1:3]#two columns and 1 to 2 rows
Out[68]:
In [69]:
df2.iloc[1:3]
Out[69]:
In [70]:
df3=df2.copy()
df3
Out[70]:
In [72]:
df3.isnull()
Out[72]:
In [73]:
df3.loc['f','age']=1.5
df3
Out[73]:
In [75]:
df3[['age']].mean()
Out[75]:
In [77]:
df3.mean()
Out[77]:
In [80]:
df3[['age']].max()
Out[80]:
In [85]:
string=pd.Series(['A','C','D','Aaa','BaCa',np.nan,'CBA','cow','owl'])
print(string.str.lower())
print(string.str.upper())
Operations for DataFrame missing values¶
In [88]:
df4=df3.copy()
meanAge=df4['age'].mean()
df4.fillna(meanAge)
Out[88]:
In [89]:
df5=df3.copy()
df5.dropna(how='any')
Out[89]:
Dataframe file operations¶
In [90]:
df3.to_csv('animal.csv')
In [91]:
df_animal=pd.read_csv('animal.csv')
df_animal
Out[91]:
In [101]:
df3.to_excel('animal.xlsx',sheet_name='Sheet1')
df_animal2=pd.read_excel('animal.xlsx','Sheet1',index_col=None,na_values=['NA'])
df_animal2
Out[101]:
Visualization in Pandas¶
In [112]:
import numpy as np
%matplotlib inline
ts=pd.Series(np.random.randn(50),index=pd.date_range('today',periods=50))
ts=ts.cumsum()
ts.plot()
Out[112]:
In [115]:
df=pd.DataFrame(np.random.randn(50,4),index=ts.index,columns=['A','B','X','Y'])
df=df.cumsum()
df.plot()
Out[115]:
Practice example
Remove repeated data using pandas
In [117]:
df=pd.DataFrame({'A':[1,2,2,2,4,4,5,5,6,6,7,8,8]})
df.loc[df['A'].shift()!=df['A']]
Out[117]:
In [ ]:
No comments:
Post a Comment