Pandas-Python

Source: Deep Learning on Medium


Go to the profile of Develop It

#For Dealing with Structured Data, Pandas is the most important library.

#Open source Python Library.
#High Performace, Easy to use Data Structure and data analysis tools.
#Runs on top of NumPy. So NumPy is a dependency for Pandas.

#A Data Structure in Pandas is called a Data Frame.

#Used to make High-level Data Structures (Data Frame)
#More Streamlined Handling of Tabular Data, and rich Time Series functionality.
#Data Alignment, Missing-Data Friendly Statistics, Groupby, Merge and Join Methods.
#You can use Pandas data structures, and freely draw on Numpy and SciPy functions to manipulate them.

# Dependencies: The Python Stack
# ( Many others ) are built on top of
# (SciKit Learn, Scikit Image, etc) are built on top of
# (SciPy, Pandas, Matplotlib) are built on top of
# ( NumPy )

# pandas.pydata.org

#In Pandas Missing values are NaN.
#If we convert the categories into numbers then pandas uses -1 for missing values which are NaN.

import numpy as np 
import pandas as pd

def header(msg):
 print(‘ — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — ‘)
 print(msg)

#1 Load Hard-Coded data into a DataFrame
def method1():
 header(“1 Load Hard-Coded data into a DataFrame”)
 df = pd.DataFrame(
 [[‘Jan’, 59, 32, 78, 24, 2.95],
 [‘Feb’, 59, 32, 72, 24, 2.95],
 [‘Mar’, 59, 32, 73, 24, 2.95],
 [‘Apr’, 59, 32, 71, 24, 2.95],
 [‘May’, 59, 32, 74, 24, 2.95],
 [‘Jun’, 59, 32, 75, 24, 2.95],
 [‘Jul’, 59, 32, 77, 24, 2.95],
 [‘Aug’, 59, 32, 79, 24, 2.95],
 [‘Sep’, 59, 32, 70, 24, 2.95],
 [‘Oct’, 59, 32, 76, 24, 2.95],
 [‘Nov’, 59, 32, 89, 24, 2.95],
 [‘Dec’, 59, 32, 87, 24, 2.95]],
 index = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
 columns = [‘month’, ‘avg_high’, ‘avg_low’, ‘record_high’, ‘record_low’, ‘avg_precipitation’]
 )
 print(df)

#method1()

#2 Read Text File into a DataFrame
def method2():
 header(“2 Read Text File into a DataFrame”)
 filename = ‘Weather_Monthly.txt’
 df = pd.read_csv(filename)
 print(df)

#3 Print Head and Tail
 header(“3. df.head()”)
 print(df.head())
 header(“3. df.tail(2)”)
 print(df.tail(2))

#4 Get Data Types, Index, Columns, Values
 header(“4. df.dtypes”)
 print(df.dtypes)
 header(“4. df.index”)
 print(df.index)
 header(“4. df.columns”)
 print(df.columns)
 header(“4. df.values”)
 print(df.values)

#5 Statistical Summary of each column
 header(“5. df.describe()”)
 print(df.describe())

#6 Sort Records by any column
 header(“6. df.sort_values(‘record_high’, ascending = False)”)
 print(df.sort_values(‘record_high’, ascending = False))

#7 Slicing Records
 header(“7. slicing — df.avg_low”)
 print(df.avg_low)

header(“7. slicing — df[‘avg_low’]”)
 print(df[‘avg_low’])

header(“7. slicing — df[2:4]”)
 print(df[2:4])

header(“7. slicing — df[[‘avg_low’, ‘avg_high’]]”)
 print(df[[‘avg_low’, ‘avg_high’]])

header(“7. slicing — df.loc[:, [‘avg_low’, ‘avg_high’]]”)
 print(df.loc[:, [‘avg_low’, ‘avg_high’]])

header(“7. slicing — df.loc[8, [‘avg_precipitation’]]”)
 print(df.loc[8, [‘avg_precipitation’]])

header(“7. slicing — df.iloc[2:4, [2, 4]]”)
 print(df.iloc[2:4, [2, 4]])

#8 Filtering
 header(“8. filtering df[df.avg_precipitation > .3]”)
 print(df[df.avg_precipitation > 2])

header(“8. filtering df[df[‘month’].isin([‘Jan’, ‘May’, ‘Aug’])]”)
 print(df[df[‘month’].isin([‘Jan’, ‘May’, ‘Aug’])])

#9 Assignment — similar to slicing
 header(“9. Assignment — df.loc[2, [‘avg_low’]] = 40 df[2:4]”)
 df.loc[2, [‘avg_low’]] = 40
 print(df.iloc[2:4])

header(“9. Assignment — df.loc[2, [‘avg_low’]] = np.nan df[2:4]”)
 df.loc[2, [‘avg_low’]] = np.nan
 print(df.iloc[2:4])
 #In general Pandas automatically converts the missing values and the values which differ from other values in type in a column to NaN.

header(“9. Assignment — df.loc[:, [‘avg_low’]] = np.array([2] * len(df)) df[2:7]”)
 df.loc[:, [‘avg_low’]] = np.array([2] * len(df))
 print(df.iloc[2:7])

header(“9. Assignment — df[‘avg_day’] = (df.avg_low + df.avg_high) / 2 df[:4]”)
 df[‘avg_day’] = (df.avg_low + df.avg_high) / 2
 print(df.iloc[:4])

#10 Renaming
 header(“10 Renaming One Column”)
 df.rename(columns = {‘avg_low’:’on_avg_low’}, inplace = True) #Either we can write inplace = True or we can write the statement as df = df.rename(…….)
 print(df.head())
 #We pass a dictionary {} here.

header(“10 Renaming all of the Columns”)
 df.columns = [‘month’, ‘av_hi’, ‘av_lo’, ‘rec_hi’, ‘rec_lo’, ‘avg_rain’, ‘avg_day’]
 print(df.head())

#11 Iterate a df
 header(“11 Iterate a DataFrame”) #We don’t use this that often
 for index, row in df.iterrows():
 print(index, row["month”], row[["avg_rain”]/p>

#12 Write to CSV File
 header(“12 write to CSV File”)
 df.to_csv(‘Weather.csv’)
 #You can write to different formats using different commands like df.to_excel etc.

method2()