# Pandas-Python

Source: Deep Learning on Medium

#For Dealing with Structured Data, Pandas is the most important library.

#Open source Python Library.
#High Performace, Easy to use Data Structure and data analysis tools.
#Runs on top of NumPy. So NumPy is a dependency for Pandas.

#A Data Structure in Pandas is called a Data Frame.

#Used to make High-level Data Structures (Data Frame)
#More Streamlined Handling of Tabular Data, and rich Time Series functionality.
#Data Alignment, Missing-Data Friendly Statistics, Groupby, Merge and Join Methods.
#You can use Pandas data structures, and freely draw on Numpy and SciPy functions to manipulate them.

# Dependencies: The Python Stack
# ( Many others ) are built on top of
# (SciKit Learn, Scikit Image, etc) are built on top of
# (SciPy, Pandas, Matplotlib) are built on top of
# ( NumPy )

# pandas.pydata.org

#In Pandas Missing values are NaN.
#If we convert the categories into numbers then pandas uses -1 for missing values which are NaN.

import numpy as np
import pandas as pd

print(‘ — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — — ‘)
print(msg)

#1 Load Hard-Coded data into a DataFrame
def method1():
df = pd.DataFrame(
[[‘Jan’, 59, 32, 78, 24, 2.95],
[‘Feb’, 59, 32, 72, 24, 2.95],
[‘Mar’, 59, 32, 73, 24, 2.95],
[‘Apr’, 59, 32, 71, 24, 2.95],
[‘May’, 59, 32, 74, 24, 2.95],
[‘Jun’, 59, 32, 75, 24, 2.95],
[‘Jul’, 59, 32, 77, 24, 2.95],
[‘Aug’, 59, 32, 79, 24, 2.95],
[‘Sep’, 59, 32, 70, 24, 2.95],
[‘Oct’, 59, 32, 76, 24, 2.95],
[‘Nov’, 59, 32, 89, 24, 2.95],
[‘Dec’, 59, 32, 87, 24, 2.95]],
index = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
columns = [‘month’, ‘avg_high’, ‘avg_low’, ‘record_high’, ‘record_low’, ‘avg_precipitation’]
)
print(df)

#method1()

#2 Read Text File into a DataFrame
def method2():
filename = ‘Weather_Monthly.txt’
print(df)

print(df.tail(2))

#4 Get Data Types, Index, Columns, Values
print(df.dtypes)
print(df.index)
print(df.columns)
print(df.values)

#5 Statistical Summary of each column
print(df.describe())

#6 Sort Records by any column
print(df.sort_values(‘record_high’, ascending = False))

#7 Slicing Records
print(df.avg_low)

print(df[‘avg_low’])

print(df[2:4])

print(df[[‘avg_low’, ‘avg_high’]])

header(“7. slicing — df.loc[:, [‘avg_low’, ‘avg_high’]]”)
print(df.loc[:, [‘avg_low’, ‘avg_high’]])

print(df.loc[8, [‘avg_precipitation’]])

header(“7. slicing — df.iloc[2:4, [2, 4]]”)
print(df.iloc[2:4, [2, 4]])

#8 Filtering
print(df[df.avg_precipitation > 2])

print(df[df[‘month’].isin([‘Jan’, ‘May’, ‘Aug’])])

#9 Assignment — similar to slicing
header(“9. Assignment — df.loc[2, [‘avg_low’]] = 40 df[2:4]”)
df.loc[2, [‘avg_low’]] = 40
print(df.iloc[2:4])

header(“9. Assignment — df.loc[2, [‘avg_low’]] = np.nan df[2:4]”)
df.loc[2, [‘avg_low’]] = np.nan
print(df.iloc[2:4])
#In general Pandas automatically converts the missing values and the values which differ from other values in type in a column to NaN.

header(“9. Assignment — df.loc[:, [‘avg_low’]] = np.array( * len(df)) df[2:7]”)
df.loc[:, [‘avg_low’]] = np.array( * len(df))
print(df.iloc[2:7])

header(“9. Assignment — df[‘avg_day’] = (df.avg_low + df.avg_high) / 2 df[:4]”)
df[‘avg_day’] = (df.avg_low + df.avg_high) / 2
print(df.iloc[:4])

#10 Renaming
df.rename(columns = {‘avg_low’:’on_avg_low’}, inplace = True) #Either we can write inplace = True or we can write the statement as df = df.rename(…….)
#We pass a dictionary {} here.

header(“10 Renaming all of the Columns”)
df.columns = [‘month’, ‘av_hi’, ‘av_lo’, ‘rec_hi’, ‘rec_lo’, ‘avg_rain’, ‘avg_day’]