機器學習的特徵工程技術

Source: Deep Learning on Medium


Go to the profile of Yanwei Liu

匯入模組

import pandas as pd
import numpy as np

1.處理遺失值

threshold = 0.7
#丟棄遺失值比例高於0.7的columns
data = data[data.columns[data.isnull().mean() < threshold]]

#丟棄遺失值比例高於0.7的rows
data = data.loc[data.isnull().mean(axis=1) < threshold]
#填補遺失值成0
data = data.fillna(0)
#填補遺失值成該欄位的中位數
data = data.fillna(data.median())
#填補類別欄位的遺失值
data['column_name'].fillna(data['column_name'].value_counts()
.idxmax(), inplace=True)

2.處理離群值

#用標準差來丟棄離群值
factor = 3 #超過3個標準差視為離群值
upper_lim = data['column'].mean () + data['column'].std () * factor
lower_lim = data['column'].mean () - data['column'].std () * factor

data = data[(data['column'] < upper_lim) & (data['column'] > lower_lim)]
#用百分位數「丟棄」離群值
upper_lim = data['column'].quantile(.95)
lower_lim = data['column'].quantile(.05)

data = data[(data['column'] < upper_lim) & (data['column'] > lower_lim)]
#用百分位數「保留」離群值
upper_lim = data['column'].quantile(.95)
lower_lim = data['column'].quantile(.05)
data.loc[(df[column] > upper_lim),column] = upper_lim
data.loc[(df[column] < lower_lim),column] = lower_lim

3.區間分化

#數值區間分化
data['bin'] = pd.cut(data['value'], bins=[0,30,70,100], labels=["Low", "Mid", "High"])
 value bin
0 2 Low
1 45 Mid
2 7 Low
3 85 High
4 28 Low
#類別區間分化
Country
0 Spain
1 Chile
2 Australia
3 Italy
4 Brazil
conditions = [
data['Country'].str.contains('Spain'),
data['Country'].str.contains('Italy'),
data['Country'].str.contains('Chile'),
data['Country'].str.contains('Brazil')]

choices = ['Europe', 'Europe', 'South America', 'South America']

data['Continent'] = np.select(conditions, choices, default='Other')
 Country Continent
0 Spain Europe
1 Chile South America
2 Australia Other
3 Italy Europe
4 Brazil South America

4.對數轉換

Log(x+1)

#Example
data = pd.DataFrame({'value':[2,45, -23, 85, 28, 2, 35, -12]})
data['log+1'] = (data['value']+1).transform(np.log)
#負數將會出現錯誤
data['log'] = (data['value']-data['value'].min()+1) .transform(np.log)
 value log(x+1) log(x-min(x)+1)
0 2 1.09861 3.25810
1 45 3.82864 4.23411
2 -23 nan 0.00000
3 85 4.45435 4.69135
4 28 3.36730 3.95124
5 2 1.09861 3.25810
6 35 3.58352 4.07754
7 -12 nan 2.48491

5.One-hot encoding

encoded_columns = pd.get_dummies(data['column'])
data = data.join(encoded_columns).drop('column', axis=1)

6.合併群集

#選擇最高頻率標籤
data.groupby('id').agg(lambda x: x.value_counts().index[0]).
#樞紐分析
data.pivot_table(index='column_to_group', columns='column_to_encode', values='aggregation_column', aggfunc=np.sum, fill_value = 0)
#合併數值欄位
grouped = data.groupby('column_to_group')

sums = grouped[sum_cols].sum().add_suffix('_sum')
avgs = grouped[mean_cols].mean().add_suffix('_avg')

new_df = pd.concat([sums, avgs], axis=1)

7.特徵分割

data.name
0 Luther N. Gonzalez
1 Charles M. Young
2 Terry Lawson
3 Kristen White
4 Thomas Logsdon
#Extracting first names
data.name.str.split(" ").map(lambda x: x[0])
0 Luther
1 Charles
2 Terry
3 Kristen
4 Thomas
#Extracting last names
data.name.str.split(" ").map(lambda x: x[-1])
0 Gonzalez
1 Young
2 Lawson
3 White
4 Logsdon

8.調整資料規模

#Normalization
data = pd.DataFrame({'value':[2,45, -23, 85, 28, 2, 35, -12]})

data['normalized'] = (data['value'] - data['value'].min()) / (data['value'].max() - data['value'].min())
value normalized
0 2 0.23
1 45 0.63
2 -23 0.00
3 85 1.00
4 28 0.47
5 2 0.23
6 35 0.54
7 -12 0.10
#Standardization
data = pd.DataFrame({'value':[2,45, -23, 85, 28, 2, 35, -12]})

data['standardized'] = (data['value'] - data['value'].mean()) / data['value'].std()
value standardized
0 2 -0.52
1 45 0.70
2 -23 -1.23
3 85 1.84
4 28 0.22
5 2 -0.52
6 35 0.42
7 -12 -0.92

9.提取日期

from datetime import date

data = pd.DataFrame({'date':
['01-01-2017',
'04-12-2008',
'23-06-1988',
'25-08-1999',
'20-02-1993',
]})

#轉換字串成時間
data['date'] = pd.to_datetime(data.date, format="%d-%m-%Y")

#提取年
data['year'] = data['date'].dt.year

#提取月
data['month'] = data['date'].dt.month

#提取過了多少年
data['passed_years'] = date.today().year - data['date'].dt.year

#提取過了多少月
data['passed_months'] = (date.today().year - data['date'].dt.year) * 12 + date.today().month - data['date'].dt.month

#從日期轉換出當天是星期幾
data['day_name'] = data['date'].dt.day_name()
 date year month passed_years passed_months day_name
0 2017-01-01 2017 1 2 26 Sunday
1 2008-12-04 2008 12 11 123 Thursday
2 1988-06-23 1988 6 31 369 Thursday
3 1999-08-25 1999 8 20 235 Wednesday
4 1993-02-20 1993 2 26 313 Saturday