# 機器學習的特徵工程技術

Source: Deep Learning on Medium

### 匯入模組

`import pandas as pdimport numpy as np`

### 1.處理遺失值

`threshold = 0.7`
`#丟棄遺失值比例高於0.7的columnsdata = data[data.columns[data.isnull().mean() < threshold]]#丟棄遺失值比例高於0.7的rowsdata = data.loc[data.isnull().mean(axis=1) < threshold]`
`#填補遺失值成0data = data.fillna(0)`
`#填補遺失值成該欄位的中位數data = data.fillna(data.median())`
`#填補類別欄位的遺失值data['column_name'].fillna(data['column_name'].value_counts().idxmax(), inplace=True)`

### 2.處理離群值

`#用標準差來丟棄離群值factor = 3 #超過3個標準差視為離群值upper_lim = data['column'].mean () + data['column'].std () * factorlower_lim = data['column'].mean () - data['column'].std () * factordata = data[(data['column'] < upper_lim) & (data['column'] > lower_lim)]`
`#用百分位數「丟棄」離群值upper_lim = data['column'].quantile(.95)lower_lim = data['column'].quantile(.05)data = data[(data['column'] < upper_lim) & (data['column'] > lower_lim)]`
`#用百分位數「保留」離群值upper_lim = data['column'].quantile(.95)lower_lim = data['column'].quantile(.05)`
`data.loc[(df[column] > upper_lim),column] = upper_limdata.loc[(df[column] < lower_lim),column] = lower_lim`

### 3.區間分化

`#數值區間分化`
`data['bin'] = pd.cut(data['value'], bins=[0,30,70,100], labels=["Low", "Mid", "High"])`
` value bin0 2 Low1 45 Mid2 7 Low3 85 High4 28 Low`
`#類別區間分化`
`Country0 Spain1 Chile2 Australia3 Italy4 Brazil`
`conditions = [ data['Country'].str.contains('Spain'), data['Country'].str.contains('Italy'), data['Country'].str.contains('Chile'), data['Country'].str.contains('Brazil')]choices = ['Europe', 'Europe', 'South America', 'South America']data['Continent'] = np.select(conditions, choices, default='Other')`
` Country Continent0 Spain Europe1 Chile South America2 Australia Other3 Italy Europe4 Brazil South America`

### 4.對數轉換

Log(x+1)

`#Exampledata = pd.DataFrame({'value':[2,45, -23, 85, 28, 2, 35, -12]})`
`data['log+1'] = (data['value']+1).transform(np.log)`
`#負數將會出現錯誤data['log'] = (data['value']-data['value'].min()+1) .transform(np.log)`
` value log(x+1) log(x-min(x)+1)0 2 1.09861 3.258101 45 3.82864 4.234112 -23 nan 0.000003 85 4.45435 4.691354 28 3.36730 3.951245 2 1.09861 3.258106 35 3.58352 4.077547 -12 nan 2.48491`

### 5.One-hot encoding

`encoded_columns = pd.get_dummies(data['column'])data = data.join(encoded_columns).drop('column', axis=1)`

### 6.合併群集

`#選擇最高頻率標籤data.groupby('id').agg(lambda x: x.value_counts().index[0]).`
`#樞紐分析data.pivot_table(index='column_to_group', columns='column_to_encode', values='aggregation_column', aggfunc=np.sum, fill_value = 0)`
`#合併數值欄位grouped = data.groupby('column_to_group')sums = grouped[sum_cols].sum().add_suffix('_sum')avgs = grouped[mean_cols].mean().add_suffix('_avg')new_df = pd.concat([sums, avgs], axis=1)`

### 7.特徵分割

`data.name0 Luther N. Gonzalez1 Charles M. Young2 Terry Lawson3 Kristen White4 Thomas Logsdon`
`#Extracting first namesdata.name.str.split(" ").map(lambda x: x[0])0 Luther1 Charles2 Terry3 Kristen4 Thomas`
`#Extracting last namesdata.name.str.split(" ").map(lambda x: x[-1])0 Gonzalez1 Young2 Lawson3 White4 Logsdon`

### 8.調整資料規模

`#Normalizationdata = pd.DataFrame({'value':[2,45, -23, 85, 28, 2, 35, -12]})data['normalized'] = (data['value'] - data['value'].min()) / (data['value'].max() - data['value'].min())`
`value normalized0 2 0.231 45 0.632 -23 0.003 85 1.004 28 0.475 2 0.236 35 0.547 -12 0.10`
`#Standardization`
`data = pd.DataFrame({'value':[2,45, -23, 85, 28, 2, 35, -12]})data['standardized'] = (data['value'] - data['value'].mean()) / data['value'].std()`
`value standardized0 2 -0.521 45 0.702 -23 -1.233 85 1.844 28 0.225 2 -0.526 35 0.427 -12 -0.92`

### 9.提取日期

`from datetime import datedata = pd.DataFrame({'date':['01-01-2017','04-12-2008','23-06-1988','25-08-1999','20-02-1993',]})#轉換字串成時間data['date'] = pd.to_datetime(data.date, format="%d-%m-%Y")#提取年data['year'] = data['date'].dt.year#提取月data['month'] = data['date'].dt.month#提取過了多少年data['passed_years'] = date.today().year - data['date'].dt.year#提取過了多少月data['passed_months'] = (date.today().year - data['date'].dt.year) * 12 + date.today().month - data['date'].dt.month#從日期轉換出當天是星期幾data['day_name'] = data['date'].dt.day_name()`
` date year month passed_years passed_months day_name0 2017-01-01 2017 1 2 26 Sunday1 2008-12-04 2008 12 11 123 Thursday2 1988-06-23 1988 6 31 369 Thursday3 1999-08-25 1999 8 20 235 Wednesday4 1993-02-20 1993 2 26 313 Saturday`