Kaggle 필사 1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
%%time
train= pd.read_csv("./porto/train.csv/train.csv")
test= pd.read_csv("./porto/test.csv/test.csv")
Wall time: 6.47 s
# train = train.sample(frac = 0.2) EDA할 때 데이터 뽑자
# 데이터가 imbalanced하면
from sklearn.model_selection import StratifiedKFold
#fold = StratifiedKFold(n_splits=10, random_state=1980, shuffle = True)
#for trn_idx, val_idx in fold.split(train, train['target']) :
# break
#train = train.iloc[trn_idx]
train.shape
(595212, 59)
train.tail()
id | target | ps_ind_01 | ps_ind_02_cat | ps_ind_03 | ps_ind_04_cat | ps_ind_05_cat | ps_ind_06_bin | ps_ind_07_bin | ps_ind_08_bin | ps_ind_09_bin | ps_ind_10_bin | ps_ind_11_bin | ps_ind_12_bin | ps_ind_13_bin | ps_ind_14 | ps_ind_15 | ps_ind_16_bin | ps_ind_17_bin | ps_ind_18_bin | ps_reg_01 | ps_reg_02 | ps_reg_03 | ps_car_01_cat | ps_car_02_cat | ps_car_03_cat | ps_car_04_cat | ps_car_05_cat | ps_car_06_cat | ps_car_07_cat | ps_car_08_cat | ps_car_09_cat | ps_car_10_cat | ps_car_11_cat | ps_car_11 | ps_car_12 | ps_car_13 | ps_car_14 | ps_car_15 | ps_calc_01 | ps_calc_02 | ps_calc_03 | ps_calc_04 | ps_calc_05 | ps_calc_06 | ps_calc_07 | ps_calc_08 | ps_calc_09 | ps_calc_10 | ps_calc_11 | ps_calc_12 | ps_calc_13 | ps_calc_14 | ps_calc_15_bin | ps_calc_16_bin | ps_calc_17_bin | ps_calc_18_bin | ps_calc_19_bin | ps_calc_20_bin | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
595207 | 1488013 | 0 | 3 | 1 | 10 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 13 | 1 | 0 | 0 | 0.5 | 0.3 | 0.692820 | 10 | 1 | -1 | 0 | 1 | 1 | 1 | 1 | 0 | 1 | 31 | 3 | 0.374166 | 0.684631 | 0.385487 | 2.645751 | 0.4 | 0.5 | 0.3 | 3 | 0 | 9 | 0 | 9 | 1 | 12 | 4 | 1 | 9 | 6 | 0 | 1 | 1 | 0 | 1 | 1 |
595208 | 1488016 | 0 | 5 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 6 | 1 | 0 | 0 | 0.9 | 0.7 | 1.382027 | 9 | 1 | -1 | 0 | -1 | 15 | 0 | 0 | 2 | 1 | 63 | 2 | 0.387298 | 0.972145 | -1.000000 | 3.605551 | 0.2 | 0.2 | 0.0 | 2 | 4 | 8 | 6 | 8 | 2 | 12 | 4 | 1 | 3 | 8 | 1 | 0 | 1 | 0 | 1 | 1 |
595209 | 1488017 | 0 | 1 | 1 | 10 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | 1 | 0 | 0 | 0.9 | 0.2 | 0.659071 | 7 | 1 | -1 | 0 | -1 | 1 | 1 | 1 | 2 | 1 | 31 | 3 | 0.397492 | 0.596373 | 0.398748 | 1.732051 | 0.4 | 0.0 | 0.3 | 3 | 2 | 7 | 4 | 8 | 0 | 10 | 3 | 2 | 2 | 6 | 0 | 0 | 1 | 0 | 0 | 0 |
595210 | 1488021 | 0 | 5 | 2 | 3 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | 1 | 0 | 0 | 0.9 | 0.4 | 0.698212 | 11 | 1 | -1 | 0 | -1 | 11 | 1 | 1 | 2 | 1 | 101 | 3 | 0.374166 | 0.764434 | 0.384968 | 3.162278 | 0.0 | 0.7 | 0.0 | 4 | 0 | 9 | 4 | 9 | 2 | 11 | 4 | 1 | 4 | 2 | 0 | 1 | 1 | 1 | 0 | 0 |
595211 | 1488027 | 0 | 0 | 1 | 8 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | 1 | 0 | 0 | 0.1 | 0.2 | -1.000000 | 7 | 0 | -1 | 0 | -1 | 0 | 1 | 0 | 2 | 1 | 34 | 2 | 0.400000 | 0.932649 | 0.378021 | 3.741657 | 0.4 | 0.0 | 0.5 | 2 | 3 | 10 | 4 | 10 | 2 | 5 | 4 | 4 | 3 | 8 | 0 | 1 | 0 | 0 | 0 | 0 |
cat_cols = [col for col in train.columns if 'cat' in col]
cat_cols
['ps_ind_02_cat',
'ps_ind_04_cat',
'ps_ind_05_cat',
'ps_car_01_cat',
'ps_car_02_cat',
'ps_car_03_cat',
'ps_car_04_cat',
'ps_car_05_cat',
'ps_car_06_cat',
'ps_car_07_cat',
'ps_car_08_cat',
'ps_car_09_cat',
'ps_car_10_cat',
'ps_car_11_cat']
train[cat_cols[0]].value_counts()
1 431859
2 123573
3 28186
4 11378
-1 216
Name: ps_ind_02_cat, dtype: int64
for col in cat_cols :
print(col, train[col].nunique())
ps_ind_02_cat 5
ps_ind_04_cat 3
ps_ind_05_cat 8
ps_car_01_cat 13
ps_car_02_cat 3
ps_car_03_cat 3
ps_car_04_cat 10
ps_car_05_cat 3
ps_car_06_cat 18
ps_car_07_cat 3
ps_car_08_cat 2
ps_car_09_cat 6
ps_car_10_cat 3
ps_car_11_cat 104
train.drop_duplicates()
train.shape
(595212, 59)
train['ps_ind_03'].dtype == 'int64'
True
data = []
for f in train.columns:
#Defining the role
if f == 'target':
role = 'target'
elif f == 'id':
role = 'id'
else :
role = 'input'
#Defining the level
if 'bin' in f or f == 'target' :
level = 'binary'
elif 'cat' in f or f =='id':
level = 'nominal'
elif train[f].dtype == 'float' :
level = 'interval'
elif train[f].dtype == 'int64':
level = 'ordinal'
#Initialize keep to True for all variables except for id
keep = True
if f == 'id':
keep = False
#Defining the data type
dtype = train[f].dtype
#Creating a Dict that contains all the metadata for the variable
f_dict = {
'varname' : f,
'role' : role,
'level' : level,
'keep' : keep,
'dtype' : dtype
}
data.append(f_dict)
meta = pd.DataFrame(data, columns = ['varname', 'role', 'level', 'keep', 'dtype'])
meta.set_index('varname', inplace = True)
meta
role | level | keep | dtype | |
---|---|---|---|---|
varname | ||||
id | id | nominal | False | int64 |
target | target | binary | True | int64 |
ps_ind_01 | input | ordinal | True | int64 |
ps_ind_02_cat | input | nominal | True | int64 |
ps_ind_03 | input | ordinal | True | int64 |
ps_ind_04_cat | input | nominal | True | int64 |
ps_ind_05_cat | input | nominal | True | int64 |
ps_ind_06_bin | input | binary | True | int64 |
ps_ind_07_bin | input | binary | True | int64 |
ps_ind_08_bin | input | binary | True | int64 |
ps_ind_09_bin | input | binary | True | int64 |
ps_ind_10_bin | input | binary | True | int64 |
ps_ind_11_bin | input | binary | True | int64 |
ps_ind_12_bin | input | binary | True | int64 |
ps_ind_13_bin | input | binary | True | int64 |
ps_ind_14 | input | ordinal | True | int64 |
ps_ind_15 | input | ordinal | True | int64 |
ps_ind_16_bin | input | binary | True | int64 |
ps_ind_17_bin | input | binary | True | int64 |
ps_ind_18_bin | input | binary | True | int64 |
ps_reg_01 | input | interval | True | float64 |
ps_reg_02 | input | interval | True | float64 |
ps_reg_03 | input | interval | True | float64 |
ps_car_01_cat | input | nominal | True | int64 |
ps_car_02_cat | input | nominal | True | int64 |
ps_car_03_cat | input | nominal | True | int64 |
ps_car_04_cat | input | nominal | True | int64 |
ps_car_05_cat | input | nominal | True | int64 |
ps_car_06_cat | input | nominal | True | int64 |
ps_car_07_cat | input | nominal | True | int64 |
ps_car_08_cat | input | nominal | True | int64 |
ps_car_09_cat | input | nominal | True | int64 |
ps_car_10_cat | input | nominal | True | int64 |
ps_car_11_cat | input | nominal | True | int64 |
ps_car_11 | input | ordinal | True | int64 |
ps_car_12 | input | interval | True | float64 |
ps_car_13 | input | interval | True | float64 |
ps_car_14 | input | interval | True | float64 |
ps_car_15 | input | interval | True | float64 |
ps_calc_01 | input | interval | True | float64 |
ps_calc_02 | input | interval | True | float64 |
ps_calc_03 | input | interval | True | float64 |
ps_calc_04 | input | ordinal | True | int64 |
ps_calc_05 | input | ordinal | True | int64 |
ps_calc_06 | input | ordinal | True | int64 |
ps_calc_07 | input | ordinal | True | int64 |
ps_calc_08 | input | ordinal | True | int64 |
ps_calc_09 | input | ordinal | True | int64 |
ps_calc_10 | input | ordinal | True | int64 |
ps_calc_11 | input | ordinal | True | int64 |
ps_calc_12 | input | ordinal | True | int64 |
ps_calc_13 | input | ordinal | True | int64 |
ps_calc_14 | input | ordinal | True | int64 |
ps_calc_15_bin | input | binary | True | int64 |
ps_calc_16_bin | input | binary | True | int64 |
ps_calc_17_bin | input | binary | True | int64 |
ps_calc_18_bin | input | binary | True | int64 |
ps_calc_19_bin | input | binary | True | int64 |
ps_calc_20_bin | input | binary | True | int64 |
meta[(meta.level =='nominal')& (meta.keep)].index
Index(['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat',
'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat',
'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat',
'ps_car_10_cat', 'ps_car_11_cat'],
dtype='object', name='varname')
pd.DataFrame({'count' : meta.groupby(['role','level'])['role'].size()}).reset_index()
role | level | count | |
---|---|---|---|
0 | id | nominal | 1 |
1 | input | binary | 17 |
2 | input | interval | 10 |
3 | input | nominal | 14 |
4 | input | ordinal | 16 |
5 | target | binary | 1 |
meta.groupby(['role','level'])['role'].size()
role level
id nominal 1
input binary 17
interval 10
nominal 14
ordinal 16
target binary 1
Name: role, dtype: int64
v = meta[(meta.level == 'interval') & (meta.keep)].index
train[v].describe()
ps_reg_01 | ps_reg_02 | ps_reg_03 | ps_car_12 | ps_car_13 | ps_car_14 | ps_car_15 | ps_calc_01 | ps_calc_02 | ps_calc_03 | |
---|---|---|---|---|---|---|---|---|---|---|
count | 595212.000000 | 595212.000000 | 595212.000000 | 595212.000000 | 595212.000000 | 595212.000000 | 595212.000000 | 595212.000000 | 595212.000000 | 595212.000000 |
mean | 0.610991 | 0.439184 | 0.551102 | 0.379945 | 0.813265 | 0.276256 | 3.065899 | 0.449756 | 0.449589 | 0.449849 |
std | 0.287643 | 0.404264 | 0.793506 | 0.058327 | 0.224588 | 0.357154 | 0.731366 | 0.287198 | 0.286893 | 0.287153 |
min | 0.000000 | 0.000000 | -1.000000 | -1.000000 | 0.250619 | -1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 0.400000 | 0.200000 | 0.525000 | 0.316228 | 0.670867 | 0.333167 | 2.828427 | 0.200000 | 0.200000 | 0.200000 |
50% | 0.700000 | 0.300000 | 0.720677 | 0.374166 | 0.765811 | 0.368782 | 3.316625 | 0.500000 | 0.400000 | 0.500000 |
75% | 0.900000 | 0.600000 | 1.000000 | 0.400000 | 0.906190 | 0.396485 | 3.605551 | 0.700000 | 0.700000 | 0.700000 |
max | 0.900000 | 1.800000 | 4.037945 | 1.264911 | 3.720626 | 0.636396 | 3.741657 | 0.900000 | 0.900000 | 0.900000 |
v = meta[(meta.level == 'binary') & (meta.keep)].index
train[v].describe()
target | ps_ind_06_bin | ps_ind_07_bin | ps_ind_08_bin | ps_ind_09_bin | ps_ind_10_bin | ps_ind_11_bin | ps_ind_12_bin | ps_ind_13_bin | ps_ind_16_bin | ps_ind_17_bin | ps_ind_18_bin | ps_calc_15_bin | ps_calc_16_bin | ps_calc_17_bin | ps_calc_18_bin | ps_calc_19_bin | ps_calc_20_bin | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 595212.000000 | 595212.000000 | 595212.000000 | 595212.000000 | 595212.000000 | 595212.000000 | 595212.000000 | 595212.000000 | 595212.000000 | 595212.000000 | 595212.000000 | 595212.000000 | 595212.000000 | 595212.000000 | 595212.000000 | 595212.000000 | 595212.000000 | 595212.000000 |
mean | 0.036448 | 0.393742 | 0.257033 | 0.163921 | 0.185304 | 0.000373 | 0.001692 | 0.009439 | 0.000948 | 0.660823 | 0.121081 | 0.153446 | 0.122427 | 0.627840 | 0.554182 | 0.287182 | 0.349024 | 0.153318 |
std | 0.187401 | 0.488579 | 0.436998 | 0.370205 | 0.388544 | 0.019309 | 0.041097 | 0.096693 | 0.030768 | 0.473430 | 0.326222 | 0.360417 | 0.327779 | 0.483381 | 0.497056 | 0.452447 | 0.476662 | 0.360295 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
50% | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 |
75% | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
Handling Imbalanced classes
- 3가지 전략
- oversampling recoreds with target == 1
- undesrsampling records with target == 0
- SMOTE사용
desired_apriro1=0.1
idx_0 = train[train.target == 0].index
idx_1 = train[train.target == 1].index
nb_0 = len(train.loc[idx_0])
nb_1 = len(train.loc[idx_1])
undersampling_rate = ((1-desired_apriro1)*nb_1)/(nb_0*desired_apriro1)
((1-desired_apriro1)*nb_1)/(nb_0*desired_apriro1)
0.34043569687437886
undersampled_nb_0 = int(undersampling_rate*nb_0)
print('Rate to undersample records with target=0: {}'.format(undersampling_rate))
print('Number of records with target=0 after undersampling: {}'.format(undersampled_nb_0))
Rate to undersample records with target=0: 0.34043569687437886
Number of records with target=0 after undersampling: 195246
undersampled_idx = shuffle(idx_0, random_state=37, n_samples = undersampled_nb_0)
idx_list = list(undersampled_idx) + list(idx_1)
train = train.loc[idx_list].reset_index(drop = True)
#import missingno as msno
#msno.matrix(train)
display( train.isnull().sum(axis =0) )
id 0
target 0
ps_ind_01 0
ps_ind_02_cat 0
ps_ind_03 0
ps_ind_04_cat 0
ps_ind_05_cat 0
ps_ind_06_bin 0
ps_ind_07_bin 0
ps_ind_08_bin 0
ps_ind_09_bin 0
ps_ind_10_bin 0
ps_ind_11_bin 0
ps_ind_12_bin 0
ps_ind_13_bin 0
ps_ind_14 0
ps_ind_15 0
ps_ind_16_bin 0
ps_ind_17_bin 0
ps_ind_18_bin 0
ps_reg_01 0
ps_reg_02 0
ps_reg_03 0
ps_car_01_cat 0
ps_car_02_cat 0
ps_car_03_cat 0
ps_car_04_cat 0
ps_car_05_cat 0
ps_car_06_cat 0
ps_car_07_cat 0
ps_car_08_cat 0
ps_car_09_cat 0
ps_car_10_cat 0
ps_car_11_cat 0
ps_car_11 0
ps_car_12 0
ps_car_13 0
ps_car_14 0
ps_car_15 0
ps_calc_01 0
ps_calc_02 0
ps_calc_03 0
ps_calc_04 0
ps_calc_05 0
ps_calc_06 0
ps_calc_07 0
ps_calc_08 0
ps_calc_09 0
ps_calc_10 0
ps_calc_11 0
ps_calc_12 0
ps_calc_13 0
ps_calc_14 0
ps_calc_15_bin 0
ps_calc_16_bin 0
ps_calc_17_bin 0
ps_calc_18_bin 0
ps_calc_19_bin 0
ps_calc_20_bin 0
dtype: int64
vars_with_missing = []
for f in train.columns:
missings = train[train[f] == -1][f].count()
if missings > 0:
vars_with_missing.append(f)
missings_perc = missings/train.shape[0]
print('Variable {} has {} records ({:.2%}) with missing values'.format(f,missings,missings_perc))
print('In total, there are {} variables with missing values'.format(len(vars_with_missing)))
Variable ps_ind_02_cat has 103 records (0.05%) with missing values
Variable ps_ind_04_cat has 51 records (0.02%) with missing values
Variable ps_ind_05_cat has 2256 records (1.04%) with missing values
Variable ps_reg_03 has 38580 records (17.78%) with missing values
Variable ps_car_01_cat has 62 records (0.03%) with missing values
Variable ps_car_02_cat has 2 records (0.00%) with missing values
Variable ps_car_03_cat has 148367 records (68.39%) with missing values
Variable ps_car_05_cat has 96026 records (44.26%) with missing values
Variable ps_car_07_cat has 4431 records (2.04%) with missing values
Variable ps_car_09_cat has 230 records (0.11%) with missing values
Variable ps_car_11 has 1 records (0.00%) with missing values
Variable ps_car_14 has 15726 records (7.25%) with missing values
In total, there are 12 variables with missing values
-ps_car_03이 70% missing value, 전체삭제
train[['ps_car_03_cat','target']].groupby('ps_car_03_cat').mean()
target | |
---|---|
ps_car_03_cat | |
-1 | 0.090654 |
0 | 0.106983 |
1 | 0.128862 |
train['ps_car_03_cat'].describe()
count 216940.000000
mean -0.492639
std 0.795291
min -1.000000
25% -1.000000
50% -1.000000
75% 0.000000
max 1.000000
Name: ps_car_03_cat, dtype: float64
train['ps_car_03_cat'].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x2398fb97108>
여기서는 결측값을 평균으로 대체하지만 좀더 면밀하게 분석하고 해야할 필요가있다..
vars_to_drop = ['ps_car_03_cat', 'ps_car_05_cat']
train.drop(vars_to_drop, axis =1, inplace = True)
meta.loc[(vars_to_drop), 'keep'] = False
temp_series = train[['ps_car_01_cat','ps_car_02_cat','ps_reg_03']].groupby(['ps_car_01_cat','ps_car_02_cat']).mean()
temp_series.reset_index(inplace= True)
train.loc[train['ps_reg_03'] == -1].merge(temp_series, on = ['ps_car_01_cat','ps_car_02_cat'] , how = 'left')
id | target | ps_ind_01 | ps_ind_02_cat | ps_ind_03 | ps_ind_04_cat | ps_ind_05_cat | ps_ind_06_bin | ps_ind_07_bin | ps_ind_08_bin | ps_ind_09_bin | ps_ind_10_bin | ps_ind_11_bin | ps_ind_12_bin | ps_ind_13_bin | ps_ind_14 | ps_ind_15 | ps_ind_16_bin | ps_ind_17_bin | ps_ind_18_bin | ps_reg_01 | ps_reg_02 | ps_reg_03_x | ps_car_01_cat | ps_car_02_cat | ps_car_04_cat | ps_car_06_cat | ps_car_07_cat | ps_car_08_cat | ps_car_09_cat | ps_car_10_cat | ps_car_11_cat | ps_car_11 | ps_car_12 | ps_car_13 | ps_car_14 | ps_car_15 | ps_calc_01 | ps_calc_02 | ps_calc_03 | ps_calc_04 | ps_calc_05 | ps_calc_06 | ps_calc_07 | ps_calc_08 | ps_calc_09 | ps_calc_10 | ps_calc_11 | ps_calc_12 | ps_calc_13 | ps_calc_14 | ps_calc_15_bin | ps_calc_16_bin | ps_calc_17_bin | ps_calc_18_bin | ps_calc_19_bin | ps_calc_20_bin | ps_reg_03_y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1254786 | 0 | 7 | 1 | 9 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | 1 | 0 | 0 | 0.2 | 0.2 | -1.0 | 0 | 1 | 9 | 13 | 1 | 0 | 0 | 1 | 104 | 2 | 0.565685 | 2.108264 | 0.530094 | 3.741657 | 0.7 | 0.6 | 0.4 | 2 | 2 | 9 | 3 | 11 | 1 | 12 | 2 | 1 | 2 | 10 | 0 | 1 | 0 | 0 | 0 | 0 | 0.505736 |
1 | 1425558 | 0 | 1 | 2 | 0 | 1 | 4 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 8 | 1 | 0 | 0 | 0.1 | 0.3 | -1.0 | 8 | 1 | 0 | 14 | 1 | 1 | 0 | 1 | 104 | 1 | 0.316070 | 0.508502 | 0.355668 | 1.732051 | 0.0 | 0.4 | 0.1 | 1 | 2 | 9 | 2 | 12 | 3 | 6 | 2 | 1 | 4 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0.515147 |
2 | 860206 | 0 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | 1 | 0 | 0 | 0.4 | 0.1 | -1.0 | 7 | 1 | 0 | 1 | 1 | 1 | 2 | 1 | 64 | 1 | 0.316228 | 0.656405 | 0.361939 | 3.316625 | 0.4 | 0.8 | 0.3 | 3 | 1 | 6 | 3 | 7 | 2 | 7 | 6 | 0 | 3 | 7 | 1 | 0 | 1 | 0 | 1 | 0 | 0.333647 |
3 | 1265316 | 0 | 4 | 2 | 4 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 0 | 0 | 1 | 0.1 | 0.3 | -1.0 | 4 | 1 | 0 | 1 | 1 | 1 | 0 | 1 | 65 | 1 | 0.316228 | 0.545795 | 0.350714 | 2.449490 | 0.6 | 0.6 | 0.5 | 1 | 1 | 7 | 1 | 12 | 6 | 6 | 6 | 2 | 4 | 10 | 0 | 1 | 1 | 1 | 0 | 0 | 0.244300 |
4 | 267652 | 0 | 2 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | 1 | 0 | 0 | 0.3 | 0.3 | -1.0 | 7 | 0 | 0 | 10 | 1 | 1 | 2 | 1 | 55 | 2 | 0.424264 | 1.116425 | 0.416533 | 3.605551 | 0.2 | 0.1 | 0.9 | 2 | 0 | 8 | 4 | 12 | 3 | 9 | 7 | 0 | 2 | 6 | 0 | 1 | 0 | 1 | 1 | 0 | 0.477232 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
38575 | 1486851 | 1 | 5 | 1 | 10 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 0 | 1 | 0 | 0.1 | 0.3 | -1.0 | 11 | 0 | 9 | 17 | 1 | 1 | 2 | 1 | 104 | 2 | 0.447214 | 1.209873 | 0.430116 | 3.316625 | 0.2 | 0.4 | 0.9 | 2 | 3 | 7 | 4 | 8 | 4 | 8 | 7 | 0 | 6 | 10 | 0 | 0 | 0 | 1 | 1 | 0 | 0.985341 |
38576 | 1487090 | 1 | 2 | 1 | 6 | 0 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | 1 | 0 | 0 | 0.1 | 0.3 | -1.0 | 7 | 1 | 0 | 0 | 1 | 1 | 2 | 1 | 37 | 2 | 0.316228 | 0.740728 | 0.311448 | 3.316625 | 0.4 | 0.7 | 0.0 | 1 | 2 | 9 | 6 | 9 | 1 | 10 | 6 | 1 | 3 | 15 | 0 | 0 | 1 | 0 | 0 | 1 | 0.333647 |
38577 | 1487406 | 1 | 3 | 1 | 8 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | 1 | 0 | 0 | 0.4 | 0.0 | -1.0 | 7 | 1 | 0 | 1 | 1 | 1 | 2 | 1 | 64 | 3 | 0.316228 | 0.613586 | 0.301662 | 2.828427 | 0.9 | 0.5 | 0.5 | 3 | 1 | 10 | 2 | 9 | 1 | 3 | 5 | 0 | 3 | 4 | 0 | 1 | 0 | 0 | 1 | 0 | 0.333647 |
38578 | 1487419 | 1 | 2 | 1 | 6 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | 1 | 0 | 0 | 0.3 | 0.3 | -1.0 | 4 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 34 | 2 | 0.400000 | 0.795156 | 0.378021 | 3.162278 | 0.7 | 0.1 | 0.9 | 3 | 4 | 9 | 3 | 9 | 2 | 8 | 11 | 1 | 5 | 5 | 0 | 1 | 1 | 0 | 0 | 0 | 0.244300 |
38579 | 1487566 | 1 | 1 | 1 | 5 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 1 | 0.3 | 0.4 | -1.0 | 11 | 0 | 0 | 15 | 1 | 1 | 2 | 1 | 5 | 2 | 0.424264 | 0.756979 | 0.400000 | 2.000000 | 0.3 | 0.4 | 0.6 | 1 | 1 | 8 | 2 | 9 | 5 | 9 | 9 | 2 | 1 | 5 | 0 | 1 | 0 | 0 | 0 | 0 | 0.985341 |
38580 rows × 58 columns
mean_imp = SimpleImputer(missing_values= -1, strategy='mean')
mode_imp = SimpleImputer(missing_values= -1, strategy ='most_frequent')
train['ps_reg_03'] = mean_imp.fit_transform(train[['ps_reg_03']]).ravel()
train['ps_car_12'] = mean_imp.fit_transform(train[['ps_car_12']]).ravel()
train['ps_car_14'] = mean_imp.fit_transform(train[['ps_car_14']]).ravel()
train['ps_car_11'] = mode_imp.fit_transform(train[['ps_car_11']]).ravel()
v = meta[(meta.level == 'nominal' ) & (meta.keep)].index
sum = 0
for f in v :
dist_values = train[f].value_counts().shape[0]
sum+= dist_values
print('Variable {} has {} distinct values'.format(f, dist_values))
Variable ps_ind_02_cat has 5 distinct values
Variable ps_ind_04_cat has 3 distinct values
Variable ps_ind_05_cat has 8 distinct values
Variable ps_car_01_cat has 13 distinct values
Variable ps_car_02_cat has 3 distinct values
Variable ps_car_04_cat has 10 distinct values
Variable ps_car_06_cat has 18 distinct values
Variable ps_car_07_cat has 3 distinct values
Variable ps_car_08_cat has 2 distinct values
Variable ps_car_09_cat has 6 distinct values
Variable ps_car_10_cat has 3 distinct values
Variable ps_car_11_cat has 104 distinct values
train[f].value_counts().shape
(104,)
sum
178
def add_noise(series, noise_level):
return series * (1 + noise_level * np.random.randn(len(series)))
def target_encode(trn_series=None,
tst_series=None,
target=None,
min_samples_leaf=1,
smoothing=1,
noise_level=0):
"""
Smoothing is computed like in the following paper by Daniele Micci-Barreca
https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
trn_series : training categorical feature as a pd.Series
tst_series : test categorical feature as a pd.Series
target : target data as a pd.Series
min_samples_leaf (int) : minimum samples to take category average into account
smoothing (int) : smoothing effect to balance categorical average vs prior
"""
assert len(trn_series) == len(target) #값이 같으면 True
assert trn_series.name == tst_series.name
temp = pd.concat([trn_series, target], axis=1) #옆으로 붙여주고,
# Compute target mean
averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"]) #group by
# Compute smoothing
smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
# Apply average function to all target data
prior = target.mean()
# The bigger the count the less full_avg is taken into account
averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
averages.drop(["mean", "count"], axis=1, inplace=True)
# Apply averages to trn and tst series
ft_trn_series = pd.merge(
trn_series.to_frame(trn_series.name),
averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),#rename
on=trn_series.name, # merge
how='left')['average'].rename(trn_series.name + '_mean').fillna(prior) # how = left를 넣어야 NULL값이 안생김
# pd.merge does not keep the index so restore it
ft_trn_series.index = trn_series.index
ft_tst_series = pd.merge(
tst_series.to_frame(tst_series.name),
averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
on=tst_series.name,
how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
# pd.merge does not keep the index so restore it
ft_tst_series.index = tst_series.index
return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)
train_encoded, test_encoded = target_encode(train["ps_car_11_cat"],
test["ps_car_11_cat"],
target=train.target,
min_samples_leaf=100,
smoothing=10,
noise_level=0.01)
train['ps_car_11_cat_te'] = train_encoded
train.drop('ps_car_11_cat', axis=1, inplace=True)
meta.loc['ps_car_11_cat','keep'] = False # Updating the meta
test['ps_car_11_cat_te'] = test_encoded
test.drop('ps_car_11_cat', axis=1, inplace=True)
뜯어보기
#v = meta[(meta.level == 'nominal') & (meta.keep)].index
#cat_perc = train[[f, 'target']].groupby([f],as_index=False).mean()
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-50-b8a6304ecf30> in <module>
----> 1 cat_perc = train[[f, 'target']].groupby([f],as_index=False).mean()
C:\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2804 if is_iterator(key):
2805 key = list(key)
-> 2806 indexer = self.loc._get_listlike_indexer(key, axis=1, raise_missing=True)[1]
2807
2808 # take() does not accept boolean indexers
C:\Anaconda3\lib\site-packages\pandas\core\indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
1550
1551 self._validate_read_indexer(
-> 1552 keyarr, indexer, o._get_axis_number(axis), raise_missing=raise_missing
1553 )
1554 return keyarr, indexer
C:\Anaconda3\lib\site-packages\pandas\core\indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
1643 if not (self.name == "loc" and not raise_missing):
1644 not_found = list(set(key) - set(ax))
-> 1645 raise KeyError(f"{not_found} not in index")
1646
1647 # we skip the warning on Categorical/Interval
KeyError: "['ps_car_11_cat'] not in index"
#cat_perc
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-51-2ac23d9b437a> in <module>
----> 1 cat_perc
NameError: name 'cat_perc' is not defined
#cat_perc.sort_values(by='target', ascending=False, inplace = True)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-52-72bd5321c2f0> in <module>
----> 1 cat_perc.sort_values(by='target', ascending=False, inplace = True)
NameError: name 'cat_perc' is not defined
#sns.barplot(x=f, y = 'target', data=cat_perc,order = cat_perc[f])
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-53-579ec202cb69> in <module>
----> 1 sns.barplot(x=f, y = 'target', data=cat_perc,order = cat_perc[f])
NameError: name 'cat_perc' is not defined
다시 코드
v = meta[(meta.level == 'nominal') & (meta.keep)].index
for f in v:
plt.figure()
fig, ax = plt.subplots(figsize=(20,10))
# Calculate the percentage of target=1 per category value
cat_perc = train[[f, 'target']].groupby([f],as_index=False).mean()
cat_perc.sort_values(by='target', ascending=False, inplace=True)
# Bar plot
# Order the bars descending on target mean
sns.barplot(ax=ax, x=f, y='target', data=cat_perc, order=cat_perc[f])
plt.ylabel('% target', fontsize=18)
plt.xlabel(f, fontsize=18)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.show();
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
cat_perc
ps_car_10_cat | target | |
---|---|---|
1 | 1 | 0.100029 |
2 | 2 | 0.100000 |
0 | 0 | 0.096420 |
f = 'ps_car_02_cat'
cat_perc = train[[f, 'target']].groupby([f], as_index=False).agg(['mean','count'])
cat_perc
target | ||
---|---|---|
mean | count | |
ps_car_02_cat | ||
-1 | 0.000000 | 2 |
0 | 0.131868 | 38000 |
1 | 0.093233 | 178938 |
- 관측값이 2개밖에 없는데 둘다 -1이여서 확률이 0으로 나타남.
- 여기서는 결측치가 의미가 있다고 판단하여 삭제하지 않음.
v = meta[(meta.level == 'interval') & (meta.keep)].index
correlations = train[v].corr()
correlations
ps_reg_01 | ps_reg_02 | ps_reg_03 | ps_car_12 | ps_car_13 | ps_car_14 | ps_car_15 | ps_calc_01 | ps_calc_02 | ps_calc_03 | |
---|---|---|---|---|---|---|---|---|---|---|
ps_reg_01 | 1.000000 | 0.470953 | 0.137117 | 0.019095 | 0.025243 | -0.002536 | 0.001755 | -0.003236 | 0.001459 | -0.001371 |
ps_reg_02 | 0.470953 | 1.000000 | 0.702512 | 0.173736 | 0.193896 | 0.053149 | 0.052344 | -0.001769 | -0.000726 | -0.000992 |
ps_reg_03 | 0.137117 | 0.702512 | 1.000000 | 0.208978 | 0.241244 | 0.079541 | 0.079848 | -0.000223 | 0.000043 | -0.000357 |
ps_car_12 | 0.019095 | 0.173736 | 0.208978 | 1.000000 | 0.674298 | 0.577537 | 0.049468 | -0.000452 | -0.001070 | -0.000707 |
ps_car_13 | 0.025243 | 0.193896 | 0.241244 | 0.674298 | 1.000000 | 0.434613 | 0.526024 | 0.000266 | 0.000020 | 0.000568 |
ps_car_14 | -0.002536 | 0.053149 | 0.079541 | 0.577537 | 0.434613 | 1.000000 | 0.008472 | -0.004548 | -0.005015 | 0.000776 |
ps_car_15 | 0.001755 | 0.052344 | 0.079848 | 0.049468 | 0.526024 | 0.008472 | 1.000000 | -0.000392 | 0.003630 | 0.000586 |
ps_calc_01 | -0.003236 | -0.001769 | -0.000223 | -0.000452 | 0.000266 | -0.004548 | -0.000392 | 1.000000 | 0.002832 | -0.000212 |
ps_calc_02 | 0.001459 | -0.000726 | 0.000043 | -0.001070 | 0.000020 | -0.005015 | 0.003630 | 0.002832 | 1.000000 | 0.003130 |
ps_calc_03 | -0.001371 | -0.000992 | -0.000357 | -0.000707 | 0.000568 | 0.000776 | 0.000586 | -0.000212 | 0.003130 | 1.000000 |
cmap = sns.diverging_palette(220, 10, as_cmap=True)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(correlations, cmap=cmap, vmax=1.0, center=0, fmt='.2f',
square=True, linewidths=.5, annot=True, cbar_kws={"shrink": .75})
plt.show();
def corr_heatmap(v):
correlations = train[v].corr()
# Create color map ranging between two colors
cmap = sns.diverging_palette(220, 10, as_cmap=True)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(correlations, cmap=cmap, vmax=1.0, center=0, fmt='.2f',
square=True, linewidths=.5, annot=True, cbar_kws={"shrink": .75})
plt.show();
v = meta[(meta.level == 'interval') & (meta.keep)].index
corr_heatmap(v)
s = train.sample(frac=0.1)
sns.lmplot(x='ps_reg_02', y='ps_reg_03', data=s, hue= 'target', palette='Set1',scatter_kws = {'alpha' : 0.3})
plt.show()
sns.lmplot(x='ps_car_12', y='ps_car_13', data=s, hue='target', palette='Set1', scatter_kws={'alpha':0.3})
plt.show()
sns.lmplot(x='ps_car_12', y='ps_car_14', data=s, hue='target', palette='Set1', scatter_kws={'alpha':0.3})
plt.show()
sns.lmplot(x='ps_car_15', y='ps_car_13', data=s, hue='target', palette='Set1', scatter_kws={'alpha':0.3})
plt.show()
v = meta[(meta.level == 'ordinal') & (meta.keep)].index
corr_heatmap(v)
#PCA를 이용해서 결정
v = meta[(meta.level == 'nominal') & (meta.keep)].index
print('Before dummification we have {} variables in train'.format(train.shape[1]))
train = pd.get_dummies(train, columns=v, drop_first=True)
print('After dummification we have {} variables in train'.format(train.shape[1]))
Before dummification we have 57 variables in train
After dummification we have 109 variables in train
v = meta[(meta.level == 'interval') & (meta.keep)].index
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias = False) #2승까지
poly.fit_transform(train[v])
array([[0.6 , 0.6 , 0.83815273, ..., 0.09 , 0. ,
0. ],
[0.9 , 0.6 , 0.72844011, ..., 0. , 0. ,
0.36 ],
[0.9 , 0.6 , 0.86926693, ..., 0.81 , 0.09 ,
0.01 ],
...,
[0.9 , 0.3 , 0.71195154, ..., 0.16 , 0.24 ,
0.36 ],
[0.6 , 0.1 , 0.57716982, ..., 0.04 , 0.12 ,
0.36 ],
[0.6 , 0.4 , 1.09515981, ..., 0.36 , 0.18 ,
0.09 ]])
poly.get_feature_names(v)
['ps_reg_01',
'ps_reg_02',
'ps_reg_03',
'ps_car_12',
'ps_car_13',
'ps_car_14',
'ps_car_15',
'ps_calc_01',
'ps_calc_02',
'ps_calc_03',
'ps_reg_01^2',
'ps_reg_01 ps_reg_02',
'ps_reg_01 ps_reg_03',
'ps_reg_01 ps_car_12',
'ps_reg_01 ps_car_13',
'ps_reg_01 ps_car_14',
'ps_reg_01 ps_car_15',
'ps_reg_01 ps_calc_01',
'ps_reg_01 ps_calc_02',
'ps_reg_01 ps_calc_03',
'ps_reg_02^2',
'ps_reg_02 ps_reg_03',
'ps_reg_02 ps_car_12',
'ps_reg_02 ps_car_13',
'ps_reg_02 ps_car_14',
'ps_reg_02 ps_car_15',
'ps_reg_02 ps_calc_01',
'ps_reg_02 ps_calc_02',
'ps_reg_02 ps_calc_03',
'ps_reg_03^2',
'ps_reg_03 ps_car_12',
'ps_reg_03 ps_car_13',
'ps_reg_03 ps_car_14',
'ps_reg_03 ps_car_15',
'ps_reg_03 ps_calc_01',
'ps_reg_03 ps_calc_02',
'ps_reg_03 ps_calc_03',
'ps_car_12^2',
'ps_car_12 ps_car_13',
'ps_car_12 ps_car_14',
'ps_car_12 ps_car_15',
'ps_car_12 ps_calc_01',
'ps_car_12 ps_calc_02',
'ps_car_12 ps_calc_03',
'ps_car_13^2',
'ps_car_13 ps_car_14',
'ps_car_13 ps_car_15',
'ps_car_13 ps_calc_01',
'ps_car_13 ps_calc_02',
'ps_car_13 ps_calc_03',
'ps_car_14^2',
'ps_car_14 ps_car_15',
'ps_car_14 ps_calc_01',
'ps_car_14 ps_calc_02',
'ps_car_14 ps_calc_03',
'ps_car_15^2',
'ps_car_15 ps_calc_01',
'ps_car_15 ps_calc_02',
'ps_car_15 ps_calc_03',
'ps_calc_01^2',
'ps_calc_01 ps_calc_02',
'ps_calc_01 ps_calc_03',
'ps_calc_02^2',
'ps_calc_02 ps_calc_03',
'ps_calc_03^2']
pd.DataFrame(data=poly.fit_transform(train[v]), columns=poly.get_feature_names(v))
ps_reg_01 | ps_reg_02 | ps_reg_03 | ps_car_12 | ps_car_13 | ps_car_14 | ps_car_15 | ps_calc_01 | ps_calc_02 | ps_calc_03 | ps_reg_01^2 | ps_reg_01 ps_reg_02 | ps_reg_01 ps_reg_03 | ps_reg_01 ps_car_12 | ps_reg_01 ps_car_13 | ps_reg_01 ps_car_14 | ps_reg_01 ps_car_15 | ps_reg_01 ps_calc_01 | ps_reg_01 ps_calc_02 | ps_reg_01 ps_calc_03 | ps_reg_02^2 | ps_reg_02 ps_reg_03 | ps_reg_02 ps_car_12 | ps_reg_02 ps_car_13 | ps_reg_02 ps_car_14 | ps_reg_02 ps_car_15 | ps_reg_02 ps_calc_01 | ps_reg_02 ps_calc_02 | ps_reg_02 ps_calc_03 | ps_reg_03^2 | ps_reg_03 ps_car_12 | ps_reg_03 ps_car_13 | ps_reg_03 ps_car_14 | ps_reg_03 ps_car_15 | ps_reg_03 ps_calc_01 | ps_reg_03 ps_calc_02 | ps_reg_03 ps_calc_03 | ps_car_12^2 | ps_car_12 ps_car_13 | ps_car_12 ps_car_14 | ps_car_12 ps_car_15 | ps_car_12 ps_calc_01 | ps_car_12 ps_calc_02 | ps_car_12 ps_calc_03 | ps_car_13^2 | ps_car_13 ps_car_14 | ps_car_13 ps_car_15 | ps_car_13 ps_calc_01 | ps_car_13 ps_calc_02 | ps_car_13 ps_calc_03 | ps_car_14^2 | ps_car_14 ps_car_15 | ps_car_14 ps_calc_01 | ps_car_14 ps_calc_02 | ps_car_14 ps_calc_03 | ps_car_15^2 | ps_car_15 ps_calc_01 | ps_car_15 ps_calc_02 | ps_car_15 ps_calc_03 | ps_calc_01^2 | ps_calc_01 ps_calc_02 | ps_calc_01 ps_calc_03 | ps_calc_02^2 | ps_calc_02 ps_calc_03 | ps_calc_03^2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.6 | 0.6 | 0.838153 | 0.368782 | 0.540603 | 0.345688 | 2.000000 | 0.9 | 0.3 | 0.0 | 0.36 | 0.36 | 0.502892 | 0.221269 | 0.324362 | 0.207413 | 1.200000 | 0.54 | 0.18 | 0.00 | 0.36 | 0.502892 | 0.221269 | 0.324362 | 0.207413 | 1.200000 | 0.54 | 0.18 | 0.00 | 0.702500 | 0.309095 | 0.453108 | 0.289739 | 1.676305 | 0.754337 | 0.251446 | 0.000000 | 0.136 | 0.199365 | 0.127483 | 0.737564 | 0.331904 | 0.110635 | 0.000000 | 0.292252 | 0.186880 | 1.081207 | 0.486543 | 0.162181 | 0.000000 | 0.1195 | 0.691375 | 0.311119 | 0.103706 | 0.000000 | 4.0 | 1.800000 | 0.600000 | 0.000000 | 0.81 | 0.27 | 0.00 | 0.09 | 0.00 | 0.00 |
1 | 0.9 | 0.6 | 0.728440 | 0.424264 | 0.382953 | 0.378814 | 0.000000 | 0.2 | 0.0 | 0.6 | 0.81 | 0.54 | 0.655596 | 0.381838 | 0.344658 | 0.340933 | 0.000000 | 0.18 | 0.00 | 0.54 | 0.36 | 0.437064 | 0.254558 | 0.229772 | 0.227288 | 0.000000 | 0.12 | 0.00 | 0.36 | 0.530625 | 0.309051 | 0.278958 | 0.275943 | 0.000000 | 0.145688 | 0.000000 | 0.437064 | 0.180 | 0.162473 | 0.160717 | 0.000000 | 0.084853 | 0.000000 | 0.254558 | 0.146653 | 0.145068 | 0.000000 | 0.076591 | 0.000000 | 0.229772 | 0.1435 | 0.000000 | 0.075763 | 0.000000 | 0.227288 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.04 | 0.00 | 0.12 | 0.00 | 0.00 | 0.36 |
2 | 0.9 | 0.6 | 0.869267 | 0.400000 | 0.814271 | 0.402368 | 3.316625 | 0.3 | 0.9 | 0.1 | 0.81 | 0.54 | 0.782340 | 0.360000 | 0.732844 | 0.362131 | 2.984962 | 0.27 | 0.81 | 0.09 | 0.36 | 0.521560 | 0.240000 | 0.488563 | 0.241421 | 1.989975 | 0.18 | 0.54 | 0.06 | 0.755625 | 0.347707 | 0.707819 | 0.349765 | 2.883032 | 0.260780 | 0.782340 | 0.086927 | 0.160 | 0.325708 | 0.160947 | 1.326650 | 0.120000 | 0.360000 | 0.040000 | 0.663037 | 0.327637 | 2.700631 | 0.244281 | 0.732844 | 0.081427 | 0.1619 | 1.334504 | 0.120710 | 0.362131 | 0.040237 | 11.0 | 0.994987 | 2.984962 | 0.331662 | 0.09 | 0.27 | 0.03 | 0.81 | 0.09 | 0.01 |
3 | 0.6 | 1.5 | 1.705872 | 0.400000 | 0.838387 | 0.378418 | 3.605551 | 0.8 | 0.4 | 0.1 | 0.36 | 0.90 | 1.023523 | 0.240000 | 0.503032 | 0.227051 | 2.163331 | 0.48 | 0.24 | 0.06 | 2.25 | 2.558808 | 0.600000 | 1.257580 | 0.567627 | 5.408327 | 1.20 | 0.60 | 0.15 | 2.910000 | 0.682349 | 1.430181 | 0.645532 | 6.150610 | 1.364698 | 0.682349 | 0.170587 | 0.160 | 0.335355 | 0.151367 | 1.442221 | 0.320000 | 0.160000 | 0.040000 | 0.702893 | 0.317260 | 3.022847 | 0.670710 | 0.335355 | 0.083839 | 0.1432 | 1.364405 | 0.302734 | 0.151367 | 0.037842 | 13.0 | 2.884441 | 1.442221 | 0.360555 | 0.64 | 0.32 | 0.08 | 0.16 | 0.04 | 0.01 |
4 | 0.8 | 0.8 | 1.086566 | 0.400000 | 0.905777 | 0.384838 | 3.605551 | 0.6 | 0.5 | 0.9 | 0.64 | 0.64 | 0.869253 | 0.320000 | 0.724622 | 0.307870 | 2.884441 | 0.48 | 0.40 | 0.72 | 0.64 | 0.869253 | 0.320000 | 0.724622 | 0.307870 | 2.884441 | 0.48 | 0.40 | 0.72 | 1.180625 | 0.434626 | 0.984186 | 0.418151 | 3.917668 | 0.651939 | 0.543283 | 0.977909 | 0.160 | 0.362311 | 0.153935 | 1.442221 | 0.240000 | 0.200000 | 0.360000 | 0.820432 | 0.348577 | 3.265825 | 0.543466 | 0.452888 | 0.815199 | 0.1481 | 1.387552 | 0.230903 | 0.192419 | 0.346354 | 13.0 | 2.163331 | 1.802776 | 3.244996 | 0.36 | 0.30 | 0.54 | 0.25 | 0.45 | 0.81 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
216935 | 0.6 | 0.4 | 1.537652 | 0.424264 | 1.269111 | 0.384708 | 3.162278 | 0.5 | 0.1 | 0.5 | 0.36 | 0.24 | 0.922591 | 0.254558 | 0.761467 | 0.230825 | 1.897367 | 0.30 | 0.06 | 0.30 | 0.16 | 0.615061 | 0.169706 | 0.507645 | 0.153883 | 1.264911 | 0.20 | 0.04 | 0.20 | 2.364375 | 0.652371 | 1.951452 | 0.591547 | 4.862484 | 0.768826 | 0.153765 | 0.768826 | 0.180 | 0.538438 | 0.163218 | 1.341641 | 0.212132 | 0.042426 | 0.212132 | 1.610644 | 0.488237 | 4.013282 | 0.634556 | 0.126911 | 0.634556 | 0.1480 | 1.216553 | 0.192354 | 0.038471 | 0.192354 | 10.0 | 1.581139 | 0.316228 | 1.581139 | 0.25 | 0.05 | 0.25 | 0.01 | 0.05 | 0.25 |
216936 | 0.3 | 0.4 | 0.898861 | 0.424264 | 0.756979 | 0.400000 | 2.000000 | 0.3 | 0.4 | 0.6 | 0.09 | 0.12 | 0.269658 | 0.127279 | 0.227094 | 0.120000 | 0.600000 | 0.09 | 0.12 | 0.18 | 0.16 | 0.359544 | 0.169706 | 0.302791 | 0.160000 | 0.800000 | 0.12 | 0.16 | 0.24 | 0.807951 | 0.381354 | 0.680419 | 0.359544 | 1.797722 | 0.269658 | 0.359544 | 0.539317 | 0.180 | 0.321159 | 0.169706 | 0.848528 | 0.127279 | 0.169706 | 0.254558 | 0.573017 | 0.302791 | 1.513957 | 0.227094 | 0.302791 | 0.454187 | 0.1600 | 0.800000 | 0.120000 | 0.160000 | 0.240000 | 4.0 | 0.600000 | 0.800000 | 1.200000 | 0.09 | 0.12 | 0.18 | 0.16 | 0.24 | 0.36 |
216937 | 0.9 | 0.3 | 0.711952 | 0.400000 | 0.970654 | 0.372424 | 3.464102 | 0.5 | 0.4 | 0.6 | 0.81 | 0.27 | 0.640756 | 0.360000 | 0.873589 | 0.335182 | 3.117691 | 0.45 | 0.36 | 0.54 | 0.09 | 0.213585 | 0.120000 | 0.291196 | 0.111727 | 1.039230 | 0.15 | 0.12 | 0.18 | 0.506875 | 0.284781 | 0.691059 | 0.265148 | 2.466272 | 0.355976 | 0.284781 | 0.427171 | 0.160 | 0.388262 | 0.148970 | 1.385641 | 0.200000 | 0.160000 | 0.240000 | 0.942169 | 0.361495 | 3.362445 | 0.485327 | 0.388262 | 0.582392 | 0.1387 | 1.290116 | 0.186212 | 0.148970 | 0.223455 | 12.0 | 1.732051 | 1.385641 | 2.078461 | 0.25 | 0.20 | 0.30 | 0.16 | 0.24 | 0.36 |
216938 | 0.6 | 0.1 | 0.577170 | 0.316228 | 0.876295 | 0.320780 | 3.741657 | 0.5 | 0.2 | 0.6 | 0.36 | 0.06 | 0.346302 | 0.189737 | 0.525777 | 0.192468 | 2.244994 | 0.30 | 0.12 | 0.36 | 0.01 | 0.057717 | 0.031623 | 0.087629 | 0.032078 | 0.374166 | 0.05 | 0.02 | 0.06 | 0.333125 | 0.182517 | 0.505771 | 0.185145 | 2.159572 | 0.288585 | 0.115434 | 0.346302 | 0.100 | 0.277109 | 0.101440 | 1.183216 | 0.158114 | 0.063246 | 0.189737 | 0.767893 | 0.281098 | 3.278795 | 0.438147 | 0.175259 | 0.525777 | 0.1029 | 1.200250 | 0.160390 | 0.064156 | 0.192468 | 14.0 | 1.870829 | 0.748331 | 2.244994 | 0.25 | 0.10 | 0.30 | 0.04 | 0.12 | 0.36 |
216939 | 0.6 | 0.4 | 1.095160 | 0.374166 | 0.752558 | 0.328634 | 3.464102 | 0.2 | 0.6 | 0.3 | 0.36 | 0.24 | 0.657096 | 0.224499 | 0.451535 | 0.197180 | 2.078461 | 0.12 | 0.36 | 0.18 | 0.16 | 0.438064 | 0.149666 | 0.301023 | 0.131453 | 1.385641 | 0.08 | 0.24 | 0.12 | 1.199375 | 0.409771 | 0.824171 | 0.359906 | 3.793745 | 0.219032 | 0.657096 | 0.328548 | 0.140 | 0.281581 | 0.122963 | 1.296148 | 0.074833 | 0.224499 | 0.112250 | 0.566343 | 0.247316 | 2.606936 | 0.150512 | 0.451535 | 0.225767 | 0.1080 | 1.138420 | 0.065727 | 0.197180 | 0.098590 | 12.0 | 0.692820 | 2.078461 | 1.039230 | 0.04 | 0.12 | 0.06 | 0.36 | 0.18 | 0.09 |
216940 rows × 65 columns
intersactions = pd.DataFrame(data=poly.fit_transform(train[v]), columns=poly.get_feature_names(v))
intersactions.drop(v, axis=1, inplace=True)
train = pd.concat([train, intersactions], axis = 1) #옆으로 붙여야한다!
#sample을 붙이면 axis = 0
selector = VarianceThreshold(threshold=.01)
selector.fit(train.drop(['id', 'target'], axis=1))
f = np.vectorize(lambda x : not x)
v = train.drop(['id', 'target'], axis=1).columns[f(selector.get_support())]
print('{} variables have too low variance.'.format(len(v)))
print('These variavels are {}'.format(list(v)))
28 variables have too low variance.
These variavels are ['ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_car_12', 'ps_car_14', 'ps_car_11_cat_te', 'ps_ind_05_cat_2', 'ps_ind_05_cat_5', 'ps_car_01_cat_1', 'ps_car_01_cat_2', 'ps_car_04_cat_3', 'ps_car_04_cat_4', 'ps_car_04_cat_5', 'ps_car_04_cat_6', 'ps_car_04_cat_7', 'ps_car_06_cat_2', 'ps_car_06_cat_5', 'ps_car_06_cat_8', 'ps_car_06_cat_12', 'ps_car_06_cat_16', 'ps_car_06_cat_17', 'ps_car_09_cat_4', 'ps_car_10_cat_1', 'ps_car_10_cat_2', 'ps_car_12^2', 'ps_car_12 ps_car_14', 'ps_car_14^2']
이유한 님 팁
- 1000개의 피쳐가 있을 때 피쳐를 20개씩 뺐다 넣다 하면서 성능을 보자!!
- 20개 모델 baseline + random choosing 20
- 40 모델 학습
- if 성능 향상 -> feature importance 상위 10%에 새로 추가된거 생기면 향산된걸 남기고
- 안생기면 계속 random choosing
- 반복
X_train = train.drop(['id','target'],axis=1)
y_train = train['target']
feat_labels = X_train.columns
rf = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1)
rf.fit(X_train, y_train)
importances = rf.feature_importances_
indices = np.argsort(rf.feature_importances_)[::-1]
for f in range(X_train.shape[1]):
print("%2d) %-*s %f" % (f +1, 30, feat_labels[indices[f]], importances[indices[f]]))
1) ps_car_11_cat_te 0.021144
2) ps_car_12 ps_car_13 0.017390
3) ps_car_13 0.017361
4) ps_car_13^2 0.017307
5) ps_reg_03 ps_car_13 0.017075
6) ps_car_13 ps_car_14 0.017067
7) ps_car_13 ps_car_15 0.016823
8) ps_reg_01 ps_car_13 0.016773
9) ps_reg_03 ps_car_14 0.016233
10) ps_reg_03 ps_car_12 0.015462
11) ps_reg_03 ps_car_15 0.015181
12) ps_car_14 ps_car_15 0.015056
13) ps_car_13 ps_calc_02 0.014764
14) ps_car_13 ps_calc_01 0.014746
15) ps_reg_02 ps_car_13 0.014710
16) ps_car_13 ps_calc_03 0.014702
17) ps_reg_01 ps_reg_03 0.014665
18) ps_reg_01 ps_car_14 0.014378
19) ps_reg_03^2 0.014235
20) ps_reg_03 0.014196
21) ps_reg_03 ps_calc_03 0.013807
22) ps_reg_03 ps_calc_02 0.013738
23) ps_reg_03 ps_calc_01 0.013705
24) ps_car_14 ps_calc_02 0.013652
25) ps_calc_10 0.013646
26) ps_car_14 ps_calc_03 0.013537
27) ps_car_14 ps_calc_01 0.013527
28) ps_calc_14 0.013388
29) ps_car_12 ps_car_14 0.012970
30) ps_ind_03 0.012921
31) ps_car_14 0.012741
32) ps_car_14^2 0.012730
33) ps_reg_02 ps_car_14 0.012697
34) ps_calc_11 0.012619
35) ps_reg_02 ps_reg_03 0.012489
36) ps_ind_15 0.012116
37) ps_car_12 ps_car_15 0.010944
38) ps_car_15 ps_calc_01 0.010857
39) ps_car_15 ps_calc_03 0.010839
40) ps_car_15 ps_calc_02 0.010837
41) ps_car_12 ps_calc_01 0.010477
42) ps_calc_13 0.010477
43) ps_car_12 ps_calc_03 0.010310
44) ps_car_12 ps_calc_02 0.010296
45) ps_reg_02 ps_car_15 0.010205
46) ps_reg_01 ps_car_15 0.010177
47) ps_calc_02 ps_calc_03 0.010077
48) ps_calc_01 ps_calc_02 0.010013
49) ps_calc_01 ps_calc_03 0.010005
50) ps_calc_08 0.009867
51) ps_calc_07 0.009857
52) ps_reg_01 ps_car_12 0.009473
53) ps_reg_02 ps_car_12 0.009319
54) ps_reg_02 ps_calc_01 0.009294
55) ps_reg_02 ps_calc_03 0.009237
56) ps_reg_02 ps_calc_02 0.009146
57) ps_calc_06 0.009092
58) ps_reg_01 ps_calc_02 0.009054
59) ps_reg_01 ps_calc_03 0.009041
60) ps_reg_01 ps_calc_01 0.009020
61) ps_calc_09 0.008794
62) ps_ind_01 0.008606
63) ps_calc_05 0.008298
64) ps_calc_04 0.008168
65) ps_calc_12 0.008015
66) ps_reg_01 ps_reg_02 0.008015
67) ps_car_15 0.006130
68) ps_car_15^2 0.006130
69) ps_calc_03 0.006001
70) ps_calc_01^2 0.005975
71) ps_calc_01 0.005964
72) ps_calc_03^2 0.005964
73) ps_calc_02 0.005950
74) ps_calc_02^2 0.005943
75) ps_car_12 0.005358
76) ps_car_12^2 0.005348
77) ps_reg_02^2 0.004993
78) ps_reg_02 0.004986
79) ps_reg_01^2 0.004140
80) ps_reg_01 0.004118
81) ps_car_11 0.003796
82) ps_ind_05_cat_0 0.003564
83) ps_ind_17_bin 0.002840
84) ps_calc_17_bin 0.002701
85) ps_calc_16_bin 0.002597
86) ps_calc_19_bin 0.002554
87) ps_calc_18_bin 0.002529
88) ps_ind_04_cat_1 0.002405
89) ps_car_01_cat_11 0.002399
90) ps_ind_16_bin 0.002393
91) ps_ind_04_cat_0 0.002378
92) ps_ind_07_bin 0.002333
93) ps_car_09_cat_2 0.002313
94) ps_ind_02_cat_1 0.002269
95) ps_car_09_cat_0 0.002100
96) ps_car_01_cat_7 0.002089
97) ps_ind_02_cat_2 0.002078
98) ps_calc_20_bin 0.002072
99) ps_ind_06_bin 0.002041
100) ps_car_06_cat_1 0.002002
101) ps_calc_15_bin 0.001996
102) ps_car_07_cat_1 0.001966
103) ps_ind_08_bin 0.001946
104) ps_car_09_cat_1 0.001828
105) ps_car_06_cat_11 0.001787
106) ps_ind_18_bin 0.001739
107) ps_ind_09_bin 0.001719
108) ps_car_01_cat_10 0.001598
109) ps_car_01_cat_9 0.001577
110) ps_car_01_cat_6 0.001549
111) ps_car_06_cat_14 0.001547
112) ps_car_01_cat_4 0.001530
113) ps_ind_05_cat_6 0.001501
114) ps_ind_02_cat_3 0.001432
115) ps_car_07_cat_0 0.001369
116) ps_car_02_cat_1 0.001337
117) ps_car_01_cat_8 0.001330
118) ps_car_08_cat_1 0.001327
119) ps_car_02_cat_0 0.001313
120) ps_car_06_cat_4 0.001225
121) ps_ind_05_cat_4 0.001216
122) ps_ind_02_cat_4 0.001156
123) ps_car_01_cat_5 0.001143
124) ps_car_06_cat_6 0.001095
125) ps_car_06_cat_10 0.001055
126) ps_car_04_cat_1 0.001036
127) ps_ind_05_cat_2 0.001030
128) ps_car_06_cat_7 0.001003
129) ps_car_04_cat_2 0.000980
130) ps_car_01_cat_3 0.000885
131) ps_car_09_cat_3 0.000883
132) ps_ind_14 0.000862
133) ps_car_01_cat_0 0.000854
134) ps_car_06_cat_15 0.000831
135) ps_car_06_cat_9 0.000785
136) ps_ind_05_cat_1 0.000755
137) ps_car_10_cat_1 0.000704
138) ps_car_06_cat_3 0.000698
139) ps_ind_05_cat_3 0.000685
140) ps_ind_12_bin 0.000671
141) ps_car_09_cat_4 0.000631
142) ps_car_01_cat_2 0.000569
143) ps_car_04_cat_8 0.000557
144) ps_car_06_cat_17 0.000513
145) ps_car_06_cat_16 0.000454
146) ps_car_04_cat_9 0.000443
147) ps_car_06_cat_12 0.000420
148) ps_car_06_cat_13 0.000396
149) ps_car_01_cat_1 0.000381
150) ps_ind_05_cat_5 0.000307
151) ps_car_06_cat_5 0.000284
152) ps_ind_11_bin 0.000217
153) ps_car_04_cat_6 0.000193
154) ps_ind_13_bin 0.000150
155) ps_car_04_cat_3 0.000141
156) ps_car_06_cat_2 0.000137
157) ps_car_04_cat_5 0.000100
158) ps_car_06_cat_8 0.000093
159) ps_car_04_cat_7 0.000083
160) ps_ind_10_bin 0.000074
161) ps_car_10_cat_2 0.000058
162) ps_car_04_cat_4 0.000042
sfm = SelectFromModel(rf, threshold='median', prefit=True)
print('Number of features before selection: {}'.format(X_train.shape[1]))
n_features = sfm.transform(X_train).shape[1]
print('Number of features after selection: {}'.format(n_features))
selected_vars = list(feat_labels[sfm.get_support()])
Number of features before selection: 162
Number of features after selection: 81
train = train[selected_vars + ['target']]
scaler = StandardScaler()
scaler.fit_transform(train.drop(['target'], axis=1))
array([[-0.45941104, -1.26665356, 1.05087653, ..., -0.72553616,
-1.01071913, -1.06173767],
[ 1.55538958, 0.95034274, -0.63847299, ..., -1.06120876,
-1.01071913, 0.27907892],
[ 1.05168943, -0.52765479, -0.92003125, ..., 1.95984463,
-0.56215309, -1.02449277],
...,
[-0.9631112 , 0.58084336, 0.48776003, ..., -0.46445747,
0.18545696, 0.27907892],
[-0.9631112 , -0.89715418, -1.48314775, ..., -0.91202093,
-0.41263108, 0.27907892],
[-0.45941104, -1.26665356, 1.61399304, ..., 0.28148164,
-0.11358706, -0.72653353]])
Comments