Kaggle 필사 1

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
%%time
train= pd.read_csv("./porto/train.csv/train.csv")
test= pd.read_csv("./porto/test.csv/test.csv")

Wall time: 6.47 s
# train = train.sample(frac = 0.2) EDA할 때 데이터 뽑자
# 데이터가 imbalanced하면
from sklearn.model_selection import StratifiedKFold
#fold = StratifiedKFold(n_splits=10, random_state=1980, shuffle = True)
#for trn_idx, val_idx in fold.split(train, train['target']) :
#    break
#train = train.iloc[trn_idx]
train.shape
(595212, 59)
train.tail()
id target ps_ind_01 ps_ind_02_cat ps_ind_03 ps_ind_04_cat ps_ind_05_cat ps_ind_06_bin ps_ind_07_bin ps_ind_08_bin ps_ind_09_bin ps_ind_10_bin ps_ind_11_bin ps_ind_12_bin ps_ind_13_bin ps_ind_14 ps_ind_15 ps_ind_16_bin ps_ind_17_bin ps_ind_18_bin ps_reg_01 ps_reg_02 ps_reg_03 ps_car_01_cat ps_car_02_cat ps_car_03_cat ps_car_04_cat ps_car_05_cat ps_car_06_cat ps_car_07_cat ps_car_08_cat ps_car_09_cat ps_car_10_cat ps_car_11_cat ps_car_11 ps_car_12 ps_car_13 ps_car_14 ps_car_15 ps_calc_01 ps_calc_02 ps_calc_03 ps_calc_04 ps_calc_05 ps_calc_06 ps_calc_07 ps_calc_08 ps_calc_09 ps_calc_10 ps_calc_11 ps_calc_12 ps_calc_13 ps_calc_14 ps_calc_15_bin ps_calc_16_bin ps_calc_17_bin ps_calc_18_bin ps_calc_19_bin ps_calc_20_bin
595207 1488013 0 3 1 10 0 0 0 0 0 1 0 0 0 0 0 13 1 0 0 0.5 0.3 0.692820 10 1 -1 0 1 1 1 1 0 1 31 3 0.374166 0.684631 0.385487 2.645751 0.4 0.5 0.3 3 0 9 0 9 1 12 4 1 9 6 0 1 1 0 1 1
595208 1488016 0 5 1 3 0 0 0 0 0 1 0 0 0 0 0 6 1 0 0 0.9 0.7 1.382027 9 1 -1 0 -1 15 0 0 2 1 63 2 0.387298 0.972145 -1.000000 3.605551 0.2 0.2 0.0 2 4 8 6 8 2 12 4 1 3 8 1 0 1 0 1 1
595209 1488017 0 1 1 10 0 0 1 0 0 0 0 0 0 0 0 12 1 0 0 0.9 0.2 0.659071 7 1 -1 0 -1 1 1 1 2 1 31 3 0.397492 0.596373 0.398748 1.732051 0.4 0.0 0.3 3 2 7 4 8 0 10 3 2 2 6 0 0 1 0 0 0
595210 1488021 0 5 2 3 1 0 0 0 1 0 0 0 0 0 0 12 1 0 0 0.9 0.4 0.698212 11 1 -1 0 -1 11 1 1 2 1 101 3 0.374166 0.764434 0.384968 3.162278 0.0 0.7 0.0 4 0 9 4 9 2 11 4 1 4 2 0 1 1 1 0 0
595211 1488027 0 0 1 8 0 0 1 0 0 0 0 0 0 0 0 7 1 0 0 0.1 0.2 -1.000000 7 0 -1 0 -1 0 1 0 2 1 34 2 0.400000 0.932649 0.378021 3.741657 0.4 0.0 0.5 2 3 10 4 10 2 5 4 4 3 8 0 1 0 0 0 0
cat_cols = [col for col in train.columns if 'cat' in col]
cat_cols
['ps_ind_02_cat',
 'ps_ind_04_cat',
 'ps_ind_05_cat',
 'ps_car_01_cat',
 'ps_car_02_cat',
 'ps_car_03_cat',
 'ps_car_04_cat',
 'ps_car_05_cat',
 'ps_car_06_cat',
 'ps_car_07_cat',
 'ps_car_08_cat',
 'ps_car_09_cat',
 'ps_car_10_cat',
 'ps_car_11_cat']
train[cat_cols[0]].value_counts()
 1    431859
 2    123573
 3     28186
 4     11378
-1       216
Name: ps_ind_02_cat, dtype: int64
for col in cat_cols :
    print(col, train[col].nunique())
ps_ind_02_cat 5
ps_ind_04_cat 3
ps_ind_05_cat 8
ps_car_01_cat 13
ps_car_02_cat 3
ps_car_03_cat 3
ps_car_04_cat 10
ps_car_05_cat 3
ps_car_06_cat 18
ps_car_07_cat 3
ps_car_08_cat 2
ps_car_09_cat 6
ps_car_10_cat 3
ps_car_11_cat 104
train.drop_duplicates()
train.shape
(595212, 59)
train['ps_ind_03'].dtype == 'int64'
True
data = [] 

for f in train.columns:
        #Defining the role
        if f == 'target':
            role = 'target'
        elif f == 'id':
            role = 'id'
        else :
            role = 'input'
        
        #Defining the level
        if 'bin' in f or f == 'target' :
            level = 'binary'
        elif 'cat' in f or f =='id':
            level = 'nominal'
        elif train[f].dtype == 'float' :
            level = 'interval'
        elif train[f].dtype == 'int64':
            level = 'ordinal'
        
        #Initialize keep to True for all variables except for id
        keep = True
        if f == 'id':
            keep = False
        
        #Defining the data type
        dtype = train[f].dtype
        
        #Creating a Dict that contains all the metadata for the variable
        f_dict = {
            'varname' : f,
            'role' : role,
            'level' : level,
            'keep' : keep,
            'dtype' : dtype
        }
        data.append(f_dict)
        
meta = pd.DataFrame(data, columns = ['varname', 'role', 'level', 'keep', 'dtype'])
meta.set_index('varname', inplace = True)
            
meta
role level keep dtype
varname
id id nominal False int64
target target binary True int64
ps_ind_01 input ordinal True int64
ps_ind_02_cat input nominal True int64
ps_ind_03 input ordinal True int64
ps_ind_04_cat input nominal True int64
ps_ind_05_cat input nominal True int64
ps_ind_06_bin input binary True int64
ps_ind_07_bin input binary True int64
ps_ind_08_bin input binary True int64
ps_ind_09_bin input binary True int64
ps_ind_10_bin input binary True int64
ps_ind_11_bin input binary True int64
ps_ind_12_bin input binary True int64
ps_ind_13_bin input binary True int64
ps_ind_14 input ordinal True int64
ps_ind_15 input ordinal True int64
ps_ind_16_bin input binary True int64
ps_ind_17_bin input binary True int64
ps_ind_18_bin input binary True int64
ps_reg_01 input interval True float64
ps_reg_02 input interval True float64
ps_reg_03 input interval True float64
ps_car_01_cat input nominal True int64
ps_car_02_cat input nominal True int64
ps_car_03_cat input nominal True int64
ps_car_04_cat input nominal True int64
ps_car_05_cat input nominal True int64
ps_car_06_cat input nominal True int64
ps_car_07_cat input nominal True int64
ps_car_08_cat input nominal True int64
ps_car_09_cat input nominal True int64
ps_car_10_cat input nominal True int64
ps_car_11_cat input nominal True int64
ps_car_11 input ordinal True int64
ps_car_12 input interval True float64
ps_car_13 input interval True float64
ps_car_14 input interval True float64
ps_car_15 input interval True float64
ps_calc_01 input interval True float64
ps_calc_02 input interval True float64
ps_calc_03 input interval True float64
ps_calc_04 input ordinal True int64
ps_calc_05 input ordinal True int64
ps_calc_06 input ordinal True int64
ps_calc_07 input ordinal True int64
ps_calc_08 input ordinal True int64
ps_calc_09 input ordinal True int64
ps_calc_10 input ordinal True int64
ps_calc_11 input ordinal True int64
ps_calc_12 input ordinal True int64
ps_calc_13 input ordinal True int64
ps_calc_14 input ordinal True int64
ps_calc_15_bin input binary True int64
ps_calc_16_bin input binary True int64
ps_calc_17_bin input binary True int64
ps_calc_18_bin input binary True int64
ps_calc_19_bin input binary True int64
ps_calc_20_bin input binary True int64
meta[(meta.level =='nominal')& (meta.keep)].index
Index(['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat',
       'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat',
       'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat',
       'ps_car_10_cat', 'ps_car_11_cat'],
      dtype='object', name='varname')
pd.DataFrame({'count' : meta.groupby(['role','level'])['role'].size()}).reset_index()
role level count
0 id nominal 1
1 input binary 17
2 input interval 10
3 input nominal 14
4 input ordinal 16
5 target binary 1
meta.groupby(['role','level'])['role'].size()
role    level   
id      nominal      1
input   binary      17
        interval    10
        nominal     14
        ordinal     16
target  binary       1
Name: role, dtype: int64
v = meta[(meta.level == 'interval') & (meta.keep)].index
train[v].describe()
ps_reg_01 ps_reg_02 ps_reg_03 ps_car_12 ps_car_13 ps_car_14 ps_car_15 ps_calc_01 ps_calc_02 ps_calc_03
count 595212.000000 595212.000000 595212.000000 595212.000000 595212.000000 595212.000000 595212.000000 595212.000000 595212.000000 595212.000000
mean 0.610991 0.439184 0.551102 0.379945 0.813265 0.276256 3.065899 0.449756 0.449589 0.449849
std 0.287643 0.404264 0.793506 0.058327 0.224588 0.357154 0.731366 0.287198 0.286893 0.287153
min 0.000000 0.000000 -1.000000 -1.000000 0.250619 -1.000000 0.000000 0.000000 0.000000 0.000000
25% 0.400000 0.200000 0.525000 0.316228 0.670867 0.333167 2.828427 0.200000 0.200000 0.200000
50% 0.700000 0.300000 0.720677 0.374166 0.765811 0.368782 3.316625 0.500000 0.400000 0.500000
75% 0.900000 0.600000 1.000000 0.400000 0.906190 0.396485 3.605551 0.700000 0.700000 0.700000
max 0.900000 1.800000 4.037945 1.264911 3.720626 0.636396 3.741657 0.900000 0.900000 0.900000
v = meta[(meta.level == 'binary') & (meta.keep)].index
train[v].describe()
target ps_ind_06_bin ps_ind_07_bin ps_ind_08_bin ps_ind_09_bin ps_ind_10_bin ps_ind_11_bin ps_ind_12_bin ps_ind_13_bin ps_ind_16_bin ps_ind_17_bin ps_ind_18_bin ps_calc_15_bin ps_calc_16_bin ps_calc_17_bin ps_calc_18_bin ps_calc_19_bin ps_calc_20_bin
count 595212.000000 595212.000000 595212.000000 595212.000000 595212.000000 595212.000000 595212.000000 595212.000000 595212.000000 595212.000000 595212.000000 595212.000000 595212.000000 595212.000000 595212.000000 595212.000000 595212.000000 595212.000000
mean 0.036448 0.393742 0.257033 0.163921 0.185304 0.000373 0.001692 0.009439 0.000948 0.660823 0.121081 0.153446 0.122427 0.627840 0.554182 0.287182 0.349024 0.153318
std 0.187401 0.488579 0.436998 0.370205 0.388544 0.019309 0.041097 0.096693 0.030768 0.473430 0.326222 0.360417 0.327779 0.483381 0.497056 0.452447 0.476662 0.360295
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 1.000000 1.000000 0.000000 0.000000 0.000000
75% 0.000000 1.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000 0.000000
max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000

Handling Imbalanced classes

  • 3가지 전략
    • oversampling recoreds with target == 1
    • undesrsampling records with target == 0
    • SMOTE사용
desired_apriro1=0.1
idx_0 = train[train.target == 0].index
idx_1 = train[train.target == 1].index

nb_0 = len(train.loc[idx_0])
nb_1 = len(train.loc[idx_1])
undersampling_rate = ((1-desired_apriro1)*nb_1)/(nb_0*desired_apriro1)
((1-desired_apriro1)*nb_1)/(nb_0*desired_apriro1)
0.34043569687437886
undersampled_nb_0 = int(undersampling_rate*nb_0)
print('Rate to undersample records with target=0: {}'.format(undersampling_rate))
print('Number of records with target=0 after undersampling: {}'.format(undersampled_nb_0))
Rate to undersample records with target=0: 0.34043569687437886
Number of records with target=0 after undersampling: 195246
undersampled_idx = shuffle(idx_0, random_state=37, n_samples = undersampled_nb_0)
idx_list = list(undersampled_idx) + list(idx_1)
train = train.loc[idx_list].reset_index(drop = True)
#import missingno as msno
#msno.matrix(train)
display( train.isnull().sum(axis =0) )
id                0
target            0
ps_ind_01         0
ps_ind_02_cat     0
ps_ind_03         0
ps_ind_04_cat     0
ps_ind_05_cat     0
ps_ind_06_bin     0
ps_ind_07_bin     0
ps_ind_08_bin     0
ps_ind_09_bin     0
ps_ind_10_bin     0
ps_ind_11_bin     0
ps_ind_12_bin     0
ps_ind_13_bin     0
ps_ind_14         0
ps_ind_15         0
ps_ind_16_bin     0
ps_ind_17_bin     0
ps_ind_18_bin     0
ps_reg_01         0
ps_reg_02         0
ps_reg_03         0
ps_car_01_cat     0
ps_car_02_cat     0
ps_car_03_cat     0
ps_car_04_cat     0
ps_car_05_cat     0
ps_car_06_cat     0
ps_car_07_cat     0
ps_car_08_cat     0
ps_car_09_cat     0
ps_car_10_cat     0
ps_car_11_cat     0
ps_car_11         0
ps_car_12         0
ps_car_13         0
ps_car_14         0
ps_car_15         0
ps_calc_01        0
ps_calc_02        0
ps_calc_03        0
ps_calc_04        0
ps_calc_05        0
ps_calc_06        0
ps_calc_07        0
ps_calc_08        0
ps_calc_09        0
ps_calc_10        0
ps_calc_11        0
ps_calc_12        0
ps_calc_13        0
ps_calc_14        0
ps_calc_15_bin    0
ps_calc_16_bin    0
ps_calc_17_bin    0
ps_calc_18_bin    0
ps_calc_19_bin    0
ps_calc_20_bin    0
dtype: int64
vars_with_missing = []

for f in train.columns:
    missings = train[train[f] == -1][f].count()
    if missings > 0:
        vars_with_missing.append(f)
        missings_perc = missings/train.shape[0]
        
        print('Variable {} has {} records ({:.2%}) with missing values'.format(f,missings,missings_perc))

print('In total, there are {} variables with missing values'.format(len(vars_with_missing)))
Variable ps_ind_02_cat has 103 records (0.05%) with missing values
Variable ps_ind_04_cat has 51 records (0.02%) with missing values
Variable ps_ind_05_cat has 2256 records (1.04%) with missing values
Variable ps_reg_03 has 38580 records (17.78%) with missing values
Variable ps_car_01_cat has 62 records (0.03%) with missing values
Variable ps_car_02_cat has 2 records (0.00%) with missing values
Variable ps_car_03_cat has 148367 records (68.39%) with missing values
Variable ps_car_05_cat has 96026 records (44.26%) with missing values
Variable ps_car_07_cat has 4431 records (2.04%) with missing values
Variable ps_car_09_cat has 230 records (0.11%) with missing values
Variable ps_car_11 has 1 records (0.00%) with missing values
Variable ps_car_14 has 15726 records (7.25%) with missing values
In total, there are 12 variables with missing values

-ps_car_03이 70% missing value, 전체삭제

train[['ps_car_03_cat','target']].groupby('ps_car_03_cat').mean()
target
ps_car_03_cat
-1 0.090654
0 0.106983
1 0.128862
train['ps_car_03_cat'].describe()
count    216940.000000
mean         -0.492639
std           0.795291
min          -1.000000
25%          -1.000000
50%          -1.000000
75%           0.000000
max           1.000000
Name: ps_car_03_cat, dtype: float64
train['ps_car_03_cat'].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x2398fb97108>

png

여기서는 결측값을 평균으로 대체하지만 좀더 면밀하게 분석하고 해야할 필요가있다..

vars_to_drop = ['ps_car_03_cat', 'ps_car_05_cat']

train.drop(vars_to_drop, axis =1, inplace = True)
meta.loc[(vars_to_drop), 'keep'] = False
temp_series = train[['ps_car_01_cat','ps_car_02_cat','ps_reg_03']].groupby(['ps_car_01_cat','ps_car_02_cat']).mean()
temp_series.reset_index(inplace= True)
train.loc[train['ps_reg_03'] == -1].merge(temp_series, on = ['ps_car_01_cat','ps_car_02_cat'] , how = 'left')
id target ps_ind_01 ps_ind_02_cat ps_ind_03 ps_ind_04_cat ps_ind_05_cat ps_ind_06_bin ps_ind_07_bin ps_ind_08_bin ps_ind_09_bin ps_ind_10_bin ps_ind_11_bin ps_ind_12_bin ps_ind_13_bin ps_ind_14 ps_ind_15 ps_ind_16_bin ps_ind_17_bin ps_ind_18_bin ps_reg_01 ps_reg_02 ps_reg_03_x ps_car_01_cat ps_car_02_cat ps_car_04_cat ps_car_06_cat ps_car_07_cat ps_car_08_cat ps_car_09_cat ps_car_10_cat ps_car_11_cat ps_car_11 ps_car_12 ps_car_13 ps_car_14 ps_car_15 ps_calc_01 ps_calc_02 ps_calc_03 ps_calc_04 ps_calc_05 ps_calc_06 ps_calc_07 ps_calc_08 ps_calc_09 ps_calc_10 ps_calc_11 ps_calc_12 ps_calc_13 ps_calc_14 ps_calc_15_bin ps_calc_16_bin ps_calc_17_bin ps_calc_18_bin ps_calc_19_bin ps_calc_20_bin ps_reg_03_y
0 1254786 0 7 1 9 0 0 0 1 0 0 0 0 0 0 0 12 1 0 0 0.2 0.2 -1.0 0 1 9 13 1 0 0 1 104 2 0.565685 2.108264 0.530094 3.741657 0.7 0.6 0.4 2 2 9 3 11 1 12 2 1 2 10 0 1 0 0 0 0 0.505736
1 1425558 0 1 2 0 1 4 0 0 0 1 0 0 0 0 0 8 1 0 0 0.1 0.3 -1.0 8 1 0 14 1 1 0 1 104 1 0.316070 0.508502 0.355668 1.732051 0.0 0.4 0.1 1 2 9 2 12 3 6 2 1 4 10 0 0 0 0 0 0 0.515147
2 860206 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0 7 1 0 0 0.4 0.1 -1.0 7 1 0 1 1 1 2 1 64 1 0.316228 0.656405 0.361939 3.316625 0.4 0.8 0.3 3 1 6 3 7 2 7 6 0 3 7 1 0 1 0 1 0 0.333647
3 1265316 0 4 2 4 1 0 0 0 1 0 0 0 0 0 0 6 0 0 1 0.1 0.3 -1.0 4 1 0 1 1 1 0 1 65 1 0.316228 0.545795 0.350714 2.449490 0.6 0.6 0.5 1 1 7 1 12 6 6 6 2 4 10 0 1 1 1 0 0 0.244300
4 267652 0 2 1 1 0 0 1 0 0 0 0 0 0 0 0 12 1 0 0 0.3 0.3 -1.0 7 0 0 10 1 1 2 1 55 2 0.424264 1.116425 0.416533 3.605551 0.2 0.1 0.9 2 0 8 4 12 3 9 7 0 2 6 0 1 0 1 1 0 0.477232
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
38575 1486851 1 5 1 10 0 0 0 0 1 0 0 0 0 0 0 5 0 1 0 0.1 0.3 -1.0 11 0 9 17 1 1 2 1 104 2 0.447214 1.209873 0.430116 3.316625 0.2 0.4 0.9 2 3 7 4 8 4 8 7 0 6 10 0 0 0 1 1 0 0.985341
38576 1487090 1 2 1 6 0 6 1 0 0 0 0 0 0 0 0 12 1 0 0 0.1 0.3 -1.0 7 1 0 0 1 1 2 1 37 2 0.316228 0.740728 0.311448 3.316625 0.4 0.7 0.0 1 2 9 6 9 1 10 6 1 3 15 0 0 1 0 0 1 0.333647
38577 1487406 1 3 1 8 1 0 0 1 0 0 0 0 0 0 0 7 1 0 0 0.4 0.0 -1.0 7 1 0 1 1 1 2 1 64 3 0.316228 0.613586 0.301662 2.828427 0.9 0.5 0.5 3 1 10 2 9 1 3 5 0 3 4 0 1 0 0 1 0 0.333647
38578 1487419 1 2 1 6 0 0 1 0 0 0 0 0 0 0 0 9 1 0 0 0.3 0.3 -1.0 4 1 0 0 1 1 0 1 34 2 0.400000 0.795156 0.378021 3.162278 0.7 0.1 0.9 3 4 9 3 9 2 8 11 1 5 5 0 1 1 0 0 0 0.244300
38579 1487566 1 1 1 5 0 0 0 0 1 0 0 0 0 0 0 4 0 0 1 0.3 0.4 -1.0 11 0 0 15 1 1 2 1 5 2 0.424264 0.756979 0.400000 2.000000 0.3 0.4 0.6 1 1 8 2 9 5 9 9 2 1 5 0 1 0 0 0 0 0.985341

38580 rows × 58 columns

mean_imp = SimpleImputer(missing_values= -1, strategy='mean')
mode_imp = SimpleImputer(missing_values= -1, strategy ='most_frequent')
train['ps_reg_03'] = mean_imp.fit_transform(train[['ps_reg_03']]).ravel()
train['ps_car_12'] = mean_imp.fit_transform(train[['ps_car_12']]).ravel()
train['ps_car_14'] = mean_imp.fit_transform(train[['ps_car_14']]).ravel()
train['ps_car_11'] = mode_imp.fit_transform(train[['ps_car_11']]).ravel()
v = meta[(meta.level == 'nominal' ) & (meta.keep)].index
sum = 0

for f in v :
    dist_values = train[f].value_counts().shape[0]
    sum+= dist_values
    print('Variable {} has {} distinct values'.format(f, dist_values))
Variable ps_ind_02_cat has 5 distinct values
Variable ps_ind_04_cat has 3 distinct values
Variable ps_ind_05_cat has 8 distinct values
Variable ps_car_01_cat has 13 distinct values
Variable ps_car_02_cat has 3 distinct values
Variable ps_car_04_cat has 10 distinct values
Variable ps_car_06_cat has 18 distinct values
Variable ps_car_07_cat has 3 distinct values
Variable ps_car_08_cat has 2 distinct values
Variable ps_car_09_cat has 6 distinct values
Variable ps_car_10_cat has 3 distinct values
Variable ps_car_11_cat has 104 distinct values
train[f].value_counts().shape
(104,)
sum
178

def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior  
    """ 
    assert len(trn_series) == len(target) #값이 같으면 True 
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1) #옆으로 붙여주고,
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"]) #group by
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),#rename
        on=trn_series.name,                                                                 # merge 
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior) # how = left를 넣어야 NULL값이 안생김
    
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)
train_encoded, test_encoded = target_encode(train["ps_car_11_cat"], 
                             test["ps_car_11_cat"], 
                             target=train.target, 
                             min_samples_leaf=100,
                             smoothing=10,
                             noise_level=0.01)
    
train['ps_car_11_cat_te'] = train_encoded
train.drop('ps_car_11_cat', axis=1, inplace=True)
meta.loc['ps_car_11_cat','keep'] = False  # Updating the meta
test['ps_car_11_cat_te'] = test_encoded
test.drop('ps_car_11_cat', axis=1, inplace=True)

뜯어보기

#v = meta[(meta.level == 'nominal') & (meta.keep)].index

#cat_perc = train[[f, 'target']].groupby([f],as_index=False).mean()
---------------------------------------------------------------------------

KeyError                                  Traceback (most recent call last)

<ipython-input-50-b8a6304ecf30> in <module>
----> 1 cat_perc = train[[f, 'target']].groupby([f],as_index=False).mean()


C:\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
   2804             if is_iterator(key):
   2805                 key = list(key)
-> 2806             indexer = self.loc._get_listlike_indexer(key, axis=1, raise_missing=True)[1]
   2807 
   2808         # take() does not accept boolean indexers


C:\Anaconda3\lib\site-packages\pandas\core\indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
   1550 
   1551         self._validate_read_indexer(
-> 1552             keyarr, indexer, o._get_axis_number(axis), raise_missing=raise_missing
   1553         )
   1554         return keyarr, indexer


C:\Anaconda3\lib\site-packages\pandas\core\indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
   1643             if not (self.name == "loc" and not raise_missing):
   1644                 not_found = list(set(key) - set(ax))
-> 1645                 raise KeyError(f"{not_found} not in index")
   1646 
   1647             # we skip the warning on Categorical/Interval


KeyError: "['ps_car_11_cat'] not in index"
#cat_perc
---------------------------------------------------------------------------

NameError                                 Traceback (most recent call last)

<ipython-input-51-2ac23d9b437a> in <module>
----> 1 cat_perc


NameError: name 'cat_perc' is not defined
#cat_perc.sort_values(by='target', ascending=False, inplace = True)
---------------------------------------------------------------------------

NameError                                 Traceback (most recent call last)

<ipython-input-52-72bd5321c2f0> in <module>
----> 1 cat_perc.sort_values(by='target', ascending=False, inplace = True)


NameError: name 'cat_perc' is not defined
#sns.barplot(x=f, y = 'target', data=cat_perc,order = cat_perc[f])

---------------------------------------------------------------------------

NameError                                 Traceback (most recent call last)

<ipython-input-53-579ec202cb69> in <module>
----> 1 sns.barplot(x=f, y = 'target', data=cat_perc,order = cat_perc[f])


NameError: name 'cat_perc' is not defined

다시 코드

v = meta[(meta.level == 'nominal') & (meta.keep)].index

for f in v:
    plt.figure()
    fig, ax = plt.subplots(figsize=(20,10))
    # Calculate the percentage of target=1 per category value
    cat_perc = train[[f, 'target']].groupby([f],as_index=False).mean()
    cat_perc.sort_values(by='target', ascending=False, inplace=True)
    # Bar plot
    # Order the bars descending on target mean
    sns.barplot(ax=ax, x=f, y='target', data=cat_perc, order=cat_perc[f])
    plt.ylabel('% target', fontsize=18)
    plt.xlabel(f, fontsize=18)
    plt.tick_params(axis='both', which='major', labelsize=18)
    plt.show();
<Figure size 432x288 with 0 Axes>

png

<Figure size 432x288 with 0 Axes>

png

<Figure size 432x288 with 0 Axes>

png

<Figure size 432x288 with 0 Axes>

png

<Figure size 432x288 with 0 Axes>

png

<Figure size 432x288 with 0 Axes>

png

<Figure size 432x288 with 0 Axes>

png

<Figure size 432x288 with 0 Axes>

png

<Figure size 432x288 with 0 Axes>

png

<Figure size 432x288 with 0 Axes>

png

<Figure size 432x288 with 0 Axes>

png

cat_perc
ps_car_10_cat target
1 1 0.100029
2 2 0.100000
0 0 0.096420
f = 'ps_car_02_cat'
cat_perc = train[[f, 'target']].groupby([f], as_index=False).agg(['mean','count'])
cat_perc
target
mean count
ps_car_02_cat
-1 0.000000 2
0 0.131868 38000
1 0.093233 178938
  • 관측값이 2개밖에 없는데 둘다 -1이여서 확률이 0으로 나타남.
  • 여기서는 결측치가 의미가 있다고 판단하여 삭제하지 않음.
v = meta[(meta.level == 'interval') & (meta.keep)].index
correlations = train[v].corr()
correlations
ps_reg_01 ps_reg_02 ps_reg_03 ps_car_12 ps_car_13 ps_car_14 ps_car_15 ps_calc_01 ps_calc_02 ps_calc_03
ps_reg_01 1.000000 0.470953 0.137117 0.019095 0.025243 -0.002536 0.001755 -0.003236 0.001459 -0.001371
ps_reg_02 0.470953 1.000000 0.702512 0.173736 0.193896 0.053149 0.052344 -0.001769 -0.000726 -0.000992
ps_reg_03 0.137117 0.702512 1.000000 0.208978 0.241244 0.079541 0.079848 -0.000223 0.000043 -0.000357
ps_car_12 0.019095 0.173736 0.208978 1.000000 0.674298 0.577537 0.049468 -0.000452 -0.001070 -0.000707
ps_car_13 0.025243 0.193896 0.241244 0.674298 1.000000 0.434613 0.526024 0.000266 0.000020 0.000568
ps_car_14 -0.002536 0.053149 0.079541 0.577537 0.434613 1.000000 0.008472 -0.004548 -0.005015 0.000776
ps_car_15 0.001755 0.052344 0.079848 0.049468 0.526024 0.008472 1.000000 -0.000392 0.003630 0.000586
ps_calc_01 -0.003236 -0.001769 -0.000223 -0.000452 0.000266 -0.004548 -0.000392 1.000000 0.002832 -0.000212
ps_calc_02 0.001459 -0.000726 0.000043 -0.001070 0.000020 -0.005015 0.003630 0.002832 1.000000 0.003130
ps_calc_03 -0.001371 -0.000992 -0.000357 -0.000707 0.000568 0.000776 0.000586 -0.000212 0.003130 1.000000
cmap = sns.diverging_palette(220, 10, as_cmap=True)

    fig, ax = plt.subplots(figsize=(10,10))
    sns.heatmap(correlations, cmap=cmap, vmax=1.0, center=0, fmt='.2f',
                square=True, linewidths=.5, annot=True, cbar_kws={"shrink": .75})
    plt.show();

png

def corr_heatmap(v):
    correlations = train[v].corr()

    # Create color map ranging between two colors
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    fig, ax = plt.subplots(figsize=(10,10))
    sns.heatmap(correlations, cmap=cmap, vmax=1.0, center=0, fmt='.2f',
                square=True, linewidths=.5, annot=True, cbar_kws={"shrink": .75})
    plt.show();
    
v = meta[(meta.level == 'interval') & (meta.keep)].index
corr_heatmap(v)

png

s = train.sample(frac=0.1)
sns.lmplot(x='ps_reg_02', y='ps_reg_03', data=s, hue= 'target', palette='Set1',scatter_kws = {'alpha' : 0.3})
plt.show()

png

sns.lmplot(x='ps_car_12', y='ps_car_13', data=s, hue='target', palette='Set1', scatter_kws={'alpha':0.3})
plt.show()

png

sns.lmplot(x='ps_car_12', y='ps_car_14', data=s, hue='target', palette='Set1', scatter_kws={'alpha':0.3})
plt.show()

png

sns.lmplot(x='ps_car_15', y='ps_car_13', data=s, hue='target', palette='Set1', scatter_kws={'alpha':0.3})
plt.show()

png

v = meta[(meta.level == 'ordinal') & (meta.keep)].index
corr_heatmap(v)

png

#PCA를 이용해서 결정

v = meta[(meta.level == 'nominal') & (meta.keep)].index
print('Before dummification we have {} variables in train'.format(train.shape[1]))
train = pd.get_dummies(train, columns=v, drop_first=True)
print('After dummification we have {} variables in train'.format(train.shape[1]))
Before dummification we have 57 variables in train
After dummification we have 109 variables in train

v = meta[(meta.level == 'interval') & (meta.keep)].index

poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias = False) #2승까지 

poly.fit_transform(train[v])
array([[0.6       , 0.6       , 0.83815273, ..., 0.09      , 0.        ,
        0.        ],
       [0.9       , 0.6       , 0.72844011, ..., 0.        , 0.        ,
        0.36      ],
       [0.9       , 0.6       , 0.86926693, ..., 0.81      , 0.09      ,
        0.01      ],
       ...,
       [0.9       , 0.3       , 0.71195154, ..., 0.16      , 0.24      ,
        0.36      ],
       [0.6       , 0.1       , 0.57716982, ..., 0.04      , 0.12      ,
        0.36      ],
       [0.6       , 0.4       , 1.09515981, ..., 0.36      , 0.18      ,
        0.09      ]])
poly.get_feature_names(v)
['ps_reg_01',
 'ps_reg_02',
 'ps_reg_03',
 'ps_car_12',
 'ps_car_13',
 'ps_car_14',
 'ps_car_15',
 'ps_calc_01',
 'ps_calc_02',
 'ps_calc_03',
 'ps_reg_01^2',
 'ps_reg_01 ps_reg_02',
 'ps_reg_01 ps_reg_03',
 'ps_reg_01 ps_car_12',
 'ps_reg_01 ps_car_13',
 'ps_reg_01 ps_car_14',
 'ps_reg_01 ps_car_15',
 'ps_reg_01 ps_calc_01',
 'ps_reg_01 ps_calc_02',
 'ps_reg_01 ps_calc_03',
 'ps_reg_02^2',
 'ps_reg_02 ps_reg_03',
 'ps_reg_02 ps_car_12',
 'ps_reg_02 ps_car_13',
 'ps_reg_02 ps_car_14',
 'ps_reg_02 ps_car_15',
 'ps_reg_02 ps_calc_01',
 'ps_reg_02 ps_calc_02',
 'ps_reg_02 ps_calc_03',
 'ps_reg_03^2',
 'ps_reg_03 ps_car_12',
 'ps_reg_03 ps_car_13',
 'ps_reg_03 ps_car_14',
 'ps_reg_03 ps_car_15',
 'ps_reg_03 ps_calc_01',
 'ps_reg_03 ps_calc_02',
 'ps_reg_03 ps_calc_03',
 'ps_car_12^2',
 'ps_car_12 ps_car_13',
 'ps_car_12 ps_car_14',
 'ps_car_12 ps_car_15',
 'ps_car_12 ps_calc_01',
 'ps_car_12 ps_calc_02',
 'ps_car_12 ps_calc_03',
 'ps_car_13^2',
 'ps_car_13 ps_car_14',
 'ps_car_13 ps_car_15',
 'ps_car_13 ps_calc_01',
 'ps_car_13 ps_calc_02',
 'ps_car_13 ps_calc_03',
 'ps_car_14^2',
 'ps_car_14 ps_car_15',
 'ps_car_14 ps_calc_01',
 'ps_car_14 ps_calc_02',
 'ps_car_14 ps_calc_03',
 'ps_car_15^2',
 'ps_car_15 ps_calc_01',
 'ps_car_15 ps_calc_02',
 'ps_car_15 ps_calc_03',
 'ps_calc_01^2',
 'ps_calc_01 ps_calc_02',
 'ps_calc_01 ps_calc_03',
 'ps_calc_02^2',
 'ps_calc_02 ps_calc_03',
 'ps_calc_03^2']
pd.DataFrame(data=poly.fit_transform(train[v]), columns=poly.get_feature_names(v))
ps_reg_01 ps_reg_02 ps_reg_03 ps_car_12 ps_car_13 ps_car_14 ps_car_15 ps_calc_01 ps_calc_02 ps_calc_03 ps_reg_01^2 ps_reg_01 ps_reg_02 ps_reg_01 ps_reg_03 ps_reg_01 ps_car_12 ps_reg_01 ps_car_13 ps_reg_01 ps_car_14 ps_reg_01 ps_car_15 ps_reg_01 ps_calc_01 ps_reg_01 ps_calc_02 ps_reg_01 ps_calc_03 ps_reg_02^2 ps_reg_02 ps_reg_03 ps_reg_02 ps_car_12 ps_reg_02 ps_car_13 ps_reg_02 ps_car_14 ps_reg_02 ps_car_15 ps_reg_02 ps_calc_01 ps_reg_02 ps_calc_02 ps_reg_02 ps_calc_03 ps_reg_03^2 ps_reg_03 ps_car_12 ps_reg_03 ps_car_13 ps_reg_03 ps_car_14 ps_reg_03 ps_car_15 ps_reg_03 ps_calc_01 ps_reg_03 ps_calc_02 ps_reg_03 ps_calc_03 ps_car_12^2 ps_car_12 ps_car_13 ps_car_12 ps_car_14 ps_car_12 ps_car_15 ps_car_12 ps_calc_01 ps_car_12 ps_calc_02 ps_car_12 ps_calc_03 ps_car_13^2 ps_car_13 ps_car_14 ps_car_13 ps_car_15 ps_car_13 ps_calc_01 ps_car_13 ps_calc_02 ps_car_13 ps_calc_03 ps_car_14^2 ps_car_14 ps_car_15 ps_car_14 ps_calc_01 ps_car_14 ps_calc_02 ps_car_14 ps_calc_03 ps_car_15^2 ps_car_15 ps_calc_01 ps_car_15 ps_calc_02 ps_car_15 ps_calc_03 ps_calc_01^2 ps_calc_01 ps_calc_02 ps_calc_01 ps_calc_03 ps_calc_02^2 ps_calc_02 ps_calc_03 ps_calc_03^2
0 0.6 0.6 0.838153 0.368782 0.540603 0.345688 2.000000 0.9 0.3 0.0 0.36 0.36 0.502892 0.221269 0.324362 0.207413 1.200000 0.54 0.18 0.00 0.36 0.502892 0.221269 0.324362 0.207413 1.200000 0.54 0.18 0.00 0.702500 0.309095 0.453108 0.289739 1.676305 0.754337 0.251446 0.000000 0.136 0.199365 0.127483 0.737564 0.331904 0.110635 0.000000 0.292252 0.186880 1.081207 0.486543 0.162181 0.000000 0.1195 0.691375 0.311119 0.103706 0.000000 4.0 1.800000 0.600000 0.000000 0.81 0.27 0.00 0.09 0.00 0.00
1 0.9 0.6 0.728440 0.424264 0.382953 0.378814 0.000000 0.2 0.0 0.6 0.81 0.54 0.655596 0.381838 0.344658 0.340933 0.000000 0.18 0.00 0.54 0.36 0.437064 0.254558 0.229772 0.227288 0.000000 0.12 0.00 0.36 0.530625 0.309051 0.278958 0.275943 0.000000 0.145688 0.000000 0.437064 0.180 0.162473 0.160717 0.000000 0.084853 0.000000 0.254558 0.146653 0.145068 0.000000 0.076591 0.000000 0.229772 0.1435 0.000000 0.075763 0.000000 0.227288 0.0 0.000000 0.000000 0.000000 0.04 0.00 0.12 0.00 0.00 0.36
2 0.9 0.6 0.869267 0.400000 0.814271 0.402368 3.316625 0.3 0.9 0.1 0.81 0.54 0.782340 0.360000 0.732844 0.362131 2.984962 0.27 0.81 0.09 0.36 0.521560 0.240000 0.488563 0.241421 1.989975 0.18 0.54 0.06 0.755625 0.347707 0.707819 0.349765 2.883032 0.260780 0.782340 0.086927 0.160 0.325708 0.160947 1.326650 0.120000 0.360000 0.040000 0.663037 0.327637 2.700631 0.244281 0.732844 0.081427 0.1619 1.334504 0.120710 0.362131 0.040237 11.0 0.994987 2.984962 0.331662 0.09 0.27 0.03 0.81 0.09 0.01
3 0.6 1.5 1.705872 0.400000 0.838387 0.378418 3.605551 0.8 0.4 0.1 0.36 0.90 1.023523 0.240000 0.503032 0.227051 2.163331 0.48 0.24 0.06 2.25 2.558808 0.600000 1.257580 0.567627 5.408327 1.20 0.60 0.15 2.910000 0.682349 1.430181 0.645532 6.150610 1.364698 0.682349 0.170587 0.160 0.335355 0.151367 1.442221 0.320000 0.160000 0.040000 0.702893 0.317260 3.022847 0.670710 0.335355 0.083839 0.1432 1.364405 0.302734 0.151367 0.037842 13.0 2.884441 1.442221 0.360555 0.64 0.32 0.08 0.16 0.04 0.01
4 0.8 0.8 1.086566 0.400000 0.905777 0.384838 3.605551 0.6 0.5 0.9 0.64 0.64 0.869253 0.320000 0.724622 0.307870 2.884441 0.48 0.40 0.72 0.64 0.869253 0.320000 0.724622 0.307870 2.884441 0.48 0.40 0.72 1.180625 0.434626 0.984186 0.418151 3.917668 0.651939 0.543283 0.977909 0.160 0.362311 0.153935 1.442221 0.240000 0.200000 0.360000 0.820432 0.348577 3.265825 0.543466 0.452888 0.815199 0.1481 1.387552 0.230903 0.192419 0.346354 13.0 2.163331 1.802776 3.244996 0.36 0.30 0.54 0.25 0.45 0.81
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
216935 0.6 0.4 1.537652 0.424264 1.269111 0.384708 3.162278 0.5 0.1 0.5 0.36 0.24 0.922591 0.254558 0.761467 0.230825 1.897367 0.30 0.06 0.30 0.16 0.615061 0.169706 0.507645 0.153883 1.264911 0.20 0.04 0.20 2.364375 0.652371 1.951452 0.591547 4.862484 0.768826 0.153765 0.768826 0.180 0.538438 0.163218 1.341641 0.212132 0.042426 0.212132 1.610644 0.488237 4.013282 0.634556 0.126911 0.634556 0.1480 1.216553 0.192354 0.038471 0.192354 10.0 1.581139 0.316228 1.581139 0.25 0.05 0.25 0.01 0.05 0.25
216936 0.3 0.4 0.898861 0.424264 0.756979 0.400000 2.000000 0.3 0.4 0.6 0.09 0.12 0.269658 0.127279 0.227094 0.120000 0.600000 0.09 0.12 0.18 0.16 0.359544 0.169706 0.302791 0.160000 0.800000 0.12 0.16 0.24 0.807951 0.381354 0.680419 0.359544 1.797722 0.269658 0.359544 0.539317 0.180 0.321159 0.169706 0.848528 0.127279 0.169706 0.254558 0.573017 0.302791 1.513957 0.227094 0.302791 0.454187 0.1600 0.800000 0.120000 0.160000 0.240000 4.0 0.600000 0.800000 1.200000 0.09 0.12 0.18 0.16 0.24 0.36
216937 0.9 0.3 0.711952 0.400000 0.970654 0.372424 3.464102 0.5 0.4 0.6 0.81 0.27 0.640756 0.360000 0.873589 0.335182 3.117691 0.45 0.36 0.54 0.09 0.213585 0.120000 0.291196 0.111727 1.039230 0.15 0.12 0.18 0.506875 0.284781 0.691059 0.265148 2.466272 0.355976 0.284781 0.427171 0.160 0.388262 0.148970 1.385641 0.200000 0.160000 0.240000 0.942169 0.361495 3.362445 0.485327 0.388262 0.582392 0.1387 1.290116 0.186212 0.148970 0.223455 12.0 1.732051 1.385641 2.078461 0.25 0.20 0.30 0.16 0.24 0.36
216938 0.6 0.1 0.577170 0.316228 0.876295 0.320780 3.741657 0.5 0.2 0.6 0.36 0.06 0.346302 0.189737 0.525777 0.192468 2.244994 0.30 0.12 0.36 0.01 0.057717 0.031623 0.087629 0.032078 0.374166 0.05 0.02 0.06 0.333125 0.182517 0.505771 0.185145 2.159572 0.288585 0.115434 0.346302 0.100 0.277109 0.101440 1.183216 0.158114 0.063246 0.189737 0.767893 0.281098 3.278795 0.438147 0.175259 0.525777 0.1029 1.200250 0.160390 0.064156 0.192468 14.0 1.870829 0.748331 2.244994 0.25 0.10 0.30 0.04 0.12 0.36
216939 0.6 0.4 1.095160 0.374166 0.752558 0.328634 3.464102 0.2 0.6 0.3 0.36 0.24 0.657096 0.224499 0.451535 0.197180 2.078461 0.12 0.36 0.18 0.16 0.438064 0.149666 0.301023 0.131453 1.385641 0.08 0.24 0.12 1.199375 0.409771 0.824171 0.359906 3.793745 0.219032 0.657096 0.328548 0.140 0.281581 0.122963 1.296148 0.074833 0.224499 0.112250 0.566343 0.247316 2.606936 0.150512 0.451535 0.225767 0.1080 1.138420 0.065727 0.197180 0.098590 12.0 0.692820 2.078461 1.039230 0.04 0.12 0.06 0.36 0.18 0.09

216940 rows × 65 columns

intersactions = pd.DataFrame(data=poly.fit_transform(train[v]), columns=poly.get_feature_names(v))
intersactions.drop(v, axis=1, inplace=True)
train = pd.concat([train, intersactions], axis = 1) #옆으로 붙여야한다!
#sample을 붙이면 axis = 0
selector = VarianceThreshold(threshold=.01)
selector.fit(train.drop(['id', 'target'], axis=1))

f = np.vectorize(lambda x : not x)

v = train.drop(['id', 'target'], axis=1).columns[f(selector.get_support())]
print('{} variables have too low variance.'.format(len(v)))
print('These variavels are {}'.format(list(v)))
28 variables have too low variance.
These variavels are ['ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_car_12', 'ps_car_14', 'ps_car_11_cat_te', 'ps_ind_05_cat_2', 'ps_ind_05_cat_5', 'ps_car_01_cat_1', 'ps_car_01_cat_2', 'ps_car_04_cat_3', 'ps_car_04_cat_4', 'ps_car_04_cat_5', 'ps_car_04_cat_6', 'ps_car_04_cat_7', 'ps_car_06_cat_2', 'ps_car_06_cat_5', 'ps_car_06_cat_8', 'ps_car_06_cat_12', 'ps_car_06_cat_16', 'ps_car_06_cat_17', 'ps_car_09_cat_4', 'ps_car_10_cat_1', 'ps_car_10_cat_2', 'ps_car_12^2', 'ps_car_12 ps_car_14', 'ps_car_14^2']

이유한 님 팁

  • 1000개의 피쳐가 있을 때 피쳐를 20개씩 뺐다 넣다 하면서 성능을 보자!!
  • 20개 모델 baseline + random choosing 20
  • 40 모델 학습
  • if 성능 향상 -> feature importance 상위 10%에 새로 추가된거 생기면 향산된걸 남기고
  • 안생기면 계속 random choosing
  • 반복
X_train = train.drop(['id','target'],axis=1)
y_train = train['target']

feat_labels = X_train.columns

rf = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1)

rf.fit(X_train, y_train)
importances = rf.feature_importances_

indices = np.argsort(rf.feature_importances_)[::-1]

for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f +1, 30, feat_labels[indices[f]], importances[indices[f]]))


 1) ps_car_11_cat_te               0.021144
 2) ps_car_12 ps_car_13            0.017390
 3) ps_car_13                      0.017361
 4) ps_car_13^2                    0.017307
 5) ps_reg_03 ps_car_13            0.017075
 6) ps_car_13 ps_car_14            0.017067
 7) ps_car_13 ps_car_15            0.016823
 8) ps_reg_01 ps_car_13            0.016773
 9) ps_reg_03 ps_car_14            0.016233
10) ps_reg_03 ps_car_12            0.015462
11) ps_reg_03 ps_car_15            0.015181
12) ps_car_14 ps_car_15            0.015056
13) ps_car_13 ps_calc_02           0.014764
14) ps_car_13 ps_calc_01           0.014746
15) ps_reg_02 ps_car_13            0.014710
16) ps_car_13 ps_calc_03           0.014702
17) ps_reg_01 ps_reg_03            0.014665
18) ps_reg_01 ps_car_14            0.014378
19) ps_reg_03^2                    0.014235
20) ps_reg_03                      0.014196
21) ps_reg_03 ps_calc_03           0.013807
22) ps_reg_03 ps_calc_02           0.013738
23) ps_reg_03 ps_calc_01           0.013705
24) ps_car_14 ps_calc_02           0.013652
25) ps_calc_10                     0.013646
26) ps_car_14 ps_calc_03           0.013537
27) ps_car_14 ps_calc_01           0.013527
28) ps_calc_14                     0.013388
29) ps_car_12 ps_car_14            0.012970
30) ps_ind_03                      0.012921
31) ps_car_14                      0.012741
32) ps_car_14^2                    0.012730
33) ps_reg_02 ps_car_14            0.012697
34) ps_calc_11                     0.012619
35) ps_reg_02 ps_reg_03            0.012489
36) ps_ind_15                      0.012116
37) ps_car_12 ps_car_15            0.010944
38) ps_car_15 ps_calc_01           0.010857
39) ps_car_15 ps_calc_03           0.010839
40) ps_car_15 ps_calc_02           0.010837
41) ps_car_12 ps_calc_01           0.010477
42) ps_calc_13                     0.010477
43) ps_car_12 ps_calc_03           0.010310
44) ps_car_12 ps_calc_02           0.010296
45) ps_reg_02 ps_car_15            0.010205
46) ps_reg_01 ps_car_15            0.010177
47) ps_calc_02 ps_calc_03          0.010077
48) ps_calc_01 ps_calc_02          0.010013
49) ps_calc_01 ps_calc_03          0.010005
50) ps_calc_08                     0.009867
51) ps_calc_07                     0.009857
52) ps_reg_01 ps_car_12            0.009473
53) ps_reg_02 ps_car_12            0.009319
54) ps_reg_02 ps_calc_01           0.009294
55) ps_reg_02 ps_calc_03           0.009237
56) ps_reg_02 ps_calc_02           0.009146
57) ps_calc_06                     0.009092
58) ps_reg_01 ps_calc_02           0.009054
59) ps_reg_01 ps_calc_03           0.009041
60) ps_reg_01 ps_calc_01           0.009020
61) ps_calc_09                     0.008794
62) ps_ind_01                      0.008606
63) ps_calc_05                     0.008298
64) ps_calc_04                     0.008168
65) ps_calc_12                     0.008015
66) ps_reg_01 ps_reg_02            0.008015
67) ps_car_15                      0.006130
68) ps_car_15^2                    0.006130
69) ps_calc_03                     0.006001
70) ps_calc_01^2                   0.005975
71) ps_calc_01                     0.005964
72) ps_calc_03^2                   0.005964
73) ps_calc_02                     0.005950
74) ps_calc_02^2                   0.005943
75) ps_car_12                      0.005358
76) ps_car_12^2                    0.005348
77) ps_reg_02^2                    0.004993
78) ps_reg_02                      0.004986
79) ps_reg_01^2                    0.004140
80) ps_reg_01                      0.004118
81) ps_car_11                      0.003796
82) ps_ind_05_cat_0                0.003564
83) ps_ind_17_bin                  0.002840
84) ps_calc_17_bin                 0.002701
85) ps_calc_16_bin                 0.002597
86) ps_calc_19_bin                 0.002554
87) ps_calc_18_bin                 0.002529
88) ps_ind_04_cat_1                0.002405
89) ps_car_01_cat_11               0.002399
90) ps_ind_16_bin                  0.002393
91) ps_ind_04_cat_0                0.002378
92) ps_ind_07_bin                  0.002333
93) ps_car_09_cat_2                0.002313
94) ps_ind_02_cat_1                0.002269
95) ps_car_09_cat_0                0.002100
96) ps_car_01_cat_7                0.002089
97) ps_ind_02_cat_2                0.002078
98) ps_calc_20_bin                 0.002072
99) ps_ind_06_bin                  0.002041
100) ps_car_06_cat_1                0.002002
101) ps_calc_15_bin                 0.001996
102) ps_car_07_cat_1                0.001966
103) ps_ind_08_bin                  0.001946
104) ps_car_09_cat_1                0.001828
105) ps_car_06_cat_11               0.001787
106) ps_ind_18_bin                  0.001739
107) ps_ind_09_bin                  0.001719
108) ps_car_01_cat_10               0.001598
109) ps_car_01_cat_9                0.001577
110) ps_car_01_cat_6                0.001549
111) ps_car_06_cat_14               0.001547
112) ps_car_01_cat_4                0.001530
113) ps_ind_05_cat_6                0.001501
114) ps_ind_02_cat_3                0.001432
115) ps_car_07_cat_0                0.001369
116) ps_car_02_cat_1                0.001337
117) ps_car_01_cat_8                0.001330
118) ps_car_08_cat_1                0.001327
119) ps_car_02_cat_0                0.001313
120) ps_car_06_cat_4                0.001225
121) ps_ind_05_cat_4                0.001216
122) ps_ind_02_cat_4                0.001156
123) ps_car_01_cat_5                0.001143
124) ps_car_06_cat_6                0.001095
125) ps_car_06_cat_10               0.001055
126) ps_car_04_cat_1                0.001036
127) ps_ind_05_cat_2                0.001030
128) ps_car_06_cat_7                0.001003
129) ps_car_04_cat_2                0.000980
130) ps_car_01_cat_3                0.000885
131) ps_car_09_cat_3                0.000883
132) ps_ind_14                      0.000862
133) ps_car_01_cat_0                0.000854
134) ps_car_06_cat_15               0.000831
135) ps_car_06_cat_9                0.000785
136) ps_ind_05_cat_1                0.000755
137) ps_car_10_cat_1                0.000704
138) ps_car_06_cat_3                0.000698
139) ps_ind_05_cat_3                0.000685
140) ps_ind_12_bin                  0.000671
141) ps_car_09_cat_4                0.000631
142) ps_car_01_cat_2                0.000569
143) ps_car_04_cat_8                0.000557
144) ps_car_06_cat_17               0.000513
145) ps_car_06_cat_16               0.000454
146) ps_car_04_cat_9                0.000443
147) ps_car_06_cat_12               0.000420
148) ps_car_06_cat_13               0.000396
149) ps_car_01_cat_1                0.000381
150) ps_ind_05_cat_5                0.000307
151) ps_car_06_cat_5                0.000284
152) ps_ind_11_bin                  0.000217
153) ps_car_04_cat_6                0.000193
154) ps_ind_13_bin                  0.000150
155) ps_car_04_cat_3                0.000141
156) ps_car_06_cat_2                0.000137
157) ps_car_04_cat_5                0.000100
158) ps_car_06_cat_8                0.000093
159) ps_car_04_cat_7                0.000083
160) ps_ind_10_bin                  0.000074
161) ps_car_10_cat_2                0.000058
162) ps_car_04_cat_4                0.000042
sfm = SelectFromModel(rf, threshold='median', prefit=True)
print('Number of features before selection: {}'.format(X_train.shape[1]))
n_features = sfm.transform(X_train).shape[1]
print('Number of features after selection: {}'.format(n_features))
selected_vars = list(feat_labels[sfm.get_support()])
Number of features before selection: 162
Number of features after selection: 81
train = train[selected_vars + ['target']]
scaler = StandardScaler()
scaler.fit_transform(train.drop(['target'], axis=1))
array([[-0.45941104, -1.26665356,  1.05087653, ..., -0.72553616,
        -1.01071913, -1.06173767],
       [ 1.55538958,  0.95034274, -0.63847299, ..., -1.06120876,
        -1.01071913,  0.27907892],
       [ 1.05168943, -0.52765479, -0.92003125, ...,  1.95984463,
        -0.56215309, -1.02449277],
       ...,
       [-0.9631112 ,  0.58084336,  0.48776003, ..., -0.46445747,
         0.18545696,  0.27907892],
       [-0.9631112 , -0.89715418, -1.48314775, ..., -0.91202093,
        -0.41263108,  0.27907892],
       [-0.45941104, -1.26665356,  1.61399304, ...,  0.28148164,
        -0.11358706, -0.72653353]])

Comments