Kaggle 필사 1

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

%%time
train= pd.read_csv("./porto/train.csv/train.csv")
test= pd.read_csv("./porto/test.csv/test.csv")

Wall time: 6.47 s

# train = train.sample(frac = 0.2) EDA할 때 데이터 뽑자
# 데이터가 imbalanced하면
from sklearn.model_selection import StratifiedKFold

#fold = StratifiedKFold(n_splits=10, random_state=1980, shuffle = True)
#for trn_idx, val_idx in fold.split(train, train['target']) :
#    break
#train = train.iloc[trn_idx]

train.shape

(595212, 59)

train.tail()

	id	ps_ind_01	ps_ind_02_cat	ps_ind_03	ps_ind_04_cat	ps_ind_06_bin	ps_ind_08_bin	ps_ind_09_bin	ps_ind_15	ps_ind_16_bin	ps_reg_01	ps_reg_02	ps_reg_03	ps_car_01_cat	ps_car_02_cat	ps_car_03_cat	ps_car_05_cat	ps_car_06_cat	ps_car_07_cat	ps_car_08_cat	ps_car_09_cat	ps_car_10_cat	ps_car_11_cat	ps_car_11	ps_car_12	ps_car_13	ps_car_14	ps_car_15	ps_calc_01	ps_calc_02	ps_calc_03	ps_calc_04	ps_calc_05	ps_calc_06	ps_calc_07	ps_calc_08	ps_calc_09	ps_calc_10	ps_calc_11	ps_calc_12	ps_calc_13	ps_calc_14	ps_calc_15_bin	ps_calc_16_bin	ps_calc_17_bin	ps_calc_18_bin	ps_calc_19_bin	ps_calc_20_bin
595207	1488013	3	1	10	0	0	0	1	13	1	0.5	0.3	0.692820	10	1	-1	1	1	1	1	0	1	31	3	0.374166	0.684631	0.385487	2.645751	0.4	0.5	0.3	3	0	9	0	9	1	12	4	1	9	6	0	1	1	0	1	1
595208	1488016	5	1	3	0	0	0	1	6	1	0.9	0.7	1.382027	9	1	-1	-1	15	0	0	2	1	63	2	0.387298	0.972145	-1.000000	3.605551	0.2	0.2	0.0	2	4	8	6	8	2	12	4	1	3	8	1	0	1	0	1	1
595209	1488017	1	1	10	0	1	0	0	12	1	0.9	0.2	0.659071	7	1	-1	-1	1	1	1	2	1	31	3	0.397492	0.596373	0.398748	1.732051	0.4	0.0	0.3	3	2	7	4	8	0	10	3	2	2	6	0	0	1	0	0	0
595210	1488021	5	2	3	1	0	1	0	12	1	0.9	0.4	0.698212	11	1	-1	-1	11	1	1	2	1	101	3	0.374166	0.764434	0.384968	3.162278	0.0	0.7	0.0	4	0	9	4	9	2	11	4	1	4	2	0	1	1	1	0	0
595211	1488027	0	1	8	0	1	0	0	7	1	0.1	0.2	-1.000000	7	0	-1	-1	0	1	0	2	1	34	2	0.400000	0.932649	0.378021	3.741657	0.4	0.0	0.5	2	3	10	4	10	2	5	4	4	3	8	0	1	0	0	0	0

cat_cols = [col for col in train.columns if 'cat' in col]

cat_cols

['ps_ind_02_cat',
 'ps_ind_04_cat',
 'ps_ind_05_cat',
 'ps_car_01_cat',
 'ps_car_02_cat',
 'ps_car_03_cat',
 'ps_car_04_cat',
 'ps_car_05_cat',
 'ps_car_06_cat',
 'ps_car_07_cat',
 'ps_car_08_cat',
 'ps_car_09_cat',
 'ps_car_10_cat',
 'ps_car_11_cat']

train[cat_cols[0]].value_counts()

 1    431859
 2    123573
 3     28186
 4     11378
-1       216
Name: ps_ind_02_cat, dtype: int64

for col in cat_cols :
    print(col, train[col].nunique())

ps_ind_02_cat 5
ps_ind_04_cat 3
ps_ind_05_cat 8
ps_car_01_cat 13
ps_car_02_cat 3
ps_car_03_cat 3
ps_car_04_cat 10
ps_car_05_cat 3
ps_car_06_cat 18
ps_car_07_cat 3
ps_car_08_cat 2
ps_car_09_cat 6
ps_car_10_cat 3
ps_car_11_cat 104

train.drop_duplicates()
train.shape

(595212, 59)

train['ps_ind_03'].dtype == 'int64'

True

data = [] 

for f in train.columns:
        #Defining the role
        if f == 'target':
            role = 'target'
        elif f == 'id':
            role = 'id'
        else :
            role = 'input'
        
        #Defining the level
        if 'bin' in f or f == 'target' :
            level = 'binary'
        elif 'cat' in f or f =='id':
            level = 'nominal'
        elif train[f].dtype == 'float' :
            level = 'interval'
        elif train[f].dtype == 'int64':
            level = 'ordinal'
        
        #Initialize keep to True for all variables except for id
        keep = True
        if f == 'id':
            keep = False
        
        #Defining the data type
        dtype = train[f].dtype
        
        #Creating a Dict that contains all the metadata for the variable
        f_dict = {
            'varname' : f,
            'role' : role,
            'level' : level,
            'keep' : keep,
            'dtype' : dtype
        }
        data.append(f_dict)
        
meta = pd.DataFrame(data, columns = ['varname', 'role', 'level', 'keep', 'dtype'])
meta.set_index('varname', inplace = True)
            

meta

	role	level	keep	dtype
varname
id	id	nominal	False	int64
target	target	binary	True	int64
ps_ind_01	input	ordinal	True	int64
ps_ind_02_cat	input	nominal	True	int64
ps_ind_03	input	ordinal	True	int64
ps_ind_04_cat	input	nominal	True	int64
ps_ind_05_cat	input	nominal	True	int64
ps_ind_06_bin	input	binary	True	int64
ps_ind_07_bin	input	binary	True	int64
ps_ind_08_bin	input	binary	True	int64
ps_ind_09_bin	input	binary	True	int64
ps_ind_10_bin	input	binary	True	int64
ps_ind_11_bin	input	binary	True	int64
ps_ind_12_bin	input	binary	True	int64
ps_ind_13_bin	input	binary	True	int64
ps_ind_14	input	ordinal	True	int64
ps_ind_15	input	ordinal	True	int64
ps_ind_16_bin	input	binary	True	int64
ps_ind_17_bin	input	binary	True	int64
ps_ind_18_bin	input	binary	True	int64
ps_reg_01	input	interval	True	float64
ps_reg_02	input	interval	True	float64
ps_reg_03	input	interval	True	float64
ps_car_01_cat	input	nominal	True	int64
ps_car_02_cat	input	nominal	True	int64
ps_car_03_cat	input	nominal	True	int64
ps_car_04_cat	input	nominal	True	int64
ps_car_05_cat	input	nominal	True	int64
ps_car_06_cat	input	nominal	True	int64
ps_car_07_cat	input	nominal	True	int64
ps_car_08_cat	input	nominal	True	int64
ps_car_09_cat	input	nominal	True	int64
ps_car_10_cat	input	nominal	True	int64
ps_car_11_cat	input	nominal	True	int64
ps_car_11	input	ordinal	True	int64
ps_car_12	input	interval	True	float64
ps_car_13	input	interval	True	float64
ps_car_14	input	interval	True	float64
ps_car_15	input	interval	True	float64
ps_calc_01	input	interval	True	float64
ps_calc_02	input	interval	True	float64
ps_calc_03	input	interval	True	float64
ps_calc_04	input	ordinal	True	int64
ps_calc_05	input	ordinal	True	int64
ps_calc_06	input	ordinal	True	int64
ps_calc_07	input	ordinal	True	int64
ps_calc_08	input	ordinal	True	int64
ps_calc_09	input	ordinal	True	int64
ps_calc_10	input	ordinal	True	int64
ps_calc_11	input	ordinal	True	int64
ps_calc_12	input	ordinal	True	int64
ps_calc_13	input	ordinal	True	int64
ps_calc_14	input	ordinal	True	int64
ps_calc_15_bin	input	binary	True	int64
ps_calc_16_bin	input	binary	True	int64
ps_calc_17_bin	input	binary	True	int64
ps_calc_18_bin	input	binary	True	int64
ps_calc_19_bin	input	binary	True	int64
ps_calc_20_bin	input	binary	True	int64

meta[(meta.level =='nominal')& (meta.keep)].index

Index(['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat',
       'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat',
       'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat',
       'ps_car_10_cat', 'ps_car_11_cat'],
      dtype='object', name='varname')

pd.DataFrame({'count' : meta.groupby(['role','level'])['role'].size()}).reset_index()

	role	level	count
0	id	nominal	1
1	input	binary	17
2	input	interval	10
3	input	nominal	14
4	input	ordinal	16
5	target	binary	1

meta.groupby(['role','level'])['role'].size()

role    level   
id      nominal      1
input   binary      17
        interval    10
        nominal     14
        ordinal     16
target  binary       1
Name: role, dtype: int64

v = meta[(meta.level == 'interval') & (meta.keep)].index
train[v].describe()

	ps_reg_01	ps_reg_02	ps_reg_03	ps_car_12	ps_car_13	ps_car_14	ps_car_15	ps_calc_01	ps_calc_02	ps_calc_03
count	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000
mean	0.610991	0.439184	0.551102	0.379945	0.813265	0.276256	3.065899	0.449756	0.449589	0.449849
std	0.287643	0.404264	0.793506	0.058327	0.224588	0.357154	0.731366	0.287198	0.286893	0.287153
min	0.000000	0.000000	-1.000000	-1.000000	0.250619	-1.000000	0.000000	0.000000	0.000000	0.000000
25%	0.400000	0.200000	0.525000	0.316228	0.670867	0.333167	2.828427	0.200000	0.200000	0.200000
50%	0.700000	0.300000	0.720677	0.374166	0.765811	0.368782	3.316625	0.500000	0.400000	0.500000
75%	0.900000	0.600000	1.000000	0.400000	0.906190	0.396485	3.605551	0.700000	0.700000	0.700000
max	0.900000	1.800000	4.037945	1.264911	3.720626	0.636396	3.741657	0.900000	0.900000	0.900000

v = meta[(meta.level == 'binary') & (meta.keep)].index
train[v].describe()

	target	ps_ind_06_bin	ps_ind_07_bin	ps_ind_08_bin	ps_ind_09_bin	ps_ind_10_bin	ps_ind_11_bin	ps_ind_12_bin	ps_ind_13_bin	ps_ind_16_bin	ps_ind_17_bin	ps_ind_18_bin	ps_calc_15_bin	ps_calc_16_bin	ps_calc_17_bin	ps_calc_18_bin	ps_calc_19_bin	ps_calc_20_bin
count	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000	595212.000000
mean	0.036448	0.393742	0.257033	0.163921	0.185304	0.000373	0.001692	0.009439	0.000948	0.660823	0.121081	0.153446	0.122427	0.627840	0.554182	0.287182	0.349024	0.153318
std	0.187401	0.488579	0.436998	0.370205	0.388544	0.019309	0.041097	0.096693	0.030768	0.473430	0.326222	0.360417	0.327779	0.483381	0.497056	0.452447	0.476662	0.360295
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000	1.000000	1.000000	0.000000	0.000000	0.000000
75%	0.000000	1.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000	1.000000	1.000000	1.000000	1.000000	0.000000
max	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000

Handling Imbalanced classes

3가지 전략
- oversampling recoreds with target == 1
- undesrsampling records with target == 0
- SMOTE사용

desired_apriro1=0.1
idx_0 = train[train.target == 0].index
idx_1 = train[train.target == 1].index

nb_0 = len(train.loc[idx_0])
nb_1 = len(train.loc[idx_1])

undersampling_rate = ((1-desired_apriro1)*nb_1)/(nb_0*desired_apriro1)

((1-desired_apriro1)*nb_1)/(nb_0*desired_apriro1)

0.34043569687437886

undersampled_nb_0 = int(undersampling_rate*nb_0)
print('Rate to undersample records with target=0: {}'.format(undersampling_rate))
print('Number of records with target=0 after undersampling: {}'.format(undersampled_nb_0))

Rate to undersample records with target=0: 0.34043569687437886
Number of records with target=0 after undersampling: 195246

undersampled_idx = shuffle(idx_0, random_state=37, n_samples = undersampled_nb_0)

idx_list = list(undersampled_idx) + list(idx_1)

train = train.loc[idx_list].reset_index(drop = True)

#import missingno as msno

#msno.matrix(train)

display( train.isnull().sum(axis =0) )

id                0
target            0
ps_ind_01         0
ps_ind_02_cat     0
ps_ind_03         0
ps_ind_04_cat     0
ps_ind_05_cat     0
ps_ind_06_bin     0
ps_ind_07_bin     0
ps_ind_08_bin     0
ps_ind_09_bin     0
ps_ind_10_bin     0
ps_ind_11_bin     0
ps_ind_12_bin     0
ps_ind_13_bin     0
ps_ind_14         0
ps_ind_15         0
ps_ind_16_bin     0
ps_ind_17_bin     0
ps_ind_18_bin     0
ps_reg_01         0
ps_reg_02         0
ps_reg_03         0
ps_car_01_cat     0
ps_car_02_cat     0
ps_car_03_cat     0
ps_car_04_cat     0
ps_car_05_cat     0
ps_car_06_cat     0
ps_car_07_cat     0
ps_car_08_cat     0
ps_car_09_cat     0
ps_car_10_cat     0
ps_car_11_cat     0
ps_car_11         0
ps_car_12         0
ps_car_13         0
ps_car_14         0
ps_car_15         0
ps_calc_01        0
ps_calc_02        0
ps_calc_03        0
ps_calc_04        0
ps_calc_05        0
ps_calc_06        0
ps_calc_07        0
ps_calc_08        0
ps_calc_09        0
ps_calc_10        0
ps_calc_11        0
ps_calc_12        0
ps_calc_13        0
ps_calc_14        0
ps_calc_15_bin    0
ps_calc_16_bin    0
ps_calc_17_bin    0
ps_calc_18_bin    0
ps_calc_19_bin    0
ps_calc_20_bin    0
dtype: int64

vars_with_missing = []

for f in train.columns:
    missings = train[train[f] == -1][f].count()
    if missings > 0:
        vars_with_missing.append(f)
        missings_perc = missings/train.shape[0]
        
        print('Variable {} has {} records ({:.2%}) with missing values'.format(f,missings,missings_perc))

print('In total, there are {} variables with missing values'.format(len(vars_with_missing)))

Variable ps_ind_02_cat has 103 records (0.05%) with missing values
Variable ps_ind_04_cat has 51 records (0.02%) with missing values
Variable ps_ind_05_cat has 2256 records (1.04%) with missing values
Variable ps_reg_03 has 38580 records (17.78%) with missing values
Variable ps_car_01_cat has 62 records (0.03%) with missing values
Variable ps_car_02_cat has 2 records (0.00%) with missing values
Variable ps_car_03_cat has 148367 records (68.39%) with missing values
Variable ps_car_05_cat has 96026 records (44.26%) with missing values
Variable ps_car_07_cat has 4431 records (2.04%) with missing values
Variable ps_car_09_cat has 230 records (0.11%) with missing values
Variable ps_car_11 has 1 records (0.00%) with missing values
Variable ps_car_14 has 15726 records (7.25%) with missing values
In total, there are 12 variables with missing values

-ps_car_03이 70% missing value, 전체삭제

train[['ps_car_03_cat','target']].groupby('ps_car_03_cat').mean()

	target
ps_car_03_cat
-1	0.090654
0	0.106983
1	0.128862

train['ps_car_03_cat'].describe()

count    216940.000000
mean         -0.492639
std           0.795291
min          -1.000000
25%          -1.000000
50%          -1.000000
75%           0.000000
max           1.000000
Name: ps_car_03_cat, dtype: float64

train['ps_car_03_cat'].hist()

<matplotlib.axes._subplots.AxesSubplot at 0x2398fb97108>

png

여기서는 결측값을 평균으로 대체하지만 좀더 면밀하게 분석하고 해야할 필요가있다..

vars_to_drop = ['ps_car_03_cat', 'ps_car_05_cat']

train.drop(vars_to_drop, axis =1, inplace = True)

meta.loc[(vars_to_drop), 'keep'] = False

temp_series = train[['ps_car_01_cat','ps_car_02_cat','ps_reg_03']].groupby(['ps_car_01_cat','ps_car_02_cat']).mean()

temp_series.reset_index(inplace= True)

train.loc[train['ps_reg_03'] == -1].merge(temp_series, on = ['ps_car_01_cat','ps_car_02_cat'] , how = 'left')

	id	target	ps_ind_01	ps_ind_02_cat	ps_ind_03	ps_ind_04_cat	ps_ind_05_cat	ps_ind_06_bin	ps_ind_07_bin	ps_ind_08_bin	ps_ind_09_bin	ps_ind_10_bin	ps_ind_11_bin	ps_ind_12_bin	ps_ind_13_bin	ps_ind_14	ps_ind_15	ps_ind_16_bin	ps_ind_17_bin	ps_ind_18_bin	ps_reg_01	ps_reg_02	ps_reg_03_x	ps_car_01_cat	ps_car_02_cat	ps_car_04_cat	ps_car_06_cat	ps_car_07_cat	ps_car_08_cat	ps_car_09_cat	ps_car_10_cat	ps_car_11_cat	ps_car_11	ps_car_12	ps_car_13	ps_car_14	ps_car_15	ps_calc_01	ps_calc_02	ps_calc_03	ps_calc_04	ps_calc_05	ps_calc_06	ps_calc_07	ps_calc_08	ps_calc_09	ps_calc_10	ps_calc_11	ps_calc_12	ps_calc_13	ps_calc_14	ps_calc_15_bin	ps_calc_16_bin	ps_calc_17_bin	ps_calc_18_bin	ps_calc_19_bin	ps_calc_20_bin	ps_reg_03_y
0	1254786	0	7	1	9	0	0	0	1	0	0	0	0	0	0	0	12	1	0	0	0.2	0.2	-1.0	0	1	9	13	1	0	0	1	104	2	0.565685	2.108264	0.530094	3.741657	0.7	0.6	0.4	2	2	9	3	11	1	12	2	1	2	10	0	1	0	0	0	0	0.505736
1	1425558	0	1	2	0	1	4	0	0	0	1	0	0	0	0	0	8	1	0	0	0.1	0.3	-1.0	8	1	0	14	1	1	0	1	104	1	0.316070	0.508502	0.355668	1.732051	0.0	0.4	0.1	1	2	9	2	12	3	6	2	1	4	10	0	0	0	0	0	0	0.515147
2	860206	0	1	1	1	0	0	1	0	0	0	0	0	0	0	0	7	1	0	0	0.4	0.1	-1.0	7	1	0	1	1	1	2	1	64	1	0.316228	0.656405	0.361939	3.316625	0.4	0.8	0.3	3	1	6	3	7	2	7	6	0	3	7	1	0	1	0	1	0	0.333647
3	1265316	0	4	2	4	1	0	0	0	1	0	0	0	0	0	0	6	0	0	1	0.1	0.3	-1.0	4	1	0	1	1	1	0	1	65	1	0.316228	0.545795	0.350714	2.449490	0.6	0.6	0.5	1	1	7	1	12	6	6	6	2	4	10	0	1	1	1	0	0	0.244300
4	267652	0	2	1	1	0	0	1	0	0	0	0	0	0	0	0	12	1	0	0	0.3	0.3	-1.0	7	0	0	10	1	1	2	1	55	2	0.424264	1.116425	0.416533	3.605551	0.2	0.1	0.9	2	0	8	4	12	3	9	7	0	2	6	0	1	0	1	1	0	0.477232
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
38575	1486851	1	5	1	10	0	0	0	0	1	0	0	0	0	0	0	5	0	1	0	0.1	0.3	-1.0	11	0	9	17	1	1	2	1	104	2	0.447214	1.209873	0.430116	3.316625	0.2	0.4	0.9	2	3	7	4	8	4	8	7	0	6	10	0	0	0	1	1	0	0.985341
38576	1487090	1	2	1	6	0	6	1	0	0	0	0	0	0	0	0	12	1	0	0	0.1	0.3	-1.0	7	1	0	0	1	1	2	1	37	2	0.316228	0.740728	0.311448	3.316625	0.4	0.7	0.0	1	2	9	6	9	1	10	6	1	3	15	0	0	1	0	0	1	0.333647
38577	1487406	1	3	1	8	1	0	0	1	0	0	0	0	0	0	0	7	1	0	0	0.4	0.0	-1.0	7	1	0	1	1	1	2	1	64	3	0.316228	0.613586	0.301662	2.828427	0.9	0.5	0.5	3	1	10	2	9	1	3	5	0	3	4	0	1	0	0	1	0	0.333647
38578	1487419	1	2	1	6	0	0	1	0	0	0	0	0	0	0	0	9	1	0	0	0.3	0.3	-1.0	4	1	0	0	1	1	0	1	34	2	0.400000	0.795156	0.378021	3.162278	0.7	0.1	0.9	3	4	9	3	9	2	8	11	1	5	5	0	1	1	0	0	0	0.244300
38579	1487566	1	1	1	5	0	0	0	0	1	0	0	0	0	0	0	4	0	0	1	0.3	0.4	-1.0	11	0	0	15	1	1	2	1	5	2	0.424264	0.756979	0.400000	2.000000	0.3	0.4	0.6	1	1	8	2	9	5	9	9	2	1	5	0	1	0	0	0	0	0.985341

38580 rows × 58 columns

mean_imp = SimpleImputer(missing_values= -1, strategy='mean')
mode_imp = SimpleImputer(missing_values= -1, strategy ='most_frequent')

train['ps_reg_03'] = mean_imp.fit_transform(train[['ps_reg_03']]).ravel()
train['ps_car_12'] = mean_imp.fit_transform(train[['ps_car_12']]).ravel()
train['ps_car_14'] = mean_imp.fit_transform(train[['ps_car_14']]).ravel()
train['ps_car_11'] = mode_imp.fit_transform(train[['ps_car_11']]).ravel()

v = meta[(meta.level == 'nominal' ) & (meta.keep)].index
sum = 0

for f in v :
    dist_values = train[f].value_counts().shape[0]
    sum+= dist_values
    print('Variable {} has {} distinct values'.format(f, dist_values))

Variable ps_ind_02_cat has 5 distinct values
Variable ps_ind_04_cat has 3 distinct values
Variable ps_ind_05_cat has 8 distinct values
Variable ps_car_01_cat has 13 distinct values
Variable ps_car_02_cat has 3 distinct values
Variable ps_car_04_cat has 10 distinct values
Variable ps_car_06_cat has 18 distinct values
Variable ps_car_07_cat has 3 distinct values
Variable ps_car_08_cat has 2 distinct values
Variable ps_car_09_cat has 6 distinct values
Variable ps_car_10_cat has 3 distinct values
Variable ps_car_11_cat has 104 distinct values

train[f].value_counts().shape

(104,)

sum

def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior  
    """ 
    assert len(trn_series) == len(target) #값이 같으면 True 
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1) #옆으로 붙여주고,
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"]) #group by
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),#rename
        on=trn_series.name,                                                                 # merge 
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior) # how = left를 넣어야 NULL값이 안생김
    
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

train_encoded, test_encoded = target_encode(train["ps_car_11_cat"], 
                             test["ps_car_11_cat"], 
                             target=train.target, 
                             min_samples_leaf=100,
                             smoothing=10,
                             noise_level=0.01)
    
train['ps_car_11_cat_te'] = train_encoded
train.drop('ps_car_11_cat', axis=1, inplace=True)
meta.loc['ps_car_11_cat','keep'] = False  # Updating the meta
test['ps_car_11_cat_te'] = test_encoded
test.drop('ps_car_11_cat', axis=1, inplace=True)

뜯어보기

#v = meta[(meta.level == 'nominal') & (meta.keep)].index

#cat_perc = train[[f, 'target']].groupby([f],as_index=False).mean()

---------------------------------------------------------------------------

KeyError                                  Traceback (most recent call last)

<ipython-input-50-b8a6304ecf30> in <module>
----> 1 cat_perc = train[[f, 'target']].groupby([f],as_index=False).mean()


C:\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
   2804             if is_iterator(key):
   2805                 key = list(key)
-> 2806             indexer = self.loc._get_listlike_indexer(key, axis=1, raise_missing=True)[1]
   2807 
   2808         # take() does not accept boolean indexers


C:\Anaconda3\lib\site-packages\pandas\core\indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
   1550 
   1551         self._validate_read_indexer(
-> 1552             keyarr, indexer, o._get_axis_number(axis), raise_missing=raise_missing
   1553         )
   1554         return keyarr, indexer


C:\Anaconda3\lib\site-packages\pandas\core\indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
   1643             if not (self.name == "loc" and not raise_missing):
   1644                 not_found = list(set(key) - set(ax))
-> 1645                 raise KeyError(f"{not_found} not in index")
   1646 
   1647             # we skip the warning on Categorical/Interval


KeyError: "['ps_car_11_cat'] not in index"

#cat_perc

---------------------------------------------------------------------------

NameError                                 Traceback (most recent call last)

<ipython-input-51-2ac23d9b437a> in <module>
----> 1 cat_perc

NameError: name 'cat_perc' is not defined

#cat_perc.sort_values(by='target', ascending=False, inplace = True)

---------------------------------------------------------------------------

NameError                                 Traceback (most recent call last)

<ipython-input-52-72bd5321c2f0> in <module>
----> 1 cat_perc.sort_values(by='target', ascending=False, inplace = True)

NameError: name 'cat_perc' is not defined

#sns.barplot(x=f, y = 'target', data=cat_perc,order = cat_perc[f])

---------------------------------------------------------------------------

NameError                                 Traceback (most recent call last)

<ipython-input-53-579ec202cb69> in <module>
----> 1 sns.barplot(x=f, y = 'target', data=cat_perc,order = cat_perc[f])

NameError: name 'cat_perc' is not defined

다시 코드

v = meta[(meta.level == 'nominal') & (meta.keep)].index

for f in v:
    plt.figure()
    fig, ax = plt.subplots(figsize=(20,10))
    # Calculate the percentage of target=1 per category value
    cat_perc = train[[f, 'target']].groupby([f],as_index=False).mean()
    cat_perc.sort_values(by='target', ascending=False, inplace=True)
    # Bar plot
    # Order the bars descending on target mean
    sns.barplot(ax=ax, x=f, y='target', data=cat_perc, order=cat_perc[f])
    plt.ylabel('% target', fontsize=18)
    plt.xlabel(f, fontsize=18)
    plt.tick_params(axis='both', which='major', labelsize=18)
    plt.show();

<Figure size 432x288 with 0 Axes>

png

<Figure size 432x288 with 0 Axes>

png

<Figure size 432x288 with 0 Axes>

png

<Figure size 432x288 with 0 Axes>

png

<Figure size 432x288 with 0 Axes>

png

<Figure size 432x288 with 0 Axes>

png

<Figure size 432x288 with 0 Axes>

png

<Figure size 432x288 with 0 Axes>

png

<Figure size 432x288 with 0 Axes>

png

<Figure size 432x288 with 0 Axes>

png

<Figure size 432x288 with 0 Axes>

png

cat_perc

	ps_car_10_cat	target
1	1	0.100029
2	2	0.100000
0	0	0.096420

f = 'ps_car_02_cat'

cat_perc = train[[f, 'target']].groupby([f], as_index=False).agg(['mean','count'])

cat_perc

	target
	mean	count
ps_car_02_cat
-1	0.000000	2
0	0.131868	38000
1	0.093233	178938

관측값이 2개밖에 없는데 둘다 -1이여서 확률이 0으로 나타남.
여기서는 결측치가 의미가 있다고 판단하여 삭제하지 않음.

v = meta[(meta.level == 'interval') & (meta.keep)].index

correlations = train[v].corr()

correlations

	ps_reg_01	ps_reg_02	ps_reg_03	ps_car_12	ps_car_13	ps_car_14	ps_car_15	ps_calc_01	ps_calc_02	ps_calc_03
ps_reg_01	1.000000	0.470953	0.137117	0.019095	0.025243	-0.002536	0.001755	-0.003236	0.001459	-0.001371
ps_reg_02	0.470953	1.000000	0.702512	0.173736	0.193896	0.053149	0.052344	-0.001769	-0.000726	-0.000992
ps_reg_03	0.137117	0.702512	1.000000	0.208978	0.241244	0.079541	0.079848	-0.000223	0.000043	-0.000357
ps_car_12	0.019095	0.173736	0.208978	1.000000	0.674298	0.577537	0.049468	-0.000452	-0.001070	-0.000707
ps_car_13	0.025243	0.193896	0.241244	0.674298	1.000000	0.434613	0.526024	0.000266	0.000020	0.000568
ps_car_14	-0.002536	0.053149	0.079541	0.577537	0.434613	1.000000	0.008472	-0.004548	-0.005015	0.000776
ps_car_15	0.001755	0.052344	0.079848	0.049468	0.526024	0.008472	1.000000	-0.000392	0.003630	0.000586
ps_calc_01	-0.003236	-0.001769	-0.000223	-0.000452	0.000266	-0.004548	-0.000392	1.000000	0.002832	-0.000212
ps_calc_02	0.001459	-0.000726	0.000043	-0.001070	0.000020	-0.005015	0.003630	0.002832	1.000000	0.003130
ps_calc_03	-0.001371	-0.000992	-0.000357	-0.000707	0.000568	0.000776	0.000586	-0.000212	0.003130	1.000000

cmap = sns.diverging_palette(220, 10, as_cmap=True)

    fig, ax = plt.subplots(figsize=(10,10))
    sns.heatmap(correlations, cmap=cmap, vmax=1.0, center=0, fmt='.2f',
                square=True, linewidths=.5, annot=True, cbar_kws={"shrink": .75})
    plt.show();

png

def corr_heatmap(v):
    correlations = train[v].corr()

    # Create color map ranging between two colors
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    fig, ax = plt.subplots(figsize=(10,10))
    sns.heatmap(correlations, cmap=cmap, vmax=1.0, center=0, fmt='.2f',
                square=True, linewidths=.5, annot=True, cbar_kws={"shrink": .75})
    plt.show();
    
v = meta[(meta.level == 'interval') & (meta.keep)].index
corr_heatmap(v)

png

s = train.sample(frac=0.1)

sns.lmplot(x='ps_reg_02', y='ps_reg_03', data=s, hue= 'target', palette='Set1',scatter_kws = {'alpha' : 0.3})
plt.show()

png

sns.lmplot(x='ps_car_12', y='ps_car_13', data=s, hue='target', palette='Set1', scatter_kws={'alpha':0.3})
plt.show()

png

sns.lmplot(x='ps_car_12', y='ps_car_14', data=s, hue='target', palette='Set1', scatter_kws={'alpha':0.3})
plt.show()

png

sns.lmplot(x='ps_car_15', y='ps_car_13', data=s, hue='target', palette='Set1', scatter_kws={'alpha':0.3})
plt.show()

png

v = meta[(meta.level == 'ordinal') & (meta.keep)].index
corr_heatmap(v)

png

#PCA를 이용해서 결정

v = meta[(meta.level == 'nominal') & (meta.keep)].index
print('Before dummification we have {} variables in train'.format(train.shape[1]))
train = pd.get_dummies(train, columns=v, drop_first=True)
print('After dummification we have {} variables in train'.format(train.shape[1]))

Before dummification we have 57 variables in train
After dummification we have 109 variables in train

v = meta[(meta.level == 'interval') & (meta.keep)].index

poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias = False) #2승까지 

poly.fit_transform(train[v])

array([[0.6       , 0.6       , 0.83815273, ..., 0.09      , 0.        ,
        0.        ],
       [0.9       , 0.6       , 0.72844011, ..., 0.        , 0.        ,
        0.36      ],
       [0.9       , 0.6       , 0.86926693, ..., 0.81      , 0.09      ,
        0.01      ],
       ...,
       [0.9       , 0.3       , 0.71195154, ..., 0.16      , 0.24      ,
        0.36      ],
       [0.6       , 0.1       , 0.57716982, ..., 0.04      , 0.12      ,
        0.36      ],
       [0.6       , 0.4       , 1.09515981, ..., 0.36      , 0.18      ,
        0.09      ]])

poly.get_feature_names(v)

['ps_reg_01',
 'ps_reg_02',
 'ps_reg_03',
 'ps_car_12',
 'ps_car_13',
 'ps_car_14',
 'ps_car_15',
 'ps_calc_01',
 'ps_calc_02',
 'ps_calc_03',
 'ps_reg_01^2',
 'ps_reg_01 ps_reg_02',
 'ps_reg_01 ps_reg_03',
 'ps_reg_01 ps_car_12',
 'ps_reg_01 ps_car_13',
 'ps_reg_01 ps_car_14',
 'ps_reg_01 ps_car_15',
 'ps_reg_01 ps_calc_01',
 'ps_reg_01 ps_calc_02',
 'ps_reg_01 ps_calc_03',
 'ps_reg_02^2',
 'ps_reg_02 ps_reg_03',
 'ps_reg_02 ps_car_12',
 'ps_reg_02 ps_car_13',
 'ps_reg_02 ps_car_14',
 'ps_reg_02 ps_car_15',
 'ps_reg_02 ps_calc_01',
 'ps_reg_02 ps_calc_02',
 'ps_reg_02 ps_calc_03',
 'ps_reg_03^2',
 'ps_reg_03 ps_car_12',
 'ps_reg_03 ps_car_13',
 'ps_reg_03 ps_car_14',
 'ps_reg_03 ps_car_15',
 'ps_reg_03 ps_calc_01',
 'ps_reg_03 ps_calc_02',
 'ps_reg_03 ps_calc_03',
 'ps_car_12^2',
 'ps_car_12 ps_car_13',
 'ps_car_12 ps_car_14',
 'ps_car_12 ps_car_15',
 'ps_car_12 ps_calc_01',
 'ps_car_12 ps_calc_02',
 'ps_car_12 ps_calc_03',
 'ps_car_13^2',
 'ps_car_13 ps_car_14',
 'ps_car_13 ps_car_15',
 'ps_car_13 ps_calc_01',
 'ps_car_13 ps_calc_02',
 'ps_car_13 ps_calc_03',
 'ps_car_14^2',
 'ps_car_14 ps_car_15',
 'ps_car_14 ps_calc_01',
 'ps_car_14 ps_calc_02',
 'ps_car_14 ps_calc_03',
 'ps_car_15^2',
 'ps_car_15 ps_calc_01',
 'ps_car_15 ps_calc_02',
 'ps_car_15 ps_calc_03',
 'ps_calc_01^2',
 'ps_calc_01 ps_calc_02',
 'ps_calc_01 ps_calc_03',
 'ps_calc_02^2',
 'ps_calc_02 ps_calc_03',
 'ps_calc_03^2']

pd.DataFrame(data=poly.fit_transform(train[v]), columns=poly.get_feature_names(v))

	ps_reg_01	ps_reg_02	ps_reg_03	ps_car_12	ps_car_13	ps_car_14	ps_car_15	ps_calc_01	ps_calc_02	ps_calc_03	ps_reg_01^2	ps_reg_01 ps_reg_02	ps_reg_01 ps_reg_03	ps_reg_01 ps_car_12	ps_reg_01 ps_car_13	ps_reg_01 ps_car_14	ps_reg_01 ps_car_15	ps_reg_01 ps_calc_01	ps_reg_01 ps_calc_02	ps_reg_01 ps_calc_03	ps_reg_02^2	ps_reg_02 ps_reg_03	ps_reg_02 ps_car_12	ps_reg_02 ps_car_13	ps_reg_02 ps_car_14	ps_reg_02 ps_car_15	ps_reg_02 ps_calc_01	ps_reg_02 ps_calc_02	ps_reg_02 ps_calc_03	ps_reg_03^2	ps_reg_03 ps_car_12	ps_reg_03 ps_car_13	ps_reg_03 ps_car_14	ps_reg_03 ps_car_15	ps_reg_03 ps_calc_01	ps_reg_03 ps_calc_02	ps_reg_03 ps_calc_03	ps_car_12^2	ps_car_12 ps_car_13	ps_car_12 ps_car_14	ps_car_12 ps_car_15	ps_car_12 ps_calc_01	ps_car_12 ps_calc_02	ps_car_12 ps_calc_03	ps_car_13^2	ps_car_13 ps_car_14	ps_car_13 ps_car_15	ps_car_13 ps_calc_01	ps_car_13 ps_calc_02	ps_car_13 ps_calc_03	ps_car_14^2	ps_car_14 ps_car_15	ps_car_14 ps_calc_01	ps_car_14 ps_calc_02	ps_car_14 ps_calc_03	ps_car_15^2	ps_car_15 ps_calc_01	ps_car_15 ps_calc_02	ps_car_15 ps_calc_03	ps_calc_01^2	ps_calc_01 ps_calc_02	ps_calc_01 ps_calc_03	ps_calc_02^2	ps_calc_02 ps_calc_03	ps_calc_03^2
0	0.6	0.6	0.838153	0.368782	0.540603	0.345688	2.000000	0.9	0.3	0.0	0.36	0.36	0.502892	0.221269	0.324362	0.207413	1.200000	0.54	0.18	0.00	0.36	0.502892	0.221269	0.324362	0.207413	1.200000	0.54	0.18	0.00	0.702500	0.309095	0.453108	0.289739	1.676305	0.754337	0.251446	0.000000	0.136	0.199365	0.127483	0.737564	0.331904	0.110635	0.000000	0.292252	0.186880	1.081207	0.486543	0.162181	0.000000	0.1195	0.691375	0.311119	0.103706	0.000000	4.0	1.800000	0.600000	0.000000	0.81	0.27	0.00	0.09	0.00	0.00
1	0.9	0.6	0.728440	0.424264	0.382953	0.378814	0.000000	0.2	0.0	0.6	0.81	0.54	0.655596	0.381838	0.344658	0.340933	0.000000	0.18	0.00	0.54	0.36	0.437064	0.254558	0.229772	0.227288	0.000000	0.12	0.00	0.36	0.530625	0.309051	0.278958	0.275943	0.000000	0.145688	0.000000	0.437064	0.180	0.162473	0.160717	0.000000	0.084853	0.000000	0.254558	0.146653	0.145068	0.000000	0.076591	0.000000	0.229772	0.1435	0.000000	0.075763	0.000000	0.227288	0.0	0.000000	0.000000	0.000000	0.04	0.00	0.12	0.00	0.00	0.36
2	0.9	0.6	0.869267	0.400000	0.814271	0.402368	3.316625	0.3	0.9	0.1	0.81	0.54	0.782340	0.360000	0.732844	0.362131	2.984962	0.27	0.81	0.09	0.36	0.521560	0.240000	0.488563	0.241421	1.989975	0.18	0.54	0.06	0.755625	0.347707	0.707819	0.349765	2.883032	0.260780	0.782340	0.086927	0.160	0.325708	0.160947	1.326650	0.120000	0.360000	0.040000	0.663037	0.327637	2.700631	0.244281	0.732844	0.081427	0.1619	1.334504	0.120710	0.362131	0.040237	11.0	0.994987	2.984962	0.331662	0.09	0.27	0.03	0.81	0.09	0.01
3	0.6	1.5	1.705872	0.400000	0.838387	0.378418	3.605551	0.8	0.4	0.1	0.36	0.90	1.023523	0.240000	0.503032	0.227051	2.163331	0.48	0.24	0.06	2.25	2.558808	0.600000	1.257580	0.567627	5.408327	1.20	0.60	0.15	2.910000	0.682349	1.430181	0.645532	6.150610	1.364698	0.682349	0.170587	0.160	0.335355	0.151367	1.442221	0.320000	0.160000	0.040000	0.702893	0.317260	3.022847	0.670710	0.335355	0.083839	0.1432	1.364405	0.302734	0.151367	0.037842	13.0	2.884441	1.442221	0.360555	0.64	0.32	0.08	0.16	0.04	0.01
4	0.8	0.8	1.086566	0.400000	0.905777	0.384838	3.605551	0.6	0.5	0.9	0.64	0.64	0.869253	0.320000	0.724622	0.307870	2.884441	0.48	0.40	0.72	0.64	0.869253	0.320000	0.724622	0.307870	2.884441	0.48	0.40	0.72	1.180625	0.434626	0.984186	0.418151	3.917668	0.651939	0.543283	0.977909	0.160	0.362311	0.153935	1.442221	0.240000	0.200000	0.360000	0.820432	0.348577	3.265825	0.543466	0.452888	0.815199	0.1481	1.387552	0.230903	0.192419	0.346354	13.0	2.163331	1.802776	3.244996	0.36	0.30	0.54	0.25	0.45	0.81
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
216935	0.6	0.4	1.537652	0.424264	1.269111	0.384708	3.162278	0.5	0.1	0.5	0.36	0.24	0.922591	0.254558	0.761467	0.230825	1.897367	0.30	0.06	0.30	0.16	0.615061	0.169706	0.507645	0.153883	1.264911	0.20	0.04	0.20	2.364375	0.652371	1.951452	0.591547	4.862484	0.768826	0.153765	0.768826	0.180	0.538438	0.163218	1.341641	0.212132	0.042426	0.212132	1.610644	0.488237	4.013282	0.634556	0.126911	0.634556	0.1480	1.216553	0.192354	0.038471	0.192354	10.0	1.581139	0.316228	1.581139	0.25	0.05	0.25	0.01	0.05	0.25
216936	0.3	0.4	0.898861	0.424264	0.756979	0.400000	2.000000	0.3	0.4	0.6	0.09	0.12	0.269658	0.127279	0.227094	0.120000	0.600000	0.09	0.12	0.18	0.16	0.359544	0.169706	0.302791	0.160000	0.800000	0.12	0.16	0.24	0.807951	0.381354	0.680419	0.359544	1.797722	0.269658	0.359544	0.539317	0.180	0.321159	0.169706	0.848528	0.127279	0.169706	0.254558	0.573017	0.302791	1.513957	0.227094	0.302791	0.454187	0.1600	0.800000	0.120000	0.160000	0.240000	4.0	0.600000	0.800000	1.200000	0.09	0.12	0.18	0.16	0.24	0.36
216937	0.9	0.3	0.711952	0.400000	0.970654	0.372424	3.464102	0.5	0.4	0.6	0.81	0.27	0.640756	0.360000	0.873589	0.335182	3.117691	0.45	0.36	0.54	0.09	0.213585	0.120000	0.291196	0.111727	1.039230	0.15	0.12	0.18	0.506875	0.284781	0.691059	0.265148	2.466272	0.355976	0.284781	0.427171	0.160	0.388262	0.148970	1.385641	0.200000	0.160000	0.240000	0.942169	0.361495	3.362445	0.485327	0.388262	0.582392	0.1387	1.290116	0.186212	0.148970	0.223455	12.0	1.732051	1.385641	2.078461	0.25	0.20	0.30	0.16	0.24	0.36
216938	0.6	0.1	0.577170	0.316228	0.876295	0.320780	3.741657	0.5	0.2	0.6	0.36	0.06	0.346302	0.189737	0.525777	0.192468	2.244994	0.30	0.12	0.36	0.01	0.057717	0.031623	0.087629	0.032078	0.374166	0.05	0.02	0.06	0.333125	0.182517	0.505771	0.185145	2.159572	0.288585	0.115434	0.346302	0.100	0.277109	0.101440	1.183216	0.158114	0.063246	0.189737	0.767893	0.281098	3.278795	0.438147	0.175259	0.525777	0.1029	1.200250	0.160390	0.064156	0.192468	14.0	1.870829	0.748331	2.244994	0.25	0.10	0.30	0.04	0.12	0.36
216939	0.6	0.4	1.095160	0.374166	0.752558	0.328634	3.464102	0.2	0.6	0.3	0.36	0.24	0.657096	0.224499	0.451535	0.197180	2.078461	0.12	0.36	0.18	0.16	0.438064	0.149666	0.301023	0.131453	1.385641	0.08	0.24	0.12	1.199375	0.409771	0.824171	0.359906	3.793745	0.219032	0.657096	0.328548	0.140	0.281581	0.122963	1.296148	0.074833	0.224499	0.112250	0.566343	0.247316	2.606936	0.150512	0.451535	0.225767	0.1080	1.138420	0.065727	0.197180	0.098590	12.0	0.692820	2.078461	1.039230	0.04	0.12	0.06	0.36	0.18	0.09

216940 rows × 65 columns

intersactions = pd.DataFrame(data=poly.fit_transform(train[v]), columns=poly.get_feature_names(v))
intersactions.drop(v, axis=1, inplace=True)
train = pd.concat([train, intersactions], axis = 1) #옆으로 붙여야한다!
#sample을 붙이면 axis = 0

selector = VarianceThreshold(threshold=.01)
selector.fit(train.drop(['id', 'target'], axis=1))

f = np.vectorize(lambda x : not x)

v = train.drop(['id', 'target'], axis=1).columns[f(selector.get_support())]
print('{} variables have too low variance.'.format(len(v)))
print('These variavels are {}'.format(list(v)))

28 variables have too low variance.
These variavels are ['ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_car_12', 'ps_car_14', 'ps_car_11_cat_te', 'ps_ind_05_cat_2', 'ps_ind_05_cat_5', 'ps_car_01_cat_1', 'ps_car_01_cat_2', 'ps_car_04_cat_3', 'ps_car_04_cat_4', 'ps_car_04_cat_5', 'ps_car_04_cat_6', 'ps_car_04_cat_7', 'ps_car_06_cat_2', 'ps_car_06_cat_5', 'ps_car_06_cat_8', 'ps_car_06_cat_12', 'ps_car_06_cat_16', 'ps_car_06_cat_17', 'ps_car_09_cat_4', 'ps_car_10_cat_1', 'ps_car_10_cat_2', 'ps_car_12^2', 'ps_car_12 ps_car_14', 'ps_car_14^2']

이유한 님 팁

1000개의 피쳐가 있을 때 피쳐를 20개씩 뺐다 넣다 하면서 성능을 보자!!
20개 모델 baseline + random choosing 20
40 모델 학습
if 성능 향상 -> feature importance 상위 10%에 새로 추가된거 생기면 향산된걸 남기고
안생기면 계속 random choosing
반복

X_train = train.drop(['id','target'],axis=1)
y_train = train['target']

feat_labels = X_train.columns

rf = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1)

rf.fit(X_train, y_train)
importances = rf.feature_importances_

indices = np.argsort(rf.feature_importances_)[::-1]

for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f +1, 30, feat_labels[indices[f]], importances[indices[f]]))

 1) ps_car_11_cat_te               0.021144
 2) ps_car_12 ps_car_13            0.017390
 3) ps_car_13                      0.017361
 4) ps_car_13^2                    0.017307
 5) ps_reg_03 ps_car_13            0.017075
 6) ps_car_13 ps_car_14            0.017067
 7) ps_car_13 ps_car_15            0.016823
 8) ps_reg_01 ps_car_13            0.016773
 9) ps_reg_03 ps_car_14            0.016233
10) ps_reg_03 ps_car_12            0.015462
11) ps_reg_03 ps_car_15            0.015181
12) ps_car_14 ps_car_15            0.015056
13) ps_car_13 ps_calc_02           0.014764
14) ps_car_13 ps_calc_01           0.014746
15) ps_reg_02 ps_car_13            0.014710
16) ps_car_13 ps_calc_03           0.014702
17) ps_reg_01 ps_reg_03            0.014665
18) ps_reg_01 ps_car_14            0.014378
19) ps_reg_03^2                    0.014235
20) ps_reg_03                      0.014196
21) ps_reg_03 ps_calc_03           0.013807
22) ps_reg_03 ps_calc_02           0.013738
23) ps_reg_03 ps_calc_01           0.013705
24) ps_car_14 ps_calc_02           0.013652
25) ps_calc_10                     0.013646
26) ps_car_14 ps_calc_03           0.013537
27) ps_car_14 ps_calc_01           0.013527
28) ps_calc_14                     0.013388
29) ps_car_12 ps_car_14            0.012970
30) ps_ind_03                      0.012921
31) ps_car_14                      0.012741
32) ps_car_14^2                    0.012730
33) ps_reg_02 ps_car_14            0.012697
34) ps_calc_11                     0.012619
35) ps_reg_02 ps_reg_03            0.012489
36) ps_ind_15                      0.012116
37) ps_car_12 ps_car_15            0.010944
38) ps_car_15 ps_calc_01           0.010857
39) ps_car_15 ps_calc_03           0.010839
40) ps_car_15 ps_calc_02           0.010837
41) ps_car_12 ps_calc_01           0.010477
42) ps_calc_13                     0.010477
43) ps_car_12 ps_calc_03           0.010310
44) ps_car_12 ps_calc_02           0.010296
45) ps_reg_02 ps_car_15            0.010205
46) ps_reg_01 ps_car_15            0.010177
47) ps_calc_02 ps_calc_03          0.010077
48) ps_calc_01 ps_calc_02          0.010013
49) ps_calc_01 ps_calc_03          0.010005
50) ps_calc_08                     0.009867
51) ps_calc_07                     0.009857
52) ps_reg_01 ps_car_12            0.009473
53) ps_reg_02 ps_car_12            0.009319
54) ps_reg_02 ps_calc_01           0.009294
55) ps_reg_02 ps_calc_03           0.009237
56) ps_reg_02 ps_calc_02           0.009146
57) ps_calc_06                     0.009092
58) ps_reg_01 ps_calc_02           0.009054
59) ps_reg_01 ps_calc_03           0.009041
60) ps_reg_01 ps_calc_01           0.009020
61) ps_calc_09                     0.008794
62) ps_ind_01                      0.008606
63) ps_calc_05                     0.008298
64) ps_calc_04                     0.008168
65) ps_calc_12                     0.008015
66) ps_reg_01 ps_reg_02            0.008015
67) ps_car_15                      0.006130
68) ps_car_15^2                    0.006130
69) ps_calc_03                     0.006001
70) ps_calc_01^2                   0.005975
71) ps_calc_01                     0.005964
72) ps_calc_03^2                   0.005964
73) ps_calc_02                     0.005950
74) ps_calc_02^2                   0.005943
75) ps_car_12                      0.005358
76) ps_car_12^2                    0.005348
77) ps_reg_02^2                    0.004993
78) ps_reg_02                      0.004986
79) ps_reg_01^2                    0.004140
80) ps_reg_01                      0.004118
81) ps_car_11                      0.003796
82) ps_ind_05_cat_0                0.003564
83) ps_ind_17_bin                  0.002840
84) ps_calc_17_bin                 0.002701
85) ps_calc_16_bin                 0.002597
86) ps_calc_19_bin                 0.002554
87) ps_calc_18_bin                 0.002529
88) ps_ind_04_cat_1                0.002405
89) ps_car_01_cat_11               0.002399
90) ps_ind_16_bin                  0.002393
91) ps_ind_04_cat_0                0.002378
92) ps_ind_07_bin                  0.002333
93) ps_car_09_cat_2                0.002313
94) ps_ind_02_cat_1                0.002269
95) ps_car_09_cat_0                0.002100
96) ps_car_01_cat_7                0.002089
97) ps_ind_02_cat_2                0.002078
98) ps_calc_20_bin                 0.002072
99) ps_ind_06_bin                  0.002041
100) ps_car_06_cat_1                0.002002
101) ps_calc_15_bin                 0.001996
102) ps_car_07_cat_1                0.001966
103) ps_ind_08_bin                  0.001946
104) ps_car_09_cat_1                0.001828
105) ps_car_06_cat_11               0.001787
106) ps_ind_18_bin                  0.001739
107) ps_ind_09_bin                  0.001719
108) ps_car_01_cat_10               0.001598
109) ps_car_01_cat_9                0.001577
110) ps_car_01_cat_6                0.001549
111) ps_car_06_cat_14               0.001547
112) ps_car_01_cat_4                0.001530
113) ps_ind_05_cat_6                0.001501
114) ps_ind_02_cat_3                0.001432
115) ps_car_07_cat_0                0.001369
116) ps_car_02_cat_1                0.001337
117) ps_car_01_cat_8                0.001330
118) ps_car_08_cat_1                0.001327
119) ps_car_02_cat_0                0.001313
120) ps_car_06_cat_4                0.001225
121) ps_ind_05_cat_4                0.001216
122) ps_ind_02_cat_4                0.001156
123) ps_car_01_cat_5                0.001143
124) ps_car_06_cat_6                0.001095
125) ps_car_06_cat_10               0.001055
126) ps_car_04_cat_1                0.001036
127) ps_ind_05_cat_2                0.001030
128) ps_car_06_cat_7                0.001003
129) ps_car_04_cat_2                0.000980
130) ps_car_01_cat_3                0.000885
131) ps_car_09_cat_3                0.000883
132) ps_ind_14                      0.000862
133) ps_car_01_cat_0                0.000854
134) ps_car_06_cat_15               0.000831
135) ps_car_06_cat_9                0.000785
136) ps_ind_05_cat_1                0.000755
137) ps_car_10_cat_1                0.000704
138) ps_car_06_cat_3                0.000698
139) ps_ind_05_cat_3                0.000685
140) ps_ind_12_bin                  0.000671
141) ps_car_09_cat_4                0.000631
142) ps_car_01_cat_2                0.000569
143) ps_car_04_cat_8                0.000557
144) ps_car_06_cat_17               0.000513
145) ps_car_06_cat_16               0.000454
146) ps_car_04_cat_9                0.000443
147) ps_car_06_cat_12               0.000420
148) ps_car_06_cat_13               0.000396
149) ps_car_01_cat_1                0.000381
150) ps_ind_05_cat_5                0.000307
151) ps_car_06_cat_5                0.000284
152) ps_ind_11_bin                  0.000217
153) ps_car_04_cat_6                0.000193
154) ps_ind_13_bin                  0.000150
155) ps_car_04_cat_3                0.000141
156) ps_car_06_cat_2                0.000137
157) ps_car_04_cat_5                0.000100
158) ps_car_06_cat_8                0.000093
159) ps_car_04_cat_7                0.000083
160) ps_ind_10_bin                  0.000074
161) ps_car_10_cat_2                0.000058
162) ps_car_04_cat_4                0.000042

sfm = SelectFromModel(rf, threshold='median', prefit=True)
print('Number of features before selection: {}'.format(X_train.shape[1]))
n_features = sfm.transform(X_train).shape[1]
print('Number of features after selection: {}'.format(n_features))
selected_vars = list(feat_labels[sfm.get_support()])

Number of features before selection: 162
Number of features after selection: 81

train = train[selected_vars + ['target']]

scaler = StandardScaler()
scaler.fit_transform(train.drop(['target'], axis=1))

array([[-0.45941104, -1.26665356,  1.05087653, ..., -0.72553616,
        -1.01071913, -1.06173767],
       [ 1.55538958,  0.95034274, -0.63847299, ..., -1.06120876,
        -1.01071913,  0.27907892],
       [ 1.05168943, -0.52765479, -0.92003125, ...,  1.95984463,
        -0.56215309, -1.02449277],
       ...,
       [-0.9631112 ,  0.58084336,  0.48776003, ..., -0.46445747,
         0.18545696,  0.27907892],
       [-0.9631112 , -0.89715418, -1.48314775, ..., -0.91202093,
        -0.41263108,  0.27907892],
       [-0.45941104, -1.26665356,  1.61399304, ...,  0.28148164,
        -0.11358706, -0.72653353]])

Twitter Facebook LinkedIn

Gang Min

Kaggle 필사 1

Handling Imbalanced classes

뜯어보기

다시 코드

이유한 님 팁

Comments

You May Also Enjoy

추천시스템 함수

TBS크롤링

MBC크롤링

2021 국토교통 빅데이터 온라인 해커톤 경진대회