Kaggle 분석
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_row', 500)
pd.set_option('display.max_columns', 100)
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv('C:/Users/landg/Downloads/Python_Study_GM/캐글/train.csv')
df.head()
Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | OverallQual | OverallCond | YearBuilt | YearRemodAdd | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinType2 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | Heating | HeatingQC | CentralAir | Electrical | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Functional | Fireplaces | FireplaceQu | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | GarageQual | GarageCond | PavedDrive | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 2Story | 7 | 5 | 2003 | 2003 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 196.0 | Gd | TA | PConc | Gd | TA | No | GLQ | 706 | Unf | 0 | 150 | 856 | GasA | Ex | Y | SBrkr | 856 | 854 | 0 | 1710 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 8 | Typ | 0 | NaN | Attchd | 2003.0 | RFn | 2 | 548 | TA | TA | Y | 0 | 61 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
1 | 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | FR2 | Gtl | Veenker | Feedr | Norm | 1Fam | 1Story | 6 | 8 | 1976 | 1976 | Gable | CompShg | MetalSd | MetalSd | None | 0.0 | TA | TA | CBlock | Gd | TA | Gd | ALQ | 978 | Unf | 0 | 284 | 1262 | GasA | Ex | Y | SBrkr | 1262 | 0 | 0 | 1262 | 0 | 1 | 2 | 0 | 3 | 1 | TA | 6 | Typ | 1 | TA | Attchd | 1976.0 | RFn | 2 | 460 | TA | TA | Y | 298 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
2 | 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 2Story | 7 | 5 | 2001 | 2002 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 162.0 | Gd | TA | PConc | Gd | TA | Mn | GLQ | 486 | Unf | 0 | 434 | 920 | GasA | Ex | Y | SBrkr | 920 | 866 | 0 | 1786 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 6 | Typ | 1 | TA | Attchd | 2001.0 | RFn | 2 | 608 | TA | TA | Y | 0 | 42 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
3 | 4 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | Corner | Gtl | Crawfor | Norm | Norm | 1Fam | 2Story | 7 | 5 | 1915 | 1970 | Gable | CompShg | Wd Sdng | Wd Shng | None | 0.0 | TA | TA | BrkTil | TA | Gd | No | ALQ | 216 | Unf | 0 | 540 | 756 | GasA | Gd | Y | SBrkr | 961 | 756 | 0 | 1717 | 1 | 0 | 1 | 0 | 3 | 1 | Gd | 7 | Typ | 1 | Gd | Detchd | 1998.0 | Unf | 3 | 642 | TA | TA | Y | 0 | 35 | 272 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
4 | 5 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | FR2 | Gtl | NoRidge | Norm | Norm | 1Fam | 2Story | 8 | 5 | 2000 | 2000 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 350.0 | Gd | TA | PConc | Gd | TA | Av | GLQ | 655 | Unf | 0 | 490 | 1145 | GasA | Ex | Y | SBrkr | 1145 | 1053 | 0 | 2198 | 1 | 0 | 2 | 1 | 4 | 1 | Gd | 9 | Typ | 1 | TA | Attchd | 2000.0 | RFn | 3 | 836 | TA | TA | Y | 192 | 84 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
df.shape
(1460, 81)
df.columns
Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
'SaleCondition', 'SalePrice'],
dtype='object')
df.describe()
Id | MSSubClass | LotFrontage | LotArea | OverallQual | OverallCond | YearBuilt | YearRemodAdd | MasVnrArea | BsmtFinSF1 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | TotRmsAbvGrd | Fireplaces | GarageYrBlt | GarageCars | GarageArea | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | MiscVal | MoSold | YrSold | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 1460.000000 | 1460.000000 | 1201.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1452.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1379.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 |
mean | 730.500000 | 56.897260 | 70.049958 | 10516.828082 | 6.099315 | 5.575342 | 1971.267808 | 1984.865753 | 103.685262 | 443.639726 | 46.549315 | 567.240411 | 1057.429452 | 1162.626712 | 346.992466 | 5.844521 | 1515.463699 | 0.425342 | 0.057534 | 1.565068 | 0.382877 | 2.866438 | 1.046575 | 6.517808 | 0.613014 | 1978.506164 | 1.767123 | 472.980137 | 94.244521 | 46.660274 | 21.954110 | 3.409589 | 15.060959 | 2.758904 | 43.489041 | 6.321918 | 2007.815753 | 180921.195890 |
std | 421.610009 | 42.300571 | 24.284752 | 9981.264932 | 1.382997 | 1.112799 | 30.202904 | 20.645407 | 181.066207 | 456.098091 | 161.319273 | 441.866955 | 438.705324 | 386.587738 | 436.528436 | 48.623081 | 525.480383 | 0.518911 | 0.238753 | 0.550916 | 0.502885 | 0.815778 | 0.220338 | 1.625393 | 0.644666 | 24.689725 | 0.747315 | 213.804841 | 125.338794 | 66.256028 | 61.119149 | 29.317331 | 55.757415 | 40.177307 | 496.123024 | 2.703626 | 1.328095 | 79442.502883 |
min | 1.000000 | 20.000000 | 21.000000 | 1300.000000 | 1.000000 | 1.000000 | 1872.000000 | 1950.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 334.000000 | 0.000000 | 0.000000 | 334.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 1900.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 2006.000000 | 34900.000000 |
25% | 365.750000 | 20.000000 | 59.000000 | 7553.500000 | 5.000000 | 5.000000 | 1954.000000 | 1967.000000 | 0.000000 | 0.000000 | 0.000000 | 223.000000 | 795.750000 | 882.000000 | 0.000000 | 0.000000 | 1129.500000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 2.000000 | 1.000000 | 5.000000 | 0.000000 | 1961.000000 | 1.000000 | 334.500000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 5.000000 | 2007.000000 | 129975.000000 |
50% | 730.500000 | 50.000000 | 69.000000 | 9478.500000 | 6.000000 | 5.000000 | 1973.000000 | 1994.000000 | 0.000000 | 383.500000 | 0.000000 | 477.500000 | 991.500000 | 1087.000000 | 0.000000 | 0.000000 | 1464.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 3.000000 | 1.000000 | 6.000000 | 1.000000 | 1980.000000 | 2.000000 | 480.000000 | 0.000000 | 25.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 6.000000 | 2008.000000 | 163000.000000 |
75% | 1095.250000 | 70.000000 | 80.000000 | 11601.500000 | 7.000000 | 6.000000 | 2000.000000 | 2004.000000 | 166.000000 | 712.250000 | 0.000000 | 808.000000 | 1298.250000 | 1391.250000 | 728.000000 | 0.000000 | 1776.750000 | 1.000000 | 0.000000 | 2.000000 | 1.000000 | 3.000000 | 1.000000 | 7.000000 | 1.000000 | 2002.000000 | 2.000000 | 576.000000 | 168.000000 | 68.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 8.000000 | 2009.000000 | 214000.000000 |
max | 1460.000000 | 190.000000 | 313.000000 | 215245.000000 | 10.000000 | 9.000000 | 2010.000000 | 2010.000000 | 1600.000000 | 5644.000000 | 1474.000000 | 2336.000000 | 6110.000000 | 4692.000000 | 2065.000000 | 572.000000 | 5642.000000 | 3.000000 | 2.000000 | 3.000000 | 2.000000 | 8.000000 | 3.000000 | 14.000000 | 3.000000 | 2010.000000 | 4.000000 | 1418.000000 | 857.000000 | 547.000000 | 552.000000 | 508.000000 | 480.000000 | 738.000000 | 15500.000000 | 12.000000 | 2010.000000 | 755000.000000 |
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Id 1460 non-null int64
1 MSSubClass 1460 non-null int64
2 MSZoning 1460 non-null object
3 LotFrontage 1201 non-null float64
4 LotArea 1460 non-null int64
5 Street 1460 non-null object
6 Alley 91 non-null object
7 LotShape 1460 non-null object
8 LandContour 1460 non-null object
9 Utilities 1460 non-null object
10 LotConfig 1460 non-null object
11 LandSlope 1460 non-null object
12 Neighborhood 1460 non-null object
13 Condition1 1460 non-null object
14 Condition2 1460 non-null object
15 BldgType 1460 non-null object
16 HouseStyle 1460 non-null object
17 OverallQual 1460 non-null int64
18 OverallCond 1460 non-null int64
19 YearBuilt 1460 non-null int64
20 YearRemodAdd 1460 non-null int64
21 RoofStyle 1460 non-null object
22 RoofMatl 1460 non-null object
23 Exterior1st 1460 non-null object
24 Exterior2nd 1460 non-null object
25 MasVnrType 1452 non-null object
26 MasVnrArea 1452 non-null float64
27 ExterQual 1460 non-null object
28 ExterCond 1460 non-null object
29 Foundation 1460 non-null object
30 BsmtQual 1423 non-null object
31 BsmtCond 1423 non-null object
32 BsmtExposure 1422 non-null object
33 BsmtFinType1 1423 non-null object
34 BsmtFinSF1 1460 non-null int64
35 BsmtFinType2 1422 non-null object
36 BsmtFinSF2 1460 non-null int64
37 BsmtUnfSF 1460 non-null int64
38 TotalBsmtSF 1460 non-null int64
39 Heating 1460 non-null object
40 HeatingQC 1460 non-null object
41 CentralAir 1460 non-null object
42 Electrical 1459 non-null object
43 1stFlrSF 1460 non-null int64
44 2ndFlrSF 1460 non-null int64
45 LowQualFinSF 1460 non-null int64
46 GrLivArea 1460 non-null int64
47 BsmtFullBath 1460 non-null int64
48 BsmtHalfBath 1460 non-null int64
49 FullBath 1460 non-null int64
50 HalfBath 1460 non-null int64
51 BedroomAbvGr 1460 non-null int64
52 KitchenAbvGr 1460 non-null int64
53 KitchenQual 1460 non-null object
54 TotRmsAbvGrd 1460 non-null int64
55 Functional 1460 non-null object
56 Fireplaces 1460 non-null int64
57 FireplaceQu 770 non-null object
58 GarageType 1379 non-null object
59 GarageYrBlt 1379 non-null float64
60 GarageFinish 1379 non-null object
61 GarageCars 1460 non-null int64
62 GarageArea 1460 non-null int64
63 GarageQual 1379 non-null object
64 GarageCond 1379 non-null object
65 PavedDrive 1460 non-null object
66 WoodDeckSF 1460 non-null int64
67 OpenPorchSF 1460 non-null int64
68 EnclosedPorch 1460 non-null int64
69 3SsnPorch 1460 non-null int64
70 ScreenPorch 1460 non-null int64
71 PoolArea 1460 non-null int64
72 PoolQC 7 non-null object
73 Fence 281 non-null object
74 MiscFeature 54 non-null object
75 MiscVal 1460 non-null int64
76 MoSold 1460 non-null int64
77 YrSold 1460 non-null int64
78 SaleType 1460 non-null object
79 SaleCondition 1460 non-null object
80 SalePrice 1460 non-null int64
dtypes: float64(3), int64(35), object(43)
memory usage: 924.0+ KB
sns.distplot(df['SalePrice']);
sns.histplot(df['SalePrice'], kde = True);
sns.histplot(x = 'MSSubClass', data = df)
<matplotlib.axes._subplots.AxesSubplot at 0x2553e0822c8>
sns.countplot(df["Fence"]);
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Id 1460 non-null int64
1 MSSubClass 1460 non-null int64
2 MSZoning 1460 non-null object
3 LotFrontage 1201 non-null float64
4 LotArea 1460 non-null int64
5 Street 1460 non-null object
6 Alley 91 non-null object
7 LotShape 1460 non-null object
8 LandContour 1460 non-null object
9 Utilities 1460 non-null object
10 LotConfig 1460 non-null object
11 LandSlope 1460 non-null object
12 Neighborhood 1460 non-null object
13 Condition1 1460 non-null object
14 Condition2 1460 non-null object
15 BldgType 1460 non-null object
16 HouseStyle 1460 non-null object
17 OverallQual 1460 non-null int64
18 OverallCond 1460 non-null int64
19 YearBuilt 1460 non-null int64
20 YearRemodAdd 1460 non-null int64
21 RoofStyle 1460 non-null object
22 RoofMatl 1460 non-null object
23 Exterior1st 1460 non-null object
24 Exterior2nd 1460 non-null object
25 MasVnrType 1452 non-null object
26 MasVnrArea 1452 non-null float64
27 ExterQual 1460 non-null object
28 ExterCond 1460 non-null object
29 Foundation 1460 non-null object
30 BsmtQual 1423 non-null object
31 BsmtCond 1423 non-null object
32 BsmtExposure 1422 non-null object
33 BsmtFinType1 1423 non-null object
34 BsmtFinSF1 1460 non-null int64
35 BsmtFinType2 1422 non-null object
36 BsmtFinSF2 1460 non-null int64
37 BsmtUnfSF 1460 non-null int64
38 TotalBsmtSF 1460 non-null int64
39 Heating 1460 non-null object
40 HeatingQC 1460 non-null object
41 CentralAir 1460 non-null object
42 Electrical 1459 non-null object
43 1stFlrSF 1460 non-null int64
44 2ndFlrSF 1460 non-null int64
45 LowQualFinSF 1460 non-null int64
46 GrLivArea 1460 non-null int64
47 BsmtFullBath 1460 non-null int64
48 BsmtHalfBath 1460 non-null int64
49 FullBath 1460 non-null int64
50 HalfBath 1460 non-null int64
51 BedroomAbvGr 1460 non-null int64
52 KitchenAbvGr 1460 non-null int64
53 KitchenQual 1460 non-null object
54 TotRmsAbvGrd 1460 non-null int64
55 Functional 1460 non-null object
56 Fireplaces 1460 non-null int64
57 FireplaceQu 770 non-null object
58 GarageType 1379 non-null object
59 GarageYrBlt 1379 non-null float64
60 GarageFinish 1379 non-null object
61 GarageCars 1460 non-null int64
62 GarageArea 1460 non-null int64
63 GarageQual 1379 non-null object
64 GarageCond 1379 non-null object
65 PavedDrive 1460 non-null object
66 WoodDeckSF 1460 non-null int64
67 OpenPorchSF 1460 non-null int64
68 EnclosedPorch 1460 non-null int64
69 3SsnPorch 1460 non-null int64
70 ScreenPorch 1460 non-null int64
71 PoolArea 1460 non-null int64
72 PoolQC 7 non-null object
73 Fence 281 non-null object
74 MiscFeature 54 non-null object
75 MiscVal 1460 non-null int64
76 MoSold 1460 non-null int64
77 YrSold 1460 non-null int64
78 SaleType 1460 non-null object
79 SaleCondition 1460 non-null object
80 SalePrice 1460 non-null int64
dtypes: float64(3), int64(35), object(43)
memory usage: 924.0+ KB
df.columns
Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
'SaleCondition', 'SalePrice'],
dtype='object')
df.shape
(1460, 81)
for c, num in zip(df.columns, df.isna().sum()):
print(c,num)
Id 0
MSSubClass 0
MSZoning 0
LotFrontage 259
LotArea 0
Street 0
Alley 1369
LotShape 0
LandContour 0
Utilities 0
LotConfig 0
LandSlope 0
Neighborhood 0
Condition1 0
Condition2 0
BldgType 0
HouseStyle 0
OverallQual 0
OverallCond 0
YearBuilt 0
YearRemodAdd 0
RoofStyle 0
RoofMatl 0
Exterior1st 0
Exterior2nd 0
MasVnrType 8
MasVnrArea 8
ExterQual 0
ExterCond 0
Foundation 0
BsmtQual 37
BsmtCond 37
BsmtExposure 38
BsmtFinType1 37
BsmtFinSF1 0
BsmtFinType2 38
BsmtFinSF2 0
BsmtUnfSF 0
TotalBsmtSF 0
Heating 0
HeatingQC 0
CentralAir 0
Electrical 1
1stFlrSF 0
2ndFlrSF 0
LowQualFinSF 0
GrLivArea 0
BsmtFullBath 0
BsmtHalfBath 0
FullBath 0
HalfBath 0
BedroomAbvGr 0
KitchenAbvGr 0
KitchenQual 0
TotRmsAbvGrd 0
Functional 0
Fireplaces 0
FireplaceQu 690
GarageType 81
GarageYrBlt 81
GarageFinish 81
GarageCars 0
GarageArea 0
GarageQual 81
GarageCond 81
PavedDrive 0
WoodDeckSF 0
OpenPorchSF 0
EnclosedPorch 0
3SsnPorch 0
ScreenPorch 0
PoolArea 0
PoolQC 1453
Fence 1179
MiscFeature 1406
MiscVal 0
MoSold 0
YrSold 0
SaleType 0
SaleCondition 0
SalePrice 0
- 결측값 1차제거
nulls = {}
for c, num in zip(df.columns, df.isna().sum()):
if num>0:
nulls[c] = num
a = dict(sorted(nulls.items(), key=lambda x: x[1], reverse=True)) #values를 기준으로 내림차순으로 정렬해서 tuple 반환
a
{'PoolQC': 1453,
'MiscFeature': 1406,
'Alley': 1369,
'Fence': 1179,
'FireplaceQu': 690,
'LotFrontage': 259,
'GarageType': 81,
'GarageYrBlt': 81,
'GarageFinish': 81,
'GarageQual': 81,
'GarageCond': 81,
'BsmtExposure': 38,
'BsmtFinType2': 38,
'BsmtQual': 37,
'BsmtCond': 37,
'BsmtFinType1': 37,
'MasVnrType': 8,
'MasVnrArea': 8,
'Electrical': 1}
#b = sorted(nulls, key=lambda x: nulls[x], reverse=True) #values를 기준으로 내림차순으로 정렬해서 key값만 리스트로 반환
#b
df.drop(['PoolQC','MiscFeature','Alley','Fence','FireplaceQu','LotFrontage'], axis = 1, inplace = True)
X_cat = df[["MSZoning", 'Street', 'LotShape', 'LandContour',
'Utilities', 'LotConfig','LandSlope', 'Neighborhood', 'Condition1',
'Condition2', 'BldgType','HouseStyle','RoofStyle', 'RoofMatl', 'Exterior1st',
'Exterior2nd', 'MasVnrType','ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
'BsmtCond', 'BsmtExposure', 'BsmtFinType1','BsmtFinType2','Heating',
'HeatingQC', 'CentralAir', 'Electrical','KitchenQual', 'Functional',
'GarageType','GarageFinish','GarageQual','GarageCond',
'PavedDrive','SaleType','SaleCondition']]
df.drop(["MSZoning", 'Street', 'LotShape', 'LandContour',
'Utilities', 'LotConfig','LandSlope', 'Neighborhood', 'Condition1',
'Condition2', 'BldgType','HouseStyle','RoofStyle', 'RoofMatl', 'Exterior1st',
'Exterior2nd', 'MasVnrType','ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
'BsmtCond', 'BsmtExposure', 'BsmtFinType1','BsmtFinType2','Heating',
'HeatingQC', 'CentralAir', 'Electrical','KitchenQual', 'Functional',
'GarageType','GarageFinish','GarageQual','GarageCond',
'PavedDrive','SaleType','SaleCondition'], axis =1, inplace = True)
X_nums = df
y = X_nums['SalePrice']
X_nums.drop(['SalePrice', 'Id'], axis = 1, inplace = True)
X_nums.head()
MSSubClass | LotArea | OverallQual | OverallCond | YearBuilt | YearRemodAdd | MasVnrArea | BsmtFinSF1 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | TotRmsAbvGrd | Fireplaces | GarageYrBlt | GarageCars | GarageArea | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | MiscVal | MoSold | YrSold | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 60 | 8450 | 7 | 5 | 2003 | 2003 | 196.0 | 706 | 0 | 150 | 856 | 856 | 854 | 0 | 1710 | 1 | 0 | 2 | 1 | 3 | 1 | 8 | 0 | 2003.0 | 2 | 548 | 0 | 61 | 0 | 0 | 0 | 0 | 0 | 2 | 2008 |
1 | 20 | 9600 | 6 | 8 | 1976 | 1976 | 0.0 | 978 | 0 | 284 | 1262 | 1262 | 0 | 0 | 1262 | 0 | 1 | 2 | 0 | 3 | 1 | 6 | 1 | 1976.0 | 2 | 460 | 298 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 2007 |
2 | 60 | 11250 | 7 | 5 | 2001 | 2002 | 162.0 | 486 | 0 | 434 | 920 | 920 | 866 | 0 | 1786 | 1 | 0 | 2 | 1 | 3 | 1 | 6 | 1 | 2001.0 | 2 | 608 | 0 | 42 | 0 | 0 | 0 | 0 | 0 | 9 | 2008 |
3 | 70 | 9550 | 7 | 5 | 1915 | 1970 | 0.0 | 216 | 0 | 540 | 756 | 961 | 756 | 0 | 1717 | 1 | 0 | 1 | 0 | 3 | 1 | 7 | 1 | 1998.0 | 3 | 642 | 0 | 35 | 272 | 0 | 0 | 0 | 0 | 2 | 2006 |
4 | 60 | 14260 | 8 | 5 | 2000 | 2000 | 350.0 | 655 | 0 | 490 | 1145 | 1145 | 1053 | 0 | 2198 | 1 | 0 | 2 | 1 | 4 | 1 | 9 | 1 | 2000.0 | 3 | 836 | 192 | 84 | 0 | 0 | 0 | 0 | 0 | 12 | 2008 |
X_cat['LotShape'].value_counts()
Reg 925
IR1 484
IR2 41
IR3 10
Name: LotShape, dtype: int64
X_cat
MSZoning | Street | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinType2 | Heating | HeatingQC | CentralAir | Electrical | KitchenQual | Functional | GarageType | GarageFinish | GarageQual | GarageCond | PavedDrive | SaleType | SaleCondition | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | RL | Pave | Reg | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 2Story | Gable | CompShg | VinylSd | VinylSd | BrkFace | Gd | TA | PConc | Gd | TA | No | GLQ | Unf | GasA | Ex | Y | SBrkr | Gd | Typ | Attchd | RFn | TA | TA | Y | WD | Normal |
1 | RL | Pave | Reg | Lvl | AllPub | FR2 | Gtl | Veenker | Feedr | Norm | 1Fam | 1Story | Gable | CompShg | MetalSd | MetalSd | None | TA | TA | CBlock | Gd | TA | Gd | ALQ | Unf | GasA | Ex | Y | SBrkr | TA | Typ | Attchd | RFn | TA | TA | Y | WD | Normal |
2 | RL | Pave | IR1 | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 2Story | Gable | CompShg | VinylSd | VinylSd | BrkFace | Gd | TA | PConc | Gd | TA | Mn | GLQ | Unf | GasA | Ex | Y | SBrkr | Gd | Typ | Attchd | RFn | TA | TA | Y | WD | Normal |
3 | RL | Pave | IR1 | Lvl | AllPub | Corner | Gtl | Crawfor | Norm | Norm | 1Fam | 2Story | Gable | CompShg | Wd Sdng | Wd Shng | None | TA | TA | BrkTil | TA | Gd | No | ALQ | Unf | GasA | Gd | Y | SBrkr | Gd | Typ | Detchd | Unf | TA | TA | Y | WD | Abnorml |
4 | RL | Pave | IR1 | Lvl | AllPub | FR2 | Gtl | NoRidge | Norm | Norm | 1Fam | 2Story | Gable | CompShg | VinylSd | VinylSd | BrkFace | Gd | TA | PConc | Gd | TA | Av | GLQ | Unf | GasA | Ex | Y | SBrkr | Gd | Typ | Attchd | RFn | TA | TA | Y | WD | Normal |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1455 | RL | Pave | Reg | Lvl | AllPub | Inside | Gtl | Gilbert | Norm | Norm | 1Fam | 2Story | Gable | CompShg | VinylSd | VinylSd | None | TA | TA | PConc | Gd | TA | No | Unf | Unf | GasA | Ex | Y | SBrkr | TA | Typ | Attchd | RFn | TA | TA | Y | WD | Normal |
1456 | RL | Pave | Reg | Lvl | AllPub | Inside | Gtl | NWAmes | Norm | Norm | 1Fam | 1Story | Gable | CompShg | Plywood | Plywood | Stone | TA | TA | CBlock | Gd | TA | No | ALQ | Rec | GasA | TA | Y | SBrkr | TA | Min1 | Attchd | Unf | TA | TA | Y | WD | Normal |
1457 | RL | Pave | Reg | Lvl | AllPub | Inside | Gtl | Crawfor | Norm | Norm | 1Fam | 2Story | Gable | CompShg | CemntBd | CmentBd | None | Ex | Gd | Stone | TA | Gd | No | GLQ | Unf | GasA | Ex | Y | SBrkr | Gd | Typ | Attchd | RFn | TA | TA | Y | WD | Normal |
1458 | RL | Pave | Reg | Lvl | AllPub | Inside | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | Hip | CompShg | MetalSd | MetalSd | None | TA | TA | CBlock | TA | TA | Mn | GLQ | Rec | GasA | Gd | Y | FuseA | Gd | Typ | Attchd | Unf | TA | TA | Y | WD | Normal |
1459 | RL | Pave | Reg | Lvl | AllPub | Inside | Gtl | Edwards | Norm | Norm | 1Fam | 1Story | Gable | CompShg | HdBoard | HdBoard | None | Gd | TA | CBlock | TA | TA | No | BLQ | LwQ | GasA | Gd | Y | SBrkr | TA | Typ | Attchd | Fin | TA | TA | Y | WD | Normal |
1460 rows × 38 columns
for names,i in zip(X_cat.columns,X_cat):
print(names ,len(X_cat[i].value_counts()))
MSZoning 5
Street 2
LotShape 4
LandContour 4
Utilities 2
LotConfig 5
LandSlope 3
Neighborhood 25
Condition1 9
Condition2 8
BldgType 5
HouseStyle 8
RoofStyle 6
RoofMatl 8
Exterior1st 15
Exterior2nd 16
MasVnrType 4
ExterQual 4
ExterCond 5
Foundation 6
BsmtQual 4
BsmtCond 4
BsmtExposure 4
BsmtFinType1 6
BsmtFinType2 6
Heating 6
HeatingQC 5
CentralAir 2
Electrical 5
KitchenQual 4
Functional 7
GarageType 6
GarageFinish 3
GarageQual 5
GarageCond 5
PavedDrive 3
SaleType 9
SaleCondition 6
nulls2 = {}
for c, num in zip(X_cat.columns, X_cat.isna().sum()):
if num>0:
nulls2[c] = num
cat_nulls = dict(sorted(nulls2.items(),key= lambda x : x[1], reverse = True))
cat_nulls
{'GarageType': 81,
'GarageFinish': 81,
'GarageQual': 81,
'GarageCond': 81,
'BsmtExposure': 38,
'BsmtFinType2': 38,
'BsmtQual': 37,
'BsmtCond': 37,
'BsmtFinType1': 37,
'MasVnrType': 8,
'Electrical': 1}
nulls3 = {}
for c, num in zip(X_nums.columns, X_nums.isna().sum()):
if num>0:
nulls3[c] = num
nums_nulls = list(sorted(nulls3.items() , key = lambda x : x[1], reverse = True))
nums_nulls
[('GarageYrBlt', 81), ('MasVnrArea', 8)]
- 결측값 범주형은 최빈값 연속형은 평균 채우기
X_cat['GarageType'].value_counts()
Attchd 870
Detchd 387
BuiltIn 88
Basment 19
CarPort 9
2Types 6
Name: GarageType, dtype: int64
for i in X_cat.columns:
X_cat[i].fillna(X_cat[i].mode()[0], inplace = True)
X_nums['MasVnrArea'] = X_nums['MasVnrArea'].fillna(0)
X_nums['MasVnrArea'] = X_nums['MasVnrArea'].apply(int)
X_nums['GarageYrBlt'] = X_nums['GarageYrBlt'].fillna(0)
X_nums['GarageYrBlt'] = X_nums['GarageYrBlt'].apply(int)
X_nums.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 35 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 MSSubClass 1460 non-null int64
1 LotArea 1460 non-null int64
2 OverallQual 1460 non-null int64
3 OverallCond 1460 non-null int64
4 YearBuilt 1460 non-null int64
5 YearRemodAdd 1460 non-null int64
6 MasVnrArea 1460 non-null int64
7 BsmtFinSF1 1460 non-null int64
8 BsmtFinSF2 1460 non-null int64
9 BsmtUnfSF 1460 non-null int64
10 TotalBsmtSF 1460 non-null int64
11 1stFlrSF 1460 non-null int64
12 2ndFlrSF 1460 non-null int64
13 LowQualFinSF 1460 non-null int64
14 GrLivArea 1460 non-null int64
15 BsmtFullBath 1460 non-null int64
16 BsmtHalfBath 1460 non-null int64
17 FullBath 1460 non-null int64
18 HalfBath 1460 non-null int64
19 BedroomAbvGr 1460 non-null int64
20 KitchenAbvGr 1460 non-null int64
21 TotRmsAbvGrd 1460 non-null int64
22 Fireplaces 1460 non-null int64
23 GarageYrBlt 1460 non-null int64
24 GarageCars 1460 non-null int64
25 GarageArea 1460 non-null int64
26 WoodDeckSF 1460 non-null int64
27 OpenPorchSF 1460 non-null int64
28 EnclosedPorch 1460 non-null int64
29 3SsnPorch 1460 non-null int64
30 ScreenPorch 1460 non-null int64
31 PoolArea 1460 non-null int64
32 MiscVal 1460 non-null int64
33 MoSold 1460 non-null int64
34 YrSold 1460 non-null int64
dtypes: int64(35)
memory usage: 399.3 KB
for i in X_nums.columns:
X_nums[i].fillna(X_nums[i].mean(), inplace = True)
- 범주형으로 변환 label encoding
from sklearn import preprocessing
for feature in X_cat:
le = preprocessing.LabelEncoder()
X_cat[feature] = le.fit_transform(X_cat[feature])
X_cat = pd.DataFrame(data = X_cat, index = X_cat.index, columns = X_cat.columns)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_nums)
X_scaled = scaler.transform(X_nums)
X_scaled = pd.DataFrame(data = X_scaled, index = X_nums.index, columns = X_nums.columns)
X_scaled.shape
(1460, 35)
X_cat.shape
(1460, 38)
X_scaled.columns
Index(['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt',
'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt',
'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
'MoSold', 'YrSold'],
dtype='object')
X_nums.head()
MSSubClass | LotArea | OverallQual | OverallCond | YearBuilt | YearRemodAdd | MasVnrArea | BsmtFinSF1 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | TotRmsAbvGrd | Fireplaces | GarageYrBlt | GarageCars | GarageArea | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | MiscVal | MoSold | YrSold | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 60 | 8450 | 7 | 5 | 2003 | 2003 | 196 | 706 | 0 | 150 | 856 | 856 | 854 | 0 | 1710 | 1 | 0 | 2 | 1 | 3 | 1 | 8 | 0 | 2003 | 2 | 548 | 0 | 61 | 0 | 0 | 0 | 0 | 0 | 2 | 2008 |
1 | 20 | 9600 | 6 | 8 | 1976 | 1976 | 0 | 978 | 0 | 284 | 1262 | 1262 | 0 | 0 | 1262 | 0 | 1 | 2 | 0 | 3 | 1 | 6 | 1 | 1976 | 2 | 460 | 298 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 2007 |
2 | 60 | 11250 | 7 | 5 | 2001 | 2002 | 162 | 486 | 0 | 434 | 920 | 920 | 866 | 0 | 1786 | 1 | 0 | 2 | 1 | 3 | 1 | 6 | 1 | 2001 | 2 | 608 | 0 | 42 | 0 | 0 | 0 | 0 | 0 | 9 | 2008 |
3 | 70 | 9550 | 7 | 5 | 1915 | 1970 | 0 | 216 | 0 | 540 | 756 | 961 | 756 | 0 | 1717 | 1 | 0 | 1 | 0 | 3 | 1 | 7 | 1 | 1998 | 3 | 642 | 0 | 35 | 272 | 0 | 0 | 0 | 0 | 2 | 2006 |
4 | 60 | 14260 | 8 | 5 | 2000 | 2000 | 350 | 655 | 0 | 490 | 1145 | 1145 | 1053 | 0 | 2198 | 1 | 0 | 2 | 1 | 4 | 1 | 9 | 1 | 2000 | 3 | 836 | 192 | 84 | 0 | 0 | 0 | 0 | 0 | 12 | 2008 |
X_cat.head()
MSZoning | Street | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinType2 | Heating | HeatingQC | CentralAir | Electrical | KitchenQual | Functional | GarageType | GarageFinish | GarageQual | GarageCond | PavedDrive | SaleType | SaleCondition | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3 | 1 | 3 | 3 | 0 | 4 | 0 | 5 | 2 | 2 | 0 | 5 | 1 | 1 | 12 | 13 | 1 | 2 | 4 | 2 | 2 | 3 | 3 | 2 | 5 | 1 | 0 | 1 | 4 | 2 | 6 | 1 | 1 | 4 | 4 | 2 | 8 | 4 |
1 | 3 | 1 | 3 | 3 | 0 | 2 | 0 | 24 | 1 | 2 | 0 | 2 | 1 | 1 | 8 | 8 | 2 | 3 | 4 | 1 | 2 | 3 | 1 | 0 | 5 | 1 | 0 | 1 | 4 | 3 | 6 | 1 | 1 | 4 | 4 | 2 | 8 | 4 |
2 | 3 | 1 | 0 | 3 | 0 | 4 | 0 | 5 | 2 | 2 | 0 | 5 | 1 | 1 | 12 | 13 | 1 | 2 | 4 | 2 | 2 | 3 | 2 | 2 | 5 | 1 | 0 | 1 | 4 | 2 | 6 | 1 | 1 | 4 | 4 | 2 | 8 | 4 |
3 | 3 | 1 | 0 | 3 | 0 | 0 | 0 | 6 | 2 | 2 | 0 | 5 | 1 | 1 | 13 | 15 | 2 | 3 | 4 | 0 | 3 | 1 | 3 | 0 | 5 | 1 | 2 | 1 | 4 | 2 | 6 | 5 | 2 | 4 | 4 | 2 | 8 | 0 |
4 | 3 | 1 | 0 | 3 | 0 | 2 | 0 | 15 | 2 | 2 | 0 | 5 | 1 | 1 | 12 | 13 | 1 | 2 | 4 | 2 | 2 | 3 | 0 | 2 | 5 | 1 | 0 | 1 | 4 | 2 | 6 | 1 | 1 | 4 | 4 | 2 | 8 | 4 |
X = pd.concat([X_scaled , X_cat], axis = 1)
X
MSSubClass | LotArea | OverallQual | OverallCond | YearBuilt | YearRemodAdd | MasVnrArea | BsmtFinSF1 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | TotRmsAbvGrd | Fireplaces | GarageYrBlt | GarageCars | GarageArea | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | MiscVal | MoSold | YrSold | MSZoning | Street | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinType2 | Heating | HeatingQC | CentralAir | Electrical | KitchenQual | Functional | GarageType | GarageFinish | GarageQual | GarageCond | PavedDrive | SaleType | SaleCondition | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.073375 | -0.207142 | 0.651479 | -0.517200 | 1.050994 | 0.878668 | 0.514104 | 0.575425 | -0.288653 | -0.944591 | -0.459303 | -0.793434 | 1.161852 | -0.120242 | 0.370333 | 1.107810 | -0.241061 | 0.789741 | 1.227585 | 0.163779 | -0.211454 | 0.912210 | -0.951226 | 0.296026 | 0.311725 | 0.351000 | -0.752176 | 0.216503 | -0.359325 | -0.116339 | -0.270208 | -0.068692 | -0.087688 | -1.599111 | 0.138777 | 3 | 1 | 3 | 3 | 0 | 4 | 0 | 5 | 2 | 2 | 0 | 5 | 1 | 1 | 12 | 13 | 1 | 2 | 4 | 2 | 2 | 3 | 3 | 2 | 5 | 1 | 0 | 1 | 4 | 2 | 6 | 1 | 1 | 4 | 4 | 2 | 8 | 4 |
1 | -0.872563 | -0.091886 | -0.071836 | 2.179628 | 0.156734 | -0.429577 | -0.570750 | 1.171992 | -0.288653 | -0.641228 | 0.466465 | 0.257140 | -0.795163 | -0.120242 | -0.482512 | -0.819964 | 3.948809 | 0.789741 | -0.761621 | 0.163779 | -0.211454 | -0.318683 | 0.600495 | 0.236495 | 0.311725 | -0.060731 | 1.626195 | -0.704483 | -0.359325 | -0.116339 | -0.270208 | -0.068692 | -0.087688 | -0.489110 | -0.614439 | 3 | 1 | 3 | 3 | 0 | 2 | 0 | 24 | 1 | 2 | 0 | 2 | 1 | 1 | 8 | 8 | 2 | 3 | 4 | 1 | 2 | 3 | 1 | 0 | 5 | 1 | 0 | 1 | 4 | 3 | 6 | 1 | 1 | 4 | 4 | 2 | 8 | 4 |
2 | 0.073375 | 0.073480 | 0.651479 | -0.517200 | 0.984752 | 0.830215 | 0.325915 | 0.092907 | -0.288653 | -0.301643 | -0.313369 | -0.627826 | 1.189351 | -0.120242 | 0.515013 | 1.107810 | -0.241061 | 0.789741 | 1.227585 | 0.163779 | -0.211454 | -0.318683 | 0.600495 | 0.291616 | 0.311725 | 0.631726 | -0.752176 | -0.070361 | -0.359325 | -0.116339 | -0.270208 | -0.068692 | -0.087688 | 0.990891 | 0.138777 | 3 | 1 | 0 | 3 | 0 | 4 | 0 | 5 | 2 | 2 | 0 | 5 | 1 | 1 | 12 | 13 | 1 | 2 | 4 | 2 | 2 | 3 | 2 | 2 | 5 | 1 | 0 | 1 | 4 | 2 | 6 | 1 | 1 | 4 | 4 | 2 | 8 | 4 |
3 | 0.309859 | -0.096897 | 0.651479 | -0.517200 | -1.863632 | -0.720298 | -0.570750 | -0.499274 | -0.288653 | -0.061670 | -0.687324 | -0.521734 | 0.937276 | -0.120242 | 0.383659 | 1.107810 | -0.241061 | -1.026041 | -0.761621 | 0.163779 | -0.211454 | 0.296763 | 0.600495 | 0.285002 | 1.650307 | 0.790804 | -0.752176 | -0.176048 | 4.092524 | -0.116339 | -0.270208 | -0.068692 | -0.087688 | -1.599111 | -1.367655 | 3 | 1 | 0 | 3 | 0 | 0 | 0 | 6 | 2 | 2 | 0 | 5 | 1 | 1 | 13 | 15 | 2 | 3 | 4 | 0 | 3 | 1 | 3 | 0 | 5 | 1 | 2 | 1 | 4 | 2 | 6 | 5 | 2 | 4 | 4 | 2 | 8 | 0 |
4 | 0.073375 | 0.375148 | 1.374795 | -0.517200 | 0.951632 | 0.733308 | 1.366489 | 0.463568 | -0.288653 | -0.174865 | 0.199680 | -0.045611 | 1.617877 | -0.120242 | 1.299326 | 1.107810 | -0.241061 | 0.789741 | 1.227585 | 1.390023 | -0.211454 | 1.527656 | 0.600495 | 0.289412 | 1.650307 | 1.698485 | 0.780197 | 0.563760 | -0.359325 | -0.116339 | -0.270208 | -0.068692 | -0.087688 | 2.100892 | 0.138777 | 3 | 1 | 0 | 3 | 0 | 2 | 0 | 15 | 2 | 2 | 0 | 5 | 1 | 1 | 12 | 13 | 1 | 2 | 4 | 2 | 2 | 3 | 0 | 2 | 5 | 1 | 0 | 1 | 4 | 2 | 6 | 1 | 1 | 4 | 4 | 2 | 8 | 4 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1455 | 0.073375 | -0.260560 | -0.071836 | -0.517200 | 0.918511 | 0.733308 | -0.570750 | -0.973018 | -0.288653 | 0.873321 | -0.238122 | -0.542435 | 0.795198 | -0.120242 | 0.250402 | -0.819964 | -0.241061 | 0.789741 | 1.227585 | 0.163779 | -0.211454 | 0.296763 | 0.600495 | 0.287207 | 0.311725 | -0.060731 | -0.752176 | -0.100558 | -0.359325 | -0.116339 | -0.270208 | -0.068692 | -0.087688 | 0.620891 | -0.614439 | 3 | 1 | 3 | 3 | 0 | 4 | 0 | 8 | 2 | 2 | 0 | 5 | 1 | 1 | 12 | 13 | 2 | 3 | 4 | 2 | 2 | 3 | 3 | 5 | 5 | 1 | 0 | 1 | 4 | 3 | 6 | 1 | 1 | 4 | 4 | 2 | 8 | 4 |
1456 | -0.872563 | 0.266407 | -0.071836 | 0.381743 | 0.222975 | 0.151865 | 0.087911 | 0.759659 | 0.722112 | 0.049262 | 1.104925 | 2.355701 | -0.795163 | -0.120242 | 1.061367 | 1.107810 | -0.241061 | 0.789741 | -0.761621 | 0.163779 | -0.211454 | 0.296763 | 2.152216 | 0.240904 | 0.311725 | 0.126420 | 2.033231 | -0.704483 | -0.359325 | -0.116339 | -0.270208 | -0.068692 | -0.087688 | -1.599111 | 1.645210 | 3 | 1 | 3 | 3 | 0 | 4 | 0 | 14 | 2 | 2 | 0 | 2 | 1 | 1 | 9 | 10 | 3 | 3 | 4 | 1 | 2 | 3 | 3 | 0 | 4 | 1 | 4 | 1 | 4 | 3 | 2 | 1 | 2 | 4 | 4 | 2 | 8 | 4 |
1457 | 0.309859 | -0.147810 | 0.651479 | 3.078570 | -1.002492 | 1.024029 | -0.570750 | -0.369871 | -0.288653 | 0.701265 | 0.215641 | 0.065656 | 1.844744 | -0.120242 | 1.569647 | -0.819964 | -0.241061 | 0.789741 | -0.761621 | 1.390023 | -0.211454 | 1.527656 | 2.152216 | 0.159324 | -1.026858 | -1.033914 | -0.752176 | 0.201405 | -0.359325 | -0.116339 | -0.270208 | -0.068692 | 4.953112 | -0.489110 | 1.645210 | 3 | 1 | 3 | 3 | 0 | 4 | 0 | 6 | 2 | 2 | 0 | 5 | 1 | 1 | 5 | 5 | 2 | 0 | 2 | 4 | 3 | 1 | 3 | 2 | 5 | 1 | 0 | 1 | 4 | 2 | 6 | 1 | 1 | 4 | 4 | 2 | 8 | 4 |
1458 | -0.872563 | -0.080160 | -0.795151 | 0.381743 | -0.704406 | 0.539493 | -0.570750 | -0.865548 | 6.092188 | -1.284176 | 0.046905 | -0.218982 | -0.795163 | -0.120242 | -0.832788 | 1.107810 | -0.241061 | -1.026041 | -0.761621 | -1.062465 | -0.211454 | -0.934130 | -0.951226 | 0.179168 | -1.026858 | -1.090059 | 2.168910 | -0.704483 | 1.473789 | -0.116339 | -0.270208 | -0.068692 | -0.087688 | -0.859110 | 1.645210 | 3 | 1 | 3 | 3 | 0 | 4 | 0 | 12 | 2 | 2 | 0 | 2 | 3 | 1 | 8 | 8 | 2 | 3 | 4 | 1 | 3 | 3 | 2 | 2 | 4 | 1 | 2 | 1 | 0 | 2 | 6 | 1 | 2 | 4 | 4 | 2 | 8 | 4 |
1459 | -0.872563 | -0.058112 | -0.795151 | 0.381743 | -0.207594 | -0.962566 | -0.570750 | 0.847389 | 1.509640 | -0.976285 | 0.452784 | 0.241615 | -0.795163 | -0.120242 | -0.493934 | 1.107810 | -0.241061 | -1.026041 | 1.227585 | 0.163779 | -0.211454 | -0.318683 | -0.951226 | 0.212241 | -1.026858 | -0.921624 | 5.121921 | 0.322190 | -0.359325 | -0.116339 | -0.270208 | -0.068692 | -0.087688 | -0.119110 | 0.138777 | 3 | 1 | 3 | 3 | 0 | 4 | 0 | 7 | 2 | 2 | 0 | 2 | 1 | 1 | 6 | 6 | 2 | 2 | 4 | 1 | 3 | 3 | 3 | 1 | 3 | 1 | 2 | 1 | 4 | 3 | 6 | 1 | 0 | 4 | 4 | 2 | 8 | 4 |
1460 rows × 73 columns
y
0 208500
1 181500
2 223500
3 140000
4 250000
...
1455 175000
1456 210000
1457 266500
1458 142125
1459 147500
Name: SalePrice, Length: 1460, dtype: int64
modes = y.mode(0)[0]
y[y < modes ] = 0
y[y >= modes ] = 1
y.value_counts()
1 971
0 489
Name: SalePrice, dtype: int64
sns.countplot(y)
<matplotlib.axes._subplots.AxesSubplot at 0x2553e042ec8>
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y , test_size = 0.3, random_state = 1)
# 학습시킬 모델 로드하기
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
classifiers = {
"Logisitic Regression": LogisticRegression(),
"K Nearest": KNeighborsClassifier(),
"Support Vector Classifier": SVC(),
"Decision Tree Classifier": DecisionTreeClassifier(),
"Random Forest Classifier": RandomForestClassifier(),
"Gradient Boosting Classifier": GradientBoostingClassifier(),
"LightGBM Classifier": LGBMClassifier()
}
from sklearn.model_selection import cross_val_score
for key, classifier in classifiers.items():
classifier.fit(X_train, y_train)
training_score = cross_val_score(classifier, X_train, y_train, cv=5)
print(classifier.__class__.__name__, ':', round(training_score.mean(), 2) * 100, '% accuracy')
LogisticRegression : 90.0 % accuracy
KNeighborsClassifier : 85.0 % accuracy
SVC : 89.0 % accuracy
DecisionTreeClassifier : 83.0 % accuracy
RandomForestClassifier : 91.0 % accuracy
GradientBoostingClassifier : 90.0 % accuracy
LGBMClassifier : 89.0 % accuracy
# 모델별 분류결과 확인하기 (올바른 예)
from sklearn.metrics import classification_report
for key, classifier in classifiers.items():
y_pred = classifier.predict(X_test) ####
results = classification_report(y_test, y_pred) ####
print(classifier.__class__.__name__, '-------','\n', results)
LogisticRegression -------
precision recall f1-score support
0 0.86 0.91 0.89 158
1 0.95 0.92 0.93 280
accuracy 0.92 438
macro avg 0.91 0.91 0.91 438
weighted avg 0.92 0.92 0.92 438
KNeighborsClassifier -------
precision recall f1-score support
0 0.77 0.86 0.81 158
1 0.92 0.86 0.89 280
accuracy 0.86 438
macro avg 0.84 0.86 0.85 438
weighted avg 0.86 0.86 0.86 438
SVC -------
precision recall f1-score support
0 0.86 0.89 0.87 158
1 0.93 0.92 0.93 280
accuracy 0.91 438
macro avg 0.90 0.90 0.90 438
weighted avg 0.91 0.91 0.91 438
DecisionTreeClassifier -------
precision recall f1-score support
0 0.83 0.82 0.82 158
1 0.90 0.91 0.90 280
accuracy 0.87 438
macro avg 0.86 0.86 0.86 438
weighted avg 0.87 0.87 0.87 438
RandomForestClassifier -------
precision recall f1-score support
0 0.90 0.90 0.90 158
1 0.94 0.94 0.94 280
accuracy 0.93 438
macro avg 0.92 0.92 0.92 438
weighted avg 0.93 0.93 0.93 438
GradientBoostingClassifier -------
precision recall f1-score support
0 0.88 0.87 0.88 158
1 0.93 0.93 0.93 280
accuracy 0.91 438
macro avg 0.90 0.90 0.90 438
weighted avg 0.91 0.91 0.91 438
LGBMClassifier -------
precision recall f1-score support
0 0.88 0.90 0.89 158
1 0.94 0.93 0.94 280
accuracy 0.92 438
macro avg 0.91 0.92 0.91 438
weighted avg 0.92 0.92 0.92 438
# 모델별 Confusion Matrix 확인하기 (올바른 예)
from sklearn.metrics import confusion_matrix
for key, classifier in classifiers.items():
y_pred = classifier.predict(X_test)####
cm = confusion_matrix(y_test,y_pred)####
print(classifier.__class__.__name__, '\n', cm, '\n')
LogisticRegression
[[144 14]
[ 23 257]]
KNeighborsClassifier
[[136 22]
[ 40 240]]
SVC
[[140 18]
[ 22 258]]
DecisionTreeClassifier
[[129 29]
[ 26 254]]
RandomForestClassifier
[[142 16]
[ 16 264]]
GradientBoostingClassifier
[[138 20]
[ 19 261]]
LGBMClassifier
[[142 16]
[ 19 261]]
Comments