Commit ·
2bff51a
1
Parent(s): 6e3f415
update housing
Browse files- main_text.py +30 -6
main_text.py
CHANGED
|
@@ -1,8 +1,32 @@
|
|
| 1 |
-
|
| 2 |
-
from
|
| 3 |
|
| 4 |
-
|
| 5 |
-
|
|
|
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from sklearn.model_selection import train_test_split
|
| 3 |
|
| 4 |
+
# Read the data
|
| 5 |
+
X_full = pd.read_csv('../input/train.csv', index_col='Id')
|
| 6 |
+
X_test_full = pd.read_csv('../input/test.csv', index_col='Id')
|
| 7 |
|
| 8 |
+
# Remove rows with missing target, separate target from predictors
|
| 9 |
+
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
|
| 10 |
+
y = X_full.SalePrice
|
| 11 |
+
X_full.drop(['SalePrice'], axis=1, inplace=True)
|
| 12 |
+
|
| 13 |
+
# Break off validation set from training data
|
| 14 |
+
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y,
|
| 15 |
+
train_size=0.8, test_size=0.2,
|
| 16 |
+
random_state=0)
|
| 17 |
+
|
| 18 |
+
# "Cardinality" means the number of unique values in a column
|
| 19 |
+
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
|
| 20 |
+
categorical_cols = [cname for cname in X_train_full.columns if
|
| 21 |
+
X_train_full[cname].nunique() < 10 and
|
| 22 |
+
X_train_full[cname].dtype == "object"]
|
| 23 |
+
|
| 24 |
+
# Select numerical columns
|
| 25 |
+
numerical_cols = [cname for cname in X_train_full.columns if
|
| 26 |
+
X_train_full[cname].dtype in ['int64', 'float64']]
|
| 27 |
+
|
| 28 |
+
# Keep selected columns only
|
| 29 |
+
my_cols = categorical_cols + numerical_cols
|
| 30 |
+
X_train = X_train_full[my_cols].copy()
|
| 31 |
+
X_valid = X_valid_full[my_cols].copy()
|
| 32 |
+
X_test = X_test_full[my_cols].copy()
|