Commit ·
eb4e2e2
1
Parent(s): 2423931
initial commit
Browse files- preprocessing.py +38 -0
preprocessing.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sklearn.compose import ColumnTransformer
|
| 2 |
+
from sklearn.pipeline import Pipeline
|
| 3 |
+
from sklearn.impute import SimpleImputer
|
| 4 |
+
from sklearn.preprocessing import OneHotEncoder
|
| 5 |
+
from sklearn.ensemble import RandomForestRegressor
|
| 6 |
+
from sklearn.metrics import mean_absolute_error
|
| 7 |
+
|
| 8 |
+
# Preprocessing for numerical data
|
| 9 |
+
numerical_transformer = SimpleImputer(strategy='constant')
|
| 10 |
+
|
| 11 |
+
# Preprocessing for categorical data
|
| 12 |
+
categorical_transformer = Pipeline(steps=[
|
| 13 |
+
('imputer', SimpleImputer(strategy='most_frequent')),
|
| 14 |
+
('onehot', OneHotEncoder(handle_unknown='ignore'))
|
| 15 |
+
])
|
| 16 |
+
|
| 17 |
+
# Bundle preprocessing for numerical and categorical data
|
| 18 |
+
preprocessor = ColumnTransformer(
|
| 19 |
+
transformers=[
|
| 20 |
+
('num', numerical_transformer, numerical_cols),
|
| 21 |
+
('cat', categorical_transformer, categorical_cols)
|
| 22 |
+
])
|
| 23 |
+
|
| 24 |
+
# Define model
|
| 25 |
+
model = RandomForestRegressor(n_estimators=100, random_state=0)
|
| 26 |
+
|
| 27 |
+
# Bundle preprocessing and modeling code in a pipeline
|
| 28 |
+
clf = Pipeline(steps=[('preprocessor', preprocessor),
|
| 29 |
+
('model', model)
|
| 30 |
+
])
|
| 31 |
+
|
| 32 |
+
# Preprocessing of training data, fit model
|
| 33 |
+
clf.fit(X_train, y_train)
|
| 34 |
+
|
| 35 |
+
# Preprocessing of validation data, get predictions
|
| 36 |
+
preds = clf.predict(X_valid)
|
| 37 |
+
|
| 38 |
+
print('MAE:', mean_absolute_error(y_valid, preds))
|