olivebradshaw commited on
Commit
eb4e2e2
·
1 Parent(s): 2423931

initial commit

Browse files
Files changed (1) hide show
  1. preprocessing.py +38 -0
preprocessing.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.compose import ColumnTransformer
2
+ from sklearn.pipeline import Pipeline
3
+ from sklearn.impute import SimpleImputer
4
+ from sklearn.preprocessing import OneHotEncoder
5
+ from sklearn.ensemble import RandomForestRegressor
6
+ from sklearn.metrics import mean_absolute_error
7
+
8
+ # Preprocessing for numerical data
9
+ numerical_transformer = SimpleImputer(strategy='constant')
10
+
11
+ # Preprocessing for categorical data
12
+ categorical_transformer = Pipeline(steps=[
13
+ ('imputer', SimpleImputer(strategy='most_frequent')),
14
+ ('onehot', OneHotEncoder(handle_unknown='ignore'))
15
+ ])
16
+
17
+ # Bundle preprocessing for numerical and categorical data
18
+ preprocessor = ColumnTransformer(
19
+ transformers=[
20
+ ('num', numerical_transformer, numerical_cols),
21
+ ('cat', categorical_transformer, categorical_cols)
22
+ ])
23
+
24
+ # Define model
25
+ model = RandomForestRegressor(n_estimators=100, random_state=0)
26
+
27
+ # Bundle preprocessing and modeling code in a pipeline
28
+ clf = Pipeline(steps=[('preprocessor', preprocessor),
29
+ ('model', model)
30
+ ])
31
+
32
+ # Preprocessing of training data, fit model
33
+ clf.fit(X_train, y_train)
34
+
35
+ # Preprocessing of validation data, get predictions
36
+ preds = clf.predict(X_valid)
37
+
38
+ print('MAE:', mean_absolute_error(y_valid, preds))