from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.impute import SimpleImputer from sklearn.preprocessing import OneHotEncoder from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_absolute_error # Preprocessing for numerical data numerical_transformer = SimpleImputer(strategy='constant') # Preprocessing for categorical data categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore')) ]) # Bundle preprocessing for numerical and categorical data preprocessor = ColumnTransformer( transformers=[ ('num', numerical_transformer, numerical_cols), ('cat', categorical_transformer, categorical_cols) ]) # Define model model = RandomForestRegressor(n_estimators=100, random_state=0) # Bundle preprocessing and modeling code in a pipeline clf = Pipeline(steps=[('preprocessor', preprocessor), ('model', model) ]) # Preprocessing of training data, fit model clf.fit(X_train, y_train) # Preprocessing of validation data, get predictions preds = clf.predict(X_valid) print('MAE:', mean_absolute_error(y_valid, preds))