Spaces:
Sleeping
Sleeping
| """ | |
| This module provides a function for hyperparameter tuning with preprocessing using scikit-learn's GridSearchCV specifically for regression models. | |
| Features: | |
| - Handles numerical and categorical preprocessing using pipelines. | |
| - Automates hyperparameter tuning for any scikit-learn regressor. | |
| - Uses GridSearchCV for cross-validation and hyperparameter search. | |
| - Applies algorithm-specific preprocessing when necessary. | |
| Functions: | |
| - hyperparameter_tuning_model: Performs hyperparameter tuning on a given dataset and estimator. | |
| Example Usage: | |
| from sklearn.ensemble import RandomForestRegressor | |
| from supervised_hyperparameter_tuning import hyperparameter_tuning_model | |
| X = ... # Your feature DataFrame | |
| y = ... # Your target variable | |
| param_grid = { | |
| 'model__n_estimators': [100, 200, 500], | |
| 'model__max_depth': [None, 10, 20] | |
| } | |
| best_model, best_params = hyperparameter_tuning_model(X, y, RandomForestRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error') | |
| """ | |
| from sklearn.compose import ColumnTransformer | |
| from sklearn.impute import SimpleImputer | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler | |
| from sklearn.model_selection import GridSearchCV, KFold | |
| def regression_hyperparameter_tuning(X, y, estimator, param_grid, cv=5, scoring=None): | |
| """ | |
| Performs hyperparameter tuning for a given regression model using GridSearchCV with preprocessing. | |
| Args: | |
| X (pd.DataFrame): Features. | |
| y (pd.Series): Target variable. | |
| estimator: The scikit-learn regressor to use (e.g., LinearRegression(), RandomForestRegressor()). | |
| param_grid (dict): Hyperparameter grid for GridSearchCV. | |
| cv (int or cross-validation generator): Number of cross-validation folds or a cross-validation generator. | |
| scoring (str or None): Scoring metric to use. | |
| Returns: | |
| best_model (Pipeline): Best model within a pipeline from GridSearch. | |
| best_params (dict): Best hyperparameters. | |
| """ | |
| # Identify numerical and categorical columns | |
| numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist() | |
| categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist() | |
| # Define preprocessing for numerical data | |
| numerical_transformer = Pipeline(steps=[ | |
| ('imputer', SimpleImputer(strategy='median')), | |
| ('scaler', StandardScaler()) | |
| ]) | |
| # Conditional preprocessing for categorical data | |
| estimator_name = estimator.__class__.__name__ | |
| if estimator_name in [ | |
| 'DecisionTreeRegressor', 'RandomForestRegressor', 'ExtraTreesRegressor', | |
| 'GradientBoostingRegressor', 'XGBRegressor', 'LGBMRegressor', 'CatBoostRegressor' | |
| ]: | |
| # Use Ordinal Encoding for tree-based models | |
| categorical_transformer = Pipeline(steps=[ | |
| ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')), | |
| ('ordinal_encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)) | |
| ]) | |
| else: | |
| # Use OneHotEncoder for other models | |
| categorical_transformer = Pipeline(steps=[ | |
| ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')), | |
| ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) | |
| ]) | |
| # Create preprocessing pipeline | |
| preprocessor = ColumnTransformer( | |
| transformers=[ | |
| ('num', numerical_transformer, numerical_cols), | |
| ('cat', categorical_transformer, categorical_cols) | |
| ] | |
| ) | |
| # Create a pipeline that combines preprocessing and the estimator | |
| pipeline = Pipeline(steps=[ | |
| ('preprocessor', preprocessor), | |
| ('model', estimator) | |
| ]) | |
| # Define cross-validation strategy | |
| if isinstance(cv, int): | |
| cv = KFold(n_splits=cv, shuffle=True, random_state=42) | |
| # Initialize GridSearchCV | |
| grid_search = GridSearchCV( | |
| estimator=pipeline, | |
| param_grid=param_grid, | |
| cv=cv, | |
| scoring=scoring, | |
| n_jobs=-1 | |
| ) | |
| # Perform Grid Search | |
| grid_search.fit(X, y) | |
| # Get the best model and parameters | |
| best_model = grid_search.best_estimator_ | |
| best_params = grid_search.best_params_ | |
| print(f"Best Hyperparameters for {estimator_name}:") | |
| for param_name in sorted(best_params.keys()): | |
| print(f"{param_name}: {best_params[param_name]}") | |
| return best_model, best_params | |