train_regression / utils /supervised_hyperparameter_tuning.py
mboukabous's picture
Add application file
829e3ac
"""
This module provides a function for hyperparameter tuning with preprocessing using scikit-learn's GridSearchCV specifically for regression models.
Features:
- Handles numerical and categorical preprocessing using pipelines.
- Automates hyperparameter tuning for any scikit-learn regressor.
- Uses GridSearchCV for cross-validation and hyperparameter search.
- Applies algorithm-specific preprocessing when necessary.
Functions:
- hyperparameter_tuning_model: Performs hyperparameter tuning on a given dataset and estimator.
Example Usage:
from sklearn.ensemble import RandomForestRegressor
from supervised_hyperparameter_tuning import hyperparameter_tuning_model
X = ... # Your feature DataFrame
y = ... # Your target variable
param_grid = {
'model__n_estimators': [100, 200, 500],
'model__max_depth': [None, 10, 20]
}
best_model, best_params = hyperparameter_tuning_model(X, y, RandomForestRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error')
"""
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, KFold
def regression_hyperparameter_tuning(X, y, estimator, param_grid, cv=5, scoring=None):
"""
Performs hyperparameter tuning for a given regression model using GridSearchCV with preprocessing.
Args:
X (pd.DataFrame): Features.
y (pd.Series): Target variable.
estimator: The scikit-learn regressor to use (e.g., LinearRegression(), RandomForestRegressor()).
param_grid (dict): Hyperparameter grid for GridSearchCV.
cv (int or cross-validation generator): Number of cross-validation folds or a cross-validation generator.
scoring (str or None): Scoring metric to use.
Returns:
best_model (Pipeline): Best model within a pipeline from GridSearch.
best_params (dict): Best hyperparameters.
"""
# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
# Define preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
# Conditional preprocessing for categorical data
estimator_name = estimator.__class__.__name__
if estimator_name in [
'DecisionTreeRegressor', 'RandomForestRegressor', 'ExtraTreesRegressor',
'GradientBoostingRegressor', 'XGBRegressor', 'LGBMRegressor', 'CatBoostRegressor'
]:
# Use Ordinal Encoding for tree-based models
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
('ordinal_encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])
else:
# Use OneHotEncoder for other models
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
# Create preprocessing pipeline
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
]
)
# Create a pipeline that combines preprocessing and the estimator
pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('model', estimator)
])
# Define cross-validation strategy
if isinstance(cv, int):
cv = KFold(n_splits=cv, shuffle=True, random_state=42)
# Initialize GridSearchCV
grid_search = GridSearchCV(
estimator=pipeline,
param_grid=param_grid,
cv=cv,
scoring=scoring,
n_jobs=-1
)
# Perform Grid Search
grid_search.fit(X, y)
# Get the best model and parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print(f"Best Hyperparameters for {estimator_name}:")
for param_name in sorted(best_params.keys()):
print(f"{param_name}: {best_params[param_name]}")
return best_model, best_params