Spaces:
Sleeping
Sleeping
File size: 8,630 Bytes
7c045bd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 |
"""
This module provides functions for hyperparameter tuning with preprocessing using scikit-learn's GridSearchCV
for both regression and classification tasks.
Features:
- Handles numerical and categorical preprocessing using pipelines.
- Automates hyperparameter tuning for any scikit-learn estimator.
- Uses GridSearchCV for cross-validation and hyperparameter search.
- Applies algorithm-specific preprocessing when necessary (e.g., ordinal encoding for tree-based models).
Functions:
- regression_hyperparameter_tuning: For regression models.
- classification_hyperparameter_tuning: For classification models.
Example Usage (Regression):
from sklearn.ensemble import RandomForestRegressor
from supervised_hyperparameter_tuning import regression_hyperparameter_tuning
X = ... # Your feature DataFrame
y = ... # Your numeric target variable
param_grid = {
'model__n_estimators': [100, 200],
'model__max_depth': [None, 10]
}
best_model, best_params = regression_hyperparameter_tuning(X, y, RandomForestRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error')
Example Usage (Classification):
from sklearn.ensemble import RandomForestClassifier
from supervised_hyperparameter_tuning import classification_hyperparameter_tuning
X = ... # Your feature DataFrame
y = ... # Your target variable (categorical)
param_grid = {
'model__n_estimators': [100, 200],
'model__max_depth': [None, 10]
}
best_model, best_params = classification_hyperparameter_tuning(X, y, RandomForestClassifier(), param_grid, cv=5, scoring='accuracy')
"""
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, KFold
def regression_hyperparameter_tuning(X, y, estimator, param_grid, cv=5, scoring=None):
"""
Performs hyperparameter tuning for a given regression model using GridSearchCV with preprocessing.
Args:
X (pd.DataFrame): Features.
y (pd.Series): Target variable.
estimator: The scikit-learn regressor to use (e.g., LinearRegression(), RandomForestRegressor()).
param_grid (dict): Hyperparameter grid for GridSearchCV.
cv (int or cross-validation generator): Number of cross-validation folds or a cross-validation generator.
scoring (str or None): Scoring metric to use.
Returns:
best_model (Pipeline): Best model within a pipeline from GridSearch.
best_params (dict): Best hyperparameters.
"""
# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
# Define preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
# Conditional preprocessing for categorical data
estimator_name = estimator.__class__.__name__
if estimator_name in [
'DecisionTreeRegressor', 'RandomForestRegressor', 'ExtraTreesRegressor',
'GradientBoostingRegressor', 'XGBRegressor', 'LGBMRegressor', 'CatBoostRegressor'
]:
# Use Ordinal Encoding for tree-based models
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
('ordinal_encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])
else:
# Use OneHotEncoder for other models
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
# Create preprocessing pipeline
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
]
)
# Create a pipeline that combines preprocessing and the estimator
pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('model', estimator)
])
# Define cross-validation strategy
if isinstance(cv, int):
cv = KFold(n_splits=cv, shuffle=True, random_state=42)
# Initialize GridSearchCV
grid_search = GridSearchCV(
estimator=pipeline,
param_grid=param_grid,
cv=cv,
scoring=scoring,
n_jobs=-1
)
# Perform Grid Search
grid_search.fit(X, y)
# Get the best model and parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print(f"Best Hyperparameters for {estimator_name}:")
for param_name in sorted(best_params.keys()):
print(f"{param_name}: {best_params[param_name]}")
return best_model, best_params
def classification_hyperparameter_tuning(X, y, estimator, param_grid, cv=5, scoring=None):
"""
Performs hyperparameter tuning for a given classification model using GridSearchCV with preprocessing.
This function is similar to the regression one but adapted for classification tasks. It can handle both
binary and multi-class classification. The choice of scoring metric (e.g., 'accuracy', 'f1', 'f1_macro', 'roc_auc')
will determine how we evaluate the model, but the pipeline structure remains the same.
Args:
X (pd.DataFrame): Features.
y (pd.Series): Target variable (categorical) for classification (can be binary or multi-class).
estimator: The scikit-learn classifier to use (e.g., LogisticRegression(), RandomForestClassifier()).
param_grid (dict): Hyperparameter grid for GridSearchCV.
cv (int or cross-validation generator): Number of cross-validation folds or a CV generator.
scoring (str or None): Scoring metric (e.g., 'accuracy' for binary or multi-class, 'f1_macro' for multi-class).
Returns:
best_model (Pipeline): Best model within a pipeline from GridSearch.
best_params (dict): Best hyperparameters.
"""
# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
# Define preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
# Determine encoding strategy based on model type (tree-based vs. others)
estimator_name = estimator.__class__.__name__
tree_based_classifiers = [
'DecisionTreeClassifier', 'RandomForestClassifier', 'ExtraTreesClassifier',
'GradientBoostingClassifier', 'XGBClassifier', 'LGBMClassifier', 'CatBoostClassifier'
]
if estimator_name in tree_based_classifiers:
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
('ordinal_encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])
else:
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
# Create preprocessing pipeline
preprocessor = ColumnTransformer(transformers=[
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
])
# Combine preprocessing and estimator in a pipeline
pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('model', estimator)
])
# Define cross-validation strategy
if isinstance(cv, int):
cv = KFold(n_splits=cv, shuffle=True, random_state=42)
# GridSearchCV for classification
grid_search = GridSearchCV(
estimator=pipeline,
param_grid=param_grid,
cv=cv,
scoring=scoring,
n_jobs=-1
)
grid_search.fit(X, y)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print(f"Best Hyperparameters for {estimator_name}:")
for param_name in sorted(best_params.keys()):
print(f"{param_name}: {best_params[param_name]}")
return best_model, best_params
|