File size: 8,630 Bytes
7c045bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214

"""
This module provides functions for hyperparameter tuning with preprocessing using scikit-learn's GridSearchCV
for both regression and classification tasks.

Features:
- Handles numerical and categorical preprocessing using pipelines.
- Automates hyperparameter tuning for any scikit-learn estimator.
- Uses GridSearchCV for cross-validation and hyperparameter search.
- Applies algorithm-specific preprocessing when necessary (e.g., ordinal encoding for tree-based models).

Functions:
    - regression_hyperparameter_tuning: For regression models.
    - classification_hyperparameter_tuning: For classification models.

Example Usage (Regression):
    from sklearn.ensemble import RandomForestRegressor
    from supervised_hyperparameter_tuning import regression_hyperparameter_tuning

    X = ...  # Your feature DataFrame
    y = ...  # Your numeric target variable
    param_grid = {
        'model__n_estimators': [100, 200],
        'model__max_depth': [None, 10]
    }
    best_model, best_params = regression_hyperparameter_tuning(X, y, RandomForestRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error')

Example Usage (Classification):
    from sklearn.ensemble import RandomForestClassifier
    from supervised_hyperparameter_tuning import classification_hyperparameter_tuning

    X = ...  # Your feature DataFrame
    y = ...  # Your target variable (categorical)
    param_grid = {
        'model__n_estimators': [100, 200],
        'model__max_depth': [None, 10]
    }
    best_model, best_params = classification_hyperparameter_tuning(X, y, RandomForestClassifier(), param_grid, cv=5, scoring='accuracy')
"""

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, KFold

def regression_hyperparameter_tuning(X, y, estimator, param_grid, cv=5, scoring=None):
    """
    Performs hyperparameter tuning for a given regression model using GridSearchCV with preprocessing.

    Args:
        X (pd.DataFrame): Features.
        y (pd.Series): Target variable.
        estimator: The scikit-learn regressor to use (e.g., LinearRegression(), RandomForestRegressor()).
        param_grid (dict): Hyperparameter grid for GridSearchCV.
        cv (int or cross-validation generator): Number of cross-validation folds or a cross-validation generator.
        scoring (str or None): Scoring metric to use.

    Returns:
        best_model (Pipeline): Best model within a pipeline from GridSearch.
        best_params (dict): Best hyperparameters.
    """
    # Identify numerical and categorical columns
    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

    # Define preprocessing for numerical data
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    # Conditional preprocessing for categorical data
    estimator_name = estimator.__class__.__name__

    if estimator_name in [
        'DecisionTreeRegressor', 'RandomForestRegressor', 'ExtraTreesRegressor',
        'GradientBoostingRegressor', 'XGBRegressor', 'LGBMRegressor', 'CatBoostRegressor'
    ]:
        # Use Ordinal Encoding for tree-based models
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
            ('ordinal_encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
        ])
    else:
        # Use OneHotEncoder for other models
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ])

    # Create preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ]
    )

    # Create a pipeline that combines preprocessing and the estimator
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', estimator)
    ])

    # Define cross-validation strategy
    if isinstance(cv, int):
        cv = KFold(n_splits=cv, shuffle=True, random_state=42)

    # Initialize GridSearchCV
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        cv=cv,
        scoring=scoring,
        n_jobs=-1
    )

    # Perform Grid Search
    grid_search.fit(X, y)

    # Get the best model and parameters
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    print(f"Best Hyperparameters for {estimator_name}:")
    for param_name in sorted(best_params.keys()):
        print(f"{param_name}: {best_params[param_name]}")

    return best_model, best_params

def classification_hyperparameter_tuning(X, y, estimator, param_grid, cv=5, scoring=None):
    """
    Performs hyperparameter tuning for a given classification model using GridSearchCV with preprocessing.

    This function is similar to the regression one but adapted for classification tasks. It can handle both
    binary and multi-class classification. The choice of scoring metric (e.g., 'accuracy', 'f1', 'f1_macro', 'roc_auc')
    will determine how we evaluate the model, but the pipeline structure remains the same.

    Args:
        X (pd.DataFrame): Features.
        y (pd.Series): Target variable (categorical) for classification (can be binary or multi-class).
        estimator: The scikit-learn classifier to use (e.g., LogisticRegression(), RandomForestClassifier()).
        param_grid (dict): Hyperparameter grid for GridSearchCV.
        cv (int or cross-validation generator): Number of cross-validation folds or a CV generator.
        scoring (str or None): Scoring metric (e.g., 'accuracy' for binary or multi-class, 'f1_macro' for multi-class).

    Returns:
        best_model (Pipeline): Best model within a pipeline from GridSearch.
        best_params (dict): Best hyperparameters.
    """
    # Identify numerical and categorical columns
    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

    # Define preprocessing for numerical data
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    # Determine encoding strategy based on model type (tree-based vs. others)
    estimator_name = estimator.__class__.__name__
    tree_based_classifiers = [
        'DecisionTreeClassifier', 'RandomForestClassifier', 'ExtraTreesClassifier',
        'GradientBoostingClassifier', 'XGBClassifier', 'LGBMClassifier', 'CatBoostClassifier'
    ]

    if estimator_name in tree_based_classifiers:
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
            ('ordinal_encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
        ])
    else:
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ])

    # Create preprocessing pipeline
    preprocessor = ColumnTransformer(transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

    # Combine preprocessing and estimator in a pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', estimator)
    ])

    # Define cross-validation strategy
    if isinstance(cv, int):
        cv = KFold(n_splits=cv, shuffle=True, random_state=42)

    # GridSearchCV for classification
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        cv=cv,
        scoring=scoring,
        n_jobs=-1
    )

    grid_search.fit(X, y)
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    print(f"Best Hyperparameters for {estimator_name}:")
    for param_name in sorted(best_params.keys()):
        print(f"{param_name}: {best_params[param_name]}")

    return best_model, best_params