Spaces:

mboukabous
/

train_classificator

Sleeping

File size: 8,630 Bytes

7c045bd


"""
This module provides functions for hyperparameter tuning with preprocessing using scikit-learn's GridSearchCV
for both regression and classification tasks.

Features:
- Handles numerical and categorical preprocessing using pipelines.
- Automates hyperparameter tuning for any scikit-learn estimator.
- Uses GridSearchCV for cross-validation and hyperparameter search.
- Applies algorithm-specific preprocessing when necessary (e.g., ordinal encoding for tree-based models).

Functions:
    - regression_hyperparameter_tuning: For regression models.
    - classification_hyperparameter_tuning: For classification models.

Example Usage (Regression):
    from sklearn.ensemble import RandomForestRegressor
    from supervised_hyperparameter_tuning import regression_hyperparameter_tuning

    X = ...  # Your feature DataFrame
    y = ...  # Your numeric target variable
    param_grid = {
        'model__n_estimators': [100, 200],
        'model__max_depth': [None, 10]
    }
    best_model, best_params = regression_hyperparameter_tuning(X, y, RandomForestRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error')

Example Usage (Classification):
    from sklearn.ensemble import RandomForestClassifier
    from supervised_hyperparameter_tuning import classification_hyperparameter_tuning

    X = ...  # Your feature DataFrame
    y = ...  # Your target variable (categorical)
    param_grid = {
        'model__n_estimators': [100, 200],
        'model__max_depth': [None, 10]
    }
    best_model, best_params = classification_hyperparameter_tuning(X, y, RandomForestClassifier(), param_grid, cv=5, scoring='accuracy')
"""

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, KFold

def regression_hyperparameter_tuning(X, y, estimator, param_grid, cv=5, scoring=None):
    """
    Performs hyperparameter tuning for a given regression model using GridSearchCV with preprocessing.

    Args:
        X (pd.DataFrame): Features.
        y (pd.Series): Target variable.
        estimator: The scikit-learn regressor to use (e.g., LinearRegression(), RandomForestRegressor()).
        param_grid (dict): Hyperparameter grid for GridSearchCV.
        cv (int or cross-validation generator): Number of cross-validation folds or a cross-validation generator.
        scoring (str or None): Scoring metric to use.

    Returns:
        best_model (Pipeline): Best model within a pipeline from GridSearch.
        best_params (dict): Best hyperparameters.
    """
    # Identify numerical and categorical columns
    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

    # Define preprocessing for numerical data
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    # Conditional preprocessing for categorical data
    estimator_name = estimator.__class__.__name__

    if estimator_name in [
        'DecisionTreeRegressor', 'RandomForestRegressor', 'ExtraTreesRegressor',
        'GradientBoostingRegressor', 'XGBRegressor', 'LGBMRegressor', 'CatBoostRegressor'
    ]:
        # Use Ordinal Encoding for tree-based models
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
            ('ordinal_encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
        ])
    else:
        # Use OneHotEncoder for other models
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ])

    # Create preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ]
    )

    # Create a pipeline that combines preprocessing and the estimator
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', estimator)
    ])

    # Define cross-validation strategy
    if isinstance(cv, int):
        cv = KFold(n_splits=cv, shuffle=True, random_state=42)

    # Initialize GridSearchCV
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        cv=cv,
        scoring=scoring,
        n_jobs=-1
    )

    # Perform Grid Search
    grid_search.fit(X, y)

    # Get the best model and parameters
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    print(f"Best Hyperparameters for {estimator_name}:")
    for param_name in sorted(best_params.keys()):
        print(f"{param_name}: {best_params[param_name]}")

    return best_model, best_params

def classification_hyperparameter_tuning(X, y, estimator, param_grid, cv=5, scoring=None):
    """
    Performs hyperparameter tuning for a given classification model using GridSearchCV with preprocessing.

    This function is similar to the regression one but adapted for classification tasks. It can handle both
    binary and multi-class classification. The choice of scoring metric (e.g., 'accuracy', 'f1', 'f1_macro', 'roc_auc')
    will determine how we evaluate the model, but the pipeline structure remains the same.

    Args:
        X (pd.DataFrame): Features.
        y (pd.Series): Target variable (categorical) for classification (can be binary or multi-class).
        estimator: The scikit-learn classifier to use (e.g., LogisticRegression(), RandomForestClassifier()).
        param_grid (dict): Hyperparameter grid for GridSearchCV.
        cv (int or cross-validation generator): Number of cross-validation folds or a CV generator.
        scoring (str or None): Scoring metric (e.g., 'accuracy' for binary or multi-class, 'f1_macro' for multi-class).

    Returns:
        best_model (Pipeline): Best model within a pipeline from GridSearch.
        best_params (dict): Best hyperparameters.
    """
    # Identify numerical and categorical columns
    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

    # Define preprocessing for numerical data
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    # Determine encoding strategy based on model type (tree-based vs. others)
    estimator_name = estimator.__class__.__name__
    tree_based_classifiers = [
        'DecisionTreeClassifier', 'RandomForestClassifier', 'ExtraTreesClassifier',
        'GradientBoostingClassifier', 'XGBClassifier', 'LGBMClassifier', 'CatBoostClassifier'
    ]

    if estimator_name in tree_based_classifiers:
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
            ('ordinal_encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
        ])
    else:
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ])

    # Create preprocessing pipeline
    preprocessor = ColumnTransformer(transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

    # Combine preprocessing and estimator in a pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', estimator)
    ])

    # Define cross-validation strategy
    if isinstance(cv, int):
        cv = KFold(n_splits=cv, shuffle=True, random_state=42)

    # GridSearchCV for classification
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        cv=cv,
        scoring=scoring,
        n_jobs=-1
    )

    grid_search.fit(X, y)
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    print(f"Best Hyperparameters for {estimator_name}:")
    for param_name in sorted(best_params.keys()):
        print(f"{param_name}: {best_params[param_name]}")

    return best_model, best_params