Spaces:

mboukabous
/

train_classificator

Sleeping

App Files Files Community

train_classificator / utils /supervised_hyperparameter_tuning.py

mboukabous

Add application file

7c045bd about 1 year ago

raw

history blame contribute delete

8.63 kB


	"""
	This module provides functions for hyperparameter tuning with preprocessing using scikit-learn's GridSearchCV
	for both regression and classification tasks.

	Features:
	- Handles numerical and categorical preprocessing using pipelines.
	- Automates hyperparameter tuning for any scikit-learn estimator.
	- Uses GridSearchCV for cross-validation and hyperparameter search.
	- Applies algorithm-specific preprocessing when necessary (e.g., ordinal encoding for tree-based models).

	Functions:
	- regression_hyperparameter_tuning: For regression models.
	- classification_hyperparameter_tuning: For classification models.

	Example Usage (Regression):
	from sklearn.ensemble import RandomForestRegressor
	from supervised_hyperparameter_tuning import regression_hyperparameter_tuning

	X = ... # Your feature DataFrame
	y = ... # Your numeric target variable
	param_grid = {
	'model__n_estimators': [100, 200],
	'model__max_depth': [None, 10]
	}
	best_model, best_params = regression_hyperparameter_tuning(X, y, RandomForestRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error')

	Example Usage (Classification):
	from sklearn.ensemble import RandomForestClassifier
	from supervised_hyperparameter_tuning import classification_hyperparameter_tuning

	X = ... # Your feature DataFrame
	y = ... # Your target variable (categorical)
	param_grid = {
	'model__n_estimators': [100, 200],
	'model__max_depth': [None, 10]
	}
	best_model, best_params = classification_hyperparameter_tuning(X, y, RandomForestClassifier(), param_grid, cv=5, scoring='accuracy')
	"""

	from sklearn.compose import ColumnTransformer
	from sklearn.impute import SimpleImputer
	from sklearn.pipeline import Pipeline
	from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
	from sklearn.model_selection import GridSearchCV, KFold

	def regression_hyperparameter_tuning(X, y, estimator, param_grid, cv=5, scoring=None):
	"""
	Performs hyperparameter tuning for a given regression model using GridSearchCV with preprocessing.

	Args:
	X (pd.DataFrame): Features.
	y (pd.Series): Target variable.
	estimator: The scikit-learn regressor to use (e.g., LinearRegression(), RandomForestRegressor()).
	param_grid (dict): Hyperparameter grid for GridSearchCV.
	cv (int or cross-validation generator): Number of cross-validation folds or a cross-validation generator.
	scoring (str or None): Scoring metric to use.

	Returns:
	best_model (Pipeline): Best model within a pipeline from GridSearch.
	best_params (dict): Best hyperparameters.
	"""
	# Identify numerical and categorical columns
	numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
	categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

	# Define preprocessing for numerical data
	numerical_transformer = Pipeline(steps=[
	('imputer', SimpleImputer(strategy='median')),
	('scaler', StandardScaler())
	])

	# Conditional preprocessing for categorical data
	estimator_name = estimator.__class__.__name__

	if estimator_name in [
	'DecisionTreeRegressor', 'RandomForestRegressor', 'ExtraTreesRegressor',
	'GradientBoostingRegressor', 'XGBRegressor', 'LGBMRegressor', 'CatBoostRegressor'
	]:
	# Use Ordinal Encoding for tree-based models
	categorical_transformer = Pipeline(steps=[
	('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
	('ordinal_encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
	])
	else:
	# Use OneHotEncoder for other models
	categorical_transformer = Pipeline(steps=[
	('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
	('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
	])

	# Create preprocessing pipeline
	preprocessor = ColumnTransformer(
	transformers=[
	('num', numerical_transformer, numerical_cols),
	('cat', categorical_transformer, categorical_cols)
	]
	)

	# Create a pipeline that combines preprocessing and the estimator
	pipeline = Pipeline(steps=[
	('preprocessor', preprocessor),
	('model', estimator)
	])

	# Define cross-validation strategy
	if isinstance(cv, int):
	cv = KFold(n_splits=cv, shuffle=True, random_state=42)

	# Initialize GridSearchCV
	grid_search = GridSearchCV(
	estimator=pipeline,
	param_grid=param_grid,
	cv=cv,
	scoring=scoring,
	n_jobs=-1
	)

	# Perform Grid Search
	grid_search.fit(X, y)

	# Get the best model and parameters
	best_model = grid_search.best_estimator_
	best_params = grid_search.best_params_

	print(f"Best Hyperparameters for {estimator_name}:")
	for param_name in sorted(best_params.keys()):
	print(f"{param_name}: {best_params[param_name]}")

	return best_model, best_params

	def classification_hyperparameter_tuning(X, y, estimator, param_grid, cv=5, scoring=None):
	"""
	Performs hyperparameter tuning for a given classification model using GridSearchCV with preprocessing.

	This function is similar to the regression one but adapted for classification tasks. It can handle both
	binary and multi-class classification. The choice of scoring metric (e.g., 'accuracy', 'f1', 'f1_macro', 'roc_auc')
	will determine how we evaluate the model, but the pipeline structure remains the same.

	Args:
	X (pd.DataFrame): Features.
	y (pd.Series): Target variable (categorical) for classification (can be binary or multi-class).
	estimator: The scikit-learn classifier to use (e.g., LogisticRegression(), RandomForestClassifier()).
	param_grid (dict): Hyperparameter grid for GridSearchCV.
	cv (int or cross-validation generator): Number of cross-validation folds or a CV generator.
	scoring (str or None): Scoring metric (e.g., 'accuracy' for binary or multi-class, 'f1_macro' for multi-class).

	Returns:
	best_model (Pipeline): Best model within a pipeline from GridSearch.
	best_params (dict): Best hyperparameters.
	"""
	# Identify numerical and categorical columns
	numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
	categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

	# Define preprocessing for numerical data
	numerical_transformer = Pipeline(steps=[
	('imputer', SimpleImputer(strategy='median')),
	('scaler', StandardScaler())
	])

	# Determine encoding strategy based on model type (tree-based vs. others)
	estimator_name = estimator.__class__.__name__
	tree_based_classifiers = [
	'DecisionTreeClassifier', 'RandomForestClassifier', 'ExtraTreesClassifier',
	'GradientBoostingClassifier', 'XGBClassifier', 'LGBMClassifier', 'CatBoostClassifier'
	]

	if estimator_name in tree_based_classifiers:
	categorical_transformer = Pipeline(steps=[
	('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
	('ordinal_encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
	])
	else:
	categorical_transformer = Pipeline(steps=[
	('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
	('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
	])

	# Create preprocessing pipeline
	preprocessor = ColumnTransformer(transformers=[
	('num', numerical_transformer, numerical_cols),
	('cat', categorical_transformer, categorical_cols)
	])

	# Combine preprocessing and estimator in a pipeline
	pipeline = Pipeline(steps=[
	('preprocessor', preprocessor),
	('model', estimator)
	])

	# Define cross-validation strategy
	if isinstance(cv, int):
	cv = KFold(n_splits=cv, shuffle=True, random_state=42)

	# GridSearchCV for classification
	grid_search = GridSearchCV(
	estimator=pipeline,
	param_grid=param_grid,
	cv=cv,
	scoring=scoring,
	n_jobs=-1
	)

	grid_search.fit(X, y)
	best_model = grid_search.best_estimator_
	best_params = grid_search.best_params_

	print(f"Best Hyperparameters for {estimator_name}:")
	for param_name in sorted(best_params.keys()):
	print(f"{param_name}: {best_params[param_name]}")

	return best_model, best_params