train_unsupervised / utils /unsupervised_hyperparameter_tuning.py
mboukabous's picture
first commit
4c91838
"""
unsupervised_hyperparameter_tuning.py
Provides a function for hyperparameter tuning of clustering models
using silhouette score as an objective.
"""
import numpy as np
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import silhouette_score
import copy
def clustering_hyperparameter_tuning(X, estimator, param_grid, scoring='silhouette', cv=5):
"""
A simple manual hyperparameter search for clustering models,
using silhouette_score for evaluation.
Args:
X (array-like): Feature data for clustering.
estimator: An estimator with .fit() and .predict() or .labels_ attribute.
param_grid (dict): Dictionary of hyperparams, e.g. {'model__n_clusters': [2,3,4]}.
scoring (str): Only 'silhouette' is supported here.
cv (int): We can do repeated subsampling or something similar to get stable silhouette.
Returns:
best_estimator: The estimator with best silhouette score.
best_params: Dictionary of best parameters found.
"""
if not param_grid:
# If param_grid is empty, just fit once
estimator.fit(X)
return estimator, {}
best_score = -1 # silhouette ranges -1 to 1
best_params = None
best_estimator = None
for params in ParameterGrid(param_grid):
# Clone the original estimator
from sklearn.base import clone
current_estimator = clone(estimator)
# Apply params
for param, val in params.items():
# param might look like "model__n_clusters"
# We adapt: if param starts with 'model__', we set on current_estimator
path = param.split('__')
if len(path) > 1:
# E.g., path = ['model','n_clusters']
# we set current_estimator.n_clusters = val
setattr(current_estimator, path[1], val)
else:
# If there's no 'model__' prefix
setattr(current_estimator, param, val)
# Simple approach to do multiple splits if we want
# For now, let's do a single fit to keep it straightforward
current_estimator.fit(X)
# Use the fitted current_estimator here, not 'estimator'
if hasattr(current_estimator, 'labels_') and current_estimator.labels_ is not None:
labels = current_estimator.labels_
elif hasattr(current_estimator, 'predict'):
labels = current_estimator.predict(X)
else:
raise ValueError("No valid way to retrieve cluster labels for this estimator.")
unique_labels = set(labels)
if len(unique_labels) > 1:
score = silhouette_score(X, labels)
else:
score = -999 # invalid scenario if only 1 cluster
if score > best_score:
best_score = score
best_params = params
best_estimator = current_estimator
if best_estimator is None:
print("No valid parameter combination produced more than 1 cluster. Falling back to original estimator.")
estimator.fit(X)
return estimator, {}
else:
print(f"Best silhouette score: {best_score:.4f}")
return best_estimator, best_params