AutoML / models /unsupervised.py
Al1Abdullah's picture
Initial commit of AutoML project
aa68823
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from utils.data_cleaner import prepare_data
import pandas as pd
import logging
# Configure logging for this module
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def train_unsupervised(df, model_name, n_clusters=3, eps=0.5, min_samples=5, n_components=2):
"""Trains an unsupervised machine learning model based on the specified model name.
Args:
df (pd.DataFrame): The input DataFrame for unsupervised learning.
model_name (str): The name of the unsupervised model to train (e.g., "KMeans", "DBSCAN", "PCA").
n_clusters (int, optional): Number of clusters for KMeans. Defaults to 3.
eps (float, optional): The maximum distance between two samples for one to be considered as in the neighborhood of the other for DBSCAN. Defaults to 0.5.
min_samples (int, optional): The number of samples (or total weight) in a neighborhood for a point to be considered as a core point for DBSCAN. Defaults to 5.
n_components (int, optional): Number of components to keep for PCA. Defaults to 2.
Returns:
tuple: A tuple containing:
- fitted_model: The trained unsupervised model object.
- result: The clustering labels or transformed data.
- error (str, optional): An error message if training fails.
"""
try:
# Prepare data for unsupervised learning (cleaning and scaling)
df_prepared, _ = prepare_data(df)
if df_prepared.empty:
logging.warning("Prepared DataFrame is empty for unsupervised training.")
return None, "Prepared data is empty."
model = None
# Initialize the selected unsupervised model
if model_name == "KMeans":
model = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
elif model_name == "DBSCAN":
model = DBSCAN(eps=eps, min_samples=min_samples)
elif model_name == "PCA":
model = PCA(n_components=n_components)
else:
logging.warning(f"Unsupervised model not supported: {model_name}")
return None, "Model not supported."
# Fit the model to the prepared data
fitted_model = model.fit(df_prepared)
result = None
# Extract results based on the model type
if hasattr(fitted_model, 'labels_'):
result = fitted_model.labels_
logging.info(f"KMeans/DBSCAN trained. Clusters/labels generated.")
elif hasattr(fitted_model, 'components_'):
result = fitted_model.transform(df_prepared)
logging.info(f"PCA trained. Data transformed to {n_components} components.")
else:
logging.info(f"Unsupervised model {model_name} trained, but no specific labels or components found.")
return fitted_model, result
except Exception as e:
logging.error(f"An error occurred during unsupervised model training for {model_name}: {e}", exc_info=True)
return None, f"An error occurred during model training: {e}"