Spaces:

Al1Abdullah
/

AutoML

Sleeping

App Files Files Community

AutoML / models /unsupervised.py

Al1Abdullah

Initial commit of AutoML project

aa68823 6 months ago

raw

history blame contribute delete

3.14 kB

	from sklearn.cluster import KMeans, DBSCAN
	from sklearn.decomposition import PCA
	from utils.data_cleaner import prepare_data
	import pandas as pd
	import logging

	# Configure logging for this module
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

	def train_unsupervised(df, model_name, n_clusters=3, eps=0.5, min_samples=5, n_components=2):
	"""Trains an unsupervised machine learning model based on the specified model name.

	Args:
	df (pd.DataFrame): The input DataFrame for unsupervised learning.
	model_name (str): The name of the unsupervised model to train (e.g., "KMeans", "DBSCAN", "PCA").
	n_clusters (int, optional): Number of clusters for KMeans. Defaults to 3.
	eps (float, optional): The maximum distance between two samples for one to be considered as in the neighborhood of the other for DBSCAN. Defaults to 0.5.
	min_samples (int, optional): The number of samples (or total weight) in a neighborhood for a point to be considered as a core point for DBSCAN. Defaults to 5.
	n_components (int, optional): Number of components to keep for PCA. Defaults to 2.

	Returns:
	tuple: A tuple containing:
	- fitted_model: The trained unsupervised model object.
	- result: The clustering labels or transformed data.
	- error (str, optional): An error message if training fails.
	"""
	try:
	# Prepare data for unsupervised learning (cleaning and scaling)
	df_prepared, _ = prepare_data(df)

	if df_prepared.empty:
	logging.warning("Prepared DataFrame is empty for unsupervised training.")
	return None, "Prepared data is empty."

	model = None
	# Initialize the selected unsupervised model
	if model_name == "KMeans":
	model = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
	elif model_name == "DBSCAN":
	model = DBSCAN(eps=eps, min_samples=min_samples)
	elif model_name == "PCA":
	model = PCA(n_components=n_components)
	else:
	logging.warning(f"Unsupervised model not supported: {model_name}")
	return None, "Model not supported."

	# Fit the model to the prepared data
	fitted_model = model.fit(df_prepared)

	result = None
	# Extract results based on the model type
	if hasattr(fitted_model, 'labels_'):
	result = fitted_model.labels_
	logging.info(f"KMeans/DBSCAN trained. Clusters/labels generated.")
	elif hasattr(fitted_model, 'components_'):
	result = fitted_model.transform(df_prepared)
	logging.info(f"PCA trained. Data transformed to {n_components} components.")
	else:
	logging.info(f"Unsupervised model {model_name} trained, but no specific labels or components found.")

	return fitted_model, result
	except Exception as e:
	logging.error(f"An error occurred during unsupervised model training for {model_name}: {e}", exc_info=True)
	return None, f"An error occurred during model training: {e}"