Spaces:
Sleeping
Sleeping
| from sklearn.cluster import KMeans, DBSCAN | |
| from sklearn.decomposition import PCA | |
| from utils.data_cleaner import prepare_data | |
| import pandas as pd | |
| import logging | |
| # Configure logging for this module | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| def train_unsupervised(df, model_name, n_clusters=3, eps=0.5, min_samples=5, n_components=2): | |
| """Trains an unsupervised machine learning model based on the specified model name. | |
| Args: | |
| df (pd.DataFrame): The input DataFrame for unsupervised learning. | |
| model_name (str): The name of the unsupervised model to train (e.g., "KMeans", "DBSCAN", "PCA"). | |
| n_clusters (int, optional): Number of clusters for KMeans. Defaults to 3. | |
| eps (float, optional): The maximum distance between two samples for one to be considered as in the neighborhood of the other for DBSCAN. Defaults to 0.5. | |
| min_samples (int, optional): The number of samples (or total weight) in a neighborhood for a point to be considered as a core point for DBSCAN. Defaults to 5. | |
| n_components (int, optional): Number of components to keep for PCA. Defaults to 2. | |
| Returns: | |
| tuple: A tuple containing: | |
| - fitted_model: The trained unsupervised model object. | |
| - result: The clustering labels or transformed data. | |
| - error (str, optional): An error message if training fails. | |
| """ | |
| try: | |
| # Prepare data for unsupervised learning (cleaning and scaling) | |
| df_prepared, _ = prepare_data(df) | |
| if df_prepared.empty: | |
| logging.warning("Prepared DataFrame is empty for unsupervised training.") | |
| return None, "Prepared data is empty." | |
| model = None | |
| # Initialize the selected unsupervised model | |
| if model_name == "KMeans": | |
| model = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) | |
| elif model_name == "DBSCAN": | |
| model = DBSCAN(eps=eps, min_samples=min_samples) | |
| elif model_name == "PCA": | |
| model = PCA(n_components=n_components) | |
| else: | |
| logging.warning(f"Unsupervised model not supported: {model_name}") | |
| return None, "Model not supported." | |
| # Fit the model to the prepared data | |
| fitted_model = model.fit(df_prepared) | |
| result = None | |
| # Extract results based on the model type | |
| if hasattr(fitted_model, 'labels_'): | |
| result = fitted_model.labels_ | |
| logging.info(f"KMeans/DBSCAN trained. Clusters/labels generated.") | |
| elif hasattr(fitted_model, 'components_'): | |
| result = fitted_model.transform(df_prepared) | |
| logging.info(f"PCA trained. Data transformed to {n_components} components.") | |
| else: | |
| logging.info(f"Unsupervised model {model_name} trained, but no specific labels or components found.") | |
| return fitted_model, result | |
| except Exception as e: | |
| logging.error(f"An error occurred during unsupervised model training for {model_name}: {e}", exc_info=True) | |
| return None, f"An error occurred during model training: {e}" | |