Spaces:
Sleeping
Sleeping
| # modules/clustering.py | |
| # -*- coding: utf-8 -*- | |
| # | |
| # PROJECT: CognitiveEDA v5.7 - The QuantumLeap Intelligence Platform | |
| # | |
| # DESCRIPTION: Specialized module for K-Means clustering. This version is | |
| # updated to return the cluster labels for downstream profiling. | |
| import logging | |
| import pandas as pd | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from sklearn.cluster import KMeans | |
| from sklearn.decomposition import PCA | |
| from sklearn.preprocessing import StandardScaler | |
| def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int): | |
| """ | |
| Performs K-Means clustering, generates an Elbow plot for optimal K, | |
| visualizes the clusters via PCA, and returns the cluster labels. | |
| Args: | |
| df (pd.DataFrame): The input DataFrame. | |
| numeric_cols (list): A list of numeric columns to use for clustering. | |
| n_clusters (int): The number of clusters (k) to create. | |
| Returns: | |
| A tuple containing: | |
| - fig_cluster (go.Figure): Plot of the clustered data in 2D PCA space. | |
| - fig_elbow (go.Figure): The Elbow Method plot for determining optimal k. | |
| - summary (str): A markdown summary of the methodology. | |
| - labels (pd.Series): The cluster label assigned to each data point. | |
| """ | |
| if len(numeric_cols) < 2: | |
| empty_fig = go.Figure() | |
| return empty_fig, empty_fig, "Clustering requires at least 2 numeric features.", pd.Series() | |
| cluster_data = df[numeric_cols].dropna() | |
| if len(cluster_data) < n_clusters: | |
| empty_fig = go.Figure() | |
| return empty_fig, empty_fig, f"Not enough data ({len(cluster_data)}) for {n_clusters} clusters.", pd.Series() | |
| scaler = StandardScaler() | |
| scaled_data = scaler.fit_transform(cluster_data) | |
| # --- Elbow Method Plot --- | |
| wcss = [] | |
| k_range = range(1, 11) | |
| for i in k_range: | |
| kmeans_elbow = KMeans(n_clusters=i, init='k-means++', random_state=42, n_init='auto') | |
| kmeans_elbow.fit(scaled_data) | |
| wcss.append(kmeans_elbow.inertia_) | |
| fig_elbow = go.Figure() | |
| fig_elbow.add_trace(go.Scatter(x=list(k_range), y=wcss, mode='lines+markers')) | |
| fig_elbow.update_layout(title='<b>💡 The Elbow Method for Optimal K</b>', | |
| xaxis_title='Number of Clusters (K)', | |
| yaxis_title='Within-Cluster Sum of Squares (WCSS)') | |
| # --- K-Means Clustering & Visualization --- | |
| kmeans = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42, n_init='auto').fit(scaled_data) | |
| labels = pd.Series(kmeans.labels_, name='Cluster_Labels', index=cluster_data.index) | |
| pca = PCA(n_components=2) | |
| components = pca.fit_transform(scaled_data) | |
| # Create a DataFrame for plotting | |
| plot_df = pd.DataFrame(components, columns=['PCA1', 'PCA2'], index=cluster_data.index) | |
| plot_df['Cluster'] = labels.astype(str) | |
| fig_cluster = px.scatter( | |
| plot_df, x='PCA1', y='PCA2', color='Cluster', | |
| title=f"<b>K-Means Clustering Visualization (K={n_clusters})</b>", | |
| labels={'PCA1': 'Principal Component 1', 'PCA2': 'Principal Component 2'}, | |
| color_discrete_sequence=px.colors.qualitative.Vivid | |
| ) | |
| explained_variance = pca.explained_variance_ratio_.sum() * 100 | |
| summary = (f"**Features Used:** `{len(numeric_cols)}` | **Clusters (K):** `{n_clusters}`\n\n" | |
| f"PCA explains **{explained_variance:.2f}%** of variance.") | |
| # --- MODIFIED RETURN --- | |
| return fig_cluster, fig_elbow, summary, labels |