Spaces:

mgbam
/

CognitiveEDA

Sleeping

App Files Files Community

CognitiveEDA / modules /clustering.py

mgbam

Update modules/clustering.py

23226ad verified 9 months ago

raw

history blame contribute delete

3.53 kB

	# modules/clustering.py

	# -- coding: utf-8 --
	#
	# PROJECT: CognitiveEDA v5.7 - The QuantumLeap Intelligence Platform
	#
	# DESCRIPTION: Specialized module for K-Means clustering. This version is
	# updated to return the cluster labels for downstream profiling.

	import logging
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	from sklearn.cluster import KMeans
	from sklearn.decomposition import PCA
	from sklearn.preprocessing import StandardScaler

	def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int):
	"""
	Performs K-Means clustering, generates an Elbow plot for optimal K,
	visualizes the clusters via PCA, and returns the cluster labels.

	Args:
	df (pd.DataFrame): The input DataFrame.
	numeric_cols (list): A list of numeric columns to use for clustering.
	n_clusters (int): The number of clusters (k) to create.

	Returns:
	A tuple containing:
	- fig_cluster (go.Figure): Plot of the clustered data in 2D PCA space.
	- fig_elbow (go.Figure): The Elbow Method plot for determining optimal k.
	- summary (str): A markdown summary of the methodology.
	- labels (pd.Series): The cluster label assigned to each data point.
	"""
	if len(numeric_cols) < 2:
	empty_fig = go.Figure()
	return empty_fig, empty_fig, "Clustering requires at least 2 numeric features.", pd.Series()

	cluster_data = df[numeric_cols].dropna()
	if len(cluster_data) < n_clusters:
	empty_fig = go.Figure()
	return empty_fig, empty_fig, f"Not enough data ({len(cluster_data)}) for {n_clusters} clusters.", pd.Series()

	scaler = StandardScaler()
	scaled_data = scaler.fit_transform(cluster_data)

	# --- Elbow Method Plot ---
	wcss = []
	k_range = range(1, 11)
	for i in k_range:
	kmeans_elbow = KMeans(n_clusters=i, init='k-means++', random_state=42, n_init='auto')
	kmeans_elbow.fit(scaled_data)
	wcss.append(kmeans_elbow.inertia_)

	fig_elbow = go.Figure()
	fig_elbow.add_trace(go.Scatter(x=list(k_range), y=wcss, mode='lines+markers'))
	fig_elbow.update_layout(title='<b>💡 The Elbow Method for Optimal K</b>',
	xaxis_title='Number of Clusters (K)',
	yaxis_title='Within-Cluster Sum of Squares (WCSS)')

	# --- K-Means Clustering & Visualization ---
	kmeans = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42, n_init='auto').fit(scaled_data)
	labels = pd.Series(kmeans.labels_, name='Cluster_Labels', index=cluster_data.index)

	pca = PCA(n_components=2)
	components = pca.fit_transform(scaled_data)

	# Create a DataFrame for plotting
	plot_df = pd.DataFrame(components, columns=['PCA1', 'PCA2'], index=cluster_data.index)
	plot_df['Cluster'] = labels.astype(str)

	fig_cluster = px.scatter(
	plot_df, x='PCA1', y='PCA2', color='Cluster',
	title=f"<b>K-Means Clustering Visualization (K={n_clusters})</b>",
	labels={'PCA1': 'Principal Component 1', 'PCA2': 'Principal Component 2'},
	color_discrete_sequence=px.colors.qualitative.Vivid
	)

	explained_variance = pca.explained_variance_ratio_.sum() * 100
	summary = (f"Features Used: `{len(numeric_cols)}` \| Clusters (K): `{n_clusters}`\n\n"
	f"PCA explains {explained_variance:.2f}% of variance.")

	# --- MODIFIED RETURN ---
	return fig_cluster, fig_elbow, summary, labels