# clustering.py import streamlit as st import pandas as pd import numpy as np import plotly.express as px from sklearn.cluster import KMeans from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.metrics import silhouette_score def get_preprocessor(df_subset): """Builds a robust sklearn preprocessor for mixed data types.""" num_cols = df_subset.select_dtypes(include=np.number).columns cat_cols = df_subset.select_dtypes(exclude=np.number).columns transformers = [] if len(num_cols) > 0: transformers.append(('num', StandardScaler(), num_cols)) if len(cat_cols) > 0: transformers.append(('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)) return ColumnTransformer(transformers=transformers) def run_clustering(): st.header("🧊 Advanced Clustering Lab") df = st.session_state.processed_df features = st.session_state.feature_cols if not features: st.warning("⚠️ Please select features in the EDA tab first.") return # Prepare Data X_raw = df[features].copy() # ---------------- Configuration ---------------- c1, c2 = st.columns(2) with c1: k_range = st.slider("Select K Range for Elbow Method", 2, 15, (2, 8)) with c2: n_clusters = st.slider("Choose Final K", 2, 15, 3) # ---------------- Elbow Method ---------------- if st.checkbox("Show Elbow Method & Silhouette Analysis"): with st.spinner("Calculating optimal K..."): preprocessor = get_preprocessor(X_raw) X_processed = preprocessor.fit_transform(X_raw) inertias = [] sil_scores = [] K_vals = range(k_range[0], k_range[1] + 1) for k in K_vals: km = KMeans(n_clusters=k, random_state=42, n_init=10) labels = km.fit_predict(X_processed) inertias.append(km.inertia_) sil_scores.append(silhouette_score(X_processed, labels)) # Plotting col1, col2 = st.columns(2) # Inertia Plot fig_elbow = px.line(x=list(K_vals), y=inertias, markers=True, labels={'x':'K', 'y':'Inertia'}, title="Elbow Curve (Inertia)") col1.plotly_chart(fig_elbow, use_container_width=True) # Silhouette Plot fig_sil = px.line(x=list(K_vals), y=sil_scores, markers=True, labels={'x':'K', 'y':'Silhouette Score'}, title="Silhouette Score (Higher is better)") col2.plotly_chart(fig_sil, use_container_width=True) # ---------------- Final Clustering ---------------- if st.button("Run K-Means Clustering"): with st.spinner("Clustering..."): # Pipeline: Preprocess -> PCA (for viz) -> KMeans preprocessor = get_preprocessor(X_raw) # 1. Preprocess X_processed = preprocessor.fit_transform(X_raw) # 2. Fit Model model = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) clusters = model.fit_predict(X_processed) # 3. Add to DataFrame locally for display df_display = df.copy() df_display["Cluster"] = clusters.astype(str) st.success("Clustering Complete!") st.dataframe(df_display.head()) # 4. Visualization (PCA if dims > 2) st.subheader("Cluster Visualization") if X_processed.shape[1] > 2: st.info("Applying PCA to visualize high-dimensional data in 2D.") pca = PCA(n_components=2) X_pca = pca.fit_transform(X_processed) fig = px.scatter( x=X_pca[:, 0], y=X_pca[:, 1], color=df_display["Cluster"], title=f"PCA Projection of Clusters (K={n_clusters})", labels={'x': 'PC1', 'y': 'PC2'}, template="plotly_white" ) else: # If 2 dims, just plot them directly # We need to find the column names from preprocessor is tricky, # so we fallback to PCA to be safe and consistent, or use raw if numeric. # Simplest robust approach: Always use PCA for generic consistency. pca = PCA(n_components=2) X_pca = pca.fit_transform(X_processed) fig = px.scatter( x=X_pca[:, 0], y=X_pca[:, 1], color=df_display["Cluster"], title=f"Cluster Visualization (K={n_clusters})", labels={'x': 'Dim 1', 'y': 'Dim 2'} ) st.plotly_chart(fig, use_container_width=True)