Spaces:

asenturisk
/

Benchmark-Kit-26

Running

File size: 5,033 Bytes

# clustering.py
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score

def get_preprocessor(df_subset):
    """Builds a robust sklearn preprocessor for mixed data types."""
    num_cols = df_subset.select_dtypes(include=np.number).columns
    cat_cols = df_subset.select_dtypes(exclude=np.number).columns

    transformers = []
    if len(num_cols) > 0:
        transformers.append(('num', StandardScaler(), num_cols))
    if len(cat_cols) > 0:
        transformers.append(('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols))

    return ColumnTransformer(transformers=transformers)

def run_clustering():
    st.header("🧊 Advanced Clustering Lab")

    df = st.session_state.processed_df
    features = st.session_state.feature_cols

    if not features:
        st.warning("⚠️ Please select features in the EDA tab first.")
        return

    # Prepare Data
    X_raw = df[features].copy()
    
    # ---------------- Configuration ----------------
    c1, c2 = st.columns(2)
    with c1:
        k_range = st.slider("Select K Range for Elbow Method", 2, 15, (2, 8))
    with c2:
        n_clusters = st.slider("Choose Final K", 2, 15, 3)

    # ---------------- Elbow Method ----------------
    if st.checkbox("Show Elbow Method & Silhouette Analysis"):
        with st.spinner("Calculating optimal K..."):
            preprocessor = get_preprocessor(X_raw)
            X_processed = preprocessor.fit_transform(X_raw)
            
            inertias = []
            sil_scores = []
            K_vals = range(k_range[0], k_range[1] + 1)

            for k in K_vals:
                km = KMeans(n_clusters=k, random_state=42, n_init=10)
                labels = km.fit_predict(X_processed)
                inertias.append(km.inertia_)
                sil_scores.append(silhouette_score(X_processed, labels))

            # Plotting
            col1, col2 = st.columns(2)
            
            # Inertia Plot
            fig_elbow = px.line(x=list(K_vals), y=inertias, markers=True, 
                                labels={'x':'K', 'y':'Inertia'}, title="Elbow Curve (Inertia)")
            col1.plotly_chart(fig_elbow, use_container_width=True)
            
            # Silhouette Plot
            fig_sil = px.line(x=list(K_vals), y=sil_scores, markers=True,
                              labels={'x':'K', 'y':'Silhouette Score'}, title="Silhouette Score (Higher is better)")
            col2.plotly_chart(fig_sil, use_container_width=True)

    # ---------------- Final Clustering ----------------
    if st.button("Run K-Means Clustering"):
        with st.spinner("Clustering..."):
            # Pipeline: Preprocess -> PCA (for viz) -> KMeans
            preprocessor = get_preprocessor(X_raw)
            
            # 1. Preprocess
            X_processed = preprocessor.fit_transform(X_raw)
            
            # 2. Fit Model
            model = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
            clusters = model.fit_predict(X_processed)
            
            # 3. Add to DataFrame locally for display
            df_display = df.copy()
            df_display["Cluster"] = clusters.astype(str)
            
            st.success("Clustering Complete!")
            st.dataframe(df_display.head())

            # 4. Visualization (PCA if dims > 2)
            st.subheader("Cluster Visualization")
            
            if X_processed.shape[1] > 2:
                st.info("Applying PCA to visualize high-dimensional data in 2D.")
                pca = PCA(n_components=2)
                X_pca = pca.fit_transform(X_processed)
                
                fig = px.scatter(
                    x=X_pca[:, 0], y=X_pca[:, 1], 
                    color=df_display["Cluster"],
                    title=f"PCA Projection of Clusters (K={n_clusters})",
                    labels={'x': 'PC1', 'y': 'PC2'},
                    template="plotly_white"
                )
            else:
                # If 2 dims, just plot them directly
                # We need to find the column names from preprocessor is tricky, 
                # so we fallback to PCA to be safe and consistent, or use raw if numeric.
                # Simplest robust approach: Always use PCA for generic consistency.
                 pca = PCA(n_components=2)
                 X_pca = pca.fit_transform(X_processed)
                 fig = px.scatter(
                    x=X_pca[:, 0], y=X_pca[:, 1], 
                    color=df_display["Cluster"],
                    title=f"Cluster Visualization (K={n_clusters})",
                     labels={'x': 'Dim 1', 'y': 'Dim 2'}
                )
            
            st.plotly_chart(fig, use_container_width=True)