dwmk commited on
Commit
a21bdec
·
verified ·
1 Parent(s): e8a3427

Update src/clustering.py

Browse files
Files changed (1) hide show
  1. src/clustering.py +106 -28
src/clustering.py CHANGED
@@ -1,46 +1,124 @@
1
  # clustering.py
2
  import streamlit as st
 
3
  import numpy as np
4
- import seaborn as sns
5
- import matplotlib.pyplot as plt
6
  from sklearn.cluster import KMeans
7
- from sklearn.preprocessing import LabelEncoder, StandardScaler
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  def run_clustering():
10
- st.header("🧊 Clustering Lab")
11
 
12
  df = st.session_state.processed_df
13
  features = st.session_state.feature_cols
14
 
15
  if not features:
16
- st.warning("Select features in EDA")
17
  return
18
 
19
- X = df[features].copy()
20
-
21
- for c in X.select_dtypes(exclude=np.number):
22
- X[c] = LabelEncoder().fit_transform(X[c].astype(str))
 
 
 
 
 
23
 
24
- if st.checkbox("Apply scaling"):
25
- X = StandardScaler().fit_transform(X)
 
 
 
 
 
 
 
26
 
27
- k = st.slider("Clusters (k)", 2, 10, 3)
 
 
 
 
28
 
29
- if st.button("Run K-Means"):
30
- model = KMeans(n_clusters=k, random_state=42)
31
- clusters = model.fit_predict(X)
 
 
 
 
 
 
 
 
 
32
 
33
- df["Cluster"] = clusters
34
- st.dataframe(df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
- if len(features) >= 2:
37
- fig, ax = plt.subplots(figsize=(8,6))
38
- sns.scatterplot(
39
- x=df[features[0]],
40
- y=df[features[1]],
41
- hue=clusters,
42
- palette="viridis",
43
- ax=ax
44
- )
45
- st.pyplot(fig)
46
- plt.close(fig)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # clustering.py
2
  import streamlit as st
3
+ import pandas as pd
4
  import numpy as np
5
+ import plotly.express as px
 
6
  from sklearn.cluster import KMeans
7
+ from sklearn.decomposition import PCA
8
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
9
+ from sklearn.compose import ColumnTransformer
10
+ from sklearn.pipeline import Pipeline
11
+ from sklearn.metrics import silhouette_score
12
+
13
+ def get_preprocessor(df_subset):
14
+ """Builds a robust sklearn preprocessor for mixed data types."""
15
+ num_cols = df_subset.select_dtypes(include=np.number).columns
16
+ cat_cols = df_subset.select_dtypes(exclude=np.number).columns
17
+
18
+ transformers = []
19
+ if len(num_cols) > 0:
20
+ transformers.append(('num', StandardScaler(), num_cols))
21
+ if len(cat_cols) > 0:
22
+ transformers.append(('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols))
23
+
24
+ return ColumnTransformer(transformers=transformers)
25
 
26
  def run_clustering():
27
+ st.header("🧊 Advanced Clustering Lab")
28
 
29
  df = st.session_state.processed_df
30
  features = st.session_state.feature_cols
31
 
32
  if not features:
33
+ st.warning("⚠️ Please select features in the EDA tab first.")
34
  return
35
 
36
+ # Prepare Data
37
+ X_raw = df[features].copy()
38
+
39
+ # ---------------- Configuration ----------------
40
+ c1, c2 = st.columns(2)
41
+ with c1:
42
+ k_range = st.slider("Select K Range for Elbow Method", 2, 15, (2, 8))
43
+ with c2:
44
+ n_clusters = st.slider("Choose Final K", 2, 15, 3)
45
 
46
+ # ---------------- Elbow Method ----------------
47
+ if st.checkbox("Show Elbow Method & Silhouette Analysis"):
48
+ with st.spinner("Calculating optimal K..."):
49
+ preprocessor = get_preprocessor(X_raw)
50
+ X_processed = preprocessor.fit_transform(X_raw)
51
+
52
+ inertias = []
53
+ sil_scores = []
54
+ K_vals = range(k_range[0], k_range[1] + 1)
55
 
56
+ for k in K_vals:
57
+ km = KMeans(n_clusters=k, random_state=42, n_init=10)
58
+ labels = km.fit_predict(X_processed)
59
+ inertias.append(km.inertia_)
60
+ sil_scores.append(silhouette_score(X_processed, labels))
61
 
62
+ # Plotting
63
+ col1, col2 = st.columns(2)
64
+
65
+ # Inertia Plot
66
+ fig_elbow = px.line(x=list(K_vals), y=inertias, markers=True,
67
+ labels={'x':'K', 'y':'Inertia'}, title="Elbow Curve (Inertia)")
68
+ col1.plotly_chart(fig_elbow, use_container_width=True)
69
+
70
+ # Silhouette Plot
71
+ fig_sil = px.line(x=list(K_vals), y=sil_scores, markers=True,
72
+ labels={'x':'K', 'y':'Silhouette Score'}, title="Silhouette Score (Higher is better)")
73
+ col2.plotly_chart(fig_sil, use_container_width=True)
74
 
75
+ # ---------------- Final Clustering ----------------
76
+ if st.button("Run K-Means Clustering"):
77
+ with st.spinner("Clustering..."):
78
+ # Pipeline: Preprocess -> PCA (for viz) -> KMeans
79
+ preprocessor = get_preprocessor(X_raw)
80
+
81
+ # 1. Preprocess
82
+ X_processed = preprocessor.fit_transform(X_raw)
83
+
84
+ # 2. Fit Model
85
+ model = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
86
+ clusters = model.fit_predict(X_processed)
87
+
88
+ # 3. Add to DataFrame locally for display
89
+ df_display = df.copy()
90
+ df_display["Cluster"] = clusters.astype(str)
91
+
92
+ st.success("Clustering Complete!")
93
+ st.dataframe(df_display.head())
94
 
95
+ # 4. Visualization (PCA if dims > 2)
96
+ st.subheader("Cluster Visualization")
97
+
98
+ if X_processed.shape[1] > 2:
99
+ st.info("Applying PCA to visualize high-dimensional data in 2D.")
100
+ pca = PCA(n_components=2)
101
+ X_pca = pca.fit_transform(X_processed)
102
+
103
+ fig = px.scatter(
104
+ x=X_pca[:, 0], y=X_pca[:, 1],
105
+ color=df_display["Cluster"],
106
+ title=f"PCA Projection of Clusters (K={n_clusters})",
107
+ labels={'x': 'PC1', 'y': 'PC2'},
108
+ template="plotly_white"
109
+ )
110
+ else:
111
+ # If 2 dims, just plot them directly
112
+ # We need to find the column names from preprocessor is tricky,
113
+ # so we fallback to PCA to be safe and consistent, or use raw if numeric.
114
+ # Simplest robust approach: Always use PCA for generic consistency.
115
+ pca = PCA(n_components=2)
116
+ X_pca = pca.fit_transform(X_processed)
117
+ fig = px.scatter(
118
+ x=X_pca[:, 0], y=X_pca[:, 1],
119
+ color=df_display["Cluster"],
120
+ title=f"Cluster Visualization (K={n_clusters})",
121
+ labels={'x': 'Dim 1', 'y': 'Dim 2'}
122
+ )
123
+
124
+ st.plotly_chart(fig, use_container_width=True)