| import pandas as pd |
| from sklearn.preprocessing import StandardScaler |
| from sklearn.cluster import KMeans |
| from sklearn.metrics import silhouette_score |
|
|
|
|
| def get_clusters(df): |
|
|
| df_processed = df.copy() |
|
|
| |
| id_columns = [col for col in df_processed.columns if col.lower() == "id"] |
| df_processed = df_processed.drop(columns=id_columns) |
|
|
| |
| for col in df_processed.columns: |
| if pd.api.types.is_numeric_dtype(df_processed[col]): |
| df_processed[col] = df_processed[col].fillna(df_processed[col].mean()) |
| else: |
| df_processed[col] = df_processed[col].fillna(df_processed[col].mode()[0]) |
|
|
| |
| |
| |
| |
| encoded_columns = [] |
| for col in df_processed.select_dtypes(include="object").columns: |
| n_unique = df_processed[col].nunique() |
|
|
| if n_unique <= 10: |
| |
| |
| dummies = pd.get_dummies(df_processed[col], prefix=col, dtype=int) |
| df_processed = pd.concat([df_processed.drop(columns=[col]), dummies], axis=1) |
| encoded_columns.append({"column": col, "method": "one-hot", "new_columns": n_unique}) |
|
|
| elif n_unique <= 50: |
| |
| |
| freq_map = df_processed[col].value_counts(normalize=True) |
| df_processed[col] = df_processed[col].map(freq_map).astype(float) |
| encoded_columns.append({"column": col, "method": "frequency", "unique_values": n_unique}) |
|
|
| else: |
| |
| df_processed = df_processed.drop(columns=[col]) |
| encoded_columns.append({"column": col, "method": "dropped", "reason": "too many unique values"}) |
|
|
| |
| df_numeric = df_processed.select_dtypes(include="number") |
|
|
| if df_numeric.shape[1] < 2: |
| return { |
| "status": "skipped", |
| "reason": "Not enough usable columns for clustering after encoding" |
| } |
|
|
| if df_numeric.var().mean() < 1e-3: |
| return { |
| "status": "skipped", |
| "reason": "Data has very low variance (no meaningful clusters)" |
| } |
| |
| |
| |
| |
| scaler = StandardScaler() |
| scaled = scaler.fit_transform(df_numeric) |
|
|
| |
| |
| |
| |
| max_k = min(11, len(df_numeric)) |
| if max_k < 3: |
| return { |
| "status": "skipped", |
| "reason": "Not enough data points for meaningful clustering" |
| } |
|
|
| silhouette_scores = [] |
| for k in range(2, max_k): |
| km = KMeans(n_clusters=k, random_state=42, n_init=10) |
| labels = km.fit_predict(scaled) |
| score = silhouette_score(scaled, labels) |
| silhouette_scores.append(score) |
|
|
| best_k = silhouette_scores.index(max(silhouette_scores)) + 2 |
|
|
| if max(silhouette_scores) < 0.15: |
| return { |
| "status": "skipped", |
| "reason": "No strong cluster separation found" |
| } |
|
|
| |
| km = KMeans(n_clusters=best_k, random_state=42, n_init=10) |
| km.fit(scaled) |
| df_numeric = df_numeric.copy() |
| df_numeric["cluster"] = km.labels_ |
| cluster_summary = df_numeric.groupby("cluster").mean() |
| cluster_sizes = { |
| int(k): int(v) |
| for k, v in df_numeric["cluster"].value_counts().items() |
| } |
| |
| return { |
| "status": "success", |
| "best_k": best_k, |
| "silhouette_score": round(max(silhouette_scores), 4), |
| "encoded_columns": encoded_columns, |
| "cluster_summary": cluster_summary.to_dict(), |
| "cluster_sizes": cluster_sizes |
| } |
|
|