import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score def get_clusters(df): df_processed = df.copy() # --- Step 1: Drop ID columns --- id_columns = [col for col in df_processed.columns if col.lower() == "id"] df_processed = df_processed.drop(columns=id_columns) # --- Step 2: Fill missing values --- for col in df_processed.columns: if pd.api.types.is_numeric_dtype(df_processed[col]): df_processed[col] = df_processed[col].fillna(df_processed[col].mean()) else: df_processed[col] = df_processed[col].fillna(df_processed[col].mode()[0]) # --- Step 3: Encode categorical columns (tiered strategy) --- # One-Hot for low cardinality (safe, equal distances) # Frequency for medium cardinality (no dimension explosion) # Drop for high cardinality (likely ID-like, not useful) encoded_columns = [] for col in df_processed.select_dtypes(include="object").columns: n_unique = df_processed[col].nunique() if n_unique <= 10: # One-Hot: each category becomes a binary column # All categories have equal distance (√2) from each other dummies = pd.get_dummies(df_processed[col], prefix=col, dtype=int) df_processed = pd.concat([df_processed.drop(columns=[col]), dummies], axis=1) encoded_columns.append({"column": col, "method": "one-hot", "new_columns": n_unique}) elif n_unique <= 50: # Frequency: replace category with how often it appears (proportion) # No fake ordinal relationship, only 1 column freq_map = df_processed[col].value_counts(normalize=True) df_processed[col] = df_processed[col].map(freq_map).astype(float) encoded_columns.append({"column": col, "method": "frequency", "unique_values": n_unique}) else: # Drop: too many unique values (likely names, emails, IDs) df_processed = df_processed.drop(columns=[col]) encoded_columns.append({"column": col, "method": "dropped", "reason": "too many unique values"}) # --- Step 4: Select numeric columns --- df_numeric = df_processed.select_dtypes(include="number") if df_numeric.shape[1] < 2: return { "status": "skipped", "reason": "Not enough usable columns for clustering after encoding" } if df_numeric.var().mean() < 1e-3: return { "status": "skipped", "reason": "Data has very low variance (no meaningful clusters)" } # --- Step 5: Scale features --- # StandardScaler: mean=0, std=1 for each feature # Ensures all features contribute equally to distance calculations scaler = StandardScaler() scaled = scaler.fit_transform(df_numeric) # --- Step 6: Find optimal k using Silhouette Score --- # Silhouette measures how well each point fits in its cluster vs nearest other cluster # Score ranges from -1 (wrong cluster) to +1 (perfect cluster) # Pick the k with the highest average silhouette score max_k = min(11, len(df_numeric)) # can't have more clusters than data points if max_k < 3: return { "status": "skipped", "reason": "Not enough data points for meaningful clustering" } silhouette_scores = [] for k in range(2, max_k): km = KMeans(n_clusters=k, random_state=42, n_init=10) labels = km.fit_predict(scaled) score = silhouette_score(scaled, labels) silhouette_scores.append(score) best_k = silhouette_scores.index(max(silhouette_scores)) + 2 if max(silhouette_scores) < 0.15: return { "status": "skipped", "reason": "No strong cluster separation found" } # --- Step 7: Final clustering with best k --- km = KMeans(n_clusters=best_k, random_state=42, n_init=10) km.fit(scaled) df_numeric = df_numeric.copy() df_numeric["cluster"] = km.labels_ cluster_summary = df_numeric.groupby("cluster").mean() cluster_sizes = { int(k): int(v) for k, v in df_numeric["cluster"].value_counts().items() } return { "status": "success", "best_k": best_k, "silhouette_score": round(max(silhouette_scores), 4), "encoded_columns": encoded_columns, "cluster_summary": cluster_summary.to_dict(), "cluster_sizes": cluster_sizes }