File size: 4,259 Bytes
1067825
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


def get_clusters(df):

  df_processed = df.copy()

  # --- Step 1: Drop ID columns ---
  id_columns = [col for col in df_processed.columns if col.lower() == "id"]
  df_processed = df_processed.drop(columns=id_columns)

  # --- Step 2: Fill missing values ---
  for col in df_processed.columns:
    if pd.api.types.is_numeric_dtype(df_processed[col]):
      df_processed[col] = df_processed[col].fillna(df_processed[col].mean())
    else:
      df_processed[col] = df_processed[col].fillna(df_processed[col].mode()[0])

  # --- Step 3: Encode categorical columns (tiered strategy) ---
  # One-Hot for low cardinality (safe, equal distances)
  # Frequency for medium cardinality (no dimension explosion)
  # Drop for high cardinality (likely ID-like, not useful)
  encoded_columns = []
  for col in df_processed.select_dtypes(include="object").columns:
    n_unique = df_processed[col].nunique()

    if n_unique <= 10:
      # One-Hot: each category becomes a binary column
      # All categories have equal distance (√2) from each other
      dummies = pd.get_dummies(df_processed[col], prefix=col, dtype=int)
      df_processed = pd.concat([df_processed.drop(columns=[col]), dummies], axis=1)
      encoded_columns.append({"column": col, "method": "one-hot", "new_columns": n_unique})

    elif n_unique <= 50:
      # Frequency: replace category with how often it appears (proportion)
      # No fake ordinal relationship, only 1 column
      freq_map = df_processed[col].value_counts(normalize=True)
      df_processed[col] = df_processed[col].map(freq_map).astype(float)
      encoded_columns.append({"column": col, "method": "frequency", "unique_values": n_unique})

    else:
      # Drop: too many unique values (likely names, emails, IDs)
      df_processed = df_processed.drop(columns=[col])
      encoded_columns.append({"column": col, "method": "dropped", "reason": "too many unique values"})

  # --- Step 4: Select numeric columns ---
  df_numeric = df_processed.select_dtypes(include="number")

  if df_numeric.shape[1] < 2:
      return {
          "status": "skipped",
          "reason": "Not enough usable columns for clustering after encoding"
      }

  if df_numeric.var().mean() < 1e-3:
      return {
          "status": "skipped",
          "reason": "Data has very low variance (no meaningful clusters)"
      }
  
  # --- Step 5: Scale features ---
  # StandardScaler: mean=0, std=1 for each feature
  # Ensures all features contribute equally to distance calculations
  scaler = StandardScaler()
  scaled = scaler.fit_transform(df_numeric)

  # --- Step 6: Find optimal k using Silhouette Score ---
  # Silhouette measures how well each point fits in its cluster vs nearest other cluster
  # Score ranges from -1 (wrong cluster) to +1 (perfect cluster)
  # Pick the k with the highest average silhouette score
  max_k = min(11, len(df_numeric))  # can't have more clusters than data points
  if max_k < 3:
      return {
          "status": "skipped",
          "reason": "Not enough data points for meaningful clustering"
      }

  silhouette_scores = []
  for k in range(2, max_k):
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = km.fit_predict(scaled)
    score = silhouette_score(scaled, labels)
    silhouette_scores.append(score)

  best_k = silhouette_scores.index(max(silhouette_scores)) + 2

  if max(silhouette_scores) < 0.15:
      return {
          "status": "skipped",
          "reason": "No strong cluster separation found"
      }

  # --- Step 7: Final clustering with best k ---
  km = KMeans(n_clusters=best_k, random_state=42, n_init=10)
  km.fit(scaled)
  df_numeric = df_numeric.copy()
  df_numeric["cluster"] = km.labels_
  cluster_summary = df_numeric.groupby("cluster").mean()
  cluster_sizes = {
    int(k): int(v)
    for k, v in df_numeric["cluster"].value_counts().items()
  }
  
  return {
    "status": "success",
    "best_k": best_k,
    "silhouette_score": round(max(silhouette_scores), 4),
    "encoded_columns": encoded_columns,
    "cluster_summary": cluster_summary.to_dict(),
    "cluster_sizes": cluster_sizes
  }