DataDecoder / clusterService.py
dewmisam's picture
Upload 19 files
1067825 verified
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
def get_clusters(df):
df_processed = df.copy()
# --- Step 1: Drop ID columns ---
id_columns = [col for col in df_processed.columns if col.lower() == "id"]
df_processed = df_processed.drop(columns=id_columns)
# --- Step 2: Fill missing values ---
for col in df_processed.columns:
if pd.api.types.is_numeric_dtype(df_processed[col]):
df_processed[col] = df_processed[col].fillna(df_processed[col].mean())
else:
df_processed[col] = df_processed[col].fillna(df_processed[col].mode()[0])
# --- Step 3: Encode categorical columns (tiered strategy) ---
# One-Hot for low cardinality (safe, equal distances)
# Frequency for medium cardinality (no dimension explosion)
# Drop for high cardinality (likely ID-like, not useful)
encoded_columns = []
for col in df_processed.select_dtypes(include="object").columns:
n_unique = df_processed[col].nunique()
if n_unique <= 10:
# One-Hot: each category becomes a binary column
# All categories have equal distance (√2) from each other
dummies = pd.get_dummies(df_processed[col], prefix=col, dtype=int)
df_processed = pd.concat([df_processed.drop(columns=[col]), dummies], axis=1)
encoded_columns.append({"column": col, "method": "one-hot", "new_columns": n_unique})
elif n_unique <= 50:
# Frequency: replace category with how often it appears (proportion)
# No fake ordinal relationship, only 1 column
freq_map = df_processed[col].value_counts(normalize=True)
df_processed[col] = df_processed[col].map(freq_map).astype(float)
encoded_columns.append({"column": col, "method": "frequency", "unique_values": n_unique})
else:
# Drop: too many unique values (likely names, emails, IDs)
df_processed = df_processed.drop(columns=[col])
encoded_columns.append({"column": col, "method": "dropped", "reason": "too many unique values"})
# --- Step 4: Select numeric columns ---
df_numeric = df_processed.select_dtypes(include="number")
if df_numeric.shape[1] < 2:
return {
"status": "skipped",
"reason": "Not enough usable columns for clustering after encoding"
}
if df_numeric.var().mean() < 1e-3:
return {
"status": "skipped",
"reason": "Data has very low variance (no meaningful clusters)"
}
# --- Step 5: Scale features ---
# StandardScaler: mean=0, std=1 for each feature
# Ensures all features contribute equally to distance calculations
scaler = StandardScaler()
scaled = scaler.fit_transform(df_numeric)
# --- Step 6: Find optimal k using Silhouette Score ---
# Silhouette measures how well each point fits in its cluster vs nearest other cluster
# Score ranges from -1 (wrong cluster) to +1 (perfect cluster)
# Pick the k with the highest average silhouette score
max_k = min(11, len(df_numeric)) # can't have more clusters than data points
if max_k < 3:
return {
"status": "skipped",
"reason": "Not enough data points for meaningful clustering"
}
silhouette_scores = []
for k in range(2, max_k):
km = KMeans(n_clusters=k, random_state=42, n_init=10)
labels = km.fit_predict(scaled)
score = silhouette_score(scaled, labels)
silhouette_scores.append(score)
best_k = silhouette_scores.index(max(silhouette_scores)) + 2
if max(silhouette_scores) < 0.15:
return {
"status": "skipped",
"reason": "No strong cluster separation found"
}
# --- Step 7: Final clustering with best k ---
km = KMeans(n_clusters=best_k, random_state=42, n_init=10)
km.fit(scaled)
df_numeric = df_numeric.copy()
df_numeric["cluster"] = km.labels_
cluster_summary = df_numeric.groupby("cluster").mean()
cluster_sizes = {
int(k): int(v)
for k, v in df_numeric["cluster"].value_counts().items()
}
return {
"status": "success",
"best_k": best_k,
"silhouette_score": round(max(silhouette_scores), 4),
"encoded_columns": encoded_columns,
"cluster_summary": cluster_summary.to_dict(),
"cluster_sizes": cluster_sizes
}