Spaces:

sdbrgo
/

PERCEUL

Sleeping

App Files Files Community

sdbrgo commited on Dec 19, 2025

Commit

1fc0ac4

verified ·

1 Parent(s): 0435f86

Update cluster_ops/clustering.py

Browse files

Files changed (1) hide show

cluster_ops/clustering.py +76 -15

cluster_ops/clustering.py CHANGED Viewed

@@ -6,14 +6,64 @@ import traceback
 from cluster_ops.cluster_utils import *
-from sklearn.cluster import KMeans
 from sklearn.manifold import TSNE
 __all__ = [
     "explore_clusters",
     "final_clustering"
 ]
 def format_deviations_as_columns(drivers):
     headers = []
     cells = []
@@ -35,39 +85,50 @@ def format_deviations_as_columns(drivers):
     return table
 #========== CLUSTER EXPLORATION ==========
-def explore_clusters(file, perp, learn_rate):
     df = pd.read_csv(file)
     exploration_pipeline = joblib.load("preprocessing/exploration_pipeline.pkl")
     X_exp = exploration_pipeline.fit_transform(df)
-    tsne = TSNE(
-        n_components=2,
-        perplexity=perp,
-        learning_rate=learn_rate,
-        init='pca',
-        random_state=42
-    )
-    df_tsne = tsne.fit_transform(X_exp)
-    # Create figure explicitly
     fig, ax = plt.subplots(figsize=(10, 8))
     ax.scatter(
-        df_tsne[:, 0],
-        df_tsne[:, 1],
         s=15,
         alpha=0.8
     )
-    ax.set_title("Cluster Exploration of Worker Profiles (t-SNE)")
     ax.set_xlabel("Dimension 1")
     ax.set_ylabel("Dimension 2")
     fig.tight_layout()
-    return fig
 #========== FINAL CLUSTERING ==========
 def final_clustering(file, top_features):

 from cluster_ops.cluster_utils import *
 from sklearn.manifold import TSNE
+from umap import UMAP
+from sklearn.cluster import KMeans
 __all__ = [
     "explore_clusters",
     "final_clustering"
 ]
+#========== HELPER FUNCTIONS ==========
+def choose_umap_params(n_samples, n_features): # engineered to emphasize local structure
+    # use simple heuristic
+    sample_based = int(np.sqrt(n_samples) - 1)
+    feature_based = int(np.log2(n_features) - 1)
+    floor = 2
+    min_n_neighbors = min(sample_based, feature_based)
+    # find best_min_dist
+    best_min_dist = 0.1 if n_features < 20 else 0.0
+    return max(floor, min_n_neighbors), best_min_dist
+def build_umap(X):
+    n_samples, n_features = X.shape
+    # helper function defined above
+    best_n_neighbors, best_min_dist = choose_umap_params(n_samples, n_features)
+    return UMAP(
+        n_neighbors=best_n_neighbors,
+        min_dist=best_min_dist,
+        n_components=2,
+        random_state=42
+    )
+def build_hdbscan(X):
+    n_samples = X.shape[0]
+    min_cluster_size = int(max(2, 0.01 * n_samples))
+    min_samples = max(2, int(0.5 * min_cluster_size))
+    return hdbscan.HDBSCAN(
+        min_cluster_size=min_cluster_size,
+        min_samples=min_samples,
+        cluster_selection_method="eom", # default
+        prediction_data=True, # default
+        random_state=42
+    )
+def format_outliers(n_outliers):
+    if n_outliers == 0:
+        return "✅ No outliers detected."
+    return f"⚠️ **{n_outliers} workers** do not strongly belong to any cluster."
 def format_deviations_as_columns(drivers):
     headers = []
     cells = []
     return table
 #========== CLUSTER EXPLORATION ==========
+def explore_clusters(file):
     df = pd.read_csv(file)
     exploration_pipeline = joblib.load("preprocessing/exploration_pipeline.pkl")
     X_exp = exploration_pipeline.fit_transform(df)
+    # dynamic UMAP constructor
+    umap_model = build_umap(df)
+    X_umap = umap_model.fit_transform(X_exp)
+    # dynamic HDBSCAN constructor
+    hdb = build_hdbscan(df)
+    labels_hdb = hdb.fit_predict(X_umap)
+    # --- cluster statistics ---
+    from collections import Counter
+    label_counts = Counter(labels_hdb)
+    n_outliers = label_counts.pop(-1, 0)
+    cluster_summary = {
+        f"Cluster {key}": value
+        for key, value in sorted(label_counts.items())
+    }
+    # --- visualization ---
     fig, ax = plt.subplots(figsize=(10, 8))
     ax.scatter(
+        X_umap[:, 0],
+        X_umap[:, 1],
+        c=labels_hdb,
         s=15,
         alpha=0.8
     )
+    ax.set_title("Cluster Exploration of Worker Profiles")
     ax.set_xlabel("Dimension 1")
     ax.set_ylabel("Dimension 2")
     fig.tight_layout()
+    return fig, cluster_summary, format_outliers(n_outliers)
 #========== FINAL CLUSTERING ==========
 def final_clustering(file, top_features):