sdbrgo commited on
Commit
1fc0ac4
·
verified ·
1 Parent(s): 0435f86

Update cluster_ops/clustering.py

Browse files
Files changed (1) hide show
  1. cluster_ops/clustering.py +76 -15
cluster_ops/clustering.py CHANGED
@@ -6,14 +6,64 @@ import traceback
6
 
7
  from cluster_ops.cluster_utils import *
8
 
9
- from sklearn.cluster import KMeans
10
  from sklearn.manifold import TSNE
 
 
 
11
 
12
  __all__ = [
13
  "explore_clusters",
14
  "final_clustering"
15
  ]
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  def format_deviations_as_columns(drivers):
18
  headers = []
19
  cells = []
@@ -35,39 +85,50 @@ def format_deviations_as_columns(drivers):
35
  return table
36
 
37
  #========== CLUSTER EXPLORATION ==========
38
- def explore_clusters(file, perp, learn_rate):
 
39
  df = pd.read_csv(file)
40
  exploration_pipeline = joblib.load("preprocessing/exploration_pipeline.pkl")
41
 
42
  X_exp = exploration_pipeline.fit_transform(df)
43
 
44
- tsne = TSNE(
45
- n_components=2,
46
- perplexity=perp,
47
- learning_rate=learn_rate,
48
- init='pca',
49
- random_state=42
50
- )
51
 
52
- df_tsne = tsne.fit_transform(X_exp)
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
- # Create figure explicitly
55
  fig, ax = plt.subplots(figsize=(10, 8))
56
 
57
  ax.scatter(
58
- df_tsne[:, 0],
59
- df_tsne[:, 1],
 
60
  s=15,
61
  alpha=0.8
62
  )
63
 
64
- ax.set_title("Cluster Exploration of Worker Profiles (t-SNE)")
65
  ax.set_xlabel("Dimension 1")
66
  ax.set_ylabel("Dimension 2")
67
 
68
  fig.tight_layout()
69
 
70
- return fig
71
 
72
  #========== FINAL CLUSTERING ==========
73
  def final_clustering(file, top_features):
 
6
 
7
  from cluster_ops.cluster_utils import *
8
 
 
9
  from sklearn.manifold import TSNE
10
+ from umap import UMAP
11
+ from sklearn.cluster import KMeans
12
+
13
 
14
  __all__ = [
15
  "explore_clusters",
16
  "final_clustering"
17
  ]
18
 
19
+ #========== HELPER FUNCTIONS ==========
20
+
21
+ def choose_umap_params(n_samples, n_features): # engineered to emphasize local structure
22
+ # use simple heuristic
23
+ sample_based = int(np.sqrt(n_samples) - 1)
24
+ feature_based = int(np.log2(n_features) - 1)
25
+
26
+ floor = 2
27
+ min_n_neighbors = min(sample_based, feature_based)
28
+
29
+ # find best_min_dist
30
+ best_min_dist = 0.1 if n_features < 20 else 0.0
31
+
32
+ return max(floor, min_n_neighbors), best_min_dist
33
+
34
+ def build_umap(X):
35
+ n_samples, n_features = X.shape
36
+
37
+ # helper function defined above
38
+ best_n_neighbors, best_min_dist = choose_umap_params(n_samples, n_features)
39
+
40
+ return UMAP(
41
+ n_neighbors=best_n_neighbors,
42
+ min_dist=best_min_dist,
43
+ n_components=2,
44
+ random_state=42
45
+ )
46
+
47
+ def build_hdbscan(X):
48
+ n_samples = X.shape[0]
49
+
50
+ min_cluster_size = int(max(2, 0.01 * n_samples))
51
+ min_samples = max(2, int(0.5 * min_cluster_size))
52
+
53
+ return hdbscan.HDBSCAN(
54
+ min_cluster_size=min_cluster_size,
55
+ min_samples=min_samples,
56
+ cluster_selection_method="eom", # default
57
+ prediction_data=True, # default
58
+ random_state=42
59
+ )
60
+
61
+ def format_outliers(n_outliers):
62
+ if n_outliers == 0:
63
+ return "✅ No outliers detected."
64
+ return f"⚠️ **{n_outliers} workers** do not strongly belong to any cluster."
65
+
66
+
67
  def format_deviations_as_columns(drivers):
68
  headers = []
69
  cells = []
 
85
  return table
86
 
87
  #========== CLUSTER EXPLORATION ==========
88
+
89
+ def explore_clusters(file):
90
  df = pd.read_csv(file)
91
  exploration_pipeline = joblib.load("preprocessing/exploration_pipeline.pkl")
92
 
93
  X_exp = exploration_pipeline.fit_transform(df)
94
 
95
+ # dynamic UMAP constructor
96
+ umap_model = build_umap(df)
97
+ X_umap = umap_model.fit_transform(X_exp)
 
 
 
 
98
 
99
+ # dynamic HDBSCAN constructor
100
+ hdb = build_hdbscan(df)
101
+ labels_hdb = hdb.fit_predict(X_umap)
102
+
103
+ # --- cluster statistics ---
104
+ from collections import Counter
105
+ label_counts = Counter(labels_hdb)
106
+
107
+ n_outliers = label_counts.pop(-1, 0)
108
+
109
+ cluster_summary = {
110
+ f"Cluster {key}": value
111
+ for key, value in sorted(label_counts.items())
112
+ }
113
 
114
+ # --- visualization ---
115
  fig, ax = plt.subplots(figsize=(10, 8))
116
 
117
  ax.scatter(
118
+ X_umap[:, 0],
119
+ X_umap[:, 1],
120
+ c=labels_hdb,
121
  s=15,
122
  alpha=0.8
123
  )
124
 
125
+ ax.set_title("Cluster Exploration of Worker Profiles")
126
  ax.set_xlabel("Dimension 1")
127
  ax.set_ylabel("Dimension 2")
128
 
129
  fig.tight_layout()
130
 
131
+ return fig, cluster_summary, format_outliers(n_outliers)
132
 
133
  #========== FINAL CLUSTERING ==========
134
  def final_clustering(file, top_features):