sdbrgo commited on
Commit
28c34ea
·
verified ·
1 Parent(s): ec476d9

Create cluster_utils.py

Browse files

Contains choose_k(), compute_cluster_stats(), identify_extreme_features()

Files changed (1) hide show
  1. cluster_utils.py +65 -0
cluster_utils.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.metrics import silhouette_score
2
+ from sklearn.cluster import KMeans
3
+
4
+ #========== Before Final Clustering ==========
5
+ def choose_k(X_pca, k_range=(2, 12)):
6
+ best_k = 2
7
+ best_score = -1
8
+
9
+ for k in range(k_range[0], k_range[1]):
10
+ km = KMeans(n_clusters=k, random_state=42)
11
+ labels = km.fit_predict(X_pca)
12
+ score = silhouette_score(X_pca, labels)
13
+
14
+ if score > best_score:
15
+ best_score = score
16
+ best_k = k
17
+
18
+ return best_k
19
+
20
+ #========== During Cluster Analysis ==========
21
+ # function to save cluster stats
22
+ def compute_cluster_stats(X_processed, labels, feature_names):
23
+ df = pd.DataFrame(X_processed, columns=feature_names)
24
+ df['cluster'] = labels
25
+
26
+ stats = {}
27
+
28
+ for cluster_id in sorted(df['cluster'].unique()):
29
+ cluster_data = df[df['cluster'] == cluster_id].drop(columns=['cluster'])
30
+
31
+ stats[cluster_id] = {
32
+ "count": len(cluster_data),
33
+ "mean": cluster_data.mean().to_dict(),
34
+ "median": cluster_data.median().to_dict(),
35
+ "std": cluster_data.std().to_dict(),
36
+ "min": cluster_data.min().to_dict(),
37
+ "max": cluster_data.max().to_dict(),
38
+ "range": (cluster_data.max() - cluster_data.min()).to_dict(),
39
+ }
40
+
41
+ return stats
42
+
43
+ # function to identify core features of the clusters
44
+ def identify_extreme_features(X_processed, labels, feature_names, threshold=1.0):
45
+ df = pd.DataFrame(X_processed, columns=feature_names)
46
+ df['cluster'] = labels
47
+
48
+ global_mean = df.drop(columns=['cluster']).mean()
49
+ global_std = df.drop(columns=['cluster']).std()
50
+
51
+ extremes = {}
52
+
53
+ for cluster_id in sorted(df['cluster'].unique()):
54
+ cluster_mean = df[df['cluster'] == cluster_id].drop(columns=['cluster']).mean()
55
+
56
+ z_scores = ((cluster_mean - global_mean) / global_std).abs()
57
+
58
+ extreme_features = z_scores[z_scores > threshold].sort_values(ascending=False)
59
+
60
+ extremes[cluster_id] = {
61
+ "features": extreme_features.index.tolist(),
62
+ "z_scores": extreme_features.to_dict()
63
+ }
64
+
65
+ return extremes