sdbrgo commited on
Commit
d8e5ee4
·
verified ·
1 Parent(s): 64292e5

Update cluster_utils.py

Browse files

added more utils for back-projection

Files changed (1) hide show
  1. cluster_utils.py +36 -21
cluster_utils.py CHANGED
@@ -18,9 +18,29 @@ def choose_k(X_pca, k_range=(2, 12)):
18
  return best_k
19
 
20
  #========== During Cluster Analysis ==========
21
- # function to save cluster stats
22
- def compute_cluster_stats(X_processed, labels, feature_names):
23
- df = pd.DataFrame(X_processed, columns=feature_names)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  df['cluster'] = labels
25
 
26
  stats = {}
@@ -35,31 +55,26 @@ def compute_cluster_stats(X_processed, labels, feature_names):
35
  "std": cluster_data.std().to_dict(),
36
  "min": cluster_data.min().to_dict(),
37
  "max": cluster_data.max().to_dict(),
38
- "range": (cluster_data.max() - cluster_data.min()).to_dict(),
39
  }
40
 
41
  return stats
42
 
43
- # function to identify core features of the clusters
44
- def identify_extreme_features(X_processed, labels, feature_names, threshold=1.0):
45
- df = pd.DataFrame(X_processed, columns=feature_names)
46
- df['cluster'] = labels
47
-
48
- global_mean = df.drop(columns=['cluster']).mean()
49
- global_std = df.drop(columns=['cluster']).std()
50
 
51
- extremes = {}
52
 
53
- for cluster_id in sorted(df['cluster'].unique()):
54
- cluster_mean = df[df['cluster'] == cluster_id].drop(columns=['cluster']).mean()
55
-
56
- z_scores = ((cluster_mean - global_mean) / global_std).abs()
57
 
58
- extreme_features = z_scores[z_scores > threshold].sort_values(ascending=False)
59
 
60
- extremes[cluster_id] = {
61
- "features": extreme_features.index.tolist(),
62
- "z_scores": extreme_features.to_dict()
63
  }
64
 
65
- return extremes
 
18
  return best_k
19
 
20
  #========== During Cluster Analysis ==========
21
+ def compute_cluster_centroids_pca(df_pca, labels):
22
+ df = pd.DataFrame(df_pca)
23
+ df['cluster'] = labels
24
+
25
+ return df.groupby('cluster').mean()
26
+
27
+ # maps PCA-space centroids back to original feature space
28
+ def inverse_project_centroids(pca_centroids, pca_model, scaler_model, original_feature_names):
29
+
30
+ scaled_centroids = pca_model.inverse_transform(pca_centroids.values) # back-project from PCA space to scaled feature space
31
+ original_space_centroids = scaler_model.inverse_transform(scaled_centroids) # undo scaling
32
+
33
+ df_original = pd.DataFrame(
34
+ original_space_centroids,
35
+ columns=original_feature_names,
36
+ index=pca_centroids.index
37
+ )
38
+
39
+ return df_original
40
+
41
+ # function to compute and save cluster stats
42
+ def compute_cluster_stats(df_pca, labels, feature_names):
43
+ df = pd.DataFrame(df_pca, columns=feature_names)
44
  df['cluster'] = labels
45
 
46
  stats = {}
 
55
  "std": cluster_data.std().to_dict(),
56
  "min": cluster_data.min().to_dict(),
57
  "max": cluster_data.max().to_dict(),
58
+ "range": (cluster_data.max() - cluster_data.min()).to_dict()
59
  }
60
 
61
  return stats
62
 
63
+ # ranks top drivers based on `top_n`
64
+ def identify_top_drivers(original_space_centroids, top_n):
65
+ global_mean = original_space_centroids.mean()
 
 
 
 
66
 
67
+ drivers = {}
68
 
69
+ for cluster_id, row in original_space_centroids.iterrows():
70
+ deviation = row - global_mean
71
+ ranked = deviation.abs().sort_values(ascending=False)
 
72
 
73
+ top_features = ranked.head(top_n).index.tolist()
74
 
75
+ drivers[cluster_id] = {
76
+ "top_features": top_features,
77
+ "deviations": deviation[top_features].to_dict()
78
  }
79
 
80
+ return drivers