Update cluster_utils.py
Browse filesadded more utils for back-projection
- cluster_utils.py +36 -21
cluster_utils.py
CHANGED
|
@@ -18,9 +18,29 @@ def choose_k(X_pca, k_range=(2, 12)):
|
|
| 18 |
return best_k
|
| 19 |
|
| 20 |
#========== During Cluster Analysis ==========
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
df =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
df['cluster'] = labels
|
| 25 |
|
| 26 |
stats = {}
|
|
@@ -35,31 +55,26 @@ def compute_cluster_stats(X_processed, labels, feature_names):
|
|
| 35 |
"std": cluster_data.std().to_dict(),
|
| 36 |
"min": cluster_data.min().to_dict(),
|
| 37 |
"max": cluster_data.max().to_dict(),
|
| 38 |
-
"range": (cluster_data.max() - cluster_data.min()).to_dict()
|
| 39 |
}
|
| 40 |
|
| 41 |
return stats
|
| 42 |
|
| 43 |
-
#
|
| 44 |
-
def
|
| 45 |
-
|
| 46 |
-
df['cluster'] = labels
|
| 47 |
-
|
| 48 |
-
global_mean = df.drop(columns=['cluster']).mean()
|
| 49 |
-
global_std = df.drop(columns=['cluster']).std()
|
| 50 |
|
| 51 |
-
|
| 52 |
|
| 53 |
-
for cluster_id in
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
z_scores = ((cluster_mean - global_mean) / global_std).abs()
|
| 57 |
|
| 58 |
-
|
| 59 |
|
| 60 |
-
|
| 61 |
-
"
|
| 62 |
-
"
|
| 63 |
}
|
| 64 |
|
| 65 |
-
return
|
|
|
|
| 18 |
return best_k
|
| 19 |
|
| 20 |
#========== During Cluster Analysis ==========
|
| 21 |
+
def compute_cluster_centroids_pca(df_pca, labels):
|
| 22 |
+
df = pd.DataFrame(df_pca)
|
| 23 |
+
df['cluster'] = labels
|
| 24 |
+
|
| 25 |
+
return df.groupby('cluster').mean()
|
| 26 |
+
|
| 27 |
+
# maps PCA-space centroids back to original feature space
|
| 28 |
+
def inverse_project_centroids(pca_centroids, pca_model, scaler_model, original_feature_names):
|
| 29 |
+
|
| 30 |
+
scaled_centroids = pca_model.inverse_transform(pca_centroids.values) # back-project from PCA space to scaled feature space
|
| 31 |
+
original_space_centroids = scaler_model.inverse_transform(scaled_centroids) # undo scaling
|
| 32 |
+
|
| 33 |
+
df_original = pd.DataFrame(
|
| 34 |
+
original_space_centroids,
|
| 35 |
+
columns=original_feature_names,
|
| 36 |
+
index=pca_centroids.index
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
return df_original
|
| 40 |
+
|
| 41 |
+
# function to compute and save cluster stats
|
| 42 |
+
def compute_cluster_stats(df_pca, labels, feature_names):
|
| 43 |
+
df = pd.DataFrame(df_pca, columns=feature_names)
|
| 44 |
df['cluster'] = labels
|
| 45 |
|
| 46 |
stats = {}
|
|
|
|
| 55 |
"std": cluster_data.std().to_dict(),
|
| 56 |
"min": cluster_data.min().to_dict(),
|
| 57 |
"max": cluster_data.max().to_dict(),
|
| 58 |
+
"range": (cluster_data.max() - cluster_data.min()).to_dict()
|
| 59 |
}
|
| 60 |
|
| 61 |
return stats
|
| 62 |
|
| 63 |
+
# ranks top drivers based on `top_n`
|
| 64 |
+
def identify_top_drivers(original_space_centroids, top_n):
|
| 65 |
+
global_mean = original_space_centroids.mean()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
+
drivers = {}
|
| 68 |
|
| 69 |
+
for cluster_id, row in original_space_centroids.iterrows():
|
| 70 |
+
deviation = row - global_mean
|
| 71 |
+
ranked = deviation.abs().sort_values(ascending=False)
|
|
|
|
| 72 |
|
| 73 |
+
top_features = ranked.head(top_n).index.tolist()
|
| 74 |
|
| 75 |
+
drivers[cluster_id] = {
|
| 76 |
+
"top_features": top_features,
|
| 77 |
+
"deviations": deviation[top_features].to_dict()
|
| 78 |
}
|
| 79 |
|
| 80 |
+
return drivers
|