File size: 3,088 Bytes
473522a 28c34ea f750436 28c34ea d440d65 28c34ea d440d65 28c34ea d440d65 28c34ea d440d65 28c34ea d8e5ee4 28c34ea d8e5ee4 28c34ea d8e5ee4 28c34ea d8e5ee4 28c34ea d8e5ee4 28c34ea d8e5ee4 28c34ea d8e5ee4 702a530 d8e5ee4 28c34ea d8e5ee4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
__all__ = [
"choose_k",
"compute_cluster_centroids_pca",
"inverse_project_centroids",
"compute_cluster_stats",
"identify_top_drivers"
]
#========== Before Final Clustering ==========
def choose_k(X_pca):
best_k = 2
best_score = -1
# Ensure k does not exceed n_samples - 1 for silhouette_score validity
n_samples = X_pca.shape[0]
max_k_for_silhouette = n_samples # range is exclusive of end, so this will allow k up to n_samples - 1
for k in range(2, min(12, max_k_for_silhouette)):
km = KMeans(n_clusters=k, random_state=42, n_init='auto') # Added n_init='auto' to suppress future warning
labels = km.fit_predict(X_pca)
score = silhouette_score(X_pca, labels)
if score > best_score:
best_score = score
best_k = k
print(f"Executing choose_k()... Best Score: {best_score}")
return best_k
#========== During Cluster Analysis ==========
def compute_cluster_centroids_pca(df_pca, labels):
df = pd.DataFrame(df_pca)
df['cluster'] = labels
return df.groupby('cluster').mean()
# maps PCA-space centroids back to original feature space
def inverse_project_centroids(pca_centroids, pca_model, scaler_model, original_feature_names):
scaled_centroids = pca_model.inverse_transform(pca_centroids.values) # back-project from PCA space to scaled feature space
original_space_centroids = scaler_model.inverse_transform(scaled_centroids) # undo scaling
df_original = pd.DataFrame(
original_space_centroids,
columns=original_feature_names,
index=pca_centroids.index
)
return df_original
# function to compute and save cluster stats
def compute_cluster_stats(df_pca, labels, feature_names):
df = pd.DataFrame(df_pca, columns=feature_names)
df['cluster'] = labels
stats = {}
for cluster_id in sorted(df['cluster'].unique()):
cluster_data = df[df['cluster'] == cluster_id].drop(columns=['cluster'])
stats[cluster_id] = {
"count": len(cluster_data),
"mean": cluster_data.mean().to_dict(),
"median": cluster_data.median().to_dict(),
"std": cluster_data.std().to_dict(),
"min": cluster_data.min().to_dict(),
"max": cluster_data.max().to_dict(),
"range": (cluster_data.max() - cluster_data.min()).to_dict()
}
return stats
# ranks top drivers based on `top_n`
def identify_top_drivers(original_space_centroids, top_n):
global_mean = original_space_centroids.mean()
drivers = {}
for cluster_id, row in original_space_centroids.iterrows():
deviation = row - global_mean
ranked = deviation.abs().sort_values(ascending=False)
top_features = ranked.head(top_n).index.tolist()
drivers[cluster_id] = {
# "top_features": top_features,
"deviations": deviation[top_features].to_dict()
}
return drivers |