Simrandhiman commited on
Commit
8775e4c
·
verified ·
1 Parent(s): cebf383

Create evaluate.py

Browse files
Files changed (1) hide show
  1. evaluate.py +55 -0
evaluate.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # evaluate.py
2
+ # Purpose: small evaluation and visualization for clusters
3
+
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from sklearn.metrics import silhouette_score
8
+ import matplotlib.pyplot as plt
9
+ import seaborn as sns
10
+
11
+
12
+
13
+
14
+ def silhouette(embs, labels):
15
+ mask = labels >= 0
16
+ if mask.sum() <= 1:
17
+ return None
18
+ score = silhouette_score(embs[mask], labels[mask])
19
+ return score
20
+
21
+
22
+
23
+
24
+ def cluster_stats(df_original, labels):
25
+ df = df_original.copy()
26
+ df['cluster'] = labels
27
+ stats = df.groupby('cluster').agg({'customer_id':'count', 'annual_income':'median', 'spend_score':'median'})
28
+ return stats
29
+
30
+
31
+
32
+
33
+ if __name__ == '__main__':
34
+ import argparse
35
+ parser = argparse.ArgumentParser()
36
+ parser.add_argument('--features', default='data/features.parquet')
37
+ parser.add_argument('--emb', default='data/embeddings.npy')
38
+ parser.add_argument('--labels', default='data/cluster_labels.npy')
39
+ args = parser.parse_args()
40
+
41
+
42
+ df = pd.read_parquet(args.features)
43
+ embs = np.load(args.emb)
44
+ labels = np.load(args.labels)
45
+
46
+
47
+ s = silhouette(embs, labels)
48
+ print('Silhouette score (ignoring noise labels -1):', s)
49
+
50
+
51
+ try:
52
+ stats = cluster_stats(df, labels)
53
+ print(stats)
54
+ except Exception:
55
+ print('Could not compute descriptive stats (missing columns).')