Simrandhiman commited on
Commit
cebf383
·
verified ·
1 Parent(s): decfa0a

Create clustering.py

Browse files
Files changed (1) hide show
  1. clustering.py +59 -0
clustering.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # clustering.py
2
+ # Purpose: run dimensionality reduction + clustering (KMeans + HDBSCAN optional) and save cluster labels
3
+
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from sklearn.decomposition import PCA
8
+ from sklearn.cluster import KMeans
9
+ from sklearn.preprocessing import StandardScaler
10
+ import joblib
11
+
12
+
13
+ try:
14
+ import hdbscan
15
+ except Exception:
16
+ hdbscan = None
17
+
18
+
19
+
20
+
21
+ def reduce_and_cluster(embs, n_components=50, k=8, use_hdbscan=False):
22
+ # emb dimension reduction
23
+ scaler = StandardScaler()
24
+ Xs = scaler.fit_transform(embs)
25
+
26
+
27
+ pca = PCA(n_components=min(n_components, Xs.shape[1]))
28
+ Xp = pca.fit_transform(Xs)
29
+
30
+
31
+ labels = None
32
+ if use_hdbscan and hdbscan is not None:
33
+ clusterer = hdbscan.HDBSCAN(min_cluster_size=15)
34
+ labels = clusterer.fit_predict(Xp)
35
+ else:
36
+ km = KMeans(n_clusters=k, random_state=42)
37
+ labels = km.fit_predict(Xp)
38
+
39
+
40
+ return labels, {'scaler': scaler, 'pca': pca}
41
+
42
+
43
+
44
+
45
+ if __name__ == '__main__':
46
+ import argparse
47
+ parser = argparse.ArgumentParser()
48
+ parser.add_argument('--emb', default='data/embeddings.npy')
49
+ parser.add_argument('--out_labels', default='data/cluster_labels.npy')
50
+ parser.add_argument('--k', type=int, default=8)
51
+ parser.add_argument('--use_hdbscan', action='store_true')
52
+ args = parser.parse_args()
53
+
54
+
55
+ embs = np.load(args.emb)
56
+ labels, artifacts = reduce_and_cluster(embs, k=args.k, use_hdbscan=args.use_hdbscan)
57
+ np.save(args.out_labels, labels)
58
+ joblib.dump(artifacts, 'data/cluster_artifacts.joblib')
59
+ print('Saved labels to', args.out_labels)