File size: 1,449 Bytes
cebf383
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# clustering.py
# Purpose: run dimensionality reduction + clustering (KMeans + HDBSCAN optional) and save cluster labels


import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import joblib


try:
import hdbscan
except Exception:
hdbscan = None




def reduce_and_cluster(embs, n_components=50, k=8, use_hdbscan=False):
# emb dimension reduction
scaler = StandardScaler()
Xs = scaler.fit_transform(embs)


pca = PCA(n_components=min(n_components, Xs.shape[1]))
Xp = pca.fit_transform(Xs)


labels = None
if use_hdbscan and hdbscan is not None:
clusterer = hdbscan.HDBSCAN(min_cluster_size=15)
labels = clusterer.fit_predict(Xp)
else:
km = KMeans(n_clusters=k, random_state=42)
labels = km.fit_predict(Xp)


return labels, {'scaler': scaler, 'pca': pca}




if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--emb', default='data/embeddings.npy')
parser.add_argument('--out_labels', default='data/cluster_labels.npy')
parser.add_argument('--k', type=int, default=8)
parser.add_argument('--use_hdbscan', action='store_true')
args = parser.parse_args()


embs = np.load(args.emb)
labels, artifacts = reduce_and_cluster(embs, k=args.k, use_hdbscan=args.use_hdbscan)
np.save(args.out_labels, labels)
joblib.dump(artifacts, 'data/cluster_artifacts.joblib')
print('Saved labels to', args.out_labels)