Spaces:
Runtime error
Runtime error
| # clustering.py | |
| # Purpose: run dimensionality reduction + clustering (KMeans + HDBSCAN optional) and save cluster labels | |
| import numpy as np | |
| import pandas as pd | |
| from sklearn.decomposition import PCA | |
| from sklearn.cluster import KMeans | |
| from sklearn.preprocessing import StandardScaler | |
| import joblib | |
| try: | |
| import hdbscan | |
| except Exception: | |
| hdbscan = None | |
| def reduce_and_cluster(embs, n_components=50, k=8, use_hdbscan=False): | |
| # emb dimension reduction | |
| scaler = StandardScaler() | |
| Xs = scaler.fit_transform(embs) | |
| pca = PCA(n_components=min(n_components, Xs.shape[1])) | |
| Xp = pca.fit_transform(Xs) | |
| labels = None | |
| if use_hdbscan and hdbscan is not None: | |
| clusterer = hdbscan.HDBSCAN(min_cluster_size=15) | |
| labels = clusterer.fit_predict(Xp) | |
| else: | |
| km = KMeans(n_clusters=k, random_state=42) | |
| labels = km.fit_predict(Xp) | |
| return labels, {'scaler': scaler, 'pca': pca} | |
| if __name__ == '__main__': | |
| import argparse | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('--emb', default='data/embeddings.npy') | |
| parser.add_argument('--out_labels', default='data/cluster_labels.npy') | |
| parser.add_argument('--k', type=int, default=8) | |
| parser.add_argument('--use_hdbscan', action='store_true') | |
| args = parser.parse_args() | |
| embs = np.load(args.emb) | |
| labels, artifacts = reduce_and_cluster(embs, k=args.k, use_hdbscan=args.use_hdbscan) | |
| np.save(args.out_labels, labels) | |
| joblib.dump(artifacts, 'data/cluster_artifacts.joblib') | |
| print('Saved labels to', args.out_labels) |