# clustering.py # Purpose: run dimensionality reduction + clustering (KMeans + HDBSCAN optional) and save cluster labels import numpy as np import pandas as pd from sklearn.decomposition import PCA from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler import joblib try: import hdbscan except Exception: hdbscan = None def reduce_and_cluster(embs, n_components=50, k=8, use_hdbscan=False): # emb dimension reduction scaler = StandardScaler() Xs = scaler.fit_transform(embs) pca = PCA(n_components=min(n_components, Xs.shape[1])) Xp = pca.fit_transform(Xs) labels = None if use_hdbscan and hdbscan is not None: clusterer = hdbscan.HDBSCAN(min_cluster_size=15) labels = clusterer.fit_predict(Xp) else: km = KMeans(n_clusters=k, random_state=42) labels = km.fit_predict(Xp) return labels, {'scaler': scaler, 'pca': pca} if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('--emb', default='data/embeddings.npy') parser.add_argument('--out_labels', default='data/cluster_labels.npy') parser.add_argument('--k', type=int, default=8) parser.add_argument('--use_hdbscan', action='store_true') args = parser.parse_args() embs = np.load(args.emb) labels, artifacts = reduce_and_cluster(embs, k=args.k, use_hdbscan=args.use_hdbscan) np.save(args.out_labels, labels) joblib.dump(artifacts, 'data/cluster_artifacts.joblib') print('Saved labels to', args.out_labels)