| """Clustering helpers for grouping similar reviews.""" | |
| from __future__ import annotations | |
| from dataclasses import dataclass | |
| from typing import Optional | |
| import numpy as np | |
| from sklearn.cluster import KMeans | |
| from sklearn.decomposition import PCA | |
| class ClusteringEngine: | |
| n_clusters: int = 20 | |
| random_state: int = 42 | |
| use_pca: bool = True | |
| pca_components: Optional[int] = 50 | |
| def fit_predict(self, embeddings: np.ndarray) -> np.ndarray: | |
| matrix = embeddings | |
| if self.use_pca and self.pca_components and matrix.shape[1] > self.pca_components: | |
| reducer = PCA(n_components=self.pca_components, random_state=self.random_state) | |
| matrix = reducer.fit_transform(matrix) | |
| model = KMeans(n_clusters=self.n_clusters, random_state=self.random_state, n_init="auto") | |
| labels = model.fit_predict(matrix) | |
| return labels | |
| __all__ = ["ClusteringEngine"] | |