File size: 924 Bytes
0116d50 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 | """Clustering helpers for grouping similar reviews."""
from __future__ import annotations
from dataclasses import dataclass
from typing import Optional
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
@dataclass
class ClusteringEngine:
n_clusters: int = 20
random_state: int = 42
use_pca: bool = True
pca_components: Optional[int] = 50
def fit_predict(self, embeddings: np.ndarray) -> np.ndarray:
matrix = embeddings
if self.use_pca and self.pca_components and matrix.shape[1] > self.pca_components:
reducer = PCA(n_components=self.pca_components, random_state=self.random_state)
matrix = reducer.fit_transform(matrix)
model = KMeans(n_clusters=self.n_clusters, random_state=self.random_state, n_init="auto")
labels = model.fit_predict(matrix)
return labels
__all__ = ["ClusteringEngine"]
|