File size: 924 Bytes
0116d50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
"""Clustering helpers for grouping similar reviews."""

from __future__ import annotations

from dataclasses import dataclass
from typing import Optional

import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA


@dataclass
class ClusteringEngine:
    n_clusters: int = 20
    random_state: int = 42
    use_pca: bool = True
    pca_components: Optional[int] = 50

    def fit_predict(self, embeddings: np.ndarray) -> np.ndarray:
        matrix = embeddings
        if self.use_pca and self.pca_components and matrix.shape[1] > self.pca_components:
            reducer = PCA(n_components=self.pca_components, random_state=self.random_state)
            matrix = reducer.fit_transform(matrix)
        model = KMeans(n_clusters=self.n_clusters, random_state=self.random_state, n_init="auto")
        labels = model.fit_predict(matrix)
        return labels


__all__ = ["ClusteringEngine"]