Richard CHEAM
Deploy customer feedback intelligence demo
73b0303
"""Theme discovery utilities based on review embeddings."""
from __future__ import annotations
from dataclasses import dataclass
import numpy as np
from sklearn.cluster import KMeans
@dataclass(slots=True)
class ThemeDiscoveryResult:
"""Cluster assignments and human-readable theme labels."""
assignments: list[int]
top_terms_by_cluster: dict[int, list[str]]
representative_row_index_by_cluster: dict[int, int]
def discover_themes(
embeddings: np.ndarray,
tfidf_matrix,
feature_names: np.ndarray,
n_clusters: int,
top_terms_per_cluster: int,
seed: int,
) -> ThemeDiscoveryResult:
"""Cluster review embeddings and derive representative terms per cluster."""
if len(embeddings) == 0:
return ThemeDiscoveryResult(
assignments=[],
top_terms_by_cluster={},
representative_row_index_by_cluster={},
)
cluster_count = max(1, min(n_clusters, len(embeddings)))
model = KMeans(n_clusters=cluster_count, random_state=seed, n_init="auto")
assignments = model.fit_predict(embeddings)
top_terms_by_cluster: dict[int, list[str]] = {}
representative_row_index_by_cluster: dict[int, int] = {}
for cluster_id in range(cluster_count):
cluster_rows = np.where(assignments == cluster_id)[0]
cluster_matrix = tfidf_matrix[cluster_rows]
weights = np.asarray(cluster_matrix.mean(axis=0)).ravel()
ranked_indices = weights.argsort()[::-1]
terms = [
str(feature_names[index])
for index in ranked_indices[:top_terms_per_cluster]
if weights[index] > 0
]
top_terms_by_cluster[cluster_id] = terms or ["misc"]
cluster_vectors = embeddings[cluster_rows]
centroid = model.cluster_centers_[cluster_id]
distances = np.linalg.norm(cluster_vectors - centroid, axis=1)
representative_row_index_by_cluster[cluster_id] = int(
cluster_rows[int(np.argmin(distances))]
)
return ThemeDiscoveryResult(
assignments=assignments.tolist(),
top_terms_by_cluster=top_terms_by_cluster,
representative_row_index_by_cluster=representative_row_index_by_cluster,
)