| """Embedding backends for clustering and analysis.""" |
|
|
| from __future__ import annotations |
|
|
| from dataclasses import dataclass |
| from typing import Any |
|
|
| import numpy as np |
| from sklearn.decomposition import TruncatedSVD |
| from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
|
|
| @dataclass(slots=True) |
| class EmbeddingResult: |
| """Dense review embeddings with metadata useful for theme extraction.""" |
|
|
| vectors: np.ndarray |
| vectorizer: TfidfVectorizer |
| tfidf_matrix: Any |
|
|
|
|
| def build_embeddings( |
| texts: list[str], |
| backend: str, |
| seed: int, |
| max_features: int, |
| dimensions: int, |
| ) -> EmbeddingResult: |
| """Create dense embeddings for review texts.""" |
| if backend != "tfidf_svd": |
| raise ValueError( |
| f"Unsupported embedding backend: {backend}. " |
| "Only 'tfidf_svd' is currently implemented in the local workflow." |
| ) |
|
|
| vectorizer = TfidfVectorizer( |
| stop_words="english", |
| max_features=max_features, |
| ngram_range=(1, 2), |
| min_df=2 if len(texts) >= 20 else 1, |
| sublinear_tf=True, |
| ) |
| tfidf_matrix = vectorizer.fit_transform(texts) |
|
|
| if tfidf_matrix.shape[1] <= 1: |
| vectors = tfidf_matrix.toarray() |
| return EmbeddingResult(vectors=vectors, vectorizer=vectorizer, tfidf_matrix=tfidf_matrix) |
|
|
| n_components = min(dimensions, tfidf_matrix.shape[0] - 1, tfidf_matrix.shape[1] - 1) |
| if n_components < 2: |
| vectors = tfidf_matrix.toarray() |
| return EmbeddingResult(vectors=vectors, vectorizer=vectorizer, tfidf_matrix=tfidf_matrix) |
|
|
| reducer = TruncatedSVD(n_components=n_components, random_state=seed) |
| vectors = reducer.fit_transform(tfidf_matrix) |
| return EmbeddingResult(vectors=vectors, vectorizer=vectorizer, tfidf_matrix=tfidf_matrix) |
|
|