Spaces:
Sleeping
Sleeping
| """Feature pipelines: TF-IDF baseline and sentence-transformer embeddings.""" | |
| from __future__ import annotations | |
| import numpy as np | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from src.config import EMBED_MODEL_NAME | |
| def build_tfidf_vectorizer() -> TfidfVectorizer: | |
| return TfidfVectorizer( | |
| ngram_range=(1, 2), | |
| min_df=2, | |
| max_df=0.95, | |
| sublinear_tf=True, | |
| max_features=80_000, | |
| strip_accents="unicode", | |
| ) | |
| class EmbeddingEncoder: | |
| """Thin wrapper around sentence-transformers with consistent settings.""" | |
| def __init__(self, model_name: str = EMBED_MODEL_NAME, device: str | None = None): | |
| from sentence_transformers import SentenceTransformer | |
| import torch | |
| if device is None: | |
| if torch.backends.mps.is_available(): | |
| device = "mps" | |
| elif torch.cuda.is_available(): | |
| device = "cuda" | |
| else: | |
| device = "cpu" | |
| self.device = device | |
| self.model_name = model_name | |
| self.model = SentenceTransformer(model_name, device=device) | |
| def encode(self, texts: list[str], batch_size: int = 64, show_progress: bool = True) -> np.ndarray: | |
| return np.asarray( | |
| self.model.encode( | |
| list(texts), | |
| batch_size=batch_size, | |
| show_progress_bar=show_progress, | |
| normalize_embeddings=True, | |
| convert_to_numpy=True, | |
| ), | |
| dtype=np.float32, | |
| ) | |