"""Feature pipelines: TF-IDF baseline and sentence-transformer embeddings.""" from __future__ import annotations import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from src.config import EMBED_MODEL_NAME def build_tfidf_vectorizer() -> TfidfVectorizer: return TfidfVectorizer( ngram_range=(1, 2), min_df=2, max_df=0.95, sublinear_tf=True, max_features=80_000, strip_accents="unicode", ) class EmbeddingEncoder: """Thin wrapper around sentence-transformers with consistent settings.""" def __init__(self, model_name: str = EMBED_MODEL_NAME, device: str | None = None): from sentence_transformers import SentenceTransformer import torch if device is None: if torch.backends.mps.is_available(): device = "mps" elif torch.cuda.is_available(): device = "cuda" else: device = "cpu" self.device = device self.model_name = model_name self.model = SentenceTransformer(model_name, device=device) def encode(self, texts: list[str], batch_size: int = 64, show_progress: bool = True) -> np.ndarray: return np.asarray( self.model.encode( list(texts), batch_size=batch_size, show_progress_bar=show_progress, normalize_embeddings=True, convert_to_numpy=True, ), dtype=np.float32, )