Spaces:
Sleeping
Sleeping
File size: 1,535 Bytes
5212b8e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | """Feature pipelines: TF-IDF baseline and sentence-transformer embeddings."""
from __future__ import annotations
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from src.config import EMBED_MODEL_NAME
def build_tfidf_vectorizer() -> TfidfVectorizer:
return TfidfVectorizer(
ngram_range=(1, 2),
min_df=2,
max_df=0.95,
sublinear_tf=True,
max_features=80_000,
strip_accents="unicode",
)
class EmbeddingEncoder:
"""Thin wrapper around sentence-transformers with consistent settings."""
def __init__(self, model_name: str = EMBED_MODEL_NAME, device: str | None = None):
from sentence_transformers import SentenceTransformer
import torch
if device is None:
if torch.backends.mps.is_available():
device = "mps"
elif torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
self.device = device
self.model_name = model_name
self.model = SentenceTransformer(model_name, device=device)
def encode(self, texts: list[str], batch_size: int = 64, show_progress: bool = True) -> np.ndarray:
return np.asarray(
self.model.encode(
list(texts),
batch_size=batch_size,
show_progress_bar=show_progress,
normalize_embeddings=True,
convert_to_numpy=True,
),
dtype=np.float32,
)
|