maralzar
Initial commit: EPCC clause classifier Streamlit demo for HF Spaces
5212b8e
Raw
History Blame Contribute Delete
1.54 kB
"""Feature pipelines: TF-IDF baseline and sentence-transformer embeddings."""
from __future__ import annotations
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from src.config import EMBED_MODEL_NAME
def build_tfidf_vectorizer() -> TfidfVectorizer:
return TfidfVectorizer(
ngram_range=(1, 2),
min_df=2,
max_df=0.95,
sublinear_tf=True,
max_features=80_000,
strip_accents="unicode",
)
class EmbeddingEncoder:
"""Thin wrapper around sentence-transformers with consistent settings."""
def __init__(self, model_name: str = EMBED_MODEL_NAME, device: str | None = None):
from sentence_transformers import SentenceTransformer
import torch
if device is None:
if torch.backends.mps.is_available():
device = "mps"
elif torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
self.device = device
self.model_name = model_name
self.model = SentenceTransformer(model_name, device=device)
def encode(self, texts: list[str], batch_size: int = 64, show_progress: bool = True) -> np.ndarray:
return np.asarray(
self.model.encode(
list(texts),
batch_size=batch_size,
show_progress_bar=show_progress,
normalize_embeddings=True,
convert_to_numpy=True,
),
dtype=np.float32,
)