""" src/retrieval/embedder.py -------------------------- Extended embedder supporting multiple models for benchmarking. Compatible with lab_s6 Embedder API (encode / encode_one). """ from typing import Literal import numpy as np SUPPORTED_MODELS = { "multilingual-e5-small": "intfloat/multilingual-e5-small", "bge-small-en": "BAAI/bge-small-en-v1.5", "minilm-l6": "sentence-transformers/all-MiniLM-L6-v2", } class Embedder: def __init__(self, model_key: str = "multilingual-e5-small"): from sentence_transformers import SentenceTransformer model_name = SUPPORTED_MODELS.get(model_key, model_key) self.model_key = model_key self.model_name = model_name self.model = SentenceTransformer(model_name) self.dim = self.model.get_sentence_embedding_dimension() print(f"✅ Loaded embedder: {model_name} (dim={self.dim})") def encode(self, texts: list[str], show_progress: bool = True) -> np.ndarray: return self.model.encode( texts, show_progress_bar=show_progress, normalize_embeddings=True, batch_size=32, ) def encode_one(self, text: str) -> list[float]: v = self.model.encode([text], normalize_embeddings=True) return v[0].tolist() def __repr__(self): return f"Embedder(model={self.model_name}, dim={self.dim})" def benchmark_embedders(texts: list[str], model_keys: list[str] = None) -> dict: import time if model_keys is None: model_keys = list(SUPPORTED_MODELS.keys()) results = {} for key in model_keys: emb = Embedder(key) t0 = time.time() emb.encode(texts, show_progress=False) elapsed = time.time() - t0 results[key] = { "model_name": emb.model_name, "dim": emb.dim, "time_s": round(elapsed, 3), "n_texts": len(texts), "texts_per_sec": round(len(texts) / elapsed, 1), } return results