Spaces:
Sleeping
Sleeping
| """ | |
| src/retrieval/embedder.py | |
| -------------------------- | |
| Extended embedder supporting multiple models for benchmarking. | |
| Compatible with lab_s6 Embedder API (encode / encode_one). | |
| """ | |
| from typing import Literal | |
| import numpy as np | |
| SUPPORTED_MODELS = { | |
| "multilingual-e5-small": "intfloat/multilingual-e5-small", | |
| "bge-small-en": "BAAI/bge-small-en-v1.5", | |
| "minilm-l6": "sentence-transformers/all-MiniLM-L6-v2", | |
| } | |
| class Embedder: | |
| def __init__(self, model_key: str = "multilingual-e5-small"): | |
| from sentence_transformers import SentenceTransformer | |
| model_name = SUPPORTED_MODELS.get(model_key, model_key) | |
| self.model_key = model_key | |
| self.model_name = model_name | |
| self.model = SentenceTransformer(model_name) | |
| self.dim = self.model.get_sentence_embedding_dimension() | |
| print(f"✅ Loaded embedder: {model_name} (dim={self.dim})") | |
| def encode(self, texts: list[str], show_progress: bool = True) -> np.ndarray: | |
| return self.model.encode( | |
| texts, show_progress_bar=show_progress, | |
| normalize_embeddings=True, batch_size=32, | |
| ) | |
| def encode_one(self, text: str) -> list[float]: | |
| v = self.model.encode([text], normalize_embeddings=True) | |
| return v[0].tolist() | |
| def __repr__(self): | |
| return f"Embedder(model={self.model_name}, dim={self.dim})" | |
| def benchmark_embedders(texts: list[str], model_keys: list[str] = None) -> dict: | |
| import time | |
| if model_keys is None: | |
| model_keys = list(SUPPORTED_MODELS.keys()) | |
| results = {} | |
| for key in model_keys: | |
| emb = Embedder(key) | |
| t0 = time.time() | |
| emb.encode(texts, show_progress=False) | |
| elapsed = time.time() - t0 | |
| results[key] = { | |
| "model_name": emb.model_name, "dim": emb.dim, | |
| "time_s": round(elapsed, 3), "n_texts": len(texts), | |
| "texts_per_sec": round(len(texts) / elapsed, 1), | |
| } | |
| return results | |