genAI-Project / src /retrieval /embedder.py
OGB2000's picture
Initial clean deployment
bf77be6
Raw
History Blame Contribute Delete
1.96 kB
"""
src/retrieval/embedder.py
--------------------------
Extended embedder supporting multiple models for benchmarking.
Compatible with lab_s6 Embedder API (encode / encode_one).
"""
from typing import Literal
import numpy as np
SUPPORTED_MODELS = {
"multilingual-e5-small": "intfloat/multilingual-e5-small",
"bge-small-en": "BAAI/bge-small-en-v1.5",
"minilm-l6": "sentence-transformers/all-MiniLM-L6-v2",
}
class Embedder:
def __init__(self, model_key: str = "multilingual-e5-small"):
from sentence_transformers import SentenceTransformer
model_name = SUPPORTED_MODELS.get(model_key, model_key)
self.model_key = model_key
self.model_name = model_name
self.model = SentenceTransformer(model_name)
self.dim = self.model.get_sentence_embedding_dimension()
print(f"✅ Loaded embedder: {model_name} (dim={self.dim})")
def encode(self, texts: list[str], show_progress: bool = True) -> np.ndarray:
return self.model.encode(
texts, show_progress_bar=show_progress,
normalize_embeddings=True, batch_size=32,
)
def encode_one(self, text: str) -> list[float]:
v = self.model.encode([text], normalize_embeddings=True)
return v[0].tolist()
def __repr__(self):
return f"Embedder(model={self.model_name}, dim={self.dim})"
def benchmark_embedders(texts: list[str], model_keys: list[str] = None) -> dict:
import time
if model_keys is None:
model_keys = list(SUPPORTED_MODELS.keys())
results = {}
for key in model_keys:
emb = Embedder(key)
t0 = time.time()
emb.encode(texts, show_progress=False)
elapsed = time.time() - t0
results[key] = {
"model_name": emb.model_name, "dim": emb.dim,
"time_s": round(elapsed, 3), "n_texts": len(texts),
"texts_per_sec": round(len(texts) / elapsed, 1),
}
return results