# backend/app/services/embedder.py # Dual-mode embedder. # - local (ENVIRONMENT != prod): lazy-loads SentenceTransformer in-process on first call. # - prod: calls the HuggingFace personabot-embedder Space via async HTTP. # API Space stays at <256MB — no model weights ever loaded there. from typing import Any, Optional import httpx # Module-level cache for the local model. Loaded on first call, reused after. # This avoids loading 90MB of weights at import time in tests. _local_model: Optional[Any] = None def _get_local_model() -> Any: global _local_model # noqa: PLW0603 if _local_model is None: from sentence_transformers import SentenceTransformer # BGE normalises embeddings by default; no manual L2 step needed. _local_model = SentenceTransformer("BAAI/bge-small-en-v1.5", device="cpu") return _local_model # BGE asymmetric query instruction — prepended locally when is_query=True and # environment is local. In prod the HF Space accepts is_query and prepends itself. _BGE_QUERY_PREFIX = "Represent this sentence for searching relevant passages: " class Embedder: def __init__(self, remote_url: str = "", environment: str = "local") -> None: self._remote = environment == "prod" and bool(remote_url) self._url = remote_url.rstrip("/") if self._remote else "" async def embed(self, texts: list[str], is_query: bool = False) -> list[list[float]]: """ Encodes texts, returns List of L2-normalised 384-dim float vectors. is_query=True: prepend BGE asymmetric query instruction (queries only). is_query=False: encode as-is (document/ingestion embeddings). See BGE paper: 2-4% NDCG gain from using the correct prefix on queries. """ if not texts: return [] if self._remote: # HF Space handles the prefix server-side when is_query=True. async with httpx.AsyncClient(timeout=30.0) as client: resp = await client.post( f"{self._url}/embed", json={"texts": texts, "is_query": is_query}, ) resp.raise_for_status() return resp.json()["embeddings"] model = _get_local_model() if is_query: texts = [_BGE_QUERY_PREFIX + t for t in texts] vectors = model.encode(texts, batch_size=32, normalize_embeddings=True, show_progress_bar=False) return vectors.tolist() async def embed_one(self, text: str, is_query: bool = False) -> list[float]: """Convenience wrapper for a single string.""" results = await self.embed([text], is_query=is_query) return results[0]