Spaces:
Running
Running
| # backend/app/services/embedder.py | |
| # Dual-mode embedder. | |
| # - local (ENVIRONMENT != prod): lazy-loads SentenceTransformer in-process on first call. | |
| # - prod: calls the HuggingFace personabot-embedder Space via async HTTP. | |
| # API Space stays at <256MB — no model weights ever loaded there. | |
| from typing import Any, Optional | |
| import httpx | |
| # Module-level cache for the local model. Loaded on first call, reused after. | |
| # This avoids loading 90MB of weights at import time in tests. | |
| _local_model: Optional[Any] = None | |
| def _get_local_model() -> Any: | |
| global _local_model # noqa: PLW0603 | |
| if _local_model is None: | |
| from sentence_transformers import SentenceTransformer | |
| # BGE normalises embeddings by default; no manual L2 step needed. | |
| _local_model = SentenceTransformer("BAAI/bge-small-en-v1.5", device="cpu") | |
| return _local_model | |
| # BGE asymmetric query instruction — prepended locally when is_query=True and | |
| # environment is local. In prod the HF Space accepts is_query and prepends itself. | |
| _BGE_QUERY_PREFIX = "Represent this sentence for searching relevant passages: " | |
| class Embedder: | |
| def __init__(self, remote_url: str = "", environment: str = "local") -> None: | |
| self._remote = environment == "prod" and bool(remote_url) | |
| self._url = remote_url.rstrip("/") if self._remote else "" | |
| async def embed(self, texts: list[str], is_query: bool = False) -> list[list[float]]: | |
| """ | |
| Encodes texts, returns List of L2-normalised 384-dim float vectors. | |
| is_query=True: prepend BGE asymmetric query instruction (queries only). | |
| is_query=False: encode as-is (document/ingestion embeddings). | |
| See BGE paper: 2-4% NDCG gain from using the correct prefix on queries. | |
| """ | |
| if not texts: | |
| return [] | |
| if self._remote: | |
| # HF Space handles the prefix server-side when is_query=True. | |
| async with httpx.AsyncClient(timeout=30.0) as client: | |
| resp = await client.post( | |
| f"{self._url}/embed", | |
| json={"texts": texts, "is_query": is_query}, | |
| ) | |
| resp.raise_for_status() | |
| return resp.json()["embeddings"] | |
| model = _get_local_model() | |
| if is_query: | |
| texts = [_BGE_QUERY_PREFIX + t for t in texts] | |
| vectors = model.encode(texts, batch_size=32, normalize_embeddings=True, show_progress_bar=False) | |
| return vectors.tolist() | |
| async def embed_one(self, text: str, is_query: bool = False) -> list[float]: | |
| """Convenience wrapper for a single string.""" | |
| results = await self.embed([text], is_query=is_query) | |
| return results[0] | |