Spaces:
Running
Running
File size: 2,705 Bytes
bbe01fe e7c9ee6 bbe01fe e7c9ee6 bbe01fe e7c9ee6 bbe01fe e7c9ee6 bbe01fe e7c9ee6 bbe01fe e7c9ee6 bbe01fe e7c9ee6 bbe01fe | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 | # backend/app/services/embedder.py
# Dual-mode embedder.
# - local (ENVIRONMENT != prod): lazy-loads SentenceTransformer in-process on first call.
# - prod: calls the HuggingFace personabot-embedder Space via async HTTP.
# API Space stays at <256MB — no model weights ever loaded there.
from typing import Any, Optional
import httpx
# Module-level cache for the local model. Loaded on first call, reused after.
# This avoids loading 90MB of weights at import time in tests.
_local_model: Optional[Any] = None
def _get_local_model() -> Any:
global _local_model # noqa: PLW0603
if _local_model is None:
from sentence_transformers import SentenceTransformer
# BGE normalises embeddings by default; no manual L2 step needed.
_local_model = SentenceTransformer("BAAI/bge-small-en-v1.5", device="cpu")
return _local_model
# BGE asymmetric query instruction — prepended locally when is_query=True and
# environment is local. In prod the HF Space accepts is_query and prepends itself.
_BGE_QUERY_PREFIX = "Represent this sentence for searching relevant passages: "
class Embedder:
def __init__(self, remote_url: str = "", environment: str = "local") -> None:
self._remote = environment == "prod" and bool(remote_url)
self._url = remote_url.rstrip("/") if self._remote else ""
async def embed(self, texts: list[str], is_query: bool = False) -> list[list[float]]:
"""
Encodes texts, returns List of L2-normalised 384-dim float vectors.
is_query=True: prepend BGE asymmetric query instruction (queries only).
is_query=False: encode as-is (document/ingestion embeddings).
See BGE paper: 2-4% NDCG gain from using the correct prefix on queries.
"""
if not texts:
return []
if self._remote:
# HF Space handles the prefix server-side when is_query=True.
async with httpx.AsyncClient(timeout=30.0) as client:
resp = await client.post(
f"{self._url}/embed",
json={"texts": texts, "is_query": is_query},
)
resp.raise_for_status()
return resp.json()["embeddings"]
model = _get_local_model()
if is_query:
texts = [_BGE_QUERY_PREFIX + t for t in texts]
vectors = model.encode(texts, batch_size=32, normalize_embeddings=True, show_progress_bar=False)
return vectors.tolist()
async def embed_one(self, text: str, is_query: bool = False) -> list[float]:
"""Convenience wrapper for a single string."""
results = await self.embed([text], is_query=is_query)
return results[0]
|