File size: 2,705 Bytes
bbe01fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e7c9ee6
 
 
 
 
bbe01fe
 
 
 
 
e7c9ee6
 
 
 
 
 
 
 
bbe01fe
 
 
e7c9ee6
bbe01fe
e7c9ee6
 
 
 
bbe01fe
 
 
e7c9ee6
 
bbe01fe
 
 
e7c9ee6
bbe01fe
e7c9ee6
bbe01fe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# backend/app/services/embedder.py
# Dual-mode embedder.
# - local (ENVIRONMENT != prod): lazy-loads SentenceTransformer in-process on first call.
# - prod: calls the HuggingFace personabot-embedder Space via async HTTP.
#   API Space stays at <256MB — no model weights ever loaded there.

from typing import Any, Optional

import httpx


# Module-level cache for the local model. Loaded on first call, reused after.
# This avoids loading 90MB of weights at import time in tests.
_local_model: Optional[Any] = None


def _get_local_model() -> Any:
    global _local_model  # noqa: PLW0603
    if _local_model is None:
        from sentence_transformers import SentenceTransformer
        # BGE normalises embeddings by default; no manual L2 step needed.
        _local_model = SentenceTransformer("BAAI/bge-small-en-v1.5", device="cpu")
    return _local_model


# BGE asymmetric query instruction — prepended locally when is_query=True and
# environment is local. In prod the HF Space accepts is_query and prepends itself.
_BGE_QUERY_PREFIX = "Represent this sentence for searching relevant passages: "


class Embedder:
    def __init__(self, remote_url: str = "", environment: str = "local") -> None:
        self._remote = environment == "prod" and bool(remote_url)
        self._url = remote_url.rstrip("/") if self._remote else ""

    async def embed(self, texts: list[str], is_query: bool = False) -> list[list[float]]:
        """
        Encodes texts, returns List of L2-normalised 384-dim float vectors.

        is_query=True: prepend BGE asymmetric query instruction (queries only).
        is_query=False: encode as-is (document/ingestion embeddings).
        See BGE paper: 2-4% NDCG gain from using the correct prefix on queries.
        """
        if not texts:
            return []
        if self._remote:
            # HF Space handles the prefix server-side when is_query=True.
            async with httpx.AsyncClient(timeout=30.0) as client:
                resp = await client.post(
                    f"{self._url}/embed",
                    json={"texts": texts, "is_query": is_query},
                )
                resp.raise_for_status()
                return resp.json()["embeddings"]
        model = _get_local_model()
        if is_query:
            texts = [_BGE_QUERY_PREFIX + t for t in texts]
        vectors = model.encode(texts, batch_size=32, normalize_embeddings=True, show_progress_bar=False)
        return vectors.tolist()

    async def embed_one(self, text: str, is_query: bool = False) -> list[float]:
        """Convenience wrapper for a single string."""
        results = await self.embed([text], is_query=is_query)
        return results[0]