Spaces:

1337XCode
/

personabot-api

Running

personabot-api / app /services /embedder.py

GitHub Actions

Deploy d8ad462

e7c9ee6 about 1 month ago

2.71 kB

	# backend/app/services/embedder.py
	# Dual-mode embedder.
	# - local (ENVIRONMENT != prod): lazy-loads SentenceTransformer in-process on first call.
	# - prod: calls the HuggingFace personabot-embedder Space via async HTTP.
	# API Space stays at <256MB — no model weights ever loaded there.

	from typing import Any, Optional

	import httpx


	# Module-level cache for the local model. Loaded on first call, reused after.
	# This avoids loading 90MB of weights at import time in tests.
	_local_model: Optional[Any] = None


	def _get_local_model() -> Any:
	global _local_model # noqa: PLW0603
	if _local_model is None:
	from sentence_transformers import SentenceTransformer
	# BGE normalises embeddings by default; no manual L2 step needed.
	_local_model = SentenceTransformer("BAAI/bge-small-en-v1.5", device="cpu")
	return _local_model


	# BGE asymmetric query instruction — prepended locally when is_query=True and
	# environment is local. In prod the HF Space accepts is_query and prepends itself.
	_BGE_QUERY_PREFIX = "Represent this sentence for searching relevant passages: "


	class Embedder:
	def __init__(self, remote_url: str = "", environment: str = "local") -> None:
	self._remote = environment == "prod" and bool(remote_url)
	self._url = remote_url.rstrip("/") if self._remote else ""

	async def embed(self, texts: list[str], is_query: bool = False) -> list[list[float]]:
	"""
	Encodes texts, returns List of L2-normalised 384-dim float vectors.

	is_query=True: prepend BGE asymmetric query instruction (queries only).
	is_query=False: encode as-is (document/ingestion embeddings).
	See BGE paper: 2-4% NDCG gain from using the correct prefix on queries.
	"""
	if not texts:
	return []
	if self._remote:
	# HF Space handles the prefix server-side when is_query=True.
	async with httpx.AsyncClient(timeout=30.0) as client:
	resp = await client.post(
	f"{self._url}/embed",
	json={"texts": texts, "is_query": is_query},
	)
	resp.raise_for_status()
	return resp.json()["embeddings"]
	model = _get_local_model()
	if is_query:
	texts = [_BGE_QUERY_PREFIX + t for t in texts]
	vectors = model.encode(texts, batch_size=32, normalize_embeddings=True, show_progress_bar=False)
	return vectors.tolist()

	async def embed_one(self, text: str, is_query: bool = False) -> list[float]:
	"""Convenience wrapper for a single string."""
	results = await self.embed([text], is_query=is_query)
	return results[0]