# infra/hf_spaces/embedder/app.py # Serves BAAI/bge-small-en-v1.5 embeddings over HTTP. # Model is loaded from /app/model_cache (baked into the Docker image at build time). from contextlib import asynccontextmanager from typing import Annotated, Any from fastapi import FastAPI from pydantic import BaseModel, Field from sentence_transformers import SentenceTransformer # 64 texts * 2000 chars = 128KB max payload — keeps the free-tier Space under # its 16GB RAM limit even with the largest expected retrieval batch (top-20). _MAX_TEXTS = 64 _MAX_TEXT_LEN = 2000 # BGE model card specifies this prefix for query embeddings in asymmetric retrieval. # Document embeddings must NOT use this prefix — only query-time calls set is_query=True. # Paper shows 2-4% NDCG improvement over no-prefix symmetric mode. _BGE_QUERY_PREFIX = "Represent this sentence for searching relevant passages: " class EmbedRequest(BaseModel): texts: list[Annotated[str, Field(max_length=_MAX_TEXT_LEN)]] = Field( ..., max_length=_MAX_TEXTS ) is_query: bool = False # True → prepend BGE asymmetric query instruction class EmbedResponse(BaseModel): embeddings: list[list[float]] @asynccontextmanager async def lifespan(app: FastAPI): # Load from baked-in cache path — no network call at startup. # BGE normalises embeddings by default; no manual L2 step needed. app.state.model = SentenceTransformer( "BAAI/bge-small-en-v1.5", cache_folder="/app/model_cache", ) app.state.model.eval() yield app.state.model = None app = FastAPI( title="PersonaBot Embedder", lifespan=lifespan, docs_url=None, redoc_url=None, openapi_url=None, ) @app.get("/health") async def health() -> dict[str, str]: if app.state.model is None: return {"status": "loading"} return {"status": "ok"} @app.post("/embed", response_model=EmbedResponse) async def embed(request: EmbedRequest) -> EmbedResponse: if not request.texts: return EmbedResponse(embeddings=[]) texts = ( [_BGE_QUERY_PREFIX + t for t in request.texts] if request.is_query else request.texts ) # encode with batch_size=32, returns numpy array shape (N, 384) vectors: Any = app.state.model.encode( texts, batch_size=32, normalize_embeddings=True, show_progress_bar=False, ) return EmbedResponse(embeddings=vectors.tolist())