GitHub Actions
Deploy d8ad462
a99846a
# infra/hf_spaces/embedder/app.py
# Serves BAAI/bge-small-en-v1.5 embeddings over HTTP.
# Model is loaded from /app/model_cache (baked into the Docker image at build time).
from contextlib import asynccontextmanager
from typing import Annotated, Any
from fastapi import FastAPI
from pydantic import BaseModel, Field
from sentence_transformers import SentenceTransformer
# 64 texts * 2000 chars = 128KB max payload β€” keeps the free-tier Space under
# its 16GB RAM limit even with the largest expected retrieval batch (top-20).
_MAX_TEXTS = 64
_MAX_TEXT_LEN = 2000
# BGE model card specifies this prefix for query embeddings in asymmetric retrieval.
# Document embeddings must NOT use this prefix β€” only query-time calls set is_query=True.
# Paper shows 2-4% NDCG improvement over no-prefix symmetric mode.
_BGE_QUERY_PREFIX = "Represent this sentence for searching relevant passages: "
class EmbedRequest(BaseModel):
texts: list[Annotated[str, Field(max_length=_MAX_TEXT_LEN)]] = Field(
..., max_length=_MAX_TEXTS
)
is_query: bool = False # True β†’ prepend BGE asymmetric query instruction
class EmbedResponse(BaseModel):
embeddings: list[list[float]]
@asynccontextmanager
async def lifespan(app: FastAPI):
# Load from baked-in cache path β€” no network call at startup.
# BGE normalises embeddings by default; no manual L2 step needed.
app.state.model = SentenceTransformer(
"BAAI/bge-small-en-v1.5",
cache_folder="/app/model_cache",
)
app.state.model.eval()
yield
app.state.model = None
app = FastAPI(
title="PersonaBot Embedder",
lifespan=lifespan,
docs_url=None,
redoc_url=None,
openapi_url=None,
)
@app.get("/health")
async def health() -> dict[str, str]:
if app.state.model is None:
return {"status": "loading"}
return {"status": "ok"}
@app.post("/embed", response_model=EmbedResponse)
async def embed(request: EmbedRequest) -> EmbedResponse:
if not request.texts:
return EmbedResponse(embeddings=[])
texts = (
[_BGE_QUERY_PREFIX + t for t in request.texts]
if request.is_query
else request.texts
)
# encode with batch_size=32, returns numpy array shape (N, 384)
vectors: Any = app.state.model.encode(
texts,
batch_size=32,
normalize_embeddings=True,
show_progress_bar=False,
)
return EmbedResponse(embeddings=vectors.tolist())