Spaces:

1337XCode
/

personabot-embedder

Running

File size: 2,451 Bytes

# infra/hf_spaces/embedder/app.py
# Serves BAAI/bge-small-en-v1.5 embeddings over HTTP.
# Model is loaded from /app/model_cache (baked into the Docker image at build time).

from contextlib import asynccontextmanager
from typing import Annotated, Any

from fastapi import FastAPI
from pydantic import BaseModel, Field
from sentence_transformers import SentenceTransformer

# 64 texts * 2000 chars = 128KB max payload — keeps the free-tier Space under
# its 16GB RAM limit even with the largest expected retrieval batch (top-20).
_MAX_TEXTS = 64
_MAX_TEXT_LEN = 2000

# BGE model card specifies this prefix for query embeddings in asymmetric retrieval.
# Document embeddings must NOT use this prefix — only query-time calls set is_query=True.
# Paper shows 2-4% NDCG improvement over no-prefix symmetric mode.
_BGE_QUERY_PREFIX = "Represent this sentence for searching relevant passages: "


class EmbedRequest(BaseModel):
    texts: list[Annotated[str, Field(max_length=_MAX_TEXT_LEN)]] = Field(
        ..., max_length=_MAX_TEXTS
    )
    is_query: bool = False  # True → prepend BGE asymmetric query instruction


class EmbedResponse(BaseModel):
    embeddings: list[list[float]]


@asynccontextmanager
async def lifespan(app: FastAPI):
    # Load from baked-in cache path — no network call at startup.
    # BGE normalises embeddings by default; no manual L2 step needed.
    app.state.model = SentenceTransformer(
        "BAAI/bge-small-en-v1.5",
        cache_folder="/app/model_cache",
    )
    app.state.model.eval()
    yield
    app.state.model = None


app = FastAPI(
    title="PersonaBot Embedder",
    lifespan=lifespan,
    docs_url=None,
    redoc_url=None,
    openapi_url=None,
)


@app.get("/health")
async def health() -> dict[str, str]:
    if app.state.model is None:
        return {"status": "loading"}
    return {"status": "ok"}


@app.post("/embed", response_model=EmbedResponse)
async def embed(request: EmbedRequest) -> EmbedResponse:
    if not request.texts:
        return EmbedResponse(embeddings=[])
    texts = (
        [_BGE_QUERY_PREFIX + t for t in request.texts]
        if request.is_query
        else request.texts
    )
    # encode with batch_size=32, returns numpy array shape (N, 384)
    vectors: Any = app.state.model.encode(
        texts,
        batch_size=32,
        normalize_embeddings=True,
        show_progress_bar=False,
    )
    return EmbedResponse(embeddings=vectors.tolist())