Spaces:
Running
Running
| # infra/hf_spaces/embedder/app.py | |
| # Serves BAAI/bge-small-en-v1.5 embeddings over HTTP. | |
| # Model is loaded from /app/model_cache (baked into the Docker image at build time). | |
| from contextlib import asynccontextmanager | |
| from typing import Annotated, Any | |
| from fastapi import FastAPI | |
| from pydantic import BaseModel, Field | |
| from sentence_transformers import SentenceTransformer | |
| # 64 texts * 2000 chars = 128KB max payload β keeps the free-tier Space under | |
| # its 16GB RAM limit even with the largest expected retrieval batch (top-20). | |
| _MAX_TEXTS = 64 | |
| _MAX_TEXT_LEN = 2000 | |
| # BGE model card specifies this prefix for query embeddings in asymmetric retrieval. | |
| # Document embeddings must NOT use this prefix β only query-time calls set is_query=True. | |
| # Paper shows 2-4% NDCG improvement over no-prefix symmetric mode. | |
| _BGE_QUERY_PREFIX = "Represent this sentence for searching relevant passages: " | |
| class EmbedRequest(BaseModel): | |
| texts: list[Annotated[str, Field(max_length=_MAX_TEXT_LEN)]] = Field( | |
| ..., max_length=_MAX_TEXTS | |
| ) | |
| is_query: bool = False # True β prepend BGE asymmetric query instruction | |
| class EmbedResponse(BaseModel): | |
| embeddings: list[list[float]] | |
| async def lifespan(app: FastAPI): | |
| # Load from baked-in cache path β no network call at startup. | |
| # BGE normalises embeddings by default; no manual L2 step needed. | |
| app.state.model = SentenceTransformer( | |
| "BAAI/bge-small-en-v1.5", | |
| cache_folder="/app/model_cache", | |
| ) | |
| app.state.model.eval() | |
| yield | |
| app.state.model = None | |
| app = FastAPI( | |
| title="PersonaBot Embedder", | |
| lifespan=lifespan, | |
| docs_url=None, | |
| redoc_url=None, | |
| openapi_url=None, | |
| ) | |
| async def health() -> dict[str, str]: | |
| if app.state.model is None: | |
| return {"status": "loading"} | |
| return {"status": "ok"} | |
| async def embed(request: EmbedRequest) -> EmbedResponse: | |
| if not request.texts: | |
| return EmbedResponse(embeddings=[]) | |
| texts = ( | |
| [_BGE_QUERY_PREFIX + t for t in request.texts] | |
| if request.is_query | |
| else request.texts | |
| ) | |
| # encode with batch_size=32, returns numpy array shape (N, 384) | |
| vectors: Any = app.state.model.encode( | |
| texts, | |
| batch_size=32, | |
| normalize_embeddings=True, | |
| show_progress_bar=False, | |
| ) | |
| return EmbedResponse(embeddings=vectors.tolist()) | |