Spaces:

1337XCode
/

personabot-embedder

Running

GitHub Actions

Deploy d8ad462

a99846a about 1 month ago

2.45 kB

	# infra/hf_spaces/embedder/app.py
	# Serves BAAI/bge-small-en-v1.5 embeddings over HTTP.
	# Model is loaded from /app/model_cache (baked into the Docker image at build time).

	from contextlib import asynccontextmanager
	from typing import Annotated, Any

	from fastapi import FastAPI
	from pydantic import BaseModel, Field
	from sentence_transformers import SentenceTransformer

	# 64 texts * 2000 chars = 128KB max payload — keeps the free-tier Space under
	# its 16GB RAM limit even with the largest expected retrieval batch (top-20).
	_MAX_TEXTS = 64
	_MAX_TEXT_LEN = 2000

	# BGE model card specifies this prefix for query embeddings in asymmetric retrieval.
	# Document embeddings must NOT use this prefix — only query-time calls set is_query=True.
	# Paper shows 2-4% NDCG improvement over no-prefix symmetric mode.
	_BGE_QUERY_PREFIX = "Represent this sentence for searching relevant passages: "


	class EmbedRequest(BaseModel):
	texts: list[Annotated[str, Field(max_length=_MAX_TEXT_LEN)]] = Field(
	..., max_length=_MAX_TEXTS
	)
	is_query: bool = False # True → prepend BGE asymmetric query instruction


	class EmbedResponse(BaseModel):
	embeddings: list[list[float]]


	@asynccontextmanager
	async def lifespan(app: FastAPI):
	# Load from baked-in cache path — no network call at startup.
	# BGE normalises embeddings by default; no manual L2 step needed.
	app.state.model = SentenceTransformer(
	"BAAI/bge-small-en-v1.5",
	cache_folder="/app/model_cache",
	)
	app.state.model.eval()
	yield
	app.state.model = None


	app = FastAPI(
	title="PersonaBot Embedder",
	lifespan=lifespan,
	docs_url=None,
	redoc_url=None,
	openapi_url=None,
	)


	@app.get("/health")
	async def health() -> dict[str, str]:
	if app.state.model is None:
	return {"status": "loading"}
	return {"status": "ok"}


	@app.post("/embed", response_model=EmbedResponse)
	async def embed(request: EmbedRequest) -> EmbedResponse:
	if not request.texts:
	return EmbedResponse(embeddings=[])
	texts = (
	[_BGE_QUERY_PREFIX + t for t in request.texts]
	if request.is_query
	else request.texts
	)
	# encode with batch_size=32, returns numpy array shape (N, 384)
	vectors: Any = app.state.model.encode(
	texts,
	batch_size=32,
	normalize_embeddings=True,
	show_progress_bar=False,
	)
	return EmbedResponse(embeddings=vectors.tolist())