| """FastAPI embeddings sidecar. |
| |
| Exposes two endpoints backed by `fastembed`: |
| POST /embed/dense -> single vectors (sentence-transformers/all-MiniLM-L6-v2) |
| POST /embed/colbert -> per-token matrices (colbert-ir/colbertv2.0) |
| |
| Models are loaded lazily on first request and reused for the lifetime of the |
| process. The Next.js app calls this service via plain HTTP. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import os |
| from contextlib import asynccontextmanager |
| from pathlib import Path |
| from typing import List |
|
|
| from dotenv import load_dotenv |
|
|
| |
| |
| _PARENT_ENV = Path(__file__).resolve().parent.parent / ".env.local" |
| if _PARENT_ENV.is_file(): |
| load_dotenv(_PARENT_ENV) |
|
|
| from fastapi import FastAPI |
| from pydantic import BaseModel |
| from fastembed import TextEmbedding, LateInteractionTextEmbedding |
|
|
| DENSE_MODEL = os.environ.get("RAG_DENSE_MODEL", "sentence-transformers/all-MiniLM-L6-v2") |
| LATE_MODEL = os.environ.get("RAG_LATE_MODEL", "colbert-ir/colbertv2.0") |
|
|
| _models: dict[str, object] = {} |
|
|
|
|
| def _dense() -> TextEmbedding: |
| if "dense" not in _models: |
| _models["dense"] = TextEmbedding(model_name=DENSE_MODEL) |
| return _models["dense"] |
|
|
|
|
| def _colbert() -> LateInteractionTextEmbedding: |
| if "colbert" not in _models: |
| _models["colbert"] = LateInteractionTextEmbedding(model_name=LATE_MODEL) |
| return _models["colbert"] |
|
|
|
|
| @asynccontextmanager |
| async def lifespan(_app: FastAPI): |
| _dense() |
| _colbert() |
| yield |
|
|
|
|
| app = FastAPI(lifespan=lifespan) |
|
|
|
|
| class EmbedRequest(BaseModel): |
| texts: List[str] |
|
|
|
|
| class DenseResponse(BaseModel): |
| vectors: List[List[float]] |
| model: str |
|
|
|
|
| class ColbertResponse(BaseModel): |
| vectors: List[List[List[float]]] |
| model: str |
|
|
|
|
| @app.get("/health") |
| def health(): |
| return {"ok": True, "dense_model": DENSE_MODEL, "late_model": LATE_MODEL} |
|
|
|
|
| @app.post("/embed/dense", response_model=DenseResponse) |
| def embed_dense(req: EmbedRequest): |
| vectors = [v.tolist() for v in _dense().embed(req.texts)] |
| return {"vectors": vectors, "model": DENSE_MODEL} |
|
|
|
|
| @app.post("/embed/colbert", response_model=ColbertResponse) |
| def embed_colbert(req: EmbedRequest): |
| vectors = [v.tolist() for v in _colbert().embed(req.texts)] |
| return {"vectors": vectors, "model": LATE_MODEL} |
|
|
|
|
| @app.post("/embed/colbert/query", response_model=ColbertResponse) |
| def embed_colbert_query(req: EmbedRequest): |
| vectors = [v.tolist() for v in _colbert().query_embed(req.texts)] |
| return {"vectors": vectors, "model": LATE_MODEL} |
|
|