Ryan Ballantyne commited on
Commit ·
983d8eb
1
Parent(s): a1323b8
Initial sidecar deploy
Browse files- .dockerignore +6 -0
- Dockerfile +49 -0
- README.hf-space.md +22 -0
- README.md +32 -12
- main.py +92 -0
- requirements.txt +5 -0
.dockerignore
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.venv
|
| 2 |
+
__pycache__
|
| 3 |
+
*.pyc
|
| 4 |
+
.env
|
| 5 |
+
.env.*
|
| 6 |
+
README.md
|
Dockerfile
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Embeddings sidecar — FastAPI + fastembed.
|
| 2 |
+
# Builds a small image that exposes /embed/dense, /embed/colbert,
|
| 3 |
+
# /embed/colbert/query, /health.
|
| 4 |
+
#
|
| 5 |
+
# Designed to run anywhere a Dockerfile is accepted:
|
| 6 |
+
# - Hugging Face Spaces (Docker SDK) — easiest, free tier, weights cached
|
| 7 |
+
# - Fly.io — `fly launch` then `fly deploy`
|
| 8 |
+
# - Railway / Render / Koyeb — auto-detects Dockerfile
|
| 9 |
+
#
|
| 10 |
+
# The runtime port is taken from $PORT (HF Spaces, Railway, Render set this);
|
| 11 |
+
# defaults to 7860 (HF Spaces convention).
|
| 12 |
+
|
| 13 |
+
FROM python:3.11-slim
|
| 14 |
+
|
| 15 |
+
ENV PYTHONUNBUFFERED=1 \
|
| 16 |
+
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
| 17 |
+
PIP_NO_CACHE_DIR=1 \
|
| 18 |
+
# Cache fastembed model downloads in a writable location (HF Spaces uses
|
| 19 |
+
# /data for persistent storage on paid tiers; falls back to /tmp on free).
|
| 20 |
+
FASTEMBED_CACHE_PATH=/tmp/fastembed_cache \
|
| 21 |
+
HF_HOME=/tmp/huggingface \
|
| 22 |
+
PORT=7860
|
| 23 |
+
|
| 24 |
+
WORKDIR /app
|
| 25 |
+
|
| 26 |
+
# Install build essentials needed for some onnxruntime / tokenizers wheels.
|
| 27 |
+
RUN apt-get update \
|
| 28 |
+
&& apt-get install -y --no-install-recommends \
|
| 29 |
+
build-essential \
|
| 30 |
+
libgomp1 \
|
| 31 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 32 |
+
|
| 33 |
+
COPY requirements.txt .
|
| 34 |
+
RUN pip install -r requirements.txt
|
| 35 |
+
|
| 36 |
+
COPY main.py .
|
| 37 |
+
|
| 38 |
+
# Pre-warm the model cache at build time so the first request is fast.
|
| 39 |
+
# Skipped if HF_TOKEN is required for a gated model (set as a secret at
|
| 40 |
+
# runtime then the first request will warm the cache).
|
| 41 |
+
RUN python -c "from fastembed import TextEmbedding, LateInteractionTextEmbedding; \
|
| 42 |
+
TextEmbedding(model_name='sentence-transformers/all-MiniLM-L6-v2'); \
|
| 43 |
+
LateInteractionTextEmbedding(model_name='colbert-ir/colbertv2.0')" \
|
| 44 |
+
|| echo "Model pre-warm skipped — will download on first request."
|
| 45 |
+
|
| 46 |
+
EXPOSE 7860
|
| 47 |
+
|
| 48 |
+
# Use a shell so $PORT is interpolated.
|
| 49 |
+
CMD ["sh", "-c", "uvicorn main:app --host 0.0.0.0 --port ${PORT:-7860}"]
|
README.hf-space.md
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Research Agent Embeddings
|
| 3 |
+
emoji: 🔍
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: violet
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
pinned: false
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# Embeddings sidecar for the Research Agent
|
| 12 |
+
|
| 13 |
+
FastAPI service exposing dense + ColBERT late-interaction embeddings via `fastembed`.
|
| 14 |
+
|
| 15 |
+
Endpoints:
|
| 16 |
+
|
| 17 |
+
- `POST /embed/dense` — `sentence-transformers/all-MiniLM-L6-v2` (384-dim)
|
| 18 |
+
- `POST /embed/colbert` — `colbert-ir/colbertv2.0` (per-token multi-vector)
|
| 19 |
+
- `POST /embed/colbert/query` — query-side ColBERT
|
| 20 |
+
- `GET /health`
|
| 21 |
+
|
| 22 |
+
Set `EMBEDDINGS_URL=https://<your-username>-<space-name>.hf.space` in the parent app.
|
README.md
CHANGED
|
@@ -1,12 +1,32 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Embeddings sidecar
|
| 2 |
+
|
| 3 |
+
Tiny FastAPI service that wraps `fastembed` and exposes:
|
| 4 |
+
|
| 5 |
+
- `POST /embed/dense` — dense vectors via `sentence-transformers/all-MiniLM-L6-v2` (384-dim)
|
| 6 |
+
- `POST /embed/colbert` — late-interaction multi-vectors via `colbert-ir/colbertv2.0` (per-token, 128-dim)
|
| 7 |
+
- `POST /embed/colbert/query` — query-side ColBERT embeddings
|
| 8 |
+
- `GET /health`
|
| 9 |
+
|
| 10 |
+
The Next.js app calls this service over HTTP. It exists because Node's
|
| 11 |
+
`fastembed-js` has spotty coverage for ColBERT/late-interaction; Python
|
| 12 |
+
`fastembed` handles both models cleanly.
|
| 13 |
+
|
| 14 |
+
## Run
|
| 15 |
+
|
| 16 |
+
```bash
|
| 17 |
+
cd embeddings
|
| 18 |
+
python3 -m venv .venv
|
| 19 |
+
source .venv/bin/activate
|
| 20 |
+
pip install -r requirements.txt
|
| 21 |
+
uvicorn main:app --port 7860
|
| 22 |
+
```
|
| 23 |
+
|
| 24 |
+
First request will download the model weights (cached under `~/.cache/fastembed`).
|
| 25 |
+
|
| 26 |
+
## Smoke test
|
| 27 |
+
|
| 28 |
+
```bash
|
| 29 |
+
curl -X POST localhost:7860/embed/dense \
|
| 30 |
+
-H 'content-type: application/json' \
|
| 31 |
+
-d '{"texts":["hello world"]}' | jq '.vectors[0] | length' # -> 384
|
| 32 |
+
```
|
main.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""FastAPI embeddings sidecar.
|
| 2 |
+
|
| 3 |
+
Exposes two endpoints backed by `fastembed`:
|
| 4 |
+
POST /embed/dense -> single vectors (sentence-transformers/all-MiniLM-L6-v2)
|
| 5 |
+
POST /embed/colbert -> per-token matrices (colbert-ir/colbertv2.0)
|
| 6 |
+
|
| 7 |
+
Models are loaded lazily on first request and reused for the lifetime of the
|
| 8 |
+
process. The Next.js app calls this service via plain HTTP.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
import os
|
| 14 |
+
from contextlib import asynccontextmanager
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
from typing import List
|
| 17 |
+
|
| 18 |
+
from dotenv import load_dotenv
|
| 19 |
+
|
| 20 |
+
# Load shared env from the parent project so HF_TOKEN, RAG_*_MODEL, etc. flow
|
| 21 |
+
# through without requiring the user to export them by hand.
|
| 22 |
+
_PARENT_ENV = Path(__file__).resolve().parent.parent / ".env.local"
|
| 23 |
+
if _PARENT_ENV.is_file():
|
| 24 |
+
load_dotenv(_PARENT_ENV)
|
| 25 |
+
|
| 26 |
+
from fastapi import FastAPI
|
| 27 |
+
from pydantic import BaseModel
|
| 28 |
+
from fastembed import TextEmbedding, LateInteractionTextEmbedding
|
| 29 |
+
|
| 30 |
+
DENSE_MODEL = os.environ.get("RAG_DENSE_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
|
| 31 |
+
LATE_MODEL = os.environ.get("RAG_LATE_MODEL", "colbert-ir/colbertv2.0")
|
| 32 |
+
|
| 33 |
+
_models: dict[str, object] = {}
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def _dense() -> TextEmbedding:
|
| 37 |
+
if "dense" not in _models:
|
| 38 |
+
_models["dense"] = TextEmbedding(model_name=DENSE_MODEL)
|
| 39 |
+
return _models["dense"] # type: ignore[return-value]
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def _colbert() -> LateInteractionTextEmbedding:
|
| 43 |
+
if "colbert" not in _models:
|
| 44 |
+
_models["colbert"] = LateInteractionTextEmbedding(model_name=LATE_MODEL)
|
| 45 |
+
return _models["colbert"] # type: ignore[return-value]
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
@asynccontextmanager
|
| 49 |
+
async def lifespan(_app: FastAPI):
|
| 50 |
+
_dense()
|
| 51 |
+
_colbert()
|
| 52 |
+
yield
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
app = FastAPI(lifespan=lifespan)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class EmbedRequest(BaseModel):
|
| 59 |
+
texts: List[str]
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
class DenseResponse(BaseModel):
|
| 63 |
+
vectors: List[List[float]]
|
| 64 |
+
model: str
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class ColbertResponse(BaseModel):
|
| 68 |
+
vectors: List[List[List[float]]]
|
| 69 |
+
model: str
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
@app.get("/health")
|
| 73 |
+
def health():
|
| 74 |
+
return {"ok": True, "dense_model": DENSE_MODEL, "late_model": LATE_MODEL}
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
@app.post("/embed/dense", response_model=DenseResponse)
|
| 78 |
+
def embed_dense(req: EmbedRequest):
|
| 79 |
+
vectors = [v.tolist() for v in _dense().embed(req.texts)]
|
| 80 |
+
return {"vectors": vectors, "model": DENSE_MODEL}
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
@app.post("/embed/colbert", response_model=ColbertResponse)
|
| 84 |
+
def embed_colbert(req: EmbedRequest):
|
| 85 |
+
vectors = [v.tolist() for v in _colbert().embed(req.texts)]
|
| 86 |
+
return {"vectors": vectors, "model": LATE_MODEL}
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
@app.post("/embed/colbert/query", response_model=ColbertResponse)
|
| 90 |
+
def embed_colbert_query(req: EmbedRequest):
|
| 91 |
+
vectors = [v.tolist() for v in _colbert().query_embed(req.texts)]
|
| 92 |
+
return {"vectors": vectors, "model": LATE_MODEL}
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi>=0.115
|
| 2 |
+
uvicorn[standard]>=0.32
|
| 3 |
+
fastembed>=0.4.2
|
| 4 |
+
pydantic>=2.9
|
| 5 |
+
python-dotenv>=1.0
|