Ryan Ballantyne commited on
Commit
983d8eb
·
1 Parent(s): a1323b8

Initial sidecar deploy

Browse files
Files changed (6) hide show
  1. .dockerignore +6 -0
  2. Dockerfile +49 -0
  3. README.hf-space.md +22 -0
  4. README.md +32 -12
  5. main.py +92 -0
  6. requirements.txt +5 -0
.dockerignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ .venv
2
+ __pycache__
3
+ *.pyc
4
+ .env
5
+ .env.*
6
+ README.md
Dockerfile ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Embeddings sidecar — FastAPI + fastembed.
2
+ # Builds a small image that exposes /embed/dense, /embed/colbert,
3
+ # /embed/colbert/query, /health.
4
+ #
5
+ # Designed to run anywhere a Dockerfile is accepted:
6
+ # - Hugging Face Spaces (Docker SDK) — easiest, free tier, weights cached
7
+ # - Fly.io — `fly launch` then `fly deploy`
8
+ # - Railway / Render / Koyeb — auto-detects Dockerfile
9
+ #
10
+ # The runtime port is taken from $PORT (HF Spaces, Railway, Render set this);
11
+ # defaults to 7860 (HF Spaces convention).
12
+
13
+ FROM python:3.11-slim
14
+
15
+ ENV PYTHONUNBUFFERED=1 \
16
+ PIP_DISABLE_PIP_VERSION_CHECK=1 \
17
+ PIP_NO_CACHE_DIR=1 \
18
+ # Cache fastembed model downloads in a writable location (HF Spaces uses
19
+ # /data for persistent storage on paid tiers; falls back to /tmp on free).
20
+ FASTEMBED_CACHE_PATH=/tmp/fastembed_cache \
21
+ HF_HOME=/tmp/huggingface \
22
+ PORT=7860
23
+
24
+ WORKDIR /app
25
+
26
+ # Install build essentials needed for some onnxruntime / tokenizers wheels.
27
+ RUN apt-get update \
28
+ && apt-get install -y --no-install-recommends \
29
+ build-essential \
30
+ libgomp1 \
31
+ && rm -rf /var/lib/apt/lists/*
32
+
33
+ COPY requirements.txt .
34
+ RUN pip install -r requirements.txt
35
+
36
+ COPY main.py .
37
+
38
+ # Pre-warm the model cache at build time so the first request is fast.
39
+ # Skipped if HF_TOKEN is required for a gated model (set as a secret at
40
+ # runtime then the first request will warm the cache).
41
+ RUN python -c "from fastembed import TextEmbedding, LateInteractionTextEmbedding; \
42
+ TextEmbedding(model_name='sentence-transformers/all-MiniLM-L6-v2'); \
43
+ LateInteractionTextEmbedding(model_name='colbert-ir/colbertv2.0')" \
44
+ || echo "Model pre-warm skipped — will download on first request."
45
+
46
+ EXPOSE 7860
47
+
48
+ # Use a shell so $PORT is interpolated.
49
+ CMD ["sh", "-c", "uvicorn main:app --host 0.0.0.0 --port ${PORT:-7860}"]
README.hf-space.md ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Research Agent Embeddings
3
+ emoji: 🔍
4
+ colorFrom: indigo
5
+ colorTo: violet
6
+ sdk: docker
7
+ app_port: 7860
8
+ pinned: false
9
+ ---
10
+
11
+ # Embeddings sidecar for the Research Agent
12
+
13
+ FastAPI service exposing dense + ColBERT late-interaction embeddings via `fastembed`.
14
+
15
+ Endpoints:
16
+
17
+ - `POST /embed/dense` — `sentence-transformers/all-MiniLM-L6-v2` (384-dim)
18
+ - `POST /embed/colbert` — `colbert-ir/colbertv2.0` (per-token multi-vector)
19
+ - `POST /embed/colbert/query` — query-side ColBERT
20
+ - `GET /health`
21
+
22
+ Set `EMBEDDINGS_URL=https://<your-username>-<space-name>.hf.space` in the parent app.
README.md CHANGED
@@ -1,12 +1,32 @@
1
- ---
2
- title: Embedding
3
- emoji: 👁
4
- colorFrom: red
5
- colorTo: red
6
- sdk: docker
7
- pinned: false
8
- license: apache-2.0
9
- short_description: all-MiniLM-L6-v2 and Colbertv2.0 Hybird FastEmbed Server
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Embeddings sidecar
2
+
3
+ Tiny FastAPI service that wraps `fastembed` and exposes:
4
+
5
+ - `POST /embed/dense` — dense vectors via `sentence-transformers/all-MiniLM-L6-v2` (384-dim)
6
+ - `POST /embed/colbert` — late-interaction multi-vectors via `colbert-ir/colbertv2.0` (per-token, 128-dim)
7
+ - `POST /embed/colbert/query` — query-side ColBERT embeddings
8
+ - `GET /health`
9
+
10
+ The Next.js app calls this service over HTTP. It exists because Node's
11
+ `fastembed-js` has spotty coverage for ColBERT/late-interaction; Python
12
+ `fastembed` handles both models cleanly.
13
+
14
+ ## Run
15
+
16
+ ```bash
17
+ cd embeddings
18
+ python3 -m venv .venv
19
+ source .venv/bin/activate
20
+ pip install -r requirements.txt
21
+ uvicorn main:app --port 7860
22
+ ```
23
+
24
+ First request will download the model weights (cached under `~/.cache/fastembed`).
25
+
26
+ ## Smoke test
27
+
28
+ ```bash
29
+ curl -X POST localhost:7860/embed/dense \
30
+ -H 'content-type: application/json' \
31
+ -d '{"texts":["hello world"]}' | jq '.vectors[0] | length' # -> 384
32
+ ```
main.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """FastAPI embeddings sidecar.
2
+
3
+ Exposes two endpoints backed by `fastembed`:
4
+ POST /embed/dense -> single vectors (sentence-transformers/all-MiniLM-L6-v2)
5
+ POST /embed/colbert -> per-token matrices (colbert-ir/colbertv2.0)
6
+
7
+ Models are loaded lazily on first request and reused for the lifetime of the
8
+ process. The Next.js app calls this service via plain HTTP.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import os
14
+ from contextlib import asynccontextmanager
15
+ from pathlib import Path
16
+ from typing import List
17
+
18
+ from dotenv import load_dotenv
19
+
20
+ # Load shared env from the parent project so HF_TOKEN, RAG_*_MODEL, etc. flow
21
+ # through without requiring the user to export them by hand.
22
+ _PARENT_ENV = Path(__file__).resolve().parent.parent / ".env.local"
23
+ if _PARENT_ENV.is_file():
24
+ load_dotenv(_PARENT_ENV)
25
+
26
+ from fastapi import FastAPI
27
+ from pydantic import BaseModel
28
+ from fastembed import TextEmbedding, LateInteractionTextEmbedding
29
+
30
+ DENSE_MODEL = os.environ.get("RAG_DENSE_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
31
+ LATE_MODEL = os.environ.get("RAG_LATE_MODEL", "colbert-ir/colbertv2.0")
32
+
33
+ _models: dict[str, object] = {}
34
+
35
+
36
+ def _dense() -> TextEmbedding:
37
+ if "dense" not in _models:
38
+ _models["dense"] = TextEmbedding(model_name=DENSE_MODEL)
39
+ return _models["dense"] # type: ignore[return-value]
40
+
41
+
42
+ def _colbert() -> LateInteractionTextEmbedding:
43
+ if "colbert" not in _models:
44
+ _models["colbert"] = LateInteractionTextEmbedding(model_name=LATE_MODEL)
45
+ return _models["colbert"] # type: ignore[return-value]
46
+
47
+
48
+ @asynccontextmanager
49
+ async def lifespan(_app: FastAPI):
50
+ _dense()
51
+ _colbert()
52
+ yield
53
+
54
+
55
+ app = FastAPI(lifespan=lifespan)
56
+
57
+
58
+ class EmbedRequest(BaseModel):
59
+ texts: List[str]
60
+
61
+
62
+ class DenseResponse(BaseModel):
63
+ vectors: List[List[float]]
64
+ model: str
65
+
66
+
67
+ class ColbertResponse(BaseModel):
68
+ vectors: List[List[List[float]]]
69
+ model: str
70
+
71
+
72
+ @app.get("/health")
73
+ def health():
74
+ return {"ok": True, "dense_model": DENSE_MODEL, "late_model": LATE_MODEL}
75
+
76
+
77
+ @app.post("/embed/dense", response_model=DenseResponse)
78
+ def embed_dense(req: EmbedRequest):
79
+ vectors = [v.tolist() for v in _dense().embed(req.texts)]
80
+ return {"vectors": vectors, "model": DENSE_MODEL}
81
+
82
+
83
+ @app.post("/embed/colbert", response_model=ColbertResponse)
84
+ def embed_colbert(req: EmbedRequest):
85
+ vectors = [v.tolist() for v in _colbert().embed(req.texts)]
86
+ return {"vectors": vectors, "model": LATE_MODEL}
87
+
88
+
89
+ @app.post("/embed/colbert/query", response_model=ColbertResponse)
90
+ def embed_colbert_query(req: EmbedRequest):
91
+ vectors = [v.tolist() for v in _colbert().query_embed(req.texts)]
92
+ return {"vectors": vectors, "model": LATE_MODEL}
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ fastapi>=0.115
2
+ uvicorn[standard]>=0.32
3
+ fastembed>=0.4.2
4
+ pydantic>=2.9
5
+ python-dotenv>=1.0