Spaces:

cng420
/

embedding

Running

App Files Files Community

Ryan Ballantyne commited on 25 days ago

Commit

983d8eb

1 Parent(s): a1323b8

Initial sidecar deploy

Browse files

Files changed (6) hide show

.dockerignore +6 -0
Dockerfile +49 -0
README.hf-space.md +22 -0
README.md +32 -12
main.py +92 -0
requirements.txt +5 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,6 @@

+.venv
+__pycache__
+*.pyc
+.env
+.env.*
+README.md

Dockerfile ADDED Viewed

	@@ -0,0 +1,49 @@

+# Embeddings sidecar — FastAPI + fastembed.
+# Builds a small image that exposes /embed/dense, /embed/colbert,
+# /embed/colbert/query, /health.
+#
+# Designed to run anywhere a Dockerfile is accepted:
+#   - Hugging Face Spaces (Docker SDK)  — easiest, free tier, weights cached
+#   - Fly.io                            — `fly launch` then `fly deploy`
+#   - Railway / Render / Koyeb          — auto-detects Dockerfile
+#
+# The runtime port is taken from $PORT (HF Spaces, Railway, Render set this);
+# defaults to 7860 (HF Spaces convention).
+FROM python:3.11-slim
+ENV PYTHONUNBUFFERED=1 \
+    PIP_DISABLE_PIP_VERSION_CHECK=1 \
+    PIP_NO_CACHE_DIR=1 \
+    # Cache fastembed model downloads in a writable location (HF Spaces uses
+    # /data for persistent storage on paid tiers; falls back to /tmp on free).
+    FASTEMBED_CACHE_PATH=/tmp/fastembed_cache \
+    HF_HOME=/tmp/huggingface \
+    PORT=7860
+WORKDIR /app
+# Install build essentials needed for some onnxruntime / tokenizers wheels.
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+       build-essential \
+       libgomp1 \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+COPY main.py .
+# Pre-warm the model cache at build time so the first request is fast.
+# Skipped if HF_TOKEN is required for a gated model (set as a secret at
+# runtime then the first request will warm the cache).
+RUN python -c "from fastembed import TextEmbedding, LateInteractionTextEmbedding; \
+    TextEmbedding(model_name='sentence-transformers/all-MiniLM-L6-v2'); \
+    LateInteractionTextEmbedding(model_name='colbert-ir/colbertv2.0')" \
+    || echo "Model pre-warm skipped — will download on first request."
+EXPOSE 7860
+# Use a shell so $PORT is interpolated.
+CMD ["sh", "-c", "uvicorn main:app --host 0.0.0.0 --port ${PORT:-7860}"]

README.hf-space.md ADDED Viewed

	@@ -0,0 +1,22 @@

+---
+title: Research Agent Embeddings
+emoji: 🔍
+colorFrom: indigo
+colorTo: violet
+sdk: docker
+app_port: 7860
+pinned: false
+---
+# Embeddings sidecar for the Research Agent
+FastAPI service exposing dense + ColBERT late-interaction embeddings via `fastembed`.
+Endpoints:
+- `POST /embed/dense` — `sentence-transformers/all-MiniLM-L6-v2` (384-dim)
+- `POST /embed/colbert` — `colbert-ir/colbertv2.0` (per-token multi-vector)
+- `POST /embed/colbert/query` — query-side ColBERT
+- `GET /health`
+Set `EMBEDDINGS_URL=https://<your-username>-<space-name>.hf.space` in the parent app.

README.md CHANGED Viewed

@@ -1,12 +1,32 @@
----
-title: Embedding
-emoji: 👁
-colorFrom: red
-colorTo: red
-sdk: docker
-pinned: false
-license: apache-2.0
-short_description: all-MiniLM-L6-v2 and Colbertv2.0 Hybird FastEmbed Server
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Embeddings sidecar
+Tiny FastAPI service that wraps `fastembed` and exposes:
+- `POST /embed/dense` — dense vectors via `sentence-transformers/all-MiniLM-L6-v2` (384-dim)
+- `POST /embed/colbert` — late-interaction multi-vectors via `colbert-ir/colbertv2.0` (per-token, 128-dim)
+- `POST /embed/colbert/query` — query-side ColBERT embeddings
+- `GET /health`
+The Next.js app calls this service over HTTP. It exists because Node's
+`fastembed-js` has spotty coverage for ColBERT/late-interaction; Python
+`fastembed` handles both models cleanly.
+## Run
+```bash
+cd embeddings
+python3 -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+uvicorn main:app --port 7860
+```
+First request will download the model weights (cached under `~/.cache/fastembed`).
+## Smoke test
+```bash
+curl -X POST localhost:7860/embed/dense \
+  -H 'content-type: application/json' \
+  -d '{"texts":["hello world"]}' | jq '.vectors[0] | length'   # -> 384
+```

main.py ADDED Viewed

	@@ -0,0 +1,92 @@

+"""FastAPI embeddings sidecar.
+Exposes two endpoints backed by `fastembed`:
+  POST /embed/dense   -> single vectors (sentence-transformers/all-MiniLM-L6-v2)
+  POST /embed/colbert -> per-token matrices (colbert-ir/colbertv2.0)
+Models are loaded lazily on first request and reused for the lifetime of the
+process. The Next.js app calls this service via plain HTTP.
+"""
+from __future__ import annotations
+import os
+from contextlib import asynccontextmanager
+from pathlib import Path
+from typing import List
+from dotenv import load_dotenv
+# Load shared env from the parent project so HF_TOKEN, RAG_*_MODEL, etc. flow
+# through without requiring the user to export them by hand.
+_PARENT_ENV = Path(__file__).resolve().parent.parent / ".env.local"
+if _PARENT_ENV.is_file():
+    load_dotenv(_PARENT_ENV)
+from fastapi import FastAPI
+from pydantic import BaseModel
+from fastembed import TextEmbedding, LateInteractionTextEmbedding
+DENSE_MODEL = os.environ.get("RAG_DENSE_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
+LATE_MODEL = os.environ.get("RAG_LATE_MODEL", "colbert-ir/colbertv2.0")
+_models: dict[str, object] = {}
+def _dense() -> TextEmbedding:
+    if "dense" not in _models:
+        _models["dense"] = TextEmbedding(model_name=DENSE_MODEL)
+    return _models["dense"]  # type: ignore[return-value]
+def _colbert() -> LateInteractionTextEmbedding:
+    if "colbert" not in _models:
+        _models["colbert"] = LateInteractionTextEmbedding(model_name=LATE_MODEL)
+    return _models["colbert"]  # type: ignore[return-value]
+@asynccontextmanager
+async def lifespan(_app: FastAPI):
+    _dense()
+    _colbert()
+    yield
+app = FastAPI(lifespan=lifespan)
+class EmbedRequest(BaseModel):
+    texts: List[str]
+class DenseResponse(BaseModel):
+    vectors: List[List[float]]
+    model: str
+class ColbertResponse(BaseModel):
+    vectors: List[List[List[float]]]
+    model: str
+@app.get("/health")
+def health():
+    return {"ok": True, "dense_model": DENSE_MODEL, "late_model": LATE_MODEL}
+@app.post("/embed/dense", response_model=DenseResponse)
+def embed_dense(req: EmbedRequest):
+    vectors = [v.tolist() for v in _dense().embed(req.texts)]
+    return {"vectors": vectors, "model": DENSE_MODEL}
+@app.post("/embed/colbert", response_model=ColbertResponse)
+def embed_colbert(req: EmbedRequest):
+    vectors = [v.tolist() for v in _colbert().embed(req.texts)]
+    return {"vectors": vectors, "model": LATE_MODEL}
+@app.post("/embed/colbert/query", response_model=ColbertResponse)
+def embed_colbert_query(req: EmbedRequest):
+    vectors = [v.tolist() for v in _colbert().query_embed(req.texts)]
+    return {"vectors": vectors, "model": LATE_MODEL}

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+fastapi>=0.115
+uvicorn[standard]>=0.32
+fastembed>=0.4.2
+pydantic>=2.9
+python-dotenv>=1.0