# Embeddings sidecar — FastAPI + fastembed. # Builds a small image that exposes /embed/dense, /embed/colbert, # /embed/colbert/query, /health. # # Designed to run anywhere a Dockerfile is accepted: # - Hugging Face Spaces (Docker SDK) — easiest, free tier, weights cached # - Fly.io — `fly launch` then `fly deploy` # - Railway / Render / Koyeb — auto-detects Dockerfile # # The runtime port is taken from $PORT (HF Spaces, Railway, Render set this); # defaults to 7860 (HF Spaces convention). FROM python:3.11-slim ENV PYTHONUNBUFFERED=1 \ PIP_DISABLE_PIP_VERSION_CHECK=1 \ PIP_NO_CACHE_DIR=1 \ # Cache fastembed model downloads in a writable location (HF Spaces uses # /data for persistent storage on paid tiers; falls back to /tmp on free). FASTEMBED_CACHE_PATH=/tmp/fastembed_cache \ HF_HOME=/tmp/huggingface \ PORT=7860 WORKDIR /app # Install build essentials needed for some onnxruntime / tokenizers wheels. RUN apt-get update \ && apt-get install -y --no-install-recommends \ build-essential \ libgomp1 \ && rm -rf /var/lib/apt/lists/* COPY requirements.txt . RUN pip install -r requirements.txt COPY main.py . # Pre-warm the model cache at build time so the first request is fast. # Skipped if HF_TOKEN is required for a gated model (set as a secret at # runtime then the first request will warm the cache). RUN python -c "from fastembed import TextEmbedding, LateInteractionTextEmbedding; \ TextEmbedding(model_name='sentence-transformers/all-MiniLM-L6-v2'); \ LateInteractionTextEmbedding(model_name='colbert-ir/colbertv2.0')" \ || echo "Model pre-warm skipped — will download on first request." EXPOSE 7860 # Use a shell so $PORT is interpolated. CMD ["sh", "-c", "uvicorn main:app --host 0.0.0.0 --port ${PORT:-7860}"]