| # Embeddings sidecar β FastAPI + fastembed. | |
| # Builds a small image that exposes /embed/dense, /embed/colbert, | |
| # /embed/colbert/query, /health. | |
| # | |
| # Designed to run anywhere a Dockerfile is accepted: | |
| # - Hugging Face Spaces (Docker SDK) β easiest, free tier, weights cached | |
| # - Fly.io β `fly launch` then `fly deploy` | |
| # - Railway / Render / Koyeb β auto-detects Dockerfile | |
| # | |
| # The runtime port is taken from $PORT (HF Spaces, Railway, Render set this); | |
| # defaults to 7860 (HF Spaces convention). | |
| FROM python:3.11-slim | |
| ENV PYTHONUNBUFFERED=1 \ | |
| PIP_DISABLE_PIP_VERSION_CHECK=1 \ | |
| PIP_NO_CACHE_DIR=1 \ | |
| # Cache fastembed model downloads in a writable location (HF Spaces uses | |
| # /data for persistent storage on paid tiers; falls back to /tmp on free). | |
| FASTEMBED_CACHE_PATH=/tmp/fastembed_cache \ | |
| HF_HOME=/tmp/huggingface \ | |
| PORT=7860 | |
| WORKDIR /app | |
| # Install build essentials needed for some onnxruntime / tokenizers wheels. | |
| RUN apt-get update \ | |
| && apt-get install -y --no-install-recommends \ | |
| build-essential \ | |
| libgomp1 \ | |
| && rm -rf /var/lib/apt/lists/* | |
| COPY requirements.txt . | |
| RUN pip install -r requirements.txt | |
| COPY main.py . | |
| # Pre-warm the model cache at build time so the first request is fast. | |
| # Skipped if HF_TOKEN is required for a gated model (set as a secret at | |
| # runtime then the first request will warm the cache). | |
| RUN python -c "from fastembed import TextEmbedding, LateInteractionTextEmbedding; \ | |
| TextEmbedding(model_name='sentence-transformers/all-MiniLM-L6-v2'); \ | |
| LateInteractionTextEmbedding(model_name='colbert-ir/colbertv2.0')" \ | |
| || echo "Model pre-warm skipped β will download on first request." | |
| EXPOSE 7860 | |
| # Use a shell so $PORT is interpolated. | |
| CMD ["sh", "-c", "uvicorn main:app --host 0.0.0.0 --port ${PORT:-7860}"] | |