embedding / Dockerfile
Ryan Ballantyne
Initial sidecar deploy
983d8eb
# Embeddings sidecar β€” FastAPI + fastembed.
# Builds a small image that exposes /embed/dense, /embed/colbert,
# /embed/colbert/query, /health.
#
# Designed to run anywhere a Dockerfile is accepted:
# - Hugging Face Spaces (Docker SDK) β€” easiest, free tier, weights cached
# - Fly.io β€” `fly launch` then `fly deploy`
# - Railway / Render / Koyeb β€” auto-detects Dockerfile
#
# The runtime port is taken from $PORT (HF Spaces, Railway, Render set this);
# defaults to 7860 (HF Spaces convention).
FROM python:3.11-slim
ENV PYTHONUNBUFFERED=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1 \
PIP_NO_CACHE_DIR=1 \
# Cache fastembed model downloads in a writable location (HF Spaces uses
# /data for persistent storage on paid tiers; falls back to /tmp on free).
FASTEMBED_CACHE_PATH=/tmp/fastembed_cache \
HF_HOME=/tmp/huggingface \
PORT=7860
WORKDIR /app
# Install build essentials needed for some onnxruntime / tokenizers wheels.
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
build-essential \
libgomp1 \
&& rm -rf /var/lib/apt/lists/*
COPY requirements.txt .
RUN pip install -r requirements.txt
COPY main.py .
# Pre-warm the model cache at build time so the first request is fast.
# Skipped if HF_TOKEN is required for a gated model (set as a secret at
# runtime then the first request will warm the cache).
RUN python -c "from fastembed import TextEmbedding, LateInteractionTextEmbedding; \
TextEmbedding(model_name='sentence-transformers/all-MiniLM-L6-v2'); \
LateInteractionTextEmbedding(model_name='colbert-ir/colbertv2.0')" \
|| echo "Model pre-warm skipped β€” will download on first request."
EXPOSE 7860
# Use a shell so $PORT is interpolated.
CMD ["sh", "-c", "uvicorn main:app --host 0.0.0.0 --port ${PORT:-7860}"]