# Embeddings sidecar — FastAPI + fastembed.
# Builds a small image that exposes /embed/dense, /embed/colbert,
# /embed/colbert/query, /health.
#
# Designed to run anywhere a Dockerfile is accepted:
#   - Hugging Face Spaces (Docker SDK)  — easiest, free tier, weights cached
#   - Fly.io                            — `fly launch` then `fly deploy`
#   - Railway / Render / Koyeb          — auto-detects Dockerfile
#
# The runtime port is taken from $PORT (HF Spaces, Railway, Render set this);
# defaults to 7860 (HF Spaces convention).

FROM python:3.11-slim

ENV PYTHONUNBUFFERED=1 \
    PIP_DISABLE_PIP_VERSION_CHECK=1 \
    PIP_NO_CACHE_DIR=1 \
    # Cache fastembed model downloads in a writable location (HF Spaces uses
    # /data for persistent storage on paid tiers; falls back to /tmp on free).
    FASTEMBED_CACHE_PATH=/tmp/fastembed_cache \
    HF_HOME=/tmp/huggingface \
    PORT=7860

WORKDIR /app

# Install build essentials needed for some onnxruntime / tokenizers wheels.
RUN apt-get update \
    && apt-get install -y --no-install-recommends \
       build-essential \
       libgomp1 \
    && rm -rf /var/lib/apt/lists/*

COPY requirements.txt .
RUN pip install -r requirements.txt

COPY main.py .

# Pre-warm the model cache at build time so the first request is fast.
# Skipped if HF_TOKEN is required for a gated model (set as a secret at
# runtime then the first request will warm the cache).
RUN python -c "from fastembed import TextEmbedding, LateInteractionTextEmbedding; \
    TextEmbedding(model_name='sentence-transformers/all-MiniLM-L6-v2'); \
    LateInteractionTextEmbedding(model_name='colbert-ir/colbertv2.0')" \
    || echo "Model pre-warm skipped — will download on first request."

EXPOSE 7860

# Use a shell so $PORT is interpolated.
CMD ["sh", "-c", "uvicorn main:app --host 0.0.0.0 --port ${PORT:-7860}"]