# G.U.I.D.E. — Hugging Face Spaces Dockerfile
#
# Build args:
#   HF_MODEL_REPO  HF Model Hub repo containing trained weights
#                  e.g. --build-arg HF_MODEL_REPO=myuser/guide-models
#
# Runtime secrets (set via HF Spaces Secrets UI — never bake into image):
#   ANTHROPIC_API_KEY

FROM python:3.11-slim

# System dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
        tesseract-ocr \
        git \
    && rm -rf /var/lib/apt/lists/*

WORKDIR /app

# Python dependencies — installed before copying full source to leverage cache
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

# spaCy language model (~750 MB) — required by Presidio privacy layer
RUN python -m spacy download en_core_web_lg

# Copy application source before downloading models.
# Models are gitignored so COPY does not include them — but it does create
# the /app/models/ directory entry.  Downloading AFTER COPY ensures the
# overlay filesystem sees models/ files in the top-most layer and does not
# hide them behind an opaque whiteout created by a later COPY.
COPY . .

# Download trained model weights from HF Model Hub at build time
# (avoids cold-start delay for visitors; weights land at /app/models/)
ARG HF_MODEL_REPO
RUN test -n "${HF_MODEL_REPO}" || (echo "ERROR: HF_MODEL_REPO build arg is not set" && exit 1)
RUN python -c "from huggingface_hub import snapshot_download; snapshot_download('${HF_MODEL_REPO}', repo_type='model', local_dir='models')" && echo "=== models dir ===" && find /app/models -not -path '*/.cache/*' | sort

# HF Spaces exposes port 7860 (Gradio); FastAPI runs internally on 8000
EXPOSE 7860

# Start both servers; models are already present so training is skipped
CMD ["python", "start.py", "--download-models", "--no-train"]