# G.U.I.D.E. — Hugging Face Spaces Dockerfile # # Build args: # HF_MODEL_REPO HF Model Hub repo containing trained weights # e.g. --build-arg HF_MODEL_REPO=myuser/guide-models # # Runtime secrets (set via HF Spaces Secrets UI — never bake into image): # ANTHROPIC_API_KEY FROM python:3.11-slim # System dependencies RUN apt-get update && apt-get install -y --no-install-recommends \ tesseract-ocr \ git \ && rm -rf /var/lib/apt/lists/* WORKDIR /app # Python dependencies — installed before copying full source to leverage cache COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt # spaCy language model (~750 MB) — required by Presidio privacy layer RUN python -m spacy download en_core_web_lg # Copy application source before downloading models. # Models are gitignored so COPY does not include them — but it does create # the /app/models/ directory entry. Downloading AFTER COPY ensures the # overlay filesystem sees models/ files in the top-most layer and does not # hide them behind an opaque whiteout created by a later COPY. COPY . . # Download trained model weights from HF Model Hub at build time # (avoids cold-start delay for visitors; weights land at /app/models/) ARG HF_MODEL_REPO RUN test -n "${HF_MODEL_REPO}" || (echo "ERROR: HF_MODEL_REPO build arg is not set" && exit 1) RUN python -c "from huggingface_hub import snapshot_download; snapshot_download('${HF_MODEL_REPO}', repo_type='model', local_dir='models')" && echo "=== models dir ===" && find /app/models -not -path '*/.cache/*' | sort # HF Spaces exposes port 7860 (Gradio); FastAPI runs internally on 8000 EXPOSE 7860 # Start both servers; models are already present so training is skipped CMD ["python", "start.py", "--download-models", "--no-train"]