secureagentrag-api / Dockerfile
LeomordKaly's picture
deploy: rename Dockerfile.hf -> Dockerfile on the Space side
994f8e8 verified
# =============================================================================
# Dockerfile.hf — SecureAgentRAG backend for Hugging Face Spaces (CPU Basic).
# =============================================================================
# Two-stage build keeps the runtime image lean. The HF Space free tier is
# CPU-only with 16 GB RAM and ~50 GB ephemeral disk, so we target a tight
# memory footprint:
#
# - Python 3.11-slim base (~150 MB)
# - Only [api, embeddings-local, pii] extras (no OCR, no Phoenix, no Postgres,
# no Redis, no MCP) -- those modules are present in the source but their
# dependencies are not installed
# - cross-encoder reranker downloaded on first request (auto-cached under
# /home/user/.cache/huggingface). Skips the 2.3 GB fine-tuned checkpoint
# for the initial deploy; phase 3.2 can swap to fine_tuned once the
# reranker repo is published on HF Hub.
#
# The Space-side README.md is uploaded separately by scripts/deploy_hf_space.py
# with a YAML frontmatter declaring sdk=docker + app_port=7860.
# =============================================================================
# --- builder ----------------------------------------------------------------
FROM python:3.11-slim AS builder
WORKDIR /app
RUN pip install --no-cache-dir uv
# pyproject.toml + a copy of the source are required for uv to build the
# editable install. README.md is referenced as the long_description.
COPY pyproject.toml ./
COPY README.md ./
# Touch the package directories that hatchling treats as the wheel root --
# we only need the directory tree to exist at build time so hatchling can
# scan for __init__.py files. The actual code lands in the runtime stage.
RUN mkdir -p config core inference retrieval interfaces ingestion utils evaluation app \
&& touch config/__init__.py core/__init__.py inference/__init__.py \
&& touch retrieval/__init__.py interfaces/__init__.py ingestion/__init__.py \
&& touch utils/__init__.py evaluation/__init__.py app/__init__.py
# Intentionally skip [pii] extras -- the regex patterns in utils/pii.py
# already cover every BYOK key shape (Groq / OpenAI / Anthropic / HF / Vercel
# / Qdrant JWT / Qdrant management). Adding Presidio would pull spaCy
# en_core_web_lg (~770 MB) which auto-downloads at runtime and crashes the
# container on the CPU Basic Space when the package installer is absent.
RUN uv venv /app/.venv \
&& uv pip install --python /app/.venv/bin/python \
-e ".[api,embeddings-local]"
# --- runtime ----------------------------------------------------------------
FROM python:3.11-slim AS runtime
WORKDIR /app
# HF Spaces convention: run as uid 1000 with a writeable /home/user.
RUN useradd -m -u 1000 user
# System deps for PDF / image processing only -- no OCR / paddle.
# Debian 12+ (trixie) renamed libgl1-mesa-glx -> libgl1 and libxrender-dev
# is no longer needed at runtime (runtime is libxrender1).
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
libglib2.0-0 libsm6 libxext6 libxrender1 libgl1 curl \
&& rm -rf /var/lib/apt/lists/*
# Bring the virtualenv from the builder stage.
COPY --from=builder /app/.venv /app/.venv
ENV PATH="/app/.venv/bin:$PATH"
# Copy application source. Files that match .dockerignore are filtered out.
COPY --chown=user:user . /app
USER user
# Pre-populate the HF cache so the cross-encoder lives on disk before the
# first request. Defensive: never fails the build -- if HF Hub is unreachable
# during build (offline mirrors etc.) the cache is populated on first query.
RUN python -c "import os; \
from huggingface_hub import snapshot_download; \
import sys; \
try: snapshot_download(repo_id='BAAI/bge-reranker-v2-m3', cache_dir='/home/user/.cache/huggingface/hub'); print('reranker cached') \
except Exception as e: print(f'reranker cache skipped: {e!r}', file=sys.stderr)" \
|| echo "build-time reranker download failed -- will lazy-load on first request"
# --- BYOK production env ---------------------------------------------------
# Real secrets (Qdrant URL + API key, Groq key) are injected via HF Space
# secrets panel -- they ride the same SAR_* env-var protocol but are NOT
# baked into the image. Only mode flags and safe defaults live here.
ENV SAR_BYOK_MODE=true
# Per-IP owner-key quota / hour. Cap protects the daily Groq 14 400-req
# budget against any single abuser. Raised from 3 -> 10 because the
# previous 3 was too tight: visitors blocked after their third query of
# the same IP. With ~100 distinct visitor IPs / day the daily Groq cap
# would still be defended; visitors who exceed the cap are nudged to
# paste their own BYOK key via the UI 429 banner.
ENV SAR_BYOK_OWNER_KEY_QUOTA_PER_HOUR=10
# HF Spaces fronts the container with exactly one trusted reverse proxy that
# *appends* the peer it saw to X-Forwarded-For. Tell the throttle to read the
# IP one hop from the right (spoof-resistant) instead of the attacker-appendable
# leftmost token, so a visitor can't mint a fresh owner-key bucket per request
# by forging XFF. See interfaces/byok.py::client_ip_from_request.
ENV SAR_BYOK_XFF_TRUSTED_HOPS=1
ENV SAR_SESSION_COLLECTION_TTL_HOURS=24
ENV SAR_CORS_ALLOW_ORIGINS='["https://secureagentrag-web.vercel.app","https://secureagentrag.vercel.app"]'
# Cloud LLM defaults -- Groq llama-3.1-8b-instant is the cheapest fast option
# on the free tier. Visitor BYOK overrides this per request.
ENV SAR_DEFAULT_PROVIDER=groq
ENV SAR_CLOUD_PROVIDER=groq
ENV SAR_LLM_MODEL=llama-3.1-8b-instant
# Embedding stack -- local BGE-M3 via sentence-transformers (CPU). Avoids
# Ollama entirely.
ENV SAR_EMBEDDING_BACKEND=local
ENV SAR_LOCAL_EMBEDDING_MODEL=BAAI/bge-m3
ENV SAR_EMBEDDING_MODEL=bge-m3
ENV SAR_EMBEDDING_DIM=1024
# Reranker disabled for the BYOK demo. The cross-encoder adds a ~600 MB
# model + 4-5 s cold-load latency, and on a 10-doc corpus + 1-5 visitor
# uploads its top-5 cut routinely drops the visitor's own chunk before
# it reaches the grader. Pure dense+sparse RRF order is good enough on
# the demo corpus; bench shows the reranker only helps materially past
# 200+ documents per query. Re-enable for production deploys with
# SAR_RERANKER_TYPE=cross_encoder or fine_tuned.
ENV SAR_RERANKER_TYPE=none
ENV SAR_RERANKER_CHECKPOINT=BAAI/bge-reranker-v2-m3
# Sparse retrieval -- BM25 keeps the cold path zero-dep; SPLADE adds an
# extra ~600 MB model and is skipped on free CPU Basic.
ENV SAR_SPARSE_BACKEND=bm25
# Persistence paths -- /tmp is the only writable area on HF Spaces.
ENV SAR_AUDIT_LOG_DIR=/tmp/secureagentrag/audit_logs
ENV SAR_CONVERSATION_DIR=/tmp/secureagentrag/conversations
ENV SAR_CHECKPOINT_DB_PATH=/tmp/secureagentrag/checkpoints.sqlite
ENV SAR_BM25_INDEX_PATH=/tmp/secureagentrag/bm25_index.pkl
# Multi-tenant collections route BYOK session -> documents_sess_<sid>.
ENV SAR_MULTI_TENANT_COLLECTIONS=true
# Pipeline safety -- BYOK uploads can push the candidate set up to 20
# chunks (top_k=10 from base + top_k=10 from session) which the grader
# scores one Groq call at a time. 180 s leaves headroom for the
# reranker cold load + Groq free-tier rate-limit waits on a fresh boot.
ENV SAR_REQUEST_TIMEOUT_S=180
# Grader thresholds tuned looser for the BYOK demo so short user-uploaded
# docs are not aggressively filtered out by the LLM judge. The default
# 0.7 / 0.5 produced too many "all docs irrelevant" refusals on this
# small corpus; 0.55 / 0.3 keeps the corrective-RAG retry loop active
# but stops the demo from refusing on edge-case wording.
ENV SAR_RELEVANCE_THRESHOLD=0.55
ENV SAR_RELEVANCE_RETRY_THRESHOLD=0.3
# Cap corrective-RAG retries -- two refines is enough; further rewrites
# stack Groq calls without meaningfully improving recall on a 10-doc
# corpus and just chew through the SLO.
ENV SAR_MAX_RETRIES=1
# RAG fusion fires 1 extra Groq call per chat to generate N query
# reformulations + N parallel Qdrant searches. Useless on a 10-doc
# demo corpus where the original query already retrieves the right
# chunks; disabled here to cut Groq call count.
ENV SAR_RAG_FUSION_ENABLED=false
# Pin the Groq model to llama-3.1-8b-instant for the demo. The
# default 70b-versatile model hits the 30 RPM cap faster (heavier
# generation = slower throughput), and the 8b model finishes in
# ~1 s on prompts under 4k tokens with comparable answer quality
# on this small corpus.
ENV SAR_GROQ_MODEL=llama-3.1-8b-instant
# Cap synth completion tokens to ease Groq free-tier TPM (6,000 tokens/min).
# A long answer + 10-chunk prompt could otherwise approach the per-minute
# token ceiling in a single chat and 429 mid-stream. 1024 is ample for the
# demo corpus; the streaming client also now retries a transient 429 blip.
ENV SAR_SYNTH_MAX_TOKENS=1024
# With the reranker disabled + grader bypassed, the synth context cap
# (rerank_top_k) doubles as the doc budget into the synth prompt. Raise
# from 5 -> 10 so all retrieved chunks reach the LLM; llama-3.1-8b has
# a 131k context so 10 chunks × 600 chars = trivial. Bigger context here
# is the easiest quality lever now that reranker is off.
ENV SAR_RERANK_TOP_K=10
# Faithfulness gate disabled on the public BYOK demo: it makes one Groq
# call per cited sentence (typically 5-10 extra calls per answer). On
# the free-tier 30 RPM budget a single chatty answer can exhaust the
# bucket and the next query 429s with an empty completion. The
# synthesizer's own citation discipline (mandatory inline [N] markers
# + sources-only prompt) is strong enough for the demo. Re-enable for
# production deploys with a paid Groq tier or local Ollama.
ENV SAR_FAITHFULNESS_GATE_ENABLED=false
ENV SAR_FAITHFULNESS_GATE_MODE=flag
ENV SAR_FAITHFULNESS_THRESHOLD=0.7
# The security node's LLM semantic injection check false-positives on
# non-English (Arabic) queries and blocks retrieval. Disable it for the demo:
# the guardrails node (regex) + the security node's regex jailbreak patterns
# still run, so injection defence stays while multilingual questions work.
ENV SAR_SECURITY_SEMANTIC_CHECK_ENABLED=false
# Cloud LLM is the only inference path on the HF Space (no Ollama). Unlock
# HIGH-sensitivity content for cloud synthesis -- the frontend warns the
# visitor with a "sensitive: routed to cloud" badge.
ENV SAR_ALLOW_CLOUD_FOR_HIGH=true
# Public audit export -- caps the /byok/audit response so the panel is
# usable but never floods the page on a long-running Space.
ENV SAR_BYOK_AUDIT_MAX_ENTRIES=50
# Force UTF-8 everywhere. HF Spaces' base image can default to a C/POSIX
# (ASCII) locale, which mangles non-ASCII request text — Arabic queries arrived
# as "????" and embedded to garbage, so retrieval never matched the Arabic
# corpus. PYTHONUTF8=1 + a UTF-8 locale make Python handle Arabic/RTL correctly.
ENV PYTHONUTF8=1
ENV PYTHONIOENCODING=utf-8
ENV LANG=C.UTF-8
ENV LC_ALL=C.UTF-8
# Logging
ENV SAR_LOG_LEVEL=INFO
# HF cache lives under the user home which is the only persistent writable
# tree across Space restarts on CPU Basic.
ENV HF_HOME=/home/user/.cache/huggingface
ENV TRANSFORMERS_CACHE=/home/user/.cache/huggingface/hub
EXPOSE 7860
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
CMD curl --fail --silent --show-error http://localhost:7860/healthz || exit 1
# uvicorn with 1 worker -- on CPU Basic two workers thrash the memory.
CMD ["uvicorn", "interfaces.api:app", \
"--host", "0.0.0.0", \
"--port", "7860", \
"--workers", "1", \
"--timeout-keep-alive", "30", \
"--no-access-log"]