# ============================================================================= # Dockerfile.hf — SecureAgentRAG backend for Hugging Face Spaces (CPU Basic). # ============================================================================= # Two-stage build keeps the runtime image lean. The HF Space free tier is # CPU-only with 16 GB RAM and ~50 GB ephemeral disk, so we target a tight # memory footprint: # # - Python 3.11-slim base (~150 MB) # - Only [api, embeddings-local, pii] extras (no OCR, no Phoenix, no Postgres, # no Redis, no MCP) -- those modules are present in the source but their # dependencies are not installed # - cross-encoder reranker downloaded on first request (auto-cached under # /home/user/.cache/huggingface). Skips the 2.3 GB fine-tuned checkpoint # for the initial deploy; phase 3.2 can swap to fine_tuned once the # reranker repo is published on HF Hub. # # The Space-side README.md is uploaded separately by scripts/deploy_hf_space.py # with a YAML frontmatter declaring sdk=docker + app_port=7860. # ============================================================================= # --- builder ---------------------------------------------------------------- FROM python:3.11-slim AS builder WORKDIR /app RUN pip install --no-cache-dir uv # pyproject.toml + a copy of the source are required for uv to build the # editable install. README.md is referenced as the long_description. COPY pyproject.toml ./ COPY README.md ./ # Touch the package directories that hatchling treats as the wheel root -- # we only need the directory tree to exist at build time so hatchling can # scan for __init__.py files. The actual code lands in the runtime stage. RUN mkdir -p config core inference retrieval interfaces ingestion utils evaluation app \ && touch config/__init__.py core/__init__.py inference/__init__.py \ && touch retrieval/__init__.py interfaces/__init__.py ingestion/__init__.py \ && touch utils/__init__.py evaluation/__init__.py app/__init__.py # Intentionally skip [pii] extras -- the regex patterns in utils/pii.py # already cover every BYOK key shape (Groq / OpenAI / Anthropic / HF / Vercel # / Qdrant JWT / Qdrant management). Adding Presidio would pull spaCy # en_core_web_lg (~770 MB) which auto-downloads at runtime and crashes the # container on the CPU Basic Space when the package installer is absent. RUN uv venv /app/.venv \ && uv pip install --python /app/.venv/bin/python \ -e ".[api,embeddings-local]" # --- runtime ---------------------------------------------------------------- FROM python:3.11-slim AS runtime WORKDIR /app # HF Spaces convention: run as uid 1000 with a writeable /home/user. RUN useradd -m -u 1000 user # System deps for PDF / image processing only -- no OCR / paddle. # Debian 12+ (trixie) renamed libgl1-mesa-glx -> libgl1 and libxrender-dev # is no longer needed at runtime (runtime is libxrender1). RUN apt-get update \ && apt-get install -y --no-install-recommends \ libglib2.0-0 libsm6 libxext6 libxrender1 libgl1 curl \ && rm -rf /var/lib/apt/lists/* # Bring the virtualenv from the builder stage. COPY --from=builder /app/.venv /app/.venv ENV PATH="/app/.venv/bin:$PATH" # Copy application source. Files that match .dockerignore are filtered out. COPY --chown=user:user . /app USER user # Pre-populate the HF cache so the cross-encoder lives on disk before the # first request. Defensive: never fails the build -- if HF Hub is unreachable # during build (offline mirrors etc.) the cache is populated on first query. RUN python -c "import os; \ from huggingface_hub import snapshot_download; \ import sys; \ try: snapshot_download(repo_id='BAAI/bge-reranker-v2-m3', cache_dir='/home/user/.cache/huggingface/hub'); print('reranker cached') \ except Exception as e: print(f'reranker cache skipped: {e!r}', file=sys.stderr)" \ || echo "build-time reranker download failed -- will lazy-load on first request" # --- BYOK production env --------------------------------------------------- # Real secrets (Qdrant URL + API key, Groq key) are injected via HF Space # secrets panel -- they ride the same SAR_* env-var protocol but are NOT # baked into the image. Only mode flags and safe defaults live here. ENV SAR_BYOK_MODE=true # Per-IP owner-key quota / hour. Cap protects the daily Groq 14 400-req # budget against any single abuser. Raised from 3 -> 10 because the # previous 3 was too tight: visitors blocked after their third query of # the same IP. With ~100 distinct visitor IPs / day the daily Groq cap # would still be defended; visitors who exceed the cap are nudged to # paste their own BYOK key via the UI 429 banner. ENV SAR_BYOK_OWNER_KEY_QUOTA_PER_HOUR=10 # HF Spaces fronts the container with exactly one trusted reverse proxy that # *appends* the peer it saw to X-Forwarded-For. Tell the throttle to read the # IP one hop from the right (spoof-resistant) instead of the attacker-appendable # leftmost token, so a visitor can't mint a fresh owner-key bucket per request # by forging XFF. See interfaces/byok.py::client_ip_from_request. ENV SAR_BYOK_XFF_TRUSTED_HOPS=1 ENV SAR_SESSION_COLLECTION_TTL_HOURS=24 ENV SAR_CORS_ALLOW_ORIGINS='["https://secureagentrag-web.vercel.app","https://secureagentrag.vercel.app"]' # Cloud LLM defaults -- Groq llama-3.1-8b-instant is the cheapest fast option # on the free tier. Visitor BYOK overrides this per request. ENV SAR_DEFAULT_PROVIDER=groq ENV SAR_CLOUD_PROVIDER=groq ENV SAR_LLM_MODEL=llama-3.1-8b-instant # Embedding stack -- local BGE-M3 via sentence-transformers (CPU). Avoids # Ollama entirely. ENV SAR_EMBEDDING_BACKEND=local ENV SAR_LOCAL_EMBEDDING_MODEL=BAAI/bge-m3 ENV SAR_EMBEDDING_MODEL=bge-m3 ENV SAR_EMBEDDING_DIM=1024 # Reranker disabled for the BYOK demo. The cross-encoder adds a ~600 MB # model + 4-5 s cold-load latency, and on a 10-doc corpus + 1-5 visitor # uploads its top-5 cut routinely drops the visitor's own chunk before # it reaches the grader. Pure dense+sparse RRF order is good enough on # the demo corpus; bench shows the reranker only helps materially past # 200+ documents per query. Re-enable for production deploys with # SAR_RERANKER_TYPE=cross_encoder or fine_tuned. ENV SAR_RERANKER_TYPE=none ENV SAR_RERANKER_CHECKPOINT=BAAI/bge-reranker-v2-m3 # Sparse retrieval -- BM25 keeps the cold path zero-dep; SPLADE adds an # extra ~600 MB model and is skipped on free CPU Basic. ENV SAR_SPARSE_BACKEND=bm25 # Persistence paths -- /tmp is the only writable area on HF Spaces. ENV SAR_AUDIT_LOG_DIR=/tmp/secureagentrag/audit_logs ENV SAR_CONVERSATION_DIR=/tmp/secureagentrag/conversations ENV SAR_CHECKPOINT_DB_PATH=/tmp/secureagentrag/checkpoints.sqlite ENV SAR_BM25_INDEX_PATH=/tmp/secureagentrag/bm25_index.pkl # Multi-tenant collections route BYOK session -> documents_sess_. ENV SAR_MULTI_TENANT_COLLECTIONS=true # Pipeline safety -- BYOK uploads can push the candidate set up to 20 # chunks (top_k=10 from base + top_k=10 from session) which the grader # scores one Groq call at a time. 180 s leaves headroom for the # reranker cold load + Groq free-tier rate-limit waits on a fresh boot. ENV SAR_REQUEST_TIMEOUT_S=180 # Grader thresholds tuned looser for the BYOK demo so short user-uploaded # docs are not aggressively filtered out by the LLM judge. The default # 0.7 / 0.5 produced too many "all docs irrelevant" refusals on this # small corpus; 0.55 / 0.3 keeps the corrective-RAG retry loop active # but stops the demo from refusing on edge-case wording. ENV SAR_RELEVANCE_THRESHOLD=0.55 ENV SAR_RELEVANCE_RETRY_THRESHOLD=0.3 # Cap corrective-RAG retries -- two refines is enough; further rewrites # stack Groq calls without meaningfully improving recall on a 10-doc # corpus and just chew through the SLO. ENV SAR_MAX_RETRIES=1 # RAG fusion fires 1 extra Groq call per chat to generate N query # reformulations + N parallel Qdrant searches. Useless on a 10-doc # demo corpus where the original query already retrieves the right # chunks; disabled here to cut Groq call count. ENV SAR_RAG_FUSION_ENABLED=false # Pin the Groq model to llama-3.1-8b-instant for the demo. The # default 70b-versatile model hits the 30 RPM cap faster (heavier # generation = slower throughput), and the 8b model finishes in # ~1 s on prompts under 4k tokens with comparable answer quality # on this small corpus. ENV SAR_GROQ_MODEL=llama-3.1-8b-instant # Cap synth completion tokens to ease Groq free-tier TPM (6,000 tokens/min). # A long answer + 10-chunk prompt could otherwise approach the per-minute # token ceiling in a single chat and 429 mid-stream. 1024 is ample for the # demo corpus; the streaming client also now retries a transient 429 blip. ENV SAR_SYNTH_MAX_TOKENS=1024 # With the reranker disabled + grader bypassed, the synth context cap # (rerank_top_k) doubles as the doc budget into the synth prompt. Raise # from 5 -> 10 so all retrieved chunks reach the LLM; llama-3.1-8b has # a 131k context so 10 chunks × 600 chars = trivial. Bigger context here # is the easiest quality lever now that reranker is off. ENV SAR_RERANK_TOP_K=10 # Faithfulness gate disabled on the public BYOK demo: it makes one Groq # call per cited sentence (typically 5-10 extra calls per answer). On # the free-tier 30 RPM budget a single chatty answer can exhaust the # bucket and the next query 429s with an empty completion. The # synthesizer's own citation discipline (mandatory inline [N] markers # + sources-only prompt) is strong enough for the demo. Re-enable for # production deploys with a paid Groq tier or local Ollama. ENV SAR_FAITHFULNESS_GATE_ENABLED=false ENV SAR_FAITHFULNESS_GATE_MODE=flag ENV SAR_FAITHFULNESS_THRESHOLD=0.7 # The security node's LLM semantic injection check false-positives on # non-English (Arabic) queries and blocks retrieval. Disable it for the demo: # the guardrails node (regex) + the security node's regex jailbreak patterns # still run, so injection defence stays while multilingual questions work. ENV SAR_SECURITY_SEMANTIC_CHECK_ENABLED=false # Cloud LLM is the only inference path on the HF Space (no Ollama). Unlock # HIGH-sensitivity content for cloud synthesis -- the frontend warns the # visitor with a "sensitive: routed to cloud" badge. ENV SAR_ALLOW_CLOUD_FOR_HIGH=true # Public audit export -- caps the /byok/audit response so the panel is # usable but never floods the page on a long-running Space. ENV SAR_BYOK_AUDIT_MAX_ENTRIES=50 # Force UTF-8 everywhere. HF Spaces' base image can default to a C/POSIX # (ASCII) locale, which mangles non-ASCII request text — Arabic queries arrived # as "????" and embedded to garbage, so retrieval never matched the Arabic # corpus. PYTHONUTF8=1 + a UTF-8 locale make Python handle Arabic/RTL correctly. ENV PYTHONUTF8=1 ENV PYTHONIOENCODING=utf-8 ENV LANG=C.UTF-8 ENV LC_ALL=C.UTF-8 # Logging ENV SAR_LOG_LEVEL=INFO # HF cache lives under the user home which is the only persistent writable # tree across Space restarts on CPU Basic. ENV HF_HOME=/home/user/.cache/huggingface ENV TRANSFORMERS_CACHE=/home/user/.cache/huggingface/hub EXPOSE 7860 HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ CMD curl --fail --silent --show-error http://localhost:7860/healthz || exit 1 # uvicorn with 1 worker -- on CPU Basic two workers thrash the memory. CMD ["uvicorn", "interfaces.api:app", \ "--host", "0.0.0.0", \ "--port", "7860", \ "--workers", "1", \ "--timeout-keep-alive", "30", \ "--no-access-log"]