Spaces:
Running
Running
File size: 11,736 Bytes
5b5c717 b2a2d08 5b5c717 b2a2d08 5b5c717 6d06a95 5b5c717 6d06a95 5b5c717 925162d 7f61c4a 48f9ef6 15478c1 5b5c717 4efdd9b 5b5c717 be8b74b e8b462a 996c682 48f9ef6 9327692 6eab718 5b5c717 dae6420 15478c1 994f8e8 5b5c717 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 | # =============================================================================
# Dockerfile.hf — SecureAgentRAG backend for Hugging Face Spaces (CPU Basic).
# =============================================================================
# Two-stage build keeps the runtime image lean. The HF Space free tier is
# CPU-only with 16 GB RAM and ~50 GB ephemeral disk, so we target a tight
# memory footprint:
#
# - Python 3.11-slim base (~150 MB)
# - Only [api, embeddings-local, pii] extras (no OCR, no Phoenix, no Postgres,
# no Redis, no MCP) -- those modules are present in the source but their
# dependencies are not installed
# - cross-encoder reranker downloaded on first request (auto-cached under
# /home/user/.cache/huggingface). Skips the 2.3 GB fine-tuned checkpoint
# for the initial deploy; phase 3.2 can swap to fine_tuned once the
# reranker repo is published on HF Hub.
#
# The Space-side README.md is uploaded separately by scripts/deploy_hf_space.py
# with a YAML frontmatter declaring sdk=docker + app_port=7860.
# =============================================================================
# --- builder ----------------------------------------------------------------
FROM python:3.11-slim AS builder
WORKDIR /app
RUN pip install --no-cache-dir uv
# pyproject.toml + a copy of the source are required for uv to build the
# editable install. README.md is referenced as the long_description.
COPY pyproject.toml ./
COPY README.md ./
# Touch the package directories that hatchling treats as the wheel root --
# we only need the directory tree to exist at build time so hatchling can
# scan for __init__.py files. The actual code lands in the runtime stage.
RUN mkdir -p config core inference retrieval interfaces ingestion utils evaluation app \
&& touch config/__init__.py core/__init__.py inference/__init__.py \
&& touch retrieval/__init__.py interfaces/__init__.py ingestion/__init__.py \
&& touch utils/__init__.py evaluation/__init__.py app/__init__.py
# Intentionally skip [pii] extras -- the regex patterns in utils/pii.py
# already cover every BYOK key shape (Groq / OpenAI / Anthropic / HF / Vercel
# / Qdrant JWT / Qdrant management). Adding Presidio would pull spaCy
# en_core_web_lg (~770 MB) which auto-downloads at runtime and crashes the
# container on the CPU Basic Space when the package installer is absent.
RUN uv venv /app/.venv \
&& uv pip install --python /app/.venv/bin/python \
-e ".[api,embeddings-local]"
# --- runtime ----------------------------------------------------------------
FROM python:3.11-slim AS runtime
WORKDIR /app
# HF Spaces convention: run as uid 1000 with a writeable /home/user.
RUN useradd -m -u 1000 user
# System deps for PDF / image processing only -- no OCR / paddle.
# Debian 12+ (trixie) renamed libgl1-mesa-glx -> libgl1 and libxrender-dev
# is no longer needed at runtime (runtime is libxrender1).
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
libglib2.0-0 libsm6 libxext6 libxrender1 libgl1 curl \
&& rm -rf /var/lib/apt/lists/*
# Bring the virtualenv from the builder stage.
COPY --from=builder /app/.venv /app/.venv
ENV PATH="/app/.venv/bin:$PATH"
# Copy application source. Files that match .dockerignore are filtered out.
COPY --chown=user:user . /app
USER user
# Pre-populate the HF cache so the cross-encoder lives on disk before the
# first request. Defensive: never fails the build -- if HF Hub is unreachable
# during build (offline mirrors etc.) the cache is populated on first query.
RUN python -c "import os; \
from huggingface_hub import snapshot_download; \
import sys; \
try: snapshot_download(repo_id='BAAI/bge-reranker-v2-m3', cache_dir='/home/user/.cache/huggingface/hub'); print('reranker cached') \
except Exception as e: print(f'reranker cache skipped: {e!r}', file=sys.stderr)" \
|| echo "build-time reranker download failed -- will lazy-load on first request"
# --- BYOK production env ---------------------------------------------------
# Real secrets (Qdrant URL + API key, Groq key) are injected via HF Space
# secrets panel -- they ride the same SAR_* env-var protocol but are NOT
# baked into the image. Only mode flags and safe defaults live here.
ENV SAR_BYOK_MODE=true
# Per-IP owner-key quota / hour. Cap protects the daily Groq 14 400-req
# budget against any single abuser. Raised from 3 -> 10 because the
# previous 3 was too tight: visitors blocked after their third query of
# the same IP. With ~100 distinct visitor IPs / day the daily Groq cap
# would still be defended; visitors who exceed the cap are nudged to
# paste their own BYOK key via the UI 429 banner.
ENV SAR_BYOK_OWNER_KEY_QUOTA_PER_HOUR=10
# HF Spaces fronts the container with exactly one trusted reverse proxy that
# *appends* the peer it saw to X-Forwarded-For. Tell the throttle to read the
# IP one hop from the right (spoof-resistant) instead of the attacker-appendable
# leftmost token, so a visitor can't mint a fresh owner-key bucket per request
# by forging XFF. See interfaces/byok.py::client_ip_from_request.
ENV SAR_BYOK_XFF_TRUSTED_HOPS=1
ENV SAR_SESSION_COLLECTION_TTL_HOURS=24
ENV SAR_CORS_ALLOW_ORIGINS='["https://secureagentrag-web.vercel.app","https://secureagentrag.vercel.app"]'
# Cloud LLM defaults -- Groq llama-3.1-8b-instant is the cheapest fast option
# on the free tier. Visitor BYOK overrides this per request.
ENV SAR_DEFAULT_PROVIDER=groq
ENV SAR_CLOUD_PROVIDER=groq
ENV SAR_LLM_MODEL=llama-3.1-8b-instant
# Embedding stack -- local BGE-M3 via sentence-transformers (CPU). Avoids
# Ollama entirely.
ENV SAR_EMBEDDING_BACKEND=local
ENV SAR_LOCAL_EMBEDDING_MODEL=BAAI/bge-m3
ENV SAR_EMBEDDING_MODEL=bge-m3
ENV SAR_EMBEDDING_DIM=1024
# Reranker disabled for the BYOK demo. The cross-encoder adds a ~600 MB
# model + 4-5 s cold-load latency, and on a 10-doc corpus + 1-5 visitor
# uploads its top-5 cut routinely drops the visitor's own chunk before
# it reaches the grader. Pure dense+sparse RRF order is good enough on
# the demo corpus; bench shows the reranker only helps materially past
# 200+ documents per query. Re-enable for production deploys with
# SAR_RERANKER_TYPE=cross_encoder or fine_tuned.
ENV SAR_RERANKER_TYPE=none
ENV SAR_RERANKER_CHECKPOINT=BAAI/bge-reranker-v2-m3
# Sparse retrieval -- BM25 keeps the cold path zero-dep; SPLADE adds an
# extra ~600 MB model and is skipped on free CPU Basic.
ENV SAR_SPARSE_BACKEND=bm25
# Persistence paths -- /tmp is the only writable area on HF Spaces.
ENV SAR_AUDIT_LOG_DIR=/tmp/secureagentrag/audit_logs
ENV SAR_CONVERSATION_DIR=/tmp/secureagentrag/conversations
ENV SAR_CHECKPOINT_DB_PATH=/tmp/secureagentrag/checkpoints.sqlite
ENV SAR_BM25_INDEX_PATH=/tmp/secureagentrag/bm25_index.pkl
# Multi-tenant collections route BYOK session -> documents_sess_<sid>.
ENV SAR_MULTI_TENANT_COLLECTIONS=true
# Pipeline safety -- BYOK uploads can push the candidate set up to 20
# chunks (top_k=10 from base + top_k=10 from session) which the grader
# scores one Groq call at a time. 180 s leaves headroom for the
# reranker cold load + Groq free-tier rate-limit waits on a fresh boot.
ENV SAR_REQUEST_TIMEOUT_S=180
# Grader thresholds tuned looser for the BYOK demo so short user-uploaded
# docs are not aggressively filtered out by the LLM judge. The default
# 0.7 / 0.5 produced too many "all docs irrelevant" refusals on this
# small corpus; 0.55 / 0.3 keeps the corrective-RAG retry loop active
# but stops the demo from refusing on edge-case wording.
ENV SAR_RELEVANCE_THRESHOLD=0.55
ENV SAR_RELEVANCE_RETRY_THRESHOLD=0.3
# Cap corrective-RAG retries -- two refines is enough; further rewrites
# stack Groq calls without meaningfully improving recall on a 10-doc
# corpus and just chew through the SLO.
ENV SAR_MAX_RETRIES=1
# RAG fusion fires 1 extra Groq call per chat to generate N query
# reformulations + N parallel Qdrant searches. Useless on a 10-doc
# demo corpus where the original query already retrieves the right
# chunks; disabled here to cut Groq call count.
ENV SAR_RAG_FUSION_ENABLED=false
# Pin the Groq model to llama-3.1-8b-instant for the demo. The
# default 70b-versatile model hits the 30 RPM cap faster (heavier
# generation = slower throughput), and the 8b model finishes in
# ~1 s on prompts under 4k tokens with comparable answer quality
# on this small corpus.
ENV SAR_GROQ_MODEL=llama-3.1-8b-instant
# Cap synth completion tokens to ease Groq free-tier TPM (6,000 tokens/min).
# A long answer + 10-chunk prompt could otherwise approach the per-minute
# token ceiling in a single chat and 429 mid-stream. 1024 is ample for the
# demo corpus; the streaming client also now retries a transient 429 blip.
ENV SAR_SYNTH_MAX_TOKENS=1024
# With the reranker disabled + grader bypassed, the synth context cap
# (rerank_top_k) doubles as the doc budget into the synth prompt. Raise
# from 5 -> 10 so all retrieved chunks reach the LLM; llama-3.1-8b has
# a 131k context so 10 chunks × 600 chars = trivial. Bigger context here
# is the easiest quality lever now that reranker is off.
ENV SAR_RERANK_TOP_K=10
# Faithfulness gate disabled on the public BYOK demo: it makes one Groq
# call per cited sentence (typically 5-10 extra calls per answer). On
# the free-tier 30 RPM budget a single chatty answer can exhaust the
# bucket and the next query 429s with an empty completion. The
# synthesizer's own citation discipline (mandatory inline [N] markers
# + sources-only prompt) is strong enough for the demo. Re-enable for
# production deploys with a paid Groq tier or local Ollama.
ENV SAR_FAITHFULNESS_GATE_ENABLED=false
ENV SAR_FAITHFULNESS_GATE_MODE=flag
ENV SAR_FAITHFULNESS_THRESHOLD=0.7
# The security node's LLM semantic injection check false-positives on
# non-English (Arabic) queries and blocks retrieval. Disable it for the demo:
# the guardrails node (regex) + the security node's regex jailbreak patterns
# still run, so injection defence stays while multilingual questions work.
ENV SAR_SECURITY_SEMANTIC_CHECK_ENABLED=false
# Cloud LLM is the only inference path on the HF Space (no Ollama). Unlock
# HIGH-sensitivity content for cloud synthesis -- the frontend warns the
# visitor with a "sensitive: routed to cloud" badge.
ENV SAR_ALLOW_CLOUD_FOR_HIGH=true
# Public audit export -- caps the /byok/audit response so the panel is
# usable but never floods the page on a long-running Space.
ENV SAR_BYOK_AUDIT_MAX_ENTRIES=50
# Force UTF-8 everywhere. HF Spaces' base image can default to a C/POSIX
# (ASCII) locale, which mangles non-ASCII request text — Arabic queries arrived
# as "????" and embedded to garbage, so retrieval never matched the Arabic
# corpus. PYTHONUTF8=1 + a UTF-8 locale make Python handle Arabic/RTL correctly.
ENV PYTHONUTF8=1
ENV PYTHONIOENCODING=utf-8
ENV LANG=C.UTF-8
ENV LC_ALL=C.UTF-8
# Logging
ENV SAR_LOG_LEVEL=INFO
# HF cache lives under the user home which is the only persistent writable
# tree across Space restarts on CPU Basic.
ENV HF_HOME=/home/user/.cache/huggingface
ENV TRANSFORMERS_CACHE=/home/user/.cache/huggingface/hub
EXPOSE 7860
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
CMD curl --fail --silent --show-error http://localhost:7860/healthz || exit 1
# uvicorn with 1 worker -- on CPU Basic two workers thrash the memory.
CMD ["uvicorn", "interfaces.api:app", \
"--host", "0.0.0.0", \
"--port", "7860", \
"--workers", "1", \
"--timeout-keep-alive", "30", \
"--no-access-log"]
|