Spaces:

LeomordKaly
/

secureagentrag-api

Running

App Files Files Community

secureagentrag-api / Dockerfile.hf

LeomordKaly

deploy: phase 3 BYOK backend (Dockerfile.hf, FastAPI on 7860)

7992ecd verified 9 days ago

raw

history blame contribute delete

11.7 kB

	# =============================================================================
	# Dockerfile.hf — SecureAgentRAG backend for Hugging Face Spaces (CPU Basic).
	# =============================================================================
	# Two-stage build keeps the runtime image lean. The HF Space free tier is
	# CPU-only with 16 GB RAM and ~50 GB ephemeral disk, so we target a tight
	# memory footprint:
	#
	# - Python 3.11-slim base (~150 MB)
	# - Only [api, embeddings-local, pii] extras (no OCR, no Phoenix, no Postgres,
	# no Redis, no MCP) -- those modules are present in the source but their
	# dependencies are not installed
	# - cross-encoder reranker downloaded on first request (auto-cached under
	# /home/user/.cache/huggingface). Skips the 2.3 GB fine-tuned checkpoint
	# for the initial deploy; phase 3.2 can swap to fine_tuned once the
	# reranker repo is published on HF Hub.
	#
	# The Space-side README.md is uploaded separately by scripts/deploy_hf_space.py
	# with a YAML frontmatter declaring sdk=docker + app_port=7860.
	# =============================================================================

	# --- builder ----------------------------------------------------------------
	FROM python:3.11-slim AS builder

	WORKDIR /app

	RUN pip install --no-cache-dir uv

	# pyproject.toml + a copy of the source are required for uv to build the
	# editable install. README.md is referenced as the long_description.
	COPY pyproject.toml ./
	COPY README.md ./

	# Touch the package directories that hatchling treats as the wheel root --
	# we only need the directory tree to exist at build time so hatchling can
	# scan for __init__.py files. The actual code lands in the runtime stage.
	RUN mkdir -p config core inference retrieval interfaces ingestion utils evaluation app \
	&& touch config/__init__.py core/__init__.py inference/__init__.py \
	&& touch retrieval/__init__.py interfaces/__init__.py ingestion/__init__.py \
	&& touch utils/__init__.py evaluation/__init__.py app/__init__.py

	# Intentionally skip [pii] extras -- the regex patterns in utils/pii.py
	# already cover every BYOK key shape (Groq / OpenAI / Anthropic / HF / Vercel
	# / Qdrant JWT / Qdrant management). Adding Presidio would pull spaCy
	# en_core_web_lg (~770 MB) which auto-downloads at runtime and crashes the
	# container on the CPU Basic Space when the package installer is absent.
	RUN uv venv /app/.venv \
	&& uv pip install --python /app/.venv/bin/python \
	-e ".[api,embeddings-local]"

	# --- runtime ----------------------------------------------------------------
	FROM python:3.11-slim AS runtime

	WORKDIR /app

	# HF Spaces convention: run as uid 1000 with a writeable /home/user.
	RUN useradd -m -u 1000 user

	# System deps for PDF / image processing only -- no OCR / paddle.
	# Debian 12+ (trixie) renamed libgl1-mesa-glx -> libgl1 and libxrender-dev
	# is no longer needed at runtime (runtime is libxrender1).
	RUN apt-get update \
	&& apt-get install -y --no-install-recommends \
	libglib2.0-0 libsm6 libxext6 libxrender1 libgl1 curl \
	&& rm -rf /var/lib/apt/lists/*

	# Bring the virtualenv from the builder stage.
	COPY --from=builder /app/.venv /app/.venv
	ENV PATH="/app/.venv/bin:$PATH"

	# Copy application source. Files that match .dockerignore are filtered out.
	COPY --chown=user:user . /app

	USER user

	# Pre-populate the HF cache so the cross-encoder lives on disk before the
	# first request. Defensive: never fails the build -- if HF Hub is unreachable
	# during build (offline mirrors etc.) the cache is populated on first query.
	RUN python -c "import os; \
	from huggingface_hub import snapshot_download; \
	import sys; \
	try: snapshot_download(repo_id='BAAI/bge-reranker-v2-m3', cache_dir='/home/user/.cache/huggingface/hub'); print('reranker cached') \
	except Exception as e: print(f'reranker cache skipped: {e!r}', file=sys.stderr)" \
	\|\| echo "build-time reranker download failed -- will lazy-load on first request"

	# --- BYOK production env ---------------------------------------------------
	# Real secrets (Qdrant URL + API key, Groq key) are injected via HF Space
	# secrets panel -- they ride the same SAR_* env-var protocol but are NOT
	# baked into the image. Only mode flags and safe defaults live here.
	ENV SAR_BYOK_MODE=true
	# Per-IP owner-key quota / hour. Cap protects the daily Groq 14 400-req
	# budget against any single abuser. Raised from 3 -> 10 because the
	# previous 3 was too tight: visitors blocked after their third query of
	# the same IP. With ~100 distinct visitor IPs / day the daily Groq cap
	# would still be defended; visitors who exceed the cap are nudged to
	# paste their own BYOK key via the UI 429 banner.
	ENV SAR_BYOK_OWNER_KEY_QUOTA_PER_HOUR=10
	# HF Spaces fronts the container with exactly one trusted reverse proxy that
	# appends the peer it saw to X-Forwarded-For. Tell the throttle to read the
	# IP one hop from the right (spoof-resistant) instead of the attacker-appendable
	# leftmost token, so a visitor can't mint a fresh owner-key bucket per request
	# by forging XFF. See interfaces/byok.py::client_ip_from_request.
	ENV SAR_BYOK_XFF_TRUSTED_HOPS=1
	ENV SAR_SESSION_COLLECTION_TTL_HOURS=24
	ENV SAR_CORS_ALLOW_ORIGINS='["https://secureagentrag-web.vercel.app","https://secureagentrag.vercel.app"]'

	# Cloud LLM defaults -- Groq llama-3.1-8b-instant is the cheapest fast option
	# on the free tier. Visitor BYOK overrides this per request.
	ENV SAR_DEFAULT_PROVIDER=groq
	ENV SAR_CLOUD_PROVIDER=groq
	ENV SAR_LLM_MODEL=llama-3.1-8b-instant

	# Embedding stack -- local BGE-M3 via sentence-transformers (CPU). Avoids
	# Ollama entirely.
	ENV SAR_EMBEDDING_BACKEND=local
	ENV SAR_LOCAL_EMBEDDING_MODEL=BAAI/bge-m3
	ENV SAR_EMBEDDING_MODEL=bge-m3
	ENV SAR_EMBEDDING_DIM=1024

	# Reranker disabled for the BYOK demo. The cross-encoder adds a ~600 MB
	# model + 4-5 s cold-load latency, and on a 10-doc corpus + 1-5 visitor
	# uploads its top-5 cut routinely drops the visitor's own chunk before
	# it reaches the grader. Pure dense+sparse RRF order is good enough on
	# the demo corpus; bench shows the reranker only helps materially past
	# 200+ documents per query. Re-enable for production deploys with
	# SAR_RERANKER_TYPE=cross_encoder or fine_tuned.
	ENV SAR_RERANKER_TYPE=none
	ENV SAR_RERANKER_CHECKPOINT=BAAI/bge-reranker-v2-m3

	# Sparse retrieval -- BM25 keeps the cold path zero-dep; SPLADE adds an
	# extra ~600 MB model and is skipped on free CPU Basic.
	ENV SAR_SPARSE_BACKEND=bm25

	# Persistence paths -- /tmp is the only writable area on HF Spaces.
	ENV SAR_AUDIT_LOG_DIR=/tmp/secureagentrag/audit_logs
	ENV SAR_CONVERSATION_DIR=/tmp/secureagentrag/conversations
	ENV SAR_CHECKPOINT_DB_PATH=/tmp/secureagentrag/checkpoints.sqlite
	ENV SAR_BM25_INDEX_PATH=/tmp/secureagentrag/bm25_index.pkl

	# Multi-tenant collections route BYOK session -> documents_sess_<sid>.
	ENV SAR_MULTI_TENANT_COLLECTIONS=true

	# Pipeline safety -- BYOK uploads can push the candidate set up to 20
	# chunks (top_k=10 from base + top_k=10 from session) which the grader
	# scores one Groq call at a time. 180 s leaves headroom for the
	# reranker cold load + Groq free-tier rate-limit waits on a fresh boot.
	ENV SAR_REQUEST_TIMEOUT_S=180

	# Grader thresholds tuned looser for the BYOK demo so short user-uploaded
	# docs are not aggressively filtered out by the LLM judge. The default
	# 0.7 / 0.5 produced too many "all docs irrelevant" refusals on this
	# small corpus; 0.55 / 0.3 keeps the corrective-RAG retry loop active
	# but stops the demo from refusing on edge-case wording.
	ENV SAR_RELEVANCE_THRESHOLD=0.55
	ENV SAR_RELEVANCE_RETRY_THRESHOLD=0.3
	# Cap corrective-RAG retries -- two refines is enough; further rewrites
	# stack Groq calls without meaningfully improving recall on a 10-doc
	# corpus and just chew through the SLO.
	ENV SAR_MAX_RETRIES=1
	# RAG fusion fires 1 extra Groq call per chat to generate N query
	# reformulations + N parallel Qdrant searches. Useless on a 10-doc
	# demo corpus where the original query already retrieves the right
	# chunks; disabled here to cut Groq call count.
	ENV SAR_RAG_FUSION_ENABLED=false
	# Pin the Groq model to llama-3.1-8b-instant for the demo. The
	# default 70b-versatile model hits the 30 RPM cap faster (heavier
	# generation = slower throughput), and the 8b model finishes in
	# ~1 s on prompts under 4k tokens with comparable answer quality
	# on this small corpus.
	ENV SAR_GROQ_MODEL=llama-3.1-8b-instant
	# Cap synth completion tokens to ease Groq free-tier TPM (6,000 tokens/min).
	# A long answer + 10-chunk prompt could otherwise approach the per-minute
	# token ceiling in a single chat and 429 mid-stream. 1024 is ample for the
	# demo corpus; the streaming client also now retries a transient 429 blip.
	ENV SAR_SYNTH_MAX_TOKENS=1024
	# With the reranker disabled + grader bypassed, the synth context cap
	# (rerank_top_k) doubles as the doc budget into the synth prompt. Raise
	# from 5 -> 10 so all retrieved chunks reach the LLM; llama-3.1-8b has
	# a 131k context so 10 chunks × 600 chars = trivial. Bigger context here
	# is the easiest quality lever now that reranker is off.
	ENV SAR_RERANK_TOP_K=10
	# Faithfulness gate disabled on the public BYOK demo: it makes one Groq
	# call per cited sentence (typically 5-10 extra calls per answer). On
	# the free-tier 30 RPM budget a single chatty answer can exhaust the
	# bucket and the next query 429s with an empty completion. The
	# synthesizer's own citation discipline (mandatory inline [N] markers
	# + sources-only prompt) is strong enough for the demo. Re-enable for
	# production deploys with a paid Groq tier or local Ollama.
	ENV SAR_FAITHFULNESS_GATE_ENABLED=false
	ENV SAR_FAITHFULNESS_GATE_MODE=flag
	ENV SAR_FAITHFULNESS_THRESHOLD=0.7

	# The security node's LLM semantic injection check false-positives on
	# non-English (Arabic) queries and blocks retrieval. Disable it for the demo:
	# the guardrails node (regex) + the security node's regex jailbreak patterns
	# still run, so injection defence stays while multilingual questions work.
	ENV SAR_SECURITY_SEMANTIC_CHECK_ENABLED=false

	# Cloud LLM is the only inference path on the HF Space (no Ollama). Unlock
	# HIGH-sensitivity content for cloud synthesis -- the frontend warns the
	# visitor with a "sensitive: routed to cloud" badge.
	ENV SAR_ALLOW_CLOUD_FOR_HIGH=true

	# Public audit export -- caps the /byok/audit response so the panel is
	# usable but never floods the page on a long-running Space.
	ENV SAR_BYOK_AUDIT_MAX_ENTRIES=50

	# Force UTF-8 everywhere. HF Spaces' base image can default to a C/POSIX
	# (ASCII) locale, which mangles non-ASCII request text — Arabic queries arrived
	# as "????" and embedded to garbage, so retrieval never matched the Arabic
	# corpus. PYTHONUTF8=1 + a UTF-8 locale make Python handle Arabic/RTL correctly.
	ENV PYTHONUTF8=1
	ENV PYTHONIOENCODING=utf-8
	ENV LANG=C.UTF-8
	ENV LC_ALL=C.UTF-8

	# Logging
	ENV SAR_LOG_LEVEL=INFO

	# HF cache lives under the user home which is the only persistent writable
	# tree across Space restarts on CPU Basic.
	ENV HF_HOME=/home/user/.cache/huggingface
	ENV TRANSFORMERS_CACHE=/home/user/.cache/huggingface/hub

	EXPOSE 7860

	HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
	CMD curl --fail --silent --show-error http://localhost:7860/healthz \|\| exit 1

	# uvicorn with 1 worker -- on CPU Basic two workers thrash the memory.
	CMD ["uvicorn", "interfaces.api:app", \
	"--host", "0.0.0.0", \
	"--port", "7860", \
	"--workers", "1", \
	"--timeout-keep-alive", "30", \
	"--no-access-log"]