File size: 11,736 Bytes
5b5c717
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b2a2d08
 
 
 
 
5b5c717
 
b2a2d08
5b5c717
 
 
 
 
 
 
 
 
 
6d06a95
 
5b5c717
 
6d06a95
5b5c717
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
925162d
 
 
 
 
 
 
7f61c4a
 
 
 
 
 
48f9ef6
15478c1
5b5c717
 
 
 
 
 
 
 
 
 
 
 
 
 
4efdd9b
 
 
 
 
 
 
 
5b5c717
 
 
 
 
 
 
 
 
 
 
 
 
 
 
be8b74b
 
 
 
 
e8b462a
 
 
 
 
 
 
 
 
 
 
 
996c682
 
 
 
 
 
 
 
 
 
 
48f9ef6
 
 
 
 
9327692
 
 
 
 
 
6eab718
 
 
 
 
 
 
 
5b5c717
 
 
dae6420
 
 
 
 
 
15478c1
 
 
 
 
 
 
 
 
994f8e8
 
 
 
 
 
 
 
 
5b5c717
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
# =============================================================================
# Dockerfile.hf — SecureAgentRAG backend for Hugging Face Spaces (CPU Basic).
# =============================================================================
# Two-stage build keeps the runtime image lean. The HF Space free tier is
# CPU-only with 16 GB RAM and ~50 GB ephemeral disk, so we target a tight
# memory footprint:
#
#   - Python 3.11-slim base (~150 MB)
#   - Only [api, embeddings-local, pii] extras (no OCR, no Phoenix, no Postgres,
#     no Redis, no MCP) -- those modules are present in the source but their
#     dependencies are not installed
#   - cross-encoder reranker downloaded on first request (auto-cached under
#     /home/user/.cache/huggingface). Skips the 2.3 GB fine-tuned checkpoint
#     for the initial deploy; phase 3.2 can swap to fine_tuned once the
#     reranker repo is published on HF Hub.
#
# The Space-side README.md is uploaded separately by scripts/deploy_hf_space.py
# with a YAML frontmatter declaring sdk=docker + app_port=7860.
# =============================================================================

# --- builder ----------------------------------------------------------------
FROM python:3.11-slim AS builder

WORKDIR /app

RUN pip install --no-cache-dir uv

# pyproject.toml + a copy of the source are required for uv to build the
# editable install. README.md is referenced as the long_description.
COPY pyproject.toml ./
COPY README.md ./

# Touch the package directories that hatchling treats as the wheel root --
# we only need the directory tree to exist at build time so hatchling can
# scan for __init__.py files. The actual code lands in the runtime stage.
RUN mkdir -p config core inference retrieval interfaces ingestion utils evaluation app \
    && touch config/__init__.py core/__init__.py inference/__init__.py \
    && touch retrieval/__init__.py interfaces/__init__.py ingestion/__init__.py \
    && touch utils/__init__.py evaluation/__init__.py app/__init__.py

# Intentionally skip [pii] extras -- the regex patterns in utils/pii.py
# already cover every BYOK key shape (Groq / OpenAI / Anthropic / HF / Vercel
# / Qdrant JWT / Qdrant management). Adding Presidio would pull spaCy
# en_core_web_lg (~770 MB) which auto-downloads at runtime and crashes the
# container on the CPU Basic Space when the package installer is absent.
RUN uv venv /app/.venv \
    && uv pip install --python /app/.venv/bin/python \
        -e ".[api,embeddings-local]"

# --- runtime ----------------------------------------------------------------
FROM python:3.11-slim AS runtime

WORKDIR /app

# HF Spaces convention: run as uid 1000 with a writeable /home/user.
RUN useradd -m -u 1000 user

# System deps for PDF / image processing only -- no OCR / paddle.
# Debian 12+ (trixie) renamed libgl1-mesa-glx -> libgl1 and libxrender-dev
# is no longer needed at runtime (runtime is libxrender1).
RUN apt-get update \
    && apt-get install -y --no-install-recommends \
        libglib2.0-0 libsm6 libxext6 libxrender1 libgl1 curl \
    && rm -rf /var/lib/apt/lists/*

# Bring the virtualenv from the builder stage.
COPY --from=builder /app/.venv /app/.venv
ENV PATH="/app/.venv/bin:$PATH"

# Copy application source. Files that match .dockerignore are filtered out.
COPY --chown=user:user . /app

USER user

# Pre-populate the HF cache so the cross-encoder lives on disk before the
# first request. Defensive: never fails the build -- if HF Hub is unreachable
# during build (offline mirrors etc.) the cache is populated on first query.
RUN python -c "import os; \

from huggingface_hub import snapshot_download; \

import sys; \

try: snapshot_download(repo_id='BAAI/bge-reranker-v2-m3', cache_dir='/home/user/.cache/huggingface/hub'); print('reranker cached') \

except Exception as e: print(f'reranker cache skipped: {e!r}', file=sys.stderr)" \
    || echo "build-time reranker download failed -- will lazy-load on first request"

# --- BYOK production env ---------------------------------------------------
# Real secrets (Qdrant URL + API key, Groq key) are injected via HF Space
# secrets panel -- they ride the same SAR_* env-var protocol but are NOT
# baked into the image. Only mode flags and safe defaults live here.
ENV SAR_BYOK_MODE=true
# Per-IP owner-key quota / hour. Cap protects the daily Groq 14 400-req
# budget against any single abuser. Raised from 3 -> 10 because the
# previous 3 was too tight: visitors blocked after their third query of
# the same IP. With ~100 distinct visitor IPs / day the daily Groq cap
# would still be defended; visitors who exceed the cap are nudged to
# paste their own BYOK key via the UI 429 banner.
ENV SAR_BYOK_OWNER_KEY_QUOTA_PER_HOUR=10
# HF Spaces fronts the container with exactly one trusted reverse proxy that
# *appends* the peer it saw to X-Forwarded-For. Tell the throttle to read the
# IP one hop from the right (spoof-resistant) instead of the attacker-appendable
# leftmost token, so a visitor can't mint a fresh owner-key bucket per request
# by forging XFF. See interfaces/byok.py::client_ip_from_request.
ENV SAR_BYOK_XFF_TRUSTED_HOPS=1
ENV SAR_SESSION_COLLECTION_TTL_HOURS=24
ENV SAR_CORS_ALLOW_ORIGINS='["https://secureagentrag-web.vercel.app","https://secureagentrag.vercel.app"]'

# Cloud LLM defaults -- Groq llama-3.1-8b-instant is the cheapest fast option
# on the free tier. Visitor BYOK overrides this per request.
ENV SAR_DEFAULT_PROVIDER=groq
ENV SAR_CLOUD_PROVIDER=groq
ENV SAR_LLM_MODEL=llama-3.1-8b-instant

# Embedding stack -- local BGE-M3 via sentence-transformers (CPU). Avoids
# Ollama entirely.
ENV SAR_EMBEDDING_BACKEND=local
ENV SAR_LOCAL_EMBEDDING_MODEL=BAAI/bge-m3
ENV SAR_EMBEDDING_MODEL=bge-m3
ENV SAR_EMBEDDING_DIM=1024

# Reranker disabled for the BYOK demo. The cross-encoder adds a ~600 MB
# model + 4-5 s cold-load latency, and on a 10-doc corpus + 1-5 visitor
# uploads its top-5 cut routinely drops the visitor's own chunk before
# it reaches the grader. Pure dense+sparse RRF order is good enough on
# the demo corpus; bench shows the reranker only helps materially past
# 200+ documents per query. Re-enable for production deploys with
# SAR_RERANKER_TYPE=cross_encoder or fine_tuned.
ENV SAR_RERANKER_TYPE=none
ENV SAR_RERANKER_CHECKPOINT=BAAI/bge-reranker-v2-m3

# Sparse retrieval -- BM25 keeps the cold path zero-dep; SPLADE adds an
# extra ~600 MB model and is skipped on free CPU Basic.
ENV SAR_SPARSE_BACKEND=bm25

# Persistence paths -- /tmp is the only writable area on HF Spaces.
ENV SAR_AUDIT_LOG_DIR=/tmp/secureagentrag/audit_logs
ENV SAR_CONVERSATION_DIR=/tmp/secureagentrag/conversations
ENV SAR_CHECKPOINT_DB_PATH=/tmp/secureagentrag/checkpoints.sqlite
ENV SAR_BM25_INDEX_PATH=/tmp/secureagentrag/bm25_index.pkl

# Multi-tenant collections route BYOK session -> documents_sess_<sid>.
ENV SAR_MULTI_TENANT_COLLECTIONS=true

# Pipeline safety -- BYOK uploads can push the candidate set up to 20
# chunks (top_k=10 from base + top_k=10 from session) which the grader
# scores one Groq call at a time. 180 s leaves headroom for the
# reranker cold load + Groq free-tier rate-limit waits on a fresh boot.
ENV SAR_REQUEST_TIMEOUT_S=180

# Grader thresholds tuned looser for the BYOK demo so short user-uploaded
# docs are not aggressively filtered out by the LLM judge. The default
# 0.7 / 0.5 produced too many "all docs irrelevant" refusals on this
# small corpus; 0.55 / 0.3 keeps the corrective-RAG retry loop active
# but stops the demo from refusing on edge-case wording.
ENV SAR_RELEVANCE_THRESHOLD=0.55
ENV SAR_RELEVANCE_RETRY_THRESHOLD=0.3
# Cap corrective-RAG retries -- two refines is enough; further rewrites
# stack Groq calls without meaningfully improving recall on a 10-doc
# corpus and just chew through the SLO.
ENV SAR_MAX_RETRIES=1
# RAG fusion fires 1 extra Groq call per chat to generate N query
# reformulations + N parallel Qdrant searches. Useless on a 10-doc
# demo corpus where the original query already retrieves the right
# chunks; disabled here to cut Groq call count.
ENV SAR_RAG_FUSION_ENABLED=false
# Pin the Groq model to llama-3.1-8b-instant for the demo. The
# default 70b-versatile model hits the 30 RPM cap faster (heavier
# generation = slower throughput), and the 8b model finishes in
# ~1 s on prompts under 4k tokens with comparable answer quality
# on this small corpus.
ENV SAR_GROQ_MODEL=llama-3.1-8b-instant
# Cap synth completion tokens to ease Groq free-tier TPM (6,000 tokens/min).
# A long answer + 10-chunk prompt could otherwise approach the per-minute
# token ceiling in a single chat and 429 mid-stream. 1024 is ample for the
# demo corpus; the streaming client also now retries a transient 429 blip.
ENV SAR_SYNTH_MAX_TOKENS=1024
# With the reranker disabled + grader bypassed, the synth context cap
# (rerank_top_k) doubles as the doc budget into the synth prompt. Raise
# from 5 -> 10 so all retrieved chunks reach the LLM; llama-3.1-8b has
# a 131k context so 10 chunks × 600 chars = trivial. Bigger context here
# is the easiest quality lever now that reranker is off.
ENV SAR_RERANK_TOP_K=10
# Faithfulness gate disabled on the public BYOK demo: it makes one Groq
# call per cited sentence (typically 5-10 extra calls per answer). On
# the free-tier 30 RPM budget a single chatty answer can exhaust the
# bucket and the next query 429s with an empty completion. The
# synthesizer's own citation discipline (mandatory inline [N] markers
# + sources-only prompt) is strong enough for the demo. Re-enable for
# production deploys with a paid Groq tier or local Ollama.
ENV SAR_FAITHFULNESS_GATE_ENABLED=false
ENV SAR_FAITHFULNESS_GATE_MODE=flag
ENV SAR_FAITHFULNESS_THRESHOLD=0.7

# The security node's LLM semantic injection check false-positives on
# non-English (Arabic) queries and blocks retrieval. Disable it for the demo:
# the guardrails node (regex) + the security node's regex jailbreak patterns
# still run, so injection defence stays while multilingual questions work.
ENV SAR_SECURITY_SEMANTIC_CHECK_ENABLED=false

# Cloud LLM is the only inference path on the HF Space (no Ollama). Unlock
# HIGH-sensitivity content for cloud synthesis -- the frontend warns the
# visitor with a "sensitive: routed to cloud" badge.
ENV SAR_ALLOW_CLOUD_FOR_HIGH=true

# Public audit export -- caps the /byok/audit response so the panel is
# usable but never floods the page on a long-running Space.
ENV SAR_BYOK_AUDIT_MAX_ENTRIES=50

# Force UTF-8 everywhere. HF Spaces' base image can default to a C/POSIX
# (ASCII) locale, which mangles non-ASCII request text — Arabic queries arrived
# as "????" and embedded to garbage, so retrieval never matched the Arabic
# corpus. PYTHONUTF8=1 + a UTF-8 locale make Python handle Arabic/RTL correctly.
ENV PYTHONUTF8=1
ENV PYTHONIOENCODING=utf-8
ENV LANG=C.UTF-8
ENV LC_ALL=C.UTF-8

# Logging
ENV SAR_LOG_LEVEL=INFO

# HF cache lives under the user home which is the only persistent writable
# tree across Space restarts on CPU Basic.
ENV HF_HOME=/home/user/.cache/huggingface
ENV TRANSFORMERS_CACHE=/home/user/.cache/huggingface/hub

EXPOSE 7860

HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
    CMD curl --fail --silent --show-error http://localhost:7860/healthz || exit 1

# uvicorn with 1 worker -- on CPU Basic two workers thrash the memory.
CMD ["uvicorn", "interfaces.api:app", \
     "--host", "0.0.0.0", \
     "--port", "7860", \
     "--workers", "1", \
     "--timeout-keep-alive", "30", \
     "--no-access-log"]