Spaces:
Running
Running
deploy: phase 3 BYOK backend (Dockerfile.hf, FastAPI on 7860)
Browse files- config/settings.py +7 -0
- ingestion/contextual.py +9 -2
- ingestion/pipeline.py +3 -0
- interfaces/api.py +4 -0
- retrieval/qdrant_client.py +7 -0
- retrieval/session_purge.py +80 -3
- utils/audit.py +13 -3
config/settings.py
CHANGED
|
@@ -173,6 +173,13 @@ class Settings(BaseSettings):
|
|
| 173 |
# deps — so it is safe to leave on everywhere.
|
| 174 |
audit_verify_enabled: bool = True
|
| 175 |
audit_verify_interval_hours: int = 6
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
# ── Citation Faithfulness Gate (NLI) ─────────────────────────────────────────
|
| 178 |
# After synthesis, run a per-sentence NLI check: for each sentence that
|
|
|
|
| 173 |
# deps — so it is safe to leave on everywhere.
|
| 174 |
audit_verify_enabled: bool = True
|
| 175 |
audit_verify_interval_hours: int = 6
|
| 176 |
+
# Optional HMAC key for the audit hash chain. When unset (default) entries
|
| 177 |
+
# are SHA-256 hashed — tamper-*evident* (any edit breaks the chain, but an
|
| 178 |
+
# attacker with file access can recompute the whole chain). When set, each
|
| 179 |
+
# entry hash is an HMAC-SHA256 keyed by this secret, making the chain
|
| 180 |
+
# tamper-*resistant* (an attacker cannot forge a valid chain without the
|
| 181 |
+
# key). Keep the key out of the audit host's filesystem (env/secret store).
|
| 182 |
+
audit_hmac_key: str | None = None
|
| 183 |
|
| 184 |
# ── Citation Faithfulness Gate (NLI) ─────────────────────────────────────────
|
| 185 |
# After synthesis, run a per-sentence NLI check: for each sentence that
|
ingestion/contextual.py
CHANGED
|
@@ -41,6 +41,7 @@ async def _generate_one(
|
|
| 41 |
semaphore: asyncio.Semaphore,
|
| 42 |
prefer_cloud: bool,
|
| 43 |
max_doc_chars: int,
|
|
|
|
| 44 |
) -> str:
|
| 45 |
"""Generate a single chunk's context summary.
|
| 46 |
|
|
@@ -63,7 +64,9 @@ async def _generate_one(
|
|
| 63 |
ctx = await call_llm_async(
|
| 64 |
prompt,
|
| 65 |
system_prompt="You generate short retrieval context summaries.",
|
| 66 |
-
|
|
|
|
|
|
|
| 67 |
prefer_cloud=prefer_cloud,
|
| 68 |
)
|
| 69 |
return ctx.strip()
|
|
@@ -79,6 +82,7 @@ async def generate_chunk_contexts(
|
|
| 79 |
prefer_cloud: bool = False,
|
| 80 |
max_concurrent: int = 8,
|
| 81 |
max_doc_chars: int = 50_000,
|
|
|
|
| 82 |
) -> list[str]:
|
| 83 |
"""Generate contexts for every chunk concurrently.
|
| 84 |
|
|
@@ -96,7 +100,10 @@ async def generate_chunk_contexts(
|
|
| 96 |
if not chunks:
|
| 97 |
return []
|
| 98 |
sem = asyncio.Semaphore(max_concurrent)
|
| 99 |
-
tasks = [
|
|
|
|
|
|
|
|
|
|
| 100 |
contexts = await asyncio.gather(*tasks, return_exceptions=False)
|
| 101 |
logger.info(
|
| 102 |
"contextual_retrieval_generated",
|
|
|
|
| 41 |
semaphore: asyncio.Semaphore,
|
| 42 |
prefer_cloud: bool,
|
| 43 |
max_doc_chars: int,
|
| 44 |
+
sensitivity_level: str = "low",
|
| 45 |
) -> str:
|
| 46 |
"""Generate a single chunk's context summary.
|
| 47 |
|
|
|
|
| 64 |
ctx = await call_llm_async(
|
| 65 |
prompt,
|
| 66 |
system_prompt="You generate short retrieval context summaries.",
|
| 67 |
+
# Use the document's real sensitivity so HIGH content is summarised
|
| 68 |
+
# on local inference and never sent to a cloud provider at ingest.
|
| 69 |
+
sensitivity_level=sensitivity_level,
|
| 70 |
prefer_cloud=prefer_cloud,
|
| 71 |
)
|
| 72 |
return ctx.strip()
|
|
|
|
| 82 |
prefer_cloud: bool = False,
|
| 83 |
max_concurrent: int = 8,
|
| 84 |
max_doc_chars: int = 50_000,
|
| 85 |
+
sensitivity_level: str = "low",
|
| 86 |
) -> list[str]:
|
| 87 |
"""Generate contexts for every chunk concurrently.
|
| 88 |
|
|
|
|
| 100 |
if not chunks:
|
| 101 |
return []
|
| 102 |
sem = asyncio.Semaphore(max_concurrent)
|
| 103 |
+
tasks = [
|
| 104 |
+
_generate_one(document_text, c, sem, prefer_cloud, max_doc_chars, sensitivity_level)
|
| 105 |
+
for c in chunks
|
| 106 |
+
]
|
| 107 |
contexts = await asyncio.gather(*tasks, return_exceptions=False)
|
| 108 |
logger.info(
|
| 109 |
"contextual_retrieval_generated",
|
ingestion/pipeline.py
CHANGED
|
@@ -241,6 +241,9 @@ class IngestionPipeline:
|
|
| 241 |
full_doc,
|
| 242 |
chunk_texts,
|
| 243 |
prefer_cloud=False,
|
|
|
|
|
|
|
|
|
|
| 244 |
)
|
| 245 |
embed_inputs = merge_chunks(chunk_texts, contexts)
|
| 246 |
logger.info(
|
|
|
|
| 241 |
full_doc,
|
| 242 |
chunk_texts,
|
| 243 |
prefer_cloud=False,
|
| 244 |
+
# HIGH docs keep their context-summary LLM call on local
|
| 245 |
+
# inference — never route sensitive content to cloud at ingest.
|
| 246 |
+
sensitivity_level=request.sensitivity_level.value,
|
| 247 |
)
|
| 248 |
embed_inputs = merge_chunks(chunk_texts, contexts)
|
| 249 |
logger.info(
|
interfaces/api.py
CHANGED
|
@@ -809,6 +809,10 @@ if _FASTAPI_AVAILABLE:
|
|
| 809 |
)
|
| 810 |
for p in points:
|
| 811 |
payload = p.payload or {}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 812 |
src = payload.get("source_file") or ""
|
| 813 |
# Reduce absolute paths down to a basename + sha so the
|
| 814 |
# visitor sees the filename they uploaded, not the
|
|
|
|
| 809 |
)
|
| 810 |
for p in points:
|
| 811 |
payload = p.payload or {}
|
| 812 |
+
# Skip the purge sentinel point — it carries no source_file
|
| 813 |
+
# and must never appear as a phantom upload row.
|
| 814 |
+
if payload.get("__sentinel__"):
|
| 815 |
+
continue
|
| 816 |
src = payload.get("source_file") or ""
|
| 817 |
# Reduce absolute paths down to a basename + sha so the
|
| 818 |
# visitor sees the filename they uploaded, not the
|
retrieval/qdrant_client.py
CHANGED
|
@@ -191,6 +191,13 @@ class QdrantManager:
|
|
| 191 |
field_name=field,
|
| 192 |
field_schema=schema,
|
| 193 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
self._tenant_cache[sess_collection] = mgr
|
| 195 |
logger.info(
|
| 196 |
"byok_session_collection_cached",
|
|
|
|
| 191 |
field_name=field,
|
| 192 |
field_schema=schema,
|
| 193 |
)
|
| 194 |
+
# Stamp the collection with a creation timestamp so the 24h purge cron
|
| 195 |
+
# can actually find and drop it later (Qdrant has no writable collection
|
| 196 |
+
# metadata slot — see retrieval/session_purge.write_session_sentinel).
|
| 197 |
+
with contextlib.suppress(Exception):
|
| 198 |
+
from retrieval.session_purge import write_session_sentinel
|
| 199 |
+
|
| 200 |
+
write_session_sentinel(mgr._client, sess_collection, settings.embedding_dim)
|
| 201 |
self._tenant_cache[sess_collection] = mgr
|
| 202 |
logger.info(
|
| 203 |
"byok_session_collection_cached",
|
retrieval/session_purge.py
CHANGED
|
@@ -23,6 +23,7 @@ See ``launch-plan/03-backend-byok.md`` § Session purge cron.
|
|
| 23 |
|
| 24 |
from __future__ import annotations
|
| 25 |
|
|
|
|
| 26 |
from datetime import UTC, datetime, timedelta
|
| 27 |
from typing import TYPE_CHECKING, Any
|
| 28 |
|
|
@@ -35,6 +36,84 @@ if TYPE_CHECKING:
|
|
| 35 |
logger = get_logger(__name__)
|
| 36 |
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
SESSION_COLLECTION_PREFIX = "_sess_"
|
| 39 |
"""Suffix introduced into the collection name by ``get_collection_name`` when
|
| 40 |
``byok_mode`` is on and a ``session_id`` is supplied. Used here to filter the
|
|
@@ -108,9 +187,7 @@ def purge_expired_sessions(
|
|
| 108 |
continue
|
| 109 |
inspected += 1
|
| 110 |
try:
|
| 111 |
-
|
| 112 |
-
meta = getattr(info.config.params, "metadata", None) or {}
|
| 113 |
-
created = _parse_created_at(meta)
|
| 114 |
if created is None:
|
| 115 |
# Undated -> skip; we don't delete what we can't time-stamp.
|
| 116 |
skipped += 1
|
|
|
|
| 23 |
|
| 24 |
from __future__ import annotations
|
| 25 |
|
| 26 |
+
import uuid
|
| 27 |
from datetime import UTC, datetime, timedelta
|
| 28 |
from typing import TYPE_CHECKING, Any
|
| 29 |
|
|
|
|
| 36 |
logger = get_logger(__name__)
|
| 37 |
|
| 38 |
|
| 39 |
+
# Deterministic namespace so the sentinel point id is stable across processes.
|
| 40 |
+
_SENTINEL_NS = uuid.UUID("5a1d0000-0000-4000-8000-000000000001")
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def session_sentinel_id(collection_name: str) -> str:
|
| 44 |
+
"""Stable UUID for a session collection's creation-timestamp sentinel point.
|
| 45 |
+
|
| 46 |
+
A fixed namespace UUID5 of the collection name, so the writer (at
|
| 47 |
+
collection-creation time) and the purge sweep agree on the id without any
|
| 48 |
+
shared state.
|
| 49 |
+
"""
|
| 50 |
+
return str(uuid.uuid5(_SENTINEL_NS, collection_name))
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def write_session_sentinel(client: QdrantClient, collection_name: str, dim: int) -> None:
|
| 54 |
+
"""Stamp a session collection with a creation timestamp the purge can read.
|
| 55 |
+
|
| 56 |
+
Qdrant ``CollectionInfo.config.params`` has no writable metadata slot, so the
|
| 57 |
+
original purge (which read ``config.params.metadata.created_at``) could never
|
| 58 |
+
find a timestamp and skipped every collection forever — they accumulated
|
| 59 |
+
until the 1 GB free tier filled. Instead we upsert one tiny sentinel point
|
| 60 |
+
carrying ``created_at``. It can never surface in retrieval: it has no
|
| 61 |
+
``org_id`` / ``roles`` / ``sensitivity_level_int`` payload, so the RBAC
|
| 62 |
+
must-filter excludes it on every query.
|
| 63 |
+
|
| 64 |
+
Best-effort: a failure here only means the collection won't be auto-purged
|
| 65 |
+
(it still bounds itself by visitor inactivity), never a hard error.
|
| 66 |
+
"""
|
| 67 |
+
from qdrant_client.http.models import PointStruct
|
| 68 |
+
|
| 69 |
+
sid = session_sentinel_id(collection_name)
|
| 70 |
+
try:
|
| 71 |
+
client.upsert(
|
| 72 |
+
collection_name=collection_name,
|
| 73 |
+
points=[
|
| 74 |
+
PointStruct(
|
| 75 |
+
id=sid,
|
| 76 |
+
vector=[0.0] * int(dim),
|
| 77 |
+
payload={
|
| 78 |
+
"__sentinel__": True,
|
| 79 |
+
"created_at": datetime.now(UTC).isoformat(),
|
| 80 |
+
},
|
| 81 |
+
)
|
| 82 |
+
],
|
| 83 |
+
)
|
| 84 |
+
except Exception as exc: # pragma: no cover - defensive
|
| 85 |
+
logger.warning("session_sentinel_write_failed", collection=collection_name, error=str(exc))
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def _read_created_at(client: QdrantClient, name: str) -> datetime | None:
|
| 89 |
+
"""Resolve a session collection's creation time.
|
| 90 |
+
|
| 91 |
+
Production path: read the sentinel point's ``created_at`` payload. Legacy
|
| 92 |
+
fallback: the old ``config.params.metadata.created_at`` slot (kept so
|
| 93 |
+
pre-sentinel deployments + existing unit mocks still resolve).
|
| 94 |
+
"""
|
| 95 |
+
# 1. Sentinel point (the real, durable path).
|
| 96 |
+
try:
|
| 97 |
+
points = client.retrieve(
|
| 98 |
+
collection_name=name,
|
| 99 |
+
ids=[session_sentinel_id(name)],
|
| 100 |
+
with_payload=True,
|
| 101 |
+
)
|
| 102 |
+
for p in points:
|
| 103 |
+
dt = _parse_created_at(getattr(p, "payload", None))
|
| 104 |
+
if dt is not None:
|
| 105 |
+
return dt
|
| 106 |
+
except Exception:
|
| 107 |
+
pass
|
| 108 |
+
# 2. Legacy collection-config metadata.
|
| 109 |
+
try:
|
| 110 |
+
info = client.get_collection(name)
|
| 111 |
+
meta = getattr(info.config.params, "metadata", None) or {}
|
| 112 |
+
return _parse_created_at(meta)
|
| 113 |
+
except Exception:
|
| 114 |
+
return None
|
| 115 |
+
|
| 116 |
+
|
| 117 |
SESSION_COLLECTION_PREFIX = "_sess_"
|
| 118 |
"""Suffix introduced into the collection name by ``get_collection_name`` when
|
| 119 |
``byok_mode`` is on and a ``session_id`` is supplied. Used here to filter the
|
|
|
|
| 187 |
continue
|
| 188 |
inspected += 1
|
| 189 |
try:
|
| 190 |
+
created = _read_created_at(client, name)
|
|
|
|
|
|
|
| 191 |
if created is None:
|
| 192 |
# Undated -> skip; we don't delete what we can't time-stamp.
|
| 193 |
skipped += 1
|
utils/audit.py
CHANGED
|
@@ -13,6 +13,7 @@ re-ordering of past entries breaks the chain and is detected by
|
|
| 13 |
from __future__ import annotations
|
| 14 |
|
| 15 |
import hashlib
|
|
|
|
| 16 |
import json
|
| 17 |
import threading
|
| 18 |
from datetime import UTC, date, datetime
|
|
@@ -60,10 +61,19 @@ class AuditEntry(BaseModel):
|
|
| 60 |
entry_hash: str = ""
|
| 61 |
|
| 62 |
def compute_hash(self) -> str:
|
| 63 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
payload = self.model_dump(mode="json", exclude={"entry_hash"})
|
| 65 |
-
canonical = json.dumps(payload, sort_keys=True, separators=(",", ":"))
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
|
| 69 |
class AuditLogger:
|
|
|
|
| 13 |
from __future__ import annotations
|
| 14 |
|
| 15 |
import hashlib
|
| 16 |
+
import hmac
|
| 17 |
import json
|
| 18 |
import threading
|
| 19 |
from datetime import UTC, date, datetime
|
|
|
|
| 61 |
entry_hash: str = ""
|
| 62 |
|
| 63 |
def compute_hash(self) -> str:
|
| 64 |
+
"""Hash the canonical JSON of this entry (excluding ``entry_hash``).
|
| 65 |
+
|
| 66 |
+
SHA-256 by default (tamper-evident). When ``settings.audit_hmac_key`` is
|
| 67 |
+
set the digest is HMAC-SHA256 keyed by that secret (tamper-resistant) —
|
| 68 |
+
``verify_chain`` recomputes the same way, so flipping the key on a fresh
|
| 69 |
+
chain upgrades the integrity guarantee with no other code change.
|
| 70 |
+
"""
|
| 71 |
payload = self.model_dump(mode="json", exclude={"entry_hash"})
|
| 72 |
+
canonical = json.dumps(payload, sort_keys=True, separators=(",", ":")).encode("utf-8")
|
| 73 |
+
key = settings.audit_hmac_key
|
| 74 |
+
if key:
|
| 75 |
+
return hmac.new(key.encode("utf-8"), canonical, hashlib.sha256).hexdigest()
|
| 76 |
+
return hashlib.sha256(canonical).hexdigest()
|
| 77 |
|
| 78 |
|
| 79 |
class AuditLogger:
|