LeomordKaly commited on
Commit
b07536a
·
verified ·
1 Parent(s): 7f61c4a

deploy: phase 3 BYOK backend (Dockerfile.hf, FastAPI on 7860)

Browse files
config/settings.py CHANGED
@@ -173,6 +173,13 @@ class Settings(BaseSettings):
173
  # deps — so it is safe to leave on everywhere.
174
  audit_verify_enabled: bool = True
175
  audit_verify_interval_hours: int = 6
 
 
 
 
 
 
 
176
 
177
  # ── Citation Faithfulness Gate (NLI) ─────────────────────────────────────────
178
  # After synthesis, run a per-sentence NLI check: for each sentence that
 
173
  # deps — so it is safe to leave on everywhere.
174
  audit_verify_enabled: bool = True
175
  audit_verify_interval_hours: int = 6
176
+ # Optional HMAC key for the audit hash chain. When unset (default) entries
177
+ # are SHA-256 hashed — tamper-*evident* (any edit breaks the chain, but an
178
+ # attacker with file access can recompute the whole chain). When set, each
179
+ # entry hash is an HMAC-SHA256 keyed by this secret, making the chain
180
+ # tamper-*resistant* (an attacker cannot forge a valid chain without the
181
+ # key). Keep the key out of the audit host's filesystem (env/secret store).
182
+ audit_hmac_key: str | None = None
183
 
184
  # ── Citation Faithfulness Gate (NLI) ─────────────────────────────────────────
185
  # After synthesis, run a per-sentence NLI check: for each sentence that
ingestion/contextual.py CHANGED
@@ -41,6 +41,7 @@ async def _generate_one(
41
  semaphore: asyncio.Semaphore,
42
  prefer_cloud: bool,
43
  max_doc_chars: int,
 
44
  ) -> str:
45
  """Generate a single chunk's context summary.
46
 
@@ -63,7 +64,9 @@ async def _generate_one(
63
  ctx = await call_llm_async(
64
  prompt,
65
  system_prompt="You generate short retrieval context summaries.",
66
- sensitivity_level="low",
 
 
67
  prefer_cloud=prefer_cloud,
68
  )
69
  return ctx.strip()
@@ -79,6 +82,7 @@ async def generate_chunk_contexts(
79
  prefer_cloud: bool = False,
80
  max_concurrent: int = 8,
81
  max_doc_chars: int = 50_000,
 
82
  ) -> list[str]:
83
  """Generate contexts for every chunk concurrently.
84
 
@@ -96,7 +100,10 @@ async def generate_chunk_contexts(
96
  if not chunks:
97
  return []
98
  sem = asyncio.Semaphore(max_concurrent)
99
- tasks = [_generate_one(document_text, c, sem, prefer_cloud, max_doc_chars) for c in chunks]
 
 
 
100
  contexts = await asyncio.gather(*tasks, return_exceptions=False)
101
  logger.info(
102
  "contextual_retrieval_generated",
 
41
  semaphore: asyncio.Semaphore,
42
  prefer_cloud: bool,
43
  max_doc_chars: int,
44
+ sensitivity_level: str = "low",
45
  ) -> str:
46
  """Generate a single chunk's context summary.
47
 
 
64
  ctx = await call_llm_async(
65
  prompt,
66
  system_prompt="You generate short retrieval context summaries.",
67
+ # Use the document's real sensitivity so HIGH content is summarised
68
+ # on local inference and never sent to a cloud provider at ingest.
69
+ sensitivity_level=sensitivity_level,
70
  prefer_cloud=prefer_cloud,
71
  )
72
  return ctx.strip()
 
82
  prefer_cloud: bool = False,
83
  max_concurrent: int = 8,
84
  max_doc_chars: int = 50_000,
85
+ sensitivity_level: str = "low",
86
  ) -> list[str]:
87
  """Generate contexts for every chunk concurrently.
88
 
 
100
  if not chunks:
101
  return []
102
  sem = asyncio.Semaphore(max_concurrent)
103
+ tasks = [
104
+ _generate_one(document_text, c, sem, prefer_cloud, max_doc_chars, sensitivity_level)
105
+ for c in chunks
106
+ ]
107
  contexts = await asyncio.gather(*tasks, return_exceptions=False)
108
  logger.info(
109
  "contextual_retrieval_generated",
ingestion/pipeline.py CHANGED
@@ -241,6 +241,9 @@ class IngestionPipeline:
241
  full_doc,
242
  chunk_texts,
243
  prefer_cloud=False,
 
 
 
244
  )
245
  embed_inputs = merge_chunks(chunk_texts, contexts)
246
  logger.info(
 
241
  full_doc,
242
  chunk_texts,
243
  prefer_cloud=False,
244
+ # HIGH docs keep their context-summary LLM call on local
245
+ # inference — never route sensitive content to cloud at ingest.
246
+ sensitivity_level=request.sensitivity_level.value,
247
  )
248
  embed_inputs = merge_chunks(chunk_texts, contexts)
249
  logger.info(
interfaces/api.py CHANGED
@@ -809,6 +809,10 @@ if _FASTAPI_AVAILABLE:
809
  )
810
  for p in points:
811
  payload = p.payload or {}
 
 
 
 
812
  src = payload.get("source_file") or ""
813
  # Reduce absolute paths down to a basename + sha so the
814
  # visitor sees the filename they uploaded, not the
 
809
  )
810
  for p in points:
811
  payload = p.payload or {}
812
+ # Skip the purge sentinel point — it carries no source_file
813
+ # and must never appear as a phantom upload row.
814
+ if payload.get("__sentinel__"):
815
+ continue
816
  src = payload.get("source_file") or ""
817
  # Reduce absolute paths down to a basename + sha so the
818
  # visitor sees the filename they uploaded, not the
retrieval/qdrant_client.py CHANGED
@@ -191,6 +191,13 @@ class QdrantManager:
191
  field_name=field,
192
  field_schema=schema,
193
  )
 
 
 
 
 
 
 
194
  self._tenant_cache[sess_collection] = mgr
195
  logger.info(
196
  "byok_session_collection_cached",
 
191
  field_name=field,
192
  field_schema=schema,
193
  )
194
+ # Stamp the collection with a creation timestamp so the 24h purge cron
195
+ # can actually find and drop it later (Qdrant has no writable collection
196
+ # metadata slot — see retrieval/session_purge.write_session_sentinel).
197
+ with contextlib.suppress(Exception):
198
+ from retrieval.session_purge import write_session_sentinel
199
+
200
+ write_session_sentinel(mgr._client, sess_collection, settings.embedding_dim)
201
  self._tenant_cache[sess_collection] = mgr
202
  logger.info(
203
  "byok_session_collection_cached",
retrieval/session_purge.py CHANGED
@@ -23,6 +23,7 @@ See ``launch-plan/03-backend-byok.md`` § Session purge cron.
23
 
24
  from __future__ import annotations
25
 
 
26
  from datetime import UTC, datetime, timedelta
27
  from typing import TYPE_CHECKING, Any
28
 
@@ -35,6 +36,84 @@ if TYPE_CHECKING:
35
  logger = get_logger(__name__)
36
 
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  SESSION_COLLECTION_PREFIX = "_sess_"
39
  """Suffix introduced into the collection name by ``get_collection_name`` when
40
  ``byok_mode`` is on and a ``session_id`` is supplied. Used here to filter the
@@ -108,9 +187,7 @@ def purge_expired_sessions(
108
  continue
109
  inspected += 1
110
  try:
111
- info = client.get_collection(name)
112
- meta = getattr(info.config.params, "metadata", None) or {}
113
- created = _parse_created_at(meta)
114
  if created is None:
115
  # Undated -> skip; we don't delete what we can't time-stamp.
116
  skipped += 1
 
23
 
24
  from __future__ import annotations
25
 
26
+ import uuid
27
  from datetime import UTC, datetime, timedelta
28
  from typing import TYPE_CHECKING, Any
29
 
 
36
  logger = get_logger(__name__)
37
 
38
 
39
+ # Deterministic namespace so the sentinel point id is stable across processes.
40
+ _SENTINEL_NS = uuid.UUID("5a1d0000-0000-4000-8000-000000000001")
41
+
42
+
43
+ def session_sentinel_id(collection_name: str) -> str:
44
+ """Stable UUID for a session collection's creation-timestamp sentinel point.
45
+
46
+ A fixed namespace UUID5 of the collection name, so the writer (at
47
+ collection-creation time) and the purge sweep agree on the id without any
48
+ shared state.
49
+ """
50
+ return str(uuid.uuid5(_SENTINEL_NS, collection_name))
51
+
52
+
53
+ def write_session_sentinel(client: QdrantClient, collection_name: str, dim: int) -> None:
54
+ """Stamp a session collection with a creation timestamp the purge can read.
55
+
56
+ Qdrant ``CollectionInfo.config.params`` has no writable metadata slot, so the
57
+ original purge (which read ``config.params.metadata.created_at``) could never
58
+ find a timestamp and skipped every collection forever — they accumulated
59
+ until the 1 GB free tier filled. Instead we upsert one tiny sentinel point
60
+ carrying ``created_at``. It can never surface in retrieval: it has no
61
+ ``org_id`` / ``roles`` / ``sensitivity_level_int`` payload, so the RBAC
62
+ must-filter excludes it on every query.
63
+
64
+ Best-effort: a failure here only means the collection won't be auto-purged
65
+ (it still bounds itself by visitor inactivity), never a hard error.
66
+ """
67
+ from qdrant_client.http.models import PointStruct
68
+
69
+ sid = session_sentinel_id(collection_name)
70
+ try:
71
+ client.upsert(
72
+ collection_name=collection_name,
73
+ points=[
74
+ PointStruct(
75
+ id=sid,
76
+ vector=[0.0] * int(dim),
77
+ payload={
78
+ "__sentinel__": True,
79
+ "created_at": datetime.now(UTC).isoformat(),
80
+ },
81
+ )
82
+ ],
83
+ )
84
+ except Exception as exc: # pragma: no cover - defensive
85
+ logger.warning("session_sentinel_write_failed", collection=collection_name, error=str(exc))
86
+
87
+
88
+ def _read_created_at(client: QdrantClient, name: str) -> datetime | None:
89
+ """Resolve a session collection's creation time.
90
+
91
+ Production path: read the sentinel point's ``created_at`` payload. Legacy
92
+ fallback: the old ``config.params.metadata.created_at`` slot (kept so
93
+ pre-sentinel deployments + existing unit mocks still resolve).
94
+ """
95
+ # 1. Sentinel point (the real, durable path).
96
+ try:
97
+ points = client.retrieve(
98
+ collection_name=name,
99
+ ids=[session_sentinel_id(name)],
100
+ with_payload=True,
101
+ )
102
+ for p in points:
103
+ dt = _parse_created_at(getattr(p, "payload", None))
104
+ if dt is not None:
105
+ return dt
106
+ except Exception:
107
+ pass
108
+ # 2. Legacy collection-config metadata.
109
+ try:
110
+ info = client.get_collection(name)
111
+ meta = getattr(info.config.params, "metadata", None) or {}
112
+ return _parse_created_at(meta)
113
+ except Exception:
114
+ return None
115
+
116
+
117
  SESSION_COLLECTION_PREFIX = "_sess_"
118
  """Suffix introduced into the collection name by ``get_collection_name`` when
119
  ``byok_mode`` is on and a ``session_id`` is supplied. Used here to filter the
 
187
  continue
188
  inspected += 1
189
  try:
190
+ created = _read_created_at(client, name)
 
 
191
  if created is None:
192
  # Undated -> skip; we don't delete what we can't time-stamp.
193
  skipped += 1
utils/audit.py CHANGED
@@ -13,6 +13,7 @@ re-ordering of past entries breaks the chain and is detected by
13
  from __future__ import annotations
14
 
15
  import hashlib
 
16
  import json
17
  import threading
18
  from datetime import UTC, date, datetime
@@ -60,10 +61,19 @@ class AuditEntry(BaseModel):
60
  entry_hash: str = ""
61
 
62
  def compute_hash(self) -> str:
63
- """Compute the SHA-256 over canonical JSON of this entry except ``entry_hash``."""
 
 
 
 
 
 
64
  payload = self.model_dump(mode="json", exclude={"entry_hash"})
65
- canonical = json.dumps(payload, sort_keys=True, separators=(",", ":"))
66
- return hashlib.sha256(canonical.encode("utf-8")).hexdigest()
 
 
 
67
 
68
 
69
  class AuditLogger:
 
13
  from __future__ import annotations
14
 
15
  import hashlib
16
+ import hmac
17
  import json
18
  import threading
19
  from datetime import UTC, date, datetime
 
61
  entry_hash: str = ""
62
 
63
  def compute_hash(self) -> str:
64
+ """Hash the canonical JSON of this entry (excluding ``entry_hash``).
65
+
66
+ SHA-256 by default (tamper-evident). When ``settings.audit_hmac_key`` is
67
+ set the digest is HMAC-SHA256 keyed by that secret (tamper-resistant) —
68
+ ``verify_chain`` recomputes the same way, so flipping the key on a fresh
69
+ chain upgrades the integrity guarantee with no other code change.
70
+ """
71
  payload = self.model_dump(mode="json", exclude={"entry_hash"})
72
+ canonical = json.dumps(payload, sort_keys=True, separators=(",", ":")).encode("utf-8")
73
+ key = settings.audit_hmac_key
74
+ if key:
75
+ return hmac.new(key.encode("utf-8"), canonical, hashlib.sha256).hexdigest()
76
+ return hashlib.sha256(canonical).hexdigest()
77
 
78
 
79
  class AuditLogger: