Spaces:

LeomordKaly
/

secureagentrag-api

Running

App Files Files Community

LeomordKaly commited on 9 days ago

Commit

a382192

verified ·

1 Parent(s): c385f4b

deploy: phase 3 BYOK backend (Dockerfile.hf, FastAPI on 7860)

Browse files

Files changed (6) hide show

Dockerfile.hf +6 -0
inference/byok_context.py +74 -0
inference/router.py +73 -16
interfaces/api.py +51 -28
interfaces/byok.py +21 -2
utils/query_cache.py +289 -277

Dockerfile.hf CHANGED Viewed

@@ -94,6 +94,12 @@ ENV SAR_BYOK_MODE=true
 # would still be defended; visitors who exceed the cap are nudged to
 # paste their own BYOK key via the UI 429 banner.
 ENV SAR_BYOK_OWNER_KEY_QUOTA_PER_HOUR=10
 ENV SAR_SESSION_COLLECTION_TTL_HOURS=24
 ENV SAR_CORS_ALLOW_ORIGINS='["https://secureagentrag-web.vercel.app","https://secureagentrag.vercel.app"]'

 # would still be defended; visitors who exceed the cap are nudged to
 # paste their own BYOK key via the UI 429 banner.
 ENV SAR_BYOK_OWNER_KEY_QUOTA_PER_HOUR=10
+# HF Spaces fronts the container with exactly one trusted reverse proxy that
+# *appends* the peer it saw to X-Forwarded-For. Tell the throttle to read the
+# IP one hop from the right (spoof-resistant) instead of the attacker-appendable
+# leftmost token, so a visitor can't mint a fresh owner-key bucket per request
+# by forging XFF. See interfaces/byok.py::client_ip_from_request.
+ENV SAR_BYOK_XFF_TRUSTED_HOPS=1
 ENV SAR_SESSION_COLLECTION_TTL_HOURS=24
 ENV SAR_CORS_ALLOW_ORIGINS='["https://secureagentrag-web.vercel.app","https://secureagentrag.vercel.app"]'

inference/byok_context.py ADDED Viewed

	@@ -0,0 +1,74 @@

+"""Per-request BYOK credentials carried through the pipeline via a ContextVar.
+The graph nodes (router → … → synthesizer) do not thread credentials through
+their signatures — they call ``call_llm_*`` which builds an ``InferenceRouter``
+with no per-request key. To make a visitor's *own* LLM key actually power their
+request (the whole point of "Bring Your Own Key"), we stash the credentials in a
+``contextvars.ContextVar`` at the top of ``run_rag_pipeline[_stream]``.
+``ContextVar`` propagates across ``asyncio`` task boundaries (``gather``,
+``astream``), so every node — and every parallel LLM call inside a node — sees
+the same per-request creds without any signature plumbing. The token is reset in
+a ``finally`` so the value never leaks between requests on a reused worker.
+When no BYOK key is present the ContextVar holds ``None`` and the router falls
+back to the owner's cached clients exactly as before.
+"""
+from __future__ import annotations
+import contextvars
+from dataclasses import dataclass
+# Providers whose BYOK client is built from a bearer/API key.
+_KEY_PROVIDERS = frozenset({"groq", "openai", "anthropic"})
+@dataclass(frozen=True)
+class ByokRuntime:
+    """Per-request BYOK credentials resolved from the visitor's request headers.
+    Attributes:
+        provider: Visitor's chosen provider ("groq" / "openai" / "anthropic" /
+            "ollama"), already allow-list validated. None = no BYOK.
+        user_key: Visitor's API key for a key-based provider. None for Ollama.
+        ollama_url: Visitor's Ollama instance URL (only for provider="ollama").
+    """
+    provider: str | None = None
+    user_key: str | None = None
+    ollama_url: str | None = None
+    def is_active(self) -> bool:
+        """True when these creds can actually drive a per-request LLM client.
+        A key-based provider needs a non-empty key; Ollama needs a URL. Anything
+        else (missing provider, key without a provider, ollama without a URL)
+        is *not* active — the router falls back to the owner's clients.
+        """
+        prov = (self.provider or "").lower()
+        if prov in _KEY_PROVIDERS:
+            return bool(self.user_key and self.user_key.strip())
+        if prov == "ollama":
+            return bool(self.ollama_url and self.ollama_url.strip())
+        return False
+_byok_ctx: contextvars.ContextVar[ByokRuntime | None] = contextvars.ContextVar(
+    "byok_runtime", default=None
+)
+def set_byok_runtime(runtime: ByokRuntime | None) -> contextvars.Token:
+    """Bind ``runtime`` for the current async context. Returns a reset token."""
+    return _byok_ctx.set(runtime)
+def get_byok_runtime() -> ByokRuntime | None:
+    """Return the BYOK creds bound to the current async context, or None."""
+    return _byok_ctx.get()
+def reset_byok_runtime(token: contextvars.Token) -> None:
+    """Restore the previous ContextVar value (call in a ``finally``)."""
+    _byok_ctx.reset(token)

inference/router.py CHANGED Viewed

@@ -2,11 +2,13 @@
 from __future__ import annotations
 from typing import TYPE_CHECKING
 from pydantic import BaseModel
 from config.settings import settings
 from inference.llm_factory import LLMResponse, get_llm
 from ingestion.metadata import SensitivityLevel
 from utils.logging import get_logger
@@ -84,15 +86,25 @@ class InferenceRouter:
         if isinstance(sensitivity_level, str):
             sensitivity_level = SensitivityLevel(sensitivity_level.lower())
-        # 1. Admin override — honoured for LOW/MEDIUM, but NEVER allowed to move
-        # HIGH-sensitivity work off local inference on a self-hosted deploy.
-        # Without this guard the override short-circuits the HIGH→local branch
-        # below (order-of-checks footgun). The override is still respected when
-        # the deploy explicitly opts into cloud-for-HIGH (the GPU-less public
-        # demo, SAR_ALLOW_CLOUD_FOR_HIGH=true) or when it targets local Ollama.
-        # NOTE: override_provider is currently not wired into the pipeline path
-        # (call_llm_* / synthesizer call route() without it); this guard is
-        # defence-in-depth so the privacy guarantee holds even if it ever is.
         if override_provider:
             high_must_stay_local = (
                 sensitivity_level == SensitivityLevel.HIGH
@@ -222,8 +234,8 @@ class InferenceRouter:
         import time
         start = time.perf_counter()
         try:
-            client = get_llm(provider=decision.provider, model=decision.model)
             response = await client.generate(prompt=prompt, system_prompt=system_prompt, **kwargs)
             elapsed_ms = (time.perf_counter() - start) * 1000
             response.latency_ms = elapsed_ms
@@ -262,6 +274,13 @@ class InferenceRouter:
             )
             response.latency_ms = (time.perf_counter() - start) * 1000
             return response, fallback_decision
     @staticmethod
     def _normalised_sensitivity(level: SensitivityLevel | str) -> SensitivityLevel:
@@ -300,7 +319,7 @@ class InferenceRouter:
             forced_local=decision.forced_local,
         )
-        client = get_llm(provider=decision.provider, model=decision.model)
         try:
             import time
@@ -310,8 +329,11 @@ class InferenceRouter:
             response.latency_ms = elapsed_ms
             return response, decision
         finally:
-            # Clients are cached — do NOT close per-request
-            pass
     async def generate_stream_with_routing(
         self,
@@ -346,7 +368,7 @@ class InferenceRouter:
             forced_local=decision.forced_local,
         )
-        client = get_llm(provider=decision.provider, model=decision.model)
         try:
             if hasattr(client, "generate_stream"):
                 async for token in client.generate_stream(
@@ -360,8 +382,11 @@ class InferenceRouter:
                 )
                 yield response.text
         finally:
-            # Clients are cached — do NOT close per-request
-            pass
     def get_available_providers(self) -> list[str]:
         """Return a list of currently configured and available providers.
@@ -418,3 +443,35 @@ class InferenceRouter:
             "anthropic": settings.anthropic_model,
         }
         return model_defaults.get(provider, settings.llm_model)

 from __future__ import annotations
+import contextlib
 from typing import TYPE_CHECKING
 from pydantic import BaseModel
 from config.settings import settings
+from inference.byok_context import get_byok_runtime
 from inference.llm_factory import LLMResponse, get_llm
 from ingestion.metadata import SensitivityLevel
 from utils.logging import get_logger
         if isinstance(sensitivity_level, str):
             sensitivity_level = SensitivityLevel(sensitivity_level.lower())
+        # BYOK: when a visitor brought their own usable key/provider for THIS
+        # request (carried via the ContextVar set in run_rag_pipeline[_stream]),
+        # treat their provider like an explicit override so their key actually
+        # powers the answer. The same HIGH-sensitivity guard below still applies,
+        # so a visitor cloud key cannot move HIGH off local on a self-hosted
+        # deploy (SAR_ALLOW_CLOUD_FOR_HIGH=false). The matching per-request
+        # client is built in ``_client_for`` from the same ContextVar.
+        if override_provider is None:
+            _rt = get_byok_runtime()
+            if _rt is not None and _rt.is_active():
+                override_provider = (_rt.provider or "").lower()
+        # 1. Admin / BYOK override — honoured for LOW/MEDIUM, but NEVER allowed
+        # to move HIGH-sensitivity work off local inference on a self-hosted
+        # deploy. Without this guard the override short-circuits the HIGH→local
+        # branch below (order-of-checks footgun). The override is still respected
+        # when the deploy explicitly opts into cloud-for-HIGH (the GPU-less
+        # public demo, SAR_ALLOW_CLOUD_FOR_HIGH=true) or when it targets local
+        # Ollama.
         if override_provider:
             high_must_stay_local = (
                 sensitivity_level == SensitivityLevel.HIGH
         import time
         start = time.perf_counter()
+        client, ephemeral = self._client_for(decision.provider, decision.model)
         try:
             response = await client.generate(prompt=prompt, system_prompt=system_prompt, **kwargs)
             elapsed_ms = (time.perf_counter() - start) * 1000
             response.latency_ms = elapsed_ms
             )
             response.latency_ms = (time.perf_counter() - start) * 1000
             return response, fallback_decision
+        finally:
+            # Per-request BYOK clients are fresh and unshared — close them so the
+            # visitor's httpx connection pool is released. Owner clients are
+            # cached and must never be closed here.
+            if ephemeral:
+                with contextlib.suppress(Exception):
+                    await client.close()
     @staticmethod
     def _normalised_sensitivity(level: SensitivityLevel | str) -> SensitivityLevel:
             forced_local=decision.forced_local,
         )
+        client, ephemeral = self._client_for(decision.provider, decision.model)
         try:
             import time
             response.latency_ms = elapsed_ms
             return response, decision
         finally:
+            # Owner clients are cached — never closed here. Per-request BYOK
+            # clients are fresh and must be closed to release their pool.
+            if ephemeral:
+                with contextlib.suppress(Exception):
+                    await client.close()
     async def generate_stream_with_routing(
         self,
             forced_local=decision.forced_local,
         )
+        client, ephemeral = self._client_for(decision.provider, decision.model)
         try:
             if hasattr(client, "generate_stream"):
                 async for token in client.generate_stream(
                 )
                 yield response.text
         finally:
+            # Owner clients are cached — never closed. Per-request BYOK clients
+            # are fresh; close after the stream is exhausted.
+            if ephemeral:
+                with contextlib.suppress(Exception):
+                    await client.close()
     def get_available_providers(self) -> list[str]:
         """Return a list of currently configured and available providers.
             "anthropic": settings.anthropic_model,
         }
         return model_defaults.get(provider, settings.llm_model)
+    @staticmethod
+    def _client_for(provider: str, model: str):
+        """Resolve the LLM client for ``provider``, honouring per-request BYOK.
+        When the current request carries active BYOK creds (ContextVar) for the
+        *same* provider the routing decision selected, build a **fresh
+        per-request client** bound to the visitor's key/URL — so the visitor's
+        own key pays for and powers the call. The fresh client is ephemeral and
+        the caller MUST close it after use.
+        Otherwise return the owner's cached client (shared, never closed
+        per-request).
+        Returns:
+            ``(client, ephemeral)`` — ``ephemeral`` True means the caller owns
+            the client and must ``await client.close()`` when done.
+        """
+        rt = get_byok_runtime()
+        if rt is not None and rt.is_active() and (rt.provider or "").lower() == provider.lower():
+            prov = provider.lower()
+            if prov == "ollama":
+                from inference.ollama_client import make_byok_ollama_client
+                return make_byok_ollama_client(base_url=rt.ollama_url or "", model=model), True
+            from inference.cloud_clients import make_byok_cloud_client
+            return (
+                make_byok_cloud_client(provider=prov, user_key=rt.user_key or "", model=model),
+                True,
+            )
+        return get_llm(provider=provider, model=model), False

interfaces/api.py CHANGED Viewed

@@ -296,9 +296,29 @@ if _FASTAPI_AVAILABLE:
     # uses per-request BYOK credentials instead. Isolation is enforced via
     # session-scoped Qdrant collections, not JWT identity.
     if settings.byok_mode:
         from interfaces.byok import ByokCreds, client_ip_from_request, extract_byok
         from utils.rate_limiter import get_owner_key_throttle
         # All demo personas share ``org_id="demo"`` so they query the same
         # ingested corpus. RBAC differentiation is enforced via clearance
         # level + roles at the payload-filter layer -- exactly the production
@@ -495,7 +515,11 @@ if _FASTAPI_AVAILABLE:
             filter still runs end-to-end — same code path as authenticated
             queries, just with demo identities.
             """
-            if not creds.has_user_key():
                 throttle = get_owner_key_throttle()
                 client_ip = client_ip_from_request(request)
                 ok, meta = throttle.allow(client_ip)
@@ -516,16 +540,24 @@ if _FASTAPI_AVAILABLE:
             import time as _t
             _t0 = _t.perf_counter()
-            state = await run_rag_pipeline(
-                query=body.query,
-                user_context=user_ctx,
-                thread_id=f"byok-{creds.session_id}",
-                prefer_cloud=body.prefer_cloud,
-                # Visitor's chosen provider when present; falls back to env.
-                override_provider=creds.safe_provider(),
-                persona_style=_persona_style(creds),
-                byok_session_id=creds.session_id,
-            )
             elapsed_ms = (_t.perf_counter() - _t0) * 1000
             response = QueryResponse.from_state(state)
             # Persist a single audit-log row so /byok/audit can surface the
@@ -591,7 +623,7 @@ if _FASTAPI_AVAILABLE:
             CORS is already mounted on the app when ``byok_mode`` is on.
             """
-            if not creds.has_user_key():
                 throttle = get_owner_key_throttle()
                 client_ip = client_ip_from_request(request)
                 ok, meta = throttle.allow(client_ip)
@@ -614,6 +646,9 @@ if _FASTAPI_AVAILABLE:
                 import time as _t
                 _t0 = _t.perf_counter()
                 # Replay the session_id up front so the client can stitch
                 # token deltas to a known turn without waiting for `final`.
                 yield (
@@ -674,6 +709,10 @@ if _FASTAPI_AVAILABLE:
                 except Exception as exc:  # pragma: no cover -- defensive
                     logger.exception("byok_stream_failed", error=str(exc))
                     yield (f"event: error\ndata: {json.dumps({'message': 'stream_failed'})}\n\n")
                 # Persist audit row at the end of the stream so /byok/audit
                 # surfaces the session's history even when the visitor
                 # disconnects before the final frame.
@@ -1174,22 +1213,6 @@ if _FASTAPI_AVAILABLE:
             token_type="bearer",
             expires_in=body.ttl_seconds or settings.jwt_ttl_seconds,
         )
-        try:
-            token = issue_token(
-                user_id=body.user_id,
-                org_id=body.org_id,
-                roles=body.roles,
-                clearance_level=body.clearance_level,
-                ttl_seconds=body.ttl_seconds,
-            )
-        except AuthError as exc:
-            raise HTTPException(
-                status.HTTP_500_INTERNAL_SERVER_ERROR, f"token_issue_{exc.reason}: {exc}"
-            ) from exc
-        return _TokenResponse(
-            access_token=token,
-            expires_in=body.ttl_seconds or settings.jwt_ttl_seconds,
-        )
 else:  # pragma: no cover
     app = None  # type: ignore[assignment]

     # uses per-request BYOK credentials instead. Isolation is enforced via
     # session-scoped Qdrant collections, not JWT identity.
     if settings.byok_mode:
+        from inference.byok_context import (
+            ByokRuntime,
+            reset_byok_runtime,
+            set_byok_runtime,
+        )
         from interfaces.byok import ByokCreds, client_ip_from_request, extract_byok
         from utils.rate_limiter import get_owner_key_throttle
+        def _byok_runtime_for(creds: ByokCreds) -> ByokRuntime | None:
+            """Build the per-request BYOK runtime from creds, or None.
+            Only returns a runtime when the visitor brought usable creds — so
+            the visitor's own key powers the call. Otherwise None and the
+            pipeline routes through the owner's cached clients (throttled).
+            """
+            if not creds.byok_active():
+                return None
+            return ByokRuntime(
+                provider=creds.safe_provider(),
+                user_key=creds.user_key,
+                ollama_url=creds.ollama_url,
+            )
         # All demo personas share ``org_id="demo"`` so they query the same
         # ingested corpus. RBAC differentiation is enforced via clearance
         # level + roles at the payload-filter layer -- exactly the production
             filter still runs end-to-end — same code path as authenticated
             queries, just with demo identities.
             """
+            # Only a visitor with *usable* BYOK creds bypasses the throttle —
+            # and that same key now actually powers the call (see the BYOK
+            # runtime below). A bare/junk key with no usable provider no longer
+            # skips the throttle while spending the owner key.
+            if not creds.byok_active():
                 throttle = get_owner_key_throttle()
                 client_ip = client_ip_from_request(request)
                 ok, meta = throttle.allow(client_ip)
             import time as _t
             _t0 = _t.perf_counter()
+            # Bind the visitor's key/provider for THIS request so the inference
+            # router builds a per-request client from it. The ContextVar
+            # propagates into run_rag_pipeline and every LangGraph node/LLM call;
+            # reset in finally so it never leaks to the next request.
+            _byok_tok = set_byok_runtime(_byok_runtime_for(creds))
+            try:
+                state = await run_rag_pipeline(
+                    query=body.query,
+                    user_context=user_ctx,
+                    thread_id=f"byok-{creds.session_id}",
+                    prefer_cloud=body.prefer_cloud,
+                    # Visitor's chosen provider when present; falls back to env.
+                    override_provider=creds.safe_provider(),
+                    persona_style=_persona_style(creds),
+                    byok_session_id=creds.session_id,
+                )
+            finally:
+                reset_byok_runtime(_byok_tok)
             elapsed_ms = (_t.perf_counter() - _t0) * 1000
             response = QueryResponse.from_state(state)
             # Persist a single audit-log row so /byok/audit can surface the
             CORS is already mounted on the app when ``byok_mode`` is on.
             """
+            if not creds.byok_active():
                 throttle = get_owner_key_throttle()
                 client_ip = client_ip_from_request(request)
                 ok, meta = throttle.allow(client_ip)
                 import time as _t
                 _t0 = _t.perf_counter()
+                # Bind the visitor's key/provider for the lifetime of this
+                # stream so the synthesizer's streaming LLM call uses it.
+                _byok_tok = set_byok_runtime(_byok_runtime_for(creds))
                 # Replay the session_id up front so the client can stitch
                 # token deltas to a known turn without waiting for `final`.
                 yield (
                 except Exception as exc:  # pragma: no cover -- defensive
                     logger.exception("byok_stream_failed", error=str(exc))
                     yield (f"event: error\ndata: {json.dumps({'message': 'stream_failed'})}\n\n")
+                finally:
+                    # Always clear the per-request BYOK runtime so it never
+                    # leaks into the next request handled by this worker.
+                    reset_byok_runtime(_byok_tok)
                 # Persist audit row at the end of the stream so /byok/audit
                 # surfaces the session's history even when the visitor
                 # disconnects before the final frame.
             token_type="bearer",
             expires_in=body.ttl_seconds or settings.jwt_ttl_seconds,
         )
 else:  # pragma: no cover
     app = None  # type: ignore[assignment]

interfaces/byok.py CHANGED Viewed

@@ -80,6 +80,23 @@ class ByokCreds(BaseModel):
             return self.provider.lower()
         return None
 def client_ip_from_request(request: Request) -> str:
     """Resolve the visitor IP for throttling, honouring ``X-Forwarded-For``.
@@ -135,8 +152,10 @@ def _derive_session_id(client_host: str | None) -> str:
     """
     host = (client_host or "anon").strip() or "anon"
     digest = hashlib.sha256(host.encode("utf-8")).hexdigest()[:8]
-    random = uuid.uuid4().hex[:8]
-    return f"{digest}-{random}"
 def build_creds(

             return self.provider.lower()
         return None
+    def byok_active(self) -> bool:
+        """True when the visitor's creds can actually power a per-request LLM call.
+        Stricter than :meth:`has_user_key`: a key-based provider (groq / openai /
+        anthropic) needs a non-empty key AND a valid provider; an Ollama BYOK
+        needs a reachable URL. This is the gate the chat endpoints use both to
+        (a) bypass the owner-key throttle and (b) bind the per-request client —
+        so a bare ``X-User-LLM-Key`` with no usable provider can no longer skip
+        the throttle while still spending the owner key.
+        """
+        prov = self.safe_provider()
+        if prov in ("groq", "openai", "anthropic"):
+            return self.has_user_key()
+        if prov == "ollama":
+            return bool(self.ollama_url and self.ollama_url.strip())
+        return False
 def client_ip_from_request(request: Request) -> str:
     """Resolve the visitor IP for throttling, honouring ``X-Forwarded-For``.
     """
     host = (client_host or "anon").strip() or "anon"
     digest = hashlib.sha256(host.encode("utf-8")).hexdigest()[:8]
+    # Full UUID4 (122 bits) for the random component — the session id guards one
+    # visitor's session-scoped uploads / audit from another, so it must be hard
+    # to guess. The host digest only adds reconnect stickiness within a worker.
+    return f"{digest}-{uuid.uuid4().hex}"
 def build_creds(

utils/query_cache.py CHANGED Viewed

@@ -1,277 +1,289 @@
-"""Query result caching with Redis fallback to in-memory.
-Caches RAG pipeline results to avoid redundant LLM calls and retrieval
-for identical queries from the same user. Uses Redis when available for
-distributed caching across multiple app instances.
-"""
-from __future__ import annotations
-import hashlib
-import json
-import time
-from typing import Any
-from config.settings import settings
-from utils.logging import get_logger
-from utils.pii import redact_dict
-logger = get_logger(__name__)
-# In-memory fallback cache
-_memory_cache: dict[str, tuple[dict[str, Any], float]] = {}
-_memory_cache_ttl_seconds: float = 300.0  # 5 minutes default
-# Redis singleton
-_redis_client = None
-# Cache metrics counters
-_cache_hits: int = 0
-_cache_misses: int = 0
-def get_cache_metrics() -> dict[str, int]:
-    """Return cache hit/miss counters."""
-    total = _cache_hits + _cache_misses
-    return {
-        "hits": _cache_hits,
-        "misses": _cache_misses,
-        "total": total,
-        "hit_rate": round(_cache_hits / total, 4) if total > 0 else 0.0,
-    }
-def reset_cache_metrics() -> None:
-    """Reset cache hit/miss counters."""
-    global _cache_hits, _cache_misses
-    _cache_hits = 0
-    _cache_misses = 0
-def _get_redis_client():
-    """Lazy-initialize Redis client for query caching.
-    Returns:
-        Redis client instance or None if unavailable.
-    """
-    global _redis_client
-    if _redis_client is not None:
-        return _redis_client
-    if not settings.redis_url:
-        return None
-    try:
-        import redis
-        _redis_client = redis.from_url(settings.redis_url, decode_responses=True)
-        _redis_client.ping()
-        logger.info("query_cache_redis_connected")
-        return _redis_client
-    except ImportError:
-        logger.debug("redis_not_installed_for_query_cache")
-    except Exception as exc:
-        logger.warning("query_cache_redis_connection_failed", error=str(exc))
-    _redis_client = False  # Mark as unavailable
-    return None
-def _build_cache_key(user_id: str, query: str, context_hash: str = "") -> str:
-    """Build a deterministic cache key from user + query.
-    Args:
-        user_id: The user's identifier.
-        query: The query text.
-        context_hash: Optional hash of additional context (model, filters, etc.).
-    Returns:
-        A hash string suitable for use as a cache key.
-    """
-    key_data = f"{user_id}:{query.lower().strip()}:{context_hash}"
-    return hashlib.sha256(key_data.encode()).hexdigest()[:32]
-def get_cached_result(
-    user_id: str,
-    query: str,
-    context_hash: str = "",
-    ttl_seconds: float | None = None,
-) -> dict[str, Any] | None:
-    """Retrieve a cached query result if available and not expired.
-    Args:
-        user_id: The user's identifier.
-        query: The query text.
-        context_hash: Optional hash of additional context.
-        ttl_seconds: Cache TTL. Defaults to settings or 300s.
-    Returns:
-        Cached result dict, or None if not found or expired.
-    """
-    cache_key = _build_cache_key(user_id, query, context_hash)
-    _ = ttl_seconds or _memory_cache_ttl_seconds
-    global _cache_hits, _cache_misses
-    # Try Redis first
-    redis_client = _get_redis_client()
-    if redis_client:
-        try:
-            cached = redis_client.get(f"rag:query:{cache_key}")
-            if cached:
-                result = json.loads(cached)
-                _cache_hits += 1
-                logger.info("query_cache_hit", source="redis", user_id=user_id)
-                return result
-        except Exception as exc:
-            logger.debug("query_cache_redis_read_failed", error=str(exc))
-    # Fallback to in-memory
-    if cache_key in _memory_cache:
-        result, expiry = _memory_cache[cache_key]
-        if time.time() < expiry:
-            _cache_hits += 1
-            logger.info("query_cache_hit", source="memory", user_id=user_id)
-            return result
-        # Expired — clean up
-        del _memory_cache[cache_key]
-    _cache_misses += 1
-    return None
-def set_cached_result(
-    user_id: str,
-    query: str,
-    result: dict[str, Any],
-    context_hash: str = "",
-    ttl_seconds: float | None = None,
-) -> None:
-    """Store a query result in the cache.
-    Args:
-        user_id: The user's identifier.
-        query: The query text.
-        result: The result dict to cache.
-        context_hash: Optional hash of additional context.
-        ttl_seconds: Cache TTL. Defaults to settings or 300s.
-    """
-    cache_key = _build_cache_key(user_id, query, context_hash)
-    ttl = ttl_seconds or _memory_cache_ttl_seconds
-    # Serialize result (exclude non-serializable fields) + redact PII before
-    # persistence so disk/Redis never sees emails, phones, card numbers, etc.
-    serializable_result = redact_dict(_make_serializable(result))
-    # Try Redis first
-    redis_client = _get_redis_client()
-    if redis_client:
-        try:
-            redis_client.setex(
-                f"rag:query:{cache_key}",
-                int(ttl),
-                json.dumps(serializable_result),
-            )
-            logger.info("query_cache_stored", source="redis", user_id=user_id)
-            return
-        except Exception as exc:
-            logger.debug("query_cache_redis_write_failed", error=str(exc))
-    # Fallback to in-memory
-    _memory_cache[cache_key] = (serializable_result, time.time() + ttl)
-    logger.info("query_cache_stored", source="memory", user_id=user_id)
-    # Prune memory cache if too large
-    if len(_memory_cache) > 1000:
-        _prune_memory_cache()
-def _make_serializable(obj: Any) -> Any:
-    """Convert an object to a JSON-serializable form.
-    Args:
-        obj: Object to serialize.
-    Returns:
-        JSON-serializable representation.
-    """
-    if isinstance(obj, dict):
-        return {k: _make_serializable(v) for k, v in obj.items()}
-    if isinstance(obj, list):
-        return [_make_serializable(v) for v in obj]
-    if isinstance(obj, (str, int, float, bool, type(None))):
-        return obj
-    return str(obj)
-def _prune_memory_cache() -> None:
-    """Remove expired entries from the in-memory cache."""
-    now = time.time()
-    expired_keys = [k for k, (_, expiry) in _memory_cache.items() if expiry < now]
-    for k in expired_keys:
-        del _memory_cache[k]
-    # If still too large, remove oldest
-    if len(_memory_cache) > 1000:
-        sorted_items = sorted(_memory_cache.items(), key=lambda x: x[1][1])
-        for k, _ in sorted_items[:100]:
-            del _memory_cache[k]
-def invalidate_user_cache(user_id: str) -> int:
-    """Invalidate all cached queries for a specific user.
-    Args:
-        user_id: The user's identifier.
-    Returns:
-        Number of entries invalidated.
-    """
-    count = 0
-    # In-memory
-    prefix = hashlib.sha256(f"{user_id}:".encode()).hexdigest()[:16]
-    keys_to_remove = [k for k in _memory_cache if k.startswith(prefix)]
-    for k in keys_to_remove:
-        del _memory_cache[k]
-        count += 1
-    # Redis — scan for user-specific keys
-    redis_client = _get_redis_client()
-    if redis_client:
-        try:
-            pattern = "rag:query:*"
-            for key in redis_client.scan_iter(match=pattern, count=100):
-                # Best-effort: we can't easily decode the key back to user_id
-                # So we just clear all query cache entries
-                redis_client.delete(key)
-                count += 1
-        except Exception as exc:
-            logger.debug("query_cache_redis_invalidate_failed", error=str(exc))
-    logger.info("query_cache_invalidated", user_id=user_id, count=count)
-    return count
-def clear_all_cache() -> int:
-    """Clear all query caches (memory + Redis).
-    Returns:
-        Number of entries cleared.
-    """
-    count = len(_memory_cache)
-    _memory_cache.clear()
-    redis_client = _get_redis_client()
-    if redis_client:
-        try:
-            pattern = "rag:query:*"
-            for key in redis_client.scan_iter(match=pattern, count=100):
-                redis_client.delete(key)
-                count += 1
-        except Exception as exc:
-            logger.debug("query_cache_redis_clear_failed", error=str(exc))
-    logger.info("query_cache_cleared_all", count=count)
-    return count

+"""Query result caching with Redis fallback to in-memory.
+Caches RAG pipeline results to avoid redundant LLM calls and retrieval
+for identical queries from the same user. Uses Redis when available for
+distributed caching across multiple app instances.
+"""
+from __future__ import annotations
+import hashlib
+import json
+import time
+from typing import Any
+from config.settings import settings
+from utils.logging import get_logger
+from utils.pii import redact_dict
+logger = get_logger(__name__)
+# In-memory fallback cache
+_memory_cache: dict[str, tuple[dict[str, Any], float]] = {}
+_memory_cache_ttl_seconds: float = 300.0  # 5 minutes default
+# Redis singleton
+_redis_client = None
+# Cache metrics counters
+_cache_hits: int = 0
+_cache_misses: int = 0
+def get_cache_metrics() -> dict[str, int]:
+    """Return cache hit/miss counters."""
+    total = _cache_hits + _cache_misses
+    return {
+        "hits": _cache_hits,
+        "misses": _cache_misses,
+        "total": total,
+        "hit_rate": round(_cache_hits / total, 4) if total > 0 else 0.0,
+    }
+def reset_cache_metrics() -> None:
+    """Reset cache hit/miss counters."""
+    global _cache_hits, _cache_misses
+    _cache_hits = 0
+    _cache_misses = 0
+def _get_redis_client():
+    """Lazy-initialize Redis client for query caching.
+    Returns:
+        Redis client instance or None if unavailable.
+    """
+    global _redis_client
+    if _redis_client is not None:
+        return _redis_client
+    if not settings.redis_url:
+        return None
+    try:
+        import redis
+        _redis_client = redis.from_url(settings.redis_url, decode_responses=True)
+        _redis_client.ping()
+        logger.info("query_cache_redis_connected")
+        return _redis_client
+    except ImportError:
+        logger.debug("redis_not_installed_for_query_cache")
+    except Exception as exc:
+        logger.warning("query_cache_redis_connection_failed", error=str(exc))
+    _redis_client = False  # Mark as unavailable
+    return None
+def _user_prefix(user_id: str) -> str:
+    """Stable per-user key prefix so ``invalidate_user_cache`` can scan by user."""
+    return hashlib.sha256(user_id.encode()).hexdigest()[:12]
+def _build_cache_key(user_id: str, query: str, context_hash: str = "") -> str:
+    """Build a deterministic cache key from user + query.
+    The key is ``<user_prefix><body_hash>`` so a single user's entries share a
+    common prefix — that is what makes ``invalidate_user_cache`` work (a hash of
+    one string is never a prefix of a hash of a different string, so the old
+    ``startswith(sha256(user_id))`` scan silently matched nothing).
+    Args:
+        user_id: The user's identifier.
+        query: The query text.
+        context_hash: Optional hash of additional context (model, filters, etc.).
+    Returns:
+        A hash string suitable for use as a cache key.
+    """
+    body = f"{query.lower().strip()}:{context_hash}"
+    body_hash = hashlib.sha256(body.encode()).hexdigest()[:20]
+    return f"{_user_prefix(user_id)}{body_hash}"
+def get_cached_result(
+    user_id: str,
+    query: str,
+    context_hash: str = "",
+    ttl_seconds: float | None = None,
+) -> dict[str, Any] | None:
+    """Retrieve a cached query result if available and not expired.
+    Args:
+        user_id: The user's identifier.
+        query: The query text.
+        context_hash: Optional hash of additional context.
+        ttl_seconds: Cache TTL. Defaults to settings or 300s.
+    Returns:
+        Cached result dict, or None if not found or expired.
+    """
+    cache_key = _build_cache_key(user_id, query, context_hash)
+    _ = ttl_seconds or _memory_cache_ttl_seconds
+    global _cache_hits, _cache_misses
+    # Try Redis first
+    redis_client = _get_redis_client()
+    if redis_client:
+        try:
+            cached = redis_client.get(f"rag:query:{cache_key}")
+            if cached:
+                result = json.loads(cached)
+                _cache_hits += 1
+                logger.info("query_cache_hit", source="redis", user_id=user_id)
+                return result
+        except Exception as exc:
+            logger.debug("query_cache_redis_read_failed", error=str(exc))
+    # Fallback to in-memory
+    if cache_key in _memory_cache:
+        result, expiry = _memory_cache[cache_key]
+        if time.time() < expiry:
+            _cache_hits += 1
+            logger.info("query_cache_hit", source="memory", user_id=user_id)
+            return result
+        # Expired — clean up
+        del _memory_cache[cache_key]
+    _cache_misses += 1
+    return None
+def set_cached_result(
+    user_id: str,
+    query: str,
+    result: dict[str, Any],
+    context_hash: str = "",
+    ttl_seconds: float | None = None,
+) -> None:
+    """Store a query result in the cache.
+    Args:
+        user_id: The user's identifier.
+        query: The query text.
+        result: The result dict to cache.
+        context_hash: Optional hash of additional context.
+        ttl_seconds: Cache TTL. Defaults to settings or 300s.
+    """
+    cache_key = _build_cache_key(user_id, query, context_hash)
+    ttl = ttl_seconds or _memory_cache_ttl_seconds
+    # Serialize result (exclude non-serializable fields) + redact PII before
+    # persistence so disk/Redis never sees emails, phones, card numbers, etc.
+    serializable_result = redact_dict(_make_serializable(result))
+    # Try Redis first
+    redis_client = _get_redis_client()
+    if redis_client:
+        try:
+            redis_client.setex(
+                f"rag:query:{cache_key}",
+                int(ttl),
+                json.dumps(serializable_result),
+            )
+            logger.info("query_cache_stored", source="redis", user_id=user_id)
+            return
+        except Exception as exc:
+            logger.debug("query_cache_redis_write_failed", error=str(exc))
+    # Fallback to in-memory
+    _memory_cache[cache_key] = (serializable_result, time.time() + ttl)
+    logger.info("query_cache_stored", source="memory", user_id=user_id)
+    # Prune memory cache if too large
+    if len(_memory_cache) > 1000:
+        _prune_memory_cache()
+def _make_serializable(obj: Any) -> Any:
+    """Convert an object to a JSON-serializable form.
+    Args:
+        obj: Object to serialize.
+    Returns:
+        JSON-serializable representation.
+    """
+    if isinstance(obj, dict):
+        return {k: _make_serializable(v) for k, v in obj.items()}
+    if isinstance(obj, list):
+        return [_make_serializable(v) for v in obj]
+    if isinstance(obj, (str, int, float, bool, type(None))):
+        return obj
+    return str(obj)
+def _prune_memory_cache() -> None:
+    """Remove expired entries from the in-memory cache."""
+    now = time.time()
+    expired_keys = [k for k, (_, expiry) in _memory_cache.items() if expiry < now]
+    for k in expired_keys:
+        del _memory_cache[k]
+    # If still too large, remove oldest
+    if len(_memory_cache) > 1000:
+        sorted_items = sorted(_memory_cache.items(), key=lambda x: x[1][1])
+        for k, _ in sorted_items[:100]:
+            del _memory_cache[k]
+def invalidate_user_cache(user_id: str) -> int:
+    """Invalidate all cached queries for a specific user.
+    Args:
+        user_id: The user's identifier.
+    Returns:
+        Number of entries invalidated.
+    """
+    count = 0
+    # In-memory — keys are namespaced ``<user_prefix><body_hash>`` so a single
+    # user's entries share this prefix (see _build_cache_key).
+    prefix = _user_prefix(user_id)
+    keys_to_remove = [k for k in _memory_cache if k.startswith(prefix)]
+    for k in keys_to_remove:
+        del _memory_cache[k]
+        count += 1
+    # Redis — scan for user-specific keys
+    redis_client = _get_redis_client()
+    if redis_client:
+        try:
+            pattern = "rag:query:*"
+            for key in redis_client.scan_iter(match=pattern, count=100):
+                # Best-effort: we can't easily decode the key back to user_id
+                # So we just clear all query cache entries
+                redis_client.delete(key)
+                count += 1
+        except Exception as exc:
+            logger.debug("query_cache_redis_invalidate_failed", error=str(exc))
+    logger.info("query_cache_invalidated", user_id=user_id, count=count)
+    return count
+def clear_all_cache() -> int:
+    """Clear all query caches (memory + Redis).
+    Returns:
+        Number of entries cleared.
+    """
+    count = len(_memory_cache)
+    _memory_cache.clear()
+    redis_client = _get_redis_client()
+    if redis_client:
+        try:
+            pattern = "rag:query:*"
+            for key in redis_client.scan_iter(match=pattern, count=100):
+                redis_client.delete(key)
+                count += 1
+        except Exception as exc:
+            logger.debug("query_cache_redis_clear_failed", error=str(exc))
+    logger.info("query_cache_cleared_all", count=count)
+    return count