Spaces:

LeomordKaly
/

secureagentrag-api

Running

App Files Files Community

LeomordKaly commited on 14 days ago

Commit

f4ef3b8

verified ·

1 Parent(s): b2a2d08

deploy: phase 3 BYOK backend (Dockerfile.hf, FastAPI on 7860)

Browse files

Files changed (11) hide show

Dockerfile.hf +151 -151
config/settings.py +316 -316
inference/cloud_clients.py +577 -577
inference/ollama_client.py +334 -334
interfaces/api.py +432 -425
interfaces/byok.py +166 -166
retrieval/multitenancy.py +43 -43
retrieval/session_purge.py +185 -185
utils/observability.py +252 -252
utils/pii.py +146 -146
utils/rate_limiter.py +524 -524

Dockerfile.hf CHANGED Viewed

@@ -1,151 +1,151 @@
-# =============================================================================
-# Dockerfile.hf — SecureAgentRAG backend for Hugging Face Spaces (CPU Basic).
-# =============================================================================
-# Two-stage build keeps the runtime image lean. The HF Space free tier is
-# CPU-only with 16 GB RAM and ~50 GB ephemeral disk, so we target a tight
-# memory footprint:
-#
-#   - Python 3.11-slim base (~150 MB)
-#   - Only [api, embeddings-local, pii] extras (no OCR, no Phoenix, no Postgres,
-#     no Redis, no MCP) -- those modules are present in the source but their
-#     dependencies are not installed
-#   - cross-encoder reranker downloaded on first request (auto-cached under
-#     /home/user/.cache/huggingface). Skips the 2.3 GB fine-tuned checkpoint
-#     for the initial deploy; phase 3.2 can swap to fine_tuned once the
-#     reranker repo is published on HF Hub.
-#
-# The Space-side README.md is uploaded separately by scripts/deploy_hf_space.py
-# with a YAML frontmatter declaring sdk=docker + app_port=7860.
-# =============================================================================
-# --- builder ----------------------------------------------------------------
-FROM python:3.11-slim AS builder
-WORKDIR /app
-RUN pip install --no-cache-dir uv
-# pyproject.toml + a copy of the source are required for uv to build the
-# editable install. README.md is referenced as the long_description.
-COPY pyproject.toml ./
-COPY README.md ./
-# Touch the package directories that hatchling treats as the wheel root --
-# we only need the directory tree to exist at build time so hatchling can
-# scan for __init__.py files. The actual code lands in the runtime stage.
-RUN mkdir -p config core inference retrieval interfaces ingestion utils evaluation app \
-    && touch config/__init__.py core/__init__.py inference/__init__.py \
-    && touch retrieval/__init__.py interfaces/__init__.py ingestion/__init__.py \
-    && touch utils/__init__.py evaluation/__init__.py app/__init__.py
-# Intentionally skip [pii] extras -- the regex patterns in utils/pii.py
-# already cover every BYOK key shape (Groq / OpenAI / Anthropic / HF / Vercel
-# / Qdrant JWT / Qdrant management). Adding Presidio would pull spaCy
-# en_core_web_lg (~770 MB) which auto-downloads at runtime and crashes the
-# container on the CPU Basic Space when the package installer is absent.
-RUN uv venv /app/.venv \
-    && uv pip install --python /app/.venv/bin/python \
-        -e ".[api,embeddings-local]"
-# --- runtime ----------------------------------------------------------------
-FROM python:3.11-slim AS runtime
-WORKDIR /app
-# HF Spaces convention: run as uid 1000 with a writeable /home/user.
-RUN useradd -m -u 1000 user
-# System deps for PDF / image processing only -- no OCR / paddle.
-# Debian 12+ (trixie) renamed libgl1-mesa-glx -> libgl1 and libxrender-dev
-# is no longer needed at runtime (runtime is libxrender1).
-RUN apt-get update \
-    && apt-get install -y --no-install-recommends \
-        libglib2.0-0 libsm6 libxext6 libxrender1 libgl1 curl \
-    && rm -rf /var/lib/apt/lists/*
-# Bring the virtualenv from the builder stage.
-COPY --from=builder /app/.venv /app/.venv
-ENV PATH="/app/.venv/bin:$PATH"
-# Copy application source. Files that match .dockerignore are filtered out.
-COPY --chown=user:user . /app
-USER user
-# Pre-populate the HF cache so the cross-encoder lives on disk before the
-# first request. Defensive: never fails the build -- if HF Hub is unreachable
-# during build (offline mirrors etc.) the cache is populated on first query.
-RUN python -c "import os; \
-from huggingface_hub import snapshot_download; \
-import sys; \
-try: snapshot_download(repo_id='BAAI/bge-reranker-v2-m3', cache_dir='/home/user/.cache/huggingface/hub'); print('reranker cached') \
-except Exception as e: print(f'reranker cache skipped: {e!r}', file=sys.stderr)" \
-    || echo "build-time reranker download failed -- will lazy-load on first request"
-# --- BYOK production env ---------------------------------------------------
-# Real secrets (Qdrant URL + API key, Groq key) are injected via HF Space
-# secrets panel -- they ride the same SAR_* env-var protocol but are NOT
-# baked into the image. Only mode flags and safe defaults live here.
-ENV SAR_BYOK_MODE=true
-ENV SAR_BYOK_OWNER_QUOTA=3
-ENV SAR_SESSION_TTL_HOURS=24
-ENV SAR_CORS_ALLOW_ORIGINS='["https://app.eilm.live","https://secureagentrag-web.vercel.app","https://secureagentrag.vercel.app"]'
-# Cloud LLM defaults -- Groq llama-3.1-8b-instant is the cheapest fast option
-# on the free tier. Visitor BYOK overrides this per request.
-ENV SAR_DEFAULT_PROVIDER=groq
-ENV SAR_CLOUD_PROVIDER=groq
-ENV SAR_LLM_MODEL=llama-3.1-8b-instant
-# Embedding stack -- local BGE-M3 via sentence-transformers (CPU). Avoids
-# Ollama entirely.
-ENV SAR_EMBEDDING_BACKEND=local
-ENV SAR_LOCAL_EMBEDDING_MODEL=BAAI/bge-m3
-ENV SAR_EMBEDDING_MODEL=bge-m3
-ENV SAR_EMBEDDING_DIM=1024
-# Cross-encoder reranker -- balances quality with build size. Swap to
-# fine_tuned + SAR_FINETUNED_RERANKER_PATH after phase 3.2 ships the
-# 2.3 GB checkpoint to LeomordKaly/secureagentrag-reranker-v1.
-ENV SAR_RERANKER_TYPE=cross_encoder
-ENV SAR_RERANKER_CHECKPOINT=BAAI/bge-reranker-v2-m3
-# Sparse retrieval -- BM25 keeps the cold path zero-dep; SPLADE adds an
-# extra ~600 MB model and is skipped on free CPU Basic.
-ENV SAR_SPARSE_BACKEND=bm25
-# Persistence paths -- /tmp is the only writable area on HF Spaces.
-ENV SAR_AUDIT_LOG_DIR=/tmp/secureagentrag/audit_logs
-ENV SAR_CONVERSATION_DIR=/tmp/secureagentrag/conversations
-ENV SAR_CHECKPOINT_DB_PATH=/tmp/secureagentrag/checkpoints.sqlite
-ENV SAR_BM25_INDEX_PATH=/tmp/secureagentrag/bm25_index.pkl
-# Multi-tenant collections route BYOK session -> documents_sess_<sid>.
-ENV SAR_MULTI_TENANT_COLLECTIONS=true
-# Pipeline safety
-ENV SAR_REQUEST_TIMEOUT_S=120
-ENV SAR_FAITHFULNESS_GATE_ENABLED=true
-ENV SAR_FAITHFULNESS_GATE_MODE=flag
-ENV SAR_FAITHFULNESS_THRESHOLD=0.7
-# Logging
-ENV SAR_LOG_LEVEL=INFO
-# HF cache lives under the user home which is the only persistent writable
-# tree across Space restarts on CPU Basic.
-ENV HF_HOME=/home/user/.cache/huggingface
-ENV TRANSFORMERS_CACHE=/home/user/.cache/huggingface/hub
-EXPOSE 7860
-HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
-    CMD curl --fail --silent --show-error http://localhost:7860/healthz || exit 1
-# uvicorn with 1 worker -- on CPU Basic two workers thrash the memory.
-CMD ["uvicorn", "interfaces.api:app", \
-     "--host", "0.0.0.0", \
-     "--port", "7860", \
-     "--workers", "1", \
-     "--timeout-keep-alive", "30", \
-     "--no-access-log"]

+# =============================================================================
+# Dockerfile.hf — SecureAgentRAG backend for Hugging Face Spaces (CPU Basic).
+# =============================================================================
+# Two-stage build keeps the runtime image lean. The HF Space free tier is
+# CPU-only with 16 GB RAM and ~50 GB ephemeral disk, so we target a tight
+# memory footprint:
+#
+#   - Python 3.11-slim base (~150 MB)
+#   - Only [api, embeddings-local, pii] extras (no OCR, no Phoenix, no Postgres,
+#     no Redis, no MCP) -- those modules are present in the source but their
+#     dependencies are not installed
+#   - cross-encoder reranker downloaded on first request (auto-cached under
+#     /home/user/.cache/huggingface). Skips the 2.3 GB fine-tuned checkpoint
+#     for the initial deploy; phase 3.2 can swap to fine_tuned once the
+#     reranker repo is published on HF Hub.
+#
+# The Space-side README.md is uploaded separately by scripts/deploy_hf_space.py
+# with a YAML frontmatter declaring sdk=docker + app_port=7860.
+# =============================================================================
+# --- builder ----------------------------------------------------------------
+FROM python:3.11-slim AS builder
+WORKDIR /app
+RUN pip install --no-cache-dir uv
+# pyproject.toml + a copy of the source are required for uv to build the
+# editable install. README.md is referenced as the long_description.
+COPY pyproject.toml ./
+COPY README.md ./
+# Touch the package directories that hatchling treats as the wheel root --
+# we only need the directory tree to exist at build time so hatchling can
+# scan for __init__.py files. The actual code lands in the runtime stage.
+RUN mkdir -p config core inference retrieval interfaces ingestion utils evaluation app \
+    && touch config/__init__.py core/__init__.py inference/__init__.py \
+    && touch retrieval/__init__.py interfaces/__init__.py ingestion/__init__.py \
+    && touch utils/__init__.py evaluation/__init__.py app/__init__.py
+# Intentionally skip [pii] extras -- the regex patterns in utils/pii.py
+# already cover every BYOK key shape (Groq / OpenAI / Anthropic / HF / Vercel
+# / Qdrant JWT / Qdrant management). Adding Presidio would pull spaCy
+# en_core_web_lg (~770 MB) which auto-downloads at runtime and crashes the
+# container on the CPU Basic Space when the package installer is absent.
+RUN uv venv /app/.venv \
+    && uv pip install --python /app/.venv/bin/python \
+        -e ".[api,embeddings-local]"
+# --- runtime ----------------------------------------------------------------
+FROM python:3.11-slim AS runtime
+WORKDIR /app
+# HF Spaces convention: run as uid 1000 with a writeable /home/user.
+RUN useradd -m -u 1000 user
+# System deps for PDF / image processing only -- no OCR / paddle.
+# Debian 12+ (trixie) renamed libgl1-mesa-glx -> libgl1 and libxrender-dev
+# is no longer needed at runtime (runtime is libxrender1).
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+        libglib2.0-0 libsm6 libxext6 libxrender1 libgl1 curl \
+    && rm -rf /var/lib/apt/lists/*
+# Bring the virtualenv from the builder stage.
+COPY --from=builder /app/.venv /app/.venv
+ENV PATH="/app/.venv/bin:$PATH"
+# Copy application source. Files that match .dockerignore are filtered out.
+COPY --chown=user:user . /app
+USER user
+# Pre-populate the HF cache so the cross-encoder lives on disk before the
+# first request. Defensive: never fails the build -- if HF Hub is unreachable
+# during build (offline mirrors etc.) the cache is populated on first query.
+RUN python -c "import os; \
+from huggingface_hub import snapshot_download; \
+import sys; \
+try: snapshot_download(repo_id='BAAI/bge-reranker-v2-m3', cache_dir='/home/user/.cache/huggingface/hub'); print('reranker cached') \
+except Exception as e: print(f'reranker cache skipped: {e!r}', file=sys.stderr)" \
+    || echo "build-time reranker download failed -- will lazy-load on first request"
+# --- BYOK production env ---------------------------------------------------
+# Real secrets (Qdrant URL + API key, Groq key) are injected via HF Space
+# secrets panel -- they ride the same SAR_* env-var protocol but are NOT
+# baked into the image. Only mode flags and safe defaults live here.
+ENV SAR_BYOK_MODE=true
+ENV SAR_BYOK_OWNER_QUOTA=3
+ENV SAR_SESSION_TTL_HOURS=24
+ENV SAR_CORS_ALLOW_ORIGINS='["https://app.eilm.live","https://secureagentrag-web.vercel.app","https://secureagentrag.vercel.app"]'
+# Cloud LLM defaults -- Groq llama-3.1-8b-instant is the cheapest fast option
+# on the free tier. Visitor BYOK overrides this per request.
+ENV SAR_DEFAULT_PROVIDER=groq
+ENV SAR_CLOUD_PROVIDER=groq
+ENV SAR_LLM_MODEL=llama-3.1-8b-instant
+# Embedding stack -- local BGE-M3 via sentence-transformers (CPU). Avoids
+# Ollama entirely.
+ENV SAR_EMBEDDING_BACKEND=local
+ENV SAR_LOCAL_EMBEDDING_MODEL=BAAI/bge-m3
+ENV SAR_EMBEDDING_MODEL=bge-m3
+ENV SAR_EMBEDDING_DIM=1024
+# Cross-encoder reranker -- balances quality with build size. Swap to
+# fine_tuned + SAR_FINETUNED_RERANKER_PATH after phase 3.2 ships the
+# 2.3 GB checkpoint to LeomordKaly/secureagentrag-reranker-v1.
+ENV SAR_RERANKER_TYPE=cross_encoder
+ENV SAR_RERANKER_CHECKPOINT=BAAI/bge-reranker-v2-m3
+# Sparse retrieval -- BM25 keeps the cold path zero-dep; SPLADE adds an
+# extra ~600 MB model and is skipped on free CPU Basic.
+ENV SAR_SPARSE_BACKEND=bm25
+# Persistence paths -- /tmp is the only writable area on HF Spaces.
+ENV SAR_AUDIT_LOG_DIR=/tmp/secureagentrag/audit_logs
+ENV SAR_CONVERSATION_DIR=/tmp/secureagentrag/conversations
+ENV SAR_CHECKPOINT_DB_PATH=/tmp/secureagentrag/checkpoints.sqlite
+ENV SAR_BM25_INDEX_PATH=/tmp/secureagentrag/bm25_index.pkl
+# Multi-tenant collections route BYOK session -> documents_sess_<sid>.
+ENV SAR_MULTI_TENANT_COLLECTIONS=true
+# Pipeline safety
+ENV SAR_REQUEST_TIMEOUT_S=120
+ENV SAR_FAITHFULNESS_GATE_ENABLED=true
+ENV SAR_FAITHFULNESS_GATE_MODE=flag
+ENV SAR_FAITHFULNESS_THRESHOLD=0.7
+# Logging
+ENV SAR_LOG_LEVEL=INFO
+# HF cache lives under the user home which is the only persistent writable
+# tree across Space restarts on CPU Basic.
+ENV HF_HOME=/home/user/.cache/huggingface
+ENV TRANSFORMERS_CACHE=/home/user/.cache/huggingface/hub
+EXPOSE 7860
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+    CMD curl --fail --silent --show-error http://localhost:7860/healthz || exit 1
+# uvicorn with 1 worker -- on CPU Basic two workers thrash the memory.
+CMD ["uvicorn", "interfaces.api:app", \
+     "--host", "0.0.0.0", \
+     "--port", "7860", \
+     "--workers", "1", \
+     "--timeout-keep-alive", "30", \
+     "--no-access-log"]

config/settings.py CHANGED Viewed

@@ -1,316 +1,316 @@
-"""Application settings managed via pydantic-settings with environment variable support."""
-from __future__ import annotations
-import contextlib
-import json
-import os
-from pathlib import Path
-from pydantic_settings import BaseSettings, SettingsConfigDict
-class Settings(BaseSettings):
-    """Central configuration for SecureAgentRAG.
-    All settings can be overridden via environment variables prefixed with ``SAR_``.
-    For example, ``SAR_DEBUG=true`` sets ``debug`` to True.
-    """
-    model_config = SettingsConfigDict(
-        env_file=".env",
-        env_prefix="SAR_",
-        env_file_encoding="utf-8",
-        case_sensitive=False,
-        extra="ignore",
-    )
-    # ── Application ──────────────────────────────────────────────────────────────
-    app_name: str = "SecureAgentRAG"
-    debug: bool = False
-    log_level: str = "INFO"
-    # ── Qdrant Vector Store ──────────────────────────────────────────────────────
-    qdrant_url: str = "http://localhost:6333"
-    qdrant_collection: str = "documents"
-    qdrant_api_key: str | None = None
-    # ── Ollama / LLM ─────────────────────────────────────────────────────────────
-    ollama_url: str = "http://localhost:11434"
-    llm_model: str = "qwen3:8b"
-    embedding_model: str = "bge-m3"
-    embedding_dim: int = 1024
-    embedding_backend: str = "ollama"  # "ollama" or "local" (sentence-transformers)
-    local_embedding_model: str = "BAAI/bge-m3"
-    # How long Ollama keeps models resident in VRAM between requests.
-    # On consumer hardware the LLM (qwen3:8b ~5.5GB) and embedding (bge-m3 ~1.2GB)
-    # need to swap if VRAM is tight. Long keep-alive avoids ~5-10s reload per swap.
-    ollama_keep_alive: str = "30m"
-    # ── Chunking ─────────────────────────────────────────────────────────────────
-    chunk_size: int = 1000
-    chunk_overlap: int = 200
-    # ── Retrieval ────────────────────────────────────────────────────────────────
-    top_k: int = 10
-    rerank_top_k: int = 5
-    relevance_threshold: float = 0.7
-    # RAG Fusion: generate N query reformulations, retrieve in parallel,
-    # fuse the ranked lists via RRF. Boosts recall on under-specified
-    # queries. Cost: N-1 extra LLM calls + N parallel Qdrant searches.
-    # Set to 1 to disable.
-    rag_fusion_n_queries: int = 3
-    rag_fusion_enabled: bool = True
-    # ── Reranker ─────────────────────────────────────────────────────────────────
-    # Re-score retrieved documents for higher precision.
-    # Options: "none" (disabled), "cross_encoder" (BGE-Reranker-v2-M3),
-    # "colbert" (ColBERTv2 late-interaction, requires colbert-ai package).
-    # The cross-encoder downloads ~600MB from HuggingFace on first use.
-    # The ColBERT checkpoint is ~400MB. Disabled by default so the first
-    # query does not silently hang on download. Pre-download explicitly.
-    reranker_type: str = "none"
-    reranker_checkpoint: str = "BAAI/bge-reranker-v2-m3"
-    colbert_checkpoint: str = "colbert-ir/colbertv2.0"
-    # Path to a locally fine-tuned cross-encoder checkpoint produced by
-    # scripts/train_reranker.py. Used when reranker_type == "fine_tuned".
-    finetuned_reranker_path: str = "data/checkpoints/reranker-domain-v1"
-    # ── Inference Providers ──────────────────────────────────────────────────────
-    default_provider: str = "ollama"
-    cloud_provider: str | None = None
-    groq_api_key: str | None = None
-    openai_api_key: str | None = None
-    anthropic_api_key: str | None = None
-    groq_api_base: str = "https://api.groq.com/openai/v1"
-    openai_api_base: str = "https://api.openai.com/v1"
-    anthropic_api_base: str = "https://api.anthropic.com/v1"
-    # ── RAG Pipeline Thresholds ───────────────────────────────────────────────────
-    relevance_retry_threshold: float = 0.5
-    confidence_threshold: float = 0.6
-    max_retries: int = 2
-    # ── JSON Citations ───────────────────────────────────────────────────────���────
-    # When enabled, the synthesizer requests structured JSON output from the LLM
-    # with `answer` and `citations` fields instead of relying on regex extraction.
-    json_citations_enabled: bool = False
-    # ── Embedding Batch Size ──────────────────────────────────────────────────────
-    embedding_batch_size: int = 32  # Max texts per embedding API call
-    embedding_max_concurrent_batches: int = 4  # Max concurrent batch requests
-    # ── RBAC ─────────────────────────────────────────────────────────────────────
-    enable_rbac: bool = True
-    # ── Observability (Phoenix) ──────────────────────────────────────────────────
-    phoenix_endpoint: str | None = None
-    # ── Sparse Vectors (Qdrant native, replaces rank_bm25 pickle) ────────────────
-    sparse_backend: str = "bm25"  # "bm25" | "splade"
-    sparse_vector_name: str = "sparse"
-    sparse_model: str = "naver/splade-cocondenser-ensembledistil"
-    # ── Audit + Conversation Storage ──────────────────────────────────────────────
-    audit_log_dir: str = "audit_logs"
-    conversation_dir: str = "conversations"
-    checkpoint_db_path: str = "data/checkpoints.sqlite"
-    # Opt-in: enable persistent (SQLite/Postgres) LangGraph checkpointing.
-    # Default off because pytest-asyncio creates per-test event loops which
-    # collide with aiosqlite's loop-bound connection. For production single-
-    # process Streamlit / FastAPI deployments, set SAR_USE_PERSISTENT_CHECKPOINTER=true.
-    use_persistent_checkpointer: bool = False
-    # ── PostgreSQL (for LangGraph checkpointing) ─────────────────────────────────
-    postgres_url: str = "postgresql://sar_user:sar_password@localhost:5433/secureagentrag"
-    # ── Pipeline SLO ─────────────────────────────────────────────────────────────
-    # Hard wall-clock budget for a single RAG pipeline run (rewrite loop +
-    # retrieval + grading + synthesis + evaluation). On timeout the caller
-    # gets a graceful refusal + audit entry; nothing partial is rendered as
-    # if the answer succeeded. 0 disables the deadline.
-    request_timeout_s: float = 60.0
-    # ── Authentication ───────────────────────────────────────────────────────────
-    # When ``jwt_secret`` is set the FastAPI / MCP layers verify HS256-signed
-    # JWTs and derive UserContext from validated claims. When unset, callers
-    # fall back to the dev-mode base64(json(UserContext)) token shape so
-    # existing tests and smoke scripts keep working — but a runtime warning is
-    # emitted on every request. Production deployments MUST set this.
-    #
-    # ``jwt_issuer`` / ``jwt_audience`` are checked against ``iss`` / ``aud``
-    # claims when present. Leave empty to disable that check (default).
-    # ``jwt_ttl_seconds`` is the lifetime of tokens minted via the local
-    # ``/token`` dev endpoint; real IdPs (Keycloak/Auth0) set their own.
-    jwt_secret: str | None = None
-    jwt_issuer: str = "secureagentrag"
-    jwt_audience: str = "secureagentrag-api"
-    jwt_ttl_seconds: int = 3600
-    jwt_algorithm: str = "HS256"
-    # JWKS endpoint for RS256 verification (e.g. Keycloak, Auth0).
-    # When set and jwt_algorithm == "RS256", tokens are verified against
-    # the cached JWKS instead of jwt_secret.
-    jwks_url: str | None = None
-    jwks_cache_ttl_seconds: int = 300
-    # ── Citation Faithfulness Gate (NLI) ─────────────────────────────────────────
-    # After synthesis, run a per-sentence NLI check: for each sentence that
-    # carries an inline `[N]` citation, ask a yes/no entailment question
-    # against the cited chunk's text. Sentences that fail are either marked
-    # `[unsupported]` (soft mode) or dropped from the answer (strict mode).
-    # The check uses the same local LLM as the rest of the graph — no extra
-    # model download. Cost: one LLM call per cited sentence (parallel).
-    faithfulness_gate_enabled: bool = False
-    faithfulness_gate_mode: str = "flag"  # "flag" | "drop"
-    faithfulness_threshold: float = 0.7  # min entailment ratio to consider answer faithful
-    faithfulness_max_concurrent: int = 4  # parallel NLI checks
-    # ── Redis (for distributed rate limiting / caching) ──────────────────────────
-    redis_url: str = "redis://localhost:6379/0"
-    use_redis_rate_limiter: bool = False
-    # ── PII Redaction ────────────────────────────────────────────────────────────
-    # Scrub email, phone, SSN, credit-card, IBAN, IP address before persisting
-    # to audit log / query cache. Defense against accidental PII leakage into
-    # secondary stores. Regex-based by default; if Microsoft Presidio is
-    # installed it is used automatically for higher recall.
-    pii_redaction_enabled: bool = True
-    # ── Prompt-Injection Guardrails ──────────────────────────────────────────────
-    # Run a regex + heuristic check on the user query before retrieval. Blocks
-    # obvious jailbreak / system-prompt-override attempts. Logged via the audit
-    # logger as ``security_block`` events.
-    guardrails_enabled: bool = True
-    # Strict mode: after the fast regex gate, escalate ambiguous or all queries
-    # to a local LLM-based classifier for a second opinion. Adds one LLM call
-    # per query but catches adversarial inputs that evade regex patterns.
-    guardrails_strict: bool = False
-    # Escalation backend used in strict mode. Options:
-    #   "llm"        — legacy SAFE/UNSAFE prompt on the synth-grade model
-    #                  (core.agents.guardrails_llm). Default for backward
-    #                  compatibility.
-    #   "llamaguard" — Meta's LlamaGuard 3 8B via Ollama. Use with
-    #                  ``ollama pull llama-guard3:8b``. More accurate on
-    #                  the standard S1-S14 taxonomy.
-    guardrails_backend: str = "llm"
-    llamaguard_model: str = "llama-guard3:8b"
-    # ── Contextual Retrieval (Anthropic 2024 technique) ──────────────────────────
-    # Prepend a short LLM-generated context summary to each chunk before
-    # embedding. Adds 1 cheap LLM call per chunk at ingestion time but
-    # measurably improves retrieval recall (Anthropic reported ~35-49%
-    # failure reduction). Local Qwen3-8B is fine for the summary.
-    contextual_retrieval_enabled: bool = False
-    # ── VLM OCR (Primary OCR via vision-language model) ───────────────────────────
-    # Use a VLM (Qwen2.5-VL / Qwen3-VL, LLaVA, etc.) via Ollama as the primary OCR path.
-    # Superior to PaddleOCR on complex layouts, tables, and mixed-language
-    # documents. Falls back to PaddleOCR when the VLM is unavailable.
-    vlm_ocr_enabled: bool = False
-    vlm_ocr_model: str = "qwen2.5-vl"
-    # ── Multi-Tenancy ────────────────────────────────────────────────────────────
-    # When true, each organization gets its own Qdrant collection
-    # (documents_{org_id}). This provides stronger isolation than payload-level
-    # RBAC filtering but requires creating collections per org on first use.
-    # When false, all docs share a single collection with RBAC at payload level.
-    multi_tenant_collections: bool = False
-    # ── BYOK demo mode (P6 production launch, see launch-plan/03-backend-byok.md)
-    # In BYOK mode the FastAPI surface accepts per-request LLM keys from visitor
-    # headers, scopes Qdrant writes to per-session collections, and disables
-    # Phoenix instrumentation. Off in dev/staging, on in the Hugging Face Space
-    # production image (SAR_BYOK_MODE=true via Space secrets).
-    byok_mode: bool = False
-    # When BYOK is on and a visitor did NOT bring their own LLM key, the owner
-    # key in .env is used but throttled to this many requests per IP per hour.
-    # The cap is intentionally tight so the Groq free-tier 30 RPM / 14400 RPD
-    # is never exhausted by a single visitor.
-    byok_owner_key_quota_per_hour: int = 3
-    # Per-session Qdrant collections (documents_sess_<session_id>) are auto
-    # purged after this many hours by retrieval/session_purge.py.
-    session_collection_ttl_hours: int = 24
-    # CORS allowlist consulted by the FastAPI middleware when byok_mode=true.
-    # Empty list = no CORS middleware mounted (dev default).
-    cors_allow_origins: list[str] = []
-    # ── Multi-Modal RAG ──────────────────────────────────────────────────────────
-    # When ingesting images, also generate a rich text description using a VLM.
-    # The description is embedded as a separate chunk, enabling retrieval for
-    # queries like "what does the diagram show?" without requiring CLIP or
-    # other multi-modal embedding models.
-    multimodal_descriptions_enabled: bool = False
-    # ── Self-Query Retrieval ─────────────────────────────────────────────────────
-    # Extract structured metadata filters (source_file, date_range,
-    # sensitivity_level, roles) from the natural language query using a small
-    # local LLM prompt. The filters are merged with the RBAC filter and passed
-    # to Qdrant, scoping retrieval before embedding search runs.
-    self_query_enabled: bool = False
-    # ── HyDE (Hypothetical Document Embeddings) ──────────────────────────────────
-    # Generate a hypothetical answer to the query, embed *that* instead of the
-    # raw query. Boosts recall when query vocabulary differs from doc
-    # vocabulary (questions vs declarative sentences). Adds one LLM call per
-    # query — skip for simple keyword lookups; enable for complex questions.
-    hyde_enabled: bool = False
-    # ── Pricing for cost dashboard (USD per 1M tokens) ───────────────────────────
-    # Used by evaluation/cost.py to convert recorded usage into $/query.
-    price_groq_input_per_1m: float = 0.59
-    price_groq_output_per_1m: float = 0.79
-    price_openai_input_per_1m: float = 2.50
-    price_openai_output_per_1m: float = 10.00
-    price_anthropic_input_per_1m: float = 3.00
-    price_anthropic_output_per_1m: float = 15.00
-    # Local inference: estimated electricity cost only (consumer hardware).
-    # 200W GPU @ $0.15/kWh ≈ $0.03/hour ≈ $0.000008/sec
-    price_local_per_second: float = 0.000008
-def _apply_calibration(settings_obj: Settings) -> None:
-    """Override threshold defaults from ``evaluation/calibration.json`` when present.
-    The calibration script (``scripts/calibrate_thresholds.py``) writes the
-    chosen confidence + faithfulness cutoffs against a labelled gold set. Loading
-    them here means deployments inherit the latest tuned values automatically,
-    while an explicit ``SAR_CONFIDENCE_THRESHOLD`` / ``SAR_FAITHFULNESS_THRESHOLD``
-    env var still wins so operators can override per environment.
-    Silently no-ops when the file is missing, malformed, or the relevant keys
-    are absent — never blocks startup.
-    """
-    calib_path = Path(__file__).resolve().parent.parent / "evaluation" / "calibration.json"
-    if not calib_path.exists():
-        return
-    try:
-        data = json.loads(calib_path.read_text(encoding="utf-8"))
-    except (OSError, json.JSONDecodeError):
-        return
-    # Reject degenerate sweeps (no negatives or no positives -> the chosen
-    # threshold has no statistical meaning). Keeping the original default in
-    # that case is safer than letting a 0.0 cut-off escape into production.
-    def _sane(block: dict) -> bool:
-        try:
-            return (
-                int(block.get("n_pos", 0)) > 0
-                and int(block.get("n_neg", 0)) > 0
-                and float(block.get("chosen_threshold", 0.0)) > 0.0
-            )
-        except (TypeError, ValueError):
-            return False
-    conf_block = data.get("confidence", {})
-    if _sane(conf_block) and os.environ.get("SAR_CONFIDENCE_THRESHOLD") is None:
-        with contextlib.suppress(TypeError, ValueError):
-            settings_obj.confidence_threshold = float(conf_block["chosen_threshold"])
-    faith_block = data.get("faithfulness", {})
-    if _sane(faith_block) and os.environ.get("SAR_FAITHFULNESS_THRESHOLD") is None:
-        with contextlib.suppress(TypeError, ValueError):
-            settings_obj.faithfulness_threshold = float(faith_block["chosen_threshold"])
-# Singleton instance — import this throughout the application
-settings = Settings()
-_apply_calibration(settings)

+"""Application settings managed via pydantic-settings with environment variable support."""
+from __future__ import annotations
+import contextlib
+import json
+import os
+from pathlib import Path
+from pydantic_settings import BaseSettings, SettingsConfigDict
+class Settings(BaseSettings):
+    """Central configuration for SecureAgentRAG.
+    All settings can be overridden via environment variables prefixed with ``SAR_``.
+    For example, ``SAR_DEBUG=true`` sets ``debug`` to True.
+    """
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_prefix="SAR_",
+        env_file_encoding="utf-8",
+        case_sensitive=False,
+        extra="ignore",
+    )
+    # ── Application ──────────────────────────────────────────────────────────────
+    app_name: str = "SecureAgentRAG"
+    debug: bool = False
+    log_level: str = "INFO"
+    # ── Qdrant Vector Store ─────────────────────────────────────────────────���────
+    qdrant_url: str = "http://localhost:6333"
+    qdrant_collection: str = "documents"
+    qdrant_api_key: str | None = None
+    # ── Ollama / LLM ─────────────────────────────────────────────────────────────
+    ollama_url: str = "http://localhost:11434"
+    llm_model: str = "qwen3:8b"
+    embedding_model: str = "bge-m3"
+    embedding_dim: int = 1024
+    embedding_backend: str = "ollama"  # "ollama" or "local" (sentence-transformers)
+    local_embedding_model: str = "BAAI/bge-m3"
+    # How long Ollama keeps models resident in VRAM between requests.
+    # On consumer hardware the LLM (qwen3:8b ~5.5GB) and embedding (bge-m3 ~1.2GB)
+    # need to swap if VRAM is tight. Long keep-alive avoids ~5-10s reload per swap.
+    ollama_keep_alive: str = "30m"
+    # ── Chunking ─────────────────────────────────────────────────────────────────
+    chunk_size: int = 1000
+    chunk_overlap: int = 200
+    # ── Retrieval ────────────────────────────────────────────────────────────────
+    top_k: int = 10
+    rerank_top_k: int = 5
+    relevance_threshold: float = 0.7
+    # RAG Fusion: generate N query reformulations, retrieve in parallel,
+    # fuse the ranked lists via RRF. Boosts recall on under-specified
+    # queries. Cost: N-1 extra LLM calls + N parallel Qdrant searches.
+    # Set to 1 to disable.
+    rag_fusion_n_queries: int = 3
+    rag_fusion_enabled: bool = True
+    # ── Reranker ─────────────────────────────────────────────────────────────────
+    # Re-score retrieved documents for higher precision.
+    # Options: "none" (disabled), "cross_encoder" (BGE-Reranker-v2-M3),
+    # "colbert" (ColBERTv2 late-interaction, requires colbert-ai package).
+    # The cross-encoder downloads ~600MB from HuggingFace on first use.
+    # The ColBERT checkpoint is ~400MB. Disabled by default so the first
+    # query does not silently hang on download. Pre-download explicitly.
+    reranker_type: str = "none"
+    reranker_checkpoint: str = "BAAI/bge-reranker-v2-m3"
+    colbert_checkpoint: str = "colbert-ir/colbertv2.0"
+    # Path to a locally fine-tuned cross-encoder checkpoint produced by
+    # scripts/train_reranker.py. Used when reranker_type == "fine_tuned".
+    finetuned_reranker_path: str = "data/checkpoints/reranker-domain-v1"
+    # ── Inference Providers ──────────────────────────────────────────────────────
+    default_provider: str = "ollama"
+    cloud_provider: str | None = None
+    groq_api_key: str | None = None
+    openai_api_key: str | None = None
+    anthropic_api_key: str | None = None
+    groq_api_base: str = "https://api.groq.com/openai/v1"
+    openai_api_base: str = "https://api.openai.com/v1"
+    anthropic_api_base: str = "https://api.anthropic.com/v1"
+    # ── RAG Pipeline Thresholds ───────────────────────────────────────────────────
+    relevance_retry_threshold: float = 0.5
+    confidence_threshold: float = 0.6
+    max_retries: int = 2
+    # ── JSON Citations ────────────────────────────────────────────────────────────
+    # When enabled, the synthesizer requests structured JSON output from the LLM
+    # with `answer` and `citations` fields instead of relying on regex extraction.
+    json_citations_enabled: bool = False
+    # ── Embedding Batch Size ──────────────────────────────────────────────────────
+    embedding_batch_size: int = 32  # Max texts per embedding API call
+    embedding_max_concurrent_batches: int = 4  # Max concurrent batch requests
+    # ── RBAC ─────────────────────────────────────────────────────────────────────
+    enable_rbac: bool = True
+    # ── Observability (Phoenix) ──────────────────────────────────────────────────
+    phoenix_endpoint: str | None = None
+    # ── Sparse Vectors (Qdrant native, replaces rank_bm25 pickle) ��───────────────
+    sparse_backend: str = "bm25"  # "bm25" | "splade"
+    sparse_vector_name: str = "sparse"
+    sparse_model: str = "naver/splade-cocondenser-ensembledistil"
+    # ── Audit + Conversation Storage ──────────────────────────────────────────────
+    audit_log_dir: str = "audit_logs"
+    conversation_dir: str = "conversations"
+    checkpoint_db_path: str = "data/checkpoints.sqlite"
+    # Opt-in: enable persistent (SQLite/Postgres) LangGraph checkpointing.
+    # Default off because pytest-asyncio creates per-test event loops which
+    # collide with aiosqlite's loop-bound connection. For production single-
+    # process Streamlit / FastAPI deployments, set SAR_USE_PERSISTENT_CHECKPOINTER=true.
+    use_persistent_checkpointer: bool = False
+    # ── PostgreSQL (for LangGraph checkpointing) ─────────────────────────────────
+    postgres_url: str = "postgresql://sar_user:sar_password@localhost:5433/secureagentrag"
+    # ── Pipeline SLO ─────────────────────────────────────────────────────────────
+    # Hard wall-clock budget for a single RAG pipeline run (rewrite loop +
+    # retrieval + grading + synthesis + evaluation). On timeout the caller
+    # gets a graceful refusal + audit entry; nothing partial is rendered as
+    # if the answer succeeded. 0 disables the deadline.
+    request_timeout_s: float = 60.0
+    # ── Authentication ───────────────────────────────────────────────────────────
+    # When ``jwt_secret`` is set the FastAPI / MCP layers verify HS256-signed
+    # JWTs and derive UserContext from validated claims. When unset, callers
+    # fall back to the dev-mode base64(json(UserContext)) token shape so
+    # existing tests and smoke scripts keep working — but a runtime warning is
+    # emitted on every request. Production deployments MUST set this.
+    #
+    # ``jwt_issuer`` / ``jwt_audience`` are checked against ``iss`` / ``aud``
+    # claims when present. Leave empty to disable that check (default).
+    # ``jwt_ttl_seconds`` is the lifetime of tokens minted via the local
+    # ``/token`` dev endpoint; real IdPs (Keycloak/Auth0) set their own.
+    jwt_secret: str | None = None
+    jwt_issuer: str = "secureagentrag"
+    jwt_audience: str = "secureagentrag-api"
+    jwt_ttl_seconds: int = 3600
+    jwt_algorithm: str = "HS256"
+    # JWKS endpoint for RS256 verification (e.g. Keycloak, Auth0).
+    # When set and jwt_algorithm == "RS256", tokens are verified against
+    # the cached JWKS instead of jwt_secret.
+    jwks_url: str | None = None
+    jwks_cache_ttl_seconds: int = 300
+    # ── Citation Faithfulness Gate (NLI) ─────────────────────────────────────────
+    # After synthesis, run a per-sentence NLI check: for each sentence that
+    # carries an inline `[N]` citation, ask a yes/no entailment question
+    # against the cited chunk's text. Sentences that fail are either marked
+    # `[unsupported]` (soft mode) or dropped from the answer (strict mode).
+    # The check uses the same local LLM as the rest of the graph — no extra
+    # model download. Cost: one LLM call per cited sentence (parallel).
+    faithfulness_gate_enabled: bool = False
+    faithfulness_gate_mode: str = "flag"  # "flag" | "drop"
+    faithfulness_threshold: float = 0.7  # min entailment ratio to consider answer faithful
+    faithfulness_max_concurrent: int = 4  # parallel NLI checks
+    # ── Redis (for distributed rate limiting / caching) ──────────────────────────
+    redis_url: str = "redis://localhost:6379/0"
+    use_redis_rate_limiter: bool = False
+    # ── PII Redaction ────────────────────────────────────────────────────────────
+    # Scrub email, phone, SSN, credit-card, IBAN, IP address before persisting
+    # to audit log / query cache. Defense against accidental PII leakage into
+    # secondary stores. Regex-based by default; if Microsoft Presidio is
+    # installed it is used automatically for higher recall.
+    pii_redaction_enabled: bool = True
+    # ── Prompt-Injection Guardrails ──────────────────────────────────────────────
+    # Run a regex + heuristic check on the user query before retrieval. Blocks
+    # obvious jailbreak / system-prompt-override attempts. Logged via the audit
+    # logger as ``security_block`` events.
+    guardrails_enabled: bool = True
+    # Strict mode: after the fast regex gate, escalate ambiguous or all queries
+    # to a local LLM-based classifier for a second opinion. Adds one LLM call
+    # per query but catches adversarial inputs that evade regex patterns.
+    guardrails_strict: bool = False
+    # Escalation backend used in strict mode. Options:
+    #   "llm"        — legacy SAFE/UNSAFE prompt on the synth-grade model
+    #                  (core.agents.guardrails_llm). Default for backward
+    #                  compatibility.
+    #   "llamaguard" — Meta's LlamaGuard 3 8B via Ollama. Use with
+    #                  ``ollama pull llama-guard3:8b``. More accurate on
+    #                  the standard S1-S14 taxonomy.
+    guardrails_backend: str = "llm"
+    llamaguard_model: str = "llama-guard3:8b"
+    # ── Contextual Retrieval (Anthropic 2024 technique) ──────────────────────────
+    # Prepend a short LLM-generated context summary to each chunk before
+    # embedding. Adds 1 cheap LLM call per chunk at ingestion time but
+    # measurably improves retrieval recall (Anthropic reported ~35-49%
+    # failure reduction). Local Qwen3-8B is fine for the summary.
+    contextual_retrieval_enabled: bool = False
+    # ── VLM OCR (Primary OCR via vision-language model) ───────────────────────────
+    # Use a VLM (Qwen2.5-VL / Qwen3-VL, LLaVA, etc.) via Ollama as the primary OCR path.
+    # Superior to PaddleOCR on complex layouts, tables, and mixed-language
+    # documents. Falls back to PaddleOCR when the VLM is unavailable.
+    vlm_ocr_enabled: bool = False
+    vlm_ocr_model: str = "qwen2.5-vl"
+    # ── Multi-Tenancy ────────────────────────────────────────────────────────────
+    # When true, each organization gets its own Qdrant collection
+    # (documents_{org_id}). This provides stronger isolation than payload-level
+    # RBAC filtering but requires creating collections per org on first use.
+    # When false, all docs share a single collection with RBAC at payload level.
+    multi_tenant_collections: bool = False
+    # ── BYOK demo mode (P6 production launch, see launch-plan/03-backend-byok.md)
+    # In BYOK mode the FastAPI surface accepts per-request LLM keys from visitor
+    # headers, scopes Qdrant writes to per-session collections, and disables
+    # Phoenix instrumentation. Off in dev/staging, on in the Hugging Face Space
+    # production image (SAR_BYOK_MODE=true via Space secrets).
+    byok_mode: bool = False
+    # When BYOK is on and a visitor did NOT bring their own LLM key, the owner
+    # key in .env is used but throttled to this many requests per IP per hour.
+    # The cap is intentionally tight so the Groq free-tier 30 RPM / 14400 RPD
+    # is never exhausted by a single visitor.
+    byok_owner_key_quota_per_hour: int = 3
+    # Per-session Qdrant collections (documents_sess_<session_id>) are auto
+    # purged after this many hours by retrieval/session_purge.py.
+    session_collection_ttl_hours: int = 24
+    # CORS allowlist consulted by the FastAPI middleware when byok_mode=true.
+    # Empty list = no CORS middleware mounted (dev default).
+    cors_allow_origins: list[str] = []
+    # ── Multi-Modal RAG ──────────────────────────────────────────────────────────
+    # When ingesting images, also generate a rich text description using a VLM.
+    # The description is embedded as a separate chunk, enabling retrieval for
+    # queries like "what does the diagram show?" without requiring CLIP or
+    # other multi-modal embedding models.
+    multimodal_descriptions_enabled: bool = False
+    # ── Self-Query Retrieval ─────────────────────────────────────────────────────
+    # Extract structured metadata filters (source_file, date_range,
+    # sensitivity_level, roles) from the natural language query using a small
+    # local LLM prompt. The filters are merged with the RBAC filter and passed
+    # to Qdrant, scoping retrieval before embedding search runs.
+    self_query_enabled: bool = False
+    # ── HyDE (Hypothetical Document Embeddings) ──────────────────────────────────
+    # Generate a hypothetical answer to the query, embed *that* instead of the
+    # raw query. Boosts recall when query vocabulary differs from doc
+    # vocabulary (questions vs declarative sentences). Adds one LLM call per
+    # query — skip for simple keyword lookups; enable for complex questions.
+    hyde_enabled: bool = False
+    # ── Pricing for cost dashboard (USD per 1M tokens) ───────────────────────────
+    # Used by evaluation/cost.py to convert recorded usage into $/query.
+    price_groq_input_per_1m: float = 0.59
+    price_groq_output_per_1m: float = 0.79
+    price_openai_input_per_1m: float = 2.50
+    price_openai_output_per_1m: float = 10.00
+    price_anthropic_input_per_1m: float = 3.00
+    price_anthropic_output_per_1m: float = 15.00
+    # Local inference: estimated electricity cost only (consumer hardware).
+    # 200W GPU @ $0.15/kWh ≈ $0.03/hour ≈ $0.000008/sec
+    price_local_per_second: float = 0.000008
+def _apply_calibration(settings_obj: Settings) -> None:
+    """Override threshold defaults from ``evaluation/calibration.json`` when present.
+    The calibration script (``scripts/calibrate_thresholds.py``) writes the
+    chosen confidence + faithfulness cutoffs against a labelled gold set. Loading
+    them here means deployments inherit the latest tuned values automatically,
+    while an explicit ``SAR_CONFIDENCE_THRESHOLD`` / ``SAR_FAITHFULNESS_THRESHOLD``
+    env var still wins so operators can override per environment.
+    Silently no-ops when the file is missing, malformed, or the relevant keys
+    are absent — never blocks startup.
+    """
+    calib_path = Path(__file__).resolve().parent.parent / "evaluation" / "calibration.json"
+    if not calib_path.exists():
+        return
+    try:
+        data = json.loads(calib_path.read_text(encoding="utf-8"))
+    except (OSError, json.JSONDecodeError):
+        return
+    # Reject degenerate sweeps (no negatives or no positives -> the chosen
+    # threshold has no statistical meaning). Keeping the original default in
+    # that case is safer than letting a 0.0 cut-off escape into production.
+    def _sane(block: dict) -> bool:
+        try:
+            return (
+                int(block.get("n_pos", 0)) > 0
+                and int(block.get("n_neg", 0)) > 0
+                and float(block.get("chosen_threshold", 0.0)) > 0.0
+            )
+        except (TypeError, ValueError):
+            return False
+    conf_block = data.get("confidence", {})
+    if _sane(conf_block) and os.environ.get("SAR_CONFIDENCE_THRESHOLD") is None:
+        with contextlib.suppress(TypeError, ValueError):
+            settings_obj.confidence_threshold = float(conf_block["chosen_threshold"])
+    faith_block = data.get("faithfulness", {})
+    if _sane(faith_block) and os.environ.get("SAR_FAITHFULNESS_THRESHOLD") is None:
+        with contextlib.suppress(TypeError, ValueError):
+            settings_obj.faithfulness_threshold = float(faith_block["chosen_threshold"])
+# Singleton instance — import this throughout the application
+settings = Settings()
+_apply_calibration(settings)

inference/cloud_clients.py CHANGED Viewed

@@ -1,577 +1,577 @@
-"""Cloud LLM provider clients (Groq, OpenAI, Anthropic Claude)."""
-from __future__ import annotations
-import json
-import time
-from abc import ABC, abstractmethod
-from enum import StrEnum
-from typing import TYPE_CHECKING, Any
-if TYPE_CHECKING:
-    from collections.abc import AsyncGenerator
-import httpx
-from tenacity import (
-    retry,
-    retry_if_exception_type,
-    stop_after_attempt,
-    wait_exponential,
-)
-from config.settings import settings
-from inference.llm_factory import LLMResponse
-from utils.logging import get_logger
-logger = get_logger(__name__)
-# Retry decorator for transient connection failures only
-_retry_on_connection = retry(
-    retry=retry_if_exception_type((httpx.ConnectError, httpx.TimeoutException)),
-    stop=stop_after_attempt(3),
-    wait=wait_exponential(multiplier=1, min=1, max=10),
-    reraise=True,
-)
-class LLMProvider(StrEnum):
-    """Supported LLM provider identifiers."""
-    OLLAMA = "ollama"
-    GROQ = "groq"
-    OPENAI = "openai"
-    ANTHROPIC = "anthropic"
-class BaseCloudClient(ABC):
-    """Abstract base class for cloud LLM provider clients.
-    Args:
-        api_key: Provider API key for authentication.
-        model: Default model identifier.
-        timeout: Request timeout in seconds.
-    """
-    def __init__(self, api_key: str, model: str, timeout: float = 60.0) -> None:
-        self.api_key = api_key
-        self.model = model
-        self.timeout = timeout
-        self._client = httpx.AsyncClient(timeout=httpx.Timeout(timeout))
-    @abstractmethod
-    async def generate(
-        self,
-        prompt: str,
-        system_prompt: str = "",
-        temperature: float = 0.7,
-        max_tokens: int = 2048,
-        json_mode: bool = False,
-    ) -> LLMResponse:
-        """Generate a completion from the provider.
-        Args:
-            prompt: The user prompt text.
-            system_prompt: Optional system context.
-            temperature: Sampling temperature.
-            max_tokens: Maximum tokens to generate.
-            json_mode: When True, request JSON-formatted output.
-        Returns:
-            LLMResponse with generated text and metadata.
-        """
-    @abstractmethod
-    async def chat(
-        self,
-        messages: list[dict],
-        temperature: float = 0.7,
-        max_tokens: int = 2048,
-    ) -> LLMResponse:
-        """Send a chat conversation to the provider.
-        Args:
-            messages: List of message dicts with 'role' and 'content' keys.
-            temperature: Sampling temperature.
-            max_tokens: Maximum tokens to generate.
-        Returns:
-            LLMResponse with generated text and metadata.
-        """
-    @abstractmethod
-    async def generate_stream(
-        self,
-        prompt: str,
-        system_prompt: str = "",
-        temperature: float = 0.7,
-        max_tokens: int = 2048,
-    ) -> AsyncGenerator[str, None]:
-        """Stream a completion from the provider, yielding tokens as they arrive.
-        Args:
-            prompt: The user prompt text.
-            system_prompt: Optional system context.
-            temperature: Sampling temperature.
-            max_tokens: Maximum tokens to generate.
-        Yields:
-            Token strings as they are generated.
-        """
-    @abstractmethod
-    async def health_check(self) -> bool:
-        """Check if the provider API is reachable.
-        Returns:
-            True if the API responds successfully.
-        """
-    async def close(self) -> None:
-        """Close the underlying HTTP client."""
-        await self._client.aclose()
-    async def __aenter__(self) -> BaseCloudClient:
-        """Enter async context manager."""
-        return self
-    async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
-        """Exit async context manager, closing the client."""
-        await self.close()
-def make_byok_cloud_client(
-    *,
-    provider: str,
-    user_key: str,
-    model: str | None = None,
-    timeout: float = 60.0,
-) -> BaseCloudClient:
-    """Build a per-request cloud LLM client that uses the visitor's API key.
-    Each call returns a **fresh client instance** holding the supplied key
-    in its own ``self.api_key`` slot. The visitor's key never lands on any
-    module-level singleton, never mixes into the owner-key client, and is
-    discarded when the FastAPI request scope ends.
-    Args:
-        provider: One of ``"groq"`` / ``"openai"`` / ``"anthropic"``.
-        user_key: The visitor-supplied API key from ``X-User-LLM-Key``.
-        model: Override the provider's default model.
-        timeout: Per-request HTTP timeout in seconds.
-    Returns:
-        A new ``BaseCloudClient`` subclass instance bound to the visitor key.
-    Raises:
-        ValueError: ``provider`` is not in the BYOK allowlist or ``user_key``
-            is missing.
-    """
-    if not user_key or not user_key.strip():
-        raise ValueError("make_byok_cloud_client called without a user key")
-    prov = (provider or "").lower()
-    if prov == "groq":
-        return GroqClient(
-            api_key=user_key.strip(), model=model or "llama-3.1-8b-instant", timeout=timeout
-        )
-    if prov == "openai":
-        return OpenAIClient(api_key=user_key.strip(), model=model or "gpt-4o-mini", timeout=timeout)
-    if prov == "anthropic":
-        return AnthropicClient(
-            api_key=user_key.strip(),
-            model=model or "claude-sonnet-4-20250514",
-            timeout=timeout,
-        )
-    raise ValueError(f"BYOK provider not supported: {provider!r}")
-class OpenAICompatibleClient(BaseCloudClient):
-    """Shared client for OpenAI Chat Completions-compatible APIs.
-    Both Groq and OpenAI implement the same wire format
-    (``POST /chat/completions`` + SSE streaming). Subclasses supply only
-    the ``api_base`` URL and the ``provider`` tag — every method on
-    ``BaseCloudClient`` is implemented once, here, and inherited.
-    """
-    #: Subclasses override these two class attrs.
-    api_base: str = ""
-    provider_name: str = ""
-    def _headers(self) -> dict[str, str]:
-        return {
-            "Authorization": f"Bearer {self.api_key}",
-            "Content-Type": "application/json",
-        }
-    @staticmethod
-    def _messages(prompt: str, system_prompt: str) -> list[dict[str, str]]:
-        out: list[dict[str, str]] = []
-        if system_prompt:
-            out.append({"role": "system", "content": system_prompt})
-        out.append({"role": "user", "content": prompt})
-        return out
-    @_retry_on_connection
-    async def generate(
-        self,
-        prompt: str,
-        system_prompt: str = "",
-        temperature: float = 0.7,
-        max_tokens: int = 2048,
-        json_mode: bool = False,
-    ) -> LLMResponse:
-        return await self.chat(
-            messages=self._messages(prompt, system_prompt),
-            temperature=temperature,
-            max_tokens=max_tokens,
-            json_mode=json_mode,
-        )
-    @_retry_on_connection
-    async def chat(
-        self,
-        messages: list[dict],
-        temperature: float = 0.7,
-        max_tokens: int = 2048,
-        json_mode: bool = False,
-    ) -> LLMResponse:
-        payload: dict[str, Any] = {
-            "model": self.model,
-            "messages": messages,
-            "temperature": temperature,
-            "max_tokens": max_tokens,
-        }
-        if json_mode:
-            payload["response_format"] = {"type": "json_object"}
-        start = time.perf_counter()
-        response = await self._client.post(
-            f"{self.api_base}/chat/completions",
-            headers=self._headers(),
-            json=payload,
-        )
-        elapsed_ms = (time.perf_counter() - start) * 1000
-        response.raise_for_status()
-        data = response.json()
-        choice = data.get("choices", [{}])[0]
-        message = choice.get("message", {})
-        usage = data.get("usage", {})
-        return LLMResponse(
-            text=message.get("content", ""),
-            model=data.get("model", self.model),
-            provider=self.provider_name,
-            usage={
-                "prompt_tokens": usage.get("prompt_tokens", 0),
-                "completion_tokens": usage.get("completion_tokens", 0),
-                "total_tokens": usage.get("total_tokens", 0),
-            },
-            latency_ms=elapsed_ms,
-        )
-    @_retry_on_connection
-    async def generate_stream(
-        self,
-        prompt: str,
-        system_prompt: str = "",
-        temperature: float = 0.7,
-        max_tokens: int = 2048,
-    ) -> AsyncGenerator[str, None]:
-        payload: dict[str, Any] = {
-            "model": self.model,
-            "messages": self._messages(prompt, system_prompt),
-            "temperature": temperature,
-            "max_tokens": max_tokens,
-            "stream": True,
-        }
-        async with self._client.stream(
-            "POST",
-            f"{self.api_base}/chat/completions",
-            headers={**self._headers(), "Accept": "text/event-stream"},
-            json=payload,
-        ) as resp:
-            resp.raise_for_status()
-            async for line in resp.aiter_lines():
-                line = line.strip()
-                if not line.startswith("data: "):
-                    continue
-                data_str = line[6:]
-                if data_str == "[DONE]":
-                    break
-                try:
-                    data = json.loads(data_str)
-                except json.JSONDecodeError:
-                    continue
-                choice = data.get("choices", [{}])[0]
-                token = choice.get("delta", {}).get("content", "")
-                if token:
-                    yield token
-    @_retry_on_connection
-    async def health_check(self) -> bool:
-        try:
-            response = await self._client.get(f"{self.api_base}/models", headers=self._headers())
-            return response.status_code in (200, 401)
-        except (httpx.ConnectError, httpx.TimeoutException):
-            return False
-class GroqClient(OpenAICompatibleClient):
-    """Groq cloud LLM client (OpenAI-compatible API at api.groq.com)."""
-    provider_name = "groq"
-    def __init__(
-        self,
-        api_key: str,
-        model: str = "llama-3.3-70b-versatile",
-        timeout: float = 60.0,
-    ) -> None:
-        super().__init__(api_key=api_key, model=model, timeout=timeout)
-        self.api_base = settings.groq_api_base
-class OpenAIClient(OpenAICompatibleClient):
-    """OpenAI cloud LLM client (Chat Completions API at api.openai.com)."""
-    provider_name = "openai"
-    def __init__(
-        self,
-        api_key: str,
-        model: str = "gpt-4o-mini",
-        timeout: float = 60.0,
-    ) -> None:
-        super().__init__(api_key=api_key, model=model, timeout=timeout)
-        self.api_base = settings.openai_api_base
-class AnthropicClient(BaseCloudClient):
-    """Anthropic Claude cloud LLM client using the Messages API.
-    Args:
-        api_key: Anthropic API key.
-        model: Model identifier. Defaults to "claude-sonnet-4-20250514".
-        timeout: Request timeout in seconds.
-    """
-    def __init__(
-        self,
-        api_key: str,
-        model: str = "claude-sonnet-4-20250514",
-        timeout: float = 60.0,
-    ) -> None:
-        super().__init__(api_key=api_key, model=model, timeout=timeout)
-        self._api_base = settings.anthropic_api_base
-    def _headers(self) -> dict[str, str]:
-        """Build request headers with Anthropic-specific authentication."""
-        return {
-            "x-api-key": self.api_key,
-            "anthropic-version": "2023-06-01",
-            "Content-Type": "application/json",
-        }
-    @_retry_on_connection
-    async def generate(
-        self,
-        prompt: str,
-        system_prompt: str = "",
-        temperature: float = 0.7,
-        max_tokens: int = 2048,
-        json_mode: bool = False,
-    ) -> LLMResponse:
-        """Generate a completion via Anthropic's Messages API.
-        Args:
-            prompt: The user prompt text.
-            system_prompt: Optional system context.
-            temperature: Sampling temperature.
-            max_tokens: Maximum tokens to generate.
-            json_mode: Anthropic does not support native JSON mode; ignored.
-        Returns:
-            LLMResponse with generated text and metadata.
-        """
-        messages: list[dict[str, str]] = [{"role": "user", "content": prompt}]
-        return await self._send_messages(
-            messages=messages,
-            system_prompt=system_prompt,
-            temperature=temperature,
-            max_tokens=max_tokens,
-        )
-    @_retry_on_connection
-    async def chat(
-        self,
-        messages: list[dict],
-        temperature: float = 0.7,
-        max_tokens: int = 2048,
-    ) -> LLMResponse:
-        """Send a chat request to Anthropic's Messages API.
-        Anthropic uses a separate 'system' parameter instead of a system message
-        in the messages list. This method extracts any system message and handles
-        the format conversion.
-        Args:
-            messages: List of message dicts with 'role' and 'content' keys.
-            temperature: Sampling temperature.
-            max_tokens: Maximum tokens to generate.
-        Returns:
-            LLMResponse with generated text and metadata.
-        """
-        # Extract system message if present
-        system_prompt = ""
-        anthropic_messages: list[dict[str, str]] = []
-        for msg in messages:
-            if msg.get("role") == "system":
-                system_prompt = msg.get("content", "")
-            else:
-                anthropic_messages.append(msg)
-        return await self._send_messages(
-            messages=anthropic_messages,
-            system_prompt=system_prompt,
-            temperature=temperature,
-            max_tokens=max_tokens,
-        )
-    async def _send_messages(
-        self,
-        messages: list[dict],
-        system_prompt: str = "",
-        temperature: float = 0.7,
-        max_tokens: int = 2048,
-    ) -> LLMResponse:
-        """Internal method to send messages to Anthropic's API.
-        Args:
-            messages: Anthropic-formatted messages (no system role).
-            system_prompt: System prompt passed as top-level parameter.
-            temperature: Sampling temperature.
-            max_tokens: Maximum tokens to generate.
-        Returns:
-            LLMResponse with generated text and metadata.
-        """
-        payload: dict[str, Any] = {
-            "model": self.model,
-            "messages": messages,
-            "temperature": temperature,
-            "max_tokens": max_tokens,
-        }
-        if system_prompt:
-            payload["system"] = system_prompt
-        start = time.perf_counter()
-        response = await self._client.post(
-            f"{self._api_base}/messages",
-            headers=self._headers(),
-            json=payload,
-        )
-        elapsed_ms = (time.perf_counter() - start) * 1000
-        response.raise_for_status()
-        data = response.json()
-        # Anthropic returns content as a list of content blocks
-        content_blocks = data.get("content", [])
-        text = ""
-        for block in content_blocks:
-            if block.get("type") == "text":
-                text += block.get("text", "")
-        usage = data.get("usage", {})
-        return LLMResponse(
-            text=text,
-            model=data.get("model", self.model),
-            provider="anthropic",
-            usage={
-                "prompt_tokens": usage.get("input_tokens", 0),
-                "completion_tokens": usage.get("output_tokens", 0),
-                "total_tokens": (usage.get("input_tokens", 0) + usage.get("output_tokens", 0)),
-            },
-            latency_ms=elapsed_ms,
-        )
-    async def generate_stream(
-        self,
-        prompt: str,
-        system_prompt: str = "",
-        temperature: float = 0.7,
-        max_tokens: int = 2048,
-    ) -> AsyncGenerator[str, None]:
-        """Stream a completion via Anthropic's Messages API.
-        Anthropic supports streaming via SSE. Yields text content blocks
-        as they arrive.
-        Args:
-            prompt: The user prompt text.
-            system_prompt: Optional system context.
-            temperature: Sampling temperature.
-            max_tokens: Maximum tokens to generate.
-        Yields:
-            Token strings as they are generated.
-        """
-        payload: dict[str, Any] = {
-            "model": self.model,
-            "messages": [{"role": "user", "content": prompt}],
-            "temperature": temperature,
-            "max_tokens": max_tokens,
-            "stream": True,
-        }
-        if system_prompt:
-            payload["system"] = system_prompt
-        async with self._client.stream(
-            "POST",
-            f"{self._api_base}/messages",
-            headers={**self._headers(), "Accept": "text/event-stream"},
-            json=payload,
-        ) as resp:
-            resp.raise_for_status()
-            async for line in resp.aiter_lines():
-                line = line.strip()
-                if line.startswith("data: "):
-                    data_str = line[6:]
-                    if data_str == "[DONE]":
-                        break
-                    try:
-                        data = json.loads(data_str)
-                        event_type = data.get("type", "")
-                        if event_type == "content_block_delta":
-                            delta = data.get("delta", {})
-                            token = delta.get("text", "")
-                            if token:
-                                yield token
-                        elif event_type == "message_stop":
-                            break
-                    except json.JSONDecodeError:
-                        continue
-    @_retry_on_connection
-    async def health_check(self) -> bool:
-        """Check if the Anthropic API is reachable.
-        Returns:
-            True if the API responds.
-        """
-        try:
-            # Anthropic doesn't have a simple health endpoint; try a minimal request
-            response = await self._client.post(
-                f"{self._api_base}/messages",
-                headers=self._headers(),
-                json={
-                    "model": self.model,
-                    "messages": [{"role": "user", "content": "hi"}],
-                    "max_tokens": 1,
-                },
-            )
-            # Any response (even 401) means the service is reachable
-            return response.status_code in (200, 401, 400)
-        except (httpx.ConnectError, httpx.TimeoutException):
-            return False

+"""Cloud LLM provider clients (Groq, OpenAI, Anthropic Claude)."""
+from __future__ import annotations
+import json
+import time
+from abc import ABC, abstractmethod
+from enum import StrEnum
+from typing import TYPE_CHECKING, Any
+if TYPE_CHECKING:
+    from collections.abc import AsyncGenerator
+import httpx
+from tenacity import (
+    retry,
+    retry_if_exception_type,
+    stop_after_attempt,
+    wait_exponential,
+)
+from config.settings import settings
+from inference.llm_factory import LLMResponse
+from utils.logging import get_logger
+logger = get_logger(__name__)
+# Retry decorator for transient connection failures only
+_retry_on_connection = retry(
+    retry=retry_if_exception_type((httpx.ConnectError, httpx.TimeoutException)),
+    stop=stop_after_attempt(3),
+    wait=wait_exponential(multiplier=1, min=1, max=10),
+    reraise=True,
+)
+class LLMProvider(StrEnum):
+    """Supported LLM provider identifiers."""
+    OLLAMA = "ollama"
+    GROQ = "groq"
+    OPENAI = "openai"
+    ANTHROPIC = "anthropic"
+class BaseCloudClient(ABC):
+    """Abstract base class for cloud LLM provider clients.
+    Args:
+        api_key: Provider API key for authentication.
+        model: Default model identifier.
+        timeout: Request timeout in seconds.
+    """
+    def __init__(self, api_key: str, model: str, timeout: float = 60.0) -> None:
+        self.api_key = api_key
+        self.model = model
+        self.timeout = timeout
+        self._client = httpx.AsyncClient(timeout=httpx.Timeout(timeout))
+    @abstractmethod
+    async def generate(
+        self,
+        prompt: str,
+        system_prompt: str = "",
+        temperature: float = 0.7,
+        max_tokens: int = 2048,
+        json_mode: bool = False,
+    ) -> LLMResponse:
+        """Generate a completion from the provider.
+        Args:
+            prompt: The user prompt text.
+            system_prompt: Optional system context.
+            temperature: Sampling temperature.
+            max_tokens: Maximum tokens to generate.
+            json_mode: When True, request JSON-formatted output.
+        Returns:
+            LLMResponse with generated text and metadata.
+        """
+    @abstractmethod
+    async def chat(
+        self,
+        messages: list[dict],
+        temperature: float = 0.7,
+        max_tokens: int = 2048,
+    ) -> LLMResponse:
+        """Send a chat conversation to the provider.
+        Args:
+            messages: List of message dicts with 'role' and 'content' keys.
+            temperature: Sampling temperature.
+            max_tokens: Maximum tokens to generate.
+        Returns:
+            LLMResponse with generated text and metadata.
+        """
+    @abstractmethod
+    async def generate_stream(
+        self,
+        prompt: str,
+        system_prompt: str = "",
+        temperature: float = 0.7,
+        max_tokens: int = 2048,
+    ) -> AsyncGenerator[str, None]:
+        """Stream a completion from the provider, yielding tokens as they arrive.
+        Args:
+            prompt: The user prompt text.
+            system_prompt: Optional system context.
+            temperature: Sampling temperature.
+            max_tokens: Maximum tokens to generate.
+        Yields:
+            Token strings as they are generated.
+        """
+    @abstractmethod
+    async def health_check(self) -> bool:
+        """Check if the provider API is reachable.
+        Returns:
+            True if the API responds successfully.
+        """
+    async def close(self) -> None:
+        """Close the underlying HTTP client."""
+        await self._client.aclose()
+    async def __aenter__(self) -> BaseCloudClient:
+        """Enter async context manager."""
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
+        """Exit async context manager, closing the client."""
+        await self.close()
+def make_byok_cloud_client(
+    *,
+    provider: str,
+    user_key: str,
+    model: str | None = None,
+    timeout: float = 60.0,
+) -> BaseCloudClient:
+    """Build a per-request cloud LLM client that uses the visitor's API key.
+    Each call returns a **fresh client instance** holding the supplied key
+    in its own ``self.api_key`` slot. The visitor's key never lands on any
+    module-level singleton, never mixes into the owner-key client, and is
+    discarded when the FastAPI request scope ends.
+    Args:
+        provider: One of ``"groq"`` / ``"openai"`` / ``"anthropic"``.
+        user_key: The visitor-supplied API key from ``X-User-LLM-Key``.
+        model: Override the provider's default model.
+        timeout: Per-request HTTP timeout in seconds.
+    Returns:
+        A new ``BaseCloudClient`` subclass instance bound to the visitor key.
+    Raises:
+        ValueError: ``provider`` is not in the BYOK allowlist or ``user_key``
+            is missing.
+    """
+    if not user_key or not user_key.strip():
+        raise ValueError("make_byok_cloud_client called without a user key")
+    prov = (provider or "").lower()
+    if prov == "groq":
+        return GroqClient(
+            api_key=user_key.strip(), model=model or "llama-3.1-8b-instant", timeout=timeout
+        )
+    if prov == "openai":
+        return OpenAIClient(api_key=user_key.strip(), model=model or "gpt-4o-mini", timeout=timeout)
+    if prov == "anthropic":
+        return AnthropicClient(
+            api_key=user_key.strip(),
+            model=model or "claude-sonnet-4-20250514",
+            timeout=timeout,
+        )
+    raise ValueError(f"BYOK provider not supported: {provider!r}")
+class OpenAICompatibleClient(BaseCloudClient):
+    """Shared client for OpenAI Chat Completions-compatible APIs.
+    Both Groq and OpenAI implement the same wire format
+    (``POST /chat/completions`` + SSE streaming). Subclasses supply only
+    the ``api_base`` URL and the ``provider`` tag — every method on
+    ``BaseCloudClient`` is implemented once, here, and inherited.
+    """
+    #: Subclasses override these two class attrs.
+    api_base: str = ""
+    provider_name: str = ""
+    def _headers(self) -> dict[str, str]:
+        return {
+            "Authorization": f"Bearer {self.api_key}",
+            "Content-Type": "application/json",
+        }
+    @staticmethod
+    def _messages(prompt: str, system_prompt: str) -> list[dict[str, str]]:
+        out: list[dict[str, str]] = []
+        if system_prompt:
+            out.append({"role": "system", "content": system_prompt})
+        out.append({"role": "user", "content": prompt})
+        return out
+    @_retry_on_connection
+    async def generate(
+        self,
+        prompt: str,
+        system_prompt: str = "",
+        temperature: float = 0.7,
+        max_tokens: int = 2048,
+        json_mode: bool = False,
+    ) -> LLMResponse:
+        return await self.chat(
+            messages=self._messages(prompt, system_prompt),
+            temperature=temperature,
+            max_tokens=max_tokens,
+            json_mode=json_mode,
+        )
+    @_retry_on_connection
+    async def chat(
+        self,
+        messages: list[dict],
+        temperature: float = 0.7,
+        max_tokens: int = 2048,
+        json_mode: bool = False,
+    ) -> LLMResponse:
+        payload: dict[str, Any] = {
+            "model": self.model,
+            "messages": messages,
+            "temperature": temperature,
+            "max_tokens": max_tokens,
+        }
+        if json_mode:
+            payload["response_format"] = {"type": "json_object"}
+        start = time.perf_counter()
+        response = await self._client.post(
+            f"{self.api_base}/chat/completions",
+            headers=self._headers(),
+            json=payload,
+        )
+        elapsed_ms = (time.perf_counter() - start) * 1000
+        response.raise_for_status()
+        data = response.json()
+        choice = data.get("choices", [{}])[0]
+        message = choice.get("message", {})
+        usage = data.get("usage", {})
+        return LLMResponse(
+            text=message.get("content", ""),
+            model=data.get("model", self.model),
+            provider=self.provider_name,
+            usage={
+                "prompt_tokens": usage.get("prompt_tokens", 0),
+                "completion_tokens": usage.get("completion_tokens", 0),
+                "total_tokens": usage.get("total_tokens", 0),
+            },
+            latency_ms=elapsed_ms,
+        )
+    @_retry_on_connection
+    async def generate_stream(
+        self,
+        prompt: str,
+        system_prompt: str = "",
+        temperature: float = 0.7,
+        max_tokens: int = 2048,
+    ) -> AsyncGenerator[str, None]:
+        payload: dict[str, Any] = {
+            "model": self.model,
+            "messages": self._messages(prompt, system_prompt),
+            "temperature": temperature,
+            "max_tokens": max_tokens,
+            "stream": True,
+        }
+        async with self._client.stream(
+            "POST",
+            f"{self.api_base}/chat/completions",
+            headers={**self._headers(), "Accept": "text/event-stream"},
+            json=payload,
+        ) as resp:
+            resp.raise_for_status()
+            async for line in resp.aiter_lines():
+                line = line.strip()
+                if not line.startswith("data: "):
+                    continue
+                data_str = line[6:]
+                if data_str == "[DONE]":
+                    break
+                try:
+                    data = json.loads(data_str)
+                except json.JSONDecodeError:
+                    continue
+                choice = data.get("choices", [{}])[0]
+                token = choice.get("delta", {}).get("content", "")
+                if token:
+                    yield token
+    @_retry_on_connection
+    async def health_check(self) -> bool:
+        try:
+            response = await self._client.get(f"{self.api_base}/models", headers=self._headers())
+            return response.status_code in (200, 401)
+        except (httpx.ConnectError, httpx.TimeoutException):
+            return False
+class GroqClient(OpenAICompatibleClient):
+    """Groq cloud LLM client (OpenAI-compatible API at api.groq.com)."""
+    provider_name = "groq"
+    def __init__(
+        self,
+        api_key: str,
+        model: str = "llama-3.3-70b-versatile",
+        timeout: float = 60.0,
+    ) -> None:
+        super().__init__(api_key=api_key, model=model, timeout=timeout)
+        self.api_base = settings.groq_api_base
+class OpenAIClient(OpenAICompatibleClient):
+    """OpenAI cloud LLM client (Chat Completions API at api.openai.com)."""
+    provider_name = "openai"
+    def __init__(
+        self,
+        api_key: str,
+        model: str = "gpt-4o-mini",
+        timeout: float = 60.0,
+    ) -> None:
+        super().__init__(api_key=api_key, model=model, timeout=timeout)
+        self.api_base = settings.openai_api_base
+class AnthropicClient(BaseCloudClient):
+    """Anthropic Claude cloud LLM client using the Messages API.
+    Args:
+        api_key: Anthropic API key.
+        model: Model identifier. Defaults to "claude-sonnet-4-20250514".
+        timeout: Request timeout in seconds.
+    """
+    def __init__(
+        self,
+        api_key: str,
+        model: str = "claude-sonnet-4-20250514",
+        timeout: float = 60.0,
+    ) -> None:
+        super().__init__(api_key=api_key, model=model, timeout=timeout)
+        self._api_base = settings.anthropic_api_base
+    def _headers(self) -> dict[str, str]:
+        """Build request headers with Anthropic-specific authentication."""
+        return {
+            "x-api-key": self.api_key,
+            "anthropic-version": "2023-06-01",
+            "Content-Type": "application/json",
+        }
+    @_retry_on_connection
+    async def generate(
+        self,
+        prompt: str,
+        system_prompt: str = "",
+        temperature: float = 0.7,
+        max_tokens: int = 2048,
+        json_mode: bool = False,
+    ) -> LLMResponse:
+        """Generate a completion via Anthropic's Messages API.
+        Args:
+            prompt: The user prompt text.
+            system_prompt: Optional system context.
+            temperature: Sampling temperature.
+            max_tokens: Maximum tokens to generate.
+            json_mode: Anthropic does not support native JSON mode; ignored.
+        Returns:
+            LLMResponse with generated text and metadata.
+        """
+        messages: list[dict[str, str]] = [{"role": "user", "content": prompt}]
+        return await self._send_messages(
+            messages=messages,
+            system_prompt=system_prompt,
+            temperature=temperature,
+            max_tokens=max_tokens,
+        )
+    @_retry_on_connection
+    async def chat(
+        self,
+        messages: list[dict],
+        temperature: float = 0.7,
+        max_tokens: int = 2048,
+    ) -> LLMResponse:
+        """Send a chat request to Anthropic's Messages API.
+        Anthropic uses a separate 'system' parameter instead of a system message
+        in the messages list. This method extracts any system message and handles
+        the format conversion.
+        Args:
+            messages: List of message dicts with 'role' and 'content' keys.
+            temperature: Sampling temperature.
+            max_tokens: Maximum tokens to generate.
+        Returns:
+            LLMResponse with generated text and metadata.
+        """
+        # Extract system message if present
+        system_prompt = ""
+        anthropic_messages: list[dict[str, str]] = []
+        for msg in messages:
+            if msg.get("role") == "system":
+                system_prompt = msg.get("content", "")
+            else:
+                anthropic_messages.append(msg)
+        return await self._send_messages(
+            messages=anthropic_messages,
+            system_prompt=system_prompt,
+            temperature=temperature,
+            max_tokens=max_tokens,
+        )
+    async def _send_messages(
+        self,
+        messages: list[dict],
+        system_prompt: str = "",
+        temperature: float = 0.7,
+        max_tokens: int = 2048,
+    ) -> LLMResponse:
+        """Internal method to send messages to Anthropic's API.
+        Args:
+            messages: Anthropic-formatted messages (no system role).
+            system_prompt: System prompt passed as top-level parameter.
+            temperature: Sampling temperature.
+            max_tokens: Maximum tokens to generate.
+        Returns:
+            LLMResponse with generated text and metadata.
+        """
+        payload: dict[str, Any] = {
+            "model": self.model,
+            "messages": messages,
+            "temperature": temperature,
+            "max_tokens": max_tokens,
+        }
+        if system_prompt:
+            payload["system"] = system_prompt
+        start = time.perf_counter()
+        response = await self._client.post(
+            f"{self._api_base}/messages",
+            headers=self._headers(),
+            json=payload,
+        )
+        elapsed_ms = (time.perf_counter() - start) * 1000
+        response.raise_for_status()
+        data = response.json()
+        # Anthropic returns content as a list of content blocks
+        content_blocks = data.get("content", [])
+        text = ""
+        for block in content_blocks:
+            if block.get("type") == "text":
+                text += block.get("text", "")
+        usage = data.get("usage", {})
+        return LLMResponse(
+            text=text,
+            model=data.get("model", self.model),
+            provider="anthropic",
+            usage={
+                "prompt_tokens": usage.get("input_tokens", 0),
+                "completion_tokens": usage.get("output_tokens", 0),
+                "total_tokens": (usage.get("input_tokens", 0) + usage.get("output_tokens", 0)),
+            },
+            latency_ms=elapsed_ms,
+        )
+    async def generate_stream(
+        self,
+        prompt: str,
+        system_prompt: str = "",
+        temperature: float = 0.7,
+        max_tokens: int = 2048,
+    ) -> AsyncGenerator[str, None]:
+        """Stream a completion via Anthropic's Messages API.
+        Anthropic supports streaming via SSE. Yields text content blocks
+        as they arrive.
+        Args:
+            prompt: The user prompt text.
+            system_prompt: Optional system context.
+            temperature: Sampling temperature.
+            max_tokens: Maximum tokens to generate.
+        Yields:
+            Token strings as they are generated.
+        """
+        payload: dict[str, Any] = {
+            "model": self.model,
+            "messages": [{"role": "user", "content": prompt}],
+            "temperature": temperature,
+            "max_tokens": max_tokens,
+            "stream": True,
+        }
+        if system_prompt:
+            payload["system"] = system_prompt
+        async with self._client.stream(
+            "POST",
+            f"{self._api_base}/messages",
+            headers={**self._headers(), "Accept": "text/event-stream"},
+            json=payload,
+        ) as resp:
+            resp.raise_for_status()
+            async for line in resp.aiter_lines():
+                line = line.strip()
+                if line.startswith("data: "):
+                    data_str = line[6:]
+                    if data_str == "[DONE]":
+                        break
+                    try:
+                        data = json.loads(data_str)
+                        event_type = data.get("type", "")
+                        if event_type == "content_block_delta":
+                            delta = data.get("delta", {})
+                            token = delta.get("text", "")
+                            if token:
+                                yield token
+                        elif event_type == "message_stop":
+                            break
+                    except json.JSONDecodeError:
+                        continue
+    @_retry_on_connection
+    async def health_check(self) -> bool:
+        """Check if the Anthropic API is reachable.
+        Returns:
+            True if the API responds.
+        """
+        try:
+            # Anthropic doesn't have a simple health endpoint; try a minimal request
+            response = await self._client.post(
+                f"{self._api_base}/messages",
+                headers=self._headers(),
+                json={
+                    "model": self.model,
+                    "messages": [{"role": "user", "content": "hi"}],
+                    "max_tokens": 1,
+                },
+            )
+            # Any response (even 401) means the service is reachable
+            return response.status_code in (200, 401, 400)
+        except (httpx.ConnectError, httpx.TimeoutException):
+            return False

inference/ollama_client.py CHANGED Viewed

@@ -1,334 +1,334 @@
-"""Async Ollama client wrapper with streaming support and health checks."""
-from __future__ import annotations
-import time
-from typing import TYPE_CHECKING, Any
-import httpx
-if TYPE_CHECKING:
-    from collections.abc import AsyncGenerator
-from tenacity import (
-    retry,
-    retry_if_exception_type,
-    stop_after_attempt,
-    wait_exponential,
-)
-from config.settings import settings
-from inference.llm_factory import LLMResponse
-from utils.logging import get_logger
-logger = get_logger(__name__)
-# Retry decorator for transient connection failures only
-_retry_on_connection = retry(
-    retry=retry_if_exception_type((httpx.ConnectError, httpx.TimeoutException)),
-    stop=stop_after_attempt(3),
-    wait=wait_exponential(multiplier=1, min=1, max=10),
-    reraise=True,
-)
-def make_byok_ollama_client(
-    *,
-    base_url: str,
-    model: str | None = None,
-    timeout: float = 60.0,
-) -> OllamaClient:
-    """Build a per-request Ollama client bound to the visitor's instance URL.
-    Visitors running their own local Ollama can paste the public URL of
-    that instance into the frontend. Each call returns a **fresh client**
-    so the visitor's URL never replaces the owner default at module scope.
-    Args:
-        base_url: URL of the visitor's Ollama server (HTTPS preferred).
-        model: Override the default model. Falls back to the owner's
-            configured ``SAR_LLM_MODEL`` if the visitor's Ollama does not
-            advertise its own.
-        timeout: Per-request HTTP timeout in seconds.
-    Returns:
-        A new ``OllamaClient`` bound to ``base_url``.
-    Raises:
-        ValueError: ``base_url`` is empty or whitespace.
-    """
-    if not base_url or not base_url.strip():
-        raise ValueError("make_byok_ollama_client called without a base_url")
-    return OllamaClient(base_url=base_url.strip(), model=model, timeout=timeout)
-class OllamaClient:
-    """Async client for the Ollama local LLM inference server.
-    Supports generate (completion), chat, streaming, health checks,
-    and model listing via the Ollama HTTP API.
-    Args:
-        base_url: Ollama server base URL. Defaults to settings.ollama_url.
-        model: Default model name. Defaults to settings.llm_model.
-        timeout: Request timeout in seconds.
-    """
-    def __init__(
-        self,
-        base_url: str | None = None,
-        model: str | None = None,
-        timeout: float = 120.0,
-    ) -> None:
-        self.base_url = (base_url if base_url is not None else settings.ollama_url).rstrip("/")
-        self.model = model if model is not None else settings.llm_model
-        self.timeout = timeout
-        self._client = httpx.AsyncClient(
-            base_url=self.base_url,
-            timeout=httpx.Timeout(timeout),
-        )
-    @_retry_on_connection
-    async def generate(
-        self,
-        prompt: str,
-        system_prompt: str = "",
-        temperature: float = 0.7,
-        max_tokens: int = 2048,
-        json_mode: bool = False,
-    ) -> LLMResponse:
-        """Generate a completion from the Ollama API.
-        Args:
-            prompt: The user prompt text.
-            system_prompt: Optional system context.
-            temperature: Sampling temperature (0.0-1.0).
-            max_tokens: Maximum tokens to generate.
-            json_mode: When True, request JSON-formatted output.
-        Returns:
-            LLMResponse with generated text and metadata.
-        """
-        payload: dict[str, Any] = {
-            "model": self.model,
-            "prompt": prompt,
-            "stream": False,
-            "options": {
-                "temperature": temperature,
-                "num_predict": max_tokens,
-            },
-            "keep_alive": settings.ollama_keep_alive,
-        }
-        if system_prompt:
-            payload["system"] = system_prompt
-        if json_mode:
-            payload["format"] = "json"
-        start = time.perf_counter()
-        response = await self._client.post("/api/generate", json=payload)
-        elapsed_ms = (time.perf_counter() - start) * 1000
-        response.raise_for_status()
-        data = response.json()
-        return LLMResponse(
-            text=data.get("response", ""),
-            model=data.get("model", self.model),
-            provider="ollama",
-            usage={
-                "prompt_tokens": data.get("prompt_eval_count", 0),
-                "completion_tokens": data.get("eval_count", 0),
-                "total_tokens": (data.get("prompt_eval_count", 0) + data.get("eval_count", 0)),
-            },
-            latency_ms=elapsed_ms,
-            metadata={
-                "total_duration": data.get("total_duration"),
-                "load_duration": data.get("load_duration"),
-            },
-        )
-    @_retry_on_connection
-    async def chat(
-        self,
-        messages: list[dict],
-        temperature: float = 0.7,
-        max_tokens: int = 2048,
-    ) -> LLMResponse:
-        """Send a chat conversation to the Ollama API.
-        Args:
-            messages: List of message dicts with 'role' and 'content' keys.
-                Roles: "system", "user", "assistant".
-            temperature: Sampling temperature (0.0-1.0).
-            max_tokens: Maximum tokens to generate.
-        Returns:
-            LLMResponse with generated text and metadata.
-        """
-        payload: dict[str, Any] = {
-            "model": self.model,
-            "messages": messages,
-            "stream": False,
-            "options": {
-                "temperature": temperature,
-                "num_predict": max_tokens,
-            },
-            "keep_alive": settings.ollama_keep_alive,
-        }
-        start = time.perf_counter()
-        response = await self._client.post("/api/chat", json=payload)
-        elapsed_ms = (time.perf_counter() - start) * 1000
-        response.raise_for_status()
-        data = response.json()
-        message = data.get("message", {})
-        return LLMResponse(
-            text=message.get("content", ""),
-            model=data.get("model", self.model),
-            provider="ollama",
-            usage={
-                "prompt_tokens": data.get("prompt_eval_count", 0),
-                "completion_tokens": data.get("eval_count", 0),
-                "total_tokens": (data.get("prompt_eval_count", 0) + data.get("eval_count", 0)),
-            },
-            latency_ms=elapsed_ms,
-            metadata={
-                "total_duration": data.get("total_duration"),
-                "load_duration": data.get("load_duration"),
-            },
-        )
-    async def generate_stream(
-        self,
-        prompt: str,
-        system_prompt: str = "",
-        temperature: float = 0.7,
-    ) -> AsyncGenerator[str, None]:
-        """Stream a completion from the Ollama API, yielding tokens as they arrive.
-        Args:
-            prompt: The user prompt text.
-            system_prompt: Optional system context.
-            temperature: Sampling temperature (0.0-1.0).
-        Yields:
-            Token strings as they are generated.
-        """
-        payload: dict[str, Any] = {
-            "model": self.model,
-            "prompt": prompt,
-            "stream": True,
-            "options": {
-                "temperature": temperature,
-            },
-            "keep_alive": settings.ollama_keep_alive,
-        }
-        if system_prompt:
-            payload["system"] = system_prompt
-        async with self._client.stream("POST", "/api/generate", json=payload) as resp:
-            resp.raise_for_status()
-            async for line in resp.aiter_lines():
-                if line:
-                    import json
-                    data = json.loads(line)
-                    token = data.get("response", "")
-                    if token:
-                        yield token
-                    if data.get("done", False):
-                        break
-    async def chat_stream(
-        self,
-        messages: list[dict],
-        temperature: float = 0.7,
-    ) -> AsyncGenerator[str, None]:
-        """Stream a chat completion from the Ollama API, yielding tokens as they arrive.
-        Args:
-            messages: List of message dicts with 'role' and 'content' keys.
-            temperature: Sampling temperature (0.0-1.0).
-        Yields:
-            Token strings as they are generated.
-        """
-        payload: dict[str, Any] = {
-            "model": self.model,
-            "messages": messages,
-            "stream": True,
-            "options": {
-                "temperature": temperature,
-            },
-            "keep_alive": settings.ollama_keep_alive,
-        }
-        async with self._client.stream("POST", "/api/chat", json=payload) as resp:
-            resp.raise_for_status()
-            async for line in resp.aiter_lines():
-                if line:
-                    import json
-                    data = json.loads(line)
-                    message = data.get("message", {})
-                    token = message.get("content", "")
-                    if token:
-                        yield token
-                    if data.get("done", False):
-                        break
-    @_retry_on_connection
-    async def health_check(self) -> bool:
-        """Check if the Ollama server is reachable and responding.
-        Returns:
-            True if the server responds with HTTP 200, False otherwise.
-        """
-        try:
-            response = await self._client.get("/api/tags")
-            return response.status_code == 200
-        except (httpx.ConnectError, httpx.TimeoutException):
-            return False
-    @_retry_on_connection
-    async def list_models(self) -> list[str]:
-        """List all models available on the Ollama server.
-        Returns:
-            List of model name strings.
-        """
-        response = await self._client.get("/api/tags")
-        response.raise_for_status()
-        data = response.json()
-        models = data.get("models", [])
-        return [m.get("name", "") for m in models]
-    @_retry_on_connection
-    async def get_model_info(self, model: str | None = None) -> dict | None:
-        """Get detailed information about a specific model.
-        Args:
-            model: Model name to query. Defaults to the client's configured model.
-        Returns:
-            Dict with model info, or None if model not found.
-        """
-        target_model = model or self.model
-        try:
-            response = await self._client.post("/api/show", json={"name": target_model})
-            if response.status_code == 200:
-                return response.json()
-            return None
-        except httpx.HTTPStatusError:
-            return None
-    async def close(self) -> None:
-        """Close the underlying HTTP client."""
-        await self._client.aclose()
-    async def __aenter__(self) -> OllamaClient:
-        """Enter async context manager."""
-        return self
-    async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
-        """Exit async context manager, closing the client."""
-        await self.close()

+"""Async Ollama client wrapper with streaming support and health checks."""
+from __future__ import annotations
+import time
+from typing import TYPE_CHECKING, Any
+import httpx
+if TYPE_CHECKING:
+    from collections.abc import AsyncGenerator
+from tenacity import (
+    retry,
+    retry_if_exception_type,
+    stop_after_attempt,
+    wait_exponential,
+)
+from config.settings import settings
+from inference.llm_factory import LLMResponse
+from utils.logging import get_logger
+logger = get_logger(__name__)
+# Retry decorator for transient connection failures only
+_retry_on_connection = retry(
+    retry=retry_if_exception_type((httpx.ConnectError, httpx.TimeoutException)),
+    stop=stop_after_attempt(3),
+    wait=wait_exponential(multiplier=1, min=1, max=10),
+    reraise=True,
+)
+def make_byok_ollama_client(
+    *,
+    base_url: str,
+    model: str | None = None,
+    timeout: float = 60.0,
+) -> OllamaClient:
+    """Build a per-request Ollama client bound to the visitor's instance URL.
+    Visitors running their own local Ollama can paste the public URL of
+    that instance into the frontend. Each call returns a **fresh client**
+    so the visitor's URL never replaces the owner default at module scope.
+    Args:
+        base_url: URL of the visitor's Ollama server (HTTPS preferred).
+        model: Override the default model. Falls back to the owner's
+            configured ``SAR_LLM_MODEL`` if the visitor's Ollama does not
+            advertise its own.
+        timeout: Per-request HTTP timeout in seconds.
+    Returns:
+        A new ``OllamaClient`` bound to ``base_url``.
+    Raises:
+        ValueError: ``base_url`` is empty or whitespace.
+    """
+    if not base_url or not base_url.strip():
+        raise ValueError("make_byok_ollama_client called without a base_url")
+    return OllamaClient(base_url=base_url.strip(), model=model, timeout=timeout)
+class OllamaClient:
+    """Async client for the Ollama local LLM inference server.
+    Supports generate (completion), chat, streaming, health checks,
+    and model listing via the Ollama HTTP API.
+    Args:
+        base_url: Ollama server base URL. Defaults to settings.ollama_url.
+        model: Default model name. Defaults to settings.llm_model.
+        timeout: Request timeout in seconds.
+    """
+    def __init__(
+        self,
+        base_url: str | None = None,
+        model: str | None = None,
+        timeout: float = 120.0,
+    ) -> None:
+        self.base_url = (base_url if base_url is not None else settings.ollama_url).rstrip("/")
+        self.model = model if model is not None else settings.llm_model
+        self.timeout = timeout
+        self._client = httpx.AsyncClient(
+            base_url=self.base_url,
+            timeout=httpx.Timeout(timeout),
+        )
+    @_retry_on_connection
+    async def generate(
+        self,
+        prompt: str,
+        system_prompt: str = "",
+        temperature: float = 0.7,
+        max_tokens: int = 2048,
+        json_mode: bool = False,
+    ) -> LLMResponse:
+        """Generate a completion from the Ollama API.
+        Args:
+            prompt: The user prompt text.
+            system_prompt: Optional system context.
+            temperature: Sampling temperature (0.0-1.0).
+            max_tokens: Maximum tokens to generate.
+            json_mode: When True, request JSON-formatted output.
+        Returns:
+            LLMResponse with generated text and metadata.
+        """
+        payload: dict[str, Any] = {
+            "model": self.model,
+            "prompt": prompt,
+            "stream": False,
+            "options": {
+                "temperature": temperature,
+                "num_predict": max_tokens,
+            },
+            "keep_alive": settings.ollama_keep_alive,
+        }
+        if system_prompt:
+            payload["system"] = system_prompt
+        if json_mode:
+            payload["format"] = "json"
+        start = time.perf_counter()
+        response = await self._client.post("/api/generate", json=payload)
+        elapsed_ms = (time.perf_counter() - start) * 1000
+        response.raise_for_status()
+        data = response.json()
+        return LLMResponse(
+            text=data.get("response", ""),
+            model=data.get("model", self.model),
+            provider="ollama",
+            usage={
+                "prompt_tokens": data.get("prompt_eval_count", 0),
+                "completion_tokens": data.get("eval_count", 0),
+                "total_tokens": (data.get("prompt_eval_count", 0) + data.get("eval_count", 0)),
+            },
+            latency_ms=elapsed_ms,
+            metadata={
+                "total_duration": data.get("total_duration"),
+                "load_duration": data.get("load_duration"),
+            },
+        )
+    @_retry_on_connection
+    async def chat(
+        self,
+        messages: list[dict],
+        temperature: float = 0.7,
+        max_tokens: int = 2048,
+    ) -> LLMResponse:
+        """Send a chat conversation to the Ollama API.
+        Args:
+            messages: List of message dicts with 'role' and 'content' keys.
+                Roles: "system", "user", "assistant".
+            temperature: Sampling temperature (0.0-1.0).
+            max_tokens: Maximum tokens to generate.
+        Returns:
+            LLMResponse with generated text and metadata.
+        """
+        payload: dict[str, Any] = {
+            "model": self.model,
+            "messages": messages,
+            "stream": False,
+            "options": {
+                "temperature": temperature,
+                "num_predict": max_tokens,
+            },
+            "keep_alive": settings.ollama_keep_alive,
+        }
+        start = time.perf_counter()
+        response = await self._client.post("/api/chat", json=payload)
+        elapsed_ms = (time.perf_counter() - start) * 1000
+        response.raise_for_status()
+        data = response.json()
+        message = data.get("message", {})
+        return LLMResponse(
+            text=message.get("content", ""),
+            model=data.get("model", self.model),
+            provider="ollama",
+            usage={
+                "prompt_tokens": data.get("prompt_eval_count", 0),
+                "completion_tokens": data.get("eval_count", 0),
+                "total_tokens": (data.get("prompt_eval_count", 0) + data.get("eval_count", 0)),
+            },
+            latency_ms=elapsed_ms,
+            metadata={
+                "total_duration": data.get("total_duration"),
+                "load_duration": data.get("load_duration"),
+            },
+        )
+    async def generate_stream(
+        self,
+        prompt: str,
+        system_prompt: str = "",
+        temperature: float = 0.7,
+    ) -> AsyncGenerator[str, None]:
+        """Stream a completion from the Ollama API, yielding tokens as they arrive.
+        Args:
+            prompt: The user prompt text.
+            system_prompt: Optional system context.
+            temperature: Sampling temperature (0.0-1.0).
+        Yields:
+            Token strings as they are generated.
+        """
+        payload: dict[str, Any] = {
+            "model": self.model,
+            "prompt": prompt,
+            "stream": True,
+            "options": {
+                "temperature": temperature,
+            },
+            "keep_alive": settings.ollama_keep_alive,
+        }
+        if system_prompt:
+            payload["system"] = system_prompt
+        async with self._client.stream("POST", "/api/generate", json=payload) as resp:
+            resp.raise_for_status()
+            async for line in resp.aiter_lines():
+                if line:
+                    import json
+                    data = json.loads(line)
+                    token = data.get("response", "")
+                    if token:
+                        yield token
+                    if data.get("done", False):
+                        break
+    async def chat_stream(
+        self,
+        messages: list[dict],
+        temperature: float = 0.7,
+    ) -> AsyncGenerator[str, None]:
+        """Stream a chat completion from the Ollama API, yielding tokens as they arrive.
+        Args:
+            messages: List of message dicts with 'role' and 'content' keys.
+            temperature: Sampling temperature (0.0-1.0).
+        Yields:
+            Token strings as they are generated.
+        """
+        payload: dict[str, Any] = {
+            "model": self.model,
+            "messages": messages,
+            "stream": True,
+            "options": {
+                "temperature": temperature,
+            },
+            "keep_alive": settings.ollama_keep_alive,
+        }
+        async with self._client.stream("POST", "/api/chat", json=payload) as resp:
+            resp.raise_for_status()
+            async for line in resp.aiter_lines():
+                if line:
+                    import json
+                    data = json.loads(line)
+                    message = data.get("message", {})
+                    token = message.get("content", "")
+                    if token:
+                        yield token
+                    if data.get("done", False):
+                        break
+    @_retry_on_connection
+    async def health_check(self) -> bool:
+        """Check if the Ollama server is reachable and responding.
+        Returns:
+            True if the server responds with HTTP 200, False otherwise.
+        """
+        try:
+            response = await self._client.get("/api/tags")
+            return response.status_code == 200
+        except (httpx.ConnectError, httpx.TimeoutException):
+            return False
+    @_retry_on_connection
+    async def list_models(self) -> list[str]:
+        """List all models available on the Ollama server.
+        Returns:
+            List of model name strings.
+        """
+        response = await self._client.get("/api/tags")
+        response.raise_for_status()
+        data = response.json()
+        models = data.get("models", [])
+        return [m.get("name", "") for m in models]
+    @_retry_on_connection
+    async def get_model_info(self, model: str | None = None) -> dict | None:
+        """Get detailed information about a specific model.
+        Args:
+            model: Model name to query. Defaults to the client's configured model.
+        Returns:
+            Dict with model info, or None if model not found.
+        """
+        target_model = model or self.model
+        try:
+            response = await self._client.post("/api/show", json={"name": target_model})
+            if response.status_code == 200:
+                return response.json()
+            return None
+        except httpx.HTTPStatusError:
+            return None
+    async def close(self) -> None:
+        """Close the underlying HTTP client."""
+        await self._client.aclose()
+    async def __aenter__(self) -> OllamaClient:
+        """Enter async context manager."""
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
+        """Exit async context manager, closing the client."""
+        await self.close()

interfaces/api.py CHANGED Viewed

@@ -1,425 +1,432 @@
-"""FastAPI surface for SecureAgentRAG.
-Run with::
-    uv run uvicorn interfaces.api:app --host 0.0.0.0 --port 8080
-Endpoints
----------
-- ``GET  /healthz``     — liveness probe (no auth).
-- ``GET  /readyz``      — readiness — pings Qdrant + Ollama.
-- ``POST /query``       — run the RAG pipeline; returns ``QueryResponse``.
-- ``POST /ingest``      — ingest a local file; requires ``user`` role.
-- ``GET  /audit``       — read paginated audit entries; requires ``admin``.
-- ``POST /audit/verify``— verify the hash-chain; requires ``admin``.
-Auth uses a stateless bearer token. The token payload is a base64-encoded JSON
-``UserContext`` so the API has no session store — caller provides identity on
-every request. Production deployments should swap this for Keycloak/Auth0 JWT
-verification (left as a hook in ``_resolve_user``).
-"""
-from __future__ import annotations
-import base64
-import json
-from datetime import date
-from typing import Annotated
-from config.settings import settings
-from utils.auth import AuthError, issue_token, verify_token
-from utils.logging import get_logger
-logger = get_logger(__name__)
-try:
-    from fastapi import Depends, FastAPI, Header, HTTPException, status
-    from fastapi.responses import JSONResponse
-    _FASTAPI_AVAILABLE = True
-except ImportError:  # pragma: no cover
-    _FASTAPI_AVAILABLE = False
-    Depends = Header = FastAPI = HTTPException = JSONResponse = status = None  # type: ignore[assignment]
-if _FASTAPI_AVAILABLE:
-    from core.graph import run_rag_pipeline
-    from core.schemas import (
-        IngestRequestModel,
-        IngestResponseModel,
-        QueryRequest,
-        QueryResponse,
-    )
-    from ingestion.metadata import IngestRequest, SensitivityLevel, UserContext
-    from utils.audit import audit_logger
-    from utils.health import run_health_checks
-    from utils.rate_limiter import RateLimiter
-    rate_limiter = RateLimiter()  # uses default token-bucket config
-    _AUTH_ERROR_STATUS: dict[str, int] = {
-        "missing": status.HTTP_401_UNAUTHORIZED,
-        "malformed": status.HTTP_401_UNAUTHORIZED,
-        "expired": status.HTTP_401_UNAUTHORIZED,
-        "bad_signature": status.HTTP_401_UNAUTHORIZED,
-        "bad_claims": status.HTTP_403_FORBIDDEN,
-    }
-    def _resolve_user_full(
-        authorization: Annotated[str | None, Header()] = None,
-    ) -> tuple[UserContext, dict]:
-        """Verify the bearer token and return (UserContext, claims).
-        Delegates to :func:`utils.auth.verify_token`, which uses HS256 JWT
-        when ``SAR_JWT_SECRET`` is set and falls back to the legacy unsigned
-        base64 token otherwise (with a runtime warning).
-        """
-        if not authorization or not authorization.lower().startswith("bearer "):
-            raise HTTPException(status.HTTP_401_UNAUTHORIZED, "missing bearer token")
-        token = authorization.split(" ", 1)[1]
-        try:
-            return verify_token(token)
-        except AuthError as exc:
-            code = _AUTH_ERROR_STATUS.get(exc.reason, status.HTTP_401_UNAUTHORIZED)
-            raise HTTPException(code, f"auth_{exc.reason}: {exc}") from exc
-    def _resolve_user(authorization: Annotated[str | None, Header()] = None) -> UserContext:
-        """Backward-compatible dependency returning only the UserContext."""
-        ctx, _claims = _resolve_user_full(authorization=authorization)
-        return ctx
-    def _require_role(required: str):
-        def _dep(user: Annotated[UserContext, Depends(_resolve_user)]) -> UserContext:
-            if required not in user.roles and "admin" not in user.roles:
-                raise HTTPException(status.HTTP_403_FORBIDDEN, f"role '{required}' required")
-            return user
-        return _dep
-    app = FastAPI(
-        title="SecureAgentRAG API",
-        version="0.1.0",
-        description="Privacy-first multi-agent RAG with RBAC, guardrails, and audit chain.",
-    )
-    # Initialize Phoenix tracing if configured.
-    # When ``settings.byok_mode`` is on, ``setup_tracing`` short-circuits to
-    # False regardless of phoenix_endpoint (see utils/observability.py).
-    from utils.observability import setup_tracing
-    _tracing_enabled = setup_tracing()
-    if _tracing_enabled:
-        logger.info("phoenix_tracing_active_in_api")
-    # ── BYOK CORS middleware ─────────────────────────────────────────────
-    # Only mount CORS when:
-    #   1) BYOK mode is on (public demo path), AND
-    #   2) an explicit allowlist is configured via SAR_CORS_ALLOW_ORIGINS.
-    # Empty allowlist + BYOK = wildcard would be a footgun (CSRF surface).
-    # Empty allowlist + dev = no CORS needed (local same-origin).
-    if settings.byok_mode and settings.cors_allow_origins:
-        from fastapi.middleware.cors import CORSMiddleware
-        app.add_middleware(
-            CORSMiddleware,
-            allow_origins=list(settings.cors_allow_origins),
-            allow_credentials=False,  # BYOK never uses cookies
-            allow_methods=["GET", "POST", "OPTIONS"],
-            allow_headers=["*"],
-        )
-        logger.info("byok_cors_enabled", origins=list(settings.cors_allow_origins))
-    @app.get("/healthz", tags=["ops"])
-    async def healthz() -> dict[str, str]:
-        return {"status": "ok"}
-    @app.get("/readyz", tags=["ops"])
-    async def readyz() -> JSONResponse:
-        report = await run_health_checks()
-        code = 200 if report.overall_healthy else 503
-        return JSONResponse(report.to_dict(), status_code=code)
-    # ── BYOK demo endpoint ───────────────────────────────────────────────
-    # Mounted only when ``settings.byok_mode`` is on. Bypasses JWT auth and
-    # uses per-request BYOK credentials instead. Isolation is enforced via
-    # session-scoped Qdrant collections, not JWT identity.
-    if settings.byok_mode:
-        from interfaces.byok import ByokCreds, extract_byok
-        from utils.rate_limiter import get_owner_key_throttle
-        _DEMO_PERSONAS: dict[str, dict] = {
-            "engineer": {
-                "org_id": "demo-engineering",
-                "clearance_level": 2,
-                "roles": ["engineering"],
-            },
-            "compliance": {
-                "org_id": "demo-compliance",
-                "clearance_level": 4,
-                "roles": ["compliance", "legal"],
-            },
-            "executive": {
-                "org_id": "demo-executive",
-                "clearance_level": 5,
-                "roles": ["executive", "compliance"],
-            },
-        }
-        def _persona_to_user_ctx(creds: ByokCreds) -> UserContext:
-            """Translate ``creds.demo_persona`` into a synthetic UserContext.
-            Unknown / missing persona → minimal read-only profile so the demo
-            still answers but cannot escalate beyond the lowest clearance.
-            """
-            preset = _DEMO_PERSONAS.get((creds.demo_persona or "").lower())
-            if preset is None:
-                preset = {"org_id": "demo-anon", "clearance_level": 1, "roles": ["viewer"]}
-            return UserContext(
-                user_id=f"demo-{creds.session_id}",
-                org_id=preset["org_id"],
-                clearance_level=preset["clearance_level"],
-                roles=preset["roles"],
-            )
-        from pydantic import BaseModel as _ByokBaseModel
-        class _ByokChatBody(_ByokBaseModel):
-            """Public-demo chat payload — no auth fields, only the question text."""
-            query: str
-            prefer_cloud: bool = True
-        # Runtime import — FastAPI dependency injection reads the annotation
-        # at request time, so this must NOT be a TYPE_CHECKING-only import.
-        from fastapi import Request as _FastApiRequest  # noqa: TC002
-        @app.post("/byok/chat", tags=["byok"])
-        async def byok_chat_endpoint(
-            request: _FastApiRequest,
-            body: _ByokChatBody,
-            creds: Annotated[ByokCreds, Depends(extract_byok)],
-        ) -> dict:
-            """Public-demo chat endpoint backed by BYOK credentials.
-            Routing:
-            - Visitor brought a key (``creds.has_user_key()``): pipeline uses
-              the visitor's provider + key. No throttle.
-            - Visitor did NOT bring a key: pipeline falls back to the owner's
-              configured cloud provider key, gated by ``OwnerKeyHourThrottle``.
-              When exhausted, returns 429 with copy nudging BYOK.
-            Persona maps to a synthetic ``UserContext`` so the existing RBAC
-            filter still runs end-to-end — same code path as authenticated
-            queries, just with demo identities.
-            """
-            if not creds.has_user_key():
-                throttle = get_owner_key_throttle()
-                client_ip = (request.client.host if request.client else None) or "anon"
-                ok, meta = throttle.allow(client_ip)
-                if not ok:
-                    raise HTTPException(
-                        status.HTTP_429_TOO_MANY_REQUESTS,
-                        detail={
-                            "reason": meta["reason"],
-                            "retry_after_seconds": meta["retry_after"],
-                            "hint": (
-                                "Owner-key fallback exhausted for this IP. "
-                                "Paste your own LLM key to continue — your key "
-                                "is never stored server-side."
-                            ),
-                        },
-                    )
-            user_ctx = _persona_to_user_ctx(creds)
-            state = await run_rag_pipeline(
-                query=body.query,
-                user_context=user_ctx,
-                thread_id=f"byok-{creds.session_id}",
-                prefer_cloud=body.prefer_cloud,
-                # Visitor's chosen provider when present; falls back to env.
-                override_provider=creds.safe_provider(),
-            )
-            response = QueryResponse.from_state(state)
-            return {
-                "session_id": creds.session_id,
-                "persona": creds.demo_persona or "anonymous",
-                "byok_used": creds.has_user_key(),
-                "response": response.model_dump(mode="json"),
-            }
-    @app.post("/query", response_model=QueryResponse, tags=["rag"])
-    async def query_endpoint(
-        body: QueryRequest,
-        auth: Annotated[tuple[UserContext, dict], Depends(_resolve_user_full)],
-    ) -> QueryResponse:
-        user, claims = auth
-        if not rate_limiter.is_allowed(f"{user.user_id}:query"):
-            raise HTTPException(status.HTTP_429_TOO_MANY_REQUESTS, "rate limit exceeded")
-        # Caller-supplied user_id must match the bearer-token identity.
-        if body.user_id != user.user_id:
-            raise HTTPException(status.HTTP_403_FORBIDDEN, "user_id mismatch")
-        # Use the JWT id so the audit trail can correlate a query with the
-        # exact token that authorised it; useful for revocation forensics.
-        jti = claims.get("jti", "unsigned")
-        state = await run_rag_pipeline(
-            query=body.query,
-            user_context=user,
-            thread_id=f"api-{user.user_id}-{jti}",
-            prefer_cloud=body.prefer_cloud,
-            override_provider=body.override_provider,
-        )
-        return QueryResponse.from_state(state)
-    @app.post("/ingest", response_model=IngestResponseModel, tags=["rag"])
-    async def ingest_endpoint(
-        body: IngestRequestModel,
-        user: Annotated[UserContext, Depends(_require_role("user"))],
-    ) -> IngestResponseModel:
-        if body.user_id != user.user_id:
-            raise HTTPException(status.HTTP_403_FORBIDDEN, "user_id mismatch")
-        from core.agents.retriever import _get_hybrid_searcher
-        from ingestion.pipeline import IngestionPipeline
-        searcher = _get_hybrid_searcher()
-        pipeline = IngestionPipeline(
-            qdrant_manager=searcher._qdrant,  # type: ignore[attr-defined]
-            embedding_service=searcher._embeddings,  # type: ignore[attr-defined]
-            sparse_service=searcher._sparse,  # type: ignore[attr-defined]
-        )
-        req = IngestRequest(
-            file_path=body.file_path,
-            user_id=body.user_id,
-            org_id=body.org_id,
-            sensitivity_level=SensitivityLevel(body.sensitivity_level),
-            roles=body.roles,
-        )
-        result = await pipeline.ingest_document(req)
-        return IngestResponseModel(
-            file_path=result.file_path,
-            status=result.status,
-            num_chunks=result.num_chunks,
-            point_ids=result.point_ids,
-            errors=result.errors,
-            processing_time_seconds=result.processing_time_seconds,
-        )
-    @app.get("/audit", tags=["audit"])
-    async def audit_list(
-        user: Annotated[UserContext, Depends(_require_role("admin"))],
-        start: str | None = None,
-        end: str | None = None,
-        limit: int = 100,
-    ) -> dict:
-        today = date.today().isoformat()
-        entries = audit_logger.get_entries(
-            start_date=start or today,
-            end_date=end or today,
-            user_id=None,
-            action=None,
-        )
-        return {
-            "total": len(entries),
-            "items": [e.model_dump(mode="json") for e in entries[:limit]],
-        }
-    @app.post("/audit/verify", tags=["audit"])
-    async def audit_verify(
-        user: Annotated[UserContext, Depends(_require_role("admin"))],
-        start: str | None = None,
-        end: str | None = None,
-    ) -> dict:
-        result = audit_logger.verify_chain(start_date=start, end_date=end)
-        return result
-    from pydantic import BaseModel as _PydBM
-    class _TokenRequest(_PydBM):
-        """Identity payload accepted by the dev ``/token`` endpoint."""
-        user_id: str
-        org_id: str = ""
-        roles: list[str] = []
-        clearance_level: int = 1
-        ttl_seconds: int | None = None
-    class _TokenResponse(_PydBM):
-        access_token: str
-        token_type: str = "bearer"
-        expires_in: int
-    @app.post("/token", response_model=_TokenResponse, tags=["auth"])
-    async def issue_dev_token(body: _TokenRequest) -> _TokenResponse:
-        """Mint a signed JWT for local testing.
-        In production the IdP (Keycloak / Auth0 / Microsoft Entra) issues the
-        token externally and this endpoint is removed via the
-        ``SAR_DISABLE_DEV_TOKEN`` flag — kept here so the e2e smoke script
-        and the Streamlit demo can mint a real token rather than the
-        unsigned base64 fallback.
-        """
-        if settings.jwt_algorithm.upper() == "RS256":
-            raise HTTPException(
-                status.HTTP_404_NOT_FOUND,
-                "Dev token endpoint disabled in RS256 mode — use the external IdP",
-            )
-        if not settings.jwt_secret:
-            raise HTTPException(
-                status.HTTP_503_SERVICE_UNAVAILABLE,
-                "SAR_JWT_SECRET is not configured; token endpoint disabled",
-            )
-        try:
-            token = issue_token(
-                user_id=body.user_id,
-                org_id=body.org_id,
-                roles=body.roles,
-                clearance_level=body.clearance_level,
-                ttl_seconds=body.ttl_seconds,
-            )
-        except AuthError as exc:
-            raise HTTPException(
-                status.HTTP_500_INTERNAL_SERVER_ERROR, f"token_issue_{exc.reason}: {exc}"
-            ) from exc
-        return _TokenResponse(
-            access_token=token,
-            token_type="bearer",
-            expires_in=body.ttl_seconds or settings.jwt_ttl_seconds,
-        )
-        try:
-            token = issue_token(
-                user_id=body.user_id,
-                org_id=body.org_id,
-                roles=body.roles,
-                clearance_level=body.clearance_level,
-                ttl_seconds=body.ttl_seconds,
-            )
-        except AuthError as exc:
-            raise HTTPException(
-                status.HTTP_500_INTERNAL_SERVER_ERROR, f"token_issue_{exc.reason}: {exc}"
-            ) from exc
-        return _TokenResponse(
-            access_token=token,
-            expires_in=body.ttl_seconds or settings.jwt_ttl_seconds,
-        )
-else:  # pragma: no cover
-    app = None  # type: ignore[assignment]
-def mint_dev_token(user: dict) -> str:
-    """Convenience for local testing — build a bearer token for a UserContext dict.
-    When ``SAR_JWT_SECRET`` is configured this mints a real signed JWT; with
-    no secret it falls back to the legacy unsigned base64 shape so existing
-    test fixtures keep working.
-    """
-    if settings.jwt_secret:
-        try:
-            return issue_token(
-                user_id=user.get("user_id", ""),
-                org_id=user.get("org_id", ""),
-                roles=list(user.get("roles", [])),
-                clearance_level=int(user.get("clearance_level", 1)),
-            )
-        except AuthError:
-            # Fall through to legacy shape on issuer error.
-            pass
-    payload = json.dumps(user).encode("utf-8")
-    return base64.b64encode(payload).decode("ascii")

+"""FastAPI surface for SecureAgentRAG.
+Run with::
+    uv run uvicorn interfaces.api:app --host 0.0.0.0 --port 8080
+Endpoints
+---------
+- ``GET  /healthz``     — liveness probe (no auth).
+- ``GET  /readyz``      — readiness — pings Qdrant + Ollama.
+- ``POST /query``       — run the RAG pipeline; returns ``QueryResponse``.
+- ``POST /ingest``      — ingest a local file; requires ``user`` role.
+- ``GET  /audit``       — read paginated audit entries; requires ``admin``.
+- ``POST /audit/verify``— verify the hash-chain; requires ``admin``.
+Auth uses a stateless bearer token. The token payload is a base64-encoded JSON
+``UserContext`` so the API has no session store — caller provides identity on
+every request. Production deployments should swap this for Keycloak/Auth0 JWT
+verification (left as a hook in ``_resolve_user``).
+"""
+from __future__ import annotations
+import base64
+import json
+from datetime import date
+from typing import Annotated
+from config.settings import settings
+from utils.auth import AuthError, issue_token, verify_token
+from utils.logging import get_logger
+logger = get_logger(__name__)
+try:
+    from fastapi import Depends, FastAPI, Header, HTTPException, status
+    from fastapi.responses import JSONResponse
+    _FASTAPI_AVAILABLE = True
+except ImportError:  # pragma: no cover
+    _FASTAPI_AVAILABLE = False
+    Depends = Header = FastAPI = HTTPException = JSONResponse = status = None  # type: ignore[assignment]
+if _FASTAPI_AVAILABLE:
+    from core.graph import run_rag_pipeline
+    from core.schemas import (
+        IngestRequestModel,
+        IngestResponseModel,
+        QueryRequest,
+        QueryResponse,
+    )
+    from ingestion.metadata import IngestRequest, SensitivityLevel, UserContext
+    from utils.audit import audit_logger
+    from utils.health import run_health_checks
+    from utils.rate_limiter import RateLimiter
+    rate_limiter = RateLimiter()  # uses default token-bucket config
+    _AUTH_ERROR_STATUS: dict[str, int] = {
+        "missing": status.HTTP_401_UNAUTHORIZED,
+        "malformed": status.HTTP_401_UNAUTHORIZED,
+        "expired": status.HTTP_401_UNAUTHORIZED,
+        "bad_signature": status.HTTP_401_UNAUTHORIZED,
+        "bad_claims": status.HTTP_403_FORBIDDEN,
+    }
+    def _resolve_user_full(
+        authorization: Annotated[str | None, Header()] = None,
+    ) -> tuple[UserContext, dict]:
+        """Verify the bearer token and return (UserContext, claims).
+        Delegates to :func:`utils.auth.verify_token`, which uses HS256 JWT
+        when ``SAR_JWT_SECRET`` is set and falls back to the legacy unsigned
+        base64 token otherwise (with a runtime warning).
+        """
+        if not authorization or not authorization.lower().startswith("bearer "):
+            raise HTTPException(status.HTTP_401_UNAUTHORIZED, "missing bearer token")
+        token = authorization.split(" ", 1)[1]
+        try:
+            return verify_token(token)
+        except AuthError as exc:
+            code = _AUTH_ERROR_STATUS.get(exc.reason, status.HTTP_401_UNAUTHORIZED)
+            raise HTTPException(code, f"auth_{exc.reason}: {exc}") from exc
+    def _resolve_user(authorization: Annotated[str | None, Header()] = None) -> UserContext:
+        """Backward-compatible dependency returning only the UserContext."""
+        ctx, _claims = _resolve_user_full(authorization=authorization)
+        return ctx
+    def _require_role(required: str):
+        def _dep(user: Annotated[UserContext, Depends(_resolve_user)]) -> UserContext:
+            if required not in user.roles and "admin" not in user.roles:
+                raise HTTPException(status.HTTP_403_FORBIDDEN, f"role '{required}' required")
+            return user
+        return _dep
+    app = FastAPI(
+        title="SecureAgentRAG API",
+        version="0.1.0",
+        description="Privacy-first multi-agent RAG with RBAC, guardrails, and audit chain.",
+    )
+    # Initialize Phoenix tracing if configured.
+    # When ``settings.byok_mode`` is on, ``setup_tracing`` short-circuits to
+    # False regardless of phoenix_endpoint (see utils/observability.py).
+    from utils.observability import setup_tracing
+    _tracing_enabled = setup_tracing()
+    if _tracing_enabled:
+        logger.info("phoenix_tracing_active_in_api")
+    # ── BYOK CORS middleware ─────────────────────────────────────────────
+    # Only mount CORS when:
+    #   1) BYOK mode is on (public demo path), AND
+    #   2) an explicit allowlist is configured via SAR_CORS_ALLOW_ORIGINS.
+    # Empty allowlist + BYOK = wildcard would be a footgun (CSRF surface).
+    # Empty allowlist + dev = no CORS needed (local same-origin).
+    if settings.byok_mode and settings.cors_allow_origins:
+        from fastapi.middleware.cors import CORSMiddleware
+        app.add_middleware(
+            CORSMiddleware,
+            allow_origins=list(settings.cors_allow_origins),
+            allow_credentials=False,  # BYOK never uses cookies
+            allow_methods=["GET", "POST", "OPTIONS"],
+            allow_headers=["*"],
+        )
+        logger.info("byok_cors_enabled", origins=list(settings.cors_allow_origins))
+    @app.get("/healthz", tags=["ops"])
+    async def healthz() -> dict[str, str]:
+        return {"status": "ok"}
+    @app.get("/readyz", tags=["ops"])
+    async def readyz() -> JSONResponse:
+        report = await run_health_checks()
+        code = 200 if report.overall_healthy else 503
+        return JSONResponse(report.to_dict(), status_code=code)
+    # ── BYOK demo endpoint ───────────────────────────────────────────────
+    # Mounted only when ``settings.byok_mode`` is on. Bypasses JWT auth and
+    # uses per-request BYOK credentials instead. Isolation is enforced via
+    # session-scoped Qdrant collections, not JWT identity.
+    if settings.byok_mode:
+        from interfaces.byok import ByokCreds, extract_byok
+        from utils.rate_limiter import get_owner_key_throttle
+        # All demo personas share ``org_id="demo"`` so they query the same
+        # ingested corpus. RBAC differentiation is enforced via clearance
+        # level + roles at the payload-filter layer -- exactly the production
+        # invariant we want to demonstrate.
+        _DEMO_ORG_ID = "demo"
+        # Sensitivity levels are LOW=1, MEDIUM=2, HIGH=3 (see
+        # ``ingestion/metadata.py::sensitivity_to_int``). Clearance levels must
+        # be in the same range so the Qdrant range filter passes the right
+        # chunks. Engineer < Compliance == Executive, but executive carries
+        # a wider role set (sees both engineering + compliance content).
+        _DEMO_PERSONAS: dict[str, dict] = {
+            "engineer": {
+                "clearance_level": 2,
+                "roles": ["engineering"],
+            },
+            "compliance": {
+                "clearance_level": 3,
+                "roles": ["compliance", "legal"],
+            },
+            "executive": {
+                "clearance_level": 3,
+                "roles": ["executive", "compliance", "engineering"],
+            },
+        }
+        def _persona_to_user_ctx(creds: ByokCreds) -> UserContext:
+            """Translate ``creds.demo_persona`` into a synthetic UserContext.
+            Unknown / missing persona → minimal read-only profile so the demo
+            still answers but cannot escalate beyond the lowest clearance.
+            """
+            preset = _DEMO_PERSONAS.get((creds.demo_persona or "").lower())
+            if preset is None:
+                preset = {"clearance_level": 1, "roles": ["viewer"]}
+            return UserContext(
+                user_id=f"demo-{creds.session_id}",
+                org_id=_DEMO_ORG_ID,
+                clearance_level=preset["clearance_level"],
+                roles=preset["roles"],
+            )
+        from pydantic import BaseModel as _ByokBaseModel
+        class _ByokChatBody(_ByokBaseModel):
+            """Public-demo chat payload — no auth fields, only the question text."""
+            query: str
+            prefer_cloud: bool = True
+        # Runtime import — FastAPI dependency injection reads the annotation
+        # at request time, so this must NOT be a TYPE_CHECKING-only import.
+        from fastapi import Request as _FastApiRequest  # noqa: TC002
+        @app.post("/byok/chat", tags=["byok"])
+        async def byok_chat_endpoint(
+            request: _FastApiRequest,
+            body: _ByokChatBody,
+            creds: Annotated[ByokCreds, Depends(extract_byok)],
+        ) -> dict:
+            """Public-demo chat endpoint backed by BYOK credentials.
+            Routing:
+            - Visitor brought a key (``creds.has_user_key()``): pipeline uses
+              the visitor's provider + key. No throttle.
+            - Visitor did NOT bring a key: pipeline falls back to the owner's
+              configured cloud provider key, gated by ``OwnerKeyHourThrottle``.
+              When exhausted, returns 429 with copy nudging BYOK.
+            Persona maps to a synthetic ``UserContext`` so the existing RBAC
+            filter still runs end-to-end — same code path as authenticated
+            queries, just with demo identities.
+            """
+            if not creds.has_user_key():
+                throttle = get_owner_key_throttle()
+                client_ip = (request.client.host if request.client else None) or "anon"
+                ok, meta = throttle.allow(client_ip)
+                if not ok:
+                    raise HTTPException(
+                        status.HTTP_429_TOO_MANY_REQUESTS,
+                        detail={
+                            "reason": meta["reason"],
+                            "retry_after_seconds": meta["retry_after"],
+                            "hint": (
+                                "Owner-key fallback exhausted for this IP. "
+                                "Paste your own LLM key to continue — your key "
+                                "is never stored server-side."
+                            ),
+                        },
+                    )
+            user_ctx = _persona_to_user_ctx(creds)
+            state = await run_rag_pipeline(
+                query=body.query,
+                user_context=user_ctx,
+                thread_id=f"byok-{creds.session_id}",
+                prefer_cloud=body.prefer_cloud,
+                # Visitor's chosen provider when present; falls back to env.
+                override_provider=creds.safe_provider(),
+            )
+            response = QueryResponse.from_state(state)
+            return {
+                "session_id": creds.session_id,
+                "persona": creds.demo_persona or "anonymous",
+                "byok_used": creds.has_user_key(),
+                "response": response.model_dump(mode="json"),
+            }
+    @app.post("/query", response_model=QueryResponse, tags=["rag"])
+    async def query_endpoint(
+        body: QueryRequest,
+        auth: Annotated[tuple[UserContext, dict], Depends(_resolve_user_full)],
+    ) -> QueryResponse:
+        user, claims = auth
+        if not rate_limiter.is_allowed(f"{user.user_id}:query"):
+            raise HTTPException(status.HTTP_429_TOO_MANY_REQUESTS, "rate limit exceeded")
+        # Caller-supplied user_id must match the bearer-token identity.
+        if body.user_id != user.user_id:
+            raise HTTPException(status.HTTP_403_FORBIDDEN, "user_id mismatch")
+        # Use the JWT id so the audit trail can correlate a query with the
+        # exact token that authorised it; useful for revocation forensics.
+        jti = claims.get("jti", "unsigned")
+        state = await run_rag_pipeline(
+            query=body.query,
+            user_context=user,
+            thread_id=f"api-{user.user_id}-{jti}",
+            prefer_cloud=body.prefer_cloud,
+            override_provider=body.override_provider,
+        )
+        return QueryResponse.from_state(state)
+    @app.post("/ingest", response_model=IngestResponseModel, tags=["rag"])
+    async def ingest_endpoint(
+        body: IngestRequestModel,
+        user: Annotated[UserContext, Depends(_require_role("user"))],
+    ) -> IngestResponseModel:
+        if body.user_id != user.user_id:
+            raise HTTPException(status.HTTP_403_FORBIDDEN, "user_id mismatch")
+        from core.agents.retriever import _get_hybrid_searcher
+        from ingestion.pipeline import IngestionPipeline
+        searcher = _get_hybrid_searcher()
+        pipeline = IngestionPipeline(
+            qdrant_manager=searcher._qdrant,  # type: ignore[attr-defined]
+            embedding_service=searcher._embeddings,  # type: ignore[attr-defined]
+            sparse_service=searcher._sparse,  # type: ignore[attr-defined]
+        )
+        req = IngestRequest(
+            file_path=body.file_path,
+            user_id=body.user_id,
+            org_id=body.org_id,
+            sensitivity_level=SensitivityLevel(body.sensitivity_level),
+            roles=body.roles,
+        )
+        result = await pipeline.ingest_document(req)
+        return IngestResponseModel(
+            file_path=result.file_path,
+            status=result.status,
+            num_chunks=result.num_chunks,
+            point_ids=result.point_ids,
+            errors=result.errors,
+            processing_time_seconds=result.processing_time_seconds,
+        )
+    @app.get("/audit", tags=["audit"])
+    async def audit_list(
+        user: Annotated[UserContext, Depends(_require_role("admin"))],
+        start: str | None = None,
+        end: str | None = None,
+        limit: int = 100,
+    ) -> dict:
+        today = date.today().isoformat()
+        entries = audit_logger.get_entries(
+            start_date=start or today,
+            end_date=end or today,
+            user_id=None,
+            action=None,
+        )
+        return {
+            "total": len(entries),
+            "items": [e.model_dump(mode="json") for e in entries[:limit]],
+        }
+    @app.post("/audit/verify", tags=["audit"])
+    async def audit_verify(
+        user: Annotated[UserContext, Depends(_require_role("admin"))],
+        start: str | None = None,
+        end: str | None = None,
+    ) -> dict:
+        result = audit_logger.verify_chain(start_date=start, end_date=end)
+        return result
+    from pydantic import BaseModel as _PydBM
+    class _TokenRequest(_PydBM):
+        """Identity payload accepted by the dev ``/token`` endpoint."""
+        user_id: str
+        org_id: str = ""
+        roles: list[str] = []
+        clearance_level: int = 1
+        ttl_seconds: int | None = None
+    class _TokenResponse(_PydBM):
+        access_token: str
+        token_type: str = "bearer"
+        expires_in: int
+    @app.post("/token", response_model=_TokenResponse, tags=["auth"])
+    async def issue_dev_token(body: _TokenRequest) -> _TokenResponse:
+        """Mint a signed JWT for local testing.
+        In production the IdP (Keycloak / Auth0 / Microsoft Entra) issues the
+        token externally and this endpoint is removed via the
+        ``SAR_DISABLE_DEV_TOKEN`` flag — kept here so the e2e smoke script
+        and the Streamlit demo can mint a real token rather than the
+        unsigned base64 fallback.
+        """
+        if settings.jwt_algorithm.upper() == "RS256":
+            raise HTTPException(
+                status.HTTP_404_NOT_FOUND,
+                "Dev token endpoint disabled in RS256 mode — use the external IdP",
+            )
+        if not settings.jwt_secret:
+            raise HTTPException(
+                status.HTTP_503_SERVICE_UNAVAILABLE,
+                "SAR_JWT_SECRET is not configured; token endpoint disabled",
+            )
+        try:
+            token = issue_token(
+                user_id=body.user_id,
+                org_id=body.org_id,
+                roles=body.roles,
+                clearance_level=body.clearance_level,
+                ttl_seconds=body.ttl_seconds,
+            )
+        except AuthError as exc:
+            raise HTTPException(
+                status.HTTP_500_INTERNAL_SERVER_ERROR, f"token_issue_{exc.reason}: {exc}"
+            ) from exc
+        return _TokenResponse(
+            access_token=token,
+            token_type="bearer",
+            expires_in=body.ttl_seconds or settings.jwt_ttl_seconds,
+        )
+        try:
+            token = issue_token(
+                user_id=body.user_id,
+                org_id=body.org_id,
+                roles=body.roles,
+                clearance_level=body.clearance_level,
+                ttl_seconds=body.ttl_seconds,
+            )
+        except AuthError as exc:
+            raise HTTPException(
+                status.HTTP_500_INTERNAL_SERVER_ERROR, f"token_issue_{exc.reason}: {exc}"
+            ) from exc
+        return _TokenResponse(
+            access_token=token,
+            expires_in=body.ttl_seconds or settings.jwt_ttl_seconds,
+        )
+else:  # pragma: no cover
+    app = None  # type: ignore[assignment]
+def mint_dev_token(user: dict) -> str:
+    """Convenience for local testing — build a bearer token for a UserContext dict.
+    When ``SAR_JWT_SECRET`` is configured this mints a real signed JWT; with
+    no secret it falls back to the legacy unsigned base64 shape so existing
+    test fixtures keep working.
+    """
+    if settings.jwt_secret:
+        try:
+            return issue_token(
+                user_id=user.get("user_id", ""),
+                org_id=user.get("org_id", ""),
+                roles=list(user.get("roles", [])),
+                clearance_level=int(user.get("clearance_level", 1)),
+            )
+        except AuthError:
+            # Fall through to legacy shape on issuer error.
+            pass
+    payload = json.dumps(user).encode("utf-8")
+    return base64.b64encode(payload).decode("ascii")

interfaces/byok.py CHANGED Viewed

@@ -1,166 +1,166 @@
-"""BYOK (Bring Your Own Key) request extraction for the public demo.
-Mounted on the FastAPI surface only when ``settings.byok_mode=True`` (production
-HF Space image). Extracts per-request LLM credentials and session identity from
-HTTP headers so the RAG pipeline can route to the visitor's own LLM provider
-and Qdrant collection.
-The extracted ``ByokCreds`` is **never persisted**:
-- API keys live only in the request scope (FastAPI dep dies after response)
-- ``utils.pii.redact`` strips key-shaped substrings from audit log entries
-- The frontend stores the key in ``localStorage`` and forwards it as a header;
-  cookies are forbidden (CSRF surface).
-See ``launch-plan/03-backend-byok.md`` and ``launch-plan/11-security-checklist.md``.
-"""
-from __future__ import annotations
-import hashlib
-import uuid
-from typing import TYPE_CHECKING
-from pydantic import BaseModel, ConfigDict, Field
-if TYPE_CHECKING:
-    from fastapi import Request
-# Header names the frontend sends.
-HDR_USER_KEY = "X-User-LLM-Key"
-HDR_USER_PROVIDER = "X-User-Provider"
-HDR_USER_OLLAMA_URL = "X-User-Ollama-URL"
-HDR_SESSION_ID = "X-Session-ID"
-HDR_DEMO_PERSONA = "X-Demo-Persona"
-# Supported provider literals carried in X-User-Provider.
-SUPPORTED_PROVIDERS: frozenset[str] = frozenset({"groq", "openai", "anthropic", "ollama"})
-class ByokCreds(BaseModel):
-    """Per-request BYOK credentials and session identity.
-    Attributes:
-        user_key: Visitor's own LLM provider API key. None means owner-key
-            fallback (subject to ``OwnerKeyHourThrottle``).
-        provider: Which LLM provider the ``user_key`` is for. Validated
-            against ``SUPPORTED_PROVIDERS``. None defaults to the platform
-            owner's configured ``cloud_provider``.
-        ollama_url: Visitor's Ollama instance URL when provider == "ollama".
-            Ignored otherwise.
-        session_id: Per-visitor session identifier. Drives the per-session
-            Qdrant collection name. Generated server-side when the visitor
-            does not provide one (first request of a session).
-        demo_persona: Optional preset RBAC profile for the public demo —
-            ``engineer`` / ``compliance`` / ``executive``. Translated to
-            ``UserContext`` downstream.
-    """
-    model_config = ConfigDict(frozen=True, str_strip_whitespace=True)
-    user_key: str | None = None
-    provider: str | None = None
-    ollama_url: str | None = None
-    session_id: str = Field(..., min_length=1, max_length=128)
-    demo_persona: str | None = None
-    def has_user_key(self) -> bool:
-        """True when the visitor brought their own LLM key.
-        Owner-key fallback (False) goes through the per-IP throttle; visitor
-        BYOK (True) bypasses it. Callers MUST consult this before deciding to
-        consume the owner-key quota.
-        """
-        return bool(self.user_key and self.user_key.strip())
-    def safe_provider(self) -> str | None:
-        """Return ``provider`` if it is in the allowlist, else None."""
-        if self.provider and self.provider.lower() in SUPPORTED_PROVIDERS:
-            return self.provider.lower()
-        return None
-def _derive_session_id(client_host: str | None) -> str:
-    """Generate a deterministic-but-non-identifying session ID.
-    Falls back to a short hash of the client host + a random UUID. The hash
-    keeps the same session sticky if the visitor reconnects within the same
-    UVicorn worker; the random UUID ensures cross-worker / cross-restart
-    isolation. The full UUID flavour stays server-side — we never expose
-    raw IP addresses in the collection name.
-    """
-    host = (client_host or "anon").strip() or "anon"
-    digest = hashlib.sha256(host.encode("utf-8")).hexdigest()[:8]
-    random = uuid.uuid4().hex[:8]
-    return f"{digest}-{random}"
-def build_creds(
-    *,
-    user_key: str | None,
-    provider: str | None,
-    ollama_url: str | None,
-    session_id: str | None,
-    demo_persona: str | None,
-    client_host: str | None,
-) -> ByokCreds:
-    """Pure factory — builds ``ByokCreds`` from raw header values.
-    Separated from the FastAPI dependency so it is unit-testable without
-    spinning up a Request object. Whitespace-trims every input; generates
-    ``session_id`` server-side when the client omitted it.
-    """
-    return ByokCreds(
-        user_key=(user_key or None),
-        provider=(provider or None),
-        ollama_url=(ollama_url or None),
-        session_id=(session_id or "").strip() or _derive_session_id(client_host),
-        demo_persona=(demo_persona or None),
-    )
-# ── FastAPI integration ──────────────────────────────────────────────────────
-# Header annotations live in this branch so the module can be imported in
-# environments where fastapi is not installed (e.g. lightweight unit tests).
-try:
-    # Runtime imports — FastAPI dependency injection reads annotations at
-    # request time, so these must NOT live in a TYPE_CHECKING-only block.
-    from fastapi import Header, Request  # noqa: TC002
-    _FASTAPI_AVAILABLE = True
-except ImportError:  # pragma: no cover
-    _FASTAPI_AVAILABLE = False
-    def Header(*_a: object, **_kw: object) -> None:  # type: ignore[no-redef]  # noqa: N802 — keep FastAPI's name
-        """No-op shim when FastAPI is not installed (lint-only env)."""
-        return None
-if _FASTAPI_AVAILABLE:
-    from typing import Annotated
-    def extract_byok(
-        request: Request,
-        x_user_llm_key: Annotated[str | None, Header()] = None,
-        x_user_provider: Annotated[str | None, Header()] = None,
-        x_user_ollama_url: Annotated[str | None, Header()] = None,
-        x_session_id: Annotated[str | None, Header()] = None,
-        x_demo_persona: Annotated[str | None, Header()] = None,
-    ) -> ByokCreds:
-        """FastAPI dependency: extract per-request BYOK credentials.
-        Pure data extraction — authentication, throttling, and routing
-        decisions happen downstream so they can be unit-tested independently
-        of FastAPI's request lifecycle.
-        """
-        host = request.client.host if request.client else None
-        return build_creds(
-            user_key=x_user_llm_key,
-            provider=x_user_provider,
-            ollama_url=x_user_ollama_url,
-            session_id=x_session_id,
-            demo_persona=x_demo_persona,
-            client_host=host,
-        )

+"""BYOK (Bring Your Own Key) request extraction for the public demo.
+Mounted on the FastAPI surface only when ``settings.byok_mode=True`` (production
+HF Space image). Extracts per-request LLM credentials and session identity from
+HTTP headers so the RAG pipeline can route to the visitor's own LLM provider
+and Qdrant collection.
+The extracted ``ByokCreds`` is **never persisted**:
+- API keys live only in the request scope (FastAPI dep dies after response)
+- ``utils.pii.redact`` strips key-shaped substrings from audit log entries
+- The frontend stores the key in ``localStorage`` and forwards it as a header;
+  cookies are forbidden (CSRF surface).
+See ``launch-plan/03-backend-byok.md`` and ``launch-plan/11-security-checklist.md``.
+"""
+from __future__ import annotations
+import hashlib
+import uuid
+from typing import TYPE_CHECKING
+from pydantic import BaseModel, ConfigDict, Field
+if TYPE_CHECKING:
+    from fastapi import Request
+# Header names the frontend sends.
+HDR_USER_KEY = "X-User-LLM-Key"
+HDR_USER_PROVIDER = "X-User-Provider"
+HDR_USER_OLLAMA_URL = "X-User-Ollama-URL"
+HDR_SESSION_ID = "X-Session-ID"
+HDR_DEMO_PERSONA = "X-Demo-Persona"
+# Supported provider literals carried in X-User-Provider.
+SUPPORTED_PROVIDERS: frozenset[str] = frozenset({"groq", "openai", "anthropic", "ollama"})
+class ByokCreds(BaseModel):
+    """Per-request BYOK credentials and session identity.
+    Attributes:
+        user_key: Visitor's own LLM provider API key. None means owner-key
+            fallback (subject to ``OwnerKeyHourThrottle``).
+        provider: Which LLM provider the ``user_key`` is for. Validated
+            against ``SUPPORTED_PROVIDERS``. None defaults to the platform
+            owner's configured ``cloud_provider``.
+        ollama_url: Visitor's Ollama instance URL when provider == "ollama".
+            Ignored otherwise.
+        session_id: Per-visitor session identifier. Drives the per-session
+            Qdrant collection name. Generated server-side when the visitor
+            does not provide one (first request of a session).
+        demo_persona: Optional preset RBAC profile for the public demo —
+            ``engineer`` / ``compliance`` / ``executive``. Translated to
+            ``UserContext`` downstream.
+    """
+    model_config = ConfigDict(frozen=True, str_strip_whitespace=True)
+    user_key: str | None = None
+    provider: str | None = None
+    ollama_url: str | None = None
+    session_id: str = Field(..., min_length=1, max_length=128)
+    demo_persona: str | None = None
+    def has_user_key(self) -> bool:
+        """True when the visitor brought their own LLM key.
+        Owner-key fallback (False) goes through the per-IP throttle; visitor
+        BYOK (True) bypasses it. Callers MUST consult this before deciding to
+        consume the owner-key quota.
+        """
+        return bool(self.user_key and self.user_key.strip())
+    def safe_provider(self) -> str | None:
+        """Return ``provider`` if it is in the allowlist, else None."""
+        if self.provider and self.provider.lower() in SUPPORTED_PROVIDERS:
+            return self.provider.lower()
+        return None
+def _derive_session_id(client_host: str | None) -> str:
+    """Generate a deterministic-but-non-identifying session ID.
+    Falls back to a short hash of the client host + a random UUID. The hash
+    keeps the same session sticky if the visitor reconnects within the same
+    UVicorn worker; the random UUID ensures cross-worker / cross-restart
+    isolation. The full UUID flavour stays server-side — we never expose
+    raw IP addresses in the collection name.
+    """
+    host = (client_host or "anon").strip() or "anon"
+    digest = hashlib.sha256(host.encode("utf-8")).hexdigest()[:8]
+    random = uuid.uuid4().hex[:8]
+    return f"{digest}-{random}"
+def build_creds(
+    *,
+    user_key: str | None,
+    provider: str | None,
+    ollama_url: str | None,
+    session_id: str | None,
+    demo_persona: str | None,
+    client_host: str | None,
+) -> ByokCreds:
+    """Pure factory — builds ``ByokCreds`` from raw header values.
+    Separated from the FastAPI dependency so it is unit-testable without
+    spinning up a Request object. Whitespace-trims every input; generates
+    ``session_id`` server-side when the client omitted it.
+    """
+    return ByokCreds(
+        user_key=(user_key or None),
+        provider=(provider or None),
+        ollama_url=(ollama_url or None),
+        session_id=(session_id or "").strip() or _derive_session_id(client_host),
+        demo_persona=(demo_persona or None),
+    )
+# ── FastAPI integration ──────────────────────────────────────────────────────
+# Header annotations live in this branch so the module can be imported in
+# environments where fastapi is not installed (e.g. lightweight unit tests).
+try:
+    # Runtime imports — FastAPI dependency injection reads annotations at
+    # request time, so these must NOT live in a TYPE_CHECKING-only block.
+    from fastapi import Header, Request  # noqa: TC002
+    _FASTAPI_AVAILABLE = True
+except ImportError:  # pragma: no cover
+    _FASTAPI_AVAILABLE = False
+    def Header(*_a: object, **_kw: object) -> None:  # type: ignore[no-redef]  # noqa: N802 — keep FastAPI's name
+        """No-op shim when FastAPI is not installed (lint-only env)."""
+        return None
+if _FASTAPI_AVAILABLE:
+    from typing import Annotated
+    def extract_byok(
+        request: Request,
+        x_user_llm_key: Annotated[str | None, Header()] = None,
+        x_user_provider: Annotated[str | None, Header()] = None,
+        x_user_ollama_url: Annotated[str | None, Header()] = None,
+        x_session_id: Annotated[str | None, Header()] = None,
+        x_demo_persona: Annotated[str | None, Header()] = None,
+    ) -> ByokCreds:
+        """FastAPI dependency: extract per-request BYOK credentials.
+        Pure data extraction — authentication, throttling, and routing
+        decisions happen downstream so they can be unit-tested independently
+        of FastAPI's request lifecycle.
+        """
+        host = request.client.host if request.client else None
+        return build_creds(
+            user_key=x_user_llm_key,
+            provider=x_user_provider,
+            ollama_url=x_user_ollama_url,
+            session_id=x_session_id,
+            demo_persona=x_demo_persona,
+            client_host=host,
+        )

retrieval/multitenancy.py CHANGED Viewed

@@ -1,43 +1,43 @@
-"""Multi-tenancy utilities for Qdrant collection naming."""
-from __future__ import annotations
-from config.settings import settings
-def _sanitize(s: str) -> str:
-    """Coerce ``s`` to a Qdrant-safe identifier (alnum + underscore only)."""
-    return "".join(c if c.isalnum() else "_" for c in s)
-def get_collection_name(
-    org_id: str | None = None,
-    *,
-    session_id: str | None = None,
-) -> str:
-    """Return the Qdrant collection name for a given org or BYOK session.
-    Resolution order:
-    1. **BYOK mode** (``settings.byok_mode=True``) with ``session_id`` →
-       returns ``"{base}_sess_{sanitized_session}"``. Session-scoped
-       collections isolate each visitor's uploads.
-    2. **Multi-tenant** (``settings.multi_tenant_collections=True``) with
-       ``org_id`` → returns ``"{base}_{sanitized_org}"``.
-    3. **Single-tenant** (default) → returns ``settings.qdrant_collection``.
-    Args:
-        org_id: Organisation identifier (multi-tenant mode).
-        session_id: Per-visitor session UUID (BYOK mode). Takes priority over
-            ``org_id`` when both are set and BYOK is on, because BYOK is the
-            stricter isolation boundary.
-    Returns:
-        Collection name string suitable for QdrantManager.
-    """
-    base = settings.qdrant_collection
-    if settings.byok_mode and session_id:
-        return f"{base}_sess_{_sanitize(session_id)}"
-    if not settings.multi_tenant_collections or not org_id:
-        return base
-    return f"{base}_{_sanitize(org_id)}"

+"""Multi-tenancy utilities for Qdrant collection naming."""
+from __future__ import annotations
+from config.settings import settings
+def _sanitize(s: str) -> str:
+    """Coerce ``s`` to a Qdrant-safe identifier (alnum + underscore only)."""
+    return "".join(c if c.isalnum() else "_" for c in s)
+def get_collection_name(
+    org_id: str | None = None,
+    *,
+    session_id: str | None = None,
+) -> str:
+    """Return the Qdrant collection name for a given org or BYOK session.
+    Resolution order:
+    1. **BYOK mode** (``settings.byok_mode=True``) with ``session_id`` →
+       returns ``"{base}_sess_{sanitized_session}"``. Session-scoped
+       collections isolate each visitor's uploads.
+    2. **Multi-tenant** (``settings.multi_tenant_collections=True``) with
+       ``org_id`` → returns ``"{base}_{sanitized_org}"``.
+    3. **Single-tenant** (default) → returns ``settings.qdrant_collection``.
+    Args:
+        org_id: Organisation identifier (multi-tenant mode).
+        session_id: Per-visitor session UUID (BYOK mode). Takes priority over
+            ``org_id`` when both are set and BYOK is on, because BYOK is the
+            stricter isolation boundary.
+    Returns:
+        Collection name string suitable for QdrantManager.
+    """
+    base = settings.qdrant_collection
+    if settings.byok_mode and session_id:
+        return f"{base}_sess_{_sanitize(session_id)}"
+    if not settings.multi_tenant_collections or not org_id:
+        return base
+    return f"{base}_{_sanitize(org_id)}"

retrieval/session_purge.py CHANGED Viewed

@@ -1,185 +1,185 @@
-"""Per-session Qdrant collection purge for BYOK mode.
-In BYOK mode each visitor's uploads land in a collection named
-``documents_sess_<sanitized_session_id>``. Without a cleanup pass these
-collections accumulate until the 1 GB Qdrant Cloud free tier fills up.
-This module provides:
-- :func:`purge_expired_sessions` — synchronous, idempotent sweep that
-  deletes collections whose creation timestamp is older than
-  ``settings.session_collection_ttl_hours``.
-- :func:`schedule_session_purge` — APScheduler hook the FastAPI lifespan
-  calls so the sweep runs every 6 hours inside the same process. No
-  separate cron container required.
-The creation timestamp is read from Qdrant's
-``CollectionInfo.config.params.metadata`` (set at create-time by the
-ingestion pipeline). Collections without a creation timestamp are treated
-as legacy and **skipped** — we never delete data we can't date.
-See ``launch-plan/03-backend-byok.md`` § Session purge cron.
-"""
-from __future__ import annotations
-from datetime import UTC, datetime, timedelta
-from typing import TYPE_CHECKING, Any
-from config.settings import settings
-from utils.logging import get_logger
-if TYPE_CHECKING:
-    from qdrant_client import QdrantClient
-logger = get_logger(__name__)
-SESSION_COLLECTION_PREFIX = "_sess_"
-"""Suffix introduced into the collection name by ``get_collection_name`` when
-``byok_mode`` is on and a ``session_id`` is supplied. Used here to filter the
-purge sweep to BYOK collections only — multi-tenant org collections are NOT
-touched."""
-def _session_collection_prefix() -> str:
-    """Concrete prefix for the current base collection (e.g. ``documents_sess_``)."""
-    return f"{settings.qdrant_collection}{SESSION_COLLECTION_PREFIX}"
-def _is_session_collection(name: str) -> bool:
-    """True iff ``name`` was emitted by ``get_collection_name`` with a session_id."""
-    return name.startswith(_session_collection_prefix())
-def _parse_created_at(meta: dict[str, Any] | None) -> datetime | None:
-    """Return the collection's recorded creation datetime, or None if missing.
-    The ingestion pipeline writes ``created_at`` as an ISO-8601 UTC string into
-    the collection's metadata payload when first creating a session
-    collection. Older collections lack the field — those are intentionally
-    skipped to avoid deleting data we cannot date.
-    """
-    if not meta:
-        return None
-    raw = meta.get("created_at")
-    if not raw:
-        return None
-    try:
-        # Accept both ``2026-05-26T13:00:00+00:00`` and trailing ``Z`` forms.
-        return datetime.fromisoformat(str(raw).replace("Z", "+00:00"))
-    except (TypeError, ValueError):
-        logger.warning("session_purge_bad_timestamp", value=str(raw))
-        return None
-def purge_expired_sessions(
-    client: QdrantClient,
-    *,
-    ttl_hours: int | None = None,
-    now: datetime | None = None,
-) -> dict[str, Any]:
-    """Delete BYOK session collections older than the TTL.
-    Args:
-        client: Live ``QdrantClient`` (cloud or local).
-        ttl_hours: Override ``settings.session_collection_ttl_hours``. Tests
-            pass small values; production uses the default 24.
-        now: Override the clock for deterministic tests.
-    Returns:
-        Summary dict with counts (``inspected``, ``deleted``, ``skipped``,
-        ``errors``) suitable for emission to the audit log.
-    """
-    ttl = ttl_hours if ttl_hours is not None else settings.session_collection_ttl_hours
-    horizon = (now or datetime.now(UTC)) - timedelta(hours=ttl)
-    inspected = deleted = skipped = errors = 0
-    deleted_names: list[str] = []
-    try:
-        collections = client.get_collections().collections
-    except Exception as exc:
-        logger.error("session_purge_list_failed", error=str(exc))
-        return {"inspected": 0, "deleted": 0, "skipped": 0, "errors": 1}
-    for col in collections:
-        name = col.name
-        if not _is_session_collection(name):
-            continue
-        inspected += 1
-        try:
-            info = client.get_collection(name)
-            meta = getattr(info.config.params, "metadata", None) or {}
-            created = _parse_created_at(meta)
-            if created is None:
-                # Undated -> skip; we don't delete what we can't time-stamp.
-                skipped += 1
-                continue
-            if created < horizon:
-                client.delete_collection(name)
-                deleted += 1
-                deleted_names.append(name)
-                logger.info(
-                    "session_purge_deleted",
-                    collection=name,
-                    created_at=created.isoformat(),
-                    age_hours=round((horizon - created).total_seconds() / 3600.0 + ttl, 1),
-                )
-            else:
-                skipped += 1
-        except Exception as exc:
-            errors += 1
-            logger.warning("session_purge_collection_failed", collection=name, error=str(exc))
-    summary = {
-        "inspected": inspected,
-        "deleted": deleted,
-        "skipped": skipped,
-        "errors": errors,
-        "deleted_names": deleted_names,
-        "ttl_hours": ttl,
-    }
-    logger.info(
-        "session_purge_summary", **{k: v for k, v in summary.items() if k != "deleted_names"}
-    )
-    return summary
-# ── FastAPI lifespan hook ────────────────────────────────────────────────────
-def schedule_session_purge(client: QdrantClient, *, interval_hours: int = 6) -> Any | None:
-    """Start an APScheduler job that runs :func:`purge_expired_sessions` periodically.
-    Called from the FastAPI ``lifespan`` context manager. Returns the
-    ``AsyncIOScheduler`` instance (or None when APScheduler is not
-    installed — we then run as a single-shot at startup so at least one
-    sweep happens per restart).
-    """
-    if not settings.byok_mode:
-        logger.debug("session_purge_not_scheduled", reason="byok_mode is off")
-        return None
-    try:
-        from apscheduler.schedulers.asyncio import (
-            AsyncIOScheduler,  # type: ignore[import-not-found]
-        )
-    except ImportError:
-        # Optional dep absent: at least sweep once so the Space does not
-        # accumulate indefinitely on long uptimes.
-        logger.warning("apscheduler_missing", action="single-shot purge instead")
-        purge_expired_sessions(client)
-        return None
-    scheduler = AsyncIOScheduler()
-    scheduler.add_job(
-        purge_expired_sessions,
-        "interval",
-        hours=interval_hours,
-        args=[client],
-        id="byok-session-purge",
-        replace_existing=True,
-    )
-    scheduler.start()
-    logger.info("session_purge_scheduled", every_hours=interval_hours)
-    return scheduler

+"""Per-session Qdrant collection purge for BYOK mode.
+In BYOK mode each visitor's uploads land in a collection named
+``documents_sess_<sanitized_session_id>``. Without a cleanup pass these
+collections accumulate until the 1 GB Qdrant Cloud free tier fills up.
+This module provides:
+- :func:`purge_expired_sessions` — synchronous, idempotent sweep that
+  deletes collections whose creation timestamp is older than
+  ``settings.session_collection_ttl_hours``.
+- :func:`schedule_session_purge` — APScheduler hook the FastAPI lifespan
+  calls so the sweep runs every 6 hours inside the same process. No
+  separate cron container required.
+The creation timestamp is read from Qdrant's
+``CollectionInfo.config.params.metadata`` (set at create-time by the
+ingestion pipeline). Collections without a creation timestamp are treated
+as legacy and **skipped** — we never delete data we can't date.
+See ``launch-plan/03-backend-byok.md`` § Session purge cron.
+"""
+from __future__ import annotations
+from datetime import UTC, datetime, timedelta
+from typing import TYPE_CHECKING, Any
+from config.settings import settings
+from utils.logging import get_logger
+if TYPE_CHECKING:
+    from qdrant_client import QdrantClient
+logger = get_logger(__name__)
+SESSION_COLLECTION_PREFIX = "_sess_"
+"""Suffix introduced into the collection name by ``get_collection_name`` when
+``byok_mode`` is on and a ``session_id`` is supplied. Used here to filter the
+purge sweep to BYOK collections only — multi-tenant org collections are NOT
+touched."""
+def _session_collection_prefix() -> str:
+    """Concrete prefix for the current base collection (e.g. ``documents_sess_``)."""
+    return f"{settings.qdrant_collection}{SESSION_COLLECTION_PREFIX}"
+def _is_session_collection(name: str) -> bool:
+    """True iff ``name`` was emitted by ``get_collection_name`` with a session_id."""
+    return name.startswith(_session_collection_prefix())
+def _parse_created_at(meta: dict[str, Any] | None) -> datetime | None:
+    """Return the collection's recorded creation datetime, or None if missing.
+    The ingestion pipeline writes ``created_at`` as an ISO-8601 UTC string into
+    the collection's metadata payload when first creating a session
+    collection. Older collections lack the field — those are intentionally
+    skipped to avoid deleting data we cannot date.
+    """
+    if not meta:
+        return None
+    raw = meta.get("created_at")
+    if not raw:
+        return None
+    try:
+        # Accept both ``2026-05-26T13:00:00+00:00`` and trailing ``Z`` forms.
+        return datetime.fromisoformat(str(raw).replace("Z", "+00:00"))
+    except (TypeError, ValueError):
+        logger.warning("session_purge_bad_timestamp", value=str(raw))
+        return None
+def purge_expired_sessions(
+    client: QdrantClient,
+    *,
+    ttl_hours: int | None = None,
+    now: datetime | None = None,
+) -> dict[str, Any]:
+    """Delete BYOK session collections older than the TTL.
+    Args:
+        client: Live ``QdrantClient`` (cloud or local).
+        ttl_hours: Override ``settings.session_collection_ttl_hours``. Tests
+            pass small values; production uses the default 24.
+        now: Override the clock for deterministic tests.
+    Returns:
+        Summary dict with counts (``inspected``, ``deleted``, ``skipped``,
+        ``errors``) suitable for emission to the audit log.
+    """
+    ttl = ttl_hours if ttl_hours is not None else settings.session_collection_ttl_hours
+    horizon = (now or datetime.now(UTC)) - timedelta(hours=ttl)
+    inspected = deleted = skipped = errors = 0
+    deleted_names: list[str] = []
+    try:
+        collections = client.get_collections().collections
+    except Exception as exc:
+        logger.error("session_purge_list_failed", error=str(exc))
+        return {"inspected": 0, "deleted": 0, "skipped": 0, "errors": 1}
+    for col in collections:
+        name = col.name
+        if not _is_session_collection(name):
+            continue
+        inspected += 1
+        try:
+            info = client.get_collection(name)
+            meta = getattr(info.config.params, "metadata", None) or {}
+            created = _parse_created_at(meta)
+            if created is None:
+                # Undated -> skip; we don't delete what we can't time-stamp.
+                skipped += 1
+                continue
+            if created < horizon:
+                client.delete_collection(name)
+                deleted += 1
+                deleted_names.append(name)
+                logger.info(
+                    "session_purge_deleted",
+                    collection=name,
+                    created_at=created.isoformat(),
+                    age_hours=round((horizon - created).total_seconds() / 3600.0 + ttl, 1),
+                )
+            else:
+                skipped += 1
+        except Exception as exc:
+            errors += 1
+            logger.warning("session_purge_collection_failed", collection=name, error=str(exc))
+    summary = {
+        "inspected": inspected,
+        "deleted": deleted,
+        "skipped": skipped,
+        "errors": errors,
+        "deleted_names": deleted_names,
+        "ttl_hours": ttl,
+    }
+    logger.info(
+        "session_purge_summary", **{k: v for k, v in summary.items() if k != "deleted_names"}
+    )
+    return summary
+# ── FastAPI lifespan hook ────────────────────────────────────────────────────
+def schedule_session_purge(client: QdrantClient, *, interval_hours: int = 6) -> Any | None:
+    """Start an APScheduler job that runs :func:`purge_expired_sessions` periodically.
+    Called from the FastAPI ``lifespan`` context manager. Returns the
+    ``AsyncIOScheduler`` instance (or None when APScheduler is not
+    installed — we then run as a single-shot at startup so at least one
+    sweep happens per restart).
+    """
+    if not settings.byok_mode:
+        logger.debug("session_purge_not_scheduled", reason="byok_mode is off")
+        return None
+    try:
+        from apscheduler.schedulers.asyncio import (
+            AsyncIOScheduler,  # type: ignore[import-not-found]
+        )
+    except ImportError:
+        # Optional dep absent: at least sweep once so the Space does not
+        # accumulate indefinitely on long uptimes.
+        logger.warning("apscheduler_missing", action="single-shot purge instead")
+        purge_expired_sessions(client)
+        return None
+    scheduler = AsyncIOScheduler()
+    scheduler.add_job(
+        purge_expired_sessions,
+        "interval",
+        hours=interval_hours,
+        args=[client],
+        id="byok-session-purge",
+        replace_existing=True,
+    )
+    scheduler.start()
+    logger.info("session_purge_scheduled", every_hours=interval_hours)
+    return scheduler

utils/observability.py CHANGED Viewed

@@ -1,252 +1,252 @@
-"""Observability setup using Arize Phoenix for LLM tracing.
-Provides OpenTelemetry-compatible distributed tracing for LLM calls,
-retrieval operations, and LangGraph execution. Gracefully degrades
-when Phoenix is not installed or configured.
-Usage:
-    Call setup_tracing() once at application startup (e.g., in app/main.py).
-    All trace_* functions will automatically emit spans when tracing is enabled.
-"""
-from __future__ import annotations
-from config.settings import settings
-from utils.logging import get_logger
-_log = get_logger(__name__)
-# Module-level state
-_tracer = None
-_phoenix_configured = False
-_phoenix_project_name: str = settings.app_name
-def setup_tracing() -> bool:
-    """Initialize Phoenix tracing if ``settings.phoenix_endpoint`` is set.
-    This function is safe to call unconditionally at startup — it will
-    log a message and return immediately if Phoenix is not configured.
-    Tracing failures never crash the application.
-    Returns:
-        True if tracing was successfully enabled, False otherwise.
-    """
-    global _tracer, _phoenix_configured, _phoenix_project_name
-    # BYOK mode mandates: no third-party telemetry sees a request. Phoenix
-    # spans capture LLM prompts and completions, which would include the
-    # visitor's keys-in-context and any private text they uploaded. Hard
-    # disable in BYOK regardless of phoenix_endpoint configuration.
-    if settings.byok_mode:
-        _log.info("phoenix_tracing_disabled", reason="BYOK mode forbids external telemetry")
-        return False
-    if not settings.phoenix_endpoint:
-        _log.info("phoenix_tracing_disabled", reason="No phoenix_endpoint configured")
-        return False
-    try:
-        from phoenix.otel import register
-        tracer_provider = register(
-            project_name=settings.app_name,
-            endpoint=settings.phoenix_endpoint,
-        )
-        # Attempt to instrument LLM and retrieval calls
-        _instrument_providers()
-        _phoenix_configured = True
-        _phoenix_project_name = settings.app_name
-        _log.info(
-            "phoenix_tracing_enabled",
-            endpoint=settings.phoenix_endpoint,
-            project=settings.app_name,
-            tracer_provider=str(tracer_provider),
-        )
-        return True
-    except ImportError:
-        _log.warning(
-            "phoenix_import_failed",
-            msg=(
-                "arize-phoenix not installed; tracing unavailable. "
-                "Install with: pip install 'arize-phoenix-otel'"
-            ),
-        )
-        return False
-    except Exception as exc:
-        _log.error(
-            "phoenix_tracing_init_error",
-            error=str(exc),
-            endpoint=settings.phoenix_endpoint,
-        )
-        return False
-def _instrument_providers() -> None:
-    """Instrument LLM and retrieval providers with OpenTelemetry.
-    Attempts to auto-instrument supported providers. Failures are
-    logged but never raised — partial instrumentation is acceptable.
-    """
-    # Instrument LangChain/LangGraph if available
-    try:
-        from openinference.instrumentation.langchain import LangChainInstrumentor
-        LangChainInstrumentor().instrument()
-        _log.info("instrumented_langchain")
-    except ImportError:
-        _log.debug(
-            "langchain_instrumentation_skipped",
-            reason="openinference-instrumentation-langchain not installed",
-        )
-    except Exception as exc:
-        _log.debug("langchain_instrumentation_error", reason=str(exc))
-    # Instrument OpenAI-compatible calls if available
-    try:
-        from openinference.instrumentation.openai import OpenAIInstrumentor
-        OpenAIInstrumentor().instrument()
-        _log.info("instrumented_openai")
-    except ImportError:
-        _log.debug(
-            "openai_instrumentation_skipped",
-            reason="openinference-instrumentation-openai not installed",
-        )
-    except Exception as exc:
-        _log.debug("openai_instrumentation_error", reason=str(exc))
-def trace_llm_call(
-    provider: str,
-    model: str,
-    prompt: str,
-    response: str,
-    latency_ms: float,
-    tokens: dict[str, int] | None = None,
-) -> None:
-    """Record a manual trace span for an LLM call.
-    Can be used as an explicit trace point when auto-instrumentation
-    is unavailable or for custom tracking.
-    Args:
-        provider: LLM provider name (e.g., "ollama", "groq").
-        model: Model identifier used for generation.
-        prompt: The input prompt text.
-        response: The generated response text.
-        latency_ms: Response latency in milliseconds.
-        tokens: Optional token usage dict with keys like
-            "prompt_tokens", "completion_tokens", "total_tokens".
-    """
-    if not _phoenix_configured:
-        return
-    try:
-        from opentelemetry import trace
-        tracer = trace.get_tracer("secureagentrag.llm")
-        with tracer.start_as_current_span("llm_call") as span:
-            span.set_attribute("llm.provider", provider)
-            span.set_attribute("llm.model", model)
-            span.set_attribute("llm.prompt_length", len(prompt))
-            span.set_attribute("llm.response_length", len(response))
-            span.set_attribute("llm.latency_ms", latency_ms)
-            if tokens:
-                for key, value in tokens.items():
-                    span.set_attribute(f"llm.tokens.{key}", value)
-    except Exception as exc:
-        _log.debug("trace_llm_call_failed", error=str(exc))
-def trace_retrieval(
-    query: str,
-    num_results: int,
-    latency_ms: float,
-    method: str = "hybrid",
-) -> None:
-    """Record a manual trace span for a retrieval operation.
-    Args:
-        query: The search query string.
-        num_results: Number of results returned.
-        latency_ms: Retrieval latency in milliseconds.
-        method: Retrieval method used ("hybrid", "dense", "bm25").
-    """
-    if not _phoenix_configured:
-        return
-    try:
-        from opentelemetry import trace
-        tracer = trace.get_tracer("secureagentrag.retrieval")
-        with tracer.start_as_current_span("retrieval") as span:
-            span.set_attribute("retrieval.query_length", len(query))
-            span.set_attribute("retrieval.num_results", num_results)
-            span.set_attribute("retrieval.latency_ms", latency_ms)
-            span.set_attribute("retrieval.method", method)
-    except Exception as exc:
-        _log.debug("trace_retrieval_failed", error=str(exc))
-def trace_graph_execution(
-    query: str,
-    nodes_executed: list[str],
-    total_latency_ms: float,
-    final_confidence: float,
-    retries: int = 0,
-) -> None:
-    """Record a manual trace span for LangGraph pipeline execution.
-    Args:
-        query: The original user query.
-        nodes_executed: List of graph node names that were executed.
-        total_latency_ms: Total pipeline execution time in milliseconds.
-        final_confidence: Final confidence score of the generated answer.
-        retries: Number of corrective retrieval retries performed.
-    """
-    if not _phoenix_configured:
-        return
-    try:
-        from opentelemetry import trace
-        tracer = trace.get_tracer("secureagentrag.graph")
-        with tracer.start_as_current_span("graph_execution") as span:
-            span.set_attribute("graph.query_length", len(query))
-            span.set_attribute("graph.nodes_executed", ",".join(nodes_executed))
-            span.set_attribute("graph.total_latency_ms", total_latency_ms)
-            span.set_attribute("graph.confidence", final_confidence)
-            span.set_attribute("graph.retries", retries)
-    except Exception as exc:
-        _log.debug("trace_graph_execution_failed", error=str(exc))
-def get_trace_url() -> str | None:
-    """Return the Phoenix dashboard URL if tracing is configured.
-    Returns:
-        Phoenix UI URL string, or None if Phoenix is not configured.
-    """
-    if not _phoenix_configured or not settings.phoenix_endpoint:
-        return None
-    # Phoenix UI typically runs on the same host
-    endpoint = settings.phoenix_endpoint.rstrip("/")
-    # Replace gRPC/collector port with UI port if needed
-    if ":4317" in endpoint:
-        return endpoint.replace(":4317", ":6006")
-    if ":6006" in endpoint:
-        return endpoint
-    return endpoint
-def is_tracing_enabled() -> bool:
-    """Check if Phoenix tracing is currently active.
-    Returns:
-        True if tracing was successfully configured, False otherwise.
-    """
-    return _phoenix_configured

+"""Observability setup using Arize Phoenix for LLM tracing.
+Provides OpenTelemetry-compatible distributed tracing for LLM calls,
+retrieval operations, and LangGraph execution. Gracefully degrades
+when Phoenix is not installed or configured.
+Usage:
+    Call setup_tracing() once at application startup (e.g., in app/main.py).
+    All trace_* functions will automatically emit spans when tracing is enabled.
+"""
+from __future__ import annotations
+from config.settings import settings
+from utils.logging import get_logger
+_log = get_logger(__name__)
+# Module-level state
+_tracer = None
+_phoenix_configured = False
+_phoenix_project_name: str = settings.app_name
+def setup_tracing() -> bool:
+    """Initialize Phoenix tracing if ``settings.phoenix_endpoint`` is set.
+    This function is safe to call unconditionally at startup — it will
+    log a message and return immediately if Phoenix is not configured.
+    Tracing failures never crash the application.
+    Returns:
+        True if tracing was successfully enabled, False otherwise.
+    """
+    global _tracer, _phoenix_configured, _phoenix_project_name
+    # BYOK mode mandates: no third-party telemetry sees a request. Phoenix
+    # spans capture LLM prompts and completions, which would include the
+    # visitor's keys-in-context and any private text they uploaded. Hard
+    # disable in BYOK regardless of phoenix_endpoint configuration.
+    if settings.byok_mode:
+        _log.info("phoenix_tracing_disabled", reason="BYOK mode forbids external telemetry")
+        return False
+    if not settings.phoenix_endpoint:
+        _log.info("phoenix_tracing_disabled", reason="No phoenix_endpoint configured")
+        return False
+    try:
+        from phoenix.otel import register
+        tracer_provider = register(
+            project_name=settings.app_name,
+            endpoint=settings.phoenix_endpoint,
+        )
+        # Attempt to instrument LLM and retrieval calls
+        _instrument_providers()
+        _phoenix_configured = True
+        _phoenix_project_name = settings.app_name
+        _log.info(
+            "phoenix_tracing_enabled",
+            endpoint=settings.phoenix_endpoint,
+            project=settings.app_name,
+            tracer_provider=str(tracer_provider),
+        )
+        return True
+    except ImportError:
+        _log.warning(
+            "phoenix_import_failed",
+            msg=(
+                "arize-phoenix not installed; tracing unavailable. "
+                "Install with: pip install 'arize-phoenix-otel'"
+            ),
+        )
+        return False
+    except Exception as exc:
+        _log.error(
+            "phoenix_tracing_init_error",
+            error=str(exc),
+            endpoint=settings.phoenix_endpoint,
+        )
+        return False
+def _instrument_providers() -> None:
+    """Instrument LLM and retrieval providers with OpenTelemetry.
+    Attempts to auto-instrument supported providers. Failures are
+    logged but never raised — partial instrumentation is acceptable.
+    """
+    # Instrument LangChain/LangGraph if available
+    try:
+        from openinference.instrumentation.langchain import LangChainInstrumentor
+        LangChainInstrumentor().instrument()
+        _log.info("instrumented_langchain")
+    except ImportError:
+        _log.debug(
+            "langchain_instrumentation_skipped",
+            reason="openinference-instrumentation-langchain not installed",
+        )
+    except Exception as exc:
+        _log.debug("langchain_instrumentation_error", reason=str(exc))
+    # Instrument OpenAI-compatible calls if available
+    try:
+        from openinference.instrumentation.openai import OpenAIInstrumentor
+        OpenAIInstrumentor().instrument()
+        _log.info("instrumented_openai")
+    except ImportError:
+        _log.debug(
+            "openai_instrumentation_skipped",
+            reason="openinference-instrumentation-openai not installed",
+        )
+    except Exception as exc:
+        _log.debug("openai_instrumentation_error", reason=str(exc))
+def trace_llm_call(
+    provider: str,
+    model: str,
+    prompt: str,
+    response: str,
+    latency_ms: float,
+    tokens: dict[str, int] | None = None,
+) -> None:
+    """Record a manual trace span for an LLM call.
+    Can be used as an explicit trace point when auto-instrumentation
+    is unavailable or for custom tracking.
+    Args:
+        provider: LLM provider name (e.g., "ollama", "groq").
+        model: Model identifier used for generation.
+        prompt: The input prompt text.
+        response: The generated response text.
+        latency_ms: Response latency in milliseconds.
+        tokens: Optional token usage dict with keys like
+            "prompt_tokens", "completion_tokens", "total_tokens".
+    """
+    if not _phoenix_configured:
+        return
+    try:
+        from opentelemetry import trace
+        tracer = trace.get_tracer("secureagentrag.llm")
+        with tracer.start_as_current_span("llm_call") as span:
+            span.set_attribute("llm.provider", provider)
+            span.set_attribute("llm.model", model)
+            span.set_attribute("llm.prompt_length", len(prompt))
+            span.set_attribute("llm.response_length", len(response))
+            span.set_attribute("llm.latency_ms", latency_ms)
+            if tokens:
+                for key, value in tokens.items():
+                    span.set_attribute(f"llm.tokens.{key}", value)
+    except Exception as exc:
+        _log.debug("trace_llm_call_failed", error=str(exc))
+def trace_retrieval(
+    query: str,
+    num_results: int,
+    latency_ms: float,
+    method: str = "hybrid",
+) -> None:
+    """Record a manual trace span for a retrieval operation.
+    Args:
+        query: The search query string.
+        num_results: Number of results returned.
+        latency_ms: Retrieval latency in milliseconds.
+        method: Retrieval method used ("hybrid", "dense", "bm25").
+    """
+    if not _phoenix_configured:
+        return
+    try:
+        from opentelemetry import trace
+        tracer = trace.get_tracer("secureagentrag.retrieval")
+        with tracer.start_as_current_span("retrieval") as span:
+            span.set_attribute("retrieval.query_length", len(query))
+            span.set_attribute("retrieval.num_results", num_results)
+            span.set_attribute("retrieval.latency_ms", latency_ms)
+            span.set_attribute("retrieval.method", method)
+    except Exception as exc:
+        _log.debug("trace_retrieval_failed", error=str(exc))
+def trace_graph_execution(
+    query: str,
+    nodes_executed: list[str],
+    total_latency_ms: float,
+    final_confidence: float,
+    retries: int = 0,
+) -> None:
+    """Record a manual trace span for LangGraph pipeline execution.
+    Args:
+        query: The original user query.
+        nodes_executed: List of graph node names that were executed.
+        total_latency_ms: Total pipeline execution time in milliseconds.
+        final_confidence: Final confidence score of the generated answer.
+        retries: Number of corrective retrieval retries performed.
+    """
+    if not _phoenix_configured:
+        return
+    try:
+        from opentelemetry import trace
+        tracer = trace.get_tracer("secureagentrag.graph")
+        with tracer.start_as_current_span("graph_execution") as span:
+            span.set_attribute("graph.query_length", len(query))
+            span.set_attribute("graph.nodes_executed", ",".join(nodes_executed))
+            span.set_attribute("graph.total_latency_ms", total_latency_ms)
+            span.set_attribute("graph.confidence", final_confidence)
+            span.set_attribute("graph.retries", retries)
+    except Exception as exc:
+        _log.debug("trace_graph_execution_failed", error=str(exc))
+def get_trace_url() -> str | None:
+    """Return the Phoenix dashboard URL if tracing is configured.
+    Returns:
+        Phoenix UI URL string, or None if Phoenix is not configured.
+    """
+    if not _phoenix_configured or not settings.phoenix_endpoint:
+        return None
+    # Phoenix UI typically runs on the same host
+    endpoint = settings.phoenix_endpoint.rstrip("/")
+    # Replace gRPC/collector port with UI port if needed
+    if ":4317" in endpoint:
+        return endpoint.replace(":4317", ":6006")
+    if ":6006" in endpoint:
+        return endpoint
+    return endpoint
+def is_tracing_enabled() -> bool:
+    """Check if Phoenix tracing is currently active.
+    Returns:
+        True if tracing was successfully configured, False otherwise.
+    """
+    return _phoenix_configured

utils/pii.py CHANGED Viewed

@@ -1,146 +1,146 @@
-"""PII redaction for secondary stores (audit log, query cache, conversation history).
-Two strategies:
-- Regex-based (always available) — covers email, phone, SSN, credit card,
-  IBAN, IPv4, URL with credentials.
-- Microsoft Presidio (optional dependency) — invoked when installed; higher
-  recall and language-aware NER for names, locations, organizations.
-This module never sees plaintext from the LLM — it operates only on text that
-is about to be persisted to disk. Live prompts and retrieved contexts remain
-unmodified so model quality is not affected.
-"""
-from __future__ import annotations
-import re
-from typing import Any
-from config.settings import settings
-from utils.logging import get_logger
-logger = get_logger(__name__)
-# Order matters — most specific patterns first so they win against the
-# broader phone regex. Provider-specific API-key shapes (added 2026-05-26
-# for BYOK mode) live ABOVE the generic ``[API_KEY]`` rule because their
-# prefixes are not catchable by the legacy ``(sk|pk|api|key)`` alternation.
-_REGEX_PATTERNS: list[tuple[re.Pattern[str], str]] = [
-    (re.compile(r"https?://[^\s/]+:[^\s/]+@[^\s]+"), "[URL_WITH_CREDS]"),
-    (re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}"), "[EMAIL]"),
-    (re.compile(r"\b\d{3}-\d{2}-\d{4}\b"), "[SSN]"),
-    (re.compile(r"\b(?:\d[ -]*?){13,19}\b"), "[CC]"),  # Luhn-validated below
-    (re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{10,30}\b"), "[IBAN]"),
-    (re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b"), "[IP]"),
-    # ── BYOK key shapes (P6 production launch) ──────────────────────────
-    # Anthropic must come BEFORE OpenAI because ``sk-ant-...`` also matches
-    # the generic ``sk-...`` rule below.
-    (re.compile(r"\bsk-ant-[A-Za-z0-9_-]{20,}\b"), "[API_KEY]"),
-    (re.compile(r"\bsk-(?:proj|svcacct)-[A-Za-z0-9_-]{20,}\b"), "[API_KEY]"),
-    (re.compile(r"\bgsk_[A-Za-z0-9]{40,}\b"), "[API_KEY]"),
-    (re.compile(r"\bhf_[A-Za-z0-9]{30,}\b"), "[API_KEY]"),
-    (re.compile(r"\bvcp_[A-Za-z0-9]{20,}\b"), "[API_KEY]"),
-    # JWT-format database API keys (Qdrant Cloud auth v2). Three dot-separated
-    # base64url segments — the middle one is always ``eyJ...`` start.
-    (re.compile(r"\beyJ[A-Za-z0-9_-]+\.eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\b"), "[API_KEY]"),
-    # Qdrant Cloud management keys: ``<uuid>|<token>``.
-    (
-        re.compile(
-            r"\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\|[A-Za-z0-9_-]{20,}\b"
-        ),
-        "[API_KEY]",
-    ),
-    # Legacy generic — keeps catching ``sk-...`` and ``api_...`` shapes from
-    # older docs and tests.
-    (re.compile(r"\b(?:sk|pk|api|key)[-_][A-Za-z0-9_-]{16,}\b", re.IGNORECASE), "[API_KEY]"),
-    (re.compile(r"\b(?:\+?\d{1,3}[-.\s]?)?\(?\d{2,4}\)?[-.\s]?\d{3,4}[-.\s]?\d{3,4}\b"), "[PHONE]"),
-]
-# Try Presidio for richer detection (names, locations, etc.)
-try:
-    from presidio_analyzer import AnalyzerEngine  # type: ignore[import-not-found]
-    from presidio_anonymizer import AnonymizerEngine  # type: ignore[import-not-found]
-    _PRESIDIO_AVAILABLE = True
-    _analyzer: Any = AnalyzerEngine()
-    _anonymizer: Any = AnonymizerEngine()
-except Exception:
-    _PRESIDIO_AVAILABLE = False
-    _analyzer = None
-    _anonymizer = None
-def _luhn_valid(num: str) -> bool:
-    """Luhn checksum to filter false-positive credit-card matches."""
-    digits = [int(c) for c in num if c.isdigit()]
-    if not (13 <= len(digits) <= 19):
-        return False
-    s = 0
-    for i, d in enumerate(reversed(digits)):
-        if i % 2 == 1:
-            d *= 2
-            if d > 9:
-                d -= 9
-        s += d
-    return s % 10 == 0
-def redact(text: str) -> str:
-    """Return ``text`` with PII tokens masked.
-    Args:
-        text: Arbitrary string that may contain PII.
-    Returns:
-        Redacted copy of the text. If redaction is disabled via settings
-        the original string is returned unchanged.
-    """
-    if not settings.pii_redaction_enabled or not text:
-        return text
-    out = text
-    for pattern, token in _REGEX_PATTERNS:
-        if token == "[CC]":
-            # Apply Luhn to avoid over-masking phone numbers / arbitrary digits.
-            out = pattern.sub(lambda m: "[CC]" if _luhn_valid(m.group(0)) else m.group(0), out)
-        else:
-            out = pattern.sub(token, out)
-    if _PRESIDIO_AVAILABLE and _analyzer is not None and _anonymizer is not None:
-        try:
-            results = _analyzer.analyze(text=out, language="en")
-            if results:
-                out = _anonymizer.anonymize(text=out, analyzer_results=results).text
-        except Exception as exc:
-            logger.debug("presidio_redact_failed", error=str(exc))
-    return out
-def redact_dict(data: dict[str, Any], fields: tuple[str, ...] | None = None) -> dict[str, Any]:
-    """Recursively redact string values in a dict.
-    Args:
-        data: Dict (possibly nested) to redact.
-        fields: If given, only redact these top-level keys. Otherwise redact
-            every string in the structure.
-    Returns:
-        Deep-redacted copy.
-    """
-    if not settings.pii_redaction_enabled:
-        return data
-    def _walk(value: Any, *, force: bool) -> Any:
-        if isinstance(value, str):
-            return redact(value) if force else value
-        if isinstance(value, dict):
-            return {
-                k: _walk(v, force=force or (fields is not None and k in fields))
-                for k, v in value.items()
-            }
-        if isinstance(value, list):
-            return [_walk(v, force=force) for v in value]
-        return value
-    return _walk(data, force=fields is None)

+"""PII redaction for secondary stores (audit log, query cache, conversation history).
+Two strategies:
+- Regex-based (always available) — covers email, phone, SSN, credit card,
+  IBAN, IPv4, URL with credentials.
+- Microsoft Presidio (optional dependency) — invoked when installed; higher
+  recall and language-aware NER for names, locations, organizations.
+This module never sees plaintext from the LLM — it operates only on text that
+is about to be persisted to disk. Live prompts and retrieved contexts remain
+unmodified so model quality is not affected.
+"""
+from __future__ import annotations
+import re
+from typing import Any
+from config.settings import settings
+from utils.logging import get_logger
+logger = get_logger(__name__)
+# Order matters — most specific patterns first so they win against the
+# broader phone regex. Provider-specific API-key shapes (added 2026-05-26
+# for BYOK mode) live ABOVE the generic ``[API_KEY]`` rule because their
+# prefixes are not catchable by the legacy ``(sk|pk|api|key)`` alternation.
+_REGEX_PATTERNS: list[tuple[re.Pattern[str], str]] = [
+    (re.compile(r"https?://[^\s/]+:[^\s/]+@[^\s]+"), "[URL_WITH_CREDS]"),
+    (re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}"), "[EMAIL]"),
+    (re.compile(r"\b\d{3}-\d{2}-\d{4}\b"), "[SSN]"),
+    (re.compile(r"\b(?:\d[ -]*?){13,19}\b"), "[CC]"),  # Luhn-validated below
+    (re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{10,30}\b"), "[IBAN]"),
+    (re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b"), "[IP]"),
+    # ── BYOK key shapes (P6 production launch) ──────────────────────────
+    # Anthropic must come BEFORE OpenAI because ``sk-ant-...`` also matches
+    # the generic ``sk-...`` rule below.
+    (re.compile(r"\bsk-ant-[A-Za-z0-9_-]{20,}\b"), "[API_KEY]"),
+    (re.compile(r"\bsk-(?:proj|svcacct)-[A-Za-z0-9_-]{20,}\b"), "[API_KEY]"),
+    (re.compile(r"\bgsk_[A-Za-z0-9]{40,}\b"), "[API_KEY]"),
+    (re.compile(r"\bhf_[A-Za-z0-9]{30,}\b"), "[API_KEY]"),
+    (re.compile(r"\bvcp_[A-Za-z0-9]{20,}\b"), "[API_KEY]"),
+    # JWT-format database API keys (Qdrant Cloud auth v2). Three dot-separated
+    # base64url segments — the middle one is always ``eyJ...`` start.
+    (re.compile(r"\beyJ[A-Za-z0-9_-]+\.eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\b"), "[API_KEY]"),
+    # Qdrant Cloud management keys: ``<uuid>|<token>``.
+    (
+        re.compile(
+            r"\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\|[A-Za-z0-9_-]{20,}\b"
+        ),
+        "[API_KEY]",
+    ),
+    # Legacy generic — keeps catching ``sk-...`` and ``api_...`` shapes from
+    # older docs and tests.
+    (re.compile(r"\b(?:sk|pk|api|key)[-_][A-Za-z0-9_-]{16,}\b", re.IGNORECASE), "[API_KEY]"),
+    (re.compile(r"\b(?:\+?\d{1,3}[-.\s]?)?\(?\d{2,4}\)?[-.\s]?\d{3,4}[-.\s]?\d{3,4}\b"), "[PHONE]"),
+]
+# Try Presidio for richer detection (names, locations, etc.)
+try:
+    from presidio_analyzer import AnalyzerEngine  # type: ignore[import-not-found]
+    from presidio_anonymizer import AnonymizerEngine  # type: ignore[import-not-found]
+    _PRESIDIO_AVAILABLE = True
+    _analyzer: Any = AnalyzerEngine()
+    _anonymizer: Any = AnonymizerEngine()
+except Exception:
+    _PRESIDIO_AVAILABLE = False
+    _analyzer = None
+    _anonymizer = None
+def _luhn_valid(num: str) -> bool:
+    """Luhn checksum to filter false-positive credit-card matches."""
+    digits = [int(c) for c in num if c.isdigit()]
+    if not (13 <= len(digits) <= 19):
+        return False
+    s = 0
+    for i, d in enumerate(reversed(digits)):
+        if i % 2 == 1:
+            d *= 2
+            if d > 9:
+                d -= 9
+        s += d
+    return s % 10 == 0
+def redact(text: str) -> str:
+    """Return ``text`` with PII tokens masked.
+    Args:
+        text: Arbitrary string that may contain PII.
+    Returns:
+        Redacted copy of the text. If redaction is disabled via settings
+        the original string is returned unchanged.
+    """
+    if not settings.pii_redaction_enabled or not text:
+        return text
+    out = text
+    for pattern, token in _REGEX_PATTERNS:
+        if token == "[CC]":
+            # Apply Luhn to avoid over-masking phone numbers / arbitrary digits.
+            out = pattern.sub(lambda m: "[CC]" if _luhn_valid(m.group(0)) else m.group(0), out)
+        else:
+            out = pattern.sub(token, out)
+    if _PRESIDIO_AVAILABLE and _analyzer is not None and _anonymizer is not None:
+        try:
+            results = _analyzer.analyze(text=out, language="en")
+            if results:
+                out = _anonymizer.anonymize(text=out, analyzer_results=results).text
+        except Exception as exc:
+            logger.debug("presidio_redact_failed", error=str(exc))
+    return out
+def redact_dict(data: dict[str, Any], fields: tuple[str, ...] | None = None) -> dict[str, Any]:
+    """Recursively redact string values in a dict.
+    Args:
+        data: Dict (possibly nested) to redact.
+        fields: If given, only redact these top-level keys. Otherwise redact
+            every string in the structure.
+    Returns:
+        Deep-redacted copy.
+    """
+    if not settings.pii_redaction_enabled:
+        return data
+    def _walk(value: Any, *, force: bool) -> Any:
+        if isinstance(value, str):
+            return redact(value) if force else value
+        if isinstance(value, dict):
+            return {
+                k: _walk(v, force=force or (fields is not None and k in fields))
+                for k, v in value.items()
+            }
+        if isinstance(value, list):
+            return [_walk(v, force=force) for v in value]
+        return value
+    return _walk(data, force=fields is None)

utils/rate_limiter.py CHANGED Viewed

@@ -1,524 +1,524 @@
-"""Token-bucket rate limiter for API request throttling.
-Provides per-user and per-endpoint rate limiting to prevent abuse and
-ensure fair resource allocation. Uses an in-memory token bucket algorithm
-with optional Redis backend for distributed deployments.
-"""
-from __future__ import annotations
-import time
-from dataclasses import dataclass, field
-from typing import Any
-from utils.logging import get_logger
-logger = get_logger(__name__)
-@dataclass
-class RateLimitConfig:
-    """Configuration for a rate limit bucket.
-    Attributes:
-        requests_per_minute: Maximum requests allowed per minute.
-        burst_size: Maximum burst capacity (bucket size).
-        cooldown_seconds: Seconds to wait after being rate limited.
-    """
-    requests_per_minute: int = 60
-    burst_size: int = 10
-    cooldown_seconds: float = 1.0
-@dataclass
-class TokenBucket:
-    """In-memory token bucket for rate limiting.
-    Attributes:
-        tokens: Current available tokens.
-        last_update: Timestamp of last token refill.
-        config: Rate limit configuration.
-        blocked_until: Timestamp when the bucket is unblocked.
-    """
-    tokens: float = field(default=0.0)
-    last_update: float = field(default_factory=time.time)
-    config: RateLimitConfig = field(default_factory=RateLimitConfig)
-    blocked_until: float = field(default=0.0)
-    def _refill(self) -> None:
-        """Refill tokens based on elapsed time since last update."""
-        now = time.time()
-        elapsed = now - self.last_update
-        # Refill rate: tokens per second
-        refill_rate = self.config.requests_per_minute / 60.0
-        self.tokens = min(self.config.burst_size, self.tokens + elapsed * refill_rate)
-        self.last_update = now
-    def consume(self, tokens: float = 1.0) -> tuple[bool, dict[str, Any]]:
-        """Attempt to consume tokens from the bucket.
-        Args:
-            tokens: Number of tokens to consume (default 1 per request).
-        Returns:
-            Tuple of (allowed, metadata) where metadata contains
-            remaining tokens, retry_after, etc.
-        """
-        now = time.time()
-        # Check if currently blocked
-        if now < self.blocked_until:
-            retry_after = int(self.blocked_until - now) + 1
-            return False, {
-                "allowed": False,
-                "remaining": 0,
-                "retry_after": retry_after,
-                "reason": "cooldown_active",
-            }
-        self._refill()
-        if self.tokens >= tokens:
-            self.tokens -= tokens
-            remaining = int(self.tokens)
-            return True, {
-                "allowed": True,
-                "remaining": remaining,
-                "retry_after": 0,
-                "reason": None,
-            }
-        # Rate limit exceeded — enter cooldown
-        self.blocked_until = now + self.config.cooldown_seconds
-        retry_after = int(self.config.cooldown_seconds) + 1
-        return False, {
-            "allowed": False,
-            "remaining": 0,
-            "retry_after": retry_after,
-            "reason": "rate_limit_exceeded",
-        }
-class RateLimiter:
-    """Multi-key rate limiter with per-user and per-endpoint tracking.
-    Uses in-memory token buckets. For distributed deployments, wrap
-    with a Redis-backed implementation.
-    Args:
-        default_config: Default rate limit configuration.
-    """
-    def __init__(self, default_config: RateLimitConfig | None = None) -> None:
-        """Initialize the rate limiter.
-        Args:
-            default_config: Default configuration for new buckets.
-        """
-        self._default_config = default_config or RateLimitConfig()
-        self._buckets: dict[str, TokenBucket] = {}
-    def _get_bucket(self, key: str, config: RateLimitConfig | None = None) -> TokenBucket:
-        """Get or create a token bucket for the given key.
-        Args:
-            key: Unique identifier for the bucket (e.g., user_id + endpoint).
-            config: Optional custom configuration.
-        Returns:
-            The token bucket for the key.
-        """
-        if key not in self._buckets:
-            self._buckets[key] = TokenBucket(
-                tokens=config.burst_size if config else self._default_config.burst_size,
-                config=config or self._default_config,
-            )
-        return self._buckets[key]
-    def check_rate_limit(
-        self,
-        key: str,
-        tokens: float = 1.0,
-        config: RateLimitConfig | None = None,
-    ) -> tuple[bool, dict[str, Any]]:
-        """Check if a request is within the rate limit.
-        Args:
-            key: Rate limit bucket key (e.g., "user_123:query").
-            tokens: Tokens to consume.
-            config: Optional custom config for this key.
-        Returns:
-            Tuple of (allowed, metadata).
-        """
-        bucket = self._get_bucket(key, config)
-        allowed, metadata = bucket.consume(tokens)
-        if not allowed:
-            logger.warning(
-                "rate_limit_exceeded",
-                key=key,
-                retry_after=metadata["retry_after"],
-                reason=metadata["reason"],
-            )
-        else:
-            logger.debug(
-                "rate_limit_allowed",
-                key=key,
-                remaining=metadata["remaining"],
-            )
-        return allowed, metadata
-    def is_allowed(self, key: str, tokens: float = 1.0) -> bool:
-        """Simple check — returns True if request is allowed.
-        Args:
-            key: Rate limit bucket key.
-            tokens: Tokens to consume.
-        Returns:
-            True if within rate limit, False otherwise.
-        """
-        allowed, _ = self.check_rate_limit(key, tokens)
-        return allowed
-    def get_status(self, key: str) -> dict[str, Any]:
-        """Get current rate limit status for a key.
-        Args:
-            key: Rate limit bucket key.
-        Returns:
-            Dict with remaining tokens, reset time, etc.
-        """
-        bucket = self._buckets.get(key)
-        if not bucket:
-            return {
-                "remaining": self._default_config.burst_size,
-                "limit": self._default_config.requests_per_minute,
-                "reset": 0,
-            }
-        bucket._refill()
-        return {
-            "remaining": int(bucket.tokens),
-            "limit": bucket.config.requests_per_minute,
-            "reset": int(max(0, bucket.blocked_until - time.time())),
-        }
-    def reset(self, key: str) -> None:
-        """Reset a specific rate limit bucket.
-        Args:
-            key: Bucket key to reset.
-        """
-        if key in self._buckets:
-            del self._buckets[key]
-            logger.info("rate_limit_reset", key=key)
-class OwnerKeyHourThrottle:
-    """Per-IP hourly throttle for the BYOK owner-key fallback.
-    Distinct from the request-level :class:`RateLimiter` because the BYOK
-    semantics are different:
-    - Visitors who bring their own LLM key (``ByokCreds.has_user_key()``)
-      bypass this throttle entirely — they are paying for their own tokens.
-    - Visitors who do NOT bring a key fall back to the platform owner's
-      Groq key. This throttle exists to stop a single recruiter or curious
-      visitor from burning the free-tier 30 RPM / 14,400 RPD budget.
-    Bucket window is rolling one hour from the first allowed request in
-    the window. Sliding-window precision is not needed — three requests an
-    hour is already conservative. We keep timestamps in a tiny list per IP
-    and prune entries older than 3600 seconds on each check.
-    """
-    __slots__ = ("_buckets", "_quota_per_hour")
-    def __init__(self, quota_per_hour: int) -> None:
-        if quota_per_hour < 0:
-            raise ValueError("quota_per_hour must be non-negative")
-        self._quota_per_hour = quota_per_hour
-        self._buckets: dict[str, list[float]] = {}
-    def allow(self, ip: str, *, now: float | None = None) -> tuple[bool, dict[str, Any]]:
-        """Return whether ``ip`` may consume one owner-key request.
-        Args:
-            ip: Client IP address (use ``"anon"`` when unavailable so the
-                fallback path still throttles instead of leaking quota).
-            now: Optional monotonic clock override for tests.
-        Returns:
-            ``(allowed, meta)`` where ``meta`` carries ``remaining`` and
-            ``retry_after`` seconds, ready for an HTTP 429 response.
-        """
-        t = now if now is not None else time.monotonic()
-        # Prune entries older than 1h, then count.
-        bucket = [ts for ts in self._buckets.get(ip, []) if t - ts < 3600.0]
-        if len(bucket) >= self._quota_per_hour:
-            # ``retry_after`` defaults to a full window when quota_per_hour=0
-            # (kill switch) — there is no "oldest entry" to expire.
-            retry_after = max(1, int(3600.0 - (t - bucket[0])) + 1) if bucket else 3600
-            self._buckets[ip] = bucket  # write pruned list back
-            return False, {
-                "allowed": False,
-                "remaining": 0,
-                "retry_after": retry_after,
-                "reason": "owner_key_hourly_quota_exhausted",
-            }
-        bucket.append(t)
-        self._buckets[ip] = bucket
-        return True, {
-            "allowed": True,
-            "remaining": self._quota_per_hour - len(bucket),
-            "retry_after": 0,
-            "reason": None,
-        }
-    def reset(self, ip: str) -> None:
-        """Drop all timestamps for ``ip`` (test/cleanup helper)."""
-        self._buckets.pop(ip, None)
-    def reset_all(self) -> None:
-        """Drop every bucket — used between test cases to avoid leakage."""
-        self._buckets.clear()
-# Module-level singleton — lazy-initialised from settings on first use so
-# unit tests that monkey-patch SAR_BYOK_OWNER_QUOTA see the right value.
-_owner_key_throttle: OwnerKeyHourThrottle | None = None
-def get_owner_key_throttle() -> OwnerKeyHourThrottle:
-    """Return the process-wide owner-key throttle, creating it lazily.
-    Reads ``settings.byok_owner_key_quota_per_hour`` at first call. Tests
-    that need a different quota value should call :func:`reset_owner_key_throttle`
-    after the monkey-patch.
-    """
-    global _owner_key_throttle
-    if _owner_key_throttle is None:
-        from config.settings import settings  # local import to avoid cycle
-        _owner_key_throttle = OwnerKeyHourThrottle(
-            quota_per_hour=settings.byok_owner_key_quota_per_hour,
-        )
-    return _owner_key_throttle
-def reset_owner_key_throttle() -> None:
-    """Force the next :func:`get_owner_key_throttle` call to rebuild from settings.
-    Test-only hook; production code never calls this.
-    """
-    global _owner_key_throttle
-    _owner_key_throttle = None
-class RedisRateLimiter:
-    """Distributed rate limiter backed by Redis.
-    Uses Redis sorted sets with sliding window algorithm for accurate
-    per-user rate limiting across multiple application instances.
-    Args:
-        redis_url: Redis connection URL.
-        default_config: Default rate limit configuration.
-    """
-    def __init__(
-        self,
-        redis_url: str | None = None,
-        default_config: RateLimitConfig | None = None,
-    ) -> None:
-        """Initialize the Redis rate limiter.
-        Args:
-            redis_url: Redis connection URL. Falls back to settings.
-            default_config: Default configuration for new keys.
-        """
-        import redis
-        from config.settings import settings
-        self._redis = redis.from_url(redis_url or settings.redis_url)
-        self._default_config = default_config or RateLimitConfig()
-    def check_rate_limit(
-        self,
-        key: str,
-        tokens: float = 1.0,
-        config: RateLimitConfig | None = None,
-    ) -> tuple[bool, dict[str, Any]]:
-        """Check if a request is within the rate limit using Redis.
-        Uses a sliding window algorithm based on Redis sorted sets.
-        Args:
-            key: Rate limit bucket key.
-            tokens: Tokens to consume.
-            config: Optional custom config.
-        Returns:
-            Tuple of (allowed, metadata).
-        """
-        cfg = config or self._default_config
-        now = time.time()
-        window_start = now - 60.0  # 1-minute window
-        redis_key = f"ratelimit:{key}"
-        # Remove old entries outside the window
-        self._redis.zremrangebyscore(redis_key, 0, window_start)
-        # Count current requests in window
-        current_count = self._redis.zcard(redis_key)
-        # Check burst limit
-        if current_count >= cfg.burst_size:
-            retry_after = int(cfg.cooldown_seconds) + 1
-            return False, {
-                "allowed": False,
-                "remaining": 0,
-                "retry_after": retry_after,
-                "reason": "rate_limit_exceeded",
-            }
-        # Check per-minute rate
-        rpm_limit = cfg.requests_per_minute
-        if current_count >= rpm_limit:
-            retry_after = int(60 - (now % 60)) + 1
-            return False, {
-                "allowed": False,
-                "remaining": 0,
-                "retry_after": retry_after,
-                "reason": "rate_limit_exceeded",
-            }
-        # Record this request
-        self._redis.zadd(redis_key, {str(now): now})
-        # Set expiry on the key
-        self._redis.expire(redis_key, 120)
-        remaining = min(cfg.burst_size, rpm_limit) - current_count - 1
-        return True, {
-            "allowed": True,
-            "remaining": max(0, remaining),
-            "retry_after": 0,
-            "reason": None,
-        }
-    def is_allowed(self, key: str, tokens: float = 1.0) -> bool:
-        """Simple check — returns True if request is allowed.
-        Args:
-            key: Rate limit bucket key.
-            tokens: Tokens to consume.
-        Returns:
-            True if within rate limit, False otherwise.
-        """
-        allowed, _ = self.check_rate_limit(key, tokens)
-        return allowed
-    def get_status(self, key: str) -> dict[str, Any]:
-        """Get current rate limit status for a key.
-        Args:
-            key: Rate limit bucket key.
-        Returns:
-            Dict with remaining tokens, limit, and reset time.
-        """
-        redis_key = f"ratelimit:{key}"
-        now = time.time()
-        window_start = now - 60.0
-        self._redis.zremrangebyscore(redis_key, 0, window_start)
-        current_count = self._redis.zcard(redis_key)
-        remaining = max(0, self._default_config.burst_size - current_count)
-        return {
-            "remaining": remaining,
-            "limit": self._default_config.requests_per_minute,
-            "reset": int(60 - (now % 60)),
-        }
-    def reset(self, key: str) -> None:
-        """Reset a specific rate limit bucket.
-        Args:
-            key: Bucket key to reset.
-        """
-        self._redis.delete(f"ratelimit:{key}")
-        logger.info("redis_rate_limit_reset", key=key)
-def _get_rate_limiter() -> RateLimiter | RedisRateLimiter:
-    """Get the appropriate rate limiter based on configuration.
-    Returns:
-        RateLimiter (in-memory) or RedisRateLimiter (distributed).
-    """
-    from config.settings import settings
-    if settings.use_redis_rate_limiter:
-        try:
-            return RedisRateLimiter()
-        except Exception as exc:
-            logger.warning("redis_rate_limiter_failed", error=str(exc), fallback="memory")
-    return RateLimiter(default_config=RATE_LIMIT_PROFILES["default"])
-# Pre-configured rate limit profiles
-RATE_LIMIT_PROFILES: dict[str, RateLimitConfig] = {
-    "default": RateLimitConfig(requests_per_minute=60, burst_size=10),
-    "strict": RateLimitConfig(requests_per_minute=10, burst_size=3, cooldown_seconds=5.0),
-    "generous": RateLimitConfig(requests_per_minute=300, burst_size=50),
-    "upload": RateLimitConfig(requests_per_minute=5, burst_size=2, cooldown_seconds=10.0),
-    "query": RateLimitConfig(requests_per_minute=30, burst_size=5, cooldown_seconds=2.0),
-}
-# Module-level singleton (lazy initialization)
-_rate_limiter_instance: RateLimiter | RedisRateLimiter | None = None
-def _get_limiter() -> RateLimiter | RedisRateLimiter:
-    """Get the singleton rate limiter instance.
-    Returns:
-        The configured rate limiter.
-    """
-    global _rate_limiter_instance
-    if _rate_limiter_instance is None:
-        _rate_limiter_instance = _get_rate_limiter()
-    return _rate_limiter_instance
-def check_query_rate_limit(user_id: str) -> tuple[bool, dict[str, Any]]:
-    """Check rate limit for a user query.
-    Args:
-        user_id: The user making the query.
-    Returns:
-        Tuple of (allowed, metadata).
-    """
-    key = f"{user_id}:query"
-    return _get_limiter().check_rate_limit(key, config=RATE_LIMIT_PROFILES["query"])
-def check_upload_rate_limit(user_id: str) -> tuple[bool, dict[str, Any]]:
-    """Check rate limit for a document upload.
-    Args:
-        user_id: The user uploading.
-    Returns:
-        Tuple of (allowed, metadata).
-    """
-    key = f"{user_id}:upload"
-    return _get_limiter().check_rate_limit(key, config=RATE_LIMIT_PROFILES["upload"])

+"""Token-bucket rate limiter for API request throttling.
+Provides per-user and per-endpoint rate limiting to prevent abuse and
+ensure fair resource allocation. Uses an in-memory token bucket algorithm
+with optional Redis backend for distributed deployments.
+"""
+from __future__ import annotations
+import time
+from dataclasses import dataclass, field
+from typing import Any
+from utils.logging import get_logger
+logger = get_logger(__name__)
+@dataclass
+class RateLimitConfig:
+    """Configuration for a rate limit bucket.
+    Attributes:
+        requests_per_minute: Maximum requests allowed per minute.
+        burst_size: Maximum burst capacity (bucket size).
+        cooldown_seconds: Seconds to wait after being rate limited.
+    """
+    requests_per_minute: int = 60
+    burst_size: int = 10
+    cooldown_seconds: float = 1.0
+@dataclass
+class TokenBucket:
+    """In-memory token bucket for rate limiting.
+    Attributes:
+        tokens: Current available tokens.
+        last_update: Timestamp of last token refill.
+        config: Rate limit configuration.
+        blocked_until: Timestamp when the bucket is unblocked.
+    """
+    tokens: float = field(default=0.0)
+    last_update: float = field(default_factory=time.time)
+    config: RateLimitConfig = field(default_factory=RateLimitConfig)
+    blocked_until: float = field(default=0.0)
+    def _refill(self) -> None:
+        """Refill tokens based on elapsed time since last update."""
+        now = time.time()
+        elapsed = now - self.last_update
+        # Refill rate: tokens per second
+        refill_rate = self.config.requests_per_minute / 60.0
+        self.tokens = min(self.config.burst_size, self.tokens + elapsed * refill_rate)
+        self.last_update = now
+    def consume(self, tokens: float = 1.0) -> tuple[bool, dict[str, Any]]:
+        """Attempt to consume tokens from the bucket.
+        Args:
+            tokens: Number of tokens to consume (default 1 per request).
+        Returns:
+            Tuple of (allowed, metadata) where metadata contains
+            remaining tokens, retry_after, etc.
+        """
+        now = time.time()
+        # Check if currently blocked
+        if now < self.blocked_until:
+            retry_after = int(self.blocked_until - now) + 1
+            return False, {
+                "allowed": False,
+                "remaining": 0,
+                "retry_after": retry_after,
+                "reason": "cooldown_active",
+            }
+        self._refill()
+        if self.tokens >= tokens:
+            self.tokens -= tokens
+            remaining = int(self.tokens)
+            return True, {
+                "allowed": True,
+                "remaining": remaining,
+                "retry_after": 0,
+                "reason": None,
+            }
+        # Rate limit exceeded — enter cooldown
+        self.blocked_until = now + self.config.cooldown_seconds
+        retry_after = int(self.config.cooldown_seconds) + 1
+        return False, {
+            "allowed": False,
+            "remaining": 0,
+            "retry_after": retry_after,
+            "reason": "rate_limit_exceeded",
+        }
+class RateLimiter:
+    """Multi-key rate limiter with per-user and per-endpoint tracking.
+    Uses in-memory token buckets. For distributed deployments, wrap
+    with a Redis-backed implementation.
+    Args:
+        default_config: Default rate limit configuration.
+    """
+    def __init__(self, default_config: RateLimitConfig | None = None) -> None:
+        """Initialize the rate limiter.
+        Args:
+            default_config: Default configuration for new buckets.
+        """
+        self._default_config = default_config or RateLimitConfig()
+        self._buckets: dict[str, TokenBucket] = {}
+    def _get_bucket(self, key: str, config: RateLimitConfig | None = None) -> TokenBucket:
+        """Get or create a token bucket for the given key.
+        Args:
+            key: Unique identifier for the bucket (e.g., user_id + endpoint).
+            config: Optional custom configuration.
+        Returns:
+            The token bucket for the key.
+        """
+        if key not in self._buckets:
+            self._buckets[key] = TokenBucket(
+                tokens=config.burst_size if config else self._default_config.burst_size,
+                config=config or self._default_config,
+            )
+        return self._buckets[key]
+    def check_rate_limit(
+        self,
+        key: str,
+        tokens: float = 1.0,
+        config: RateLimitConfig | None = None,
+    ) -> tuple[bool, dict[str, Any]]:
+        """Check if a request is within the rate limit.
+        Args:
+            key: Rate limit bucket key (e.g., "user_123:query").
+            tokens: Tokens to consume.
+            config: Optional custom config for this key.
+        Returns:
+            Tuple of (allowed, metadata).
+        """
+        bucket = self._get_bucket(key, config)
+        allowed, metadata = bucket.consume(tokens)
+        if not allowed:
+            logger.warning(
+                "rate_limit_exceeded",
+                key=key,
+                retry_after=metadata["retry_after"],
+                reason=metadata["reason"],
+            )
+        else:
+            logger.debug(
+                "rate_limit_allowed",
+                key=key,
+                remaining=metadata["remaining"],
+            )
+        return allowed, metadata
+    def is_allowed(self, key: str, tokens: float = 1.0) -> bool:
+        """Simple check — returns True if request is allowed.
+        Args:
+            key: Rate limit bucket key.
+            tokens: Tokens to consume.
+        Returns:
+            True if within rate limit, False otherwise.
+        """
+        allowed, _ = self.check_rate_limit(key, tokens)
+        return allowed
+    def get_status(self, key: str) -> dict[str, Any]:
+        """Get current rate limit status for a key.
+        Args:
+            key: Rate limit bucket key.
+        Returns:
+            Dict with remaining tokens, reset time, etc.
+        """
+        bucket = self._buckets.get(key)
+        if not bucket:
+            return {
+                "remaining": self._default_config.burst_size,
+                "limit": self._default_config.requests_per_minute,
+                "reset": 0,
+            }
+        bucket._refill()
+        return {
+            "remaining": int(bucket.tokens),
+            "limit": bucket.config.requests_per_minute,
+            "reset": int(max(0, bucket.blocked_until - time.time())),
+        }
+    def reset(self, key: str) -> None:
+        """Reset a specific rate limit bucket.
+        Args:
+            key: Bucket key to reset.
+        """
+        if key in self._buckets:
+            del self._buckets[key]
+            logger.info("rate_limit_reset", key=key)
+class OwnerKeyHourThrottle:
+    """Per-IP hourly throttle for the BYOK owner-key fallback.
+    Distinct from the request-level :class:`RateLimiter` because the BYOK
+    semantics are different:
+    - Visitors who bring their own LLM key (``ByokCreds.has_user_key()``)
+      bypass this throttle entirely — they are paying for their own tokens.
+    - Visitors who do NOT bring a key fall back to the platform owner's
+      Groq key. This throttle exists to stop a single recruiter or curious
+      visitor from burning the free-tier 30 RPM / 14,400 RPD budget.
+    Bucket window is rolling one hour from the first allowed request in
+    the window. Sliding-window precision is not needed — three requests an
+    hour is already conservative. We keep timestamps in a tiny list per IP
+    and prune entries older than 3600 seconds on each check.
+    """
+    __slots__ = ("_buckets", "_quota_per_hour")
+    def __init__(self, quota_per_hour: int) -> None:
+        if quota_per_hour < 0:
+            raise ValueError("quota_per_hour must be non-negative")
+        self._quota_per_hour = quota_per_hour
+        self._buckets: dict[str, list[float]] = {}
+    def allow(self, ip: str, *, now: float | None = None) -> tuple[bool, dict[str, Any]]:
+        """Return whether ``ip`` may consume one owner-key request.
+        Args:
+            ip: Client IP address (use ``"anon"`` when unavailable so the
+                fallback path still throttles instead of leaking quota).
+            now: Optional monotonic clock override for tests.
+        Returns:
+            ``(allowed, meta)`` where ``meta`` carries ``remaining`` and
+            ``retry_after`` seconds, ready for an HTTP 429 response.
+        """
+        t = now if now is not None else time.monotonic()
+        # Prune entries older than 1h, then count.
+        bucket = [ts for ts in self._buckets.get(ip, []) if t - ts < 3600.0]
+        if len(bucket) >= self._quota_per_hour:
+            # ``retry_after`` defaults to a full window when quota_per_hour=0
+            # (kill switch) — there is no "oldest entry" to expire.
+            retry_after = max(1, int(3600.0 - (t - bucket[0])) + 1) if bucket else 3600
+            self._buckets[ip] = bucket  # write pruned list back
+            return False, {
+                "allowed": False,
+                "remaining": 0,
+                "retry_after": retry_after,
+                "reason": "owner_key_hourly_quota_exhausted",
+            }
+        bucket.append(t)
+        self._buckets[ip] = bucket
+        return True, {
+            "allowed": True,
+            "remaining": self._quota_per_hour - len(bucket),
+            "retry_after": 0,
+            "reason": None,
+        }
+    def reset(self, ip: str) -> None:
+        """Drop all timestamps for ``ip`` (test/cleanup helper)."""
+        self._buckets.pop(ip, None)
+    def reset_all(self) -> None:
+        """Drop every bucket — used between test cases to avoid leakage."""
+        self._buckets.clear()
+# Module-level singleton — lazy-initialised from settings on first use so
+# unit tests that monkey-patch SAR_BYOK_OWNER_QUOTA see the right value.
+_owner_key_throttle: OwnerKeyHourThrottle | None = None
+def get_owner_key_throttle() -> OwnerKeyHourThrottle:
+    """Return the process-wide owner-key throttle, creating it lazily.
+    Reads ``settings.byok_owner_key_quota_per_hour`` at first call. Tests
+    that need a different quota value should call :func:`reset_owner_key_throttle`
+    after the monkey-patch.
+    """
+    global _owner_key_throttle
+    if _owner_key_throttle is None:
+        from config.settings import settings  # local import to avoid cycle
+        _owner_key_throttle = OwnerKeyHourThrottle(
+            quota_per_hour=settings.byok_owner_key_quota_per_hour,
+        )
+    return _owner_key_throttle
+def reset_owner_key_throttle() -> None:
+    """Force the next :func:`get_owner_key_throttle` call to rebuild from settings.
+    Test-only hook; production code never calls this.
+    """
+    global _owner_key_throttle
+    _owner_key_throttle = None
+class RedisRateLimiter:
+    """Distributed rate limiter backed by Redis.
+    Uses Redis sorted sets with sliding window algorithm for accurate
+    per-user rate limiting across multiple application instances.
+    Args:
+        redis_url: Redis connection URL.
+        default_config: Default rate limit configuration.
+    """
+    def __init__(
+        self,
+        redis_url: str | None = None,
+        default_config: RateLimitConfig | None = None,
+    ) -> None:
+        """Initialize the Redis rate limiter.
+        Args:
+            redis_url: Redis connection URL. Falls back to settings.
+            default_config: Default configuration for new keys.
+        """
+        import redis
+        from config.settings import settings
+        self._redis = redis.from_url(redis_url or settings.redis_url)
+        self._default_config = default_config or RateLimitConfig()
+    def check_rate_limit(
+        self,
+        key: str,
+        tokens: float = 1.0,
+        config: RateLimitConfig | None = None,
+    ) -> tuple[bool, dict[str, Any]]:
+        """Check if a request is within the rate limit using Redis.
+        Uses a sliding window algorithm based on Redis sorted sets.
+        Args:
+            key: Rate limit bucket key.
+            tokens: Tokens to consume.
+            config: Optional custom config.
+        Returns:
+            Tuple of (allowed, metadata).
+        """
+        cfg = config or self._default_config
+        now = time.time()
+        window_start = now - 60.0  # 1-minute window
+        redis_key = f"ratelimit:{key}"
+        # Remove old entries outside the window
+        self._redis.zremrangebyscore(redis_key, 0, window_start)
+        # Count current requests in window
+        current_count = self._redis.zcard(redis_key)
+        # Check burst limit
+        if current_count >= cfg.burst_size:
+            retry_after = int(cfg.cooldown_seconds) + 1
+            return False, {
+                "allowed": False,
+                "remaining": 0,
+                "retry_after": retry_after,
+                "reason": "rate_limit_exceeded",
+            }
+        # Check per-minute rate
+        rpm_limit = cfg.requests_per_minute
+        if current_count >= rpm_limit:
+            retry_after = int(60 - (now % 60)) + 1
+            return False, {
+                "allowed": False,
+                "remaining": 0,
+                "retry_after": retry_after,
+                "reason": "rate_limit_exceeded",
+            }
+        # Record this request
+        self._redis.zadd(redis_key, {str(now): now})
+        # Set expiry on the key
+        self._redis.expire(redis_key, 120)
+        remaining = min(cfg.burst_size, rpm_limit) - current_count - 1
+        return True, {
+            "allowed": True,
+            "remaining": max(0, remaining),
+            "retry_after": 0,
+            "reason": None,
+        }
+    def is_allowed(self, key: str, tokens: float = 1.0) -> bool:
+        """Simple check — returns True if request is allowed.
+        Args:
+            key: Rate limit bucket key.
+            tokens: Tokens to consume.
+        Returns:
+            True if within rate limit, False otherwise.
+        """
+        allowed, _ = self.check_rate_limit(key, tokens)
+        return allowed
+    def get_status(self, key: str) -> dict[str, Any]:
+        """Get current rate limit status for a key.
+        Args:
+            key: Rate limit bucket key.
+        Returns:
+            Dict with remaining tokens, limit, and reset time.
+        """
+        redis_key = f"ratelimit:{key}"
+        now = time.time()
+        window_start = now - 60.0
+        self._redis.zremrangebyscore(redis_key, 0, window_start)
+        current_count = self._redis.zcard(redis_key)
+        remaining = max(0, self._default_config.burst_size - current_count)
+        return {
+            "remaining": remaining,
+            "limit": self._default_config.requests_per_minute,
+            "reset": int(60 - (now % 60)),
+        }
+    def reset(self, key: str) -> None:
+        """Reset a specific rate limit bucket.
+        Args:
+            key: Bucket key to reset.
+        """
+        self._redis.delete(f"ratelimit:{key}")
+        logger.info("redis_rate_limit_reset", key=key)
+def _get_rate_limiter() -> RateLimiter | RedisRateLimiter:
+    """Get the appropriate rate limiter based on configuration.
+    Returns:
+        RateLimiter (in-memory) or RedisRateLimiter (distributed).
+    """
+    from config.settings import settings
+    if settings.use_redis_rate_limiter:
+        try:
+            return RedisRateLimiter()
+        except Exception as exc:
+            logger.warning("redis_rate_limiter_failed", error=str(exc), fallback="memory")
+    return RateLimiter(default_config=RATE_LIMIT_PROFILES["default"])
+# Pre-configured rate limit profiles
+RATE_LIMIT_PROFILES: dict[str, RateLimitConfig] = {
+    "default": RateLimitConfig(requests_per_minute=60, burst_size=10),
+    "strict": RateLimitConfig(requests_per_minute=10, burst_size=3, cooldown_seconds=5.0),
+    "generous": RateLimitConfig(requests_per_minute=300, burst_size=50),
+    "upload": RateLimitConfig(requests_per_minute=5, burst_size=2, cooldown_seconds=10.0),
+    "query": RateLimitConfig(requests_per_minute=30, burst_size=5, cooldown_seconds=2.0),
+}
+# Module-level singleton (lazy initialization)
+_rate_limiter_instance: RateLimiter | RedisRateLimiter | None = None
+def _get_limiter() -> RateLimiter | RedisRateLimiter:
+    """Get the singleton rate limiter instance.
+    Returns:
+        The configured rate limiter.
+    """
+    global _rate_limiter_instance
+    if _rate_limiter_instance is None:
+        _rate_limiter_instance = _get_rate_limiter()
+    return _rate_limiter_instance
+def check_query_rate_limit(user_id: str) -> tuple[bool, dict[str, Any]]:
+    """Check rate limit for a user query.
+    Args:
+        user_id: The user making the query.
+    Returns:
+        Tuple of (allowed, metadata).
+    """
+    key = f"{user_id}:query"
+    return _get_limiter().check_rate_limit(key, config=RATE_LIMIT_PROFILES["query"])
+def check_upload_rate_limit(user_id: str) -> tuple[bool, dict[str, Any]]:
+    """Check rate limit for a document upload.
+    Args:
+        user_id: The user uploading.
+    Returns:
+        Tuple of (allowed, metadata).
+    """
+    key = f"{user_id}:upload"
+    return _get_limiter().check_rate_limit(key, config=RATE_LIMIT_PROFILES["upload"])