Spaces:

vampokala
/

doc-ingestion

Running

Vamshi Pokala Cursor commited on 29 days ago

Commit

2af1cca

1 Parent(s): 131d7bb

feat: React UI, Docker Space, sessions, Ollama toggle on HF

- Add root Dockerfile and frontend build served from FastAPI static
- Session corpus API, demo uploads, integration tests
- Disable Ollama when SPACE_ID or DOC_OLLAMA_ENABLED=0 (config.py)
- Observability/metrics, CI and dependency updates
- README/spaces README: document SPACE_ID vs local deploy

Co-authored-by: Cursor <cursoragent@cursor.com>

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.github/workflows/ci.yml +61 -1
Dockerfile +59 -0
Docs/Phase5-Monitoring-Observability.md +1764 -0
Docs/Phase6-Iterative-Execution-Index.md +29 -0
Docs/Phase6-RefactorDemo_React.md +445 -0
Docs/Phase6.1-Backend-Session-Isolation-Plan.md +125 -0
Docs/Phase6.2-React-MVP-Plan.md +83 -0
Docs/Phase6.3-Container-Cutover-Implementation-Spec.md +302 -0
Docs/Phase6.3-Container-Cutover-Plan.md +62 -0
Docs/Phase6.4-Streamlit-Decommission-Implementation-Spec.md +384 -0
Docs/Phase6.4-Streamlit-Decommission-Plan.md +38 -0
Docs/phase5_observability.md +412 -0
README.md +33 -6
docker/Dockerfile +0 -44
docker/Dockerfile +1 -0
docker/docker-compose.yml +7 -28
frontend/.gitignore +26 -0
frontend/README.md +73 -0
frontend/components.json +16 -0
frontend/e2e/fixtures/uploaded-doc.md +3 -0
frontend/e2e/react-mvp.spec.ts +80 -0
frontend/eslint.config.js +22 -0
frontend/index.html +21 -0
frontend/package-lock.json +0 -0
frontend/package.json +55 -0
frontend/playwright.config.ts +20 -0
frontend/public/favicon.svg +1 -0
frontend/public/icons.svg +24 -0
frontend/src/App.tsx +128 -0
frontend/src/api/client.ts +182 -0
frontend/src/api/generated.ts +71 -0
frontend/src/assets/hero.png +0 -0
frontend/src/assets/react.svg +1 -0
frontend/src/assets/vite.svg +1 -0
frontend/src/components/AnswerPanel.tsx +35 -0
frontend/src/components/CitationsList.tsx +41 -0
frontend/src/components/RetrievedChunks.tsx +26 -0
frontend/src/components/SamplePromptChips.tsx +26 -0
frontend/src/components/ScopeToggle.test.tsx +18 -0
frontend/src/components/ScopeToggle.tsx +57 -0
frontend/src/components/Uploader.test.tsx +33 -0
frontend/src/components/Uploader.tsx +108 -0
frontend/src/index.css +42 -0
frontend/src/lib/citationProvenance.test.ts +16 -0
frontend/src/lib/citationProvenance.ts +12 -0
frontend/src/lib/format.ts +22 -0
frontend/src/lib/streamQuery.test.ts +21 -0
frontend/src/lib/streamQuery.ts +96 -0
frontend/src/lib/utils.ts +6 -0
frontend/src/main.tsx +10 -0

.github/workflows/ci.yml CHANGED Viewed

@@ -78,6 +78,46 @@ jobs:
       - name: Run integration tests
         run: pytest tests/integration/ -v
   evals-smoke:
     name: Eval smoke test (mock pipeline)
     runs-on: ubuntu-latest
@@ -137,10 +177,30 @@ jobs:
             --dataset evals/datasets/golden_ci.jsonl \
             --judge-provider anthropic \
             --judge-model claude-haiku-4-5 \
-            --output evals/reports/ \
             --faithfulness-threshold 0.7 \
             --correctness-threshold 0.2
       - name: Upload golden eval report
         if: ${{ always() && env.ANTHROPIC_API_KEY != '' }}
         uses: actions/upload-artifact@v4

       - name: Run integration tests
         run: pytest tests/integration/ -v
+  frontend:
+    name: Frontend checks
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-node@v4
+        with:
+          node-version: "20"
+          cache: npm
+          cache-dependency-path: frontend/package-lock.json
+      - name: Install frontend dependencies
+        run: npm ci
+        working-directory: frontend
+      - name: Lint frontend
+        run: npm run lint
+        working-directory: frontend
+      - name: Typecheck frontend
+        run: npm run typecheck
+        working-directory: frontend
+      - name: Test frontend
+        run: npm run test
+        working-directory: frontend
+      - name: Build frontend
+        run: npm run build
+        working-directory: frontend
+      - name: Install Playwright browsers
+        run: npx playwright install --with-deps chromium
+        working-directory: frontend
+      - name: Run Playwright smoke tests
+        run: npm run test:e2e
+        working-directory: frontend
   evals-smoke:
     name: Eval smoke test (mock pipeline)
     runs-on: ubuntu-latest
             --dataset evals/datasets/golden_ci.jsonl \
             --judge-provider anthropic \
             --judge-model claude-haiku-4-5 \
+            --output evals/reports/pr-current.json \
             --faithfulness-threshold 0.7 \
             --correctness-threshold 0.2
+      - name: Compare against baseline (regression gate)
+        if: ${{ env.ANTHROPIC_API_KEY != '' && hashFiles('evals/reports/baseline.json') != '' }}
+        run: |
+          python scripts/compare_evals.py \
+            --baseline evals/reports/baseline.json \
+            --current evals/reports/pr-current.json \
+            --threshold 5.0
+      - name: Comment on regression failure
+        if: failure() && env.ANTHROPIC_API_KEY != ''
+        uses: actions/github-script@v6
+        with:
+          script: |
+            github.rest.issues.createComment({
+              issue_number: context.issue.number,
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              body: "⚠️ **Regression Detected**\n\nEval metrics degraded vs baseline. See `evals/reports/pr-current.json` artifact for details.\n\nTo update the baseline (intentional improvement), run:\n```bash\ngit checkout main\nPYTHONPATH=. python -m evals.run_evals \\\n  --dataset evals/datasets/golden_ci.jsonl \\\n  --judge-provider anthropic \\\n  --judge-model claude-haiku-4-5 \\\n  --output evals/reports/baseline.json\ngit add evals/reports/baseline.json\ngit commit -m 'chore: update Phase 5 eval baseline'\n```"
+            })
       - name: Upload golden eval report
         if: ${{ always() && env.ANTHROPIC_API_KEY != '' }}
         uses: actions/upload-artifact@v4

Dockerfile ADDED Viewed

	@@ -0,0 +1,59 @@

+# Single-process image: FastAPI serves the React SPA from /app/static and API routes under the same port.
+# Hugging Face Docker Spaces expects a Dockerfile at the repository root; local builds use the same file:
+#   docker build -t doc-ingest .
+# Compose: docker/docker-compose.yml (build context is repo root).
+FROM node:20-bookworm-slim AS frontend-builder
+WORKDIR /frontend
+COPY frontend/package.json frontend/package-lock.json ./
+RUN npm ci
+COPY frontend/ ./
+RUN npm run build
+FROM python:3.11-slim
+WORKDIR /app
+# Install system deps needed by python-magic and runtime health checks.
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libmagic1 \
+    curl \
+    ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements/base.txt requirements/base.txt
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements/base.txt
+COPY --from=frontend-builder /frontend/dist /app/static
+COPY src/ src/
+COPY scripts/ scripts/
+COPY tests/ tests/
+COPY config.yaml config.yaml
+COPY README.md README.md
+COPY Docs/ Docs/
+ENV ENV=prod
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONPATH=/app
+ENV PORT=8000
+ENV OLLAMA_BASE_URL=http://host.docker.internal:11434
+ENV HF_HOME=/app/.cache/huggingface
+ENV TRANSFORMERS_CACHE=/app/.cache/huggingface/transformers
+ENV SENTENCE_TRANSFORMERS_HOME=/app/.cache/huggingface/sentence_transformers
+# Preload reranker model at build time to avoid runtime downloads.
+RUN python -c "from sentence_transformers import CrossEncoder; CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')"
+EXPOSE 8000
+# HF Spaces runs the container as UID 1000; match that to avoid permission issues.
+RUN useradd -m -u 1000 appuser && mkdir -p /app/.cache/huggingface && chown -R appuser:appuser /app
+USER appuser
+HEALTHCHECK --interval=30s --timeout=5s --start-period=20s --retries=3 \
+  CMD sh -c 'curl -fsS "http://127.0.0.1:${PORT:-8000}/health" || exit 1'
+# PORT is honored for Hugging Face (app_port / runtime) and other platforms.
+CMD ["sh", "-c", "exec uvicorn src.api.main:app --host 0.0.0.0 --port \"${PORT:-8000}\" --workers 1"]

Docs/Phase5-Monitoring-Observability.md ADDED Viewed

	@@ -0,0 +1,1764 @@

+# Phase 5: Production Monitoring & Observability
+**Project:** doc-ingestion (RAG System)
+**Status:** Planning
+**Timeline:** 3 weeks
+**Owner:** Vamshi Pokala
+**Goal:** Instrument RAG pipeline with production-grade observability, regression gating, and operational dashboards
+---
+## Executive Summary
+Transform doc-ingestion from a feature-complete RAG system into a **production-hardened platform** by adding:
+1. **Distributed tracing** (LangFuse) across every step: ingestion → retrieval → reranking → generation → citations
+2. **Latency profiling** (P50/P95 per component) to identify bottlenecks
+3. **Cost tracking** (USD per request) for capacity planning
+4. **Quality regression gating** (GitHub Actions CI/CD) to prevent accuracy degradation
+5. **Observable metrics dashboard** for real-time operational visibility
+6. **Citation accuracy & citation coverage** monitoring
+**Why this matters for your job search:**
+- Differentiates you as "production architect" not "demo builder"
+- Directly maps to Principal/Director interview questions: "How do you know your AI system is healthy?"
+- Concrete story for Vertex (latency budgeting), Elevation Capital (risk reduction), Marriott-like enterprise roles
+---
+## Current State Analysis
+### Existing Strengths
+```
+✅ Multi-format ingestion (PDF, DOCX, TXT, MD, HTML)
+✅ Hybrid retrieval (BM25 + vector search with RRF)
+✅ Cross-encoder reranking
+✅ Citation tracking & verification
+✅ Truthfulness scoring (NLI faithfulness)
+✅ Multi-provider LLM routing (Ollama, OpenAI, Anthropic, Gemini)
+✅ FastAPI + Streamlit UI
+✅ Offline evaluation harness (golden datasets, RAGAS)
+✅ Docker Compose stack
+✅ Rate limiting (Redis-backed)
+✅ MetricsCollector in src/utils/log.py (in-memory count/mean/min/max per operation)
+✅ Structured JSON audit logging in main.py (_audit_log with latency_ms, provider, model)
+✅ processing_time_ms and cached flag already returned in QueryResponse
+✅ evals-golden CI job already runs golden_ci.jsonl on every PR
+```
+### Gaps for Production Observability
+```
+❌ No distributed tracing (can't see latency breakdown by step)
+❌ No real-time metrics dashboard
+❌ No cost tracking (USD per request)
+❌ No regression gating comparing baseline vs PR metrics in CI/CD
+❌ No P50/P95 latency tracking (existing MetricsCollector only tracks mean/min/max)
+❌ No citation accuracy trends over time
+❌ /metrics endpoint returns config metadata, not operational metrics
+❌ No replay/debug mode for failed queries
+```
+### Critical Design Constraints (address before coding)
+These issues will cause bugs or structural debt if not addressed upfront:
+1. **LangFuse span hierarchy**: `self.client.trace()` creates a top-level trace. Calling it once per pipeline step produces 5 disconnected traces per request. The correct pattern is one `trace = client.trace()` per request, then `span = trace.span()` for each step. Instrument at `RAGOrchestrator.run()`, not in `main.py`.
+2. **`observer.flush()` must not block the HTTP response**: LangFuse flush makes a network call. Calling it synchronously before returning adds latency to every request. Use `asyncio.create_task(loop.run_in_executor(..., observer.flush))` or a background thread.
+3. **Instrument at `RAGOrchestrator`, not `main.py`**: The pipeline runs inside `RAGOrchestrator.run()`. Instrumenting in `main.py` via inline `observer.trace_retrieval(fn)(args)` patterns: (a) misses the cache-hit early return, (b) misses CLI and Streamlit code paths, (c) creates a new wrapper closure on every HTTP request. The observer should be injected into or used directly within `RAGOrchestrator.run()`.
+4. **MRR and NDCG are offline-only metrics**: They require ground-truth relevance labels per query. You cannot compute them in production. Remove `mrr` and `ndcg` from `RequestMetrics`; they belong only in the eval harness.
+5. **Don't create a separate regression gate workflow**: `ci.yml` already has `evals-golden` running `golden_ci.jsonl`. Add a comparison step to that existing job rather than duplicating it. Also: the dataset is `golden_ci.jsonl`, not `golden.jsonl`.
+6. **`src/monitoring/metrics.py` should extend, not duplicate `src/utils/log.py`**: The existing `MetricsCollector` in `log.py` tracks mean/min/max. The new one adds percentiles and per-request records. Consolidate: either replace the log.py one or have the new one call through to it. Don't ship two `MetricsCollector` classes.
+7. **`requirements/api.txt` does not exist**: The project has `requirements/base.txt` and `requirements/eval.txt`. Add `langfuse>=2.0.0` to `requirements/base.txt`.
+---
+## Architecture: Before → After
+### Before (Current)
+```
+┌─────────────────────────────────────────────────────────┐
+│                  Streamlit UI / FastAPI                 │
+└──────────────────────┬──────────────────────────────────┘
+                       │
+        ┌──────────────┼──────────────┐
+        │              │              │
+   ┌────▼────┐   ┌────▼────┐   ┌────▼────┐
+   │Retrieval│   │Reranking│   │Generation
+   │ (BM25+  │   │(Cross-  │   │(Ollama/ │
+   │ Vector) │   │ Encoder)│   │ OpenAI) │
+   └─────────┘   └─────────┘   └────┬────┘
+                                     │
+                            ┌────────▼────────┐
+                            │Citations &      │
+                            │Truthfulness     │
+                            └─────────────────┘
+❌ No observability layer
+❌ Latency is a black box
+❌ Can't track cost
+❌ No regression detection
+```
+### After (Phase 5)
+```
+┌──────────────────────────────────────────────────────────────────┐
+│                   LangFuse Tracing Layer                         │
+│  (Distributed tracing, step-by-step instrumentation)            │
+└────────────────────────────┬─────────────────────────────────────┘
+                             │
+┌────────────────────────────▼─────────────────────────────────────┐
+│                  Streamlit UI / FastAPI                          │
+└──────────────────────┬──────────────────────────────────────────┘
+                       │
+        ┌──────────────┼──────────────────────┐
+        │              │                      │
+   ┌────▼────┐   ┌────▼────┐   ┌────▼────┐
+   │Retrieval│   │Reranking│   │Generation
+   │ (BM25+  │   │(Cross-  │   │(Ollama/ │
+   │ Vector) │   │ Encoder)│   │ OpenAI) │
+   └────┬────┘   └────┬────┘   └────┬────┘
+        │             │             │
+   [TRACE]        [TRACE]       [TRACE]
+   - Latency      - Latency     - Latency
+   - Chunks       - Ranked      - Tokens
+   - Scores       - Duration    - Cost
+        │             │             │
+        └─────────────┼─────────────┘
+                      │
+            ┌─────────▼──────────┐
+            │  Citations &       │
+            │  Truthfulness      │
+            │  [TRACE] Cost      │
+            └─────────┬──────────┘
+                      │
+        ┌─────────────┴──────────────────┐
+        │                                │
+   ┌────▼─────┐             ┌───────────▼────────┐
+   │Observ.   │             │  GitHub Actions    │
+   │Dashboard │             │  Regression Gating │
+   │(Metrics) │             │  (CI/CD)           │
+   └──────────┘             └────────────────────┘
+✅ End-to-end tracing
+✅ Real-time latency visibility
+✅ Cost per request tracked
+✅ Automated regression detection
+✅ Observable metrics at /observability/dashboard
+```
+---
+## Detailed Phase Breakdown
+### Phase 5.1: LangFuse Instrumentation (Week 1)
+**Goal:** Add distributed tracing to every RAG pipeline step
+#### Step 1.1: Create Observability Module
+**File:** `src/core/observability.py` (NEW)
+```python
+"""
+Observability layer for RAG pipeline instrumentation.
+Provides decorators and context managers for LangFuse tracing.
+"""
+import os
+import time
+import json
+from functools import wraps
+from typing import Any, Callable, Dict, Optional, List
+from contextlib import contextmanager
+from langfuse import Langfuse
+from langfuse.decorators import observe
+import logging
+logger = logging.getLogger(__name__)
+class RAGObserver:
+    """
+    Centralized observer for RAG pipeline.
+    Manages LangFuse client and provides tracing context managers.
+    Usage pattern — instrument inside RAGOrchestrator.run(), not in main.py:
+        with observer.trace_request("rag_query", query=query_text) as trace:
+            with trace.span_step("retrieval") as span:
+                result = hybrid_retriever.retrieve(...)
+                span["output"] = {"chunks": len(result)}
+    """
+    def __init__(self, enabled: bool = True, public_key: str = None, secret_key: str = None):
+        """
+        Args:
+            enabled: If False, all tracing is no-op (demo mode, tests)
+            public_key: LangFuse public key (defaults to LANGFUSE_PUBLIC_KEY env var)
+            secret_key: LangFuse secret key (defaults to LANGFUSE_SECRET_KEY env var)
+        """
+        self.enabled = enabled
+        self.client = None
+        if self.enabled:
+            try:
+                public_key = public_key or os.getenv("LANGFUSE_PUBLIC_KEY")
+                secret_key = secret_key or os.getenv("LANGFUSE_SECRET_KEY")
+                if public_key and secret_key:
+                    self.client = Langfuse(
+                        public_key=public_key,
+                        secret_key=secret_key,
+                        host=os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com"),
+                    )
+                    logger.info("LangFuse observability enabled")
+                else:
+                    self.enabled = False
+                    logger.warning("LangFuse keys not found; observability disabled")
+            except Exception as e:
+                logger.error(f"Failed to initialize LangFuse: {e}; observability disabled")
+                self.enabled = False
+    @contextmanager
+    def trace_request(
+        self,
+        name: str,
+        query: str = "",
+        metadata: Optional[Dict[str, Any]] = None,
+    ):
+        """
+        Context manager for a top-level request trace.
+        One trace per query request — child spans live inside this.
+        IMPORTANT: This is the top-level trace object. Use trace.span() for
+        individual pipeline steps. Never call client.trace() per step — that
+        creates disconnected traces in the LangFuse UI.
+        Usage:
+            with observer.trace_request("rag_query", query=query_text) as trace:
+                with observer.trace_step(trace, "retrieval") as span:
+                    chunks = retriever.retrieve(query)
+                    span["chunks_retrieved"] = len(chunks)
+        """
+        if not self.enabled or not self.client:
+            yield None
+            return
+        trace = self.client.trace(
+            name=name,
+            input={"query": query},
+            metadata=metadata or {},
+        )
+        start = time.time()
+        try:
+            yield trace
+        except Exception as e:
+            trace.update(
+                output={"error": str(e)},
+                metadata={**(metadata or {}), "total_ms": (time.time() - start) * 1000},
+            )
+            raise
+        finally:
+            trace.update(
+                metadata={**(metadata or {}), "total_ms": round((time.time() - start) * 1000, 2)},
+            )
+    @contextmanager
+    def trace_step(
+        self,
+        trace,
+        step_name: str,
+        input_data: Optional[Dict[str, Any]] = None,
+    ):
+        """
+        Context manager for a child span within a request trace.
+        Attach to the trace returned by trace_request().
+        Args:
+            trace: The top-level trace object from trace_request()
+            step_name: Name of the pipeline step (e.g. "retrieval", "generation")
+            input_data: Optional input metadata for this step
+        """
+        if not self.enabled or trace is None:
+            yield {}
+            return
+        span = trace.span(name=step_name, input=input_data or {})
+        output: Dict[str, Any] = {}
+        start = time.time()
+        try:
+            yield output
+        except Exception as e:
+            span.end(
+                output={"error": str(e)},
+                metadata={"latency_ms": round((time.time() - start) * 1000, 2)},
+            )
+            raise
+        finally:
+            output["latency_ms"] = round((time.time() - start) * 1000, 2)
+            span.end(output=output)
+    def flush_async(self) -> None:
+        """
+        Flush pending traces to LangFuse in a background thread.
+        Call this after the HTTP response is sent — never block the hot path.
+        In FastAPI, use a BackgroundTask:
+            from fastapi import BackgroundTasks
+            background_tasks.add_task(observer.flush_async)
+        """
+        if not self.client:
+            return
+        import threading
+        threading.Thread(target=self.client.flush, daemon=True).start()
+    def flush(self) -> None:
+        """Synchronous flush — only use in shutdown/test contexts, not request handlers."""
+        if self.client:
+            self.client.flush()
+# Global observer instance
+_observer_instance: Optional[RAGObserver] = None
+def get_observer() -> RAGObserver:
+    """Singleton getter for RAGObserver."""
+    global _observer_instance
+    if _observer_instance is None:
+        enabled = os.getenv("DOC_PROFILE") != "demo"
+        _observer_instance = RAGObserver(enabled=enabled)
+    return _observer_instance
+def init_observer(enabled: bool = True) -> RAGObserver:
+    """Initialize the observer (useful for testing)."""
+    global _observer_instance
+    _observer_instance = RAGObserver(enabled=enabled)
+    return _observer_instance
+```
+**Testing:** `tests/unit/test_observability.py` (NEW)
+```python
+"""Unit tests for observability module."""
+import pytest
+from src.core.observability import RAGObserver, init_observer, get_observer
+def test_observer_disabled_noop_on_trace_request():
+    """Verify trace_request is a no-op when disabled — yields None."""
+    observer = RAGObserver(enabled=False)
+    with observer.trace_request("rag_query", query="test") as trace:
+        assert trace is None  # no-op when disabled
+def test_observer_disabled_noop_on_trace_step():
+    """Verify trace_step yields empty dict when trace is None (disabled path)."""
+    observer = RAGObserver(enabled=False)
+    with observer.trace_step(None, "retrieval", {"query": "x"}) as output:
+        output["chunks"] = 3  # should not raise
+    assert output["chunks"] == 3  # returned value preserved even when disabled
+def test_trace_step_records_latency():
+    """Verify trace_step always populates latency_ms in the output dict."""
+    observer = RAGObserver(enabled=False)
+    with observer.trace_step(None, "generation") as output:
+        output["provider"] = "anthropic"
+    assert "latency_ms" in output
+    assert output["latency_ms"] >= 0
+    assert output["provider"] == "anthropic"
+def test_nested_trace_and_step_no_exception():
+    """Verify trace_request + trace_step nesting works without LangFuse keys."""
+    observer = RAGObserver(enabled=False)
+    with observer.trace_request("rag_query", query="hello") as trace:
+        with observer.trace_step(trace, "retrieval") as s:
+            s["chunks_retrieved"] = 5
+        with observer.trace_step(trace, "generation") as s:
+            s["provider"] = "ollama"
+    # No exception = pass
+```
+---
+#### Step 1.2: Instrument RAGOrchestrator (correct instrumentation point)
+**File:** `src/core/rag_orchestrator.py` (MODIFY existing)
+> **Why here, not `main.py`?** `RAGOrchestrator.run()` is called by the API, CLI, and Streamlit — instrumenting here captures all paths. It also correctly observes the cache-hit early return (which main.py wrapping skips entirely). Never create wrapper closures per-call inside the request handler — that's a new function object on every request and misses the orchestrator's internal structure.
+**Changes:**
+```python
+# In RAGOrchestrator.__init__, add observer:
+from src.core.observability import get_observer
+class RAGOrchestrator:
+    def __init__(self, cfg: Config) -> None:
+        # ... existing init ...
+        self.observer = get_observer()
+    def run(self, req: QueryRequest) -> QueryResponse:
+        t0 = time.perf_counter()
+        # ... existing cache key resolution ...
+        # Cache hit: trace as a cache hit and return
+        cached = self.cache.get(key) if req.use_llm else None
+        if cached is not None:
+            with self.observer.trace_request("rag_query_cached", query=req.query_text):
+                pass  # Trace the cache hit for visibility
+            return QueryResponse(cached=True, ...)
+        # Cache miss: trace all pipeline steps under one request trace
+        with self.observer.trace_request("rag_query", query=req.query_text) as trace:
+            with self.observer.trace_step(trace, "retrieval", {"top_k": retrieve_k}) as s:
+                fused = self._retrieve(req.query_text, index, db, qp, top_k=retrieve_k)
+                s["chunks_retrieved"] = len(fused)
+            if req.use_rerank:
+                with self.observer.trace_step(trace, "reranking", {"input_chunks": len(fused)}) as s:
+                    ranked = reranker.rerank(req.query_text, fused, top_k=req.top_k)
+                    s["output_chunks"] = len(ranked)
+            with self.observer.trace_step(trace, "generation", {"provider": selection.provider, "model": selection.model}) as s:
+                gen_result = generator.generate(req.query_text, docs_for_gen, ...)
+                s["latency_ms"] = gen_result.latency_ms
+            with self.observer.trace_step(trace, "citation_verification") as s:
+                citations = self.citation_verifier.verify(full, raw_citations, opt.documents)
+                s["citations_count"] = len(citations)
+            with self.observer.trace_step(trace, "truthfulness_scoring") as s:
+                truthfulness = scorer.score(full, opt.documents)
+                if truthfulness:
+                    s["nli_faithfulness"] = truthfulness.nli_faithfulness
+                    s["citation_groundedness"] = truthfulness.citation_groundedness
+        # Flush in background — do NOT block the response
+        self.observer.flush_async()
+        return QueryResponse(...)
+```
+**main.py changes** — only the `/query` endpoint needs to pass `BackgroundTasks` to ensure flush completes even if the orchestrator doesn't hold a reference:
+```python
+# main.py — minimal change, no inline tracing wrappers
+from fastapi import BackgroundTasks
+from src.core.observability import get_observer
+observer = get_observer()
+@app.post("/query")
+async def query(request: QueryRequest, background_tasks: BackgroundTasks):
+    # Tracing happens inside orchestrator.run() — main.py doesn't wrap steps
+    response = orchestrator.run(build_query_request(request))
+    background_tasks.add_task(observer.flush_async)  # belt-and-suspenders flush
+    return build_query_response(response)
+```
+**Deliverable for Week 1:**
+- ✅ `src/core/observability.py` (complete)
+- ✅ `tests/unit/test_observability.py` (complete)
+- ✅ `src/core/rag_orchestrator.py` instrumented with step-level tracing
+- ✅ `.env.example` includes `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY`
+- ✅ `langfuse>=2.0.0` added to `requirements/base.txt` (not api.txt — that file does not exist)
+**Testing Week 1:**
+```bash
+# Run unit tests
+pytest tests/unit/test_observability.py -v
+# Start API with observability enabled
+export LANGFUSE_PUBLIC_KEY=pk_... LANGFUSE_SECRET_KEY=sk_...
+PYTHONPATH=. uvicorn src.api.main:app --reload --port 8000
+# Query and check LangFuse dashboard
+curl -X POST http://127.0.0.1:8000/query \
+  -H "Content-Type: application/json" \
+  -d '{"query": "What is RAG?"}'
+# Verify trace appears in LangFuse dashboard
+```
+---
+### Phase 5.2: Latency Profiling & Metrics Dashboard (Week 2)
+**Goal:** Track and expose real-time operational metrics
+#### Step 2.1: Create Metrics Collector Module
+**File:** `src/monitoring/metrics.py` (NEW)
+```python
+"""
+Metrics collection and aggregation for RAG pipeline.
+Tracks latency percentiles, cost, retrieval precision, citation accuracy.
+"""
+import json
+import time
+from typing import Dict, List, Optional, Tuple
+from dataclasses import dataclass, asdict
+from collections import deque
+import threading
+from datetime import datetime, timedelta
+import logging
+logger = logging.getLogger(__name__)
+@dataclass
+class StepMetrics:
+    """Metrics for a single RAG pipeline step."""
+    step_name: str  # "retrieval", "reranking", "generation", "citations", "truthfulness"
+    latency_ms: float
+    timestamp: str
+    metadata: Dict = None  # Provider, model, token counts, etc.
+@dataclass
+class RequestMetrics:
+    """Aggregated metrics for a single query request."""
+    request_id: str
+    total_latency_ms: float
+    retrieval_latency_ms: float
+    reranking_latency_ms: float
+    generation_latency_ms: float
+    citation_latency_ms: float
+    truthfulness_latency_ms: float
+    # Cost
+    cost_usd: float
+    # Quality (online signals — computable without ground truth)
+    citation_groundedness: float
+    nli_faithfulness: float
+    uncited_claims: int
+    # NOTE: MRR and NDCG require per-query ground-truth relevance labels.
+    # They cannot be computed in production. Use them only in the offline
+    # eval harness (evals/run_evals.py). Removed from RequestMetrics.
+    timestamp: str
+class MetricsCollector:
+    """
+    In-memory metrics collector with time-windowed aggregation.
+    Stores metrics in a rolling window (default 1000 last requests).
+    Computes P50, P95, P99 latencies and cost trends.
+    NOTE: src/utils/log.py already has a MetricsCollector (count/mean/min/max
+    per operation name). This class replaces it — don't run both in parallel.
+    When implementing, delete or archive the one in log.py to avoid two sources
+    of truth. The track_duration() context manager in log.py can delegate to
+    this class instead.
+    """
+    def __init__(self, window_size: int = 1000):
+        self.window_size = window_size
+        self.metrics: deque = deque(maxlen=window_size)
+        self.lock = threading.Lock()
+    def record_request(self, metrics: RequestMetrics):
+        """Record a completed request's metrics."""
+        with self.lock:
+            self.metrics.append(metrics)
+    def get_percentile(
+        self, metric_field: str, percentile: float
+    ) -> Optional[float]:
+        """
+        Get percentile value for a metric field.
+        Args:
+            metric_field: e.g., "total_latency_ms", "cost_usd"
+            percentile: 0-100, e.g., 50 for P50, 95 for P95
+        Returns:
+            Percentile value or None if insufficient data
+        """
+        with self.lock:
+            if not self.metrics:
+                return None
+            values = sorted([getattr(m, metric_field) for m in self.metrics])
+            idx = int(len(values) * percentile / 100)
+            return values[min(idx, len(values) - 1)]
+    def get_dashboard_metrics(self) -> Dict:
+        """
+        Return aggregated metrics suitable for dashboarding.
+        """
+        with self.lock:
+            if not self.metrics:
+                return {
+                    "status": "no_data",
+                    "message": "No requests recorded yet",
+                }
+            metrics_list = list(self.metrics)
+            n = len(metrics_list)
+            # Latency percentiles (ms)
+            latency_p50 = self.get_percentile("total_latency_ms", 50)
+            latency_p95 = self.get_percentile("total_latency_ms", 95)
+            latency_p99 = self.get_percentile("total_latency_ms", 99)
+            # Step-wise latencies (average)
+            retrieval_avg = sum(m.retrieval_latency_ms for m in metrics_list) / n
+            reranking_avg = sum(m.reranking_latency_ms for m in metrics_list) / n
+            generation_avg = sum(m.generation_latency_ms for m in metrics_list) / n
+            citation_avg = sum(m.citation_latency_ms for m in metrics_list) / n
+            # Cost
+            cost_total = sum(m.cost_usd for m in metrics_list)
+            cost_avg = cost_total / n
+            cost_p95 = self.get_percentile("cost_usd", 95)
+            # Quality
+            groundedness_avg = sum(
+                m.citation_groundedness for m in metrics_list if m.citation_groundedness
+            ) / max(sum(1 for m in metrics_list if m.citation_groundedness), 1)
+            faithfulness_avg = sum(
+                m.nli_faithfulness for m in metrics_list if m.nli_faithfulness
+            ) / max(sum(1 for m in metrics_list if m.nli_faithfulness), 1)
+            # Retrieval quality
+            mrr_avg = sum(m.mrr for m in metrics_list if m.mrr) / max(
+                sum(1 for m in metrics_list if m.mrr), 1
+            )
+            ndcg_avg = sum(m.ndcg for m in metrics_list if m.ndcg) / max(
+                sum(1 for m in metrics_list if m.ndcg), 1
+            )
+            return {
+                "summary": {
+                    "total_requests": n,
+                    "window_size": self.window_size,
+                    "last_updated": datetime.utcnow().isoformat(),
+                },
+                "latency": {
+                    "total_p50_ms": round(latency_p50, 2),
+                    "total_p95_ms": round(latency_p95, 2),
+                    "total_p99_ms": round(latency_p99, 2),
+                    "retrieval_avg_ms": round(retrieval_avg, 2),
+                    "reranking_avg_ms": round(reranking_avg, 2),
+                    "generation_avg_ms": round(generation_avg, 2),
+                    "citation_avg_ms": round(citation_avg, 2),
+                    "breakdown_pct": {
+                        "retrieval": round(retrieval_avg / (retrieval_avg + reranking_avg + generation_avg + citation_avg) * 100, 1),
+                        "reranking": round(reranking_avg / (retrieval_avg + reranking_avg + generation_avg + citation_avg) * 100, 1),
+                        "generation": round(generation_avg / (retrieval_avg + reranking_avg + generation_avg + citation_avg) * 100, 1),
+                        "citation": round(citation_avg / (retrieval_avg + reranking_avg + generation_avg + citation_avg) * 100, 1),
+                    },
+                },
+                "cost": {
+                    "total_usd": round(cost_total, 4),
+                    "avg_per_request_usd": round(cost_avg, 6),
+                    "p95_per_request_usd": round(cost_p95, 6),
+                },
+                "quality": {
+                    "citation_groundedness_avg": round(groundedness_avg, 3),
+                    "nli_faithfulness_avg": round(faithfulness_avg, 3),
+                    "mrr_avg": round(mrr_avg, 3),
+                    "ndcg_avg": round(ndcg_avg, 3),
+                },
+            }
+# Global metrics collector instance
+_collector_instance: Optional[MetricsCollector] = None
+def get_metrics_collector() -> MetricsCollector:
+    """Singleton getter."""
+    global _collector_instance
+    if _collector_instance is None:
+        _collector_instance = MetricsCollector()
+    return _collector_instance
+```
+**Testing:** `tests/unit/test_metrics.py` (NEW)
+```python
+"""Unit tests for metrics collector."""
+import pytest
+from src.monitoring.metrics import MetricsCollector, RequestMetrics
+from datetime import datetime
+def test_metrics_collector_records_request():
+    """Verify collector records request metrics."""
+    collector = MetricsCollector(window_size=100)
+    metrics = RequestMetrics(
+        request_id="req-1",
+        total_latency_ms=1000.0,
+        retrieval_latency_ms=200.0,
+        reranking_latency_ms=150.0,
+        generation_latency_ms=600.0,
+        citation_latency_ms=50.0,
+        truthfulness_latency_ms=0.0,
+        cost_usd=0.005,
+        citation_groundedness=0.92,
+        nli_faithfulness=0.88,
+        uncited_claims=0,
+        timestamp=datetime.utcnow().isoformat(),
+    )
+    collector.record_request(metrics)
+    assert len(collector.metrics) == 1
+def test_metrics_percentile_calculation():
+    """Verify P50, P95, P99 calculations."""
+    collector = MetricsCollector(window_size=100)
+    # Record 100 requests with latencies 100-1000ms
+    for i in range(1, 101):
+        metrics = RequestMetrics(
+            request_id=f"req-{i}",
+            total_latency_ms=float(i * 10),
+            retrieval_latency_ms=100.0,
+            reranking_latency_ms=50.0,
+            generation_latency_ms=i * 5,
+            citation_latency_ms=10.0,
+            truthfulness_latency_ms=0.0,
+            cost_usd=0.01,
+            citation_groundedness=0.90,
+            nli_faithfulness=0.90,
+            uncited_claims=0,
+            timestamp=datetime.utcnow().isoformat(),
+        )
+        collector.record_request(metrics)
+    # Check percentiles
+    p50 = collector.get_percentile("total_latency_ms", 50)
+    p95 = collector.get_percentile("total_latency_ms", 95)
+    p99 = collector.get_percentile("total_latency_ms", 99)
+    assert p50 is not None
+    assert p95 is not None and p95 >= p50
+    assert p99 is not None and p99 >= p95
+def test_dashboard_metrics_aggregation():
+    """Verify dashboard metrics aggregation."""
+    collector = MetricsCollector(window_size=10)
+    for i in range(5):
+        metrics = RequestMetrics(
+            request_id=f"req-{i}",
+            total_latency_ms=1000.0,
+            retrieval_latency_ms=200.0,
+            reranking_latency_ms=150.0,
+            generation_latency_ms=600.0,
+            citation_latency_ms=50.0,
+            truthfulness_latency_ms=0.0,
+            cost_usd=0.005,
+            citation_groundedness=0.92,
+            nli_faithfulness=0.88,
+            uncited_claims=0,
+            timestamp=datetime.utcnow().isoformat(),
+        )
+        collector.record_request(metrics)
+    dashboard = collector.get_dashboard_metrics()
+    assert dashboard["summary"]["total_requests"] == 5
+    assert "latency" in dashboard
+    assert "cost" in dashboard
+    assert "quality" in dashboard
+    assert dashboard["latency"]["total_p50_ms"] > 0
+```
+---
+#### Step 2.2: Update FastAPI Routes to Record Metrics
+**File:** `src/api/main.py` (MODIFY existing)
+```python
+# At top
+from src.monitoring.metrics import get_metrics_collector, RequestMetrics
+import uuid
+from datetime import datetime
+metrics_collector = get_metrics_collector()
+# NOTE: Step-level timing and tracing now live in RAGOrchestrator.run() — see Step 1.2.
+# main.py only needs to extract the per-step latencies from the QueryResponse and
+# record them into MetricsCollector. RAGOrchestrator.run() returns processing_time_ms
+# and per-step breakdowns; extend QueryResponse to carry those fields.
+@app.post("/query")
+async def query(request: QueryRequest, background_tasks: BackgroundTasks):
+    request_id = str(uuid.uuid4())
+    try:
+        orch_response = orchestrator.run(build_query_request(request))
+        metrics = RequestMetrics(
+            request_id=request_id,
+            total_latency_ms=orch_response.processing_time_ms,
+            retrieval_latency_ms=orch_response.step_latencies.get("retrieval", 0),
+            reranking_latency_ms=orch_response.step_latencies.get("reranking", 0),
+            generation_latency_ms=orch_response.step_latencies.get("generation", 0),
+            citation_latency_ms=orch_response.step_latencies.get("citation_verification", 0),
+            truthfulness_latency_ms=orch_response.step_latencies.get("truthfulness_scoring", 0),
+            cost_usd=calculate_cost(orch_response, request.provider, request.model),
+            citation_groundedness=orch_response.truthfulness.citation_groundedness if orch_response.truthfulness else 0,
+            nli_faithfulness=orch_response.truthfulness.nli_faithfulness if orch_response.truthfulness else 0,
+            uncited_claims=orch_response.truthfulness.uncited_claims if orch_response.truthfulness else 0,
+            timestamp=datetime.utcnow().isoformat(),
+        )
+        metrics_collector.record_request(metrics)
+        background_tasks.add_task(observer.flush_async)
+        return build_api_response(request_id, orch_response)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# NEW endpoint: /observability/dashboard
+@app.get("/observability/dashboard")
+async def observability_dashboard():
+    """Return real-time observability metrics for dashboarding."""
+    return metrics_collector.get_dashboard_metrics()
+def calculate_cost(answer_response, provider: str, model: str) -> float:
+    """Calculate USD cost of request based on tokens and provider pricing.
+    NOTE: This function belongs in src/core/llm_provider.py, not main.py.
+    LLMProviderRouter already knows the provider/model — move cost calculation
+    there so it's available to CLI and Streamlit paths as well.
+    """
+    if hasattr(answer_response, "usage"):
+        # Rough estimates — update as provider pricing changes
+        if provider == "openai":
+            return (answer_response.usage.prompt_tokens * 0.000001 +
+                    answer_response.usage.completion_tokens * 0.000002)
+        elif provider == "anthropic":
+            return (answer_response.usage.prompt_tokens * 0.0000008 +
+                    answer_response.usage.completion_tokens * 0.0000024)
+    return 0.0
+```
+**New endpoint:** `src/api/routes/observability.py` (NEW, optional separation)
+```python
+"""Observability and monitoring routes."""
+from fastapi import APIRouter
+from src.monitoring.metrics import get_metrics_collector
+router = APIRouter(prefix="/observability", tags=["observability"])
+metrics_collector = get_metrics_collector()
+@router.get("/dashboard")
+async def get_dashboard():
+    """Get real-time dashboard metrics."""
+    return metrics_collector.get_dashboard_metrics()
+@router.get("/health")
+async def health_check():
+    """Basic health check."""
+    return {"status": "healthy"}
+```
+**Deliverable for Week 2:**
+- ✅ `src/monitoring/metrics.py` (complete)
+- ✅ `tests/unit/test_metrics.py` (complete)
+- ✅ `src/api/main.py` updated with step-level timing and metrics recording
+- ✅ `src/api/routes/observability.py` (optional separation)
+- ✅ `src/api/main.py` includes `/observability/dashboard` endpoint
+- ✅ Update `Docs/RUNBOOK.md` with observability dashboard instructions
+**Testing Week 2:**
+```bash
+# Run unit tests
+pytest tests/unit/test_metrics.py -v
+# Query and check metrics
+curl -X POST http://127.0.0.1:8000/query \
+  -H "Content-Type: application/json" \
+  -d '{"query": "What is RAG?"}'
+# View dashboard
+curl http://127.0.0.1:8000/observability/dashboard | jq .
+# Should return:
+# {
+#   "summary": { "total_requests": 1, ... },
+#   "latency": { "total_p50_ms": ..., "breakdown_pct": ... },
+#   "cost": { "avg_per_request_usd": ... },
+#   "quality": { "citation_groundedness_avg": ... }
+# }
+```
+---
+### Phase 5.3: Regression Gating in CI/CD (Week 3)
+**Goal:** Automated quality threshold enforcement on PRs
+#### Step 3.1: Create Regression Gate Script
+**File:** `scripts/compare_evals.py` (NEW)
+```python
+#!/usr/bin/env python3
+"""
+Compare evaluation metrics between baseline and current results.
+Used in GitHub Actions to gate PRs based on regression thresholds.
+"""
+import json
+import sys
+import argparse
+from typing import Dict, Tuple
+def load_metrics(filepath: str) -> Dict:
+    """Load metrics from JSON file."""
+    try:
+        with open(filepath, "r") as f:
+            return json.load(f)
+    except FileNotFoundError:
+        print(f"Error: {filepath} not found")
+        sys.exit(1)
+    except json.JSONDecodeError:
+        print(f"Error: {filepath} is not valid JSON")
+        sys.exit(1)
+def compare_metrics(
+    baseline: Dict, current: Dict, threshold_pct: float = 5.0
+) -> Tuple[bool, Dict]:
+    """
+    Compare baseline and current metrics.
+    Returns:
+        (passed: bool, results: Dict with details)
+    """
+    results = {
+        "passed": True,
+        "regressions": [],
+        "threshold_pct": threshold_pct,
+    }
+    # Metrics to track (lower is better for latency/cost, higher is better for quality)
+    latency_metrics = [
+        "total_p50_ms",
+        "total_p95_ms",
+        "retrieval_avg_ms",
+        "generation_avg_ms",
+    ]
+    quality_metrics = [
+        "citation_groundedness_avg",
+        "nli_faithfulness_avg",
+        # mrr_avg and ndcg_avg removed — offline-only, not in RequestMetrics
+    ]
+    cost_metrics = ["avg_per_request_usd"]
+    # Check latency (should not increase by >threshold%)
+    baseline_latency = baseline.get("latency", {})
+    current_latency = current.get("latency", {})
+    for metric in latency_metrics:
+        baseline_val = baseline_latency.get(metric)
+        current_val = current_latency.get(metric)
+        if baseline_val is None or current_val is None:
+            continue
+        pct_change = ((current_val - baseline_val) / baseline_val) * 100
+        if pct_change > threshold_pct:
+            results["regressions"].append({
+                "metric": metric,
+                "baseline": baseline_val,
+                "current": current_val,
+                "pct_change": pct_change,
+                "direction": "worse (latency increased)",
+            })
+            results["passed"] = False
+    # Check quality (should not decrease by >threshold%)
+    baseline_quality = baseline.get("quality", {})
+    current_quality = current.get("quality", {})
+    for metric in quality_metrics:
+        baseline_val = baseline_quality.get(metric)
+        current_val = current_quality.get(metric)
+        if baseline_val is None or current_val is None:
+            continue
+        pct_change = ((baseline_val - current_val) / baseline_val) * 100
+        if pct_change > threshold_pct:
+            results["regressions"].append({
+                "metric": metric,
+                "baseline": baseline_val,
+                "current": current_val,
+                "pct_change": pct_change,
+                "direction": "worse (quality decreased)",
+            })
+            results["passed"] = False
+    # Check cost (should not increase by >threshold%)
+    baseline_cost = baseline.get("cost", {})
+    current_cost = current.get("cost", {})
+    for metric in cost_metrics:
+        baseline_val = baseline_cost.get(metric)
+        current_val = current_cost.get(metric)
+        if baseline_val is None or current_val is None:
+            continue
+        pct_change = ((current_val - baseline_val) / baseline_val) * 100
+        if pct_change > threshold_pct:
+            results["regressions"].append({
+                "metric": metric,
+                "baseline": baseline_val,
+                "current": current_val,
+                "pct_change": pct_change,
+                "direction": "worse (cost increased)",
+            })
+            results["passed"] = False
+    return results["passed"], results
+def main():
+    parser = argparse.ArgumentParser(
+        description="Compare evaluation metrics between baseline and current"
+    )
+    parser.add_argument("--baseline", required=True, help="Path to baseline metrics JSON")
+    parser.add_argument("--current", required=True, help="Path to current metrics JSON")
+    parser.add_argument(
+        "--threshold", type=float, default=5.0, help="Regression threshold in percent (default: 5%)"
+    )
+    parser.add_argument("--strict", action="store_true", help="Fail on any regression")
+    args = parser.parse_args()
+    baseline = load_metrics(args.baseline)
+    current = load_metrics(args.current)
+    threshold = 0 if args.strict else args.threshold
+    passed, results = compare_metrics(baseline, current, threshold_pct=threshold)
+    print(json.dumps(results, indent=2))
+    if not passed:
+        print(f"\n❌ Regression detected ({len(results['regressions'])} metric(s) failed)")
+        for reg in results["regressions"]:
+            print(f"  - {reg['metric']}: {reg['pct_change']:.1f}% {reg['direction']}")
+        sys.exit(1)
+    else:
+        print("\n✅ All metrics pass regression gate")
+        sys.exit(0)
+if __name__ == "__main__":
+    main()
+```
+**Testing:** `tests/unit/test_regression_gate.py` (NEW)
+```python
+"""Unit tests for regression gate script."""
+import json
+import tempfile
+import pytest
+from scripts.compare_evals import compare_metrics
+def test_no_regression_when_metrics_stable():
+    """Verify no regression when metrics are unchanged."""
+    baseline = {
+        "latency": {"total_p50_ms": 1000.0, "retrieval_avg_ms": 200.0},
+        "quality": {"citation_groundedness_avg": 0.92},
+        "cost": {"avg_per_request_usd": 0.005},
+    }
+    current = baseline.copy()
+    passed, results = compare_metrics(baseline, current, threshold_pct=5.0)
+    assert passed is True
+    assert len(results["regressions"]) == 0
+def test_regression_detected_for_latency_increase():
+    """Verify regression detected when latency increases >threshold."""
+    baseline = {
+        "latency": {"total_p50_ms": 1000.0},
+        "quality": {},
+        "cost": {},
+    }
+    current = {
+        "latency": {"total_p50_ms": 1100.0},  # 10% increase
+        "quality": {},
+        "cost": {},
+    }
+    passed, results = compare_metrics(baseline, current, threshold_pct=5.0)
+    assert passed is False
+    assert len(results["regressions"]) == 1
+    assert results["regressions"][0]["metric"] == "total_p50_ms"
+    assert results["regressions"][0]["pct_change"] > 5.0
+def test_no_regression_when_quality_improves():
+    """Verify no regression when quality improves."""
+    baseline = {
+        "latency": {},
+        "quality": {"citation_groundedness_avg": 0.90},
+        "cost": {},
+    }
+    current = {
+        "latency": {},
+        "quality": {"citation_groundedness_avg": 0.95},  # Improvement
+        "cost": {},
+    }
+    passed, results = compare_metrics(baseline, current, threshold_pct=5.0)
+    assert passed is True
+    assert len(results["regressions"]) == 0
+```
+---
+#### Step 3.2: Extend Existing CI Workflow (do NOT create a new file)
+**File:** `.github/workflows/ci.yml` (MODIFY existing `evals-golden` job)
+> **Why extend, not create?** `ci.yml` already has an `evals-golden` job that runs `golden_ci.jsonl` on every PR with Anthropic Haiku. Creating `.github/workflows/regression_gate.yml` would duplicate that job, resulting in two separate eval runs per PR at twice the cost and runtime. Extend the existing job with a comparison step instead.
+>
+> **Dataset filename**: The actual file is `evals/datasets/golden_ci.jsonl`, not `golden.jsonl`.
+>
+> **Baseline strategy**: Store `evals/reports/baseline.json` in the repo (committed from main). The CI job compares the PR output against this committed baseline. This avoids the fragile "check out main and run evals" approach, which doubles job time and creates a chicken-and-egg bootstrapping problem.
+**Add these steps to the existing `evals-golden` job in `ci.yml`:**
+```yaml
+  evals-golden:
+    name: Golden evals (Anthropic Haiku)
+    runs-on: ubuntu-latest
+    env:
+      ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.13"
+          cache: pip
+      - name: Skip golden evals when secret is missing
+        if: ${{ env.ANTHROPIC_API_KEY == '' }}
+        run: echo "ANTHROPIC_API_KEY not set; skipping golden evals."
+      - name: Install dependencies
+        if: ${{ env.ANTHROPIC_API_KEY != '' }}
+        run: pip install -r requirements/base.txt
+      - name: Run golden evals (live pipeline, Anthropic Haiku)
+        if: ${{ env.ANTHROPIC_API_KEY != '' }}
+        run: |
+          PYTHONPATH=. python -m evals.run_evals \
+            --dataset evals/datasets/golden_ci.jsonl \
+            --judge-provider anthropic \
+            --judge-model claude-haiku-4-5 \
+            --output evals/reports/pr-current.json \
+            --faithfulness-threshold 0.7 \
+            --correctness-threshold 0.2
+      # === NEW: regression gate comparison ===
+      - name: Compare against baseline (regression gate)
+        if: ${{ env.ANTHROPIC_API_KEY != '' && hashFiles('evals/reports/baseline.json') != '' }}
+        run: |
+          python scripts/compare_evals.py \
+            --baseline evals/reports/baseline.json \
+            --current evals/reports/pr-current.json \
+            --threshold 5.0
+      - name: Comment on regression failure
+        if: failure()
+        uses: actions/github-script@v6
+        with:
+          script: |
+            github.rest.issues.createComment({
+              issue_number: context.issue.number,
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              body: "⚠️ **Regression Detected**\n\nEval metrics degraded vs baseline. See `evals/reports/pr-current.json` artifact for details.\n\nTo update the baseline (intentional improvement), run `make update-baseline` on main."
+            })
+      # === END: regression gate ===
+      - name: Upload golden eval report
+        if: ${{ always() && env.ANTHROPIC_API_KEY != '' }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: eval-report-golden
+          path: evals/reports/
+```
+**One-time baseline setup** (run on main, commit the result):
+```bash
+git checkout main
+PYTHONPATH=. python -m evals.run_evals \
+  --dataset evals/datasets/golden_ci.jsonl \
+  --judge-provider anthropic \
+  --judge-model claude-haiku-4-5 \
+  --output evals/reports/baseline.json
+git add evals/reports/baseline.json
+git commit -m "chore: establish Phase 5 eval baseline"
+```
+---
+#### Step 3.3: Create Phase 5 Documentation
+**File:** `Docs/phase5_observability.md` (NEW)
+```markdown
+# Phase 5: Production Monitoring & Observability
+**Timeline:** 3 weeks
+**Status:** Implementation in progress
+## Overview
+Phase 5 hardens the doc-ingestion RAG system for production through:
+1. **Distributed tracing** (LangFuse) for end-to-end pipeline visibility
+2. **Latency profiling** (P50, P95, P99) per step
+3. **Cost tracking** (USD per request)
+4. **Real-time metrics dashboard** at `/observability/dashboard`
+5. **Regression gating** (GitHub Actions) to prevent accuracy degradation on PRs
+6. **Citation accuracy monitoring** (groundedness, coverage trends)
+## Architecture
+### Tracing Flow
+```
+User Query
+    ↓
+[LangFuse Trace Start]
+    ↓
+Retrieval (BM25 + Vector)
+[TRACE: latency, chunks retrieved, scores]
+    ↓
+Reranking (Cross-Encoder)
+[TRACE: latency, input/output chunks]
+    ↓
+Generation (LLM)
+[TRACE: latency, tokens, cost, provider]
+    ↓
+Citation Verification
+[TRACE: latency, citations verified]
+    ↓
+Truthfulness Scoring
+[TRACE: latency, faithfulness, groundedness]
+    ↓
+[Flush to LangFuse]
+    ↓
+Response + Metrics Recorded
+```
+### Metrics Aggregation
+```
+Per-Request Metrics (RequestMetrics)
+    ↓
+In-Memory Collector (1000 rolling window)
+    ↓
+Dashboard Endpoint (/observability/dashboard)
+    ↓
+JSON: P50/P95/P99 latencies, cost trends, quality scores
+```
+### Regression Gating
+```
+PR Submitted
+    ↓
+GitHub Actions: Run evals on golden dataset
+    ↓
+Compare against baseline (main branch)
+    ↓
+Check: Latency increase <5%? Quality decrease <5%?
+    ↓
+If FAIL: Block PR + comment with regression details
+If PASS: Allow merge
+```
+## Key Components
+### 1. Observability Module (`src/core/observability.py`)
+**Provides:**
+- `RAGObserver` class with step-level tracing decorators
+- Context managers for span-based tracing
+- LangFuse client integration
+- No-op when disabled (useful for demo mode)
+**Usage:**
+```python
+observer = get_observer()
+# One trace per request, spans as children — instrument in RAGOrchestrator.run()
+with observer.trace_request("rag_query", query=query_text) as trace:
+    with observer.trace_step(trace, "retrieval") as s:
+        result = retriever.retrieve(query)
+        s["chunks_retrieved"] = len(result)
+    with observer.trace_step(trace, "generation", {"provider": provider}) as s:
+        answer = generator.generate(query, result)
+observer.flush_async()  # non-blocking — run in background thread
+```
+### 2. Metrics Collector (`src/monitoring/metrics.py`)
+**Provides:**
+- `MetricsCollector` for in-memory aggregation
+- Percentile calculations (P50, P95, P99)
+- Dashboard-friendly JSON aggregations
+- Thread-safe recording
+**Metrics tracked:**
+```
+Latency:
+- total_latency_ms (P50, P95, P99)
+- retrieval_avg_ms
+- reranking_avg_ms
+- generation_avg_ms
+- citation_avg_ms
+- Breakdown percentages
+Cost:
+- total_usd (across all requests)
+- avg_per_request_usd
+- p95_per_request_usd
+Quality (online — no ground truth required):
+- citation_groundedness_avg
+- nli_faithfulness_avg
+(mrr/ndcg are offline-only; they live in evals/run_evals.py, not RequestMetrics)
+```
+### 3. Regression Gate Script (`scripts/compare_evals.py`)
+**Compares:**
+- Baseline metrics (main branch)
+- Current metrics (PR branch)
+- Threshold: 5% by default (configurable)
+**Fails if:**
+- Latency increases >5%
+- Quality decreases >5%
+- Cost increases >5%
+### 4. Regression Gate in `.github/workflows/ci.yml` (extends existing `evals-golden` job)
+**On every PR:**
+1. Runs offline evaluations against `evals/datasets/golden_ci.jsonl`
+2. Compares against committed `evals/reports/baseline.json`
+3. Blocks PR if regressions detected
+4. Comments with regression details
+## Setup Instructions
+### Step 1: Set Environment Variables
+```bash
+# For development
+export LANGFUSE_PUBLIC_KEY=pk_...
+export LANGFUSE_SECRET_KEY=sk_...
+# For testing (disabled)
+export DOC_PROFILE=demo  # Disables LangFuse
+```
+### Step 2: Install Dependencies
+```bash
+# langfuse goes into requirements/base.txt (requirements/api.txt does not exist)
+pip install -r requirements/base.txt  # Includes langfuse>=2.0.0
+```
+### Step 3: Configure Baseline (One-Time, commit to repo)
+```bash
+# Run evaluations on main branch to establish baseline
+git checkout main
+PYTHONPATH=. python -m evals.run_evals \
+  --dataset evals/datasets/golden_ci.jsonl \
+  --judge-provider anthropic \
+  --judge-model claude-haiku-4-5 \
+  --output evals/reports/baseline.json
+git add evals/reports/baseline.json
+git commit -m "chore: establish Phase 5 eval baseline"
+```
+### Step 4: Query and Monitor
+```bash
+# Start API
+PYTHONPATH=. uvicorn src.api.main:app --reload
+# Query
+curl -X POST http://localhost:8000/query \
+  -H "Content-Type: application/json" \
+  -d '{"query": "What is RAG?"}'
+# View dashboard
+curl http://localhost:8000/observability/dashboard | jq .
+# Output:
+# {
+#   "summary": { "total_requests": 1, ... },
+#   "latency": {
+#     "total_p50_ms": 1247.3,
+#     "total_p95_ms": 1247.3,
+#     "breakdown_pct": {
+#       "retrieval": 18.2,
+#       "reranking": 12.1,
+#       "generation": 68.4,
+#       "citation": 1.3
+#     }
+#   },
+#   "cost": { "avg_per_request_usd": 0.00245 },
+#   "quality": {
+#     "citation_groundedness_avg": 0.92,
+#     "nli_faithfulness_avg": 0.88
+#   }
+# }
+```
+## Testing
+### Unit Tests
+```bash
+# Observability tests
+pytest tests/unit/test_observability.py -v
+# Metrics tests
+pytest tests/unit/test_metrics.py -v
+# Regression gate tests
+pytest tests/unit/test_regression_gate.py -v
+```
+### Integration Test
+```bash
+# Full E2E with tracing enabled
+LANGFUSE_PUBLIC_KEY=pk_... LANGFUSE_SECRET_KEY=sk_... \
+PYTHONPATH=. python -c "
+from src.api.main import app
+from fastapi.testclient import TestClient
+client = TestClient(app)
+response = client.post('/query', json={'query': 'What is RAG?'})
+print(response.json())
+# Should include request_id and all metrics
+"
+```
+## Metrics Interpretation
+### Latency Breakdown Example
+```
+Total P50: 1247.3 ms
+Breakdown:
+- Retrieval:   227 ms (18.2%)  ← BM25 + Vector Search
+- Reranking:   151 ms (12.1%)  ← Cross-Encoder Rerank
+- Generation:  855 ms (68.4%)  ← LLM inference
+- Citation:     14 ms ( 1.3%)  ← Citation Verification
+Interpretation:
+Generation is the bottleneck (68.4% of total).
+Could optimize by:
+1. Using a faster model
+2. Using streaming
+3. Reducing context size
+```
+### Quality Metrics Example
+```
+Citation Groundedness: 0.92 (92% of citations verified)
+NLI Faithfulness:      0.88 (88% of answer supported by chunks)
+MRR (Retrieval):       0.85 (Mean Reciprocal Rank)
+NDCG (Retrieval):      0.80 (NDCG@10)
+Interpretation:
+- Citation coverage is strong (92%)
+- Faithfulness could improve (88%)
+- Retrieval quality is good (MRR 0.85)
+- Consider reranking strategy improvements
+```
+### Cost Estimation Example
+```
+Cost per Request: $0.00245 (avg)
+Cost at P95:      $0.00312
+Annual projection (10K requests/day):
+365 * 10K * $0.00245 = $8,927.50
+Cost Optimization:
+- Switch to cheaper model?
+- Use batch inference?
+- Cache common queries?
+```
+## Deployment Notes
+### Docker
+```dockerfile
+# In docker/Dockerfile, ensure observability deps are included
+# langfuse is in requirements/base.txt — no separate api.txt exists
+RUN pip install -r requirements/base.txt
+# docker-compose sets env vars
+environment:
+  - LANGFUSE_PUBLIC_KEY=${LANGFUSE_PUBLIC_KEY}
+  - LANGFUSE_SECRET_KEY=${LANGFUSE_SECRET_KEY}
+```
+### Streamlit (Demo Mode)
+```python
+# In demo mode, observability is disabled
+if os.getenv("DOC_PROFILE") == "demo":
+    observer = RAGObserver(enabled=False)  # No-op
+```
+## Troubleshooting
+### LangFuse traces not appearing
+```
+1. Check credentials: LANGFUSE_PUBLIC_KEY and LANGFUSE_SECRET_KEY set?
+2. Check network: Can you reach https://cloud.langfuse.com?
+3. Check logs: Do you see "LangFuse observability enabled"?
+4. Verify flush: observer.flush() called after each request?
+```
+### Dashboard metrics all zeros
+```
+1. Check MetricsCollector is receiving data:
+   print(metrics_collector.metrics)
+2. Have you sent enough requests? (P95 needs at least 100)
+3. Is metrics_collector.record_request() being called?
+```
+### Regression gate always failing
+```
+1. Baseline exists? evals/reports/baseline.json present? (committed to repo)
+   If not: run "make update-baseline" on main to generate it.
+2. Threshold too strict? Default is 5%, try --threshold 10
+3. Eval dataset: correct file is evals/datasets/golden_ci.jsonl (not golden.jsonl)
+4. Check eval logs for errors: evals/reports/pr-current.json artifact
+```
+## Next Steps (Post-Phase 5)
+- [ ] Grafana dashboard integration for long-term trends
+- [ ] Alert thresholds (PagerDuty for latency spikes)
+- [ ] Cost attribution per LLM provider
+- [ ] A/B testing framework (compare models, prompts)
+- [ ] User feedback loop (thumbs up/down on answers)
+- [ ] Fine-tuning based on eval failures
+## Interview Stories
+### "How do you ensure production RAG reliability?"
+> At Marriott, we deployed an agent handling 10K+ guest queries daily. Without observability, we'd have no idea if accuracy was degrading. I instrumented the pipeline with LangFuse tracing to see every step: retrieval latency, reranking precision, generation tokens, citation accuracy. Now I have a dashboard showing P50/P95 latency breakdown, cost per request, and quality metrics. And I wired up regression gating so no code change ships unless it passes a golden dataset evaluation. This is how you build trust in production AI systems.
+### "How would you scale an AI platform?"
+> Observability is first-class, not an afterthought. The moment you deploy, you need distributed tracing to answer: Where's the bottleneck? Is generation or retrieval slowing us down? What's the cost per request? How are quality metrics trending? I built this with LangFuse + a metrics collector, so we can see the full stack at P50/P95. Then I added regression gating in CI/CD to prevent accuracy regressions from ever shipping.
+---
+**Deliverables Summary:**
+| Week | Component | Files |
+|------|-----------|-------|
+| 1 | Instrumentation | `src/core/observability.py`, `tests/unit/test_observability.py`, `src/core/rag_orchestrator.py` (modified) |
+| 2 | Metrics Dashboard | `src/monitoring/metrics.py` (replaces log.py MetricsCollector), `/observability/dashboard` endpoint |
+| 3 | Regression Gating | `scripts/compare_evals.py`, `.github/workflows/ci.yml` (modified — add comparison step to evals-golden job), `evals/reports/baseline.json` (committed) |
+---
+## Approval Checklist
+- [ ] Week 1: LangFuse integration with correct span hierarchy (one trace/request, spans as children)
+- [ ] Week 1: Instrumentation in `RAGOrchestrator.run()`, not `main.py`
+- [ ] Week 1: `flush_async()` used everywhere (no synchronous flush in request path)
+- [ ] Week 2: `MetricsCollector` in `src/monitoring/metrics.py` replaces the one in `src/utils/log.py`
+- [ ] Week 2: `RequestMetrics` has no `mrr`/`ndcg` fields
+- [ ] Week 3: Regression comparison added to existing `evals-golden` job in `ci.yml`
+- [ ] Week 3: `evals/reports/baseline.json` committed to repo from main branch
+- [ ] Tests: All unit tests passing
+- [ ] Integration: E2E query with tracing + metrics recording
+- [ ] Interview ready: Stories prepared (see "Interview Stories")
+```
+**Deliverable for Week 3:**
+- ✅ `scripts/compare_evals.py` (complete)
+- ✅ `tests/unit/test_regression_gate.py` (complete)
+- ✅ `.github/workflows/ci.yml` updated — regression comparison step added to `evals-golden` job (no new workflow file)
+- ✅ `evals/reports/baseline.json` committed to repo (generated from main branch)
+- ✅ `Docs/phase5_observability.md` (comprehensive, 300+ lines)
+- ✅ Update `README.md` with observability badge and link to Phase 5 docs
+- ✅ Update `Docs/ROADMAP.md` to mark Phase 5 as "Complete"
+---
+## Testing All Phases (Integration Tests)
+**File:** `tests/integration/test_phase5_e2e.py` (NEW)
+```python
+"""End-to-end integration test for Phase 5."""
+import pytest
+from fastapi.testclient import TestClient
+from src.api.main import app
+from src.core.observability import init_observer
+from src.monitoring.metrics import get_metrics_collector
+client = TestClient(app)
+@pytest.fixture(autouse=True)
+def setup_observability():
+    """Initialize observability for tests."""
+    init_observer(enabled=False)  # Disabled for unit tests
+    yield
+    metrics_collector = get_metrics_collector()
+    metrics_collector.metrics.clear()
+def test_full_query_pipeline_with_observability():
+    """Test full query pipeline with observability enabled.
+    NOTE: This requires the API to be running with documents indexed.
+    Use the existing tests/fixtures/ for pre-loaded test documents — see
+    tests/integration/test_pipeline.py for the fixture pattern.
+    """
+    response = client.post(
+        "/query",
+        json={"query": "What is RAG?", "provider": "ollama", "model": "qwen2.5:7b"}
+    )
+    assert response.status_code == 200
+    data = response.json()
+    # Verify response structure
+    assert "request_id" in data
+    assert "answer" in data
+    assert "citations" in data
+    assert "truthfulness" in data
+    # Verify request_id format
+    assert len(data["request_id"]) == 36  # UUID length
+def test_observability_dashboard_endpoint():
+    """Test /observability/dashboard endpoint."""
+    # Send a few requests
+    for i in range(5):
+        client.post(
+            "/query",
+            json={"query": f"Query {i}", "provider": "ollama"}
+        )
+    # Check dashboard
+    response = client.get("/observability/dashboard")
+    assert response.status_code == 200
+    data = response.json()
+    # Verify dashboard structure
+    assert "summary" in data
+    assert "latency" in data
+    assert "cost" in data
+    assert "quality" in data
+    # Verify latency metrics
+    assert "total_p50_ms" in data["latency"]
+    assert "breakdown_pct" in data["latency"]
+    assert data["summary"]["total_requests"] >= 5
+```
+---
+## Success Metrics (How to Know Phase 5 Is Complete)
+| Metric | Target | Status |
+|--------|--------|--------|
+| **Tracing** | Every RAG step traced in LangFuse | ✅ |
+| **Latency visibility** | P50/P95/P99 per step on dashboard | ✅ |
+| **Cost tracking** | USD per request calculated & exposed | ✅ |
+| **Regression gating** | GitHub Actions blocks PRs on degradation | ✅ |
+| **Tests passing** | Unit + integration + E2E all passing | ✅ |
+| **Documentation** | Phase 5 docs + interview stories | ✅ |
+| **Demo-ready** | Can show dashboard in 3 minutes | ✅ |
+---
+## Interview Talking Points
+### For Vertex (Director, AI Coding Platforms)
+> "Latency budgeting is critical at director level. I instrumented my RAG system to show P50/P95 latency per step. Generation is 68% of the latency. I'd optimize by choosing a faster model or using streaming. This is the mental model: measure first, then optimize. And I wired up regression gating so accuracy never regresses on PRs."
+### For Elevation Capital (Head of AI Strategy)
+> "Risk reduction is how you scale AI platforms. I added observability to my RAG system so we can track: Is accuracy degrading? Are costs trending up? Is latency acceptable? And I automated regression detection in CI/CD. This removes the human risk of accidentally shipping a prompt change that tanks quality."
+### For Marriott-like Enterprise Roles
+> "At enterprise scale, you can't guess. I built a metrics dashboard showing cost per request, citation accuracy, retrieval quality. I monitor P50/P95 latencies to understand where bottlenecks are. And I have a regression gate that prevents code changes from degrading the model without detection. This is how you run a platform."
+---
+## Timeline Summary
+| Week | Deliverable | Effort | Demo |
+|------|-------------|--------|------|
+| 1 | LangFuse tracing | 15-20 hrs | Query + LangFuse dashboard |
+| 2 | Metrics + dashboard | 10-15 hrs | /observability/dashboard endpoint |
+| 3 | Regression gating + docs | 10-15 hrs | GitHub Actions blocking PR demo |
+**Total effort:** ~40-50 hours over 3 weeks
+---

Docs/Phase6-Iterative-Execution-Index.md ADDED Viewed

	@@ -0,0 +1,29 @@

+# Phase 6 Iterative Execution Index
+Use this index to execute Phase 6 one plan at a time while keeping the master plan unchanged.
+Master plan:
+- `Docs/Phase6-RefactorDemo_React.md`
+Execution order:
+1. `Docs/Phase6.1-Backend-Session-Isolation-Plan.md`
+2. `Docs/Phase6.2-React-MVP-Plan.md`
+3. `Docs/Phase6.3-Container-Cutover-Plan.md`
+4. `Docs/Phase6.4-Streamlit-Decommission-Plan.md` (optional)
+## Phase gate rule
+Do not start the next phase until current phase:
+- meets all exit criteria,
+- passes phase verification commands,
+- and has handoff artifacts ready for the next phase.
+## Shared constraints (apply to all phase files)
+- Knowledge scope stays `global|session|both`.
+- Guardrails stay at defaults unless explicitly tuned via `DOC_DEMO_*` env:
+  - 3 files/session
+  - 3 MB/file
+  - 8 MB/session
+  - 30 min idle TTL
+- Keep rollback notes current during 6.3 and 6.4.

Docs/Phase6-RefactorDemo_React.md ADDED Viewed

	@@ -0,0 +1,445 @@

+# Plan: Per-session document upload for the demo, on a React + FastAPI front-end
+## Context
+The Hugging Face Spaces demo at [src/web/streamlit_app.py](src/web/streamlit_app.py) currently disables document uploads in demo mode (early-return at [src/web/streamlit_app.py:344-350](src/web/streamlit_app.py#L344-L350)) because the ingestion pipeline writes to a single shared Chroma collection (`"documents"` at [src/core/rag_orchestrator.py:32](src/core/rag_orchestrator.py#L32)) and a single shared BM25 index file ([src/core/rag_orchestrator.py:30](src/core/rag_orchestrator.py#L30)). Visitors can only run pre-canned prompts against pre-loaded sample docs, which leaves them unable to verify whether the RAG pipeline is genuinely grounded — eroding trust on first contact.
+Goal: let a visitor (a) try the existing sample prompts, (b) upload a few of their own documents, (c) ask questions scoped to global / their uploads / both, and (d) see citations they can verify against the file they just uploaded — all without polluting the shared corpus or other visitors' sessions.
+You opted to go straight to a React + FastAPI front-end (rather than extending Streamlit) and to ship the work in phases. Backend isolation must land first regardless of front-end choice, so the plan starts there.
+Decisions captured: **3-way knowledge-scope toggle (Global / Mine / Both)**; **conservative caps: 3 files, 3 MB each, 8 MB total, 30 min idle TTL**.
+## High-level approach
+The clean architectural seam already exists in the code — three hard-coded constants (`BM25_INDEX_PATH`, `COLLECTION_NAME`, `CHROMA_PATH`) at module scope in [src/ingest.py:21-22](src/ingest.py#L21-L22) and [src/core/rag_orchestrator.py:30-32](src/core/rag_orchestrator.py#L30-L32). The plan parameterizes those, threads a session-scoped triple `(bm25_index_path, collection_name, chroma_path)` through the request, and unions retrieval results when scope is "Both". Existing components (`HybridRetriever`, `BM25Search`, `VectorSearch`, `CrossEncoderReranker`, `RAGGenerator`, `CitationVerifier`) require no changes.
+The cached singleton orchestrator at [src/web/streamlit_app.py:39](src/web/streamlit_app.py#L39) stays — it reads its session inputs per-`QueryRequest`, not at construction.
+## Phase 6.1 — Backend session isolation foundation (~2-3 days)
+Ships independently. Streamlit UI continues to work as today. The new HTTP surface unblocks the React build.
+### Objective
+Land session-isolated ingestion/retrieval in the backend while keeping existing Streamlit behavior intact.
+### Scope
+### Files to modify
+**[src/ingest.py](src/ingest.py)** — make `ingest()` accept overrides
+- Change signature at [L37](src/ingest.py#L37) to:
+  `def ingest(docs_path, *, bm25_index_path=BM25_INDEX_PATH, collection_name=COLLECTION_NAME, chroma_path="data/embeddings/chroma", processor=None) -> tuple[BM25Index, VectorDatabase]`
+- Replace hard-coded uses at [L54](src/ingest.py#L54), [L55](src/ingest.py#L55), [L91](src/ingest.py#L91), [L97-98](src/ingest.py#L97-L98) with the kwargs.
+- When `processor is None`, build one as today; the parameter exists so the caller passes a fresh `DocumentProcessor` per session (its `_seen_hashes` is per-instance and would otherwise leak dedup state across sessions).
+- Module constants stay as defaults — CLI usage unchanged.
+**[src/web/ingestion_service.py](src/web/ingestion_service.py)** — caps + session-target passthrough
+- Add module constants (env-overridable):
+  - `MAX_FILES_PER_SESSION = int(os.getenv("DOC_DEMO_MAX_FILES", "3"))`
+  - `MAX_FILE_BYTES = int(os.getenv("DOC_DEMO_MAX_FILE_MB", "3")) * 1024 * 1024`
+  - `MAX_SESSION_BYTES = int(os.getenv("DOC_DEMO_MAX_SESSION_MB", "8")) * 1024 * 1024`
+- Extend `save_uploaded_files()` at [L29](src/web/ingestion_service.py#L29) to accept `existing_bytes: int = 0, max_files: int | None = None, max_file_bytes: int | None = None, max_session_bytes: int | None = None`. Reject with `IngestFileResult(status="rejected", message=...)` for: oversize file, file count cap, session disk cap.
+- Add a magic-bytes sanity check (e.g., `.pdf` must start with `%PDF`, `.docx` must start with `PK\x03\x04`); reject `type_mismatch` otherwise.
+- Extend `run_ingest()` at [L50](src/web/ingestion_service.py#L50) to accept `bm25_index_path: str | None = None, collection_name: str | None = None, chroma_path: str | None = None` and forward to `ingest(...)`.
+**`src/web/session_corpus.py`** (new — only new module)
+```
+SESSION_ROOT = Path(os.getenv("DOC_DEMO_SESSION_ROOT", "/tmp/doc-ingest-sessions"))
+SESSION_TTL_SECONDS = int(os.getenv("DOC_DEMO_SESSION_TTL", "1800"))
+@dataclass
+class SessionCorpus:
+    session_id: str
+    upload_dir: Path
+    chroma_path: Path
+    bm25_index_path: Path
+    collection_name: str   # f"sess_{session_id}" — Chroma-safe
+    created_at: float
+def new_session_id() -> str        # uuid4().hex[:12]
+def get_or_create(sid: str) -> SessionCorpus
+def touch(sid: str) -> None        # bump .touched mtime, refresh TTL
+def total_bytes(s: SessionCorpus) -> int
+def list_active_sessions() -> list[SessionCorpus]
+def delete_session(sid: str) -> None
+def janitor_sweep(now: float | None = None) -> int
+```
+Layout per session: `${SESSION_ROOT}/<sid>/{uploads/, chroma/, bm25_index.json, .touched}`. Idempotent and safe under concurrent reruns.
+**[src/core/rag_orchestrator.py](src/core/rag_orchestrator.py)** — session-aware retrieval
+- Extend `QueryRequest` at [L36](src/core/rag_orchestrator.py#L36) with:
+  - `session_bm25_index_path: Optional[str] = None`
+  - `session_collection_name: Optional[str] = None`
+  - `session_chroma_path: Optional[str] = None`
+  - `knowledge_scope: str = "global"`  # `"global" | "session" | "both"`
+- `_load_components()` at [L87](src/core/rag_orchestrator.py#L87): when scope is `session` or `both`, also load a second `(BM25Index, VectorDatabase)` from session paths. If session BM25 file is missing/empty (user hasn't uploaded yet), fall back gracefully — log warning and demote scope to `global`.
+- `_retrieve()` at [L92](src/core/rag_orchestrator.py#L92): when `scope == "session"` run hybrid against the session pair only; when `scope == "both"` run two `HybridRetriever.retrieve()` calls and concatenate, deduping by `id`. The reranker at [L165](src/core/rag_orchestrator.py#L165) is the final arbiter — no change to fusion/rerank logic.
+- Cache-key fingerprint at [L126](src/core/rag_orchestrator.py#L126) must include scope and session triple so global cache hits don't leak across users:
+  `corpus_fingerprint=f"{COLLECTION_NAME}:{BM25_INDEX_PATH}|{req.knowledge_scope}|{req.session_collection_name or '-'}:{req.session_bm25_index_path or '-'}"`
+**[src/api/main.py](src/api/main.py)** — new endpoints + CORS + janitor
+- Add CORS middleware (allow the React origin: localhost dev port + the deployed origin from env `DOC_FRONTEND_ORIGINS`).
+- New endpoints:
+  - `POST /sessions` → `{session_id, expires_at}`. Mints id, calls `session_corpus.get_or_create()`. Sets `X-Demo-Session-Id` response header so the React app can also use it without cookies.
+  - `GET /sessions/{sid}` → `{session_id, files: [...], total_bytes, max_session_bytes, max_files, expires_at}`. Useful for the "My documents" panel.
+  - `POST /sessions/{sid}/documents` → multipart upload. Calls `save_uploaded_files(session.upload_dir, files, existing_bytes=total_bytes(session), ...)`, then `run_ingest(session.upload_dir, bm25_index_path=session.bm25_index_path, collection_name=session.collection_name, chroma_path=str(session.chroma_path))`. Touches the session.
+  - `DELETE /sessions/{sid}` → `session_corpus.delete_session(sid)` then mints a new id.
+- Extend `POST /query` at [L155](src/api/main.py#L155): accept optional `session_id`, `knowledge_scope`. If provided, look up the session, touch it, and pass `session_*` paths into `QueryRequest`. Reject `session`/`both` scopes when session has no uploads (return 409 with a hint to upload first).
+- Demo-mode guard at [L112](src/api/main.py#L112): the new session endpoints are **only** mounted when `DOC_PROFILE=demo` and `DOC_DEMO_UPLOADS=1`. Outside demo mode, ingestion stays through the existing batch path.
+- Per-IP upload rate limit: reuse the existing limiter at [L77-99](src/api/main.py#L77-L99) on `POST /sessions/{sid}/documents`.
+- FastAPI `lifespan`: start a background `asyncio` task that runs `session_corpus.janitor_sweep()` every 60 s; stop it on shutdown. Replaces the on-rerun best-effort sweep entirely.
+**[spaces/app.py](spaces/app.py)** — opt the deployed demo into Phase 6.1
+- After [L34](spaces/app.py#L34) add the env defaults:
+  - `DOC_DEMO_UPLOADS=1`
+  - `DOC_DEMO_SESSION_ROOT=/tmp/doc-ingest-sessions`
+  - `DOC_DEMO_MAX_FILES=3`, `DOC_DEMO_MAX_FILE_MB=3`, `DOC_DEMO_MAX_SESSION_MB=8`, `DOC_DEMO_SESSION_TTL=1800`
+- HF Spaces ephemeral disk is wiped on container restart — `/tmp` keeps the persisted `data/` clean.
+### Functions/classes to reuse unchanged
+- `save_uploaded_files()` at [src/web/ingestion_service.py:29](src/web/ingestion_service.py#L29) — body preserved, signature additions only
+- `RAGOrchestrator` class itself at [src/core/rag_orchestrator.py:64](src/core/rag_orchestrator.py#L64) — only `QueryRequest` grows
+- `HybridRetriever`, `BM25Search`, `VectorSearch` ([src/core/](src/core/)) — second instance per request when scope demands; otherwise unchanged
+- `VectorDatabase` at [src/utils/database.py:29](src/utils/database.py#L29) — already accepts `chroma_path`; just construct a second one for sessions
+- `BM25Index.save` / `BM25Index.load` at [src/core/bm25_index.py](src/core/bm25_index.py) — already path-parameterized
+- `CrossEncoderReranker`, `RAGGenerator`, `CitationVerifier`, `ResponseCache` — unchanged
+### Tests (Phase 6.1)
+Add under [tests/unit/](tests/unit/) and [tests/integration/](tests/integration/):
+- `tests/unit/test_session_corpus.py` — id format, idempotent `get_or_create`, janitor TTL eviction, `delete_session` on missing dir is a no-op, concurrent `get_or_create` is safe.
+- Extend `tests/unit/test_ingestion_service.py` (or create) — caps enforced (oversize, count, session disk), magic-byte mismatch rejected, override kwargs forwarded.
+- `tests/unit/test_ingest_overrides.py` — `ingest(tmp, bm25_index_path=..., collection_name="sess_x", chroma_path=...)` writes to overrides and not defaults; default-arg call still hits the global paths.
+- Extend `tests/unit/test_streamlit_demo_routing.py` — `knowledge_scope="session"` carries session paths only; `"both"` carries both; cache key changes when session paths change.
+- `tests/integration/test_session_isolation.py` — bootstrap a tiny global corpus; mint sessions A and B; ingest different fixtures into each; query A scope=`session` returns only A's chunks; query A scope=`both` returns A+global, never B's; janitor with mocked clock past TTL deletes the session dirs.
+- `tests/integration/test_global_corpus_pristine.py` — sha256 the global BM25 + Chroma store before/after multiple session ingests; assert unchanged.
+- `tests/integration/test_session_api.py` — exercise `POST /sessions`, `POST /sessions/{id}/documents`, `GET /sessions/{id}`, `DELETE /sessions/{id}` and `POST /query` with session_id end-to-end via FastAPI `TestClient`.
+### Verification (Phase 6.1, local)
+```
+# Unit + integration
+pytest tests/unit/test_session_corpus.py tests/unit/test_ingestion_service.py \
+       tests/unit/test_ingest_overrides.py tests/unit/test_streamlit_demo_routing.py \
+       tests/integration/test_session_isolation.py \
+       tests/integration/test_global_corpus_pristine.py \
+       tests/integration/test_session_api.py -v
+# Boot demo-mode API + Streamlit (Streamlit still works)
+DOC_PROFILE=demo DOC_EMBEDDING_PROVIDER=sentence_transformers \
+DOC_DEMO_UPLOADS=1 DOC_DEMO_SESSION_ROOT=/tmp/doc-ingest-sessions \
+uvicorn src.api.main:app --host 127.0.0.1 --port 8000 &
+DOC_PROFILE=demo streamlit run src/web/streamlit_app.py
+# Curl smoke the new API
+curl -X POST http://127.0.0.1:8000/sessions
+# → {"session_id":"...","expires_at":...}
+curl -X POST -F "files=@./README.md" http://127.0.0.1:8000/sessions/<sid>/documents
+curl -X POST http://127.0.0.1:8000/query \
+  -H "Content-Type: application/json" \
+  -d '{"query":"summarize my doc","session_id":"<sid>","knowledge_scope":"session"}'
+# Confirm shared corpus untouched
+sha256sum data/embeddings/bm25_index.json   # before/after — identical
+```
+### Phase 6.1 handoff (exit criteria)
+- Backend supports isolated session corpus lifecycle (`create/get/upload/query/delete`) without cross-session leakage.
+- `knowledge_scope` (`global|session|both`) works end-to-end and cache keys are session-safe.
+- Guardrails are enforced server-side (file caps, MIME sanity checks, rate limiting, TTL janitor).
+- Existing Streamlit demo still runs in demo profile (no regression to current user flow).
+- All Phase 6.1 tests pass locally and in CI.
+### Transition to Phase 6.2 (entry criteria)
+- API contracts are stable for frontend consumption (`/sessions`, `/sessions/{id}`, `/sessions/{id}/documents`, `/query` with session fields).
+- OpenAPI spec reflects new request/response shapes.
+- Demo env defaults for session uploads are available.
+## Phase 6.2 — React MVP front-end over stable API (~5-7 days)
+Built in a new top-level `frontend/` directory; FastAPI keeps running unchanged. No HF cutover yet — develop locally against `http://127.0.0.1:8000`.
+### Objective
+Ship a usable React demo UI that consumes Phase 6.1 APIs and validates isolated user-upload experience.
+### Scope
+### Stack
+- **Vite + React 18 + TypeScript** (lean SPA, no SSR needed for a demo).
+- **Tailwind CSS + shadcn/ui** (Radix-based primitives — drop-in card, tabs, radio-group, file-uploader, progress, toast).
+- **TanStack Query** for server state (session, file list, query results) — gives caching, retries, and dedup for free.
+- **Zustand** (or React Context) for the session-id slice that needs to outlive a route change.
+- **Typed API client** generated from FastAPI's OpenAPI schema via `openapi-typescript` so the FE stays type-safe against the BE contract.
+- **Streaming**: consume `POST /query/stream` via the `EventSource`-style `fetch` + `ReadableStream` pattern (since SSE doesn't natively support POST).
+### Component layout
+```
+frontend/
+├─ index.html
+├─ vite.config.ts
+├─ tailwind.config.ts
+├─ src/
+│  ├─ main.tsx
+│  ├─ App.tsx                      # Tabs: Query | My documents
+│  ├─ api/
+│  │  ├─ client.ts                 # fetch wrapper, attaches X-Demo-Session-Id
+│  │  └─ generated.ts              # openapi-typescript output
+│  ├─ session/
+│  │  ├─ SessionProvider.tsx       # mints session via POST /sessions on first load
+│  │  └─ useSession.ts
+│  ├─ tabs/
+│  │  ├─ QueryTab.tsx              # sample prompts, scope toggle, run
+│  │  └─ DocumentsTab.tsx          # drop-zone, file list, caps meter, reset
+│  ├─ components/
+│  │  ├─ SamplePromptChips.tsx     # mirrors _DEMO_QUESTIONS
+│  │  ├─ ScopeToggle.tsx           # 3-way radio, disables Mine/Both until upload
+│  │  ├─ AnswerPanel.tsx           # answer + truthfulness badge
+│  │  ├─ CitationsList.tsx         # tagged [global]/[yours]
+│  │  ├─ RetrievedChunks.tsx
+│  │  └─ Uploader.tsx              # drag-drop, per-file status
+│  └─ lib/streamQuery.ts           # SSE-over-POST helper
+```
+### UX wireframe
+```
+┌─────────────────────────────────────────────────────────────────┐
+│ Doc Ingestion Assistant            session …a91c · 28:14 left   │
+│ ⓘ Hosted demo. Your uploads stay in this session for 30 min,    │
+│   are not added to the shared corpus, and aren't visible to     │
+│   anyone else.                                                  │
+├─ [ Query ] [ My documents ] ─────────────────────────────────── │
+│                                                                 │
+│ Query tab:                                                      │
+│   Try a sample:  [What is RAG?] [What is RRF?] [BM25 vs vec…]   │
+│                                                                 │
+│   Knowledge scope:                                              │
+│     ◉ Global sample corpus                                      │
+│     ○ My uploads only         (disabled until upload)           │
+│     ○ Both                                                      │
+│                                                                 │
+│   Provider [ ▼ ]   Model [ ▼ ]                                  │
+│   ┌───────────────────────────────────────────────┐             │
+│   │ Ask a question…                               │             │
+│   └───────────────────────────────────────────────┘             │
+│   [ Run ]                                                       │
+│                                                                 │
+│   ── Answer ──  🟢 Truthfulness 0.89                            │
+│   …answer text streaming in…                                    │
+│                                                                 │
+│   Citations:                                                    │
+│     [yours] my-resume.pdf · chunk 2                             │
+│     [global] phase2_hybrid_retrieval.md · chunk 5               │
+│                                                                 │
+│ My documents tab:                                               │
+│   Disk used: 1.2 / 8.0 MB     Files: 2 / 3                      │
+│   ⓘ ≤ 3 files · ≤ 3 MB each · ≤ 8 MB total                      │
+│   ┌───────── drop files here ─────────┐                         │
+│   └──────────────────────────────────┘                          │
+│   • my-resume.pdf      indexed                                  │
+│   • report.txt         indexed                                  │
+│   [ Clear my session ]                                          │
+└─────────────────────────────────────────────────────────────────┘
+```
+Behavior detail:
+- On first mount, `SessionProvider` calls `POST /sessions` and stashes the id in localStorage (so a refresh keeps the same session until TTL).
+- The scope toggle disables Mine/Both until `GET /sessions/{id}` reports ≥ 1 indexed file.
+- Sample prompts always target Global scope by default (clicking a chip sets scope=Global and fills the textarea).
+- The streaming answer uses `lib/streamQuery.ts` to read tokens off `/query/stream`; falls back to non-streaming if SSE fails.
+- "Clear my session" calls `DELETE /sessions/{id}` then mints a new one.
+### Tests (Phase 6.2)
+- `frontend/src/**/*.test.tsx` with **Vitest + React Testing Library**:
+  - SessionProvider mints a session on first mount and stores it.
+  - ScopeToggle disables Mine/Both when no uploads, enables after upload.
+  - Uploader respects 3-file cap client-side and shows server rejection toasts.
+  - QueryTab renders streamed tokens incrementally.
+- **Playwright** smoke (`frontend/e2e/`): full happy-path — load → upload one file → switch to Mine → ask a question → see the file's citation.
+- **Playwright** negative path: no uploads keeps Mine/Both disabled; rejected uploads surface clear cap/type errors.
+### Verification (Phase 6.2, local)
+```
+# Backend
+DOC_PROFILE=demo DOC_DEMO_UPLOADS=1 \
+  uvicorn src.api.main:app --host 127.0.0.1 --port 8000
+# Frontend
+cd frontend && npm install && npm run dev    # http://localhost:5173
+# E2E
+cd frontend && npm run test       # vitest
+npm run test:e2e                  # playwright
+```
+### Phase 6.2 handoff (exit criteria)
+- React app provides Query + My Documents tabs, scope toggle, streaming/fallback answer flow, and session reset.
+- UI clearly communicates upload limits and session TTL.
+- Frontend unit tests and e2e tests pass locally and in CI.
+- UX supports clear citation provenance (`[global]` vs `[yours]`) for trust validation.
+### Transition to Phase 6.3 (entry criteria)
+- Frontend builds reproducibly (`npm ci && npm run build`) and can be served as static assets.
+- API CORS config includes intended frontend origins.
+- No unresolved API/frontend contract mismatches remain.
+## Phase 6.3 — Single-container deploy & HF Spaces cutover (~2 days)
+The current HF Space uses the Streamlit SDK (`spaces/README.md`). Switch to the Docker SDK so we ship one container with FastAPI + the built React SPA.
+### Objective
+Deploy one container (FastAPI + built React) to simplify ops and align HF delivery with the new UI.
+### Scope
+### Files to modify
+- **[docker/Dockerfile](docker/Dockerfile)** — multi-stage:
+  - Stage 1 (`node:20-alpine`): `npm ci && npm run build` → `frontend/dist`.
+  - Stage 2 (existing Python image): `COPY --from=stage1 /app/frontend/dist /app/static`.
+  - Final `CMD` runs uvicorn only — Streamlit is no longer in the deployed image.
+- **[src/api/main.py](src/api/main.py)** — when the static dir exists, mount it: `app.mount("/", StaticFiles(directory="static", html=True), name="ui")`. Move existing API routes under `/api` prefix (or use `app.mount` ordering so SPA fallback kicks in only on unknown paths). Keep `/health`, `/metrics`, `/query`, `/query/stream` reachable.
+- **[spaces/README.md](spaces/README.md)** — change frontmatter:
+  ```yaml
+  sdk: docker
+  app_port: 8000
+  ```
+  Drop `app_file: spaces/app.py`.
+- **[spaces/app.py](spaces/app.py)** — repurpose as a tiny launcher that just sets the demo env vars and execs uvicorn (or remove entirely if env defaults move into the Dockerfile).
+- **[.github/workflows/sync-to-spaces.yml](.github/workflows/sync-to-spaces.yml)** — extend to run `npm ci && npm run build` before pushing, OR rely on HF's Docker build (preferred — keeps CI fast).
+- **[.github/workflows/ci.yml](.github/workflows/ci.yml)** — add a `frontend` job: `npm ci`, `npm run lint`, `npm run test`, `npm run build`. Add a `e2e` job that boots the API and runs Playwright.
+Streamlit code stays in `src/web/streamlit_app.py` behind an env flag during the cutover so we can roll back to the previous Space SDK by reverting `spaces/README.md`.
+### Verification (Phase 6.3)
+```
+# Build and run the unified container locally
+docker build -f docker/Dockerfile -t doc-ingest:demo .
+docker run --rm -p 8000:8000 \
+  -e DOC_PROFILE=demo -e DOC_DEMO_UPLOADS=1 \
+  -e DOC_EMBEDDING_PROVIDER=sentence_transformers \
+  doc-ingest:demo
+open http://127.0.0.1:8000
+# Push branch → HF Space rebuilds via Docker SDK; smoke-test the live URL.
+```
+### Phase 6.3 handoff (exit criteria)
+- Unified container runs locally and in HF Spaces with expected routes and SPA fallback behavior.
+- Core API routes (`/health`, `/metrics`, `/query`, `/query/stream`) remain reachable and validated.
+- Demo smoke tests pass against deployed environment.
+- Rollback procedure to prior Space setup is documented and tested.
+### Transition to Phase 6.4 (entry criteria)
+- React demo has soaked in production-like traffic for at least one week.
+- No unresolved severity-1/2 issues tied to the new deployment path.
+- Team confirms Streamlit rollback is no longer required.
+## Phase 6.4 — Decommission Streamlit (optional, after 6.3 soaks)
+Once the React demo has been live for a week without regressions:
+- Delete [src/web/streamlit_app.py](src/web/streamlit_app.py).
+- Remove `streamlit` from [requirements/base.txt](requirements/base.txt).
+- Drop the Streamlit container from [docker/docker-compose.yml](docker/docker-compose.yml).
+- Update [README.md](README.md) screenshots and quickstart.
+- Delete `tests/unit/test_streamlit_demo_routing.py`.
+Keep `_DEMO_QUESTIONS` (move into a small JSON the API serves at `GET /api/sample-prompts` so the React FE stays in sync).
+### Phase 6.4 handoff (exit criteria)
+- Streamlit runtime, dependencies, and tests are removed cleanly.
+- Documentation and quickstart reflect the React + FastAPI deployment only.
+- Sample prompts are served from API/shared source of truth.
+### Transition to next program increment
+- Phase 6 is complete when 6.1-6.4 exit criteria are satisfied (with 6.4 optional per release decision).
+- Any deferred improvements become backlog items for Phase 7 (e.g., hard TTL cap, query concurrency limiter, enhanced abuse controls).
+## Caps & abuse guardrails (locked-in defaults)
+| Guard | Default | Enforced where | Failure mode |
+|---|---|---|---|
+| Per-file size cap | 3 MB | `save_uploaded_files()` | `rejected: oversize` |
+| File count cap | 3 / session | `save_uploaded_files()` | `rejected: file_count_cap` |
+| Total session disk cap | 8 MB | `save_uploaded_files()` | `rejected: session_disk_cap` |
+| Extension allowlist | `.pdf .docx .txt .md .html` | already at `_SUPPORTED_EXTS` ([L15](src/web/ingestion_service.py#L15)) | `failed: unsupported` |
+| MIME magic | header sniff | new helper in `save_uploaded_files()` | `rejected: type_mismatch` |
+| Per-IP upload rate-limit | reuse [src/api/main.py:77-99](src/api/main.py#L77-L99) limiter | `POST /sessions/{sid}/documents` | 429 |
+| Janitor disk ceiling | total `SESSION_ROOT > 1 GB` evicts oldest | `janitor_sweep()` | oldest sessions dropped |
+| Idle TTL | 30 min, refreshed on every query/upload | `.touched` mtime + janitor | session purged |
+All caps overridable via env (`DOC_DEMO_*`) so we can tune on HF without code changes.
+## Phase execution re-review (end-to-end)
+Execution order is intentionally strict: **6.1 -> 6.2 -> 6.3 -> 6.4 (optional)**.
+- **6.1 is the architectural base**: session isolation, scoped retrieval, and backend guardrails must be correct before any UI investment.
+- **6.2 depends on 6.1 contracts**: React work starts only after session APIs and `knowledge_scope` behavior are stable and test-covered.
+- **6.3 depends on 6.2 build maturity**: container cutover happens only after frontend build/test reliability and CORS/origin alignment are in place.
+- **6.4 is a stabilization cleanup**: Streamlit removal is deferred until post-soak confidence to protect rollback safety.
+Readiness checklist before starting each phase:
+- Previous phase exit criteria are met and documented.
+- Phase-specific test suite passes locally and in CI.
+- No open blocker in cross-phase risks that invalidates next-phase assumptions.
+- Handoff artifacts are available (API contract, env defaults, deployment notes, rollback notes as applicable).
+## Cross-phase risks & open questions
+1. **HF Space SDK switch (Streamlit → Docker)** is a one-way door for the running Space. Do the cutover on a fresh Space first (e.g., `…-demo-v2`), validate, then point the public URL at it.
+2. **Reranker memory under concurrency** — cross-encoder is the dominant cost (~400 MB) and serializes on CPU. More visitors uploading doesn't worsen retrieval contention, but Phase 6.2 should add a concurrency limiter on `/query` if HF traffic grows.
+3. **Cache-key fingerprint correctness** — the change at [rag_orchestrator.py:126](src/core/rag_orchestrator.py#L126) is load-bearing. Test must assert two sessions with identical query text get distinct cache keys.
+4. **`DocumentProcessor._seen_hashes` per-instance** ([src/core/document_processor.py:49](src/core/document_processor.py#L49)) — passing a fresh processor per session ingest is required, otherwise a session can silently skip files matching another session's hashes.
+5. **TTL refresh on read vs write** — refreshing on every query keeps active users' uploads alive indefinitely; consider an absolute hard cap (4 h) in Phase 6.2 if abuse appears.
+6. **SSE-over-POST quirks** — some proxies break long-lived POST streams. The React client falls back to non-streaming on first failure.
+7. **CORS scope** — set `DOC_FRONTEND_ORIGINS` tightly (no `"*"`) once the Space URL is final.
+8. **Browser refresh** — localStorage retains `session_id`; if backend has expired it, the FE catches a 404 from `GET /sessions/{id}` and re-mints transparently.
+9. **Citation labeling** — to display `[yours]` vs `[global]`, the merged `RetrievedResult.metadata` must carry the source collection. Cheapest: prefix chunk `id`s with `sess_<sid>__` for session uploads (already implicit since the collection name differs); the FE checks the prefix.
+10. **Streamlit coexistence during transition** — keep the Streamlit page reachable via a hidden `/legacy` route until Phase 6.4 to ease rollback.
+## Critical files by phase
+- **Phase 6.1**
+  - [src/ingest.py](src/ingest.py)
+  - [src/web/ingestion_service.py](src/web/ingestion_service.py)
+  - `src/web/session_corpus.py` (new)
+  - [src/core/rag_orchestrator.py](src/core/rag_orchestrator.py)
+  - [src/api/main.py](src/api/main.py)
+  - [spaces/app.py](spaces/app.py)
+- **Phase 6.2**
+  - `frontend/` (new tree)
+- **Phase 6.3**
+  - [docker/Dockerfile](docker/Dockerfile)
+  - [src/api/main.py](src/api/main.py)
+  - [spaces/README.md](spaces/README.md)
+  - [spaces/app.py](spaces/app.py)
+  - [.github/workflows/ci.yml](.github/workflows/ci.yml)
+  - [.github/workflows/sync-to-spaces.yml](.github/workflows/sync-to-spaces.yml)
+- **Phase 6.4**
+  - [src/web/streamlit_app.py](src/web/streamlit_app.py)
+  - [requirements/base.txt](requirements/base.txt)
+  - [docker/docker-compose.yml](docker/docker-compose.yml)
+  - [README.md](README.md)
+  - `tests/unit/test_streamlit_demo_routing.py`

Docs/Phase6.1-Backend-Session-Isolation-Plan.md ADDED Viewed

	@@ -0,0 +1,125 @@

+# Phase 6.1 Plan: Backend Session Isolation Foundation
+Source of truth: `Docs/Phase6-RefactorDemo_React.md` (this file is an execution slice for iterative delivery).
+## Objective
+Land session-isolated ingestion/retrieval in the backend while keeping existing Streamlit behavior intact.
+## Scope
+Ships independently. Streamlit UI continues to work as today. The new HTTP surface unblocks the React build.
+## Files to modify
+**`src/ingest.py`** — make `ingest()` accept overrides
+- Change signature to:
+  `def ingest(docs_path, *, bm25_index_path=BM25_INDEX_PATH, collection_name=COLLECTION_NAME, chroma_path="data/embeddings/chroma", processor=None) -> tuple[BM25Index, VectorDatabase]`
+- Replace hard-coded uses with kwargs.
+- Keep module constants as defaults so CLI remains unchanged.
+- Ensure fresh `DocumentProcessor` per session when caller passes one.
+**`src/web/ingestion_service.py`** — caps + session-target passthrough
+- Add env-overridable caps:
+  - `DOC_DEMO_MAX_FILES` (default `3`)
+  - `DOC_DEMO_MAX_FILE_MB` (default `3`)
+  - `DOC_DEMO_MAX_SESSION_MB` (default `8`)
+- Extend `save_uploaded_files()` to enforce:
+  - per-file cap
+  - file count cap
+  - total session cap
+- Add magic-bytes check (`.pdf`, `.docx`) and reject type mismatch.
+- Extend `run_ingest()` to pass `bm25_index_path`, `collection_name`, `chroma_path` overrides.
+**`src/web/session_corpus.py`** (new)
+- Add `SessionCorpus` dataclass and helpers:
+  - `new_session_id`, `get_or_create`, `touch`, `total_bytes`, `list_active_sessions`, `delete_session`, `janitor_sweep`
+- Session layout:
+  - `${SESSION_ROOT}/<sid>/{uploads/, chroma/, bm25_index.json, .touched}`
+- Defaults:
+  - `DOC_DEMO_SESSION_ROOT=/tmp/doc-ingest-sessions`
+  - `DOC_DEMO_SESSION_TTL=1800`
+**`src/core/rag_orchestrator.py`** — session-aware retrieval
+- Extend `QueryRequest`:
+  - `session_bm25_index_path`
+  - `session_collection_name`
+  - `session_chroma_path`
+  - `knowledge_scope` (`global|session|both`)
+- `session` scope uses only session corpus.
+- `both` scope merges global + session results and dedups by id.
+- Cache fingerprint must include scope + session corpus identifiers.
+**`src/api/main.py`** — session endpoints + CORS + janitor
+- Add CORS using `DOC_FRONTEND_ORIGINS`.
+- Add endpoints:
+  - `POST /sessions`
+  - `GET /sessions/{sid}`
+  - `POST /sessions/{sid}/documents`
+  - `DELETE /sessions/{sid}`
+- Extend `POST /query` with optional `session_id` and `knowledge_scope`.
+- Reject `session/both` if session has no uploads (409 with hint).
+- Mount only in demo mode:
+  - `DOC_PROFILE=demo`
+  - `DOC_DEMO_UPLOADS=1`
+- Reuse upload rate limiter for `POST /sessions/{sid}/documents`.
+- Add lifespan janitor task (`session_corpus.janitor_sweep()` every 60s).
+**`spaces/app.py`** — enable demo defaults for this phase
+- Set:
+  - `DOC_DEMO_UPLOADS=1`
+  - `DOC_DEMO_SESSION_ROOT=/tmp/doc-ingest-sessions`
+  - `DOC_DEMO_MAX_FILES=3`
+  - `DOC_DEMO_MAX_FILE_MB=3`
+  - `DOC_DEMO_MAX_SESSION_MB=8`
+  - `DOC_DEMO_SESSION_TTL=1800`
+## Tests
+- `tests/unit/test_session_corpus.py`
+- `tests/unit/test_ingestion_service.py` (extend or create)
+- `tests/unit/test_ingest_overrides.py`
+- `tests/unit/test_streamlit_demo_routing.py` (extend)
+- `tests/integration/test_session_isolation.py`
+- `tests/integration/test_global_corpus_pristine.py`
+- `tests/integration/test_session_api.py`
+## Verification
+```bash
+pytest tests/unit/test_session_corpus.py tests/unit/test_ingestion_service.py \
+       tests/unit/test_ingest_overrides.py tests/unit/test_streamlit_demo_routing.py \
+       tests/integration/test_session_isolation.py \
+       tests/integration/test_global_corpus_pristine.py \
+       tests/integration/test_session_api.py -v
+DOC_PROFILE=demo DOC_EMBEDDING_PROVIDER=sentence_transformers \
+DOC_DEMO_UPLOADS=1 DOC_DEMO_SESSION_ROOT=/tmp/doc-ingest-sessions \
+uvicorn src.api.main:app --host 127.0.0.1 --port 8000 &
+DOC_PROFILE=demo streamlit run src/web/streamlit_app.py
+```
+API smoke:
+```bash
+curl -X POST http://127.0.0.1:8000/sessions
+curl -X POST -F "files=@./README.md" http://127.0.0.1:8000/sessions/<sid>/documents
+curl -X POST http://127.0.0.1:8000/query \
+  -H "Content-Type: application/json" \
+  -d '{"query":"summarize my doc","session_id":"<sid>","knowledge_scope":"session"}'
+sha256sum data/embeddings/bm25_index.json
+```
+## Handoff (Exit Criteria)
+- Backend supports isolated session lifecycle (`create/get/upload/query/delete`) with no cross-session leakage.
+- `knowledge_scope` works end-to-end and cache keys are session-safe.
+- Guardrails enforced server-side (caps, MIME checks, rate limiting, TTL janitor).
+- Streamlit demo still works in demo profile.
+- Phase 6.1 tests pass locally and in CI.
+## Transition to Phase 6.2
+- API contracts are stable for frontend usage.
+- OpenAPI includes new request/response shapes.
+- Demo env defaults for session uploads are confirmed.

Docs/Phase6.2-React-MVP-Plan.md ADDED Viewed

	@@ -0,0 +1,83 @@

+# Phase 6.2 Plan: React MVP Front-end Over Stable API
+Source of truth: `Docs/Phase6-RefactorDemo_React.md` (this file is an execution slice for iterative delivery).
+Depends on: `Docs/Phase6.1-Backend-Session-Isolation-Plan.md`
+## Objective
+Ship a usable React demo UI that consumes Phase 6.1 APIs and validates isolated user-upload experience.
+## Scope
+Build in top-level `frontend/`; FastAPI backend remains unchanged. No HF cutover yet; develop locally against `http://127.0.0.1:8000`.
+## Stack
+- Vite + React 18 + TypeScript
+- Tailwind CSS + shadcn/ui
+- TanStack Query
+- Zustand (or Context) for session id state
+- `openapi-typescript` generated API typings
+- Streaming via `POST /query/stream` using `fetch` + `ReadableStream`
+## Planned frontend layout
+```text
+frontend/
+├─ src/App.tsx                    # Query | My documents
+├─ src/api/client.ts              # fetch wrapper + session header
+├─ src/api/generated.ts           # OpenAPI types
+├─ src/session/SessionProvider.tsx
+├─ src/tabs/QueryTab.tsx
+├─ src/tabs/DocumentsTab.tsx
+├─ src/components/ScopeToggle.tsx
+├─ src/components/Uploader.tsx
+└─ src/lib/streamQuery.ts
+```
+## Required behavior
+- On first mount, mint session via `POST /sessions`; store id in localStorage.
+- Disable Mine/Both scope until session has at least one indexed file.
+- Sample prompts default to Global scope.
+- Stream response tokens from `/query/stream`; fallback to non-streaming on failure.
+- "Clear my session" triggers `DELETE /sessions/{id}` and remints a session id.
+- Surface citation provenance as `[global]` and `[yours]`.
+## Tests
+- Vitest + RTL (`frontend/src/**/*.test.tsx`):
+  - Session mint/persist behavior
+  - Scope toggle enable/disable states
+  - Uploader cap and server-rejection UI
+  - Incremental stream rendering
+- Playwright smoke:
+  - load -> upload -> scope Mine -> query -> citation from uploaded file
+- Playwright negative:
+  - no upload keeps Mine/Both disabled
+  - rejected upload errors are clearly shown
+## Verification
+```bash
+DOC_PROFILE=demo DOC_DEMO_UPLOADS=1 \
+  uvicorn src.api.main:app --host 127.0.0.1 --port 8000
+cd frontend && npm install && npm run dev
+cd frontend && npm run test
+cd frontend && npm run test:e2e
+```
+## Handoff (Exit Criteria)
+- Query + My Documents tabs are complete with session reset flow.
+- Scope toggle, upload caps messaging, and TTL messaging are visible and correct.
+- Streaming and fallback response paths are reliable.
+- Unit + e2e tests pass locally and in CI.
+- Citation source labeling enables user trust verification.
+## Transition to Phase 6.3
+- Frontend builds reproducibly (`npm ci && npm run build`).
+- API CORS includes intended frontend origins.
+- No unresolved frontend/backend contract mismatches.

Docs/Phase6.3-Container-Cutover-Implementation-Spec.md ADDED Viewed

	@@ -0,0 +1,302 @@

+# Phase 6.3 Implementation Spec: Single-Container Deploy and HF Spaces Cutover
+Source plan: `Docs/Phase6.3-Container-Cutover-Plan.md`
+Depends on: `Docs/Phase6.2-React-MVP-Plan.md`
+Next phase: `Docs/Phase6.4-Streamlit-Decommission-Plan.md`
+## Objective
+Ship the React MVP and FastAPI API from one Docker container, then cut Hugging Face Spaces from the Streamlit SDK runtime to the Docker SDK runtime.
+The deployed container must:
+- Serve the built React SPA at `/`.
+- Keep API endpoints reachable with their current contracts.
+- Preserve the Streamlit rollback path until Phase 6.4.
+- Continue to support demo session uploads, scoped retrieval, citations, and health checks.
+## Current State
+- `frontend/` already has npm scripts for `lint`, `typecheck`, `test`, `test:e2e`, and `build`.
+- `.github/workflows/ci.yml` already contains a frontend job, but e2e execution should be reviewed because the Playwright config currently starts only the Vite dev server.
+- `docker/Dockerfile` runs FastAPI on port `8000` and still exposes `8501`.
+- `spaces/README.md` still declares `sdk: streamlit`, `sdk_version`, and `app_file: spaces/app.py`.
+- `spaces/app.py` still starts FastAPI in a background thread and delegates to `src.web.streamlit_app`.
+- `src/api/main.py` does not yet mount the React build output as static UI.
+## Non-Goals
+- Do not delete `src/web/streamlit_app.py`.
+- Do not remove `streamlit` from `requirements/base.txt`.
+- Do not remove the Streamlit service from `docker/docker-compose.yml`.
+- Do not change the `/health`, `/metrics`, `/query`, or `/query/stream` API payload contracts.
+- Do not introduce a second production web server such as nginx unless a concrete deployment issue requires it.
+## Implementation Sequence
+### 1. Confirm Phase 6.2 Readiness
+Before editing deployment files, verify the React app is buildable and API-compatible:
+```bash
+cd frontend
+npm ci
+npm run lint
+npm run typecheck
+npm run test
+npm run build
+```
+Expected result:
+- `frontend/dist/` is produced reproducibly.
+- The frontend does not require a hard-coded `VITE_API_BASE_URL` when served from the same origin.
+- Playwright tests can run against a backend URL that represents the deployment shape.
+If Playwright currently depends on a separate dev API, update the e2e setup in this phase so CI boots FastAPI in demo mode before running the browser tests.
+### 2. Update `docker/Dockerfile`
+Convert the Dockerfile to a multi-stage build.
+Recommended structure:
+1. `frontend-builder` stage based on `node:20-alpine`.
+2. Python runtime stage based on the existing `python:3.11-slim`.
+3. Copy `frontend/package.json` and `frontend/package-lock.json` before copying all frontend files so npm dependencies cache properly.
+4. Run `npm ci` and `npm run build`.
+5. Copy `frontend/dist` into the runtime image at `/app/static`.
+6. Keep `PYTHONPATH=/app`, Hugging Face cache env vars, non-root `appuser`, and the existing FastAPI `CMD`.
+7. Remove `EXPOSE 8501` from the final runtime image.
+Best practices:
+- Use `npm ci`, not `npm install`, in image builds.
+- Keep dependency installation before source copies where practical for Docker cache reuse.
+- Keep the final container single-process: uvicorn only.
+- Keep Streamlit installed for rollback during Phase 6.3, but do not run it in the final container command.
+- Preserve the existing `/health` Docker healthcheck.
+Acceptance checks:
+- `docker build -f docker/Dockerfile -t doc-ingest:demo .` succeeds from repo root.
+- `docker run` starts uvicorn on port `8000`.
+- `/app/static/index.html` exists in the image.
+- No runtime process listens on port `8501` in the unified image.
+### 3. Mount React Static Assets in `src/api/main.py`
+Serve the SPA only after API routes have been registered.
+Implementation requirements:
+- Import `Path` and `StaticFiles`.
+- Resolve the static directory relative to the deployed app, for example `/app/static` in Docker and `static/` from the repo root locally.
+- Mount static assets only if the directory exists and contains `index.html`.
+- Register all API routes before mounting the catch-all UI route.
+- Ensure SPA fallback does not shadow `/health`, `/metrics`, `/query`, `/query/stream`, `/sessions`, `/observability/dashboard`, or OpenAPI docs.
+Recommended route strategy:
+- Keep existing routes at their current paths for backward compatibility.
+- Add optional `/api` aliases only if the frontend needs them, but do not remove current top-level API paths.
+- Mount `StaticFiles(directory=..., html=True)` at `/` after all current route decorators.
+Testing focus:
+- `GET /` returns the React app when `static/index.html` exists.
+- `GET /assets/...` serves bundled frontend assets.
+- Unknown browser routes fall back to the SPA.
+- API routes continue to return JSON and do not return `index.html`.
+- OpenAPI remains available at `/openapi.json`.
+### 4. Rework `spaces/app.py` for Docker Runtime
+In Docker SDK mode, HF Spaces will run the container command, so `spaces/app.py` no longer needs to launch Streamlit.
+Preferred implementation:
+- Keep `spaces/app.py` as a thin bootstrap utility only if it is still useful for local or HF startup.
+- Move demo env defaults into the Docker runtime or a small bootstrap function used by the Docker entrypoint.
+- Continue to set:
+  - `DOC_PROFILE=demo`
+  - `DOC_API_KEYS=demo-key`
+  - `DOC_EMBEDDING_PROVIDER=sentence_transformers`
+  - `DOC_DEMO_UPLOADS=1`
+  - `DOC_DEMO_SESSION_ROOT=/tmp/doc-ingest-sessions`
+  - `DOC_DEMO_MAX_FILES=3`
+  - `DOC_DEMO_MAX_FILE_MB=3`
+  - `DOC_DEMO_MAX_SESSION_MB=8`
+  - `DOC_DEMO_SESSION_TTL=1800`
+- Ensure `spaces.bootstrap_demo.bootstrap_if_needed()` still runs before traffic depends on the sample corpus.
+Acceptable options:
+- Add an entrypoint script that runs bootstrap, then `exec uvicorn src.api.main:app --host 0.0.0.0 --port 8000 --workers 1`.
+- Or keep Docker `CMD` as uvicorn and move bootstrap into FastAPI lifespan startup, guarded so it only runs in demo profile.
+Best practice:
+- Prefer `exec` in shell entrypoints so uvicorn receives container signals directly.
+- Keep bootstrap idempotent.
+- Do not start background API threads in Docker SDK mode.
+### 5. Update `spaces/README.md`
+Change Hugging Face Spaces metadata:
+```yaml
+sdk: docker
+app_port: 8000
+```
+Remove:
+```yaml
+sdk_version: "1.37.0"
+app_file: spaces/app.py
+```
+Refresh user-facing text:
+- Describe the React + FastAPI demo.
+- Mention session uploads are enabled in demo mode with the configured limits.
+- Point users to the root URL on port `8000` for the UI.
+- Keep provider/API-key limitations accurate for HF.
+### 6. Review `.github/workflows/ci.yml`
+The frontend job already exists. Review and adjust it so it reflects the deployment contract:
+- Keep `npm ci`, `npm run lint`, `npm run typecheck`, `npm run test`, and `npm run build`.
+- Add a dedicated e2e job or e2e steps that start FastAPI in demo mode before Playwright runs.
+- Use `DOC_PROFILE=demo`, `DOC_DEMO_UPLOADS=1`, and `DOC_EMBEDDING_PROVIDER=sentence_transformers` for e2e.
+- Wait for `http://127.0.0.1:8000/health` before launching browser tests.
+- Keep Python and Node caches scoped to the correct lockfiles.
+Recommended e2e smoke:
+```bash
+PYTHONPATH=. DOC_PROFILE=demo DOC_DEMO_UPLOADS=1 \
+  DOC_EMBEDDING_PROVIDER=sentence_transformers \
+  uvicorn src.api.main:app --host 127.0.0.1 --port 8000
+cd frontend
+npm run test:e2e
+```
+### 7. Review `.github/workflows/sync-to-spaces.yml`
+Keep this workflow lean. Hugging Face should build the Docker image from the pushed repo.
+Implementation notes:
+- Update comments that still say HF uses `spaces/app.py` as the entry point.
+- Do not add a prebuild unless HF Docker builds are too slow or unreliable.
+- Keep the repo push behavior aligned with the current release process.
+- Ensure `spaces/README.md` is included in the pushed content so HF detects Docker SDK metadata.
+## Local Verification
+Run these checks before opening a PR:
+```bash
+PYTHONPATH=. python -m pytest tests/unit -q
+PYTHONPATH=. python -m pytest tests/integration -q
+cd frontend
+npm ci
+npm run lint
+npm run typecheck
+npm run test
+npm run build
+cd ..
+docker build -f docker/Dockerfile -t doc-ingest:demo .
+docker run --rm -p 8000:8000 \
+  -e DOC_PROFILE=demo \
+  -e DOC_DEMO_UPLOADS=1 \
+  -e DOC_EMBEDDING_PROVIDER=sentence_transformers \
+  doc-ingest:demo
+```
+Smoke checks while the container is running:
+```bash
+curl -fsS http://127.0.0.1:8000/health
+curl -fsS http://127.0.0.1:8000/metrics
+curl -fsS http://127.0.0.1:8000/openapi.json
+curl -fsS http://127.0.0.1:8000/ | head
+```
+Browser checks:
+- Open `http://127.0.0.1:8000`.
+- Confirm the React UI loads without Vite.
+- Create or reuse a demo session.
+- Upload one small supported file.
+- Query with `Mine` scope and confirm citation provenance.
+- Query with `Global` scope and confirm existing sample corpus still works.
+- Refresh the browser and confirm the session resumes or remints cleanly.
+## Hugging Face Spaces Verification
+Recommended cutover flow:
+1. Deploy to a fresh validation Space first, for example `doc-ingestion-demo-v2`.
+2. Confirm the Space is using Docker SDK metadata.
+3. Wait for Docker build completion.
+4. Smoke-test:
+   - `/`
+   - `/health`
+   - `/metrics`
+   - `/openapi.json`
+   - `POST /sessions`
+   - document upload
+   - scoped query
+   - streaming query fallback behavior
+5. Validate logs for bootstrap, model download, and session janitor errors.
+6. Only then switch the public demo target.
+## Rollback Plan
+Rollback must remain available until Phase 6.4 is intentionally executed.
+Fast rollback:
+- Revert `spaces/README.md` to Streamlit SDK metadata:
+  - `sdk: streamlit`
+  - `sdk_version: "1.37.0"`
+  - `app_file: spaces/app.py`
+- Restore the pre-cutover `spaces/app.py` behavior that starts FastAPI in a thread and delegates to Streamlit.
+- Keep `src/web/streamlit_app.py` and `streamlit` dependency untouched during Phase 6.3.
+Container rollback:
+- Revert the Dockerfile to the previous Python-only image if the multi-stage build breaks HF.
+- Keep the React app and backend changes in the branch if they are not the cause.
+Rollback validation:
+- HF Space boots in Streamlit SDK mode.
+- Streamlit UI loads.
+- `/health` is reachable from the background FastAPI server.
+- Sample prompts still work.
+## Acceptance Criteria
+- One Docker image serves FastAPI and the built React SPA.
+- `/`, static assets, and client-side browser routes work from the container.
+- `/health`, `/metrics`, `/query`, `/query/stream`, `/sessions`, and `/openapi.json` keep expected behavior.
+- HF Spaces runs the Docker SDK Space on `app_port: 8000`.
+- CI validates backend tests, frontend checks, frontend build, and e2e smoke against a running FastAPI backend.
+- Streamlit rollback is documented and tested.
+## Handoff to Phase 6.4
+Do not start Phase 6.4 until:
+- React demo has soaked for at least one week in the Docker deployment.
+- No unresolved severity 1 or severity 2 deployment/runtime defects remain.
+- The team confirms Streamlit rollback is no longer needed.
+- The rollback steps above were tested at least once during cutover.

Docs/Phase6.3-Container-Cutover-Plan.md ADDED Viewed

	@@ -0,0 +1,62 @@

+# Phase 6.3 Plan: Single-Container Deploy and HF Spaces Cutover
+Source of truth: `Docs/Phase6-RefactorDemo_React.md` (this file is an execution slice for iterative delivery).
+Depends on: `Docs/Phase6.2-React-MVP-Plan.md`
+## Objective
+Deploy one container (FastAPI + built React SPA) to simplify delivery and align Hugging Face Spaces runtime with the new UI.
+## Scope
+Migrate from Streamlit SDK Space to Docker SDK Space with rollback path preserved.
+## Files to modify
+- `docker/Dockerfile`
+  - Multi-stage build:
+    - Node stage builds `frontend/dist`
+    - Python stage copies static assets to `/app/static`
+  - Final command runs uvicorn only.
+- `src/api/main.py`
+  - Mount static UI when available.
+  - Keep API route behavior intact (`/health`, `/metrics`, `/query`, `/query/stream`).
+  - Ensure SPA fallback does not shadow API routes.
+- `spaces/README.md`
+  - Switch to:
+    - `sdk: docker`
+    - `app_port: 8000`
+  - Remove `app_file` streamlit setting.
+- `spaces/app.py`
+  - Repurpose as thin env bootstrap + uvicorn launcher, or remove if no longer needed.
+- `.github/workflows/sync-to-spaces.yml`
+  - Keep CI lean; prefer relying on HF Docker build unless prebuild is required.
+- `.github/workflows/ci.yml`
+  - Add frontend job (`lint`, `test`, `build`).
+  - Add e2e job booting API + running Playwright.
+## Verification
+```bash
+docker build -f docker/Dockerfile -t doc-ingest:demo .
+docker run --rm -p 8000:8000 \
+  -e DOC_PROFILE=demo -e DOC_DEMO_UPLOADS=1 \
+  -e DOC_EMBEDDING_PROVIDER=sentence_transformers \
+  doc-ingest:demo
+open http://127.0.0.1:8000
+```
+Then push branch and validate HF Space after Docker rebuild.
+## Handoff (Exit Criteria)
+- Unified container runs locally and in HF with expected route behavior.
+- Core API endpoints stay reachable and validated.
+- Deployed smoke tests pass.
+- Rollback path to pre-cutover setup is documented and tested.
+## Transition to Phase 6.4
+- React demo has soaked for at least one week.
+- No unresolved high-severity deployment/runtime defects.
+- Team confirms Streamlit rollback is no longer needed.

Docs/Phase6.4-Streamlit-Decommission-Implementation-Spec.md ADDED Viewed

	@@ -0,0 +1,384 @@

+# Phase 6.4 Implementation Spec: Streamlit Decommission
+Source plan: `Docs/Phase6.4-Streamlit-Decommission-Plan.md`
+Depends on: `Docs/Phase6.3-Container-Cutover-Plan.md`
+Optional phase: execute only after the Docker React deployment has stabilized.
+## Objective
+Remove the Streamlit runtime, legacy UI path, and Streamlit-only tests after the React + FastAPI Docker deployment is stable and rollback to Streamlit is no longer required.
+The final system should have one supported user interface:
+- React SPA served by FastAPI.
+- FastAPI APIs for querying, session uploads, metrics, health, and sample prompts.
+- No Streamlit dependency, process, compose service, or documentation path.
+## Entry Criteria
+Start this phase only when all are true:
+- Phase 6.3 has been deployed for at least one week.
+- No unresolved severity 1 or severity 2 issues exist for the React + FastAPI Docker runtime.
+- The team explicitly confirms Streamlit rollback is no longer needed.
+- The Phase 6.3 rollback procedure has been tested and documented.
+- A current branch or tag exists that can restore the Streamlit implementation if needed later.
+## Non-Goals
+- Do not change retrieval, reranking, citation, or provider behavior.
+- Do not change session isolation semantics.
+- Do not redesign the React UI.
+- Do not remove shared ingestion helpers that are still used by API upload endpoints.
+- Do not delete demo sample prompts; move them to an API-served shared source.
+## Implementation Sequence
+### 1. Inventory Streamlit References
+Find all active references before deleting anything:
+```bash
+rg "streamlit|8501|src/web/streamlit_app|DOC_INGEST_API_URL|_DEMO_QUESTIONS" .
+```
+Classify each match:
+- Delete: Streamlit runtime, Streamlit command, Streamlit-only tests.
+- Replace: documentation and quickstart references.
+- Keep: generic `src/web` helper modules used by the API, such as `ingestion_service.py` and `session_corpus.py`.
+Expected Streamlit-specific items to remove or update:
+- `src/web/streamlit_app.py`
+- `streamlit>=...` in `requirements/base.txt`
+- `streamlit` service in `docker/docker-compose.yml`
+- `tests/unit/test_streamlit_demo_routing.py`
+- Streamlit SDK references in `README.md` and `spaces/README.md`
+- Port `8501` references in docs and Docker metadata
+### 2. Move Sample Prompts to a Shared API Source
+The Streamlit app currently owns `_DEMO_QUESTIONS`. The React app currently has a shorter hard-coded prompt list in `frontend/src/components/SamplePromptChips.tsx`.
+Create a backend-owned shared source before deleting Streamlit.
+Recommended file:
+```text
+src/api/sample_prompts.py
+```
+Recommended content shape:
+```python
+SAMPLE_PROMPTS: tuple[str, ...] = (
+    "What is Retrieval-Augmented Generation?",
+    "What are the two main phases of a RAG system?",
+    "How does hybrid retrieval work?",
+    "What is BM25 and how does it differ from vector search?",
+    "What are the weaknesses of BM25?",
+    "What is Reciprocal Rank Fusion (RRF)?",
+    "What is a vector database?",
+    "What is HNSW?",
+    "What is the difference between Chroma and Qdrant?",
+    "Why use hybrid retrieval instead of just dense vector search?",
+    "What failure mode does citation tracking help detect?",
+    "How are embeddings used in a RAG pipeline?",
+)
+```
+Add an API endpoint in `src/api/main.py`:
+```text
+GET /api/sample-prompts
+```
+Response contract:
+```json
+{
+  "prompts": [
+    "What is Retrieval-Augmented Generation?"
+  ]
+}
+```
+Best practices:
+- Keep the endpoint unauthenticated. It is static demo content.
+- Register it before the SPA static mount.
+- Keep response shape stable and explicit.
+- If API models are used for typed responses, add a small Pydantic response model.
+- Add the endpoint to frontend OpenAPI generation if the frontend consumes generated types.
+### 3. Update React Sample Prompt Consumption
+Replace the hard-coded prompt array in `frontend/src/components/SamplePromptChips.tsx` with API-backed data.
+Recommended approach:
+- Add `getSamplePrompts()` to `frontend/src/api/client.ts`.
+- Use TanStack Query in either `SamplePromptChips` or the parent `QueryTab`.
+- Render a small loading state or skeleton while prompts load.
+- Provide a local fallback only for network failure, using the same canonical prompt text as the backend. Keep the fallback clearly secondary so backend remains the source of truth.
+Testing requirements:
+- Unit test that prompts returned by the API render as chips.
+- Unit test that selecting a prompt still fills the query text and resets scope to Global if that behavior already exists.
+- Unit test the failure fallback or empty-state UI.
+### 4. Delete Streamlit Runtime Code
+Delete:
+```text
+src/web/streamlit_app.py
+```
+Keep:
+```text
+src/web/ingestion_service.py
+src/web/session_corpus.py
+```
+Reason:
+- `ingestion_service.py` and `session_corpus.py` are no longer UI code only; FastAPI session upload endpoints depend on them.
+- The package name `src.web` can remain for now to avoid a broad refactor. A later cleanup may move these helpers into `src/api` or `src/services`.
+After deletion, run:
+```bash
+rg "src.web.streamlit_app|streamlit_app|_DEMO_QUESTIONS" src tests frontend Docs README.md spaces
+```
+Expected result:
+- No runtime references remain.
+- `_DEMO_QUESTIONS` has been replaced by `SAMPLE_PROMPTS`.
+### 5. Remove Streamlit Dependency
+Edit `requirements/base.txt`:
+- Remove `streamlit>=1.37.0`.
+- Keep `requests`, `fastapi`, `python-multipart`, and `uvicorn` because the API still needs them.
+Validation:
+```bash
+python -m pip install -r requirements/base.txt
+PYTHONPATH=. python -m pytest tests/unit -q
+```
+Best practice:
+- If a lockfile is introduced later, regenerate it in the same change.
+- Do not remove dependencies solely because they were imported by Streamlit unless no remaining module imports them.
+### 6. Simplify Docker Compose
+Edit `docker/docker-compose.yml`:
+- Remove the `streamlit` service.
+- Remove port `8501`.
+- Keep `api`, `redis`, `qdrant`, and shared volumes.
+- Ensure the API service exposes the React UI through `8000`.
+- Add demo env vars to the API service only if local compose should support demo uploads by default.
+Recommended local URL after this phase:
+```text
+http://localhost:8000
+```
+Compose validation:
+```bash
+docker compose -f docker/docker-compose.yml up --build
+curl -fsS http://127.0.0.1:8000/health
+open http://127.0.0.1:8000
+```
+### 7. Update Dockerfile and HF Files
+Review files touched in Phase 6.3:
+- `docker/Dockerfile`
+- `spaces/README.md`
+- `spaces/app.py`
+Required outcomes:
+- No `EXPOSE 8501`.
+- No Streamlit command.
+- No Streamlit SDK metadata.
+- No docs claiming `spaces/app.py` is the Streamlit entrypoint.
+If `spaces/app.py` is no longer used:
+- Delete it only if HF Docker runtime and local workflows do not import it.
+- Keep `spaces/bootstrap_demo.py` if the Docker startup path still uses it.
+If `spaces/app.py` is kept as a bootstrap helper:
+- Remove all Streamlit imports and comments.
+- Keep only demo env defaults/bootstrap logic that is still called.
+### 8. Update Documentation
+Update `README.md`:
+- Replace Streamlit quickstart with React + FastAPI quickstart.
+- Change Docker instructions to open `http://localhost:8000`.
+- Update architecture bullets:
+  - `src/api/` serves FastAPI routes and the React SPA.
+  - `frontend/` contains the React app.
+  - `src/web/` should not be described as the UI layer if it remains only for helper modules.
+- Remove screenshots or text that show the Streamlit sidebar.
+- Add sample prompt endpoint reference if useful for frontend/API developers.
+Update `spaces/README.md`:
+- Confirm it describes Docker SDK and app port `8000`.
+- Remove upload-disabled Streamlit limitations if Phase 6.1 uploads are enabled.
+- Describe the supported upload caps and TTL.
+Update any runbooks or phase docs that still instruct users to run:
+```bash
+streamlit run src/web/streamlit_app.py
+```
+Replace with:
+```bash
+uvicorn src.api.main:app --host 127.0.0.1 --port 8000
+cd frontend && npm run dev
+```
+or, for unified container:
+```bash
+docker build -f docker/Dockerfile -t doc-ingest:demo .
+docker run --rm -p 8000:8000 doc-ingest:demo
+```
+### 9. Remove or Replace Streamlit Tests
+Delete:
+```text
+tests/unit/test_streamlit_demo_routing.py
+```
+Add or extend tests so the removed behavior remains covered through API and React tests:
+- API test for `GET /api/sample-prompts`.
+- API test that demo upload/session routes still work when `DOC_PROFILE=demo` and `DOC_DEMO_UPLOADS=1`.
+- Frontend test that sample prompts render from API data.
+- Frontend test that sample prompt selection populates the query.
+- Playwright smoke that loads the unified UI and runs a global sample prompt.
+Important:
+- Do not reduce coverage for provider/model request passing, session scope, or citation provenance if those were previously asserted through Streamlit tests.
+- Move assertions to API or frontend tests rather than deleting them outright.
+## Validation Checklist
+Run after implementation:
+```bash
+rg "streamlit|8501|src/web/streamlit_app|_DEMO_QUESTIONS" .
+```
+Expected allowed matches:
+- Historical phase docs may mention Streamlit as completed/decommissioned context.
+- No active runtime, dependency, compose, CI, or README quickstart references should remain.
+Backend:
+```bash
+PYTHONPATH=. python -m pytest tests/unit -q
+PYTHONPATH=. python -m pytest tests/integration -q
+PYTHONPATH=. uvicorn src.api.main:app --host 127.0.0.1 --port 8000
+```
+API smoke:
+```bash
+curl -fsS http://127.0.0.1:8000/health
+curl -fsS http://127.0.0.1:8000/api/sample-prompts
+curl -fsS http://127.0.0.1:8000/
+```
+Frontend:
+```bash
+cd frontend
+npm ci
+npm run lint
+npm run typecheck
+npm run test
+npm run build
+npm run test:e2e
+```
+Docker:
+```bash
+docker build -f docker/Dockerfile -t doc-ingest:demo .
+docker run --rm -p 8000:8000 \
+  -e DOC_PROFILE=demo \
+  -e DOC_DEMO_UPLOADS=1 \
+  -e DOC_EMBEDDING_PROVIDER=sentence_transformers \
+  doc-ingest:demo
+```
+Manual smoke:
+- Open `http://127.0.0.1:8000`.
+- Confirm the React UI loads.
+- Confirm sample prompt chips load from the API.
+- Run a global sample prompt.
+- Upload one small supported file.
+- Query with `Mine` scope and verify citation provenance.
+- Clear the session and confirm a new session is minted.
+## Rollback Plan
+Rollback after this phase is no longer the normal operating path. If rollback is required, use the saved Phase 6.3 branch/tag.
+Emergency rollback steps:
+1. Restore `src/web/streamlit_app.py`.
+2. Restore `streamlit` in `requirements/base.txt`.
+3. Restore the `streamlit` service in `docker/docker-compose.yml`.
+4. Restore Streamlit SDK metadata in `spaces/README.md` if rolling HF back to the old runtime.
+5. Restore `spaces/app.py` Streamlit launcher behavior.
+6. Re-run backend tests and a Streamlit smoke test.
+Because Phase 6.4 intentionally removes the rollback path, require team approval before merging it.
+## Acceptance Criteria
+- Streamlit runtime code is removed.
+- `streamlit` dependency is removed.
+- Docker Compose has no Streamlit service or `8501` port.
+- HF and Docker docs describe only React + FastAPI on port `8000`.
+- Sample prompts are served by `GET /api/sample-prompts` and consumed by the React UI.
+- API, frontend, e2e, and Docker smoke checks pass.
+- No active runtime or onboarding docs instruct users to run Streamlit.
+## Handoff
+After merge:
+- Mark Phase 6.4 complete in the phase index.
+- Record the final React + FastAPI deployment URL and smoke-test date.
+- Move any deferred cleanup, such as relocating `src/web/ingestion_service.py`, to the Phase 7 backlog.

Docs/Phase6.4-Streamlit-Decommission-Plan.md ADDED Viewed

	@@ -0,0 +1,38 @@

+# Phase 6.4 Plan: Streamlit Decommission (Optional)
+Source of truth: `Docs/Phase6-RefactorDemo_React.md` (this file is an execution slice for iterative delivery).
+Depends on: `Docs/Phase6.3-Container-Cutover-Plan.md`
+## Objective
+Remove Streamlit runtime and legacy paths after React + FastAPI deployment has stabilized.
+## Scope
+Run only after at least one week of stable production-like behavior from Phase 6.3.
+## Tasks
+- Delete `src/web/streamlit_app.py`.
+- Remove `streamlit` from `requirements/base.txt`.
+- Remove Streamlit container from `docker/docker-compose.yml`.
+- Update `README.md` screenshots and quickstart docs.
+- Remove `tests/unit/test_streamlit_demo_routing.py`.
+- Keep sample prompts by serving them via API (`GET /api/sample-prompts`) as shared source of truth.
+## Verification
+- Confirm no imports/runtime references to Streamlit remain.
+- Run backend/frontend test suites and smoke checks after cleanup.
+- Confirm docs and onboarding instructions match new architecture.
+## Handoff (Exit Criteria)
+- Streamlit code/dependencies/tests are removed cleanly.
+- Docs fully reflect React + FastAPI flow.
+- Sample prompts are centrally served and consumed.
+## Transition to Next Program Increment
+- Phase 6 closes with 6.1-6.3 complete and 6.4 executed (or intentionally deferred).
+- Deferred improvements move to Phase 7 backlog.

Docs/phase5_observability.md ADDED Viewed

	@@ -0,0 +1,412 @@

+# Phase 5: Production Monitoring & Observability
+**Timeline:** 3 weeks
+**Status:** Complete
+**Owner:** Vamshi Pokala
+## Overview
+Phase 5 hardens the doc-ingestion RAG system for production through:
+1. **Distributed tracing** (LangFuse) for end-to-end pipeline visibility
+2. **Latency profiling** (P50, P95, P99) per step
+3. **Cost tracking** (USD per request)
+4. **Real-time metrics dashboard** at `/observability/dashboard`
+5. **Regression gating** (GitHub Actions) to prevent accuracy degradation on PRs
+6. **Citation accuracy monitoring** (groundedness, coverage trends)
+## Architecture
+### Tracing Flow
+```
+User Query
+    ↓
+[LangFuse Trace Start]
+    ↓
+Retrieval (BM25 + Vector)
+[TRACE: latency, chunks retrieved, scores]
+    ↓
+Reranking (Cross-Encoder)
+[TRACE: latency, input/output chunks]
+    ↓
+Generation (LLM)
+[TRACE: latency, tokens, cost, provider]
+    ↓
+Citation Verification
+[TRACE: latency, citations verified]
+    ↓
+[Flush to LangFuse]
+    ↓
+Response + Metrics Recorded
+```
+### Metrics Aggregation
+```
+Per-Request Metrics (RequestMetrics)
+    ↓
+In-Memory Collector (1000 rolling window)
+    ↓
+Dashboard Endpoint (/observability/dashboard)
+    ↓
+JSON: P50/P95/P99 latencies, cost trends, quality scores
+```
+### Regression Gating
+```
+PR Submitted
+    ↓
+GitHub Actions: Run evals on golden dataset
+    ↓
+Compare against baseline (main branch)
+    ↓
+Check: Latency increase <5%? Quality decrease <5%?
+    ↓
+If FAIL: Block PR + comment with regression details
+If PASS: Allow merge
+```
+## Key Components
+### 1. Observability Module (`src/core/observability.py`)
+**Provides:**
+- `RAGObserver` class with step-level tracing context managers
+- LangFuse client integration
+- No-op when disabled (useful for demo mode)
+- Background-safe async flush
+**Usage:**
+```python
+observer = get_observer()
+# One trace per request, spans as children
+with observer.trace_request("rag_query", query=query_text) as trace:
+    with observer.trace_step(trace, "retrieval") as s:
+        result = retriever.retrieve(query)
+        s["chunks_retrieved"] = len(result)
+    with observer.trace_step(trace, "generation", {"provider": provider}) as s:
+        answer = generator.generate(query, result)
+observer.flush_async()  # non-blocking
+```
+### 2. Metrics Collector (`src/monitoring/metrics.py`)
+**Provides:**
+- `MetricsCollector` for in-memory aggregation
+- Percentile calculations (P50, P95, P99)
+- Dashboard-friendly JSON aggregations
+- Thread-safe recording
+**Metrics tracked:**
+```
+Latency:
+- total_latency_ms (P50, P95, P99)
+- retrieval_avg_ms
+- reranking_avg_ms
+- generation_avg_ms
+- citation_avg_ms
+- Breakdown percentages
+Cost:
+- total_usd (across all requests)
+- avg_per_request_usd
+- p95_per_request_usd
+Quality (online — no ground truth required):
+- citation_groundedness_avg
+- nli_faithfulness_avg
+```
+### 3. Regression Gate Script (`scripts/compare_evals.py`)
+**Compares:**
+- Baseline metrics (main branch)
+- Current metrics (PR branch)
+- Threshold: 5% by default (configurable)
+**Fails if:**
+- Latency increases >5%
+- Quality decreases >5%
+- Cost increases >5%
+### 4. Regression Gate in `.github/workflows/ci.yml` (extended `evals-golden` job)
+**On every PR:**
+1. Runs offline evaluations against `evals/datasets/golden_ci.jsonl`
+2. Compares against committed `evals/reports/baseline.json`
+3. Blocks PR if regressions detected
+4. Comments with regression details
+## Setup Instructions
+### Step 1: Set Environment Variables
+```bash
+# For development with LangFuse
+export LANGFUSE_PUBLIC_KEY=pk_...
+export LANGFUSE_SECRET_KEY=sk_...
+# For testing (disabled)
+export DOC_PROFILE=demo  # Disables LangFuse
+```
+### Step 2: Install Dependencies
+```bash
+# langfuse is in requirements/base.txt
+pip install -r requirements/base.txt  # Includes langfuse>=2.0.0
+```
+### Step 3: Configure Baseline (One-Time, commit to repo)
+Already done! `evals/reports/baseline.json` is committed.
+To regenerate from main branch:
+```bash
+git checkout main
+PYTHONPATH=. python -m evals.run_evals \
+  --dataset evals/datasets/golden_ci.jsonl \
+  --judge-provider anthropic \
+  --judge-model claude-haiku-4-5 \
+  --output evals/reports/baseline.json
+git add evals/reports/baseline.json
+git commit -m "chore: update Phase 5 eval baseline"
+```
+### Step 4: Query and Monitor
+```bash
+# Start API with LangFuse enabled
+export LANGFUSE_PUBLIC_KEY=pk_... LANGFUSE_SECRET_KEY=sk_...
+PYTHONPATH=. uvicorn src.api.main:app --reload
+# In another terminal, query
+curl -X POST http://localhost:8000/query \
+  -H "Content-Type: application/json" \
+  -d '{"query": "What is RAG?"}'
+# View dashboard
+curl http://localhost:8000/observability/dashboard | jq .
+# Output:
+# {
+#   "summary": { "total_requests": 1, ... },
+#   "latency": {
+#     "total_p50_ms": 1247.3,
+#     "total_p95_ms": 1247.3,
+#     "breakdown_pct": {
+#       "retrieval": 18.2,
+#       "reranking": 12.1,
+#       "generation": 68.4,
+#       "citation": 1.3
+#     }
+#   },
+#   "cost": { "avg_per_request_usd": 0.00245 },
+#   "quality": {
+#     "citation_groundedness_avg": 0.92,
+#     "nli_faithfulness_avg": 0.88
+#   }
+# }
+```
+## Testing
+### Unit Tests
+```bash
+# Observability tests
+pytest tests/unit/test_observability.py -v
+# Metrics tests
+pytest tests/unit/test_metrics.py -v
+# Regression gate tests
+pytest tests/unit/test_regression_gate.py -v
+```
+### Integration Test
+```bash
+# Full E2E with tracing enabled
+LANGFUSE_PUBLIC_KEY=pk_... LANGFUSE_SECRET_KEY=sk_... \
+PYTHONPATH=. python -c "
+from src.api.main import app
+from fastapi.testclient import TestClient
+client = TestClient(app)
+response = client.post('/query', json={'query': 'What is RAG?'})
+print(response.json())
+# Should include request_id and all metrics
+"
+```
+## Metrics Interpretation
+### Latency Breakdown Example
+```
+Total P50: 1247.3 ms
+Breakdown:
+- Retrieval:   227 ms (18.2%)  ← BM25 + Vector Search
+- Reranking:   151 ms (12.1%)  ← Cross-Encoder Rerank
+- Generation:  855 ms (68.4%)  ← LLM inference
+- Citation:     14 ms ( 1.3%)  ← Citation Verification
+Interpretation:
+Generation is the bottleneck (68.4% of total).
+Could optimize by:
+1. Using a faster model
+2. Using streaming
+3. Reducing context size
+```
+### Quality Metrics Example
+```
+Citation Groundedness: 0.92 (92% of citations verified)
+NLI Faithfulness:      0.88 (88% of answer supported by chunks)
+Interpretation:
+- Citation coverage is strong (92%)
+- Faithfulness could improve (88%)
+- Consider reranking strategy improvements
+```
+### Cost Estimation Example
+```
+Cost per Request: $0.00245 (avg)
+Cost at P95:      $0.00312
+Annual projection (10K requests/day):
+365 * 10K * $0.00245 = $8,927.50
+Cost Optimization:
+- Switch to cheaper model?
+- Use batch inference?
+- Cache common queries?
+```
+## Deployment Notes
+### Docker
+```dockerfile
+# In docker/Dockerfile, ensure observability deps are included
+# langfuse is in requirements/base.txt
+RUN pip install -r requirements/base.txt
+# docker-compose sets env vars
+environment:
+  - LANGFUSE_PUBLIC_KEY=${LANGFUSE_PUBLIC_KEY}
+  - LANGFUSE_SECRET_KEY=${LANGFUSE_SECRET_KEY}
+```
+### Streamlit (Demo Mode)
+```python
+# In demo mode, observability is disabled
+if os.getenv("DOC_PROFILE") == "demo":
+    observer = RAGObserver(enabled=False)  # No-op
+```
+## Troubleshooting
+### LangFuse traces not appearing
+```
+1. Check credentials: LANGFUSE_PUBLIC_KEY and LANGFUSE_SECRET_KEY set?
+2. Check network: Can you reach https://cloud.langfuse.com?
+3. Check logs: Do you see "LangFuse observability enabled"?
+4. Verify flush: observer.flush_async() called after each request?
+```
+### Dashboard metrics all zeros
+```
+1. Check MetricsCollector is receiving data:
+   python -c "from src.monitoring.metrics import get_metrics_collector; print(len(get_metrics_collector().metrics))"
+2. Have you sent enough requests? (P95 needs at least 20 samples)
+3. Is metrics_collector.record_request() being called in /query endpoint?
+```
+### Regression gate always failing
+```
+1. Baseline exists? evals/reports/baseline.json present? (committed to repo)
+   If not: already committed as part of Phase 5
+2. Threshold too strict? Default is 5%, try --threshold 10
+3. Eval dataset: correct file is evals/datasets/golden_ci.jsonl
+4. Check eval logs for errors: see artifact evals/reports/pr-current.json
+```
+## Files Changed/Created
+### Week 1: Instrumentation
+- ✅ `src/core/observability.py` (NEW)
+- ✅ `tests/unit/test_observability.py` (NEW)
+- ✅ `src/core/rag_orchestrator.py` (MODIFIED - added tracing)
+- ✅ `src/api/main.py` (MODIFIED - minimal changes)
+- ✅ `requirements/base.txt` (MODIFIED - added langfuse)
+### Week 2: Metrics Dashboard
+- ✅ `src/monitoring/metrics.py` (NEW)
+- ✅ `tests/unit/test_metrics.py` (NEW)
+- ✅ `src/api/main.py` (MODIFIED - added metrics recording and dashboard endpoint)
+- ✅ `src/utils/log.py` (MODIFIED - replaced MetricsCollector)
+### Week 3: Regression Gating
+- ✅ `scripts/compare_evals.py` (NEW)
+- ✅ `tests/unit/test_regression_gate.py` (NEW)
+- ✅ `.github/workflows/ci.yml` (MODIFIED - extended evals-golden job)
+- ✅ `evals/reports/baseline.json` (NEW - committed baseline)
+## Next Steps (Post-Phase 5)
+- [ ] Grafana dashboard integration for long-term trends
+- [ ] Alert thresholds (PagerDuty for latency spikes)
+- [ ] Cost attribution per LLM provider
+- [ ] A/B testing framework (compare models, prompts)
+- [ ] User feedback loop (thumbs up/down on answers)
+- [ ] Fine-tuning based on eval failures
+## Interview Stories
+### "How do you ensure production RAG reliability?"
+> At Marriott, we deployed an agent handling 10K+ guest queries daily. Without observability, we'd have no idea if accuracy was degrading. I instrumented the pipeline with LangFuse tracing to see every step: retrieval latency, reranking precision, generation tokens, citation accuracy. Now I have a dashboard showing P50/P95 latency breakdown, cost per request, and quality metrics. And I wired up regression gating so no code change ships unless it passes a golden dataset evaluation. This is how you build trust in production AI systems.
+### "How would you scale an AI platform?"
+> Observability is first-class, not an afterthought. The moment you deploy, you need distributed tracing to answer: Where's the bottleneck? Is generation or retrieval slowing us down? What's the cost per request? How are quality metrics trending? I built this with LangFuse + a metrics collector, so we can see the full stack at P50/P95. Then I added regression gating in CI/CD to prevent accuracy regressions from ever shipping.
+### "Describe your observability architecture"
+> Every RAG pipeline step is traced to LangFuse: retrieval, reranking, generation, citation verification. We compute P50/P95/P99 latencies per step and expose them on a dashboard. We also track cost per request and quality metrics (citation groundedness, NLI faithfulness). In CI/CD, we compare PR eval results against a baseline — if latency increases >5% or quality decreases >5%, the PR is blocked with a detailed comment. This gives us real-time visibility and prevents regressions.
+## Approval Checklist
+- [x] Week 1: LangFuse integration with correct span hierarchy (one trace/request, spans as children)
+- [x] Week 1: Instrumentation in `RAGOrchestrator.run()`, not `main.py`
+- [x] Week 1: `flush_async()` used everywhere (no synchronous flush in request path)
+- [x] Week 2: `MetricsCollector` in `src/monitoring/metrics.py` (new one, old one updated for compatibility)
+- [x] Week 2: `RequestMetrics` has no `mrr`/`ndcg` fields
+- [x] Week 3: Regression comparison added to existing `evals-golden` job in `ci.yml`
+- [x] Week 3: `evals/reports/baseline.json` committed to repo
+- [x] Tests: All unit tests passing
+- [x] Integration: E2E query with tracing + metrics recording
+- [x] Interview ready: Stories prepared
+## Timeline Summary
+| Week | Deliverable | Status |
+|------|-------------|--------|
+| 1 | LangFuse tracing | ✅ Complete |
+| 2 | Metrics + dashboard | ✅ Complete |
+| 3 | Regression gating + docs | ✅ Complete |
+**Total effort:** ~40-50 hours over 3 weeks
+---
+**Generated:** 2026-05-01
+**Last Updated:** 2026-05-01

README.md CHANGED Viewed

@@ -3,16 +3,15 @@ title: Doc Ingestion RAG Demo
 emoji: 📚
 colorFrom: blue
 colorTo: indigo
-sdk: streamlit
-sdk_version: "1.37.0"
-app_file: spaces/app.py
 pinned: false
 license: mit
 ---
 # Doc-Ingestion
-Doc-Ingestion is a citation-aware RAG system that turns private document collections into grounded question-answering experiences. It demonstrates how to ingest documents, retrieve the right evidence, generate answers from that evidence, and return citations plus truthfulness signals through a Streamlit app, FastAPI service, and CLI.
 > **[Try the live demo on Hugging Face Spaces](https://huggingface.co/spaces/vampokala/doc-ingestion)** - no install required.
@@ -154,7 +153,7 @@ In hosted demo mode (`DOC_PROFILE=demo`), Streamlit executes queries in-process
 ### Try Online
-Open the [Hugging Face Spaces demo](https://huggingface.co/spaces/vampokala/doc-ingestion). Sample documents about RAG, vector databases, and BM25 are preloaded. Paste your OpenAI, Anthropic, or Gemini key in the sidebar if you want to use a cloud provider.
 ### Run Locally With Docker
@@ -166,7 +165,7 @@ cp docker/.env.example docker/.env
 docker compose -f docker/docker-compose.yml up
 ```
-Open `http://localhost:8501` for Streamlit or `http://localhost:8000` for the API.
 ### Run From Source
@@ -193,6 +192,28 @@ PYTHONPATH=. python -m src.query "What is RAG?"
 For a full local and Docker runbook, see [`Docs/RUNBOOK.md`](Docs/RUNBOOK.md).
 ## API Usage
 ```bash
@@ -293,6 +314,11 @@ export GEMINI_API_KEY=...
 export DOC_API_KEYS=dev-key-1
 ```
 ## Troubleshooting
 - **Empty results after ingest:** Run `python -m src.ingest --docs data/documents` and verify `data/embeddings/` exists.
@@ -300,3 +326,4 @@ export DOC_API_KEYS=dev-key-1
 - **Dimension mismatch after model change:** Re-ingest all documents to rebuild the vector index.
 - **Cloud provider fails:** Check the relevant `*_API_KEY` env var is set.
 - **Truthfulness score always 0:** The NLI model (`cross-encoder/nli-deberta-v3-small`) downloads on first use. Check internet access or set `evaluation.inline_enabled: false` in `config.yaml` to disable.

 emoji: 📚
 colorFrom: blue
 colorTo: indigo
+sdk: docker
+app_port: 8000
 pinned: false
 license: mit
 ---
 # Doc-Ingestion
+Doc-Ingestion is a citation-aware RAG system that turns private document collections into grounded question-answering experiences. It demonstrates how to ingest documents, retrieve the right evidence, generate answers from that evidence, and return citations plus truthfulness signals through a React UI (served by FastAPI), standalone FastAPI, optional Streamlit legacy UI, and CLI.
 > **[Try the live demo on Hugging Face Spaces](https://huggingface.co/spaces/vampokala/doc-ingestion)** - no install required.
 ### Try Online
+Open the [Hugging Face Spaces demo](https://huggingface.co/spaces/vampokala/doc-ingestion). Sample documents about RAG, vector databases, and BM25 are preloaded. Paste your OpenAI, Anthropic, or Gemini key in the app if you want to use a cloud provider.
 ### Run Locally With Docker
 docker compose -f docker/docker-compose.yml up
 ```
+Open `http://localhost:8000` for the React UI and API (single container image).
 ### Run From Source
 For a full local and Docker runbook, see [`Docs/RUNBOOK.md`](Docs/RUNBOOK.md).
+## Ollama and Hugging Face Spaces
+**`SPACE_ID` is not a file in this repository.** It is a **runtime environment variable** that [Hugging Face Spaces](https://huggingface.co/docs/hub/spaces-overview) sets inside the Space container (for example `your-username/your-space-name`). Doc-Ingestion reads it from the process environment in [`src/utils/config.py`](src/utils/config.py) when `load_config("config.yaml")` runs. Static LLM provider and model lists still live in [`config.yaml`](config.yaml); Ollama is only removed from the **effective** config when Space detection says it should be.
+If you **clone this repo and run it locally** (source or Docker on your machine), **Hugging Face does not set `SPACE_ID`**. The Ollama provider therefore stays in the default LLM list from `config.yaml`, and you can use it after starting the [Ollama](https://ollama.com) daemon and pulling the chat and embedding models described in [`Docs/RUNBOOK.md`](Docs/RUNBOOK.md).
+On **Hugging Face Spaces**, the platform **injects `SPACE_ID`** (for example `your-username/your-space-name`). Doc-Ingestion reads that at startup and **removes Ollama** from allowed providers and from `GET /config/llm`, because there is no local Ollama service in the hosted container. Hosted demos use OpenAI, Anthropic, or Gemini with keys you supply in the UI or environment.
+| Where you run | `SPACE_ID` | Ollama in the app |
+|---------------|------------|-------------------|
+| Your laptop or your own server / Docker | Not set by default | Yes (per `config.yaml`) |
+| Hugging Face Space | Set automatically by HF | No (automatic) |
+**Do not define `SPACE_ID` yourself** for local deployment. It exists so the app can tell it is running inside a Space. If you copied Space-style environment variables into a local `.env` and Ollama disappeared from the UI, remove `SPACE_ID` or set **`DOC_OLLAMA_ENABLED=1`** to force Ollama back on.
+**Explicit override (optional):**
+- `DOC_OLLAMA_ENABLED=0` — hide Ollama even when `SPACE_ID` is unset (useful if you want cloud-only in your own container).
+- `DOC_OLLAMA_ENABLED=1` — show Ollama even when `SPACE_ID` is set (rare; only if you had a sidecar Ollama and extended the image yourself).
+Implementation: [`src/utils/config.py`](src/utils/config.py) (`doc_ollama_runtime_enabled`, applied inside `load_config`).
 ## API Usage
 ```bash
 export DOC_API_KEYS=dev-key-1
 ```
+Deployment-related environment variables (not stored in `config.yaml`; see [Ollama and Hugging Face Spaces](#ollama-and-hugging-face-spaces) above):
+- **`SPACE_ID`** — injected on Hugging Face Spaces only. You do not add this to a local config file for normal development.
+- **`DOC_OLLAMA_ENABLED`** — optional explicit override: `0` / `false` to hide Ollama, `1` / `true` to show it even when `SPACE_ID` is set.
 ## Troubleshooting
 - **Empty results after ingest:** Run `python -m src.ingest --docs data/documents` and verify `data/embeddings/` exists.
 - **Dimension mismatch after model change:** Re-ingest all documents to rebuild the vector index.
 - **Cloud provider fails:** Check the relevant `*_API_KEY` env var is set.
 - **Truthfulness score always 0:** The NLI model (`cross-encoder/nli-deberta-v3-small`) downloads on first use. Check internet access or set `evaluation.inline_enabled: false` in `config.yaml` to disable.
+- **Ollama missing from the UI or `/config/llm` locally:** You may have `SPACE_ID` or `DOC_OLLAMA_ENABLED=0` in your shell or `docker/.env`. Unset `SPACE_ID` for local runs, or set `DOC_OLLAMA_ENABLED=1`. There is no separate `SPACE_ID` configuration file in the repo—only environment variables and [`config.yaml`](config.yaml).

docker/Dockerfile DELETED Viewed

@@ -1,44 +0,0 @@
-FROM python:3.11-slim
-WORKDIR /app
-# Install system deps needed by python-magic and runtime health checks.
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    libmagic1 \
-    curl \
-    ca-certificates \
-    && rm -rf /var/lib/apt/lists/*
-COPY requirements/base.txt requirements/base.txt
-RUN pip install --no-cache-dir --upgrade pip && \
-    pip install --no-cache-dir -r requirements/base.txt
-COPY src/ src/
-COPY scripts/ scripts/
-COPY tests/ tests/
-COPY config.yaml config.yaml
-COPY README.md README.md
-COPY Docs/ Docs/
-ENV ENV=prod
-ENV PYTHONUNBUFFERED=1
-ENV PYTHONPATH=/app
-ENV OLLAMA_BASE_URL=http://host.docker.internal:11434
-ENV HF_HOME=/app/.cache/huggingface
-ENV TRANSFORMERS_CACHE=/app/.cache/huggingface/transformers
-ENV SENTENCE_TRANSFORMERS_HOME=/app/.cache/huggingface/sentence_transformers
-# Preload reranker model at build time to avoid runtime downloads.
-RUN python -c "from sentence_transformers import CrossEncoder; CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')"
-EXPOSE 8000
-EXPOSE 8501
-# Use non-root runtime user.
-RUN useradd -m appuser && mkdir -p /app/.cache/huggingface && chown -R appuser:appuser /app
-USER appuser
-HEALTHCHECK --interval=30s --timeout=5s --start-period=20s --retries=3 \
-  CMD curl -fsS http://127.0.0.1:8000/health || exit 1
-CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "1"]

docker/Dockerfile ADDED Viewed

	@@ -0,0 +1 @@


1	+ ../Dockerfile

docker/docker-compose.yml CHANGED Viewed

@@ -1,11 +1,16 @@
 services:
   api:
     build:
       context: ..
-      dockerfile: docker/Dockerfile
     container_name: doc_ingestion_api
     environment:
       - ENV=dev
       - DOC_API_KEYS=${DOC_API_KEYS:-change-me}
       - OPENAI_API_KEY=${OPENAI_API_KEY:-}
       - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
@@ -17,6 +22,7 @@ services:
       - SENTENCE_TRANSFORMERS_HOME=/app/.cache/huggingface/sentence_transformers
       - HF_HUB_OFFLINE=${HF_HUB_OFFLINE:-0}
       - TRANSFORMERS_OFFLINE=${TRANSFORMERS_OFFLINE:-0}
     volumes:
       - ../data:/app/data
       - ../config.yaml:/app/config.yaml
@@ -27,33 +33,6 @@ services:
       - qdrant
       - redis
-  streamlit:
-    build:
-      context: ..
-      dockerfile: docker/Dockerfile
-    container_name: doc_ingestion_streamlit
-    command: ["streamlit", "run", "src/web/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
-    environment:
-      - DOC_INGEST_API_URL=http://api:8000
-      - DOC_API_KEY=${DOC_API_KEY:-change-me}
-      - OLLAMA_BASE_URL=${OLLAMA_BASE_URL:-http://host.docker.internal:11434}
-      - OPENAI_API_KEY=${OPENAI_API_KEY:-}
-      - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
-      - GEMINI_API_KEY=${GEMINI_API_KEY:-}
-      - HF_HOME=/app/.cache/huggingface
-      - TRANSFORMERS_CACHE=/app/.cache/huggingface/transformers
-      - SENTENCE_TRANSFORMERS_HOME=/app/.cache/huggingface/sentence_transformers
-      - HF_HUB_OFFLINE=${HF_HUB_OFFLINE:-0}
-      - TRANSFORMERS_OFFLINE=${TRANSFORMERS_OFFLINE:-0}
-    volumes:
-      - ../data:/app/data
-      - ../config.yaml:/app/config.yaml
-      - hf_cache:/app/.cache/huggingface
-    ports:
-      - "8501:8501"
-    depends_on:
-      - api
   redis:
     image: redis:7-alpine
     container_name: doc_ingestion_redis

+# Full stack (React + FastAPI) in one container on :8000. Streamlit is not part of this stack;
+# run it locally only if needed: PYTHONPATH=. streamlit run src/web/streamlit_app.py
 services:
   api:
     build:
       context: ..
+      dockerfile: Dockerfile
     container_name: doc_ingestion_api
     environment:
       - ENV=dev
+      # React demo session + uploads (override for hardened deploys).
+      - DOC_PROFILE=${DOC_PROFILE:-demo}
+      - DOC_DEMO_UPLOADS=${DOC_DEMO_UPLOADS:-1}
       - DOC_API_KEYS=${DOC_API_KEYS:-change-me}
       - OPENAI_API_KEY=${OPENAI_API_KEY:-}
       - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
       - SENTENCE_TRANSFORMERS_HOME=/app/.cache/huggingface/sentence_transformers
       - HF_HUB_OFFLINE=${HF_HUB_OFFLINE:-0}
       - TRANSFORMERS_OFFLINE=${TRANSFORMERS_OFFLINE:-0}
+      - PORT=8000
     volumes:
       - ../data:/app/data
       - ../config.yaml:/app/config.yaml
       - qdrant
       - redis
   redis:
     image: redis:7-alpine
     container_name: doc_ingestion_redis

frontend/.gitignore ADDED Viewed

	@@ -0,0 +1,26 @@

+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+pnpm-debug.log*
+lerna-debug.log*
+node_modules
+dist
+dist-ssr
+test-results
+playwright-report
+*.local
+# Editor directories and files
+.vscode/*
+!.vscode/extensions.json
+.idea
+.DS_Store
+*.suo
+*.ntvs*
+*.njsproj
+*.sln
+*.sw?

frontend/README.md ADDED Viewed

	@@ -0,0 +1,73 @@

+# React + TypeScript + Vite
+This template provides a minimal setup to get React working in Vite with HMR and some ESLint rules.
+Currently, two official plugins are available:
+- [@vitejs/plugin-react](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react) uses [Oxc](https://oxc.rs)
+- [@vitejs/plugin-react-swc](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react-swc) uses [SWC](https://swc.rs/)
+## React Compiler
+The React Compiler is not enabled on this template because of its impact on dev & build performances. To add it, see [this documentation](https://react.dev/learn/react-compiler/installation).
+## Expanding the ESLint configuration
+If you are developing a production application, we recommend updating the configuration to enable type-aware lint rules:
+```js
+export default defineConfig([
+  globalIgnores(['dist']),
+  {
+    files: ['**/*.{ts,tsx}'],
+    extends: [
+      // Other configs...
+      // Remove tseslint.configs.recommended and replace with this
+      tseslint.configs.recommendedTypeChecked,
+      // Alternatively, use this for stricter rules
+      tseslint.configs.strictTypeChecked,
+      // Optionally, add this for stylistic rules
+      tseslint.configs.stylisticTypeChecked,
+      // Other configs...
+    ],
+    languageOptions: {
+      parserOptions: {
+        project: ['./tsconfig.node.json', './tsconfig.app.json'],
+        tsconfigRootDir: import.meta.dirname,
+      },
+      // other options...
+    },
+  },
+])
+```
+You can also install [eslint-plugin-react-x](https://github.com/Rel1cx/eslint-react/tree/main/packages/plugins/eslint-plugin-react-x) and [eslint-plugin-react-dom](https://github.com/Rel1cx/eslint-react/tree/main/packages/plugins/eslint-plugin-react-dom) for React-specific lint rules:
+```js
+// eslint.config.js
+import reactX from 'eslint-plugin-react-x'
+import reactDom from 'eslint-plugin-react-dom'
+export default defineConfig([
+  globalIgnores(['dist']),
+  {
+    files: ['**/*.{ts,tsx}'],
+    extends: [
+      // Other configs...
+      // Enable lint rules for React
+      reactX.configs['recommended-typescript'],
+      // Enable lint rules for React DOM
+      reactDom.configs.recommended,
+    ],
+    languageOptions: {
+      parserOptions: {
+        project: ['./tsconfig.node.json', './tsconfig.app.json'],
+        tsconfigRootDir: import.meta.dirname,
+      },
+      // other options...
+    },
+  },
+])
+```

frontend/components.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "$schema": "https://ui.shadcn.com/schema.json",
+  "style": "new-york",
+  "rsc": false,
+  "tsx": true,
+  "tailwind": {
+    "config": "",
+    "css": "src/index.css",
+    "baseColor": "slate",
+    "cssVariables": true
+  },
+  "aliases": {
+    "components": "@/components",
+    "utils": "@/lib/utils"
+  }
+}

frontend/e2e/fixtures/uploaded-doc.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # Uploaded Test Document
2	+
3	+ This uploaded document says hello from a private session.

frontend/e2e/react-mvp.spec.ts ADDED Viewed

	@@ -0,0 +1,80 @@

+import { test, expect, type Page } from '@playwright/test'
+async function mockLlmConfig(page: Page) {
+  await page.route('http://127.0.0.1:8000/config/llm', async (route) => {
+    await route.fulfill({
+      json: {
+        default_provider: 'ollama',
+        default_model_by_provider: {
+          ollama: 'qwen2.5:7b',
+          openai: 'gpt-4o-mini',
+        },
+        allowed_models_by_provider: {
+          ollama: ['qwen2.5:7b'],
+          openai: ['gpt-4o-mini'],
+        },
+      },
+    })
+  })
+}
+test('no uploads keeps Mine and Both disabled', async ({ page }) => {
+  await mockLlmConfig(page)
+  await page.route('http://127.0.0.1:8000/sessions', async (route) => {
+    await route.fulfill({
+      json: {
+        session_id: 'abc123demo',
+        expires_at: Math.floor(Date.now() / 1000) + 1800,
+        files: [],
+        total_bytes: 0,
+        max_session_bytes: 8388608,
+        max_files: 3,
+      },
+    })
+  })
+  await page.route('http://127.0.0.1:8000/sessions/abc123demo', async (route) => {
+    await route.fulfill({
+      json: {
+        session_id: 'abc123demo',
+        expires_at: Math.floor(Date.now() / 1000) + 1800,
+        files: [],
+        total_bytes: 0,
+        max_session_bytes: 8388608,
+        max_files: 3,
+      },
+    })
+  })
+  await page.goto('/')
+  await page.getByRole('tab', { name: 'Query' }).click()
+  await expect(page.getByRole('radio', { name: /my uploads only/i })).toBeDisabled()
+  await expect(page.getByRole('radio', { name: /both/i })).toBeDisabled()
+})
+test('query streams an answer', async ({ page }) => {
+  await mockLlmConfig(page)
+  await page.route('http://127.0.0.1:8000/sessions', async (route) => {
+    await route.fulfill({
+      json: {
+        session_id: 'abc123demo',
+        expires_at: Math.floor(Date.now() / 1000) + 1800,
+        files: [],
+        total_bytes: 0,
+        max_session_bytes: 8388608,
+        max_files: 3,
+      },
+    })
+  })
+  await page.route('http://127.0.0.1:8000/query/stream', async (route) => {
+    await route.fulfill({
+      contentType: 'text/event-stream',
+      body: 'data: {"type":"token","text":"Hello from stream"}\n\ndata: {"type":"final","citations":[],"provider":"ollama","model":"llama3"}\n\ndata: [DONE]\n\n',
+    })
+  })
+  await page.goto('/')
+  await page.getByRole('tab', { name: 'Query' }).click()
+  await page.getByRole('textbox', { name: /question/i }).fill('What is RAG?')
+  await page.getByRole('button', { name: 'Run' }).click()
+  await expect(page.getByText('Hello from stream')).toBeVisible()
+})

frontend/eslint.config.js ADDED Viewed

	@@ -0,0 +1,22 @@

+import js from '@eslint/js'
+import globals from 'globals'
+import reactHooks from 'eslint-plugin-react-hooks'
+import reactRefresh from 'eslint-plugin-react-refresh'
+import tseslint from 'typescript-eslint'
+import { defineConfig, globalIgnores } from 'eslint/config'
+export default defineConfig([
+  globalIgnores(['dist']),
+  {
+    files: ['**/*.{ts,tsx}'],
+    extends: [
+      js.configs.recommended,
+      tseslint.configs.recommended,
+      reactHooks.configs.flat.recommended,
+      reactRefresh.configs.vite,
+    ],
+    languageOptions: {
+      globals: globals.browser,
+    },
+  },
+])

frontend/index.html ADDED Viewed

	@@ -0,0 +1,21 @@

+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <link rel="icon" type="image/svg+xml" href="/favicon.svg" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>frontend</title>
+    <style>
+      html,
+      body {
+        margin: 0;
+        min-height: 100%;
+        background: #f1f5f9;
+      }
+    </style>
+  </head>
+  <body>
+    <div id="root"></div>
+    <script type="module" src="/src/main.tsx"></script>
+  </body>
+</html>

frontend/package-lock.json ADDED Viewed

The diff for this file is too large to render. See raw diff

frontend/package.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+  "name": "frontend",
+  "private": true,
+  "version": "0.0.0",
+  "type": "module",
+  "scripts": {
+    "dev": "vite --host 127.0.0.1 --port 5173",
+    "build": "tsc -b && vite build",
+    "lint": "eslint .",
+    "preview": "vite preview --host 127.0.0.1 --port 4173",
+    "test": "vitest run",
+    "test:watch": "vitest",
+    "test:e2e": "playwright test",
+    "typecheck": "tsc --noEmit",
+    "gen:api": "openapi-typescript http://127.0.0.1:8000/openapi.json -o src/api/generated.ts"
+  },
+  "dependencies": {
+    "@radix-ui/react-progress": "^1.1.8",
+    "@radix-ui/react-radio-group": "^1.3.8",
+    "@radix-ui/react-slot": "^1.2.4",
+    "@radix-ui/react-tabs": "^1.1.13",
+    "@radix-ui/react-toast": "^1.2.15",
+    "@tanstack/react-query": "^5.100.8",
+    "class-variance-authority": "^0.7.1",
+    "clsx": "^2.1.1",
+    "lucide-react": "^1.14.0",
+    "react": "^18.3.1",
+    "react-dom": "^18.3.1",
+    "tailwind-merge": "^3.5.0",
+    "zustand": "^5.0.12"
+  },
+  "devDependencies": {
+    "@eslint/js": "^10.0.1",
+    "@playwright/test": "^1.59.1",
+    "@tailwindcss/vite": "^4.2.4",
+    "@testing-library/jest-dom": "^6.9.1",
+    "@testing-library/react": "^16.3.2",
+    "@testing-library/user-event": "^14.6.1",
+    "@types/node": "^24.12.2",
+    "@types/react": "^18.3.28",
+    "@types/react-dom": "^18.3.7",
+    "@vitejs/plugin-react": "^6.0.1",
+    "eslint": "^10.3.0",
+    "eslint-plugin-react-hooks": "^7.1.1",
+    "eslint-plugin-react-refresh": "^0.5.2",
+    "globals": "^17.5.0",
+    "jsdom": "^29.1.1",
+    "msw": "^2.14.2",
+    "openapi-typescript": "^7.13.0",
+    "typescript": "^5.9.3",
+    "typescript-eslint": "^8.59.1",
+    "vite": "^8.0.10",
+    "vitest": "^4.1.5"
+  }
+}

frontend/playwright.config.ts ADDED Viewed

	@@ -0,0 +1,20 @@

+import { defineConfig, devices } from '@playwright/test'
+export default defineConfig({
+  testDir: './e2e',
+  webServer: {
+    command: 'npm run dev',
+    url: 'http://127.0.0.1:5173',
+    reuseExistingServer: !process.env.CI,
+  },
+  use: {
+    baseURL: 'http://127.0.0.1:5173',
+    trace: 'on-first-retry',
+  },
+  projects: [
+    {
+      name: 'chromium',
+      use: { ...devices['Desktop Chrome'] },
+    },
+  ],
+})

frontend/public/favicon.svg ADDED Viewed

frontend/public/icons.svg ADDED Viewed

frontend/src/App.tsx ADDED Viewed

	@@ -0,0 +1,128 @@

+import * as Tabs from '@radix-ui/react-tabs'
+import { QueryClient, QueryClientProvider } from '@tanstack/react-query'
+import { AlertCircle, BookOpen, Database, FileText, Fingerprint } from 'lucide-react'
+import { useMemo } from 'react'
+import { QueryTab } from './tabs/QueryTab'
+import { OverviewTab } from './tabs/OverviewTab'
+import { DocumentsTab } from './tabs/DocumentsTab'
+import { SessionProvider } from './session/SessionProvider'
+import { useSession } from './session/SessionContext'
+import { formatTtl, shortSessionId } from './lib/format'
+function Shell() {
+  const { sessionId, expiresAt, error, retrySession, isMintingSession, isLoading, clearSession } =
+    useSession()
+  return (
+    <main className="min-h-screen bg-slate-100 px-4 py-6 md:px-8">
+      <div className="mx-auto max-w-6xl space-y-5">
+        <header className="app-card p-5">
+          <div className="flex flex-wrap items-start justify-between gap-4">
+            <div>
+              <p className="text-sm font-semibold uppercase tracking-wide text-blue-700">Doc Ingestion</p>
+              <h1 className="mt-1 text-3xl font-bold text-slate-950">Document Q&A Assistant</h1>
+              <p className="mt-2 max-w-3xl text-slate-600">
+                Ask citation-aware questions against the global demo corpus, your private uploads, or both.
+              </p>
+            </div>
+            <div className="flex min-w-[12rem] flex-col gap-2 rounded-xl bg-slate-50 px-4 py-3 text-sm text-slate-700">
+              <div>
+                <div>Session {isMintingSession ? 'creating…' : shortSessionId(sessionId)}</div>
+                <div className="text-slate-500">TTL {formatTtl(expiresAt)}</div>
+              </div>
+              <button
+                type="button"
+                className="inline-flex items-center justify-center gap-2 rounded-lg bg-blue-600 px-3 py-2 text-sm font-semibold text-white shadow hover:bg-blue-700 disabled:pointer-events-none disabled:opacity-50"
+                disabled={isLoading}
+                aria-busy={isLoading}
+                onClick={() => void clearSession()}
+              >
+                <Fingerprint className="h-4 w-4 shrink-0" aria-hidden="true" />
+                {isLoading ? 'Creating…' : sessionId ? 'New session ID' : 'Generate session ID'}
+              </button>
+              <p className="text-xs leading-snug text-slate-500">
+                Fresh ID for uploads in this browser. Replaces any current demo session (including uploads on
+                the server).
+              </p>
+            </div>
+          </div>
+          <div className="mt-4 rounded-xl border border-blue-100 bg-blue-50 p-4 text-sm text-blue-900">
+            Your uploads stay in this browser session, expire after inactivity, and are not added to the
+            shared corpus.
+          </div>
+          {error ? (
+            <div className="mt-4 flex flex-wrap items-center justify-between gap-3 rounded-xl border border-amber-200 bg-amber-50 p-4 text-sm text-amber-900">
+              <span className="inline-flex items-center gap-2">
+                <AlertCircle className="h-4 w-4" aria-hidden="true" />
+                {error.message}
+              </span>
+              <button type="button" className="font-semibold underline" onClick={() => void retrySession()}>
+                Retry session
+              </button>
+            </div>
+          ) : null}
+        </header>
+        <Tabs.Root defaultValue="overview" className="space-y-5">
+          <Tabs.List className="app-card inline-flex gap-2 p-2" aria-label="Main sections">
+            <Tabs.Trigger
+              value="overview"
+              className="inline-flex items-center gap-2 rounded-xl px-4 py-2 text-sm font-semibold text-slate-700 data-[state=active]:bg-blue-600 data-[state=active]:text-white"
+            >
+              <BookOpen className="h-4 w-4" aria-hidden="true" />
+              Overview
+            </Tabs.Trigger>
+            <Tabs.Trigger
+              value="query"
+              className="inline-flex items-center gap-2 rounded-xl px-4 py-2 text-sm font-semibold text-slate-700 data-[state=active]:bg-blue-600 data-[state=active]:text-white"
+            >
+              <Database className="h-4 w-4" aria-hidden="true" />
+              Query
+            </Tabs.Trigger>
+            <Tabs.Trigger
+              value="documents"
+              className="inline-flex items-center gap-2 rounded-xl px-4 py-2 text-sm font-semibold text-slate-700 data-[state=active]:bg-blue-600 data-[state=active]:text-white"
+            >
+              <FileText className="h-4 w-4" aria-hidden="true" />
+              My documents
+            </Tabs.Trigger>
+          </Tabs.List>
+          <Tabs.Content value="overview">
+            <OverviewTab />
+          </Tabs.Content>
+          <Tabs.Content value="query">
+            <QueryTab />
+          </Tabs.Content>
+          <Tabs.Content value="documents">
+            <DocumentsTab />
+          </Tabs.Content>
+        </Tabs.Root>
+      </div>
+    </main>
+  )
+}
+function App() {
+  const queryClient = useMemo(
+    () =>
+      new QueryClient({
+        defaultOptions: {
+          queries: {
+            refetchOnWindowFocus: false,
+            retry: false,
+          },
+        },
+      }),
+    [],
+  )
+  return (
+    <QueryClientProvider client={queryClient}>
+      <SessionProvider>
+        <Shell />
+      </SessionProvider>
+    </QueryClientProvider>
+  )
+}
+export default App

frontend/src/api/client.ts ADDED Viewed

	@@ -0,0 +1,182 @@

+import type { LlmConfigModel, QueryRequestModel, QueryResponseModel } from './generated'
+function resolveApiBaseUrl(): string {
+  const raw = import.meta.env.VITE_API_BASE_URL
+  if (typeof raw === 'string' && raw.trim() !== '') {
+    return raw.trim().replace(/\/$/, '')
+  }
+  if (!import.meta.env.PROD) {
+    // Vitest + MSW use absolute handlers on http://127.0.0.1:8000.
+    if (import.meta.env.VITEST) {
+      return 'http://127.0.0.1:8000'
+    }
+    // npm run dev: empty base → same-origin; vite.config.ts proxies to FastAPI.
+    return ''
+  }
+  // Production bundle: same-origin when UI is served by FastAPI (typical Docker) on :8000.
+  if (typeof window !== 'undefined') {
+    const port = window.location.port
+    const sameOriginAsApi =
+      port === '8000' || port === '' || port === '80' || port === '443'
+    if (sameOriginAsApi) {
+      return ''
+    }
+    const { protocol, hostname } = window.location
+    const host = hostname || '127.0.0.1'
+    return `${protocol}//${host}:8000`.replace(/\/$/, '')
+  }
+  return ''
+}
+const API_BASE_URL = resolveApiBaseUrl()
+export interface SessionFile {
+  name: string
+  size_bytes: number
+}
+export interface CreateSessionResponse {
+  session_id: string
+  expires_at: number
+}
+export interface DeleteSessionResponse {
+  deleted_session_id: string
+  session_id: string
+}
+export interface SessionSummary extends CreateSessionResponse {
+  files: SessionFile[]
+  total_bytes: number
+  max_session_bytes: number
+  max_files: number
+}
+export interface UploadResult {
+  filename: string
+  status: 'queued' | 'skipped' | 'rejected' | 'failed' | string
+  message: string
+}
+export interface UploadDocumentsResponse extends SessionSummary {
+  results: UploadResult[]
+}
+export class ApiError extends Error {
+  readonly status: number
+  readonly detail: unknown
+  constructor(
+    message: string,
+    status: number,
+    detail: unknown,
+  ) {
+    super(message)
+    this.name = 'ApiError'
+    this.status = status
+    this.detail = detail
+  }
+}
+function apiUrl(path: string) {
+  const suffix = path.startsWith('/') ? path : `/${path}`
+  return API_BASE_URL ? `${API_BASE_URL}${suffix}` : suffix
+}
+function readApiKey() {
+  return localStorage.getItem('doc-ingestion.api-key') ?? ''
+}
+async function parseError(response: Response) {
+  let detail: unknown
+  try {
+    detail = await response.json()
+  } catch {
+    detail = await response.text()
+  }
+  const message =
+    typeof detail === 'object' && detail !== null && 'detail' in detail
+      ? String((detail as { detail: unknown }).detail)
+      : `Request failed with status ${response.status}`
+  return new ApiError(message, response.status, detail)
+}
+function networkErrorHint(): string {
+  const target =
+    API_BASE_URL ||
+    (typeof window !== 'undefined' ? `${window.location.origin} (vite → API)` : 'the API')
+  const connectivity =
+    API_BASE_URL === '' && typeof import.meta.env !== 'undefined' && import.meta.env.DEV
+      ? 'Start uvicorn on the proxy target (default http://127.0.0.1:8000) while npm run dev is running, '
+        + 'set VITE_DEV_API_PROXY_TARGET if the API is elsewhere, '
+        + 'or set VITE_API_BASE_URL to bypass the proxy. '
+      : 'Start the API (e.g. uvicorn on port 8000), or set VITE_API_BASE_URL at build time. '
+  return (
+    `Cannot reach ${target}. ${connectivity}` +
+    `Session features need DOC_PROFILE=demo and DOC_DEMO_UPLOADS=1 on the server.`
+  )
+}
+/** Thrown when `fetch` fails before a response (offline, wrong host/port, CORS, etc.). */
+export function networkFailureError(cause?: unknown): ApiError {
+  return new ApiError(networkErrorHint(), 0, cause)
+}
+async function requestJson<T>(path: string, init: RequestInit = {}): Promise<T> {
+  const apiKey = readApiKey()
+  const headers = new Headers(init.headers)
+  if (!(init.body instanceof FormData)) {
+    headers.set('Content-Type', 'application/json')
+  }
+  if (apiKey) {
+    headers.set('X-API-Key', apiKey)
+  }
+  let response: Response
+  try {
+    response = await fetch(apiUrl(path), { ...init, headers })
+  } catch (cause) {
+    throw networkFailureError(cause)
+  }
+  if (!response.ok) {
+    const err = await parseError(response)
+    if (response.status === 404 && path.startsWith('/sessions')) {
+      err.message = `${err.message} If the API is up, enable demo sessions: DOC_PROFILE=demo and DOC_DEMO_UPLOADS=1.`
+    }
+    throw err
+  }
+  return response.json() as Promise<T>
+}
+export function createSession() {
+  return requestJson<CreateSessionResponse>('/sessions', { method: 'POST' })
+}
+export function getSession(sessionId: string) {
+  return requestJson<SessionSummary>(`/sessions/${sessionId}`)
+}
+export function deleteSession(sessionId: string) {
+  return requestJson<DeleteSessionResponse>(`/sessions/${sessionId}`, { method: 'DELETE' })
+}
+export function uploadDocuments(sessionId: string, files: File[]) {
+  const formData = new FormData()
+  files.forEach((file) => formData.append('files', file))
+  return requestJson<UploadDocumentsResponse>(`/sessions/${sessionId}/documents`, {
+    method: 'POST',
+    body: formData,
+  })
+}
+export function queryDocuments(request: QueryRequestModel) {
+  return requestJson<QueryResponseModel>('/query', {
+    method: 'POST',
+    body: JSON.stringify(request),
+  })
+}
+export function fetchLlmConfig() {
+  return requestJson<LlmConfigModel>('/config/llm')
+}
+export { API_BASE_URL }

frontend/src/api/generated.ts ADDED Viewed

	@@ -0,0 +1,71 @@

+export type KnowledgeScope = 'global' | 'session' | 'both'
+export interface QueryRequestModel {
+  query: string
+  top_k?: number
+  use_llm?: boolean
+  use_rerank?: boolean
+  stream?: boolean
+  include_citations?: boolean
+  provider?: string | null
+  model?: string | null
+  reranker_model?: string | null
+  provider_api_key?: string | null
+  session_id?: string | null
+  knowledge_scope?: KnowledgeScope
+}
+export interface CitationModel {
+  raw_id: string
+  chunk_id: string
+  resolved: boolean
+  title?: string | null
+  source?: string | null
+  verification_score: number
+  verification: string
+}
+export interface RetrievedChunkModel {
+  id: string
+  score: number
+  source: string
+  confidence: number
+  metadata: Record<string, unknown>
+  preview: string
+}
+export interface TruthfulnessModel {
+  nli_faithfulness: number
+  citation_groundedness: number
+  uncited_claims: number
+  score: number
+}
+export interface QueryResponseModel {
+  query: string
+  provider: string
+  model: string
+  answer: string
+  processing_time_ms: number
+  cached: boolean
+  validation_issues: string[]
+  citations: CitationModel[]
+  retrieved: RetrievedChunkModel[]
+  truthfulness?: TruthfulnessModel | null
+}
+export interface HealthModel {
+  status: string
+  collection: string
+}
+export interface MetricsModel {
+  cache_ttl_seconds: number
+  available_providers: string[]
+}
+export interface LlmConfigModel {
+  default_provider: string
+  default_model_by_provider: Record<string, string>
+  allowed_models_by_provider: Record<string, string[]>
+}

frontend/src/assets/hero.png ADDED Viewed

frontend/src/assets/react.svg ADDED Viewed

frontend/src/assets/vite.svg ADDED Viewed

frontend/src/components/AnswerPanel.tsx ADDED Viewed

	@@ -0,0 +1,35 @@

+import type { QueryResponseModel } from '../api/generated'
+export function AnswerPanel({
+  answer,
+  response,
+  isLoading,
+}: {
+  answer: string
+  response: QueryResponseModel | null
+  isLoading: boolean
+}) {
+  const truthfulness = response?.truthfulness
+  return (
+    <section className="app-card p-5" aria-live="polite">
+      <div className="mb-3 flex flex-wrap items-center justify-between gap-2">
+        <h2 className="text-lg font-semibold text-slate-950">Answer</h2>
+        {truthfulness ? (
+          <span className="rounded-full bg-emerald-50 px-3 py-1 text-sm font-medium text-emerald-700">
+            Truthfulness {truthfulness.score.toFixed(2)}
+          </span>
+        ) : null}
+      </div>
+      <div className="min-h-28 whitespace-pre-wrap rounded-xl bg-slate-50 p-4 text-left text-slate-800">
+        {answer || (isLoading ? 'Waiting for tokens...' : 'Ask a question to see a grounded answer.')}
+      </div>
+      {response ? (
+        <div className="mt-3 flex flex-wrap gap-3 text-sm text-slate-600">
+          <span>{response.provider} / {response.model}</span>
+          <span>{Math.round(response.processing_time_ms)} ms</span>
+          {response.cached ? <span>Cached</span> : null}
+        </div>
+      ) : null}
+    </section>
+  )
+}

frontend/src/components/CitationsList.tsx ADDED Viewed

	@@ -0,0 +1,41 @@

+import type { SessionFile } from '../api/client'
+import type { CitationModel } from '../api/generated'
+import { citationLabel } from '../lib/citationProvenance'
+export function CitationsList({
+  citations,
+  sessionFiles,
+}: {
+  citations: CitationModel[]
+  sessionFiles: SessionFile[]
+}) {
+  return (
+    <section className="app-card p-5">
+      <h2 className="mb-3 text-lg font-semibold text-slate-950">Citations</h2>
+      {citations.length === 0 ? (
+        <p className="text-sm text-slate-600">No citations returned yet.</p>
+      ) : (
+        <ul className="space-y-3">
+          {citations.map((citation) => {
+            const label = citationLabel(citation, sessionFiles)
+            return (
+              <li key={`${citation.raw_id}-${citation.chunk_id}`} className="rounded-xl bg-slate-50 p-3">
+                <div className="flex flex-wrap items-center gap-2">
+                  <span className="rounded-full bg-slate-900 px-2 py-1 text-xs font-semibold text-white">
+                    [{label === 'yours' ? 'yours' : 'global'}]
+                  </span>
+                  <span className="font-medium text-slate-900">
+                    {citation.title || citation.source || citation.chunk_id}
+                  </span>
+                </div>
+                <p className="mt-1 text-sm text-slate-600">
+                  {citation.verification} · score {citation.verification_score.toFixed(2)}
+                </p>
+              </li>
+            )
+          })}
+        </ul>
+      )}
+    </section>
+  )
+}

frontend/src/components/RetrievedChunks.tsx ADDED Viewed

	@@ -0,0 +1,26 @@

+import type { RetrievedChunkModel } from '../api/generated'
+export function RetrievedChunks({ chunks }: { chunks: RetrievedChunkModel[] }) {
+  return (
+    <details className="app-card p-5">
+      <summary className="cursor-pointer text-lg font-semibold text-slate-950">
+        Retrieved chunks ({chunks.length})
+      </summary>
+      {chunks.length === 0 ? (
+        <p className="mt-3 text-sm text-slate-600">No retrieved chunks returned yet.</p>
+      ) : (
+        <ul className="mt-4 space-y-3">
+          {chunks.map((chunk) => (
+            <li key={chunk.id} className="rounded-xl bg-slate-50 p-3 text-left">
+              <div className="flex flex-wrap justify-between gap-2 text-sm">
+                <span className="font-medium text-slate-900">{chunk.id}</span>
+                <span className="text-slate-600">score {chunk.score.toFixed(3)}</span>
+              </div>
+              <p className="mt-2 text-sm text-slate-700">{chunk.preview}</p>
+            </li>
+          ))}
+        </ul>
+      )}
+    </details>
+  )
+}

frontend/src/components/SamplePromptChips.tsx ADDED Viewed

	@@ -0,0 +1,26 @@

+const prompts = [
+  'What is retrieval augmented generation?',
+  'How does hybrid retrieval improve document search?',
+  'Explain BM25 vs vector search.',
+  'What makes citations useful in a RAG system?',
+]
+export function SamplePromptChips({ onSelect }: { onSelect: (prompt: string) => void }) {
+  return (
+    <div>
+      <p className="mb-2 text-sm font-medium text-slate-700">Try a sample</p>
+      <div className="flex flex-wrap gap-2">
+        {prompts.map((prompt) => (
+          <button
+            key={prompt}
+            type="button"
+            className="rounded-full border border-slate-200 bg-white px-3 py-2 text-sm text-slate-700 shadow-sm hover:border-blue-300 hover:text-blue-700"
+            onClick={() => onSelect(prompt)}
+          >
+            {prompt}
+          </button>
+        ))}
+      </div>
+    </div>
+  )
+}

frontend/src/components/ScopeToggle.test.tsx ADDED Viewed

	@@ -0,0 +1,18 @@

+import { render, screen } from '@testing-library/react'
+import userEvent from '@testing-library/user-event'
+import { ScopeToggle } from './ScopeToggle'
+describe('ScopeToggle', () => {
+  it('disables Mine and Both until uploads exist', () => {
+    render(<ScopeToggle value="global" onChange={vi.fn()} hasUploads={false} />)
+    expect(screen.getByRole('radio', { name: /my uploads only/i })).toBeDisabled()
+    expect(screen.getByRole('radio', { name: /both/i })).toBeDisabled()
+  })
+  it('enables session scopes after upload', async () => {
+    const onChange = vi.fn()
+    render(<ScopeToggle value="global" onChange={onChange} hasUploads />)
+    await userEvent.click(screen.getByRole('radio', { name: /my uploads only/i }))
+    expect(onChange).toHaveBeenCalledWith('session')
+  })
+})

frontend/src/components/ScopeToggle.tsx ADDED Viewed

	@@ -0,0 +1,57 @@

+import * as RadioGroup from '@radix-ui/react-radio-group'
+import type { KnowledgeScope } from '../api/generated'
+import { cn } from '../lib/utils'
+const options: Array<{ value: KnowledgeScope; label: string; helper: string }> = [
+  { value: 'global', label: 'Global sample corpus', helper: 'Use the preloaded public demo documents.' },
+  { value: 'session', label: 'My uploads only', helper: 'Ask only against documents in this browser session.' },
+  { value: 'both', label: 'Both', helper: 'Blend sample documents with your uploaded files.' },
+]
+export function ScopeToggle({
+  value,
+  onChange,
+  hasUploads,
+}: {
+  value: KnowledgeScope
+  onChange: (value: KnowledgeScope) => void
+  hasUploads: boolean
+}) {
+  return (
+    <RadioGroup.Root
+      className="grid gap-3 md:grid-cols-3"
+      value={value}
+      onValueChange={(next) => onChange(next as KnowledgeScope)}
+      aria-label="Knowledge scope"
+    >
+      {options.map((option) => {
+        const disabled = option.value !== 'global' && !hasUploads
+        return (
+          <RadioGroup.Item
+            key={option.value}
+            value={option.value}
+            disabled={disabled}
+            className={cn(
+              'rounded-xl border p-4 text-left transition',
+              value === option.value ? 'border-blue-500 bg-blue-50' : 'border-slate-200 bg-white',
+              disabled && 'cursor-not-allowed opacity-50',
+            )}
+          >
+            <div className="flex items-center gap-3">
+              <span
+                className={cn(
+                  'h-4 w-4 rounded-full border',
+                  value === option.value ? 'border-blue-600 bg-blue-600' : 'border-slate-400',
+                )}
+              />
+              <span className="font-medium text-slate-900">{option.label}</span>
+            </div>
+            <p className="mt-2 text-sm text-slate-600">
+              {disabled ? 'Upload a document to enable this scope.' : option.helper}
+            </p>
+          </RadioGroup.Item>
+        )
+      })}
+    </RadioGroup.Root>
+  )
+}

frontend/src/components/Uploader.test.tsx ADDED Viewed

	@@ -0,0 +1,33 @@

+import { render, screen } from '@testing-library/react'
+import userEvent from '@testing-library/user-event'
+import { QueryClient, QueryClientProvider } from '@tanstack/react-query'
+import { Uploader } from './Uploader'
+describe('Uploader', () => {
+  it('shows client-side file count cap messaging', async () => {
+    const client = new QueryClient({ defaultOptions: { queries: { retry: false } } })
+    render(
+      <QueryClientProvider client={client}>
+        <Uploader
+          sessionId="abc123demo"
+          onUploaded={vi.fn()}
+          summary={{
+            session_id: 'abc123demo',
+            expires_at: 1,
+            files: [
+              { name: 'a.md', size_bytes: 1 },
+              { name: 'b.md', size_bytes: 1 },
+              { name: 'c.md', size_bytes: 1 },
+            ],
+            total_bytes: 3,
+            max_files: 3,
+            max_session_bytes: 100,
+          }}
+        />
+      </QueryClientProvider>,
+    )
+    const input = document.querySelector('input[type="file"]') as HTMLInputElement
+    await userEvent.upload(input, new File(['hello'], 'd.md', { type: 'text/markdown' }))
+    expect(screen.getByText(/upload 0 more file/i)).toBeInTheDocument()
+  })
+})

frontend/src/components/Uploader.tsx ADDED Viewed

	@@ -0,0 +1,108 @@

+import { useRef, useState } from 'react'
+import { useMutation } from '@tanstack/react-query'
+import { Upload } from 'lucide-react'
+import { uploadDocuments, type SessionSummary, type UploadResult } from '../api/client'
+import { formatBytes } from '../lib/format'
+const ACCEPTED = '.pdf,.docx,.txt,.md,.html'
+const MAX_FILE_BYTES = 3 * 1024 * 1024
+function resultMessage(result: UploadResult) {
+  const messages: Record<string, string> = {
+    queued: 'Uploaded and indexed.',
+    skipped: 'Duplicate upload skipped.',
+    oversize: 'File exceeds the 3 MB limit.',
+    file_count_cap: 'Session file count cap reached.',
+    session_disk_cap: 'Session disk cap reached.',
+    type_mismatch: 'File contents do not match the extension.',
+  }
+  return messages[result.status] ?? messages[result.message] ?? result.message
+}
+export function Uploader({
+  sessionId,
+  summary,
+  onUploaded,
+}: {
+  sessionId: string
+  summary: SessionSummary | undefined
+  onUploaded: () => Promise<unknown>
+}) {
+  const inputRef = useRef<HTMLInputElement>(null)
+  const [message, setMessage] = useState('')
+  const [results, setResults] = useState<UploadResult[]>([])
+  const mutation = useMutation({
+    mutationFn: (files: File[]) => uploadDocuments(sessionId, files),
+    onSuccess: async (response) => {
+      setResults(response.results)
+      setMessage('Upload finished.')
+      await onUploaded()
+    },
+    onError: (error) => {
+      setMessage(error instanceof Error ? error.message : 'Upload failed.')
+    },
+  })
+  const upload = (fileList: FileList | File[]) => {
+    if (!summary) {
+      return
+    }
+    const files = Array.from(fileList)
+    const maxFiles = summary?.max_files ?? 3
+    const currentFiles = summary?.files.length ?? 0
+    if (currentFiles + files.length > maxFiles) {
+      setMessage(`You can upload ${Math.max(0, maxFiles - currentFiles)} more file(s).`)
+      return
+    }
+    const oversized = files.find((file) => file.size > MAX_FILE_BYTES)
+    if (oversized) {
+      setMessage(`${oversized.name} is larger than ${formatBytes(MAX_FILE_BYTES)}.`)
+      return
+    }
+    mutation.mutate(files)
+  }
+  return (
+    <div>
+      <div
+        className="rounded-2xl border-2 border-dashed border-slate-300 bg-slate-50 p-8 text-center"
+        onDragOver={(event) => event.preventDefault()}
+        onDrop={(event) => {
+          event.preventDefault()
+          upload(event.dataTransfer.files)
+        }}
+      >
+        <Upload className="mx-auto mb-3 h-8 w-8 text-blue-600" aria-hidden="true" />
+        <p className="font-medium text-slate-900">Drop files here or choose files</p>
+        <p className="mt-1 text-sm text-slate-600">PDF, DOCX, TXT, Markdown, or HTML.</p>
+        <input
+          ref={inputRef}
+          type="file"
+          accept={ACCEPTED}
+          multiple
+          className="sr-only"
+          onChange={(event) => event.target.files && upload(event.target.files)}
+        />
+        <button
+          type="button"
+          className="mt-4 rounded-lg bg-blue-600 px-4 py-2 text-sm font-semibold text-white hover:bg-blue-700 disabled:opacity-50"
+          disabled={mutation.isPending || !summary}
+          onClick={() => inputRef.current?.click()}
+        >
+          {mutation.isPending ? 'Uploading...' : 'Choose files'}
+        </button>
+      </div>
+      {message ? <p className="mt-3 text-sm text-slate-700" aria-live="polite">{message}</p> : null}
+      {results.length > 0 ? (
+        <ul className="mt-3 space-y-2">
+          {results.map((result) => (
+            <li key={`${result.filename}-${result.status}`} className="rounded-lg bg-slate-50 p-3 text-sm">
+              <span className="font-medium text-slate-900">{result.filename}</span>: {resultMessage(result)}
+            </li>
+          ))}
+        </ul>
+      ) : null}
+    </div>
+  )
+}

frontend/src/index.css ADDED Viewed

	@@ -0,0 +1,42 @@

+@import "tailwindcss";
+:root {
+  color: #172033;
+  background: #f5f7fb;
+  font-family:
+    Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI",
+    sans-serif;
+  font-synthesis: none;
+  text-rendering: optimizeLegibility;
+  -webkit-font-smoothing: antialiased;
+  -moz-osx-font-smoothing: grayscale;
+}
+body {
+  margin: 0;
+  min-width: 320px;
+  min-height: 100vh;
+}
+button,
+input,
+textarea {
+  font: inherit;
+}
+button:focus-visible,
+input:focus-visible,
+textarea:focus-visible,
+[role="tab"]:focus-visible,
+[role="radio"]:focus-visible {
+  outline: 3px solid #93c5fd;
+  outline-offset: 2px;
+}
+.app-card {
+  @apply rounded-2xl border border-slate-200 bg-white shadow-sm;
+}
+.muted {
+  @apply text-sm text-slate-600;
+}

frontend/src/lib/citationProvenance.test.ts ADDED Viewed

	@@ -0,0 +1,16 @@

+import { citationLabel } from './citationProvenance'
+describe('citationLabel', () => {
+  it('labels citations matching uploaded files as yours', () => {
+    expect(
+      citationLabel(
+        { title: 'uploaded-doc.md', source: '/tmp/doc-ingest-sessions/abc/uploads/uploaded-doc.md' },
+        [{ name: 'uploaded-doc.md', size_bytes: 12 }],
+      ),
+    ).toBe('yours')
+  })
+  it('labels unmatched citations as global', () => {
+    expect(citationLabel({ title: 'README.md', source: 'data/documents/README.md' }, [])).toBe('global')
+  })
+})

frontend/src/lib/citationProvenance.ts ADDED Viewed

	@@ -0,0 +1,12 @@

+import type { SessionFile } from '../api/client'
+import type { CitationModel } from '../api/generated'
+export type CitationProvenance = 'global' | 'yours'
+export function citationLabel(
+  citation: Pick<CitationModel, 'source' | 'title'>,
+  sessionFiles: SessionFile[],
+): CitationProvenance {
+  const searchable = `${citation.source ?? ''} ${citation.title ?? ''}`.toLowerCase()
+  return sessionFiles.some((file) => searchable.includes(file.name.toLowerCase())) ? 'yours' : 'global'
+}

frontend/src/lib/format.ts ADDED Viewed

	@@ -0,0 +1,22 @@

+export function formatBytes(bytes: number) {
+  if (!Number.isFinite(bytes) || bytes <= 0) {
+    return '0 B'
+  }
+  const units = ['B', 'KB', 'MB', 'GB']
+  const index = Math.min(Math.floor(Math.log(bytes) / Math.log(1024)), units.length - 1)
+  return `${(bytes / 1024 ** index).toFixed(index === 0 ? 0 : 1)} ${units[index]}`
+}
+export function formatTtl(expiresAt: number | null) {
+  if (!expiresAt) {
+    return 'unknown'
+  }
+  const seconds = Math.max(0, expiresAt - Math.floor(Date.now() / 1000))
+  const minutes = Math.floor(seconds / 60)
+  const remainder = seconds % 60
+  return `${minutes}:${remainder.toString().padStart(2, '0')}`
+}
+export function shortSessionId(sessionId: string | null) {
+  return sessionId ? `...${sessionId.slice(-5)}` : 'pending'
+}

frontend/src/lib/streamQuery.test.ts ADDED Viewed

	@@ -0,0 +1,21 @@

+import { testInternals } from './streamQuery'
+describe('streamQuery parsing', () => {
+  it('parses token and final events', () => {
+    expect(
+      testInternals.parseSseFrame(
+        'data: {"type":"token","text":"Hi"}\n\ndata: {"type":"final","citations":[],"retrieved":[],"truthfulness":null,"provider":"ollama","model":"llama3"}',
+      ),
+    ).toEqual([
+      { type: 'token', text: 'Hi' },
+      {
+        type: 'final',
+        citations: [],
+        retrieved: [],
+        truthfulness: null,
+        provider: 'ollama',
+        model: 'llama3',
+      },
+    ])
+  })
+})

frontend/src/lib/streamQuery.ts ADDED Viewed

	@@ -0,0 +1,96 @@

+import { API_BASE_URL, ApiError, networkFailureError } from '../api/client'
+import type { CitationModel, QueryRequestModel, RetrievedChunkModel, TruthfulnessModel } from '../api/generated'
+export type StreamEvent =
+  | { type: 'token'; text: string }
+  | {
+      type: 'final'
+      citations: CitationModel[]
+      retrieved?: RetrievedChunkModel[]
+      truthfulness?: TruthfulnessModel | null
+      provider: string
+      model: string
+    }
+  | { type: 'error'; message: string }
+export interface StreamQueryCallbacks {
+  onToken: (text: string) => void
+  onFinal: (event: Extract<StreamEvent, { type: 'final' }>) => void
+  onError?: (message: string) => void
+}
+function parseSseFrame(frame: string): StreamEvent[] {
+  return frame
+    .split('\n')
+    .filter((line) => line.startsWith('data:'))
+    .map((line) => line.slice(5).trim())
+    .filter((data) => data && data !== '[DONE]')
+    .map((data) => JSON.parse(data) as StreamEvent)
+}
+async function parseError(response: Response) {
+  try {
+    const body = await response.json()
+    return body?.detail ? String(body.detail) : `Stream failed with status ${response.status}`
+  } catch {
+    return `Stream failed with status ${response.status}`
+  }
+}
+export async function streamQuery(request: QueryRequestModel, callbacks: StreamQueryCallbacks) {
+  const apiKey = localStorage.getItem('doc-ingestion.api-key')
+  const headers = new Headers({ 'Content-Type': 'application/json' })
+  if (apiKey) {
+    headers.set('X-API-Key', apiKey)
+  }
+  const streamPath =
+    API_BASE_URL && API_BASE_URL.length > 0 ? `${API_BASE_URL}/query/stream` : '/query/stream'
+  let response: Response
+  try {
+    response = await fetch(streamPath, {
+      method: 'POST',
+      headers,
+      body: JSON.stringify({ ...request, stream: true }),
+    })
+  } catch (cause) {
+    throw networkFailureError(cause)
+  }
+  if (!response.ok) {
+    throw new ApiError(await parseError(response), response.status, null)
+  }
+  if (!response.body) {
+    throw new ApiError('Streaming is not supported by this browser.', response.status, null)
+  }
+  const reader = response.body.getReader()
+  const decoder = new TextDecoder()
+  let buffer = ''
+  while (true) {
+    const { value, done } = await reader.read()
+    buffer += decoder.decode(value, { stream: !done })
+    const frames = buffer.split('\n\n')
+    buffer = frames.pop() ?? ''
+    for (const frame of frames) {
+      for (const event of parseSseFrame(frame)) {
+        if (event.type === 'token') {
+          callbacks.onToken(event.text)
+        } else if (event.type === 'final') {
+          callbacks.onFinal(event)
+        } else if (event.type === 'error') {
+          callbacks.onError?.(event.message)
+          throw new ApiError(event.message, response.status, event)
+        }
+      }
+    }
+    if (done) {
+      break
+    }
+  }
+}
+export const testInternals = { parseSseFrame }

frontend/src/lib/utils.ts ADDED Viewed

	@@ -0,0 +1,6 @@

+import { clsx, type ClassValue } from 'clsx'
+import { twMerge } from 'tailwind-merge'
+export function cn(...inputs: ClassValue[]) {
+  return twMerge(clsx(inputs))
+}

frontend/src/main.tsx ADDED Viewed

	@@ -0,0 +1,10 @@

+import { StrictMode } from 'react'
+import { createRoot } from 'react-dom/client'
+import './index.css'
+import App from './App.tsx'
+createRoot(document.getElementById('root')!).render(
+  <StrictMode>
+    <App />
+  </StrictMode>,
+)