Vamshi Pokala Cursor commited on
Commit
2af1cca
·
1 Parent(s): 131d7bb

feat: React UI, Docker Space, sessions, Ollama toggle on HF

Browse files

- Add root Dockerfile and frontend build served from FastAPI static
- Session corpus API, demo uploads, integration tests
- Disable Ollama when SPACE_ID or DOC_OLLAMA_ENABLED=0 (config.py)
- Observability/metrics, CI and dependency updates
- README/spaces README: document SPACE_ID vs local deploy

Co-authored-by: Cursor <cursoragent@cursor.com>

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .github/workflows/ci.yml +61 -1
  2. Dockerfile +59 -0
  3. Docs/Phase5-Monitoring-Observability.md +1764 -0
  4. Docs/Phase6-Iterative-Execution-Index.md +29 -0
  5. Docs/Phase6-RefactorDemo_React.md +445 -0
  6. Docs/Phase6.1-Backend-Session-Isolation-Plan.md +125 -0
  7. Docs/Phase6.2-React-MVP-Plan.md +83 -0
  8. Docs/Phase6.3-Container-Cutover-Implementation-Spec.md +302 -0
  9. Docs/Phase6.3-Container-Cutover-Plan.md +62 -0
  10. Docs/Phase6.4-Streamlit-Decommission-Implementation-Spec.md +384 -0
  11. Docs/Phase6.4-Streamlit-Decommission-Plan.md +38 -0
  12. Docs/phase5_observability.md +412 -0
  13. README.md +33 -6
  14. docker/Dockerfile +0 -44
  15. docker/Dockerfile +1 -0
  16. docker/docker-compose.yml +7 -28
  17. frontend/.gitignore +26 -0
  18. frontend/README.md +73 -0
  19. frontend/components.json +16 -0
  20. frontend/e2e/fixtures/uploaded-doc.md +3 -0
  21. frontend/e2e/react-mvp.spec.ts +80 -0
  22. frontend/eslint.config.js +22 -0
  23. frontend/index.html +21 -0
  24. frontend/package-lock.json +0 -0
  25. frontend/package.json +55 -0
  26. frontend/playwright.config.ts +20 -0
  27. frontend/public/favicon.svg +1 -0
  28. frontend/public/icons.svg +24 -0
  29. frontend/src/App.tsx +128 -0
  30. frontend/src/api/client.ts +182 -0
  31. frontend/src/api/generated.ts +71 -0
  32. frontend/src/assets/hero.png +0 -0
  33. frontend/src/assets/react.svg +1 -0
  34. frontend/src/assets/vite.svg +1 -0
  35. frontend/src/components/AnswerPanel.tsx +35 -0
  36. frontend/src/components/CitationsList.tsx +41 -0
  37. frontend/src/components/RetrievedChunks.tsx +26 -0
  38. frontend/src/components/SamplePromptChips.tsx +26 -0
  39. frontend/src/components/ScopeToggle.test.tsx +18 -0
  40. frontend/src/components/ScopeToggle.tsx +57 -0
  41. frontend/src/components/Uploader.test.tsx +33 -0
  42. frontend/src/components/Uploader.tsx +108 -0
  43. frontend/src/index.css +42 -0
  44. frontend/src/lib/citationProvenance.test.ts +16 -0
  45. frontend/src/lib/citationProvenance.ts +12 -0
  46. frontend/src/lib/format.ts +22 -0
  47. frontend/src/lib/streamQuery.test.ts +21 -0
  48. frontend/src/lib/streamQuery.ts +96 -0
  49. frontend/src/lib/utils.ts +6 -0
  50. frontend/src/main.tsx +10 -0
.github/workflows/ci.yml CHANGED
@@ -78,6 +78,46 @@ jobs:
78
  - name: Run integration tests
79
  run: pytest tests/integration/ -v
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  evals-smoke:
82
  name: Eval smoke test (mock pipeline)
83
  runs-on: ubuntu-latest
@@ -137,10 +177,30 @@ jobs:
137
  --dataset evals/datasets/golden_ci.jsonl \
138
  --judge-provider anthropic \
139
  --judge-model claude-haiku-4-5 \
140
- --output evals/reports/ \
141
  --faithfulness-threshold 0.7 \
142
  --correctness-threshold 0.2
143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  - name: Upload golden eval report
145
  if: ${{ always() && env.ANTHROPIC_API_KEY != '' }}
146
  uses: actions/upload-artifact@v4
 
78
  - name: Run integration tests
79
  run: pytest tests/integration/ -v
80
 
81
+ frontend:
82
+ name: Frontend checks
83
+ runs-on: ubuntu-latest
84
+ steps:
85
+ - uses: actions/checkout@v4
86
+
87
+ - uses: actions/setup-node@v4
88
+ with:
89
+ node-version: "20"
90
+ cache: npm
91
+ cache-dependency-path: frontend/package-lock.json
92
+
93
+ - name: Install frontend dependencies
94
+ run: npm ci
95
+ working-directory: frontend
96
+
97
+ - name: Lint frontend
98
+ run: npm run lint
99
+ working-directory: frontend
100
+
101
+ - name: Typecheck frontend
102
+ run: npm run typecheck
103
+ working-directory: frontend
104
+
105
+ - name: Test frontend
106
+ run: npm run test
107
+ working-directory: frontend
108
+
109
+ - name: Build frontend
110
+ run: npm run build
111
+ working-directory: frontend
112
+
113
+ - name: Install Playwright browsers
114
+ run: npx playwright install --with-deps chromium
115
+ working-directory: frontend
116
+
117
+ - name: Run Playwright smoke tests
118
+ run: npm run test:e2e
119
+ working-directory: frontend
120
+
121
  evals-smoke:
122
  name: Eval smoke test (mock pipeline)
123
  runs-on: ubuntu-latest
 
177
  --dataset evals/datasets/golden_ci.jsonl \
178
  --judge-provider anthropic \
179
  --judge-model claude-haiku-4-5 \
180
+ --output evals/reports/pr-current.json \
181
  --faithfulness-threshold 0.7 \
182
  --correctness-threshold 0.2
183
 
184
+ - name: Compare against baseline (regression gate)
185
+ if: ${{ env.ANTHROPIC_API_KEY != '' && hashFiles('evals/reports/baseline.json') != '' }}
186
+ run: |
187
+ python scripts/compare_evals.py \
188
+ --baseline evals/reports/baseline.json \
189
+ --current evals/reports/pr-current.json \
190
+ --threshold 5.0
191
+
192
+ - name: Comment on regression failure
193
+ if: failure() && env.ANTHROPIC_API_KEY != ''
194
+ uses: actions/github-script@v6
195
+ with:
196
+ script: |
197
+ github.rest.issues.createComment({
198
+ issue_number: context.issue.number,
199
+ owner: context.repo.owner,
200
+ repo: context.repo.repo,
201
+ body: "⚠️ **Regression Detected**\n\nEval metrics degraded vs baseline. See `evals/reports/pr-current.json` artifact for details.\n\nTo update the baseline (intentional improvement), run:\n```bash\ngit checkout main\nPYTHONPATH=. python -m evals.run_evals \\\n --dataset evals/datasets/golden_ci.jsonl \\\n --judge-provider anthropic \\\n --judge-model claude-haiku-4-5 \\\n --output evals/reports/baseline.json\ngit add evals/reports/baseline.json\ngit commit -m 'chore: update Phase 5 eval baseline'\n```"
202
+ })
203
+
204
  - name: Upload golden eval report
205
  if: ${{ always() && env.ANTHROPIC_API_KEY != '' }}
206
  uses: actions/upload-artifact@v4
Dockerfile ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Single-process image: FastAPI serves the React SPA from /app/static and API routes under the same port.
2
+ # Hugging Face Docker Spaces expects a Dockerfile at the repository root; local builds use the same file:
3
+ # docker build -t doc-ingest .
4
+ # Compose: docker/docker-compose.yml (build context is repo root).
5
+
6
+ FROM node:20-bookworm-slim AS frontend-builder
7
+ WORKDIR /frontend
8
+ COPY frontend/package.json frontend/package-lock.json ./
9
+ RUN npm ci
10
+ COPY frontend/ ./
11
+ RUN npm run build
12
+
13
+ FROM python:3.11-slim
14
+
15
+ WORKDIR /app
16
+
17
+ # Install system deps needed by python-magic and runtime health checks.
18
+ RUN apt-get update && apt-get install -y --no-install-recommends \
19
+ libmagic1 \
20
+ curl \
21
+ ca-certificates \
22
+ && rm -rf /var/lib/apt/lists/*
23
+
24
+ COPY requirements/base.txt requirements/base.txt
25
+ RUN pip install --no-cache-dir --upgrade pip && \
26
+ pip install --no-cache-dir -r requirements/base.txt
27
+
28
+ COPY --from=frontend-builder /frontend/dist /app/static
29
+
30
+ COPY src/ src/
31
+ COPY scripts/ scripts/
32
+ COPY tests/ tests/
33
+ COPY config.yaml config.yaml
34
+ COPY README.md README.md
35
+ COPY Docs/ Docs/
36
+
37
+ ENV ENV=prod
38
+ ENV PYTHONUNBUFFERED=1
39
+ ENV PYTHONPATH=/app
40
+ ENV PORT=8000
41
+ ENV OLLAMA_BASE_URL=http://host.docker.internal:11434
42
+ ENV HF_HOME=/app/.cache/huggingface
43
+ ENV TRANSFORMERS_CACHE=/app/.cache/huggingface/transformers
44
+ ENV SENTENCE_TRANSFORMERS_HOME=/app/.cache/huggingface/sentence_transformers
45
+
46
+ # Preload reranker model at build time to avoid runtime downloads.
47
+ RUN python -c "from sentence_transformers import CrossEncoder; CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')"
48
+
49
+ EXPOSE 8000
50
+
51
+ # HF Spaces runs the container as UID 1000; match that to avoid permission issues.
52
+ RUN useradd -m -u 1000 appuser && mkdir -p /app/.cache/huggingface && chown -R appuser:appuser /app
53
+ USER appuser
54
+
55
+ HEALTHCHECK --interval=30s --timeout=5s --start-period=20s --retries=3 \
56
+ CMD sh -c 'curl -fsS "http://127.0.0.1:${PORT:-8000}/health" || exit 1'
57
+
58
+ # PORT is honored for Hugging Face (app_port / runtime) and other platforms.
59
+ CMD ["sh", "-c", "exec uvicorn src.api.main:app --host 0.0.0.0 --port \"${PORT:-8000}\" --workers 1"]
Docs/Phase5-Monitoring-Observability.md ADDED
@@ -0,0 +1,1764 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Phase 5: Production Monitoring & Observability
2
+
3
+ **Project:** doc-ingestion (RAG System)
4
+ **Status:** Planning
5
+ **Timeline:** 3 weeks
6
+ **Owner:** Vamshi Pokala
7
+ **Goal:** Instrument RAG pipeline with production-grade observability, regression gating, and operational dashboards
8
+
9
+ ---
10
+
11
+ ## Executive Summary
12
+
13
+ Transform doc-ingestion from a feature-complete RAG system into a **production-hardened platform** by adding:
14
+
15
+ 1. **Distributed tracing** (LangFuse) across every step: ingestion → retrieval → reranking → generation → citations
16
+ 2. **Latency profiling** (P50/P95 per component) to identify bottlenecks
17
+ 3. **Cost tracking** (USD per request) for capacity planning
18
+ 4. **Quality regression gating** (GitHub Actions CI/CD) to prevent accuracy degradation
19
+ 5. **Observable metrics dashboard** for real-time operational visibility
20
+ 6. **Citation accuracy & citation coverage** monitoring
21
+
22
+ **Why this matters for your job search:**
23
+ - Differentiates you as "production architect" not "demo builder"
24
+ - Directly maps to Principal/Director interview questions: "How do you know your AI system is healthy?"
25
+ - Concrete story for Vertex (latency budgeting), Elevation Capital (risk reduction), Marriott-like enterprise roles
26
+
27
+ ---
28
+
29
+ ## Current State Analysis
30
+
31
+ ### Existing Strengths
32
+ ```
33
+ ✅ Multi-format ingestion (PDF, DOCX, TXT, MD, HTML)
34
+ ✅ Hybrid retrieval (BM25 + vector search with RRF)
35
+ ✅ Cross-encoder reranking
36
+ ✅ Citation tracking & verification
37
+ ✅ Truthfulness scoring (NLI faithfulness)
38
+ ✅ Multi-provider LLM routing (Ollama, OpenAI, Anthropic, Gemini)
39
+ ✅ FastAPI + Streamlit UI
40
+ ✅ Offline evaluation harness (golden datasets, RAGAS)
41
+ ✅ Docker Compose stack
42
+ ✅ Rate limiting (Redis-backed)
43
+ ✅ MetricsCollector in src/utils/log.py (in-memory count/mean/min/max per operation)
44
+ ✅ Structured JSON audit logging in main.py (_audit_log with latency_ms, provider, model)
45
+ ✅ processing_time_ms and cached flag already returned in QueryResponse
46
+ ✅ evals-golden CI job already runs golden_ci.jsonl on every PR
47
+ ```
48
+
49
+ ### Gaps for Production Observability
50
+ ```
51
+ ❌ No distributed tracing (can't see latency breakdown by step)
52
+ ❌ No real-time metrics dashboard
53
+ ❌ No cost tracking (USD per request)
54
+ ❌ No regression gating comparing baseline vs PR metrics in CI/CD
55
+ ❌ No P50/P95 latency tracking (existing MetricsCollector only tracks mean/min/max)
56
+ ❌ No citation accuracy trends over time
57
+ ❌ /metrics endpoint returns config metadata, not operational metrics
58
+ ❌ No replay/debug mode for failed queries
59
+ ```
60
+
61
+ ### Critical Design Constraints (address before coding)
62
+
63
+ These issues will cause bugs or structural debt if not addressed upfront:
64
+
65
+ 1. **LangFuse span hierarchy**: `self.client.trace()` creates a top-level trace. Calling it once per pipeline step produces 5 disconnected traces per request. The correct pattern is one `trace = client.trace()` per request, then `span = trace.span()` for each step. Instrument at `RAGOrchestrator.run()`, not in `main.py`.
66
+
67
+ 2. **`observer.flush()` must not block the HTTP response**: LangFuse flush makes a network call. Calling it synchronously before returning adds latency to every request. Use `asyncio.create_task(loop.run_in_executor(..., observer.flush))` or a background thread.
68
+
69
+ 3. **Instrument at `RAGOrchestrator`, not `main.py`**: The pipeline runs inside `RAGOrchestrator.run()`. Instrumenting in `main.py` via inline `observer.trace_retrieval(fn)(args)` patterns: (a) misses the cache-hit early return, (b) misses CLI and Streamlit code paths, (c) creates a new wrapper closure on every HTTP request. The observer should be injected into or used directly within `RAGOrchestrator.run()`.
70
+
71
+ 4. **MRR and NDCG are offline-only metrics**: They require ground-truth relevance labels per query. You cannot compute them in production. Remove `mrr` and `ndcg` from `RequestMetrics`; they belong only in the eval harness.
72
+
73
+ 5. **Don't create a separate regression gate workflow**: `ci.yml` already has `evals-golden` running `golden_ci.jsonl`. Add a comparison step to that existing job rather than duplicating it. Also: the dataset is `golden_ci.jsonl`, not `golden.jsonl`.
74
+
75
+ 6. **`src/monitoring/metrics.py` should extend, not duplicate `src/utils/log.py`**: The existing `MetricsCollector` in `log.py` tracks mean/min/max. The new one adds percentiles and per-request records. Consolidate: either replace the log.py one or have the new one call through to it. Don't ship two `MetricsCollector` classes.
76
+
77
+ 7. **`requirements/api.txt` does not exist**: The project has `requirements/base.txt` and `requirements/eval.txt`. Add `langfuse>=2.0.0` to `requirements/base.txt`.
78
+
79
+ ---
80
+
81
+ ## Architecture: Before → After
82
+
83
+ ### Before (Current)
84
+ ```
85
+ ┌─────────────────────────────────────────────────────────┐
86
+ │ Streamlit UI / FastAPI │
87
+ └──────────────────────┬──────────────────────────────────┘
88
+
89
+ ┌──────────────┼──────────────┐
90
+ │ │ │
91
+ ┌────▼────┐ ┌────▼────┐ ┌────▼────┐
92
+ │Retrieval│ │Reranking│ │Generation
93
+ │ (BM25+ │ │(Cross- │ │(Ollama/ │
94
+ │ Vector) │ │ Encoder)│ │ OpenAI) │
95
+ └─────────┘ └─────────┘ └────┬────┘
96
+
97
+ ┌────────▼────────┐
98
+ │Citations & │
99
+ │Truthfulness │
100
+ └─────────────────┘
101
+
102
+ ❌ No observability layer
103
+ ❌ Latency is a black box
104
+ ❌ Can't track cost
105
+ ❌ No regression detection
106
+ ```
107
+
108
+ ### After (Phase 5)
109
+ ```
110
+ ┌──────────────────────────────────────────────────────────────────┐
111
+ │ LangFuse Tracing Layer │
112
+ │ (Distributed tracing, step-by-step instrumentation) │
113
+ └────────────────────────────┬─────────────────────────────────────┘
114
+
115
+ ┌────────────────────────────▼─────────────────────────────────────┐
116
+ │ Streamlit UI / FastAPI │
117
+ └──────────────────────┬──────────────────────────────────────────┘
118
+
119
+ ┌──────────────┼──────────────────────┐
120
+ │ │ │
121
+ ┌────▼────┐ ┌────▼────┐ ┌────▼────┐
122
+ │Retrieval│ │Reranking│ │Generation
123
+ │ (BM25+ │ │(Cross- │ │(Ollama/ │
124
+ │ Vector) │ │ Encoder)│ │ OpenAI) │
125
+ └────┬────┘ └────┬────┘ └────┬────┘
126
+ │ │ │
127
+ [TRACE] [TRACE] [TRACE]
128
+ - Latency - Latency - Latency
129
+ - Chunks - Ranked - Tokens
130
+ - Scores - Duration - Cost
131
+ │ │ │
132
+ └─────────────┼─────────────┘
133
+
134
+ ┌─────────▼──────────┐
135
+ │ Citations & │
136
+ │ Truthfulness │
137
+ │ [TRACE] Cost │
138
+ └─────────┬──────────┘
139
+
140
+ ┌─────────────┴──────────────────┐
141
+ │ │
142
+ ┌────▼─────┐ ┌───────────▼────────┐
143
+ │Observ. │ │ GitHub Actions │
144
+ │Dashboard │ │ Regression Gating │
145
+ │(Metrics) │ │ (CI/CD) │
146
+ └──────────┘ └────────────────────┘
147
+
148
+ ✅ End-to-end tracing
149
+ ✅ Real-time latency visibility
150
+ ✅ Cost per request tracked
151
+ ✅ Automated regression detection
152
+ ✅ Observable metrics at /observability/dashboard
153
+ ```
154
+
155
+ ---
156
+
157
+ ## Detailed Phase Breakdown
158
+
159
+ ### Phase 5.1: LangFuse Instrumentation (Week 1)
160
+
161
+ **Goal:** Add distributed tracing to every RAG pipeline step
162
+
163
+ #### Step 1.1: Create Observability Module
164
+ **File:** `src/core/observability.py` (NEW)
165
+
166
+ ```python
167
+ """
168
+ Observability layer for RAG pipeline instrumentation.
169
+ Provides decorators and context managers for LangFuse tracing.
170
+ """
171
+
172
+ import os
173
+ import time
174
+ import json
175
+ from functools import wraps
176
+ from typing import Any, Callable, Dict, Optional, List
177
+ from contextlib import contextmanager
178
+
179
+ from langfuse import Langfuse
180
+ from langfuse.decorators import observe
181
+ import logging
182
+
183
+ logger = logging.getLogger(__name__)
184
+
185
+
186
+ class RAGObserver:
187
+ """
188
+ Centralized observer for RAG pipeline.
189
+ Manages LangFuse client and provides tracing context managers.
190
+
191
+ Usage pattern — instrument inside RAGOrchestrator.run(), not in main.py:
192
+ with observer.trace_request("rag_query", query=query_text) as trace:
193
+ with trace.span_step("retrieval") as span:
194
+ result = hybrid_retriever.retrieve(...)
195
+ span["output"] = {"chunks": len(result)}
196
+ """
197
+
198
+ def __init__(self, enabled: bool = True, public_key: str = None, secret_key: str = None):
199
+ """
200
+ Args:
201
+ enabled: If False, all tracing is no-op (demo mode, tests)
202
+ public_key: LangFuse public key (defaults to LANGFUSE_PUBLIC_KEY env var)
203
+ secret_key: LangFuse secret key (defaults to LANGFUSE_SECRET_KEY env var)
204
+ """
205
+ self.enabled = enabled
206
+ self.client = None
207
+
208
+ if self.enabled:
209
+ try:
210
+ public_key = public_key or os.getenv("LANGFUSE_PUBLIC_KEY")
211
+ secret_key = secret_key or os.getenv("LANGFUSE_SECRET_KEY")
212
+
213
+ if public_key and secret_key:
214
+ self.client = Langfuse(
215
+ public_key=public_key,
216
+ secret_key=secret_key,
217
+ host=os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com"),
218
+ )
219
+ logger.info("LangFuse observability enabled")
220
+ else:
221
+ self.enabled = False
222
+ logger.warning("LangFuse keys not found; observability disabled")
223
+ except Exception as e:
224
+ logger.error(f"Failed to initialize LangFuse: {e}; observability disabled")
225
+ self.enabled = False
226
+
227
+ @contextmanager
228
+ def trace_request(
229
+ self,
230
+ name: str,
231
+ query: str = "",
232
+ metadata: Optional[Dict[str, Any]] = None,
233
+ ):
234
+ """
235
+ Context manager for a top-level request trace.
236
+ One trace per query request — child spans live inside this.
237
+
238
+ IMPORTANT: This is the top-level trace object. Use trace.span() for
239
+ individual pipeline steps. Never call client.trace() per step — that
240
+ creates disconnected traces in the LangFuse UI.
241
+
242
+ Usage:
243
+ with observer.trace_request("rag_query", query=query_text) as trace:
244
+ with observer.trace_step(trace, "retrieval") as span:
245
+ chunks = retriever.retrieve(query)
246
+ span["chunks_retrieved"] = len(chunks)
247
+ """
248
+ if not self.enabled or not self.client:
249
+ yield None
250
+ return
251
+
252
+ trace = self.client.trace(
253
+ name=name,
254
+ input={"query": query},
255
+ metadata=metadata or {},
256
+ )
257
+ start = time.time()
258
+ try:
259
+ yield trace
260
+ except Exception as e:
261
+ trace.update(
262
+ output={"error": str(e)},
263
+ metadata={**(metadata or {}), "total_ms": (time.time() - start) * 1000},
264
+ )
265
+ raise
266
+ finally:
267
+ trace.update(
268
+ metadata={**(metadata or {}), "total_ms": round((time.time() - start) * 1000, 2)},
269
+ )
270
+
271
+ @contextmanager
272
+ def trace_step(
273
+ self,
274
+ trace,
275
+ step_name: str,
276
+ input_data: Optional[Dict[str, Any]] = None,
277
+ ):
278
+ """
279
+ Context manager for a child span within a request trace.
280
+ Attach to the trace returned by trace_request().
281
+
282
+ Args:
283
+ trace: The top-level trace object from trace_request()
284
+ step_name: Name of the pipeline step (e.g. "retrieval", "generation")
285
+ input_data: Optional input metadata for this step
286
+ """
287
+ if not self.enabled or trace is None:
288
+ yield {}
289
+ return
290
+
291
+ span = trace.span(name=step_name, input=input_data or {})
292
+ output: Dict[str, Any] = {}
293
+ start = time.time()
294
+ try:
295
+ yield output
296
+ except Exception as e:
297
+ span.end(
298
+ output={"error": str(e)},
299
+ metadata={"latency_ms": round((time.time() - start) * 1000, 2)},
300
+ )
301
+ raise
302
+ finally:
303
+ output["latency_ms"] = round((time.time() - start) * 1000, 2)
304
+ span.end(output=output)
305
+
306
+ def flush_async(self) -> None:
307
+ """
308
+ Flush pending traces to LangFuse in a background thread.
309
+ Call this after the HTTP response is sent — never block the hot path.
310
+
311
+ In FastAPI, use a BackgroundTask:
312
+ from fastapi import BackgroundTasks
313
+ background_tasks.add_task(observer.flush_async)
314
+ """
315
+ if not self.client:
316
+ return
317
+ import threading
318
+ threading.Thread(target=self.client.flush, daemon=True).start()
319
+
320
+ def flush(self) -> None:
321
+ """Synchronous flush — only use in shutdown/test contexts, not request handlers."""
322
+ if self.client:
323
+ self.client.flush()
324
+
325
+
326
+ # Global observer instance
327
+ _observer_instance: Optional[RAGObserver] = None
328
+
329
+
330
+ def get_observer() -> RAGObserver:
331
+ """Singleton getter for RAGObserver."""
332
+ global _observer_instance
333
+ if _observer_instance is None:
334
+ enabled = os.getenv("DOC_PROFILE") != "demo"
335
+ _observer_instance = RAGObserver(enabled=enabled)
336
+ return _observer_instance
337
+
338
+
339
+ def init_observer(enabled: bool = True) -> RAGObserver:
340
+ """Initialize the observer (useful for testing)."""
341
+ global _observer_instance
342
+ _observer_instance = RAGObserver(enabled=enabled)
343
+ return _observer_instance
344
+ ```
345
+
346
+ **Testing:** `tests/unit/test_observability.py` (NEW)
347
+
348
+ ```python
349
+ """Unit tests for observability module."""
350
+
351
+ import pytest
352
+ from src.core.observability import RAGObserver, init_observer, get_observer
353
+
354
+
355
+ def test_observer_disabled_noop_on_trace_request():
356
+ """Verify trace_request is a no-op when disabled — yields None."""
357
+ observer = RAGObserver(enabled=False)
358
+
359
+ with observer.trace_request("rag_query", query="test") as trace:
360
+ assert trace is None # no-op when disabled
361
+
362
+
363
+ def test_observer_disabled_noop_on_trace_step():
364
+ """Verify trace_step yields empty dict when trace is None (disabled path)."""
365
+ observer = RAGObserver(enabled=False)
366
+
367
+ with observer.trace_step(None, "retrieval", {"query": "x"}) as output:
368
+ output["chunks"] = 3 # should not raise
369
+ assert output["chunks"] == 3 # returned value preserved even when disabled
370
+
371
+
372
+ def test_trace_step_records_latency():
373
+ """Verify trace_step always populates latency_ms in the output dict."""
374
+ observer = RAGObserver(enabled=False)
375
+
376
+ with observer.trace_step(None, "generation") as output:
377
+ output["provider"] = "anthropic"
378
+
379
+ assert "latency_ms" in output
380
+ assert output["latency_ms"] >= 0
381
+ assert output["provider"] == "anthropic"
382
+
383
+
384
+ def test_nested_trace_and_step_no_exception():
385
+ """Verify trace_request + trace_step nesting works without LangFuse keys."""
386
+ observer = RAGObserver(enabled=False)
387
+
388
+ with observer.trace_request("rag_query", query="hello") as trace:
389
+ with observer.trace_step(trace, "retrieval") as s:
390
+ s["chunks_retrieved"] = 5
391
+ with observer.trace_step(trace, "generation") as s:
392
+ s["provider"] = "ollama"
393
+ # No exception = pass
394
+ ```
395
+
396
+ ---
397
+
398
+ #### Step 1.2: Instrument RAGOrchestrator (correct instrumentation point)
399
+ **File:** `src/core/rag_orchestrator.py` (MODIFY existing)
400
+
401
+ > **Why here, not `main.py`?** `RAGOrchestrator.run()` is called by the API, CLI, and Streamlit — instrumenting here captures all paths. It also correctly observes the cache-hit early return (which main.py wrapping skips entirely). Never create wrapper closures per-call inside the request handler — that's a new function object on every request and misses the orchestrator's internal structure.
402
+
403
+ **Changes:**
404
+ ```python
405
+ # In RAGOrchestrator.__init__, add observer:
406
+ from src.core.observability import get_observer
407
+
408
+ class RAGOrchestrator:
409
+ def __init__(self, cfg: Config) -> None:
410
+ # ... existing init ...
411
+ self.observer = get_observer()
412
+
413
+ def run(self, req: QueryRequest) -> QueryResponse:
414
+ t0 = time.perf_counter()
415
+ # ... existing cache key resolution ...
416
+
417
+ # Cache hit: trace as a cache hit and return
418
+ cached = self.cache.get(key) if req.use_llm else None
419
+ if cached is not None:
420
+ with self.observer.trace_request("rag_query_cached", query=req.query_text):
421
+ pass # Trace the cache hit for visibility
422
+ return QueryResponse(cached=True, ...)
423
+
424
+ # Cache miss: trace all pipeline steps under one request trace
425
+ with self.observer.trace_request("rag_query", query=req.query_text) as trace:
426
+ with self.observer.trace_step(trace, "retrieval", {"top_k": retrieve_k}) as s:
427
+ fused = self._retrieve(req.query_text, index, db, qp, top_k=retrieve_k)
428
+ s["chunks_retrieved"] = len(fused)
429
+
430
+ if req.use_rerank:
431
+ with self.observer.trace_step(trace, "reranking", {"input_chunks": len(fused)}) as s:
432
+ ranked = reranker.rerank(req.query_text, fused, top_k=req.top_k)
433
+ s["output_chunks"] = len(ranked)
434
+
435
+ with self.observer.trace_step(trace, "generation", {"provider": selection.provider, "model": selection.model}) as s:
436
+ gen_result = generator.generate(req.query_text, docs_for_gen, ...)
437
+ s["latency_ms"] = gen_result.latency_ms
438
+
439
+ with self.observer.trace_step(trace, "citation_verification") as s:
440
+ citations = self.citation_verifier.verify(full, raw_citations, opt.documents)
441
+ s["citations_count"] = len(citations)
442
+
443
+ with self.observer.trace_step(trace, "truthfulness_scoring") as s:
444
+ truthfulness = scorer.score(full, opt.documents)
445
+ if truthfulness:
446
+ s["nli_faithfulness"] = truthfulness.nli_faithfulness
447
+ s["citation_groundedness"] = truthfulness.citation_groundedness
448
+
449
+ # Flush in background — do NOT block the response
450
+ self.observer.flush_async()
451
+ return QueryResponse(...)
452
+ ```
453
+
454
+ **main.py changes** — only the `/query` endpoint needs to pass `BackgroundTasks` to ensure flush completes even if the orchestrator doesn't hold a reference:
455
+
456
+ ```python
457
+ # main.py — minimal change, no inline tracing wrappers
458
+ from fastapi import BackgroundTasks
459
+ from src.core.observability import get_observer
460
+
461
+ observer = get_observer()
462
+
463
+ @app.post("/query")
464
+ async def query(request: QueryRequest, background_tasks: BackgroundTasks):
465
+ # Tracing happens inside orchestrator.run() — main.py doesn't wrap steps
466
+ response = orchestrator.run(build_query_request(request))
467
+ background_tasks.add_task(observer.flush_async) # belt-and-suspenders flush
468
+ return build_query_response(response)
469
+ ```
470
+
471
+ **Deliverable for Week 1:**
472
+ - ✅ `src/core/observability.py` (complete)
473
+ - ✅ `tests/unit/test_observability.py` (complete)
474
+ - ✅ `src/core/rag_orchestrator.py` instrumented with step-level tracing
475
+ - ✅ `.env.example` includes `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY`
476
+ - ✅ `langfuse>=2.0.0` added to `requirements/base.txt` (not api.txt — that file does not exist)
477
+
478
+ **Testing Week 1:**
479
+ ```bash
480
+ # Run unit tests
481
+ pytest tests/unit/test_observability.py -v
482
+
483
+ # Start API with observability enabled
484
+ export LANGFUSE_PUBLIC_KEY=pk_... LANGFUSE_SECRET_KEY=sk_...
485
+ PYTHONPATH=. uvicorn src.api.main:app --reload --port 8000
486
+
487
+ # Query and check LangFuse dashboard
488
+ curl -X POST http://127.0.0.1:8000/query \
489
+ -H "Content-Type: application/json" \
490
+ -d '{"query": "What is RAG?"}'
491
+
492
+ # Verify trace appears in LangFuse dashboard
493
+ ```
494
+
495
+ ---
496
+
497
+ ### Phase 5.2: Latency Profiling & Metrics Dashboard (Week 2)
498
+
499
+ **Goal:** Track and expose real-time operational metrics
500
+
501
+ #### Step 2.1: Create Metrics Collector Module
502
+ **File:** `src/monitoring/metrics.py` (NEW)
503
+
504
+ ```python
505
+ """
506
+ Metrics collection and aggregation for RAG pipeline.
507
+ Tracks latency percentiles, cost, retrieval precision, citation accuracy.
508
+ """
509
+
510
+ import json
511
+ import time
512
+ from typing import Dict, List, Optional, Tuple
513
+ from dataclasses import dataclass, asdict
514
+ from collections import deque
515
+ import threading
516
+ from datetime import datetime, timedelta
517
+ import logging
518
+
519
+ logger = logging.getLogger(__name__)
520
+
521
+
522
+ @dataclass
523
+ class StepMetrics:
524
+ """Metrics for a single RAG pipeline step."""
525
+ step_name: str # "retrieval", "reranking", "generation", "citations", "truthfulness"
526
+ latency_ms: float
527
+ timestamp: str
528
+ metadata: Dict = None # Provider, model, token counts, etc.
529
+
530
+
531
+ @dataclass
532
+ class RequestMetrics:
533
+ """Aggregated metrics for a single query request."""
534
+ request_id: str
535
+ total_latency_ms: float
536
+ retrieval_latency_ms: float
537
+ reranking_latency_ms: float
538
+ generation_latency_ms: float
539
+ citation_latency_ms: float
540
+ truthfulness_latency_ms: float
541
+
542
+ # Cost
543
+ cost_usd: float
544
+
545
+ # Quality (online signals — computable without ground truth)
546
+ citation_groundedness: float
547
+ nli_faithfulness: float
548
+ uncited_claims: int
549
+ # NOTE: MRR and NDCG require per-query ground-truth relevance labels.
550
+ # They cannot be computed in production. Use them only in the offline
551
+ # eval harness (evals/run_evals.py). Removed from RequestMetrics.
552
+
553
+ timestamp: str
554
+
555
+
556
+ class MetricsCollector:
557
+ """
558
+ In-memory metrics collector with time-windowed aggregation.
559
+
560
+ Stores metrics in a rolling window (default 1000 last requests).
561
+ Computes P50, P95, P99 latencies and cost trends.
562
+
563
+ NOTE: src/utils/log.py already has a MetricsCollector (count/mean/min/max
564
+ per operation name). This class replaces it — don't run both in parallel.
565
+ When implementing, delete or archive the one in log.py to avoid two sources
566
+ of truth. The track_duration() context manager in log.py can delegate to
567
+ this class instead.
568
+ """
569
+
570
+ def __init__(self, window_size: int = 1000):
571
+ self.window_size = window_size
572
+ self.metrics: deque = deque(maxlen=window_size)
573
+ self.lock = threading.Lock()
574
+
575
+ def record_request(self, metrics: RequestMetrics):
576
+ """Record a completed request's metrics."""
577
+ with self.lock:
578
+ self.metrics.append(metrics)
579
+
580
+ def get_percentile(
581
+ self, metric_field: str, percentile: float
582
+ ) -> Optional[float]:
583
+ """
584
+ Get percentile value for a metric field.
585
+
586
+ Args:
587
+ metric_field: e.g., "total_latency_ms", "cost_usd"
588
+ percentile: 0-100, e.g., 50 for P50, 95 for P95
589
+
590
+ Returns:
591
+ Percentile value or None if insufficient data
592
+ """
593
+ with self.lock:
594
+ if not self.metrics:
595
+ return None
596
+
597
+ values = sorted([getattr(m, metric_field) for m in self.metrics])
598
+ idx = int(len(values) * percentile / 100)
599
+ return values[min(idx, len(values) - 1)]
600
+
601
+ def get_dashboard_metrics(self) -> Dict:
602
+ """
603
+ Return aggregated metrics suitable for dashboarding.
604
+ """
605
+ with self.lock:
606
+ if not self.metrics:
607
+ return {
608
+ "status": "no_data",
609
+ "message": "No requests recorded yet",
610
+ }
611
+
612
+ metrics_list = list(self.metrics)
613
+ n = len(metrics_list)
614
+
615
+ # Latency percentiles (ms)
616
+ latency_p50 = self.get_percentile("total_latency_ms", 50)
617
+ latency_p95 = self.get_percentile("total_latency_ms", 95)
618
+ latency_p99 = self.get_percentile("total_latency_ms", 99)
619
+
620
+ # Step-wise latencies (average)
621
+ retrieval_avg = sum(m.retrieval_latency_ms for m in metrics_list) / n
622
+ reranking_avg = sum(m.reranking_latency_ms for m in metrics_list) / n
623
+ generation_avg = sum(m.generation_latency_ms for m in metrics_list) / n
624
+ citation_avg = sum(m.citation_latency_ms for m in metrics_list) / n
625
+
626
+ # Cost
627
+ cost_total = sum(m.cost_usd for m in metrics_list)
628
+ cost_avg = cost_total / n
629
+ cost_p95 = self.get_percentile("cost_usd", 95)
630
+
631
+ # Quality
632
+ groundedness_avg = sum(
633
+ m.citation_groundedness for m in metrics_list if m.citation_groundedness
634
+ ) / max(sum(1 for m in metrics_list if m.citation_groundedness), 1)
635
+
636
+ faithfulness_avg = sum(
637
+ m.nli_faithfulness for m in metrics_list if m.nli_faithfulness
638
+ ) / max(sum(1 for m in metrics_list if m.nli_faithfulness), 1)
639
+
640
+ # Retrieval quality
641
+ mrr_avg = sum(m.mrr for m in metrics_list if m.mrr) / max(
642
+ sum(1 for m in metrics_list if m.mrr), 1
643
+ )
644
+ ndcg_avg = sum(m.ndcg for m in metrics_list if m.ndcg) / max(
645
+ sum(1 for m in metrics_list if m.ndcg), 1
646
+ )
647
+
648
+ return {
649
+ "summary": {
650
+ "total_requests": n,
651
+ "window_size": self.window_size,
652
+ "last_updated": datetime.utcnow().isoformat(),
653
+ },
654
+ "latency": {
655
+ "total_p50_ms": round(latency_p50, 2),
656
+ "total_p95_ms": round(latency_p95, 2),
657
+ "total_p99_ms": round(latency_p99, 2),
658
+ "retrieval_avg_ms": round(retrieval_avg, 2),
659
+ "reranking_avg_ms": round(reranking_avg, 2),
660
+ "generation_avg_ms": round(generation_avg, 2),
661
+ "citation_avg_ms": round(citation_avg, 2),
662
+ "breakdown_pct": {
663
+ "retrieval": round(retrieval_avg / (retrieval_avg + reranking_avg + generation_avg + citation_avg) * 100, 1),
664
+ "reranking": round(reranking_avg / (retrieval_avg + reranking_avg + generation_avg + citation_avg) * 100, 1),
665
+ "generation": round(generation_avg / (retrieval_avg + reranking_avg + generation_avg + citation_avg) * 100, 1),
666
+ "citation": round(citation_avg / (retrieval_avg + reranking_avg + generation_avg + citation_avg) * 100, 1),
667
+ },
668
+ },
669
+ "cost": {
670
+ "total_usd": round(cost_total, 4),
671
+ "avg_per_request_usd": round(cost_avg, 6),
672
+ "p95_per_request_usd": round(cost_p95, 6),
673
+ },
674
+ "quality": {
675
+ "citation_groundedness_avg": round(groundedness_avg, 3),
676
+ "nli_faithfulness_avg": round(faithfulness_avg, 3),
677
+ "mrr_avg": round(mrr_avg, 3),
678
+ "ndcg_avg": round(ndcg_avg, 3),
679
+ },
680
+ }
681
+
682
+
683
+ # Global metrics collector instance
684
+ _collector_instance: Optional[MetricsCollector] = None
685
+
686
+
687
+ def get_metrics_collector() -> MetricsCollector:
688
+ """Singleton getter."""
689
+ global _collector_instance
690
+ if _collector_instance is None:
691
+ _collector_instance = MetricsCollector()
692
+ return _collector_instance
693
+ ```
694
+
695
+ **Testing:** `tests/unit/test_metrics.py` (NEW)
696
+
697
+ ```python
698
+ """Unit tests for metrics collector."""
699
+
700
+ import pytest
701
+ from src.monitoring.metrics import MetricsCollector, RequestMetrics
702
+ from datetime import datetime
703
+
704
+
705
+ def test_metrics_collector_records_request():
706
+ """Verify collector records request metrics."""
707
+ collector = MetricsCollector(window_size=100)
708
+
709
+ metrics = RequestMetrics(
710
+ request_id="req-1",
711
+ total_latency_ms=1000.0,
712
+ retrieval_latency_ms=200.0,
713
+ reranking_latency_ms=150.0,
714
+ generation_latency_ms=600.0,
715
+ citation_latency_ms=50.0,
716
+ truthfulness_latency_ms=0.0,
717
+ cost_usd=0.005,
718
+ citation_groundedness=0.92,
719
+ nli_faithfulness=0.88,
720
+ uncited_claims=0,
721
+ timestamp=datetime.utcnow().isoformat(),
722
+ )
723
+
724
+ collector.record_request(metrics)
725
+ assert len(collector.metrics) == 1
726
+
727
+
728
+ def test_metrics_percentile_calculation():
729
+ """Verify P50, P95, P99 calculations."""
730
+ collector = MetricsCollector(window_size=100)
731
+
732
+ # Record 100 requests with latencies 100-1000ms
733
+ for i in range(1, 101):
734
+ metrics = RequestMetrics(
735
+ request_id=f"req-{i}",
736
+ total_latency_ms=float(i * 10),
737
+ retrieval_latency_ms=100.0,
738
+ reranking_latency_ms=50.0,
739
+ generation_latency_ms=i * 5,
740
+ citation_latency_ms=10.0,
741
+ truthfulness_latency_ms=0.0,
742
+ cost_usd=0.01,
743
+ citation_groundedness=0.90,
744
+ nli_faithfulness=0.90,
745
+ uncited_claims=0,
746
+ timestamp=datetime.utcnow().isoformat(),
747
+ )
748
+ collector.record_request(metrics)
749
+
750
+ # Check percentiles
751
+ p50 = collector.get_percentile("total_latency_ms", 50)
752
+ p95 = collector.get_percentile("total_latency_ms", 95)
753
+ p99 = collector.get_percentile("total_latency_ms", 99)
754
+
755
+ assert p50 is not None
756
+ assert p95 is not None and p95 >= p50
757
+ assert p99 is not None and p99 >= p95
758
+
759
+
760
+ def test_dashboard_metrics_aggregation():
761
+ """Verify dashboard metrics aggregation."""
762
+ collector = MetricsCollector(window_size=10)
763
+
764
+ for i in range(5):
765
+ metrics = RequestMetrics(
766
+ request_id=f"req-{i}",
767
+ total_latency_ms=1000.0,
768
+ retrieval_latency_ms=200.0,
769
+ reranking_latency_ms=150.0,
770
+ generation_latency_ms=600.0,
771
+ citation_latency_ms=50.0,
772
+ truthfulness_latency_ms=0.0,
773
+ cost_usd=0.005,
774
+ citation_groundedness=0.92,
775
+ nli_faithfulness=0.88,
776
+ uncited_claims=0,
777
+ timestamp=datetime.utcnow().isoformat(),
778
+ )
779
+ collector.record_request(metrics)
780
+
781
+ dashboard = collector.get_dashboard_metrics()
782
+
783
+ assert dashboard["summary"]["total_requests"] == 5
784
+ assert "latency" in dashboard
785
+ assert "cost" in dashboard
786
+ assert "quality" in dashboard
787
+ assert dashboard["latency"]["total_p50_ms"] > 0
788
+ ```
789
+
790
+ ---
791
+
792
+ #### Step 2.2: Update FastAPI Routes to Record Metrics
793
+ **File:** `src/api/main.py` (MODIFY existing)
794
+
795
+ ```python
796
+ # At top
797
+ from src.monitoring.metrics import get_metrics_collector, RequestMetrics
798
+ import uuid
799
+ from datetime import datetime
800
+
801
+ metrics_collector = get_metrics_collector()
802
+
803
+ # NOTE: Step-level timing and tracing now live in RAGOrchestrator.run() — see Step 1.2.
804
+ # main.py only needs to extract the per-step latencies from the QueryResponse and
805
+ # record them into MetricsCollector. RAGOrchestrator.run() returns processing_time_ms
806
+ # and per-step breakdowns; extend QueryResponse to carry those fields.
807
+
808
+ @app.post("/query")
809
+ async def query(request: QueryRequest, background_tasks: BackgroundTasks):
810
+ request_id = str(uuid.uuid4())
811
+
812
+ try:
813
+ orch_response = orchestrator.run(build_query_request(request))
814
+
815
+ metrics = RequestMetrics(
816
+ request_id=request_id,
817
+ total_latency_ms=orch_response.processing_time_ms,
818
+ retrieval_latency_ms=orch_response.step_latencies.get("retrieval", 0),
819
+ reranking_latency_ms=orch_response.step_latencies.get("reranking", 0),
820
+ generation_latency_ms=orch_response.step_latencies.get("generation", 0),
821
+ citation_latency_ms=orch_response.step_latencies.get("citation_verification", 0),
822
+ truthfulness_latency_ms=orch_response.step_latencies.get("truthfulness_scoring", 0),
823
+ cost_usd=calculate_cost(orch_response, request.provider, request.model),
824
+ citation_groundedness=orch_response.truthfulness.citation_groundedness if orch_response.truthfulness else 0,
825
+ nli_faithfulness=orch_response.truthfulness.nli_faithfulness if orch_response.truthfulness else 0,
826
+ uncited_claims=orch_response.truthfulness.uncited_claims if orch_response.truthfulness else 0,
827
+ timestamp=datetime.utcnow().isoformat(),
828
+ )
829
+ metrics_collector.record_request(metrics)
830
+ background_tasks.add_task(observer.flush_async)
831
+
832
+ return build_api_response(request_id, orch_response)
833
+ except Exception as e:
834
+ raise HTTPException(status_code=500, detail=str(e))
835
+
836
+
837
+ # NEW endpoint: /observability/dashboard
838
+ @app.get("/observability/dashboard")
839
+ async def observability_dashboard():
840
+ """Return real-time observability metrics for dashboarding."""
841
+ return metrics_collector.get_dashboard_metrics()
842
+
843
+
844
+ def calculate_cost(answer_response, provider: str, model: str) -> float:
845
+ """Calculate USD cost of request based on tokens and provider pricing.
846
+
847
+ NOTE: This function belongs in src/core/llm_provider.py, not main.py.
848
+ LLMProviderRouter already knows the provider/model — move cost calculation
849
+ there so it's available to CLI and Streamlit paths as well.
850
+ """
851
+ if hasattr(answer_response, "usage"):
852
+ # Rough estimates — update as provider pricing changes
853
+ if provider == "openai":
854
+ return (answer_response.usage.prompt_tokens * 0.000001 +
855
+ answer_response.usage.completion_tokens * 0.000002)
856
+ elif provider == "anthropic":
857
+ return (answer_response.usage.prompt_tokens * 0.0000008 +
858
+ answer_response.usage.completion_tokens * 0.0000024)
859
+ return 0.0
860
+ ```
861
+
862
+ **New endpoint:** `src/api/routes/observability.py` (NEW, optional separation)
863
+
864
+ ```python
865
+ """Observability and monitoring routes."""
866
+
867
+ from fastapi import APIRouter
868
+ from src.monitoring.metrics import get_metrics_collector
869
+
870
+ router = APIRouter(prefix="/observability", tags=["observability"])
871
+ metrics_collector = get_metrics_collector()
872
+
873
+
874
+ @router.get("/dashboard")
875
+ async def get_dashboard():
876
+ """Get real-time dashboard metrics."""
877
+ return metrics_collector.get_dashboard_metrics()
878
+
879
+
880
+ @router.get("/health")
881
+ async def health_check():
882
+ """Basic health check."""
883
+ return {"status": "healthy"}
884
+ ```
885
+
886
+ **Deliverable for Week 2:**
887
+ - ✅ `src/monitoring/metrics.py` (complete)
888
+ - ✅ `tests/unit/test_metrics.py` (complete)
889
+ - ✅ `src/api/main.py` updated with step-level timing and metrics recording
890
+ - ✅ `src/api/routes/observability.py` (optional separation)
891
+ - ✅ `src/api/main.py` includes `/observability/dashboard` endpoint
892
+ - ✅ Update `Docs/RUNBOOK.md` with observability dashboard instructions
893
+
894
+ **Testing Week 2:**
895
+ ```bash
896
+ # Run unit tests
897
+ pytest tests/unit/test_metrics.py -v
898
+
899
+ # Query and check metrics
900
+ curl -X POST http://127.0.0.1:8000/query \
901
+ -H "Content-Type: application/json" \
902
+ -d '{"query": "What is RAG?"}'
903
+
904
+ # View dashboard
905
+ curl http://127.0.0.1:8000/observability/dashboard | jq .
906
+
907
+ # Should return:
908
+ # {
909
+ # "summary": { "total_requests": 1, ... },
910
+ # "latency": { "total_p50_ms": ..., "breakdown_pct": ... },
911
+ # "cost": { "avg_per_request_usd": ... },
912
+ # "quality": { "citation_groundedness_avg": ... }
913
+ # }
914
+ ```
915
+
916
+ ---
917
+
918
+ ### Phase 5.3: Regression Gating in CI/CD (Week 3)
919
+
920
+ **Goal:** Automated quality threshold enforcement on PRs
921
+
922
+ #### Step 3.1: Create Regression Gate Script
923
+ **File:** `scripts/compare_evals.py` (NEW)
924
+
925
+ ```python
926
+ #!/usr/bin/env python3
927
+ """
928
+ Compare evaluation metrics between baseline and current results.
929
+ Used in GitHub Actions to gate PRs based on regression thresholds.
930
+ """
931
+
932
+ import json
933
+ import sys
934
+ import argparse
935
+ from typing import Dict, Tuple
936
+
937
+
938
+ def load_metrics(filepath: str) -> Dict:
939
+ """Load metrics from JSON file."""
940
+ try:
941
+ with open(filepath, "r") as f:
942
+ return json.load(f)
943
+ except FileNotFoundError:
944
+ print(f"Error: {filepath} not found")
945
+ sys.exit(1)
946
+ except json.JSONDecodeError:
947
+ print(f"Error: {filepath} is not valid JSON")
948
+ sys.exit(1)
949
+
950
+
951
+ def compare_metrics(
952
+ baseline: Dict, current: Dict, threshold_pct: float = 5.0
953
+ ) -> Tuple[bool, Dict]:
954
+ """
955
+ Compare baseline and current metrics.
956
+
957
+ Returns:
958
+ (passed: bool, results: Dict with details)
959
+ """
960
+ results = {
961
+ "passed": True,
962
+ "regressions": [],
963
+ "threshold_pct": threshold_pct,
964
+ }
965
+
966
+ # Metrics to track (lower is better for latency/cost, higher is better for quality)
967
+ latency_metrics = [
968
+ "total_p50_ms",
969
+ "total_p95_ms",
970
+ "retrieval_avg_ms",
971
+ "generation_avg_ms",
972
+ ]
973
+ quality_metrics = [
974
+ "citation_groundedness_avg",
975
+ "nli_faithfulness_avg",
976
+ # mrr_avg and ndcg_avg removed — offline-only, not in RequestMetrics
977
+ ]
978
+ cost_metrics = ["avg_per_request_usd"]
979
+
980
+ # Check latency (should not increase by >threshold%)
981
+ baseline_latency = baseline.get("latency", {})
982
+ current_latency = current.get("latency", {})
983
+
984
+ for metric in latency_metrics:
985
+ baseline_val = baseline_latency.get(metric)
986
+ current_val = current_latency.get(metric)
987
+
988
+ if baseline_val is None or current_val is None:
989
+ continue
990
+
991
+ pct_change = ((current_val - baseline_val) / baseline_val) * 100
992
+
993
+ if pct_change > threshold_pct:
994
+ results["regressions"].append({
995
+ "metric": metric,
996
+ "baseline": baseline_val,
997
+ "current": current_val,
998
+ "pct_change": pct_change,
999
+ "direction": "worse (latency increased)",
1000
+ })
1001
+ results["passed"] = False
1002
+
1003
+ # Check quality (should not decrease by >threshold%)
1004
+ baseline_quality = baseline.get("quality", {})
1005
+ current_quality = current.get("quality", {})
1006
+
1007
+ for metric in quality_metrics:
1008
+ baseline_val = baseline_quality.get(metric)
1009
+ current_val = current_quality.get(metric)
1010
+
1011
+ if baseline_val is None or current_val is None:
1012
+ continue
1013
+
1014
+ pct_change = ((baseline_val - current_val) / baseline_val) * 100
1015
+
1016
+ if pct_change > threshold_pct:
1017
+ results["regressions"].append({
1018
+ "metric": metric,
1019
+ "baseline": baseline_val,
1020
+ "current": current_val,
1021
+ "pct_change": pct_change,
1022
+ "direction": "worse (quality decreased)",
1023
+ })
1024
+ results["passed"] = False
1025
+
1026
+ # Check cost (should not increase by >threshold%)
1027
+ baseline_cost = baseline.get("cost", {})
1028
+ current_cost = current.get("cost", {})
1029
+
1030
+ for metric in cost_metrics:
1031
+ baseline_val = baseline_cost.get(metric)
1032
+ current_val = current_cost.get(metric)
1033
+
1034
+ if baseline_val is None or current_val is None:
1035
+ continue
1036
+
1037
+ pct_change = ((current_val - baseline_val) / baseline_val) * 100
1038
+
1039
+ if pct_change > threshold_pct:
1040
+ results["regressions"].append({
1041
+ "metric": metric,
1042
+ "baseline": baseline_val,
1043
+ "current": current_val,
1044
+ "pct_change": pct_change,
1045
+ "direction": "worse (cost increased)",
1046
+ })
1047
+ results["passed"] = False
1048
+
1049
+ return results["passed"], results
1050
+
1051
+
1052
+ def main():
1053
+ parser = argparse.ArgumentParser(
1054
+ description="Compare evaluation metrics between baseline and current"
1055
+ )
1056
+ parser.add_argument("--baseline", required=True, help="Path to baseline metrics JSON")
1057
+ parser.add_argument("--current", required=True, help="Path to current metrics JSON")
1058
+ parser.add_argument(
1059
+ "--threshold", type=float, default=5.0, help="Regression threshold in percent (default: 5%)"
1060
+ )
1061
+ parser.add_argument("--strict", action="store_true", help="Fail on any regression")
1062
+
1063
+ args = parser.parse_args()
1064
+
1065
+ baseline = load_metrics(args.baseline)
1066
+ current = load_metrics(args.current)
1067
+
1068
+ threshold = 0 if args.strict else args.threshold
1069
+ passed, results = compare_metrics(baseline, current, threshold_pct=threshold)
1070
+
1071
+ print(json.dumps(results, indent=2))
1072
+
1073
+ if not passed:
1074
+ print(f"\n❌ Regression detected ({len(results['regressions'])} metric(s) failed)")
1075
+ for reg in results["regressions"]:
1076
+ print(f" - {reg['metric']}: {reg['pct_change']:.1f}% {reg['direction']}")
1077
+ sys.exit(1)
1078
+ else:
1079
+ print("\n✅ All metrics pass regression gate")
1080
+ sys.exit(0)
1081
+
1082
+
1083
+ if __name__ == "__main__":
1084
+ main()
1085
+ ```
1086
+
1087
+ **Testing:** `tests/unit/test_regression_gate.py` (NEW)
1088
+
1089
+ ```python
1090
+ """Unit tests for regression gate script."""
1091
+
1092
+ import json
1093
+ import tempfile
1094
+ import pytest
1095
+ from scripts.compare_evals import compare_metrics
1096
+
1097
+
1098
+ def test_no_regression_when_metrics_stable():
1099
+ """Verify no regression when metrics are unchanged."""
1100
+ baseline = {
1101
+ "latency": {"total_p50_ms": 1000.0, "retrieval_avg_ms": 200.0},
1102
+ "quality": {"citation_groundedness_avg": 0.92},
1103
+ "cost": {"avg_per_request_usd": 0.005},
1104
+ }
1105
+ current = baseline.copy()
1106
+
1107
+ passed, results = compare_metrics(baseline, current, threshold_pct=5.0)
1108
+
1109
+ assert passed is True
1110
+ assert len(results["regressions"]) == 0
1111
+
1112
+
1113
+ def test_regression_detected_for_latency_increase():
1114
+ """Verify regression detected when latency increases >threshold."""
1115
+ baseline = {
1116
+ "latency": {"total_p50_ms": 1000.0},
1117
+ "quality": {},
1118
+ "cost": {},
1119
+ }
1120
+ current = {
1121
+ "latency": {"total_p50_ms": 1100.0}, # 10% increase
1122
+ "quality": {},
1123
+ "cost": {},
1124
+ }
1125
+
1126
+ passed, results = compare_metrics(baseline, current, threshold_pct=5.0)
1127
+
1128
+ assert passed is False
1129
+ assert len(results["regressions"]) == 1
1130
+ assert results["regressions"][0]["metric"] == "total_p50_ms"
1131
+ assert results["regressions"][0]["pct_change"] > 5.0
1132
+
1133
+
1134
+ def test_no_regression_when_quality_improves():
1135
+ """Verify no regression when quality improves."""
1136
+ baseline = {
1137
+ "latency": {},
1138
+ "quality": {"citation_groundedness_avg": 0.90},
1139
+ "cost": {},
1140
+ }
1141
+ current = {
1142
+ "latency": {},
1143
+ "quality": {"citation_groundedness_avg": 0.95}, # Improvement
1144
+ "cost": {},
1145
+ }
1146
+
1147
+ passed, results = compare_metrics(baseline, current, threshold_pct=5.0)
1148
+
1149
+ assert passed is True
1150
+ assert len(results["regressions"]) == 0
1151
+ ```
1152
+
1153
+ ---
1154
+
1155
+ #### Step 3.2: Extend Existing CI Workflow (do NOT create a new file)
1156
+ **File:** `.github/workflows/ci.yml` (MODIFY existing `evals-golden` job)
1157
+
1158
+ > **Why extend, not create?** `ci.yml` already has an `evals-golden` job that runs `golden_ci.jsonl` on every PR with Anthropic Haiku. Creating `.github/workflows/regression_gate.yml` would duplicate that job, resulting in two separate eval runs per PR at twice the cost and runtime. Extend the existing job with a comparison step instead.
1159
+ >
1160
+ > **Dataset filename**: The actual file is `evals/datasets/golden_ci.jsonl`, not `golden.jsonl`.
1161
+ >
1162
+ > **Baseline strategy**: Store `evals/reports/baseline.json` in the repo (committed from main). The CI job compares the PR output against this committed baseline. This avoids the fragile "check out main and run evals" approach, which doubles job time and creates a chicken-and-egg bootstrapping problem.
1163
+
1164
+ **Add these steps to the existing `evals-golden` job in `ci.yml`:**
1165
+
1166
+ ```yaml
1167
+ evals-golden:
1168
+ name: Golden evals (Anthropic Haiku)
1169
+ runs-on: ubuntu-latest
1170
+ env:
1171
+ ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
1172
+ steps:
1173
+ - uses: actions/checkout@v4
1174
+ - uses: actions/setup-python@v5
1175
+ with:
1176
+ python-version: "3.13"
1177
+ cache: pip
1178
+
1179
+ - name: Skip golden evals when secret is missing
1180
+ if: ${{ env.ANTHROPIC_API_KEY == '' }}
1181
+ run: echo "ANTHROPIC_API_KEY not set; skipping golden evals."
1182
+
1183
+ - name: Install dependencies
1184
+ if: ${{ env.ANTHROPIC_API_KEY != '' }}
1185
+ run: pip install -r requirements/base.txt
1186
+
1187
+ - name: Run golden evals (live pipeline, Anthropic Haiku)
1188
+ if: ${{ env.ANTHROPIC_API_KEY != '' }}
1189
+ run: |
1190
+ PYTHONPATH=. python -m evals.run_evals \
1191
+ --dataset evals/datasets/golden_ci.jsonl \
1192
+ --judge-provider anthropic \
1193
+ --judge-model claude-haiku-4-5 \
1194
+ --output evals/reports/pr-current.json \
1195
+ --faithfulness-threshold 0.7 \
1196
+ --correctness-threshold 0.2
1197
+
1198
+ # === NEW: regression gate comparison ===
1199
+ - name: Compare against baseline (regression gate)
1200
+ if: ${{ env.ANTHROPIC_API_KEY != '' && hashFiles('evals/reports/baseline.json') != '' }}
1201
+ run: |
1202
+ python scripts/compare_evals.py \
1203
+ --baseline evals/reports/baseline.json \
1204
+ --current evals/reports/pr-current.json \
1205
+ --threshold 5.0
1206
+
1207
+ - name: Comment on regression failure
1208
+ if: failure()
1209
+ uses: actions/github-script@v6
1210
+ with:
1211
+ script: |
1212
+ github.rest.issues.createComment({
1213
+ issue_number: context.issue.number,
1214
+ owner: context.repo.owner,
1215
+ repo: context.repo.repo,
1216
+ body: "⚠️ **Regression Detected**\n\nEval metrics degraded vs baseline. See `evals/reports/pr-current.json` artifact for details.\n\nTo update the baseline (intentional improvement), run `make update-baseline` on main."
1217
+ })
1218
+ # === END: regression gate ===
1219
+
1220
+ - name: Upload golden eval report
1221
+ if: ${{ always() && env.ANTHROPIC_API_KEY != '' }}
1222
+ uses: actions/upload-artifact@v4
1223
+ with:
1224
+ name: eval-report-golden
1225
+ path: evals/reports/
1226
+ ```
1227
+
1228
+ **One-time baseline setup** (run on main, commit the result):
1229
+ ```bash
1230
+ git checkout main
1231
+ PYTHONPATH=. python -m evals.run_evals \
1232
+ --dataset evals/datasets/golden_ci.jsonl \
1233
+ --judge-provider anthropic \
1234
+ --judge-model claude-haiku-4-5 \
1235
+ --output evals/reports/baseline.json
1236
+ git add evals/reports/baseline.json
1237
+ git commit -m "chore: establish Phase 5 eval baseline"
1238
+ ```
1239
+
1240
+ ---
1241
+
1242
+ #### Step 3.3: Create Phase 5 Documentation
1243
+ **File:** `Docs/phase5_observability.md` (NEW)
1244
+
1245
+ ```markdown
1246
+ # Phase 5: Production Monitoring & Observability
1247
+
1248
+ **Timeline:** 3 weeks
1249
+ **Status:** Implementation in progress
1250
+
1251
+ ## Overview
1252
+
1253
+ Phase 5 hardens the doc-ingestion RAG system for production through:
1254
+
1255
+ 1. **Distributed tracing** (LangFuse) for end-to-end pipeline visibility
1256
+ 2. **Latency profiling** (P50, P95, P99) per step
1257
+ 3. **Cost tracking** (USD per request)
1258
+ 4. **Real-time metrics dashboard** at `/observability/dashboard`
1259
+ 5. **Regression gating** (GitHub Actions) to prevent accuracy degradation on PRs
1260
+ 6. **Citation accuracy monitoring** (groundedness, coverage trends)
1261
+
1262
+ ## Architecture
1263
+
1264
+ ### Tracing Flow
1265
+ ```
1266
+ User Query
1267
+
1268
+ [LangFuse Trace Start]
1269
+
1270
+ Retrieval (BM25 + Vector)
1271
+ [TRACE: latency, chunks retrieved, scores]
1272
+
1273
+ Reranking (Cross-Encoder)
1274
+ [TRACE: latency, input/output chunks]
1275
+
1276
+ Generation (LLM)
1277
+ [TRACE: latency, tokens, cost, provider]
1278
+
1279
+ Citation Verification
1280
+ [TRACE: latency, citations verified]
1281
+
1282
+ Truthfulness Scoring
1283
+ [TRACE: latency, faithfulness, groundedness]
1284
+
1285
+ [Flush to LangFuse]
1286
+
1287
+ Response + Metrics Recorded
1288
+ ```
1289
+
1290
+ ### Metrics Aggregation
1291
+ ```
1292
+ Per-Request Metrics (RequestMetrics)
1293
+
1294
+ In-Memory Collector (1000 rolling window)
1295
+
1296
+ Dashboard Endpoint (/observability/dashboard)
1297
+
1298
+ JSON: P50/P95/P99 latencies, cost trends, quality scores
1299
+ ```
1300
+
1301
+ ### Regression Gating
1302
+ ```
1303
+ PR Submitted
1304
+
1305
+ GitHub Actions: Run evals on golden dataset
1306
+
1307
+ Compare against baseline (main branch)
1308
+
1309
+ Check: Latency increase <5%? Quality decrease <5%?
1310
+
1311
+ If FAIL: Block PR + comment with regression details
1312
+ If PASS: Allow merge
1313
+ ```
1314
+
1315
+ ## Key Components
1316
+
1317
+ ### 1. Observability Module (`src/core/observability.py`)
1318
+
1319
+ **Provides:**
1320
+ - `RAGObserver` class with step-level tracing decorators
1321
+ - Context managers for span-based tracing
1322
+ - LangFuse client integration
1323
+ - No-op when disabled (useful for demo mode)
1324
+
1325
+ **Usage:**
1326
+ ```python
1327
+ observer = get_observer()
1328
+
1329
+ # One trace per request, spans as children — instrument in RAGOrchestrator.run()
1330
+ with observer.trace_request("rag_query", query=query_text) as trace:
1331
+ with observer.trace_step(trace, "retrieval") as s:
1332
+ result = retriever.retrieve(query)
1333
+ s["chunks_retrieved"] = len(result)
1334
+ with observer.trace_step(trace, "generation", {"provider": provider}) as s:
1335
+ answer = generator.generate(query, result)
1336
+
1337
+ observer.flush_async() # non-blocking — run in background thread
1338
+ ```
1339
+
1340
+ ### 2. Metrics Collector (`src/monitoring/metrics.py`)
1341
+
1342
+ **Provides:**
1343
+ - `MetricsCollector` for in-memory aggregation
1344
+ - Percentile calculations (P50, P95, P99)
1345
+ - Dashboard-friendly JSON aggregations
1346
+ - Thread-safe recording
1347
+
1348
+ **Metrics tracked:**
1349
+ ```
1350
+ Latency:
1351
+ - total_latency_ms (P50, P95, P99)
1352
+ - retrieval_avg_ms
1353
+ - reranking_avg_ms
1354
+ - generation_avg_ms
1355
+ - citation_avg_ms
1356
+ - Breakdown percentages
1357
+
1358
+ Cost:
1359
+ - total_usd (across all requests)
1360
+ - avg_per_request_usd
1361
+ - p95_per_request_usd
1362
+
1363
+ Quality (online — no ground truth required):
1364
+ - citation_groundedness_avg
1365
+ - nli_faithfulness_avg
1366
+ (mrr/ndcg are offline-only; they live in evals/run_evals.py, not RequestMetrics)
1367
+ ```
1368
+
1369
+ ### 3. Regression Gate Script (`scripts/compare_evals.py`)
1370
+
1371
+ **Compares:**
1372
+ - Baseline metrics (main branch)
1373
+ - Current metrics (PR branch)
1374
+ - Threshold: 5% by default (configurable)
1375
+
1376
+ **Fails if:**
1377
+ - Latency increases >5%
1378
+ - Quality decreases >5%
1379
+ - Cost increases >5%
1380
+
1381
+ ### 4. Regression Gate in `.github/workflows/ci.yml` (extends existing `evals-golden` job)
1382
+
1383
+ **On every PR:**
1384
+ 1. Runs offline evaluations against `evals/datasets/golden_ci.jsonl`
1385
+ 2. Compares against committed `evals/reports/baseline.json`
1386
+ 3. Blocks PR if regressions detected
1387
+ 4. Comments with regression details
1388
+
1389
+ ## Setup Instructions
1390
+
1391
+ ### Step 1: Set Environment Variables
1392
+
1393
+ ```bash
1394
+ # For development
1395
+ export LANGFUSE_PUBLIC_KEY=pk_...
1396
+ export LANGFUSE_SECRET_KEY=sk_...
1397
+
1398
+ # For testing (disabled)
1399
+ export DOC_PROFILE=demo # Disables LangFuse
1400
+ ```
1401
+
1402
+ ### Step 2: Install Dependencies
1403
+
1404
+ ```bash
1405
+ # langfuse goes into requirements/base.txt (requirements/api.txt does not exist)
1406
+ pip install -r requirements/base.txt # Includes langfuse>=2.0.0
1407
+ ```
1408
+
1409
+ ### Step 3: Configure Baseline (One-Time, commit to repo)
1410
+
1411
+ ```bash
1412
+ # Run evaluations on main branch to establish baseline
1413
+ git checkout main
1414
+ PYTHONPATH=. python -m evals.run_evals \
1415
+ --dataset evals/datasets/golden_ci.jsonl \
1416
+ --judge-provider anthropic \
1417
+ --judge-model claude-haiku-4-5 \
1418
+ --output evals/reports/baseline.json
1419
+ git add evals/reports/baseline.json
1420
+ git commit -m "chore: establish Phase 5 eval baseline"
1421
+ ```
1422
+
1423
+ ### Step 4: Query and Monitor
1424
+
1425
+ ```bash
1426
+ # Start API
1427
+ PYTHONPATH=. uvicorn src.api.main:app --reload
1428
+
1429
+ # Query
1430
+ curl -X POST http://localhost:8000/query \
1431
+ -H "Content-Type: application/json" \
1432
+ -d '{"query": "What is RAG?"}'
1433
+
1434
+ # View dashboard
1435
+ curl http://localhost:8000/observability/dashboard | jq .
1436
+
1437
+ # Output:
1438
+ # {
1439
+ # "summary": { "total_requests": 1, ... },
1440
+ # "latency": {
1441
+ # "total_p50_ms": 1247.3,
1442
+ # "total_p95_ms": 1247.3,
1443
+ # "breakdown_pct": {
1444
+ # "retrieval": 18.2,
1445
+ # "reranking": 12.1,
1446
+ # "generation": 68.4,
1447
+ # "citation": 1.3
1448
+ # }
1449
+ # },
1450
+ # "cost": { "avg_per_request_usd": 0.00245 },
1451
+ # "quality": {
1452
+ # "citation_groundedness_avg": 0.92,
1453
+ # "nli_faithfulness_avg": 0.88
1454
+ # }
1455
+ # }
1456
+ ```
1457
+
1458
+ ## Testing
1459
+
1460
+ ### Unit Tests
1461
+
1462
+ ```bash
1463
+ # Observability tests
1464
+ pytest tests/unit/test_observability.py -v
1465
+
1466
+ # Metrics tests
1467
+ pytest tests/unit/test_metrics.py -v
1468
+
1469
+ # Regression gate tests
1470
+ pytest tests/unit/test_regression_gate.py -v
1471
+ ```
1472
+
1473
+ ### Integration Test
1474
+
1475
+ ```bash
1476
+ # Full E2E with tracing enabled
1477
+ LANGFUSE_PUBLIC_KEY=pk_... LANGFUSE_SECRET_KEY=sk_... \
1478
+ PYTHONPATH=. python -c "
1479
+ from src.api.main import app
1480
+ from fastapi.testclient import TestClient
1481
+
1482
+ client = TestClient(app)
1483
+ response = client.post('/query', json={'query': 'What is RAG?'})
1484
+ print(response.json())
1485
+ # Should include request_id and all metrics
1486
+ "
1487
+ ```
1488
+
1489
+ ## Metrics Interpretation
1490
+
1491
+ ### Latency Breakdown Example
1492
+ ```
1493
+ Total P50: 1247.3 ms
1494
+
1495
+ Breakdown:
1496
+ - Retrieval: 227 ms (18.2%) ← BM25 + Vector Search
1497
+ - Reranking: 151 ms (12.1%) ← Cross-Encoder Rerank
1498
+ - Generation: 855 ms (68.4%) ← LLM inference
1499
+ - Citation: 14 ms ( 1.3%) ← Citation Verification
1500
+
1501
+ Interpretation:
1502
+ Generation is the bottleneck (68.4% of total).
1503
+ Could optimize by:
1504
+ 1. Using a faster model
1505
+ 2. Using streaming
1506
+ 3. Reducing context size
1507
+ ```
1508
+
1509
+ ### Quality Metrics Example
1510
+ ```
1511
+ Citation Groundedness: 0.92 (92% of citations verified)
1512
+ NLI Faithfulness: 0.88 (88% of answer supported by chunks)
1513
+ MRR (Retrieval): 0.85 (Mean Reciprocal Rank)
1514
+ NDCG (Retrieval): 0.80 (NDCG@10)
1515
+
1516
+ Interpretation:
1517
+ - Citation coverage is strong (92%)
1518
+ - Faithfulness could improve (88%)
1519
+ - Retrieval quality is good (MRR 0.85)
1520
+ - Consider reranking strategy improvements
1521
+ ```
1522
+
1523
+ ### Cost Estimation Example
1524
+ ```
1525
+ Cost per Request: $0.00245 (avg)
1526
+ Cost at P95: $0.00312
1527
+
1528
+ Annual projection (10K requests/day):
1529
+ 365 * 10K * $0.00245 = $8,927.50
1530
+
1531
+ Cost Optimization:
1532
+ - Switch to cheaper model?
1533
+ - Use batch inference?
1534
+ - Cache common queries?
1535
+ ```
1536
+
1537
+ ## Deployment Notes
1538
+
1539
+ ### Docker
1540
+
1541
+ ```dockerfile
1542
+ # In docker/Dockerfile, ensure observability deps are included
1543
+ # langfuse is in requirements/base.txt — no separate api.txt exists
1544
+ RUN pip install -r requirements/base.txt
1545
+
1546
+ # docker-compose sets env vars
1547
+ environment:
1548
+ - LANGFUSE_PUBLIC_KEY=${LANGFUSE_PUBLIC_KEY}
1549
+ - LANGFUSE_SECRET_KEY=${LANGFUSE_SECRET_KEY}
1550
+ ```
1551
+
1552
+ ### Streamlit (Demo Mode)
1553
+
1554
+ ```python
1555
+ # In demo mode, observability is disabled
1556
+ if os.getenv("DOC_PROFILE") == "demo":
1557
+ observer = RAGObserver(enabled=False) # No-op
1558
+ ```
1559
+
1560
+ ## Troubleshooting
1561
+
1562
+ ### LangFuse traces not appearing
1563
+
1564
+ ```
1565
+ 1. Check credentials: LANGFUSE_PUBLIC_KEY and LANGFUSE_SECRET_KEY set?
1566
+ 2. Check network: Can you reach https://cloud.langfuse.com?
1567
+ 3. Check logs: Do you see "LangFuse observability enabled"?
1568
+ 4. Verify flush: observer.flush() called after each request?
1569
+ ```
1570
+
1571
+ ### Dashboard metrics all zeros
1572
+
1573
+ ```
1574
+ 1. Check MetricsCollector is receiving data:
1575
+ print(metrics_collector.metrics)
1576
+ 2. Have you sent enough requests? (P95 needs at least 100)
1577
+ 3. Is metrics_collector.record_request() being called?
1578
+ ```
1579
+
1580
+ ### Regression gate always failing
1581
+
1582
+ ```
1583
+ 1. Baseline exists? evals/reports/baseline.json present? (committed to repo)
1584
+ If not: run "make update-baseline" on main to generate it.
1585
+ 2. Threshold too strict? Default is 5%, try --threshold 10
1586
+ 3. Eval dataset: correct file is evals/datasets/golden_ci.jsonl (not golden.jsonl)
1587
+ 4. Check eval logs for errors: evals/reports/pr-current.json artifact
1588
+ ```
1589
+
1590
+ ## Next Steps (Post-Phase 5)
1591
+
1592
+ - [ ] Grafana dashboard integration for long-term trends
1593
+ - [ ] Alert thresholds (PagerDuty for latency spikes)
1594
+ - [ ] Cost attribution per LLM provider
1595
+ - [ ] A/B testing framework (compare models, prompts)
1596
+ - [ ] User feedback loop (thumbs up/down on answers)
1597
+ - [ ] Fine-tuning based on eval failures
1598
+
1599
+ ## Interview Stories
1600
+
1601
+ ### "How do you ensure production RAG reliability?"
1602
+
1603
+ > At Marriott, we deployed an agent handling 10K+ guest queries daily. Without observability, we'd have no idea if accuracy was degrading. I instrumented the pipeline with LangFuse tracing to see every step: retrieval latency, reranking precision, generation tokens, citation accuracy. Now I have a dashboard showing P50/P95 latency breakdown, cost per request, and quality metrics. And I wired up regression gating so no code change ships unless it passes a golden dataset evaluation. This is how you build trust in production AI systems.
1604
+
1605
+ ### "How would you scale an AI platform?"
1606
+
1607
+ > Observability is first-class, not an afterthought. The moment you deploy, you need distributed tracing to answer: Where's the bottleneck? Is generation or retrieval slowing us down? What's the cost per request? How are quality metrics trending? I built this with LangFuse + a metrics collector, so we can see the full stack at P50/P95. Then I added regression gating in CI/CD to prevent accuracy regressions from ever shipping.
1608
+
1609
+ ---
1610
+
1611
+ **Deliverables Summary:**
1612
+
1613
+ | Week | Component | Files |
1614
+ |------|-----------|-------|
1615
+ | 1 | Instrumentation | `src/core/observability.py`, `tests/unit/test_observability.py`, `src/core/rag_orchestrator.py` (modified) |
1616
+ | 2 | Metrics Dashboard | `src/monitoring/metrics.py` (replaces log.py MetricsCollector), `/observability/dashboard` endpoint |
1617
+ | 3 | Regression Gating | `scripts/compare_evals.py`, `.github/workflows/ci.yml` (modified — add comparison step to evals-golden job), `evals/reports/baseline.json` (committed) |
1618
+
1619
+ ---
1620
+
1621
+ ## Approval Checklist
1622
+
1623
+ - [ ] Week 1: LangFuse integration with correct span hierarchy (one trace/request, spans as children)
1624
+ - [ ] Week 1: Instrumentation in `RAGOrchestrator.run()`, not `main.py`
1625
+ - [ ] Week 1: `flush_async()` used everywhere (no synchronous flush in request path)
1626
+ - [ ] Week 2: `MetricsCollector` in `src/monitoring/metrics.py` replaces the one in `src/utils/log.py`
1627
+ - [ ] Week 2: `RequestMetrics` has no `mrr`/`ndcg` fields
1628
+ - [ ] Week 3: Regression comparison added to existing `evals-golden` job in `ci.yml`
1629
+ - [ ] Week 3: `evals/reports/baseline.json` committed to repo from main branch
1630
+ - [ ] Tests: All unit tests passing
1631
+ - [ ] Integration: E2E query with tracing + metrics recording
1632
+ - [ ] Interview ready: Stories prepared (see "Interview Stories")
1633
+ ```
1634
+
1635
+ **Deliverable for Week 3:**
1636
+ - ✅ `scripts/compare_evals.py` (complete)
1637
+ - ✅ `tests/unit/test_regression_gate.py` (complete)
1638
+ - ✅ `.github/workflows/ci.yml` updated — regression comparison step added to `evals-golden` job (no new workflow file)
1639
+ - ✅ `evals/reports/baseline.json` committed to repo (generated from main branch)
1640
+ - ✅ `Docs/phase5_observability.md` (comprehensive, 300+ lines)
1641
+ - ✅ Update `README.md` with observability badge and link to Phase 5 docs
1642
+ - ✅ Update `Docs/ROADMAP.md` to mark Phase 5 as "Complete"
1643
+
1644
+ ---
1645
+
1646
+ ## Testing All Phases (Integration Tests)
1647
+
1648
+ **File:** `tests/integration/test_phase5_e2e.py` (NEW)
1649
+
1650
+ ```python
1651
+ """End-to-end integration test for Phase 5."""
1652
+
1653
+ import pytest
1654
+ from fastapi.testclient import TestClient
1655
+ from src.api.main import app
1656
+ from src.core.observability import init_observer
1657
+ from src.monitoring.metrics import get_metrics_collector
1658
+
1659
+ client = TestClient(app)
1660
+
1661
+
1662
+ @pytest.fixture(autouse=True)
1663
+ def setup_observability():
1664
+ """Initialize observability for tests."""
1665
+ init_observer(enabled=False) # Disabled for unit tests
1666
+ yield
1667
+ metrics_collector = get_metrics_collector()
1668
+ metrics_collector.metrics.clear()
1669
+
1670
+
1671
+ def test_full_query_pipeline_with_observability():
1672
+ """Test full query pipeline with observability enabled.
1673
+
1674
+ NOTE: This requires the API to be running with documents indexed.
1675
+ Use the existing tests/fixtures/ for pre-loaded test documents — see
1676
+ tests/integration/test_pipeline.py for the fixture pattern.
1677
+ """
1678
+ response = client.post(
1679
+ "/query",
1680
+ json={"query": "What is RAG?", "provider": "ollama", "model": "qwen2.5:7b"}
1681
+ )
1682
+
1683
+ assert response.status_code == 200
1684
+ data = response.json()
1685
+
1686
+ # Verify response structure
1687
+ assert "request_id" in data
1688
+ assert "answer" in data
1689
+ assert "citations" in data
1690
+ assert "truthfulness" in data
1691
+
1692
+ # Verify request_id format
1693
+ assert len(data["request_id"]) == 36 # UUID length
1694
+
1695
+
1696
+ def test_observability_dashboard_endpoint():
1697
+ """Test /observability/dashboard endpoint."""
1698
+ # Send a few requests
1699
+ for i in range(5):
1700
+ client.post(
1701
+ "/query",
1702
+ json={"query": f"Query {i}", "provider": "ollama"}
1703
+ )
1704
+
1705
+ # Check dashboard
1706
+ response = client.get("/observability/dashboard")
1707
+ assert response.status_code == 200
1708
+ data = response.json()
1709
+
1710
+ # Verify dashboard structure
1711
+ assert "summary" in data
1712
+ assert "latency" in data
1713
+ assert "cost" in data
1714
+ assert "quality" in data
1715
+
1716
+ # Verify latency metrics
1717
+ assert "total_p50_ms" in data["latency"]
1718
+ assert "breakdown_pct" in data["latency"]
1719
+ assert data["summary"]["total_requests"] >= 5
1720
+ ```
1721
+
1722
+ ---
1723
+
1724
+ ## Success Metrics (How to Know Phase 5 Is Complete)
1725
+
1726
+ | Metric | Target | Status |
1727
+ |--------|--------|--------|
1728
+ | **Tracing** | Every RAG step traced in LangFuse | ✅ |
1729
+ | **Latency visibility** | P50/P95/P99 per step on dashboard | ✅ |
1730
+ | **Cost tracking** | USD per request calculated & exposed | ✅ |
1731
+ | **Regression gating** | GitHub Actions blocks PRs on degradation | ✅ |
1732
+ | **Tests passing** | Unit + integration + E2E all passing | ✅ |
1733
+ | **Documentation** | Phase 5 docs + interview stories | ✅ |
1734
+ | **Demo-ready** | Can show dashboard in 3 minutes | ✅ |
1735
+
1736
+ ---
1737
+
1738
+ ## Interview Talking Points
1739
+
1740
+ ### For Vertex (Director, AI Coding Platforms)
1741
+
1742
+ > "Latency budgeting is critical at director level. I instrumented my RAG system to show P50/P95 latency per step. Generation is 68% of the latency. I'd optimize by choosing a faster model or using streaming. This is the mental model: measure first, then optimize. And I wired up regression gating so accuracy never regresses on PRs."
1743
+
1744
+ ### For Elevation Capital (Head of AI Strategy)
1745
+
1746
+ > "Risk reduction is how you scale AI platforms. I added observability to my RAG system so we can track: Is accuracy degrading? Are costs trending up? Is latency acceptable? And I automated regression detection in CI/CD. This removes the human risk of accidentally shipping a prompt change that tanks quality."
1747
+
1748
+ ### For Marriott-like Enterprise Roles
1749
+
1750
+ > "At enterprise scale, you can't guess. I built a metrics dashboard showing cost per request, citation accuracy, retrieval quality. I monitor P50/P95 latencies to understand where bottlenecks are. And I have a regression gate that prevents code changes from degrading the model without detection. This is how you run a platform."
1751
+
1752
+ ---
1753
+
1754
+ ## Timeline Summary
1755
+
1756
+ | Week | Deliverable | Effort | Demo |
1757
+ |------|-------------|--------|------|
1758
+ | 1 | LangFuse tracing | 15-20 hrs | Query + LangFuse dashboard |
1759
+ | 2 | Metrics + dashboard | 10-15 hrs | /observability/dashboard endpoint |
1760
+ | 3 | Regression gating + docs | 10-15 hrs | GitHub Actions blocking PR demo |
1761
+
1762
+ **Total effort:** ~40-50 hours over 3 weeks
1763
+
1764
+ ---
Docs/Phase6-Iterative-Execution-Index.md ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Phase 6 Iterative Execution Index
2
+
3
+ Use this index to execute Phase 6 one plan at a time while keeping the master plan unchanged.
4
+
5
+ Master plan:
6
+ - `Docs/Phase6-RefactorDemo_React.md`
7
+
8
+ Execution order:
9
+ 1. `Docs/Phase6.1-Backend-Session-Isolation-Plan.md`
10
+ 2. `Docs/Phase6.2-React-MVP-Plan.md`
11
+ 3. `Docs/Phase6.3-Container-Cutover-Plan.md`
12
+ 4. `Docs/Phase6.4-Streamlit-Decommission-Plan.md` (optional)
13
+
14
+ ## Phase gate rule
15
+
16
+ Do not start the next phase until current phase:
17
+ - meets all exit criteria,
18
+ - passes phase verification commands,
19
+ - and has handoff artifacts ready for the next phase.
20
+
21
+ ## Shared constraints (apply to all phase files)
22
+
23
+ - Knowledge scope stays `global|session|both`.
24
+ - Guardrails stay at defaults unless explicitly tuned via `DOC_DEMO_*` env:
25
+ - 3 files/session
26
+ - 3 MB/file
27
+ - 8 MB/session
28
+ - 30 min idle TTL
29
+ - Keep rollback notes current during 6.3 and 6.4.
Docs/Phase6-RefactorDemo_React.md ADDED
@@ -0,0 +1,445 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Plan: Per-session document upload for the demo, on a React + FastAPI front-end
2
+
3
+ ## Context
4
+
5
+ The Hugging Face Spaces demo at [src/web/streamlit_app.py](src/web/streamlit_app.py) currently disables document uploads in demo mode (early-return at [src/web/streamlit_app.py:344-350](src/web/streamlit_app.py#L344-L350)) because the ingestion pipeline writes to a single shared Chroma collection (`"documents"` at [src/core/rag_orchestrator.py:32](src/core/rag_orchestrator.py#L32)) and a single shared BM25 index file ([src/core/rag_orchestrator.py:30](src/core/rag_orchestrator.py#L30)). Visitors can only run pre-canned prompts against pre-loaded sample docs, which leaves them unable to verify whether the RAG pipeline is genuinely grounded — eroding trust on first contact.
6
+
7
+ Goal: let a visitor (a) try the existing sample prompts, (b) upload a few of their own documents, (c) ask questions scoped to global / their uploads / both, and (d) see citations they can verify against the file they just uploaded — all without polluting the shared corpus or other visitors' sessions.
8
+
9
+ You opted to go straight to a React + FastAPI front-end (rather than extending Streamlit) and to ship the work in phases. Backend isolation must land first regardless of front-end choice, so the plan starts there.
10
+
11
+ Decisions captured: **3-way knowledge-scope toggle (Global / Mine / Both)**; **conservative caps: 3 files, 3 MB each, 8 MB total, 30 min idle TTL**.
12
+
13
+ ## High-level approach
14
+
15
+ The clean architectural seam already exists in the code — three hard-coded constants (`BM25_INDEX_PATH`, `COLLECTION_NAME`, `CHROMA_PATH`) at module scope in [src/ingest.py:21-22](src/ingest.py#L21-L22) and [src/core/rag_orchestrator.py:30-32](src/core/rag_orchestrator.py#L30-L32). The plan parameterizes those, threads a session-scoped triple `(bm25_index_path, collection_name, chroma_path)` through the request, and unions retrieval results when scope is "Both". Existing components (`HybridRetriever`, `BM25Search`, `VectorSearch`, `CrossEncoderReranker`, `RAGGenerator`, `CitationVerifier`) require no changes.
16
+
17
+ The cached singleton orchestrator at [src/web/streamlit_app.py:39](src/web/streamlit_app.py#L39) stays — it reads its session inputs per-`QueryRequest`, not at construction.
18
+
19
+ ## Phase 6.1 — Backend session isolation foundation (~2-3 days)
20
+
21
+ Ships independently. Streamlit UI continues to work as today. The new HTTP surface unblocks the React build.
22
+
23
+ ### Objective
24
+
25
+ Land session-isolated ingestion/retrieval in the backend while keeping existing Streamlit behavior intact.
26
+
27
+ ### Scope
28
+
29
+ ### Files to modify
30
+
31
+ **[src/ingest.py](src/ingest.py)** — make `ingest()` accept overrides
32
+ - Change signature at [L37](src/ingest.py#L37) to:
33
+ `def ingest(docs_path, *, bm25_index_path=BM25_INDEX_PATH, collection_name=COLLECTION_NAME, chroma_path="data/embeddings/chroma", processor=None) -> tuple[BM25Index, VectorDatabase]`
34
+ - Replace hard-coded uses at [L54](src/ingest.py#L54), [L55](src/ingest.py#L55), [L91](src/ingest.py#L91), [L97-98](src/ingest.py#L97-L98) with the kwargs.
35
+ - When `processor is None`, build one as today; the parameter exists so the caller passes a fresh `DocumentProcessor` per session (its `_seen_hashes` is per-instance and would otherwise leak dedup state across sessions).
36
+ - Module constants stay as defaults — CLI usage unchanged.
37
+
38
+ **[src/web/ingestion_service.py](src/web/ingestion_service.py)** — caps + session-target passthrough
39
+ - Add module constants (env-overridable):
40
+ - `MAX_FILES_PER_SESSION = int(os.getenv("DOC_DEMO_MAX_FILES", "3"))`
41
+ - `MAX_FILE_BYTES = int(os.getenv("DOC_DEMO_MAX_FILE_MB", "3")) * 1024 * 1024`
42
+ - `MAX_SESSION_BYTES = int(os.getenv("DOC_DEMO_MAX_SESSION_MB", "8")) * 1024 * 1024`
43
+ - Extend `save_uploaded_files()` at [L29](src/web/ingestion_service.py#L29) to accept `existing_bytes: int = 0, max_files: int | None = None, max_file_bytes: int | None = None, max_session_bytes: int | None = None`. Reject with `IngestFileResult(status="rejected", message=...)` for: oversize file, file count cap, session disk cap.
44
+ - Add a magic-bytes sanity check (e.g., `.pdf` must start with `%PDF`, `.docx` must start with `PK\x03\x04`); reject `type_mismatch` otherwise.
45
+ - Extend `run_ingest()` at [L50](src/web/ingestion_service.py#L50) to accept `bm25_index_path: str | None = None, collection_name: str | None = None, chroma_path: str | None = None` and forward to `ingest(...)`.
46
+
47
+ **`src/web/session_corpus.py`** (new — only new module)
48
+ ```
49
+ SESSION_ROOT = Path(os.getenv("DOC_DEMO_SESSION_ROOT", "/tmp/doc-ingest-sessions"))
50
+ SESSION_TTL_SECONDS = int(os.getenv("DOC_DEMO_SESSION_TTL", "1800"))
51
+
52
+ @dataclass
53
+ class SessionCorpus:
54
+ session_id: str
55
+ upload_dir: Path
56
+ chroma_path: Path
57
+ bm25_index_path: Path
58
+ collection_name: str # f"sess_{session_id}" — Chroma-safe
59
+ created_at: float
60
+
61
+ def new_session_id() -> str # uuid4().hex[:12]
62
+ def get_or_create(sid: str) -> SessionCorpus
63
+ def touch(sid: str) -> None # bump .touched mtime, refresh TTL
64
+ def total_bytes(s: SessionCorpus) -> int
65
+ def list_active_sessions() -> list[SessionCorpus]
66
+ def delete_session(sid: str) -> None
67
+ def janitor_sweep(now: float | None = None) -> int
68
+ ```
69
+ Layout per session: `${SESSION_ROOT}/<sid>/{uploads/, chroma/, bm25_index.json, .touched}`. Idempotent and safe under concurrent reruns.
70
+
71
+ **[src/core/rag_orchestrator.py](src/core/rag_orchestrator.py)** — session-aware retrieval
72
+ - Extend `QueryRequest` at [L36](src/core/rag_orchestrator.py#L36) with:
73
+ - `session_bm25_index_path: Optional[str] = None`
74
+ - `session_collection_name: Optional[str] = None`
75
+ - `session_chroma_path: Optional[str] = None`
76
+ - `knowledge_scope: str = "global"` # `"global" | "session" | "both"`
77
+ - `_load_components()` at [L87](src/core/rag_orchestrator.py#L87): when scope is `session` or `both`, also load a second `(BM25Index, VectorDatabase)` from session paths. If session BM25 file is missing/empty (user hasn't uploaded yet), fall back gracefully — log warning and demote scope to `global`.
78
+ - `_retrieve()` at [L92](src/core/rag_orchestrator.py#L92): when `scope == "session"` run hybrid against the session pair only; when `scope == "both"` run two `HybridRetriever.retrieve()` calls and concatenate, deduping by `id`. The reranker at [L165](src/core/rag_orchestrator.py#L165) is the final arbiter — no change to fusion/rerank logic.
79
+ - Cache-key fingerprint at [L126](src/core/rag_orchestrator.py#L126) must include scope and session triple so global cache hits don't leak across users:
80
+ `corpus_fingerprint=f"{COLLECTION_NAME}:{BM25_INDEX_PATH}|{req.knowledge_scope}|{req.session_collection_name or '-'}:{req.session_bm25_index_path or '-'}"`
81
+
82
+ **[src/api/main.py](src/api/main.py)** — new endpoints + CORS + janitor
83
+ - Add CORS middleware (allow the React origin: localhost dev port + the deployed origin from env `DOC_FRONTEND_ORIGINS`).
84
+ - New endpoints:
85
+ - `POST /sessions` → `{session_id, expires_at}`. Mints id, calls `session_corpus.get_or_create()`. Sets `X-Demo-Session-Id` response header so the React app can also use it without cookies.
86
+ - `GET /sessions/{sid}` → `{session_id, files: [...], total_bytes, max_session_bytes, max_files, expires_at}`. Useful for the "My documents" panel.
87
+ - `POST /sessions/{sid}/documents` → multipart upload. Calls `save_uploaded_files(session.upload_dir, files, existing_bytes=total_bytes(session), ...)`, then `run_ingest(session.upload_dir, bm25_index_path=session.bm25_index_path, collection_name=session.collection_name, chroma_path=str(session.chroma_path))`. Touches the session.
88
+ - `DELETE /sessions/{sid}` → `session_corpus.delete_session(sid)` then mints a new id.
89
+ - Extend `POST /query` at [L155](src/api/main.py#L155): accept optional `session_id`, `knowledge_scope`. If provided, look up the session, touch it, and pass `session_*` paths into `QueryRequest`. Reject `session`/`both` scopes when session has no uploads (return 409 with a hint to upload first).
90
+ - Demo-mode guard at [L112](src/api/main.py#L112): the new session endpoints are **only** mounted when `DOC_PROFILE=demo` and `DOC_DEMO_UPLOADS=1`. Outside demo mode, ingestion stays through the existing batch path.
91
+ - Per-IP upload rate limit: reuse the existing limiter at [L77-99](src/api/main.py#L77-L99) on `POST /sessions/{sid}/documents`.
92
+ - FastAPI `lifespan`: start a background `asyncio` task that runs `session_corpus.janitor_sweep()` every 60 s; stop it on shutdown. Replaces the on-rerun best-effort sweep entirely.
93
+
94
+ **[spaces/app.py](spaces/app.py)** — opt the deployed demo into Phase 6.1
95
+ - After [L34](spaces/app.py#L34) add the env defaults:
96
+ - `DOC_DEMO_UPLOADS=1`
97
+ - `DOC_DEMO_SESSION_ROOT=/tmp/doc-ingest-sessions`
98
+ - `DOC_DEMO_MAX_FILES=3`, `DOC_DEMO_MAX_FILE_MB=3`, `DOC_DEMO_MAX_SESSION_MB=8`, `DOC_DEMO_SESSION_TTL=1800`
99
+ - HF Spaces ephemeral disk is wiped on container restart — `/tmp` keeps the persisted `data/` clean.
100
+
101
+ ### Functions/classes to reuse unchanged
102
+
103
+ - `save_uploaded_files()` at [src/web/ingestion_service.py:29](src/web/ingestion_service.py#L29) — body preserved, signature additions only
104
+ - `RAGOrchestrator` class itself at [src/core/rag_orchestrator.py:64](src/core/rag_orchestrator.py#L64) — only `QueryRequest` grows
105
+ - `HybridRetriever`, `BM25Search`, `VectorSearch` ([src/core/](src/core/)) — second instance per request when scope demands; otherwise unchanged
106
+ - `VectorDatabase` at [src/utils/database.py:29](src/utils/database.py#L29) — already accepts `chroma_path`; just construct a second one for sessions
107
+ - `BM25Index.save` / `BM25Index.load` at [src/core/bm25_index.py](src/core/bm25_index.py) — already path-parameterized
108
+ - `CrossEncoderReranker`, `RAGGenerator`, `CitationVerifier`, `ResponseCache` — unchanged
109
+
110
+ ### Tests (Phase 6.1)
111
+
112
+ Add under [tests/unit/](tests/unit/) and [tests/integration/](tests/integration/):
113
+
114
+ - `tests/unit/test_session_corpus.py` — id format, idempotent `get_or_create`, janitor TTL eviction, `delete_session` on missing dir is a no-op, concurrent `get_or_create` is safe.
115
+ - Extend `tests/unit/test_ingestion_service.py` (or create) — caps enforced (oversize, count, session disk), magic-byte mismatch rejected, override kwargs forwarded.
116
+ - `tests/unit/test_ingest_overrides.py` — `ingest(tmp, bm25_index_path=..., collection_name="sess_x", chroma_path=...)` writes to overrides and not defaults; default-arg call still hits the global paths.
117
+ - Extend `tests/unit/test_streamlit_demo_routing.py` — `knowledge_scope="session"` carries session paths only; `"both"` carries both; cache key changes when session paths change.
118
+ - `tests/integration/test_session_isolation.py` — bootstrap a tiny global corpus; mint sessions A and B; ingest different fixtures into each; query A scope=`session` returns only A's chunks; query A scope=`both` returns A+global, never B's; janitor with mocked clock past TTL deletes the session dirs.
119
+ - `tests/integration/test_global_corpus_pristine.py` — sha256 the global BM25 + Chroma store before/after multiple session ingests; assert unchanged.
120
+ - `tests/integration/test_session_api.py` — exercise `POST /sessions`, `POST /sessions/{id}/documents`, `GET /sessions/{id}`, `DELETE /sessions/{id}` and `POST /query` with session_id end-to-end via FastAPI `TestClient`.
121
+
122
+ ### Verification (Phase 6.1, local)
123
+
124
+ ```
125
+ # Unit + integration
126
+ pytest tests/unit/test_session_corpus.py tests/unit/test_ingestion_service.py \
127
+ tests/unit/test_ingest_overrides.py tests/unit/test_streamlit_demo_routing.py \
128
+ tests/integration/test_session_isolation.py \
129
+ tests/integration/test_global_corpus_pristine.py \
130
+ tests/integration/test_session_api.py -v
131
+
132
+ # Boot demo-mode API + Streamlit (Streamlit still works)
133
+ DOC_PROFILE=demo DOC_EMBEDDING_PROVIDER=sentence_transformers \
134
+ DOC_DEMO_UPLOADS=1 DOC_DEMO_SESSION_ROOT=/tmp/doc-ingest-sessions \
135
+ uvicorn src.api.main:app --host 127.0.0.1 --port 8000 &
136
+ DOC_PROFILE=demo streamlit run src/web/streamlit_app.py
137
+
138
+ # Curl smoke the new API
139
+ curl -X POST http://127.0.0.1:8000/sessions
140
+ # → {"session_id":"...","expires_at":...}
141
+ curl -X POST -F "files=@./README.md" http://127.0.0.1:8000/sessions/<sid>/documents
142
+ curl -X POST http://127.0.0.1:8000/query \
143
+ -H "Content-Type: application/json" \
144
+ -d '{"query":"summarize my doc","session_id":"<sid>","knowledge_scope":"session"}'
145
+
146
+ # Confirm shared corpus untouched
147
+ sha256sum data/embeddings/bm25_index.json # before/after — identical
148
+ ```
149
+
150
+ ### Phase 6.1 handoff (exit criteria)
151
+
152
+ - Backend supports isolated session corpus lifecycle (`create/get/upload/query/delete`) without cross-session leakage.
153
+ - `knowledge_scope` (`global|session|both`) works end-to-end and cache keys are session-safe.
154
+ - Guardrails are enforced server-side (file caps, MIME sanity checks, rate limiting, TTL janitor).
155
+ - Existing Streamlit demo still runs in demo profile (no regression to current user flow).
156
+ - All Phase 6.1 tests pass locally and in CI.
157
+
158
+ ### Transition to Phase 6.2 (entry criteria)
159
+
160
+ - API contracts are stable for frontend consumption (`/sessions`, `/sessions/{id}`, `/sessions/{id}/documents`, `/query` with session fields).
161
+ - OpenAPI spec reflects new request/response shapes.
162
+ - Demo env defaults for session uploads are available.
163
+
164
+ ## Phase 6.2 — React MVP front-end over stable API (~5-7 days)
165
+
166
+ Built in a new top-level `frontend/` directory; FastAPI keeps running unchanged. No HF cutover yet — develop locally against `http://127.0.0.1:8000`.
167
+
168
+ ### Objective
169
+
170
+ Ship a usable React demo UI that consumes Phase 6.1 APIs and validates isolated user-upload experience.
171
+
172
+ ### Scope
173
+
174
+ ### Stack
175
+
176
+ - **Vite + React 18 + TypeScript** (lean SPA, no SSR needed for a demo).
177
+ - **Tailwind CSS + shadcn/ui** (Radix-based primitives — drop-in card, tabs, radio-group, file-uploader, progress, toast).
178
+ - **TanStack Query** for server state (session, file list, query results) — gives caching, retries, and dedup for free.
179
+ - **Zustand** (or React Context) for the session-id slice that needs to outlive a route change.
180
+ - **Typed API client** generated from FastAPI's OpenAPI schema via `openapi-typescript` so the FE stays type-safe against the BE contract.
181
+ - **Streaming**: consume `POST /query/stream` via the `EventSource`-style `fetch` + `ReadableStream` pattern (since SSE doesn't natively support POST).
182
+
183
+ ### Component layout
184
+
185
+ ```
186
+ frontend/
187
+ ├─ index.html
188
+ ├─ vite.config.ts
189
+ ├─ tailwind.config.ts
190
+ ├─ src/
191
+ │ ├─ main.tsx
192
+ │ ├─ App.tsx # Tabs: Query | My documents
193
+ │ ├─ api/
194
+ │ │ ├─ client.ts # fetch wrapper, attaches X-Demo-Session-Id
195
+ │ │ └─ generated.ts # openapi-typescript output
196
+ │ ├─ session/
197
+ │ │ ├─ SessionProvider.tsx # mints session via POST /sessions on first load
198
+ │ │ └─ useSession.ts
199
+ │ ├─ tabs/
200
+ │ │ ├─ QueryTab.tsx # sample prompts, scope toggle, run
201
+ │ │ └─ DocumentsTab.tsx # drop-zone, file list, caps meter, reset
202
+ │ ├─ components/
203
+ │ │ ├─ SamplePromptChips.tsx # mirrors _DEMO_QUESTIONS
204
+ │ │ ├─ ScopeToggle.tsx # 3-way radio, disables Mine/Both until upload
205
+ │ │ ├─ AnswerPanel.tsx # answer + truthfulness badge
206
+ │ │ ├─ CitationsList.tsx # tagged [global]/[yours]
207
+ │ │ ├─ RetrievedChunks.tsx
208
+ │ │ └─ Uploader.tsx # drag-drop, per-file status
209
+ │ └─ lib/streamQuery.ts # SSE-over-POST helper
210
+ ```
211
+
212
+ ### UX wireframe
213
+
214
+ ```
215
+ ┌─────────────────────────────────────────────────────────────────┐
216
+ │ Doc Ingestion Assistant session …a91c · 28:14 left │
217
+ │ ⓘ Hosted demo. Your uploads stay in this session for 30 min, │
218
+ │ are not added to the shared corpus, and aren't visible to │
219
+ │ anyone else. │
220
+ ├─ [ Query ] [ My documents ] ─────────────────────────────────── │
221
+ │ │
222
+ │ Query tab: │
223
+ │ Try a sample: [What is RAG?] [What is RRF?] [BM25 vs vec…] │
224
+ │ │
225
+ │ Knowledge scope: │
226
+ │ ◉ Global sample corpus │
227
+ │ ○ My uploads only (disabled until upload) │
228
+ │ ○ Both │
229
+ │ │
230
+ │ Provider [ ▼ ] Model [ ▼ ] │
231
+ │ ┌───────────────────────────────────────────────┐ │
232
+ │ │ Ask a question… │ │
233
+ │ └───────────────────────────────────────────────┘ │
234
+ │ [ Run ] │
235
+ │ │
236
+ │ ── Answer ── 🟢 Truthfulness 0.89 │
237
+ │ …answer text streaming in… │
238
+ │ │
239
+ │ Citations: │
240
+ │ [yours] my-resume.pdf · chunk 2 │
241
+ │ [global] phase2_hybrid_retrieval.md · chunk 5 │
242
+ │ │
243
+ │ My documents tab: │
244
+ │ Disk used: 1.2 / 8.0 MB Files: 2 / 3 │
245
+ │ ⓘ ≤ 3 files · ≤ 3 MB each · ≤ 8 MB total │
246
+ │ ┌───────── drop files here ─────────┐ │
247
+ │ └──────────────────────────────────┘ │
248
+ │ • my-resume.pdf indexed │
249
+ │ • report.txt indexed │
250
+ │ [ Clear my session ] │
251
+ └─────────────────────────────────────────────────────────────────┘
252
+ ```
253
+
254
+ Behavior detail:
255
+ - On first mount, `SessionProvider` calls `POST /sessions` and stashes the id in localStorage (so a refresh keeps the same session until TTL).
256
+ - The scope toggle disables Mine/Both until `GET /sessions/{id}` reports ≥ 1 indexed file.
257
+ - Sample prompts always target Global scope by default (clicking a chip sets scope=Global and fills the textarea).
258
+ - The streaming answer uses `lib/streamQuery.ts` to read tokens off `/query/stream`; falls back to non-streaming if SSE fails.
259
+ - "Clear my session" calls `DELETE /sessions/{id}` then mints a new one.
260
+
261
+ ### Tests (Phase 6.2)
262
+
263
+ - `frontend/src/**/*.test.tsx` with **Vitest + React Testing Library**:
264
+ - SessionProvider mints a session on first mount and stores it.
265
+ - ScopeToggle disables Mine/Both when no uploads, enables after upload.
266
+ - Uploader respects 3-file cap client-side and shows server rejection toasts.
267
+ - QueryTab renders streamed tokens incrementally.
268
+ - **Playwright** smoke (`frontend/e2e/`): full happy-path — load → upload one file → switch to Mine → ask a question → see the file's citation.
269
+ - **Playwright** negative path: no uploads keeps Mine/Both disabled; rejected uploads surface clear cap/type errors.
270
+
271
+ ### Verification (Phase 6.2, local)
272
+
273
+ ```
274
+ # Backend
275
+ DOC_PROFILE=demo DOC_DEMO_UPLOADS=1 \
276
+ uvicorn src.api.main:app --host 127.0.0.1 --port 8000
277
+
278
+ # Frontend
279
+ cd frontend && npm install && npm run dev # http://localhost:5173
280
+
281
+ # E2E
282
+ cd frontend && npm run test # vitest
283
+ npm run test:e2e # playwright
284
+ ```
285
+
286
+ ### Phase 6.2 handoff (exit criteria)
287
+
288
+ - React app provides Query + My Documents tabs, scope toggle, streaming/fallback answer flow, and session reset.
289
+ - UI clearly communicates upload limits and session TTL.
290
+ - Frontend unit tests and e2e tests pass locally and in CI.
291
+ - UX supports clear citation provenance (`[global]` vs `[yours]`) for trust validation.
292
+
293
+ ### Transition to Phase 6.3 (entry criteria)
294
+
295
+ - Frontend builds reproducibly (`npm ci && npm run build`) and can be served as static assets.
296
+ - API CORS config includes intended frontend origins.
297
+ - No unresolved API/frontend contract mismatches remain.
298
+
299
+ ## Phase 6.3 — Single-container deploy & HF Spaces cutover (~2 days)
300
+
301
+ The current HF Space uses the Streamlit SDK (`spaces/README.md`). Switch to the Docker SDK so we ship one container with FastAPI + the built React SPA.
302
+
303
+ ### Objective
304
+
305
+ Deploy one container (FastAPI + built React) to simplify ops and align HF delivery with the new UI.
306
+
307
+ ### Scope
308
+
309
+ ### Files to modify
310
+
311
+ - **[docker/Dockerfile](docker/Dockerfile)** — multi-stage:
312
+ - Stage 1 (`node:20-alpine`): `npm ci && npm run build` → `frontend/dist`.
313
+ - Stage 2 (existing Python image): `COPY --from=stage1 /app/frontend/dist /app/static`.
314
+ - Final `CMD` runs uvicorn only — Streamlit is no longer in the deployed image.
315
+ - **[src/api/main.py](src/api/main.py)** — when the static dir exists, mount it: `app.mount("/", StaticFiles(directory="static", html=True), name="ui")`. Move existing API routes under `/api` prefix (or use `app.mount` ordering so SPA fallback kicks in only on unknown paths). Keep `/health`, `/metrics`, `/query`, `/query/stream` reachable.
316
+ - **[spaces/README.md](spaces/README.md)** — change frontmatter:
317
+ ```yaml
318
+ sdk: docker
319
+ app_port: 8000
320
+ ```
321
+ Drop `app_file: spaces/app.py`.
322
+ - **[spaces/app.py](spaces/app.py)** — repurpose as a tiny launcher that just sets the demo env vars and execs uvicorn (or remove entirely if env defaults move into the Dockerfile).
323
+ - **[.github/workflows/sync-to-spaces.yml](.github/workflows/sync-to-spaces.yml)** — extend to run `npm ci && npm run build` before pushing, OR rely on HF's Docker build (preferred — keeps CI fast).
324
+ - **[.github/workflows/ci.yml](.github/workflows/ci.yml)** — add a `frontend` job: `npm ci`, `npm run lint`, `npm run test`, `npm run build`. Add a `e2e` job that boots the API and runs Playwright.
325
+
326
+ Streamlit code stays in `src/web/streamlit_app.py` behind an env flag during the cutover so we can roll back to the previous Space SDK by reverting `spaces/README.md`.
327
+
328
+ ### Verification (Phase 6.3)
329
+
330
+ ```
331
+ # Build and run the unified container locally
332
+ docker build -f docker/Dockerfile -t doc-ingest:demo .
333
+ docker run --rm -p 8000:8000 \
334
+ -e DOC_PROFILE=demo -e DOC_DEMO_UPLOADS=1 \
335
+ -e DOC_EMBEDDING_PROVIDER=sentence_transformers \
336
+ doc-ingest:demo
337
+ open http://127.0.0.1:8000
338
+
339
+ # Push branch → HF Space rebuilds via Docker SDK; smoke-test the live URL.
340
+ ```
341
+
342
+ ### Phase 6.3 handoff (exit criteria)
343
+
344
+ - Unified container runs locally and in HF Spaces with expected routes and SPA fallback behavior.
345
+ - Core API routes (`/health`, `/metrics`, `/query`, `/query/stream`) remain reachable and validated.
346
+ - Demo smoke tests pass against deployed environment.
347
+ - Rollback procedure to prior Space setup is documented and tested.
348
+
349
+ ### Transition to Phase 6.4 (entry criteria)
350
+
351
+ - React demo has soaked in production-like traffic for at least one week.
352
+ - No unresolved severity-1/2 issues tied to the new deployment path.
353
+ - Team confirms Streamlit rollback is no longer required.
354
+
355
+ ## Phase 6.4 — Decommission Streamlit (optional, after 6.3 soaks)
356
+
357
+ Once the React demo has been live for a week without regressions:
358
+
359
+ - Delete [src/web/streamlit_app.py](src/web/streamlit_app.py).
360
+ - Remove `streamlit` from [requirements/base.txt](requirements/base.txt).
361
+ - Drop the Streamlit container from [docker/docker-compose.yml](docker/docker-compose.yml).
362
+ - Update [README.md](README.md) screenshots and quickstart.
363
+ - Delete `tests/unit/test_streamlit_demo_routing.py`.
364
+
365
+ Keep `_DEMO_QUESTIONS` (move into a small JSON the API serves at `GET /api/sample-prompts` so the React FE stays in sync).
366
+
367
+ ### Phase 6.4 handoff (exit criteria)
368
+
369
+ - Streamlit runtime, dependencies, and tests are removed cleanly.
370
+ - Documentation and quickstart reflect the React + FastAPI deployment only.
371
+ - Sample prompts are served from API/shared source of truth.
372
+
373
+ ### Transition to next program increment
374
+
375
+ - Phase 6 is complete when 6.1-6.4 exit criteria are satisfied (with 6.4 optional per release decision).
376
+ - Any deferred improvements become backlog items for Phase 7 (e.g., hard TTL cap, query concurrency limiter, enhanced abuse controls).
377
+
378
+ ## Caps & abuse guardrails (locked-in defaults)
379
+
380
+ | Guard | Default | Enforced where | Failure mode |
381
+ |---|---|---|---|
382
+ | Per-file size cap | 3 MB | `save_uploaded_files()` | `rejected: oversize` |
383
+ | File count cap | 3 / session | `save_uploaded_files()` | `rejected: file_count_cap` |
384
+ | Total session disk cap | 8 MB | `save_uploaded_files()` | `rejected: session_disk_cap` |
385
+ | Extension allowlist | `.pdf .docx .txt .md .html` | already at `_SUPPORTED_EXTS` ([L15](src/web/ingestion_service.py#L15)) | `failed: unsupported` |
386
+ | MIME magic | header sniff | new helper in `save_uploaded_files()` | `rejected: type_mismatch` |
387
+ | Per-IP upload rate-limit | reuse [src/api/main.py:77-99](src/api/main.py#L77-L99) limiter | `POST /sessions/{sid}/documents` | 429 |
388
+ | Janitor disk ceiling | total `SESSION_ROOT > 1 GB` evicts oldest | `janitor_sweep()` | oldest sessions dropped |
389
+ | Idle TTL | 30 min, refreshed on every query/upload | `.touched` mtime + janitor | session purged |
390
+
391
+ All caps overridable via env (`DOC_DEMO_*`) so we can tune on HF without code changes.
392
+
393
+ ## Phase execution re-review (end-to-end)
394
+
395
+ Execution order is intentionally strict: **6.1 -> 6.2 -> 6.3 -> 6.4 (optional)**.
396
+
397
+ - **6.1 is the architectural base**: session isolation, scoped retrieval, and backend guardrails must be correct before any UI investment.
398
+ - **6.2 depends on 6.1 contracts**: React work starts only after session APIs and `knowledge_scope` behavior are stable and test-covered.
399
+ - **6.3 depends on 6.2 build maturity**: container cutover happens only after frontend build/test reliability and CORS/origin alignment are in place.
400
+ - **6.4 is a stabilization cleanup**: Streamlit removal is deferred until post-soak confidence to protect rollback safety.
401
+
402
+ Readiness checklist before starting each phase:
403
+
404
+ - Previous phase exit criteria are met and documented.
405
+ - Phase-specific test suite passes locally and in CI.
406
+ - No open blocker in cross-phase risks that invalidates next-phase assumptions.
407
+ - Handoff artifacts are available (API contract, env defaults, deployment notes, rollback notes as applicable).
408
+
409
+ ## Cross-phase risks & open questions
410
+
411
+ 1. **HF Space SDK switch (Streamlit → Docker)** is a one-way door for the running Space. Do the cutover on a fresh Space first (e.g., `…-demo-v2`), validate, then point the public URL at it.
412
+ 2. **Reranker memory under concurrency** — cross-encoder is the dominant cost (~400 MB) and serializes on CPU. More visitors uploading doesn't worsen retrieval contention, but Phase 6.2 should add a concurrency limiter on `/query` if HF traffic grows.
413
+ 3. **Cache-key fingerprint correctness** — the change at [rag_orchestrator.py:126](src/core/rag_orchestrator.py#L126) is load-bearing. Test must assert two sessions with identical query text get distinct cache keys.
414
+ 4. **`DocumentProcessor._seen_hashes` per-instance** ([src/core/document_processor.py:49](src/core/document_processor.py#L49)) — passing a fresh processor per session ingest is required, otherwise a session can silently skip files matching another session's hashes.
415
+ 5. **TTL refresh on read vs write** — refreshing on every query keeps active users' uploads alive indefinitely; consider an absolute hard cap (4 h) in Phase 6.2 if abuse appears.
416
+ 6. **SSE-over-POST quirks** — some proxies break long-lived POST streams. The React client falls back to non-streaming on first failure.
417
+ 7. **CORS scope** — set `DOC_FRONTEND_ORIGINS` tightly (no `"*"`) once the Space URL is final.
418
+ 8. **Browser refresh** — localStorage retains `session_id`; if backend has expired it, the FE catches a 404 from `GET /sessions/{id}` and re-mints transparently.
419
+ 9. **Citation labeling** — to display `[yours]` vs `[global]`, the merged `RetrievedResult.metadata` must carry the source collection. Cheapest: prefix chunk `id`s with `sess_<sid>__` for session uploads (already implicit since the collection name differs); the FE checks the prefix.
420
+ 10. **Streamlit coexistence during transition** — keep the Streamlit page reachable via a hidden `/legacy` route until Phase 6.4 to ease rollback.
421
+
422
+ ## Critical files by phase
423
+
424
+ - **Phase 6.1**
425
+ - [src/ingest.py](src/ingest.py)
426
+ - [src/web/ingestion_service.py](src/web/ingestion_service.py)
427
+ - `src/web/session_corpus.py` (new)
428
+ - [src/core/rag_orchestrator.py](src/core/rag_orchestrator.py)
429
+ - [src/api/main.py](src/api/main.py)
430
+ - [spaces/app.py](spaces/app.py)
431
+ - **Phase 6.2**
432
+ - `frontend/` (new tree)
433
+ - **Phase 6.3**
434
+ - [docker/Dockerfile](docker/Dockerfile)
435
+ - [src/api/main.py](src/api/main.py)
436
+ - [spaces/README.md](spaces/README.md)
437
+ - [spaces/app.py](spaces/app.py)
438
+ - [.github/workflows/ci.yml](.github/workflows/ci.yml)
439
+ - [.github/workflows/sync-to-spaces.yml](.github/workflows/sync-to-spaces.yml)
440
+ - **Phase 6.4**
441
+ - [src/web/streamlit_app.py](src/web/streamlit_app.py)
442
+ - [requirements/base.txt](requirements/base.txt)
443
+ - [docker/docker-compose.yml](docker/docker-compose.yml)
444
+ - [README.md](README.md)
445
+ - `tests/unit/test_streamlit_demo_routing.py`
Docs/Phase6.1-Backend-Session-Isolation-Plan.md ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Phase 6.1 Plan: Backend Session Isolation Foundation
2
+
3
+ Source of truth: `Docs/Phase6-RefactorDemo_React.md` (this file is an execution slice for iterative delivery).
4
+
5
+ ## Objective
6
+
7
+ Land session-isolated ingestion/retrieval in the backend while keeping existing Streamlit behavior intact.
8
+
9
+ ## Scope
10
+
11
+ Ships independently. Streamlit UI continues to work as today. The new HTTP surface unblocks the React build.
12
+
13
+ ## Files to modify
14
+
15
+ **`src/ingest.py`** — make `ingest()` accept overrides
16
+ - Change signature to:
17
+ `def ingest(docs_path, *, bm25_index_path=BM25_INDEX_PATH, collection_name=COLLECTION_NAME, chroma_path="data/embeddings/chroma", processor=None) -> tuple[BM25Index, VectorDatabase]`
18
+ - Replace hard-coded uses with kwargs.
19
+ - Keep module constants as defaults so CLI remains unchanged.
20
+ - Ensure fresh `DocumentProcessor` per session when caller passes one.
21
+
22
+ **`src/web/ingestion_service.py`** — caps + session-target passthrough
23
+ - Add env-overridable caps:
24
+ - `DOC_DEMO_MAX_FILES` (default `3`)
25
+ - `DOC_DEMO_MAX_FILE_MB` (default `3`)
26
+ - `DOC_DEMO_MAX_SESSION_MB` (default `8`)
27
+ - Extend `save_uploaded_files()` to enforce:
28
+ - per-file cap
29
+ - file count cap
30
+ - total session cap
31
+ - Add magic-bytes check (`.pdf`, `.docx`) and reject type mismatch.
32
+ - Extend `run_ingest()` to pass `bm25_index_path`, `collection_name`, `chroma_path` overrides.
33
+
34
+ **`src/web/session_corpus.py`** (new)
35
+ - Add `SessionCorpus` dataclass and helpers:
36
+ - `new_session_id`, `get_or_create`, `touch`, `total_bytes`, `list_active_sessions`, `delete_session`, `janitor_sweep`
37
+ - Session layout:
38
+ - `${SESSION_ROOT}/<sid>/{uploads/, chroma/, bm25_index.json, .touched}`
39
+ - Defaults:
40
+ - `DOC_DEMO_SESSION_ROOT=/tmp/doc-ingest-sessions`
41
+ - `DOC_DEMO_SESSION_TTL=1800`
42
+
43
+ **`src/core/rag_orchestrator.py`** — session-aware retrieval
44
+ - Extend `QueryRequest`:
45
+ - `session_bm25_index_path`
46
+ - `session_collection_name`
47
+ - `session_chroma_path`
48
+ - `knowledge_scope` (`global|session|both`)
49
+ - `session` scope uses only session corpus.
50
+ - `both` scope merges global + session results and dedups by id.
51
+ - Cache fingerprint must include scope + session corpus identifiers.
52
+
53
+ **`src/api/main.py`** — session endpoints + CORS + janitor
54
+ - Add CORS using `DOC_FRONTEND_ORIGINS`.
55
+ - Add endpoints:
56
+ - `POST /sessions`
57
+ - `GET /sessions/{sid}`
58
+ - `POST /sessions/{sid}/documents`
59
+ - `DELETE /sessions/{sid}`
60
+ - Extend `POST /query` with optional `session_id` and `knowledge_scope`.
61
+ - Reject `session/both` if session has no uploads (409 with hint).
62
+ - Mount only in demo mode:
63
+ - `DOC_PROFILE=demo`
64
+ - `DOC_DEMO_UPLOADS=1`
65
+ - Reuse upload rate limiter for `POST /sessions/{sid}/documents`.
66
+ - Add lifespan janitor task (`session_corpus.janitor_sweep()` every 60s).
67
+
68
+ **`spaces/app.py`** — enable demo defaults for this phase
69
+ - Set:
70
+ - `DOC_DEMO_UPLOADS=1`
71
+ - `DOC_DEMO_SESSION_ROOT=/tmp/doc-ingest-sessions`
72
+ - `DOC_DEMO_MAX_FILES=3`
73
+ - `DOC_DEMO_MAX_FILE_MB=3`
74
+ - `DOC_DEMO_MAX_SESSION_MB=8`
75
+ - `DOC_DEMO_SESSION_TTL=1800`
76
+
77
+ ## Tests
78
+
79
+ - `tests/unit/test_session_corpus.py`
80
+ - `tests/unit/test_ingestion_service.py` (extend or create)
81
+ - `tests/unit/test_ingest_overrides.py`
82
+ - `tests/unit/test_streamlit_demo_routing.py` (extend)
83
+ - `tests/integration/test_session_isolation.py`
84
+ - `tests/integration/test_global_corpus_pristine.py`
85
+ - `tests/integration/test_session_api.py`
86
+
87
+ ## Verification
88
+
89
+ ```bash
90
+ pytest tests/unit/test_session_corpus.py tests/unit/test_ingestion_service.py \
91
+ tests/unit/test_ingest_overrides.py tests/unit/test_streamlit_demo_routing.py \
92
+ tests/integration/test_session_isolation.py \
93
+ tests/integration/test_global_corpus_pristine.py \
94
+ tests/integration/test_session_api.py -v
95
+
96
+ DOC_PROFILE=demo DOC_EMBEDDING_PROVIDER=sentence_transformers \
97
+ DOC_DEMO_UPLOADS=1 DOC_DEMO_SESSION_ROOT=/tmp/doc-ingest-sessions \
98
+ uvicorn src.api.main:app --host 127.0.0.1 --port 8000 &
99
+ DOC_PROFILE=demo streamlit run src/web/streamlit_app.py
100
+ ```
101
+
102
+ API smoke:
103
+
104
+ ```bash
105
+ curl -X POST http://127.0.0.1:8000/sessions
106
+ curl -X POST -F "files=@./README.md" http://127.0.0.1:8000/sessions/<sid>/documents
107
+ curl -X POST http://127.0.0.1:8000/query \
108
+ -H "Content-Type: application/json" \
109
+ -d '{"query":"summarize my doc","session_id":"<sid>","knowledge_scope":"session"}'
110
+ sha256sum data/embeddings/bm25_index.json
111
+ ```
112
+
113
+ ## Handoff (Exit Criteria)
114
+
115
+ - Backend supports isolated session lifecycle (`create/get/upload/query/delete`) with no cross-session leakage.
116
+ - `knowledge_scope` works end-to-end and cache keys are session-safe.
117
+ - Guardrails enforced server-side (caps, MIME checks, rate limiting, TTL janitor).
118
+ - Streamlit demo still works in demo profile.
119
+ - Phase 6.1 tests pass locally and in CI.
120
+
121
+ ## Transition to Phase 6.2
122
+
123
+ - API contracts are stable for frontend usage.
124
+ - OpenAPI includes new request/response shapes.
125
+ - Demo env defaults for session uploads are confirmed.
Docs/Phase6.2-React-MVP-Plan.md ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Phase 6.2 Plan: React MVP Front-end Over Stable API
2
+
3
+ Source of truth: `Docs/Phase6-RefactorDemo_React.md` (this file is an execution slice for iterative delivery).
4
+ Depends on: `Docs/Phase6.1-Backend-Session-Isolation-Plan.md`
5
+
6
+ ## Objective
7
+
8
+ Ship a usable React demo UI that consumes Phase 6.1 APIs and validates isolated user-upload experience.
9
+
10
+ ## Scope
11
+
12
+ Build in top-level `frontend/`; FastAPI backend remains unchanged. No HF cutover yet; develop locally against `http://127.0.0.1:8000`.
13
+
14
+ ## Stack
15
+
16
+ - Vite + React 18 + TypeScript
17
+ - Tailwind CSS + shadcn/ui
18
+ - TanStack Query
19
+ - Zustand (or Context) for session id state
20
+ - `openapi-typescript` generated API typings
21
+ - Streaming via `POST /query/stream` using `fetch` + `ReadableStream`
22
+
23
+ ## Planned frontend layout
24
+
25
+ ```text
26
+ frontend/
27
+ ├─ src/App.tsx # Query | My documents
28
+ ├─ src/api/client.ts # fetch wrapper + session header
29
+ ├─ src/api/generated.ts # OpenAPI types
30
+ ├─ src/session/SessionProvider.tsx
31
+ ├─ src/tabs/QueryTab.tsx
32
+ ├─ src/tabs/DocumentsTab.tsx
33
+ ├─ src/components/ScopeToggle.tsx
34
+ ├─ src/components/Uploader.tsx
35
+ └─ src/lib/streamQuery.ts
36
+ ```
37
+
38
+ ## Required behavior
39
+
40
+ - On first mount, mint session via `POST /sessions`; store id in localStorage.
41
+ - Disable Mine/Both scope until session has at least one indexed file.
42
+ - Sample prompts default to Global scope.
43
+ - Stream response tokens from `/query/stream`; fallback to non-streaming on failure.
44
+ - "Clear my session" triggers `DELETE /sessions/{id}` and remints a session id.
45
+ - Surface citation provenance as `[global]` and `[yours]`.
46
+
47
+ ## Tests
48
+
49
+ - Vitest + RTL (`frontend/src/**/*.test.tsx`):
50
+ - Session mint/persist behavior
51
+ - Scope toggle enable/disable states
52
+ - Uploader cap and server-rejection UI
53
+ - Incremental stream rendering
54
+ - Playwright smoke:
55
+ - load -> upload -> scope Mine -> query -> citation from uploaded file
56
+ - Playwright negative:
57
+ - no upload keeps Mine/Both disabled
58
+ - rejected upload errors are clearly shown
59
+
60
+ ## Verification
61
+
62
+ ```bash
63
+ DOC_PROFILE=demo DOC_DEMO_UPLOADS=1 \
64
+ uvicorn src.api.main:app --host 127.0.0.1 --port 8000
65
+
66
+ cd frontend && npm install && npm run dev
67
+ cd frontend && npm run test
68
+ cd frontend && npm run test:e2e
69
+ ```
70
+
71
+ ## Handoff (Exit Criteria)
72
+
73
+ - Query + My Documents tabs are complete with session reset flow.
74
+ - Scope toggle, upload caps messaging, and TTL messaging are visible and correct.
75
+ - Streaming and fallback response paths are reliable.
76
+ - Unit + e2e tests pass locally and in CI.
77
+ - Citation source labeling enables user trust verification.
78
+
79
+ ## Transition to Phase 6.3
80
+
81
+ - Frontend builds reproducibly (`npm ci && npm run build`).
82
+ - API CORS includes intended frontend origins.
83
+ - No unresolved frontend/backend contract mismatches.
Docs/Phase6.3-Container-Cutover-Implementation-Spec.md ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Phase 6.3 Implementation Spec: Single-Container Deploy and HF Spaces Cutover
2
+
3
+ Source plan: `Docs/Phase6.3-Container-Cutover-Plan.md`
4
+ Depends on: `Docs/Phase6.2-React-MVP-Plan.md`
5
+ Next phase: `Docs/Phase6.4-Streamlit-Decommission-Plan.md`
6
+
7
+ ## Objective
8
+
9
+ Ship the React MVP and FastAPI API from one Docker container, then cut Hugging Face Spaces from the Streamlit SDK runtime to the Docker SDK runtime.
10
+
11
+ The deployed container must:
12
+
13
+ - Serve the built React SPA at `/`.
14
+ - Keep API endpoints reachable with their current contracts.
15
+ - Preserve the Streamlit rollback path until Phase 6.4.
16
+ - Continue to support demo session uploads, scoped retrieval, citations, and health checks.
17
+
18
+ ## Current State
19
+
20
+ - `frontend/` already has npm scripts for `lint`, `typecheck`, `test`, `test:e2e`, and `build`.
21
+ - `.github/workflows/ci.yml` already contains a frontend job, but e2e execution should be reviewed because the Playwright config currently starts only the Vite dev server.
22
+ - `docker/Dockerfile` runs FastAPI on port `8000` and still exposes `8501`.
23
+ - `spaces/README.md` still declares `sdk: streamlit`, `sdk_version`, and `app_file: spaces/app.py`.
24
+ - `spaces/app.py` still starts FastAPI in a background thread and delegates to `src.web.streamlit_app`.
25
+ - `src/api/main.py` does not yet mount the React build output as static UI.
26
+
27
+ ## Non-Goals
28
+
29
+ - Do not delete `src/web/streamlit_app.py`.
30
+ - Do not remove `streamlit` from `requirements/base.txt`.
31
+ - Do not remove the Streamlit service from `docker/docker-compose.yml`.
32
+ - Do not change the `/health`, `/metrics`, `/query`, or `/query/stream` API payload contracts.
33
+ - Do not introduce a second production web server such as nginx unless a concrete deployment issue requires it.
34
+
35
+ ## Implementation Sequence
36
+
37
+ ### 1. Confirm Phase 6.2 Readiness
38
+
39
+ Before editing deployment files, verify the React app is buildable and API-compatible:
40
+
41
+ ```bash
42
+ cd frontend
43
+ npm ci
44
+ npm run lint
45
+ npm run typecheck
46
+ npm run test
47
+ npm run build
48
+ ```
49
+
50
+ Expected result:
51
+
52
+ - `frontend/dist/` is produced reproducibly.
53
+ - The frontend does not require a hard-coded `VITE_API_BASE_URL` when served from the same origin.
54
+ - Playwright tests can run against a backend URL that represents the deployment shape.
55
+
56
+ If Playwright currently depends on a separate dev API, update the e2e setup in this phase so CI boots FastAPI in demo mode before running the browser tests.
57
+
58
+ ### 2. Update `docker/Dockerfile`
59
+
60
+ Convert the Dockerfile to a multi-stage build.
61
+
62
+ Recommended structure:
63
+
64
+ 1. `frontend-builder` stage based on `node:20-alpine`.
65
+ 2. Python runtime stage based on the existing `python:3.11-slim`.
66
+ 3. Copy `frontend/package.json` and `frontend/package-lock.json` before copying all frontend files so npm dependencies cache properly.
67
+ 4. Run `npm ci` and `npm run build`.
68
+ 5. Copy `frontend/dist` into the runtime image at `/app/static`.
69
+ 6. Keep `PYTHONPATH=/app`, Hugging Face cache env vars, non-root `appuser`, and the existing FastAPI `CMD`.
70
+ 7. Remove `EXPOSE 8501` from the final runtime image.
71
+
72
+ Best practices:
73
+
74
+ - Use `npm ci`, not `npm install`, in image builds.
75
+ - Keep dependency installation before source copies where practical for Docker cache reuse.
76
+ - Keep the final container single-process: uvicorn only.
77
+ - Keep Streamlit installed for rollback during Phase 6.3, but do not run it in the final container command.
78
+ - Preserve the existing `/health` Docker healthcheck.
79
+
80
+ Acceptance checks:
81
+
82
+ - `docker build -f docker/Dockerfile -t doc-ingest:demo .` succeeds from repo root.
83
+ - `docker run` starts uvicorn on port `8000`.
84
+ - `/app/static/index.html` exists in the image.
85
+ - No runtime process listens on port `8501` in the unified image.
86
+
87
+ ### 3. Mount React Static Assets in `src/api/main.py`
88
+
89
+ Serve the SPA only after API routes have been registered.
90
+
91
+ Implementation requirements:
92
+
93
+ - Import `Path` and `StaticFiles`.
94
+ - Resolve the static directory relative to the deployed app, for example `/app/static` in Docker and `static/` from the repo root locally.
95
+ - Mount static assets only if the directory exists and contains `index.html`.
96
+ - Register all API routes before mounting the catch-all UI route.
97
+ - Ensure SPA fallback does not shadow `/health`, `/metrics`, `/query`, `/query/stream`, `/sessions`, `/observability/dashboard`, or OpenAPI docs.
98
+
99
+ Recommended route strategy:
100
+
101
+ - Keep existing routes at their current paths for backward compatibility.
102
+ - Add optional `/api` aliases only if the frontend needs them, but do not remove current top-level API paths.
103
+ - Mount `StaticFiles(directory=..., html=True)` at `/` after all current route decorators.
104
+
105
+ Testing focus:
106
+
107
+ - `GET /` returns the React app when `static/index.html` exists.
108
+ - `GET /assets/...` serves bundled frontend assets.
109
+ - Unknown browser routes fall back to the SPA.
110
+ - API routes continue to return JSON and do not return `index.html`.
111
+ - OpenAPI remains available at `/openapi.json`.
112
+
113
+ ### 4. Rework `spaces/app.py` for Docker Runtime
114
+
115
+ In Docker SDK mode, HF Spaces will run the container command, so `spaces/app.py` no longer needs to launch Streamlit.
116
+
117
+ Preferred implementation:
118
+
119
+ - Keep `spaces/app.py` as a thin bootstrap utility only if it is still useful for local or HF startup.
120
+ - Move demo env defaults into the Docker runtime or a small bootstrap function used by the Docker entrypoint.
121
+ - Continue to set:
122
+ - `DOC_PROFILE=demo`
123
+ - `DOC_API_KEYS=demo-key`
124
+ - `DOC_EMBEDDING_PROVIDER=sentence_transformers`
125
+ - `DOC_DEMO_UPLOADS=1`
126
+ - `DOC_DEMO_SESSION_ROOT=/tmp/doc-ingest-sessions`
127
+ - `DOC_DEMO_MAX_FILES=3`
128
+ - `DOC_DEMO_MAX_FILE_MB=3`
129
+ - `DOC_DEMO_MAX_SESSION_MB=8`
130
+ - `DOC_DEMO_SESSION_TTL=1800`
131
+ - Ensure `spaces.bootstrap_demo.bootstrap_if_needed()` still runs before traffic depends on the sample corpus.
132
+
133
+ Acceptable options:
134
+
135
+ - Add an entrypoint script that runs bootstrap, then `exec uvicorn src.api.main:app --host 0.0.0.0 --port 8000 --workers 1`.
136
+ - Or keep Docker `CMD` as uvicorn and move bootstrap into FastAPI lifespan startup, guarded so it only runs in demo profile.
137
+
138
+ Best practice:
139
+
140
+ - Prefer `exec` in shell entrypoints so uvicorn receives container signals directly.
141
+ - Keep bootstrap idempotent.
142
+ - Do not start background API threads in Docker SDK mode.
143
+
144
+ ### 5. Update `spaces/README.md`
145
+
146
+ Change Hugging Face Spaces metadata:
147
+
148
+ ```yaml
149
+ sdk: docker
150
+ app_port: 8000
151
+ ```
152
+
153
+ Remove:
154
+
155
+ ```yaml
156
+ sdk_version: "1.37.0"
157
+ app_file: spaces/app.py
158
+ ```
159
+
160
+ Refresh user-facing text:
161
+
162
+ - Describe the React + FastAPI demo.
163
+ - Mention session uploads are enabled in demo mode with the configured limits.
164
+ - Point users to the root URL on port `8000` for the UI.
165
+ - Keep provider/API-key limitations accurate for HF.
166
+
167
+ ### 6. Review `.github/workflows/ci.yml`
168
+
169
+ The frontend job already exists. Review and adjust it so it reflects the deployment contract:
170
+
171
+ - Keep `npm ci`, `npm run lint`, `npm run typecheck`, `npm run test`, and `npm run build`.
172
+ - Add a dedicated e2e job or e2e steps that start FastAPI in demo mode before Playwright runs.
173
+ - Use `DOC_PROFILE=demo`, `DOC_DEMO_UPLOADS=1`, and `DOC_EMBEDDING_PROVIDER=sentence_transformers` for e2e.
174
+ - Wait for `http://127.0.0.1:8000/health` before launching browser tests.
175
+ - Keep Python and Node caches scoped to the correct lockfiles.
176
+
177
+ Recommended e2e smoke:
178
+
179
+ ```bash
180
+ PYTHONPATH=. DOC_PROFILE=demo DOC_DEMO_UPLOADS=1 \
181
+ DOC_EMBEDDING_PROVIDER=sentence_transformers \
182
+ uvicorn src.api.main:app --host 127.0.0.1 --port 8000
183
+
184
+ cd frontend
185
+ npm run test:e2e
186
+ ```
187
+
188
+ ### 7. Review `.github/workflows/sync-to-spaces.yml`
189
+
190
+ Keep this workflow lean. Hugging Face should build the Docker image from the pushed repo.
191
+
192
+ Implementation notes:
193
+
194
+ - Update comments that still say HF uses `spaces/app.py` as the entry point.
195
+ - Do not add a prebuild unless HF Docker builds are too slow or unreliable.
196
+ - Keep the repo push behavior aligned with the current release process.
197
+ - Ensure `spaces/README.md` is included in the pushed content so HF detects Docker SDK metadata.
198
+
199
+ ## Local Verification
200
+
201
+ Run these checks before opening a PR:
202
+
203
+ ```bash
204
+ PYTHONPATH=. python -m pytest tests/unit -q
205
+ PYTHONPATH=. python -m pytest tests/integration -q
206
+
207
+ cd frontend
208
+ npm ci
209
+ npm run lint
210
+ npm run typecheck
211
+ npm run test
212
+ npm run build
213
+ cd ..
214
+
215
+ docker build -f docker/Dockerfile -t doc-ingest:demo .
216
+ docker run --rm -p 8000:8000 \
217
+ -e DOC_PROFILE=demo \
218
+ -e DOC_DEMO_UPLOADS=1 \
219
+ -e DOC_EMBEDDING_PROVIDER=sentence_transformers \
220
+ doc-ingest:demo
221
+ ```
222
+
223
+ Smoke checks while the container is running:
224
+
225
+ ```bash
226
+ curl -fsS http://127.0.0.1:8000/health
227
+ curl -fsS http://127.0.0.1:8000/metrics
228
+ curl -fsS http://127.0.0.1:8000/openapi.json
229
+ curl -fsS http://127.0.0.1:8000/ | head
230
+ ```
231
+
232
+ Browser checks:
233
+
234
+ - Open `http://127.0.0.1:8000`.
235
+ - Confirm the React UI loads without Vite.
236
+ - Create or reuse a demo session.
237
+ - Upload one small supported file.
238
+ - Query with `Mine` scope and confirm citation provenance.
239
+ - Query with `Global` scope and confirm existing sample corpus still works.
240
+ - Refresh the browser and confirm the session resumes or remints cleanly.
241
+
242
+ ## Hugging Face Spaces Verification
243
+
244
+ Recommended cutover flow:
245
+
246
+ 1. Deploy to a fresh validation Space first, for example `doc-ingestion-demo-v2`.
247
+ 2. Confirm the Space is using Docker SDK metadata.
248
+ 3. Wait for Docker build completion.
249
+ 4. Smoke-test:
250
+ - `/`
251
+ - `/health`
252
+ - `/metrics`
253
+ - `/openapi.json`
254
+ - `POST /sessions`
255
+ - document upload
256
+ - scoped query
257
+ - streaming query fallback behavior
258
+ 5. Validate logs for bootstrap, model download, and session janitor errors.
259
+ 6. Only then switch the public demo target.
260
+
261
+ ## Rollback Plan
262
+
263
+ Rollback must remain available until Phase 6.4 is intentionally executed.
264
+
265
+ Fast rollback:
266
+
267
+ - Revert `spaces/README.md` to Streamlit SDK metadata:
268
+ - `sdk: streamlit`
269
+ - `sdk_version: "1.37.0"`
270
+ - `app_file: spaces/app.py`
271
+ - Restore the pre-cutover `spaces/app.py` behavior that starts FastAPI in a thread and delegates to Streamlit.
272
+ - Keep `src/web/streamlit_app.py` and `streamlit` dependency untouched during Phase 6.3.
273
+
274
+ Container rollback:
275
+
276
+ - Revert the Dockerfile to the previous Python-only image if the multi-stage build breaks HF.
277
+ - Keep the React app and backend changes in the branch if they are not the cause.
278
+
279
+ Rollback validation:
280
+
281
+ - HF Space boots in Streamlit SDK mode.
282
+ - Streamlit UI loads.
283
+ - `/health` is reachable from the background FastAPI server.
284
+ - Sample prompts still work.
285
+
286
+ ## Acceptance Criteria
287
+
288
+ - One Docker image serves FastAPI and the built React SPA.
289
+ - `/`, static assets, and client-side browser routes work from the container.
290
+ - `/health`, `/metrics`, `/query`, `/query/stream`, `/sessions`, and `/openapi.json` keep expected behavior.
291
+ - HF Spaces runs the Docker SDK Space on `app_port: 8000`.
292
+ - CI validates backend tests, frontend checks, frontend build, and e2e smoke against a running FastAPI backend.
293
+ - Streamlit rollback is documented and tested.
294
+
295
+ ## Handoff to Phase 6.4
296
+
297
+ Do not start Phase 6.4 until:
298
+
299
+ - React demo has soaked for at least one week in the Docker deployment.
300
+ - No unresolved severity 1 or severity 2 deployment/runtime defects remain.
301
+ - The team confirms Streamlit rollback is no longer needed.
302
+ - The rollback steps above were tested at least once during cutover.
Docs/Phase6.3-Container-Cutover-Plan.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Phase 6.3 Plan: Single-Container Deploy and HF Spaces Cutover
2
+
3
+ Source of truth: `Docs/Phase6-RefactorDemo_React.md` (this file is an execution slice for iterative delivery).
4
+ Depends on: `Docs/Phase6.2-React-MVP-Plan.md`
5
+
6
+ ## Objective
7
+
8
+ Deploy one container (FastAPI + built React SPA) to simplify delivery and align Hugging Face Spaces runtime with the new UI.
9
+
10
+ ## Scope
11
+
12
+ Migrate from Streamlit SDK Space to Docker SDK Space with rollback path preserved.
13
+
14
+ ## Files to modify
15
+
16
+ - `docker/Dockerfile`
17
+ - Multi-stage build:
18
+ - Node stage builds `frontend/dist`
19
+ - Python stage copies static assets to `/app/static`
20
+ - Final command runs uvicorn only.
21
+ - `src/api/main.py`
22
+ - Mount static UI when available.
23
+ - Keep API route behavior intact (`/health`, `/metrics`, `/query`, `/query/stream`).
24
+ - Ensure SPA fallback does not shadow API routes.
25
+ - `spaces/README.md`
26
+ - Switch to:
27
+ - `sdk: docker`
28
+ - `app_port: 8000`
29
+ - Remove `app_file` streamlit setting.
30
+ - `spaces/app.py`
31
+ - Repurpose as thin env bootstrap + uvicorn launcher, or remove if no longer needed.
32
+ - `.github/workflows/sync-to-spaces.yml`
33
+ - Keep CI lean; prefer relying on HF Docker build unless prebuild is required.
34
+ - `.github/workflows/ci.yml`
35
+ - Add frontend job (`lint`, `test`, `build`).
36
+ - Add e2e job booting API + running Playwright.
37
+
38
+ ## Verification
39
+
40
+ ```bash
41
+ docker build -f docker/Dockerfile -t doc-ingest:demo .
42
+ docker run --rm -p 8000:8000 \
43
+ -e DOC_PROFILE=demo -e DOC_DEMO_UPLOADS=1 \
44
+ -e DOC_EMBEDDING_PROVIDER=sentence_transformers \
45
+ doc-ingest:demo
46
+ open http://127.0.0.1:8000
47
+ ```
48
+
49
+ Then push branch and validate HF Space after Docker rebuild.
50
+
51
+ ## Handoff (Exit Criteria)
52
+
53
+ - Unified container runs locally and in HF with expected route behavior.
54
+ - Core API endpoints stay reachable and validated.
55
+ - Deployed smoke tests pass.
56
+ - Rollback path to pre-cutover setup is documented and tested.
57
+
58
+ ## Transition to Phase 6.4
59
+
60
+ - React demo has soaked for at least one week.
61
+ - No unresolved high-severity deployment/runtime defects.
62
+ - Team confirms Streamlit rollback is no longer needed.
Docs/Phase6.4-Streamlit-Decommission-Implementation-Spec.md ADDED
@@ -0,0 +1,384 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Phase 6.4 Implementation Spec: Streamlit Decommission
2
+
3
+ Source plan: `Docs/Phase6.4-Streamlit-Decommission-Plan.md`
4
+ Depends on: `Docs/Phase6.3-Container-Cutover-Plan.md`
5
+ Optional phase: execute only after the Docker React deployment has stabilized.
6
+
7
+ ## Objective
8
+
9
+ Remove the Streamlit runtime, legacy UI path, and Streamlit-only tests after the React + FastAPI Docker deployment is stable and rollback to Streamlit is no longer required.
10
+
11
+ The final system should have one supported user interface:
12
+
13
+ - React SPA served by FastAPI.
14
+ - FastAPI APIs for querying, session uploads, metrics, health, and sample prompts.
15
+ - No Streamlit dependency, process, compose service, or documentation path.
16
+
17
+ ## Entry Criteria
18
+
19
+ Start this phase only when all are true:
20
+
21
+ - Phase 6.3 has been deployed for at least one week.
22
+ - No unresolved severity 1 or severity 2 issues exist for the React + FastAPI Docker runtime.
23
+ - The team explicitly confirms Streamlit rollback is no longer needed.
24
+ - The Phase 6.3 rollback procedure has been tested and documented.
25
+ - A current branch or tag exists that can restore the Streamlit implementation if needed later.
26
+
27
+ ## Non-Goals
28
+
29
+ - Do not change retrieval, reranking, citation, or provider behavior.
30
+ - Do not change session isolation semantics.
31
+ - Do not redesign the React UI.
32
+ - Do not remove shared ingestion helpers that are still used by API upload endpoints.
33
+ - Do not delete demo sample prompts; move them to an API-served shared source.
34
+
35
+ ## Implementation Sequence
36
+
37
+ ### 1. Inventory Streamlit References
38
+
39
+ Find all active references before deleting anything:
40
+
41
+ ```bash
42
+ rg "streamlit|8501|src/web/streamlit_app|DOC_INGEST_API_URL|_DEMO_QUESTIONS" .
43
+ ```
44
+
45
+ Classify each match:
46
+
47
+ - Delete: Streamlit runtime, Streamlit command, Streamlit-only tests.
48
+ - Replace: documentation and quickstart references.
49
+ - Keep: generic `src/web` helper modules used by the API, such as `ingestion_service.py` and `session_corpus.py`.
50
+
51
+ Expected Streamlit-specific items to remove or update:
52
+
53
+ - `src/web/streamlit_app.py`
54
+ - `streamlit>=...` in `requirements/base.txt`
55
+ - `streamlit` service in `docker/docker-compose.yml`
56
+ - `tests/unit/test_streamlit_demo_routing.py`
57
+ - Streamlit SDK references in `README.md` and `spaces/README.md`
58
+ - Port `8501` references in docs and Docker metadata
59
+
60
+ ### 2. Move Sample Prompts to a Shared API Source
61
+
62
+ The Streamlit app currently owns `_DEMO_QUESTIONS`. The React app currently has a shorter hard-coded prompt list in `frontend/src/components/SamplePromptChips.tsx`.
63
+
64
+ Create a backend-owned shared source before deleting Streamlit.
65
+
66
+ Recommended file:
67
+
68
+ ```text
69
+ src/api/sample_prompts.py
70
+ ```
71
+
72
+ Recommended content shape:
73
+
74
+ ```python
75
+ SAMPLE_PROMPTS: tuple[str, ...] = (
76
+ "What is Retrieval-Augmented Generation?",
77
+ "What are the two main phases of a RAG system?",
78
+ "How does hybrid retrieval work?",
79
+ "What is BM25 and how does it differ from vector search?",
80
+ "What are the weaknesses of BM25?",
81
+ "What is Reciprocal Rank Fusion (RRF)?",
82
+ "What is a vector database?",
83
+ "What is HNSW?",
84
+ "What is the difference between Chroma and Qdrant?",
85
+ "Why use hybrid retrieval instead of just dense vector search?",
86
+ "What failure mode does citation tracking help detect?",
87
+ "How are embeddings used in a RAG pipeline?",
88
+ )
89
+ ```
90
+
91
+ Add an API endpoint in `src/api/main.py`:
92
+
93
+ ```text
94
+ GET /api/sample-prompts
95
+ ```
96
+
97
+ Response contract:
98
+
99
+ ```json
100
+ {
101
+ "prompts": [
102
+ "What is Retrieval-Augmented Generation?"
103
+ ]
104
+ }
105
+ ```
106
+
107
+ Best practices:
108
+
109
+ - Keep the endpoint unauthenticated. It is static demo content.
110
+ - Register it before the SPA static mount.
111
+ - Keep response shape stable and explicit.
112
+ - If API models are used for typed responses, add a small Pydantic response model.
113
+ - Add the endpoint to frontend OpenAPI generation if the frontend consumes generated types.
114
+
115
+ ### 3. Update React Sample Prompt Consumption
116
+
117
+ Replace the hard-coded prompt array in `frontend/src/components/SamplePromptChips.tsx` with API-backed data.
118
+
119
+ Recommended approach:
120
+
121
+ - Add `getSamplePrompts()` to `frontend/src/api/client.ts`.
122
+ - Use TanStack Query in either `SamplePromptChips` or the parent `QueryTab`.
123
+ - Render a small loading state or skeleton while prompts load.
124
+ - Provide a local fallback only for network failure, using the same canonical prompt text as the backend. Keep the fallback clearly secondary so backend remains the source of truth.
125
+
126
+ Testing requirements:
127
+
128
+ - Unit test that prompts returned by the API render as chips.
129
+ - Unit test that selecting a prompt still fills the query text and resets scope to Global if that behavior already exists.
130
+ - Unit test the failure fallback or empty-state UI.
131
+
132
+ ### 4. Delete Streamlit Runtime Code
133
+
134
+ Delete:
135
+
136
+ ```text
137
+ src/web/streamlit_app.py
138
+ ```
139
+
140
+ Keep:
141
+
142
+ ```text
143
+ src/web/ingestion_service.py
144
+ src/web/session_corpus.py
145
+ ```
146
+
147
+ Reason:
148
+
149
+ - `ingestion_service.py` and `session_corpus.py` are no longer UI code only; FastAPI session upload endpoints depend on them.
150
+ - The package name `src.web` can remain for now to avoid a broad refactor. A later cleanup may move these helpers into `src/api` or `src/services`.
151
+
152
+ After deletion, run:
153
+
154
+ ```bash
155
+ rg "src.web.streamlit_app|streamlit_app|_DEMO_QUESTIONS" src tests frontend Docs README.md spaces
156
+ ```
157
+
158
+ Expected result:
159
+
160
+ - No runtime references remain.
161
+ - `_DEMO_QUESTIONS` has been replaced by `SAMPLE_PROMPTS`.
162
+
163
+ ### 5. Remove Streamlit Dependency
164
+
165
+ Edit `requirements/base.txt`:
166
+
167
+ - Remove `streamlit>=1.37.0`.
168
+ - Keep `requests`, `fastapi`, `python-multipart`, and `uvicorn` because the API still needs them.
169
+
170
+ Validation:
171
+
172
+ ```bash
173
+ python -m pip install -r requirements/base.txt
174
+ PYTHONPATH=. python -m pytest tests/unit -q
175
+ ```
176
+
177
+ Best practice:
178
+
179
+ - If a lockfile is introduced later, regenerate it in the same change.
180
+ - Do not remove dependencies solely because they were imported by Streamlit unless no remaining module imports them.
181
+
182
+ ### 6. Simplify Docker Compose
183
+
184
+ Edit `docker/docker-compose.yml`:
185
+
186
+ - Remove the `streamlit` service.
187
+ - Remove port `8501`.
188
+ - Keep `api`, `redis`, `qdrant`, and shared volumes.
189
+ - Ensure the API service exposes the React UI through `8000`.
190
+ - Add demo env vars to the API service only if local compose should support demo uploads by default.
191
+
192
+ Recommended local URL after this phase:
193
+
194
+ ```text
195
+ http://localhost:8000
196
+ ```
197
+
198
+ Compose validation:
199
+
200
+ ```bash
201
+ docker compose -f docker/docker-compose.yml up --build
202
+ curl -fsS http://127.0.0.1:8000/health
203
+ open http://127.0.0.1:8000
204
+ ```
205
+
206
+ ### 7. Update Dockerfile and HF Files
207
+
208
+ Review files touched in Phase 6.3:
209
+
210
+ - `docker/Dockerfile`
211
+ - `spaces/README.md`
212
+ - `spaces/app.py`
213
+
214
+ Required outcomes:
215
+
216
+ - No `EXPOSE 8501`.
217
+ - No Streamlit command.
218
+ - No Streamlit SDK metadata.
219
+ - No docs claiming `spaces/app.py` is the Streamlit entrypoint.
220
+
221
+ If `spaces/app.py` is no longer used:
222
+
223
+ - Delete it only if HF Docker runtime and local workflows do not import it.
224
+ - Keep `spaces/bootstrap_demo.py` if the Docker startup path still uses it.
225
+
226
+ If `spaces/app.py` is kept as a bootstrap helper:
227
+
228
+ - Remove all Streamlit imports and comments.
229
+ - Keep only demo env defaults/bootstrap logic that is still called.
230
+
231
+ ### 8. Update Documentation
232
+
233
+ Update `README.md`:
234
+
235
+ - Replace Streamlit quickstart with React + FastAPI quickstart.
236
+ - Change Docker instructions to open `http://localhost:8000`.
237
+ - Update architecture bullets:
238
+ - `src/api/` serves FastAPI routes and the React SPA.
239
+ - `frontend/` contains the React app.
240
+ - `src/web/` should not be described as the UI layer if it remains only for helper modules.
241
+ - Remove screenshots or text that show the Streamlit sidebar.
242
+ - Add sample prompt endpoint reference if useful for frontend/API developers.
243
+
244
+ Update `spaces/README.md`:
245
+
246
+ - Confirm it describes Docker SDK and app port `8000`.
247
+ - Remove upload-disabled Streamlit limitations if Phase 6.1 uploads are enabled.
248
+ - Describe the supported upload caps and TTL.
249
+
250
+ Update any runbooks or phase docs that still instruct users to run:
251
+
252
+ ```bash
253
+ streamlit run src/web/streamlit_app.py
254
+ ```
255
+
256
+ Replace with:
257
+
258
+ ```bash
259
+ uvicorn src.api.main:app --host 127.0.0.1 --port 8000
260
+ cd frontend && npm run dev
261
+ ```
262
+
263
+ or, for unified container:
264
+
265
+ ```bash
266
+ docker build -f docker/Dockerfile -t doc-ingest:demo .
267
+ docker run --rm -p 8000:8000 doc-ingest:demo
268
+ ```
269
+
270
+ ### 9. Remove or Replace Streamlit Tests
271
+
272
+ Delete:
273
+
274
+ ```text
275
+ tests/unit/test_streamlit_demo_routing.py
276
+ ```
277
+
278
+ Add or extend tests so the removed behavior remains covered through API and React tests:
279
+
280
+ - API test for `GET /api/sample-prompts`.
281
+ - API test that demo upload/session routes still work when `DOC_PROFILE=demo` and `DOC_DEMO_UPLOADS=1`.
282
+ - Frontend test that sample prompts render from API data.
283
+ - Frontend test that sample prompt selection populates the query.
284
+ - Playwright smoke that loads the unified UI and runs a global sample prompt.
285
+
286
+ Important:
287
+
288
+ - Do not reduce coverage for provider/model request passing, session scope, or citation provenance if those were previously asserted through Streamlit tests.
289
+ - Move assertions to API or frontend tests rather than deleting them outright.
290
+
291
+ ## Validation Checklist
292
+
293
+ Run after implementation:
294
+
295
+ ```bash
296
+ rg "streamlit|8501|src/web/streamlit_app|_DEMO_QUESTIONS" .
297
+ ```
298
+
299
+ Expected allowed matches:
300
+
301
+ - Historical phase docs may mention Streamlit as completed/decommissioned context.
302
+ - No active runtime, dependency, compose, CI, or README quickstart references should remain.
303
+
304
+ Backend:
305
+
306
+ ```bash
307
+ PYTHONPATH=. python -m pytest tests/unit -q
308
+ PYTHONPATH=. python -m pytest tests/integration -q
309
+ PYTHONPATH=. uvicorn src.api.main:app --host 127.0.0.1 --port 8000
310
+ ```
311
+
312
+ API smoke:
313
+
314
+ ```bash
315
+ curl -fsS http://127.0.0.1:8000/health
316
+ curl -fsS http://127.0.0.1:8000/api/sample-prompts
317
+ curl -fsS http://127.0.0.1:8000/
318
+ ```
319
+
320
+ Frontend:
321
+
322
+ ```bash
323
+ cd frontend
324
+ npm ci
325
+ npm run lint
326
+ npm run typecheck
327
+ npm run test
328
+ npm run build
329
+ npm run test:e2e
330
+ ```
331
+
332
+ Docker:
333
+
334
+ ```bash
335
+ docker build -f docker/Dockerfile -t doc-ingest:demo .
336
+ docker run --rm -p 8000:8000 \
337
+ -e DOC_PROFILE=demo \
338
+ -e DOC_DEMO_UPLOADS=1 \
339
+ -e DOC_EMBEDDING_PROVIDER=sentence_transformers \
340
+ doc-ingest:demo
341
+ ```
342
+
343
+ Manual smoke:
344
+
345
+ - Open `http://127.0.0.1:8000`.
346
+ - Confirm the React UI loads.
347
+ - Confirm sample prompt chips load from the API.
348
+ - Run a global sample prompt.
349
+ - Upload one small supported file.
350
+ - Query with `Mine` scope and verify citation provenance.
351
+ - Clear the session and confirm a new session is minted.
352
+
353
+ ## Rollback Plan
354
+
355
+ Rollback after this phase is no longer the normal operating path. If rollback is required, use the saved Phase 6.3 branch/tag.
356
+
357
+ Emergency rollback steps:
358
+
359
+ 1. Restore `src/web/streamlit_app.py`.
360
+ 2. Restore `streamlit` in `requirements/base.txt`.
361
+ 3. Restore the `streamlit` service in `docker/docker-compose.yml`.
362
+ 4. Restore Streamlit SDK metadata in `spaces/README.md` if rolling HF back to the old runtime.
363
+ 5. Restore `spaces/app.py` Streamlit launcher behavior.
364
+ 6. Re-run backend tests and a Streamlit smoke test.
365
+
366
+ Because Phase 6.4 intentionally removes the rollback path, require team approval before merging it.
367
+
368
+ ## Acceptance Criteria
369
+
370
+ - Streamlit runtime code is removed.
371
+ - `streamlit` dependency is removed.
372
+ - Docker Compose has no Streamlit service or `8501` port.
373
+ - HF and Docker docs describe only React + FastAPI on port `8000`.
374
+ - Sample prompts are served by `GET /api/sample-prompts` and consumed by the React UI.
375
+ - API, frontend, e2e, and Docker smoke checks pass.
376
+ - No active runtime or onboarding docs instruct users to run Streamlit.
377
+
378
+ ## Handoff
379
+
380
+ After merge:
381
+
382
+ - Mark Phase 6.4 complete in the phase index.
383
+ - Record the final React + FastAPI deployment URL and smoke-test date.
384
+ - Move any deferred cleanup, such as relocating `src/web/ingestion_service.py`, to the Phase 7 backlog.
Docs/Phase6.4-Streamlit-Decommission-Plan.md ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Phase 6.4 Plan: Streamlit Decommission (Optional)
2
+
3
+ Source of truth: `Docs/Phase6-RefactorDemo_React.md` (this file is an execution slice for iterative delivery).
4
+ Depends on: `Docs/Phase6.3-Container-Cutover-Plan.md`
5
+
6
+ ## Objective
7
+
8
+ Remove Streamlit runtime and legacy paths after React + FastAPI deployment has stabilized.
9
+
10
+ ## Scope
11
+
12
+ Run only after at least one week of stable production-like behavior from Phase 6.3.
13
+
14
+ ## Tasks
15
+
16
+ - Delete `src/web/streamlit_app.py`.
17
+ - Remove `streamlit` from `requirements/base.txt`.
18
+ - Remove Streamlit container from `docker/docker-compose.yml`.
19
+ - Update `README.md` screenshots and quickstart docs.
20
+ - Remove `tests/unit/test_streamlit_demo_routing.py`.
21
+ - Keep sample prompts by serving them via API (`GET /api/sample-prompts`) as shared source of truth.
22
+
23
+ ## Verification
24
+
25
+ - Confirm no imports/runtime references to Streamlit remain.
26
+ - Run backend/frontend test suites and smoke checks after cleanup.
27
+ - Confirm docs and onboarding instructions match new architecture.
28
+
29
+ ## Handoff (Exit Criteria)
30
+
31
+ - Streamlit code/dependencies/tests are removed cleanly.
32
+ - Docs fully reflect React + FastAPI flow.
33
+ - Sample prompts are centrally served and consumed.
34
+
35
+ ## Transition to Next Program Increment
36
+
37
+ - Phase 6 closes with 6.1-6.3 complete and 6.4 executed (or intentionally deferred).
38
+ - Deferred improvements move to Phase 7 backlog.
Docs/phase5_observability.md ADDED
@@ -0,0 +1,412 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Phase 5: Production Monitoring & Observability
2
+
3
+ **Timeline:** 3 weeks
4
+ **Status:** Complete
5
+ **Owner:** Vamshi Pokala
6
+
7
+ ## Overview
8
+
9
+ Phase 5 hardens the doc-ingestion RAG system for production through:
10
+
11
+ 1. **Distributed tracing** (LangFuse) for end-to-end pipeline visibility
12
+ 2. **Latency profiling** (P50, P95, P99) per step
13
+ 3. **Cost tracking** (USD per request)
14
+ 4. **Real-time metrics dashboard** at `/observability/dashboard`
15
+ 5. **Regression gating** (GitHub Actions) to prevent accuracy degradation on PRs
16
+ 6. **Citation accuracy monitoring** (groundedness, coverage trends)
17
+
18
+ ## Architecture
19
+
20
+ ### Tracing Flow
21
+ ```
22
+ User Query
23
+
24
+ [LangFuse Trace Start]
25
+
26
+ Retrieval (BM25 + Vector)
27
+ [TRACE: latency, chunks retrieved, scores]
28
+
29
+ Reranking (Cross-Encoder)
30
+ [TRACE: latency, input/output chunks]
31
+
32
+ Generation (LLM)
33
+ [TRACE: latency, tokens, cost, provider]
34
+
35
+ Citation Verification
36
+ [TRACE: latency, citations verified]
37
+
38
+ [Flush to LangFuse]
39
+
40
+ Response + Metrics Recorded
41
+ ```
42
+
43
+ ### Metrics Aggregation
44
+ ```
45
+ Per-Request Metrics (RequestMetrics)
46
+
47
+ In-Memory Collector (1000 rolling window)
48
+
49
+ Dashboard Endpoint (/observability/dashboard)
50
+
51
+ JSON: P50/P95/P99 latencies, cost trends, quality scores
52
+ ```
53
+
54
+ ### Regression Gating
55
+ ```
56
+ PR Submitted
57
+
58
+ GitHub Actions: Run evals on golden dataset
59
+
60
+ Compare against baseline (main branch)
61
+
62
+ Check: Latency increase <5%? Quality decrease <5%?
63
+
64
+ If FAIL: Block PR + comment with regression details
65
+ If PASS: Allow merge
66
+ ```
67
+
68
+ ## Key Components
69
+
70
+ ### 1. Observability Module (`src/core/observability.py`)
71
+
72
+ **Provides:**
73
+ - `RAGObserver` class with step-level tracing context managers
74
+ - LangFuse client integration
75
+ - No-op when disabled (useful for demo mode)
76
+ - Background-safe async flush
77
+
78
+ **Usage:**
79
+ ```python
80
+ observer = get_observer()
81
+
82
+ # One trace per request, spans as children
83
+ with observer.trace_request("rag_query", query=query_text) as trace:
84
+ with observer.trace_step(trace, "retrieval") as s:
85
+ result = retriever.retrieve(query)
86
+ s["chunks_retrieved"] = len(result)
87
+ with observer.trace_step(trace, "generation", {"provider": provider}) as s:
88
+ answer = generator.generate(query, result)
89
+
90
+ observer.flush_async() # non-blocking
91
+ ```
92
+
93
+ ### 2. Metrics Collector (`src/monitoring/metrics.py`)
94
+
95
+ **Provides:**
96
+ - `MetricsCollector` for in-memory aggregation
97
+ - Percentile calculations (P50, P95, P99)
98
+ - Dashboard-friendly JSON aggregations
99
+ - Thread-safe recording
100
+
101
+ **Metrics tracked:**
102
+ ```
103
+ Latency:
104
+ - total_latency_ms (P50, P95, P99)
105
+ - retrieval_avg_ms
106
+ - reranking_avg_ms
107
+ - generation_avg_ms
108
+ - citation_avg_ms
109
+ - Breakdown percentages
110
+
111
+ Cost:
112
+ - total_usd (across all requests)
113
+ - avg_per_request_usd
114
+ - p95_per_request_usd
115
+
116
+ Quality (online — no ground truth required):
117
+ - citation_groundedness_avg
118
+ - nli_faithfulness_avg
119
+ ```
120
+
121
+ ### 3. Regression Gate Script (`scripts/compare_evals.py`)
122
+
123
+ **Compares:**
124
+ - Baseline metrics (main branch)
125
+ - Current metrics (PR branch)
126
+ - Threshold: 5% by default (configurable)
127
+
128
+ **Fails if:**
129
+ - Latency increases >5%
130
+ - Quality decreases >5%
131
+ - Cost increases >5%
132
+
133
+ ### 4. Regression Gate in `.github/workflows/ci.yml` (extended `evals-golden` job)
134
+
135
+ **On every PR:**
136
+ 1. Runs offline evaluations against `evals/datasets/golden_ci.jsonl`
137
+ 2. Compares against committed `evals/reports/baseline.json`
138
+ 3. Blocks PR if regressions detected
139
+ 4. Comments with regression details
140
+
141
+ ## Setup Instructions
142
+
143
+ ### Step 1: Set Environment Variables
144
+
145
+ ```bash
146
+ # For development with LangFuse
147
+ export LANGFUSE_PUBLIC_KEY=pk_...
148
+ export LANGFUSE_SECRET_KEY=sk_...
149
+
150
+ # For testing (disabled)
151
+ export DOC_PROFILE=demo # Disables LangFuse
152
+ ```
153
+
154
+ ### Step 2: Install Dependencies
155
+
156
+ ```bash
157
+ # langfuse is in requirements/base.txt
158
+ pip install -r requirements/base.txt # Includes langfuse>=2.0.0
159
+ ```
160
+
161
+ ### Step 3: Configure Baseline (One-Time, commit to repo)
162
+
163
+ Already done! `evals/reports/baseline.json` is committed.
164
+
165
+ To regenerate from main branch:
166
+ ```bash
167
+ git checkout main
168
+ PYTHONPATH=. python -m evals.run_evals \
169
+ --dataset evals/datasets/golden_ci.jsonl \
170
+ --judge-provider anthropic \
171
+ --judge-model claude-haiku-4-5 \
172
+ --output evals/reports/baseline.json
173
+ git add evals/reports/baseline.json
174
+ git commit -m "chore: update Phase 5 eval baseline"
175
+ ```
176
+
177
+ ### Step 4: Query and Monitor
178
+
179
+ ```bash
180
+ # Start API with LangFuse enabled
181
+ export LANGFUSE_PUBLIC_KEY=pk_... LANGFUSE_SECRET_KEY=sk_...
182
+ PYTHONPATH=. uvicorn src.api.main:app --reload
183
+
184
+ # In another terminal, query
185
+ curl -X POST http://localhost:8000/query \
186
+ -H "Content-Type: application/json" \
187
+ -d '{"query": "What is RAG?"}'
188
+
189
+ # View dashboard
190
+ curl http://localhost:8000/observability/dashboard | jq .
191
+
192
+ # Output:
193
+ # {
194
+ # "summary": { "total_requests": 1, ... },
195
+ # "latency": {
196
+ # "total_p50_ms": 1247.3,
197
+ # "total_p95_ms": 1247.3,
198
+ # "breakdown_pct": {
199
+ # "retrieval": 18.2,
200
+ # "reranking": 12.1,
201
+ # "generation": 68.4,
202
+ # "citation": 1.3
203
+ # }
204
+ # },
205
+ # "cost": { "avg_per_request_usd": 0.00245 },
206
+ # "quality": {
207
+ # "citation_groundedness_avg": 0.92,
208
+ # "nli_faithfulness_avg": 0.88
209
+ # }
210
+ # }
211
+ ```
212
+
213
+ ## Testing
214
+
215
+ ### Unit Tests
216
+
217
+ ```bash
218
+ # Observability tests
219
+ pytest tests/unit/test_observability.py -v
220
+
221
+ # Metrics tests
222
+ pytest tests/unit/test_metrics.py -v
223
+
224
+ # Regression gate tests
225
+ pytest tests/unit/test_regression_gate.py -v
226
+ ```
227
+
228
+ ### Integration Test
229
+
230
+ ```bash
231
+ # Full E2E with tracing enabled
232
+ LANGFUSE_PUBLIC_KEY=pk_... LANGFUSE_SECRET_KEY=sk_... \
233
+ PYTHONPATH=. python -c "
234
+ from src.api.main import app
235
+ from fastapi.testclient import TestClient
236
+
237
+ client = TestClient(app)
238
+ response = client.post('/query', json={'query': 'What is RAG?'})
239
+ print(response.json())
240
+ # Should include request_id and all metrics
241
+ "
242
+ ```
243
+
244
+ ## Metrics Interpretation
245
+
246
+ ### Latency Breakdown Example
247
+ ```
248
+ Total P50: 1247.3 ms
249
+
250
+ Breakdown:
251
+ - Retrieval: 227 ms (18.2%) ← BM25 + Vector Search
252
+ - Reranking: 151 ms (12.1%) ← Cross-Encoder Rerank
253
+ - Generation: 855 ms (68.4%) ← LLM inference
254
+ - Citation: 14 ms ( 1.3%) ← Citation Verification
255
+
256
+ Interpretation:
257
+ Generation is the bottleneck (68.4% of total).
258
+ Could optimize by:
259
+ 1. Using a faster model
260
+ 2. Using streaming
261
+ 3. Reducing context size
262
+ ```
263
+
264
+ ### Quality Metrics Example
265
+ ```
266
+ Citation Groundedness: 0.92 (92% of citations verified)
267
+ NLI Faithfulness: 0.88 (88% of answer supported by chunks)
268
+
269
+ Interpretation:
270
+ - Citation coverage is strong (92%)
271
+ - Faithfulness could improve (88%)
272
+ - Consider reranking strategy improvements
273
+ ```
274
+
275
+ ### Cost Estimation Example
276
+ ```
277
+ Cost per Request: $0.00245 (avg)
278
+ Cost at P95: $0.00312
279
+
280
+ Annual projection (10K requests/day):
281
+ 365 * 10K * $0.00245 = $8,927.50
282
+
283
+ Cost Optimization:
284
+ - Switch to cheaper model?
285
+ - Use batch inference?
286
+ - Cache common queries?
287
+ ```
288
+
289
+ ## Deployment Notes
290
+
291
+ ### Docker
292
+
293
+ ```dockerfile
294
+ # In docker/Dockerfile, ensure observability deps are included
295
+ # langfuse is in requirements/base.txt
296
+ RUN pip install -r requirements/base.txt
297
+
298
+ # docker-compose sets env vars
299
+ environment:
300
+ - LANGFUSE_PUBLIC_KEY=${LANGFUSE_PUBLIC_KEY}
301
+ - LANGFUSE_SECRET_KEY=${LANGFUSE_SECRET_KEY}
302
+ ```
303
+
304
+ ### Streamlit (Demo Mode)
305
+
306
+ ```python
307
+ # In demo mode, observability is disabled
308
+ if os.getenv("DOC_PROFILE") == "demo":
309
+ observer = RAGObserver(enabled=False) # No-op
310
+ ```
311
+
312
+ ## Troubleshooting
313
+
314
+ ### LangFuse traces not appearing
315
+
316
+ ```
317
+ 1. Check credentials: LANGFUSE_PUBLIC_KEY and LANGFUSE_SECRET_KEY set?
318
+ 2. Check network: Can you reach https://cloud.langfuse.com?
319
+ 3. Check logs: Do you see "LangFuse observability enabled"?
320
+ 4. Verify flush: observer.flush_async() called after each request?
321
+ ```
322
+
323
+ ### Dashboard metrics all zeros
324
+
325
+ ```
326
+ 1. Check MetricsCollector is receiving data:
327
+ python -c "from src.monitoring.metrics import get_metrics_collector; print(len(get_metrics_collector().metrics))"
328
+ 2. Have you sent enough requests? (P95 needs at least 20 samples)
329
+ 3. Is metrics_collector.record_request() being called in /query endpoint?
330
+ ```
331
+
332
+ ### Regression gate always failing
333
+
334
+ ```
335
+ 1. Baseline exists? evals/reports/baseline.json present? (committed to repo)
336
+ If not: already committed as part of Phase 5
337
+ 2. Threshold too strict? Default is 5%, try --threshold 10
338
+ 3. Eval dataset: correct file is evals/datasets/golden_ci.jsonl
339
+ 4. Check eval logs for errors: see artifact evals/reports/pr-current.json
340
+ ```
341
+
342
+ ## Files Changed/Created
343
+
344
+ ### Week 1: Instrumentation
345
+ - ✅ `src/core/observability.py` (NEW)
346
+ - ✅ `tests/unit/test_observability.py` (NEW)
347
+ - ✅ `src/core/rag_orchestrator.py` (MODIFIED - added tracing)
348
+ - ✅ `src/api/main.py` (MODIFIED - minimal changes)
349
+ - ✅ `requirements/base.txt` (MODIFIED - added langfuse)
350
+
351
+ ### Week 2: Metrics Dashboard
352
+ - ✅ `src/monitoring/metrics.py` (NEW)
353
+ - ✅ `tests/unit/test_metrics.py` (NEW)
354
+ - ✅ `src/api/main.py` (MODIFIED - added metrics recording and dashboard endpoint)
355
+ - ✅ `src/utils/log.py` (MODIFIED - replaced MetricsCollector)
356
+
357
+ ### Week 3: Regression Gating
358
+ - ✅ `scripts/compare_evals.py` (NEW)
359
+ - ✅ `tests/unit/test_regression_gate.py` (NEW)
360
+ - ✅ `.github/workflows/ci.yml` (MODIFIED - extended evals-golden job)
361
+ - ✅ `evals/reports/baseline.json` (NEW - committed baseline)
362
+
363
+ ## Next Steps (Post-Phase 5)
364
+
365
+ - [ ] Grafana dashboard integration for long-term trends
366
+ - [ ] Alert thresholds (PagerDuty for latency spikes)
367
+ - [ ] Cost attribution per LLM provider
368
+ - [ ] A/B testing framework (compare models, prompts)
369
+ - [ ] User feedback loop (thumbs up/down on answers)
370
+ - [ ] Fine-tuning based on eval failures
371
+
372
+ ## Interview Stories
373
+
374
+ ### "How do you ensure production RAG reliability?"
375
+
376
+ > At Marriott, we deployed an agent handling 10K+ guest queries daily. Without observability, we'd have no idea if accuracy was degrading. I instrumented the pipeline with LangFuse tracing to see every step: retrieval latency, reranking precision, generation tokens, citation accuracy. Now I have a dashboard showing P50/P95 latency breakdown, cost per request, and quality metrics. And I wired up regression gating so no code change ships unless it passes a golden dataset evaluation. This is how you build trust in production AI systems.
377
+
378
+ ### "How would you scale an AI platform?"
379
+
380
+ > Observability is first-class, not an afterthought. The moment you deploy, you need distributed tracing to answer: Where's the bottleneck? Is generation or retrieval slowing us down? What's the cost per request? How are quality metrics trending? I built this with LangFuse + a metrics collector, so we can see the full stack at P50/P95. Then I added regression gating in CI/CD to prevent accuracy regressions from ever shipping.
381
+
382
+ ### "Describe your observability architecture"
383
+
384
+ > Every RAG pipeline step is traced to LangFuse: retrieval, reranking, generation, citation verification. We compute P50/P95/P99 latencies per step and expose them on a dashboard. We also track cost per request and quality metrics (citation groundedness, NLI faithfulness). In CI/CD, we compare PR eval results against a baseline — if latency increases >5% or quality decreases >5%, the PR is blocked with a detailed comment. This gives us real-time visibility and prevents regressions.
385
+
386
+ ## Approval Checklist
387
+
388
+ - [x] Week 1: LangFuse integration with correct span hierarchy (one trace/request, spans as children)
389
+ - [x] Week 1: Instrumentation in `RAGOrchestrator.run()`, not `main.py`
390
+ - [x] Week 1: `flush_async()` used everywhere (no synchronous flush in request path)
391
+ - [x] Week 2: `MetricsCollector` in `src/monitoring/metrics.py` (new one, old one updated for compatibility)
392
+ - [x] Week 2: `RequestMetrics` has no `mrr`/`ndcg` fields
393
+ - [x] Week 3: Regression comparison added to existing `evals-golden` job in `ci.yml`
394
+ - [x] Week 3: `evals/reports/baseline.json` committed to repo
395
+ - [x] Tests: All unit tests passing
396
+ - [x] Integration: E2E query with tracing + metrics recording
397
+ - [x] Interview ready: Stories prepared
398
+
399
+ ## Timeline Summary
400
+
401
+ | Week | Deliverable | Status |
402
+ |------|-------------|--------|
403
+ | 1 | LangFuse tracing | ✅ Complete |
404
+ | 2 | Metrics + dashboard | ✅ Complete |
405
+ | 3 | Regression gating + docs | ✅ Complete |
406
+
407
+ **Total effort:** ~40-50 hours over 3 weeks
408
+
409
+ ---
410
+
411
+ **Generated:** 2026-05-01
412
+ **Last Updated:** 2026-05-01
README.md CHANGED
@@ -3,16 +3,15 @@ title: Doc Ingestion RAG Demo
3
  emoji: 📚
4
  colorFrom: blue
5
  colorTo: indigo
6
- sdk: streamlit
7
- sdk_version: "1.37.0"
8
- app_file: spaces/app.py
9
  pinned: false
10
  license: mit
11
  ---
12
 
13
  # Doc-Ingestion
14
 
15
- Doc-Ingestion is a citation-aware RAG system that turns private document collections into grounded question-answering experiences. It demonstrates how to ingest documents, retrieve the right evidence, generate answers from that evidence, and return citations plus truthfulness signals through a Streamlit app, FastAPI service, and CLI.
16
 
17
  > **[Try the live demo on Hugging Face Spaces](https://huggingface.co/spaces/vampokala/doc-ingestion)** - no install required.
18
 
@@ -154,7 +153,7 @@ In hosted demo mode (`DOC_PROFILE=demo`), Streamlit executes queries in-process
154
 
155
  ### Try Online
156
 
157
- Open the [Hugging Face Spaces demo](https://huggingface.co/spaces/vampokala/doc-ingestion). Sample documents about RAG, vector databases, and BM25 are preloaded. Paste your OpenAI, Anthropic, or Gemini key in the sidebar if you want to use a cloud provider.
158
 
159
  ### Run Locally With Docker
160
 
@@ -166,7 +165,7 @@ cp docker/.env.example docker/.env
166
  docker compose -f docker/docker-compose.yml up
167
  ```
168
 
169
- Open `http://localhost:8501` for Streamlit or `http://localhost:8000` for the API.
170
 
171
  ### Run From Source
172
 
@@ -193,6 +192,28 @@ PYTHONPATH=. python -m src.query "What is RAG?"
193
 
194
  For a full local and Docker runbook, see [`Docs/RUNBOOK.md`](Docs/RUNBOOK.md).
195
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  ## API Usage
197
 
198
  ```bash
@@ -293,6 +314,11 @@ export GEMINI_API_KEY=...
293
  export DOC_API_KEYS=dev-key-1
294
  ```
295
 
 
 
 
 
 
296
  ## Troubleshooting
297
 
298
  - **Empty results after ingest:** Run `python -m src.ingest --docs data/documents` and verify `data/embeddings/` exists.
@@ -300,3 +326,4 @@ export DOC_API_KEYS=dev-key-1
300
  - **Dimension mismatch after model change:** Re-ingest all documents to rebuild the vector index.
301
  - **Cloud provider fails:** Check the relevant `*_API_KEY` env var is set.
302
  - **Truthfulness score always 0:** The NLI model (`cross-encoder/nli-deberta-v3-small`) downloads on first use. Check internet access or set `evaluation.inline_enabled: false` in `config.yaml` to disable.
 
 
3
  emoji: 📚
4
  colorFrom: blue
5
  colorTo: indigo
6
+ sdk: docker
7
+ app_port: 8000
 
8
  pinned: false
9
  license: mit
10
  ---
11
 
12
  # Doc-Ingestion
13
 
14
+ Doc-Ingestion is a citation-aware RAG system that turns private document collections into grounded question-answering experiences. It demonstrates how to ingest documents, retrieve the right evidence, generate answers from that evidence, and return citations plus truthfulness signals through a React UI (served by FastAPI), standalone FastAPI, optional Streamlit legacy UI, and CLI.
15
 
16
  > **[Try the live demo on Hugging Face Spaces](https://huggingface.co/spaces/vampokala/doc-ingestion)** - no install required.
17
 
 
153
 
154
  ### Try Online
155
 
156
+ Open the [Hugging Face Spaces demo](https://huggingface.co/spaces/vampokala/doc-ingestion). Sample documents about RAG, vector databases, and BM25 are preloaded. Paste your OpenAI, Anthropic, or Gemini key in the app if you want to use a cloud provider.
157
 
158
  ### Run Locally With Docker
159
 
 
165
  docker compose -f docker/docker-compose.yml up
166
  ```
167
 
168
+ Open `http://localhost:8000` for the React UI and API (single container image).
169
 
170
  ### Run From Source
171
 
 
192
 
193
  For a full local and Docker runbook, see [`Docs/RUNBOOK.md`](Docs/RUNBOOK.md).
194
 
195
+ ## Ollama and Hugging Face Spaces
196
+
197
+ **`SPACE_ID` is not a file in this repository.** It is a **runtime environment variable** that [Hugging Face Spaces](https://huggingface.co/docs/hub/spaces-overview) sets inside the Space container (for example `your-username/your-space-name`). Doc-Ingestion reads it from the process environment in [`src/utils/config.py`](src/utils/config.py) when `load_config("config.yaml")` runs. Static LLM provider and model lists still live in [`config.yaml`](config.yaml); Ollama is only removed from the **effective** config when Space detection says it should be.
198
+
199
+ If you **clone this repo and run it locally** (source or Docker on your machine), **Hugging Face does not set `SPACE_ID`**. The Ollama provider therefore stays in the default LLM list from `config.yaml`, and you can use it after starting the [Ollama](https://ollama.com) daemon and pulling the chat and embedding models described in [`Docs/RUNBOOK.md`](Docs/RUNBOOK.md).
200
+
201
+ On **Hugging Face Spaces**, the platform **injects `SPACE_ID`** (for example `your-username/your-space-name`). Doc-Ingestion reads that at startup and **removes Ollama** from allowed providers and from `GET /config/llm`, because there is no local Ollama service in the hosted container. Hosted demos use OpenAI, Anthropic, or Gemini with keys you supply in the UI or environment.
202
+
203
+ | Where you run | `SPACE_ID` | Ollama in the app |
204
+ |---------------|------------|-------------------|
205
+ | Your laptop or your own server / Docker | Not set by default | Yes (per `config.yaml`) |
206
+ | Hugging Face Space | Set automatically by HF | No (automatic) |
207
+
208
+ **Do not define `SPACE_ID` yourself** for local deployment. It exists so the app can tell it is running inside a Space. If you copied Space-style environment variables into a local `.env` and Ollama disappeared from the UI, remove `SPACE_ID` or set **`DOC_OLLAMA_ENABLED=1`** to force Ollama back on.
209
+
210
+ **Explicit override (optional):**
211
+
212
+ - `DOC_OLLAMA_ENABLED=0` — hide Ollama even when `SPACE_ID` is unset (useful if you want cloud-only in your own container).
213
+ - `DOC_OLLAMA_ENABLED=1` — show Ollama even when `SPACE_ID` is set (rare; only if you had a sidecar Ollama and extended the image yourself).
214
+
215
+ Implementation: [`src/utils/config.py`](src/utils/config.py) (`doc_ollama_runtime_enabled`, applied inside `load_config`).
216
+
217
  ## API Usage
218
 
219
  ```bash
 
314
  export DOC_API_KEYS=dev-key-1
315
  ```
316
 
317
+ Deployment-related environment variables (not stored in `config.yaml`; see [Ollama and Hugging Face Spaces](#ollama-and-hugging-face-spaces) above):
318
+
319
+ - **`SPACE_ID`** — injected on Hugging Face Spaces only. You do not add this to a local config file for normal development.
320
+ - **`DOC_OLLAMA_ENABLED`** — optional explicit override: `0` / `false` to hide Ollama, `1` / `true` to show it even when `SPACE_ID` is set.
321
+
322
  ## Troubleshooting
323
 
324
  - **Empty results after ingest:** Run `python -m src.ingest --docs data/documents` and verify `data/embeddings/` exists.
 
326
  - **Dimension mismatch after model change:** Re-ingest all documents to rebuild the vector index.
327
  - **Cloud provider fails:** Check the relevant `*_API_KEY` env var is set.
328
  - **Truthfulness score always 0:** The NLI model (`cross-encoder/nli-deberta-v3-small`) downloads on first use. Check internet access or set `evaluation.inline_enabled: false` in `config.yaml` to disable.
329
+ - **Ollama missing from the UI or `/config/llm` locally:** You may have `SPACE_ID` or `DOC_OLLAMA_ENABLED=0` in your shell or `docker/.env`. Unset `SPACE_ID` for local runs, or set `DOC_OLLAMA_ENABLED=1`. There is no separate `SPACE_ID` configuration file in the repo—only environment variables and [`config.yaml`](config.yaml).
docker/Dockerfile DELETED
@@ -1,44 +0,0 @@
1
- FROM python:3.11-slim
2
-
3
- WORKDIR /app
4
-
5
- # Install system deps needed by python-magic and runtime health checks.
6
- RUN apt-get update && apt-get install -y --no-install-recommends \
7
- libmagic1 \
8
- curl \
9
- ca-certificates \
10
- && rm -rf /var/lib/apt/lists/*
11
-
12
- COPY requirements/base.txt requirements/base.txt
13
- RUN pip install --no-cache-dir --upgrade pip && \
14
- pip install --no-cache-dir -r requirements/base.txt
15
-
16
- COPY src/ src/
17
- COPY scripts/ scripts/
18
- COPY tests/ tests/
19
- COPY config.yaml config.yaml
20
- COPY README.md README.md
21
- COPY Docs/ Docs/
22
-
23
- ENV ENV=prod
24
- ENV PYTHONUNBUFFERED=1
25
- ENV PYTHONPATH=/app
26
- ENV OLLAMA_BASE_URL=http://host.docker.internal:11434
27
- ENV HF_HOME=/app/.cache/huggingface
28
- ENV TRANSFORMERS_CACHE=/app/.cache/huggingface/transformers
29
- ENV SENTENCE_TRANSFORMERS_HOME=/app/.cache/huggingface/sentence_transformers
30
-
31
- # Preload reranker model at build time to avoid runtime downloads.
32
- RUN python -c "from sentence_transformers import CrossEncoder; CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')"
33
-
34
- EXPOSE 8000
35
- EXPOSE 8501
36
-
37
- # Use non-root runtime user.
38
- RUN useradd -m appuser && mkdir -p /app/.cache/huggingface && chown -R appuser:appuser /app
39
- USER appuser
40
-
41
- HEALTHCHECK --interval=30s --timeout=5s --start-period=20s --retries=3 \
42
- CMD curl -fsS http://127.0.0.1:8000/health || exit 1
43
-
44
- CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "1"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docker/Dockerfile ADDED
@@ -0,0 +1 @@
 
 
1
+ ../Dockerfile
docker/docker-compose.yml CHANGED
@@ -1,11 +1,16 @@
 
 
1
  services:
2
  api:
3
  build:
4
  context: ..
5
- dockerfile: docker/Dockerfile
6
  container_name: doc_ingestion_api
7
  environment:
8
  - ENV=dev
 
 
 
9
  - DOC_API_KEYS=${DOC_API_KEYS:-change-me}
10
  - OPENAI_API_KEY=${OPENAI_API_KEY:-}
11
  - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
@@ -17,6 +22,7 @@ services:
17
  - SENTENCE_TRANSFORMERS_HOME=/app/.cache/huggingface/sentence_transformers
18
  - HF_HUB_OFFLINE=${HF_HUB_OFFLINE:-0}
19
  - TRANSFORMERS_OFFLINE=${TRANSFORMERS_OFFLINE:-0}
 
20
  volumes:
21
  - ../data:/app/data
22
  - ../config.yaml:/app/config.yaml
@@ -27,33 +33,6 @@ services:
27
  - qdrant
28
  - redis
29
 
30
- streamlit:
31
- build:
32
- context: ..
33
- dockerfile: docker/Dockerfile
34
- container_name: doc_ingestion_streamlit
35
- command: ["streamlit", "run", "src/web/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
36
- environment:
37
- - DOC_INGEST_API_URL=http://api:8000
38
- - DOC_API_KEY=${DOC_API_KEY:-change-me}
39
- - OLLAMA_BASE_URL=${OLLAMA_BASE_URL:-http://host.docker.internal:11434}
40
- - OPENAI_API_KEY=${OPENAI_API_KEY:-}
41
- - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
42
- - GEMINI_API_KEY=${GEMINI_API_KEY:-}
43
- - HF_HOME=/app/.cache/huggingface
44
- - TRANSFORMERS_CACHE=/app/.cache/huggingface/transformers
45
- - SENTENCE_TRANSFORMERS_HOME=/app/.cache/huggingface/sentence_transformers
46
- - HF_HUB_OFFLINE=${HF_HUB_OFFLINE:-0}
47
- - TRANSFORMERS_OFFLINE=${TRANSFORMERS_OFFLINE:-0}
48
- volumes:
49
- - ../data:/app/data
50
- - ../config.yaml:/app/config.yaml
51
- - hf_cache:/app/.cache/huggingface
52
- ports:
53
- - "8501:8501"
54
- depends_on:
55
- - api
56
-
57
  redis:
58
  image: redis:7-alpine
59
  container_name: doc_ingestion_redis
 
1
+ # Full stack (React + FastAPI) in one container on :8000. Streamlit is not part of this stack;
2
+ # run it locally only if needed: PYTHONPATH=. streamlit run src/web/streamlit_app.py
3
  services:
4
  api:
5
  build:
6
  context: ..
7
+ dockerfile: Dockerfile
8
  container_name: doc_ingestion_api
9
  environment:
10
  - ENV=dev
11
+ # React demo session + uploads (override for hardened deploys).
12
+ - DOC_PROFILE=${DOC_PROFILE:-demo}
13
+ - DOC_DEMO_UPLOADS=${DOC_DEMO_UPLOADS:-1}
14
  - DOC_API_KEYS=${DOC_API_KEYS:-change-me}
15
  - OPENAI_API_KEY=${OPENAI_API_KEY:-}
16
  - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
 
22
  - SENTENCE_TRANSFORMERS_HOME=/app/.cache/huggingface/sentence_transformers
23
  - HF_HUB_OFFLINE=${HF_HUB_OFFLINE:-0}
24
  - TRANSFORMERS_OFFLINE=${TRANSFORMERS_OFFLINE:-0}
25
+ - PORT=8000
26
  volumes:
27
  - ../data:/app/data
28
  - ../config.yaml:/app/config.yaml
 
33
  - qdrant
34
  - redis
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  redis:
37
  image: redis:7-alpine
38
  container_name: doc_ingestion_redis
frontend/.gitignore ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Logs
2
+ logs
3
+ *.log
4
+ npm-debug.log*
5
+ yarn-debug.log*
6
+ yarn-error.log*
7
+ pnpm-debug.log*
8
+ lerna-debug.log*
9
+
10
+ node_modules
11
+ dist
12
+ dist-ssr
13
+ test-results
14
+ playwright-report
15
+ *.local
16
+
17
+ # Editor directories and files
18
+ .vscode/*
19
+ !.vscode/extensions.json
20
+ .idea
21
+ .DS_Store
22
+ *.suo
23
+ *.ntvs*
24
+ *.njsproj
25
+ *.sln
26
+ *.sw?
frontend/README.md ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # React + TypeScript + Vite
2
+
3
+ This template provides a minimal setup to get React working in Vite with HMR and some ESLint rules.
4
+
5
+ Currently, two official plugins are available:
6
+
7
+ - [@vitejs/plugin-react](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react) uses [Oxc](https://oxc.rs)
8
+ - [@vitejs/plugin-react-swc](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react-swc) uses [SWC](https://swc.rs/)
9
+
10
+ ## React Compiler
11
+
12
+ The React Compiler is not enabled on this template because of its impact on dev & build performances. To add it, see [this documentation](https://react.dev/learn/react-compiler/installation).
13
+
14
+ ## Expanding the ESLint configuration
15
+
16
+ If you are developing a production application, we recommend updating the configuration to enable type-aware lint rules:
17
+
18
+ ```js
19
+ export default defineConfig([
20
+ globalIgnores(['dist']),
21
+ {
22
+ files: ['**/*.{ts,tsx}'],
23
+ extends: [
24
+ // Other configs...
25
+
26
+ // Remove tseslint.configs.recommended and replace with this
27
+ tseslint.configs.recommendedTypeChecked,
28
+ // Alternatively, use this for stricter rules
29
+ tseslint.configs.strictTypeChecked,
30
+ // Optionally, add this for stylistic rules
31
+ tseslint.configs.stylisticTypeChecked,
32
+
33
+ // Other configs...
34
+ ],
35
+ languageOptions: {
36
+ parserOptions: {
37
+ project: ['./tsconfig.node.json', './tsconfig.app.json'],
38
+ tsconfigRootDir: import.meta.dirname,
39
+ },
40
+ // other options...
41
+ },
42
+ },
43
+ ])
44
+ ```
45
+
46
+ You can also install [eslint-plugin-react-x](https://github.com/Rel1cx/eslint-react/tree/main/packages/plugins/eslint-plugin-react-x) and [eslint-plugin-react-dom](https://github.com/Rel1cx/eslint-react/tree/main/packages/plugins/eslint-plugin-react-dom) for React-specific lint rules:
47
+
48
+ ```js
49
+ // eslint.config.js
50
+ import reactX from 'eslint-plugin-react-x'
51
+ import reactDom from 'eslint-plugin-react-dom'
52
+
53
+ export default defineConfig([
54
+ globalIgnores(['dist']),
55
+ {
56
+ files: ['**/*.{ts,tsx}'],
57
+ extends: [
58
+ // Other configs...
59
+ // Enable lint rules for React
60
+ reactX.configs['recommended-typescript'],
61
+ // Enable lint rules for React DOM
62
+ reactDom.configs.recommended,
63
+ ],
64
+ languageOptions: {
65
+ parserOptions: {
66
+ project: ['./tsconfig.node.json', './tsconfig.app.json'],
67
+ tsconfigRootDir: import.meta.dirname,
68
+ },
69
+ // other options...
70
+ },
71
+ },
72
+ ])
73
+ ```
frontend/components.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "$schema": "https://ui.shadcn.com/schema.json",
3
+ "style": "new-york",
4
+ "rsc": false,
5
+ "tsx": true,
6
+ "tailwind": {
7
+ "config": "",
8
+ "css": "src/index.css",
9
+ "baseColor": "slate",
10
+ "cssVariables": true
11
+ },
12
+ "aliases": {
13
+ "components": "@/components",
14
+ "utils": "@/lib/utils"
15
+ }
16
+ }
frontend/e2e/fixtures/uploaded-doc.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Uploaded Test Document
2
+
3
+ This uploaded document says hello from a private session.
frontend/e2e/react-mvp.spec.ts ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { test, expect, type Page } from '@playwright/test'
2
+
3
+ async function mockLlmConfig(page: Page) {
4
+ await page.route('http://127.0.0.1:8000/config/llm', async (route) => {
5
+ await route.fulfill({
6
+ json: {
7
+ default_provider: 'ollama',
8
+ default_model_by_provider: {
9
+ ollama: 'qwen2.5:7b',
10
+ openai: 'gpt-4o-mini',
11
+ },
12
+ allowed_models_by_provider: {
13
+ ollama: ['qwen2.5:7b'],
14
+ openai: ['gpt-4o-mini'],
15
+ },
16
+ },
17
+ })
18
+ })
19
+ }
20
+
21
+ test('no uploads keeps Mine and Both disabled', async ({ page }) => {
22
+ await mockLlmConfig(page)
23
+ await page.route('http://127.0.0.1:8000/sessions', async (route) => {
24
+ await route.fulfill({
25
+ json: {
26
+ session_id: 'abc123demo',
27
+ expires_at: Math.floor(Date.now() / 1000) + 1800,
28
+ files: [],
29
+ total_bytes: 0,
30
+ max_session_bytes: 8388608,
31
+ max_files: 3,
32
+ },
33
+ })
34
+ })
35
+ await page.route('http://127.0.0.1:8000/sessions/abc123demo', async (route) => {
36
+ await route.fulfill({
37
+ json: {
38
+ session_id: 'abc123demo',
39
+ expires_at: Math.floor(Date.now() / 1000) + 1800,
40
+ files: [],
41
+ total_bytes: 0,
42
+ max_session_bytes: 8388608,
43
+ max_files: 3,
44
+ },
45
+ })
46
+ })
47
+
48
+ await page.goto('/')
49
+ await page.getByRole('tab', { name: 'Query' }).click()
50
+ await expect(page.getByRole('radio', { name: /my uploads only/i })).toBeDisabled()
51
+ await expect(page.getByRole('radio', { name: /both/i })).toBeDisabled()
52
+ })
53
+
54
+ test('query streams an answer', async ({ page }) => {
55
+ await mockLlmConfig(page)
56
+ await page.route('http://127.0.0.1:8000/sessions', async (route) => {
57
+ await route.fulfill({
58
+ json: {
59
+ session_id: 'abc123demo',
60
+ expires_at: Math.floor(Date.now() / 1000) + 1800,
61
+ files: [],
62
+ total_bytes: 0,
63
+ max_session_bytes: 8388608,
64
+ max_files: 3,
65
+ },
66
+ })
67
+ })
68
+ await page.route('http://127.0.0.1:8000/query/stream', async (route) => {
69
+ await route.fulfill({
70
+ contentType: 'text/event-stream',
71
+ body: 'data: {"type":"token","text":"Hello from stream"}\n\ndata: {"type":"final","citations":[],"provider":"ollama","model":"llama3"}\n\ndata: [DONE]\n\n',
72
+ })
73
+ })
74
+
75
+ await page.goto('/')
76
+ await page.getByRole('tab', { name: 'Query' }).click()
77
+ await page.getByRole('textbox', { name: /question/i }).fill('What is RAG?')
78
+ await page.getByRole('button', { name: 'Run' }).click()
79
+ await expect(page.getByText('Hello from stream')).toBeVisible()
80
+ })
frontend/eslint.config.js ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import js from '@eslint/js'
2
+ import globals from 'globals'
3
+ import reactHooks from 'eslint-plugin-react-hooks'
4
+ import reactRefresh from 'eslint-plugin-react-refresh'
5
+ import tseslint from 'typescript-eslint'
6
+ import { defineConfig, globalIgnores } from 'eslint/config'
7
+
8
+ export default defineConfig([
9
+ globalIgnores(['dist']),
10
+ {
11
+ files: ['**/*.{ts,tsx}'],
12
+ extends: [
13
+ js.configs.recommended,
14
+ tseslint.configs.recommended,
15
+ reactHooks.configs.flat.recommended,
16
+ reactRefresh.configs.vite,
17
+ ],
18
+ languageOptions: {
19
+ globals: globals.browser,
20
+ },
21
+ },
22
+ ])
frontend/index.html ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <link rel="icon" type="image/svg+xml" href="/favicon.svg" />
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
7
+ <title>frontend</title>
8
+ <style>
9
+ html,
10
+ body {
11
+ margin: 0;
12
+ min-height: 100%;
13
+ background: #f1f5f9;
14
+ }
15
+ </style>
16
+ </head>
17
+ <body>
18
+ <div id="root"></div>
19
+ <script type="module" src="/src/main.tsx"></script>
20
+ </body>
21
+ </html>
frontend/package-lock.json ADDED
The diff for this file is too large to render. See raw diff
 
frontend/package.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "frontend",
3
+ "private": true,
4
+ "version": "0.0.0",
5
+ "type": "module",
6
+ "scripts": {
7
+ "dev": "vite --host 127.0.0.1 --port 5173",
8
+ "build": "tsc -b && vite build",
9
+ "lint": "eslint .",
10
+ "preview": "vite preview --host 127.0.0.1 --port 4173",
11
+ "test": "vitest run",
12
+ "test:watch": "vitest",
13
+ "test:e2e": "playwright test",
14
+ "typecheck": "tsc --noEmit",
15
+ "gen:api": "openapi-typescript http://127.0.0.1:8000/openapi.json -o src/api/generated.ts"
16
+ },
17
+ "dependencies": {
18
+ "@radix-ui/react-progress": "^1.1.8",
19
+ "@radix-ui/react-radio-group": "^1.3.8",
20
+ "@radix-ui/react-slot": "^1.2.4",
21
+ "@radix-ui/react-tabs": "^1.1.13",
22
+ "@radix-ui/react-toast": "^1.2.15",
23
+ "@tanstack/react-query": "^5.100.8",
24
+ "class-variance-authority": "^0.7.1",
25
+ "clsx": "^2.1.1",
26
+ "lucide-react": "^1.14.0",
27
+ "react": "^18.3.1",
28
+ "react-dom": "^18.3.1",
29
+ "tailwind-merge": "^3.5.0",
30
+ "zustand": "^5.0.12"
31
+ },
32
+ "devDependencies": {
33
+ "@eslint/js": "^10.0.1",
34
+ "@playwright/test": "^1.59.1",
35
+ "@tailwindcss/vite": "^4.2.4",
36
+ "@testing-library/jest-dom": "^6.9.1",
37
+ "@testing-library/react": "^16.3.2",
38
+ "@testing-library/user-event": "^14.6.1",
39
+ "@types/node": "^24.12.2",
40
+ "@types/react": "^18.3.28",
41
+ "@types/react-dom": "^18.3.7",
42
+ "@vitejs/plugin-react": "^6.0.1",
43
+ "eslint": "^10.3.0",
44
+ "eslint-plugin-react-hooks": "^7.1.1",
45
+ "eslint-plugin-react-refresh": "^0.5.2",
46
+ "globals": "^17.5.0",
47
+ "jsdom": "^29.1.1",
48
+ "msw": "^2.14.2",
49
+ "openapi-typescript": "^7.13.0",
50
+ "typescript": "^5.9.3",
51
+ "typescript-eslint": "^8.59.1",
52
+ "vite": "^8.0.10",
53
+ "vitest": "^4.1.5"
54
+ }
55
+ }
frontend/playwright.config.ts ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { defineConfig, devices } from '@playwright/test'
2
+
3
+ export default defineConfig({
4
+ testDir: './e2e',
5
+ webServer: {
6
+ command: 'npm run dev',
7
+ url: 'http://127.0.0.1:5173',
8
+ reuseExistingServer: !process.env.CI,
9
+ },
10
+ use: {
11
+ baseURL: 'http://127.0.0.1:5173',
12
+ trace: 'on-first-retry',
13
+ },
14
+ projects: [
15
+ {
16
+ name: 'chromium',
17
+ use: { ...devices['Desktop Chrome'] },
18
+ },
19
+ ],
20
+ })
frontend/public/favicon.svg ADDED
frontend/public/icons.svg ADDED
frontend/src/App.tsx ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import * as Tabs from '@radix-ui/react-tabs'
2
+ import { QueryClient, QueryClientProvider } from '@tanstack/react-query'
3
+ import { AlertCircle, BookOpen, Database, FileText, Fingerprint } from 'lucide-react'
4
+ import { useMemo } from 'react'
5
+ import { QueryTab } from './tabs/QueryTab'
6
+ import { OverviewTab } from './tabs/OverviewTab'
7
+ import { DocumentsTab } from './tabs/DocumentsTab'
8
+ import { SessionProvider } from './session/SessionProvider'
9
+ import { useSession } from './session/SessionContext'
10
+ import { formatTtl, shortSessionId } from './lib/format'
11
+
12
+ function Shell() {
13
+ const { sessionId, expiresAt, error, retrySession, isMintingSession, isLoading, clearSession } =
14
+ useSession()
15
+
16
+ return (
17
+ <main className="min-h-screen bg-slate-100 px-4 py-6 md:px-8">
18
+ <div className="mx-auto max-w-6xl space-y-5">
19
+ <header className="app-card p-5">
20
+ <div className="flex flex-wrap items-start justify-between gap-4">
21
+ <div>
22
+ <p className="text-sm font-semibold uppercase tracking-wide text-blue-700">Doc Ingestion</p>
23
+ <h1 className="mt-1 text-3xl font-bold text-slate-950">Document Q&A Assistant</h1>
24
+ <p className="mt-2 max-w-3xl text-slate-600">
25
+ Ask citation-aware questions against the global demo corpus, your private uploads, or both.
26
+ </p>
27
+ </div>
28
+ <div className="flex min-w-[12rem] flex-col gap-2 rounded-xl bg-slate-50 px-4 py-3 text-sm text-slate-700">
29
+ <div>
30
+ <div>Session {isMintingSession ? 'creating…' : shortSessionId(sessionId)}</div>
31
+ <div className="text-slate-500">TTL {formatTtl(expiresAt)}</div>
32
+ </div>
33
+ <button
34
+ type="button"
35
+ className="inline-flex items-center justify-center gap-2 rounded-lg bg-blue-600 px-3 py-2 text-sm font-semibold text-white shadow hover:bg-blue-700 disabled:pointer-events-none disabled:opacity-50"
36
+ disabled={isLoading}
37
+ aria-busy={isLoading}
38
+ onClick={() => void clearSession()}
39
+ >
40
+ <Fingerprint className="h-4 w-4 shrink-0" aria-hidden="true" />
41
+ {isLoading ? 'Creating…' : sessionId ? 'New session ID' : 'Generate session ID'}
42
+ </button>
43
+ <p className="text-xs leading-snug text-slate-500">
44
+ Fresh ID for uploads in this browser. Replaces any current demo session (including uploads on
45
+ the server).
46
+ </p>
47
+ </div>
48
+ </div>
49
+ <div className="mt-4 rounded-xl border border-blue-100 bg-blue-50 p-4 text-sm text-blue-900">
50
+ Your uploads stay in this browser session, expire after inactivity, and are not added to the
51
+ shared corpus.
52
+ </div>
53
+ {error ? (
54
+ <div className="mt-4 flex flex-wrap items-center justify-between gap-3 rounded-xl border border-amber-200 bg-amber-50 p-4 text-sm text-amber-900">
55
+ <span className="inline-flex items-center gap-2">
56
+ <AlertCircle className="h-4 w-4" aria-hidden="true" />
57
+ {error.message}
58
+ </span>
59
+ <button type="button" className="font-semibold underline" onClick={() => void retrySession()}>
60
+ Retry session
61
+ </button>
62
+ </div>
63
+ ) : null}
64
+ </header>
65
+
66
+ <Tabs.Root defaultValue="overview" className="space-y-5">
67
+ <Tabs.List className="app-card inline-flex gap-2 p-2" aria-label="Main sections">
68
+ <Tabs.Trigger
69
+ value="overview"
70
+ className="inline-flex items-center gap-2 rounded-xl px-4 py-2 text-sm font-semibold text-slate-700 data-[state=active]:bg-blue-600 data-[state=active]:text-white"
71
+ >
72
+ <BookOpen className="h-4 w-4" aria-hidden="true" />
73
+ Overview
74
+ </Tabs.Trigger>
75
+ <Tabs.Trigger
76
+ value="query"
77
+ className="inline-flex items-center gap-2 rounded-xl px-4 py-2 text-sm font-semibold text-slate-700 data-[state=active]:bg-blue-600 data-[state=active]:text-white"
78
+ >
79
+ <Database className="h-4 w-4" aria-hidden="true" />
80
+ Query
81
+ </Tabs.Trigger>
82
+ <Tabs.Trigger
83
+ value="documents"
84
+ className="inline-flex items-center gap-2 rounded-xl px-4 py-2 text-sm font-semibold text-slate-700 data-[state=active]:bg-blue-600 data-[state=active]:text-white"
85
+ >
86
+ <FileText className="h-4 w-4" aria-hidden="true" />
87
+ My documents
88
+ </Tabs.Trigger>
89
+ </Tabs.List>
90
+ <Tabs.Content value="overview">
91
+ <OverviewTab />
92
+ </Tabs.Content>
93
+ <Tabs.Content value="query">
94
+ <QueryTab />
95
+ </Tabs.Content>
96
+ <Tabs.Content value="documents">
97
+ <DocumentsTab />
98
+ </Tabs.Content>
99
+ </Tabs.Root>
100
+ </div>
101
+ </main>
102
+ )
103
+ }
104
+
105
+ function App() {
106
+ const queryClient = useMemo(
107
+ () =>
108
+ new QueryClient({
109
+ defaultOptions: {
110
+ queries: {
111
+ refetchOnWindowFocus: false,
112
+ retry: false,
113
+ },
114
+ },
115
+ }),
116
+ [],
117
+ )
118
+
119
+ return (
120
+ <QueryClientProvider client={queryClient}>
121
+ <SessionProvider>
122
+ <Shell />
123
+ </SessionProvider>
124
+ </QueryClientProvider>
125
+ )
126
+ }
127
+
128
+ export default App
frontend/src/api/client.ts ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import type { LlmConfigModel, QueryRequestModel, QueryResponseModel } from './generated'
2
+
3
+ function resolveApiBaseUrl(): string {
4
+ const raw = import.meta.env.VITE_API_BASE_URL
5
+ if (typeof raw === 'string' && raw.trim() !== '') {
6
+ return raw.trim().replace(/\/$/, '')
7
+ }
8
+ if (!import.meta.env.PROD) {
9
+ // Vitest + MSW use absolute handlers on http://127.0.0.1:8000.
10
+ if (import.meta.env.VITEST) {
11
+ return 'http://127.0.0.1:8000'
12
+ }
13
+ // npm run dev: empty base → same-origin; vite.config.ts proxies to FastAPI.
14
+ return ''
15
+ }
16
+ // Production bundle: same-origin when UI is served by FastAPI (typical Docker) on :8000.
17
+ if (typeof window !== 'undefined') {
18
+ const port = window.location.port
19
+ const sameOriginAsApi =
20
+ port === '8000' || port === '' || port === '80' || port === '443'
21
+ if (sameOriginAsApi) {
22
+ return ''
23
+ }
24
+ const { protocol, hostname } = window.location
25
+ const host = hostname || '127.0.0.1'
26
+ return `${protocol}//${host}:8000`.replace(/\/$/, '')
27
+ }
28
+ return ''
29
+ }
30
+
31
+ const API_BASE_URL = resolveApiBaseUrl()
32
+
33
+ export interface SessionFile {
34
+ name: string
35
+ size_bytes: number
36
+ }
37
+
38
+ export interface CreateSessionResponse {
39
+ session_id: string
40
+ expires_at: number
41
+ }
42
+
43
+ export interface DeleteSessionResponse {
44
+ deleted_session_id: string
45
+ session_id: string
46
+ }
47
+
48
+ export interface SessionSummary extends CreateSessionResponse {
49
+ files: SessionFile[]
50
+ total_bytes: number
51
+ max_session_bytes: number
52
+ max_files: number
53
+ }
54
+
55
+ export interface UploadResult {
56
+ filename: string
57
+ status: 'queued' | 'skipped' | 'rejected' | 'failed' | string
58
+ message: string
59
+ }
60
+
61
+ export interface UploadDocumentsResponse extends SessionSummary {
62
+ results: UploadResult[]
63
+ }
64
+
65
+ export class ApiError extends Error {
66
+ readonly status: number
67
+ readonly detail: unknown
68
+
69
+ constructor(
70
+ message: string,
71
+ status: number,
72
+ detail: unknown,
73
+ ) {
74
+ super(message)
75
+ this.name = 'ApiError'
76
+ this.status = status
77
+ this.detail = detail
78
+ }
79
+ }
80
+
81
+ function apiUrl(path: string) {
82
+ const suffix = path.startsWith('/') ? path : `/${path}`
83
+ return API_BASE_URL ? `${API_BASE_URL}${suffix}` : suffix
84
+ }
85
+
86
+ function readApiKey() {
87
+ return localStorage.getItem('doc-ingestion.api-key') ?? ''
88
+ }
89
+
90
+ async function parseError(response: Response) {
91
+ let detail: unknown
92
+ try {
93
+ detail = await response.json()
94
+ } catch {
95
+ detail = await response.text()
96
+ }
97
+ const message =
98
+ typeof detail === 'object' && detail !== null && 'detail' in detail
99
+ ? String((detail as { detail: unknown }).detail)
100
+ : `Request failed with status ${response.status}`
101
+ return new ApiError(message, response.status, detail)
102
+ }
103
+
104
+ function networkErrorHint(): string {
105
+ const target =
106
+ API_BASE_URL ||
107
+ (typeof window !== 'undefined' ? `${window.location.origin} (vite → API)` : 'the API')
108
+ const connectivity =
109
+ API_BASE_URL === '' && typeof import.meta.env !== 'undefined' && import.meta.env.DEV
110
+ ? 'Start uvicorn on the proxy target (default http://127.0.0.1:8000) while npm run dev is running, '
111
+ + 'set VITE_DEV_API_PROXY_TARGET if the API is elsewhere, '
112
+ + 'or set VITE_API_BASE_URL to bypass the proxy. '
113
+ : 'Start the API (e.g. uvicorn on port 8000), or set VITE_API_BASE_URL at build time. '
114
+ return (
115
+ `Cannot reach ${target}. ${connectivity}` +
116
+ `Session features need DOC_PROFILE=demo and DOC_DEMO_UPLOADS=1 on the server.`
117
+ )
118
+ }
119
+
120
+ /** Thrown when `fetch` fails before a response (offline, wrong host/port, CORS, etc.). */
121
+ export function networkFailureError(cause?: unknown): ApiError {
122
+ return new ApiError(networkErrorHint(), 0, cause)
123
+ }
124
+
125
+ async function requestJson<T>(path: string, init: RequestInit = {}): Promise<T> {
126
+ const apiKey = readApiKey()
127
+ const headers = new Headers(init.headers)
128
+ if (!(init.body instanceof FormData)) {
129
+ headers.set('Content-Type', 'application/json')
130
+ }
131
+ if (apiKey) {
132
+ headers.set('X-API-Key', apiKey)
133
+ }
134
+ let response: Response
135
+ try {
136
+ response = await fetch(apiUrl(path), { ...init, headers })
137
+ } catch (cause) {
138
+ throw networkFailureError(cause)
139
+ }
140
+ if (!response.ok) {
141
+ const err = await parseError(response)
142
+ if (response.status === 404 && path.startsWith('/sessions')) {
143
+ err.message = `${err.message} If the API is up, enable demo sessions: DOC_PROFILE=demo and DOC_DEMO_UPLOADS=1.`
144
+ }
145
+ throw err
146
+ }
147
+ return response.json() as Promise<T>
148
+ }
149
+
150
+ export function createSession() {
151
+ return requestJson<CreateSessionResponse>('/sessions', { method: 'POST' })
152
+ }
153
+
154
+ export function getSession(sessionId: string) {
155
+ return requestJson<SessionSummary>(`/sessions/${sessionId}`)
156
+ }
157
+
158
+ export function deleteSession(sessionId: string) {
159
+ return requestJson<DeleteSessionResponse>(`/sessions/${sessionId}`, { method: 'DELETE' })
160
+ }
161
+
162
+ export function uploadDocuments(sessionId: string, files: File[]) {
163
+ const formData = new FormData()
164
+ files.forEach((file) => formData.append('files', file))
165
+ return requestJson<UploadDocumentsResponse>(`/sessions/${sessionId}/documents`, {
166
+ method: 'POST',
167
+ body: formData,
168
+ })
169
+ }
170
+
171
+ export function queryDocuments(request: QueryRequestModel) {
172
+ return requestJson<QueryResponseModel>('/query', {
173
+ method: 'POST',
174
+ body: JSON.stringify(request),
175
+ })
176
+ }
177
+
178
+ export function fetchLlmConfig() {
179
+ return requestJson<LlmConfigModel>('/config/llm')
180
+ }
181
+
182
+ export { API_BASE_URL }
frontend/src/api/generated.ts ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export type KnowledgeScope = 'global' | 'session' | 'both'
2
+
3
+ export interface QueryRequestModel {
4
+ query: string
5
+ top_k?: number
6
+ use_llm?: boolean
7
+ use_rerank?: boolean
8
+ stream?: boolean
9
+ include_citations?: boolean
10
+ provider?: string | null
11
+ model?: string | null
12
+ reranker_model?: string | null
13
+ provider_api_key?: string | null
14
+ session_id?: string | null
15
+ knowledge_scope?: KnowledgeScope
16
+ }
17
+
18
+ export interface CitationModel {
19
+ raw_id: string
20
+ chunk_id: string
21
+ resolved: boolean
22
+ title?: string | null
23
+ source?: string | null
24
+ verification_score: number
25
+ verification: string
26
+ }
27
+
28
+ export interface RetrievedChunkModel {
29
+ id: string
30
+ score: number
31
+ source: string
32
+ confidence: number
33
+ metadata: Record<string, unknown>
34
+ preview: string
35
+ }
36
+
37
+ export interface TruthfulnessModel {
38
+ nli_faithfulness: number
39
+ citation_groundedness: number
40
+ uncited_claims: number
41
+ score: number
42
+ }
43
+
44
+ export interface QueryResponseModel {
45
+ query: string
46
+ provider: string
47
+ model: string
48
+ answer: string
49
+ processing_time_ms: number
50
+ cached: boolean
51
+ validation_issues: string[]
52
+ citations: CitationModel[]
53
+ retrieved: RetrievedChunkModel[]
54
+ truthfulness?: TruthfulnessModel | null
55
+ }
56
+
57
+ export interface HealthModel {
58
+ status: string
59
+ collection: string
60
+ }
61
+
62
+ export interface MetricsModel {
63
+ cache_ttl_seconds: number
64
+ available_providers: string[]
65
+ }
66
+
67
+ export interface LlmConfigModel {
68
+ default_provider: string
69
+ default_model_by_provider: Record<string, string>
70
+ allowed_models_by_provider: Record<string, string[]>
71
+ }
frontend/src/assets/hero.png ADDED
frontend/src/assets/react.svg ADDED
frontend/src/assets/vite.svg ADDED
frontend/src/components/AnswerPanel.tsx ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import type { QueryResponseModel } from '../api/generated'
2
+
3
+ export function AnswerPanel({
4
+ answer,
5
+ response,
6
+ isLoading,
7
+ }: {
8
+ answer: string
9
+ response: QueryResponseModel | null
10
+ isLoading: boolean
11
+ }) {
12
+ const truthfulness = response?.truthfulness
13
+ return (
14
+ <section className="app-card p-5" aria-live="polite">
15
+ <div className="mb-3 flex flex-wrap items-center justify-between gap-2">
16
+ <h2 className="text-lg font-semibold text-slate-950">Answer</h2>
17
+ {truthfulness ? (
18
+ <span className="rounded-full bg-emerald-50 px-3 py-1 text-sm font-medium text-emerald-700">
19
+ Truthfulness {truthfulness.score.toFixed(2)}
20
+ </span>
21
+ ) : null}
22
+ </div>
23
+ <div className="min-h-28 whitespace-pre-wrap rounded-xl bg-slate-50 p-4 text-left text-slate-800">
24
+ {answer || (isLoading ? 'Waiting for tokens...' : 'Ask a question to see a grounded answer.')}
25
+ </div>
26
+ {response ? (
27
+ <div className="mt-3 flex flex-wrap gap-3 text-sm text-slate-600">
28
+ <span>{response.provider} / {response.model}</span>
29
+ <span>{Math.round(response.processing_time_ms)} ms</span>
30
+ {response.cached ? <span>Cached</span> : null}
31
+ </div>
32
+ ) : null}
33
+ </section>
34
+ )
35
+ }
frontend/src/components/CitationsList.tsx ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import type { SessionFile } from '../api/client'
2
+ import type { CitationModel } from '../api/generated'
3
+ import { citationLabel } from '../lib/citationProvenance'
4
+
5
+ export function CitationsList({
6
+ citations,
7
+ sessionFiles,
8
+ }: {
9
+ citations: CitationModel[]
10
+ sessionFiles: SessionFile[]
11
+ }) {
12
+ return (
13
+ <section className="app-card p-5">
14
+ <h2 className="mb-3 text-lg font-semibold text-slate-950">Citations</h2>
15
+ {citations.length === 0 ? (
16
+ <p className="text-sm text-slate-600">No citations returned yet.</p>
17
+ ) : (
18
+ <ul className="space-y-3">
19
+ {citations.map((citation) => {
20
+ const label = citationLabel(citation, sessionFiles)
21
+ return (
22
+ <li key={`${citation.raw_id}-${citation.chunk_id}`} className="rounded-xl bg-slate-50 p-3">
23
+ <div className="flex flex-wrap items-center gap-2">
24
+ <span className="rounded-full bg-slate-900 px-2 py-1 text-xs font-semibold text-white">
25
+ [{label === 'yours' ? 'yours' : 'global'}]
26
+ </span>
27
+ <span className="font-medium text-slate-900">
28
+ {citation.title || citation.source || citation.chunk_id}
29
+ </span>
30
+ </div>
31
+ <p className="mt-1 text-sm text-slate-600">
32
+ {citation.verification} · score {citation.verification_score.toFixed(2)}
33
+ </p>
34
+ </li>
35
+ )
36
+ })}
37
+ </ul>
38
+ )}
39
+ </section>
40
+ )
41
+ }
frontend/src/components/RetrievedChunks.tsx ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import type { RetrievedChunkModel } from '../api/generated'
2
+
3
+ export function RetrievedChunks({ chunks }: { chunks: RetrievedChunkModel[] }) {
4
+ return (
5
+ <details className="app-card p-5">
6
+ <summary className="cursor-pointer text-lg font-semibold text-slate-950">
7
+ Retrieved chunks ({chunks.length})
8
+ </summary>
9
+ {chunks.length === 0 ? (
10
+ <p className="mt-3 text-sm text-slate-600">No retrieved chunks returned yet.</p>
11
+ ) : (
12
+ <ul className="mt-4 space-y-3">
13
+ {chunks.map((chunk) => (
14
+ <li key={chunk.id} className="rounded-xl bg-slate-50 p-3 text-left">
15
+ <div className="flex flex-wrap justify-between gap-2 text-sm">
16
+ <span className="font-medium text-slate-900">{chunk.id}</span>
17
+ <span className="text-slate-600">score {chunk.score.toFixed(3)}</span>
18
+ </div>
19
+ <p className="mt-2 text-sm text-slate-700">{chunk.preview}</p>
20
+ </li>
21
+ ))}
22
+ </ul>
23
+ )}
24
+ </details>
25
+ )
26
+ }
frontend/src/components/SamplePromptChips.tsx ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const prompts = [
2
+ 'What is retrieval augmented generation?',
3
+ 'How does hybrid retrieval improve document search?',
4
+ 'Explain BM25 vs vector search.',
5
+ 'What makes citations useful in a RAG system?',
6
+ ]
7
+
8
+ export function SamplePromptChips({ onSelect }: { onSelect: (prompt: string) => void }) {
9
+ return (
10
+ <div>
11
+ <p className="mb-2 text-sm font-medium text-slate-700">Try a sample</p>
12
+ <div className="flex flex-wrap gap-2">
13
+ {prompts.map((prompt) => (
14
+ <button
15
+ key={prompt}
16
+ type="button"
17
+ className="rounded-full border border-slate-200 bg-white px-3 py-2 text-sm text-slate-700 shadow-sm hover:border-blue-300 hover:text-blue-700"
18
+ onClick={() => onSelect(prompt)}
19
+ >
20
+ {prompt}
21
+ </button>
22
+ ))}
23
+ </div>
24
+ </div>
25
+ )
26
+ }
frontend/src/components/ScopeToggle.test.tsx ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { render, screen } from '@testing-library/react'
2
+ import userEvent from '@testing-library/user-event'
3
+ import { ScopeToggle } from './ScopeToggle'
4
+
5
+ describe('ScopeToggle', () => {
6
+ it('disables Mine and Both until uploads exist', () => {
7
+ render(<ScopeToggle value="global" onChange={vi.fn()} hasUploads={false} />)
8
+ expect(screen.getByRole('radio', { name: /my uploads only/i })).toBeDisabled()
9
+ expect(screen.getByRole('radio', { name: /both/i })).toBeDisabled()
10
+ })
11
+
12
+ it('enables session scopes after upload', async () => {
13
+ const onChange = vi.fn()
14
+ render(<ScopeToggle value="global" onChange={onChange} hasUploads />)
15
+ await userEvent.click(screen.getByRole('radio', { name: /my uploads only/i }))
16
+ expect(onChange).toHaveBeenCalledWith('session')
17
+ })
18
+ })
frontend/src/components/ScopeToggle.tsx ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import * as RadioGroup from '@radix-ui/react-radio-group'
2
+ import type { KnowledgeScope } from '../api/generated'
3
+ import { cn } from '../lib/utils'
4
+
5
+ const options: Array<{ value: KnowledgeScope; label: string; helper: string }> = [
6
+ { value: 'global', label: 'Global sample corpus', helper: 'Use the preloaded public demo documents.' },
7
+ { value: 'session', label: 'My uploads only', helper: 'Ask only against documents in this browser session.' },
8
+ { value: 'both', label: 'Both', helper: 'Blend sample documents with your uploaded files.' },
9
+ ]
10
+
11
+ export function ScopeToggle({
12
+ value,
13
+ onChange,
14
+ hasUploads,
15
+ }: {
16
+ value: KnowledgeScope
17
+ onChange: (value: KnowledgeScope) => void
18
+ hasUploads: boolean
19
+ }) {
20
+ return (
21
+ <RadioGroup.Root
22
+ className="grid gap-3 md:grid-cols-3"
23
+ value={value}
24
+ onValueChange={(next) => onChange(next as KnowledgeScope)}
25
+ aria-label="Knowledge scope"
26
+ >
27
+ {options.map((option) => {
28
+ const disabled = option.value !== 'global' && !hasUploads
29
+ return (
30
+ <RadioGroup.Item
31
+ key={option.value}
32
+ value={option.value}
33
+ disabled={disabled}
34
+ className={cn(
35
+ 'rounded-xl border p-4 text-left transition',
36
+ value === option.value ? 'border-blue-500 bg-blue-50' : 'border-slate-200 bg-white',
37
+ disabled && 'cursor-not-allowed opacity-50',
38
+ )}
39
+ >
40
+ <div className="flex items-center gap-3">
41
+ <span
42
+ className={cn(
43
+ 'h-4 w-4 rounded-full border',
44
+ value === option.value ? 'border-blue-600 bg-blue-600' : 'border-slate-400',
45
+ )}
46
+ />
47
+ <span className="font-medium text-slate-900">{option.label}</span>
48
+ </div>
49
+ <p className="mt-2 text-sm text-slate-600">
50
+ {disabled ? 'Upload a document to enable this scope.' : option.helper}
51
+ </p>
52
+ </RadioGroup.Item>
53
+ )
54
+ })}
55
+ </RadioGroup.Root>
56
+ )
57
+ }
frontend/src/components/Uploader.test.tsx ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { render, screen } from '@testing-library/react'
2
+ import userEvent from '@testing-library/user-event'
3
+ import { QueryClient, QueryClientProvider } from '@tanstack/react-query'
4
+ import { Uploader } from './Uploader'
5
+
6
+ describe('Uploader', () => {
7
+ it('shows client-side file count cap messaging', async () => {
8
+ const client = new QueryClient({ defaultOptions: { queries: { retry: false } } })
9
+ render(
10
+ <QueryClientProvider client={client}>
11
+ <Uploader
12
+ sessionId="abc123demo"
13
+ onUploaded={vi.fn()}
14
+ summary={{
15
+ session_id: 'abc123demo',
16
+ expires_at: 1,
17
+ files: [
18
+ { name: 'a.md', size_bytes: 1 },
19
+ { name: 'b.md', size_bytes: 1 },
20
+ { name: 'c.md', size_bytes: 1 },
21
+ ],
22
+ total_bytes: 3,
23
+ max_files: 3,
24
+ max_session_bytes: 100,
25
+ }}
26
+ />
27
+ </QueryClientProvider>,
28
+ )
29
+ const input = document.querySelector('input[type="file"]') as HTMLInputElement
30
+ await userEvent.upload(input, new File(['hello'], 'd.md', { type: 'text/markdown' }))
31
+ expect(screen.getByText(/upload 0 more file/i)).toBeInTheDocument()
32
+ })
33
+ })
frontend/src/components/Uploader.tsx ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useRef, useState } from 'react'
2
+ import { useMutation } from '@tanstack/react-query'
3
+ import { Upload } from 'lucide-react'
4
+ import { uploadDocuments, type SessionSummary, type UploadResult } from '../api/client'
5
+ import { formatBytes } from '../lib/format'
6
+
7
+ const ACCEPTED = '.pdf,.docx,.txt,.md,.html'
8
+ const MAX_FILE_BYTES = 3 * 1024 * 1024
9
+
10
+ function resultMessage(result: UploadResult) {
11
+ const messages: Record<string, string> = {
12
+ queued: 'Uploaded and indexed.',
13
+ skipped: 'Duplicate upload skipped.',
14
+ oversize: 'File exceeds the 3 MB limit.',
15
+ file_count_cap: 'Session file count cap reached.',
16
+ session_disk_cap: 'Session disk cap reached.',
17
+ type_mismatch: 'File contents do not match the extension.',
18
+ }
19
+ return messages[result.status] ?? messages[result.message] ?? result.message
20
+ }
21
+
22
+ export function Uploader({
23
+ sessionId,
24
+ summary,
25
+ onUploaded,
26
+ }: {
27
+ sessionId: string
28
+ summary: SessionSummary | undefined
29
+ onUploaded: () => Promise<unknown>
30
+ }) {
31
+ const inputRef = useRef<HTMLInputElement>(null)
32
+ const [message, setMessage] = useState('')
33
+ const [results, setResults] = useState<UploadResult[]>([])
34
+
35
+ const mutation = useMutation({
36
+ mutationFn: (files: File[]) => uploadDocuments(sessionId, files),
37
+ onSuccess: async (response) => {
38
+ setResults(response.results)
39
+ setMessage('Upload finished.')
40
+ await onUploaded()
41
+ },
42
+ onError: (error) => {
43
+ setMessage(error instanceof Error ? error.message : 'Upload failed.')
44
+ },
45
+ })
46
+
47
+ const upload = (fileList: FileList | File[]) => {
48
+ if (!summary) {
49
+ return
50
+ }
51
+ const files = Array.from(fileList)
52
+ const maxFiles = summary?.max_files ?? 3
53
+ const currentFiles = summary?.files.length ?? 0
54
+ if (currentFiles + files.length > maxFiles) {
55
+ setMessage(`You can upload ${Math.max(0, maxFiles - currentFiles)} more file(s).`)
56
+ return
57
+ }
58
+ const oversized = files.find((file) => file.size > MAX_FILE_BYTES)
59
+ if (oversized) {
60
+ setMessage(`${oversized.name} is larger than ${formatBytes(MAX_FILE_BYTES)}.`)
61
+ return
62
+ }
63
+ mutation.mutate(files)
64
+ }
65
+
66
+ return (
67
+ <div>
68
+ <div
69
+ className="rounded-2xl border-2 border-dashed border-slate-300 bg-slate-50 p-8 text-center"
70
+ onDragOver={(event) => event.preventDefault()}
71
+ onDrop={(event) => {
72
+ event.preventDefault()
73
+ upload(event.dataTransfer.files)
74
+ }}
75
+ >
76
+ <Upload className="mx-auto mb-3 h-8 w-8 text-blue-600" aria-hidden="true" />
77
+ <p className="font-medium text-slate-900">Drop files here or choose files</p>
78
+ <p className="mt-1 text-sm text-slate-600">PDF, DOCX, TXT, Markdown, or HTML.</p>
79
+ <input
80
+ ref={inputRef}
81
+ type="file"
82
+ accept={ACCEPTED}
83
+ multiple
84
+ className="sr-only"
85
+ onChange={(event) => event.target.files && upload(event.target.files)}
86
+ />
87
+ <button
88
+ type="button"
89
+ className="mt-4 rounded-lg bg-blue-600 px-4 py-2 text-sm font-semibold text-white hover:bg-blue-700 disabled:opacity-50"
90
+ disabled={mutation.isPending || !summary}
91
+ onClick={() => inputRef.current?.click()}
92
+ >
93
+ {mutation.isPending ? 'Uploading...' : 'Choose files'}
94
+ </button>
95
+ </div>
96
+ {message ? <p className="mt-3 text-sm text-slate-700" aria-live="polite">{message}</p> : null}
97
+ {results.length > 0 ? (
98
+ <ul className="mt-3 space-y-2">
99
+ {results.map((result) => (
100
+ <li key={`${result.filename}-${result.status}`} className="rounded-lg bg-slate-50 p-3 text-sm">
101
+ <span className="font-medium text-slate-900">{result.filename}</span>: {resultMessage(result)}
102
+ </li>
103
+ ))}
104
+ </ul>
105
+ ) : null}
106
+ </div>
107
+ )
108
+ }
frontend/src/index.css ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @import "tailwindcss";
2
+
3
+ :root {
4
+ color: #172033;
5
+ background: #f5f7fb;
6
+ font-family:
7
+ Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI",
8
+ sans-serif;
9
+ font-synthesis: none;
10
+ text-rendering: optimizeLegibility;
11
+ -webkit-font-smoothing: antialiased;
12
+ -moz-osx-font-smoothing: grayscale;
13
+ }
14
+
15
+ body {
16
+ margin: 0;
17
+ min-width: 320px;
18
+ min-height: 100vh;
19
+ }
20
+
21
+ button,
22
+ input,
23
+ textarea {
24
+ font: inherit;
25
+ }
26
+
27
+ button:focus-visible,
28
+ input:focus-visible,
29
+ textarea:focus-visible,
30
+ [role="tab"]:focus-visible,
31
+ [role="radio"]:focus-visible {
32
+ outline: 3px solid #93c5fd;
33
+ outline-offset: 2px;
34
+ }
35
+
36
+ .app-card {
37
+ @apply rounded-2xl border border-slate-200 bg-white shadow-sm;
38
+ }
39
+
40
+ .muted {
41
+ @apply text-sm text-slate-600;
42
+ }
frontend/src/lib/citationProvenance.test.ts ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { citationLabel } from './citationProvenance'
2
+
3
+ describe('citationLabel', () => {
4
+ it('labels citations matching uploaded files as yours', () => {
5
+ expect(
6
+ citationLabel(
7
+ { title: 'uploaded-doc.md', source: '/tmp/doc-ingest-sessions/abc/uploads/uploaded-doc.md' },
8
+ [{ name: 'uploaded-doc.md', size_bytes: 12 }],
9
+ ),
10
+ ).toBe('yours')
11
+ })
12
+
13
+ it('labels unmatched citations as global', () => {
14
+ expect(citationLabel({ title: 'README.md', source: 'data/documents/README.md' }, [])).toBe('global')
15
+ })
16
+ })
frontend/src/lib/citationProvenance.ts ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import type { SessionFile } from '../api/client'
2
+ import type { CitationModel } from '../api/generated'
3
+
4
+ export type CitationProvenance = 'global' | 'yours'
5
+
6
+ export function citationLabel(
7
+ citation: Pick<CitationModel, 'source' | 'title'>,
8
+ sessionFiles: SessionFile[],
9
+ ): CitationProvenance {
10
+ const searchable = `${citation.source ?? ''} ${citation.title ?? ''}`.toLowerCase()
11
+ return sessionFiles.some((file) => searchable.includes(file.name.toLowerCase())) ? 'yours' : 'global'
12
+ }
frontend/src/lib/format.ts ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export function formatBytes(bytes: number) {
2
+ if (!Number.isFinite(bytes) || bytes <= 0) {
3
+ return '0 B'
4
+ }
5
+ const units = ['B', 'KB', 'MB', 'GB']
6
+ const index = Math.min(Math.floor(Math.log(bytes) / Math.log(1024)), units.length - 1)
7
+ return `${(bytes / 1024 ** index).toFixed(index === 0 ? 0 : 1)} ${units[index]}`
8
+ }
9
+
10
+ export function formatTtl(expiresAt: number | null) {
11
+ if (!expiresAt) {
12
+ return 'unknown'
13
+ }
14
+ const seconds = Math.max(0, expiresAt - Math.floor(Date.now() / 1000))
15
+ const minutes = Math.floor(seconds / 60)
16
+ const remainder = seconds % 60
17
+ return `${minutes}:${remainder.toString().padStart(2, '0')}`
18
+ }
19
+
20
+ export function shortSessionId(sessionId: string | null) {
21
+ return sessionId ? `...${sessionId.slice(-5)}` : 'pending'
22
+ }
frontend/src/lib/streamQuery.test.ts ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { testInternals } from './streamQuery'
2
+
3
+ describe('streamQuery parsing', () => {
4
+ it('parses token and final events', () => {
5
+ expect(
6
+ testInternals.parseSseFrame(
7
+ 'data: {"type":"token","text":"Hi"}\n\ndata: {"type":"final","citations":[],"retrieved":[],"truthfulness":null,"provider":"ollama","model":"llama3"}',
8
+ ),
9
+ ).toEqual([
10
+ { type: 'token', text: 'Hi' },
11
+ {
12
+ type: 'final',
13
+ citations: [],
14
+ retrieved: [],
15
+ truthfulness: null,
16
+ provider: 'ollama',
17
+ model: 'llama3',
18
+ },
19
+ ])
20
+ })
21
+ })
frontend/src/lib/streamQuery.ts ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { API_BASE_URL, ApiError, networkFailureError } from '../api/client'
2
+ import type { CitationModel, QueryRequestModel, RetrievedChunkModel, TruthfulnessModel } from '../api/generated'
3
+
4
+ export type StreamEvent =
5
+ | { type: 'token'; text: string }
6
+ | {
7
+ type: 'final'
8
+ citations: CitationModel[]
9
+ retrieved?: RetrievedChunkModel[]
10
+ truthfulness?: TruthfulnessModel | null
11
+ provider: string
12
+ model: string
13
+ }
14
+ | { type: 'error'; message: string }
15
+
16
+ export interface StreamQueryCallbacks {
17
+ onToken: (text: string) => void
18
+ onFinal: (event: Extract<StreamEvent, { type: 'final' }>) => void
19
+ onError?: (message: string) => void
20
+ }
21
+
22
+ function parseSseFrame(frame: string): StreamEvent[] {
23
+ return frame
24
+ .split('\n')
25
+ .filter((line) => line.startsWith('data:'))
26
+ .map((line) => line.slice(5).trim())
27
+ .filter((data) => data && data !== '[DONE]')
28
+ .map((data) => JSON.parse(data) as StreamEvent)
29
+ }
30
+
31
+ async function parseError(response: Response) {
32
+ try {
33
+ const body = await response.json()
34
+ return body?.detail ? String(body.detail) : `Stream failed with status ${response.status}`
35
+ } catch {
36
+ return `Stream failed with status ${response.status}`
37
+ }
38
+ }
39
+
40
+ export async function streamQuery(request: QueryRequestModel, callbacks: StreamQueryCallbacks) {
41
+ const apiKey = localStorage.getItem('doc-ingestion.api-key')
42
+ const headers = new Headers({ 'Content-Type': 'application/json' })
43
+ if (apiKey) {
44
+ headers.set('X-API-Key', apiKey)
45
+ }
46
+
47
+ const streamPath =
48
+ API_BASE_URL && API_BASE_URL.length > 0 ? `${API_BASE_URL}/query/stream` : '/query/stream'
49
+ let response: Response
50
+ try {
51
+ response = await fetch(streamPath, {
52
+ method: 'POST',
53
+ headers,
54
+ body: JSON.stringify({ ...request, stream: true }),
55
+ })
56
+ } catch (cause) {
57
+ throw networkFailureError(cause)
58
+ }
59
+
60
+ if (!response.ok) {
61
+ throw new ApiError(await parseError(response), response.status, null)
62
+ }
63
+ if (!response.body) {
64
+ throw new ApiError('Streaming is not supported by this browser.', response.status, null)
65
+ }
66
+
67
+ const reader = response.body.getReader()
68
+ const decoder = new TextDecoder()
69
+ let buffer = ''
70
+
71
+ while (true) {
72
+ const { value, done } = await reader.read()
73
+ buffer += decoder.decode(value, { stream: !done })
74
+ const frames = buffer.split('\n\n')
75
+ buffer = frames.pop() ?? ''
76
+
77
+ for (const frame of frames) {
78
+ for (const event of parseSseFrame(frame)) {
79
+ if (event.type === 'token') {
80
+ callbacks.onToken(event.text)
81
+ } else if (event.type === 'final') {
82
+ callbacks.onFinal(event)
83
+ } else if (event.type === 'error') {
84
+ callbacks.onError?.(event.message)
85
+ throw new ApiError(event.message, response.status, event)
86
+ }
87
+ }
88
+ }
89
+
90
+ if (done) {
91
+ break
92
+ }
93
+ }
94
+ }
95
+
96
+ export const testInternals = { parseSseFrame }
frontend/src/lib/utils.ts ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import { clsx, type ClassValue } from 'clsx'
2
+ import { twMerge } from 'tailwind-merge'
3
+
4
+ export function cn(...inputs: ClassValue[]) {
5
+ return twMerge(clsx(inputs))
6
+ }
frontend/src/main.tsx ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import { StrictMode } from 'react'
2
+ import { createRoot } from 'react-dom/client'
3
+ import './index.css'
4
+ import App from './App.tsx'
5
+
6
+ createRoot(document.getElementById('root')!).render(
7
+ <StrictMode>
8
+ <App />
9
+ </StrictMode>,
10
+ )