BrejBala commited on
Commit
e63c592
·
1 Parent(s): c02c9d3

Deploy backend Docker app

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +12 -0
  2. Dockerfile +23 -0
  3. LICENSE +21 -0
  4. backend/.env.example +38 -0
  5. backend/README.md +441 -0
  6. backend/app/__pycache__/main.cpython-313.pyc +0 -0
  7. backend/app/core/__pycache__/cache.cpython-313.pyc +0 -0
  8. backend/app/core/__pycache__/config.cpython-313.pyc +0 -0
  9. backend/app/core/__pycache__/errors.cpython-313.pyc +0 -0
  10. backend/app/core/__pycache__/logging.cpython-313.pyc +0 -0
  11. backend/app/core/__pycache__/metrics.cpython-313.pyc +0 -0
  12. backend/app/core/__pycache__/rate_limit.cpython-313.pyc +0 -0
  13. backend/app/core/__pycache__/runtime.cpython-313.pyc +0 -0
  14. backend/app/core/__pycache__/security.cpython-313.pyc +0 -0
  15. backend/app/core/__pycache__/tracing.cpython-313.pyc +0 -0
  16. backend/app/core/cache.py +162 -0
  17. backend/app/core/config.py +118 -0
  18. backend/app/core/errors.py +83 -0
  19. backend/app/core/logging.py +19 -0
  20. backend/app/core/metrics.py +129 -0
  21. backend/app/core/rate_limit.py +58 -0
  22. backend/app/core/runtime.py +31 -0
  23. backend/app/core/security.py +116 -0
  24. backend/app/core/tracing.py +60 -0
  25. backend/app/main.py +61 -0
  26. backend/app/routers/__pycache__/chat.cpython-313.pyc +0 -0
  27. backend/app/routers/__pycache__/documents.cpython-313.pyc +0 -0
  28. backend/app/routers/__pycache__/health.cpython-313.pyc +0 -0
  29. backend/app/routers/__pycache__/ingest.cpython-313.pyc +0 -0
  30. backend/app/routers/__pycache__/metrics.cpython-313.pyc +0 -0
  31. backend/app/routers/__pycache__/search.cpython-313.pyc +0 -0
  32. backend/app/routers/chat.py +280 -0
  33. backend/app/routers/documents.py +100 -0
  34. backend/app/routers/health.py +19 -0
  35. backend/app/routers/ingest.py +194 -0
  36. backend/app/routers/metrics.py +18 -0
  37. backend/app/routers/search.py +90 -0
  38. backend/app/schemas/__pycache__/chat.cpython-313.pyc +0 -0
  39. backend/app/schemas/__pycache__/documents.cpython-313.pyc +0 -0
  40. backend/app/schemas/__pycache__/ingest.cpython-313.pyc +0 -0
  41. backend/app/schemas/__pycache__/search.cpython-313.pyc +0 -0
  42. backend/app/schemas/chat.py +128 -0
  43. backend/app/schemas/documents.py +34 -0
  44. backend/app/schemas/ingest.py +56 -0
  45. backend/app/schemas/search.py +33 -0
  46. backend/app/services/__pycache__/chunking.cpython-313.pyc +0 -0
  47. backend/app/services/__pycache__/dedupe.cpython-313.pyc +0 -0
  48. backend/app/services/__pycache__/normalize.cpython-313.pyc +0 -0
  49. backend/app/services/__pycache__/pinecone_store.cpython-313.pyc +0 -0
  50. backend/app/services/chat/__pycache__/graph.cpython-313.pyc +0 -0
.dockerignore ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ **/__pycache__/
2
+ **/*.pyc
3
+ **/.pytest_cache/
4
+ **/.mypy_cache/
5
+ **/.ruff_cache/
6
+ **/.venv/
7
+ **/venv/
8
+ **/.env
9
+ **/.env.*
10
+ .git/
11
+ .gitignore
12
+ .DS_Store
Dockerfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ ENV PYTHONDONTWRITEBYTECODE=1
4
+ ENV PYTHONUNBUFFERED=1
5
+
6
+ WORKDIR /app
7
+
8
+ # Install dependencies
9
+ COPY backend/requirements.txt /app/requirements.txt
10
+ RUN pip install --no-cache-dir -r /app/requirements.txt
11
+
12
+ # Copy application code
13
+ COPY backend /app
14
+
15
+ # Hugging Face Spaces typically exposes port 7860 and sets PORT dynamically.
16
+ EXPOSE 7860
17
+
18
+ ENV PINECONE_NAMESPACE=dev
19
+ ENV LOG_LEVEL=INFO
20
+
21
+ # Use the PORT environment variable if provided (e.g. on Hugging Face Spaces),
22
+ # otherwise default to 7860. Shell form allows env substitution.
23
+ CMD uvicorn app.main:app --host 0.0.0.0 --port ${PORT:-7860}
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Brejesh Balakrishnan
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
backend/.env.example ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PINECONE_API_KEY=your-pinecone-api-key
2
+ PINECONE_INDEX_NAME=your-index-name
3
+ PINECONE_HOST=https://your-index-host.pinecone.io
4
+ PINECONE_NAMESPACE=dev
5
+ PINECONE_TEXT_FIELD=chunk_text
6
+
7
+ # Groq (LLM for /chat)
8
+ GROQ_API_KEY=your-groq-api-key
9
+ GROQ_BASE_URL=https://api.groq.com/openai/v1
10
+ GROQ_MODEL=llama-3.1-8b-instant
11
+
12
+ # Tavily (optional web search fallback for /chat)
13
+ TAVILY_API_KEY=your-tavily-api-key
14
+
15
+ # Optional: LangSmith / LangChain tracing
16
+ LANGCHAIN_TRACING_V2=false
17
+ LANGCHAIN_API_KEY=your-langsmith-api-key
18
+ LANGCHAIN_PROJECT=rag-agent-workbench
19
+
20
+ # Optional: basic API protection
21
+ # When set, /ingest/*, /documents/*, /search, and /chat* require header X-API-Key
22
+ API_KEY=your-backend-api-key
23
+
24
+ # Optional: CORS
25
+ # Comma-separated list of allowed origins, e.g.
26
+ # ALLOWED_ORIGINS=http://localhost:8501,https://your-streamlit-app.streamlit.app
27
+ # When unset, defaults to "*".
28
+ ALLOWED_ORIGINS=
29
+
30
+ # Optional: rate limiting and caching toggles
31
+ # Set to "false" to disable.
32
+ RATE_LIMIT_ENABLED=true
33
+ CACHE_ENABLED=true
34
+
35
+ # Optional: override the default Wikimedia User-Agent string used for Wikipedia requests.
36
+ # If not set, a descriptive default is used to comply with Wikimedia API policy.
37
+ WIKIMEDIA_USER_AGENT="rag-agent-workbench/0.1 (+https://github.com/..; contact: ..)"
38
+ LOG_LEVEL=INFO
backend/README.md ADDED
@@ -0,0 +1,441 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # RAG Agent Workbench – Backend
2
+
3
+ Lightweight FastAPI backend for ingesting documents into Pinecone (with integrated embeddings), searching over them, and serving a production-style RAG chat endpoint.
4
+
5
+ ## Prerequisites
6
+
7
+ - Python 3.11+
8
+ - A Pinecone account and an index configured with **integrated embeddings**
9
+ - A Groq account and API key for chat
10
+ - (Optional) Tavily API key for web search fallback
11
+ - (Optional) LangSmith account + API key for tracing
12
+ - Environment variables set (see `.env.example`)
13
+
14
+ ## Setup
15
+
16
+ ```bash
17
+ cd backend
18
+ python -m venv .venv
19
+ source .venv/bin/activate # On Windows: .venv\Scripts\activate
20
+ pip install -r requirements.txt
21
+ cp .env.example .env # then edit with your Pinecone, Groq, and optional Tavily/LangSmith credentials
22
+ ```
23
+
24
+ Required `.env` values:
25
+
26
+ - `PINECONE_API_KEY` – your Pinecone API key
27
+ - `PINECONE_INDEX_NAME` – the index name (used for configuration checks)
28
+ - `PINECONE_HOST` – the index host URL (use host targeting for production)
29
+ - `PINECONE_NAMESPACE` – default namespace (e.g. `dev`)
30
+ - `PINECONE_TEXT_FIELD` – text field name used by the integrated embedding index (e.g. `chunk_text` or `content`)
31
+ - `LOG_LEVEL` – e.g. `INFO`, `DEBUG`
32
+
33
+ Required for `/chat`:
34
+
35
+ - `GROQ_API_KEY` – your Groq API key
36
+ - `GROQ_BASE_URL` – Groq OpenAI-compatible endpoint (default `https://api.groq.com/openai/v1`)
37
+ - `GROQ_MODEL` – Groq chat model name (default `llama-3.1-8b-instant`)
38
+
39
+ Optional for web search fallback:
40
+
41
+ - `TAVILY_API_KEY` – Tavily API key (enables web search in `/chat` when retrieval is weak)
42
+
43
+ Optional for LangSmith tracing:
44
+
45
+ - `LANGCHAIN_TRACING_V2` – set to `true` to enable tracing
46
+ - `LANGCHAIN_API_KEY` – your LangSmith API key
47
+ - `LANGCHAIN_PROJECT` – project name for traces (e.g. `rag-agent-workbench`)
48
+
49
+ Optional for basic API protection:
50
+
51
+ - `API_KEY` – when set, `/ingest/*`, `/documents/*`, `/search`, and `/chat*` require `X-API-Key` header.
52
+
53
+ Optional for CORS:
54
+
55
+ - `ALLOWED_ORIGINS` – comma-separated list of allowed origins.
56
+ - If unset, defaults to `"*"` (useful for local dev and quick demos).
57
+
58
+ Optional for rate limiting and caching:
59
+
60
+ - `RATE_LIMIT_ENABLED` – defaults to `true`. Set to `false` to disable SlowAPI limits.
61
+ - `CACHE_ENABLED` – defaults to `true`. Set to `false` to disable in-memory TTL caches.
62
+
63
+ Your Pinecone index **must** be configured for integrated embeddings (e.g. via `create_index_for_model` or `configure_index(embed=...)`), with a field mapping that includes the configured `PINECONE_TEXT_FIELD`.
64
+
65
+ ## Run locally
66
+
67
+ ```bash
68
+ cd backend
69
+ uvicorn app.main:app --reload --port 8000
70
+ ```
71
+
72
+ The API will be available at `http://localhost:8000`.
73
+
74
+ ## Sample endpoints
75
+
76
+ ### Health
77
+
78
+ ```bash
79
+ curl http://localhost:8000/health
80
+ ```
81
+
82
+ ### Ingest from arXiv
83
+
84
+ ```bash
85
+ curl -X POST "http://localhost:8000/ingest/arxiv" \
86
+ -H "Content-Type: application/json" \
87
+ -d '{
88
+ "query": "retrieval augmented generation",
89
+ "max_docs": 5,
90
+ "namespace": "dev",
91
+ "category": "papers"
92
+ }'
93
+ ```
94
+
95
+ ### Ingest from OpenAlex
96
+
97
+ ```bash
98
+ curl -X POST "http://localhost:8000/ingest/openalex" \
99
+ -H "Content-Type: application/json" \
100
+ -d '{
101
+ "query": "retrieval augmented generation",
102
+ "max_docs": 5,
103
+ "namespace": "dev",
104
+ "mailto": "you@example.com"
105
+ }'
106
+ ```
107
+
108
+ ### Ingest from Wikipedia
109
+
110
+ ```bash
111
+ curl -X POST "http://localhost:8000/ingest/wiki" \
112
+ -H "Content-Type: application/json" \
113
+ -d '{
114
+ "titles": ["Retrieval-augmented generation", "Vector database"],
115
+ "namespace": "dev"
116
+ }'
117
+ ```
118
+
119
+ ### Manual text upload
120
+
121
+ ```bash
122
+ curl -X POST "http://localhost:8000/documents/upload-text" \
123
+ -H "Content-Type: application/json" \
124
+ -d '{
125
+ "title": "My manual note",
126
+ "source": "manual",
127
+ "text": "This is some example text describing RAG pipelines...",
128
+ "namespace": "dev",
129
+ "metadata": {
130
+ "url": "https://example.com/my-note"
131
+ }
132
+ }'
133
+ ```
134
+
135
+ ### Search
136
+
137
+ ```bash
138
+ curl -X POST "http://localhost:8000/search" \
139
+ -H "Content-Type: application/json" \
140
+ -H "X-API-Key: $API_KEY" \ # only if API_KEY is enabled
141
+ -d '{
142
+ "query": "what is RAG",
143
+ "top_k": 5,
144
+ "namespace": "dev",
145
+ "filters": {"source": "arxiv"}
146
+ }'
147
+ ```
148
+
149
+ ### Document stats
150
+
151
+ ```bash
152
+ curl "http://localhost:8000/documents/stats?namespace=dev"
153
+ ```
154
+
155
+ ### Chat (non-streaming)
156
+
157
+ ```bash
158
+ curl -X POST "http://localhost:8000/chat" \
159
+ -H "Content-Type: application/json" \
160
+ -H "X-API-Key: $API_KEY" \ # only if API_KEY is enabled
161
+ -d '{
162
+ "query": "What is retrieval-augmented generation?",
163
+ "namespace": "dev",
164
+ "top_k": 5,
165
+ "use_web_fallback": true,
166
+ "min_score": 0.25,
167
+ "max_web_results": 5,
168
+ "chat_history": [
169
+ {"role": "user", "content": "You are helping me understand RAG."}
170
+ ]
171
+ }'
172
+ ```
173
+
174
+ Example JSON response:
175
+
176
+ ```json
177
+ {
178
+ "answer": "...",
179
+ "sources": [
180
+ {
181
+ "source": "wiki",
182
+ "title": "Retrieval-augmented generation",
183
+ "url": "https://en.wikipedia.org/wiki/...",
184
+ "score": 0.91,
185
+ "chunk_text": "..."
186
+ }
187
+ ],
188
+ "timings": {
189
+ "retrieve_ms": 35.2,
190
+ "web_ms": 0.0,
191
+ "generate_ms": 420.7,
192
+ "total_ms": 470.1
193
+ },
194
+ "trace": {
195
+ "langsmith_project": "rag-agent-workbench",
196
+ "trace_enabled": true
197
+ }
198
+ }
199
+ ```
200
+
201
+ ### Chat (SSE streaming)
202
+
203
+ ```bash
204
+ curl -N -X POST "http://localhost:8000/chat/stream" \
205
+ -H "Content-Type: application/json" \
206
+ -H "X-API-Key: $API_KEY" \ # only if API_KEY is enabled
207
+ -d '{
208
+ "query": "Summarise retrieval-augmented generation.",
209
+ "namespace": "dev",
210
+ "top_k": 5,
211
+ "use_web_fallback": true
212
+ }'
213
+ ```
214
+
215
+ - The response will be `text/event-stream`.
216
+ - Individual SSE events stream tokens (space-delimited).
217
+ - The final event (`event: end`) includes the full JSON payload as in `/chat`.
218
+
219
+ ### Metrics
220
+
221
+ ```bash
222
+ curl "http://localhost:8000/metrics"
223
+ ```
224
+
225
+ Returns JSON with:
226
+
227
+ - `requests_by_path` and `errors_by_path`
228
+ - `timings` (average and p50/p95 for `retrieve_ms`, `web_ms`, `generate_ms`, `total_ms`)
229
+ - `cache` stats
230
+ - Last 20 timing samples for chat.
231
+
232
+ ## Seeding data
233
+
234
+ A helper script is provided to seed the index with multiple arXiv and OpenAlex queries:
235
+
236
+ ```bash
237
+ python ../scripts/seed_ingest.py --base-url http://localhost:8000 --namespace dev --mailto you@example.com
238
+ ```
239
+
240
+ ## Docling integration (external script)
241
+
242
+ Docling is used via a separate script so the backend container stays small. To convert a local PDF and upload it as text:
243
+
244
+ ```bash
245
+ cd scripts
246
+ pip install docling
247
+ python docling_convert_and_upload.py \
248
+ --pdf-path /path/to/file.pdf \
249
+ --backend-url http://localhost:8000 \
250
+ --namespace dev \
251
+ --title "My PDF via Docling" \
252
+ --source docling
253
+ ```
254
+
255
+ ## Deploy Backend on Hugging Face Spaces (Docker)
256
+
257
+ 1. **Create a new Space**
258
+ - Go to Hugging Face → *New Space*.
259
+ - Choose:
260
+ - **SDK**: Docker
261
+ - **Space name**: e.g. `your-name/rag-agent-workbench-backend`.
262
+ - Point the Space to this repository and configure it to use the `backend/` subdirectory (or copy `backend/Dockerfile` to the root if you prefer).
263
+
264
+ 2. **Environment variables / secrets**
265
+
266
+ In the Space settings, configure the following (as “Secrets” where appropriate):
267
+
268
+ Required:
269
+
270
+ - `PINECONE_API_KEY`
271
+ - `PINECONE_HOST`
272
+ - `PINECONE_INDEX_NAME`
273
+ - `PINECONE_NAMESPACE`
274
+ - `PINECONE_TEXT_FIELD=content` (or your actual text field)
275
+ - `GROQ_API_KEY`
276
+ - `GROQ_BASE_URL` (optional, defaults to `https://api.groq.com/openai/v1`)
277
+ - `GROQ_MODEL` (optional, defaults to `llama-3.1-8b-instant`)
278
+
279
+ Optional:
280
+
281
+ - `TAVILY_API_KEY` (web search fallback for `/chat`)
282
+ - `LANGCHAIN_TRACING_V2`
283
+ - `LANGCHAIN_API_KEY`
284
+ - `LANGCHAIN_PROJECT`
285
+ - `API_KEY` (to protect `/ingest/*`, `/documents/*`, `/search`, `/chat*`)
286
+ - `ALLOWED_ORIGINS` (e.g. your Streamlit frontend origin)
287
+ - `RATE_LIMIT_ENABLED` and `CACHE_ENABLED` (rarely need to change from defaults)
288
+
289
+ 3. **Ports and startup**
290
+
291
+ - The Docker image exposes port **7860** by default.
292
+ - Hugging Face Spaces sets the `PORT` environment variable; the `CMD` honours it:
293
+ - `uvicorn app.main:app --host 0.0.0.0 --port ${PORT:-7860}`
294
+ - On successful startup, logs include:
295
+ - `Starting on port=<port> hf_spaces_mode=<bool>`
296
+
297
+ 4. **Verify**
298
+
299
+ - Open your Space URL:
300
+ - `https://<your-space>.hf.space/docs` – interactive API docs.
301
+ - `https://<your-space>.hf.space/health` – health check.
302
+ - If `API_KEY` is set, test protected endpoints using `X-API-Key`.
303
+
304
+ ## Deploy Frontend on Streamlit Community Cloud
305
+
306
+ 1. **Prepare the repo**
307
+
308
+ - The minimal Streamlit frontend lives under `frontend/app.py`.
309
+ - Root `requirements.txt` includes:
310
+ - `streamlit`
311
+ - `httpx`
312
+
313
+ 2. **Create Streamlit app**
314
+
315
+ - Go to Streamlit Community Cloud and create a new app.
316
+ - Point it at this repository.
317
+ - Set the main file to `frontend/app.py`.
318
+
319
+ 3. **Configure Streamlit secrets**
320
+
321
+ - In the Streamlit app settings, configure *Secrets* (YAML):
322
+
323
+ ```yaml
324
+ BACKEND_BASE_URL: "https://<your-backend-space>.hf.space"
325
+ API_KEY: "your-backend-api-key" # only if backend API_KEY is set
326
+ ```
327
+
328
+ - **Do not** commit secrets into the repo.
329
+
330
+ 4. **Verify connectivity**
331
+
332
+ - Open the Streamlit app.
333
+ - In the sidebar “Connectivity” panel:
334
+ - Confirm the backend URL is correct.
335
+ - Click “Ping /health” to verify backend connectivity.
336
+ - Use the chat panel to send a question:
337
+ - The app will call `/chat` on the backend and display answer, timings, and sources.
338
+
339
+ ## Local Test Checklist – Work Package C
340
+
341
+ 1. **Configure environment**
342
+
343
+ - Set `PINECONE_*` variables for an integrated embeddings index.
344
+ - Set `GROQ_API_KEY` (and optionally override `GROQ_BASE_URL`, `GROQ_MODEL`).
345
+ - Optionally set `TAVILY_API_KEY` for web fallback.
346
+ - Optionally enable LangSmith:
347
+ - `LANGCHAIN_TRACING_V2=true`
348
+ - `LANGCHAIN_API_KEY=...`
349
+ - `LANGCHAIN_PROJECT=rag-agent-workbench`
350
+ - Optionally set:
351
+ - `API_KEY` for basic protection.
352
+ - `ALLOWED_ORIGINS` if you are calling from a browser origin.
353
+ - `RATE_LIMIT_ENABLED` / `CACHE_ENABLED` for tuning.
354
+
355
+ 2. **Start the backend**
356
+
357
+ ```bash
358
+ cd backend
359
+ uvicorn app.main:app --reload --port 8000
360
+ ```
361
+
362
+ 3. **Ingest data**
363
+
364
+ - Quick Wikipedia smoke test (also see `scripts/smoke_chat.py`):
365
+
366
+ ```bash
367
+ python ../scripts/smoke_chat.py --backend-url http://localhost:8000 --namespace dev
368
+ ```
369
+
370
+ 4. **Test `/search`**
371
+
372
+ ```bash
373
+ curl -X POST "http://localhost:8000/search" \
374
+ -H "Content-Type: application/json" \
375
+ -H "X-API-Key: $API_KEY" \ # only if API_KEY is enabled
376
+ -d '{"query": "what is RAG", "namespace": "dev", "top_k": 5}'
377
+ ```
378
+
379
+ 5. **Test `/chat`**
380
+
381
+ - Use the curl example above or run:
382
+
383
+ ```bash
384
+ curl -X POST "http://localhost:8000/chat" \
385
+ -H "Content-Type: application/json" \
386
+ -H "X-API-Key: $API_KEY" \ # only if API_KEY is enabled
387
+ -d '{"query": "What is retrieval-augmented generation?", "namespace": "dev"}'
388
+ ```
389
+
390
+ 6. **Test `/chat` with web fallback**
391
+
392
+ - Requires `TAVILY_API_KEY`:
393
+
394
+ ```bash
395
+ python ../scripts/smoke_chat_web.py --backend-url http://localhost:8000 --namespace dev
396
+ ```
397
+
398
+ 7. **Inspect `/metrics`**
399
+
400
+ ```bash
401
+ curl "http://localhost:8000/metrics"
402
+ ```
403
+
404
+ - Confirm:
405
+ - Request counts are increasing.
406
+ - Timing stats (`average_ms`, `p50_ms`, `p95_ms`) are populated after several `/chat` calls.
407
+ - Cache hit/miss counters change when repeating identical `/search` or `/chat` requests.
408
+
409
+ 8. **Run the benchmark script**
410
+
411
+ - From the repo root:
412
+
413
+ ```bash
414
+ python scripts/bench_local.py \
415
+ --backend-url http://localhost:8000 \
416
+ --namespace dev \
417
+ --concurrency 10 \
418
+ --requests 50 \
419
+ --api-key "$API_KEY"
420
+ ```
421
+
422
+ - Review reported:
423
+ - Average latency.
424
+ - p50 / p95 latency.
425
+ - Error rate.
426
+
427
+ 9. **Optional: Test Streamlit frontend locally**
428
+
429
+ - Install root requirements:
430
+
431
+ ```bash
432
+ pip install -r requirements.txt
433
+ ```
434
+
435
+ - Run:
436
+
437
+ ```bash
438
+ streamlit run frontend/app.py
439
+ ```
440
+
441
+ - Configure `BACKEND_BASE_URL` and `API_KEY` via environment or `.streamlit/secrets.toml`, and verify chat works end-to-end.
backend/app/__pycache__/main.cpython-313.pyc ADDED
Binary file (2.45 kB). View file
 
backend/app/core/__pycache__/cache.cpython-313.pyc ADDED
Binary file (5.12 kB). View file
 
backend/app/core/__pycache__/config.cpython-313.pyc ADDED
Binary file (4.09 kB). View file
 
backend/app/core/__pycache__/errors.cpython-313.pyc ADDED
Binary file (4.89 kB). View file
 
backend/app/core/__pycache__/logging.cpython-313.pyc ADDED
Binary file (1.11 kB). View file
 
backend/app/core/__pycache__/metrics.cpython-313.pyc ADDED
Binary file (5.86 kB). View file
 
backend/app/core/__pycache__/rate_limit.cpython-313.pyc ADDED
Binary file (2.53 kB). View file
 
backend/app/core/__pycache__/runtime.cpython-313.pyc ADDED
Binary file (1.35 kB). View file
 
backend/app/core/__pycache__/security.cpython-313.pyc ADDED
Binary file (5.04 kB). View file
 
backend/app/core/__pycache__/tracing.cpython-313.pyc ADDED
Binary file (2.75 kB). View file
 
backend/app/core/cache.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from threading import Lock
3
+ from typing import Any, Dict, Hashable, Optional, Tuple
4
+
5
+ from cachetools import TTLCache
6
+
7
+ from app.core.config import get_settings
8
+ from app.core.logging import get_logger
9
+
10
+ logger = get_logger(__name__)
11
+
12
+ _settings = get_settings()
13
+ _CACHE_ENABLED: bool = getattr(_settings, "CACHE_ENABLED", True)
14
+
15
+ # TTLs are intentionally short and in-code defaults; no env required.
16
+ _SEARCH_TTL_SECONDS = 60
17
+ _CHAT_TTL_SECONDS = 60
18
+
19
+ _search_cache: TTLCache = TTLCache(maxsize=1024, ttl=_SEARCH_TTL_SECONDS)
20
+ _chat_cache: TTLCache = TTLCache(maxsize=512, ttl=_CHAT_TTL_SECONDS)
21
+
22
+ _lock = Lock()
23
+
24
+ _search_hits: int = 0
25
+ _search_misses: int = 0
26
+ _chat_hits: int = 0
27
+ _chat_misses: int = 0
28
+
29
+
30
+ def cache_enabled() -> bool:
31
+ return _CACHE_ENABLED
32
+
33
+
34
+ def _make_search_key(
35
+ namespace: str,
36
+ query: str,
37
+ top_k: int,
38
+ filters: Optional[Dict[str, Any]],
39
+ ) -> Hashable:
40
+ filters_json = (
41
+ json.dumps(filters, sort_keys=True, separators=(",", ":"))
42
+ if filters is not None
43
+ else ""
44
+ )
45
+ return (namespace, query, int(top_k), filters_json)
46
+
47
+
48
+ def _make_chat_key(
49
+ namespace: str,
50
+ query: str,
51
+ top_k: int,
52
+ min_score: float,
53
+ use_web_fallback: bool,
54
+ ) -> Hashable:
55
+ return (namespace, query, int(top_k), float(min_score), bool(use_web_fallback))
56
+
57
+
58
+ def get_search_cached(
59
+ namespace: str,
60
+ query: str,
61
+ top_k: int,
62
+ filters: Optional[Dict[str, Any]],
63
+ ) -> Optional[Any]:
64
+ """Return cached search results or None."""
65
+ global _search_hits, _search_misses
66
+ if not _CACHE_ENABLED:
67
+ return None
68
+
69
+ key = _make_search_key(namespace, query, top_k, filters)
70
+ with _lock:
71
+ if key in _search_cache:
72
+ _search_hits += 1
73
+ logger.info(
74
+ "Search cache hit namespace='%s' query='%s' top_k=%d",
75
+ namespace,
76
+ query,
77
+ top_k,
78
+ )
79
+ return _search_cache[key]
80
+ _search_misses += 1
81
+ logger.info(
82
+ "Search cache miss namespace='%s' query='%s' top_k=%d",
83
+ namespace,
84
+ query,
85
+ top_k,
86
+ )
87
+ return None
88
+
89
+
90
+ def set_search_cached(
91
+ namespace: str,
92
+ query: str,
93
+ top_k: int,
94
+ filters: Optional[Dict[str, Any]],
95
+ value: Any,
96
+ ) -> None:
97
+ if not _CACHE_ENABLED:
98
+ return
99
+ key = _make_search_key(namespace, query, top_k, filters)
100
+ with _lock:
101
+ _search_cache[key] = value
102
+
103
+
104
+ def get_chat_cached(
105
+ namespace: str,
106
+ query: str,
107
+ top_k: int,
108
+ min_score: float,
109
+ use_web_fallback: bool,
110
+ ) -> Optional[Any]:
111
+ """Return cached chat response or None.
112
+
113
+ Only used when chat_history is empty.
114
+ """
115
+ global _chat_hits, _chat_misses
116
+ if not _CACHE_ENABLED:
117
+ return None
118
+
119
+ key = _make_chat_key(namespace, query, top_k, min_score, use_web_fallback)
120
+ with _lock:
121
+ if key in _chat_cache:
122
+ _chat_hits += 1
123
+ logger.info(
124
+ "Chat cache hit namespace='%s' query='%s' top_k=%d",
125
+ namespace,
126
+ query,
127
+ top_k,
128
+ )
129
+ return _chat_cache[key]
130
+ _chat_misses += 1
131
+ logger.info(
132
+ "Chat cache miss namespace='%s' query='%s' top_k=%d",
133
+ namespace,
134
+ query,
135
+ top_k,
136
+ )
137
+ return None
138
+
139
+
140
+ def set_chat_cached(
141
+ namespace: str,
142
+ query: str,
143
+ top_k: int,
144
+ min_score: float,
145
+ use_web_fallback: bool,
146
+ value: Any,
147
+ ) -> None:
148
+ if not _CACHE_ENABLED:
149
+ return
150
+ key = _make_chat_key(namespace, query, top_k, min_score, use_web_fallback)
151
+ with _lock:
152
+ _chat_cache[key] = value
153
+
154
+
155
+ def get_cache_stats() -> Dict[str, int]:
156
+ """Return a snapshot of cache hit/miss counters."""
157
+ return {
158
+ "search_hits": _search_hits,
159
+ "search_misses": _search_misses,
160
+ "chat_hits": _chat_hits,
161
+ "chat_misses": _chat_misses,
162
+ }
backend/app/core/config.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from functools import lru_cache
3
+ from typing import Optional
4
+
5
+ from dotenv import load_dotenv
6
+ from pydantic import Field
7
+ from pydantic_settings import BaseSettings, SettingsConfigDict
8
+
9
+ # Load environment variables from a local .env file if present
10
+ load_dotenv()
11
+
12
+
13
+ class Settings(BaseSettings):
14
+ """Application settings loaded from environment variables."""
15
+
16
+ # App
17
+ APP_NAME: str = Field(default="rag-agent-workbench")
18
+ APP_VERSION: str = Field(default="0.1.0")
19
+
20
+ # Pinecone
21
+ PINECONE_API_KEY: str = Field(..., description="Pinecone API key")
22
+ PINECONE_INDEX_NAME: str = Field(
23
+ ..., description="Name of the Pinecone index (used for configuration checks)"
24
+ )
25
+ PINECONE_HOST: str = Field(
26
+ ..., description="Pinecone index host URL for data-plane operations"
27
+ )
28
+ PINECONE_NAMESPACE: str = Field(
29
+ default="dev", description="Default Pinecone namespace"
30
+ )
31
+ PINECONE_TEXT_FIELD: str = Field(
32
+ default="chunk_text",
33
+ description=(
34
+ "Text field name used by the Pinecone integrated embedding index. "
35
+ "For example, set to 'content' if your index field_map uses that name."
36
+ ),
37
+ )
38
+
39
+ # Logging
40
+ LOG_LEVEL: str = Field(default="INFO", description="Application log level")
41
+
42
+ # HTTP client defaults
43
+ HTTP_TIMEOUT_SECONDS: float = Field(
44
+ default=10.0, description="Default timeout for outbound HTTP requests"
45
+ )
46
+ HTTP_MAX_RETRIES: int = Field(
47
+ default=3, description="Max retries for outbound HTTP requests"
48
+ )
49
+
50
+ # Groq / LLM
51
+ GROQ_API_KEY: Optional[str] = Field(
52
+ default=None,
53
+ description="Groq API key (required for /chat endpoints)",
54
+ )
55
+ GROQ_BASE_URL: str = Field(
56
+ default="https://api.groq.com/openai/v1",
57
+ description="Groq OpenAI-compatible base URL",
58
+ )
59
+ GROQ_MODEL: str = Field(
60
+ default="llama-3.1-8b-instant",
61
+ description="Default Groq chat model used for /chat",
62
+ )
63
+
64
+ # Web search / Tavily
65
+ TAVILY_API_KEY: Optional[str] = Field(
66
+ default=None,
67
+ description="Tavily API key for web search fallback (optional)",
68
+ )
69
+
70
+ # RAG defaults
71
+ RAG_DEFAULT_TOP_K: int = Field(
72
+ default=5,
73
+ ge=1,
74
+ le=100,
75
+ description="Default number of documents to retrieve for RAG",
76
+ )
77
+ RAG_MIN_SCORE: float = Field(
78
+ default=0.25,
79
+ ge=0.0,
80
+ le=1.0,
81
+ description="Default minimum relevance score to trust retrieval without web fallback",
82
+ )
83
+ RAG_MAX_WEB_RESULTS: int = Field(
84
+ default=5,
85
+ ge=1,
86
+ le=20,
87
+ description="Maximum number of web search results to fetch when using Tavily",
88
+ )
89
+
90
+ # Operational toggles
91
+ RATE_LIMIT_ENABLED: bool = Field(
92
+ default=True,
93
+ description="Enable SlowAPI rate limiting middleware when true.",
94
+ )
95
+ CACHE_ENABLED: bool = Field(
96
+ default=True,
97
+ description="Enable in-memory TTL caching for /search and /chat when true.",
98
+ )
99
+
100
+ model_config = SettingsConfigDict(
101
+ env_file=".env",
102
+ env_file_encoding="utf-8",
103
+ extra="ignore",
104
+ )
105
+
106
+
107
+ @lru_cache(maxsize=1)
108
+ def get_settings() -> Settings:
109
+ """Return a cached Settings instance."""
110
+ return Settings() # type: ignore[call-arg]
111
+
112
+
113
+ def get_env_bool(name: str, default: bool = False) -> bool:
114
+ """Utility to parse boolean flags from environment variables."""
115
+ raw: Optional[str] = os.getenv(name)
116
+ if raw is None:
117
+ return default
118
+ return raw.strip().lower() in {"1", "true", "yes", "on"}
backend/app/core/errors.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Any
3
+
4
+ from fastapi import FastAPI, HTTPException, Request
5
+ from fastapi.exceptions import RequestValidationError
6
+ from fastapi.responses import JSONResponse
7
+ from starlette import status
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class PineconeIndexConfigError(RuntimeError):
13
+ """Raised when the Pinecone index is not configured for integrated embeddings."""
14
+
15
+
16
+ class UpstreamServiceError(RuntimeError):
17
+ """Raised when an upstream dependency (LLM, web search, etc.) fails."""
18
+
19
+ def __init__(self, service: str, message: str) -> None:
20
+ self.service = service
21
+ super().__init__(message)
22
+
23
+
24
+ def setup_exception_handlers(app: FastAPI) -> None:
25
+ """Register global exception handlers on the FastAPI app."""
26
+
27
+ @app.exception_handler(PineconeIndexConfigError)
28
+ async def pinecone_index_config_error_handler(
29
+ request: Request, exc: PineconeIndexConfigError
30
+ ) -> JSONResponse:
31
+ logger.error("Pinecone index configuration error: %s", exc)
32
+ return JSONResponse(
33
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
34
+ content={"detail": str(exc)},
35
+ )
36
+
37
+ @app.exception_handler(UpstreamServiceError)
38
+ async def upstream_service_error_handler(
39
+ request: Request, exc: UpstreamServiceError
40
+ ) -> JSONResponse:
41
+ logger.error("Upstream service error from %s: %s", exc.service, exc)
42
+ return JSONResponse(
43
+ status_code=status.HTTP_502_BAD_GATEWAY,
44
+ content={"detail": str(exc)},
45
+ )
46
+
47
+ @app.exception_handler(RequestValidationError)
48
+ async def validation_exception_handler(
49
+ request: Request, exc: RequestValidationError
50
+ ) -> JSONResponse:
51
+ logger.warning("Request validation error: %s", exc)
52
+ return JSONResponse(
53
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
54
+ content={"detail": exc.errors()},
55
+ )
56
+
57
+ @app.exception_handler(HTTPException)
58
+ async def http_exception_handler(
59
+ request: Request, exc: HTTPException
60
+ ) -> JSONResponse:
61
+ # Let FastAPI-style HTTPException pass through with its status and detail.
62
+ logger.warning(
63
+ "HTTPException raised: status=%s detail=%s",
64
+ exc.status_code,
65
+ exc.detail,
66
+ )
67
+ return JSONResponse(
68
+ status_code=exc.status_code,
69
+ content={"detail": exc.detail},
70
+ )
71
+
72
+ @app.exception_handler(Exception)
73
+ async def generic_exception_handler(
74
+ request: Request, exc: Exception
75
+ ) -> JSONResponse:
76
+ logger.exception("Unhandled error", exc_info=exc)
77
+ return JSONResponse(
78
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
79
+ content={"detail": "Internal server error"},
80
+ )
81
+
82
+
83
+ __all__ = ["PineconeIndexConfigError", "setup_exception_handlers"]
backend/app/core/logging.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Optional
3
+
4
+
5
+ def configure_logging(log_level: str) -> None:
6
+ """Configure root logging for the application.
7
+
8
+ This keeps configuration minimal while ensuring consistent formatting.
9
+ """
10
+ level = getattr(logging, log_level.upper(), logging.INFO)
11
+ logging.basicConfig(
12
+ level=level,
13
+ format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
14
+ )
15
+
16
+
17
+ def get_logger(name: Optional[str] = None) -> logging.Logger:
18
+ """Return a module-level logger."""
19
+ return logging.getLogger(name or "app")
backend/app/core/metrics.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import defaultdict, deque
2
+ from threading import Lock
3
+ from time import perf_counter
4
+ from typing import Deque, Dict, List, Mapping, MutableMapping
5
+
6
+ from fastapi import FastAPI, Request
7
+ from fastapi.responses import JSONResponse
8
+
9
+ from app.core.cache import get_cache_stats
10
+ from app.core.logging import get_logger
11
+
12
+ logger = get_logger(__name__)
13
+
14
+
15
+ # Request and error counters by path.
16
+ _request_counts: MutableMapping[str, int] = defaultdict(int)
17
+ _error_counts: MutableMapping[str, int] = defaultdict(int)
18
+
19
+ # Timing samples for chat requests: last N samples.
20
+ _TIMING_FIELDS = ["retrieve_ms", "web_ms", "generate_ms", "total_ms"]
21
+ _TIMING_BUFFER_SIZE = 20
22
+ _timing_samples: Deque[Dict[str, float]] = deque(maxlen=_TIMING_BUFFER_SIZE)
23
+
24
+ # Aggregated sums and counts for averages.
25
+ _timing_sums: Dict[str, float] = {f: 0.0 for f in _TIMING_FIELDS}
26
+ _timing_count: int = 0
27
+
28
+ _lock = Lock()
29
+
30
+
31
+ async def metrics_middleware(request: Request, call_next):
32
+ """Middleware capturing request counts and error counts by path."""
33
+ path = request.url.path or "/"
34
+ start = perf_counter()
35
+ try:
36
+ response = await call_next(request)
37
+ except Exception: # noqa: BLE001
38
+ elapsed = (perf_counter() - start) * 1000.0
39
+ with _lock:
40
+ _request_counts[path] += 1
41
+ _error_counts[path] += 1
42
+ logger.exception("Unhandled error for path=%s elapsed_ms=%.2f", path, elapsed)
43
+ raise
44
+
45
+ elapsed = (perf_counter() - start) * 1000.0
46
+ status = response.status_code
47
+ with _lock:
48
+ _request_counts[path] += 1
49
+ if status >= 400:
50
+ _error_counts[path] += 1
51
+
52
+ logger.debug(
53
+ "Request path=%s status=%s elapsed_ms=%.2f", path, status, elapsed
54
+ )
55
+ return response
56
+
57
+
58
+ def record_chat_timings(timings: Mapping[str, float]) -> None:
59
+ """Record timing metrics from a chat request.
60
+
61
+ Expects a mapping with keys retrieve_ms, web_ms, generate_ms, total_ms.
62
+ """
63
+ global _timing_count
64
+ sample = {field: float(timings.get(field, 0.0)) for field in _TIMING_FIELDS}
65
+ with _lock:
66
+ _timing_samples.append(sample)
67
+ for field, value in sample.items():
68
+ _timing_sums[field] += value
69
+ _timing_count += 1
70
+
71
+
72
+ def _percentile(values: List[float], p: float) -> float:
73
+ if not values:
74
+ return 0.0
75
+ values_sorted = sorted(values)
76
+ k = max(0, min(len(values_sorted) - 1, int(round((p / 100.0) * (len(values_sorted) - 1)))))
77
+ return values_sorted[k]
78
+
79
+
80
+ def get_metrics_snapshot() -> Dict[str, object]:
81
+ """Return a stable snapshot of metrics suitable for /metrics responses."""
82
+ with _lock:
83
+ requests_by_path = dict(_request_counts)
84
+ errors_by_path = dict(_error_counts)
85
+ samples = list(_timing_samples)
86
+ sums = dict(_timing_sums)
87
+ count = int(_timing_count)
88
+
89
+ averages: Dict[str, float] = {}
90
+ if count > 0:
91
+ for field in _TIMING_FIELDS:
92
+ averages[field] = sums.get(field, 0.0) / count
93
+ else:
94
+ for field in _TIMING_FIELDS:
95
+ averages[field] = 0.0
96
+
97
+ # Compute percentiles over the last N samples.
98
+ p50: Dict[str, float] = {}
99
+ p95: Dict[str, float] = {}
100
+ if samples:
101
+ for field in _TIMING_FIELDS:
102
+ values = [s.get(field, 0.0) for s in samples]
103
+ p50[field] = _percentile(values, 50.0)
104
+ p95[field] = _percentile(values, 95.0)
105
+ else:
106
+ for field in _TIMING_FIELDS:
107
+ p50[field] = 0.0
108
+ p95[field] = 0.0
109
+
110
+ cache_stats = get_cache_stats()
111
+
112
+ return {
113
+ "requests_by_path": requests_by_path,
114
+ "errors_by_path": errors_by_path,
115
+ "timings": {
116
+ "average_ms": averages,
117
+ "p50_ms": p50,
118
+ "p95_ms": p95,
119
+ },
120
+ "cache": cache_stats,
121
+ "sample_count": count,
122
+ "samples": samples,
123
+ }
124
+
125
+
126
+ def setup_metrics(app: FastAPI) -> None:
127
+ """Attach metrics middleware to the app."""
128
+ logger.info("Metrics middleware enabled.")
129
+ app.middleware("http")(metrics_middleware)
backend/app/core/rate_limit.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+
3
+ from fastapi import FastAPI, Request
4
+ from fastapi.responses import JSONResponse
5
+ from slowapi import Limiter
6
+ from slowapi.errors import RateLimitExceeded
7
+ from slowapi.middleware import SlowAPIMiddleware
8
+ from slowapi.util import get_remote_address
9
+
10
+ from app.core.config import get_settings
11
+ from app.core.logging import get_logger
12
+
13
+ logger = get_logger(__name__)
14
+
15
+ # Global limiter instance used for decorators.
16
+ limiter = Limiter(key_func=get_remote_address)
17
+
18
+
19
+ def setup_rate_limiter(app: FastAPI) -> None:
20
+ """Configure SlowAPI rate limiting middleware and handlers.
21
+
22
+ Limits are enabled/disabled via Settings.RATE_LIMIT_ENABLED.
23
+ """
24
+ settings = get_settings()
25
+ if not getattr(settings, "RATE_LIMIT_ENABLED", True):
26
+ logger.info("Rate limiting is disabled via settings.")
27
+ return
28
+
29
+ logger.info("Rate limiting enabled with SlowAPI.")
30
+
31
+ app.state.limiter = limiter # type: ignore[attr-defined]
32
+
33
+ @app.exception_handler(RateLimitExceeded)
34
+ async def rate_limit_exceeded_handler( # type: ignore[no-redef]
35
+ request: Request,
36
+ exc: RateLimitExceeded,
37
+ ) -> JSONResponse:
38
+ retry_after: str | None = None
39
+ try:
40
+ retry_after = exc.headers.get("Retry-After") # type: ignore[assignment]
41
+ except Exception: # noqa: BLE001
42
+ retry_after = None
43
+
44
+ logger.warning(
45
+ "Rate limit exceeded path=%s client=%s limit=%s",
46
+ request.url.path,
47
+ get_remote_address(request),
48
+ exc.detail,
49
+ )
50
+ content: dict[str, Any] = {
51
+ "detail": "Rate limit exceeded. Please slow down your requests.",
52
+ }
53
+ if retry_after is not None:
54
+ content["retry_after"] = retry_after
55
+ return JSONResponse(status_code=429, content=content)
56
+
57
+ # Attach SlowAPI middleware
58
+ app.add_middleware(SlowAPIMiddleware)
backend/app/core/runtime.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from app.core.logging import get_logger
4
+
5
+ logger = get_logger(__name__)
6
+
7
+
8
+ def get_port(default: int = 7860) -> int:
9
+ """Return the port the application should bind to.
10
+
11
+ - Uses the PORT environment variable when set (e.g. on Hugging Face Spaces).
12
+ - Falls back to the provided default (7860 for Spaces compatibility).
13
+ - Logs a message indicating the chosen port and whether we appear to be
14
+ running inside a Hugging Face Spaces environment.
15
+ """
16
+ raw = os.getenv("PORT")
17
+ try:
18
+ port = int(raw) if raw else default
19
+ except (TypeError, ValueError):
20
+ port = default
21
+
22
+ # Heuristic to detect HF Spaces: SPACE_ID or SPACE_REPO_ID are usually set.
23
+ hf_spaces_mode = bool(os.getenv("SPACE_ID") or os.getenv("SPACE_REPO_ID"))
24
+
25
+ logger.info(
26
+ "Starting on port=%d hf_spaces_mode=%s",
27
+ port,
28
+ hf_spaces_mode,
29
+ )
30
+
31
+ return port
backend/app/core/security.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Iterable, List, Optional
3
+
4
+ from fastapi import FastAPI, Request
5
+ from fastapi.responses import JSONResponse
6
+ from fastapi.middleware.cors import CORSMiddleware
7
+ from starlette.middleware.base import BaseHTTPMiddleware
8
+
9
+ from app.core.logging import get_logger
10
+
11
+ logger = get_logger(__name__)
12
+
13
+
14
+ def _get_allowed_origins() -> List[str]:
15
+ raw = os.getenv("ALLOWED_ORIGINS")
16
+ if not raw:
17
+ # Default: permissive for local development and simple frontends.
18
+ origins = ["*"]
19
+ else:
20
+ origins = [item.strip() for item in raw.split(",") if item.strip()]
21
+ if not origins:
22
+ origins = ["*"]
23
+ return origins
24
+
25
+
26
+ class APIKeyMiddleware(BaseHTTPMiddleware):
27
+ """Optional API key protection for selected endpoints.
28
+
29
+ When the API_KEY environment variable is set, this middleware enforces the
30
+ presence of an `X-API-Key` header with a matching value for:
31
+
32
+ - /ingest/*
33
+ - /documents/*
34
+ - /chat*
35
+ - /search
36
+
37
+ The following paths remain public regardless of API_KEY:
38
+
39
+ - /health
40
+ - /docs
41
+ - /openapi.json
42
+ - /redoc
43
+ - /metrics
44
+
45
+ When API_KEY is not set, the middleware is not installed and the API is open.
46
+ """
47
+
48
+ def __init__(self, app: FastAPI, api_key: str) -> None: # type: ignore[override]
49
+ super().__init__(app)
50
+ self.api_key = api_key
51
+
52
+ self._protected_prefixes: List[str] = [
53
+ "/ingest",
54
+ "/documents",
55
+ "/chat",
56
+ "/search",
57
+ ]
58
+ self._public_prefixes: List[str] = [
59
+ "/health",
60
+ "/docs",
61
+ "/openapi.json",
62
+ "/redoc",
63
+ "/metrics",
64
+ ]
65
+
66
+ async def dispatch(self, request: Request, call_next): # type: ignore[override]
67
+ path = request.url.path or "/"
68
+
69
+ # Public endpoints stay open.
70
+ if any(path.startswith(prefix) for prefix in self._public_prefixes):
71
+ return await call_next(request)
72
+
73
+ # Only enforce for protected prefixes.
74
+ if not any(path.startswith(prefix) for prefix in self._protected_prefixes):
75
+ return await call_next(request)
76
+
77
+ header_key: Optional[str] = request.headers.get("X-API-Key")
78
+ if not header_key or header_key != self.api_key:
79
+ logger.warning("Rejected request with missing/invalid API key path=%s", path)
80
+ return JSONResponse(
81
+ status_code=401,
82
+ content={
83
+ "detail": (
84
+ "Missing or invalid API key. Provide X-API-Key header with "
85
+ "a valid key to access this endpoint."
86
+ )
87
+ },
88
+ )
89
+
90
+ return await call_next(request)
91
+
92
+
93
+ def configure_security(app: FastAPI) -> None:
94
+ """Configure CORS and optional API key protection on the FastAPI app."""
95
+ # CORS
96
+ origins = _get_allowed_origins()
97
+ app.add_middleware(
98
+ CORSMiddleware,
99
+ allow_origins=origins,
100
+ allow_credentials=True,
101
+ allow_methods=["*"],
102
+ allow_headers=["*"],
103
+ )
104
+ logger.info("CORS configured allow_origins=%s", origins)
105
+
106
+ # Optional API key middleware
107
+ api_key = os.getenv("API_KEY")
108
+ if not api_key:
109
+ logger.warning(
110
+ "API key disabled; protected endpoints are open. "
111
+ "Set API_KEY environment variable to enable X-API-Key protection."
112
+ )
113
+ return
114
+
115
+ logger.info("API key protection enabled for ingest, documents, search, and chat.")
116
+ app.add_middleware(APIKeyMiddleware, api_key=api_key)
backend/app/core/tracing.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from functools import lru_cache
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ from app.core.config import get_env_bool
6
+ from app.core.logging import get_logger
7
+
8
+ logger = get_logger(__name__)
9
+
10
+
11
+ def is_tracing_enabled() -> bool:
12
+ """Return True if LangSmith / LangChain tracing is enabled via environment."""
13
+ tracing_flag = get_env_bool("LANGCHAIN_TRACING_V2", False)
14
+ api_key = os.getenv("LANGCHAIN_API_KEY")
15
+ return tracing_flag and bool(api_key)
16
+
17
+
18
+ def get_langsmith_project() -> Optional[str]:
19
+ """Return the LangSmith project name, if configured."""
20
+ return os.getenv("LANGCHAIN_PROJECT")
21
+
22
+
23
+ @lru_cache(maxsize=1)
24
+ def get_tracing_callbacks() -> List[Any]:
25
+ """Return LangChain callback handlers for tracing, if available.
26
+
27
+ When LANGCHAIN_TRACING_V2=true and LANGCHAIN_API_KEY is set, this will
28
+ attempt to create a LangChainTracer instance. If tracing is not enabled
29
+ or the tracer is unavailable, an empty list is returned.
30
+ """
31
+ if not is_tracing_enabled():
32
+ logger.info(
33
+ "LangSmith tracing disabled (set LANGCHAIN_TRACING_V2=true and "
34
+ "LANGCHAIN_API_KEY to enable)."
35
+ )
36
+ return []
37
+
38
+ try:
39
+ from langchain_core.tracers import LangChainTracer # type: ignore[import]
40
+ except Exception as exc: # noqa: BLE001
41
+ logger.warning(
42
+ "LangSmith tracing requested but LangChainTracer is unavailable: %s", exc
43
+ )
44
+ return []
45
+
46
+ project = get_langsmith_project()
47
+ tracer = LangChainTracer(project_name=project)
48
+ logger.info(
49
+ "LangSmith tracing enabled for project='%s'",
50
+ project or "(default)",
51
+ )
52
+ return [tracer]
53
+
54
+
55
+ def get_tracing_response_metadata() -> Dict[str, Any]:
56
+ """Return trace metadata suitable for API responses."""
57
+ return {
58
+ "langsmith_project": get_langsmith_project(),
59
+ "trace_enabled": is_tracing_enabled(),
60
+ }
backend/app/main.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from fastapi.responses import ORJSONResponse
3
+
4
+ from app.core.config import get_settings
5
+ from app.core.errors import PineconeIndexConfigError, setup_exception_handlers
6
+ from app.core.logging import configure_logging, get_logger
7
+ from app.core.metrics import setup_metrics
8
+ from app.core.rate_limit import setup_rate_limiter
9
+ from app.core.runtime import get_port
10
+ from app.core.security import configure_security
11
+ from app.routers.documents import router as documents_router
12
+ from app.routers.health import router as health_router
13
+ from app.routers.ingest import router as ingest_router
14
+ from app.routers.search import router as search_router
15
+ from app.routers.chat import router as chat_router
16
+ from app.routers.metrics import router as metrics_router
17
+ from app.services.pinecone_store import init_pinecone
18
+
19
+ settings = get_settings()
20
+ configure_logging(settings.LOG_LEVEL)
21
+ logger = get_logger(__name__)
22
+
23
+ # Log runtime port / environment context at import time for easier diagnostics.
24
+ get_port()
25
+
26
+ app = FastAPI(
27
+ title="RAG Agent Workbench API",
28
+ version=settings.APP_VERSION,
29
+ default_response_class=ORJSONResponse,
30
+ docs_url="/docs",
31
+ redoc_url="/redoc",
32
+ openapi_url="/openapi.json",
33
+ )
34
+
35
+ # Core app configuration
36
+ configure_security(app)
37
+ setup_rate_limiter(app)
38
+ setup_metrics(app)
39
+
40
+ # Register routers with tags and ensure they are included in the schema
41
+ app.include_router(health_router, tags=["health"])
42
+ app.include_router(ingest_router, tags=["ingest"])
43
+ app.include_router(search_router, tags=["search"])
44
+ app.include_router(documents_router, tags=["documents"])
45
+ app.include_router(chat_router, tags=["chat"])
46
+ app.include_router(metrics_router, tags=["metrics"])
47
+
48
+ # Register exception handlers
49
+ setup_exception_handlers(app)
50
+
51
+
52
+ @app.on_event("startup")
53
+ async def startup_event() -> None:
54
+ """Application startup hook."""
55
+ try:
56
+ init_pinecone(settings)
57
+ logger.info("Pinecone initialisation completed")
58
+ except PineconeIndexConfigError:
59
+ # Let the exception handler and FastAPI/uvicorn deal with the error.
60
+ # Re-raise to fail fast on misconfiguration.
61
+ raise
backend/app/routers/__pycache__/chat.cpython-313.pyc ADDED
Binary file (11.4 kB). View file
 
backend/app/routers/__pycache__/documents.cpython-313.pyc ADDED
Binary file (4.17 kB). View file
 
backend/app/routers/__pycache__/health.cpython-313.pyc ADDED
Binary file (792 Bytes). View file
 
backend/app/routers/__pycache__/ingest.cpython-313.pyc ADDED
Binary file (8.27 kB). View file
 
backend/app/routers/__pycache__/metrics.cpython-313.pyc ADDED
Binary file (789 Bytes). View file
 
backend/app/routers/__pycache__/search.cpython-313.pyc ADDED
Binary file (3.32 kB). View file
 
backend/app/routers/chat.py ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from time import perf_counter
3
+ from typing import AsyncGenerator, Dict, List, Optional
4
+
5
+ from fastapi import APIRouter, Request
6
+ from fastapi.concurrency import run_in_threadpool
7
+ from fastapi.responses import StreamingResponse
8
+
9
+ from app.core.cache import cache_enabled, get_chat_cached, set_chat_cached
10
+ from app.core.config import get_settings
11
+ from app.core.logging import get_logger
12
+ from app.core.metrics import record_chat_timings
13
+ from app.core.rate_limit import limiter
14
+ from app.core.tracing import (
15
+ get_tracing_callbacks,
16
+ get_tracing_response_metadata,
17
+ )
18
+ from app.schemas.chat import (
19
+ ChatRequest,
20
+ ChatResponse,
21
+ ChatTimings,
22
+ ChatTraceMetadata,
23
+ SourceHit,
24
+ )
25
+ from app.services.chat.graph import get_chat_graph
26
+
27
+ logger = get_logger(__name__)
28
+
29
+ router = APIRouter(tags=["chat"])
30
+
31
+
32
+ def _build_chat_response(state: Dict) -> ChatResponse:
33
+ """Convert graph state into a ChatResponse model."""
34
+ timings_raw = state.get("timings") or {}
35
+ timings = ChatTimings(
36
+ retrieve_ms=float(timings_raw.get("retrieve_ms") or 0.0),
37
+ web_ms=float(timings_raw.get("web_ms") or 0.0),
38
+ generate_ms=float(timings_raw.get("generate_ms") or 0.0),
39
+ total_ms=float(timings_raw.get("total_ms") or 0.0),
40
+ )
41
+
42
+ sources_raw: List[Dict] = (state.get("retrieved") or []) + (
43
+ state.get("web_results") or []
44
+ )
45
+ sources: List[SourceHit] = [
46
+ SourceHit(
47
+ source=str(src.get("source") or "unknown"),
48
+ title=str(src.get("title") or ""),
49
+ url=str(src.get("url") or ""),
50
+ score=float(src.get("score") or 0.0),
51
+ chunk_text=str(src.get("chunk_text") or ""),
52
+ )
53
+ for src in sources_raw
54
+ ]
55
+
56
+ trace_meta = ChatTraceMetadata(**get_tracing_response_metadata())
57
+
58
+ return ChatResponse(
59
+ answer=str(state.get("answer") or ""),
60
+ sources=sources,
61
+ timings=timings,
62
+ trace=trace_meta,
63
+ )
64
+
65
+
66
+ @router.post(
67
+ "/chat",
68
+ response_model=ChatResponse,
69
+ summary="Production-style RAG chat endpoint",
70
+ description=(
71
+ "Runs an agentic RAG flow using Pinecone retrieval, optional Tavily web "
72
+ "fallback, and a Groq-backed LLM to generate an answer. "
73
+ "Returns the answer, source snippets, timings, and LangSmith trace metadata."
74
+ ),
75
+ )
76
+ @limiter.limit("30/minute")
77
+ async def chat(request: Request, payload: ChatRequest) -> ChatResponse: # noqa: ARG001
78
+ settings = get_settings()
79
+ namespace = payload.namespace or settings.PINECONE_NAMESPACE
80
+
81
+ logger.info(
82
+ "Received /chat request namespace='%s' top_k=%d use_web_fallback=%s",
83
+ namespace,
84
+ payload.top_k,
85
+ payload.use_web_fallback,
86
+ )
87
+
88
+ use_cache = cache_enabled() and not payload.chat_history
89
+ cached_response: Optional[ChatResponse] = None
90
+ if use_cache:
91
+ cached = get_chat_cached(
92
+ namespace=namespace,
93
+ query=payload.query,
94
+ top_k=payload.top_k,
95
+ min_score=payload.min_score,
96
+ use_web_fallback=payload.use_web_fallback,
97
+ )
98
+ if cached is not None:
99
+ logger.info(
100
+ "Serving /chat response from cache namespace='%s' query='%s'",
101
+ namespace,
102
+ payload.query,
103
+ )
104
+ cached_response = cached
105
+
106
+ if cached_response is not None:
107
+ # Still record timings and metrics based on the cached response.
108
+ record_chat_timings(
109
+ {
110
+ "retrieve_ms": cached_response.timings.retrieve_ms,
111
+ "web_ms": cached_response.timings.web_ms,
112
+ "generate_ms": cached_response.timings.generate_ms,
113
+ "total_ms": cached_response.timings.total_ms,
114
+ }
115
+ )
116
+ return cached_response
117
+
118
+ graph = get_chat_graph()
119
+ callbacks = get_tracing_callbacks()
120
+ config: Dict = {}
121
+ if callbacks:
122
+ config["callbacks"] = callbacks
123
+
124
+ initial_state = {
125
+ "query": payload.query,
126
+ "namespace": namespace,
127
+ "top_k": payload.top_k,
128
+ "use_web_fallback": payload.use_web_fallback,
129
+ "min_score": payload.min_score,
130
+ "max_web_results": payload.max_web_results,
131
+ "chat_history": [
132
+ {"role": msg.role, "content": msg.content}
133
+ for msg in (payload.chat_history or [])
134
+ ],
135
+ }
136
+
137
+ start_total = perf_counter()
138
+
139
+ def _invoke_graph() -> Dict:
140
+ return graph.invoke(initial_state, config=config)
141
+
142
+ # Exceptions (including UpstreamServiceError) are handled by global handlers.
143
+ state = await run_in_threadpool(_invoke_graph)
144
+
145
+ total_ms = (perf_counter() - start_total) * 1000.0
146
+ timings = state.get("timings") or {}
147
+ timings["total_ms"] = total_ms
148
+ state["timings"] = timings
149
+
150
+ web_used = bool(state.get("web_fallback_used"))
151
+ top_score = float(state.get("top_score") or 0.0)
152
+ logger.info(
153
+ "Chat request completed namespace='%s' web_fallback_used=%s "
154
+ "retrieve_ms=%.2f web_ms=%.2f generate_ms=%.2f total_ms=%.2f top_score=%.4f",
155
+ namespace,
156
+ web_used,
157
+ float(timings.get("retrieve_ms") or 0.0),
158
+ float(timings.get("web_ms") or 0.0),
159
+ float(timings.get("generate_ms") or 0.0),
160
+ float(timings.get("total_ms") or 0.0),
161
+ top_score,
162
+ )
163
+
164
+ response_model = _build_chat_response(state)
165
+
166
+ # Record metrics based on this response.
167
+ record_chat_timings(
168
+ {
169
+ "retrieve_ms": response_model.timings.retrieve_ms,
170
+ "web_ms": response_model.timings.web_ms,
171
+ "generate_ms": response_model.timings.generate_ms,
172
+ "total_ms": response_model.timings.total_ms,
173
+ }
174
+ )
175
+
176
+ # Cache only when chat_history is empty.
177
+ if use_cache:
178
+ set_chat_cached(
179
+ namespace=namespace,
180
+ query=payload.query,
181
+ top_k=payload.top_k,
182
+ min_score=payload.min_score,
183
+ use_web_fallback=payload.use_web_fallback,
184
+ value=response_model,
185
+ )
186
+
187
+ return response_model
188
+
189
+
190
+ @router.post(
191
+ "/chat/stream",
192
+ summary="Streaming RAG chat endpoint (SSE)",
193
+ description=(
194
+ "Same behaviour as /chat but streams the answer over Server-Sent Events "
195
+ "(SSE). The final event includes the full JSON payload with answer, sources, "
196
+ "timings, and trace metadata."
197
+ ),
198
+ )
199
+ @limiter.limit("30/minute")
200
+ async def chat_stream(request: Request, payload: ChatRequest) -> StreamingResponse: # noqa: ARG001
201
+ settings = get_settings()
202
+ namespace = payload.namespace or settings.PINECONE_NAMESPACE
203
+
204
+ logger.info(
205
+ "Received /chat/stream request namespace='%s' top_k=%d use_web_fallback=%s",
206
+ namespace,
207
+ payload.top_k,
208
+ payload.use_web_fallback,
209
+ )
210
+
211
+ graph = get_chat_graph()
212
+ callbacks = get_tracing_callbacks()
213
+ config: Dict = {}
214
+ if callbacks:
215
+ config["callbacks"] = callbacks
216
+
217
+ initial_state = {
218
+ "query": payload.query,
219
+ "namespace": namespace,
220
+ "top_k": payload.top_k,
221
+ "use_web_fallback": payload.use_web_fallback,
222
+ "min_score": payload.min_score,
223
+ "max_web_results": payload.max_web_results,
224
+ "chat_history": [
225
+ {"role": msg.role, "content": msg.content}
226
+ for msg in (payload.chat_history or [])
227
+ ],
228
+ }
229
+
230
+ start_total = perf_counter()
231
+
232
+ def _invoke_graph() -> Dict:
233
+ return graph.invoke(initial_state, config=config)
234
+
235
+ # Exceptions (including UpstreamServiceError) are handled by global handlers.
236
+ state = await run_in_threadpool(_invoke_graph)
237
+
238
+ total_ms = (perf_counter() - start_total) * 1000.0
239
+ timings = state.get("timings") or {}
240
+ timings["total_ms"] = total_ms
241
+ state["timings"] = timings
242
+
243
+ web_used = bool(state.get("web_fallback_used"))
244
+ top_score = float(state.get("top_score") or 0.0)
245
+ logger.info(
246
+ "Streaming chat completed namespace='%s' web_fallback_used=%s "
247
+ "retrieve_ms=%.2f web_ms=%.2f generate_ms=%.2f total_ms=%.2f top_score=%.4f",
248
+ namespace,
249
+ web_used,
250
+ float(timings.get("retrieve_ms") or 0.0),
251
+ float(timings.get("web_ms") or 0.0),
252
+ float(timings.get("generate_ms") or 0.0),
253
+ float(timings.get("total_ms") or 0.0),
254
+ top_score,
255
+ )
256
+
257
+ response_model = _build_chat_response(state)
258
+ answer_text = response_model.answer
259
+
260
+ # Record metrics based on this response as well.
261
+ record_chat_timings(
262
+ {
263
+ "retrieve_ms": response_model.timings.retrieve_ms,
264
+ "web_ms": response_model.timings.web_ms,
265
+ "generate_ms": response_model.timings.generate_ms,
266
+ "total_ms": response_model.timings.total_ms,
267
+ }
268
+ )
269
+
270
+ async def event_generator() -> AsyncGenerator[str, None]:
271
+ # Stream the answer token-by-token (space-delimited) as simple SSE events.
272
+ for token in answer_text.split():
273
+ yield f"data: {token}\n\n"
274
+
275
+ # Send a final event containing the full JSON payload for clients that
276
+ # want metadata and sources.
277
+ final_payload = response_model.model_dump()
278
+ yield f"event: end\ndata: {json.dumps(final_payload)}\n\n"
279
+
280
+ return StreamingResponse(event_generator(), media_type="text/event-stream")
backend/app/routers/documents.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict, List
2
+
3
+ from fastapi import APIRouter, Query
4
+ from fastapi.concurrency import run_in_threadpool
5
+ from langchain_core.documents import Document
6
+
7
+ from app.core.config import get_settings
8
+ from app.core.logging import get_logger
9
+ from app.schemas.documents import (
10
+ DocumentsStatsResponse,
11
+ NamespaceStat,
12
+ UploadTextRequest,
13
+ UploadTextResponse,
14
+ )
15
+ from app.services import chunking as chunking_service
16
+ from app.services import dedupe as dedupe_service
17
+ from app.services.normalize import make_doc_id, normalize_text, is_valid_document
18
+ from app.services.pinecone_store import describe_index_stats, upsert_records
19
+
20
+ logger = get_logger(__name__)
21
+
22
+ router = APIRouter(prefix="/documents", tags=["documents"])
23
+
24
+
25
+ @router.post(
26
+ "/upload-text",
27
+ response_model=UploadTextResponse,
28
+ summary="Upload raw text or Docling output",
29
+ description=(
30
+ "Accepts manual text uploads or Docling-converted content, normalizes and "
31
+ "chunks the text, and upserts it into Pinecone."
32
+ ),
33
+ )
34
+ async def upload_text(payload: UploadTextRequest) -> UploadTextResponse:
35
+ settings = get_settings()
36
+ namespace = payload.namespace or settings.PINECONE_NAMESPACE
37
+
38
+ normalized = normalize_text(payload.text)
39
+ if not is_valid_document(normalized):
40
+ logger.info(
41
+ "Skipping manual upload for title='%s' due to insufficient length (len=%d)",
42
+ payload.title,
43
+ len(normalized),
44
+ )
45
+ return UploadTextResponse(
46
+ namespace=namespace,
47
+ source=payload.source,
48
+ ingested_documents=0,
49
+ ingested_chunks=0,
50
+ )
51
+
52
+ metadata: Dict[str, Any] = payload.metadata.copy() if payload.metadata else {}
53
+ url = metadata.get("url", "")
54
+ published = metadata.get("published", "")
55
+
56
+ doc_id = make_doc_id(source=payload.source, title=payload.title, url=url)
57
+ metadata.update(
58
+ {
59
+ "title": payload.title,
60
+ "source": payload.source,
61
+ "url": url,
62
+ "published": published,
63
+ "doc_id": doc_id,
64
+ }
65
+ )
66
+
67
+ document = Document(page_content=normalized, metadata=metadata)
68
+ records = chunking_service.documents_to_records([document])
69
+ records = dedupe_service.dedupe_records(records)
70
+
71
+ total_upserted = await run_in_threadpool(upsert_records, namespace, records)
72
+
73
+ return UploadTextResponse(
74
+ namespace=namespace,
75
+ source=payload.source,
76
+ ingested_documents=1,
77
+ ingested_chunks=total_upserted,
78
+ )
79
+
80
+
81
+ @router.get(
82
+ "/stats",
83
+ response_model=DocumentsStatsResponse,
84
+ summary="Get document statistics",
85
+ description="Returns vector counts per namespace from the configured Pinecone index.",
86
+ )
87
+ async def documents_stats(
88
+ namespace: str | None = Query(
89
+ default=None,
90
+ description="Optional namespace filter; if omitted, stats for all namespaces are returned",
91
+ ),
92
+ ) -> DocumentsStatsResponse:
93
+ raw_stats = await run_in_threadpool(describe_index_stats, namespace)
94
+
95
+ stats: Dict[str, NamespaceStat] = {
96
+ ns_name: NamespaceStat(vector_count=ns_info.get("vector_count", 0))
97
+ for ns_name, ns_info in raw_stats.items()
98
+ }
99
+
100
+ return DocumentsStatsResponse(namespaces=stats)
backend/app/routers/health.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter
2
+
3
+ from app.core.config import get_settings
4
+
5
+ router = APIRouter(tags=["health"])
6
+
7
+
8
+ @router.get(
9
+ "/health",
10
+ summary="Health check",
11
+ description="Returns service status, name, and version.",
12
+ )
13
+ async def health() -> dict:
14
+ settings = get_settings()
15
+ return {
16
+ "status": "ok",
17
+ "service": settings.APP_NAME,
18
+ "version": settings.APP_VERSION,
19
+ }
backend/app/routers/ingest.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict, List
2
+ from collections import Counter
3
+
4
+ import httpx
5
+ from fastapi import APIRouter, HTTPException, Request
6
+ from fastapi.concurrency import run_in_threadpool
7
+ from langchain_core.documents import Document
8
+
9
+ from app.core.config import get_settings
10
+ from app.core.logging import get_logger
11
+ from app.core.rate_limit import limiter
12
+ from app.schemas.ingest import (
13
+ ArxivIngestRequest,
14
+ IngestResponse,
15
+ OpenAlexIngestRequest,
16
+ WikiIngestRequest,
17
+ )
18
+ from app.services import dedupe as dedupe_service
19
+ from app.services import chunking as chunking_service
20
+ from app.services.ingestors.arxiv import fetch_arxiv_documents
21
+ from app.services.ingestors.openalex import fetch_openalex_documents
22
+ from app.services.ingestors.wiki import fetch_wiki_documents
23
+ from app.services.pinecone_store import upsert_records
24
+
25
+ logger = get_logger(__name__)
26
+
27
+ router = APIRouter(prefix="/ingest", tags=["ingest"])
28
+
29
+
30
+ async def _process_and_upsert(
31
+ documents: List[Document],
32
+ namespace: str,
33
+ source: str,
34
+ details: dict | None = None,
35
+ ) -> IngestResponse:
36
+ """Shared helper to chunk, dedupe and upsert documents."""
37
+ if not documents:
38
+ return IngestResponse(
39
+ namespace=namespace,
40
+ source=source,
41
+ ingested_documents=0,
42
+ ingested_chunks=0,
43
+ skipped_documents=0,
44
+ details=details or {"reason": "no_documents_after_filtering"},
45
+ )
46
+
47
+ records = chunking_service.documents_to_records(documents)
48
+ records = dedupe_service.dedupe_records(records)
49
+
50
+ total_upserted = await run_in_threadpool(upsert_records, namespace, records)
51
+
52
+ return IngestResponse(
53
+ namespace=namespace,
54
+ source=source,
55
+ ingested_documents=len(documents),
56
+ ingested_chunks=total_upserted,
57
+ skipped_documents=0,
58
+ details=details,
59
+ )
60
+
61
+
62
+ @router.post(
63
+ "/arxiv",
64
+ response_model=IngestResponse,
65
+ summary="Ingest documents from arXiv",
66
+ description="Fetches recent arXiv entries for a query and upserts them into Pinecone.",
67
+ )
68
+ @limiter.limit("10/minute")
69
+ async def ingest_arxiv(request: Request, payload: ArxivIngestRequest) -> IngestResponse: # noqa: ARG001
70
+ settings = get_settings()
71
+ namespace = payload.namespace or settings.PINECONE_NAMESPACE
72
+ max_docs = min(payload.max_docs, 20)
73
+
74
+ logger.info(
75
+ "Starting arXiv ingestion query='%s' max_docs=%d namespace='%s'",
76
+ payload.query,
77
+ max_docs,
78
+ namespace,
79
+ )
80
+
81
+ try:
82
+ documents = await fetch_arxiv_documents(
83
+ query=payload.query,
84
+ max_results=max_docs,
85
+ category=payload.category,
86
+ )
87
+ except (httpx.HTTPStatusError, httpx.RequestError) as exc:
88
+ status = None
89
+ reason = ""
90
+ url = None
91
+ if isinstance(exc, httpx.HTTPStatusError):
92
+ if exc.response is not None:
93
+ status = exc.response.status_code
94
+ reason = exc.response.reason_phrase
95
+ url = str(exc.response.url)
96
+ if hasattr(exc, "request") and getattr(exc, "request") is not None and url is None:
97
+ try:
98
+ url = str(exc.request.url)
99
+ except Exception: # noqa: BLE001
100
+ url = None
101
+
102
+ logger.error(
103
+ "Upstream arXiv error (url=%s): %s",
104
+ url or "unknown",
105
+ exc,
106
+ )
107
+ status_display = status if status is not None else "unknown"
108
+ detail = f"Upstream arXiv error: {status_display} {reason}".strip()
109
+ raise HTTPException(
110
+ status_code=502,
111
+ detail=detail,
112
+ ) from exc
113
+
114
+ return await _process_and_upsert(documents, namespace=namespace, source="arxiv")
115
+
116
+
117
+ @router.post(
118
+ "/openalex",
119
+ response_model=IngestResponse,
120
+ summary="Ingest documents from OpenAlex",
121
+ description="Fetches works from OpenAlex for a query and upserts them into Pinecone.",
122
+ )
123
+ @limiter.limit("10/minute")
124
+ async def ingest_openalex(request: Request, payload: OpenAlexIngestRequest) -> IngestResponse: # noqa: ARG001
125
+ settings = get_settings()
126
+ namespace = payload.namespace or settings.PINECONE_NAMESPACE
127
+ max_docs = min(payload.max_docs, 20)
128
+
129
+ logger.info(
130
+ "Starting OpenAlex ingestion query='%s' max_docs=%d namespace='%s'",
131
+ payload.query,
132
+ max_docs,
133
+ namespace,
134
+ )
135
+
136
+ try:
137
+ documents = await fetch_openalex_documents(
138
+ query=payload.query,
139
+ max_results=max_docs,
140
+ mailto=payload.mailto,
141
+ )
142
+ except (httpx.HTTPStatusError, httpx.RequestError) as exc:
143
+ logger.error("Upstream OpenAlex error: %s", exc)
144
+ raise HTTPException(
145
+ status_code=502,
146
+ detail="Upstream OpenAlex error: unable to retrieve content. "
147
+ "Try again later.",
148
+ ) from exc
149
+
150
+ return await _process_and_upsert(documents, namespace=namespace, source="openalex")
151
+
152
+
153
+ @router.post(
154
+ "/wiki",
155
+ response_model=IngestResponse,
156
+ summary="Ingest documents from Wikipedia",
157
+ description=(
158
+ "Fetches articles from Wikipedia using the REST API with Action API fallback "
159
+ "and upserts them into Pinecone."
160
+ ),
161
+ )
162
+ @limiter.limit("10/minute")
163
+ async def ingest_wiki(request: Request, payload: WikiIngestRequest) -> IngestResponse: # noqa: ARG001
164
+ settings = get_settings()
165
+ namespace = payload.namespace or settings.PINECONE_NAMESPACE
166
+
167
+ titles = payload.titles[:20]
168
+ logger.info(
169
+ "Starting Wikipedia ingestion titles=%d namespace='%s'",
170
+ len(titles),
171
+ namespace,
172
+ )
173
+
174
+ try:
175
+ documents = await fetch_wiki_documents(titles=titles)
176
+ except (httpx.HTTPStatusError, httpx.RequestError) as exc:
177
+ logger.error("Upstream Wikimedia error: %s", exc)
178
+ raise HTTPException(
179
+ status_code=502,
180
+ detail=(
181
+ "Upstream Wikimedia error: unable to retrieve content. "
182
+ "Try again later or use Action API fallback."
183
+ ),
184
+ ) from exc
185
+
186
+ # Summarise which backend was used (REST vs Action API) for debugging.
187
+ backend_counts: Dict[str, int] = Counter(
188
+ doc.metadata.get("wikimedia_backend", "unknown") for doc in documents
189
+ )
190
+ details: Dict[str, Any] = {"wikimedia_backend_counts": dict(backend_counts)}
191
+
192
+ return await _process_and_upsert(
193
+ documents, namespace=namespace, source="wiki", details=details
194
+ )
backend/app/routers/metrics.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter
2
+
3
+ from app.core.metrics import get_metrics_snapshot
4
+
5
+ router = APIRouter(tags=["metrics"])
6
+
7
+
8
+ @router.get(
9
+ "/metrics",
10
+ summary="In-memory metrics snapshot",
11
+ description=(
12
+ "Returns request and error counts by path, timing statistics for chat "
13
+ "requests (average and p50/p95), cache hit/miss counters, and the last "
14
+ "20 timing samples."
15
+ ),
16
+ )
17
+ async def metrics() -> dict:
18
+ return get_metrics_snapshot()
backend/app/routers/search.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict, List
2
+
3
+ from fastapi import APIRouter, Request
4
+ from fastapi.concurrency import run_in_threadpool
5
+
6
+ from app.core.cache import get_search_cached, set_search_cached
7
+ from app.core.config import get_settings
8
+ from app.core.logging import get_logger
9
+ from app.core.rate_limit import limiter
10
+ from app.schemas.search import SearchHit, SearchRequest, SearchResponse
11
+ from app.services.pinecone_store import search as pinecone_search
12
+
13
+ logger = get_logger(__name__)
14
+
15
+ router = APIRouter(tags=["search"])
16
+
17
+
18
+ @router.post(
19
+ "/search",
20
+ response_model=SearchResponse,
21
+ summary="Semantic search over ingested documents",
22
+ description=(
23
+ "Performs integrated embedding search over documents stored in Pinecone and "
24
+ "returns the top matching chunks."
25
+ ),
26
+ )
27
+ @limiter.limit("60/minute")
28
+ async def search(request: Request, payload: SearchRequest) -> SearchResponse: # noqa: ARG001
29
+ settings = get_settings()
30
+ namespace = payload.namespace or settings.PINECONE_NAMESPACE
31
+ text_field = settings.PINECONE_TEXT_FIELD
32
+
33
+ logger.info(
34
+ "Received search request namespace='%s' top_k=%d",
35
+ namespace,
36
+ payload.top_k,
37
+ )
38
+
39
+ cached = get_search_cached(
40
+ namespace=namespace,
41
+ query=payload.query,
42
+ top_k=payload.top_k,
43
+ filters=payload.filters,
44
+ )
45
+ if cached is not None:
46
+ hits_raw = cached
47
+ else:
48
+ hits_raw: List[Dict[str, Any]] = await run_in_threadpool(
49
+ pinecone_search,
50
+ namespace,
51
+ payload.query,
52
+ payload.top_k,
53
+ payload.filters,
54
+ None,
55
+ )
56
+ set_search_cached(
57
+ namespace=namespace,
58
+ query=payload.query,
59
+ top_k=payload.top_k,
60
+ filters=payload.filters,
61
+ value=hits_raw,
62
+ )
63
+
64
+ hits: List[SearchHit] = []
65
+ for hit in hits_raw:
66
+ hit_id = hit.get("_id") or hit.get("id") or ""
67
+ score = float(hit.get("_score") or hit.get("score") or 0.0)
68
+ raw_fields: Dict[str, Any] = hit.get("fields") or {}
69
+
70
+ # Map the configured Pinecone text field back to a stable 'chunk_text' key
71
+ returned_text = raw_fields.get(text_field, "")
72
+ fields: Dict[str, Any] = dict(raw_fields)
73
+ if text_field in fields and text_field != "chunk_text":
74
+ fields.pop(text_field, None)
75
+ fields["chunk_text"] = returned_text
76
+
77
+ hits.append(
78
+ SearchHit(
79
+ id=hit_id,
80
+ score=score,
81
+ fields=fields,
82
+ )
83
+ )
84
+
85
+ return SearchResponse(
86
+ namespace=namespace,
87
+ query=payload.query,
88
+ top_k=payload.top_k,
89
+ hits=hits,
90
+ )
backend/app/schemas/__pycache__/chat.cpython-313.pyc ADDED
Binary file (4.89 kB). View file
 
backend/app/schemas/__pycache__/documents.cpython-313.pyc ADDED
Binary file (1.97 kB). View file
 
backend/app/schemas/__pycache__/ingest.cpython-313.pyc ADDED
Binary file (2.57 kB). View file
 
backend/app/schemas/__pycache__/search.cpython-313.pyc ADDED
Binary file (1.7 kB). View file
 
backend/app/schemas/chat.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Literal, Optional
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+
6
+ class ChatMessage(BaseModel):
7
+ role: Literal["user", "assistant"] = Field(
8
+ ...,
9
+ description="Role of the message author (user or assistant).",
10
+ )
11
+ content: str = Field(..., description="Message text content.")
12
+
13
+
14
+ class ChatRequest(BaseModel):
15
+ query: str = Field(..., description="User query to be answered.")
16
+ namespace: Optional[str] = Field(
17
+ default=None,
18
+ description=(
19
+ "Target Pinecone namespace. Defaults to the configured "
20
+ "PINECONE_NAMESPACE when omitted."
21
+ ),
22
+ )
23
+ top_k: int = Field(
24
+ default=5,
25
+ ge=1,
26
+ le=100,
27
+ description="Maximum number of retrieved document chunks.",
28
+ )
29
+ use_web_fallback: bool = Field(
30
+ default=True,
31
+ description=(
32
+ "Whether to fall back to web search when retrieval is weak. "
33
+ "Requires a configured Tavily API key."
34
+ ),
35
+ )
36
+ min_score: float = Field(
37
+ default=0.25,
38
+ ge=0.0,
39
+ le=1.0,
40
+ description=(
41
+ "If the top retrieval score is below this threshold and "
42
+ "use_web_fallback is true, a web search will be attempted."
43
+ ),
44
+ )
45
+ max_web_results: int = Field(
46
+ default=5,
47
+ ge=1,
48
+ le=20,
49
+ description="Maximum number of web search results to fetch when enabled.",
50
+ )
51
+ chat_history: Optional[List[ChatMessage]] = Field(
52
+ default=None,
53
+ description=(
54
+ "Optional prior conversation history. "
55
+ "Messages with role='user' or 'assistant' are supported."
56
+ ),
57
+ )
58
+
59
+
60
+ class SourceHit(BaseModel):
61
+ source: str = Field(
62
+ ...,
63
+ description="Origin of the snippet (e.g. wiki, openalex, arxiv, web).",
64
+ )
65
+ title: str = Field(
66
+ ...,
67
+ description="Title of the underlying document or web page.",
68
+ )
69
+ url: str = Field(
70
+ "",
71
+ description="URL associated with the source, when available.",
72
+ )
73
+ score: float = Field(
74
+ 0.0,
75
+ description=(
76
+ "Relevance score from the vector store or a synthetic score for web search."
77
+ ),
78
+ )
79
+ chunk_text: str = Field(
80
+ ...,
81
+ description="Text content of the retrieved chunk or web snippet.",
82
+ )
83
+
84
+
85
+ class ChatTimings(BaseModel):
86
+ retrieve_ms: float = Field(
87
+ 0.0,
88
+ description="Time spent retrieving from Pinecone, in milliseconds.",
89
+ )
90
+ web_ms: float = Field(
91
+ 0.0,
92
+ description="Time spent calling web search tools, in milliseconds.",
93
+ )
94
+ generate_ms: float = Field(
95
+ 0.0,
96
+ description="Time spent generating the answer with the LLM, in milliseconds.",
97
+ )
98
+ total_ms: float = Field(
99
+ 0.0,
100
+ description="End-to-end time from request receipt to response, in milliseconds.",
101
+ )
102
+
103
+
104
+ class ChatTraceMetadata(BaseModel):
105
+ langsmith_project: Optional[str] = Field(
106
+ default=None,
107
+ description="LangSmith project name associated with this trace, if any.",
108
+ )
109
+ trace_enabled: bool = Field(
110
+ default=False,
111
+ description="Whether LangSmith / LangChain tracing was enabled for this call.",
112
+ )
113
+
114
+
115
+ class ChatResponse(BaseModel):
116
+ answer: str = Field(..., description="Generated answer text.")
117
+ sources: List[SourceHit] = Field(
118
+ default_factory=list,
119
+ description="List of document or web snippets used as context.",
120
+ )
121
+ timings: ChatTimings = Field(
122
+ default_factory=ChatTimings,
123
+ description="Timing information for key phases of the pipeline.",
124
+ )
125
+ trace: ChatTraceMetadata = Field(
126
+ ...,
127
+ description="Tracing configuration metadata for observability.",
128
+ )
backend/app/schemas/documents.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict, Optional
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+
6
+ class UploadTextRequest(BaseModel):
7
+ title: str = Field(..., description="Document title")
8
+ source: str = Field(
9
+ default="manual",
10
+ description="Source label for the document (e.g. manual, docling)",
11
+ )
12
+ text: str = Field(..., description="Full text content of the document")
13
+ namespace: Optional[str] = Field(
14
+ default=None, description="Target Pinecone namespace (defaults to env)"
15
+ )
16
+ metadata: Optional[Dict[str, Any]] = Field(
17
+ default=None,
18
+ description="Additional metadata fields to store alongside the document",
19
+ )
20
+
21
+
22
+ class UploadTextResponse(BaseModel):
23
+ namespace: str
24
+ source: str
25
+ ingested_documents: int
26
+ ingested_chunks: int
27
+
28
+
29
+ class NamespaceStat(BaseModel):
30
+ vector_count: int
31
+
32
+
33
+ class DocumentsStatsResponse(BaseModel):
34
+ namespaces: Dict[str, NamespaceStat]
backend/app/schemas/ingest.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict, List, Optional
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+
6
+ class ArxivIngestRequest(BaseModel):
7
+ query: str = Field(..., description="Search query for arXiv")
8
+ max_docs: int = Field(
9
+ default=10,
10
+ ge=1,
11
+ le=20,
12
+ description="Maximum number of documents to fetch (capped at 20)",
13
+ )
14
+ namespace: Optional[str] = Field(
15
+ default=None, description="Target Pinecone namespace (defaults to env)"
16
+ )
17
+ category: Optional[str] = Field(
18
+ default=None,
19
+ description="Optional category label for ingested papers",
20
+ )
21
+
22
+
23
+ class OpenAlexIngestRequest(BaseModel):
24
+ query: str = Field(..., description="Search query for OpenAlex works")
25
+ max_docs: int = Field(
26
+ default=10,
27
+ ge=1,
28
+ le=20,
29
+ description="Maximum number of documents to fetch (capped at 20)",
30
+ )
31
+ namespace: Optional[str] = Field(
32
+ default=None, description="Target Pinecone namespace (defaults to env)"
33
+ )
34
+ mailto: str = Field(
35
+ ...,
36
+ description="Contact email passed to OpenAlex via the mailto query parameter",
37
+ )
38
+
39
+
40
+ class WikiIngestRequest(BaseModel):
41
+ titles: List[str] = Field(
42
+ ...,
43
+ description="List of Wikipedia page titles (first 20 will be used)",
44
+ )
45
+ namespace: Optional[str] = Field(
46
+ default=None, description="Target Pinecone namespace (defaults to env)"
47
+ )
48
+
49
+
50
+ class IngestResponse(BaseModel):
51
+ namespace: str
52
+ source: str
53
+ ingested_documents: int
54
+ ingested_chunks: int
55
+ skipped_documents: int
56
+ details: Optional[Dict[str, Any]] = None
backend/app/schemas/search.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict, List, Optional
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+
6
+ class SearchRequest(BaseModel):
7
+ query: str = Field(..., description="User query text")
8
+ top_k: int = Field(
9
+ default=5,
10
+ ge=1,
11
+ le=100,
12
+ description="Number of results to return",
13
+ )
14
+ namespace: Optional[str] = Field(
15
+ default=None, description="Target Pinecone namespace (defaults to env)"
16
+ )
17
+ filters: Optional[Dict[str, Any]] = Field(
18
+ default=None,
19
+ description="Optional metadata filters passed directly to Pinecone search",
20
+ )
21
+
22
+
23
+ class SearchHit(BaseModel):
24
+ id: str
25
+ score: float
26
+ fields: Dict[str, Any]
27
+
28
+
29
+ class SearchResponse(BaseModel):
30
+ namespace: str
31
+ query: str
32
+ top_k: int
33
+ hits: List[SearchHit]
backend/app/services/__pycache__/chunking.cpython-313.pyc ADDED
Binary file (3.1 kB). View file
 
backend/app/services/__pycache__/dedupe.cpython-313.pyc ADDED
Binary file (1.16 kB). View file
 
backend/app/services/__pycache__/normalize.cpython-313.pyc ADDED
Binary file (1.48 kB). View file
 
backend/app/services/__pycache__/pinecone_store.cpython-313.pyc ADDED
Binary file (7.45 kB). View file
 
backend/app/services/chat/__pycache__/graph.cpython-313.pyc ADDED
Binary file (12.7 kB). View file