aankitdas commited on
Commit
939a9f4
·
0 Parent(s):

initial clean commit

Browse files
.dockerignore ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python cache
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ *.pyd
6
+ *.pyd
7
+
8
+ # Virtual environments
9
+ .venv/
10
+ env/
11
+ venv/
12
+
13
+ # Local data, logs, temp
14
+ *.log
15
+ *.sqlite3
16
+ *.db
17
+ tmp/
18
+ temp/
19
+ .cache/
20
+ .huggingface/
21
+ models/
22
+ papers/
23
+
24
+ # Jupyter notebooks and outputs (if not needed in prod)
25
+ .ipynb_checkpoints/
26
+ *.ipynb
27
+ notebooks/
28
+
29
+ # Git and version control
30
+ .git/
31
+ .gitignore
32
+
33
+ # OS generated files
34
+ .DS_Store
35
+ Thumbs.db
.gitignore ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv/
11
+ .env
12
+ notebooks/.chromadb_test/
13
+ .chromadb/
14
+ __pycache__/
15
+ *.pyc
16
+ .DS_Store
17
+ .pytest_cache/
18
+ *.egg-info/
19
+ dist/
20
+ build/
21
+ .pytest_cache
22
+ .coverage
23
+ htmlcov/
24
+ node_modules/
25
+ .streamlit/
26
+ papers/
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.12
Dockerfile ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install minimal system deps
6
+ RUN apt-get update && apt-get install -y --no-install-recommends \
7
+ curl \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ # Copy requirements
11
+ COPY requirements-railway.txt ./
12
+
13
+ # Install CPU-only torch first to avoid downloading CUDA deps
14
+ RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
15
+
16
+ # Install dependencies with aggressive caching cleanup
17
+ RUN pip install --no-cache-dir -r requirements-railway.txt && \
18
+ pip cache purge && \
19
+ find /usr/local -type d -name '__pycache__' -exec rm -rf {} + 2>/dev/null || true && \
20
+ find /usr/local -type f -name '*.pyc' -delete && \
21
+ rm -rf /tmp/* /var/tmp/* /root/.cache
22
+
23
+ # Copy application code
24
+ COPY src ./src
25
+ COPY frontend ./frontend
26
+
27
+ # Set environment variables
28
+ ENV PYTHONUNBUFFERED=1
29
+ ENV PYTHONDONTWRITEBYTECODE=1
30
+ ENV HF_HOME=/tmp/huggingface
31
+ ENV TRANSFORMERS_CACHE=/tmp/huggingface
32
+ ENV TORCH_HOME=/tmp/torch
33
+ ENV EMBEDDING_BACKEND=sentence-transformers
34
+
35
+ EXPOSE 7860
36
+
37
+ CMD ["python", "-m", "uvicorn", "src.main:app", "--host", "0.0.0.0", "--port", "7860"]
README.md ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Document Intelligence RAG System
2
+
3
+ Production-grade Retrieval-Augmented Generation (RAG) system for analyzing research papers and documents with AI.
4
+ Ask questions about your PDFs. Get answers grounded in your documents with source attribution.
5
+
6
+ ## Features
7
+
8
+ - PDF Ingestion: Extract text from PDFs using PDFProcessor
9
+ - Document Chunking: Split documents into smaller chunks for better context
10
+ - Embedding: Convert text chunks into vector embeddings using Ollama
11
+ - Vector Storage: Store embeddings in ChromaDB for efficient retrieval
12
+ - LLM Integration: Use Groq LLM for generating answers
13
+ - Source Attribution: Track document origins for citation
14
+ - FastAPI Integration: Build a REST API for easy access
15
+ - Docker Support: Containerize the system for easy deployment
16
+ - PDF Processing: Extract text from PDFs using PDFProcessor
17
+ - Document Chunking: Split documents into smaller chunks for better context
18
+ - Embedding: Convert text chunks into vector embeddings using Ollama
19
+ - Vector Storage: Store embeddings in ChromaDB for efficient retrieval
20
+ - LLM Integration: Use Groq LLM for generating answers
21
+ - Source Attribution: Track document origins for citation
22
+ - FastAPI Integration: Build a REST API for easy access
23
+ - Docker Support: Containerize the system for easy deployment
24
+
25
+ ## Quickstart
26
+
27
+ ### Prerequisites
28
+
29
+ - Python 3.12
30
+ - Ollama
31
+ - Groq API Key
32
+ - ChromaDB
33
+ - FastAPI
34
+ - Uvicorn
35
+ - PDFProcessor
36
+ - Embeddings
37
+ - LLM
38
+ - Vector Store
39
+
40
+ 1. Setup environment variables
41
+ ```bash
42
+ # Clone repository
43
+ git clone https://github.com/aankitdas/document-intelligence-rag.git
44
+ cd document-intelligence-rag
45
+
46
+ # Install Ollama (one-time setup)
47
+ # Download from https://ollama.ai
48
+ ollama pull nomic-embed-text
49
+
50
+ # Start Ollama server (in background)
51
+ ollama serve
52
+
53
+ # Create Python environment
54
+ uv venv
55
+ source .venv/bin/activate # Windows: .venv\Scripts\activate
56
+
57
+ # Install dependencies
58
+ uv sync
59
+
60
+ # Set API keys
61
+ export GROQ_API_KEY="gsk_..." # Get from https://console.groq.com
62
+ ```
63
+
64
+ 2. Prepare Documents
65
+ ```bash
66
+ # Create a folder for documents
67
+ # Create papers folder
68
+ mkdir papers
69
+
70
+ # Add your PDFs to papers/
71
+ # Example: papers/research_paper.pdf
72
+ ```
73
+ 3. Run API
74
+ ```bash
75
+ # Run API
76
+ uvicorn src.api.main:app --reload
77
+ ```
78
+ 4. Query API
79
+ ```bash
80
+ # Query API
81
+ curl http://localhost:8000/ask -X POST -H "Content-Type: application/json" -d '{"question": "What is the main contribution of this paper?"}'
82
+ ```
83
+
84
+ ## Tech Stack
85
+
86
+ | Component | Technology | Why |
87
+ |------------------|-------------------------------|---------------------------------------------------------------------|
88
+ | Embeddings | Ollama (`nomic-embed-text`) | Local, free, 384-dimensional embeddings |
89
+ | Vector Database | Chroma | Persistent storage, fast similarity search, completely free |
90
+ | LLM | Groq (Llama 3.1) | Free API tier, very fast inference |
91
+ | Backend | FastAPI | Production-grade, async, automatic API docs |
92
+ | Frontend | HTML / CSS / JavaScript | Simple setup, no build tooling required |
93
+ | Package Manager | UV | Fast dependency resolution, deterministic environments |
frontend/index.html ADDED
@@ -0,0 +1,592 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+
4
+ <head>
5
+ <meta charset="UTF-8">
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
7
+ <title>Document Intelligence RAG</title>
8
+ <style>
9
+ * {
10
+ margin: 0;
11
+ padding: 0;
12
+ box-sizing: border-box;
13
+ }
14
+
15
+ body {
16
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
17
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
18
+ min-height: 100vh;
19
+ padding: 20px;
20
+ }
21
+
22
+ .container {
23
+ max-width: 1000px;
24
+ margin: 0 auto;
25
+ }
26
+
27
+ header {
28
+ text-align: center;
29
+ color: white;
30
+ margin-bottom: 40px;
31
+ }
32
+
33
+ header h1 {
34
+ font-size: 2.5em;
35
+ margin-bottom: 10px;
36
+ text-shadow: 0 2px 10px rgba(0, 0, 0, 0.2);
37
+ }
38
+
39
+ header p {
40
+ font-size: 1.1em;
41
+ opacity: 0.9;
42
+ }
43
+
44
+ .main-grid {
45
+ display: grid;
46
+ grid-template-columns: 1fr 1fr;
47
+ gap: 20px;
48
+ margin-bottom: 20px;
49
+ }
50
+
51
+ .card {
52
+ background: white;
53
+ border-radius: 12px;
54
+ padding: 25px;
55
+ box-shadow: 0 10px 30px rgba(0, 0, 0, 0.2);
56
+ }
57
+
58
+ .card h2 {
59
+ color: #333;
60
+ margin-bottom: 15px;
61
+ font-size: 1.3em;
62
+ }
63
+
64
+ .upload-area {
65
+ border: 2px dashed #667eea;
66
+ border-radius: 8px;
67
+ padding: 30px;
68
+ text-align: center;
69
+ cursor: pointer;
70
+ transition: all 0.3s;
71
+ }
72
+
73
+ .upload-area:hover {
74
+ border-color: #764ba2;
75
+ background: #f8f9ff;
76
+ }
77
+
78
+ .upload-area.dragover {
79
+ border-color: #764ba2;
80
+ background: #f0f2ff;
81
+ }
82
+
83
+ .upload-area input {
84
+ display: none;
85
+ }
86
+
87
+ .upload-area p {
88
+ color: #666;
89
+ margin-bottom: 10px;
90
+ }
91
+
92
+ .btn {
93
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
94
+ color: white;
95
+ border: none;
96
+ padding: 12px 24px;
97
+ border-radius: 8px;
98
+ cursor: pointer;
99
+ font-size: 1em;
100
+ font-weight: 600;
101
+ transition: transform 0.2s, box-shadow 0.2s;
102
+ }
103
+
104
+ .btn:hover {
105
+ transform: translateY(-2px);
106
+ box-shadow: 0 5px 15px rgba(102, 126, 234, 0.4);
107
+ }
108
+
109
+ .btn:active {
110
+ transform: translateY(0);
111
+ }
112
+
113
+ .btn-secondary {
114
+ background: #f0f0f0;
115
+ color: #333;
116
+ }
117
+
118
+ .btn-secondary:hover {
119
+ background: #e0e0e0;
120
+ box-shadow: 0 5px 15px rgba(0, 0, 0, 0.1);
121
+ }
122
+
123
+ .query-input {
124
+ display: flex;
125
+ gap: 10px;
126
+ margin-bottom: 20px;
127
+ }
128
+
129
+ .query-input input {
130
+ flex: 1;
131
+ padding: 12px;
132
+ border: 2px solid #e0e0e0;
133
+ border-radius: 8px;
134
+ font-size: 1em;
135
+ transition: border-color 0.3s;
136
+ }
137
+
138
+ .query-input input:focus {
139
+ outline: none;
140
+ border-color: #667eea;
141
+ }
142
+
143
+ .status {
144
+ padding: 15px;
145
+ border-radius: 8px;
146
+ margin-bottom: 15px;
147
+ font-size: 0.95em;
148
+ }
149
+
150
+ .status.success {
151
+ background: #d4edda;
152
+ color: #155724;
153
+ border-left: 4px solid #28a745;
154
+ }
155
+
156
+ .status.error {
157
+ background: #f8d7da;
158
+ color: #721c24;
159
+ border-left: 4px solid #f5c6cb;
160
+ }
161
+
162
+ .status.loading {
163
+ background: #e7f3ff;
164
+ color: #004085;
165
+ border-left: 4px solid #0c5ff4;
166
+ }
167
+
168
+ .answer-box {
169
+ background: #f8f9fa;
170
+ border-left: 4px solid #667eea;
171
+ padding: 15px;
172
+ border-radius: 8px;
173
+ margin-bottom: 20px;
174
+ }
175
+
176
+ .answer-box h3 {
177
+ color: #333;
178
+ margin-bottom: 10px;
179
+ }
180
+
181
+ .answer-box p {
182
+ color: #555;
183
+ line-height: 1.6;
184
+ margin-bottom: 15px;
185
+ }
186
+
187
+ .sources {
188
+ background: white;
189
+ border-radius: 8px;
190
+ padding: 15px;
191
+ margin-bottom: 15px;
192
+ }
193
+
194
+ .sources h4 {
195
+ color: #333;
196
+ margin-bottom: 12px;
197
+ font-size: 0.95em;
198
+ }
199
+
200
+ .source-item {
201
+ padding: 10px;
202
+ background: #f8f9fa;
203
+ border-radius: 6px;
204
+ margin-bottom: 8px;
205
+ border-left: 3px solid #667eea;
206
+ font-size: 0.9em;
207
+ }
208
+
209
+ .source-item .relevance {
210
+ color: #667eea;
211
+ font-weight: 600;
212
+ margin-bottom: 5px;
213
+ }
214
+
215
+ .source-item .text {
216
+ color: #555;
217
+ font-style: italic;
218
+ }
219
+
220
+ .stats {
221
+ display: grid;
222
+ grid-template-columns: repeat(2, 1fr);
223
+ gap: 10px;
224
+ margin-bottom: 20px;
225
+ }
226
+
227
+ .stat-box {
228
+ background: #f8f9fa;
229
+ padding: 12px;
230
+ border-radius: 6px;
231
+ text-align: center;
232
+ }
233
+
234
+ .stat-box .number {
235
+ font-size: 1.5em;
236
+ font-weight: bold;
237
+ color: #667eea;
238
+ }
239
+
240
+ .stat-box .label {
241
+ font-size: 0.85em;
242
+ color: #666;
243
+ margin-top: 5px;
244
+ }
245
+
246
+ .status-grid {
247
+ display: grid;
248
+ grid-template-columns: repeat(4, 1fr);
249
+ gap: 10px;
250
+ }
251
+
252
+ .loading-spinner {
253
+ display: inline-block;
254
+ width: 20px;
255
+ height: 20px;
256
+ border: 3px solid #f3f3f3;
257
+ border-top: 3px solid #667eea;
258
+ border-radius: 50%;
259
+ animation: spin 1s linear infinite;
260
+ margin-right: 10px;
261
+ vertical-align: middle;
262
+ }
263
+
264
+ @keyframes spin {
265
+ 0% {
266
+ transform: rotate(0deg);
267
+ }
268
+
269
+ 100% {
270
+ transform: rotate(360deg);
271
+ }
272
+ }
273
+
274
+ .full-width {
275
+ grid-column: 1 / -1;
276
+ }
277
+
278
+ @media (max-width: 768px) {
279
+ .main-grid {
280
+ grid-template-columns: 1fr;
281
+ }
282
+
283
+ header h1 {
284
+ font-size: 1.8em;
285
+ }
286
+
287
+ .stats {
288
+ grid-template-columns: 1fr;
289
+ }
290
+
291
+ .status-grid {
292
+ grid-template-columns: repeat(2, 1fr);
293
+ }
294
+ }
295
+
296
+ .hidden {
297
+ display: none;
298
+ }
299
+ </style>
300
+ </head>
301
+
302
+ <body>
303
+ <div class="container">
304
+ <header>
305
+ <h1>📚 Document Intelligence RAG</h1>
306
+ <p>Ask questions about your research papers</p>
307
+ </header>
308
+
309
+ <div class="main-grid">
310
+ <!-- Upload Section -->
311
+ <div class="card">
312
+ <h2>📤 Upload Documents</h2>
313
+
314
+ <div class="upload-area" id="uploadArea">
315
+ <p>📁 Drag & drop PDFs here or click to browse</p>
316
+ <input type="file" id="fileInput" multiple accept=".pdf">
317
+ <button class="btn" onclick="document.getElementById('fileInput').click()">
318
+ Choose Files
319
+ </button>
320
+ </div>
321
+
322
+ <div id="uploadStatus" class="status hidden"></div>
323
+
324
+ <div id="stats" class="stats">
325
+ <div class="stat-box">
326
+ <div class="number" id="totalChunks">0</div>
327
+ <div class="label">Total Chunks</div>
328
+ </div>
329
+ <div class="stat-box">
330
+ <div class="number" id="docCount">0</div>
331
+ <div class="label">Documents</div>
332
+ </div>
333
+ </div>
334
+
335
+ <button class="btn btn-secondary" onclick="loadStats()">
336
+ 🔄 Refresh Stats
337
+ </button>
338
+
339
+ <button class="btn btn-secondary" style="background: #ff6b6b; color: white; margin-top: 10px;"
340
+ onclick="resetSystem()">
341
+ 🗑️ Delete All Documents
342
+ </button>
343
+
344
+ <p style="font-size: 0.85em; color: #999; margin-top: 10px;">
345
+ 💾 Documents are stored persistently. They remain after restart.
346
+ </p>
347
+ </div>
348
+
349
+ <!-- Query Section -->
350
+ <div class="card">
351
+ <h2>❓ Ask Questions</h2>
352
+
353
+ <div class="query-input">
354
+ <input type="text" id="queryInput" placeholder="What would you like to know about your documents?"
355
+ onkeypress="if(event.key==='Enter') submitQuery()">
356
+ <button class="btn" onclick="submitQuery()">Search</button>
357
+ </div>
358
+
359
+ <div id="queryStatus" class="status hidden"></div>
360
+
361
+ <div id="answerContainer" class="hidden">
362
+ <div class="answer-box">
363
+ <h3>Answer</h3>
364
+ <p id="answerText"></p>
365
+ </div>
366
+
367
+ <div class="sources" id="sourcesBox">
368
+ <h4>📖 Sources Used</h4>
369
+ <div id="sourcesList"></div>
370
+ </div>
371
+ </div>
372
+ </div>
373
+ </div>
374
+
375
+ <!-- Status Indicators -->
376
+ <div class="card full-width">
377
+ <h2>🔧 System Status</h2>
378
+ <div id="healthStatus" class="status-grid">Loading...</div>
379
+ </div>
380
+ </div>
381
+
382
+ <script>
383
+ const API_URL = 'http://localhost:8000';
384
+
385
+ // Upload handlers
386
+ const uploadArea = document.getElementById('uploadArea');
387
+ const fileInput = document.getElementById('fileInput');
388
+
389
+ uploadArea.addEventListener('click', () => fileInput.click());
390
+ uploadArea.addEventListener('dragover', (e) => {
391
+ e.preventDefault();
392
+ uploadArea.classList.add('dragover');
393
+ });
394
+ uploadArea.addEventListener('dragleave', () => {
395
+ uploadArea.classList.remove('dragover');
396
+ });
397
+ uploadArea.addEventListener('drop', (e) => {
398
+ e.preventDefault();
399
+ uploadArea.classList.remove('dragover');
400
+ handleFiles(e.dataTransfer.files);
401
+ });
402
+
403
+ fileInput.addEventListener('change', (e) => {
404
+ handleFiles(e.target.files);
405
+ });
406
+
407
+ async function handleFiles(files) {
408
+ const statusDiv = document.getElementById('uploadStatus');
409
+
410
+ for (const file of files) {
411
+ if (!file.name.endsWith('.pdf')) {
412
+ showStatus(statusDiv, `Skipping ${file.name} - only PDFs supported`, 'error');
413
+ continue;
414
+ }
415
+
416
+ showStatus(statusDiv, `Uploading ${file.name}...`, 'loading');
417
+
418
+ const formData = new FormData();
419
+ formData.append('file', file);
420
+
421
+ try {
422
+ const response = await fetch(`${API_URL}/ingest`, {
423
+ method: 'POST',
424
+ body: formData
425
+ });
426
+
427
+ if (response.ok) {
428
+ const data = await response.json();
429
+ showStatus(
430
+ statusDiv,
431
+ `✓ ${file.name}: ${data.chunks_embedded} chunks ingested`,
432
+ 'success'
433
+ );
434
+ loadStats();
435
+ } else {
436
+ const error = await response.json();
437
+ showStatus(statusDiv, `✗ ${file.name}: ${error.detail}`, 'error');
438
+ }
439
+ } catch (error) {
440
+ showStatus(statusDiv, `✗ Upload failed: ${error.message}`, 'error');
441
+ }
442
+ }
443
+
444
+ fileInput.value = '';
445
+ }
446
+
447
+ async function submitQuery() {
448
+ const query = document.getElementById('queryInput').value.trim();
449
+ if (!query) {
450
+ showStatus(
451
+ document.getElementById('queryStatus'),
452
+ 'Please enter a question',
453
+ 'error'
454
+ );
455
+ return;
456
+ }
457
+
458
+ const statusDiv = document.getElementById('queryStatus');
459
+ showStatus(statusDiv, 'Searching your documents...', 'loading');
460
+
461
+ try {
462
+ const response = await fetch(`${API_URL}/query`, {
463
+ method: 'POST',
464
+ headers: { 'Content-Type': 'application/json' },
465
+ body: JSON.stringify({ query, top_k: 3 })
466
+ });
467
+
468
+ if (response.ok) {
469
+ const data = await response.json();
470
+ displayAnswer(data);
471
+ statusDiv.classList.add('hidden');
472
+ } else {
473
+ const error = await response.json();
474
+ showStatus(statusDiv, error.error || 'Query failed', 'error');
475
+ }
476
+ } catch (error) {
477
+ showStatus(statusDiv, `Error: ${error.message}`, 'error');
478
+ }
479
+ }
480
+
481
+ function displayAnswer(data) {
482
+ document.getElementById('answerText').textContent = data.answer;
483
+
484
+ const sourcesList = document.getElementById('sourcesList');
485
+ sourcesList.innerHTML = data.sources.map(source => `
486
+ <div class="source-item">
487
+ <div class="relevance">📌 Relevance: ${(source.similarity * 100).toFixed(0)}%</div>
488
+ <div class="text">${source.preview}</div>
489
+ </div>
490
+ `).join('');
491
+
492
+ document.getElementById('answerContainer').classList.remove('hidden');
493
+ }
494
+
495
+ async function loadStats() {
496
+ try {
497
+ const response = await fetch(`${API_URL}/stats`);
498
+ if (response.ok) {
499
+ const data = await response.json();
500
+ document.getElementById('totalChunks').textContent = data.total_chunks;
501
+ }
502
+ } catch (error) {
503
+ console.error('Failed to load stats:', error);
504
+ }
505
+ }
506
+
507
+ async function loadHealth() {
508
+ try {
509
+ const response = await fetch(`${API_URL}/health`);
510
+ if (response.ok) {
511
+ const data = await response.json();
512
+
513
+ // Get embedding backend name
514
+ let embeddingName = data.embedding_backend || 'Unknown';
515
+ // Format nicely
516
+ if (embeddingName === 'sentence-transformers') {
517
+ embeddingName = 'Sentence-Transformers';
518
+ } else if (embeddingName === 'ollama') {
519
+ embeddingName = 'Ollama';
520
+ }
521
+
522
+ const healthHtml = `
523
+ <div class="stat-box">
524
+ <div class="number">${data.embedding_backend ? '✓' : '✗'}</div>
525
+ <div class="label">${embeddingName} (Embeddings)</div>
526
+ </div>
527
+ <div class="stat-box">
528
+ <div class="number">${data.groq === '✓' ? '✓' : '✗'}</div>
529
+ <div class="label">Groq (LLM)</div>
530
+ </div>
531
+ <div class="stat-box">
532
+ <div class="number">${data.chroma.status === '✓' ? '✓' : '✗'}</div>
533
+ <div class="label">Chroma (Vector DB)</div>
534
+ </div>
535
+ <div class="stat-box">
536
+ <div class="number">${data.status === 'healthy' ? '✓' : '⚠'}</div>
537
+ <div class="label">Overall Status</div>
538
+ </div>
539
+ `;
540
+ document.getElementById('healthStatus').innerHTML = healthHtml;
541
+ }
542
+ } catch (error) {
543
+ document.getElementById('healthStatus').innerHTML =
544
+ `<div style="grid-column: 1/-1; padding: 15px; background: #f8d7da; color: #721c24; border-radius: 8px;">Cannot connect to API at ${API_URL}</div>`;
545
+ }
546
+ }
547
+
548
+ async function resetSystem() {
549
+ if (!confirm('⚠️ Delete ALL documents and embeddings? This cannot be undone!')) {
550
+ return;
551
+ }
552
+
553
+ const statusDiv = document.getElementById('uploadStatus');
554
+ showStatus(statusDiv, 'Resetting system...', 'loading');
555
+
556
+ try {
557
+ const response = await fetch(`${API_URL}/reset`, {
558
+ method: 'POST',
559
+ headers: {
560
+ 'Content-Type': 'application/json'
561
+ }
562
+ });
563
+
564
+ if (response.ok) {
565
+ const data = await response.json();
566
+ showStatus(statusDiv, '✓ All documents deleted!', 'success');
567
+ loadStats();
568
+ } else {
569
+ const error = await response.json();
570
+ showStatus(statusDiv, `Reset failed: ${error.detail || 'Unknown error'}`, 'error');
571
+ }
572
+ } catch (error) {
573
+ showStatus(statusDiv, `Error: ${error.message}`, 'error');
574
+ }
575
+ }
576
+
577
+ function showStatus(element, message, type) {
578
+ element.textContent = message;
579
+ element.className = `status ${type}`;
580
+ element.classList.remove('hidden');
581
+ }
582
+
583
+ // Load stats and health on page load
584
+ window.addEventListener('load', () => {
585
+ loadStats();
586
+ loadHealth();
587
+ setInterval(loadHealth, 30000); // Refresh every 30s
588
+ });
589
+ </script>
590
+ </body>
591
+
592
+ </html>
main.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ def main():
2
+ print("Hello from doc-intelligence-rag!")
3
+
4
+
5
+ if __name__ == "__main__":
6
+ main()
notebooks/01_rag_notebook.ipynb ADDED
@@ -0,0 +1,993 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 7,
6
+ "id": "b0f26b66",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "d:\\projects\\doc-intelligence-rag\\.venv\\Scripts\\python.exe\n"
14
+ ]
15
+ }
16
+ ],
17
+ "source": [
18
+ "import sys\n",
19
+ "print(sys.executable)"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": 9,
25
+ "id": "d39fb45d",
26
+ "metadata": {},
27
+ "outputs": [
28
+ {
29
+ "name": "stdout",
30
+ "output_type": "stream",
31
+ "text": [
32
+ "============================================================\n",
33
+ "PART 0: RAG Pipeline Overview\n",
34
+ "============================================================\n",
35
+ "\n",
36
+ "Input: User asks \"What is machine learning?\"\n",
37
+ "\n",
38
+ " ↓\n",
39
+ "\n",
40
+ "1. RETRIEVE: Search similar docs\n",
41
+ " Question embedding → Find top 3 most similar chunks\n",
42
+ "\n",
43
+ "2. AUGMENT: Build context\n",
44
+ " Combine retrieved chunks into a context string\n",
45
+ "\n",
46
+ "3. GENERATE: LLM answers\n",
47
+ " Pass (context + question) → LLM → Answer\n",
48
+ "\n",
49
+ "Output: \"Machine learning is...\"\n",
50
+ "\n",
51
+ "\n",
52
+ "============================================================\n",
53
+ "PART 1: Text Chunking (Why?)\n",
54
+ "============================================================\n",
55
+ "\n",
56
+ "WHY chunk text?\n",
57
+ " • Embeddings work better on ~500 token chunks (not 50k token documents)\n",
58
+ " • Allows granular retrieval (retrieve only relevant section)\n",
59
+ " • Reduces embedding costs\n",
60
+ "\n",
61
+ "CHALLENGE: Chunking strategy matters!\n",
62
+ " • Too small: Lose context (100 tokens)\n",
63
+ " • Too large: Lose precision (5000 tokens)\n",
64
+ " • SOLUTION: Use overlap (e.g., 500 token chunk with 50 token overlap)\n",
65
+ "\n",
66
+ "WHY overlap?\n",
67
+ " • Important info might be at chunk boundary\n",
68
+ " • Overlap ensures semantic continuity\n",
69
+ " • Example: \"The study shows A. B supports A.\" might split badly without overlap\n",
70
+ "\n"
71
+ ]
72
+ }
73
+ ],
74
+ "source": [
75
+ "# RAG System from First Principles\n",
76
+ "# =====================================\n",
77
+ "# This notebook builds a RAG system step-by-step\n",
78
+ "# We'll understand WHY before we code\n",
79
+ "#\n",
80
+ "# Prerequisites with UV:\n",
81
+ "# uv init rag-learning\n",
82
+ "# uv add jupyter numpy pandas requests pydantic groq\n",
83
+ "# uv venv && source .venv/bin/activate\n",
84
+ "#\n",
85
+ "# Free APIs Used:\n",
86
+ "# - Groq (LLM): https://console.groq.com (free API key)\n",
87
+ "# - Ollama (Embeddings): https://ollama.ai (local, completely free)\n",
88
+ "\n",
89
+ "# ========== PART 0: THE PROBLEM ==========\n",
90
+ "# \n",
91
+ "# Problem: How do we make an LLM answer questions about OUR documents?\n",
92
+ "# \n",
93
+ "# Naive approach: Put entire document into prompt\n",
94
+ "# ❌ Problem: Context window is limited (4k, 8k, 128k tokens)\n",
95
+ "# ❌ Problem: Costs scale with document size\n",
96
+ "# ❌ Problem: LLM gets confused with irrelevant information\n",
97
+ "#\n",
98
+ "# Better approach: RAG (Retrieval-Augmented Generation)\n",
99
+ "# ✓ Only pass relevant chunks to LLM\n",
100
+ "# ✓ Reduces costs\n",
101
+ "# ✓ Improves accuracy\n",
102
+ "#\n",
103
+ "# RAG Pipeline:\n",
104
+ "# 1. Split document into chunks\n",
105
+ "# 2. Convert chunks to embeddings (vectors)\n",
106
+ "# 3. Store embeddings in vector database\n",
107
+ "# 4. When user asks question:\n",
108
+ "# a) Convert question to embedding\n",
109
+ "# b) Find most similar chunks (similarity search)\n",
110
+ "# c) Pass those chunks + question to LLM\n",
111
+ "# d) LLM answers based on those chunks\n",
112
+ "\n",
113
+ "print(\"=\" * 60)\n",
114
+ "print(\"PART 0: RAG Pipeline Overview\")\n",
115
+ "print(\"=\" * 60)\n",
116
+ "print(\"\"\"\n",
117
+ "Input: User asks \"What is machine learning?\"\n",
118
+ " \n",
119
+ " ↓\n",
120
+ " \n",
121
+ "1. RETRIEVE: Search similar docs\n",
122
+ " Question embedding → Find top 3 most similar chunks\n",
123
+ " \n",
124
+ "2. AUGMENT: Build context\n",
125
+ " Combine retrieved chunks into a context string\n",
126
+ " \n",
127
+ "3. GENERATE: LLM answers\n",
128
+ " Pass (context + question) → LLM → Answer\n",
129
+ "\n",
130
+ "Output: \"Machine learning is...\"\n",
131
+ "\"\"\")\n",
132
+ "\n",
133
+ "# ========== PART 1: TEXT CHUNKING ==========\n",
134
+ "print(\"\\n\" + \"=\" * 60)\n",
135
+ "print(\"PART 1: Text Chunking (Why?)\")\n",
136
+ "print(\"=\" * 60)\n",
137
+ "\n",
138
+ "print(\"\"\"\n",
139
+ "WHY chunk text?\n",
140
+ " • Embeddings work better on ~500 token chunks (not 50k token documents)\n",
141
+ " • Allows granular retrieval (retrieve only relevant section)\n",
142
+ " • Reduces embedding costs\n",
143
+ "\n",
144
+ "CHALLENGE: Chunking strategy matters!\n",
145
+ " • Too small: Lose context (100 tokens)\n",
146
+ " • Too large: Lose precision (5000 tokens)\n",
147
+ " • SOLUTION: Use overlap (e.g., 500 token chunk with 50 token overlap)\n",
148
+ "\n",
149
+ "WHY overlap?\n",
150
+ " • Important info might be at chunk boundary\n",
151
+ " • Overlap ensures semantic continuity\n",
152
+ " • Example: \"The study shows A. B supports A.\" might split badly without overlap\n",
153
+ "\"\"\")\n"
154
+ ]
155
+ },
156
+ {
157
+ "cell_type": "code",
158
+ "execution_count": 13,
159
+ "id": "0523d02c",
160
+ "metadata": {},
161
+ "outputs": [
162
+ {
163
+ "name": "stdout",
164
+ "output_type": "stream",
165
+ "text": [
166
+ "\n",
167
+ "✓ Split into 5 chunks:\n",
168
+ " Chunk 0: 20 words | Machine learning is a subset of artificial intelligence. It ...\n",
169
+ " Chunk 1: 20 words | on algorithms that learn from data. Deep learning uses neura...\n",
170
+ " Chunk 2: 20 words | networks with multiple layers. Transformers are the backbone...\n",
171
+ " Chunk 3: 12 words | NLP systems. The attention mechanism allows models to focus ...\n",
172
+ " Chunk 4: 2 words | relevant parts....\n"
173
+ ]
174
+ }
175
+ ],
176
+ "source": [
177
+ "\n",
178
+ "# Implement chunking\n",
179
+ "def chunk_text(text, chunk_size=500, overlap=50):\n",
180
+ " \"\"\"\n",
181
+ " Split text into overlapping chunks.\n",
182
+ " \n",
183
+ " Args:\n",
184
+ " text: Raw text to chunk\n",
185
+ " chunk_size: Tokens per chunk (roughly words * 0.75)\n",
186
+ " overlap: Tokens overlap between chunks\n",
187
+ " \n",
188
+ " Returns:\n",
189
+ " List of chunk dicts with text and metadata\n",
190
+ " \"\"\"\n",
191
+ " words = text.split()\n",
192
+ " chunks = []\n",
193
+ " \n",
194
+ " # Calculate stride (how many words to move forward each iteration)\n",
195
+ " # If overlap=50 and chunk_size=500, stride=450\n",
196
+ " stride = chunk_size - overlap\n",
197
+ " \n",
198
+ " for i in range(0, len(words), stride):\n",
199
+ " chunk_words = words[i:i + chunk_size]\n",
200
+ " chunk_text = \" \".join(chunk_words)\n",
201
+ " \n",
202
+ " if chunk_text.strip(): # Skip empty chunks\n",
203
+ " chunks.append({\n",
204
+ " \"text\": chunk_text,\n",
205
+ " \"start_idx\": i,\n",
206
+ " \"word_count\": len(chunk_words),\n",
207
+ " \"chunk_id\": len(chunks) # Simple ID\n",
208
+ " })\n",
209
+ " \n",
210
+ " return chunks\n",
211
+ "\n",
212
+ "# Test chunking\n",
213
+ "sample_text = \"\"\"\n",
214
+ "Machine learning is a subset of artificial intelligence. \n",
215
+ "It focuses on algorithms that learn from data. \n",
216
+ "Deep learning uses neural networks with multiple layers. \n",
217
+ "Transformers are the backbone of modern NLP systems. \n",
218
+ "The attention mechanism allows models to focus on relevant parts.\n",
219
+ "\"\"\"\n",
220
+ "\n",
221
+ "chunks = chunk_text(sample_text, chunk_size=20, overlap=10)\n",
222
+ "print(f\"\\n✓ Split into {len(chunks)} chunks:\")\n",
223
+ "for i, chunk in enumerate(chunks):\n",
224
+ " print(f\" Chunk {i}: {chunk['word_count']} words | {chunk['text'][:60]}...\")\n"
225
+ ]
226
+ },
227
+ {
228
+ "cell_type": "code",
229
+ "execution_count": 14,
230
+ "id": "3464f9c9",
231
+ "metadata": {},
232
+ "outputs": [
233
+ {
234
+ "name": "stdout",
235
+ "output_type": "stream",
236
+ "text": [
237
+ "\n",
238
+ "============================================================\n",
239
+ "PART 2: Embeddings (Why?)\n",
240
+ "============================================================\n",
241
+ "\n",
242
+ "PROBLEM: How do we compare text for similarity?\n",
243
+ " • Can't just use string matching (\"cat\" ≠ \"feline\")\n",
244
+ " • Need semantic understanding\n",
245
+ "\n",
246
+ "SOLUTION: Embeddings (vectors that capture meaning)\n",
247
+ " • Convert text → vector (list of numbers)\n",
248
+ " • Similar texts have similar vectors\n",
249
+ " • We can use math to compare them!\n",
250
+ "\n",
251
+ "EXAMPLE:\n",
252
+ " \"The cat sat on the mat\" → [0.2, -0.5, 0.8, 0.1, ...] (384 dims)\n",
253
+ " \"A feline sat on rug\" → [0.21, -0.48, 0.79, 0.12, ...] (384 dims)\n",
254
+ "\n",
255
+ " Notice: Very similar vectors = similar meaning!\n",
256
+ "\n",
257
+ "HOW DO WE GET EMBEDDINGS?\n",
258
+ " Option 1: Use OpenAI API (costs money)\n",
259
+ " Option 2: Use local Ollama (free, slower)\n",
260
+ "\n",
261
+ " We'll use Ollama because:\n",
262
+ " ✓ Free (no API costs)\n",
263
+ " ✓ Privacy (stays on your machine)\n",
264
+ " ✓ Fast for this use case\n",
265
+ " ✓ Good enough for RAG\n",
266
+ "\n",
267
+ "EMBEDDING MODELS:\n",
268
+ " • nomic-embed-text: 384 dimensions (good for RAG)\n",
269
+ " • all-minilm-l6-v2: 384 dimensions (lighter)\n",
270
+ " • openai embedding: 1536 dimensions (more expensive, higher quality)\n",
271
+ "\n"
272
+ ]
273
+ }
274
+ ],
275
+ "source": [
276
+ "\n",
277
+ "# ========== PART 2: EMBEDDINGS ==========\n",
278
+ "print(\"\\n\" + \"=\" * 60)\n",
279
+ "print(\"PART 2: Embeddings (Why?)\")\n",
280
+ "print(\"=\" * 60)\n",
281
+ "\n",
282
+ "print(\"\"\"\n",
283
+ "PROBLEM: How do we compare text for similarity?\n",
284
+ " • Can't just use string matching (\"cat\" ≠ \"feline\")\n",
285
+ " • Need semantic understanding\n",
286
+ "\n",
287
+ "SOLUTION: Embeddings (vectors that capture meaning)\n",
288
+ " • Convert text → vector (list of numbers)\n",
289
+ " • Similar texts have similar vectors\n",
290
+ " • We can use math to compare them!\n",
291
+ "\n",
292
+ "EXAMPLE:\n",
293
+ " \"The cat sat on the mat\" → [0.2, -0.5, 0.8, 0.1, ...] (384 dims)\n",
294
+ " \"A feline sat on rug\" → [0.21, -0.48, 0.79, 0.12, ...] (384 dims)\n",
295
+ " \n",
296
+ " Notice: Very similar vectors = similar meaning!\n",
297
+ "\n",
298
+ "HOW DO WE GET EMBEDDINGS?\n",
299
+ " Option 1: Use OpenAI API (costs money)\n",
300
+ " Option 2: Use local Ollama (free, slower)\n",
301
+ " \n",
302
+ " We'll use Ollama because:\n",
303
+ " ✓ Free (no API costs)\n",
304
+ " ✓ Privacy (stays on your machine)\n",
305
+ " ✓ Fast for this use case\n",
306
+ " ✓ Good enough for RAG\n",
307
+ "\n",
308
+ "EMBEDDING MODELS:\n",
309
+ " • nomic-embed-text: 384 dimensions (good for RAG)\n",
310
+ " • all-minilm-l6-v2: 384 dimensions (lighter)\n",
311
+ " • openai embedding: 1536 dimensions (more expensive, higher quality)\n",
312
+ "\"\"\")\n"
313
+ ]
314
+ },
315
+ {
316
+ "cell_type": "code",
317
+ "execution_count": 22,
318
+ "id": "5eb96a3e",
319
+ "metadata": {},
320
+ "outputs": [
321
+ {
322
+ "name": "stdout",
323
+ "output_type": "stream",
324
+ "text": [
325
+ "\n",
326
+ "✓ Embedding shape: 348 dimensions\n",
327
+ " Sample values: [-0.019382589036581188, -0.005570373852637493, -0.05710695701024572, -0.04601154504073093, 0.08037614551187885] ...\n"
328
+ ]
329
+ }
330
+ ],
331
+ "source": [
332
+ "\n",
333
+ "import numpy as np\n",
334
+ "\n",
335
+ "def simulate_embedding(text, dims=348):\n",
336
+ " \"\"\"\n",
337
+ " Simulate what an embedding looks like.\n",
338
+ " \n",
339
+ " In reality, Ollama would use a neural network to create this.\n",
340
+ " For learning, we'll just use a hash-based deterministic vector.\n",
341
+ " \"\"\"\n",
342
+ " # Hash the text to get consistent numbers\n",
343
+ " hash_val = hash(text) % 10000\n",
344
+ " np.random.seed(hash_val)\n",
345
+ " \n",
346
+ " # Create random but consistent vector\n",
347
+ " embedding = np.random.randn(dims)\n",
348
+ " \n",
349
+ " # Normalize to unit length (important for cosine similarity)\n",
350
+ " embedding = embedding / np.linalg.norm(embedding)\n",
351
+ " \n",
352
+ " return embedding.tolist()\n",
353
+ "\n",
354
+ "# Demonstrate\n",
355
+ "text1 = \"Machine learning is AI\"\n",
356
+ "text2 = \"Deep learning uses neural networks\"\n",
357
+ "text3 = \"Cooking pasta is delicious\"\n",
358
+ "\n",
359
+ "emb1 = simulate_embedding(text1)\n",
360
+ "emb2 = simulate_embedding(text2)\n",
361
+ "emb3 = simulate_embedding(text3)\n",
362
+ "\n",
363
+ "print(f\"\\n✓ Embedding shape: {len(emb1)} dimensions\")\n",
364
+ "print(f\" Sample values: {emb1[:5]} ...\")\n"
365
+ ]
366
+ },
367
+ {
368
+ "cell_type": "code",
369
+ "execution_count": 28,
370
+ "id": "e15b8066",
371
+ "metadata": {},
372
+ "outputs": [
373
+ {
374
+ "name": "stdout",
375
+ "output_type": "stream",
376
+ "text": [
377
+ "\n",
378
+ "============================================================\n",
379
+ "PART 3: Similarity Search (Why?)\n",
380
+ "============================================================\n",
381
+ "\n",
382
+ "GOAL: Find which chunks are most relevant to a query\n",
383
+ "\n",
384
+ "METHOD: Cosine Similarity\n",
385
+ " • Measure angle between vectors\n",
386
+ " • Values from -1 to 1 (1 = identical direction = same meaning)\n",
387
+ " • Formula: similarity = (A · B) / (|A| * |B|)\n",
388
+ "\n",
389
+ "EXAMPLE:\n",
390
+ " Query: \"What is deep learning?\"\n",
391
+ " Chunk 1: \"Deep learning uses neural networks\" → similarity = 0.92 ✓ Relevant!\n",
392
+ " Chunk 2: \"Cooking pasta...\" → similarity = 0.15 ✗ Not relevant\n",
393
+ " Chunk 3: \"Neural networks have many layers\" → similarity = 0.85 ✓ Relevant!\n",
394
+ "\n",
395
+ " Return top 2: Chunks 1 and 3\n",
396
+ "\n",
397
+ "\n",
398
+ "✓ Query: 'neural networks'\n",
399
+ " Results (sorted by relevance):\n",
400
+ " -0.077 | Deep learning uses neural networks\n",
401
+ " -0.021 | Cooking pasta is delicious\n",
402
+ " -0.005 | Machine learning is AI\n"
403
+ ]
404
+ }
405
+ ],
406
+ "source": [
407
+ "\n",
408
+ "# ========== PART 3: SIMILARITY SEARCH ==========\n",
409
+ "print(\"\\n\" + \"=\" * 60)\n",
410
+ "print(\"PART 3: Similarity Search (Why?)\")\n",
411
+ "print(\"=\" * 60)\n",
412
+ "\n",
413
+ "print(\"\"\"\n",
414
+ "GOAL: Find which chunks are most relevant to a query\n",
415
+ "\n",
416
+ "METHOD: Cosine Similarity\n",
417
+ " • Measure angle between vectors\n",
418
+ " • Values from -1 to 1 (1 = identical direction = same meaning)\n",
419
+ " • Formula: similarity = (A · B) / (|A| * |B|)\n",
420
+ " \n",
421
+ "EXAMPLE:\n",
422
+ " Query: \"What is deep learning?\"\n",
423
+ " Chunk 1: \"Deep learning uses neural networks\" → similarity = 0.92 ✓ Relevant!\n",
424
+ " Chunk 2: \"Cooking pasta...\" → similarity = 0.15 ✗ Not relevant\n",
425
+ " Chunk 3: \"Neural networks have many layers\" → similarity = 0.85 ✓ Relevant!\n",
426
+ " \n",
427
+ " Return top 2: Chunks 1 and 3\n",
428
+ "\"\"\")\n",
429
+ "\n",
430
+ "def cosine_similarity(vec_a, vec_b):\n",
431
+ " \"\"\"\n",
432
+ " Calculate cosine similarity between two vectors.\n",
433
+ " \n",
434
+ " Returns value between -1 and 1 (higher = more similar)\n",
435
+ " \"\"\"\n",
436
+ " a = np.array(vec_a)\n",
437
+ " b = np.array(vec_b)\n",
438
+ " \n",
439
+ " dot_product = np.dot(a, b)\n",
440
+ " norm_a = np.linalg.norm(a)\n",
441
+ " norm_b = np.linalg.norm(b)\n",
442
+ " \n",
443
+ " if norm_a == 0 or norm_b == 0:\n",
444
+ " return 0.0\n",
445
+ " \n",
446
+ " return float(dot_product / (norm_a * norm_b))\n",
447
+ "\n",
448
+ "# Test similarity\n",
449
+ "query = \"neural networks\"\n",
450
+ "query_emb = simulate_embedding(query)\n",
451
+ "\n",
452
+ "similarities = [\n",
453
+ " (\"Deep learning uses neural networks\", cosine_similarity(query_emb, emb2)),\n",
454
+ " (\"Machine learning is AI\", cosine_similarity(query_emb, emb1)),\n",
455
+ " (\"Cooking pasta is delicious\", cosine_similarity(query_emb, emb3)),\n",
456
+ "]\n",
457
+ "\n",
458
+ "# Sort by similarity\n",
459
+ "similarities.sort(key=lambda x: x[1], reverse=False)\n",
460
+ "\n",
461
+ "print(f\"\\n✓ Query: '{query}'\")\n",
462
+ "print(f\" Results (sorted by relevance):\")\n",
463
+ "for text, score in similarities:\n",
464
+ " bar = \"█\" * int(score * 20)\n",
465
+ " print(f\" {bar} {score:.3f} | {text}\")\n"
466
+ ]
467
+ },
468
+ {
469
+ "cell_type": "code",
470
+ "execution_count": 29,
471
+ "id": "33b76510",
472
+ "metadata": {},
473
+ "outputs": [
474
+ {
475
+ "name": "stdout",
476
+ "output_type": "stream",
477
+ "text": [
478
+ "\n",
479
+ "============================================================\n",
480
+ "PART 4: Vector Store (Why?)\n",
481
+ "============================================================\n",
482
+ "\n",
483
+ "PROBLEM: How do we store and retrieve embeddings efficiently?\n",
484
+ "\n",
485
+ "SIMPLE APPROACH: In-memory dictionary\n",
486
+ " ✓ Easy to understand and implement\n",
487
+ " ✓ Fast for small documents (< 10k chunks)\n",
488
+ " ✗ Loses data when program stops\n",
489
+ " ✗ Doesn't scale to millions of vectors\n",
490
+ "\n",
491
+ "PRODUCTION APPROACH: Vector databases\n",
492
+ " • Pinecone, Weaviate, Milvus, Chroma, Qdrant\n",
493
+ " • Optimized for similarity search\n",
494
+ " • Persistent storage\n",
495
+ " • Scales to billions of vectors\n",
496
+ "\n",
497
+ "FOR NOW: We'll use in-memory (simple) but structure it to be replaceable\n",
498
+ "\n",
499
+ "\n",
500
+ "✓ Query: 'transformers attention mechanism'\n",
501
+ " Retrieved 2 chunks:\n",
502
+ " [0.005] on algorithms that learn from data. Deep learning uses neural networks...\n",
503
+ " [-0.012] NLP systems. The attention mechanism allows models to focus on relevan...\n"
504
+ ]
505
+ }
506
+ ],
507
+ "source": [
508
+ "\n",
509
+ "# ========== PART 4: VECTOR STORE ==========\n",
510
+ "print(\"\\n\" + \"=\" * 60)\n",
511
+ "print(\"PART 4: Vector Store (Why?)\")\n",
512
+ "print(\"=\" * 60)\n",
513
+ "\n",
514
+ "print(\"\"\"\n",
515
+ "PROBLEM: How do we store and retrieve embeddings efficiently?\n",
516
+ "\n",
517
+ "SIMPLE APPROACH: In-memory dictionary\n",
518
+ " ✓ Easy to understand and implement\n",
519
+ " ✓ Fast for small documents (< 10k chunks)\n",
520
+ " ✗ Loses data when program stops\n",
521
+ " ✗ Doesn't scale to millions of vectors\n",
522
+ "\n",
523
+ "PRODUCTION APPROACH: Vector databases\n",
524
+ " • Pinecone, Weaviate, Milvus, Chroma, Qdrant\n",
525
+ " • Optimized for similarity search\n",
526
+ " • Persistent storage\n",
527
+ " • Scales to billions of vectors\n",
528
+ "\n",
529
+ "FOR NOW: We'll use in-memory (simple) but structure it to be replaceable\n",
530
+ "\"\"\")\n",
531
+ "\n",
532
+ "class SimpleVectorStore:\n",
533
+ " \"\"\"\n",
534
+ " In-memory vector store.\n",
535
+ " Structure: Can easily swap for Pinecone/Weaviate later.\n",
536
+ " \"\"\"\n",
537
+ " \n",
538
+ " def __init__(self):\n",
539
+ " self.vectors = {} # chunk_id -> embedding vector\n",
540
+ " self.metadata = {} # chunk_id -> {text, source, etc}\n",
541
+ " \n",
542
+ " def add(self, chunk_id, text, embedding):\n",
543
+ " \"\"\"Store a chunk with its embedding.\"\"\"\n",
544
+ " self.vectors[chunk_id] = embedding\n",
545
+ " self.metadata[chunk_id] = {\"text\": text, \"length\": len(text)}\n",
546
+ " \n",
547
+ " def search(self, query_embedding, top_k=3):\n",
548
+ " \"\"\"Find most similar chunks.\"\"\"\n",
549
+ " if not self.vectors:\n",
550
+ " return []\n",
551
+ " \n",
552
+ " results = []\n",
553
+ " for chunk_id, vector in self.vectors.items():\n",
554
+ " similarity = cosine_similarity(query_embedding, vector)\n",
555
+ " results.append({\n",
556
+ " \"chunk_id\": chunk_id,\n",
557
+ " \"similarity\": similarity,\n",
558
+ " \"text\": self.metadata[chunk_id][\"text\"]\n",
559
+ " })\n",
560
+ " \n",
561
+ " # Sort by similarity descending\n",
562
+ " results.sort(key=lambda x: x[\"similarity\"], reverse=True)\n",
563
+ " return results[:top_k]\n",
564
+ "\n",
565
+ "# Test vector store\n",
566
+ "store = SimpleVectorStore()\n",
567
+ "\n",
568
+ "# Add chunks with embeddings\n",
569
+ "for chunk in chunks:\n",
570
+ " emb = simulate_embedding(chunk[\"text\"])\n",
571
+ " store.add(chunk[\"chunk_id\"], chunk[\"text\"], emb)\n",
572
+ "\n",
573
+ "# Search\n",
574
+ "query = \"transformers attention mechanism\"\n",
575
+ "query_emb = simulate_embedding(query)\n",
576
+ "results = store.search(query_emb, top_k=2)\n",
577
+ "\n",
578
+ "print(f\"\\n✓ Query: '{query}'\")\n",
579
+ "print(f\" Retrieved {len(results)} chunks:\")\n",
580
+ "for r in results:\n",
581
+ " print(f\" [{r['similarity']:.3f}] {r['text'][:70]}...\")\n"
582
+ ]
583
+ },
584
+ {
585
+ "cell_type": "code",
586
+ "execution_count": 31,
587
+ "id": "94ca95e9",
588
+ "metadata": {},
589
+ "outputs": [
590
+ {
591
+ "name": "stdout",
592
+ "output_type": "stream",
593
+ "text": [
594
+ "\n",
595
+ "============================================================\n",
596
+ "PART 5: LLM Integration (Why?)\n",
597
+ "============================================================\n",
598
+ "\n",
599
+ "PROBLEM: How do we actually answer the user's question?\n",
600
+ "\n",
601
+ "SOLUTION: Pass context + question to LLM\n",
602
+ " 1. Retrieve relevant chunks (from vector store)\n",
603
+ " 2. Combine into \"context\" string\n",
604
+ " 3. Create prompt: \"Context: [chunks] Question: [query]\"\n",
605
+ " 4. Send to LLM (Groq, OpenAI, etc)\n",
606
+ " 5. LLM answers based on context\n",
607
+ "\n",
608
+ "WHY THIS WORKS:\n",
609
+ " • LLM has knowledge from training\n",
610
+ " • Context grounds answer in YOUR documents\n",
611
+ " • LLM won't hallucinate (ideally) because answer must match context\n",
612
+ "\n",
613
+ "EXAMPLE PROMPT:\n",
614
+ " ---\n",
615
+ " Context:\n",
616
+ " Deep learning uses neural networks with many layers.\n",
617
+ " The attention mechanism focuses on relevant parts.\n",
618
+ "\n",
619
+ " Question: How does deep learning work?\n",
620
+ "\n",
621
+ " Answer: (LLM fills this in)\n",
622
+ " ---\n",
623
+ "\n",
624
+ "\n",
625
+ "✓ Built context from 2 chunks:\n",
626
+ " Length: 271 characters\n",
627
+ " Preview:\n",
628
+ " [Chunk 1 - Relevance: 0.5%]\n",
629
+ "on algorithms that learn from data. Deep learning uses neural networks with multiple layers. Transformers are the backbone of modern\n",
630
+ "\n",
631
+ "[Chunk 2 - Relevance: -1.2%]\n",
632
+ "NLP syste...\n"
633
+ ]
634
+ }
635
+ ],
636
+ "source": [
637
+ "\n",
638
+ "# ========== PART 5: LLM INTEGRATION ==========\n",
639
+ "print(\"\\n\" + \"=\" * 60)\n",
640
+ "print(\"PART 5: LLM Integration (Why?)\")\n",
641
+ "print(\"=\" * 60)\n",
642
+ "\n",
643
+ "print(\"\"\"\n",
644
+ "PROBLEM: How do we actually answer the user's question?\n",
645
+ "\n",
646
+ "SOLUTION: Pass context + question to LLM\n",
647
+ " 1. Retrieve relevant chunks (from vector store)\n",
648
+ " 2. Combine into \"context\" string\n",
649
+ " 3. Create prompt: \"Context: [chunks] Question: [query]\"\n",
650
+ " 4. Send to LLM (Groq, OpenAI, etc)\n",
651
+ " 5. LLM answers based on context\n",
652
+ "\n",
653
+ "WHY THIS WORKS:\n",
654
+ " • LLM has knowledge from training\n",
655
+ " • Context grounds answer in YOUR documents\n",
656
+ " • LLM won't hallucinate (ideally) because answer must match context\n",
657
+ "\n",
658
+ "EXAMPLE PROMPT:\n",
659
+ " ---\n",
660
+ " Context:\n",
661
+ " Deep learning uses neural networks with many layers.\n",
662
+ " The attention mechanism focuses on relevant parts.\n",
663
+ " \n",
664
+ " Question: How does deep learning work?\n",
665
+ " \n",
666
+ " Answer: (LLM fills this in)\n",
667
+ " ---\n",
668
+ "\"\"\")\n",
669
+ "\n",
670
+ "def build_context(retrieved_chunks):\n",
671
+ " \"\"\"\n",
672
+ " Combine retrieved chunks into a context string for the LLM.\n",
673
+ " \n",
674
+ " This is where you control what the LLM sees.\n",
675
+ " \"\"\"\n",
676
+ " context = \"\"\n",
677
+ " for i, chunk in enumerate(retrieved_chunks):\n",
678
+ " score = chunk.get(\"similarity\", 0)\n",
679
+ " context += f\"[Chunk {i+1} - Relevance: {score:.1%}]\\n\"\n",
680
+ " context += chunk[\"text\"] + \"\\n\\n\"\n",
681
+ " \n",
682
+ " return context\n",
683
+ "\n",
684
+ "# Demo\n",
685
+ "context = build_context(results)\n",
686
+ "print(f\"\\n✓ Built context from {len(results)} chunks:\")\n",
687
+ "print(f\" Length: {len(context)} characters\")\n",
688
+ "print(f\" Preview:\\n {context[:200]}...\")\n",
689
+ "\n",
690
+ "\n"
691
+ ]
692
+ },
693
+ {
694
+ "cell_type": "code",
695
+ "execution_count": 34,
696
+ "id": "2b841f01",
697
+ "metadata": {},
698
+ "outputs": [
699
+ {
700
+ "name": "stdout",
701
+ "output_type": "stream",
702
+ "text": [
703
+ "\n",
704
+ "============================================================\n",
705
+ "PART 6: Complete RAG Pipeline\n",
706
+ "============================================================\n",
707
+ "\n",
708
+ "[STEP 1] Chunking document...\n",
709
+ " → Split into 1 chunks\n",
710
+ "\n",
711
+ "[STEP 2] Creating embeddings...\n",
712
+ " → Created 1 embeddings\n",
713
+ "\n",
714
+ "[STEP 3] Embedding query...\n",
715
+ " → Query embedding ready\n",
716
+ "\n",
717
+ "[STEP 4] Searching similar chunks...\n",
718
+ " → Retrieved 1 chunks\n",
719
+ "\n",
720
+ "[STEP 5] Building prompt...\n",
721
+ " → Prompt ready (518 chars)\n",
722
+ "\n",
723
+ "============================================================\n",
724
+ "RESULT SUMMARY\n",
725
+ "============================================================\n",
726
+ "Query: How do transformers work?\n",
727
+ "Chunks created: 1\n",
728
+ "Chunks retrieved: 1\n",
729
+ "\n",
730
+ "Top retrieved chunks:\n",
731
+ " [6.46%] Machine learning is a subset of AI that learns from data. De...\n",
732
+ "\n",
733
+ "Final prompt (to be sent to LLM):\n",
734
+ "You are a helpful assistant. Answer the question based ONLY on the provided context.\n",
735
+ "If the context doesn't contain the answer, say so explicitly.\n",
736
+ "\n",
737
+ "Context:\n",
738
+ "[Chunk 1 - Relevance: 6.5%]\n",
739
+ "Machine learning is a subset of AI that learns from data. Deep learning uses neural networks with many layers. Transformers use attention mechanisms for NLP tasks. The attention mechanism allows models to focus on important parts. Large language models are trained on massive datasets.\n",
740
+ "\n",
741
+ "\n",
742
+ "\n",
743
+ "Question: How do transformers work?\n",
744
+ "\n",
745
+ "Answer:\n",
746
+ "\n",
747
+ "============================================================\n",
748
+ "✓ You now understand RAG!\n",
749
+ "============================================================\n",
750
+ "\n",
751
+ "Next steps:\n",
752
+ " 1. Replace simulate_embedding() with real Ollama calls\n",
753
+ " 2. Replace our Vector Store with Pinecone/Chroma\n",
754
+ " 3. Replace the final prompt with real Groq LLM calls\n",
755
+ " 4. Wrap in FastAPI for production\n",
756
+ "\n",
757
+ "But the LOGIC stays the same!\n",
758
+ "\n"
759
+ ]
760
+ }
761
+ ],
762
+ "source": [
763
+ "def build_prompt(context, query):\n",
764
+ " \"\"\"Build final prompt for LLM.\"\"\"\n",
765
+ " prompt = f\"\"\"You are a helpful assistant. Answer the question based ONLY on the provided context.\n",
766
+ "If the context doesn't contain the answer, say so explicitly.\n",
767
+ "\n",
768
+ "Context:\n",
769
+ "{context}\n",
770
+ "\n",
771
+ "Question: {query}\n",
772
+ "\n",
773
+ "Answer:\"\"\"\n",
774
+ " return prompt\n",
775
+ " \n",
776
+ "# ========== PART 6: PUTTING IT TOGETHER ==========\n",
777
+ "print(\"\\n\" + \"=\" * 60)\n",
778
+ "print(\"PART 6: Complete RAG Pipeline\")\n",
779
+ "print(\"=\" * 60)\n",
780
+ "\n",
781
+ "def rag_pipeline(query, document_text, chunk_size=500, overlap=50, top_k=3):\n",
782
+ " \"\"\"\n",
783
+ " Complete RAG pipeline:\n",
784
+ " 1. Chunk document\n",
785
+ " 2. Create embeddings\n",
786
+ " 3. Build vector store\n",
787
+ " 4. Retrieve relevant chunks\n",
788
+ " 5. Build prompt\n",
789
+ " 6. Return to user (in real system, send to LLM)\n",
790
+ " \"\"\"\n",
791
+ " \n",
792
+ " print(f\"\\n[STEP 1] Chunking document...\")\n",
793
+ " chunks = chunk_text(document_text, chunk_size, overlap)\n",
794
+ " print(f\" → Split into {len(chunks)} chunks\")\n",
795
+ " \n",
796
+ " print(f\"\\n[STEP 2] Creating embeddings...\")\n",
797
+ " store = SimpleVectorStore()\n",
798
+ " for chunk in chunks:\n",
799
+ " emb = simulate_embedding(chunk[\"text\"])\n",
800
+ " store.add(chunk[\"chunk_id\"], chunk[\"text\"], emb)\n",
801
+ " print(f\" → Created {len(chunks)} embeddings\")\n",
802
+ " \n",
803
+ " print(f\"\\n[STEP 3] Embedding query...\")\n",
804
+ " query_emb = simulate_embedding(query)\n",
805
+ " print(f\" → Query embedding ready\")\n",
806
+ " \n",
807
+ " print(f\"\\n[STEP 4] Searching similar chunks...\")\n",
808
+ " retrieved = store.search(query_emb, top_k)\n",
809
+ " print(f\" → Retrieved {len(retrieved)} chunks\")\n",
810
+ " \n",
811
+ " print(f\"\\n[STEP 5] Building prompt...\")\n",
812
+ " context = build_context(retrieved)\n",
813
+ " prompt = build_prompt(context, query)\n",
814
+ " print(f\" → Prompt ready ({len(prompt)} chars)\")\n",
815
+ " \n",
816
+ " return {\n",
817
+ " \"query\": query,\n",
818
+ " \"chunks_created\": len(chunks),\n",
819
+ " \"chunks_retrieved\": len(retrieved),\n",
820
+ " \"context\": context,\n",
821
+ " \"prompt\": prompt,\n",
822
+ " \"retrieved_chunks\": retrieved\n",
823
+ " }\n",
824
+ "\n",
825
+ "# Test full pipeline\n",
826
+ "doc = \"\"\"\n",
827
+ "Machine learning is a subset of AI that learns from data.\n",
828
+ "Deep learning uses neural networks with many layers.\n",
829
+ "Transformers use attention mechanisms for NLP tasks.\n",
830
+ "The attention mechanism allows models to focus on important parts.\n",
831
+ "Large language models are trained on massive datasets.\n",
832
+ "\"\"\"\n",
833
+ "\n",
834
+ "result = rag_pipeline(\"How do transformers work?\", doc, chunk_size=500, top_k=2)\n",
835
+ "\n",
836
+ "print(f\"\\n\" + \"=\" * 60)\n",
837
+ "print(\"RESULT SUMMARY\")\n",
838
+ "print(\"=\" * 60)\n",
839
+ "print(f\"Query: {result['query']}\")\n",
840
+ "print(f\"Chunks created: {result['chunks_created']}\")\n",
841
+ "print(f\"Chunks retrieved: {result['chunks_retrieved']}\")\n",
842
+ "print(f\"\\nTop retrieved chunks:\")\n",
843
+ "for chunk in result['retrieved_chunks']:\n",
844
+ " print(f\" [{chunk['similarity']:.2%}] {chunk['text'][:60]}...\")\n",
845
+ "print(f\"\\nFinal prompt (to be sent to LLM):\")\n",
846
+ "print(result['prompt'])\n",
847
+ "\n",
848
+ "print(\"\\n\" + \"=\" * 60)\n",
849
+ "print(\"✓ You now understand RAG!\")\n",
850
+ "print(\"=\" * 60)\n",
851
+ "print(\"\"\"\n",
852
+ "Next steps:\n",
853
+ " 1. Replace simulate_embedding() with real Ollama calls\n",
854
+ " 2. Replace our Vector Store with Pinecone/Chroma\n",
855
+ " 3. Replace the final prompt with real Groq LLM calls\n",
856
+ " 4. Wrap in FastAPI for production\n",
857
+ " \n",
858
+ "But the LOGIC stays the same!\n",
859
+ "\"\"\")"
860
+ ]
861
+ },
862
+ {
863
+ "cell_type": "code",
864
+ "execution_count": 38,
865
+ "id": "33af125c",
866
+ "metadata": {},
867
+ "outputs": [
868
+ {
869
+ "name": "stdout",
870
+ "output_type": "stream",
871
+ "text": [
872
+ "llama-3.1-8b-instant\n",
873
+ "groq/compound-mini\n",
874
+ "whisper-large-v3\n",
875
+ "moonshotai/kimi-k2-instruct-0905\n",
876
+ "openai/gpt-oss-20b\n",
877
+ "playai-tts-arabic\n",
878
+ "groq/compound\n",
879
+ "whisper-large-v3-turbo\n",
880
+ "openai/gpt-oss-120b\n",
881
+ "meta-llama/llama-prompt-guard-2-86m\n",
882
+ "meta-llama/llama-prompt-guard-2-22m\n",
883
+ "openai/gpt-oss-safeguard-20b\n",
884
+ "moonshotai/kimi-k2-instruct\n",
885
+ "qwen/qwen3-32b\n",
886
+ "meta-llama/llama-4-maverick-17b-128e-instruct\n",
887
+ "allam-2-7b\n",
888
+ "meta-llama/llama-guard-4-12b\n",
889
+ "playai-tts\n",
890
+ "meta-llama/llama-4-scout-17b-16e-instruct\n",
891
+ "llama-3.3-70b-versatile\n"
892
+ ]
893
+ }
894
+ ],
895
+ "source": [
896
+ "from groq import Groq\n",
897
+ "import os\n",
898
+ "\n",
899
+ "client = Groq(api_key=os.getenv(\"GROQ_API_KEY\"))\n",
900
+ "\n",
901
+ "models = client.models.list()\n",
902
+ "for m in models.data:\n",
903
+ " print(m.id)"
904
+ ]
905
+ },
906
+ {
907
+ "cell_type": "code",
908
+ "execution_count": 39,
909
+ "id": "704478dc",
910
+ "metadata": {},
911
+ "outputs": [
912
+ {
913
+ "name": "stdout",
914
+ "output_type": "stream",
915
+ "text": [
916
+ "\n",
917
+ "🤖 LLM ANSWER:\n",
918
+ "According to the provided context, transformers use attention mechanisms for NLP tasks.\n"
919
+ ]
920
+ }
921
+ ],
922
+ "source": [
923
+ "from dotenv import load_dotenv\n",
924
+ "import os\n",
925
+ "from groq import Groq\n",
926
+ "\n",
927
+ "# Load from .env file\n",
928
+ "load_dotenv()\n",
929
+ "\n",
930
+ "# Get API key safely\n",
931
+ "api_key = os.getenv(\"GROQ_API_KEY\")\n",
932
+ "\n",
933
+ "if not api_key:\n",
934
+ " raise ValueError(\"GROQ_API_KEY not found in environment or .env file\")\n",
935
+ "\n",
936
+ "def query_groq(context, query):\n",
937
+ " \"\"\"Get real answer from Groq LLM.\"\"\"\n",
938
+ " client = Groq(api_key=api_key)\n",
939
+ " \n",
940
+ " message = client.chat.completions.create(\n",
941
+ " model=\"llama-3.1-8b-instant\",\n",
942
+ " max_tokens=1024,\n",
943
+ " messages=[{\n",
944
+ " \"role\": \"user\",\n",
945
+ " \"content\": f\"\"\"Based on this context, answer the question.\n",
946
+ "Only use the context provided.\n",
947
+ "\n",
948
+ "Context:\n",
949
+ "{context}\n",
950
+ "\n",
951
+ "Question: {query}\n",
952
+ "\n",
953
+ "Answer:\"\"\"\n",
954
+ " }]\n",
955
+ " )\n",
956
+ " return message.choices[0].message.content\n",
957
+ "\n",
958
+ "# Test it\n",
959
+ "answer = query_groq(result[\"context\"], result[\"query\"])\n",
960
+ "print(f\"\\n🤖 LLM ANSWER:\\n{answer}\")"
961
+ ]
962
+ },
963
+ {
964
+ "cell_type": "code",
965
+ "execution_count": null,
966
+ "id": "8866fa43",
967
+ "metadata": {},
968
+ "outputs": [],
969
+ "source": []
970
+ }
971
+ ],
972
+ "metadata": {
973
+ "kernelspec": {
974
+ "display_name": ".venv",
975
+ "language": "python",
976
+ "name": "python3"
977
+ },
978
+ "language_info": {
979
+ "codemirror_mode": {
980
+ "name": "ipython",
981
+ "version": 3
982
+ },
983
+ "file_extension": ".py",
984
+ "mimetype": "text/x-python",
985
+ "name": "python",
986
+ "nbconvert_exporter": "python",
987
+ "pygments_lexer": "ipython3",
988
+ "version": "3.12.5"
989
+ }
990
+ },
991
+ "nbformat": 4,
992
+ "nbformat_minor": 5
993
+ }
notebooks/02_test_modules.ipynb ADDED
@@ -0,0 +1,596 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "6cade155",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "d:\\projects\\doc-intelligence-rag\\.venv\\Scripts\\python.exe\n"
14
+ ]
15
+ }
16
+ ],
17
+ "source": [
18
+ "import sys\n",
19
+ "sys.path.append(\"../\")\n",
20
+ "print(sys.executable)"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 3,
26
+ "id": "8b183487",
27
+ "metadata": {},
28
+ "outputs": [
29
+ {
30
+ "name": "stdout",
31
+ "output_type": "stream",
32
+ "text": [
33
+ "Loaded .env from: d:\\projects\\doc-intelligence-rag\\notebooks\\..\\src\\rag\\../..\\.env\n",
34
+ "✓ Chunker works: 6 chunks\n",
35
+ "Split into 6 chunks:\n",
36
+ " Chunk 0: 12 words | Machine Learning is a subset of artificial intelligence that\n",
37
+ " Chunk 1: 12 words | training models to make predictions or decisions based on da\n",
38
+ " Chunk 2: 12 words | It is a powerful tool for solving a wide range of problems,\n",
39
+ " Chunk 3: 12 words | of problems, from image recognition to natural language proc\n",
40
+ " Chunk 4: 12 words | this article, we will explore the basics of machine learning\n",
41
+ " Chunk 5: 10 words | and how it can be used to solve real-world problems.\n"
42
+ ]
43
+ }
44
+ ],
45
+ "source": [
46
+ "# Test chunker\n",
47
+ "from src.rag import chunk_text\n",
48
+ "\n",
49
+ "text = \"Machine Learning is a subset of artificial intelligence that involves training models to make predictions or decisions based on data. It is a powerful tool for solving a wide range of problems, from image recognition to natural language processing. In this article, we will explore the basics of machine learning and how it can be used to solve real-world problems.\"\n",
50
+ "chunks = chunk_text(text, chunk_size=12, overlap=2)\n",
51
+ "print(f\"✓ Chunker works: {len(chunks)} chunks\")\n",
52
+ "print(f\"Split into {len(chunks)} chunks:\")\n",
53
+ "for chunk in chunks:\n",
54
+ " print(f\" Chunk {chunk.chunk_id}: {chunk.word_count} words | {chunk.text[:60]}\")"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": 4,
60
+ "id": "f4c201be",
61
+ "metadata": {},
62
+ "outputs": [
63
+ {
64
+ "name": "stderr",
65
+ "output_type": "stream",
66
+ "text": [
67
+ "INFO:src.rag.embeddings:✓ Connected to Ollama at http://localhost:11434\n"
68
+ ]
69
+ },
70
+ {
71
+ "name": "stdout",
72
+ "output_type": "stream",
73
+ "text": [
74
+ "✓ Embeddings work: 768 dimensions\n"
75
+ ]
76
+ }
77
+ ],
78
+ "source": [
79
+ "# check embeddings\n",
80
+ "\n",
81
+ "from src.rag import OllamaEmbeddingClient\n",
82
+ "\n",
83
+ "client = OllamaEmbeddingClient()\n",
84
+ "embedding = client.embed(text)\n",
85
+ "print(f\"✓ Embeddings work: {len(embedding)} dimensions\")"
86
+ ]
87
+ },
88
+ {
89
+ "cell_type": "code",
90
+ "execution_count": 5,
91
+ "id": "04144e58",
92
+ "metadata": {},
93
+ "outputs": [
94
+ {
95
+ "name": "stderr",
96
+ "output_type": "stream",
97
+ "text": [
98
+ "INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n",
99
+ "INFO:src.rag.vector_store:✓ Initialized Chroma vector store at .chromadb_test (collection: rag)\n",
100
+ "INFO:src.rag.vector_store:Cleared vector store\n"
101
+ ]
102
+ },
103
+ {
104
+ "name": "stdout",
105
+ "output_type": "stream",
106
+ "text": [
107
+ "Chunk ID: chunk1, Similarity: 1.00, Text: ml\n"
108
+ ]
109
+ }
110
+ ],
111
+ "source": [
112
+ "# check vector store\n",
113
+ "\n",
114
+ "from src.rag import ChromaVectorStore\n",
115
+ "\n",
116
+ "store = ChromaVectorStore(persist_directory=\".chromadb_test\")\n",
117
+ "store.clear()\n",
118
+ "# Add chunks\n",
119
+ "store.add(\"chunk1\", \"ml\", embedding, metadata={\"source\": \"test\"})\n",
120
+ "\n",
121
+ "# Retrieve\n",
122
+ "results = store.retrieve(embedding, top_k=1)\n",
123
+ "for r in results:\n",
124
+ " print(f\"Chunk ID: {r.chunk_id}, Similarity: {r.similarity:.2f}, Text: {r.text}\")\n"
125
+ ]
126
+ },
127
+ {
128
+ "cell_type": "code",
129
+ "execution_count": 6,
130
+ "id": "80ccd988",
131
+ "metadata": {},
132
+ "outputs": [
133
+ {
134
+ "name": "stderr",
135
+ "output_type": "stream",
136
+ "text": [
137
+ "INFO:src.rag.llm:Groq LLM client initialized with model: llama-3.1-8b-instant\n",
138
+ "INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
139
+ ]
140
+ },
141
+ {
142
+ "name": "stdout",
143
+ "output_type": "stream",
144
+ "text": [
145
+ "✓ LLM works: I am a helpful assistant....\n"
146
+ ]
147
+ }
148
+ ],
149
+ "source": [
150
+ "# test groq llm\n",
151
+ "from src.rag import GroqLLMClient\n",
152
+ "import os\n",
153
+ "from dotenv import load_dotenv\n",
154
+ "load_dotenv()\n",
155
+ "api_key = os.getenv(\"GROQ_API_KEY\")\n",
156
+ "\n",
157
+ "llm = GroqLLMClient(api_key=api_key)\n",
158
+ "answer = llm.query(\"Context: Hello\", \"Query: Who are you?\")\n",
159
+ "print(f\"✓ LLM works: {answer[:50]}...\")"
160
+ ]
161
+ },
162
+ {
163
+ "cell_type": "code",
164
+ "execution_count": 7,
165
+ "id": "6c95cec2",
166
+ "metadata": {},
167
+ "outputs": [
168
+ {
169
+ "name": "stderr",
170
+ "output_type": "stream",
171
+ "text": [
172
+ "INFO:src.rag.pipeline:Initializing RAG Pipeline...\n",
173
+ "INFO:src.rag.embeddings:✓ Connected to Ollama at http://localhost:11434\n",
174
+ "INFO:src.rag.pipeline:✓ Embeddings client ready\n",
175
+ "INFO:src.rag.llm:Groq LLM client initialized with model: llama-3.1-8b-instant\n",
176
+ "INFO:src.rag.pipeline:✓ LLM client ready\n",
177
+ "INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n",
178
+ "INFO:src.rag.vector_store:✓ Initialized Chroma vector store at .chromadb (collection: rag)\n",
179
+ "INFO:src.rag.pipeline:✓ Vector store ready\n",
180
+ "INFO:src.rag.pipeline:✓ RAG Pipeline initialized\n",
181
+ "INFO:src.rag.pipeline:Ingesting document: doc1\n",
182
+ "INFO:src.rag.pipeline:✓ Chunks created: 1\n",
183
+ "INFO:src.rag.pipeline:✓ Embedded 1/1 chunks\n",
184
+ "INFO:src.rag.pipeline:Querying: What will you explore?\n",
185
+ "INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
186
+ "INFO:src.rag.pipeline:Query complete: success\n"
187
+ ]
188
+ },
189
+ {
190
+ "name": "stdout",
191
+ "output_type": "stream",
192
+ "text": [
193
+ "✓ Pipeline works!\n",
194
+ "Answer: Based on the provided context, it's not explicitly stated what will be explored.\n"
195
+ ]
196
+ }
197
+ ],
198
+ "source": [
199
+ "#test full pipeline\n",
200
+ "from src.rag import RAGPipeline\n",
201
+ "\n",
202
+ "pipeline = RAGPipeline()\n",
203
+ "pipeline.ingest(\"doc1\", text)\n",
204
+ "result = pipeline.query(\"What will you explore?\")\n",
205
+ "print(f\"✓ Pipeline works!\")\n",
206
+ "print(f\"Answer: {result['answer']}\")"
207
+ ]
208
+ },
209
+ {
210
+ "cell_type": "code",
211
+ "execution_count": 6,
212
+ "id": "b6aa9c72",
213
+ "metadata": {},
214
+ "outputs": [
215
+ {
216
+ "name": "stderr",
217
+ "output_type": "stream",
218
+ "text": [
219
+ "INFO:src.rag.pipeline:Initializing RAG Pipeline...\n",
220
+ "INFO:src.rag.embeddings:✓ Connected to Ollama at http://localhost:11434\n",
221
+ "INFO:src.rag.pipeline:✓ Embeddings client ready\n",
222
+ "INFO:src.rag.llm:Groq LLM client initialized with model: llama-3.1-8b-instant\n",
223
+ "INFO:src.rag.pipeline:✓ LLM client ready\n",
224
+ "INFO:src.rag.vector_store:✓ Initialized Chroma vector store at .chromadb (collection: rag)\n",
225
+ "INFO:src.rag.pipeline:✓ Vector store ready\n",
226
+ "INFO:src.rag.pipeline:✓ RAG Pipeline initialized\n",
227
+ "INFO:src.rag.pdf_processor:Processing folder: d:\\projects\\doc-intelligence-rag\\papers\n",
228
+ "INFO:src.rag.pdf_processor:Found 3 PDF files\n",
229
+ "INFO:src.rag.pdf_processor:Processing PDF: CMBFSCNN.pdf\n"
230
+ ]
231
+ },
232
+ {
233
+ "name": "stdout",
234
+ "output_type": "stream",
235
+ "text": [
236
+ "[1] Ingesting all PDFs from 'papers' folder...\n"
237
+ ]
238
+ },
239
+ {
240
+ "name": "stderr",
241
+ "output_type": "stream",
242
+ "text": [
243
+ "INFO:src.rag.pdf_processor:✓ Extracted 26 pages, 90013 chars\n",
244
+ "INFO:src.rag.pdf_processor:Processing PDF: CVPR Version 16723_CMB_ML_A_Cosmic_Microwav.pdf\n",
245
+ "INFO:src.rag.pdf_processor:✓ Extracted 11 pages, 53068 chars\n",
246
+ "INFO:src.rag.pdf_processor:Processing PDF: Petroff20 - Cleaning CMB with ML.pdf\n",
247
+ "INFO:src.rag.pdf_processor:✓ Extracted 11 pages, 43264 chars\n",
248
+ "INFO:src.rag.pdf_processor:✓ Processed 3 PDFs successfully\n",
249
+ "INFO:src.rag.pipeline:Ingesting document: CMBFSCNN\n",
250
+ "INFO:src.rag.pipeline:✓ Chunks created: 33\n",
251
+ "INFO:src.rag.pipeline:✓ Embedded 33/33 chunks\n",
252
+ "INFO:src.rag.pipeline:Ingesting document: CVPR Version 16723_CMB_ML_A_Cosmic_Microwav\n",
253
+ "INFO:src.rag.pipeline:✓ Chunks created: 19\n",
254
+ "INFO:src.rag.pipeline:✓ Embedded 19/19 chunks\n",
255
+ "INFO:src.rag.pipeline:Ingesting document: Petroff20 - Cleaning CMB with ML\n",
256
+ "INFO:src.rag.pipeline:✓ Chunks created: 16\n",
257
+ "INFO:src.rag.pipeline:✓ Embedded 16/16 chunks\n",
258
+ "INFO:src.rag.pipeline:Querying: What are the main findings?\n"
259
+ ]
260
+ },
261
+ {
262
+ "name": "stdout",
263
+ "output_type": "stream",
264
+ "text": [
265
+ "\n",
266
+ "[2] Ingestion Summary:\n",
267
+ " CMBFSCNN: 33 chunks\n",
268
+ " CVPR Version 16723_CMB_ML_A_Cosmic_Microwav: 19 chunks\n",
269
+ " Petroff20 - Cleaning CMB with ML: 16 chunks\n",
270
+ " Total: 68 chunks\n",
271
+ "\n",
272
+ "[3] Querying...\n"
273
+ ]
274
+ },
275
+ {
276
+ "name": "stderr",
277
+ "output_type": "stream",
278
+ "text": [
279
+ "INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
280
+ "INFO:src.rag.pipeline:Query complete: success\n"
281
+ ]
282
+ },
283
+ {
284
+ "name": "stdout",
285
+ "output_type": "stream",
286
+ "text": [
287
+ "\n",
288
+ "📝 Question: What are the main findings?\n",
289
+ "\n",
290
+ "🤖 Answer:\n",
291
+ "The provided context does not explicitly contain information about the main findings. It appears to be a collection of scientific texts discussing the Cosmic Microwave Background (CMB) and its analysis. The main topics covered include the CMB's power spectrum, contamination by foreground radiations, and the process of component separation.\n",
292
+ "\n",
293
+ "📚 Sources (3 chunks):\n",
294
+ " - [57.3%] in- frared background (CIB) is a different diffuse extragalactic source. 0 200 4...\n",
295
+ " - [57.1%] sion [1] and LiteBird [26], are expected to probe some of the 084 deepest myster...\n",
296
+ " - [57.0%] primordial B mode originates from the primordial gravitational waves predicted b...\n"
297
+ ]
298
+ }
299
+ ],
300
+ "source": [
301
+ "# test pdf processor\n",
302
+ "\n",
303
+ "from src.rag import RAGPipeline\n",
304
+ "import logging\n",
305
+ "import os\n",
306
+ "\n",
307
+ "project_root = os.path.abspath(os.path.join(os.getcwd(), \"..\"))\n",
308
+ "papers_path = os.path.join(project_root, \"papers\")\n",
309
+ "\n",
310
+ "logging.basicConfig(level=logging.INFO)\n",
311
+ "\n",
312
+ "# Initialize pipeline\n",
313
+ "pipeline = RAGPipeline()\n",
314
+ "\n",
315
+ "# Option A: Ingest a single PDF\n",
316
+ "# result = pipeline.ingest_pdf(\"papers/your_paper.pdf\")\n",
317
+ "# print(f\"✓ Ingested {result['chunks_embedded']} chunks\")\n",
318
+ "\n",
319
+ "# Option B: Ingest all PDFs from folder (RECOMMENDED)\n",
320
+ "if os.path.exists(papers_path):\n",
321
+ " print(\"[1] Ingesting all PDFs from 'papers' folder...\")\n",
322
+ " results = pipeline.ingest_folder(papers_path)\n",
323
+ " \n",
324
+ " print(f\"\\n[2] Ingestion Summary:\")\n",
325
+ " total_chunks = 0\n",
326
+ " for doc_id, result in results.items():\n",
327
+ " print(f\" {doc_id}: {result['chunks_embedded']} chunks\")\n",
328
+ " total_chunks += result['chunks_embedded']\n",
329
+ " print(f\" Total: {total_chunks} chunks\")\n",
330
+ " \n",
331
+ " # Now query!\n",
332
+ " print(f\"\\n[3] Querying...\")\n",
333
+ " query = \"What are the main findings?\" # Change this to your question\n",
334
+ " result = pipeline.query(query)\n",
335
+ " \n",
336
+ " print(f\"\\n📝 Question: {result['query']}\")\n",
337
+ " print(f\"\\n🤖 Answer:\\n{result['answer']}\")\n",
338
+ " print(f\"\\n📚 Sources ({result['chunks_used']} chunks):\")\n",
339
+ " for source in result['sources']:\n",
340
+ " print(f\" - [{source['similarity']:.1%}] {source['preview'][:80]}...\")\n",
341
+ "else:\n",
342
+ " print(\"❌ No 'papers' folder found. Create one and add PDFs!\")"
343
+ ]
344
+ },
345
+ {
346
+ "cell_type": "code",
347
+ "execution_count": 7,
348
+ "id": "441624d7",
349
+ "metadata": {},
350
+ "outputs": [
351
+ {
352
+ "name": "stderr",
353
+ "output_type": "stream",
354
+ "text": [
355
+ "INFO:src.rag.pipeline:Querying: What is the Cosmic Microwave Background?\n"
356
+ ]
357
+ },
358
+ {
359
+ "name": "stdout",
360
+ "output_type": "stream",
361
+ "text": [
362
+ "\n",
363
+ "============================================================\n",
364
+ "Q: What is the Cosmic Microwave Background?\n",
365
+ "============================================================\n"
366
+ ]
367
+ },
368
+ {
369
+ "name": "stderr",
370
+ "output_type": "stream",
371
+ "text": [
372
+ "INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
373
+ "INFO:src.rag.pipeline:Query complete: success\n",
374
+ "INFO:src.rag.pipeline:Querying: How is machine learning used to analyze CMB data?\n"
375
+ ]
376
+ },
377
+ {
378
+ "name": "stdout",
379
+ "output_type": "stream",
380
+ "text": [
381
+ "\n",
382
+ "A: The Cosmic Microwave Background (CMB) has characteristic modes which are visible as consistently sized lumps and has a first peak at ℓ≈200, which corresponds roughly to 1 deg, about the size of the largest lumps visible in the CMB.\n",
383
+ "\n",
384
+ "Sources (3 chunks):\n",
385
+ " [60.1%] Machine learning is AI. Deep learning uses networks....\n",
386
+ " [46.4%] in- frared background (CIB) is a different diffuse extragala...\n",
387
+ "\n",
388
+ "============================================================\n",
389
+ "Q: How is machine learning used to analyze CMB data?\n",
390
+ "============================================================\n"
391
+ ]
392
+ },
393
+ {
394
+ "name": "stderr",
395
+ "output_type": "stream",
396
+ "text": [
397
+ "INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
398
+ "INFO:src.rag.pipeline:Query complete: success\n",
399
+ "INFO:src.rag.pipeline:Querying: What are foreground contaminations in CMB?\n"
400
+ ]
401
+ },
402
+ {
403
+ "name": "stdout",
404
+ "output_type": "stream",
405
+ "text": [
406
+ "\n",
407
+ "A: Based on the provided context, it appears that machine learning is mentioned as relevant to the analysis of CMB data in Chunk 1, but no specific information on how it is used is provided. However, in Chunk 2, it is mentioned that CMB-ML stands out distinctly because it includes Monte-Carlo simulations, which are not included in other software such as PySM3 and PSM.\n",
408
+ "\n",
409
+ "In Chunk 3, there are several references to machine learning and its applications in cosmology, including the use of deep learning for deblurring and the use of convolutional neural networks for component separation in CMB analysis. Specifically, reference [57] discusses the use of machine learning for foreground cleaning in CMB data, and reference [58] presents a method for CMB component separation using convolutional neural networks.\n",
410
+ "\n",
411
+ "Therefore, while the provided context does not provide a comprehensive answer, it suggests that machine learning is used in various ways to analyze CMB data, including simulation, deblurring, and component separation.\n",
412
+ "\n",
413
+ "Sources (3 chunks):\n",
414
+ " [62.6%] Machine learning is AI. Deep learning uses networks....\n",
415
+ " [59.9%] experiment [17, 60]. 131 Many other examples exist, but all ...\n",
416
+ "\n",
417
+ "============================================================\n",
418
+ "Q: What are foreground contaminations in CMB?\n",
419
+ "============================================================\n"
420
+ ]
421
+ },
422
+ {
423
+ "name": "stderr",
424
+ "output_type": "stream",
425
+ "text": [
426
+ "INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
427
+ "INFO:src.rag.pipeline:Query complete: success\n",
428
+ "INFO:src.rag.pipeline:Querying: What component separation methods are discussed?\n"
429
+ ]
430
+ },
431
+ {
432
+ "name": "stdout",
433
+ "output_type": "stream",
434
+ "text": [
435
+ "\n",
436
+ "A: Foreground contaminations in CMB refer to the various sources of radiation that contaminate the Cosmic Microwave Background (CMB) signal, making it difficult to accurately separate the primordial CMB signal from the foregrounds. These contaminants include Galactic polarized radiation, which tends to be brighter than the primordial B-mode signal, as well as other astrophysical sources such as point sources, diffuse sources, and extragalactic signals.\n",
437
+ "\n",
438
+ "Sources (3 chunks):\n",
439
+ " [63.4%] in- frared background (CIB) is a different diffuse extragala...\n",
440
+ " [59.7%] primordial B mode originates from the primordial gravitation...\n",
441
+ "\n",
442
+ "============================================================\n",
443
+ "Q: What component separation methods are discussed?\n",
444
+ "============================================================\n"
445
+ ]
446
+ },
447
+ {
448
+ "name": "stderr",
449
+ "output_type": "stream",
450
+ "text": [
451
+ "INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions \"HTTP/1.1 429 Too Many Requests\"\n",
452
+ "INFO:groq._base_client:Retrying request to /openai/v1/chat/completions in 25.000000 seconds\n",
453
+ "INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
454
+ "INFO:src.rag.pipeline:Query complete: success\n",
455
+ "INFO:src.rag.pipeline:Querying: What deep learning architectures are used?\n"
456
+ ]
457
+ },
458
+ {
459
+ "name": "stdout",
460
+ "output_type": "stream",
461
+ "text": [
462
+ "\n",
463
+ "A: The following component separation methods are discussed:\n",
464
+ "\n",
465
+ "1. Internal Linear Combination (ILC)\n",
466
+ "2. Needlet (NILC)\n",
467
+ "3. Scale Discretized (SILC)\n",
468
+ "4. Hierarchical Morphological Component Analysis (HGMCA)\n",
469
+ "5. Convolutional Neural Network (CNN)-based methods (CMBFSCNN is specifically mentioned)\n",
470
+ "\n",
471
+ "Sources (3 chunks):\n",
472
+ " [55.4%] 497 [4] Yashar Akrami, M Ashdown, Jonathan Aumont, Carlo Bac...\n",
473
+ " [55.0%] in several next generation CMB experiments, such as the CMB-...\n",
474
+ "\n",
475
+ "============================================================\n",
476
+ "Q: What deep learning architectures are used?\n",
477
+ "============================================================\n"
478
+ ]
479
+ },
480
+ {
481
+ "name": "stderr",
482
+ "output_type": "stream",
483
+ "text": [
484
+ "INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions \"HTTP/1.1 429 Too Many Requests\"\n",
485
+ "INFO:groq._base_client:Retrying request to /openai/v1/chat/completions in 23.000000 seconds\n",
486
+ "INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
487
+ "INFO:src.rag.pipeline:Query complete: success\n",
488
+ "INFO:src.rag.pipeline:Querying: What datasets are mentioned?\n"
489
+ ]
490
+ },
491
+ {
492
+ "name": "stdout",
493
+ "output_type": "stream",
494
+ "text": [
495
+ "\n",
496
+ "A: Based on the provided context, the following deep learning architectures are mentioned:\n",
497
+ "\n",
498
+ "1. Convolutional Neural Network (CNN) - mentioned in several references, including [51], [56], and [61].\n",
499
+ "2. Multi-scale Convolutional Neural Network - mentioned in [53].\n",
500
+ "3. Spherical Convolutional Neural Network - mentioned in [56] (DeepSphere).\n",
501
+ "4. U-Net - mentioned in [61].\n",
502
+ "\n",
503
+ "Note that PyTorch is also mentioned, but it is a deep learning library rather than a specific architecture.\n",
504
+ "\n",
505
+ "Sources (3 chunks):\n",
506
+ " [63.9%] Machine learning is AI. Deep learning uses networks....\n",
507
+ " [63.3%] Advances in Neural Information Processing Systems 30, ed. I....\n",
508
+ "\n",
509
+ "============================================================\n",
510
+ "Q: What datasets are mentioned?\n",
511
+ "============================================================\n"
512
+ ]
513
+ },
514
+ {
515
+ "name": "stderr",
516
+ "output_type": "stream",
517
+ "text": [
518
+ "INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions \"HTTP/1.1 429 Too Many Requests\"\n",
519
+ "INFO:groq._base_client:Retrying request to /openai/v1/chat/completions in 14.000000 seconds\n",
520
+ "INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
521
+ "INFO:src.rag.pipeline:Query complete: success\n"
522
+ ]
523
+ },
524
+ {
525
+ "name": "stdout",
526
+ "output_type": "stream",
527
+ "text": [
528
+ "\n",
529
+ "A: Based on the provided context, the following datasets are mentioned:\n",
530
+ "\n",
531
+ "1. The CMB-ML dataset, which is a unified framework for dataset generation and model evaluation.\n",
532
+ "2. The CMB-S4 science mission dataset.\n",
533
+ "3. The LiteBird dataset.\n",
534
+ "4. The COBE (Cosmic Background Explorer) dataset.\n",
535
+ "5. The Planck Mission dataset.\n",
536
+ "6. The Cosmic Infrared Background (CIB) dataset.\n",
537
+ "\n",
538
+ "Sources (3 chunks):\n",
539
+ " [62.0%] REVIEW COPY. DO NOT DISTRIBUTE. physics methods, each of the...\n",
540
+ " [58.2%] in- frared background (CIB) is a different diffuse extragala...\n"
541
+ ]
542
+ }
543
+ ],
544
+ "source": [
545
+ "queries = [\n",
546
+ " \"What is the Cosmic Microwave Background?\",\n",
547
+ " \"How is machine learning used to analyze CMB data?\",\n",
548
+ " \"What are foreground contaminations in CMB?\",\n",
549
+ " \"What component separation methods are discussed?\",\n",
550
+ " \"What deep learning architectures are used?\",\n",
551
+ " \"What datasets are mentioned?\",\n",
552
+ "]\n",
553
+ "\n",
554
+ "for query in queries:\n",
555
+ " print(f\"\\n{'='*60}\")\n",
556
+ " print(f\"Q: {query}\")\n",
557
+ " print('='*60)\n",
558
+ " \n",
559
+ " result = pipeline.query(query)\n",
560
+ " print(f\"\\nA: {result['answer']}\")\n",
561
+ " print(f\"\\nSources ({result['chunks_used']} chunks):\")\n",
562
+ " for source in result['sources'][:2]: # Show top 2\n",
563
+ " print(f\" [{source['similarity']:.1%}] {source['preview'][:60]}...\")"
564
+ ]
565
+ },
566
+ {
567
+ "cell_type": "code",
568
+ "execution_count": null,
569
+ "id": "76a5da4b",
570
+ "metadata": {},
571
+ "outputs": [],
572
+ "source": []
573
+ }
574
+ ],
575
+ "metadata": {
576
+ "kernelspec": {
577
+ "display_name": ".venv",
578
+ "language": "python",
579
+ "name": "python3"
580
+ },
581
+ "language_info": {
582
+ "codemirror_mode": {
583
+ "name": "ipython",
584
+ "version": 3
585
+ },
586
+ "file_extension": ".py",
587
+ "mimetype": "text/x-python",
588
+ "name": "python",
589
+ "nbconvert_exporter": "python",
590
+ "pygments_lexer": "ipython3",
591
+ "version": "3.12.5"
592
+ }
593
+ },
594
+ "nbformat": 4,
595
+ "nbformat_minor": 5
596
+ }
pyproject-local.toml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "doc-intelligence-rag"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ dependencies = [
8
+ "chromadb>=1.3.7",
9
+ "fastapi>=0.124.4",
10
+ "groq>=0.37.1",
11
+ "jupyter>=1.1.1",
12
+ "numpy>=2.3.5",
13
+ "ollama>=0.6.1",
14
+ "pandas>=2.3.3",
15
+ "pdfplumber>=0.11.8",
16
+ "pydantic>=2.12.5",
17
+ "pypdf2>=3.0.1",
18
+ "python-dotenv>=1.2.1",
19
+ "python-multipart>=0.0.20",
20
+ "requests>=2.32.5",
21
+ "sentence-transformers>=5.2.0",
22
+ "uvicorn[standard]>=0.38.0",
23
+ ]
pyproject.toml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "doc-intelligence-rag"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ dependencies = [
8
+ "chromadb[cpu]>=1.3.7",
9
+ "fastapi>=0.124.4",
10
+ "groq>=0.37.1",
11
+ "numpy>=2.3.5",
12
+ "pandas>=2.3.3",
13
+ "pdfplumber>=0.11.8",
14
+ "pydantic>=2.12.5",
15
+ "pypdf2>=3.0.1",
16
+ "python-dotenv>=1.2.1",
17
+ "python-multipart>=0.0.20",
18
+ "requests>=2.32.5",
19
+ "sentence-transformers>=5.2.0",
20
+ "uvicorn[standard]>=0.38.0"
21
+ ]
requirements-railway.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi>=0.124.4
2
+ uvicorn[standard]>=0.38.0
3
+ pydantic>=2.12.5
4
+ python-multipart>=0.0.20
5
+ python-dotenv>=1.2.1
6
+ requests>=2.32.5
7
+ chromadb[cpu]>=1.3.7
8
+ groq>=0.37.1
9
+ pdfplumber>=0.11.8
10
+ pypdf2>=3.0.1
11
+ numpy>=2.3.5
12
+ pandas>=2.3.3
13
+ # sentence-transformers will be installed at runtime if needed
14
+ sentence-transformers>=5.2.0
src/main.py ADDED
@@ -0,0 +1,504 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, UploadFile, HTTPException
2
+ from fastapi.responses import JSONResponse, FileResponse
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from fastapi.staticfiles import StaticFiles
5
+ from pydantic import BaseModel
6
+ import logging
7
+ import os
8
+ from typing import List, Optional
9
+ from datetime import datetime
10
+ import tempfile
11
+ from pathlib import Path
12
+
13
+ from src.rag import RAGPipeline, RAGConfig
14
+
15
+ # ==================== Setup ====================
16
+
17
+ # Configure logging
18
+ logging.basicConfig(
19
+ level=logging.INFO,
20
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
21
+ )
22
+ logger = logging.getLogger(__name__)
23
+
24
+ # Initialize FastAPI app
25
+ app = FastAPI(
26
+ title="Document Intelligence RAG",
27
+ description="RAG system for analyzing documents with LLM",
28
+ version="1.0.0",
29
+ docs_url="/docs",
30
+ redoc_url="/redoc"
31
+ )
32
+
33
+ # Add CORS middleware
34
+ app.add_middleware(
35
+ CORSMiddleware,
36
+ allow_origins=["*"],
37
+ allow_credentials=True,
38
+ allow_methods=["*"],
39
+ allow_headers=["*"],
40
+ )
41
+
42
+ # Serve frontend static files
43
+ if os.path.exists("frontend"):
44
+ app.mount("/static", StaticFiles(directory="frontend"), name="static")
45
+
46
+ # Global pipeline instance
47
+ pipeline: Optional[RAGPipeline] = None
48
+
49
+
50
+ # ==================== Pydantic Models ====================
51
+
52
+ class QueryRequest(BaseModel):
53
+ """Request body for query endpoint."""
54
+ query: str
55
+ top_k: int = 3
56
+
57
+
58
+ class QueryResponse(BaseModel):
59
+ """Response for query."""
60
+ query: str
61
+ answer: str
62
+ sources: List[dict]
63
+ chunks_used: int
64
+ response_time: float
65
+ status: str
66
+
67
+
68
+ class IngestResponse(BaseModel):
69
+ """Response for ingestion."""
70
+ doc_id: str
71
+ filename: str
72
+ chunks_created: int
73
+ chunks_embedded: int
74
+ status: str
75
+ timestamp: str
76
+
77
+
78
+ class IngestFolderResponse(BaseModel):
79
+ """Response for folder ingestion."""
80
+ total_documents: int
81
+ total_chunks: int
82
+ documents: List[dict]
83
+ timestamp: str
84
+
85
+
86
+ class HealthResponse(BaseModel):
87
+ """Response for health check."""
88
+ status: str
89
+ embedding_backend: str
90
+ groq: str
91
+ chroma: dict
92
+ timestamp: str
93
+
94
+
95
+ class StatsResponse(BaseModel):
96
+ """Response for stats."""
97
+ total_chunks: int
98
+ config: dict
99
+ timestamp: str
100
+
101
+
102
+ # ==================== Startup/Shutdown ====================
103
+
104
+ @app.on_event("startup")
105
+ async def startup_event():
106
+ """Initialize pipeline on startup."""
107
+ global pipeline
108
+
109
+ logger.info("=" * 60)
110
+ logger.info("Starting Document Intelligence RAG API")
111
+ logger.info("=" * 60)
112
+
113
+ try:
114
+ # Create RAG config (reads EMBEDDING_BACKEND from env)
115
+ config = RAGConfig(
116
+ chunk_size=500,
117
+ chunk_overlap=50,
118
+ top_k=3
119
+ )
120
+
121
+ # Initialize pipeline (automatically uses get_embeddings_client())
122
+ pipeline = RAGPipeline(config=config)
123
+
124
+ logger.info("✓ Pipeline initialized successfully")
125
+ logger.info(f"✓ Embedding backend: {config.embedding_backend}")
126
+ logger.info(f"✓ API ready at http://localhost:8000")
127
+ logger.info(f"✓ Interactive docs at http://localhost:8000/docs")
128
+
129
+ except Exception as e:
130
+ logger.error(f"Failed to initialize pipeline: {e}")
131
+ raise
132
+
133
+
134
+ @app.on_event("shutdown")
135
+ async def shutdown_event():
136
+ """Cleanup on shutdown."""
137
+ logger.info("Shutting down Document Intelligence RAG API")
138
+
139
+
140
+ # ==================== Health & Status ====================
141
+
142
+ @app.get("/health", response_model=HealthResponse)
143
+ async def health_check():
144
+ """
145
+ Check system health.
146
+
147
+ Returns:
148
+ Health status of all components
149
+ """
150
+ if not pipeline:
151
+ raise HTTPException(status_code=503, detail="Pipeline not initialized")
152
+
153
+ try:
154
+ # Check components
155
+ embeddings_ok = "✓" if pipeline.embeddings else "✗"
156
+ groq_ok = "✓" if pipeline.llm else "✗"
157
+ chroma_ok = pipeline.vector_store.size() >= 0
158
+
159
+ return HealthResponse(
160
+ status="healthy" if all([embeddings_ok == "✓", groq_ok == "✓", chroma_ok]) else "degraded",
161
+ embedding_backend=pipeline.config.embedding_backend,
162
+ groq=groq_ok,
163
+ chroma={
164
+ "status": "✓" if chroma_ok else "✗",
165
+ "chunks": pipeline.vector_store.size()
166
+ },
167
+ timestamp=datetime.now().isoformat()
168
+ )
169
+
170
+ except Exception as e:
171
+ logger.error(f"Health check failed: {e}")
172
+ raise HTTPException(status_code=500, detail=str(e))
173
+
174
+
175
+ @app.get("/stats", response_model=StatsResponse)
176
+ async def get_stats():
177
+ """
178
+ Get pipeline statistics.
179
+
180
+ Returns:
181
+ Current stats: total chunks, config, etc.
182
+ """
183
+ if not pipeline:
184
+ raise HTTPException(status_code=503, detail="Pipeline not initialized")
185
+
186
+ try:
187
+ stats = pipeline.get_stats()
188
+
189
+ return StatsResponse(
190
+ total_chunks=stats['total_chunks'],
191
+ config=stats['config'],
192
+ timestamp=datetime.now().isoformat()
193
+ )
194
+
195
+ except Exception as e:
196
+ logger.error(f"Stats retrieval failed: {e}")
197
+ raise HTTPException(status_code=500, detail=str(e))
198
+
199
+
200
+ # ==================== Ingestion Endpoints ====================
201
+
202
+ @app.post("/ingest", response_model=IngestResponse)
203
+ async def ingest_pdf(file: UploadFile = File(...)):
204
+ """
205
+ Upload and ingest a single PDF file.
206
+
207
+ Args:
208
+ file: PDF file to upload
209
+
210
+ Returns:
211
+ Ingestion result with doc_id and chunk count
212
+
213
+ Example:
214
+ curl -X POST "http://localhost:8000/ingest" \
215
+ -F "file=@research_paper.pdf"
216
+ """
217
+ if not pipeline:
218
+ raise HTTPException(status_code=503, detail="Pipeline not initialized")
219
+
220
+ if not file.filename.endswith('.pdf'):
221
+ raise HTTPException(status_code=400, detail="Only PDF files are supported")
222
+
223
+ try:
224
+ # Save uploaded file to temp location
225
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
226
+ contents = await file.read()
227
+ tmp_file.write(contents)
228
+ tmp_path = tmp_file.name
229
+
230
+ logger.info(f"Processing uploaded PDF: {file.filename}")
231
+
232
+ # Ingest PDF
233
+ result = pipeline.ingest_pdf(tmp_path)
234
+
235
+ # Clean up temp file
236
+ os.remove(tmp_path)
237
+
238
+ return IngestResponse(
239
+ doc_id=result['doc_id'],
240
+ filename=file.filename,
241
+ chunks_created=result['chunks_created'],
242
+ chunks_embedded=result['chunks_embedded'],
243
+ status=result['status'],
244
+ timestamp=datetime.now().isoformat()
245
+ )
246
+
247
+ except Exception as e:
248
+ logger.error(f"PDF ingestion failed: {e}")
249
+ raise HTTPException(status_code=500, detail=f"Ingestion failed: {str(e)}")
250
+
251
+
252
+ @app.post("/ingest-folder", response_model=IngestFolderResponse)
253
+ async def ingest_folder(folder_path: str):
254
+ """
255
+ Ingest all PDFs from a folder.
256
+
257
+ Args:
258
+ folder_path: Path to folder containing PDFs
259
+
260
+ Returns:
261
+ Summary of all ingested documents
262
+
263
+ Example:
264
+ curl -X POST "http://localhost:8000/ingest-folder" \
265
+ -H "Content-Type: application/json" \
266
+ -d '{"folder_path": "./papers"}'
267
+ """
268
+ if not pipeline:
269
+ raise HTTPException(status_code=503, detail="Pipeline not initialized")
270
+
271
+ try:
272
+ # Check folder exists
273
+ if not os.path.exists(folder_path):
274
+ raise HTTPException(status_code=400, detail=f"Folder not found: {folder_path}")
275
+
276
+ logger.info(f"Ingesting folder: {folder_path}")
277
+
278
+ # Ingest all PDFs
279
+ results = pipeline.ingest_folder(folder_path)
280
+
281
+ if not results:
282
+ raise HTTPException(status_code=400, detail="No PDFs found in folder")
283
+
284
+ # Build response
285
+ total_chunks = sum(r['chunks_embedded'] for r in results.values())
286
+ documents = [
287
+ {
288
+ "doc_id": doc_id,
289
+ "chunks": r['chunks_embedded']
290
+ }
291
+ for doc_id, r in results.items()
292
+ ]
293
+
294
+ return IngestFolderResponse(
295
+ total_documents=len(results),
296
+ total_chunks=total_chunks,
297
+ documents=documents,
298
+ timestamp=datetime.now().isoformat()
299
+ )
300
+
301
+ except HTTPException:
302
+ raise
303
+ except Exception as e:
304
+ logger.error(f"Folder ingestion failed: {e}")
305
+ raise HTTPException(status_code=500, detail=f"Ingestion failed: {str(e)}")
306
+
307
+
308
+ # ==================== Query Endpoint ====================
309
+
310
+ @app.post("/query", response_model=QueryResponse)
311
+ async def query(request: QueryRequest):
312
+ """
313
+ Query the RAG system with a question.
314
+
315
+ Args:
316
+ request: QueryRequest with 'query' and optional 'top_k'
317
+
318
+ Returns:
319
+ Answer with sources and metadata
320
+
321
+ Example:
322
+ curl -X POST "http://localhost:8000/query" \
323
+ -H "Content-Type: application/json" \
324
+ -d '{"query": "What is machine learning?", "top_k": 3}'
325
+ """
326
+ if not pipeline:
327
+ raise HTTPException(status_code=503, detail="Pipeline not initialized")
328
+
329
+ if pipeline.vector_store.size() == 0:
330
+ raise HTTPException(
331
+ status_code=400,
332
+ detail="No documents ingested yet. Use /ingest endpoint first."
333
+ )
334
+
335
+ try:
336
+ import time
337
+ start_time = time.time()
338
+
339
+ logger.info(f"Query: {request.query}")
340
+
341
+ # Query pipeline
342
+ result = pipeline.query(request.query, return_sources=True)
343
+
344
+ response_time = time.time() - start_time
345
+
346
+ return QueryResponse(
347
+ query=result['query'],
348
+ answer=result['answer'],
349
+ sources=result['sources'],
350
+ chunks_used=result['chunks_used'],
351
+ response_time=round(response_time, 3),
352
+ status=result['status']
353
+ )
354
+
355
+ except Exception as e:
356
+ logger.error(f"Query failed: {e}")
357
+ raise HTTPException(status_code=500, detail=f"Query failed: {str(e)}")
358
+
359
+
360
+ # ==================== Document Management ====================
361
+
362
+ @app.get("/documents")
363
+ async def list_documents():
364
+ """
365
+ List all ingested documents.
366
+
367
+ Returns:
368
+ List of document IDs and chunk counts
369
+ """
370
+ if not pipeline:
371
+ raise HTTPException(status_code=503, detail="Pipeline not initialized")
372
+
373
+ try:
374
+ total_chunks = pipeline.vector_store.size()
375
+
376
+ return {
377
+ "total_chunks": total_chunks,
378
+ "status": "ready" if total_chunks > 0 else "empty",
379
+ "timestamp": datetime.now().isoformat()
380
+ }
381
+
382
+ except Exception as e:
383
+ logger.error(f"Failed to list documents: {e}")
384
+ raise HTTPException(status_code=500, detail=str(e))
385
+
386
+
387
+ @app.delete("/documents/{doc_id}")
388
+ async def delete_document(doc_id: str):
389
+ """
390
+ Delete a document and all its chunks.
391
+
392
+ Args:
393
+ doc_id: Document ID to delete
394
+
395
+ Returns:
396
+ Deletion result
397
+ """
398
+ if not pipeline:
399
+ raise HTTPException(status_code=503, detail="Pipeline not initialized")
400
+
401
+ try:
402
+ # Note: This is a simple implementation
403
+ # For production, you'd want to track document chunks and delete them
404
+ logger.info(f"Deleting document: {doc_id}")
405
+
406
+ return {
407
+ "status": "success",
408
+ "doc_id": doc_id,
409
+ "message": "Document deletion queued",
410
+ "timestamp": datetime.now().isoformat()
411
+ }
412
+
413
+ except Exception as e:
414
+ logger.error(f"Failed to delete document: {e}")
415
+ raise HTTPException(status_code=500, detail=str(e))
416
+
417
+
418
+ @app.post("/reset")
419
+ async def reset_system():
420
+ """
421
+ Reset the entire system - clear all documents and embeddings.
422
+
423
+ WARNING: This deletes all stored embeddings!
424
+
425
+ Returns:
426
+ Reset confirmation
427
+ """
428
+ global pipeline
429
+
430
+ if not pipeline:
431
+ raise HTTPException(status_code=503, detail="Pipeline not initialized")
432
+
433
+ try:
434
+ logger.warning("RESET: Clearing all documents and embeddings")
435
+
436
+ # Clear vector store
437
+ pipeline.vector_store.clear()
438
+
439
+ logger.info("✓ System reset complete")
440
+
441
+ return {
442
+ "status": "success",
443
+ "message": "All documents and embeddings cleared",
444
+ "chunks_remaining": 0,
445
+ "timestamp": datetime.now().isoformat()
446
+ }
447
+
448
+ except Exception as e:
449
+ logger.error(f"Reset failed: {e}")
450
+ raise HTTPException(status_code=500, detail=str(e))
451
+
452
+
453
+ # ==================== Error Handlers ====================
454
+
455
+ @app.exception_handler(HTTPException)
456
+ async def http_exception_handler(request, exc):
457
+ """Handle HTTP exceptions."""
458
+ return JSONResponse(
459
+ status_code=exc.status_code,
460
+ content={
461
+ "error": exc.detail,
462
+ "status": "error",
463
+ "timestamp": datetime.now().isoformat()
464
+ }
465
+ )
466
+
467
+
468
+ @app.exception_handler(Exception)
469
+ async def general_exception_handler(request, exc):
470
+ """Handle general exceptions."""
471
+ logger.error(f"Unhandled exception: {exc}")
472
+ return JSONResponse(
473
+ status_code=500,
474
+ content={
475
+ "error": "Internal server error",
476
+ "status": "error",
477
+ "timestamp": datetime.now().isoformat()
478
+ }
479
+ )
480
+
481
+
482
+ # ==================== Root Endpoint ====================
483
+
484
+ @app.get("/", response_class=FileResponse)
485
+ async def root():
486
+ """Root endpoint - serve web UI."""
487
+ frontend_path = "frontend/index.html"
488
+ if os.path.exists(frontend_path):
489
+ return FileResponse(frontend_path)
490
+
491
+ # If no frontend, return API info
492
+ return {
493
+ "name": "Document Intelligence RAG",
494
+ "version": "1.0.0",
495
+ "description": "RAG system for analyzing documents with LLM",
496
+ "docs": "http://localhost:8000/docs",
497
+ "health": "http://localhost:8000/health",
498
+ "embedding_backend": pipeline.config.embedding_backend if pipeline else "initializing",
499
+ "timestamp": datetime.now().isoformat()
500
+ }
501
+
502
+ if __name__ == "__main__":
503
+ import uvicorn
504
+ uvicorn.run(app, host="0.0.0.0", port=8000)
src/rag/__init__.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ RAG Package
3
+ ===========
4
+
5
+ Modular Retrieval-Augmented Generation system with Ollama + Groq
6
+ """
7
+
8
+ from .chunker import chunk_text, Chunk, chunk_documents
9
+ from .embeddings import OllamaEmbeddingClient, cosine_similarity
10
+ from .vector_store import ChromaVectorStore, RetrievalResult
11
+ from .llm import GroqLLMClient, build_context_string
12
+ from .pdf_processor import PDFProcessor
13
+ from .pipeline import RAGPipeline, RAGConfig
14
+
15
+ __all__ = [
16
+ # Chunking
17
+ "chunk_text",
18
+ "Chunk",
19
+ "chunk_documents",
20
+ # Embeddings
21
+ "OllamaEmbeddingClient",
22
+ "cosine_similarity",
23
+ # Vector Store
24
+ "ChromaVectorStore",
25
+ "SimpleVectorStore",
26
+ "RetrievalResult",
27
+ # LLM
28
+ "GroqLLMClient",
29
+ "build_context_string",
30
+ # PDF Processing
31
+ "PDFProcessor",
32
+ # Pipeline
33
+ "RAGPipeline",
34
+ "RAGConfig",
35
+ ]
36
+
37
+ __version__ = "0.1.0"
src/rag/chunker.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Chunker module
3
+ --------------
4
+ Purpose: Split text into smaller chunks.
5
+ """
6
+
7
+ from typing import List, Dict
8
+ from dataclasses import dataclass
9
+
10
+ @dataclass
11
+ class Chunk:
12
+ text: str
13
+ chunk_id: int
14
+ start_idx: int
15
+ word_count: int
16
+
17
+ def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[Chunk]:
18
+ """
19
+ Split text into smaller chunks.
20
+
21
+ Args:
22
+ text (str): The text to split into chunks.
23
+ chunk_size (int): The size of each chunk.
24
+ overlap (int): The overlap between chunks.
25
+
26
+ Returns:
27
+ List[Chunk]: A list of chunks.
28
+ """
29
+ words = text.split()
30
+
31
+ if not words:
32
+ return []
33
+
34
+ stride = chunk_size - overlap
35
+ chunks = []
36
+ chunk_id = 0
37
+
38
+ for i in range(0, len(words), stride):
39
+ chunk = words[i:i + chunk_size]
40
+ chunk_text = ' '.join(chunk)
41
+
42
+ if not chunk_text.strip():
43
+ continue
44
+
45
+ chunk = Chunk(
46
+ text=chunk_text,
47
+ chunk_id=chunk_id,
48
+ start_idx=i,
49
+ word_count=len(chunk)
50
+ )
51
+
52
+ chunks.append(chunk)
53
+ chunk_id += 1
54
+
55
+ return chunks
56
+
57
+
58
+ def chunk_documents(
59
+ documents: Dict[str, str],
60
+ chunk_size: int = 500,
61
+ overlap: int = 50
62
+ ) -> Dict[str, List[Chunk]]:
63
+ """
64
+ Chunk multiple documents.
65
+
66
+ Args:
67
+ documents: Dict of {doc_id: text}
68
+ chunk_size: Tokens per chunk
69
+ overlap: Token overlap
70
+
71
+ Returns:
72
+ Dict of {doc_id: [chunks]}
73
+
74
+ Example:
75
+ >>> docs = {"doc1": "Text 1", "doc2": "Text 2"}
76
+ >>> chunked = chunk_documents(docs)
77
+ >>> "doc1" in chunked
78
+ True
79
+ """
80
+ chunked_docs = {}
81
+
82
+ for doc_id, text in documents.items():
83
+ chunks = chunk_text(text, chunk_size, overlap)
84
+ chunked_docs[doc_id] = chunks
85
+
86
+ return chunked_docs
87
+
88
+ if __name__ == "__main__":
89
+ text = """
90
+ Machine Learning is a subset of artificial intelligence that involves training models to make predictions or decisions based on data. It is a powerful tool for solving a wide range of problems, from image recognition to natural language processing. In this article, we will explore the basics of machine learning and how it can be used to solve real-world problems.
91
+ """
92
+
93
+ chunks = chunk_text(text, chunk_size=50, overlap=10)
94
+ print(f"Split into {len(chunks)} chunks:")
95
+ for chunk in chunks:
96
+ print(f" Chunk {chunk.chunk_id}: {chunk.word_count} words | {chunk.text[:60]}...")
97
+
src/rag/embeddings.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Embeddings module
3
+ ----------------
4
+ Purpose: Convert text to vector embeddings using local Ollama or Sentence-Transformers
5
+ """
6
+ import requests
7
+ import numpy as np
8
+ from typing import List
9
+ import logging
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class OllamaEmbeddingClient:
15
+ """
16
+ Client for Ollama embedding service
17
+
18
+ Requires: ollama serve running on localhost:11434
19
+ Model: nomic-embed-text (384 dimensions)
20
+ """
21
+ def __init__(
22
+ self,
23
+ base_url: str = "http://localhost:11434",
24
+ model: str = "nomic-embed-text",
25
+ timeout: int = 30
26
+ ):
27
+
28
+ """
29
+ Initialize the Ollama embedding client
30
+ Args:
31
+ base_url: Ollama server URL
32
+ model: Embedding model name
33
+ timeout: Request timeout in seconds
34
+ """
35
+
36
+ self.base_url = base_url
37
+ self.model = model
38
+ self.timeout = timeout
39
+
40
+ self._test_connection()
41
+
42
+ def _test_connection(self) -> None:
43
+ """Test if Ollama is running."""
44
+ try:
45
+ response = requests.get(
46
+ f"{self.base_url}/api/tags",
47
+ timeout=5
48
+ )
49
+ if response.status_code != 200:
50
+ raise ConnectionError(f"Ollama returned {response.status_code}")
51
+
52
+ logger.info(f"✓ Connected to Ollama at {self.base_url}")
53
+ except requests.exceptions.ConnectionError:
54
+ raise ConnectionError(
55
+ f"Cannot connect to Ollama at {self.base_url}. "
56
+ "Start it with: ollama serve"
57
+ )
58
+
59
+ def embed(self, text: str) -> List[float]:
60
+ """
61
+ Get embedding for a single text.
62
+ Args:
63
+ text: Text to embed
64
+
65
+ Returns:
66
+ List of floats (384 dimensions for nomic-embed-text)
67
+
68
+ Raises:
69
+ requests.RequestException: If Ollama API fails
70
+
71
+ Example:
72
+ >>> client = OllamaEmbeddingClient()
73
+ >>> embedding = client.embed("Hello world")
74
+ >>> len(embedding)
75
+ 384
76
+ """
77
+ try:
78
+ response = requests.post(
79
+ f"{self.base_url}/api/embed",
80
+ json={
81
+ "model": self.model,
82
+ "input": text
83
+ },
84
+ timeout=self.timeout
85
+ )
86
+
87
+ if response.status_code != 200:
88
+ raise RuntimeError(
89
+ f"Ollama error {response.status_code}: {response.text}"
90
+ )
91
+
92
+ # Extract embedding from response
93
+ embedding = response.json()["embeddings"][0]
94
+ return embedding
95
+
96
+ except requests.exceptions.Timeout:
97
+ raise TimeoutError(
98
+ f"Ollama request timed out after {self.timeout}s"
99
+ )
100
+ except requests.exceptions.ConnectionError:
101
+ raise ConnectionError(
102
+ f"Lost connection to Ollama at {self.base_url}"
103
+ )
104
+ except KeyError as e:
105
+ raise ValueError(f"Unexpected Ollama response format: {e}")
106
+
107
+
108
+ def embed_batch(self, texts: List[str]) -> List[List[float]]:
109
+ """
110
+ Get embeddings for multiple texts.
111
+
112
+ Args:
113
+ texts: List of texts to embed
114
+
115
+ Returns:
116
+ List of embeddings (one per text)
117
+
118
+ Note: This calls Ollama for each text. For production,
119
+ consider batching at the Ollama level.
120
+ """
121
+ embeddings = []
122
+ for text in texts:
123
+ try:
124
+ emb = self.embed(text)
125
+ embeddings.append(emb)
126
+ except Exception as e:
127
+ logger.error(f"Failed to embed text: {e}")
128
+ raise
129
+
130
+ return embeddings
131
+
132
+
133
+ class SentenceTransformerEmbeddingClient:
134
+ """
135
+ Client for Sentence-Transformers embeddings (local, free).
136
+
137
+ No external service required - runs locally.
138
+ Model: all-MiniLM-L6-v2 (384 dimensions)
139
+
140
+ Install with: pip install sentence-transformers
141
+ """
142
+
143
+ def __init__(self, model_name: str = "all-mpnet-base-v2"):
144
+ """
145
+ Initialize Sentence-Transformers embedding client.
146
+
147
+ Args:
148
+ model_name: HuggingFace model name
149
+ Default: all-MiniLM-L6-v2 (fast, lightweight, 384 dims)
150
+
151
+ Note: First initialization downloads the model (~500MB)
152
+ """
153
+ logger.info(f"Initializing Sentence-Transformers (model: {model_name})")
154
+
155
+ try:
156
+ from sentence_transformers import SentenceTransformer
157
+ self.model = SentenceTransformer(model_name)
158
+ logger.info(f"✓ Loaded Sentence-Transformer model: {model_name}")
159
+ except ImportError:
160
+ raise ImportError(
161
+ "sentence-transformers not installed. "
162
+ "Install with: pip install sentence-transformers"
163
+ )
164
+ except Exception as e:
165
+ logger.error(f"Failed to load Sentence-Transformer model: {e}")
166
+ raise
167
+
168
+ def embed(self, text: str) -> List[float]:
169
+ """
170
+ Get embedding for a single text.
171
+
172
+ Args:
173
+ text: Text to embed
174
+
175
+ Returns:
176
+ List of floats (384 dimensions for all-MiniLM-L6-v2)
177
+
178
+ Example:
179
+ >>> client = SentenceTransformerEmbeddingClient()
180
+ >>> embedding = client.embed("Hello world")
181
+ >>> len(embedding)
182
+ 384
183
+ """
184
+ try:
185
+ embedding = self.model.encode(text, convert_to_numpy=True)
186
+ return embedding.tolist()
187
+ except Exception as e:
188
+ logger.error(f"Failed to embed text: {e}")
189
+ raise
190
+
191
+ def embed_batch(self, texts: List[str]) -> List[List[float]]:
192
+ """
193
+ Get embeddings for multiple texts (more efficient than calling embed() for each).
194
+
195
+ Args:
196
+ texts: List of texts to embed
197
+
198
+ Returns:
199
+ List of embeddings (one per text)
200
+ """
201
+ try:
202
+ embeddings = self.model.encode(texts, convert_to_numpy=True)
203
+ return [emb.tolist() for emb in embeddings]
204
+ except Exception as e:
205
+ logger.error(f"Failed to embed batch: {e}")
206
+ raise
207
+
208
+
209
+ def cosine_similarity(vec_a: List[float], vec_b: List[float]) -> float:
210
+ """
211
+ Calculate cosine similarity between two vectors.
212
+
213
+ Args:
214
+ vec_a: First vector
215
+ vec_b: Second vector
216
+
217
+ Returns:
218
+ Similarity score from -1 to 1 (1 = identical)
219
+
220
+ Note: Works best on normalized vectors (which both Ollama and Sentence-Transformers provide)
221
+
222
+ Example:
223
+ >>> vec1 = [1.0, 0.0, 0.0]
224
+ >>> vec2 = [1.0, 0.0, 0.0]
225
+ >>> cosine_similarity(vec1, vec2)
226
+ 1.0
227
+ """
228
+ a = np.array(vec_a)
229
+ b = np.array(vec_b)
230
+
231
+ dot_product = np.dot(a, b)
232
+ norm_a = np.linalg.norm(a)
233
+ norm_b = np.linalg.norm(b)
234
+
235
+ if norm_a == 0 or norm_b == 0:
236
+ return 0.0
237
+
238
+ return float(dot_product / (norm_a * norm_b))
239
+
240
+
241
+ # ============ TESTS ============
242
+
243
+ def test_cosine_similarity():
244
+ """Test cosine similarity calculation."""
245
+ # Identical vectors
246
+ vec1 = [1.0, 0.0, 0.0]
247
+ vec2 = [1.0, 0.0, 0.0]
248
+ assert abs(cosine_similarity(vec1, vec2) - 1.0) < 0.01
249
+
250
+ # Orthogonal vectors
251
+ vec3 = [1.0, 0.0, 0.0]
252
+ vec4 = [0.0, 1.0, 0.0]
253
+ assert abs(cosine_similarity(vec3, vec4) - 0.0) < 0.01
254
+
255
+
256
+ def test_cosine_similarity_normalized():
257
+ """Test with normalized vectors."""
258
+ # Normalized vectors
259
+ vec1 = np.array([1.0, 0.0, 0.0])
260
+ vec1 = vec1 / np.linalg.norm(vec1)
261
+
262
+ vec2 = np.array([1.0, 0.0, 0.0])
263
+ vec2 = vec2 / np.linalg.norm(vec2)
264
+
265
+ sim = cosine_similarity(vec1.tolist(), vec2.tolist())
266
+ assert abs(sim - 1.0) < 0.01
267
+
268
+
269
+ if __name__ == "__main__":
270
+ import os
271
+
272
+ # Test based on EMBEDDING_BACKEND env var
273
+ backend = os.getenv("EMBEDDING_BACKEND", "sentence-transformers").lower()
274
+
275
+ try:
276
+ if backend == "ollama":
277
+ print("Testing Ollama embeddings...")
278
+ client = OllamaEmbeddingClient()
279
+ else:
280
+ print("Testing Sentence-Transformers embeddings...")
281
+ client = SentenceTransformerEmbeddingClient()
282
+
283
+ # Test single embedding
284
+ text = "Machine learning is AI"
285
+ embedding = client.embed(text)
286
+
287
+ print(f"✓ Embedding created: {len(embedding)} dimensions")
288
+ print(f" Sample values: {embedding[:5]}")
289
+
290
+ # Test similarity
291
+ text2 = "Deep learning uses networks"
292
+ embedding2 = client.embed(text2)
293
+
294
+ sim = cosine_similarity(embedding, embedding2)
295
+ print(f" Similarity between texts: {sim:.3f}")
296
+
297
+ except Exception as e:
298
+ print(f"✗ Error: {e}")
299
+ if backend == "ollama":
300
+ print(" Start Ollama with: ollama serve")
301
+ else:
302
+ print(" Install sentence-transformers with: pip install sentence-transformers")
src/rag/llm.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ LLM Module
3
+ ----------
4
+ Purpose: Query Groq LLM with context for RAG answers
5
+ """
6
+ from groq import Groq
7
+ from typing import List
8
+ import os
9
+ import logging
10
+ logging.basicConfig(level=logging.INFO)
11
+ from dotenv import load_dotenv
12
+
13
+ env_paths = [
14
+ os.path.join(os.path.dirname(__file__), '../..', '.env'), # Project root
15
+ os.path.join(os.path.dirname(__file__), '.env'), # Script directory
16
+ ]
17
+
18
+ for env_path in env_paths:
19
+ if os.path.exists(env_path):
20
+ load_dotenv(env_path)
21
+ print(f"Loaded .env from: {env_path}")
22
+ break
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class GroqLLMClient:
28
+ """
29
+ Client for querying Groq LLM with context for RAG answers
30
+ Requires: Groq API key
31
+ Model: llama-3.1-8b-instant -> check available models using client.models.list()
32
+ """
33
+ def __init__(
34
+ self,
35
+ api_key: str,
36
+ model_name: str = "llama-3.1-8b-instant",
37
+ max_tokens: int = 1024,
38
+ temperature: float = 0.7,
39
+ ):
40
+ """
41
+ Initialize Groq LLM client
42
+ Args:
43
+ api_key (str): Groq API key
44
+ model_name (str): Groq model name
45
+ max_tokens (int): Maximum number of tokens to generate
46
+ temperature (float): 0-1, higher for more creative shit
47
+ """
48
+ self.api_key = api_key or os.getenv("GROQ_API_KEY")
49
+
50
+ if not self.api_key:
51
+ raise ValueError("GROQ_API_KEY not found in environment variables")
52
+
53
+ self.client = Groq(api_key=self.api_key)
54
+ self.model_name = model_name
55
+ self.max_tokens = max_tokens
56
+ self.temperature = temperature
57
+
58
+ logger.info(f"Groq LLM client initialized with model: {self.model_name}")
59
+
60
+ def _build_prompt(
61
+ self,
62
+ context: str,
63
+ question: str,
64
+ ) -> str:
65
+ """
66
+ Build the final prompt for LLM
67
+ Args:
68
+ context (str): Retrieved chunks
69
+ question (str): Question to ask
70
+ Returns:
71
+ str: Prompt for LLM
72
+ """
73
+ prompt = f"""You are a helpful assistant. Answer the question based ONLY on the provided context.
74
+ If the context doesn't contain enough information to answer, say so explicitly.
75
+ Do not make up information.
76
+
77
+ Context: {context}
78
+
79
+ Question: {question}
80
+
81
+ Answer:"""
82
+ return prompt
83
+
84
+ def query(
85
+ self,
86
+ context: str,
87
+ query: str,
88
+ ) -> str:
89
+ """
90
+ Query the Groq LLM with context
91
+ Args:
92
+ context (str): Retrieved context from vector store
93
+ query: User's question
94
+
95
+ Returns:
96
+ LLM's answer as string
97
+
98
+ Raises:
99
+ RuntimeError: If Groq API fails
100
+ """
101
+ try:
102
+ prompt = self._build_prompt(context, query)
103
+ logger.debug(f"Querying Groq with {len(context)} chars context")
104
+
105
+ response = self.client.chat.completions.create(
106
+ model=self.model_name,
107
+ messages=[
108
+ {"role": "user", "content": prompt}
109
+ ],
110
+ max_tokens=self.max_tokens,
111
+ temperature=self.temperature,
112
+ )
113
+ answer = response.choices[0].message.content
114
+ logger.debug(f"Groq API response: {answer}")
115
+ return answer
116
+ except Exception as e:
117
+ logger.error(f"Groq query failed: {e}")
118
+ raise RuntimeError(f"LLM query failed: {e}")
119
+
120
+ def query_with_sources(
121
+ self,
122
+ context: str,
123
+ query: str,
124
+ sources: List[str] = None
125
+ ) -> dict:
126
+ """
127
+ Query LLM and return answer with source attribution.
128
+
129
+ Args:
130
+ context: Retrieved context
131
+ query: User's question
132
+ sources: Optional list of source identifiers (chunk IDs, URLs, etc.)
133
+
134
+ Returns:
135
+ Dict with 'answer' and 'sources' keys
136
+
137
+ Example:
138
+ >>> result = client.query_with_sources(
139
+ ... context="...",
140
+ ... query="What is ML?",
141
+ ... sources=["doc1_chunk_0", "doc1_chunk_2"]
142
+ ... )
143
+ >>> print(result["answer"])
144
+ >>> print(result["sources"])
145
+ """
146
+ answer = self.query(context, query)
147
+
148
+ return {
149
+ "answer": answer,
150
+ "sources": sources or []
151
+ }
152
+
153
+ def build_context_string(
154
+ retrieved_results: List,
155
+ include_scores: bool = True
156
+ ) -> str:
157
+ """
158
+ Build a context string from retrieved results
159
+ Args:
160
+ retrieved_results: List of retrieved results
161
+ include_scores: Whether to include scores in the context string
162
+ Returns:
163
+ Context string
164
+ """
165
+ context_parts = []
166
+
167
+ for i, result in enumerate(retrieved_results, 1):
168
+ if include_scores:
169
+ part = f"[Chunk {i} - Relevance: {result.similarity:.1%}]\n{result.text}"
170
+ else:
171
+ part = f"[Chunk {i}]\n{result.text}"
172
+
173
+ context_parts.append(part)
174
+
175
+ return "\n\n".join(context_parts)
176
+
177
+ # ============ TESTS ============
178
+
179
+ def test_build_context_string():
180
+ """Test context string building."""
181
+ from .vector_store import RetrievalResult
182
+
183
+ results = [
184
+ RetrievalResult("chunk1", "Text 1", 0.95),
185
+ RetrievalResult("chunk2", "Text 2", 0.87)
186
+ ]
187
+
188
+ context = build_context_string(results)
189
+
190
+ assert "Text 1" in context
191
+ assert "Text 2" in context
192
+ assert "95.0%" in context
193
+
194
+
195
+ if __name__ == "__main__":
196
+ try:
197
+ # Test Groq client
198
+ client = GroqLLMClient(api_key=os.getenv("GROQ_API_KEY"))
199
+
200
+ # Test context string
201
+ from .vector_store import RetrievalResult
202
+
203
+ results = [
204
+ RetrievalResult("chunk1", "Machine learning is AI", 0.95),
205
+ RetrievalResult("chunk2", "Deep learning uses neural networks", 0.87)
206
+ ]
207
+
208
+ context = build_context_string(results)
209
+
210
+ # Query
211
+ answer = client.query(
212
+ context=context,
213
+ query="What is machine learning?"
214
+ )
215
+
216
+ print("✓ Groq query successful!")
217
+ print(f"Answer: {answer[:200]}...")
218
+
219
+ except Exception as e:
220
+ print(f"✗ Error: {e}")
src/rag/pdf_processor.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PDF Processor
3
+ -------------
4
+ Purpose: Process PDF files and extract text.
5
+ """
6
+
7
+ import os
8
+ from pathlib import Path
9
+ from typing import List, Dict, Tuple
10
+ import logging
11
+ import PyPDF2
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ @staticmethod
16
+ def extract_text_pypdf2(pdf_path: str) -> Tuple[str, Dict]:
17
+ """
18
+ Extract text from PDF using PyPDF2.
19
+
20
+ Args:
21
+ pdf_path: Path to PDF file
22
+
23
+ Returns:
24
+ Tuple of (text, metadata)
25
+ metadata includes: num_pages, title, author (if available)
26
+
27
+ Note: PyPDF2 works okay for text-based PDFs.
28
+ For scanned PDFs, consider using OCR tools.
29
+ """
30
+ try:
31
+ with open(pdf_path, 'rb') as pdf_file:
32
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
33
+
34
+ # Extract metadata
35
+ metadata = pdf_reader.metadata or {}
36
+ num_pages = len(pdf_reader.pages)
37
+
38
+ # Extract text from all pages
39
+ text = ""
40
+ page_texts = {}
41
+
42
+ for page_num, page in enumerate(pdf_reader.pages):
43
+ page_text = page.extract_text()
44
+ text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
45
+ page_texts[page_num + 1] = page_text
46
+
47
+ result_metadata = {
48
+ "num_pages": num_pages,
49
+ "title": metadata.get('/Title', 'Unknown'),
50
+ "author": metadata.get('/Author', 'Unknown'),
51
+ "page_texts": page_texts,
52
+ "source_file": os.path.basename(pdf_path)
53
+ }
54
+
55
+ return text, result_metadata
56
+
57
+ except Exception as e:
58
+ logger.error(f"Failed to extract text from {pdf_path}: {e}")
59
+ raise
60
+
61
+
62
+ def extract_text_pdfplumber(pdf_path: str) -> Tuple[str, Dict]:
63
+ """
64
+ Extract text from PDF using pdfplumber (better quality).
65
+
66
+ Args:
67
+ pdf_path: Path to PDF file
68
+
69
+ Returns:
70
+ Tuple of (text, metadata)
71
+
72
+ Note: Requires: pip install pdfplumber
73
+ Better text extraction than PyPDF2, especially for complex layouts
74
+ """
75
+ try:
76
+ import pdfplumber
77
+ except ImportError:
78
+ logger.warning("pdfplumber not installed, falling back to PyPDF2")
79
+ return extract_text_pypdf2(pdf_path)
80
+
81
+ try:
82
+ with pdfplumber.open(pdf_path) as pdf:
83
+ text = ""
84
+ page_texts = {}
85
+
86
+ for page_num, page in enumerate(pdf.pages):
87
+ page_text = page.extract_text()
88
+ text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
89
+ page_texts[page_num + 1] = page_text
90
+
91
+ result_metadata = {
92
+ "num_pages": len(pdf.pages),
93
+ "title": pdf.metadata.get('Title', 'Unknown') if pdf.metadata else 'Unknown',
94
+ "author": pdf.metadata.get('Author', 'Unknown') if pdf.metadata else 'Unknown',
95
+ "page_texts": page_texts,
96
+ "source_file": os.path.basename(pdf_path)
97
+ }
98
+
99
+ return text, result_metadata
100
+
101
+ except Exception as e:
102
+ logger.error(f"Failed to extract text from {pdf_path}: {e}")
103
+ raise
104
+
105
+ class PDFProcessor:
106
+ """
107
+ Process PDF files and extract text for RAG ingestion.
108
+ """
109
+
110
+ def __init__(self, use_pdfplumber: bool = False):
111
+ """
112
+ Initialize PDF processor.
113
+
114
+ Args:
115
+ use_pdfplumber: Use pdfplumber (better) or PyPDF2 (built-in)
116
+ """
117
+ self.use_pdfplumber = use_pdfplumber
118
+
119
+ if use_pdfplumber:
120
+ try:
121
+ import pdfplumber
122
+ logger.info("Using pdfplumber for PDF extraction")
123
+ except ImportError:
124
+ logger.warning("pdfplumber not installed, using PyPDF2")
125
+ self.use_pdfplumber = False
126
+
127
+ def process_pdf(self, pdf_path: str) -> Tuple[str, Dict]:
128
+ """
129
+ Extract text from a single PDF.
130
+
131
+ Args:
132
+ pdf_path: Path to PDF file
133
+
134
+ Returns:
135
+ Tuple of (extracted_text, metadata)
136
+
137
+ Example:
138
+ >>> processor = PDFProcessor()
139
+ >>> text, meta = processor.process_pdf("paper.pdf")
140
+ >>> print(f"Extracted {meta['num_pages']} pages")
141
+ """
142
+ pdf_path = str(pdf_path)
143
+
144
+ if not os.path.exists(pdf_path):
145
+ raise FileNotFoundError(f"PDF not found: {pdf_path}")
146
+
147
+ if not pdf_path.lower().endswith('.pdf'):
148
+ raise ValueError(f"Not a PDF file: {pdf_path}")
149
+
150
+ logger.info(f"Processing PDF: {os.path.basename(pdf_path)}")
151
+
152
+ if self.use_pdfplumber:
153
+ text, metadata = extract_text_pdfplumber(pdf_path)
154
+ else:
155
+ text, metadata = extract_text_pypdf2(pdf_path)
156
+
157
+ logger.info(
158
+ f"✓ Extracted {metadata['num_pages']} pages, "
159
+ f"{len(text)} chars"
160
+ )
161
+
162
+ return text, metadata
163
+
164
+ def process_folder(
165
+ self,
166
+ folder_path: str,
167
+ pattern: str = "*.pdf"
168
+ ) -> Dict[str, Tuple[str, Dict]]:
169
+ """
170
+ Process all PDFs in a folder.
171
+
172
+ Args:
173
+ folder_path: Path to folder containing PDFs
174
+ pattern: File pattern to match (default: "*.pdf")
175
+
176
+ Returns:
177
+ Dict of {filename: (text, metadata)}
178
+
179
+ Example:
180
+ >>> processor = PDFProcessor()
181
+ >>> docs = processor.process_folder("./papers")
182
+ >>> for filename, (text, meta) in docs.items():
183
+ ... print(f"{filename}: {meta['num_pages']} pages")
184
+ """
185
+ folder_path = Path(folder_path)
186
+
187
+ if not folder_path.exists():
188
+ raise FileNotFoundError(f"Folder not found: {folder_path}")
189
+
190
+ logger.info(f"Processing folder: {folder_path}")
191
+
192
+ pdf_files = list(folder_path.glob(pattern))
193
+ logger.info(f"Found {len(pdf_files)} PDF files")
194
+
195
+ documents = {}
196
+ failed = []
197
+
198
+ for pdf_path in pdf_files:
199
+ try:
200
+ text, metadata = self.process_pdf(str(pdf_path))
201
+ documents[pdf_path.stem] = (text, metadata) # Use filename without extension as key
202
+ except Exception as e:
203
+ logger.error(f"Failed to process {pdf_path.name}: {e}")
204
+ failed.append((pdf_path.name, str(e)))
205
+
206
+ if failed:
207
+ logger.warning(f"Failed to process {len(failed)} files:")
208
+ for filename, error in failed:
209
+ logger.warning(f" - {filename}: {error}")
210
+
211
+ logger.info(f"✓ Processed {len(documents)} PDFs successfully")
212
+
213
+ return documents
214
+
215
+ def clean_text(self, text: str) -> str:
216
+ """
217
+ Clean extracted text (remove extra whitespace, control characters, etc.)
218
+
219
+ Args:
220
+ text: Raw extracted text
221
+
222
+ Returns:
223
+ Cleaned text
224
+ """
225
+ # Remove multiple newlines
226
+ text = '\n'.join([line.strip() for line in text.split('\n') if line.strip()])
227
+
228
+ # Remove control characters (but keep newlines and tabs)
229
+ text = ''.join(char for char in text if char.isprintable() or char in '\n\t')
230
+
231
+ return text
232
+
233
+
234
+ # ============ TESTS ============
235
+
236
+ def test_pdf_processor_missing_file():
237
+ """Test handling of missing file."""
238
+ processor = PDFProcessor()
239
+
240
+ try:
241
+ processor.process_pdf("nonexistent.pdf")
242
+ assert False, "Should raise FileNotFoundError"
243
+ except FileNotFoundError:
244
+ print("✓ Correctly raises FileNotFoundError for missing file")
245
+
246
+
247
+ if __name__ == "__main__":
248
+ logging.basicConfig(level=logging.INFO)
249
+
250
+ # Example usage
251
+ processor = PDFProcessor(use_pdfplumber=False)
src/rag/pipeline.py ADDED
@@ -0,0 +1,362 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ RAG Pipeline
3
+ ------------
4
+ Purpose: DO all RAG stuff in to a unified pipeline
5
+ """
6
+
7
+ from typing import List, Dict, Any
8
+ from dataclasses import dataclass
9
+ import logging
10
+ import os
11
+ from dotenv import load_dotenv
12
+ from pathlib import Path
13
+
14
+ from .chunker import chunk_text
15
+ from .vector_store import ChromaVectorStore
16
+ from .llm import GroqLLMClient, build_context_string
17
+ from .pdf_processor import PDFProcessor
18
+
19
+ logging.basicConfig(level=logging.INFO)
20
+ logger = logging.getLogger(__name__)
21
+
22
+ def load_env():
23
+ """Load environment variables from project root .env file."""
24
+ env_paths = [
25
+ os.path.join(os.path.dirname(__file__), '../..', '.env'),
26
+ os.path.join(os.path.dirname(__file__), '.env'),
27
+ ]
28
+
29
+ for env_path in env_paths:
30
+ if os.path.exists(env_path):
31
+ load_dotenv(env_path)
32
+ logger.debug(f"Loaded .env from: {env_path}")
33
+ return env_path
34
+
35
+ logger.warning("No .env file found")
36
+ return None
37
+
38
+
39
+ def get_embeddings_client():
40
+ """
41
+ Get embeddings client based on EMBEDDING_BACKEND env var.
42
+
43
+ Environment Variables:
44
+ EMBEDDING_BACKEND: "ollama" or "sentence-transformers" (default)
45
+ OLLAMA_BASE_URL: URL for Ollama (default: http://localhost:11434)
46
+
47
+ Returns:
48
+ Embeddings client instance
49
+ """
50
+ backend = os.getenv("EMBEDDING_BACKEND", "sentence-transformers").lower()
51
+
52
+ if backend == "ollama":
53
+ logger.info("Using Ollama embeddings")
54
+ from .embeddings import OllamaEmbeddingClient
55
+ base_url = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
56
+ return OllamaEmbeddingClient(
57
+ base_url=base_url,
58
+ model="nomic-embed-text"
59
+ )
60
+ else:
61
+ # sentence-transformers (default, free, works everywhere)
62
+ logger.info("Using Sentence-Transformers embeddings (local)")
63
+ from .embeddings import SentenceTransformerEmbeddingClient
64
+ return SentenceTransformerEmbeddingClient()
65
+
66
+
67
+ @dataclass
68
+ class RAGConfig:
69
+ """Configuration for RAG pipeline."""
70
+ chunk_size: int = 500
71
+ chunk_overlap: int = 50
72
+ top_k: int = 3
73
+ embedding_backend: str = None # Will use env var if None
74
+ groq_api_key: str = None
75
+
76
+ def __post_init__(self):
77
+ """Set embedding_backend from env if not provided."""
78
+ if self.embedding_backend is None:
79
+ self.embedding_backend = os.getenv("EMBEDDING_BACKEND", "sentence-transformers")
80
+
81
+
82
+ class RAGPipeline:
83
+ """
84
+ End-to-end RAG pipeline.
85
+
86
+ Workflow:
87
+ 1. Initialize: Create components
88
+ 2. Ingest: Chunk and embed documents
89
+ 3. Query: Retrieve and answer
90
+ """
91
+ def __init__(
92
+ self,
93
+ config: RAGConfig = None,
94
+ embeddings=None,
95
+ llm=None
96
+ ):
97
+ """
98
+ Initialize RAG pipeline with all components.
99
+
100
+ Args:
101
+ config: RAGConfig object with settings
102
+ embeddings: Optional embeddings client (for dependency injection)
103
+ llm: Optional LLM client (for dependency injection)
104
+ """
105
+ load_env()
106
+ self.config = config or RAGConfig()
107
+ logger.info("Initializing RAG Pipeline...")
108
+
109
+ # Use provided embeddings or create from config
110
+ if embeddings:
111
+ self.embeddings = embeddings
112
+ logger.info("✓ Using provided embeddings client")
113
+ else:
114
+ try:
115
+ self.embeddings = get_embeddings_client()
116
+ logger.info("✓ Embeddings client ready")
117
+ except Exception as e:
118
+ logger.error(f"Failed to initialize embeddings: {e}")
119
+ raise
120
+
121
+ # Use provided LLM or create from config
122
+ if llm:
123
+ self.llm = llm
124
+ logger.info("✓ Using provided LLM client")
125
+ else:
126
+ try:
127
+ api_key = self.config.groq_api_key or os.getenv("GROQ_API_KEY")
128
+ if not api_key:
129
+ raise ValueError(
130
+ "GROQ_API_KEY not provided. Pass it in RAGConfig or set GROQ_API_KEY environment variable."
131
+ )
132
+ self.llm = GroqLLMClient(api_key=api_key)
133
+ logger.info("✓ LLM client ready")
134
+ except Exception as e:
135
+ logger.error(f"Failed to initialize LLM: {e}")
136
+ raise
137
+
138
+ self.vector_store = ChromaVectorStore()
139
+ logger.info("✓ Vector store ready")
140
+
141
+ logger.info("✓ RAG Pipeline initialized")
142
+
143
+
144
+ def ingest_pdf(
145
+ self,
146
+ pdf_path: str
147
+ ) -> Dict[str, Any]:
148
+ """
149
+ Ingest a PDF file: extract text, chunk, and embed.
150
+
151
+ Args:
152
+ pdf_path: Path to PDF file
153
+
154
+ Returns:
155
+ Ingestion stats
156
+
157
+ Example:
158
+ >>> pipeline = RAGPipeline()
159
+ >>> result = pipeline.ingest_pdf("research_paper.pdf")
160
+ >>> print(f"Ingested {result['chunks_embedded']} chunks")
161
+ """
162
+ # Extract PDF
163
+ processor = PDFProcessor(use_pdfplumber=False)
164
+ text, metadata = processor.process_pdf(pdf_path)
165
+
166
+ # Use filename (without extension) as doc_id
167
+ doc_id = Path(pdf_path).stem
168
+
169
+ # Add PDF metadata to chunks
170
+ ingestion_result = self.ingest(doc_id, text)
171
+ ingestion_result["pdf_metadata"] = metadata
172
+
173
+ return ingestion_result
174
+
175
+ def ingest_folder(
176
+ self,
177
+ folder_path: str
178
+ ) -> Dict[str, Dict[str, Any]]:
179
+ """
180
+ Ingest all PDFs from a folder.
181
+
182
+ Args:
183
+ folder_path: Path to folder containing PDFs
184
+
185
+ Returns:
186
+ Dict of {doc_id: ingestion_result}
187
+
188
+ Example:
189
+ >>> pipeline = RAGPipeline()
190
+ >>> results = pipeline.ingest_folder("./papers")
191
+ >>> for doc_id, result in results.items():
192
+ ... print(f"{doc_id}: {result['chunks_embedded']} chunks")
193
+ """
194
+ processor = PDFProcessor(use_pdfplumber=False)
195
+ documents = processor.process_folder(folder_path)
196
+
197
+ results = {}
198
+ for doc_id, (text, metadata) in documents.items():
199
+ result = self.ingest(doc_id, text)
200
+ result["pdf_metadata"] = metadata
201
+ results[doc_id] = result
202
+
203
+ return results
204
+
205
+ def ingest(
206
+ self,
207
+ doc_id: str,
208
+ text: str
209
+ ) -> Dict[str, Any]:
210
+ """
211
+ Ingest a document: chunk it and embed each chunk.
212
+
213
+ Args:
214
+ doc_id: Unique document identifier
215
+ text: Document text
216
+
217
+ Returns:
218
+ Ingestion stats (chunks created, time taken, etc.)
219
+
220
+ Example:
221
+ >>> pipeline = RAGPipeline()
222
+ >>> result = pipeline.ingest(
223
+ ... "doc1",
224
+ ... "Machine learning is AI. Deep learning uses networks."
225
+ ... )
226
+ >>> print(f"Ingested {result['chunks_created']} chunks")
227
+ """
228
+ logger.info(f"Ingesting document: {doc_id}")
229
+
230
+ #Step 1: chunk it
231
+ chunks = chunk_text(text, self.config.chunk_size, self.config.chunk_overlap)
232
+ logger.info(f"✓ Chunks created: {len(chunks)}")
233
+
234
+ if not chunks:
235
+ logger.warning("No chunks created. Document may be too short.")
236
+ return {
237
+ "doc_id": doc_id,
238
+ "chunks_created": 0,
239
+ "time_taken": 0,
240
+ "error": "Document too short"
241
+ }
242
+
243
+ #Step 2: embed each chunk
244
+ chunks_embedded = 0
245
+ for chunk in chunks:
246
+ try:
247
+ chunk_id = f"{doc_id}_chunk_{chunk.chunk_id}"
248
+ embedding = self.embeddings.embed(chunk.text)
249
+ self.vector_store.add(
250
+ chunk_id=chunk_id,
251
+ text=chunk.text,
252
+ embedding=embedding,
253
+ metadata={
254
+ "doc_id": doc_id,
255
+ "chunk_num": chunk.chunk_id,
256
+ "word_count": chunk.word_count
257
+ }
258
+ )
259
+ chunks_embedded += 1
260
+ except Exception as e:
261
+ logger.error(f"Failed to embed chunk {chunk_id}: {e}")
262
+ continue
263
+
264
+ logger.info(f"✓ Embedded {chunks_embedded}/{len(chunks)} chunks")
265
+ return {
266
+ "doc_id": doc_id,
267
+ "chunks_created": len(chunks),
268
+ "chunks_embedded": chunks_embedded,
269
+ "status": "success" if chunks_embedded > 0 else "partial"
270
+ }
271
+
272
+ def query(
273
+ self,
274
+ query: str,
275
+ return_sources: bool = True
276
+ ) -> Dict[str, Any]:
277
+ """
278
+ Query the RAG system: retrieve relevant chunks and generate answer.
279
+
280
+ Args:
281
+ query: User's question
282
+ return_sources: Include source chunks in response
283
+
284
+ Returns:
285
+ Dictionary with 'query', 'answer', 'sources', etc.
286
+
287
+ Raises:
288
+ ValueError: If vector store is empty
289
+
290
+ Example:
291
+ >>> pipeline = RAGPipeline()
292
+ >>> pipeline.ingest("doc1", "Machine learning is...")
293
+ >>> result = pipeline.query("What is ML?")
294
+ >>> print(result["answer"])
295
+ """
296
+ logger.info(f"Querying: {query}")
297
+
298
+ #Check if we have docs
299
+ if self.vector_store.size() == 0:
300
+ raise ValueError("No documents in vector store")
301
+
302
+ #Step 1: Embed the query
303
+ query_embedding = self.embeddings.embed(query)
304
+ logger.debug(" → Query embedded")
305
+
306
+ #Step 2: Retrieve relevant chunks
307
+ retrieved_chunks = self.vector_store.retrieve(
308
+ query_embedding,
309
+ top_k=self.config.top_k
310
+ )
311
+ logger.debug(f" → Retrieved {len(retrieved_chunks)} chunks")
312
+ if not retrieved_chunks:
313
+ return {
314
+ "query": query,
315
+ "answer": "No relevant documents found.",
316
+ "sources": [],
317
+ "status": "no_results"
318
+ }
319
+
320
+ #Step 3: Build context from retrieved chunks
321
+ context = build_context_string(retrieved_chunks)
322
+ logger.debug(f" → Built context ({len(context)} chars)")
323
+
324
+ #Step 4: Query LLM with context
325
+ try:
326
+ answer = self.llm.query(context=context, query=query)
327
+ logger.debug(f" → LLM responded ({len(answer)} chars)")
328
+ except Exception as e:
329
+ logger.error(f"LLM query failed: {e}")
330
+ raise
331
+
332
+ #Step 5: Format response
333
+ sources = [
334
+ {
335
+ "chunk_id": r.chunk_id,
336
+ "similarity": round(r.similarity, 3),
337
+ "preview": r.text[:100] + "..." if len(r.text) > 100 else r.text
338
+ }
339
+ for r in retrieved_chunks
340
+ ] if return_sources else []
341
+
342
+ result = {
343
+ "query": query,
344
+ "answer": answer,
345
+ "sources": sources,
346
+ "chunks_used": len(retrieved_chunks),
347
+ "status": "success"
348
+ }
349
+
350
+ logger.info(f"Query complete: {result['status']}")
351
+ return result
352
+
353
+ def get_stats(self) -> Dict[str, Any]:
354
+ """Get pipeline statistics."""
355
+ return {
356
+ "total_chunks": self.vector_store.size(),
357
+ "config": {
358
+ "chunk_size": self.config.chunk_size,
359
+ "chunk_overlap": self.config.chunk_overlap,
360
+ "top_k": self.config.top_k
361
+ }
362
+ }
src/rag/vector_store.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Vector Store Module
3
+ ===================
4
+
5
+ Purpose: Store embeddings and retrieve similar ones
6
+
7
+ This module uses Chroma for persistent, efficient vector storage.
8
+ Chroma is free, local, and production-ready.
9
+
10
+ Key Concepts:
11
+ • Vector storage: Persistent storage mapping chunk_id → embedding
12
+ • Metadata: Source info, text preview, etc.
13
+ • Retrieval: Find top-k most similar vectors using cosine similarity
14
+ • Persistence: Data survives application restarts
15
+ """
16
+
17
+ from typing import List, Dict, Any
18
+ from dataclasses import dataclass, field
19
+ import logging
20
+ import chromadb
21
+ import os
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ @dataclass
27
+ class RetrievalResult:
28
+ """A single retrieved chunk with metadata."""
29
+ chunk_id: str
30
+ text: str
31
+ similarity: float
32
+ metadata: Dict[str, Any] = field(default_factory=dict)
33
+
34
+
35
+ class ChromaVectorStore:
36
+ """
37
+ Vector store using Chroma (persistent, free, production-ready).
38
+
39
+ Chroma is a modern vector database that:
40
+ • Stores embeddings persistently on disk
41
+ • Provides similarity search
42
+ • Is completely free and open source
43
+ • Works locally (no API calls)
44
+
45
+ This is the recommended implementation for production RAG systems.
46
+ """
47
+
48
+ def __init__(self, persist_directory: str = ".chromadb", collection_name: str = "rag"):
49
+ """
50
+ Initialize Chroma vector store.
51
+
52
+ Args:
53
+ persist_directory: Where to store vectors on disk
54
+ collection_name: Name of the collection (namespace)
55
+
56
+ Example:
57
+ >>> store = ChromaVectorStore(persist_directory="./data/vectors")
58
+ """
59
+ self.persist_directory = persist_directory
60
+ self.collection_name = collection_name
61
+
62
+ # Ensure persist directory exists
63
+ os.makedirs(persist_directory, exist_ok=True)
64
+
65
+ try:
66
+ # Create persistent client
67
+ self.client = chromadb.PersistentClient(path=persist_directory)
68
+
69
+ # Get or create collection
70
+ self.collection = self.client.get_or_create_collection(
71
+ name=collection_name,
72
+ metadata={"hnsw:space": "cosine"} # Use cosine similarity
73
+ )
74
+
75
+ logger.info(
76
+ f"✓ Initialized Chroma vector store at {persist_directory} "
77
+ f"(collection: {collection_name})"
78
+ )
79
+ except Exception as e:
80
+ logger.error(f"Failed to initialize Chroma: {e}")
81
+ raise
82
+ def __enter__(self):
83
+ return self
84
+
85
+ def __exit__(self, exc_type, exc_val, exc_tb):
86
+ try:
87
+ self.client.persist()
88
+ self.client.shutdown()
89
+ except Exception:
90
+ pass
91
+
92
+ def add(
93
+ self,
94
+ chunk_id: str,
95
+ text: str,
96
+ embedding: List[float],
97
+ metadata: Dict[str, Any] = None
98
+ ) -> None:
99
+ """
100
+ Add a chunk with its embedding to the store.
101
+
102
+ Args:
103
+ chunk_id: Unique identifier for chunk
104
+ text: Original text content
105
+ embedding: Vector representation (list of floats)
106
+ metadata: Optional metadata (source, page number, etc.)
107
+
108
+ Example:
109
+ >>> store.add(
110
+ ... "doc1_chunk_0",
111
+ ... "Machine learning is AI",
112
+ ... [0.1, 0.2, ..., 0.384],
113
+ ... metadata={"doc_id": "doc1", "page": 1}
114
+ ... )
115
+ """
116
+ try:
117
+ self.collection.add(
118
+ ids=[chunk_id],
119
+ documents=[text],
120
+ embeddings=[embedding],
121
+ metadatas=[metadata or {}]
122
+ )
123
+ logger.debug(f"Added chunk {chunk_id} ({len(text)} chars)")
124
+ except Exception as e:
125
+ logger.error(f"Failed to add chunk {chunk_id}: {e}")
126
+ raise
127
+
128
+ def retrieve(
129
+ self,
130
+ query_embedding: List[float],
131
+ top_k: int = 5
132
+ ) -> List[RetrievalResult]:
133
+ """
134
+ Find most similar chunks to query.
135
+
136
+ Args:
137
+ query_embedding: Query vector
138
+ top_k: Number of results to return
139
+
140
+ Returns:
141
+ List of RetrievalResult objects, sorted by similarity (highest first)
142
+
143
+ Example:
144
+ >>> results = store.retrieve(query_embedding, top_k=3)
145
+ >>> for r in results:
146
+ ... print(f"{r.similarity:.3f} | {r.text[:60]}")
147
+ """
148
+ try:
149
+ if self.collection.count() == 0:
150
+ logger.warning("Vector store is empty")
151
+ return []
152
+
153
+ # Query Chroma
154
+ results = self.collection.query(
155
+ query_embeddings=[query_embedding],
156
+ n_results=top_k
157
+ )
158
+
159
+ if not results["ids"] or not results["ids"][0]:
160
+ logger.debug("No results found for query")
161
+ return []
162
+
163
+ # Convert to RetrievalResult objects
164
+ retrieval_results = []
165
+
166
+ for i, chunk_id in enumerate(results["ids"][0]):
167
+ # Chroma returns distances, convert to similarity (1 - distance for cosine)
168
+ # Note: Chroma with cosine metric returns distances
169
+ distance = results["distances"][0][i]
170
+ similarity = 1 - distance # Convert distance to similarity
171
+
172
+ result = RetrievalResult(
173
+ chunk_id=chunk_id,
174
+ text=results["documents"][0][i],
175
+ similarity=similarity,
176
+ metadata=results["metadatas"][0][i]
177
+ )
178
+ retrieval_results.append(result)
179
+
180
+ logger.debug(f"Retrieved {len(retrieval_results)} chunks")
181
+ return retrieval_results
182
+
183
+ except Exception as e:
184
+ logger.error(f"Retrieval failed: {e}")
185
+ raise
186
+
187
+ def size(self) -> int:
188
+ """Return number of chunks in store."""
189
+ try:
190
+ count = self.collection.count()
191
+ return count
192
+ except Exception as e:
193
+ logger.error(f"Failed to get store size: {e}")
194
+ return 0
195
+
196
+ def delete(self, chunk_id: str) -> bool:
197
+ """
198
+ Delete a chunk from the store.
199
+
200
+ Args:
201
+ chunk_id: ID of chunk to delete
202
+
203
+ Returns:
204
+ True if deleted, False if not found
205
+ """
206
+ try:
207
+ self.collection.delete(ids=[chunk_id])
208
+ logger.debug(f"Deleted chunk {chunk_id}")
209
+ return True
210
+ except Exception as e:
211
+ logger.error(f"Failed to delete chunk {chunk_id}: {e}")
212
+ return False
213
+
214
+ def clear(self) -> None:
215
+ """Clear all vectors from store."""
216
+ try:
217
+ # Get all IDs and delete them
218
+ all_data = self.collection.get()
219
+ if all_data["ids"]:
220
+ self.collection.delete(ids=all_data["ids"])
221
+ logger.info("Cleared vector store")
222
+ except Exception as e:
223
+ logger.error(f"Failed to clear store: {e}")
224
+ raise
225
+
226
+
227
+
228
+
229
+
230
+ # ============ TESTS ============
231
+
232
+ import tempfile
233
+ import shutil
234
+ import time
235
+
236
+ def test_chroma_vector_store():
237
+ temp_dir = tempfile.mkdtemp()
238
+
239
+ store = ChromaVectorStore(persist_directory=temp_dir)
240
+
241
+ try:
242
+ # Add chunks
243
+ vec1 = [1.0, 0.0, 0.0]
244
+ vec2 = [0.9, 0.1, 0.0]
245
+ vec3 = [0.0, 1.0, 0.0]
246
+
247
+ store.add("chunk1", "Machine learning", vec1, metadata={"source": "test"})
248
+ store.add("chunk2", "Deep learning networks", vec2, metadata={"source": "test"})
249
+ store.add("chunk3", "Cooking recipes", vec3, metadata={"source": "test"})
250
+
251
+ # Retrieve
252
+ results = store.retrieve(vec1, top_k=2)
253
+ assert len(results) == 2
254
+ assert results[0].chunk_id == "chunk1"
255
+ print("✓ Chroma test passed!")
256
+
257
+ finally:
258
+ # Cleanup Chroma resources
259
+ try:
260
+ if hasattr(store, "client"):
261
+ store.client.close()
262
+ del store.client
263
+ del store.collection
264
+ except Exception as e:
265
+ logger.warning(f"Error closing Chroma client: {e}")
266
+
267
+ # Give Windows time to release file handles
268
+ time.sleep(1.0)
269
+
270
+ # Retry logic for Windows file deletion
271
+ retry_count = 0
272
+ max_retries = 5
273
+ while retry_count < max_retries:
274
+ try:
275
+ shutil.rmtree(temp_dir)
276
+ break
277
+ except PermissionError:
278
+ retry_count += 1
279
+ if retry_count < max_retries:
280
+ time.sleep(0.5)
281
+ else:
282
+ logger.warning(f"Could not delete temp directory {temp_dir}, skipping")
283
+ break
284
+
285
+
286
+
287
+ if __name__ == "__main__":
288
+ logging.basicConfig(level=logging.INFO)
289
+
290
+ # Test Chroma
291
+ try:
292
+ test_chroma_vector_store()
293
+ except ImportError:
294
+ print("Chroma not installed, skipping test")
295
+
296
+ # Test SimpleVectorStore
297
+ test_simple_vector_store()
uv.lock ADDED
The diff for this file is too large to render. See raw diff