puji4ml commited on
Commit
6e42cf5
·
verified ·
1 Parent(s): 2b22a59

Upload 9 files

Browse files
Files changed (9) hide show
  1. .dockerignore +78 -0
  2. .gitattributes +19 -0
  3. .gitignore +0 -0
  4. Dockerfile +49 -0
  5. README.md +639 -0
  6. docker-compose.yml +94 -0
  7. init_project.py +43 -0
  8. requirements.txt +46 -0
  9. results.xlsx +3 -0
.dockerignore ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python cache
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+
8
+ # Virtual environments
9
+ env/
10
+ venv/
11
+ ENV/
12
+ env.bak/
13
+ venv.bak/
14
+ .venv/
15
+
16
+ # IDE
17
+ .vscode/
18
+ .idea/
19
+ *.swp
20
+ *.swo
21
+ .DS_Store
22
+
23
+ # Logs
24
+ *.log
25
+ logs/
26
+ *.log.*
27
+
28
+ # Git
29
+ .git/
30
+ .gitignore
31
+ .gitattributes
32
+
33
+ # Documentation
34
+ README.md
35
+ docs/
36
+ *.pdf
37
+
38
+ # Tests
39
+ tests/
40
+ test_*.py
41
+ *_test.py
42
+
43
+ # Temporary files
44
+ tmp/
45
+ temp/
46
+ *.tmp
47
+ .cache/
48
+
49
+ # Jupyter notebooks
50
+ *.ipynb
51
+ .ipynb_checkpoints/
52
+
53
+ # Database temporary files (keep main DB, ignore temp files)
54
+ *.db-shm
55
+ *.db-wal
56
+
57
+ # Environment files (will be injected via docker-compose)
58
+ .env
59
+ .env.*
60
+
61
+ # Large corpus files (will handle separately)
62
+ # Uncomment if not including in Docker image
63
+ # chroma_db/
64
+ # data/wikipedia_corpus.parquet
65
+
66
+ # Build artifacts
67
+ dist/
68
+ build/
69
+ *.egg-info/
70
+
71
+ # Docker files (no need to copy into image)
72
+ Dockerfile
73
+ docker-compose.yml
74
+ .dockerignore
75
+
76
+ # GitHub/CI files
77
+ .github/
78
+ .gitmodules
.gitattributes ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Git LFS Configuration for Large Files
2
+ # ChromaDB vectsor stores (entire directory)
3
+ chroma_db/** filter=lfs diff=lfs merge=lfs -text
4
+ data/vector_stores/** filter=lfs diff=lfs merge=lfs -text
5
+ # SQLite databases (all variants)
6
+ *.db filter=lfs diff=lfs merge=lfs -text
7
+ *.sqlite filter=lfs diff=lfs merge=lfs -text
8
+ *.sqlite3 filter=lfs diff=lfs merge=lfs -text
9
+ # Binary index files
10
+ *.bin filter=lfs diff=lfs merge=lfs -text
11
+ # Parquet data files
12
+ *.parquet filter=lfs diff=lfs merge=lfs -text
13
+ # JSONL corpus files
14
+ *.jsonl filter=lfs diff=lfs merge=lfs -text
15
+ # Model files (if any)
16
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
17
+ *.gguf filter=lfs diff=lfs merge=lfs -text
18
+ data/vector_stores/**/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
19
+ results.xlsx filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
Binary file (999 Bytes). View file
 
Dockerfile ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Phase 5: Production Dockerfile for RAG Pipeline Optimizer
2
+ FROM python:3.12-slim
3
+
4
+ # Set working directory
5
+ WORKDIR /app
6
+
7
+ # Install system dependencies
8
+ RUN apt-get update && apt-get install -y \
9
+ build-essential \
10
+ curl \
11
+ git \
12
+ git-lfs \
13
+ && rm -rf /var/lib/apt/lists/*
14
+
15
+ # Initialize Git LFS
16
+ RUN git lfs install
17
+
18
+ # Copy requirements first (for Docker layer caching)
19
+ COPY requirements.txt .
20
+
21
+ # Install Python dependencies
22
+ RUN pip install --no-cache-dir --upgrade pip && \
23
+ pip install --no-cache-dir -r requirements.txt
24
+
25
+ # Copy project files
26
+ COPY config/ ./config/
27
+ COPY core/ ./core/
28
+ COPY utils/ ./utils/
29
+ COPY scripts/ ./scripts/
30
+ COPY app/ ./app/
31
+ COPY data/ ./data/
32
+ COPY chroma_db/ ./chroma_db/
33
+
34
+ # Create necessary directories
35
+ RUN mkdir -p logs
36
+
37
+ # Expose Streamlit port
38
+ EXPOSE 8501
39
+
40
+ # Health check
41
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
42
+ CMD curl --fail http://localhost:8501/_stcore/health || exit 1
43
+
44
+ # Run Streamlit dashboard
45
+ CMD ["streamlit", "run", "app/dashboard.py", \
46
+ "--server.port=8501", \
47
+ "--server.address=0.0.0.0", \
48
+ "--server.headless=true", \
49
+ "--browser.gatherUsageStats=false"]
README.md ADDED
@@ -0,0 +1,639 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # RAG Pipeline Optimizer - Phase 1 Complete ✅
2
+
3
+ An MLOps platform for evaluating and optimizing RAG (Retrieval-Augmented Generation) pipelines across multiple models and configurations.
4
+
5
+ ---
6
+
7
+ ## 🎯 Project Overview
8
+
9
+ **The Problem**: Every company has a RAG system, but almost no one knows if their RAG is good. Is chunk_size=512 better than 1024? Is Cohere a better embedder than OpenAI for their data? They're just guessing.
10
+
11
+ **The Solution**: A full-stack RAG evaluation platform that runs multiple pipeline configurations in parallel, scores them using AI evaluation, and shows you which configuration works best for YOUR data.
12
+
13
+ ---
14
+
15
+ ## ✅ Phase 1: Complete
16
+
17
+ ### What's Built
18
+
19
+ - ✅ **Project structure** with clean separation of concerns
20
+ - ✅ **6 diverse RAG pipelines** leveraging different strategies:
21
+ - Pipeline A: Speed-Optimized (Azure GPT-5)
22
+ - Pipeline B: Accuracy-Optimized (Azure GPT-5 + Reranking)
23
+ - Pipeline C: Balanced (Azure Cohere)
24
+ - Pipeline D: Reasoning (Anthropic Claude)
25
+ - Pipeline E: Cost-Optimized (Azure DeepSeek)
26
+ - Pipeline F: Experimental (xAI Grok)
27
+ - ✅ **Configuration management** with environment variables
28
+ - ✅ **Cost estimation** for each pipeline
29
+ - ✅ **Comprehensive tests** to validate configurations
30
+
31
+ ### Technology Stack
32
+
33
+ | Component | Technology | Purpose |
34
+ |-----------|-----------|---------|
35
+ | **LLM Providers** | Azure OpenAI, Cohere, DeepSeek, Anthropic, xAI | Diverse model comparison |
36
+ | **Embeddings** | OpenAI, Sentence-Transformers | Vector representations |
37
+ | **Vector DB** | ChromaDB | Local vector storage |
38
+ | **Framework** | LangChain | RAG orchestration |
39
+ | **Storage** | SQLite | Results & metadata |
40
+ | **Backend** (Phase 2) | FastAPI | REST API |
41
+ | **Frontend** (Phase 3) | Streamlit | User interface |
42
+ | **Deployment** (Phase 4) | Hugging Face Spaces | Cloud hosting |
43
+
44
+ ---
45
+
46
+ ## 📁 Project Structure
47
+ rag_optimizer/
48
+ ├── config/
49
+ │ ├── init.py
50
+ │ └── pipeline_configs.py # 6 pipeline configurations
51
+ ├── core/ # [Phase 2] Document processing
52
+ │ ├── init.py
53
+ │ ├── document_loader.py # [Coming next]
54
+ │ ├── chunker.py # [Coming next]
55
+ │ ├── embedder.py # [Coming next]
56
+ │ ├── vector_store.py # [Coming next]
57
+ │ ├── retriever.py # [Coming next]
58
+ │ ├── generator.py # [Coming next]
59
+ │ └── pipeline.py # [Coming next]
60
+ ├── data/
61
+ │ ├── uploads/ # User-uploaded documents
62
+ │ ├── vector_stores/ # ChromaDB storage
63
+ │ └── results.db # SQLite evaluation results
64
+ ├── utils/
65
+ │ ├── init.py
66
+ │ └── database.py # [Phase 3]
67
+ ├── tests/
68
+ │ ├── init.py
69
+ │ └── test_pipeline_config.py # ✅ Tests pass
70
+ ├── .env # Your API keys (DO NOT COMMIT)
71
+ ├── .env.example # Template for .env
72
+ ├── requirements.txt # Python dependencies
73
+ └── README.md # This file
74
+ ------------------------------------------------------------------------------------------------------------------------------
75
+ 🚀 Quick Start
76
+ ------------------------------------------------------------------------------------------------------------------------------
77
+ open "rag_optimizer" directory in VsCode
78
+ 1. Installation
79
+ # navigate to project
80
+ python init_project.py
81
+
82
+ # Create virtual environment
83
+ python -m venv venv
84
+
85
+ # Activate virtual environment
86
+ # On Windows:
87
+ .\venv\Scripts\activate
88
+
89
+ # Install dependencies
90
+ pip install -r requirements.txt
91
+
92
+ #Configure keys inside .env.example (I am using Azure Foundry AI for OpenAi,Cohere,deepseek)
93
+ a)configure ENDPOINT,API_KEY,DEPLOYMENT_NAME of models used via Azure and for rest like ANTHROPIC and GROQ directly use API_KEY
94
+ b)cp .env.example .env
95
+
96
+ 2. Verify Setup
97
+ # View pipeline comparison
98
+ python config/pipeline_configs.py
99
+
100
+ # Run tests
101
+ python tests/test_pipeline_config.py
102
+
103
+ Last Updated: January 14, 2026
104
+ Project: RAG Pipeline Optimizer
105
+ Phase: 1 of 5
106
+
107
+ # RAG Pipeline Optimizer - Phase 2 Complete ✅
108
+ Phase 2: Core RAG Components
109
+
110
+ Successfully implemented and tested all core components for document processing, embedding generation, and vector storage using LangChain framework.
111
+
112
+ 🎯 Phase 2 Deliverables
113
+ ✅ Document Loader - Multi-format document parsing (PDF, DOCX, TXT, MD, PPTX, XLSX)
114
+ ✅ Text Chunker - LangChain-based chunking with multiple strategies
115
+ ✅ Embedder - Local + Azure OpenAI embeddings
116
+ ✅ Vector Store - ChromaDB with LangChain integration
117
+
118
+ 📁 Files Created
119
+ rag_optimizer/
120
+ ├── core/
121
+ │ ├── __init__.py
122
+ │ ├── document_loader.py ✅ Multi-format document loading
123
+ │ ├── chunker.py ✅ LangChain text splitting
124
+ │ ├── embedder.py ✅ Embedding generation
125
+ │ └── vector_store.py ✅ ChromaDB vector storage
126
+
127
+ ├── data/
128
+ │ ├── uploads/ 📂 User uploaded documents
129
+ │ └── vector_stores/ 📂 Persisted vector databases
130
+
131
+ └── requirements.txt ✅ Updated with LangChain packages
132
+
133
+ 🔧 Components Overview
134
+ 1. Document Loader (core/document_loader.py)
135
+ Purpose: Load and parse documents in multiple formats
136
+
137
+ Supported Formats:
138
+
139
+ PDF (.pdf) - Extracts text with page numbers
140
+ Word (.docx) - Paragraphs and formatting
141
+ Text (.txt) - Plain text files
142
+ Markdown (.md) - Converts to plain text
143
+ PowerPoint (.pptx) - Slide content
144
+ Excel (.xlsx) - Sheet data
145
+
146
+ Key Features:
147
+ Automatic format detection
148
+ Metadata extraction (file size, page count)
149
+ Error handling for corrupted files
150
+ Batch document loading
151
+
152
+ 2. Text Chunker (core/chunker.py)
153
+ Purpose: Split documents into semantic chunks for embedding
154
+
155
+ Framework: LangChain Text Splitters
156
+
157
+ Chunking Strategies:
158
+ | Strategy | Description | Use Case | Quality |
159
+ | ----------- | -------------------------------- | ----------------------------- | ------- |
160
+ | recursive ✅ | Tries \\n\\n → \\n → . → | RECOMMENDED for all pipelines | A+ |
161
+ | character | Simple character-based splitting | Basic documents | B |
162
+ | token | Token-aware splitting | Token-limited models | B |
163
+ | sentence | Sentence boundary detection | Short documents | C |
164
+
165
+ Key Features:
166
+
167
+ Configurable chunk size (tokens)
168
+ Overlap for context preservation
169
+ Clean semantic boundaries
170
+ No fragment generation
171
+
172
+ 3. Embedder (core/embedder.py)
173
+ Purpose: Generate vector embeddings for text chunks
174
+
175
+ Framework: LangChain Embeddings
176
+
177
+ Supported Providers:
178
+ | Provider | Model | Dimension | Cost | Speed | Use Case |
179
+ | --------------------- | ---------------------- | --------- | -------- | ------ | ------------------- |
180
+ | sentence-transformers | all-MiniLM-L6-v2 | 384D | FREE ✅ | Fast | Development/Testing |
181
+ | sentence-transformers | all-mpnet-base-v2 | 768D | FREE ✅ | Medium | Better quality |
182
+ | azure-openai | text-embedding-3-small | 1536D | $0.02/1M | Fast | Production |
183
+ | azure-openai | text-embedding-3-large | 3072D | $0.13/1M | Medium | Highest accuracy |
184
+
185
+ Key Features:
186
+
187
+ Automatic batching for efficiency
188
+ Cosine similarity calculation
189
+ Normalized embeddings
190
+ Local caching (future)
191
+
192
+ 4. Vector Store (core/vector_store.py)
193
+ Purpose: Store and retrieve document chunks using vector similarity
194
+
195
+ Framework: LangChain + ChromaDB
196
+
197
+ Key Features:
198
+ Local persistent storage (no external DB needed)
199
+ Fast similarity search (cosine distance)
200
+ Metadata filtering
201
+ LangChain retriever integration
202
+ Collection management
203
+
204
+ Storage Structure:
205
+ data/vector_stores/
206
+ └── {collection_name}/
207
+ ├── chroma.sqlite3 # Metadata
208
+ └── {uuid}/ # Vector data
209
+ └── data_level0.bin
210
+
211
+ ------------------------------------------------------------------------------------------------------------------------------
212
+ 🚀 Quick Start
213
+ ------------------------------------------------------------------------------------------------------------------------------
214
+ 1).\venv\Scripts\activate
215
+ 2)pip install -r requirements.txt
216
+ 3)python core/document_loader.py
217
+ 4)python core/chunker.py
218
+ 5)python core/embedder.py
219
+ 6)python core/vector_store.py
220
+
221
+ Last Updated: January 14, 2026, 8:38 PM EST
222
+ Project: RAG Pipeline Optimizer
223
+ Phase: 2 of 5
224
+
225
+ #📘 Phase 3 README: Pipeline Orchestration & Parallel Evaluation
226
+ Phase 3 Roadmap (Step-by-Step)
227
+ Step 1: Generator Module ⬅️ START HERE
228
+ Build LLM interface for all 6 models (Azure OpenAI, Cohere, DeepSeek, Claude, Grok)
229
+ Step 2: Retriever Module
230
+ Combine VectorStore + optional reranking (Pipeline B uses Cohere rerank)
231
+ Step 3: Pipeline Orchestrator
232
+ Connect all components: Document → Chunks → Embeddings → Retrieval → Generation
233
+ Step 4: Dataset Integration
234
+ Download wiki_dpr + Natural Questions, load into vector stores
235
+ Step 5: Parallel Execution
236
+ Run all 6 pipelines on same query simultaneously
237
+ Step 6: Evaluation & Results Storage
238
+ SQLite database to store query results, costs, metrics
239
+
240
+ 🎯 Phase 3 Overview
241
+ Phase 3 integrated all core RAG components into a fully functional multi-pipeline evaluation system capable of running 6 different RAG configurations in parallel, comparing their performance, and storing results for analysis.
242
+
243
+ What We Built
244
+ ✅ LLM Generator (core/generator.py) - Multi-provider response generation
245
+ ✅ Smart Retriever (core/retriever.py) - Context retrieval with optional reranking
246
+ ✅ Pipeline Orchestrator (core/pipeline.py) - End-to-end RAG workflow
247
+ ✅ Parallel Evaluator (scripts/run_parallel_evaluation.py) - Simultaneous pipeline execution
248
+ ✅ Analysis Dashboard (scripts/analyze_results.py) - Performance comparison tools
249
+ ✅ Database Schema (data/evaluation_results.db) - SQLite storage for metrics
250
+ ✅ Dataset Integration (scripts/dataset_loader.py) - NQ-Open evaluation dataset
251
+ ✅ Corpus Ingestion (scripts/ingest_corpus.py) - Wikipedia knowledge base
252
+
253
+ 🏗️ Architecture Overview
254
+ ┌─────────────────────────────────────────────────────────────────┐
255
+ │ USER QUERY INPUT │
256
+ └────────────────────┬────────────────────────────────────────────┘
257
+
258
+
259
+ ┌─────────────────────────────────────────────────────────────────┐
260
+ │ PARALLEL PIPELINE EXECUTION (6 Pipelines) │
261
+ │ ┌──────────┬──────────┬──────────┬──────────┬──────────┬─────┐ │
262
+ │ │Pipeline A│Pipeline B│Pipeline C│Pipeline D│Pipeline E│Pipe │ │
263
+ │ │ (Speed) │(Accuracy)│(Balanced)│(Reasoning│ (Cost) │ F │ │
264
+ │ └─────┬────┴────┬─────┴─────┬────┴─────┬────┴─────┬────┴──┬──┘ │
265
+ └────────┼─────────┼───────────┼──────────┼──────────┼───────┼────┘
266
+ │ │ │ │ │ │
267
+ ▼ ▼ ▼ ▼ ▼ ▼
268
+ ┌────────────────────────────────────────────────────────────┐
269
+ │ VECTOR STORE (ChromaDB) │
270
+ │ Retrieves top-k relevant chunks for each pipeline │
271
+ └───────────────────┬────────────────────────────────────────┘
272
+
273
+
274
+ ┌────────────────────────────────────────────────────────────┐
275
+ │ RETRIEVER (with optional reranking) │
276
+ │ • Pipeline B: Cohere reranking (accuracy boost) │
277
+ │ • Others: Direct similarity search │
278
+ └───────────────────┬────────────────────────────────────────┘
279
+
280
+
281
+ ┌────────────────────────────────────────────────────────────┐
282
+ │ GENERATOR │
283
+ │ • Pipeline A: Azure GPT-5 (fast) │
284
+ │ • Pipeline B: Azure GPT-5 (high quality) │
285
+ │ • Pipeline C: Azure Cohere Command │
286
+ │ • Pipeline D: Anthropic Claude (reasoning) │
287
+ │ • Pipeline E: DeepSeek V3.2 (cost-optimized) │
288
+ │ • Pipeline F: Groq Llama (experimental) │
289
+ └───────────────────┬────────────────────────────────────────┘
290
+
291
+
292
+ ┌────────────────────────────────────────────────────────────┐
293
+ │ EVALUATION & METRICS COLLECTION │
294
+ │ • Answer correctness (exact match + fuzzy) │
295
+ │ • Latency tracking (retrieval + generation) │
296
+ │ • Cost calculation (per query) │
297
+ │ • Token usage monitoring │
298
+ └───────────────────┬────────────────────────────────────────┘
299
+
300
+
301
+ ┌────────────────────────────��───────────────────────────────┐
302
+ │ SQLite DATABASE (evaluation_results.db) │
303
+ │ Stores: Queries, Answers, Metrics, Timestamps │
304
+ └────────────────────────────────────────────────────────────┘
305
+
306
+
307
+ ┌────────────────────────────────────────────────────────────┐
308
+ │ ANALYSIS DASHBOARD (analyze_results.py) │
309
+ │ • Pipeline comparison │
310
+ │ • Cost efficiency analysis │
311
+ │ • Question difficulty breakdown │
312
+ │ • Excel export for deeper analysis │
313
+ └────────────────────────────────────────────────────────────┘
314
+ 📦 Components Built in Phase 3
315
+ 1. Generator (core/generator.py)
316
+ Purpose: Interface to all LLM providers with unified response handling.
317
+
318
+ Features:
319
+
320
+ ✅ Multi-provider support (Azure OpenAI, Cohere, Claude, DeepSeek, Groq)
321
+
322
+ ✅ Prompt template management
323
+ ✅ Automatic cost calculation
324
+ ✅ Token usage tracking
325
+ ✅ Error handling & retries
326
+ ✅ Response parsing with strict format validation
327
+
328
+ Supported Models:
329
+ AZURE_GPT5 = "gpt-5-chat" # Fast, high quality
330
+ AZURE_COHERE = "cohere-command-a" # Balanced performance
331
+ AZURE_DEEPSEEK = "DeepSeek-V3.2" # Ultra cost-efficient
332
+ ANTHROPIC_CLAUDE= "claude-3-5-sonnet" # Advanced reasoning
333
+ GROQ_LLAMA = "llama-3.3-70b" # Experimental, fast inference
334
+
335
+ 2. Retriever (core/retriever.py)
336
+ Purpose: Fetch relevant context chunks with optional reranking.
337
+
338
+ Features:
339
+
340
+ ✅ Semantic similarity search (ChromaDB)
341
+ ✅ Cohere reranking for Pipeline B (accuracy boost)
342
+ ✅ Configurable top-k retrieval
343
+ ✅ Score normalization
344
+ ✅ Metadata filtering
345
+ ✅ Performance timing
346
+
347
+ Retrieval Strategies:
348
+ | Pipeline | Strategy | Chunks | Reranking | Use Case |
349
+ | -------- | -------- | ------ | --------- | --------------- |
350
+ | A | Fast | 3 | ❌ | Speed-critical |
351
+ | B | Accuracy | 10 | ✅ Cohere | Maximum quality |
352
+ | C-F | Standard | 5-10 | ❌ | General use |
353
+
354
+ 3. Pipeline Orchestrator (core/pipeline.py)
355
+ Purpose: End-to-end RAG workflow coordinator.
356
+
357
+ Features:
358
+
359
+ ✅ Component integration (Embedder → VectorStore → Retriever → Generator)
360
+ ✅ Stage-wise timing (retrieval_time_ms, generation_time_ms, total_time_ms)
361
+ ✅ Cost accumulation
362
+ ✅ Metadata tracking
363
+ ✅ Error recovery
364
+
365
+ Pipeline Flow:
366
+ User Query → Embedding → Vector Search → Rerank (optional) → LLM Generation → Response
367
+ ↓ ↓ ↓ ↓ ↓ ↓
368
+ Timing Timing Timing Timing Timing Total
369
+
370
+
371
+ 4. Parallel Evaluator (scripts/run_parallel_evaluation.py)
372
+ Purpose: Run all 6 pipelines simultaneously on evaluation dataset.
373
+
374
+ Features:
375
+
376
+ ✅ Concurrent execution (ThreadPoolExecutor)
377
+ ✅ Progress tracking (tqdm)
378
+ ✅ Automatic database insertion
379
+ ✅ Error isolation (one pipeline failure doesn't stop others)
380
+ ✅ Answer validation (exact match + fuzzy matching)
381
+ ✅ Run ID tracking for experiment management
382
+
383
+ Performance Metrics Tracked:
384
+
385
+ ✅ Accuracy (answer_found: 0 or 1)
386
+ ✅ Latency (retrieval_time_ms, generation_time_ms, total_time_ms)
387
+ ✅ Cost (generation_cost_usd, total_cost_usd)
388
+ ✅ Token usage (prompt_tokens, completion_tokens, total_tokens)
389
+ ✅ Retrieval quality (num_chunks_retrieved, retrieval_scores)
390
+
391
+ 5. Analysis Dashboard (scripts/analyze_results.py)
392
+ Purpose: Comprehensive evaluation results analysis.
393
+
394
+ Features:
395
+
396
+ ✅ Pipeline performance summary (accuracy, cost, speed)
397
+ ✅ Cost efficiency analysis (cost per correct answer)
398
+ ✅ Time breakdown (retrieval vs generation)
399
+ ✅ Token usage statistics
400
+ ✅ Retrieval quality metrics
401
+ ✅ Difficult questions identification (0% accuracy)
402
+ ✅ Easy questions identification (>66% accuracy)
403
+ ✅ Question-by-question comparison
404
+ ✅ Excel export with 8 detailed sheets
405
+ Usage:
406
+ # View dashboard in terminal
407
+ python scripts/analyze_results.py
408
+
409
+ # Export to Excel
410
+ python scripts/analyze_results.py --export results.xlsx
411
+
412
+ # List all runs
413
+ python scripts/analyze_results.py --list-runs
414
+
415
+
416
+ 6. Database Schema (data/evaluation_results.db)
417
+ Table: evaluation_results
418
+ | Column | Type | Description |
419
+ | -------------------- | ------------------- | --------------------------------------------------- |
420
+ | id | INTEGER PRIMARY KEY | Auto-increment ID |
421
+ | run_id | TEXT | Evaluation run identifier (e.g., "20260117_182253") |
422
+ | pipeline_id | TEXT | Pipeline identifier |
423
+ | pipeline_name | TEXT | Human-readable pipeline name |
424
+ | question_id | TEXT | Question identifier from dataset |
425
+ | query | TEXT | Input question |
426
+ | ground_truth_answers | TEXT | JSON array of correct answers |
427
+ | retrieved_chunks | TEXT | JSON array of context chunks |
428
+ | retrieval_scores | TEXT | JSON array of similarity scores |
429
+ | num_chunks_retrieved | INTEGER | Number of chunks retrieved |
430
+ | retrieval_time_ms | REAL | Time spent on retrieval |
431
+ | reranking_time_ms | REAL | Time spent on reranking (if applicable) |
432
+ | reranked | INTEGER | Whether reranking was used (0 or 1) |
433
+ | generated_answer | TEXT | Model's generated answer |
434
+ | generation_time_ms | REAL | Time spent on generation |
435
+ | prompt_tokens | INTEGER | Input tokens used |
436
+ | completion_tokens | INTEGER | Output tokens generated |
437
+ | total_tokens | INTEGER | Total tokens (prompt + completion) |
438
+ | generation_cost_usd | REAL | Cost of generation |
439
+ | total_cost_usd | REAL | Total query cost |
440
+ | total_time_ms | REAL | End-to-end latency |
441
+ | has_answer | INTEGER | Whether answer is present (1 or 0) |
442
+ | answer_found | INTEGER | Whether answer is correct (1 or 0) |
443
+ | timestamp | TEXT | ISO 8601 timestamp |
444
+
445
+ ------------------------------------------------------------------------------------------------------------------------------
446
+ 🚀 Quick Start
447
+ ------------------------------------------------------------------------------------------------------------------------------
448
+ Prerequisites:
449
+ # Ensure Phase 1 & 2 are complete
450
+ ✅ 6 pipeline configurations defined
451
+ ✅ All API keys in .env
452
+ ✅ ChromaDB vector store populated
453
+ ✅ Wikipedia corpus ingested
454
+
455
+ 1)core/generator.py #LLM response generation
456
+ 2)core/retriever.py #Context retrieval + reranking
457
+ 3)core/pipeline.py #End-to-end orchestration
458
+ 4)utils/dataset_loader.py #Load Natural Questions + Wikipedia Dataset
459
+ 5)scripts/ingest_corpus_selective_pipeline.py (see below) #Ingest Wikipedia Corpus into All 6 Pipelines
460
+ 6)python scripts/run_generic_evaluation.py --num-questions 60 --pipelines A,B,C,D,E,F #Parallel RAG Pipeline Evaluation
461
+ 7)scripts/analyze_results.py #Results dashboard -diff types of runs to generate diff outout
462
+
463
+ for big scale dataset:
464
+ 5a)python scripts/ingest_corpus_selective_pipeline.py --pipelines A,C,D,E,F --passages 500000 --batch-size 5000
465
+ 5b)python scripts/ingest_corpus_selective_pipeline.py --pipelines B --passages 500000 --batch-size 1000
466
+
467
+ Last Updated: January 17, 2026, 7:38 PM EST
468
+ Project: RAG Pipeline Optimizer
469
+ Phase: 3 of 5
470
+
471
+ # 📊Phase 4: Advanced Evaluation & Interactive Dashboard
472
+ 🎯 Overview
473
+ Phase 4 delivers a two-part system for advanced RAG pipeline evaluation:
474
+
475
+ Phase 4A: LLM-as-a-Judge evaluation system using GPT-4o to score answer quality across 6 dimensions
476
+ Phase 4B: Full-stack interactive Streamlit dashboard for visualizing and comparing results
477
+
478
+ Together, these provide objective quality scoring and interactive exploration of pipeline performance beyond basic metrics like speed and cost.
479
+
480
+ 📦 Phase 4 Components
481
+ Phase 4A: LLM Judge Evaluation System
482
+ Automated answer quality scoring using GPT-4o as an AI judge
483
+
484
+ Phase 4B: Interactive Dashboard
485
+ 8-page Streamlit application for data exploration and real-time testing
486
+
487
+ 🔬 Phase 4A: LLM Judge Evaluation
488
+ Overview
489
+ Phase 4A adds multi-dimensional quality scoring to existing evaluation results using GPT-4o as an objective judge. Each answer is scored across 6 quality dimensions, providing insights beyond operational metrics.
490
+
491
+ ✨ Features
492
+ 6-Dimensional Quality Scoring
493
+ Correctness (0-10) - Factual accuracy compared to ground truth
494
+ Relevance (0-10) - How well the answer addresses the question
495
+ Completeness (0-10) - Coverage of important information
496
+ Clarity (0-10) - Clear, understandable language
497
+ Conciseness (0-10) - Brevity without sacrificing information
498
+ Overall (0-10) - Weighted average of all dimensions
499
+
500
+ Automated Evaluation
501
+ Evaluates existing Phase 3 results retroactively
502
+ No need to re-run pipelines
503
+ Batch processing with progress tracking
504
+ Results stored in separate database table
505
+
506
+ Cost-Efficient
507
+ Only evaluates answers, not entire pipeline re-runs
508
+ Uses GPT-4o-mini for cost efficiency
509
+ Batches requests to minimize API calls
510
+
511
+ 🏗️ Architecture
512
+ rag_optimizer/
513
+ ├── core/
514
+ │ └── evaluator.py # LLM Judge implementation
515
+ ├── utils/
516
+ │ └── database.py # Database utilities for score storage
517
+ ├── scripts/
518
+ │ └── evaluate_with_judge.py # CLI tool for running evaluations
519
+ └── data/
520
+ └── evaluation_results.db # SQLite (updated schema)
521
+
522
+ 🗄️ Database Schema (Phase 4A Extension)
523
+ New Table: evaluation_scores
524
+ Stores LLM judge quality scores for each evaluation result.
525
+ | Column | Type | Description |
526
+ | -------------------- | ------- | ----------------------------------- |
527
+ | id | INTEGER | Primary key |
528
+ | evaluation_result_id | INTEGER | Foreign key → evaluation_results.id |
529
+ | correctness_score | REAL | Factual accuracy (0-10) |
530
+ | relevance_score | REAL | Question relevance (0-10) |
531
+ | completeness_score | REAL | Information coverage (0-10) |
532
+ | clarity_score | REAL | Language clarity (0-10) |
533
+ | conciseness_score | REAL | Brevity (0-10) |
534
+ | overall_score | REAL | Weighted average (0-10) |
535
+ | judge_reasoning | TEXT | LLM's explanation for scores |
536
+ | timestamp | TEXT | ISO timestamp |
537
+
538
+ Indexes:
539
+
540
+ idx_eval_result on evaluation_result_id
541
+ idx_overall_score on overall_score
542
+
543
+ ------------------------------------------------------------------------------------------------------------------------------
544
+ 🚀 Quick Start
545
+ ------------------------------------------------------------------------------------------------------------------------------
546
+ core/evaluator.py
547
+ utils/database.py
548
+ python scripts/evaluate_with_judge.py --latest --limit 5
549
+
550
+
551
+ 🖥️ Phase 4B: Interactive Dashboard
552
+ Overview
553
+ Full-stack Streamlit dashboard with 9 pages for exploring evaluation results and testing pipelines in real-time.
554
+
555
+ ✨ Features
556
+ 🏠 Home Page
557
+ Project overview and capabilities
558
+ Quick stats (6 pipelines, 5 LLM providers, 500K+ corpus)
559
+ Pipeline configuration cards
560
+ Modern dark theme UI
561
+
562
+ 📊 Pipeline Comparison
563
+ Side-by-side performance metrics
564
+ Quality scores from LLM judge (correctness, relevance, completeness, clarity, conciseness)
565
+ Interactive comparison tables
566
+ Filter by evaluation run
567
+ Sort by accuracy, speed, cost, or quality score
568
+ Multi-dimensional scoring
569
+
570
+ 🔍 Question Explorer
571
+ Browse all evaluated questions
572
+ See how each pipeline answered
573
+ View quality scores per answer
574
+ Compare answers across pipelines
575
+ View retrieved context chunks
576
+ Ground truth validation
577
+
578
+ 💰 Cost Analysis
579
+ Token usage breakdown
580
+ Cost per query analysis
581
+ Cost efficiency rankings
582
+ Cost per quality point (cost divided by overall score)
583
+
584
+ ⚡ Performance Metrics
585
+ Latency analysis (retrieval vs generation)
586
+ Time breakdown by pipeline stage
587
+ Speed comparisons
588
+ Quality-adjusted speed (speed vs quality trade-offs)
589
+
590
+ 🔬 Performance Insights
591
+ Analyze pipeline performance across question types, categories, and difficulty
592
+ Performance by Question Type
593
+ Performance by pipeline
594
+
595
+ 🧪 Live Testing
596
+ Real-time pipeline testing
597
+ Category-based question suggestions
598
+ Multi-pipeline comparison
599
+ Live progress tracking
600
+ Answer quality comparison
601
+ Instant quality scoring (optional)
602
+
603
+ 📦 Batch Evaluation
604
+ Run comprehensive evaluations (5-100 questions)
605
+ Multi-pipeline testing
606
+ Parallel execution (1-6 workers)
607
+ Real-time progress monitoring
608
+ Option to run LLM judge automatically
609
+
610
+ 🏆 Leaderboard
611
+ Overall pipeline rankings
612
+ Quality-weighted rankings
613
+ Multiple sorting options (accuracy, speed, cost, quality)
614
+ Performance badges
615
+
616
+ ## Architecture
617
+ app/
618
+ ├── dashboard.py (main file above)
619
+ ├── pages/
620
+ │ ├── __init__.py
621
+ │ ├── home.py
622
+ │ ├── comparison.py
623
+ │ ├── explorer.py
624
+ │ ├── cost.py
625
+ │ ├── performance.py
626
+ │ ├── testing.py
627
+ │ └── leaderboard.py
628
+ │ └── batch_evaluation.py
629
+ │ └── insights.py
630
+
631
+ app/.streamlit/config.toml
632
+
633
+ ## Run application
634
+ streamlit run app/dashboard.py
635
+
636
+
637
+ Last Updated: January 24, 2026, 7:00 PM EST
638
+ Project: RAG Pipeline Optimizer
639
+ Phase: 4 of 5
docker-compose.yml ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ services:
4
+ rag-optimizer:
5
+ build:
6
+ context: .
7
+ dockerfile: Dockerfile
8
+ container_name: rag-optimizer-dashboard
9
+ ports:
10
+ - "8501:8501"
11
+ volumes:
12
+ # Mount data directories for persistence
13
+ - ./data:/app/data
14
+ - ./chroma_db:/app/chroma_db
15
+ - ./logs:/app/logs
16
+ environment:
17
+ # =====================
18
+ # AZURE AI FOUNDRY (Main OpenAI)
19
+ # =====================
20
+ - AZURE_OPENAI_ENDPOINT=${AZURE_OPENAI_ENDPOINT}
21
+ - AZURE_OPENAI_API_KEY=${AZURE_OPENAI_API_KEY}
22
+ - AZURE_OPENAI_DEPLOYMENT_NAME=${AZURE_OPENAI_DEPLOYMENT_NAME}
23
+
24
+ # =====================
25
+ # Azure OpenAI Embeddings
26
+ # =====================
27
+ - AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME=${AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME}
28
+ - AZURE_OPENAI_EMBEDDING_MODEL_NAME=${AZURE_OPENAI_EMBEDDING_MODEL_NAME}
29
+ - AZURE_OPENAI_EMBEDDING_ENDPOINT=${AZURE_OPENAI_EMBEDDING_ENDPOINT}
30
+ - AZURE_OPENAI_EMBEDDING_API_KEY=${AZURE_OPENAI_EMBEDDING_API_KEY}
31
+
32
+ # =====================
33
+ # Cohere via Azure AI Foundry
34
+ # =====================
35
+ - AZURE_COHERE_ENDPOINT=${AZURE_COHERE_ENDPOINT}
36
+ - AZURE_COHERE_API_KEY=${AZURE_COHERE_API_KEY}
37
+ - AZURE_COHERE_DEPLOYMENT_NAME=${AZURE_COHERE_DEPLOYMENT_NAME}
38
+
39
+ # =====================
40
+ # Azure Cohere Rerank (for retrieval)
41
+ # =====================
42
+ - AZURE_COHERE_RERANK_MODEL_NAME=${AZURE_COHERE_RERANK_MODEL_NAME}
43
+ - AZURE_COHERE_RERANK_ENDPOINT=${AZURE_COHERE_RERANK_ENDPOINT}
44
+ - AZURE_COHERE_RERANK_KEY=${AZURE_COHERE_RERANK_KEY}
45
+
46
+ # =====================
47
+ # DeepSeek via Azure AI Foundry
48
+ # =====================
49
+ - AZURE_DEEPSEEK_ENDPOINT=${AZURE_DEEPSEEK_ENDPOINT}
50
+ - AZURE_DEEPSEEK_API_KEY=${AZURE_DEEPSEEK_API_KEY}
51
+ - AZURE_DEEPSEEK_DEPLOYMENT_NAME=${AZURE_DEEPSEEK_DEPLOYMENT_NAME}
52
+
53
+ # =====================
54
+ # Anthropic (Direct API)
55
+ # =====================
56
+ - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
57
+
58
+ # =====================
59
+ # GROQ (Sonnet model - Direct API)
60
+ # =====================
61
+ - GROK_API_KEY=${GROK_API_KEY}
62
+
63
+ # =====================
64
+ # Database Configuration
65
+ # =====================
66
+ - DATABASE_URL=${DATABASE_URL:-sqlite:///./data/results.db}
67
+
68
+ # =====================
69
+ # ChromaDB Configuration
70
+ # =====================
71
+ - CHROMA_PERSIST_DIR=${CHROMA_PERSIST_DIR:-./data/vector_stores}
72
+
73
+ # =====================
74
+ # Streamlit Configuration
75
+ # =====================
76
+ - STREAMLIT_SERVER_PORT=8501
77
+ - STREAMLIT_SERVER_ADDRESS=0.0.0.0
78
+ - STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
79
+
80
+ restart: unless-stopped
81
+
82
+ healthcheck:
83
+ test: ["CMD", "curl", "-f", "http://localhost:8501/_stcore/health"]
84
+ interval: 30s
85
+ timeout: 10s
86
+ retries: 3
87
+ start_period: 40s
88
+
89
+ networks:
90
+ - rag-network
91
+
92
+ networks:
93
+ rag-network:
94
+ driver: bridge
init_project.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ PROJECT_ROOT = "."
4
+
5
+ DIRS = [
6
+ f"{PROJECT_ROOT}/core",
7
+ f"{PROJECT_ROOT}/config",
8
+ f"{PROJECT_ROOT}/data/uploads",
9
+ f"{PROJECT_ROOT}/data/vector_stores",
10
+ f"{PROJECT_ROOT}/utils",
11
+ f"{PROJECT_ROOT}/tests",
12
+ ]
13
+
14
+ TEST_PIPELINE = """\
15
+ import pytest
16
+ from config.pipeline_configs import ALL_PIPELINES
17
+
18
+ def test_pipeline_registry():
19
+ assert len(ALL_PIPELINES) >= 4
20
+ for key, cfg in ALL_PIPELINES.items():
21
+ assert cfg.chunk_size > 0
22
+ assert cfg.top_k > 0
23
+ """
24
+
25
+ if __name__ == "__main__":
26
+ for d in DIRS:
27
+ os.makedirs(d, exist_ok=True)
28
+ # __init__.py for packages
29
+ if "data" not in d:
30
+ init_path = os.path.join(d, "__init__.py")
31
+ if not os.path.exists(init_path):
32
+ open(init_path, "w").close()
33
+
34
+ with open(f"{PROJECT_ROOT}/tests/test_pipeline.py", "w") as f:
35
+ f.write(TEST_PIPELINE)
36
+
37
+ print("✅ Phase 1 project skeleton created.")
38
+ print("Next steps:")
39
+ print("1. cd rag_optimizer")
40
+ print("2. python -m venv venv")
41
+ print("3. source venv/bin/activate # or venv\\\\Scripts\\\\activate on Windows")
42
+ print("4. pip install -r requirements.txt")
43
+ print("5. cp .env.example .env && fill your keys")
requirements.txt ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core Framework
2
+ langchain
3
+ langchain-huggingface
4
+ langchain-openai
5
+ langchain-cohere
6
+ langchain-text-splitters
7
+ langchain-chroma
8
+
9
+ datasets
10
+ hf_xet
11
+ # Vector Database
12
+ chromadb
13
+
14
+ # Embeddings
15
+ sentence-transformers
16
+ openai
17
+ cohere
18
+
19
+ # LLM
20
+ anthropic
21
+
22
+ # Document Loading
23
+ pypdf
24
+ python-docx
25
+ python-pptx
26
+ openpyxl
27
+ markdown
28
+ beautifulsoup4
29
+
30
+ # Utils
31
+ python-dotenv
32
+ pydantic
33
+ tqdm
34
+ pandas
35
+
36
+ # Storage
37
+ sqlalchemy
38
+
39
+ # API (Phase 2)
40
+ fastapi
41
+ uvicorn
42
+ python-multipart
43
+
44
+ # Frontend (Phase 2)
45
+ streamlit
46
+ plotly
results.xlsx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4342adb5aa2740a5e25d2d03abb96d1ab7becdd9c74698c7276f1f7ce8dcd3fd
3
+ size 101826