init
Browse files- .dockerignore +0 -59
- ARCHITECTURE.md +0 -259
- Dockerfile +0 -55
- PROJECT_KNOWLEDGE.md +0 -473
- README.md +0 -195
- docker-compose.yml +0 -43
- ingest_pdfs.py +0 -87
- notebooks/llm_benchmark.ipynb +901 -0
- notebooks/rag_optimization_benchmark.ipynb +1367 -0
- notebooks/requirements_llm_benchmark.txt +27 -0
- notebooks/requirements_rag_optimization.txt +27 -0
- notebooks/requirements_vlm_ocr.txt +24 -0
- notebooks/vlm_ocr_benchmark.ipynb +891 -0
- requirements.txt +0 -30
- run.py +0 -18
- src/__init__.py +0 -0
- src/api/__init__.py +0 -0
- src/api/main.py +0 -181
- src/api/models.py +0 -48
- src/config.py +0 -46
- src/llm/__init__.py +0 -0
- src/llm/deepseek_client.py +0 -126
- src/llm/rag_pipeline.py +0 -154
- src/ocr/__init__.py +0 -0
- src/ocr/azure_ocr.py +0 -143
- src/ocr/processor.py +0 -62
- src/vectordb/__init__.py +0 -16
- src/vectordb/chroma_store.py +0 -150
- src/vectordb/pinecone_store.py +0 -176
- start.sh +0 -81
- test_complete_system.py +0 -128
.dockerignore
DELETED
|
@@ -1,59 +0,0 @@
|
|
| 1 |
-
# Python
|
| 2 |
-
__pycache__/
|
| 3 |
-
*.py[cod]
|
| 4 |
-
*$py.class
|
| 5 |
-
*.so
|
| 6 |
-
.Python
|
| 7 |
-
*.egg-info/
|
| 8 |
-
dist/
|
| 9 |
-
build/
|
| 10 |
-
*.egg
|
| 11 |
-
|
| 12 |
-
# Virtual environments
|
| 13 |
-
venv/
|
| 14 |
-
env/
|
| 15 |
-
ENV/
|
| 16 |
-
|
| 17 |
-
# IDE
|
| 18 |
-
.vscode/
|
| 19 |
-
.idea/
|
| 20 |
-
*.swp
|
| 21 |
-
*.swo
|
| 22 |
-
|
| 23 |
-
# Git
|
| 24 |
-
.git/
|
| 25 |
-
.gitignore
|
| 26 |
-
|
| 27 |
-
# Environment
|
| 28 |
-
.env
|
| 29 |
-
.env.*
|
| 30 |
-
|
| 31 |
-
# Testing
|
| 32 |
-
.pytest_cache/
|
| 33 |
-
.coverage
|
| 34 |
-
htmlcov/
|
| 35 |
-
|
| 36 |
-
# Documentation
|
| 37 |
-
docs/
|
| 38 |
-
*.md
|
| 39 |
-
!README.md
|
| 40 |
-
|
| 41 |
-
# Data (can be mounted as volumes)
|
| 42 |
-
data/pdfs/*
|
| 43 |
-
data/vector_db/*
|
| 44 |
-
data/processed/*
|
| 45 |
-
|
| 46 |
-
# Test files
|
| 47 |
-
test_*.py
|
| 48 |
-
*_test.py
|
| 49 |
-
|
| 50 |
-
# Logs
|
| 51 |
-
*.log
|
| 52 |
-
|
| 53 |
-
# OS
|
| 54 |
-
.DS_Store
|
| 55 |
-
Thumbs.db
|
| 56 |
-
|
| 57 |
-
# Temporary files
|
| 58 |
-
*.tmp
|
| 59 |
-
*.bak
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ARCHITECTURE.md
DELETED
|
@@ -1,259 +0,0 @@
|
|
| 1 |
-
# SOCAR Document Processing System - Architecture
|
| 2 |
-
|
| 3 |
-
## System Components
|
| 4 |
-
|
| 5 |
-
### 1. OCR (Optical Character Recognition)
|
| 6 |
-
**Model**: Azure Document Intelligence (`prebuilt-read`)
|
| 7 |
-
**NOT** Llama-4-Maverick (LLM is for answer generation, not OCR)
|
| 8 |
-
|
| 9 |
-
**Performance**:
|
| 10 |
-
- ✅ **92.79% Character Success Rate (CSR)**
|
| 11 |
-
- ✅ **55.59% Word Success Rate (WSR)**
|
| 12 |
-
- ✅ Significantly better than Tesseract (25% CSR, 21% WSR)
|
| 13 |
-
|
| 14 |
-
**Features**:
|
| 15 |
-
- ✅ Multi-language support (Azerbaijani, Russian, English)
|
| 16 |
-
- ✅ **Cyrillic alphabet PRESERVED** (Russian text stays in Cyrillic as-is)
|
| 17 |
-
- ✅ **Image detection** (lightweight references via PyMuPDF)
|
| 18 |
-
- ✅ Handwriting recognition
|
| 19 |
-
- ✅ Table detection
|
| 20 |
-
|
| 21 |
-
**Location**: `src/ocr/azure_ocr.py`
|
| 22 |
-
|
| 23 |
-
**Output Format**:
|
| 24 |
-
```json
|
| 25 |
-
[
|
| 26 |
-
{
|
| 27 |
-
"page_number": 1,
|
| 28 |
-
"MD_text": "Text content with image references...\n\n\n\n"
|
| 29 |
-
}
|
| 30 |
-
]
|
| 31 |
-
```
|
| 32 |
-
|
| 33 |
-
**Image Handling**:
|
| 34 |
-
- Images detected and referenced inline in MD_text (not saved to disk)
|
| 35 |
-
- Format: ``
|
| 36 |
-
- Simple lightweight references, no base64, no file storage
|
| 37 |
-
- No separate "images" field - everything in MD_text
|
| 38 |
-
- Only adds image markdown when images actually exist on page
|
| 39 |
-
|
| 40 |
-
---
|
| 41 |
-
|
| 42 |
-
### 2. Embeddings (Vector Database Ingestion)
|
| 43 |
-
**Model**: `BAAI/bge-large-en-v1.5`
|
| 44 |
-
**Dimensions**: 1024 (matches Pinecone index)
|
| 45 |
-
**Purpose**: Converts text chunks into numerical vectors for semantic search
|
| 46 |
-
|
| 47 |
-
**Location**: `src/vectordb/pinecone_store.py:28`
|
| 48 |
-
|
| 49 |
-
```python
|
| 50 |
-
self.embedding_model = SentenceTransformer("BAAI/bge-large-en-v1.5")
|
| 51 |
-
self.embedding_dimension = 1024
|
| 52 |
-
```
|
| 53 |
-
|
| 54 |
-
**Process**:
|
| 55 |
-
1. Text chunks (600 chars, 100 overlap) → Embeddings (1024-dim vectors)
|
| 56 |
-
2. Upload to Pinecone cloud vector database
|
| 57 |
-
3. Enable semantic search (finds similar content)
|
| 58 |
-
|
| 59 |
-
---
|
| 60 |
-
|
| 61 |
-
### 3. Vector Database
|
| 62 |
-
**Service**: Pinecone (Cloud)
|
| 63 |
-
**Index**: `hackathon`
|
| 64 |
-
**Configuration**:
|
| 65 |
-
- Dimensions: 1024
|
| 66 |
-
- Metric: Cosine similarity
|
| 67 |
-
- Cloud: AWS us-east-1
|
| 68 |
-
- Type: Dense vectors
|
| 69 |
-
- Capacity: On-demand
|
| 70 |
-
|
| 71 |
-
**Location**: `src/vectordb/pinecone_store.py`
|
| 72 |
-
|
| 73 |
-
**Stats**:
|
| 74 |
-
- Total Documents: 1,241 chunks
|
| 75 |
-
- Source PDFs: 28 documents
|
| 76 |
-
- Embedding Model: BAAI/bge-large-en-v1.5
|
| 77 |
-
|
| 78 |
-
---
|
| 79 |
-
|
| 80 |
-
### 4. LLM (Answer Generation)
|
| 81 |
-
**Model**: `Llama-4-Maverick-17B-128E-Instruct-FP8` (Open-source)
|
| 82 |
-
**Purpose**: Generates contextual answers based on retrieved documents
|
| 83 |
-
**Provider**: Azure OpenAI
|
| 84 |
-
|
| 85 |
-
**Location**: `src/config.py:31`
|
| 86 |
-
|
| 87 |
-
```python
|
| 88 |
-
llm_model = "Llama-4-Maverick-17B-128E-Instruct-FP8"
|
| 89 |
-
```
|
| 90 |
-
|
| 91 |
-
**Parameters**:
|
| 92 |
-
- Temperature: 0.2 (deterministic, factual answers)
|
| 93 |
-
- Max Tokens: 1000
|
| 94 |
-
- Top-k Documents: 3 (from Pinecone)
|
| 95 |
-
|
| 96 |
-
---
|
| 97 |
-
|
| 98 |
-
## RAG Pipeline Flow
|
| 99 |
-
|
| 100 |
-
```
|
| 101 |
-
User Query
|
| 102 |
-
↓
|
| 103 |
-
1. Generate Query Embedding (BAAI/bge-large-en-v1.5)
|
| 104 |
-
↓
|
| 105 |
-
2. Search Pinecone (Cosine similarity, top 3 docs)
|
| 106 |
-
↓
|
| 107 |
-
3. Retrieve Relevant Chunks
|
| 108 |
-
↓
|
| 109 |
-
4. Build Context (3 documents × 600 chars)
|
| 110 |
-
↓
|
| 111 |
-
5. LLM Generation (Llama-4-Maverick-17B)
|
| 112 |
-
↓
|
| 113 |
-
Response with Citations
|
| 114 |
-
```
|
| 115 |
-
|
| 116 |
-
**Average Response Time**: 6.7 seconds
|
| 117 |
-
- Embedding generation: ~0.5s
|
| 118 |
-
- Pinecone search: ~1.0s
|
| 119 |
-
- LLM generation: ~3.2s
|
| 120 |
-
- Network overhead: ~2.0s
|
| 121 |
-
|
| 122 |
-
---
|
| 123 |
-
|
| 124 |
-
## Cyrillic Support
|
| 125 |
-
|
| 126 |
-
**OCR Output** (Cyrillic preserved):
|
| 127 |
-
```markdown
|
| 128 |
-
# Добыча нефти в Азербайджане
|
| 129 |
-
|
| 130 |
-
Южно-Каспийский бассейн...
|
| 131 |
-
```
|
| 132 |
-
|
| 133 |
-
**Chatbot** (Optional Azerbaijani conversion):
|
| 134 |
-
- OCR: **Cyrillic preserved** (requirement)
|
| 135 |
-
- Chatbot answers: Can be Azerbaijani alphabet (your choice)
|
| 136 |
-
|
| 137 |
-
---
|
| 138 |
-
|
| 139 |
-
## Image Detection
|
| 140 |
-
|
| 141 |
-
**Method**: PyMuPDF (fitz)
|
| 142 |
-
**Format**: Simple text references (not saved to disk)
|
| 143 |
-
**Included in**: OCR endpoint `/ocr`
|
| 144 |
-
|
| 145 |
-
**Example Response**:
|
| 146 |
-
```json
|
| 147 |
-
[
|
| 148 |
-
{
|
| 149 |
-
"page_number": 1,
|
| 150 |
-
"MD_text": "Oil exploration map...\n\n\n\n"
|
| 151 |
-
}
|
| 152 |
-
]
|
| 153 |
-
```
|
| 154 |
-
|
| 155 |
-
**Key Features**:
|
| 156 |
-
- Lightweight: No file storage, no base64 encoding
|
| 157 |
-
- Smart: Only adds image markdown when images exist
|
| 158 |
-
- Clean: Simple references like `document_page_1_image_1`
|
| 159 |
-
|
| 160 |
-
---
|
| 161 |
-
|
| 162 |
-
## API Endpoints
|
| 163 |
-
|
| 164 |
-
### 1. `POST /ocr` - PDF Processing
|
| 165 |
-
Extract text and detect images from PDF
|
| 166 |
-
|
| 167 |
-
**Request**:
|
| 168 |
-
```bash
|
| 169 |
-
curl -X POST http://localhost:8000/ocr \
|
| 170 |
-
-F "file=@document.pdf"
|
| 171 |
-
```
|
| 172 |
-
|
| 173 |
-
**Response**:
|
| 174 |
-
```json
|
| 175 |
-
[
|
| 176 |
-
{
|
| 177 |
-
"page_number": 1,
|
| 178 |
-
"MD_text": "Нефтяные месторождения...\n\n\n\n"
|
| 179 |
-
}
|
| 180 |
-
]
|
| 181 |
-
```
|
| 182 |
-
|
| 183 |
-
### 2. `POST /llm` - RAG Chatbot
|
| 184 |
-
Ask questions about documents
|
| 185 |
-
|
| 186 |
-
**Request**:
|
| 187 |
-
```bash
|
| 188 |
-
curl -X POST http://localhost:8000/llm \
|
| 189 |
-
-H "Content-Type: application/json" \
|
| 190 |
-
-d '[{"role": "user", "content": "What is SOCAR?"}]'
|
| 191 |
-
```
|
| 192 |
-
|
| 193 |
-
**Response**:
|
| 194 |
-
```json
|
| 195 |
-
{
|
| 196 |
-
"answer": "SOCAR (State Oil Company of Azerbaijan Republic)...",
|
| 197 |
-
"sources": [
|
| 198 |
-
{
|
| 199 |
-
"pdf_name": "document_06.pdf",
|
| 200 |
-
"page_number": 3,
|
| 201 |
-
"content": "SOCAR operates in..."
|
| 202 |
-
}
|
| 203 |
-
]
|
| 204 |
-
}
|
| 205 |
-
```
|
| 206 |
-
|
| 207 |
-
---
|
| 208 |
-
|
| 209 |
-
## Model Summary
|
| 210 |
-
|
| 211 |
-
| Component | Model/Service | Purpose |
|
| 212 |
-
|-----------|--------------|---------|
|
| 213 |
-
| **OCR** | Azure Document Intelligence | Extract text + detect images (92.79% CSR, Cyrillic preserved) |
|
| 214 |
-
| **Embeddings** | BAAI/bge-large-en-v1.5 (1024-dim) | Convert text → vectors for search |
|
| 215 |
-
| **Vector DB** | Pinecone (AWS us-east-1) | Store & search 1,241 document chunks |
|
| 216 |
-
| **LLM** | Llama-4-Maverick-17B-128E-Instruct-FP8 | Generate contextual answers (Open-source) |
|
| 217 |
-
|
| 218 |
-
---
|
| 219 |
-
|
| 220 |
-
## Scoring Criteria Optimization
|
| 221 |
-
|
| 222 |
-
### OCR Quality (50%)
|
| 223 |
-
- ✅ Multi-language (Azerbaijani, Russian, English)
|
| 224 |
-
- ✅ **High accuracy: 92.79% CSR, 55.59% WSR**
|
| 225 |
-
- ✅ Cyrillic preservation (as required)
|
| 226 |
-
- ✅ Image detection (lightweight references)
|
| 227 |
-
- ✅ Azure Document Intelligence (enterprise-grade OCR)
|
| 228 |
-
|
| 229 |
-
### LLM Quality (30%)
|
| 230 |
-
- ✅ **Llama-4-Maverick-17B-128E-Instruct-FP8** (Open-source model)
|
| 231 |
-
- ✅ RAG with 3-document retrieval
|
| 232 |
-
- ✅ Source citations
|
| 233 |
-
- ✅ Contextual accuracy
|
| 234 |
-
|
| 235 |
-
### Architecture (20%)
|
| 236 |
-
- ✅ Cloud vector database (Pinecone)
|
| 237 |
-
- ✅ Production-ready (Docker, FastAPI)
|
| 238 |
-
- ✅ Open-source LLM (Llama-4-Maverick)
|
| 239 |
-
- ✅ Modern stack (Pinecone, sentence-transformers)
|
| 240 |
-
|
| 241 |
-
---
|
| 242 |
-
|
| 243 |
-
## Public Access
|
| 244 |
-
|
| 245 |
-
**ngrok URL**: `https://healthy-carolin-noncontagiously.ngrok-free.dev`
|
| 246 |
-
|
| 247 |
-
**Test Query**:
|
| 248 |
-
```bash
|
| 249 |
-
curl -X POST https://healthy-carolin-noncontagiously.ngrok-free.dev/llm \
|
| 250 |
-
-H "Content-Type: application/json" \
|
| 251 |
-
-d '[{"role": "user", "content": "Что такое СОКАР?"}]'
|
| 252 |
-
```
|
| 253 |
-
**Expected**: Answer in Russian/Azerbaijani with source citations
|
| 254 |
-
|
| 255 |
-
---
|
| 256 |
-
|
| 257 |
-
## Repository
|
| 258 |
-
**GitHub**: https://github.com/Ismat-Samadov/SOCAR_Hackathon
|
| 259 |
-
**Stack**: Python 3.10, FastAPI, Pinecone, Azure AI, Docker
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Dockerfile
DELETED
|
@@ -1,55 +0,0 @@
|
|
| 1 |
-
# Multi-stage build for optimized Docker image
|
| 2 |
-
FROM python:3.11-slim as builder
|
| 3 |
-
|
| 4 |
-
# Set working directory
|
| 5 |
-
WORKDIR /app
|
| 6 |
-
|
| 7 |
-
# Install system dependencies
|
| 8 |
-
RUN apt-get update && apt-get install -y \
|
| 9 |
-
build-essential \
|
| 10 |
-
curl \
|
| 11 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 12 |
-
|
| 13 |
-
# Copy requirements first for better caching
|
| 14 |
-
COPY requirements.txt .
|
| 15 |
-
|
| 16 |
-
# Install Python dependencies
|
| 17 |
-
RUN pip install --no-cache-dir --upgrade pip && \
|
| 18 |
-
pip install --no-cache-dir -r requirements.txt
|
| 19 |
-
|
| 20 |
-
# Final stage
|
| 21 |
-
FROM python:3.11-slim
|
| 22 |
-
|
| 23 |
-
# Set working directory
|
| 24 |
-
WORKDIR /app
|
| 25 |
-
|
| 26 |
-
# Install runtime dependencies only
|
| 27 |
-
RUN apt-get update && apt-get install -y \
|
| 28 |
-
curl \
|
| 29 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 30 |
-
|
| 31 |
-
# Copy Python packages from builder
|
| 32 |
-
COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages
|
| 33 |
-
COPY --from=builder /usr/local/bin /usr/local/bin
|
| 34 |
-
|
| 35 |
-
# Copy application code
|
| 36 |
-
COPY src/ ./src/
|
| 37 |
-
COPY run.py .
|
| 38 |
-
|
| 39 |
-
# Create directories for data
|
| 40 |
-
RUN mkdir -p data/pdfs data/vector_db data/processed
|
| 41 |
-
|
| 42 |
-
# Expose port
|
| 43 |
-
EXPOSE 8000
|
| 44 |
-
|
| 45 |
-
# Set environment variables
|
| 46 |
-
ENV PYTHONUNBUFFERED=1
|
| 47 |
-
ENV TOKENIZERS_PARALLELISM=false
|
| 48 |
-
ENV ANONYMIZED_TELEMETRY=false
|
| 49 |
-
|
| 50 |
-
# Health check
|
| 51 |
-
HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
|
| 52 |
-
CMD curl -f http://localhost:8000/ || exit 1
|
| 53 |
-
|
| 54 |
-
# Run the application
|
| 55 |
-
CMD ["python", "run.py"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
PROJECT_KNOWLEDGE.md
DELETED
|
@@ -1,473 +0,0 @@
|
|
| 1 |
-
# SOCAR Hackathon Project - Essential Knowledge Base
|
| 2 |
-
|
| 3 |
-
## Project Overview
|
| 4 |
-
**Purpose**: Document processing system for SOCAR historical oil & gas documents
|
| 5 |
-
**Goal**: OCR + RAG chatbot system for hackathon submission
|
| 6 |
-
**Scoring**: OCR Quality (50%) + LLM Quality (30%) + Architecture (20%)
|
| 7 |
-
|
| 8 |
-
---
|
| 9 |
-
|
| 10 |
-
## Critical Requirements
|
| 11 |
-
|
| 12 |
-
### API Endpoints (Must Have)
|
| 13 |
-
1. **POST /ocr** - PDF text extraction with image detection
|
| 14 |
-
2. **POST /llm** - RAG-based question answering
|
| 15 |
-
|
| 16 |
-
### Response Formats (Exact Specifications)
|
| 17 |
-
|
| 18 |
-
#### /ocr Response
|
| 19 |
-
```json
|
| 20 |
-
[
|
| 21 |
-
{
|
| 22 |
-
"page_number": 1,
|
| 23 |
-
"MD_text": "Extracted text content...\n\n\n\n"
|
| 24 |
-
}
|
| 25 |
-
]
|
| 26 |
-
```
|
| 27 |
-
|
| 28 |
-
**Key Points**:
|
| 29 |
-
- List of dictionaries (NOT nested object with "pages" field)
|
| 30 |
-
- Only two keys: `page_number` (int) and `MD_text` (str)
|
| 31 |
-
- Images referenced inline in MD_text as markdown path: ``
|
| 32 |
-
- Path format: `{pdf_filename}/{page_folder}/{image_name}`
|
| 33 |
-
- NO separate "images" field
|
| 34 |
-
- NO base64 encoding - just simple path-like text references
|
| 35 |
-
- Only add image markdown when images actually exist on page
|
| 36 |
-
|
| 37 |
-
#### /llm Response
|
| 38 |
-
```json
|
| 39 |
-
{
|
| 40 |
-
"answer": "Response text...",
|
| 41 |
-
"sources": [
|
| 42 |
-
{
|
| 43 |
-
"pdf_name": "document_06.pdf",
|
| 44 |
-
"page_number": 3,
|
| 45 |
-
"content": "Relevant excerpt..."
|
| 46 |
-
}
|
| 47 |
-
]
|
| 48 |
-
}
|
| 49 |
-
```
|
| 50 |
-
|
| 51 |
-
---
|
| 52 |
-
|
| 53 |
-
## Technology Stack
|
| 54 |
-
|
| 55 |
-
### 1. OCR (50% of score)
|
| 56 |
-
**Model**: Azure Document Intelligence API
|
| 57 |
-
- Endpoint: `prebuilt-read`
|
| 58 |
-
- Provider: Azure AI Services
|
| 59 |
-
- Performance: 92.79% CSR, 55.59% WSR
|
| 60 |
-
- Features: Multi-language (Azerbaijani, Russian, English), Cyrillic preservation, handwriting recognition
|
| 61 |
-
|
| 62 |
-
**Why Azure**:
|
| 63 |
-
- Tesseract achieved only 25% CSR (4x worse)
|
| 64 |
-
- Enterprise-grade accuracy
|
| 65 |
-
- Native Cyrillic support
|
| 66 |
-
|
| 67 |
-
**Configuration**:
|
| 68 |
-
```python
|
| 69 |
-
from azure.ai.formrecognizer import DocumentAnalysisClient
|
| 70 |
-
from azure.core.credentials import AzureKeyCredential
|
| 71 |
-
|
| 72 |
-
client = DocumentAnalysisClient(
|
| 73 |
-
endpoint=azure_openai_endpoint,
|
| 74 |
-
credential=AzureKeyCredential(api_key)
|
| 75 |
-
)
|
| 76 |
-
```
|
| 77 |
-
|
| 78 |
-
### 2. Embeddings
|
| 79 |
-
**Model**: `BAAI/bge-large-en-v1.5`
|
| 80 |
-
- Dimensions: 1024
|
| 81 |
-
- Library: sentence-transformers
|
| 82 |
-
- Purpose: Convert text chunks to vectors for semantic search
|
| 83 |
-
|
| 84 |
-
### 3. Vector Database
|
| 85 |
-
**Service**: Pinecone Cloud
|
| 86 |
-
- Index name: Configurable (e.g., "socar-documents")
|
| 87 |
-
- Dimensions: 1024 (must match embedding model)
|
| 88 |
-
- Metric: Cosine similarity
|
| 89 |
-
- Region: AWS us-east-1
|
| 90 |
-
|
| 91 |
-
**Alternative**: ChromaDB (local, good for development)
|
| 92 |
-
|
| 93 |
-
### 4. LLM (30% of score)
|
| 94 |
-
**Model**: `Llama-4-Maverick-17B-128E-Instruct-FP8`
|
| 95 |
-
- Provider: Azure OpenAI (or compatible endpoint)
|
| 96 |
-
- Parameters:
|
| 97 |
-
- Temperature: 0.2 (factual, deterministic)
|
| 98 |
-
- Max Tokens: 1000
|
| 99 |
-
- Top-k documents: 3
|
| 100 |
-
- **Important**: Use this exact model name for open-source architecture points
|
| 101 |
-
|
| 102 |
-
### 5. Web Framework
|
| 103 |
-
**Framework**: FastAPI
|
| 104 |
-
- Async support
|
| 105 |
-
- Auto-generated docs at /docs
|
| 106 |
-
- File upload support (multipart/form-data)
|
| 107 |
-
|
| 108 |
-
### 6. Image Extraction
|
| 109 |
-
**Library**: PyMuPDF (fitz)
|
| 110 |
-
- Purpose: Detect images in PDFs
|
| 111 |
-
- Method: `page.get_images()`
|
| 112 |
-
- **Important**: Don't save images to disk, just reference them
|
| 113 |
-
|
| 114 |
-
---
|
| 115 |
-
|
| 116 |
-
## Environment Variables (.env)
|
| 117 |
-
|
| 118 |
-
```bash
|
| 119 |
-
# Azure AI Services
|
| 120 |
-
AZURE_OPENAI_API_KEY=your_key_here
|
| 121 |
-
AZURE_OPENAI_ENDPOINT=https://your-endpoint.openai.azure.com/
|
| 122 |
-
AZURE_OPENAI_API_VERSION=2024-08-01-preview
|
| 123 |
-
|
| 124 |
-
# Pinecone (if using)
|
| 125 |
-
PINECONE_API_KEY=your_key_here
|
| 126 |
-
PINECONE_INDEX_NAME=socar-documents
|
| 127 |
-
PINECONE_CLOUD=aws
|
| 128 |
-
PINECONE_REGION=us-east-1
|
| 129 |
-
|
| 130 |
-
# Application
|
| 131 |
-
API_HOST=0.0.0.0
|
| 132 |
-
API_PORT=8000
|
| 133 |
-
```
|
| 134 |
-
|
| 135 |
-
---
|
| 136 |
-
|
| 137 |
-
## RAG Pipeline Architecture
|
| 138 |
-
|
| 139 |
-
```
|
| 140 |
-
User Query
|
| 141 |
-
↓
|
| 142 |
-
1. Embed Query (BAAI/bge-large-en-v1.5)
|
| 143 |
-
↓
|
| 144 |
-
2. Search Vector DB (Cosine similarity, top 3 docs)
|
| 145 |
-
↓
|
| 146 |
-
3. Retrieve Relevant Chunks
|
| 147 |
-
↓
|
| 148 |
-
4. Build Context (3 documents × 600 chars)
|
| 149 |
-
↓
|
| 150 |
-
5. LLM Generation (Llama-4-Maverick-17B)
|
| 151 |
-
↓
|
| 152 |
-
Response with Citations
|
| 153 |
-
```
|
| 154 |
-
|
| 155 |
-
### Chunking Strategy
|
| 156 |
-
- Chunk size: 600 characters
|
| 157 |
-
- Overlap: 100 characters
|
| 158 |
-
- Preserves context across chunks
|
| 159 |
-
- **Important**: Only store extracted TEXT in vector database, NOT image markdown references
|
| 160 |
-
- Strip out image markdown (``) before ingestion
|
| 161 |
-
- Images are only for OCR endpoint response, not for RAG
|
| 162 |
-
|
| 163 |
-
---
|
| 164 |
-
|
| 165 |
-
## Key Features & Requirements
|
| 166 |
-
|
| 167 |
-
### Cyrillic Support
|
| 168 |
-
- **Critical**: OCR must preserve Cyrillic alphabet (Russian text stays in Cyrillic)
|
| 169 |
-
- Don't transliterate Russian → Latin
|
| 170 |
-
- Azure Document Intelligence handles this natively
|
| 171 |
-
|
| 172 |
-
### Image Handling
|
| 173 |
-
- **Method**: Detect images using PyMuPDF
|
| 174 |
-
- **Format**: Path-like references: `document_name.pdf/page_X/image_Y`
|
| 175 |
-
- **Markdown**: ``
|
| 176 |
-
- **Path Structure**: `{pdf_filename}/{page_folder}/{image_name}`
|
| 177 |
-
- **Important**:
|
| 178 |
-
- NO file saving
|
| 179 |
-
- NO base64 encoding
|
| 180 |
-
- Only add markdown when images exist
|
| 181 |
-
- Check `if image_list:` before adding
|
| 182 |
-
- Include full PDF filename in path
|
| 183 |
-
|
| 184 |
-
### OCR vs Vector Database (Critical Distinction)
|
| 185 |
-
**OCR Endpoint (`/ocr`)**:
|
| 186 |
-
- Returns text WITH image markdown references
|
| 187 |
-
- Format: ``
|
| 188 |
-
- Purpose: Show complete document structure to user
|
| 189 |
-
|
| 190 |
-
**Vector Database Ingestion**:
|
| 191 |
-
- Store ONLY text content, NO image references
|
| 192 |
-
- Strip out all `` markdown before adding to vector DB
|
| 193 |
-
- Purpose: Enable semantic search on text only
|
| 194 |
-
- Image markdown would pollute search results
|
| 195 |
-
|
| 196 |
-
**Example**:
|
| 197 |
-
```python
|
| 198 |
-
# OCR Response (with images)
|
| 199 |
-
MD_text = "Oil reserves...\n\n\n\nTotal production..."
|
| 200 |
-
|
| 201 |
-
# Vector DB ingestion (text only)
|
| 202 |
-
import re
|
| 203 |
-
text_only = re.sub(r'!\[Image\]\([^)]+\)', '', MD_text)
|
| 204 |
-
# Result: "Oil reserves...\n\nTotal production..."
|
| 205 |
-
vector_db.add(text_only)
|
| 206 |
-
```
|
| 207 |
-
|
| 208 |
-
### Public Access
|
| 209 |
-
- Use ngrok for public endpoint
|
| 210 |
-
- Command: `ngrok http 8000`
|
| 211 |
-
- Free tier is sufficient
|
| 212 |
-
- Save the URL for hackathon submission
|
| 213 |
-
|
| 214 |
-
---
|
| 215 |
-
|
| 216 |
-
## Project Structure
|
| 217 |
-
|
| 218 |
-
```
|
| 219 |
-
SOCAR_Hackathon/
|
| 220 |
-
├── src/
|
| 221 |
-
│ ├── api/
|
| 222 |
-
│ │ └── main.py # FastAPI app with /ocr and /llm endpoints
|
| 223 |
-
│ ├── ocr/
|
| 224 |
-
│ │ ├── processor.py # Main OCR interface
|
| 225 |
-
│ │ └── azure_ocr.py # Azure Document Intelligence implementation
|
| 226 |
-
│ ├── vectordb/
|
| 227 |
-
│ │ ├── pinecone_store.py # Pinecone operations
|
| 228 |
-
│ │ └── chroma_store.py # ChromaDB alternative
|
| 229 |
-
│ ├── llm/
|
| 230 |
-
│ │ └── chat.py # LLM integration with RAG
|
| 231 |
-
│ └── config.py # Pydantic settings
|
| 232 |
-
├── data/
|
| 233 |
-
│ ├── pdfs/ # Input PDFs
|
| 234 |
-
│ └── processed/ # Processed data
|
| 235 |
-
├── run.py # Entry point
|
| 236 |
-
├── requirements.txt
|
| 237 |
-
├── .env
|
| 238 |
-
└── docker-compose.yml # Optional
|
| 239 |
-
```
|
| 240 |
-
|
| 241 |
-
---
|
| 242 |
-
|
| 243 |
-
## Implementation Guidelines
|
| 244 |
-
|
| 245 |
-
### OCR Endpoint Implementation
|
| 246 |
-
```python
|
| 247 |
-
@app.post("/ocr")
|
| 248 |
-
async def ocr_endpoint(file: UploadFile = File(...)):
|
| 249 |
-
# 1. Read PDF bytes
|
| 250 |
-
pdf_bytes = await file.read()
|
| 251 |
-
pdf_filename = file.filename # e.g., "document_06.pdf"
|
| 252 |
-
|
| 253 |
-
# 2. Process with Azure Document Intelligence
|
| 254 |
-
# - Extract text page by page
|
| 255 |
-
# - Detect images with PyMuPDF
|
| 256 |
-
# - Create path-like references: pdf_filename/page_X/image_Y
|
| 257 |
-
# - Embed image references in MD_text
|
| 258 |
-
|
| 259 |
-
# 3. Example of adding image reference
|
| 260 |
-
# for each image detected on page 1:
|
| 261 |
-
# md_text += f"\n\n\n\n"
|
| 262 |
-
|
| 263 |
-
# 4. Return list of dicts
|
| 264 |
-
return [
|
| 265 |
-
{"page_number": 1, "MD_text": "text..."},
|
| 266 |
-
{"page_number": 2, "MD_text": "text..."}
|
| 267 |
-
]
|
| 268 |
-
```
|
| 269 |
-
|
| 270 |
-
### LLM Endpoint Implementation
|
| 271 |
-
```python
|
| 272 |
-
@app.post("/llm")
|
| 273 |
-
async def llm_endpoint(messages: List[Dict]):
|
| 274 |
-
# 1. Get last user message
|
| 275 |
-
user_query = messages[-1]["content"]
|
| 276 |
-
|
| 277 |
-
# 2. Generate embedding
|
| 278 |
-
query_embedding = embed_model.encode(user_query)
|
| 279 |
-
|
| 280 |
-
# 3. Search vector DB (top 3)
|
| 281 |
-
results = vector_db.search(query_embedding, top_k=3)
|
| 282 |
-
|
| 283 |
-
# 4. Build context from results
|
| 284 |
-
context = "\n\n".join([doc["content"] for doc in results])
|
| 285 |
-
|
| 286 |
-
# 5. Create prompt with context
|
| 287 |
-
prompt = f"Context:\n{context}\n\nQuestion: {user_query}"
|
| 288 |
-
|
| 289 |
-
# 6. Call LLM
|
| 290 |
-
response = llm_client.chat(
|
| 291 |
-
model="Llama-4-Maverick-17B-128E-Instruct-FP8",
|
| 292 |
-
messages=[{"role": "user", "content": prompt}],
|
| 293 |
-
temperature=0.2,
|
| 294 |
-
max_tokens=1000
|
| 295 |
-
)
|
| 296 |
-
|
| 297 |
-
# 7. Return answer with sources
|
| 298 |
-
return {
|
| 299 |
-
"answer": response.content,
|
| 300 |
-
"sources": [
|
| 301 |
-
{
|
| 302 |
-
"pdf_name": doc["metadata"]["pdf_name"],
|
| 303 |
-
"page_number": doc["metadata"]["page_number"],
|
| 304 |
-
"content": doc["content"][:200]
|
| 305 |
-
}
|
| 306 |
-
for doc in results
|
| 307 |
-
]
|
| 308 |
-
}
|
| 309 |
-
```
|
| 310 |
-
|
| 311 |
-
---
|
| 312 |
-
|
| 313 |
-
## Critical Dos and Don'ts
|
| 314 |
-
|
| 315 |
-
### ✅ DO
|
| 316 |
-
- Use exact model names: `Llama-4-Maverick-17B-128E-Instruct-FP8`
|
| 317 |
-
- Return exact response formats as specified
|
| 318 |
-
- Preserve Cyrillic text from OCR
|
| 319 |
-
- Only add image markdown when images exist
|
| 320 |
-
- Include source citations in LLM responses
|
| 321 |
-
- Use temperature 0.2 for factual answers
|
| 322 |
-
- Chunk documents at 600 chars with 100 overlap
|
| 323 |
-
|
| 324 |
-
### ❌ DON'T
|
| 325 |
-
- Change response format (no extra fields or nesting)
|
| 326 |
-
- Use base64 for images
|
| 327 |
-
- Save images to disk
|
| 328 |
-
- Transliterate Cyrillic to Latin
|
| 329 |
-
- Add image markdown when no images exist
|
| 330 |
-
- Use GPT models (use Llama for open-source points)
|
| 331 |
-
- Skip error handling
|
| 332 |
-
|
| 333 |
-
---
|
| 334 |
-
|
| 335 |
-
## Testing Commands
|
| 336 |
-
|
| 337 |
-
### Test OCR
|
| 338 |
-
```bash
|
| 339 |
-
curl -X POST http://localhost:8000/ocr \
|
| 340 |
-
-F "file=@document.pdf"
|
| 341 |
-
```
|
| 342 |
-
|
| 343 |
-
### Test LLM
|
| 344 |
-
```bash
|
| 345 |
-
curl -X POST http://localhost:8000/llm \
|
| 346 |
-
-H "Content-Type: application/json" \
|
| 347 |
-
-d '[{"role": "user", "content": "What is SOCAR?"}]'
|
| 348 |
-
```
|
| 349 |
-
|
| 350 |
-
### Test with ngrok
|
| 351 |
-
```bash
|
| 352 |
-
# Replace localhost with ngrok URL
|
| 353 |
-
curl -X POST https://your-url.ngrok-free.dev/llm \
|
| 354 |
-
-H "Content-Type: application/json" \
|
| 355 |
-
-d '[{"role": "user", "content": "Test question"}]'
|
| 356 |
-
```
|
| 357 |
-
|
| 358 |
-
---
|
| 359 |
-
|
| 360 |
-
## Common Issues & Solutions
|
| 361 |
-
|
| 362 |
-
### Issue: Tesseract poor performance
|
| 363 |
-
**Solution**: Use Azure Document Intelligence (92.79% vs 25% CSR)
|
| 364 |
-
|
| 365 |
-
### Issue: Images bloating response
|
| 366 |
-
**Solution**: Don't use base64, use path-like text references: `document_name.pdf/page_1/image_1`
|
| 367 |
-
|
| 368 |
-
### Issue: Wrong response format
|
| 369 |
-
**Solution**: Return list directly, not `{"pages": [...]}`
|
| 370 |
-
|
| 371 |
-
### Issue: LLM taking too long
|
| 372 |
-
**Solution**:
|
| 373 |
-
- Limit to top 3 documents
|
| 374 |
-
- Use temperature 0.2
|
| 375 |
-
- Cap max_tokens at 1000
|
| 376 |
-
|
| 377 |
-
### Issue: Cyrillic not preserved
|
| 378 |
-
**Solution**: Azure handles this natively, ensure proper encoding throughout pipeline
|
| 379 |
-
|
| 380 |
-
---
|
| 381 |
-
|
| 382 |
-
## Performance Targets
|
| 383 |
-
|
| 384 |
-
- **OCR**: ~10-15 seconds for 12-page PDF
|
| 385 |
-
- **LLM**: ~3-5 seconds per query
|
| 386 |
-
- **Total RAG pipeline**: ~6-7 seconds (embedding + search + generation)
|
| 387 |
-
|
| 388 |
-
---
|
| 389 |
-
|
| 390 |
-
## Dependencies (requirements.txt essentials)
|
| 391 |
-
|
| 392 |
-
```txt
|
| 393 |
-
# Web Framework
|
| 394 |
-
fastapi==0.104.1
|
| 395 |
-
uvicorn[standard]==0.24.0
|
| 396 |
-
python-multipart==0.0.6
|
| 397 |
-
|
| 398 |
-
# Azure AI
|
| 399 |
-
azure-ai-formrecognizer==3.3.2
|
| 400 |
-
openai==1.3.0
|
| 401 |
-
|
| 402 |
-
# PDF Processing
|
| 403 |
-
PyMuPDF==1.23.8
|
| 404 |
-
|
| 405 |
-
# Vector DB & Embeddings
|
| 406 |
-
pinecone-client==3.0.0
|
| 407 |
-
sentence-transformers>=2.5.0
|
| 408 |
-
|
| 409 |
-
# Utilities
|
| 410 |
-
python-dotenv==1.0.0
|
| 411 |
-
pydantic==2.5.0
|
| 412 |
-
pydantic-settings==2.1.0
|
| 413 |
-
loguru==0.7.2
|
| 414 |
-
```
|
| 415 |
-
|
| 416 |
-
---
|
| 417 |
-
|
| 418 |
-
## Deployment Checklist
|
| 419 |
-
|
| 420 |
-
1. ✅ Set environment variables in .env
|
| 421 |
-
2. ✅ Install dependencies: `pip install -r requirements.txt`
|
| 422 |
-
3. ✅ Configure Azure Document Intelligence endpoint
|
| 423 |
-
4. ✅ Set up Pinecone index (1024 dimensions, cosine metric)
|
| 424 |
-
5. ✅ Ingest PDF documents into vector DB
|
| 425 |
-
6. ✅ Start FastAPI server: `uvicorn src.api.main:app --host 0.0.0.0 --port 8000`
|
| 426 |
-
7. ✅ Start ngrok: `ngrok http 8000`
|
| 427 |
-
8. ✅ Test both endpoints
|
| 428 |
-
9. ✅ Verify Cyrillic preservation
|
| 429 |
-
10. ✅ Check image references in OCR output
|
| 430 |
-
|
| 431 |
-
---
|
| 432 |
-
|
| 433 |
-
## Scoring Optimization
|
| 434 |
-
|
| 435 |
-
### OCR Quality (50%)
|
| 436 |
-
- Azure Document Intelligence ✅
|
| 437 |
-
- Multi-language support ✅
|
| 438 |
-
- 92.79% CSR ✅
|
| 439 |
-
- Image detection ✅
|
| 440 |
-
|
| 441 |
-
### LLM Quality (30%)
|
| 442 |
-
- Open-source model (Llama) ✅
|
| 443 |
-
- RAG with 3-doc retrieval ✅
|
| 444 |
-
- Source citations ✅
|
| 445 |
-
- Contextual accuracy ✅
|
| 446 |
-
|
| 447 |
-
### Architecture (20%)
|
| 448 |
-
- Cloud vector DB (Pinecone) ✅
|
| 449 |
-
- Production-ready (FastAPI) ✅
|
| 450 |
-
- Open-source LLM ✅
|
| 451 |
-
- Modern stack ✅
|
| 452 |
-
|
| 453 |
-
---
|
| 454 |
-
|
| 455 |
-
## Quick Start from Scratch
|
| 456 |
-
|
| 457 |
-
1. Create project structure (see above)
|
| 458 |
-
2. Copy .env template and fill in credentials
|
| 459 |
-
3. Install dependencies
|
| 460 |
-
4. Implement /ocr endpoint with Azure Document Intelligence
|
| 461 |
-
5. Implement /llm endpoint with RAG pipeline
|
| 462 |
-
6. Set up vector database (Pinecone or ChromaDB)
|
| 463 |
-
7. Ingest PDF documents
|
| 464 |
-
8. Test locally
|
| 465 |
-
9. Deploy with ngrok
|
| 466 |
-
10. Submit hackathon URL
|
| 467 |
-
|
| 468 |
-
---
|
| 469 |
-
|
| 470 |
-
**Last Updated**: 2025-12-13
|
| 471 |
-
**Model**: Llama-4-Maverick-17B-128E-Instruct-FP8
|
| 472 |
-
**OCR**: Azure Document Intelligence (92.79% CSR)
|
| 473 |
-
**Vector DB**: Pinecone (1,241 chunks from 28 PDFs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
DELETED
|
@@ -1,195 +0,0 @@
|
|
| 1 |
-
# SOCAR Historical Document Processing Challenge
|
| 2 |
-
|
| 3 |
-
AI-powered system for processing historical handwritten and printed documents from SOCAR's oil and gas research archives.
|
| 4 |
-
|
| 5 |
-
## Overview
|
| 6 |
-
|
| 7 |
-
This solution transforms historical documents into an interactive, searchable knowledge base using:
|
| 8 |
-
- **OCR Processing** - Extract text from handwritten and printed PDFs (multi-language support)
|
| 9 |
-
- **Vector Database** - Store and retrieve document information efficiently
|
| 10 |
-
- **RAG Chatbot** - Answer questions using historical document knowledge
|
| 11 |
-
|
| 12 |
-
## Quick Start
|
| 13 |
-
|
| 14 |
-
### Option 1: Docker Deployment (Recommended)
|
| 15 |
-
|
| 16 |
-
#### Using Docker Compose
|
| 17 |
-
|
| 18 |
-
```bash
|
| 19 |
-
# Build and start the container
|
| 20 |
-
docker-compose up -d
|
| 21 |
-
|
| 22 |
-
# View logs
|
| 23 |
-
docker-compose logs -f
|
| 24 |
-
|
| 25 |
-
# Stop the container
|
| 26 |
-
docker-compose down
|
| 27 |
-
```
|
| 28 |
-
|
| 29 |
-
#### Using Docker Directly
|
| 30 |
-
|
| 31 |
-
```bash
|
| 32 |
-
# Build the image
|
| 33 |
-
docker build -t socar-document-processing .
|
| 34 |
-
|
| 35 |
-
# Run the container
|
| 36 |
-
docker run -d \
|
| 37 |
-
-p 8000:8000 \
|
| 38 |
-
-v $(pwd)/data:/app/data \
|
| 39 |
-
--env-file .env \
|
| 40 |
-
--name socar-api \
|
| 41 |
-
socar-document-processing
|
| 42 |
-
|
| 43 |
-
# View logs
|
| 44 |
-
docker logs -f socar-api
|
| 45 |
-
|
| 46 |
-
# Stop the container
|
| 47 |
-
docker stop socar-api
|
| 48 |
-
```
|
| 49 |
-
|
| 50 |
-
The API will be available at `http://localhost:8000`
|
| 51 |
-
|
| 52 |
-
### Option 2: Local Python Setup
|
| 53 |
-
|
| 54 |
-
#### 1. Install Dependencies
|
| 55 |
-
|
| 56 |
-
```bash
|
| 57 |
-
pip install -r requirements.txt
|
| 58 |
-
```
|
| 59 |
-
|
| 60 |
-
#### 2. Configure Environment
|
| 61 |
-
|
| 62 |
-
Ensure `.env` file exists with your credentials:
|
| 63 |
-
|
| 64 |
-
Required variables:
|
| 65 |
-
- `AZURE_OPENAI_API_KEY` - Azure OpenAI API key
|
| 66 |
-
- `AZURE_OPENAI_ENDPOINT` - Azure OpenAI endpoint URL
|
| 67 |
-
- `LLM_MODEL` - Model name (default: Llama-4-Maverick-17B-128E-Instruct-FP8)
|
| 68 |
-
|
| 69 |
-
#### 3. Run the API Server
|
| 70 |
-
|
| 71 |
-
```bash
|
| 72 |
-
python run.py
|
| 73 |
-
```
|
| 74 |
-
|
| 75 |
-
The API will be available at `http://localhost:8000`
|
| 76 |
-
|
| 77 |
-
#### 4. Test the System
|
| 78 |
-
|
| 79 |
-
```bash
|
| 80 |
-
python test_complete_system.py
|
| 81 |
-
```
|
| 82 |
-
|
| 83 |
-
## API Endpoints
|
| 84 |
-
|
| 85 |
-
### POST /ocr
|
| 86 |
-
|
| 87 |
-
Extract text from PDF documents.
|
| 88 |
-
|
| 89 |
-
**Request:**
|
| 90 |
-
```bash
|
| 91 |
-
curl -X POST http://localhost:8000/ocr \
|
| 92 |
-
-F "file=@document.pdf"
|
| 93 |
-
```
|
| 94 |
-
|
| 95 |
-
**Response:**
|
| 96 |
-
```json
|
| 97 |
-
[
|
| 98 |
-
{
|
| 99 |
-
"page_number": 1,
|
| 100 |
-
"MD_text": "## Section Title\nExtracted text..."
|
| 101 |
-
}
|
| 102 |
-
]
|
| 103 |
-
```
|
| 104 |
-
|
| 105 |
-
### POST /llm
|
| 106 |
-
|
| 107 |
-
Query documents using natural language.
|
| 108 |
-
|
| 109 |
-
**Request:**
|
| 110 |
-
```bash
|
| 111 |
-
curl -X POST http://localhost:8000/llm \
|
| 112 |
-
-H "Content-Type: application/json" \
|
| 113 |
-
-d '[{"role": "user", "content": "What is this document about?"}]'
|
| 114 |
-
```
|
| 115 |
-
|
| 116 |
-
**Response:**
|
| 117 |
-
```json
|
| 118 |
-
{
|
| 119 |
-
"sources": [
|
| 120 |
-
{
|
| 121 |
-
"pdf_name": "document.pdf",
|
| 122 |
-
"page_number": 1,
|
| 123 |
-
"content": "Relevant text snippet..."
|
| 124 |
-
}
|
| 125 |
-
],
|
| 126 |
-
"answer": "This document discusses..."
|
| 127 |
-
}
|
| 128 |
-
```
|
| 129 |
-
|
| 130 |
-
## Project Structure
|
| 131 |
-
|
| 132 |
-
```
|
| 133 |
-
.
|
| 134 |
-
├── src/
|
| 135 |
-
│ ├── api/ # FastAPI endpoints
|
| 136 |
-
│ ├── ocr/ # OCR processing modules
|
| 137 |
-
│ ├── llm/ # LLM and RAG pipeline
|
| 138 |
-
│ └── utils/ # Utility functions
|
| 139 |
-
├── data/
|
| 140 |
-
│ ├── pdfs/ # Input PDF documents
|
| 141 |
-
│ ├── processed/ # Processed documents
|
| 142 |
-
│ └── vector_db/ # Vector database storage
|
| 143 |
-
├── tests/ # Test files
|
| 144 |
-
├── run.py # Application entry point
|
| 145 |
-
└── requirements.txt # Python dependencies
|
| 146 |
-
```
|
| 147 |
-
|
| 148 |
-
## Technologies
|
| 149 |
-
|
| 150 |
-
- **OCR**: Azure Document Intelligence (multi-language support)
|
| 151 |
-
- **Vector DB**: ChromaDB (local, open-source)
|
| 152 |
-
- **LLM**: Llama-4-Maverick-17B (open-source, deployable)
|
| 153 |
-
- **API**: FastAPI (async, high-performance)
|
| 154 |
-
- **Embeddings**: Sentence Transformers (all-MiniLM-L6-v2)
|
| 155 |
-
- **Deployment**: Docker, Docker Compose
|
| 156 |
-
|
| 157 |
-
## Deployment
|
| 158 |
-
|
| 159 |
-
### Docker Features
|
| 160 |
-
|
| 161 |
-
- **Multi-stage build** - Optimized image size
|
| 162 |
-
- **Health checks** - Automatic container monitoring
|
| 163 |
-
- **Volume mounts** - Persistent data storage
|
| 164 |
-
- **Environment variables** - Easy configuration
|
| 165 |
-
- **Auto-restart** - Production-ready resilience
|
| 166 |
-
|
| 167 |
-
### Production Deployment
|
| 168 |
-
|
| 169 |
-
```bash
|
| 170 |
-
# Build production image
|
| 171 |
-
docker build -t socar-api:production .
|
| 172 |
-
|
| 173 |
-
# Deploy with nginx reverse proxy
|
| 174 |
-
docker network create socar-network
|
| 175 |
-
docker run -d --name socar-api --network socar-network socar-api:production
|
| 176 |
-
```
|
| 177 |
-
|
| 178 |
-
## Development
|
| 179 |
-
|
| 180 |
-
### Running Tests
|
| 181 |
-
|
| 182 |
-
```bash
|
| 183 |
-
pytest tests/
|
| 184 |
-
```
|
| 185 |
-
|
| 186 |
-
### Code Formatting
|
| 187 |
-
|
| 188 |
-
```bash
|
| 189 |
-
black src/
|
| 190 |
-
flake8 src/
|
| 191 |
-
```
|
| 192 |
-
|
| 193 |
-
## License
|
| 194 |
-
|
| 195 |
-
MIT License - SOCAR Hackathon 2024
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docker-compose.yml
DELETED
|
@@ -1,43 +0,0 @@
|
|
| 1 |
-
version: '3.8'
|
| 2 |
-
|
| 3 |
-
services:
|
| 4 |
-
socar-api:
|
| 5 |
-
build:
|
| 6 |
-
context: .
|
| 7 |
-
dockerfile: Dockerfile
|
| 8 |
-
container_name: socar-document-processing
|
| 9 |
-
ports:
|
| 10 |
-
- "8000:8000"
|
| 11 |
-
volumes:
|
| 12 |
-
# Mount data directories for persistence
|
| 13 |
-
- ./data/pdfs:/app/data/pdfs
|
| 14 |
-
- ./data/vector_db:/app/data/vector_db
|
| 15 |
-
- ./data/processed:/app/data/processed
|
| 16 |
-
environment:
|
| 17 |
-
# Azure OpenAI Configuration
|
| 18 |
-
- AZURE_OPENAI_API_KEY=${AZURE_OPENAI_API_KEY}
|
| 19 |
-
- AZURE_OPENAI_ENDPOINT=${AZURE_OPENAI_ENDPOINT}
|
| 20 |
-
- AZURE_OPENAI_API_VERSION=${AZURE_OPENAI_API_VERSION}
|
| 21 |
-
# Azure Document Intelligence
|
| 22 |
-
- AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT=${AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT}
|
| 23 |
-
- AZURE_DOCUMENT_INTELLIGENCE_KEY=${AZURE_DOCUMENT_INTELLIGENCE_KEY}
|
| 24 |
-
# Application Configuration
|
| 25 |
-
- LLM_MODEL=${LLM_MODEL:-Llama-4-Maverick-17B-128E-Instruct-FP8}
|
| 26 |
-
- API_HOST=0.0.0.0
|
| 27 |
-
- API_PORT=8000
|
| 28 |
-
# Performance
|
| 29 |
-
- TOKENIZERS_PARALLELISM=false
|
| 30 |
-
- ANONYMIZED_TELEMETRY=false
|
| 31 |
-
restart: unless-stopped
|
| 32 |
-
healthcheck:
|
| 33 |
-
test: ["CMD", "curl", "-f", "http://localhost:8000/"]
|
| 34 |
-
interval: 30s
|
| 35 |
-
timeout: 10s
|
| 36 |
-
retries: 3
|
| 37 |
-
start_period: 40s
|
| 38 |
-
networks:
|
| 39 |
-
- socar-network
|
| 40 |
-
|
| 41 |
-
networks:
|
| 42 |
-
socar-network:
|
| 43 |
-
driver: bridge
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ingest_pdfs.py
DELETED
|
@@ -1,87 +0,0 @@
|
|
| 1 |
-
"""Script to ingest all PDFs into the vector database"""
|
| 2 |
-
|
| 3 |
-
from pathlib import Path
|
| 4 |
-
from loguru import logger
|
| 5 |
-
import sys
|
| 6 |
-
|
| 7 |
-
from src.llm.rag_pipeline import get_rag_pipeline
|
| 8 |
-
from src.ocr.processor import get_ocr_processor
|
| 9 |
-
|
| 10 |
-
# Configure logging
|
| 11 |
-
logger.remove()
|
| 12 |
-
logger.add(sys.stderr, level="INFO")
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
def ingest_pdfs(pdf_dir: str = "data/pdfs", limit: int = None):
|
| 16 |
-
"""
|
| 17 |
-
Ingest all PDFs from directory into vector database
|
| 18 |
-
|
| 19 |
-
Args:
|
| 20 |
-
pdf_dir: Directory containing PDF files
|
| 21 |
-
limit: Optional limit on number of PDFs to process
|
| 22 |
-
"""
|
| 23 |
-
pdf_path = Path(pdf_dir)
|
| 24 |
-
|
| 25 |
-
if not pdf_path.exists():
|
| 26 |
-
logger.error(f"PDF directory not found: {pdf_dir}")
|
| 27 |
-
return
|
| 28 |
-
|
| 29 |
-
# Get all PDF files
|
| 30 |
-
pdf_files = list(pdf_path.glob("*.pdf"))
|
| 31 |
-
logger.info(f"Found {len(pdf_files)} PDF files")
|
| 32 |
-
|
| 33 |
-
if limit:
|
| 34 |
-
pdf_files = pdf_files[:limit]
|
| 35 |
-
logger.info(f"Processing only first {limit} files")
|
| 36 |
-
|
| 37 |
-
# Initialize components
|
| 38 |
-
ocr = get_ocr_processor()
|
| 39 |
-
rag = get_rag_pipeline()
|
| 40 |
-
|
| 41 |
-
# Process each PDF
|
| 42 |
-
for idx, pdf_file in enumerate(pdf_files, 1):
|
| 43 |
-
try:
|
| 44 |
-
logger.info(f"[{idx}/{len(pdf_files)}] Processing: {pdf_file.name}")
|
| 45 |
-
|
| 46 |
-
# Read PDF
|
| 47 |
-
with open(pdf_file, "rb") as f:
|
| 48 |
-
pdf_content = f.read()
|
| 49 |
-
|
| 50 |
-
# Extract text with OCR
|
| 51 |
-
pages = ocr.process_pdf(pdf_content, pdf_file.name)
|
| 52 |
-
logger.info(f"Extracted {len(pages)} pages from {pdf_file.name}")
|
| 53 |
-
|
| 54 |
-
# Add to vector database
|
| 55 |
-
rag.add_processed_document(pdf_file.name, pages)
|
| 56 |
-
|
| 57 |
-
logger.info(f"Successfully ingested {pdf_file.name}")
|
| 58 |
-
|
| 59 |
-
except Exception as e:
|
| 60 |
-
logger.error(f"Error processing {pdf_file.name}: {e}")
|
| 61 |
-
continue
|
| 62 |
-
|
| 63 |
-
# Print stats
|
| 64 |
-
stats = rag.vector_store.get_stats()
|
| 65 |
-
logger.info(f"\nIngestion complete!")
|
| 66 |
-
logger.info(f"Total documents in vector store: {stats['total_documents']}")
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
if __name__ == "__main__":
|
| 70 |
-
import argparse
|
| 71 |
-
|
| 72 |
-
parser = argparse.ArgumentParser(description="Ingest PDFs into vector database")
|
| 73 |
-
parser.add_argument(
|
| 74 |
-
"--dir",
|
| 75 |
-
type=str,
|
| 76 |
-
default="data/pdfs",
|
| 77 |
-
help="Directory containing PDF files",
|
| 78 |
-
)
|
| 79 |
-
parser.add_argument(
|
| 80 |
-
"--limit",
|
| 81 |
-
type=int,
|
| 82 |
-
default=None,
|
| 83 |
-
help="Limit number of PDFs to process (for testing)",
|
| 84 |
-
)
|
| 85 |
-
|
| 86 |
-
args = parser.parse_args()
|
| 87 |
-
ingest_pdfs(args.dir, args.limit)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
notebooks/llm_benchmark.ipynb
ADDED
|
@@ -0,0 +1,901 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": [
|
| 7 |
+
"# LLM Benchmarking for SOCAR Hackathon RAG Chatbot\n",
|
| 8 |
+
"\n",
|
| 9 |
+
"This notebook tests different LLM models for the `/llm` endpoint to find the best performer.\n",
|
| 10 |
+
"\n",
|
| 11 |
+
"## Evaluation Criteria (LLM Judge Metrics):\n",
|
| 12 |
+
"- **Accuracy**: Is the answer correct?\n",
|
| 13 |
+
"- **Relevance**: Are retrieved citations relevant?\n",
|
| 14 |
+
"- **Completeness**: Does it fully answer the question?\n",
|
| 15 |
+
"- **Citation Quality**: Proper sources with page numbers?\n",
|
| 16 |
+
"- **Response Time**: Speed of generation\n",
|
| 17 |
+
"\n",
|
| 18 |
+
"## Available LLM Models:\n",
|
| 19 |
+
"1. **Llama-4-Maverick-17B-128E-Instruct-FP8** (Current choice, open-source)\n",
|
| 20 |
+
"2. **DeepSeek-R1** (Open-source reasoning model)\n",
|
| 21 |
+
"3. **GPT-4.1** (Strong general performance)\n",
|
| 22 |
+
"4. **GPT-5, GPT-5-mini**\n",
|
| 23 |
+
"5. **Claude Sonnet 4.5** (Best quality)\n",
|
| 24 |
+
"6. **Claude Opus 4.1**\n",
|
| 25 |
+
"7. **Phi-4-multimodal-instruct**\n",
|
| 26 |
+
"8. **gpt-oss-120b**"
|
| 27 |
+
]
|
| 28 |
+
},
|
| 29 |
+
{
|
| 30 |
+
"cell_type": "code",
|
| 31 |
+
"execution_count": 1,
|
| 32 |
+
"metadata": {},
|
| 33 |
+
"outputs": [],
|
| 34 |
+
"source": [
|
| 35 |
+
"# Install required packages\n",
|
| 36 |
+
"# !pip install openai pinecone-client sentence-transformers python-dotenv pandas matplotlib seaborn jiwer"
|
| 37 |
+
]
|
| 38 |
+
},
|
| 39 |
+
{
|
| 40 |
+
"cell_type": "code",
|
| 41 |
+
"execution_count": 2,
|
| 42 |
+
"metadata": {},
|
| 43 |
+
"outputs": [
|
| 44 |
+
{
|
| 45 |
+
"name": "stderr",
|
| 46 |
+
"output_type": "stream",
|
| 47 |
+
"text": [
|
| 48 |
+
"/Users/ismatsamadov/SOCAR_Hackathon/venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
| 49 |
+
" from .autonotebook import tqdm as notebook_tqdm\n"
|
| 50 |
+
]
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"name": "stdout",
|
| 54 |
+
"output_type": "stream",
|
| 55 |
+
"text": [
|
| 56 |
+
"✅ Libraries loaded successfully\n"
|
| 57 |
+
]
|
| 58 |
+
}
|
| 59 |
+
],
|
| 60 |
+
"source": [
|
| 61 |
+
"import os\n",
|
| 62 |
+
"import json\n",
|
| 63 |
+
"import time\n",
|
| 64 |
+
"from typing import Dict, List, Tuple\n",
|
| 65 |
+
"from dotenv import load_dotenv\n",
|
| 66 |
+
"import pandas as pd\n",
|
| 67 |
+
"import matplotlib.pyplot as plt\n",
|
| 68 |
+
"import seaborn as sns\n",
|
| 69 |
+
"from openai import AzureOpenAI\n",
|
| 70 |
+
"from pinecone import Pinecone\n",
|
| 71 |
+
"from sentence_transformers import SentenceTransformer\n",
|
| 72 |
+
"from jiwer import wer, cer\n",
|
| 73 |
+
"\n",
|
| 74 |
+
"# Load environment variables\n",
|
| 75 |
+
"load_dotenv()\n",
|
| 76 |
+
"\n",
|
| 77 |
+
"# Set style\n",
|
| 78 |
+
"sns.set_style('whitegrid')\n",
|
| 79 |
+
"plt.rcParams['figure.figsize'] = (14, 8)\n",
|
| 80 |
+
"\n",
|
| 81 |
+
"print(\"✅ Libraries loaded successfully\")"
|
| 82 |
+
]
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
"cell_type": "markdown",
|
| 86 |
+
"metadata": {},
|
| 87 |
+
"source": [
|
| 88 |
+
"## 1. Load Test Questions and Expected Answers"
|
| 89 |
+
]
|
| 90 |
+
},
|
| 91 |
+
{
|
| 92 |
+
"cell_type": "code",
|
| 93 |
+
"execution_count": 3,
|
| 94 |
+
"metadata": {},
|
| 95 |
+
"outputs": [
|
| 96 |
+
{
|
| 97 |
+
"name": "stdout",
|
| 98 |
+
"output_type": "stream",
|
| 99 |
+
"text": [
|
| 100 |
+
"Loaded 5 test cases\n",
|
| 101 |
+
"\n",
|
| 102 |
+
"Test Questions:\n",
|
| 103 |
+
"1. Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas amillərin inteqrasiyası tələb olunur?...\n",
|
| 104 |
+
"2. Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və hansı layda tətbiq edilmişdir və bunun m...\n",
|
| 105 |
+
"3. Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümunələrdə SiO2 və CaO oksidləri arasında ha...\n",
|
| 106 |
+
"4. Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkliyi (AKÇ) üçün geotemperatur xəritələrinə əsasən neft və qaz...\n",
|
| 107 |
+
"5. Example5: Bu zonada hansı proseslər baş verir?...\n"
|
| 108 |
+
]
|
| 109 |
+
}
|
| 110 |
+
],
|
| 111 |
+
"source": [
|
| 112 |
+
"# Load sample questions\n",
|
| 113 |
+
"with open('docs/sample_questions.json', 'r', encoding='utf-8') as f:\n",
|
| 114 |
+
" questions = json.load(f)\n",
|
| 115 |
+
"\n",
|
| 116 |
+
"# Load expected answers\n",
|
| 117 |
+
"with open('docs/sample_answers.json', 'r', encoding='utf-8') as f:\n",
|
| 118 |
+
" expected_answers = json.load(f)\n",
|
| 119 |
+
"\n",
|
| 120 |
+
"print(f\"Loaded {len(questions)} test cases\")\n",
|
| 121 |
+
"print(\"\\nTest Questions:\")\n",
|
| 122 |
+
"for i, (key, msgs) in enumerate(questions.items(), 1):\n",
|
| 123 |
+
" user_msg = [m for m in msgs if m['role'] == 'user'][-1]\n",
|
| 124 |
+
" print(f\"{i}. {key}: {user_msg['content'][:100]}...\")"
|
| 125 |
+
]
|
| 126 |
+
},
|
| 127 |
+
{
|
| 128 |
+
"cell_type": "markdown",
|
| 129 |
+
"metadata": {},
|
| 130 |
+
"source": [
|
| 131 |
+
"## 2. Initialize Vector Database and Embedding Model"
|
| 132 |
+
]
|
| 133 |
+
},
|
| 134 |
+
{
|
| 135 |
+
"cell_type": "code",
|
| 136 |
+
"execution_count": 4,
|
| 137 |
+
"metadata": {},
|
| 138 |
+
"outputs": [
|
| 139 |
+
{
|
| 140 |
+
"name": "stdout",
|
| 141 |
+
"output_type": "stream",
|
| 142 |
+
"text": [
|
| 143 |
+
"✅ Vector DB connected: {'_response_info': {'raw_headers': {'connection': 'keep-alive',\n",
|
| 144 |
+
" 'content-length': '188',\n",
|
| 145 |
+
" 'content-type': 'application/json',\n",
|
| 146 |
+
" 'date': 'Sun, 14 Dec 2025 03:21:33 GMT',\n",
|
| 147 |
+
" 'grpc-status': '0',\n",
|
| 148 |
+
" 'server': 'envoy',\n",
|
| 149 |
+
" 'x-envoy-upstream-service-time': '4',\n",
|
| 150 |
+
" 'x-pinecone-request-id': '3979707437017514155',\n",
|
| 151 |
+
" 'x-pinecone-request-latency-ms': '4'}},\n",
|
| 152 |
+
" 'dimension': 1024,\n",
|
| 153 |
+
" 'index_fullness': 0.0,\n",
|
| 154 |
+
" 'memoryFullness': 0.0,\n",
|
| 155 |
+
" 'metric': 'cosine',\n",
|
| 156 |
+
" 'namespaces': {'__default__': {'vector_count': 1300}},\n",
|
| 157 |
+
" 'storageFullness': 0.0,\n",
|
| 158 |
+
" 'total_vector_count': 1300,\n",
|
| 159 |
+
" 'vector_type': 'dense'}\n",
|
| 160 |
+
"✅ Embedding model loaded: SentenceTransformer(\n",
|
| 161 |
+
" (0): Transformer({'max_seq_length': 512, 'do_lower_case': True, 'architecture': 'BertModel'})\n",
|
| 162 |
+
" (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})\n",
|
| 163 |
+
" (2): Normalize()\n",
|
| 164 |
+
")\n"
|
| 165 |
+
]
|
| 166 |
+
}
|
| 167 |
+
],
|
| 168 |
+
"source": [
|
| 169 |
+
"# Initialize Pinecone\n",
|
| 170 |
+
"pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))\n",
|
| 171 |
+
"index = pc.Index(os.getenv('PINECONE_INDEX_NAME', 'hackathon'))\n",
|
| 172 |
+
"\n",
|
| 173 |
+
"# Initialize embedding model (same as used for ingestion)\n",
|
| 174 |
+
"embed_model = SentenceTransformer('BAAI/bge-large-en-v1.5')\n",
|
| 175 |
+
"\n",
|
| 176 |
+
"print(f\"✅ Vector DB connected: {index.describe_index_stats()}\")\n",
|
| 177 |
+
"print(f\"✅ Embedding model loaded: {embed_model}\")"
|
| 178 |
+
]
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"cell_type": "markdown",
|
| 182 |
+
"metadata": {},
|
| 183 |
+
"source": [
|
| 184 |
+
"## 3. RAG Retrieval Function"
|
| 185 |
+
]
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"cell_type": "code",
|
| 189 |
+
"execution_count": 5,
|
| 190 |
+
"metadata": {},
|
| 191 |
+
"outputs": [
|
| 192 |
+
{
|
| 193 |
+
"name": "stdout",
|
| 194 |
+
"output_type": "stream",
|
| 195 |
+
"text": [
|
| 196 |
+
"\n",
|
| 197 |
+
"✅ Retrieved 3 documents for test query\n",
|
| 198 |
+
"Top result: document_10.pdf, page 8 (score: 0.767)\n"
|
| 199 |
+
]
|
| 200 |
+
}
|
| 201 |
+
],
|
| 202 |
+
"source": [
|
| 203 |
+
"def retrieve_documents(query: str, top_k: int = 3) -> List[Dict]:\n",
|
| 204 |
+
" \"\"\"\n",
|
| 205 |
+
" Retrieve relevant documents from vector database.\n",
|
| 206 |
+
" \"\"\"\n",
|
| 207 |
+
" # Generate query embedding\n",
|
| 208 |
+
" query_embedding = embed_model.encode(query).tolist()\n",
|
| 209 |
+
" \n",
|
| 210 |
+
" # Search vector DB\n",
|
| 211 |
+
" results = index.query(\n",
|
| 212 |
+
" vector=query_embedding,\n",
|
| 213 |
+
" top_k=top_k,\n",
|
| 214 |
+
" include_metadata=True\n",
|
| 215 |
+
" )\n",
|
| 216 |
+
" \n",
|
| 217 |
+
" # Extract documents\n",
|
| 218 |
+
" documents = []\n",
|
| 219 |
+
" for match in results['matches']:\n",
|
| 220 |
+
" documents.append({\n",
|
| 221 |
+
" 'pdf_name': match['metadata'].get('pdf_name', 'unknown.pdf'),\n",
|
| 222 |
+
" 'page_number': match['metadata'].get('page_number', 0),\n",
|
| 223 |
+
" 'content': match['metadata'].get('text', ''),\n",
|
| 224 |
+
" 'score': match.get('score', 0.0)\n",
|
| 225 |
+
" })\n",
|
| 226 |
+
" \n",
|
| 227 |
+
" return documents\n",
|
| 228 |
+
"\n",
|
| 229 |
+
"# Test retrieval\n",
|
| 230 |
+
"test_query = \"Palçıq vulkanlarının təsir radiusu nə qədərdir?\"\n",
|
| 231 |
+
"test_docs = retrieve_documents(test_query)\n",
|
| 232 |
+
"print(f\"\\n✅ Retrieved {len(test_docs)} documents for test query\")\n",
|
| 233 |
+
"print(f\"Top result: {test_docs[0]['pdf_name']}, page {test_docs[0]['page_number']} (score: {test_docs[0]['score']:.3f})\")"
|
| 234 |
+
]
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"cell_type": "markdown",
|
| 238 |
+
"metadata": {},
|
| 239 |
+
"source": [
|
| 240 |
+
"## 4. LLM Client Functions"
|
| 241 |
+
]
|
| 242 |
+
},
|
| 243 |
+
{
|
| 244 |
+
"cell_type": "code",
|
| 245 |
+
"execution_count": null,
|
| 246 |
+
"metadata": {},
|
| 247 |
+
"outputs": [],
|
| 248 |
+
"source": "# Initialize Azure OpenAI\nazure_client = AzureOpenAI(\n api_key=os.getenv('AZURE_OPENAI_API_KEY'),\n api_version=os.getenv('AZURE_OPENAI_API_VERSION', '2024-08-01-preview'),\n azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT')\n)\n\nLLM_MODELS = {\n 'Llama-4-Maverick': 'Llama-4-Maverick-17B-128E-Instruct-FP8',\n 'DeepSeek-R1': 'DeepSeek-R1',\n 'GPT-4.1': 'gpt-4.1',\n 'GPT-5-mini': 'gpt-5-mini',\n 'Claude-Sonnet-4.5': 'claude-sonnet-4-5',\n}\n\ndef generate_answer(model_name: str, query: str, documents: List[Dict], \n temperature: float = 0.2, max_tokens: int = 1000) -> Tuple[str, float]:\n \"\"\"\n Generate answer using specified LLM model.\n Returns: (answer, response_time)\n \"\"\"\n # Build context from retrieved documents\n context_parts = []\n for i, doc in enumerate(documents, 1):\n context_parts.append(\n f\"Document {i} (Source: {doc['pdf_name']}, Page {doc['page_number']}):\\n{doc['content']}\"\n )\n context = \"\\n\\n\".join(context_parts)\n \n # Create prompt\n prompt = f\"\"\"Siz SOCAR-ın tarixi neft və qaz sənədləri üzrə mütəxəssis köməkçisisiniz.\n\nKontekst (əlaqəli sənədlər):\n{context}\n\nSual: {query}\n\nƏtraflı cavab verin və mütləq sənəd mənbələrinə istinad edin (PDF adı və səhifə nömrəsi ilə).\nCavabınız dəqiq, faktlara əsaslanan və kontekst məlumatlarından istifadə edən olmalıdır.\"\"\"\n \n # Get model deployment\n deployment = MODELS[model_name]['deployment']\n \n try:\n start_time = time.time()\n \n # GPT-5 models use max_completion_tokens, others use max_tokens\n if deployment.startswith('gpt-5'):\n response = azure_client.chat.completions.create(\n model=deployment,\n messages=[\n {\"role\": \"user\", \"content\": prompt}\n ],\n temperature=temperature,\n max_completion_tokens=max_tokens\n )\n else:\n response = azure_client.chat.completions.create(\n model=deployment,\n messages=[\n {\"role\": \"user\", \"content\": prompt}\n ],\n temperature=temperature,\n max_tokens=max_tokens\n )\n \n response_time = time.time() - start_time\n answer = response.choices[0].message.content\n \n return answer, response_time\n \n except Exception as e:\n return f\"ERROR: {str(e)}\", 0.0\n\nprint(f\"\\n✅ Configured {len(LLM_MODELS)} LLM models for testing\")"
|
| 249 |
+
},
|
| 250 |
+
{
|
| 251 |
+
"cell_type": "markdown",
|
| 252 |
+
"metadata": {},
|
| 253 |
+
"source": [
|
| 254 |
+
"## 5. Evaluation Metrics"
|
| 255 |
+
]
|
| 256 |
+
},
|
| 257 |
+
{
|
| 258 |
+
"cell_type": "code",
|
| 259 |
+
"execution_count": 7,
|
| 260 |
+
"metadata": {},
|
| 261 |
+
"outputs": [
|
| 262 |
+
{
|
| 263 |
+
"name": "stdout",
|
| 264 |
+
"output_type": "stream",
|
| 265 |
+
"text": [
|
| 266 |
+
"✅ Evaluation functions ready\n"
|
| 267 |
+
]
|
| 268 |
+
}
|
| 269 |
+
],
|
| 270 |
+
"source": [
|
| 271 |
+
"def normalize_text(text: str) -> str:\n",
|
| 272 |
+
" \"\"\"Normalize text for comparison.\"\"\"\n",
|
| 273 |
+
" import re\n",
|
| 274 |
+
" text = text.lower().strip()\n",
|
| 275 |
+
" text = re.sub(r'\\s+', ' ', text)\n",
|
| 276 |
+
" return text\n",
|
| 277 |
+
"\n",
|
| 278 |
+
"def calculate_answer_similarity(reference: str, hypothesis: str) -> Dict[str, float]:\n",
|
| 279 |
+
" \"\"\"\n",
|
| 280 |
+
" Calculate similarity between generated and expected answer.\n",
|
| 281 |
+
" Lower is better for error rates.\n",
|
| 282 |
+
" \"\"\"\n",
|
| 283 |
+
" ref_norm = normalize_text(reference)\n",
|
| 284 |
+
" hyp_norm = normalize_text(hypothesis)\n",
|
| 285 |
+
" \n",
|
| 286 |
+
" # Character Error Rate\n",
|
| 287 |
+
" cer_score = cer(ref_norm, hyp_norm) * 100\n",
|
| 288 |
+
" \n",
|
| 289 |
+
" # Word Error Rate \n",
|
| 290 |
+
" wer_score = wer(ref_norm, hyp_norm) * 100\n",
|
| 291 |
+
" \n",
|
| 292 |
+
" # Similarity scores (higher is better)\n",
|
| 293 |
+
" similarity = max(0, 100 - wer_score)\n",
|
| 294 |
+
" \n",
|
| 295 |
+
" return {\n",
|
| 296 |
+
" 'CER': round(cer_score, 2),\n",
|
| 297 |
+
" 'WER': round(wer_score, 2),\n",
|
| 298 |
+
" 'Similarity': round(similarity, 2)\n",
|
| 299 |
+
" }\n",
|
| 300 |
+
"\n",
|
| 301 |
+
"def check_citations(answer: str, documents: List[Dict]) -> Dict[str, any]:\n",
|
| 302 |
+
" \"\"\"\n",
|
| 303 |
+
" Check if answer includes proper citations.\n",
|
| 304 |
+
" \"\"\"\n",
|
| 305 |
+
" import re\n",
|
| 306 |
+
" \n",
|
| 307 |
+
" # Check for PDF names\n",
|
| 308 |
+
" pdf_names = [doc['pdf_name'] for doc in documents]\n",
|
| 309 |
+
" cited_pdfs = sum(1 for pdf in pdf_names if pdf.replace('.pdf', '') in answer)\n",
|
| 310 |
+
" \n",
|
| 311 |
+
" # Check for page numbers\n",
|
| 312 |
+
" page_numbers = [str(doc['page_number']) for doc in documents]\n",
|
| 313 |
+
" cited_pages = sum(1 for page in page_numbers if page in answer)\n",
|
| 314 |
+
" \n",
|
| 315 |
+
" # Check for source keywords\n",
|
| 316 |
+
" source_keywords = ['mənbə', 'sənəd', 'səhifə', 'pdf', 'document', 'page', 'source']\n",
|
| 317 |
+
" has_source_ref = any(kw in answer.lower() for kw in source_keywords)\n",
|
| 318 |
+
" \n",
|
| 319 |
+
" citation_score = (\n",
|
| 320 |
+
" (cited_pdfs / len(pdf_names) * 40) + # 40% for PDF citation\n",
|
| 321 |
+
" (cited_pages / len(page_numbers) * 40) + # 40% for page citation\n",
|
| 322 |
+
" (20 if has_source_ref else 0) # 20% for having source keywords\n",
|
| 323 |
+
" )\n",
|
| 324 |
+
" \n",
|
| 325 |
+
" return {\n",
|
| 326 |
+
" 'Citation_Score': round(citation_score, 2),\n",
|
| 327 |
+
" 'Cited_PDFs': cited_pdfs,\n",
|
| 328 |
+
" 'Cited_Pages': cited_pages,\n",
|
| 329 |
+
" 'Has_Source_Reference': has_source_ref\n",
|
| 330 |
+
" }\n",
|
| 331 |
+
"\n",
|
| 332 |
+
"def evaluate_completeness(answer: str, min_length: int = 100) -> Dict[str, any]:\n",
|
| 333 |
+
" \"\"\"\n",
|
| 334 |
+
" Evaluate answer completeness.\n",
|
| 335 |
+
" \"\"\"\n",
|
| 336 |
+
" word_count = len(answer.split())\n",
|
| 337 |
+
" char_count = len(answer)\n",
|
| 338 |
+
" \n",
|
| 339 |
+
" # Penalize very short or very long answers\n",
|
| 340 |
+
" if char_count < min_length:\n",
|
| 341 |
+
" completeness_score = (char_count / min_length) * 100\n",
|
| 342 |
+
" elif char_count > 2000:\n",
|
| 343 |
+
" completeness_score = 100 - ((char_count - 2000) / 2000 * 20) # Penalty for verbosity\n",
|
| 344 |
+
" else:\n",
|
| 345 |
+
" completeness_score = 100\n",
|
| 346 |
+
" \n",
|
| 347 |
+
" return {\n",
|
| 348 |
+
" 'Completeness_Score': round(max(0, completeness_score), 2),\n",
|
| 349 |
+
" 'Word_Count': word_count,\n",
|
| 350 |
+
" 'Char_Count': char_count\n",
|
| 351 |
+
" }\n",
|
| 352 |
+
"\n",
|
| 353 |
+
"print(\"✅ Evaluation functions ready\")"
|
| 354 |
+
]
|
| 355 |
+
},
|
| 356 |
+
{
|
| 357 |
+
"cell_type": "markdown",
|
| 358 |
+
"metadata": {},
|
| 359 |
+
"source": [
|
| 360 |
+
"## 6. Run Benchmark on All Models"
|
| 361 |
+
]
|
| 362 |
+
},
|
| 363 |
+
{
|
| 364 |
+
"cell_type": "code",
|
| 365 |
+
"execution_count": 8,
|
| 366 |
+
"metadata": {},
|
| 367 |
+
"outputs": [
|
| 368 |
+
{
|
| 369 |
+
"name": "stdout",
|
| 370 |
+
"output_type": "stream",
|
| 371 |
+
"text": [
|
| 372 |
+
"Testing 5 models on 5 questions...\n",
|
| 373 |
+
"\n",
|
| 374 |
+
"This may take several minutes...\n",
|
| 375 |
+
"\n"
|
| 376 |
+
]
|
| 377 |
+
}
|
| 378 |
+
],
|
| 379 |
+
"source": [
|
| 380 |
+
"# Select models to test (you can comment out models to skip)\n",
|
| 381 |
+
"MODELS_TO_TEST = [\n",
|
| 382 |
+
" 'Llama-4-Maverick-17B',\n",
|
| 383 |
+
" 'DeepSeek-R1',\n",
|
| 384 |
+
" 'GPT-4.1',\n",
|
| 385 |
+
" 'GPT-5-mini',\n",
|
| 386 |
+
" 'Claude-Sonnet-4.5',\n",
|
| 387 |
+
" # 'Claude-Opus-4.1', # Uncomment to test\n",
|
| 388 |
+
" # 'Phi-4-multimodal', # Uncomment to test\n",
|
| 389 |
+
" # 'GPT-OSS-120B', # Uncomment to test\n",
|
| 390 |
+
"]\n",
|
| 391 |
+
"\n",
|
| 392 |
+
"print(f\"Testing {len(MODELS_TO_TEST)} models on {len(questions)} questions...\\n\")\n",
|
| 393 |
+
"print(\"This may take several minutes...\\n\")"
|
| 394 |
+
]
|
| 395 |
+
},
|
| 396 |
+
{
|
| 397 |
+
"cell_type": "code",
|
| 398 |
+
"execution_count": 9,
|
| 399 |
+
"metadata": {},
|
| 400 |
+
"outputs": [
|
| 401 |
+
{
|
| 402 |
+
"name": "stdout",
|
| 403 |
+
"output_type": "stream",
|
| 404 |
+
"text": [
|
| 405 |
+
"\n",
|
| 406 |
+
"================================================================================\n",
|
| 407 |
+
"Testing: Llama-4-Maverick-17B\n",
|
| 408 |
+
"================================================================================\n",
|
| 409 |
+
"\n",
|
| 410 |
+
" Question Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas amillərin inteqrasi...\n",
|
| 411 |
+
" ✅ Response time: 4.39s\n",
|
| 412 |
+
"\n",
|
| 413 |
+
" Question Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və hansı layda tətbiq e...\n",
|
| 414 |
+
" ✅ Response time: 3.74s\n",
|
| 415 |
+
"\n",
|
| 416 |
+
" Question Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümunələrdə SiO2 və CaO o...\n",
|
| 417 |
+
" ✅ Response time: 4.07s\n",
|
| 418 |
+
"\n",
|
| 419 |
+
" Question Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkliyi (AKÇ) üçün geotemperatur xəritələrin...\n",
|
| 420 |
+
" ✅ Response time: 4.20s\n",
|
| 421 |
+
"\n",
|
| 422 |
+
" Question Example5: Bu zonada hansı proseslər baş verir?...\n",
|
| 423 |
+
" ✅ Response time: 3.50s\n",
|
| 424 |
+
"\n",
|
| 425 |
+
" 📊 Llama-4-Maverick-17B Summary:\n",
|
| 426 |
+
" Avg Response Time: 3.98s\n",
|
| 427 |
+
" Avg Similarity: 0.0%\n",
|
| 428 |
+
" Avg Citation Score: 84.0%\n",
|
| 429 |
+
" Avg Completeness: 100.0%\n",
|
| 430 |
+
"\n",
|
| 431 |
+
"================================================================================\n",
|
| 432 |
+
"Testing: DeepSeek-R1\n",
|
| 433 |
+
"================================================================================\n",
|
| 434 |
+
"\n",
|
| 435 |
+
" Question Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas amillərin inteqrasi...\n",
|
| 436 |
+
" ✅ Response time: 10.00s\n",
|
| 437 |
+
"\n",
|
| 438 |
+
" Question Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və hansı layda tətbiq e...\n",
|
| 439 |
+
" ✅ Response time: 10.39s\n",
|
| 440 |
+
"\n",
|
| 441 |
+
" Question Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümunələrdə SiO2 və CaO o...\n",
|
| 442 |
+
" ✅ Response time: 10.73s\n",
|
| 443 |
+
"\n",
|
| 444 |
+
" Question Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkliyi (AKÇ) üçün geotemperatur xəritələrin...\n",
|
| 445 |
+
" ✅ Response time: 12.17s\n",
|
| 446 |
+
"\n",
|
| 447 |
+
" Question Example5: Bu zonada hansı proseslər baş verir?...\n",
|
| 448 |
+
" ✅ Response time: 10.72s\n",
|
| 449 |
+
"\n",
|
| 450 |
+
" 📊 DeepSeek-R1 Summary:\n",
|
| 451 |
+
" Avg Response Time: 10.80s\n",
|
| 452 |
+
" Avg Similarity: 0.0%\n",
|
| 453 |
+
" Avg Citation Score: 80.0%\n",
|
| 454 |
+
" Avg Completeness: 67.7%\n",
|
| 455 |
+
"\n",
|
| 456 |
+
"================================================================================\n",
|
| 457 |
+
"Testing: GPT-4.1\n",
|
| 458 |
+
"================================================================================\n",
|
| 459 |
+
"\n",
|
| 460 |
+
" Question Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas amillərin inteqrasi...\n",
|
| 461 |
+
" ✅ Response time: 6.66s\n",
|
| 462 |
+
"\n",
|
| 463 |
+
" Question Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və hansı layda tətbiq e...\n",
|
| 464 |
+
" ✅ Response time: 5.05s\n",
|
| 465 |
+
"\n",
|
| 466 |
+
" Question Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümunələrdə SiO2 və CaO o...\n",
|
| 467 |
+
" ✅ Response time: 7.65s\n",
|
| 468 |
+
"\n",
|
| 469 |
+
" Question Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkliyi (AKÇ) üçün geotemperatur xəritələrin...\n",
|
| 470 |
+
" ✅ Response time: 6.68s\n",
|
| 471 |
+
"\n",
|
| 472 |
+
" Question Example5: Bu zonada hansı proseslər baş verir?...\n",
|
| 473 |
+
" ✅ Response time: 3.69s\n",
|
| 474 |
+
"\n",
|
| 475 |
+
" 📊 GPT-4.1 Summary:\n",
|
| 476 |
+
" Avg Response Time: 5.95s\n",
|
| 477 |
+
" Avg Similarity: 0.0%\n",
|
| 478 |
+
" Avg Citation Score: 84.0%\n",
|
| 479 |
+
" Avg Completeness: 93.5%\n",
|
| 480 |
+
"\n",
|
| 481 |
+
"================================================================================\n",
|
| 482 |
+
"Testing: GPT-5-mini\n",
|
| 483 |
+
"================================================================================\n",
|
| 484 |
+
"\n",
|
| 485 |
+
" Question Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas amillərin inteqrasi...\n",
|
| 486 |
+
" ❌ Failed: ERROR: Error code: 400 - {'error': {'message': \"Unsupported parameter: 'max_tokens' is not supported with this model. Use 'max_completion_tokens' instead.\", 'type': 'invalid_request_error', 'param': 'max_tokens', 'code': 'unsupported_parameter'}}\n",
|
| 487 |
+
"\n",
|
| 488 |
+
" Question Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və hansı layda tətbiq e...\n",
|
| 489 |
+
" ❌ Failed: ERROR: Error code: 400 - {'error': {'message': \"Unsupported parameter: 'max_tokens' is not supported with this model. Use 'max_completion_tokens' instead.\", 'type': 'invalid_request_error', 'param': 'max_tokens', 'code': 'unsupported_parameter'}}\n",
|
| 490 |
+
"\n",
|
| 491 |
+
" Question Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümunələrdə SiO2 və CaO o...\n",
|
| 492 |
+
" ❌ Failed: ERROR: Error code: 400 - {'error': {'message': \"Unsupported parameter: 'max_tokens' is not supported with this model. Use 'max_completion_tokens' instead.\", 'type': 'invalid_request_error', 'param': 'max_tokens', 'code': 'unsupported_parameter'}}\n",
|
| 493 |
+
"\n",
|
| 494 |
+
" Question Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkliyi (AKÇ) üçün geotemperatur xəritələrin...\n",
|
| 495 |
+
" ❌ Failed: ERROR: Error code: 400 - {'error': {'message': \"Unsupported parameter: 'max_tokens' is not supported with this model. Use 'max_completion_tokens' instead.\", 'type': 'invalid_request_error', 'param': 'max_tokens', 'code': 'unsupported_parameter'}}\n",
|
| 496 |
+
"\n",
|
| 497 |
+
" Question Example5: Bu zonada hansı proseslər baş verir?...\n",
|
| 498 |
+
" ❌ Failed: ERROR: Error code: 400 - {'error': {'message': \"Unsupported parameter: 'max_tokens' is not supported with this model. Use 'max_completion_tokens' instead.\", 'type': 'invalid_request_error', 'param': 'max_tokens', 'code': 'unsupported_parameter'}}\n",
|
| 499 |
+
"\n",
|
| 500 |
+
"================================================================================\n",
|
| 501 |
+
"Testing: Claude-Sonnet-4.5\n",
|
| 502 |
+
"================================================================================\n",
|
| 503 |
+
"\n",
|
| 504 |
+
" Question Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas amillərin inteqrasi...\n",
|
| 505 |
+
" ❌ Failed: ERROR: Error code: 400 - {'error': {'code': 'unknown_model', 'message': 'Unknown model: claude-sonnet-4-5', 'details': 'Unknown model: claude-sonnet-4-5'}}\n",
|
| 506 |
+
"\n",
|
| 507 |
+
" Question Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və hansı layda tətbiq e...\n",
|
| 508 |
+
" ❌ Failed: ERROR: Error code: 400 - {'error': {'code': 'unknown_model', 'message': 'Unknown model: claude-sonnet-4-5', 'details': 'Unknown model: claude-sonnet-4-5'}}\n",
|
| 509 |
+
"\n",
|
| 510 |
+
" Question Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümunələrdə SiO2 və CaO o...\n",
|
| 511 |
+
" ❌ Failed: ERROR: Error code: 400 - {'error': {'code': 'unknown_model', 'message': 'Unknown model: claude-sonnet-4-5', 'details': 'Unknown model: claude-sonnet-4-5'}}\n",
|
| 512 |
+
"\n",
|
| 513 |
+
" Question Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkliyi (AKÇ) üçün geotemperatur xəritələrin...\n",
|
| 514 |
+
" ❌ Failed: ERROR: Error code: 400 - {'error': {'code': 'unknown_model', 'message': 'Unknown model: claude-sonnet-4-5', 'details': 'Unknown model: claude-sonnet-4-5'}}\n",
|
| 515 |
+
"\n",
|
| 516 |
+
" Question Example5: Bu zonada hansı proseslər baş verir?...\n",
|
| 517 |
+
" ❌ Failed: ERROR: Error code: 400 - {'error': {'code': 'unknown_model', 'message': 'Unknown model: claude-sonnet-4-5', 'details': 'Unknown model: claude-sonnet-4-5'}}\n",
|
| 518 |
+
"\n",
|
| 519 |
+
"================================================================================\n",
|
| 520 |
+
"✅ Benchmarking complete!\n",
|
| 521 |
+
"================================================================================\n"
|
| 522 |
+
]
|
| 523 |
+
}
|
| 524 |
+
],
|
| 525 |
+
"source": [
|
| 526 |
+
"# Run benchmark\n",
|
| 527 |
+
"results = []\n",
|
| 528 |
+
"\n",
|
| 529 |
+
"for model_name in MODELS_TO_TEST:\n",
|
| 530 |
+
" print(f\"\\n{'='*80}\")\n",
|
| 531 |
+
" print(f\"Testing: {model_name}\")\n",
|
| 532 |
+
" print(f\"{'='*80}\")\n",
|
| 533 |
+
" \n",
|
| 534 |
+
" model_results = []\n",
|
| 535 |
+
" \n",
|
| 536 |
+
" for example_key, messages in questions.items():\n",
|
| 537 |
+
" # Get the last user message (the actual question)\n",
|
| 538 |
+
" user_msg = [m for m in messages if m['role'] == 'user'][-1]\n",
|
| 539 |
+
" query = user_msg['content']\n",
|
| 540 |
+
" \n",
|
| 541 |
+
" print(f\"\\n Question {example_key}: {query[:80]}...\")\n",
|
| 542 |
+
" \n",
|
| 543 |
+
" # Retrieve documents\n",
|
| 544 |
+
" documents = retrieve_documents(query, top_k=3)\n",
|
| 545 |
+
" \n",
|
| 546 |
+
" # Generate answer\n",
|
| 547 |
+
" answer, response_time = generate_answer(model_name, query, documents)\n",
|
| 548 |
+
" \n",
|
| 549 |
+
" if answer.startswith('ERROR'):\n",
|
| 550 |
+
" print(f\" ❌ Failed: {answer}\")\n",
|
| 551 |
+
" continue\n",
|
| 552 |
+
" \n",
|
| 553 |
+
" print(f\" ✅ Response time: {response_time:.2f}s\")\n",
|
| 554 |
+
" \n",
|
| 555 |
+
" # Get expected answer\n",
|
| 556 |
+
" expected = expected_answers.get(example_key, {}).get('Answer', '')\n",
|
| 557 |
+
" \n",
|
| 558 |
+
" # Calculate metrics\n",
|
| 559 |
+
" similarity_metrics = calculate_answer_similarity(expected, answer) if expected else {'CER': 0, 'WER': 0, 'Similarity': 0}\n",
|
| 560 |
+
" citation_metrics = check_citations(answer, documents)\n",
|
| 561 |
+
" completeness_metrics = evaluate_completeness(answer)\n",
|
| 562 |
+
" \n",
|
| 563 |
+
" # Store result\n",
|
| 564 |
+
" result = {\n",
|
| 565 |
+
" 'Model': model_name,\n",
|
| 566 |
+
" 'Question': example_key,\n",
|
| 567 |
+
" 'Query': query[:100],\n",
|
| 568 |
+
" 'Answer': answer[:200] + '...',\n",
|
| 569 |
+
" 'Response_Time': round(response_time, 2),\n",
|
| 570 |
+
" **similarity_metrics,\n",
|
| 571 |
+
" **citation_metrics,\n",
|
| 572 |
+
" **completeness_metrics,\n",
|
| 573 |
+
" 'Open_Source': MODELS[model_name]['open_source'],\n",
|
| 574 |
+
" 'Architecture_Score': MODELS[model_name]['architecture_score']\n",
|
| 575 |
+
" }\n",
|
| 576 |
+
" \n",
|
| 577 |
+
" model_results.append(result)\n",
|
| 578 |
+
" results.append(result)\n",
|
| 579 |
+
" \n",
|
| 580 |
+
" # Show summary for this model\n",
|
| 581 |
+
" if model_results:\n",
|
| 582 |
+
" avg_response_time = sum(r['Response_Time'] for r in model_results) / len(model_results)\n",
|
| 583 |
+
" avg_similarity = sum(r['Similarity'] for r in model_results) / len(model_results)\n",
|
| 584 |
+
" avg_citation = sum(r['Citation_Score'] for r in model_results) / len(model_results)\n",
|
| 585 |
+
" avg_completeness = sum(r['Completeness_Score'] for r in model_results) / len(model_results)\n",
|
| 586 |
+
" \n",
|
| 587 |
+
" print(f\"\\n 📊 {model_name} Summary:\")\n",
|
| 588 |
+
" print(f\" Avg Response Time: {avg_response_time:.2f}s\")\n",
|
| 589 |
+
" print(f\" Avg Similarity: {avg_similarity:.1f}%\")\n",
|
| 590 |
+
" print(f\" Avg Citation Score: {avg_citation:.1f}%\")\n",
|
| 591 |
+
" print(f\" Avg Completeness: {avg_completeness:.1f}%\")\n",
|
| 592 |
+
"\n",
|
| 593 |
+
"print(f\"\\n{'='*80}\")\n",
|
| 594 |
+
"print(\"✅ Benchmarking complete!\")\n",
|
| 595 |
+
"print(f\"{'='*80}\")"
|
| 596 |
+
]
|
| 597 |
+
},
|
| 598 |
+
{
|
| 599 |
+
"cell_type": "markdown",
|
| 600 |
+
"metadata": {},
|
| 601 |
+
"source": [
|
| 602 |
+
"## 7. Aggregate Results and Rankings"
|
| 603 |
+
]
|
| 604 |
+
},
|
| 605 |
+
{
|
| 606 |
+
"cell_type": "code",
|
| 607 |
+
"execution_count": 10,
|
| 608 |
+
"metadata": {},
|
| 609 |
+
"outputs": [
|
| 610 |
+
{
|
| 611 |
+
"name": "stdout",
|
| 612 |
+
"output_type": "stream",
|
| 613 |
+
"text": [
|
| 614 |
+
"\n",
|
| 615 |
+
"====================================================================================================\n",
|
| 616 |
+
"📊 LLM BENCHMARKING RESULTS - MODEL SUMMARY\n",
|
| 617 |
+
"====================================================================================================\n",
|
| 618 |
+
" Response_Time Similarity Citation_Score Completeness_Score CER WER Open_Source Architecture_Score Quality_Score\n",
|
| 619 |
+
"Model \n",
|
| 620 |
+
"Llama-4-Maverick-17B 3.98 0.0 84.0 100.00 330.97 378.42 True High 59.40\n",
|
| 621 |
+
"GPT-4.1 5.95 0.0 84.0 93.54 755.19 780.64 False Medium 57.46\n",
|
| 622 |
+
"DeepSeek-R1 10.80 0.0 80.0 67.73 855.43 992.02 True High 48.32\n",
|
| 623 |
+
"====================================================================================================\n"
|
| 624 |
+
]
|
| 625 |
+
}
|
| 626 |
+
],
|
| 627 |
+
"source": [
|
| 628 |
+
"# Create DataFrame\n",
|
| 629 |
+
"df = pd.DataFrame(results)\n",
|
| 630 |
+
"\n",
|
| 631 |
+
"# Calculate aggregate scores per model\n",
|
| 632 |
+
"model_summary = df.groupby('Model').agg({\n",
|
| 633 |
+
" 'Response_Time': 'mean',\n",
|
| 634 |
+
" 'Similarity': 'mean',\n",
|
| 635 |
+
" 'Citation_Score': 'mean',\n",
|
| 636 |
+
" 'Completeness_Score': 'mean',\n",
|
| 637 |
+
" 'CER': 'mean',\n",
|
| 638 |
+
" 'WER': 'mean',\n",
|
| 639 |
+
" 'Open_Source': 'first',\n",
|
| 640 |
+
" 'Architecture_Score': 'first'\n",
|
| 641 |
+
"}).round(2)\n",
|
| 642 |
+
"\n",
|
| 643 |
+
"# Calculate overall quality score (weighted average)\n",
|
| 644 |
+
"model_summary['Quality_Score'] = (\n",
|
| 645 |
+
" model_summary['Similarity'] * 0.35 + # 35% answer accuracy\n",
|
| 646 |
+
" model_summary['Citation_Score'] * 0.35 + # 35% citation quality\n",
|
| 647 |
+
" model_summary['Completeness_Score'] * 0.30 # 30% completeness\n",
|
| 648 |
+
").round(2)\n",
|
| 649 |
+
"\n",
|
| 650 |
+
"# Sort by Quality Score\n",
|
| 651 |
+
"model_summary = model_summary.sort_values('Quality_Score', ascending=False)\n",
|
| 652 |
+
"\n",
|
| 653 |
+
"# Display summary table\n",
|
| 654 |
+
"print(\"\\n\" + \"=\"*100)\n",
|
| 655 |
+
"print(\"📊 LLM BENCHMARKING RESULTS - MODEL SUMMARY\")\n",
|
| 656 |
+
"print(\"=\"*100)\n",
|
| 657 |
+
"print(model_summary.to_string())\n",
|
| 658 |
+
"print(\"=\"*100)"
|
| 659 |
+
]
|
| 660 |
+
},
|
| 661 |
+
{
|
| 662 |
+
"cell_type": "markdown",
|
| 663 |
+
"metadata": {},
|
| 664 |
+
"source": [
|
| 665 |
+
"## 8. Visualizations"
|
| 666 |
+
]
|
| 667 |
+
},
|
| 668 |
+
{
|
| 669 |
+
"cell_type": "code",
|
| 670 |
+
"execution_count": null,
|
| 671 |
+
"metadata": {},
|
| 672 |
+
"outputs": [],
|
| 673 |
+
"source": "# Create comprehensive visualization\nimport os\nfrom pathlib import Path\n\n# Create output directory\noutput_dir = Path('output/llm_benchmark')\noutput_dir.mkdir(parents=True, exist_ok=True)\n\nfig, axes = plt.subplots(2, 3, figsize=(18, 12))\n\nmodels = model_summary.index.tolist()\ncolors = sns.color_palette('husl', len(models))\n\n# 1. Overall Quality Score\nax1 = axes[0, 0]\nbars1 = ax1.barh(models, model_summary['Quality_Score'], color=colors)\nax1.set_xlabel('Quality Score (Higher is Better)', fontsize=11)\nax1.set_title('Overall Quality Score\\n(Similarity 35% + Citation 35% + Completeness 30%)', \n fontsize=12, fontweight='bold')\nax1.set_xlim(0, 100)\nfor i, (model, score) in enumerate(zip(models, model_summary['Quality_Score'])):\n ax1.text(score + 1, i, f'{score:.1f}', va='center', fontsize=10, fontweight='bold')\n\n# 2. Answer Similarity (Accuracy)\nax2 = axes[0, 1]\nax2.barh(models, model_summary['Similarity'], color=colors)\nax2.set_xlabel('Similarity to Expected Answer (%)', fontsize=11)\nax2.set_title('Answer Accuracy', fontsize=12, fontweight='bold')\nax2.set_xlim(0, 100)\nfor i, (model, score) in enumerate(zip(models, model_summary['Similarity'])):\n ax2.text(score + 1, i, f'{score:.1f}%', va='center', fontsize=9)\n\n# 3. Citation Quality\nax3 = axes[0, 2]\nax3.barh(models, model_summary['Citation_Score'], color=colors)\nax3.set_xlabel('Citation Score (%)', fontsize=11)\nax3.set_title('Citation Quality\\n(PDF names + Page numbers)', fontsize=12, fontweight='bold')\nax3.set_xlim(0, 100)\nfor i, (model, score) in enumerate(zip(models, model_summary['Citation_Score'])):\n ax3.text(score + 1, i, f'{score:.1f}%', va='center', fontsize=9)\n\n# 4. Response Time\nax4 = axes[1, 0]\nax4.barh(models, model_summary['Response_Time'], color=colors)\nax4.set_xlabel('Response Time (seconds - Lower is Better)', fontsize=11)\nax4.set_title('Speed Performance', fontsize=12, fontweight='bold')\nfor i, (model, time) in enumerate(zip(models, model_summary['Response_Time'])):\n ax4.text(time + 0.1, i, f'{time:.2f}s', va='center', fontsize=9)\n\n# 5. Completeness\nax5 = axes[1, 1]\nax5.barh(models, model_summary['Completeness_Score'], color=colors)\nax5.set_xlabel('Completeness Score (%)', fontsize=11)\nax5.set_title('Answer Completeness', fontsize=12, fontweight='bold')\nax5.set_xlim(0, 100)\nfor i, (model, score) in enumerate(zip(models, model_summary['Completeness_Score'])):\n ax5.text(score + 1, i, f'{score:.1f}%', va='center', fontsize=9)\n\n# 6. Error Rates (CER vs WER)\nax6 = axes[1, 2]\nx = range(len(models))\nwidth = 0.35\nax6.bar([i - width/2 for i in x], model_summary['CER'], width, label='CER', alpha=0.8)\nax6.bar([i + width/2 for i in x], model_summary['WER'], width, label='WER', alpha=0.8)\nax6.set_ylabel('Error Rate (% - Lower is Better)', fontsize=11)\nax6.set_title('Error Rates', fontsize=12, fontweight='bold')\nax6.set_xticks(x)\nax6.set_xticklabels(models, rotation=45, ha='right')\nax6.legend()\nax6.grid(axis='y', alpha=0.3)\n\nplt.tight_layout()\nplt.savefig(output_dir / 'results.png', dpi=300, bbox_inches='tight')\nplt.show()\n\nprint(f\"\\n✅ Visualization saved to '{output_dir}/results.png'\")"
|
| 674 |
+
},
|
| 675 |
+
{
|
| 676 |
+
"cell_type": "markdown",
|
| 677 |
+
"metadata": {},
|
| 678 |
+
"source": [
|
| 679 |
+
"## 9. Final Rankings and Recommendations"
|
| 680 |
+
]
|
| 681 |
+
},
|
| 682 |
+
{
|
| 683 |
+
"cell_type": "code",
|
| 684 |
+
"execution_count": 12,
|
| 685 |
+
"metadata": {},
|
| 686 |
+
"outputs": [
|
| 687 |
+
{
|
| 688 |
+
"name": "stdout",
|
| 689 |
+
"output_type": "stream",
|
| 690 |
+
"text": [
|
| 691 |
+
"\n",
|
| 692 |
+
"====================================================================================================\n",
|
| 693 |
+
"🏆 FINAL RANKINGS\n",
|
| 694 |
+
"====================================================================================================\n",
|
| 695 |
+
" Rank Quality_Score Similarity Citation_Score Completeness_Score Response_Time Open_Source Architecture_Score\n",
|
| 696 |
+
"Model \n",
|
| 697 |
+
"Llama-4-Maverick-17B 1 59.40 0.0 84.0 100.00 3.98 True High\n",
|
| 698 |
+
"GPT-4.1 2 57.46 0.0 84.0 93.54 5.95 False Medium\n",
|
| 699 |
+
"DeepSeek-R1 3 48.32 0.0 80.0 67.73 10.80 True High\n",
|
| 700 |
+
"====================================================================================================\n",
|
| 701 |
+
"\n",
|
| 702 |
+
"====================================================================================================\n",
|
| 703 |
+
"💡 RECOMMENDATIONS FOR HACKATHON\n",
|
| 704 |
+
"====================================================================================================\n",
|
| 705 |
+
"\n",
|
| 706 |
+
"🥇 Best Overall Quality: Llama-4-Maverick-17B\n",
|
| 707 |
+
" Quality Score: 59.4%\n",
|
| 708 |
+
" Similarity: 0.0%\n",
|
| 709 |
+
" Citation Score: 84.0%\n",
|
| 710 |
+
" Response Time: 3.98s\n",
|
| 711 |
+
" Open Source: True\n",
|
| 712 |
+
" Architecture Score: High\n",
|
| 713 |
+
"\n",
|
| 714 |
+
"🔓 Best Open-Source Model: Llama-4-Maverick-17B\n",
|
| 715 |
+
" Quality Score: 59.4%\n",
|
| 716 |
+
" Architecture Score: High (Better for hackathon!)\n",
|
| 717 |
+
" Response Time: 3.98s\n",
|
| 718 |
+
"\n",
|
| 719 |
+
"⚡ Fastest Model: Llama-4-Maverick-17B\n",
|
| 720 |
+
" Response Time: 3.98s\n",
|
| 721 |
+
" Quality Score: 59.4%\n",
|
| 722 |
+
"\n",
|
| 723 |
+
"====================================================================================================\n",
|
| 724 |
+
"📝 FINAL RECOMMENDATION\n",
|
| 725 |
+
"====================================================================================================\n",
|
| 726 |
+
"\n",
|
| 727 |
+
"Scoring Breakdown:\n",
|
| 728 |
+
" - LLM Quality: 30% of total hackathon score\n",
|
| 729 |
+
" - Architecture: 20% of total hackathon score (open-source preferred!)\n",
|
| 730 |
+
"\n",
|
| 731 |
+
"Best Choice:\n",
|
| 732 |
+
" ✅ Llama-4-Maverick-17B - Best balance of quality and architecture score\n",
|
| 733 |
+
" Only 0.0% quality drop for higher architecture score!\n",
|
| 734 |
+
"====================================================================================================\n"
|
| 735 |
+
]
|
| 736 |
+
}
|
| 737 |
+
],
|
| 738 |
+
"source": [
|
| 739 |
+
"# Create rankings table\n",
|
| 740 |
+
"rankings = model_summary[[\n",
|
| 741 |
+
" 'Quality_Score', 'Similarity', 'Citation_Score', 'Completeness_Score', \n",
|
| 742 |
+
" 'Response_Time', 'Open_Source', 'Architecture_Score'\n",
|
| 743 |
+
"]].copy()\n",
|
| 744 |
+
"\n",
|
| 745 |
+
"rankings.insert(0, 'Rank', range(1, len(rankings) + 1))\n",
|
| 746 |
+
"\n",
|
| 747 |
+
"print(\"\\n\" + \"=\"*100)\n",
|
| 748 |
+
"print(\"🏆 FINAL RANKINGS\")\n",
|
| 749 |
+
"print(\"=\"*100)\n",
|
| 750 |
+
"print(rankings.to_string())\n",
|
| 751 |
+
"print(\"=\"*100)\n",
|
| 752 |
+
"\n",
|
| 753 |
+
"# Winner analysis\n",
|
| 754 |
+
"best_overall = rankings.index[0]\n",
|
| 755 |
+
"best_open_source = rankings[rankings['Open_Source'] == True].index[0] if any(rankings['Open_Source']) else None\n",
|
| 756 |
+
"fastest = model_summary['Response_Time'].idxmin()\n",
|
| 757 |
+
"\n",
|
| 758 |
+
"print(\"\\n\" + \"=\"*100)\n",
|
| 759 |
+
"print(\"💡 RECOMMENDATIONS FOR HACKATHON\")\n",
|
| 760 |
+
"print(\"=\"*100)\n",
|
| 761 |
+
"\n",
|
| 762 |
+
"print(f\"\\n🥇 Best Overall Quality: {best_overall}\")\n",
|
| 763 |
+
"print(f\" Quality Score: {model_summary.loc[best_overall, 'Quality_Score']:.1f}%\")\n",
|
| 764 |
+
"print(f\" Similarity: {model_summary.loc[best_overall, 'Similarity']:.1f}%\")\n",
|
| 765 |
+
"print(f\" Citation Score: {model_summary.loc[best_overall, 'Citation_Score']:.1f}%\")\n",
|
| 766 |
+
"print(f\" Response Time: {model_summary.loc[best_overall, 'Response_Time']:.2f}s\")\n",
|
| 767 |
+
"print(f\" Open Source: {model_summary.loc[best_overall, 'Open_Source']}\")\n",
|
| 768 |
+
"print(f\" Architecture Score: {model_summary.loc[best_overall, 'Architecture_Score']}\")\n",
|
| 769 |
+
"\n",
|
| 770 |
+
"if best_open_source:\n",
|
| 771 |
+
" print(f\"\\n🔓 Best Open-Source Model: {best_open_source}\")\n",
|
| 772 |
+
" print(f\" Quality Score: {model_summary.loc[best_open_source, 'Quality_Score']:.1f}%\")\n",
|
| 773 |
+
" print(f\" Architecture Score: {model_summary.loc[best_open_source, 'Architecture_Score']} (Better for hackathon!)\")\n",
|
| 774 |
+
" print(f\" Response Time: {model_summary.loc[best_open_source, 'Response_Time']:.2f}s\")\n",
|
| 775 |
+
"\n",
|
| 776 |
+
"print(f\"\\n⚡ Fastest Model: {fastest}\")\n",
|
| 777 |
+
"print(f\" Response Time: {model_summary.loc[fastest, 'Response_Time']:.2f}s\")\n",
|
| 778 |
+
"print(f\" Quality Score: {model_summary.loc[fastest, 'Quality_Score']:.1f}%\")\n",
|
| 779 |
+
"\n",
|
| 780 |
+
"print(\"\\n\" + \"=\"*100)\n",
|
| 781 |
+
"print(\"📝 FINAL RECOMMENDATION\")\n",
|
| 782 |
+
"print(\"=\"*100)\n",
|
| 783 |
+
"print(\"\\nScoring Breakdown:\")\n",
|
| 784 |
+
"print(\" - LLM Quality: 30% of total hackathon score\")\n",
|
| 785 |
+
"print(\" - Architecture: 20% of total hackathon score (open-source preferred!)\")\n",
|
| 786 |
+
"print(\"\\nBest Choice:\")\n",
|
| 787 |
+
"if best_open_source and model_summary.loc[best_open_source, 'Quality_Score'] >= model_summary.loc[best_overall, 'Quality_Score'] * 0.9:\n",
|
| 788 |
+
" print(f\" ✅ {best_open_source} - Best balance of quality and architecture score\")\n",
|
| 789 |
+
" print(f\" Only {model_summary.loc[best_overall, 'Quality_Score'] - model_summary.loc[best_open_source, 'Quality_Score']:.1f}% quality drop for higher architecture score!\")\n",
|
| 790 |
+
"else:\n",
|
| 791 |
+
" print(f\" ✅ {best_overall} - Highest quality, use if quality gap is significant\")\n",
|
| 792 |
+
" if best_open_source:\n",
|
| 793 |
+
" print(f\" ⚠️ Consider {best_open_source} for higher architecture score (trade-off: {model_summary.loc[best_overall, 'Quality_Score'] - model_summary.loc[best_open_source, 'Quality_Score']:.1f}% quality)\")\n",
|
| 794 |
+
"\n",
|
| 795 |
+
"print(\"=\"*100)"
|
| 796 |
+
]
|
| 797 |
+
},
|
| 798 |
+
{
|
| 799 |
+
"cell_type": "markdown",
|
| 800 |
+
"metadata": {},
|
| 801 |
+
"source": [
|
| 802 |
+
"## 10. Export Results"
|
| 803 |
+
]
|
| 804 |
+
},
|
| 805 |
+
{
|
| 806 |
+
"cell_type": "code",
|
| 807 |
+
"execution_count": null,
|
| 808 |
+
"metadata": {},
|
| 809 |
+
"outputs": [],
|
| 810 |
+
"source": "# Save results\nfrom pathlib import Path\n\noutput_dir = Path('output/llm_benchmark')\noutput_dir.mkdir(parents=True, exist_ok=True)\n\ndf.to_csv(output_dir / 'detailed_results.csv', index=False, encoding='utf-8')\nmodel_summary.to_csv(output_dir / 'summary.csv', encoding='utf-8')\nrankings.to_csv(output_dir / 'rankings.csv', index=False, encoding='utf-8')\n\nprint(\"\\n✅ Results exported to output/llm_benchmark/:\")\nprint(\" - detailed_results.csv (all questions and answers)\")\nprint(\" - summary.csv (model averages)\")\nprint(\" - rankings.csv (final rankings)\")\nprint(\" - results.png (visualizations)\")"
|
| 811 |
+
},
|
| 812 |
+
{
|
| 813 |
+
"cell_type": "markdown",
|
| 814 |
+
"metadata": {},
|
| 815 |
+
"source": [
|
| 816 |
+
"## 11. Sample Answer Comparison"
|
| 817 |
+
]
|
| 818 |
+
},
|
| 819 |
+
{
|
| 820 |
+
"cell_type": "code",
|
| 821 |
+
"execution_count": 14,
|
| 822 |
+
"metadata": {},
|
| 823 |
+
"outputs": [
|
| 824 |
+
{
|
| 825 |
+
"name": "stdout",
|
| 826 |
+
"output_type": "stream",
|
| 827 |
+
"text": [
|
| 828 |
+
"\n",
|
| 829 |
+
"====================================================================================================\n",
|
| 830 |
+
"📝 SAMPLE ANSWER COMPARISON - Example1\n",
|
| 831 |
+
"====================================================================================================\n",
|
| 832 |
+
"\n",
|
| 833 |
+
"❓ Question: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas amillərin inteqrasiyası tələb olunur?\n",
|
| 834 |
+
"\n",
|
| 835 |
+
"✅ Expected Answer:\n",
|
| 836 |
+
"Daha az quyu ilə daha çox hasilat əldə etmək üçün düzgün seçilmiş texnoloji inteqrasiya (horizontal və çoxtərəfli qazma texnikaları) və qazma məhlullarının səmərəli idarə edilməsi tələb olunur. Bu yanaşma həm iqtisadi, həm də ekoloji baxımdan üstünlük yaradır.\n",
|
| 837 |
+
"\n",
|
| 838 |
+
"----------------------------------------------------------------------------------------------------\n",
|
| 839 |
+
"\n",
|
| 840 |
+
"🤖 Llama-4-Maverick-17B (Quality: 59.4%, Time: 4.39s):\n",
|
| 841 |
+
"Daha az quyu ilə daha çox hasilat əldə etmək üçün düzgün seçilmiş texnoloji inteqrasiya və qazma məhlullarının səmərəli idarəsi əsas amillərdir. Bu, Document 1 (document_11.pdf, Səhifə 3)-də qeyd olun...\n",
|
| 842 |
+
"----------------------------------------------------------------------------------------------------\n",
|
| 843 |
+
"\n",
|
| 844 |
+
"🤖 DeepSeek-R1 (Quality: 48.3%, Time: 10.00s):\n",
|
| 845 |
+
"<think>\n",
|
| 846 |
+
"Okay, let's tackle this question. The user is asking about the main factors that need to be integrated to achieve more production with fewer wells. They provided three documents, so I need to ...\n",
|
| 847 |
+
"----------------------------------------------------------------------------------------------------\n",
|
| 848 |
+
"\n",
|
| 849 |
+
"🤖 GPT-4.1 (Quality: 57.5%, Time: 6.66s):\n",
|
| 850 |
+
"Daha az quyu ilə daha çox hasilat əldə etmək üçün bir neçə əsas amilin inteqrasiyası tələb olunur. Bu amillər aşağıdakı kimi sistematik şəkildə sənəd mənbələrinə istinadla izah olunur:\n",
|
| 851 |
+
"\n",
|
| 852 |
+
"1. **Düzgün se...\n",
|
| 853 |
+
"----------------------------------------------------------------------------------------------------\n",
|
| 854 |
+
"====================================================================================================\n"
|
| 855 |
+
]
|
| 856 |
+
}
|
| 857 |
+
],
|
| 858 |
+
"source": [
|
| 859 |
+
"# Show sample answers for first question\n",
|
| 860 |
+
"sample_question = 'Example1'\n",
|
| 861 |
+
"sample_results = df[df['Question'] == sample_question]\n",
|
| 862 |
+
"\n",
|
| 863 |
+
"print(\"\\n\" + \"=\"*100)\n",
|
| 864 |
+
"print(f\"📝 SAMPLE ANSWER COMPARISON - {sample_question}\")\n",
|
| 865 |
+
"print(\"=\"*100)\n",
|
| 866 |
+
"\n",
|
| 867 |
+
"print(f\"\\n❓ Question: {questions[sample_question][0]['content']}\")\n",
|
| 868 |
+
"print(f\"\\n✅ Expected Answer:\\n{expected_answers[sample_question]['Answer']}\")\n",
|
| 869 |
+
"print(\"\\n\" + \"-\"*100)\n",
|
| 870 |
+
"\n",
|
| 871 |
+
"for _, row in sample_results.iterrows():\n",
|
| 872 |
+
" print(f\"\\n🤖 {row['Model']} (Quality: {model_summary.loc[row['Model'], 'Quality_Score']:.1f}%, Time: {row['Response_Time']:.2f}s):\")\n",
|
| 873 |
+
" print(f\"{row['Answer']}\")\n",
|
| 874 |
+
" print(\"-\"*100)\n",
|
| 875 |
+
"\n",
|
| 876 |
+
"print(\"=\"*100)"
|
| 877 |
+
]
|
| 878 |
+
}
|
| 879 |
+
],
|
| 880 |
+
"metadata": {
|
| 881 |
+
"kernelspec": {
|
| 882 |
+
"display_name": "venv",
|
| 883 |
+
"language": "python",
|
| 884 |
+
"name": "python3"
|
| 885 |
+
},
|
| 886 |
+
"language_info": {
|
| 887 |
+
"codemirror_mode": {
|
| 888 |
+
"name": "ipython",
|
| 889 |
+
"version": 3
|
| 890 |
+
},
|
| 891 |
+
"file_extension": ".py",
|
| 892 |
+
"mimetype": "text/x-python",
|
| 893 |
+
"name": "python",
|
| 894 |
+
"nbconvert_exporter": "python",
|
| 895 |
+
"pygments_lexer": "ipython3",
|
| 896 |
+
"version": "3.10.12"
|
| 897 |
+
}
|
| 898 |
+
},
|
| 899 |
+
"nbformat": 4,
|
| 900 |
+
"nbformat_minor": 4
|
| 901 |
+
}
|
notebooks/rag_optimization_benchmark.ipynb
ADDED
|
@@ -0,0 +1,1367 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": [
|
| 7 |
+
"# RAG Pipeline Optimization Benchmark\n",
|
| 8 |
+
"\n",
|
| 9 |
+
"**Comprehensive testing of ALL RAG components to maximize LLM Judge score**\n",
|
| 10 |
+
"\n",
|
| 11 |
+
"## What We're Testing:\n",
|
| 12 |
+
"\n",
|
| 13 |
+
"### 1. Embedding Models (Vector Representations)\n",
|
| 14 |
+
"- `BAAI/bge-large-en-v1.5` (Current - 1024 dim, best quality)\n",
|
| 15 |
+
"- `BAAI/bge-base-en-v1.5` (768 dim, faster)\n",
|
| 16 |
+
"- `intfloat/multilingual-e5-large` (1024 dim, multi-language)\n",
|
| 17 |
+
"- `sentence-transformers/paraphrase-multilingual-mpnet-base-v2` (768 dim, multilingual)\n",
|
| 18 |
+
"- `sentence-transformers/all-MiniLM-L6-v2` (384 dim, very fast)\n",
|
| 19 |
+
"\n",
|
| 20 |
+
"### 2. Retrieval Strategies\n",
|
| 21 |
+
"- **Top-K**: Test 1, 3, 5, 10 documents\n",
|
| 22 |
+
"- **MMR** (Maximal Marginal Relevance): Diversity vs relevance trade-off\n",
|
| 23 |
+
"- **Similarity Threshold**: Filter low-relevance docs\n",
|
| 24 |
+
"- **Reranking**: Use cross-encoder to rerank results\n",
|
| 25 |
+
"\n",
|
| 26 |
+
"### 3. Chunking Strategies (Already in Vector DB, but we'll compare)\n",
|
| 27 |
+
"- Chunk size: 256, 512, 600 (current), 1000 tokens\n",
|
| 28 |
+
"- Overlap: 0, 50, 100 (current), 200 chars\n",
|
| 29 |
+
"\n",
|
| 30 |
+
"### 4. LLM Models\n",
|
| 31 |
+
"- Llama-4-Maverick-17B (open-source)\n",
|
| 32 |
+
"- DeepSeek-R1 (reasoning)\n",
|
| 33 |
+
"- GPT-4.1, GPT-5, GPT-5-mini\n",
|
| 34 |
+
"- Claude-Sonnet-4.5\n",
|
| 35 |
+
"\n",
|
| 36 |
+
"### 5. Prompting Techniques\n",
|
| 37 |
+
"- **Baseline**: Simple context + question\n",
|
| 38 |
+
"- **Citation-focused**: Emphasize source references\n",
|
| 39 |
+
"- **Step-by-step**: Chain-of-thought reasoning\n",
|
| 40 |
+
"- **Few-shot**: Include example Q&A\n",
|
| 41 |
+
"\n",
|
| 42 |
+
"## LLM Judge Evaluation Criteria:\n",
|
| 43 |
+
"- **Accuracy** (35%): Answer correctness\n",
|
| 44 |
+
"- **Relevance** (35%): Citation quality and relevance\n",
|
| 45 |
+
"- **Completeness** (30%): Thorough answers"
|
| 46 |
+
]
|
| 47 |
+
},
|
| 48 |
+
{
|
| 49 |
+
"cell_type": "code",
|
| 50 |
+
"execution_count": 5,
|
| 51 |
+
"metadata": {},
|
| 52 |
+
"outputs": [],
|
| 53 |
+
"source": [
|
| 54 |
+
"# !pip install openai pinecone-client sentence-transformers rank-bm25 python-dotenv pandas matplotlib seaborn jiwer"
|
| 55 |
+
]
|
| 56 |
+
},
|
| 57 |
+
{
|
| 58 |
+
"cell_type": "code",
|
| 59 |
+
"execution_count": 6,
|
| 60 |
+
"metadata": {},
|
| 61 |
+
"outputs": [
|
| 62 |
+
{
|
| 63 |
+
"name": "stderr",
|
| 64 |
+
"output_type": "stream",
|
| 65 |
+
"text": [
|
| 66 |
+
"/Users/ismatsamadov/SOCAR_Hackathon/venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
| 67 |
+
" from .autonotebook import tqdm as notebook_tqdm\n"
|
| 68 |
+
]
|
| 69 |
+
},
|
| 70 |
+
{
|
| 71 |
+
"name": "stdout",
|
| 72 |
+
"output_type": "stream",
|
| 73 |
+
"text": [
|
| 74 |
+
"✅ Libraries loaded\n"
|
| 75 |
+
]
|
| 76 |
+
}
|
| 77 |
+
],
|
| 78 |
+
"source": [
|
| 79 |
+
"import os\n",
|
| 80 |
+
"import json\n",
|
| 81 |
+
"import time\n",
|
| 82 |
+
"import re\n",
|
| 83 |
+
"from typing import Dict, List, Tuple, Any\n",
|
| 84 |
+
"from collections import defaultdict\n",
|
| 85 |
+
"from dotenv import load_dotenv\n",
|
| 86 |
+
"\n",
|
| 87 |
+
"import pandas as pd\n",
|
| 88 |
+
"import matplotlib.pyplot as plt\n",
|
| 89 |
+
"import seaborn as sns\n",
|
| 90 |
+
"from openai import AzureOpenAI\n",
|
| 91 |
+
"from pinecone import Pinecone\n",
|
| 92 |
+
"from sentence_transformers import SentenceTransformer, CrossEncoder\n",
|
| 93 |
+
"from jiwer import wer, cer\n",
|
| 94 |
+
"import numpy as np\n",
|
| 95 |
+
"\n",
|
| 96 |
+
"load_dotenv()\n",
|
| 97 |
+
"\n",
|
| 98 |
+
"sns.set_style('whitegrid')\n",
|
| 99 |
+
"plt.rcParams['figure.figsize'] = (16, 10)\n",
|
| 100 |
+
"\n",
|
| 101 |
+
"print(\"✅ Libraries loaded\")"
|
| 102 |
+
]
|
| 103 |
+
},
|
| 104 |
+
{
|
| 105 |
+
"cell_type": "markdown",
|
| 106 |
+
"metadata": {},
|
| 107 |
+
"source": [
|
| 108 |
+
"## 1. Load Test Questions and Expected Answers"
|
| 109 |
+
]
|
| 110 |
+
},
|
| 111 |
+
{
|
| 112 |
+
"cell_type": "code",
|
| 113 |
+
"execution_count": 7,
|
| 114 |
+
"metadata": {},
|
| 115 |
+
"outputs": [
|
| 116 |
+
{
|
| 117 |
+
"name": "stdout",
|
| 118 |
+
"output_type": "stream",
|
| 119 |
+
"text": [
|
| 120 |
+
"✅ Loaded 5 test questions\n",
|
| 121 |
+
" - Example1\n",
|
| 122 |
+
" - Example2\n",
|
| 123 |
+
" - Example3\n",
|
| 124 |
+
" - Example4\n",
|
| 125 |
+
" - Example5\n"
|
| 126 |
+
]
|
| 127 |
+
}
|
| 128 |
+
],
|
| 129 |
+
"source": [
|
| 130 |
+
"# Load test cases\n",
|
| 131 |
+
"with open('docs/sample_questions.json', 'r', encoding='utf-8') as f:\n",
|
| 132 |
+
" questions = json.load(f)\n",
|
| 133 |
+
"\n",
|
| 134 |
+
"with open('docs/sample_answers.json', 'r', encoding='utf-8') as f:\n",
|
| 135 |
+
" expected_answers = json.load(f)\n",
|
| 136 |
+
"\n",
|
| 137 |
+
"print(f\"✅ Loaded {len(questions)} test questions\")\n",
|
| 138 |
+
"for key in questions.keys():\n",
|
| 139 |
+
" print(f\" - {key}\")"
|
| 140 |
+
]
|
| 141 |
+
},
|
| 142 |
+
{
|
| 143 |
+
"cell_type": "markdown",
|
| 144 |
+
"metadata": {},
|
| 145 |
+
"source": [
|
| 146 |
+
"## 2. Initialize Vector Database"
|
| 147 |
+
]
|
| 148 |
+
},
|
| 149 |
+
{
|
| 150 |
+
"cell_type": "code",
|
| 151 |
+
"execution_count": 8,
|
| 152 |
+
"metadata": {},
|
| 153 |
+
"outputs": [
|
| 154 |
+
{
|
| 155 |
+
"name": "stdout",
|
| 156 |
+
"output_type": "stream",
|
| 157 |
+
"text": [
|
| 158 |
+
"✅ Vector DB connected\n",
|
| 159 |
+
" Total vectors: 1300\n",
|
| 160 |
+
" Dimensions: 1024\n"
|
| 161 |
+
]
|
| 162 |
+
}
|
| 163 |
+
],
|
| 164 |
+
"source": [
|
| 165 |
+
"# Connect to Pinecone\n",
|
| 166 |
+
"pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))\n",
|
| 167 |
+
"index = pc.Index(os.getenv('PINECONE_INDEX_NAME', 'hackathon'))\n",
|
| 168 |
+
"\n",
|
| 169 |
+
"stats = index.describe_index_stats()\n",
|
| 170 |
+
"print(f\"✅ Vector DB connected\")\n",
|
| 171 |
+
"print(f\" Total vectors: {stats['total_vector_count']}\")\n",
|
| 172 |
+
"print(f\" Dimensions: {stats['dimension']}\")"
|
| 173 |
+
]
|
| 174 |
+
},
|
| 175 |
+
{
|
| 176 |
+
"cell_type": "markdown",
|
| 177 |
+
"metadata": {},
|
| 178 |
+
"source": [
|
| 179 |
+
"## 3. Embedding Models Configuration"
|
| 180 |
+
]
|
| 181 |
+
},
|
| 182 |
+
{
|
| 183 |
+
"cell_type": "code",
|
| 184 |
+
"execution_count": 9,
|
| 185 |
+
"metadata": {},
|
| 186 |
+
"outputs": [
|
| 187 |
+
{
|
| 188 |
+
"name": "stdout",
|
| 189 |
+
"output_type": "stream",
|
| 190 |
+
"text": [
|
| 191 |
+
"Loading bge-large-en...\n",
|
| 192 |
+
" ✅ BAAI/bge-large-en-v1.5\n",
|
| 193 |
+
"Loading multilingual-e5-large...\n",
|
| 194 |
+
" ✅ intfloat/multilingual-e5-large\n",
|
| 195 |
+
"\n",
|
| 196 |
+
"✅ Loaded 2 embedding models\n"
|
| 197 |
+
]
|
| 198 |
+
}
|
| 199 |
+
],
|
| 200 |
+
"source": [
|
| 201 |
+
"EMBEDDING_MODELS = {\n",
|
| 202 |
+
" 'bge-large-en': {\n",
|
| 203 |
+
" 'name': 'BAAI/bge-large-en-v1.5',\n",
|
| 204 |
+
" 'dimensions': 1024,\n",
|
| 205 |
+
" 'notes': 'Current model - best quality'\n",
|
| 206 |
+
" },\n",
|
| 207 |
+
" 'bge-base-en': {\n",
|
| 208 |
+
" 'name': 'BAAI/bge-base-en-v1.5',\n",
|
| 209 |
+
" 'dimensions': 768,\n",
|
| 210 |
+
" 'notes': 'Faster, slightly lower quality'\n",
|
| 211 |
+
" },\n",
|
| 212 |
+
" 'multilingual-e5-large': {\n",
|
| 213 |
+
" 'name': 'intfloat/multilingual-e5-large',\n",
|
| 214 |
+
" 'dimensions': 1024,\n",
|
| 215 |
+
" 'notes': 'Multi-language optimized'\n",
|
| 216 |
+
" },\n",
|
| 217 |
+
" 'paraphrase-multilingual': {\n",
|
| 218 |
+
" 'name': 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2',\n",
|
| 219 |
+
" 'dimensions': 768,\n",
|
| 220 |
+
" 'notes': 'Good for Azerbaijani/Russian'\n",
|
| 221 |
+
" },\n",
|
| 222 |
+
" 'all-MiniLM-L6': {\n",
|
| 223 |
+
" 'name': 'sentence-transformers/all-MiniLM-L6-v2',\n",
|
| 224 |
+
" 'dimensions': 384,\n",
|
| 225 |
+
" 'notes': 'Very fast, lower quality'\n",
|
| 226 |
+
" }\n",
|
| 227 |
+
"}\n",
|
| 228 |
+
"\n",
|
| 229 |
+
"# Load embedding models (only test 1024-dim models for existing Pinecone index)\n",
|
| 230 |
+
"EMBEDDING_MODELS_TO_TEST = [\n",
|
| 231 |
+
" 'bge-large-en', # Current\n",
|
| 232 |
+
" 'multilingual-e5-large', # Alternative with same dims\n",
|
| 233 |
+
"]\n",
|
| 234 |
+
"\n",
|
| 235 |
+
"embedding_cache = {}\n",
|
| 236 |
+
"\n",
|
| 237 |
+
"for model_key in EMBEDDING_MODELS_TO_TEST:\n",
|
| 238 |
+
" model_name = EMBEDDING_MODELS[model_key]['name']\n",
|
| 239 |
+
" print(f\"Loading {model_key}...\")\n",
|
| 240 |
+
" embedding_cache[model_key] = SentenceTransformer(model_name)\n",
|
| 241 |
+
" print(f\" ✅ {model_name}\")\n",
|
| 242 |
+
"\n",
|
| 243 |
+
"print(f\"\\n✅ Loaded {len(embedding_cache)} embedding models\")"
|
| 244 |
+
]
|
| 245 |
+
},
|
| 246 |
+
{
|
| 247 |
+
"cell_type": "markdown",
|
| 248 |
+
"metadata": {},
|
| 249 |
+
"source": [
|
| 250 |
+
"## 4. Retrieval Strategies"
|
| 251 |
+
]
|
| 252 |
+
},
|
| 253 |
+
{
|
| 254 |
+
"cell_type": "code",
|
| 255 |
+
"execution_count": 10,
|
| 256 |
+
"metadata": {},
|
| 257 |
+
"outputs": [
|
| 258 |
+
{
|
| 259 |
+
"name": "stdout",
|
| 260 |
+
"output_type": "stream",
|
| 261 |
+
"text": [
|
| 262 |
+
"✅ Configured 7 retrieval strategies\n"
|
| 263 |
+
]
|
| 264 |
+
}
|
| 265 |
+
],
|
| 266 |
+
"source": [
|
| 267 |
+
"def retrieve_vanilla(query: str, embed_model: SentenceTransformer, top_k: int = 3) -> List[Dict]:\n",
|
| 268 |
+
" \"\"\"\n",
|
| 269 |
+
" Vanilla retrieval: Simple top-k vector search.\n",
|
| 270 |
+
" \"\"\"\n",
|
| 271 |
+
" query_embedding = embed_model.encode(query).tolist()\n",
|
| 272 |
+
" results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)\n",
|
| 273 |
+
" \n",
|
| 274 |
+
" documents = []\n",
|
| 275 |
+
" for match in results['matches']:\n",
|
| 276 |
+
" documents.append({\n",
|
| 277 |
+
" 'pdf_name': match['metadata'].get('pdf_name', 'unknown.pdf'),\n",
|
| 278 |
+
" 'page_number': match['metadata'].get('page_number', 0),\n",
|
| 279 |
+
" 'content': match['metadata'].get('text', ''),\n",
|
| 280 |
+
" 'score': match.get('score', 0.0)\n",
|
| 281 |
+
" })\n",
|
| 282 |
+
" \n",
|
| 283 |
+
" return documents\n",
|
| 284 |
+
"\n",
|
| 285 |
+
"\n",
|
| 286 |
+
"def retrieve_with_threshold(query: str, embed_model: SentenceTransformer, \n",
|
| 287 |
+
" top_k: int = 10, threshold: float = 0.7) -> List[Dict]:\n",
|
| 288 |
+
" \"\"\"\n",
|
| 289 |
+
" Retrieve with similarity threshold filtering.\n",
|
| 290 |
+
" \"\"\"\n",
|
| 291 |
+
" docs = retrieve_vanilla(query, embed_model, top_k=top_k)\n",
|
| 292 |
+
" return [doc for doc in docs if doc['score'] >= threshold]\n",
|
| 293 |
+
"\n",
|
| 294 |
+
"\n",
|
| 295 |
+
"def retrieve_with_mmr(query: str, embed_model: SentenceTransformer, \n",
|
| 296 |
+
" top_k: int = 3, lambda_param: float = 0.5, fetch_k: int = 20) -> List[Dict]:\n",
|
| 297 |
+
" \"\"\"\n",
|
| 298 |
+
" MMR (Maximal Marginal Relevance) for diversity.\n",
|
| 299 |
+
" lambda=1 → pure relevance, lambda=0 → pure diversity\n",
|
| 300 |
+
" \"\"\"\n",
|
| 301 |
+
" # Fetch more candidates\n",
|
| 302 |
+
" candidates = retrieve_vanilla(query, embed_model, top_k=fetch_k)\n",
|
| 303 |
+
" \n",
|
| 304 |
+
" if len(candidates) <= top_k:\n",
|
| 305 |
+
" return candidates[:top_k]\n",
|
| 306 |
+
" \n",
|
| 307 |
+
" # Query embedding\n",
|
| 308 |
+
" query_emb = embed_model.encode(query)\n",
|
| 309 |
+
" \n",
|
| 310 |
+
" # Get embeddings for candidates\n",
|
| 311 |
+
" candidate_texts = [doc['content'] for doc in candidates]\n",
|
| 312 |
+
" candidate_embs = embed_model.encode(candidate_texts)\n",
|
| 313 |
+
" \n",
|
| 314 |
+
" # MMR algorithm\n",
|
| 315 |
+
" selected = []\n",
|
| 316 |
+
" selected_embs = []\n",
|
| 317 |
+
" \n",
|
| 318 |
+
" for _ in range(min(top_k, len(candidates))):\n",
|
| 319 |
+
" mmr_scores = []\n",
|
| 320 |
+
" \n",
|
| 321 |
+
" for i, (doc, emb) in enumerate(zip(candidates, candidate_embs)):\n",
|
| 322 |
+
" if i in [candidates.index(s) for s in selected]:\n",
|
| 323 |
+
" mmr_scores.append(-float('inf'))\n",
|
| 324 |
+
" continue\n",
|
| 325 |
+
" \n",
|
| 326 |
+
" # Relevance to query\n",
|
| 327 |
+
" relevance = np.dot(query_emb, emb) / (np.linalg.norm(query_emb) * np.linalg.norm(emb))\n",
|
| 328 |
+
" \n",
|
| 329 |
+
" # Max similarity to already selected\n",
|
| 330 |
+
" if selected_embs:\n",
|
| 331 |
+
" similarities = [np.dot(emb, s_emb) / (np.linalg.norm(emb) * np.linalg.norm(s_emb)) \n",
|
| 332 |
+
" for s_emb in selected_embs]\n",
|
| 333 |
+
" max_sim = max(similarities)\n",
|
| 334 |
+
" else:\n",
|
| 335 |
+
" max_sim = 0\n",
|
| 336 |
+
" \n",
|
| 337 |
+
" # MMR score\n",
|
| 338 |
+
" mmr = lambda_param * relevance - (1 - lambda_param) * max_sim\n",
|
| 339 |
+
" mmr_scores.append(mmr)\n",
|
| 340 |
+
" \n",
|
| 341 |
+
" # Select best MMR score\n",
|
| 342 |
+
" best_idx = np.argmax(mmr_scores)\n",
|
| 343 |
+
" selected.append(candidates[best_idx])\n",
|
| 344 |
+
" selected_embs.append(candidate_embs[best_idx])\n",
|
| 345 |
+
" \n",
|
| 346 |
+
" return selected\n",
|
| 347 |
+
"\n",
|
| 348 |
+
"\n",
|
| 349 |
+
"def retrieve_with_reranking(query: str, embed_model: SentenceTransformer, \n",
|
| 350 |
+
" top_k: int = 3, fetch_k: int = 20) -> List[Dict]:\n",
|
| 351 |
+
" \"\"\"\n",
|
| 352 |
+
" Two-stage: retrieve with embeddings, rerank with cross-encoder.\n",
|
| 353 |
+
" \"\"\"\n",
|
| 354 |
+
" # Stage 1: Retrieve candidates\n",
|
| 355 |
+
" candidates = retrieve_vanilla(query, embed_model, top_k=fetch_k)\n",
|
| 356 |
+
" \n",
|
| 357 |
+
" if len(candidates) <= top_k:\n",
|
| 358 |
+
" return candidates[:top_k]\n",
|
| 359 |
+
" \n",
|
| 360 |
+
" # Stage 2: Rerank with cross-encoder\n",
|
| 361 |
+
" reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')\n",
|
| 362 |
+
" \n",
|
| 363 |
+
" pairs = [[query, doc['content']] for doc in candidates]\n",
|
| 364 |
+
" scores = reranker.predict(pairs)\n",
|
| 365 |
+
" \n",
|
| 366 |
+
" # Sort by reranker score\n",
|
| 367 |
+
" scored_docs = [(doc, score) for doc, score in zip(candidates, scores)]\n",
|
| 368 |
+
" scored_docs.sort(key=lambda x: x[1], reverse=True)\n",
|
| 369 |
+
" \n",
|
| 370 |
+
" # Update scores and return top-k\n",
|
| 371 |
+
" reranked = []\n",
|
| 372 |
+
" for doc, score in scored_docs[:top_k]:\n",
|
| 373 |
+
" doc['rerank_score'] = float(score)\n",
|
| 374 |
+
" reranked.append(doc)\n",
|
| 375 |
+
" \n",
|
| 376 |
+
" return reranked\n",
|
| 377 |
+
"\n",
|
| 378 |
+
"\n",
|
| 379 |
+
"RETRIEVAL_STRATEGIES = {\n",
|
| 380 |
+
" 'vanilla_k3': {'func': retrieve_vanilla, 'params': {'top_k': 3}, 'notes': 'Current setup'},\n",
|
| 381 |
+
" 'vanilla_k5': {'func': retrieve_vanilla, 'params': {'top_k': 5}, 'notes': 'More context'},\n",
|
| 382 |
+
" 'vanilla_k10': {'func': retrieve_vanilla, 'params': {'top_k': 10}, 'notes': 'Maximum context'},\n",
|
| 383 |
+
" 'threshold_0.7': {'func': retrieve_with_threshold, 'params': {'top_k': 10, 'threshold': 0.7}, 'notes': 'Quality filter'},\n",
|
| 384 |
+
" 'mmr_balanced': {'func': retrieve_with_mmr, 'params': {'top_k': 3, 'lambda_param': 0.5}, 'notes': 'Balance diversity'},\n",
|
| 385 |
+
" 'mmr_diverse': {'func': retrieve_with_mmr, 'params': {'top_k': 3, 'lambda_param': 0.3}, 'notes': 'More diversity'},\n",
|
| 386 |
+
" 'reranked_k3': {'func': retrieve_with_reranking, 'params': {'top_k': 3, 'fetch_k': 20}, 'notes': 'Two-stage rerank'},\n",
|
| 387 |
+
"}\n",
|
| 388 |
+
"\n",
|
| 389 |
+
"print(f\"✅ Configured {len(RETRIEVAL_STRATEGIES)} retrieval strategies\")"
|
| 390 |
+
]
|
| 391 |
+
},
|
| 392 |
+
{
|
| 393 |
+
"cell_type": "markdown",
|
| 394 |
+
"metadata": {},
|
| 395 |
+
"source": [
|
| 396 |
+
"## 5. LLM Models and Prompting Strategies"
|
| 397 |
+
]
|
| 398 |
+
},
|
| 399 |
+
{
|
| 400 |
+
"cell_type": "code",
|
| 401 |
+
"execution_count": 11,
|
| 402 |
+
"metadata": {},
|
| 403 |
+
"outputs": [
|
| 404 |
+
{
|
| 405 |
+
"name": "stdout",
|
| 406 |
+
"output_type": "stream",
|
| 407 |
+
"text": [
|
| 408 |
+
"✅ Configured 5 LLM models\n",
|
| 409 |
+
"✅ Configured 4 prompting strategies\n"
|
| 410 |
+
]
|
| 411 |
+
}
|
| 412 |
+
],
|
| 413 |
+
"source": [
|
| 414 |
+
"# Initialize Azure OpenAI\n",
|
| 415 |
+
"azure_client = AzureOpenAI(\n",
|
| 416 |
+
" api_key=os.getenv('AZURE_OPENAI_API_KEY'),\n",
|
| 417 |
+
" api_version=os.getenv('AZURE_OPENAI_API_VERSION', '2024-08-01-preview'),\n",
|
| 418 |
+
" azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT')\n",
|
| 419 |
+
")\n",
|
| 420 |
+
"\n",
|
| 421 |
+
"LLM_MODELS = {\n",
|
| 422 |
+
" 'Llama-4-Maverick': 'Llama-4-Maverick-17B-128E-Instruct-FP8',\n",
|
| 423 |
+
" 'DeepSeek-R1': 'DeepSeek-R1',\n",
|
| 424 |
+
" 'GPT-4.1': 'gpt-4.1',\n",
|
| 425 |
+
" 'GPT-5-mini': 'gpt-5-mini',\n",
|
| 426 |
+
" 'Claude-Sonnet-4.5': 'claude-sonnet-4-5',\n",
|
| 427 |
+
"}\n",
|
| 428 |
+
"\n",
|
| 429 |
+
"# Prompting strategies\n",
|
| 430 |
+
"PROMPTING_STRATEGIES = {\n",
|
| 431 |
+
" 'baseline': \"\"\"\n",
|
| 432 |
+
"Siz SOCAR-ın tarixi neft və qaz sənədləri üzr�� köməkçisiniz.\n",
|
| 433 |
+
"\n",
|
| 434 |
+
"Kontekst:\n",
|
| 435 |
+
"{context}\n",
|
| 436 |
+
"\n",
|
| 437 |
+
"Sual: {query}\n",
|
| 438 |
+
"\n",
|
| 439 |
+
"Kontekstə əsaslanaraq cavab verin.\n",
|
| 440 |
+
"\"\"\",\n",
|
| 441 |
+
" \n",
|
| 442 |
+
" 'citation_focused': \"\"\"\n",
|
| 443 |
+
"Siz SOCAR-ın tarixi sənədlər üzrə mütəxəssis köməkçisisiniz.\n",
|
| 444 |
+
"\n",
|
| 445 |
+
"ÖNƏMLİ: Hər bir faktı mütləq mənbə ilə təsdiqləyin (PDF adı və səhifə nömrəsi).\n",
|
| 446 |
+
"\n",
|
| 447 |
+
"Kontekst:\n",
|
| 448 |
+
"{context}\n",
|
| 449 |
+
"\n",
|
| 450 |
+
"Sual: {query}\n",
|
| 451 |
+
"\n",
|
| 452 |
+
"Cavab verərkən:\n",
|
| 453 |
+
"1. Dəqiq faktlar yazın\n",
|
| 454 |
+
"2. Hər faktı mənbə ilə göstərin: (PDF: fayl_adı.pdf, Səhifə: X)\n",
|
| 455 |
+
"3. Kontekstdə olmayan məlumat əlavə etməyin\n",
|
| 456 |
+
"\"\"\",\n",
|
| 457 |
+
" \n",
|
| 458 |
+
" 'step_by_step': \"\"\"\n",
|
| 459 |
+
"Siz SOCAR-ın tarixi sənədlər üzrə analitik köməkçisisiniz.\n",
|
| 460 |
+
"\n",
|
| 461 |
+
"Kontekst:\n",
|
| 462 |
+
"{context}\n",
|
| 463 |
+
"\n",
|
| 464 |
+
"Sual: {query}\n",
|
| 465 |
+
"\n",
|
| 466 |
+
"Addım-addım cavab verin:\n",
|
| 467 |
+
"1. Əvvəlcə kontekstdən əlaqəli məlumatları müəyyənləşdirin\n",
|
| 468 |
+
"2. Bu məlumatları təhlil edin\n",
|
| 469 |
+
"3. Nəticəni mənbələr ilə birlikdə təqdim edin\n",
|
| 470 |
+
"\"\"\",\n",
|
| 471 |
+
" \n",
|
| 472 |
+
" 'few_shot': \"\"\"\n",
|
| 473 |
+
"Siz SOCAR-ın tarixi sənədlər üzrə mütəxəssis köməkçisisiniz.\n",
|
| 474 |
+
"\n",
|
| 475 |
+
"Nümunə:\n",
|
| 476 |
+
"Sual: \"Palçıq vulkanlarının təsir radiusu nə qədərdir?\"\n",
|
| 477 |
+
"Cavab: \"Sahə müşahidələri və modelləşdirmə göstərir ki, palçıq vulkanlarının təsir radiusu təqribən 10 km-dir (PDF: document_06.pdf, Səhifə: 5).\"\n",
|
| 478 |
+
"\n",
|
| 479 |
+
"Kontekst:\n",
|
| 480 |
+
"{context}\n",
|
| 481 |
+
"\n",
|
| 482 |
+
"Sual: {query}\n",
|
| 483 |
+
"\n",
|
| 484 |
+
"Yuxarıdakı nümunə kimi cavab verin - dəqiq, qısa, mənbə ilə.\n",
|
| 485 |
+
"\"\"\"\n",
|
| 486 |
+
"}\n",
|
| 487 |
+
"\n",
|
| 488 |
+
"print(f\"✅ Configured {len(LLM_MODELS)} LLM models\")\n",
|
| 489 |
+
"print(f\"✅ Configured {len(PROMPTING_STRATEGIES)} prompting strategies\")"
|
| 490 |
+
]
|
| 491 |
+
},
|
| 492 |
+
{
|
| 493 |
+
"cell_type": "code",
|
| 494 |
+
"execution_count": null,
|
| 495 |
+
"metadata": {},
|
| 496 |
+
"outputs": [],
|
| 497 |
+
"source": "def generate_answer(llm_model: str, query: str, documents: List[Dict], \n prompt_strategy: str = 'baseline',\n temperature: float = 0.2) -> Tuple[str, float]:\n \"\"\"\n Generate answer using LLM with specified prompting strategy.\n \"\"\"\n # Build context\n context_parts = []\n for i, doc in enumerate(documents, 1):\n context_parts.append(\n f\"Sənəd {i} (Mənbə: {doc['pdf_name']}, Səhifə {doc['page_number']}):\\n{doc['content']}\"\n )\n context = \"\\n\\n\".join(context_parts)\n \n # Get prompt template\n prompt_template = PROMPTING_STRATEGIES[prompt_strategy]\n prompt = prompt_template.format(context=context, query=query)\n \n try:\n start_time = time.time()\n \n deployment = LLM_MODELS[llm_model]\n \n # GPT-5 models use max_completion_tokens, others use max_tokens\n if deployment.startswith('gpt-5'):\n response = azure_client.chat.completions.create(\n model=deployment,\n messages=[{\"role\": \"user\", \"content\": prompt}],\n temperature=temperature,\n max_completion_tokens=1000\n )\n else:\n response = azure_client.chat.completions.create(\n model=deployment,\n messages=[{\"role\": \"user\", \"content\": prompt}],\n temperature=temperature,\n max_tokens=1000\n )\n \n elapsed = time.time() - start_time\n answer = response.choices[0].message.content\n \n return answer, elapsed\n \n except Exception as e:\n return f\"ERROR: {str(e)}\", 0.0\n\nprint(\"✅ LLM generation function ready\")"
|
| 498 |
+
},
|
| 499 |
+
{
|
| 500 |
+
"cell_type": "markdown",
|
| 501 |
+
"metadata": {},
|
| 502 |
+
"source": [
|
| 503 |
+
"## 6. Evaluation Metrics"
|
| 504 |
+
]
|
| 505 |
+
},
|
| 506 |
+
{
|
| 507 |
+
"cell_type": "code",
|
| 508 |
+
"execution_count": 13,
|
| 509 |
+
"metadata": {},
|
| 510 |
+
"outputs": [
|
| 511 |
+
{
|
| 512 |
+
"name": "stdout",
|
| 513 |
+
"output_type": "stream",
|
| 514 |
+
"text": [
|
| 515 |
+
"✅ Evaluation metrics ready\n"
|
| 516 |
+
]
|
| 517 |
+
}
|
| 518 |
+
],
|
| 519 |
+
"source": [
|
| 520 |
+
"def normalize_text(text: str) -> str:\n",
|
| 521 |
+
" text = text.lower().strip()\n",
|
| 522 |
+
" text = re.sub(r'\\s+', ' ', text)\n",
|
| 523 |
+
" return text\n",
|
| 524 |
+
"\n",
|
| 525 |
+
"def calculate_answer_quality(reference: str, hypothesis: str) -> Dict[str, float]:\n",
|
| 526 |
+
" \"\"\"Accuracy metrics.\"\"\"\n",
|
| 527 |
+
" ref_norm = normalize_text(reference)\n",
|
| 528 |
+
" hyp_norm = normalize_text(hypothesis)\n",
|
| 529 |
+
" \n",
|
| 530 |
+
" cer_score = cer(ref_norm, hyp_norm) * 100\n",
|
| 531 |
+
" wer_score = wer(ref_norm, hyp_norm) * 100\n",
|
| 532 |
+
" similarity = max(0, 100 - wer_score)\n",
|
| 533 |
+
" \n",
|
| 534 |
+
" return {\n",
|
| 535 |
+
" 'Accuracy_Score': round(similarity, 2)\n",
|
| 536 |
+
" }\n",
|
| 537 |
+
"\n",
|
| 538 |
+
"def evaluate_citation_quality(answer: str, documents: List[Dict]) -> Dict[str, float]:\n",
|
| 539 |
+
" \"\"\"Relevance - citation quality.\"\"\"\n",
|
| 540 |
+
" pdf_names = [doc['pdf_name'].replace('.pdf', '') for doc in documents]\n",
|
| 541 |
+
" page_numbers = [str(doc['page_number']) for doc in documents]\n",
|
| 542 |
+
" \n",
|
| 543 |
+
" cited_pdfs = sum(1 for pdf in pdf_names if pdf in answer)\n",
|
| 544 |
+
" cited_pages = sum(1 for page in page_numbers if page in answer)\n",
|
| 545 |
+
" \n",
|
| 546 |
+
" citation_keywords = ['mənbə', 'sənəd', 'səhifə', 'pdf', 'document', 'page']\n",
|
| 547 |
+
" has_citation_format = any(kw in answer.lower() for kw in citation_keywords)\n",
|
| 548 |
+
" \n",
|
| 549 |
+
" citation_score = (\n",
|
| 550 |
+
" (cited_pdfs / len(pdf_names) * 40) +\n",
|
| 551 |
+
" (cited_pages / len(page_numbers) * 40) +\n",
|
| 552 |
+
" (20 if has_citation_format else 0)\n",
|
| 553 |
+
" )\n",
|
| 554 |
+
" \n",
|
| 555 |
+
" return {\n",
|
| 556 |
+
" 'Citation_Score': round(citation_score, 2),\n",
|
| 557 |
+
" 'Cited_PDFs': cited_pdfs,\n",
|
| 558 |
+
" 'Cited_Pages': cited_pages\n",
|
| 559 |
+
" }\n",
|
| 560 |
+
"\n",
|
| 561 |
+
"def evaluate_retrieval_quality(query: str, documents: List[Dict], expected_answer: str) -> Dict[str, float]:\n",
|
| 562 |
+
" \"\"\"Measure if retrieved docs are relevant to answer.\"\"\"\n",
|
| 563 |
+
" if not documents or not expected_answer:\n",
|
| 564 |
+
" return {'Retrieval_Relevance': 0.0}\n",
|
| 565 |
+
" \n",
|
| 566 |
+
" # Simple heuristic: check if expected answer words appear in retrieved docs\n",
|
| 567 |
+
" expected_words = set(normalize_text(expected_answer).split())\n",
|
| 568 |
+
" retrieved_text = ' '.join([doc['content'] for doc in documents])\n",
|
| 569 |
+
" retrieved_words = set(normalize_text(retrieved_text).split())\n",
|
| 570 |
+
" \n",
|
| 571 |
+
" overlap = len(expected_words & retrieved_words) / len(expected_words) if expected_words else 0\n",
|
| 572 |
+
" \n",
|
| 573 |
+
" return {\n",
|
| 574 |
+
" 'Retrieval_Relevance': round(overlap * 100, 2)\n",
|
| 575 |
+
" }\n",
|
| 576 |
+
"\n",
|
| 577 |
+
"def evaluate_completeness(answer: str) -> Dict[str, float]:\n",
|
| 578 |
+
" \"\"\"Completeness metrics.\"\"\"\n",
|
| 579 |
+
" word_count = len(answer.split())\n",
|
| 580 |
+
" \n",
|
| 581 |
+
" if word_count < 20:\n",
|
| 582 |
+
" completeness = (word_count / 20) * 100\n",
|
| 583 |
+
" elif word_count > 200:\n",
|
| 584 |
+
" completeness = 100 - ((word_count - 200) / 200 * 20)\n",
|
| 585 |
+
" else:\n",
|
| 586 |
+
" completeness = 100\n",
|
| 587 |
+
" \n",
|
| 588 |
+
" return {\n",
|
| 589 |
+
" 'Completeness_Score': round(max(0, completeness), 2),\n",
|
| 590 |
+
" 'Word_Count': word_count\n",
|
| 591 |
+
" }\n",
|
| 592 |
+
"\n",
|
| 593 |
+
"def calculate_llm_judge_score(accuracy: float, citation: float, completeness: float) -> float:\n",
|
| 594 |
+
" \"\"\"Overall LLM Judge score (weighted).\"\"\"\n",
|
| 595 |
+
" return round(\n",
|
| 596 |
+
" accuracy * 0.35 +\n",
|
| 597 |
+
" citation * 0.35 +\n",
|
| 598 |
+
" completeness * 0.30,\n",
|
| 599 |
+
" 2\n",
|
| 600 |
+
" )\n",
|
| 601 |
+
"\n",
|
| 602 |
+
"print(\"✅ Evaluation metrics ready\")"
|
| 603 |
+
]
|
| 604 |
+
},
|
| 605 |
+
{
|
| 606 |
+
"cell_type": "markdown",
|
| 607 |
+
"metadata": {},
|
| 608 |
+
"source": [
|
| 609 |
+
"## 7. Run Comprehensive Benchmark"
|
| 610 |
+
]
|
| 611 |
+
},
|
| 612 |
+
{
|
| 613 |
+
"cell_type": "code",
|
| 614 |
+
"execution_count": 14,
|
| 615 |
+
"metadata": {},
|
| 616 |
+
"outputs": [
|
| 617 |
+
{
|
| 618 |
+
"name": "stdout",
|
| 619 |
+
"output_type": "stream",
|
| 620 |
+
"text": [
|
| 621 |
+
"Testing 11 configurations on 5 questions\n",
|
| 622 |
+
"Total API calls: ~55\n",
|
| 623 |
+
"This will take 15-30 minutes...\n",
|
| 624 |
+
"\n"
|
| 625 |
+
]
|
| 626 |
+
}
|
| 627 |
+
],
|
| 628 |
+
"source": [
|
| 629 |
+
"# Configuration: Select what to test\n",
|
| 630 |
+
"CONFIGS_TO_TEST = [\n",
|
| 631 |
+
" # Format: (embed_model, retrieval_strategy, llm_model, prompt_strategy)\n",
|
| 632 |
+
" \n",
|
| 633 |
+
" # Baseline (current setup)\n",
|
| 634 |
+
" ('bge-large-en', 'vanilla_k3', 'Llama-4-Maverick', 'baseline'),\n",
|
| 635 |
+
" \n",
|
| 636 |
+
" # Test different embedding models\n",
|
| 637 |
+
" ('multilingual-e5-large', 'vanilla_k3', 'Llama-4-Maverick', 'baseline'),\n",
|
| 638 |
+
" \n",
|
| 639 |
+
" # Test different retrieval strategies\n",
|
| 640 |
+
" ('bge-large-en', 'vanilla_k5', 'Llama-4-Maverick', 'baseline'),\n",
|
| 641 |
+
" ('bge-large-en', 'mmr_balanced', 'Llama-4-Maverick', 'baseline'),\n",
|
| 642 |
+
" ('bge-large-en', 'reranked_k3', 'Llama-4-Maverick', 'baseline'),\n",
|
| 643 |
+
" \n",
|
| 644 |
+
" # Test different LLM models\n",
|
| 645 |
+
" ('bge-large-en', 'vanilla_k3', 'GPT-5-mini', 'baseline'),\n",
|
| 646 |
+
" ('bge-large-en', 'vanilla_k3', 'Claude-Sonnet-4.5', 'baseline'),\n",
|
| 647 |
+
" \n",
|
| 648 |
+
" # Test different prompting strategies\n",
|
| 649 |
+
" ('bge-large-en', 'vanilla_k3', 'Llama-4-Maverick', 'citation_focused'),\n",
|
| 650 |
+
" ('bge-large-en', 'vanilla_k3', 'Llama-4-Maverick', 'few_shot'),\n",
|
| 651 |
+
" \n",
|
| 652 |
+
" # Best combinations\n",
|
| 653 |
+
" ('bge-large-en', 'reranked_k3', 'GPT-5-mini', 'citation_focused'),\n",
|
| 654 |
+
" ('bge-large-en', 'mmr_balanced', 'Claude-Sonnet-4.5', 'citation_focused'),\n",
|
| 655 |
+
"]\n",
|
| 656 |
+
"\n",
|
| 657 |
+
"print(f\"Testing {len(CONFIGS_TO_TEST)} configurations on {len(questions)} questions\")\n",
|
| 658 |
+
"print(f\"Total API calls: ~{len(CONFIGS_TO_TEST) * len(questions)}\")\n",
|
| 659 |
+
"print(\"This will take 15-30 minutes...\\n\")"
|
| 660 |
+
]
|
| 661 |
+
},
|
| 662 |
+
{
|
| 663 |
+
"cell_type": "code",
|
| 664 |
+
"execution_count": 15,
|
| 665 |
+
"metadata": {},
|
| 666 |
+
"outputs": [
|
| 667 |
+
{
|
| 668 |
+
"name": "stdout",
|
| 669 |
+
"output_type": "stream",
|
| 670 |
+
"text": [
|
| 671 |
+
"\n",
|
| 672 |
+
"====================================================================================================\n",
|
| 673 |
+
"Config 1/11: bge-large-en_vanilla_k3_Llama-4-Maverick_baseline\n",
|
| 674 |
+
"====================================================================================================\n",
|
| 675 |
+
"\n",
|
| 676 |
+
" Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas...\n",
|
| 677 |
+
" Retrieved 3 docs\n",
|
| 678 |
+
" ✅ Generated in 3.01s\n",
|
| 679 |
+
"\n",
|
| 680 |
+
" Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və ...\n",
|
| 681 |
+
" Retrieved 3 docs\n",
|
| 682 |
+
" ✅ Generated in 2.38s\n",
|
| 683 |
+
"\n",
|
| 684 |
+
" Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümun...\n",
|
| 685 |
+
" Retrieved 3 docs\n",
|
| 686 |
+
" ✅ Generated in 2.45s\n",
|
| 687 |
+
"\n",
|
| 688 |
+
" Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkliyi (AKÇ) üçün geote...\n",
|
| 689 |
+
" Retrieved 3 docs\n",
|
| 690 |
+
" ✅ Generated in 3.52s\n",
|
| 691 |
+
"\n",
|
| 692 |
+
" Example5: Bu zonada hansı proseslər baş verir?...\n",
|
| 693 |
+
" Retrieved 3 docs\n",
|
| 694 |
+
" ✅ Generated in 1.14s\n",
|
| 695 |
+
"\n",
|
| 696 |
+
" 📊 Config Summary:\n",
|
| 697 |
+
" Avg LLM Judge Score: 43.53%\n",
|
| 698 |
+
" Avg Response Time: 2.50s\n",
|
| 699 |
+
"\n",
|
| 700 |
+
"====================================================================================================\n",
|
| 701 |
+
"Config 2/11: multilingual-e5-large_vanilla_k3_Llama-4-Maverick_baseline\n",
|
| 702 |
+
"====================================================================================================\n",
|
| 703 |
+
"\n",
|
| 704 |
+
" Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas...\n",
|
| 705 |
+
" Retrieved 3 docs\n",
|
| 706 |
+
" ✅ Generated in 3.98s\n",
|
| 707 |
+
"\n",
|
| 708 |
+
" Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və ...\n",
|
| 709 |
+
" Retrieved 3 docs\n",
|
| 710 |
+
" ✅ Generated in 1.66s\n",
|
| 711 |
+
"\n",
|
| 712 |
+
" Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümun...\n",
|
| 713 |
+
" Retrieved 3 docs\n",
|
| 714 |
+
" ✅ Generated in 2.19s\n",
|
| 715 |
+
"\n",
|
| 716 |
+
" Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkliyi (AKÇ) üçün geote...\n",
|
| 717 |
+
" Retrieved 3 docs\n",
|
| 718 |
+
" ✅ Generated in 4.38s\n",
|
| 719 |
+
"\n",
|
| 720 |
+
" Example5: Bu zonada hansı proseslər baş verir?...\n",
|
| 721 |
+
" Retrieved 3 docs\n",
|
| 722 |
+
" ✅ Generated in 4.32s\n",
|
| 723 |
+
"\n",
|
| 724 |
+
" 📊 Config Summary:\n",
|
| 725 |
+
" Avg LLM Judge Score: 39.73%\n",
|
| 726 |
+
" Avg Response Time: 3.31s\n",
|
| 727 |
+
"\n",
|
| 728 |
+
"====================================================================================================\n",
|
| 729 |
+
"Config 3/11: bge-large-en_vanilla_k5_Llama-4-Maverick_baseline\n",
|
| 730 |
+
"====================================================================================================\n",
|
| 731 |
+
"\n",
|
| 732 |
+
" Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas...\n",
|
| 733 |
+
" Retrieved 5 docs\n",
|
| 734 |
+
" ✅ Generated in 2.55s\n",
|
| 735 |
+
"\n",
|
| 736 |
+
" Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və ...\n",
|
| 737 |
+
" Retrieved 5 docs\n",
|
| 738 |
+
" ✅ Generated in 2.50s\n",
|
| 739 |
+
"\n",
|
| 740 |
+
" Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümun...\n",
|
| 741 |
+
" Retrieved 5 docs\n",
|
| 742 |
+
" ✅ Generated in 2.58s\n",
|
| 743 |
+
"\n",
|
| 744 |
+
" Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkliyi (AKÇ) üçün geote...\n",
|
| 745 |
+
" Retrieved 5 docs\n",
|
| 746 |
+
" ✅ Generated in 3.07s\n",
|
| 747 |
+
"\n",
|
| 748 |
+
" Example5: Bu zonada hansı proseslər baş verir?...\n",
|
| 749 |
+
" Retrieved 5 docs\n",
|
| 750 |
+
" ✅ Generated in 3.74s\n",
|
| 751 |
+
"\n",
|
| 752 |
+
" 📊 Config Summary:\n",
|
| 753 |
+
" Avg LLM Judge Score: 45.40%\n",
|
| 754 |
+
" Avg Response Time: 2.89s\n",
|
| 755 |
+
"\n",
|
| 756 |
+
"====================================================================================================\n",
|
| 757 |
+
"Config 4/11: bge-large-en_mmr_balanced_Llama-4-Maverick_baseline\n",
|
| 758 |
+
"====================================================================================================\n",
|
| 759 |
+
"\n",
|
| 760 |
+
" Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas...\n",
|
| 761 |
+
" Retrieved 3 docs\n",
|
| 762 |
+
" ✅ Generated in 1.64s\n",
|
| 763 |
+
"\n",
|
| 764 |
+
" Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və ...\n",
|
| 765 |
+
" Retrieved 3 docs\n",
|
| 766 |
+
" ✅ Generated in 1.27s\n",
|
| 767 |
+
"\n",
|
| 768 |
+
" Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümun...\n",
|
| 769 |
+
" Retrieved 3 docs\n",
|
| 770 |
+
" ✅ Generated in 2.34s\n",
|
| 771 |
+
"\n",
|
| 772 |
+
" Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkliyi (AKÇ) üçün geote...\n",
|
| 773 |
+
" Retrieved 3 docs\n",
|
| 774 |
+
" ✅ Generated in 3.05s\n",
|
| 775 |
+
"\n",
|
| 776 |
+
" Example5: Bu zonada hansı proseslər baş verir?...\n",
|
| 777 |
+
" Retrieved 3 docs\n",
|
| 778 |
+
" ✅ Generated in 2.52s\n",
|
| 779 |
+
"\n",
|
| 780 |
+
" 📊 Config Summary:\n",
|
| 781 |
+
" Avg LLM Judge Score: 45.40%\n",
|
| 782 |
+
" Avg Response Time: 2.16s\n",
|
| 783 |
+
"\n",
|
| 784 |
+
"====================================================================================================\n",
|
| 785 |
+
"Config 5/11: bge-large-en_reranked_k3_Llama-4-Maverick_baseline\n",
|
| 786 |
+
"====================================================================================================\n",
|
| 787 |
+
"\n",
|
| 788 |
+
" Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas...\n",
|
| 789 |
+
" Retrieved 3 docs\n",
|
| 790 |
+
" ✅ Generated in 2.26s\n",
|
| 791 |
+
"\n",
|
| 792 |
+
" Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və ...\n",
|
| 793 |
+
" Retrieved 3 docs\n",
|
| 794 |
+
" ✅ Generated in 3.12s\n",
|
| 795 |
+
"\n",
|
| 796 |
+
" Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümun...\n",
|
| 797 |
+
" Retrieved 3 docs\n",
|
| 798 |
+
" ✅ Generated in 2.83s\n",
|
| 799 |
+
"\n",
|
| 800 |
+
" Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkliyi (AKÇ) üçün geote...\n",
|
| 801 |
+
" Retrieved 3 docs\n",
|
| 802 |
+
" ✅ Generated in 3.93s\n",
|
| 803 |
+
"\n",
|
| 804 |
+
" Example5: Bu zonada hansı proseslər baş verir?...\n",
|
| 805 |
+
" Retrieved 3 docs\n",
|
| 806 |
+
" ✅ Generated in 3.24s\n",
|
| 807 |
+
"\n",
|
| 808 |
+
" 📊 Config Summary:\n",
|
| 809 |
+
" Avg LLM Judge Score: 44.47%\n",
|
| 810 |
+
" Avg Response Time: 3.08s\n",
|
| 811 |
+
"\n",
|
| 812 |
+
"====================================================================================================\n",
|
| 813 |
+
"Config 6/11: bge-large-en_vanilla_k3_GPT-5-mini_baseline\n",
|
| 814 |
+
"====================================================================================================\n",
|
| 815 |
+
"\n",
|
| 816 |
+
" Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas...\n",
|
| 817 |
+
" Retrieved 3 docs\n",
|
| 818 |
+
" ❌ ERROR: Error code: 400 - {'error': {'message': \"Unsupported parameter: 'max_tokens' is not supported with this model. Use 'max_completion_tokens' instead.\", 'type': 'invalid_request_error', 'param': 'max_tokens', 'code': 'unsupported_parameter'}}\n",
|
| 819 |
+
"\n",
|
| 820 |
+
" Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və ...\n",
|
| 821 |
+
" Retrieved 3 docs\n",
|
| 822 |
+
" ❌ ERROR: Error code: 400 - {'error': {'message': \"Unsupported parameter: 'max_tokens' is not supported with this model. Use 'max_completion_tokens' instead.\", 'type': 'invalid_request_error', 'param': 'max_tokens', 'code': 'unsupported_parameter'}}\n",
|
| 823 |
+
"\n",
|
| 824 |
+
" Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümun...\n",
|
| 825 |
+
" Retrieved 3 docs\n",
|
| 826 |
+
" ❌ ERROR: Error code: 400 - {'error': {'message': \"Unsupported parameter: 'max_tokens' is not supported with this model. Use 'max_completion_tokens' instead.\", 'type': 'invalid_request_error', 'param': 'max_tokens', 'code': 'unsupported_parameter'}}\n",
|
| 827 |
+
"\n",
|
| 828 |
+
" Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkliyi (AKÇ) üçün geote...\n",
|
| 829 |
+
" Retrieved 3 docs\n",
|
| 830 |
+
" ❌ ERROR: Error code: 400 - {'error': {'message': \"Unsupported parameter: 'max_tokens' is not supported with this model. Use 'max_completion_tokens' instead.\", 'type': 'invalid_request_error', 'param': 'max_tokens', 'code': 'unsupported_parameter'}}\n",
|
| 831 |
+
"\n",
|
| 832 |
+
" Example5: Bu zonada hansı proseslər baş verir?...\n",
|
| 833 |
+
" Retrieved 3 docs\n",
|
| 834 |
+
" ❌ ERROR: Error code: 400 - {'error': {'message': \"Unsupported parameter: 'max_tokens' is not supported with this model. Use 'max_completion_tokens' instead.\", 'type': 'invalid_request_error', 'param': 'max_tokens', 'code': 'unsupported_parameter'}}\n",
|
| 835 |
+
"\n",
|
| 836 |
+
"====================================================================================================\n",
|
| 837 |
+
"Config 7/11: bge-large-en_vanilla_k3_Claude-Sonnet-4.5_baseline\n",
|
| 838 |
+
"====================================================================================================\n",
|
| 839 |
+
"\n",
|
| 840 |
+
" Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas...\n",
|
| 841 |
+
" Retrieved 3 docs\n",
|
| 842 |
+
" ❌ ERROR: Error code: 400 - {'error': {'code': 'unknown_model', 'message': 'Unknown model: claude-sonnet-4-5', 'details': 'Unknown model: claude-sonnet-4-5'}}\n",
|
| 843 |
+
"\n",
|
| 844 |
+
" Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və ...\n",
|
| 845 |
+
" Retrieved 3 docs\n",
|
| 846 |
+
" ❌ ERROR: Error code: 400 - {'error': {'code': 'unknown_model', 'message': 'Unknown model: claude-sonnet-4-5', 'details': 'Unknown model: claude-sonnet-4-5'}}\n",
|
| 847 |
+
"\n",
|
| 848 |
+
" Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümun...\n",
|
| 849 |
+
" Retrieved 3 docs\n",
|
| 850 |
+
" ❌ ERROR: Error code: 400 - {'error': {'code': 'unknown_model', 'message': 'Unknown model: claude-sonnet-4-5', 'details': 'Unknown model: claude-sonnet-4-5'}}\n",
|
| 851 |
+
"\n",
|
| 852 |
+
" Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkliyi (AKÇ) üçün geote...\n",
|
| 853 |
+
" Retrieved 3 docs\n",
|
| 854 |
+
" ❌ ERROR: Error code: 400 - {'error': {'code': 'unknown_model', 'message': 'Unknown model: claude-sonnet-4-5', 'details': 'Unknown model: claude-sonnet-4-5'}}\n",
|
| 855 |
+
"\n",
|
| 856 |
+
" Example5: Bu zonada hansı proseslər baş verir?...\n",
|
| 857 |
+
" Retrieved 3 docs\n",
|
| 858 |
+
" ❌ ERROR: Error code: 400 - {'error': {'code': 'unknown_model', 'message': 'Unknown model: claude-sonnet-4-5', 'details': 'Unknown model: claude-sonnet-4-5'}}\n",
|
| 859 |
+
"\n",
|
| 860 |
+
"====================================================================================================\n",
|
| 861 |
+
"Config 8/11: bge-large-en_vanilla_k3_Llama-4-Maverick_citation_focused\n",
|
| 862 |
+
"====================================================================================================\n",
|
| 863 |
+
"\n",
|
| 864 |
+
" Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas...\n",
|
| 865 |
+
" Retrieved 3 docs\n",
|
| 866 |
+
" ✅ Generated in 2.24s\n",
|
| 867 |
+
"\n",
|
| 868 |
+
" Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və ...\n",
|
| 869 |
+
" Retrieved 3 docs\n",
|
| 870 |
+
" ✅ Generated in 3.82s\n",
|
| 871 |
+
"\n",
|
| 872 |
+
" Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümun...\n",
|
| 873 |
+
" Retrieved 3 docs\n",
|
| 874 |
+
" ✅ Generated in 2.36s\n",
|
| 875 |
+
"\n",
|
| 876 |
+
" Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkliyi (AKÇ) üçün geote...\n",
|
| 877 |
+
" Retrieved 3 docs\n",
|
| 878 |
+
" ✅ Generated in 3.30s\n",
|
| 879 |
+
"\n",
|
| 880 |
+
" Example5: Bu zonada hansı proseslər baş verir?...\n",
|
| 881 |
+
" Retrieved 3 docs\n",
|
| 882 |
+
" ✅ Generated in 1.59s\n",
|
| 883 |
+
"\n",
|
| 884 |
+
" 📊 Config Summary:\n",
|
| 885 |
+
" Avg LLM Judge Score: 57.53%\n",
|
| 886 |
+
" Avg Response Time: 2.66s\n",
|
| 887 |
+
"\n",
|
| 888 |
+
"====================================================================================================\n",
|
| 889 |
+
"Config 9/11: bge-large-en_vanilla_k3_Llama-4-Maverick_few_shot\n",
|
| 890 |
+
"====================================================================================================\n",
|
| 891 |
+
"\n",
|
| 892 |
+
" Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas...\n",
|
| 893 |
+
" Retrieved 3 docs\n",
|
| 894 |
+
" ✅ Generated in 0.87s\n",
|
| 895 |
+
"\n",
|
| 896 |
+
" Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və ...\n",
|
| 897 |
+
" Retrieved 3 docs\n",
|
| 898 |
+
" ✅ Generated in 1.51s\n",
|
| 899 |
+
"\n",
|
| 900 |
+
" Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümun...\n",
|
| 901 |
+
" Retrieved 3 docs\n",
|
| 902 |
+
" ✅ Generated in 1.96s\n",
|
| 903 |
+
"\n",
|
| 904 |
+
" Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkliyi (AKÇ) üçün geote...\n",
|
| 905 |
+
" Retrieved 3 docs\n",
|
| 906 |
+
" ✅ Generated in 2.77s\n",
|
| 907 |
+
"\n",
|
| 908 |
+
" Example5: Bu zonada hansı proseslər baş verir?...\n",
|
| 909 |
+
" Retrieved 3 docs\n",
|
| 910 |
+
" ✅ Generated in 1.08s\n",
|
| 911 |
+
"\n",
|
| 912 |
+
" 📊 Config Summary:\n",
|
| 913 |
+
" Avg LLM Judge Score: 61.51%\n",
|
| 914 |
+
" Avg Response Time: 1.64s\n",
|
| 915 |
+
"\n",
|
| 916 |
+
"====================================================================================================\n",
|
| 917 |
+
"Config 10/11: bge-large-en_reranked_k3_GPT-5-mini_citation_focused\n",
|
| 918 |
+
"====================================================================================================\n",
|
| 919 |
+
"\n",
|
| 920 |
+
" Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas...\n",
|
| 921 |
+
" Retrieved 3 docs\n",
|
| 922 |
+
" ❌ ERROR: Error code: 400 - {'error': {'message': \"Unsupported parameter: 'max_tokens' is not supported with this model. Use 'max_completion_tokens' instead.\", 'type': 'invalid_request_error', 'param': 'max_tokens', 'code': 'unsupported_parameter'}}\n",
|
| 923 |
+
"\n",
|
| 924 |
+
" Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və ...\n",
|
| 925 |
+
" Retrieved 3 docs\n",
|
| 926 |
+
" ❌ ERROR: Error code: 400 - {'error': {'message': \"Unsupported parameter: 'max_tokens' is not supported with this model. Use 'max_completion_tokens' instead.\", 'type': 'invalid_request_error', 'param': 'max_tokens', 'code': 'unsupported_parameter'}}\n",
|
| 927 |
+
"\n",
|
| 928 |
+
" Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümun...\n",
|
| 929 |
+
" Retrieved 3 docs\n",
|
| 930 |
+
" ❌ ERROR: Error code: 400 - {'error': {'message': \"Unsupported parameter: 'max_tokens' is not supported with this model. Use 'max_completion_tokens' instead.\", 'type': 'invalid_request_error', 'param': 'max_tokens', 'code': 'unsupported_parameter'}}\n",
|
| 931 |
+
"\n",
|
| 932 |
+
" Example4: Bakı arxipelaqı (BA) və Aşağı K��r çökəkliyi (AKÇ) üçün geote...\n",
|
| 933 |
+
" Retrieved 3 docs\n",
|
| 934 |
+
" ❌ ERROR: Error code: 400 - {'error': {'message': \"Unsupported parameter: 'max_tokens' is not supported with this model. Use 'max_completion_tokens' instead.\", 'type': 'invalid_request_error', 'param': 'max_tokens', 'code': 'unsupported_parameter'}}\n",
|
| 935 |
+
"\n",
|
| 936 |
+
" Example5: Bu zonada hansı proseslər baş verir?...\n",
|
| 937 |
+
" Retrieved 3 docs\n",
|
| 938 |
+
" ❌ ERROR: Error code: 400 - {'error': {'message': \"Unsupported parameter: 'max_tokens' is not supported with this model. Use 'max_completion_tokens' instead.\", 'type': 'invalid_request_error', 'param': 'max_tokens', 'code': 'unsupported_parameter'}}\n",
|
| 939 |
+
"\n",
|
| 940 |
+
"====================================================================================================\n",
|
| 941 |
+
"Config 11/11: bge-large-en_mmr_balanced_Claude-Sonnet-4.5_citation_focused\n",
|
| 942 |
+
"====================================================================================================\n",
|
| 943 |
+
"\n",
|
| 944 |
+
" Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas...\n",
|
| 945 |
+
" Retrieved 3 docs\n",
|
| 946 |
+
" ❌ ERROR: Error code: 400 - {'error': {'code': 'unknown_model', 'message': 'Unknown model: claude-sonnet-4-5', 'details': 'Unknown model: claude-sonnet-4-5'}}\n",
|
| 947 |
+
"\n",
|
| 948 |
+
" Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və ...\n",
|
| 949 |
+
" Retrieved 3 docs\n",
|
| 950 |
+
" ❌ ERROR: Error code: 400 - {'error': {'code': 'unknown_model', 'message': 'Unknown model: claude-sonnet-4-5', 'details': 'Unknown model: claude-sonnet-4-5'}}\n",
|
| 951 |
+
"\n",
|
| 952 |
+
" Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümun...\n",
|
| 953 |
+
" Retrieved 3 docs\n",
|
| 954 |
+
" ❌ ERROR: Error code: 400 - {'error': {'code': 'unknown_model', 'message': 'Unknown model: claude-sonnet-4-5', 'details': 'Unknown model: claude-sonnet-4-5'}}\n",
|
| 955 |
+
"\n",
|
| 956 |
+
" Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkliyi (AKÇ) üçün geote...\n",
|
| 957 |
+
" Retrieved 3 docs\n",
|
| 958 |
+
" ❌ ERROR: Error code: 400 - {'error': {'code': 'unknown_model', 'message': 'Unknown model: claude-sonnet-4-5', 'details': 'Unknown model: claude-sonnet-4-5'}}\n",
|
| 959 |
+
"\n",
|
| 960 |
+
" Example5: Bu zonada hansı proseslər baş verir?...\n",
|
| 961 |
+
" Retrieved 3 docs\n",
|
| 962 |
+
" ❌ ERROR: Error code: 400 - {'error': {'code': 'unknown_model', 'message': 'Unknown model: claude-sonnet-4-5', 'details': 'Unknown model: claude-sonnet-4-5'}}\n",
|
| 963 |
+
"\n",
|
| 964 |
+
"====================================================================================================\n",
|
| 965 |
+
"✅ Comprehensive benchmark complete!\n",
|
| 966 |
+
"====================================================================================================\n"
|
| 967 |
+
]
|
| 968 |
+
}
|
| 969 |
+
],
|
| 970 |
+
"source": [
|
| 971 |
+
"# Run benchmark\n",
|
| 972 |
+
"results = []\n",
|
| 973 |
+
"\n",
|
| 974 |
+
"for config_idx, (embed_key, retrieval_key, llm_key, prompt_key) in enumerate(CONFIGS_TO_TEST, 1):\n",
|
| 975 |
+
" config_name = f\"{embed_key}_{retrieval_key}_{llm_key}_{prompt_key}\"\n",
|
| 976 |
+
" \n",
|
| 977 |
+
" print(f\"\\n{'='*100}\")\n",
|
| 978 |
+
" print(f\"Config {config_idx}/{len(CONFIGS_TO_TEST)}: {config_name}\")\n",
|
| 979 |
+
" print(f\"{'='*100}\")\n",
|
| 980 |
+
" \n",
|
| 981 |
+
" # Get components\n",
|
| 982 |
+
" embed_model = embedding_cache[embed_key]\n",
|
| 983 |
+
" retrieval_func = RETRIEVAL_STRATEGIES[retrieval_key]['func']\n",
|
| 984 |
+
" retrieval_params = RETRIEVAL_STRATEGIES[retrieval_key]['params']\n",
|
| 985 |
+
" \n",
|
| 986 |
+
" config_results = []\n",
|
| 987 |
+
" \n",
|
| 988 |
+
" for example_key, messages in questions.items():\n",
|
| 989 |
+
" user_msg = [m for m in messages if m['role'] == 'user'][-1]\n",
|
| 990 |
+
" query = user_msg['content']\n",
|
| 991 |
+
" \n",
|
| 992 |
+
" print(f\"\\n {example_key}: {query[:60]}...\")\n",
|
| 993 |
+
" \n",
|
| 994 |
+
" # Retrieve documents\n",
|
| 995 |
+
" documents = retrieval_func(query, embed_model, **retrieval_params)\n",
|
| 996 |
+
" print(f\" Retrieved {len(documents)} docs\")\n",
|
| 997 |
+
" \n",
|
| 998 |
+
" # Generate answer\n",
|
| 999 |
+
" answer, response_time = generate_answer(llm_key, query, documents, prompt_key)\n",
|
| 1000 |
+
" \n",
|
| 1001 |
+
" if answer.startswith('ERROR'):\n",
|
| 1002 |
+
" print(f\" ❌ {answer}\")\n",
|
| 1003 |
+
" continue\n",
|
| 1004 |
+
" \n",
|
| 1005 |
+
" print(f\" ✅ Generated in {response_time:.2f}s\")\n",
|
| 1006 |
+
" \n",
|
| 1007 |
+
" # Evaluate\n",
|
| 1008 |
+
" expected = expected_answers.get(example_key, {}).get('Answer', '')\n",
|
| 1009 |
+
" \n",
|
| 1010 |
+
" accuracy_metrics = calculate_answer_quality(expected, answer) if expected else {'Accuracy_Score': 0}\n",
|
| 1011 |
+
" citation_metrics = evaluate_citation_quality(answer, documents)\n",
|
| 1012 |
+
" retrieval_metrics = evaluate_retrieval_quality(query, documents, expected)\n",
|
| 1013 |
+
" completeness_metrics = evaluate_completeness(answer)\n",
|
| 1014 |
+
" \n",
|
| 1015 |
+
" # Calculate overall score\n",
|
| 1016 |
+
" llm_judge_score = calculate_llm_judge_score(\n",
|
| 1017 |
+
" accuracy_metrics['Accuracy_Score'],\n",
|
| 1018 |
+
" citation_metrics['Citation_Score'],\n",
|
| 1019 |
+
" completeness_metrics['Completeness_Score']\n",
|
| 1020 |
+
" )\n",
|
| 1021 |
+
" \n",
|
| 1022 |
+
" result = {\n",
|
| 1023 |
+
" 'Config': config_name,\n",
|
| 1024 |
+
" 'Embedding_Model': embed_key,\n",
|
| 1025 |
+
" 'Retrieval_Strategy': retrieval_key,\n",
|
| 1026 |
+
" 'LLM_Model': llm_key,\n",
|
| 1027 |
+
" 'Prompt_Strategy': prompt_key,\n",
|
| 1028 |
+
" 'Question': example_key,\n",
|
| 1029 |
+
" 'Query': query[:80],\n",
|
| 1030 |
+
" 'Num_Docs_Retrieved': len(documents),\n",
|
| 1031 |
+
" 'Response_Time': round(response_time, 2),\n",
|
| 1032 |
+
" 'LLM_Judge_Score': llm_judge_score,\n",
|
| 1033 |
+
" **accuracy_metrics,\n",
|
| 1034 |
+
" **citation_metrics,\n",
|
| 1035 |
+
" **retrieval_metrics,\n",
|
| 1036 |
+
" **completeness_metrics,\n",
|
| 1037 |
+
" 'Answer_Preview': answer[:150]\n",
|
| 1038 |
+
" }\n",
|
| 1039 |
+
" \n",
|
| 1040 |
+
" results.append(result)\n",
|
| 1041 |
+
" config_results.append(result)\n",
|
| 1042 |
+
" \n",
|
| 1043 |
+
" # Show config summary\n",
|
| 1044 |
+
" if config_results:\n",
|
| 1045 |
+
" avg_score = sum(r['LLM_Judge_Score'] for r in config_results) / len(config_results)\n",
|
| 1046 |
+
" avg_time = sum(r['Response_Time'] for r in config_results) / len(config_results)\n",
|
| 1047 |
+
" print(f\"\\n 📊 Config Summary:\")\n",
|
| 1048 |
+
" print(f\" Avg LLM Judge Score: {avg_score:.2f}%\")\n",
|
| 1049 |
+
" print(f\" Avg Response Time: {avg_time:.2f}s\")\n",
|
| 1050 |
+
"\n",
|
| 1051 |
+
"print(f\"\\n{'='*100}\")\n",
|
| 1052 |
+
"print(\"✅ Comprehensive benchmark complete!\")\n",
|
| 1053 |
+
"print(f\"{'='*100}\")"
|
| 1054 |
+
]
|
| 1055 |
+
},
|
| 1056 |
+
{
|
| 1057 |
+
"cell_type": "markdown",
|
| 1058 |
+
"metadata": {},
|
| 1059 |
+
"source": [
|
| 1060 |
+
"## 8. Analyze Results"
|
| 1061 |
+
]
|
| 1062 |
+
},
|
| 1063 |
+
{
|
| 1064 |
+
"cell_type": "code",
|
| 1065 |
+
"execution_count": 16,
|
| 1066 |
+
"metadata": {},
|
| 1067 |
+
"outputs": [
|
| 1068 |
+
{
|
| 1069 |
+
"name": "stdout",
|
| 1070 |
+
"output_type": "stream",
|
| 1071 |
+
"text": [
|
| 1072 |
+
"\n",
|
| 1073 |
+
"========================================================================================================================\n",
|
| 1074 |
+
"📊 CONFIGURATION RANKINGS (By LLM Judge Score)\n",
|
| 1075 |
+
"========================================================================================================================\n",
|
| 1076 |
+
" Embedding_Model Retrieval_Strategy LLM_Model Prompt_Strategy LLM_Judge_Score Accuracy_Score Citation_Score Response_Time\n",
|
| 1077 |
+
"Config \n",
|
| 1078 |
+
"bge-large-en_vanilla_k3_Llama-4-Maverick_few_shot bge-large-en vanilla_k3 Llama-4-Maverick few_shot 61.51 11.35 78.67 1.64\n",
|
| 1079 |
+
"bge-large-en_vanilla_k3_Llama-4-Maverick_citation_focused bge-large-en vanilla_k3 Llama-4-Maverick citation_focused 57.53 0.00 78.67 2.66\n",
|
| 1080 |
+
"bge-large-en_mmr_balanced_Llama-4-Maverick_baseline bge-large-en mmr_balanced Llama-4-Maverick baseline 45.40 0.00 44.00 2.16\n",
|
| 1081 |
+
"bge-large-en_vanilla_k5_Llama-4-Maverick_baseline bge-large-en vanilla_k5 Llama-4-Maverick baseline 45.40 0.00 44.00 2.89\n",
|
| 1082 |
+
"bge-large-en_reranked_k3_Llama-4-Maverick_baseline bge-large-en reranked_k3 Llama-4-Maverick baseline 44.47 0.00 41.33 3.08\n",
|
| 1083 |
+
"bge-large-en_vanilla_k3_Llama-4-Maverick_baseline bge-large-en vanilla_k3 Llama-4-Maverick baseline 43.53 0.00 38.67 2.50\n",
|
| 1084 |
+
"multilingual-e5-large_vanilla_k3_Llama-4-Maverick_baseline multilingual-e5-large vanilla_k3 Llama-4-Maverick baseline 39.73 0.00 28.00 3.31\n",
|
| 1085 |
+
"========================================================================================================================\n"
|
| 1086 |
+
]
|
| 1087 |
+
}
|
| 1088 |
+
],
|
| 1089 |
+
"source": [
|
| 1090 |
+
"# Create DataFrame\n",
|
| 1091 |
+
"df = pd.DataFrame(results)\n",
|
| 1092 |
+
"\n",
|
| 1093 |
+
"# Aggregate by configuration\n",
|
| 1094 |
+
"config_summary = df.groupby('Config').agg({\n",
|
| 1095 |
+
" 'LLM_Judge_Score': 'mean',\n",
|
| 1096 |
+
" 'Accuracy_Score': 'mean',\n",
|
| 1097 |
+
" 'Citation_Score': 'mean',\n",
|
| 1098 |
+
" 'Retrieval_Relevance': 'mean',\n",
|
| 1099 |
+
" 'Completeness_Score': 'mean',\n",
|
| 1100 |
+
" 'Response_Time': 'mean',\n",
|
| 1101 |
+
" 'Embedding_Model': 'first',\n",
|
| 1102 |
+
" 'Retrieval_Strategy': 'first',\n",
|
| 1103 |
+
" 'LLM_Model': 'first',\n",
|
| 1104 |
+
" 'Prompt_Strategy': 'first'\n",
|
| 1105 |
+
"}).round(2)\n",
|
| 1106 |
+
"\n",
|
| 1107 |
+
"# Sort by LLM Judge Score\n",
|
| 1108 |
+
"config_summary = config_summary.sort_values('LLM_Judge_Score', ascending=False)\n",
|
| 1109 |
+
"\n",
|
| 1110 |
+
"print(\"\\n\" + \"=\"*120)\n",
|
| 1111 |
+
"print(\"📊 CONFIGURATION RANKINGS (By LLM Judge Score)\")\n",
|
| 1112 |
+
"print(\"=\"*120)\n",
|
| 1113 |
+
"display_cols = ['Embedding_Model', 'Retrieval_Strategy', 'LLM_Model', 'Prompt_Strategy', \n",
|
| 1114 |
+
" 'LLM_Judge_Score', 'Accuracy_Score', 'Citation_Score', 'Response_Time']\n",
|
| 1115 |
+
"print(config_summary[display_cols].to_string())\n",
|
| 1116 |
+
"print(\"=\"*120)"
|
| 1117 |
+
]
|
| 1118 |
+
},
|
| 1119 |
+
{
|
| 1120 |
+
"cell_type": "markdown",
|
| 1121 |
+
"metadata": {},
|
| 1122 |
+
"source": [
|
| 1123 |
+
"## 9. Component Analysis"
|
| 1124 |
+
]
|
| 1125 |
+
},
|
| 1126 |
+
{
|
| 1127 |
+
"cell_type": "code",
|
| 1128 |
+
"execution_count": 17,
|
| 1129 |
+
"metadata": {},
|
| 1130 |
+
"outputs": [
|
| 1131 |
+
{
|
| 1132 |
+
"name": "stdout",
|
| 1133 |
+
"output_type": "stream",
|
| 1134 |
+
"text": [
|
| 1135 |
+
"\n",
|
| 1136 |
+
"====================================================================================================\n",
|
| 1137 |
+
"🔍 COMPONENT IMPACT ANALYSIS\n",
|
| 1138 |
+
"====================================================================================================\n",
|
| 1139 |
+
"\n",
|
| 1140 |
+
"📚 EMBEDDING MODELS:\n",
|
| 1141 |
+
" bge-large-en: 49.64%\n",
|
| 1142 |
+
" multilingual-e5-large: 39.73%\n",
|
| 1143 |
+
"\n",
|
| 1144 |
+
"🔎 RETRIEVAL STRATEGIES:\n",
|
| 1145 |
+
" vanilla_k3: 50.58% (Current setup)\n",
|
| 1146 |
+
" mmr_balanced: 45.40% (Balance diversity)\n",
|
| 1147 |
+
" vanilla_k5: 45.40% (More context)\n",
|
| 1148 |
+
" reranked_k3: 44.47% (Two-stage rerank)\n",
|
| 1149 |
+
"\n",
|
| 1150 |
+
"🤖 LLM MODELS:\n",
|
| 1151 |
+
" Llama-4-Maverick: 48.22%\n",
|
| 1152 |
+
"\n",
|
| 1153 |
+
"💬 PROMPTING STRATEGIES:\n",
|
| 1154 |
+
" few_shot: 61.51%\n",
|
| 1155 |
+
" citation_focused: 57.53%\n",
|
| 1156 |
+
" baseline: 43.71%\n",
|
| 1157 |
+
"\n",
|
| 1158 |
+
"====================================================================================================\n"
|
| 1159 |
+
]
|
| 1160 |
+
}
|
| 1161 |
+
],
|
| 1162 |
+
"source": [
|
| 1163 |
+
"# Analyze impact of each component\n",
|
| 1164 |
+
"print(\"\\n\" + \"=\"*100)\n",
|
| 1165 |
+
"print(\"🔍 COMPONENT IMPACT ANALYSIS\")\n",
|
| 1166 |
+
"print(\"=\"*100)\n",
|
| 1167 |
+
"\n",
|
| 1168 |
+
"# 1. Embedding Models\n",
|
| 1169 |
+
"print(\"\\n📚 EMBEDDING MODELS:\")\n",
|
| 1170 |
+
"embed_impact = df.groupby('Embedding_Model')['LLM_Judge_Score'].mean().sort_values(ascending=False)\n",
|
| 1171 |
+
"for model, score in embed_impact.items():\n",
|
| 1172 |
+
" print(f\" {model}: {score:.2f}%\")\n",
|
| 1173 |
+
"\n",
|
| 1174 |
+
"# 2. Retrieval Strategies\n",
|
| 1175 |
+
"print(\"\\n🔎 RETRIEVAL STRATEGIES:\")\n",
|
| 1176 |
+
"retrieval_impact = df.groupby('Retrieval_Strategy')['LLM_Judge_Score'].mean().sort_values(ascending=False)\n",
|
| 1177 |
+
"for strategy, score in retrieval_impact.items():\n",
|
| 1178 |
+
" notes = RETRIEVAL_STRATEGIES[strategy]['notes']\n",
|
| 1179 |
+
" print(f\" {strategy}: {score:.2f}% ({notes})\")\n",
|
| 1180 |
+
"\n",
|
| 1181 |
+
"# 3. LLM Models\n",
|
| 1182 |
+
"print(\"\\n🤖 LLM MODELS:\")\n",
|
| 1183 |
+
"llm_impact = df.groupby('LLM_Model')['LLM_Judge_Score'].mean().sort_values(ascending=False)\n",
|
| 1184 |
+
"for model, score in llm_impact.items():\n",
|
| 1185 |
+
" print(f\" {model}: {score:.2f}%\")\n",
|
| 1186 |
+
"\n",
|
| 1187 |
+
"# 4. Prompting Strategies\n",
|
| 1188 |
+
"print(\"\\n💬 PROMPTING STRATEGIES:\")\n",
|
| 1189 |
+
"prompt_impact = df.groupby('Prompt_Strategy')['LLM_Judge_Score'].mean().sort_values(ascending=False)\n",
|
| 1190 |
+
"for strategy, score in prompt_impact.items():\n",
|
| 1191 |
+
" print(f\" {strategy}: {score:.2f}%\")\n",
|
| 1192 |
+
"\n",
|
| 1193 |
+
"print(\"\\n\" + \"=\"*100)"
|
| 1194 |
+
]
|
| 1195 |
+
},
|
| 1196 |
+
{
|
| 1197 |
+
"cell_type": "markdown",
|
| 1198 |
+
"metadata": {},
|
| 1199 |
+
"source": [
|
| 1200 |
+
"## 10. Visualizations"
|
| 1201 |
+
]
|
| 1202 |
+
},
|
| 1203 |
+
{
|
| 1204 |
+
"cell_type": "code",
|
| 1205 |
+
"execution_count": null,
|
| 1206 |
+
"metadata": {},
|
| 1207 |
+
"outputs": [],
|
| 1208 |
+
"source": "import os\nfrom pathlib import Path\n\n# Create output directory\noutput_dir = Path('output/rag_optimization_benchmark')\noutput_dir.mkdir(parents=True, exist_ok=True)\n\nfig, axes = plt.subplots(2, 3, figsize=(20, 12))\n\n# 1. Top Configurations\nax1 = axes[0, 0]\ntop_configs = config_summary.head(10)\nconfig_labels = [c.split('_')[-2] + '+' + c.split('_')[-1] for c in top_configs.index]\nax1.barh(config_labels, top_configs['LLM_Judge_Score'], color=sns.color_palette('viridis', len(top_configs)))\nax1.set_xlabel('LLM Judge Score (%)', fontsize=11, fontweight='bold')\nax1.set_title('Top 10 Configurations', fontsize=13, fontweight='bold')\nax1.set_xlim(0, 100)\nfor i, score in enumerate(top_configs['LLM_Judge_Score']):\n ax1.text(score + 1, i, f'{score:.1f}', va='center', fontsize=10)\n\n# 2. Embedding Model Impact\nax2 = axes[0, 1]\nax2.bar(embed_impact.index, embed_impact.values, color='skyblue', alpha=0.8)\nax2.set_ylabel('Avg LLM Judge Score (%)', fontsize=11, fontweight='bold')\nax2.set_title('Embedding Model Impact', fontsize=13, fontweight='bold')\nax2.set_ylim(0, 100)\nax2.tick_params(axis='x', rotation=45)\nfor i, (model, score) in enumerate(embed_impact.items()):\n ax2.text(i, score + 2, f'{score:.1f}', ha='center', fontsize=10)\n\n# 3. Retrieval Strategy Impact\nax3 = axes[0, 2]\nax3.bar(retrieval_impact.index, retrieval_impact.values, color='coral', alpha=0.8)\nax3.set_ylabel('Avg LLM Judge Score (%)', fontsize=11, fontweight='bold')\nax3.set_title('Retrieval Strategy Impact', fontsize=13, fontweight='bold')\nax3.set_ylim(0, 100)\nax3.tick_params(axis='x', rotation=45)\nfor i, (strategy, score) in enumerate(retrieval_impact.items()):\n ax3.text(i, score + 2, f'{score:.1f}', ha='center', fontsize=9)\n\n# 4. LLM Model Impact\nax4 = axes[1, 0]\nax4.bar(llm_impact.index, llm_impact.values, color='mediumseagreen', alpha=0.8)\nax4.set_ylabel('Avg LLM Judge Score (%)', fontsize=11, fontweight='bold')\nax4.set_title('LLM Model Impact', fontsize=13, fontweight='bold')\nax4.set_ylim(0, 100)\nax4.tick_params(axis='x', rotation=45)\nfor i, (model, score) in enumerate(llm_impact.items()):\n ax4.text(i, score + 2, f'{score:.1f}', ha='center', fontsize=10)\n\n# 5. Prompting Strategy Impact\nax5 = axes[1, 1]\nax5.bar(prompt_impact.index, prompt_impact.values, color='mediumpurple', alpha=0.8)\nax5.set_ylabel('Avg LLM Judge Score (%)', fontsize=11, fontweight='bold')\nax5.set_title('Prompting Strategy Impact', fontsize=13, fontweight='bold')\nax5.set_ylim(0, 100)\nax5.tick_params(axis='x', rotation=45)\nfor i, (strategy, score) in enumerate(prompt_impact.items()):\n ax5.text(i, score + 2, f'{score:.1f}', ha='center', fontsize=10)\n\n# 6. Score Components (best config)\nax6 = axes[1, 2]\nbest_config = config_summary.iloc[0]\ncomponents = ['Accuracy', 'Citation', 'Completeness']\nscores = [best_config['Accuracy_Score'], best_config['Citation_Score'], best_config['Completeness_Score']]\ncolors_comp = ['#FF6B6B', '#4ECDC4', '#45B7D1']\nbars = ax6.bar(components, scores, color=colors_comp, alpha=0.8)\nax6.set_ylabel('Score (%)', fontsize=11, fontweight='bold')\nax6.set_title(f'Best Config Components\\n{best_config.name.split(\"_\")[2]}', fontsize=13, fontweight='bold')\nax6.set_ylim(0, 100)\nfor i, score in enumerate(scores):\n ax6.text(i, score + 2, f'{score:.1f}%', ha='center', fontsize=10, fontweight='bold')\n\nplt.tight_layout()\nplt.savefig(output_dir / 'results.png', dpi=300, bbox_inches='tight')\nplt.show()\n\nprint(f\"\\n✅ Visualization saved to '{output_dir}/results.png'\")"
|
| 1209 |
+
},
|
| 1210 |
+
{
|
| 1211 |
+
"cell_type": "markdown",
|
| 1212 |
+
"metadata": {},
|
| 1213 |
+
"source": [
|
| 1214 |
+
"## 11. Final Recommendations"
|
| 1215 |
+
]
|
| 1216 |
+
},
|
| 1217 |
+
{
|
| 1218 |
+
"cell_type": "code",
|
| 1219 |
+
"execution_count": 19,
|
| 1220 |
+
"metadata": {},
|
| 1221 |
+
"outputs": [
|
| 1222 |
+
{
|
| 1223 |
+
"name": "stdout",
|
| 1224 |
+
"output_type": "stream",
|
| 1225 |
+
"text": [
|
| 1226 |
+
"\n",
|
| 1227 |
+
"====================================================================================================\n",
|
| 1228 |
+
"🏆 OPTIMAL RAG CONFIGURATION\n",
|
| 1229 |
+
"====================================================================================================\n",
|
| 1230 |
+
"\n",
|
| 1231 |
+
"✅ Best Configuration: bge-large-en_vanilla_k3_Llama-4-Maverick_few_shot\n",
|
| 1232 |
+
"\n",
|
| 1233 |
+
"📊 Performance:\n",
|
| 1234 |
+
" LLM Judge Score: 61.51%\n",
|
| 1235 |
+
" Accuracy: 11.35%\n",
|
| 1236 |
+
" Citation Quality: 78.67%\n",
|
| 1237 |
+
" Completeness: 100.00%\n",
|
| 1238 |
+
" Avg Response Time: 1.64s\n",
|
| 1239 |
+
"\n",
|
| 1240 |
+
"⚙️ Components:\n",
|
| 1241 |
+
" Embedding Model: bge-large-en\n",
|
| 1242 |
+
" → BAAI/bge-large-en-v1.5\n",
|
| 1243 |
+
" Retrieval Strategy: vanilla_k3\n",
|
| 1244 |
+
" → Current setup\n",
|
| 1245 |
+
" LLM Model: Llama-4-Maverick\n",
|
| 1246 |
+
" Prompting Strategy: few_shot\n",
|
| 1247 |
+
"\n",
|
| 1248 |
+
"💡 Key Findings:\n",
|
| 1249 |
+
" 1. Best Embedding: bge-large-en (49.64%)\n",
|
| 1250 |
+
" 2. Best Retrieval: vanilla_k3 (50.58%)\n",
|
| 1251 |
+
" 3. Best LLM: Llama-4-Maverick (48.22%)\n",
|
| 1252 |
+
" 4. Best Prompt: few_shot (61.51%)\n",
|
| 1253 |
+
"\n",
|
| 1254 |
+
"🎯 Hackathon Impact:\n",
|
| 1255 |
+
" LLM Quality = 30% of total score\n",
|
| 1256 |
+
" Your score: 61.51% × 30% = 18.45 points\n",
|
| 1257 |
+
"\n",
|
| 1258 |
+
"📈 Improvement vs Baseline:\n",
|
| 1259 |
+
" +24.51% quality improvement\n",
|
| 1260 |
+
" = +7.35 hackathon points\n",
|
| 1261 |
+
"\n",
|
| 1262 |
+
"====================================================================================================\n",
|
| 1263 |
+
"📝 IMPLEMENTATION CHECKLIST\n",
|
| 1264 |
+
"====================================================================================================\n",
|
| 1265 |
+
"\n",
|
| 1266 |
+
"1. Use embedding model: BAAI/bge-large-en-v1.5\n",
|
| 1267 |
+
"2. Implement retrieval: vanilla_k3\n",
|
| 1268 |
+
"3. Use LLM model: Llama-4-Maverick\n",
|
| 1269 |
+
"4. Apply prompt: few_shot\n",
|
| 1270 |
+
"\n",
|
| 1271 |
+
"5. Expected performance:\n",
|
| 1272 |
+
" - LLM Judge Score: 61.51%\n",
|
| 1273 |
+
" - Response time: ~1.6s\n",
|
| 1274 |
+
"====================================================================================================\n"
|
| 1275 |
+
]
|
| 1276 |
+
}
|
| 1277 |
+
],
|
| 1278 |
+
"source": [
|
| 1279 |
+
"best_config = config_summary.iloc[0]\n",
|
| 1280 |
+
"\n",
|
| 1281 |
+
"print(\"\\n\" + \"=\"*100)\n",
|
| 1282 |
+
"print(\"🏆 OPTIMAL RAG CONFIGURATION\")\n",
|
| 1283 |
+
"print(\"=\"*100)\n",
|
| 1284 |
+
"\n",
|
| 1285 |
+
"print(f\"\\n✅ Best Configuration: {best_config.name}\")\n",
|
| 1286 |
+
"print(f\"\\n📊 Performance:\")\n",
|
| 1287 |
+
"print(f\" LLM Judge Score: {best_config['LLM_Judge_Score']:.2f}%\")\n",
|
| 1288 |
+
"print(f\" Accuracy: {best_config['Accuracy_Score']:.2f}%\")\n",
|
| 1289 |
+
"print(f\" Citation Quality: {best_config['Citation_Score']:.2f}%\")\n",
|
| 1290 |
+
"print(f\" Completeness: {best_config['Completeness_Score']:.2f}%\")\n",
|
| 1291 |
+
"print(f\" Avg Response Time: {best_config['Response_Time']:.2f}s\")\n",
|
| 1292 |
+
"\n",
|
| 1293 |
+
"print(f\"\\n⚙️ Components:\")\n",
|
| 1294 |
+
"print(f\" Embedding Model: {best_config['Embedding_Model']}\")\n",
|
| 1295 |
+
"print(f\" → {EMBEDDING_MODELS[best_config['Embedding_Model']]['name']}\")\n",
|
| 1296 |
+
"print(f\" Retrieval Strategy: {best_config['Retrieval_Strategy']}\")\n",
|
| 1297 |
+
"print(f\" → {RETRIEVAL_STRATEGIES[best_config['Retrieval_Strategy']]['notes']}\")\n",
|
| 1298 |
+
"print(f\" LLM Model: {best_config['LLM_Model']}\")\n",
|
| 1299 |
+
"print(f\" Prompting Strategy: {best_config['Prompt_Strategy']}\")\n",
|
| 1300 |
+
"\n",
|
| 1301 |
+
"print(f\"\\n💡 Key Findings:\")\n",
|
| 1302 |
+
"print(f\" 1. Best Embedding: {embed_impact.index[0]} ({embed_impact.values[0]:.2f}%)\")\n",
|
| 1303 |
+
"print(f\" 2. Best Retrieval: {retrieval_impact.index[0]} ({retrieval_impact.values[0]:.2f}%)\")\n",
|
| 1304 |
+
"print(f\" 3. Best LLM: {llm_impact.index[0]} ({llm_impact.values[0]:.2f}%)\")\n",
|
| 1305 |
+
"print(f\" 4. Best Prompt: {prompt_impact.index[0]} ({prompt_impact.values[0]:.2f}%)\")\n",
|
| 1306 |
+
"\n",
|
| 1307 |
+
"print(f\"\\n🎯 Hackathon Impact:\")\n",
|
| 1308 |
+
"print(f\" LLM Quality = 30% of total score\")\n",
|
| 1309 |
+
"print(f\" Your score: {best_config['LLM_Judge_Score']:.2f}% × 30% = {best_config['LLM_Judge_Score'] * 0.3:.2f} points\")\n",
|
| 1310 |
+
"\n",
|
| 1311 |
+
"baseline = df[df['Config'].str.contains('baseline')].iloc[0] if len(df[df['Config'].str.contains('baseline')]) > 0 else None\n",
|
| 1312 |
+
"if baseline is not None:\n",
|
| 1313 |
+
" improvement = best_config['LLM_Judge_Score'] - baseline['LLM_Judge_Score']\n",
|
| 1314 |
+
" print(f\"\\n📈 Improvement vs Baseline:\")\n",
|
| 1315 |
+
" print(f\" +{improvement:.2f}% quality improvement\")\n",
|
| 1316 |
+
" print(f\" = +{improvement * 0.3:.2f} hackathon points\")\n",
|
| 1317 |
+
"\n",
|
| 1318 |
+
"print(\"\\n\" + \"=\"*100)\n",
|
| 1319 |
+
"print(\"📝 IMPLEMENTATION CHECKLIST\")\n",
|
| 1320 |
+
"print(\"=\"*100)\n",
|
| 1321 |
+
"print(f\"\\n1. Use embedding model: {EMBEDDING_MODELS[best_config['Embedding_Model']]['name']}\")\n",
|
| 1322 |
+
"print(f\"2. Implement retrieval: {best_config['Retrieval_Strategy']}\")\n",
|
| 1323 |
+
"print(f\"3. Use LLM model: {best_config['LLM_Model']}\")\n",
|
| 1324 |
+
"print(f\"4. Apply prompt: {best_config['Prompt_Strategy']}\")\n",
|
| 1325 |
+
"print(f\"\\n5. Expected performance:\")\n",
|
| 1326 |
+
"print(f\" - LLM Judge Score: {best_config['LLM_Judge_Score']:.2f}%\")\n",
|
| 1327 |
+
"print(f\" - Response time: ~{best_config['Response_Time']:.1f}s\")\n",
|
| 1328 |
+
"print(\"=\"*100)"
|
| 1329 |
+
]
|
| 1330 |
+
},
|
| 1331 |
+
{
|
| 1332 |
+
"cell_type": "markdown",
|
| 1333 |
+
"metadata": {},
|
| 1334 |
+
"source": [
|
| 1335 |
+
"## 12. Export Results"
|
| 1336 |
+
]
|
| 1337 |
+
},
|
| 1338 |
+
{
|
| 1339 |
+
"cell_type": "code",
|
| 1340 |
+
"execution_count": null,
|
| 1341 |
+
"metadata": {},
|
| 1342 |
+
"outputs": [],
|
| 1343 |
+
"source": "# Save results\nfrom pathlib import Path\n\noutput_dir = Path('output/rag_optimization_benchmark')\noutput_dir.mkdir(parents=True, exist_ok=True)\n\ndf.to_csv(output_dir / 'detailed_results.csv', index=False, encoding='utf-8')\nconfig_summary.to_csv(output_dir / 'summary.csv', encoding='utf-8')\n\n# Save component impacts\nimpacts = pd.DataFrame({\n 'Embedding_Impact': embed_impact,\n 'Retrieval_Impact': retrieval_impact.reindex(embed_impact.index, fill_value=0),\n 'LLM_Impact': llm_impact.reindex(embed_impact.index, fill_value=0),\n 'Prompt_Impact': prompt_impact.reindex(embed_impact.index, fill_value=0)\n}).fillna(0)\nimpacts.to_csv(output_dir / 'component_impacts.csv', encoding='utf-8')\n\nprint(\"\\n✅ Results exported to output/rag_optimization_benchmark/:\")\nprint(\" - detailed_results.csv (all tests)\")\nprint(\" - summary.csv (config rankings)\")\nprint(\" - component_impacts.csv (component analysis)\")\nprint(\" - results.png (visualizations)\")"
|
| 1344 |
+
}
|
| 1345 |
+
],
|
| 1346 |
+
"metadata": {
|
| 1347 |
+
"kernelspec": {
|
| 1348 |
+
"display_name": "venv",
|
| 1349 |
+
"language": "python",
|
| 1350 |
+
"name": "python3"
|
| 1351 |
+
},
|
| 1352 |
+
"language_info": {
|
| 1353 |
+
"codemirror_mode": {
|
| 1354 |
+
"name": "ipython",
|
| 1355 |
+
"version": 3
|
| 1356 |
+
},
|
| 1357 |
+
"file_extension": ".py",
|
| 1358 |
+
"mimetype": "text/x-python",
|
| 1359 |
+
"name": "python",
|
| 1360 |
+
"nbconvert_exporter": "python",
|
| 1361 |
+
"pygments_lexer": "ipython3",
|
| 1362 |
+
"version": "3.10.12"
|
| 1363 |
+
}
|
| 1364 |
+
},
|
| 1365 |
+
"nbformat": 4,
|
| 1366 |
+
"nbformat_minor": 4
|
| 1367 |
+
}
|
notebooks/requirements_llm_benchmark.txt
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# LLM Benchmarking Requirements
|
| 2 |
+
# Install with: pip install -r requirements_llm_benchmark.txt
|
| 3 |
+
|
| 4 |
+
# Azure OpenAI client
|
| 5 |
+
openai==1.54.0
|
| 6 |
+
|
| 7 |
+
# Vector Database
|
| 8 |
+
pinecone-client==5.0.0
|
| 9 |
+
|
| 10 |
+
# Embeddings
|
| 11 |
+
sentence-transformers==3.3.1
|
| 12 |
+
|
| 13 |
+
# Metrics
|
| 14 |
+
jiwer==3.0.3
|
| 15 |
+
|
| 16 |
+
# Data analysis and visualization
|
| 17 |
+
pandas==2.1.3
|
| 18 |
+
matplotlib==3.8.2
|
| 19 |
+
seaborn==0.13.0
|
| 20 |
+
|
| 21 |
+
# Utilities
|
| 22 |
+
python-dotenv==1.0.0
|
| 23 |
+
numpy==1.26.2
|
| 24 |
+
|
| 25 |
+
# Jupyter
|
| 26 |
+
jupyter==1.0.0
|
| 27 |
+
ipykernel==6.27.1
|
notebooks/requirements_rag_optimization.txt
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# RAG Pipeline Optimization Requirements
|
| 2 |
+
# Install with: pip install -r requirements_rag_optimization.txt
|
| 3 |
+
|
| 4 |
+
# Azure OpenAI
|
| 5 |
+
openai==1.54.0
|
| 6 |
+
|
| 7 |
+
# Vector Database
|
| 8 |
+
pinecone-client==5.0.0
|
| 9 |
+
|
| 10 |
+
# Embeddings and Reranking
|
| 11 |
+
sentence-transformers==3.3.1
|
| 12 |
+
|
| 13 |
+
# Metrics
|
| 14 |
+
jiwer==3.0.3
|
| 15 |
+
|
| 16 |
+
# Data analysis and visualization
|
| 17 |
+
pandas==2.1.3
|
| 18 |
+
matplotlib==3.8.2
|
| 19 |
+
seaborn==0.13.0
|
| 20 |
+
numpy==1.26.2
|
| 21 |
+
|
| 22 |
+
# Utilities
|
| 23 |
+
python-dotenv==1.0.0
|
| 24 |
+
|
| 25 |
+
# Jupyter
|
| 26 |
+
jupyter==1.0.0
|
| 27 |
+
ipykernel==6.27.1
|
notebooks/requirements_vlm_ocr.txt
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# VLM OCR Benchmarking Requirements
|
| 2 |
+
# Install with: pip install -r requirements_vlm_ocr.txt
|
| 3 |
+
|
| 4 |
+
# Azure OpenAI client (for vision models)
|
| 5 |
+
openai==1.54.0
|
| 6 |
+
|
| 7 |
+
# PDF processing
|
| 8 |
+
PyMuPDF==1.23.8
|
| 9 |
+
Pillow==10.1.0
|
| 10 |
+
|
| 11 |
+
# Metrics
|
| 12 |
+
jiwer==3.0.3
|
| 13 |
+
|
| 14 |
+
# Data analysis and visualization
|
| 15 |
+
pandas==2.1.3
|
| 16 |
+
matplotlib==3.8.2
|
| 17 |
+
seaborn==0.13.0
|
| 18 |
+
|
| 19 |
+
# Utilities
|
| 20 |
+
python-dotenv==1.0.0
|
| 21 |
+
|
| 22 |
+
# Jupyter
|
| 23 |
+
jupyter==1.0.0
|
| 24 |
+
ipykernel==6.27.1
|
notebooks/vlm_ocr_benchmark.ipynb
ADDED
|
@@ -0,0 +1,891 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": [
|
| 7 |
+
"# Vision-Language Model (VLM) OCR Benchmarking\n",
|
| 8 |
+
"\n",
|
| 9 |
+
"Testing **Vision-Language Models** for OCR on historical SOCAR documents.\n",
|
| 10 |
+
"\n",
|
| 11 |
+
"## Why VLMs for OCR?\n",
|
| 12 |
+
"- **Better than traditional OCR** (Tesseract, EasyOCR, etc.)\n",
|
| 13 |
+
"- **Understands context** - can handle handwriting, layout, multi-language\n",
|
| 14 |
+
"- **Directly processes images** - no separate OCR step needed\n",
|
| 15 |
+
"- **State-of-the-art accuracy** on complex documents\n",
|
| 16 |
+
"\n",
|
| 17 |
+
"## Vision Models to Test:\n",
|
| 18 |
+
"1. **GPT-4.1** ⭐⭐⭐⭐⭐ (Excellent OCR capability)\n",
|
| 19 |
+
"2. **GPT-5, GPT-5-mini** ⭐⭐⭐⭐⭐ (Latest, best performance)\n",
|
| 20 |
+
"3. **Claude-Sonnet-4.5** ⭐⭐⭐⭐⭐ (Very good OCR)\n",
|
| 21 |
+
"4. **Phi-4-multimodal-instruct** ⭐⭐⭐⭐ (Explicitly multimodal)\n",
|
| 22 |
+
"5. **Llama-4-Maverick-17B** ⭐⭐⭐⭐ (May have vision support)\n",
|
| 23 |
+
"6. **DeepSeek-VL** (Vision-Language specialized)\n",
|
| 24 |
+
"\n",
|
| 25 |
+
"## Metrics:\n",
|
| 26 |
+
"- **CER** (Character Error Rate) - Lower is better\n",
|
| 27 |
+
"- **CSR** (Character Success Rate) = 100 - CER - Higher is better\n",
|
| 28 |
+
"- **WER** (Word Error Rate) - Lower is better\n",
|
| 29 |
+
"- **WSR** (Word Success Rate) = 100 - WER - Higher is better"
|
| 30 |
+
]
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"cell_type": "code",
|
| 34 |
+
"execution_count": 15,
|
| 35 |
+
"metadata": {},
|
| 36 |
+
"outputs": [],
|
| 37 |
+
"source": [
|
| 38 |
+
"# Install required packages\n",
|
| 39 |
+
"# !pip install openai PyMuPDF Pillow jiwer pandas matplotlib seaborn python-dotenv"
|
| 40 |
+
]
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"cell_type": "code",
|
| 44 |
+
"execution_count": 16,
|
| 45 |
+
"metadata": {},
|
| 46 |
+
"outputs": [
|
| 47 |
+
{
|
| 48 |
+
"name": "stdout",
|
| 49 |
+
"output_type": "stream",
|
| 50 |
+
"text": [
|
| 51 |
+
"✅ Libraries loaded\n"
|
| 52 |
+
]
|
| 53 |
+
}
|
| 54 |
+
],
|
| 55 |
+
"source": [
|
| 56 |
+
"import os\n",
|
| 57 |
+
"import base64\n",
|
| 58 |
+
"import re\n",
|
| 59 |
+
"import time\n",
|
| 60 |
+
"from pathlib import Path\n",
|
| 61 |
+
"from typing import Dict, List, Tuple\n",
|
| 62 |
+
"from io import BytesIO\n",
|
| 63 |
+
"\n",
|
| 64 |
+
"from dotenv import load_dotenv\n",
|
| 65 |
+
"import fitz # PyMuPDF\n",
|
| 66 |
+
"from PIL import Image\n",
|
| 67 |
+
"import pandas as pd\n",
|
| 68 |
+
"import matplotlib.pyplot as plt\n",
|
| 69 |
+
"import seaborn as sns\n",
|
| 70 |
+
"from jiwer import wer, cer\n",
|
| 71 |
+
"from openai import AzureOpenAI\n",
|
| 72 |
+
"\n",
|
| 73 |
+
"# Load environment\n",
|
| 74 |
+
"load_dotenv()\n",
|
| 75 |
+
"\n",
|
| 76 |
+
"# Set style\n",
|
| 77 |
+
"sns.set_style('whitegrid')\n",
|
| 78 |
+
"plt.rcParams['figure.figsize'] = (14, 8)\n",
|
| 79 |
+
"\n",
|
| 80 |
+
"print(\"✅ Libraries loaded\")"
|
| 81 |
+
]
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"cell_type": "markdown",
|
| 85 |
+
"metadata": {},
|
| 86 |
+
"source": [
|
| 87 |
+
"## 1. Load Ground Truth"
|
| 88 |
+
]
|
| 89 |
+
},
|
| 90 |
+
{
|
| 91 |
+
"cell_type": "code",
|
| 92 |
+
"execution_count": 17,
|
| 93 |
+
"metadata": {},
|
| 94 |
+
"outputs": [
|
| 95 |
+
{
|
| 96 |
+
"name": "stdout",
|
| 97 |
+
"output_type": "stream",
|
| 98 |
+
"text": [
|
| 99 |
+
"✅ Ground truth loaded: 22386 characters\n",
|
| 100 |
+
"Preview:\n",
|
| 101 |
+
"XÜLASƏ\n",
|
| 102 |
+
"\n",
|
| 103 |
+
"Bu tədqiqat Aşağı Kür çökəkliyi (AKÇ) və Bakı arxipelaqı (BA) daxil olmaqla Cənubi Xəzər çökəkliyi sistemində faydalı qazıntıların mənşəyinin paleotektonik, paleocoğrafi şərait və geodinamik rejimlə necə əlaqələndiyini, eləcə də Gec Miosendən etibarən Ərəbistan plitəsinin təsiri ilə formalaş...\n"
|
| 104 |
+
]
|
| 105 |
+
}
|
| 106 |
+
],
|
| 107 |
+
"source": [
|
| 108 |
+
"def load_ground_truth(md_path: str) -> str:\n",
|
| 109 |
+
" \"\"\"\n",
|
| 110 |
+
" Load ground truth text from markdown file.\n",
|
| 111 |
+
" Removes markdown formatting for pure text comparison.\n",
|
| 112 |
+
" \"\"\"\n",
|
| 113 |
+
" with open(md_path, 'r', encoding='utf-8') as f:\n",
|
| 114 |
+
" text = f.read()\n",
|
| 115 |
+
" \n",
|
| 116 |
+
" # Remove markdown elements\n",
|
| 117 |
+
" text = re.sub(r'^#+\\s+', '', text, flags=re.MULTILINE) # Headers\n",
|
| 118 |
+
" text = re.sub(r'\\*\\*(.+?)\\*\\*', r'\\1', text) # Bold\n",
|
| 119 |
+
" text = re.sub(r'\\*(.+?)\\*', r'\\1', text) # Italic\n",
|
| 120 |
+
" text = re.sub(r'---+', '', text) # Horizontal rules\n",
|
| 121 |
+
" text = re.sub(r'\\n\\s*\\n+', '\\n\\n', text) # Normalize newlines\n",
|
| 122 |
+
" \n",
|
| 123 |
+
" return text.strip()\n",
|
| 124 |
+
"\n",
|
| 125 |
+
"# Load ground truth\n",
|
| 126 |
+
"ground_truth = load_ground_truth('data/document_00.md')\n",
|
| 127 |
+
"print(f\"✅ Ground truth loaded: {len(ground_truth)} characters\")\n",
|
| 128 |
+
"print(f\"Preview:\\n{ground_truth[:300]}...\")"
|
| 129 |
+
]
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"cell_type": "markdown",
|
| 133 |
+
"metadata": {},
|
| 134 |
+
"source": [
|
| 135 |
+
"## 2. PDF to Image Conversion"
|
| 136 |
+
]
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"cell_type": "code",
|
| 140 |
+
"execution_count": 18,
|
| 141 |
+
"metadata": {},
|
| 142 |
+
"outputs": [
|
| 143 |
+
{
|
| 144 |
+
"name": "stdout",
|
| 145 |
+
"output_type": "stream",
|
| 146 |
+
"text": [
|
| 147 |
+
"\n",
|
| 148 |
+
"✅ Converted PDF to 12 images\n",
|
| 149 |
+
"First image size: (3072, 4096)\n"
|
| 150 |
+
]
|
| 151 |
+
}
|
| 152 |
+
],
|
| 153 |
+
"source": [
|
| 154 |
+
"def pdf_to_images(pdf_path: str, dpi: int = 150) -> List[Image.Image]:\n",
|
| 155 |
+
" \"\"\"\n",
|
| 156 |
+
" Convert PDF pages to PIL Images.\n",
|
| 157 |
+
" Higher DPI = better quality for VLMs.\n",
|
| 158 |
+
" \"\"\"\n",
|
| 159 |
+
" doc = fitz.open(pdf_path)\n",
|
| 160 |
+
" images = []\n",
|
| 161 |
+
" \n",
|
| 162 |
+
" for page_num in range(len(doc)):\n",
|
| 163 |
+
" page = doc[page_num]\n",
|
| 164 |
+
" # Render at higher resolution\n",
|
| 165 |
+
" zoom = dpi / 72 # 72 DPI is default\n",
|
| 166 |
+
" mat = fitz.Matrix(zoom, zoom)\n",
|
| 167 |
+
" pix = page.get_pixmap(matrix=mat)\n",
|
| 168 |
+
" \n",
|
| 169 |
+
" # Convert to PIL Image\n",
|
| 170 |
+
" img = Image.frombytes(\"RGB\", [pix.width, pix.height], pix.samples)\n",
|
| 171 |
+
" images.append(img)\n",
|
| 172 |
+
" \n",
|
| 173 |
+
" doc.close()\n",
|
| 174 |
+
" return images\n",
|
| 175 |
+
"\n",
|
| 176 |
+
"def image_to_base64(image: Image.Image, format: str = 'PNG') -> str:\n",
|
| 177 |
+
" \"\"\"\n",
|
| 178 |
+
" Convert PIL Image to base64 string for API.\n",
|
| 179 |
+
" \"\"\"\n",
|
| 180 |
+
" buffered = BytesIO()\n",
|
| 181 |
+
" image.save(buffered, format=format)\n",
|
| 182 |
+
" return base64.b64encode(buffered.getvalue()).decode('utf-8')\n",
|
| 183 |
+
"\n",
|
| 184 |
+
"# Test conversion\n",
|
| 185 |
+
"pdf_path = 'data/pdfs/document_00.pdf'\n",
|
| 186 |
+
"test_images = pdf_to_images(pdf_path)\n",
|
| 187 |
+
"print(f\"\\n✅ Converted PDF to {len(test_images)} images\")\n",
|
| 188 |
+
"print(f\"First image size: {test_images[0].size}\")"
|
| 189 |
+
]
|
| 190 |
+
},
|
| 191 |
+
{
|
| 192 |
+
"cell_type": "markdown",
|
| 193 |
+
"metadata": {},
|
| 194 |
+
"source": [
|
| 195 |
+
"## 3. Vision-Language Model Client"
|
| 196 |
+
]
|
| 197 |
+
},
|
| 198 |
+
{
|
| 199 |
+
"cell_type": "code",
|
| 200 |
+
"execution_count": 19,
|
| 201 |
+
"metadata": {},
|
| 202 |
+
"outputs": [
|
| 203 |
+
{
|
| 204 |
+
"name": "stdout",
|
| 205 |
+
"output_type": "stream",
|
| 206 |
+
"text": [
|
| 207 |
+
"✅ Configured 6 vision models\n"
|
| 208 |
+
]
|
| 209 |
+
}
|
| 210 |
+
],
|
| 211 |
+
"source": [
|
| 212 |
+
"# Initialize Azure OpenAI client\n",
|
| 213 |
+
"azure_client = AzureOpenAI(\n",
|
| 214 |
+
" api_key=os.getenv('AZURE_OPENAI_API_KEY'),\n",
|
| 215 |
+
" api_version=os.getenv('AZURE_OPENAI_API_VERSION', '2024-08-01-preview'),\n",
|
| 216 |
+
" azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT')\n",
|
| 217 |
+
")\n",
|
| 218 |
+
"\n",
|
| 219 |
+
"# Vision model configurations\n",
|
| 220 |
+
"VLM_MODELS = {\n",
|
| 221 |
+
" 'GPT-4.1': {\n",
|
| 222 |
+
" 'deployment': 'gpt-4.1',\n",
|
| 223 |
+
" 'supports_vision': True,\n",
|
| 224 |
+
" 'rating': '⭐⭐⭐⭐⭐',\n",
|
| 225 |
+
" 'notes': 'Excellent OCR'\n",
|
| 226 |
+
" },\n",
|
| 227 |
+
" 'GPT-5': {\n",
|
| 228 |
+
" 'deployment': 'gpt-5',\n",
|
| 229 |
+
" 'supports_vision': True,\n",
|
| 230 |
+
" 'rating': '⭐⭐⭐⭐⭐',\n",
|
| 231 |
+
" 'notes': 'Latest model'\n",
|
| 232 |
+
" },\n",
|
| 233 |
+
" 'GPT-5-mini': {\n",
|
| 234 |
+
" 'deployment': 'gpt-5-mini',\n",
|
| 235 |
+
" 'supports_vision': True,\n",
|
| 236 |
+
" 'rating': '⭐⭐⭐⭐⭐',\n",
|
| 237 |
+
" 'notes': 'Fast + excellent'\n",
|
| 238 |
+
" },\n",
|
| 239 |
+
" 'Claude-Sonnet-4.5': {\n",
|
| 240 |
+
" 'deployment': 'claude-sonnet-4-5',\n",
|
| 241 |
+
" 'supports_vision': True,\n",
|
| 242 |
+
" 'rating': '⭐⭐⭐⭐⭐',\n",
|
| 243 |
+
" 'notes': 'Very good OCR'\n",
|
| 244 |
+
" },\n",
|
| 245 |
+
" 'Phi-4-multimodal': {\n",
|
| 246 |
+
" 'deployment': 'Phi-4-multimodal-instruct',\n",
|
| 247 |
+
" 'supports_vision': True,\n",
|
| 248 |
+
" 'rating': '⭐⭐⭐⭐',\n",
|
| 249 |
+
" 'notes': 'Explicitly multimodal'\n",
|
| 250 |
+
" },\n",
|
| 251 |
+
" 'Llama-4-Maverick-17B': {\n",
|
| 252 |
+
" 'deployment': 'Llama-4-Maverick-17B-128E-Instruct-FP8',\n",
|
| 253 |
+
" 'supports_vision': True,\n",
|
| 254 |
+
" 'rating': '⭐⭐⭐⭐',\n",
|
| 255 |
+
" 'notes': 'Testing vision capability'\n",
|
| 256 |
+
" }\n",
|
| 257 |
+
"}\n",
|
| 258 |
+
"\n",
|
| 259 |
+
"print(f\"✅ Configured {len(VLM_MODELS)} vision models\")"
|
| 260 |
+
]
|
| 261 |
+
},
|
| 262 |
+
{
|
| 263 |
+
"cell_type": "code",
|
| 264 |
+
"execution_count": 20,
|
| 265 |
+
"metadata": {},
|
| 266 |
+
"outputs": [
|
| 267 |
+
{
|
| 268 |
+
"name": "stdout",
|
| 269 |
+
"output_type": "stream",
|
| 270 |
+
"text": [
|
| 271 |
+
"✅ VLM extraction function ready\n"
|
| 272 |
+
]
|
| 273 |
+
}
|
| 274 |
+
],
|
| 275 |
+
"source": [
|
| 276 |
+
"def vlm_extract_text(model_name: str, images: List[Image.Image], \n",
|
| 277 |
+
" temperature: float = 0.0) -> Tuple[str, float]:\n",
|
| 278 |
+
" \"\"\"\n",
|
| 279 |
+
" Extract text from images using Vision-Language Model.\n",
|
| 280 |
+
" Returns: (extracted_text, response_time)\n",
|
| 281 |
+
" \"\"\"\n",
|
| 282 |
+
" deployment = VLM_MODELS[model_name]['deployment']\n",
|
| 283 |
+
" \n",
|
| 284 |
+
" # OCR prompt - optimized for accuracy\n",
|
| 285 |
+
" system_prompt = \"\"\"You are an expert OCR system for historical oil & gas documents.\n",
|
| 286 |
+
"\n",
|
| 287 |
+
"Extract ALL text from the image with 100% accuracy. Follow these rules:\n",
|
| 288 |
+
"1. Preserve EXACT spelling - including Azerbaijani, Russian, and English text\n",
|
| 289 |
+
"2. Maintain original Cyrillic characters - DO NOT transliterate\n",
|
| 290 |
+
"3. Keep all numbers, symbols, and special characters exactly as shown\n",
|
| 291 |
+
"4. Preserve layout structure (paragraphs, line breaks)\n",
|
| 292 |
+
"5. Include ALL text - headers, body, footnotes, tables, captions\n",
|
| 293 |
+
"6. If text is unclear, make best effort but stay accurate\n",
|
| 294 |
+
"\n",
|
| 295 |
+
"Output ONLY the extracted text. No explanations, no descriptions.\"\"\"\n",
|
| 296 |
+
" \n",
|
| 297 |
+
" try:\n",
|
| 298 |
+
" all_text = []\n",
|
| 299 |
+
" total_time = 0\n",
|
| 300 |
+
" \n",
|
| 301 |
+
" for page_num, image in enumerate(images, 1):\n",
|
| 302 |
+
" # Convert image to base64\n",
|
| 303 |
+
" image_base64 = image_to_base64(image)\n",
|
| 304 |
+
" \n",
|
| 305 |
+
" # Prepare messages with image\n",
|
| 306 |
+
" messages = [\n",
|
| 307 |
+
" {\n",
|
| 308 |
+
" \"role\": \"system\",\n",
|
| 309 |
+
" \"content\": system_prompt\n",
|
| 310 |
+
" },\n",
|
| 311 |
+
" {\n",
|
| 312 |
+
" \"role\": \"user\",\n",
|
| 313 |
+
" \"content\": [\n",
|
| 314 |
+
" {\n",
|
| 315 |
+
" \"type\": \"text\",\n",
|
| 316 |
+
" \"text\": f\"Extract all text from page {page_num}:\"\n",
|
| 317 |
+
" },\n",
|
| 318 |
+
" {\n",
|
| 319 |
+
" \"type\": \"image_url\",\n",
|
| 320 |
+
" \"image_url\": {\n",
|
| 321 |
+
" \"url\": f\"data:image/png;base64,{image_base64}\"\n",
|
| 322 |
+
" }\n",
|
| 323 |
+
" }\n",
|
| 324 |
+
" ]\n",
|
| 325 |
+
" }\n",
|
| 326 |
+
" ]\n",
|
| 327 |
+
" \n",
|
| 328 |
+
" # Call VLM with appropriate token parameter\n",
|
| 329 |
+
" start_time = time.time()\n",
|
| 330 |
+
" \n",
|
| 331 |
+
" # GPT-5 models use max_completion_tokens, others use max_tokens\n",
|
| 332 |
+
" if deployment.startswith('gpt-5'):\n",
|
| 333 |
+
" response = azure_client.chat.completions.create(\n",
|
| 334 |
+
" model=deployment,\n",
|
| 335 |
+
" messages=messages,\n",
|
| 336 |
+
" temperature=temperature,\n",
|
| 337 |
+
" max_completion_tokens=4000\n",
|
| 338 |
+
" )\n",
|
| 339 |
+
" else:\n",
|
| 340 |
+
" response = azure_client.chat.completions.create(\n",
|
| 341 |
+
" model=deployment,\n",
|
| 342 |
+
" messages=messages,\n",
|
| 343 |
+
" temperature=temperature,\n",
|
| 344 |
+
" max_tokens=4000\n",
|
| 345 |
+
" )\n",
|
| 346 |
+
" \n",
|
| 347 |
+
" elapsed = time.time() - start_time\n",
|
| 348 |
+
" total_time += elapsed\n",
|
| 349 |
+
" \n",
|
| 350 |
+
" # Extract text\n",
|
| 351 |
+
" page_text = response.choices[0].message.content\n",
|
| 352 |
+
" all_text.append(page_text)\n",
|
| 353 |
+
" \n",
|
| 354 |
+
" print(f\" Page {page_num}/{len(images)}: {elapsed:.1f}s\")\n",
|
| 355 |
+
" \n",
|
| 356 |
+
" # Combine all pages\n",
|
| 357 |
+
" full_text = '\\n\\n'.join(all_text)\n",
|
| 358 |
+
" return full_text, total_time\n",
|
| 359 |
+
" \n",
|
| 360 |
+
" except Exception as e:\n",
|
| 361 |
+
" return f\"ERROR: {str(e)}\", 0.0\n",
|
| 362 |
+
"\n",
|
| 363 |
+
"print(\"✅ VLM extraction function ready\")"
|
| 364 |
+
]
|
| 365 |
+
},
|
| 366 |
+
{
|
| 367 |
+
"cell_type": "markdown",
|
| 368 |
+
"metadata": {},
|
| 369 |
+
"source": [
|
| 370 |
+
"## 4. Metrics Calculation"
|
| 371 |
+
]
|
| 372 |
+
},
|
| 373 |
+
{
|
| 374 |
+
"cell_type": "code",
|
| 375 |
+
"execution_count": 21,
|
| 376 |
+
"metadata": {},
|
| 377 |
+
"outputs": [
|
| 378 |
+
{
|
| 379 |
+
"name": "stdout",
|
| 380 |
+
"output_type": "stream",
|
| 381 |
+
"text": [
|
| 382 |
+
"✅ Metrics functions ready\n"
|
| 383 |
+
]
|
| 384 |
+
}
|
| 385 |
+
],
|
| 386 |
+
"source": [
|
| 387 |
+
"def normalize_text(text: str) -> str:\n",
|
| 388 |
+
" \"\"\"Normalize text for comparison.\"\"\"\n",
|
| 389 |
+
" text = text.lower().strip()\n",
|
| 390 |
+
" text = re.sub(r'\\s+', ' ', text)\n",
|
| 391 |
+
" return text\n",
|
| 392 |
+
"\n",
|
| 393 |
+
"def calculate_ocr_metrics(reference: str, hypothesis: str) -> Dict[str, float]:\n",
|
| 394 |
+
" \"\"\"\n",
|
| 395 |
+
" Calculate comprehensive OCR metrics.\n",
|
| 396 |
+
" \"\"\"\n",
|
| 397 |
+
" ref_norm = normalize_text(reference)\n",
|
| 398 |
+
" hyp_norm = normalize_text(hypothesis)\n",
|
| 399 |
+
" \n",
|
| 400 |
+
" # Character Error Rate\n",
|
| 401 |
+
" cer_score = cer(ref_norm, hyp_norm) * 100\n",
|
| 402 |
+
" \n",
|
| 403 |
+
" # Word Error Rate\n",
|
| 404 |
+
" wer_score = wer(ref_norm, hyp_norm) * 100\n",
|
| 405 |
+
" \n",
|
| 406 |
+
" # Success rates\n",
|
| 407 |
+
" csr_score = max(0, 100 - cer_score)\n",
|
| 408 |
+
" wsr_score = max(0, 100 - wer_score)\n",
|
| 409 |
+
" \n",
|
| 410 |
+
" # Length metrics\n",
|
| 411 |
+
" ref_chars = len(ref_norm)\n",
|
| 412 |
+
" hyp_chars = len(hyp_norm)\n",
|
| 413 |
+
" ref_words = len(ref_norm.split())\n",
|
| 414 |
+
" hyp_words = len(hyp_norm.split())\n",
|
| 415 |
+
" \n",
|
| 416 |
+
" char_length_acc = (min(ref_chars, hyp_chars) / max(ref_chars, hyp_chars) * 100) if max(ref_chars, hyp_chars) > 0 else 0\n",
|
| 417 |
+
" word_length_acc = (min(ref_words, hyp_words) / max(ref_words, hyp_words) * 100) if max(ref_words, hyp_words) > 0 else 0\n",
|
| 418 |
+
" \n",
|
| 419 |
+
" return {\n",
|
| 420 |
+
" 'CER': round(cer_score, 2),\n",
|
| 421 |
+
" 'WER': round(wer_score, 2),\n",
|
| 422 |
+
" 'CSR': round(csr_score, 2),\n",
|
| 423 |
+
" 'WSR': round(wsr_score, 2),\n",
|
| 424 |
+
" 'Char_Count_Ref': ref_chars,\n",
|
| 425 |
+
" 'Char_Count_Hyp': hyp_chars,\n",
|
| 426 |
+
" 'Word_Count_Ref': ref_words,\n",
|
| 427 |
+
" 'Word_Count_Hyp': hyp_words,\n",
|
| 428 |
+
" 'Char_Length_Accuracy': round(char_length_acc, 2),\n",
|
| 429 |
+
" 'Word_Length_Accuracy': round(word_length_acc, 2)\n",
|
| 430 |
+
" }\n",
|
| 431 |
+
"\n",
|
| 432 |
+
"print(\"✅ Metrics functions ready\")"
|
| 433 |
+
]
|
| 434 |
+
},
|
| 435 |
+
{
|
| 436 |
+
"cell_type": "markdown",
|
| 437 |
+
"metadata": {},
|
| 438 |
+
"source": [
|
| 439 |
+
"## 5. Run Benchmark on All VLMs"
|
| 440 |
+
]
|
| 441 |
+
},
|
| 442 |
+
{
|
| 443 |
+
"cell_type": "code",
|
| 444 |
+
"execution_count": 22,
|
| 445 |
+
"metadata": {},
|
| 446 |
+
"outputs": [
|
| 447 |
+
{
|
| 448 |
+
"name": "stdout",
|
| 449 |
+
"output_type": "stream",
|
| 450 |
+
"text": [
|
| 451 |
+
"Testing 6 vision models...\n",
|
| 452 |
+
"This will take several minutes...\n",
|
| 453 |
+
"\n"
|
| 454 |
+
]
|
| 455 |
+
}
|
| 456 |
+
],
|
| 457 |
+
"source": [
|
| 458 |
+
"# Select models to test\n",
|
| 459 |
+
"MODELS_TO_TEST = [\n",
|
| 460 |
+
" 'GPT-4.1',\n",
|
| 461 |
+
" 'GPT-5',\n",
|
| 462 |
+
" 'GPT-5-mini',\n",
|
| 463 |
+
" 'Claude-Sonnet-4.5',\n",
|
| 464 |
+
" 'Phi-4-multimodal',\n",
|
| 465 |
+
" 'Llama-4-Maverick-17B', # Added for comparison\n",
|
| 466 |
+
"]\n",
|
| 467 |
+
"\n",
|
| 468 |
+
"print(f\"Testing {len(MODELS_TO_TEST)} vision models...\")\n",
|
| 469 |
+
"print(\"This will take several minutes...\\n\")"
|
| 470 |
+
]
|
| 471 |
+
},
|
| 472 |
+
{
|
| 473 |
+
"cell_type": "code",
|
| 474 |
+
"execution_count": null,
|
| 475 |
+
"metadata": {},
|
| 476 |
+
"outputs": [
|
| 477 |
+
{
|
| 478 |
+
"name": "stdout",
|
| 479 |
+
"output_type": "stream",
|
| 480 |
+
"text": [
|
| 481 |
+
"Prepared 12 page images\n",
|
| 482 |
+
"\n",
|
| 483 |
+
"\n",
|
| 484 |
+
"================================================================================\n",
|
| 485 |
+
"Testing: GPT-4.1 ⭐⭐⭐⭐⭐\n",
|
| 486 |
+
"Notes: Excellent OCR\n",
|
| 487 |
+
"================================================================================\n",
|
| 488 |
+
" Page 1/12: 9.2s\n",
|
| 489 |
+
" Page 2/12: 9.5s\n",
|
| 490 |
+
" Page 3/12: 10.9s\n",
|
| 491 |
+
" Page 4/12: 10.7s\n"
|
| 492 |
+
]
|
| 493 |
+
}
|
| 494 |
+
],
|
| 495 |
+
"source": [
|
| 496 |
+
"# Prepare PDF images\n",
|
| 497 |
+
"images = pdf_to_images(pdf_path, dpi=150)\n",
|
| 498 |
+
"print(f\"Prepared {len(images)} page images\\n\")\n",
|
| 499 |
+
"\n",
|
| 500 |
+
"# Run benchmark\n",
|
| 501 |
+
"results = []\n",
|
| 502 |
+
"\n",
|
| 503 |
+
"for model_name in MODELS_TO_TEST:\n",
|
| 504 |
+
" print(f\"\\n{'='*80}\")\n",
|
| 505 |
+
" print(f\"Testing: {model_name} {VLM_MODELS[model_name]['rating']}\")\n",
|
| 506 |
+
" print(f\"Notes: {VLM_MODELS[model_name]['notes']}\")\n",
|
| 507 |
+
" print(f\"{'='*80}\")\n",
|
| 508 |
+
" \n",
|
| 509 |
+
" # Extract text\n",
|
| 510 |
+
" extracted_text, response_time = vlm_extract_text(model_name, images)\n",
|
| 511 |
+
" \n",
|
| 512 |
+
" if extracted_text.startswith('ERROR'):\n",
|
| 513 |
+
" print(f\"❌ Failed: {extracted_text}\")\n",
|
| 514 |
+
" continue\n",
|
| 515 |
+
" \n",
|
| 516 |
+
" print(f\"\\n✅ Total time: {response_time:.2f}s\")\n",
|
| 517 |
+
" print(f\"✅ Extracted: {len(extracted_text)} characters\")\n",
|
| 518 |
+
" \n",
|
| 519 |
+
" # Calculate metrics\n",
|
| 520 |
+
" metrics = calculate_ocr_metrics(ground_truth, extracted_text)\n",
|
| 521 |
+
" \n",
|
| 522 |
+
" # Store result\n",
|
| 523 |
+
" result = {\n",
|
| 524 |
+
" 'Model': model_name,\n",
|
| 525 |
+
" 'Response_Time': round(response_time, 2),\n",
|
| 526 |
+
" **metrics,\n",
|
| 527 |
+
" 'Rating': VLM_MODELS[model_name]['rating'],\n",
|
| 528 |
+
" 'Notes': VLM_MODELS[model_name]['notes'],\n",
|
| 529 |
+
" 'Extracted_Preview': extracted_text[:200]\n",
|
| 530 |
+
" }\n",
|
| 531 |
+
" \n",
|
| 532 |
+
" results.append(result)\n",
|
| 533 |
+
" \n",
|
| 534 |
+
" # Show summary\n",
|
| 535 |
+
" print(f\"\\n📊 Metrics:\")\n",
|
| 536 |
+
" print(f\" CSR (Character Success): {metrics['CSR']:.2f}%\")\n",
|
| 537 |
+
" print(f\" WSR (Word Success): {metrics['WSR']:.2f}%\")\n",
|
| 538 |
+
" print(f\" CER (Character Error): {metrics['CER']:.2f}%\")\n",
|
| 539 |
+
" print(f\" WER (Word Error): {metrics['WER']:.2f}%\")\n",
|
| 540 |
+
"\n",
|
| 541 |
+
"print(f\"\\n{'='*80}\")\n",
|
| 542 |
+
"print(\"✅ VLM OCR Benchmarking complete!\")\n",
|
| 543 |
+
"print(f\"{'='*80}\")"
|
| 544 |
+
]
|
| 545 |
+
},
|
| 546 |
+
{
|
| 547 |
+
"cell_type": "markdown",
|
| 548 |
+
"metadata": {},
|
| 549 |
+
"source": [
|
| 550 |
+
"## 6. Results Analysis"
|
| 551 |
+
]
|
| 552 |
+
},
|
| 553 |
+
{
|
| 554 |
+
"cell_type": "code",
|
| 555 |
+
"execution_count": null,
|
| 556 |
+
"metadata": {},
|
| 557 |
+
"outputs": [
|
| 558 |
+
{
|
| 559 |
+
"name": "stdout",
|
| 560 |
+
"output_type": "stream",
|
| 561 |
+
"text": [
|
| 562 |
+
"\n",
|
| 563 |
+
"====================================================================================================\n",
|
| 564 |
+
"📊 VLM OCR BENCHMARKING RESULTS\n",
|
| 565 |
+
"====================================================================================================\n",
|
| 566 |
+
" Model CSR WSR CER WER Response_Time Rating\n",
|
| 567 |
+
"GPT-4.1 85.86 67.61 14.14 32.39 133.49 ⭐⭐⭐⭐⭐\n",
|
| 568 |
+
"====================================================================================================\n"
|
| 569 |
+
]
|
| 570 |
+
}
|
| 571 |
+
],
|
| 572 |
+
"source": [
|
| 573 |
+
"# Create DataFrame\n",
|
| 574 |
+
"df = pd.DataFrame(results)\n",
|
| 575 |
+
"\n",
|
| 576 |
+
"# Sort by CSR (best first)\n",
|
| 577 |
+
"df = df.sort_values('CSR', ascending=False).reset_index(drop=True)\n",
|
| 578 |
+
"\n",
|
| 579 |
+
"# Display results\n",
|
| 580 |
+
"display_cols = ['Model', 'CSR', 'WSR', 'CER', 'WER', 'Response_Time', 'Rating']\n",
|
| 581 |
+
"\n",
|
| 582 |
+
"print(\"\\n\" + \"=\"*100)\n",
|
| 583 |
+
"print(\"📊 VLM OCR BENCHMARKING RESULTS\")\n",
|
| 584 |
+
"print(\"=\"*100)\n",
|
| 585 |
+
"print(df[display_cols].to_string(index=False))\n",
|
| 586 |
+
"print(\"=\"*100)"
|
| 587 |
+
]
|
| 588 |
+
},
|
| 589 |
+
{
|
| 590 |
+
"cell_type": "markdown",
|
| 591 |
+
"metadata": {},
|
| 592 |
+
"source": [
|
| 593 |
+
"## 7. Visualizations"
|
| 594 |
+
]
|
| 595 |
+
},
|
| 596 |
+
{
|
| 597 |
+
"cell_type": "code",
|
| 598 |
+
"execution_count": null,
|
| 599 |
+
"metadata": {},
|
| 600 |
+
"outputs": [],
|
| 601 |
+
"source": [
|
| 602 |
+
"# Create comprehensive visualization\n",
|
| 603 |
+
"import os\n",
|
| 604 |
+
"from pathlib import Path\n",
|
| 605 |
+
"\n",
|
| 606 |
+
"# Create output directory\n",
|
| 607 |
+
"output_dir = Path('output/vlm_ocr_benchmark')\n",
|
| 608 |
+
"output_dir.mkdir(parents=True, exist_ok=True)\n",
|
| 609 |
+
"\n",
|
| 610 |
+
"fig, axes = plt.subplots(2, 2, figsize=(16, 10))\n",
|
| 611 |
+
"\n",
|
| 612 |
+
"models = df['Model'].tolist()\n",
|
| 613 |
+
"colors = sns.color_palette('viridis', len(models))\n",
|
| 614 |
+
"\n",
|
| 615 |
+
"# 1. CSR - Character Success Rate (MAIN METRIC)\n",
|
| 616 |
+
"ax1 = axes[0, 0]\n",
|
| 617 |
+
"bars1 = ax1.barh(models, df['CSR'], color=colors)\n",
|
| 618 |
+
"ax1.set_xlabel('CSR (%) - Higher is Better', fontsize=12, fontweight='bold')\n",
|
| 619 |
+
"ax1.set_title('Character Success Rate (CSR)\\n🏆 HACKATHON PRIMARY METRIC', \n",
|
| 620 |
+
" fontsize=14, fontweight='bold')\n",
|
| 621 |
+
"ax1.set_xlim(0, 100)\n",
|
| 622 |
+
"for i, (model, csr) in enumerate(zip(models, df['CSR'])):\n",
|
| 623 |
+
" ax1.text(csr + 1, i, f'{csr:.2f}%', va='center', fontsize=11, fontweight='bold')\n",
|
| 624 |
+
"ax1.axvline(x=90, color='green', linestyle='--', alpha=0.3, label='Excellent (>90%)')\n",
|
| 625 |
+
"ax1.axvline(x=80, color='orange', linestyle='--', alpha=0.3, label='Good (>80%)')\n",
|
| 626 |
+
"ax1.legend(fontsize=9)\n",
|
| 627 |
+
"\n",
|
| 628 |
+
"# 2. WSR - Word Success Rate\n",
|
| 629 |
+
"ax2 = axes[0, 1]\n",
|
| 630 |
+
"bars2 = ax2.barh(models, df['WSR'], color=colors)\n",
|
| 631 |
+
"ax2.set_xlabel('WSR (%) - Higher is Better', fontsize=12, fontweight='bold')\n",
|
| 632 |
+
"ax2.set_title('Word Success Rate (WSR)', fontsize=14, fontweight='bold')\n",
|
| 633 |
+
"ax2.set_xlim(0, 100)\n",
|
| 634 |
+
"for i, (model, wsr) in enumerate(zip(models, df['WSR'])):\n",
|
| 635 |
+
" ax2.text(wsr + 1, i, f'{wsr:.2f}%', va='center', fontsize=11, fontweight='bold')\n",
|
| 636 |
+
"\n",
|
| 637 |
+
"# 3. Response Time\n",
|
| 638 |
+
"ax3 = axes[1, 0]\n",
|
| 639 |
+
"bars3 = ax3.barh(models, df['Response_Time'], color=colors)\n",
|
| 640 |
+
"ax3.set_xlabel('Total Time (seconds) - Lower is Better', fontsize=12, fontweight='bold')\n",
|
| 641 |
+
"ax3.set_title('Processing Speed', fontsize=14, fontweight='bold')\n",
|
| 642 |
+
"for i, (model, time_val) in enumerate(zip(models, df['Response_Time'])):\n",
|
| 643 |
+
" ax3.text(time_val + 0.5, i, f'{time_val:.1f}s', va='center', fontsize=11)\n",
|
| 644 |
+
"\n",
|
| 645 |
+
"# 4. Error Rates Comparison\n",
|
| 646 |
+
"ax4 = axes[1, 1]\n",
|
| 647 |
+
"x = range(len(models))\n",
|
| 648 |
+
"width = 0.35\n",
|
| 649 |
+
"ax4.bar([i - width/2 for i in x], df['CER'], width, label='CER', color='coral', alpha=0.8)\n",
|
| 650 |
+
"ax4.bar([i + width/2 for i in x], df['WER'], width, label='WER', color='skyblue', alpha=0.8)\n",
|
| 651 |
+
"ax4.set_ylabel('Error Rate (%) - Lower is Better', fontsize=12, fontweight='bold')\n",
|
| 652 |
+
"ax4.set_title('Error Rates', fontsize=14, fontweight='bold')\n",
|
| 653 |
+
"ax4.set_xticks(x)\n",
|
| 654 |
+
"ax4.set_xticklabels(models, rotation=45, ha='right')\n",
|
| 655 |
+
"ax4.legend(fontsize=11)\n",
|
| 656 |
+
"ax4.grid(axis='y', alpha=0.3)\n",
|
| 657 |
+
"\n",
|
| 658 |
+
"plt.tight_layout()\n",
|
| 659 |
+
"plt.savefig(output_dir / 'results.png', dpi=300, bbox_inches='tight')\n",
|
| 660 |
+
"plt.show()\n",
|
| 661 |
+
"\n",
|
| 662 |
+
"print(f\"\\n✅ Visualization saved to '{output_dir}/results.png'\")"
|
| 663 |
+
]
|
| 664 |
+
},
|
| 665 |
+
{
|
| 666 |
+
"cell_type": "markdown",
|
| 667 |
+
"metadata": {},
|
| 668 |
+
"source": [
|
| 669 |
+
"## 8. Winner Analysis and Recommendations"
|
| 670 |
+
]
|
| 671 |
+
},
|
| 672 |
+
{
|
| 673 |
+
"cell_type": "code",
|
| 674 |
+
"execution_count": null,
|
| 675 |
+
"metadata": {},
|
| 676 |
+
"outputs": [
|
| 677 |
+
{
|
| 678 |
+
"name": "stdout",
|
| 679 |
+
"output_type": "stream",
|
| 680 |
+
"text": [
|
| 681 |
+
"\n",
|
| 682 |
+
"====================================================================================================\n",
|
| 683 |
+
"🏆 FINAL RANKINGS\n",
|
| 684 |
+
"====================================================================================================\n",
|
| 685 |
+
" Rank Model CSR WSR CER WER Response_Time Rating\n",
|
| 686 |
+
" 1 GPT-4.1 85.86 67.61 14.14 32.39 133.49 ⭐⭐⭐⭐⭐\n",
|
| 687 |
+
"====================================================================================================\n",
|
| 688 |
+
"\n",
|
| 689 |
+
"====================================================================================================\n",
|
| 690 |
+
"💡 RECOMMENDATIONS FOR HACKATHON\n",
|
| 691 |
+
"====================================================================================================\n",
|
| 692 |
+
"\n",
|
| 693 |
+
"🥇 BEST OVERALL: GPT-4.1 ⭐⭐⭐⭐⭐\n",
|
| 694 |
+
" CSR: 85.86% (Character Success)\n",
|
| 695 |
+
" WSR: 67.61% (Word Success)\n",
|
| 696 |
+
" CER: 14.14% (Character Error)\n",
|
| 697 |
+
" WER: 32.39% (Word Error)\n",
|
| 698 |
+
" Time: 133.49s for 12 pages\n",
|
| 699 |
+
" Notes: Excellent OCR\n",
|
| 700 |
+
"\n",
|
| 701 |
+
"⚡ FASTEST: GPT-4.1\n",
|
| 702 |
+
" Time: 133.49s\n",
|
| 703 |
+
" CSR: 85.86%\n",
|
| 704 |
+
"\n",
|
| 705 |
+
"====================================================================================================\n",
|
| 706 |
+
"📝 HACKATHON SCORING IMPACT\n",
|
| 707 |
+
"====================================================================================================\n",
|
| 708 |
+
"\n",
|
| 709 |
+
"OCR Quality = 50% of total hackathon score\n",
|
| 710 |
+
"\n",
|
| 711 |
+
"Using GPT-4.1:\n",
|
| 712 |
+
" - CSR: 85.86% × 50% = 42.93 points\n",
|
| 713 |
+
" - This is 85.9% accuracy on character-level OCR\n",
|
| 714 |
+
"\n",
|
| 715 |
+
"⚠️ GOOD - Consider optimizing prompt or trying other models\n",
|
| 716 |
+
"\n",
|
| 717 |
+
"====================================================================================================\n",
|
| 718 |
+
"🎯 FINAL RECOMMENDATION\n",
|
| 719 |
+
"====================================================================================================\n",
|
| 720 |
+
"\n",
|
| 721 |
+
"Use: GPT-4.1\n",
|
| 722 |
+
"Reason: Highest accuracy (85.86% CSR) for hackathon OCR benchmark\n",
|
| 723 |
+
"Implementation: Use vision API with same prompt as tested\n",
|
| 724 |
+
"====================================================================================================\n"
|
| 725 |
+
]
|
| 726 |
+
}
|
| 727 |
+
],
|
| 728 |
+
"source": [
|
| 729 |
+
"# Rankings\n",
|
| 730 |
+
"rankings = df[['Model', 'CSR', 'WSR', 'CER', 'WER', 'Response_Time', 'Rating']].copy()\n",
|
| 731 |
+
"rankings.insert(0, 'Rank', range(1, len(rankings) + 1))\n",
|
| 732 |
+
"\n",
|
| 733 |
+
"print(\"\\n\" + \"=\"*100)\n",
|
| 734 |
+
"print(\"🏆 FINAL RANKINGS\")\n",
|
| 735 |
+
"print(\"=\"*100)\n",
|
| 736 |
+
"print(rankings.to_string(index=False))\n",
|
| 737 |
+
"print(\"=\"*100)\n",
|
| 738 |
+
"\n",
|
| 739 |
+
"# Winner\n",
|
| 740 |
+
"best_model = df.iloc[0]\n",
|
| 741 |
+
"fastest_model = df.loc[df['Response_Time'].idxmin()]\n",
|
| 742 |
+
"\n",
|
| 743 |
+
"print(\"\\n\" + \"=\"*100)\n",
|
| 744 |
+
"print(\"💡 RECOMMENDATIONS FOR HACKATHON\")\n",
|
| 745 |
+
"print(\"=\"*100)\n",
|
| 746 |
+
"\n",
|
| 747 |
+
"print(f\"\\n🥇 BEST OVERALL: {best_model['Model']} {best_model['Rating']}\")\n",
|
| 748 |
+
"print(f\" CSR: {best_model['CSR']:.2f}% (Character Success)\")\n",
|
| 749 |
+
"print(f\" WSR: {best_model['WSR']:.2f}% (Word Success)\")\n",
|
| 750 |
+
"print(f\" CER: {best_model['CER']:.2f}% (Character Error)\")\n",
|
| 751 |
+
"print(f\" WER: {best_model['WER']:.2f}% (Word Error)\")\n",
|
| 752 |
+
"print(f\" Time: {best_model['Response_Time']:.2f}s for {len(images)} pages\")\n",
|
| 753 |
+
"print(f\" Notes: {best_model['Notes']}\")\n",
|
| 754 |
+
"\n",
|
| 755 |
+
"print(f\"\\n⚡ FASTEST: {fastest_model['Model']}\")\n",
|
| 756 |
+
"print(f\" Time: {fastest_model['Response_Time']:.2f}s\")\n",
|
| 757 |
+
"print(f\" CSR: {fastest_model['CSR']:.2f}%\")\n",
|
| 758 |
+
"\n",
|
| 759 |
+
"print(\"\\n\" + \"=\"*100)\n",
|
| 760 |
+
"print(\"📝 HACKATHON SCORING IMPACT\")\n",
|
| 761 |
+
"print(\"=\"*100)\n",
|
| 762 |
+
"print(\"\\nOCR Quality = 50% of total hackathon score\")\n",
|
| 763 |
+
"print(f\"\\nUsing {best_model['Model']}:\")\n",
|
| 764 |
+
"print(f\" - CSR: {best_model['CSR']:.2f}% × 50% = {best_model['CSR'] * 0.5:.2f} points\")\n",
|
| 765 |
+
"print(f\" - This is {best_model['CSR']:.1f}% accuracy on character-level OCR\")\n",
|
| 766 |
+
"\n",
|
| 767 |
+
"if best_model['CSR'] >= 95:\n",
|
| 768 |
+
" print(\"\\n✅ EXCELLENT - This will score very high on OCR!\")\n",
|
| 769 |
+
"elif best_model['CSR'] >= 90:\n",
|
| 770 |
+
" print(\"\\n✅ VERY GOOD - Strong OCR performance!\")\n",
|
| 771 |
+
"elif best_model['CSR'] >= 85:\n",
|
| 772 |
+
" print(\"\\n⚠️ GOOD - Consider optimizing prompt or trying other models\")\n",
|
| 773 |
+
"else:\n",
|
| 774 |
+
" print(\"\\n⚠️ NEEDS IMPROVEMENT - Try other models or adjust parameters\")\n",
|
| 775 |
+
"\n",
|
| 776 |
+
"print(\"\\n\" + \"=\"*100)\n",
|
| 777 |
+
"print(\"🎯 FINAL RECOMMENDATION\")\n",
|
| 778 |
+
"print(\"=\"*100)\n",
|
| 779 |
+
"print(f\"\\nUse: {best_model['Model']}\")\n",
|
| 780 |
+
"print(f\"Reason: Highest accuracy ({best_model['CSR']:.2f}% CSR) for hackathon OCR benchmark\")\n",
|
| 781 |
+
"print(f\"Implementation: Use vision API with same prompt as tested\")\n",
|
| 782 |
+
"print(\"=\"*100)"
|
| 783 |
+
]
|
| 784 |
+
},
|
| 785 |
+
{
|
| 786 |
+
"cell_type": "markdown",
|
| 787 |
+
"metadata": {},
|
| 788 |
+
"source": [
|
| 789 |
+
"## 9. Export Results"
|
| 790 |
+
]
|
| 791 |
+
},
|
| 792 |
+
{
|
| 793 |
+
"cell_type": "code",
|
| 794 |
+
"execution_count": null,
|
| 795 |
+
"metadata": {},
|
| 796 |
+
"outputs": [],
|
| 797 |
+
"source": [
|
| 798 |
+
"# Save results\n",
|
| 799 |
+
"from pathlib import Path\n",
|
| 800 |
+
"\n",
|
| 801 |
+
"output_dir = Path('output/vlm_ocr_benchmark')\n",
|
| 802 |
+
"output_dir.mkdir(parents=True, exist_ok=True)\n",
|
| 803 |
+
"\n",
|
| 804 |
+
"df.to_csv(output_dir / 'detailed_results.csv', index=False, encoding='utf-8')\n",
|
| 805 |
+
"rankings.to_csv(output_dir / 'rankings.csv', index=False, encoding='utf-8')\n",
|
| 806 |
+
"\n",
|
| 807 |
+
"print(\"\\n✅ Results exported to output/vlm_ocr_benchmark/:\")\n",
|
| 808 |
+
"print(\" - detailed_results.csv\")\n",
|
| 809 |
+
"print(\" - rankings.csv\")\n",
|
| 810 |
+
"print(\" - results.png\")"
|
| 811 |
+
]
|
| 812 |
+
},
|
| 813 |
+
{
|
| 814 |
+
"cell_type": "markdown",
|
| 815 |
+
"metadata": {},
|
| 816 |
+
"source": [
|
| 817 |
+
"## 10. Sample Text Comparison"
|
| 818 |
+
]
|
| 819 |
+
},
|
| 820 |
+
{
|
| 821 |
+
"cell_type": "code",
|
| 822 |
+
"execution_count": null,
|
| 823 |
+
"metadata": {},
|
| 824 |
+
"outputs": [
|
| 825 |
+
{
|
| 826 |
+
"name": "stdout",
|
| 827 |
+
"output_type": "stream",
|
| 828 |
+
"text": [
|
| 829 |
+
"\n",
|
| 830 |
+
"====================================================================================================\n",
|
| 831 |
+
"📝 SAMPLE TEXT COMPARISON (First 500 characters)\n",
|
| 832 |
+
"====================================================================================================\n",
|
| 833 |
+
"\n",
|
| 834 |
+
"🎯 GROUND TRUTH:\n",
|
| 835 |
+
"----------------------------------------------------------------------------------------------------\n",
|
| 836 |
+
"XÜLASƏ\n",
|
| 837 |
+
"\n",
|
| 838 |
+
"Bu tədqiqat Aşağı Kür çökəkliyi (AKÇ) və Bakı arxipelaqı (BA) daxil olmaqla Cənubi Xəzər çökəkliyi sistemində faydalı qazıntıların mənşəyinin paleotektonik, paleocoğrafi şərait və geodinamik rejimlə necə əlaqələndiyini, eləcə də Gec Miosendən etibarən Ərəbistan plitəsinin təsiri ilə formalaşan kollizion proseslərin bölgənin struktur-morfoloji və termal inkişafına nə dərəcədə yönverici rol oynadığını kompleks şəkildə qiymətləndirir. Seismotektonik göstəricilərin, çöküntütoplanma sürətləri\n",
|
| 839 |
+
"\n",
|
| 840 |
+
"🤖 GPT-4.1 (CSR: 85.86%):\n",
|
| 841 |
+
"----------------------------------------------------------------------------------------------------\n",
|
| 842 |
+
"Xülasə\n",
|
| 843 |
+
"Bu məqalə Apşə Kər çökəkliyi (AKÇ) və Bakı arxipelaqı (BA) daxil olmaqla Cənubi Xəzər çökəkliyi sistemində faydalı qazıntıların mənşəyinin paleotektonik, paleocoğrafi şərait və geodinamik rejim\n",
|
| 844 |
+
"\n",
|
| 845 |
+
"====================================================================================================\n"
|
| 846 |
+
]
|
| 847 |
+
}
|
| 848 |
+
],
|
| 849 |
+
"source": [
|
| 850 |
+
"# Show comparison of first 500 characters\n",
|
| 851 |
+
"print(\"\\n\" + \"=\"*100)\n",
|
| 852 |
+
"print(\"📝 SAMPLE TEXT COMPARISON (First 500 characters)\")\n",
|
| 853 |
+
"print(\"=\"*100)\n",
|
| 854 |
+
"\n",
|
| 855 |
+
"print(\"\\n🎯 GROUND TRUTH:\")\n",
|
| 856 |
+
"print(\"-\" * 100)\n",
|
| 857 |
+
"print(ground_truth[:500])\n",
|
| 858 |
+
"\n",
|
| 859 |
+
"for _, row in df.iterrows():\n",
|
| 860 |
+
" print(f\"\\n🤖 {row['Model']} (CSR: {row['CSR']:.2f}%):\")\n",
|
| 861 |
+
" print(\"-\" * 100)\n",
|
| 862 |
+
" # Get first 500 chars from extracted text\n",
|
| 863 |
+
" preview = row['Extracted_Preview'] if len(row['Extracted_Preview']) >= 500 else row['Extracted_Preview']\n",
|
| 864 |
+
" print(preview[:500])\n",
|
| 865 |
+
"\n",
|
| 866 |
+
"print(\"\\n\" + \"=\"*100)"
|
| 867 |
+
]
|
| 868 |
+
}
|
| 869 |
+
],
|
| 870 |
+
"metadata": {
|
| 871 |
+
"kernelspec": {
|
| 872 |
+
"display_name": "venv",
|
| 873 |
+
"language": "python",
|
| 874 |
+
"name": "python3"
|
| 875 |
+
},
|
| 876 |
+
"language_info": {
|
| 877 |
+
"codemirror_mode": {
|
| 878 |
+
"name": "ipython",
|
| 879 |
+
"version": 3
|
| 880 |
+
},
|
| 881 |
+
"file_extension": ".py",
|
| 882 |
+
"mimetype": "text/x-python",
|
| 883 |
+
"name": "python",
|
| 884 |
+
"nbconvert_exporter": "python",
|
| 885 |
+
"pygments_lexer": "ipython3",
|
| 886 |
+
"version": "3.10.12"
|
| 887 |
+
}
|
| 888 |
+
},
|
| 889 |
+
"nbformat": 4,
|
| 890 |
+
"nbformat_minor": 4
|
| 891 |
+
}
|
requirements.txt
DELETED
|
@@ -1,30 +0,0 @@
|
|
| 1 |
-
# Web Framework
|
| 2 |
-
fastapi==0.104.1
|
| 3 |
-
uvicorn[standard]==0.24.0
|
| 4 |
-
python-multipart==0.0.6
|
| 5 |
-
|
| 6 |
-
# Azure Services (OCR + LLM)
|
| 7 |
-
azure-ai-formrecognizer==3.3.2
|
| 8 |
-
azure-ai-documentintelligence==1.0.0b1
|
| 9 |
-
openai==1.3.0
|
| 10 |
-
|
| 11 |
-
# PDF Processing & Image Extraction
|
| 12 |
-
PyMuPDF==1.23.8
|
| 13 |
-
|
| 14 |
-
# Vector Database & Embeddings
|
| 15 |
-
pinecone-client==3.0.0
|
| 16 |
-
sentence-transformers>=2.5.0
|
| 17 |
-
|
| 18 |
-
# Utilities
|
| 19 |
-
python-dotenv==1.0.0
|
| 20 |
-
pydantic==2.5.0
|
| 21 |
-
pydantic-settings==2.1.0
|
| 22 |
-
requests==2.31.0
|
| 23 |
-
aiofiles==23.2.1
|
| 24 |
-
|
| 25 |
-
# Monitoring & Logging
|
| 26 |
-
loguru==0.7.2
|
| 27 |
-
|
| 28 |
-
# Development (optional)
|
| 29 |
-
pytest==7.4.3
|
| 30 |
-
httpx==0.25.2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
run.py
DELETED
|
@@ -1,18 +0,0 @@
|
|
| 1 |
-
"""Run the FastAPI application"""
|
| 2 |
-
|
| 3 |
-
import os
|
| 4 |
-
import uvicorn
|
| 5 |
-
from src.config import settings
|
| 6 |
-
|
| 7 |
-
# Disable telemetry and warnings
|
| 8 |
-
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 9 |
-
os.environ["ANONYMIZED_TELEMETRY"] = "false"
|
| 10 |
-
|
| 11 |
-
if __name__ == "__main__":
|
| 12 |
-
uvicorn.run(
|
| 13 |
-
"src.api.main:app",
|
| 14 |
-
host=settings.api_host,
|
| 15 |
-
port=settings.api_port,
|
| 16 |
-
reload=True,
|
| 17 |
-
log_level="info",
|
| 18 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/__init__.py
DELETED
|
File without changes
|
src/api/__init__.py
DELETED
|
File without changes
|
src/api/main.py
DELETED
|
@@ -1,181 +0,0 @@
|
|
| 1 |
-
"""FastAPI application with OCR and LLM endpoints"""
|
| 2 |
-
|
| 3 |
-
from fastapi import FastAPI, File, UploadFile, HTTPException
|
| 4 |
-
from fastapi.responses import JSONResponse, Response
|
| 5 |
-
from typing import List
|
| 6 |
-
from loguru import logger
|
| 7 |
-
import sys
|
| 8 |
-
|
| 9 |
-
from src.api.models import (
|
| 10 |
-
OCRPageResponse,
|
| 11 |
-
ChatMessage,
|
| 12 |
-
LLMResponse,
|
| 13 |
-
ErrorResponse,
|
| 14 |
-
)
|
| 15 |
-
from src.ocr.processor import get_ocr_processor
|
| 16 |
-
from src.config import settings
|
| 17 |
-
|
| 18 |
-
# Configure logging
|
| 19 |
-
logger.remove()
|
| 20 |
-
logger.add(sys.stderr, level="INFO")
|
| 21 |
-
|
| 22 |
-
# Create FastAPI app
|
| 23 |
-
app = FastAPI(
|
| 24 |
-
title="SOCAR Historical Document Processing API",
|
| 25 |
-
description="OCR and LLM endpoints for processing historical documents",
|
| 26 |
-
version="1.0.0",
|
| 27 |
-
)
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
@app.get("/")
|
| 31 |
-
async def root():
|
| 32 |
-
"""Health check endpoint"""
|
| 33 |
-
return {
|
| 34 |
-
"status": "healthy",
|
| 35 |
-
"service": "SOCAR Document Processing API",
|
| 36 |
-
"endpoints": ["/ocr", "/llm"],
|
| 37 |
-
}
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
@app.get("/favicon.ico", include_in_schema=False)
|
| 41 |
-
async def favicon():
|
| 42 |
-
"""Return favicon for browser tab"""
|
| 43 |
-
# Simple SVG favicon representing oil/gas industry
|
| 44 |
-
svg = """<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100">
|
| 45 |
-
<circle cx="50" cy="50" r="45" fill="#0066cc"/>
|
| 46 |
-
<path d="M30 60 L50 30 L70 60 Z" fill="#ffffff"/>
|
| 47 |
-
<rect x="45" y="55" width="10" height="30" fill="#ffffff"/>
|
| 48 |
-
</svg>"""
|
| 49 |
-
return Response(content=svg, media_type="image/svg+xml")
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
@app.post(
|
| 53 |
-
"/ocr",
|
| 54 |
-
response_model=List[OCRPageResponse],
|
| 55 |
-
responses={
|
| 56 |
-
200: {"description": "Successfully processed PDF"},
|
| 57 |
-
400: {"model": ErrorResponse, "description": "Invalid PDF file"},
|
| 58 |
-
500: {"model": ErrorResponse, "description": "Processing error"},
|
| 59 |
-
},
|
| 60 |
-
)
|
| 61 |
-
async def process_ocr(file: UploadFile = File(...)):
|
| 62 |
-
"""
|
| 63 |
-
OCR Endpoint - Extract text from PDF documents
|
| 64 |
-
|
| 65 |
-
Accepts a PDF file upload and returns the extracted Markdown text for each page.
|
| 66 |
-
|
| 67 |
-
Args:
|
| 68 |
-
file: PDF file in multipart/form-data format
|
| 69 |
-
|
| 70 |
-
Returns:
|
| 71 |
-
List of dictionaries with page_number and MD_text for each page
|
| 72 |
-
"""
|
| 73 |
-
try:
|
| 74 |
-
# Validate file type
|
| 75 |
-
if not file.filename.lower().endswith(".pdf"):
|
| 76 |
-
raise HTTPException(
|
| 77 |
-
status_code=400,
|
| 78 |
-
detail="Invalid file type. Only PDF files are accepted.",
|
| 79 |
-
)
|
| 80 |
-
|
| 81 |
-
# Read file content
|
| 82 |
-
logger.info(f"Receiving PDF file: {file.filename}")
|
| 83 |
-
pdf_content = await file.read()
|
| 84 |
-
|
| 85 |
-
if len(pdf_content) == 0:
|
| 86 |
-
raise HTTPException(status_code=400, detail="Empty PDF file")
|
| 87 |
-
|
| 88 |
-
# Process PDF with OCR
|
| 89 |
-
ocr_processor = get_ocr_processor()
|
| 90 |
-
result = ocr_processor.process_pdf(pdf_content, file.filename)
|
| 91 |
-
|
| 92 |
-
# Convert to response format
|
| 93 |
-
response = [
|
| 94 |
-
OCRPageResponse(page_number=page["page_number"], MD_text=page["MD_text"])
|
| 95 |
-
for page in result
|
| 96 |
-
]
|
| 97 |
-
|
| 98 |
-
logger.info(f"Successfully processed {len(response)} pages from {file.filename}")
|
| 99 |
-
return response
|
| 100 |
-
|
| 101 |
-
except HTTPException:
|
| 102 |
-
raise
|
| 103 |
-
except Exception as e:
|
| 104 |
-
logger.error(f"Error processing OCR request: {e}")
|
| 105 |
-
raise HTTPException(
|
| 106 |
-
status_code=500, detail=f"Failed to process PDF: {str(e)}"
|
| 107 |
-
)
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
@app.post(
|
| 111 |
-
"/llm",
|
| 112 |
-
response_model=LLMResponse,
|
| 113 |
-
responses={
|
| 114 |
-
200: {"description": "Successfully generated response"},
|
| 115 |
-
400: {"model": ErrorResponse, "description": "Invalid request"},
|
| 116 |
-
500: {"model": ErrorResponse, "description": "Processing error"},
|
| 117 |
-
},
|
| 118 |
-
)
|
| 119 |
-
async def process_llm(messages: List[ChatMessage]):
|
| 120 |
-
"""
|
| 121 |
-
LLM Endpoint - Generate answers from document knowledge base
|
| 122 |
-
|
| 123 |
-
Receives chat history and produces an LLM-generated answer along with source references.
|
| 124 |
-
|
| 125 |
-
Args:
|
| 126 |
-
messages: List of chat messages with role and content
|
| 127 |
-
|
| 128 |
-
Returns:
|
| 129 |
-
Dictionary with sources and answer
|
| 130 |
-
"""
|
| 131 |
-
try:
|
| 132 |
-
# Validate input
|
| 133 |
-
if not messages:
|
| 134 |
-
raise HTTPException(status_code=400, detail="No messages provided")
|
| 135 |
-
|
| 136 |
-
logger.info(f"Received {len(messages)} messages for LLM processing")
|
| 137 |
-
|
| 138 |
-
# Get the last user message as the query
|
| 139 |
-
last_message = messages[-1]
|
| 140 |
-
if last_message.role != "user":
|
| 141 |
-
raise HTTPException(
|
| 142 |
-
status_code=400,
|
| 143 |
-
detail="Last message must be from user",
|
| 144 |
-
)
|
| 145 |
-
|
| 146 |
-
query = last_message.content
|
| 147 |
-
|
| 148 |
-
# Prepare chat history (all messages except the last one)
|
| 149 |
-
chat_history = None
|
| 150 |
-
if len(messages) > 1:
|
| 151 |
-
chat_history = [
|
| 152 |
-
{"role": msg.role, "content": msg.content}
|
| 153 |
-
for msg in messages[:-1]
|
| 154 |
-
]
|
| 155 |
-
|
| 156 |
-
# Process query using RAG pipeline
|
| 157 |
-
from src.llm.rag_pipeline import get_rag_pipeline
|
| 158 |
-
|
| 159 |
-
rag = get_rag_pipeline()
|
| 160 |
-
result = rag.query(query, chat_history=chat_history)
|
| 161 |
-
|
| 162 |
-
logger.info(f"Generated answer with {len(result['sources'])} sources")
|
| 163 |
-
|
| 164 |
-
return LLMResponse(
|
| 165 |
-
sources=result["sources"],
|
| 166 |
-
answer=result["answer"],
|
| 167 |
-
)
|
| 168 |
-
|
| 169 |
-
except HTTPException:
|
| 170 |
-
raise
|
| 171 |
-
except Exception as e:
|
| 172 |
-
logger.error(f"Error processing LLM request: {e}")
|
| 173 |
-
raise HTTPException(
|
| 174 |
-
status_code=500, detail=f"Failed to generate response: {str(e)}"
|
| 175 |
-
)
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
if __name__ == "__main__":
|
| 179 |
-
import uvicorn
|
| 180 |
-
|
| 181 |
-
uvicorn.run(app, host=settings.api_host, port=settings.api_port)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/api/models.py
DELETED
|
@@ -1,48 +0,0 @@
|
|
| 1 |
-
"""Pydantic models for API requests and responses"""
|
| 2 |
-
|
| 3 |
-
from typing import List, Dict, Optional
|
| 4 |
-
from pydantic import BaseModel, Field
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
class OCRPageResponse(BaseModel):
|
| 8 |
-
"""Response model for a single page OCR result"""
|
| 9 |
-
|
| 10 |
-
page_number: int = Field(..., description="Page index starting from 1")
|
| 11 |
-
MD_text: str = Field(..., description="Markdown-formatted extracted text")
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
class OCRResponse(BaseModel):
|
| 15 |
-
"""Response model for OCR endpoint"""
|
| 16 |
-
|
| 17 |
-
pages: List[OCRPageResponse]
|
| 18 |
-
total_pages: int
|
| 19 |
-
filename: Optional[str] = None
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
class ChatMessage(BaseModel):
|
| 23 |
-
"""Chat message model"""
|
| 24 |
-
|
| 25 |
-
role: str = Field(..., description="Role of the message sender (user/assistant)")
|
| 26 |
-
content: str = Field(..., description="Message content")
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
class SourceReference(BaseModel):
|
| 30 |
-
"""Source reference for LLM response"""
|
| 31 |
-
|
| 32 |
-
pdf_name: str = Field(..., description="Name of the PDF")
|
| 33 |
-
page_number: int = Field(..., description="Page number in the PDF")
|
| 34 |
-
content: str = Field(..., description="Relevant extracted text (in Markdown)")
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
class LLMResponse(BaseModel):
|
| 38 |
-
"""Response model for LLM endpoint"""
|
| 39 |
-
|
| 40 |
-
sources: List[SourceReference] = Field(..., description="List of source references")
|
| 41 |
-
answer: str = Field(..., description="Generated answer to the user query")
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
class ErrorResponse(BaseModel):
|
| 45 |
-
"""Error response model"""
|
| 46 |
-
|
| 47 |
-
error: str
|
| 48 |
-
detail: Optional[str] = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/config.py
DELETED
|
@@ -1,46 +0,0 @@
|
|
| 1 |
-
from pydantic_settings import BaseSettings
|
| 2 |
-
from pathlib import Path
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
class Settings(BaseSettings):
|
| 6 |
-
"""Application settings loaded from environment variables"""
|
| 7 |
-
|
| 8 |
-
# Azure OpenAI Configuration
|
| 9 |
-
azure_openai_api_key: str
|
| 10 |
-
azure_openai_endpoint: str
|
| 11 |
-
azure_openai_api_version: str = "2024-08-01-preview"
|
| 12 |
-
|
| 13 |
-
# Azure Document Intelligence
|
| 14 |
-
azure_document_intelligence_endpoint: str = ""
|
| 15 |
-
azure_document_intelligence_key: str = ""
|
| 16 |
-
|
| 17 |
-
# Application Configuration
|
| 18 |
-
data_dir: Path = Path("./data")
|
| 19 |
-
pdf_dir: Path = Path("./data/pdfs")
|
| 20 |
-
vector_db_path: Path = Path("./data/vector_db")
|
| 21 |
-
processed_dir: Path = Path("./data/processed")
|
| 22 |
-
|
| 23 |
-
# API Configuration
|
| 24 |
-
api_host: str = "0.0.0.0"
|
| 25 |
-
api_port: int = 8000
|
| 26 |
-
|
| 27 |
-
# OCR Settings
|
| 28 |
-
ocr_backend: str = "azure" # Azure Document Intelligence (92.79% CSR)
|
| 29 |
-
|
| 30 |
-
# LLM Settings
|
| 31 |
-
llm_model: str = "Llama-4-Maverick-17B-128E-Instruct-FP8" # Open-source LLM for hackathon
|
| 32 |
-
|
| 33 |
-
# Pinecone Settings
|
| 34 |
-
pinecone_api_key: str = ""
|
| 35 |
-
pinecone_index_name: str = "socar-documents"
|
| 36 |
-
pinecone_cloud: str = "aws"
|
| 37 |
-
pinecone_region: str = "us-east-1"
|
| 38 |
-
vector_db_type: str = "chroma" # Options: chroma, pinecone
|
| 39 |
-
|
| 40 |
-
class Config:
|
| 41 |
-
env_file = ".env"
|
| 42 |
-
case_sensitive = False
|
| 43 |
-
extra = "ignore" # Ignore extra fields in .env file
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
settings = Settings()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/llm/__init__.py
DELETED
|
File without changes
|
src/llm/deepseek_client.py
DELETED
|
@@ -1,126 +0,0 @@
|
|
| 1 |
-
"""DeepSeek LLM client using Azure AI Foundry"""
|
| 2 |
-
|
| 3 |
-
from typing import List, Dict, Optional
|
| 4 |
-
from loguru import logger
|
| 5 |
-
import openai
|
| 6 |
-
|
| 7 |
-
from src.config import settings
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
class DeepSeekClient:
|
| 11 |
-
"""Client for DeepSeek LLM via Azure AI Foundry"""
|
| 12 |
-
|
| 13 |
-
def __init__(self):
|
| 14 |
-
"""Initialize DeepSeek client"""
|
| 15 |
-
# Configure OpenAI client to use Azure endpoint
|
| 16 |
-
self.client = openai.AzureOpenAI(
|
| 17 |
-
api_key=settings.azure_openai_api_key,
|
| 18 |
-
api_version=settings.azure_openai_api_version,
|
| 19 |
-
azure_endpoint=settings.azure_openai_endpoint,
|
| 20 |
-
)
|
| 21 |
-
|
| 22 |
-
# Get model name from settings
|
| 23 |
-
self.model_name = settings.llm_model
|
| 24 |
-
logger.info(f"Initialized LLM client with model: {self.model_name}")
|
| 25 |
-
|
| 26 |
-
def generate_response(
|
| 27 |
-
self,
|
| 28 |
-
messages: List[Dict[str, str]],
|
| 29 |
-
max_tokens: int = 1000,
|
| 30 |
-
temperature: float = 0.7,
|
| 31 |
-
) -> str:
|
| 32 |
-
"""
|
| 33 |
-
Generate response from DeepSeek model
|
| 34 |
-
|
| 35 |
-
Args:
|
| 36 |
-
messages: List of message dicts with 'role' and 'content'
|
| 37 |
-
max_tokens: Maximum tokens in response
|
| 38 |
-
temperature: Sampling temperature (0.0 to 1.0)
|
| 39 |
-
|
| 40 |
-
Returns:
|
| 41 |
-
Generated text response
|
| 42 |
-
"""
|
| 43 |
-
try:
|
| 44 |
-
logger.info(f"Generating response with {len(messages)} messages")
|
| 45 |
-
|
| 46 |
-
response = self.client.chat.completions.create(
|
| 47 |
-
model=self.model_name,
|
| 48 |
-
messages=messages,
|
| 49 |
-
max_tokens=max_tokens,
|
| 50 |
-
temperature=temperature,
|
| 51 |
-
)
|
| 52 |
-
|
| 53 |
-
generated_text = response.choices[0].message.content
|
| 54 |
-
logger.info(f"Generated response: {len(generated_text)} characters")
|
| 55 |
-
|
| 56 |
-
return generated_text
|
| 57 |
-
|
| 58 |
-
except Exception as e:
|
| 59 |
-
logger.error(f"Error generating response from {self.model_name}: {e}")
|
| 60 |
-
raise
|
| 61 |
-
|
| 62 |
-
def generate_with_context(
|
| 63 |
-
self,
|
| 64 |
-
query: str,
|
| 65 |
-
context_chunks: List[str],
|
| 66 |
-
chat_history: Optional[List[Dict[str, str]]] = None,
|
| 67 |
-
) -> str:
|
| 68 |
-
"""
|
| 69 |
-
Generate response with RAG context
|
| 70 |
-
|
| 71 |
-
Args:
|
| 72 |
-
query: User's question
|
| 73 |
-
context_chunks: Retrieved document chunks
|
| 74 |
-
chat_history: Previous chat messages
|
| 75 |
-
|
| 76 |
-
Returns:
|
| 77 |
-
Generated answer
|
| 78 |
-
"""
|
| 79 |
-
# Build context from chunks
|
| 80 |
-
context = "\n\n".join([f"[Document {i+1}]\n{chunk}" for i, chunk in enumerate(context_chunks)])
|
| 81 |
-
|
| 82 |
-
# Create system prompt optimized for LLM Judge evaluation
|
| 83 |
-
system_prompt = """You are an expert assistant specializing in SOCAR's historical oil and gas research documents.
|
| 84 |
-
|
| 85 |
-
CRITICAL INSTRUCTIONS for high-quality answers:
|
| 86 |
-
1. ACCURACY: Base your answer STRICTLY on the provided context - never add external information
|
| 87 |
-
2. RELEVANCE: Answer the exact question asked - be direct and focused
|
| 88 |
-
3. COMPLETENESS: Cover all key aspects mentioned in the context
|
| 89 |
-
4. CITATIONS: Reference specific documents (e.g., "According to Document 1...")
|
| 90 |
-
5. TECHNICAL PRECISION: Use correct oil & gas terminology from the documents
|
| 91 |
-
6. CLARITY: Structure your answer logically - use bullet points for multiple items
|
| 92 |
-
7. CONCISENESS: Be thorough but avoid redundancy or verbose explanations
|
| 93 |
-
|
| 94 |
-
If the context lacks sufficient information, clearly state what is missing."""
|
| 95 |
-
|
| 96 |
-
# Build messages
|
| 97 |
-
messages = [{"role": "system", "content": system_prompt}]
|
| 98 |
-
|
| 99 |
-
# Add chat history if provided
|
| 100 |
-
if chat_history:
|
| 101 |
-
messages.extend(chat_history)
|
| 102 |
-
|
| 103 |
-
# Add current query with context
|
| 104 |
-
user_message = f"""Context from documents:
|
| 105 |
-
{context}
|
| 106 |
-
|
| 107 |
-
Question: {query}
|
| 108 |
-
|
| 109 |
-
Provide a well-structured, accurate answer based ONLY on the context above. Include document citations."""
|
| 110 |
-
|
| 111 |
-
messages.append({"role": "user", "content": user_message})
|
| 112 |
-
|
| 113 |
-
# Optimized for quality (LLM Judge) while maintaining speed
|
| 114 |
-
return self.generate_response(messages, max_tokens=1000, temperature=0.2)
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
# Singleton instance
|
| 118 |
-
_deepseek_client = None
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
def get_deepseek_client() -> DeepSeekClient:
|
| 122 |
-
"""Get or create DeepSeek client instance"""
|
| 123 |
-
global _deepseek_client
|
| 124 |
-
if _deepseek_client is None:
|
| 125 |
-
_deepseek_client = DeepSeekClient()
|
| 126 |
-
return _deepseek_client
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/llm/rag_pipeline.py
DELETED
|
@@ -1,154 +0,0 @@
|
|
| 1 |
-
"""RAG (Retrieval Augmented Generation) pipeline"""
|
| 2 |
-
|
| 3 |
-
from typing import List, Dict, Optional
|
| 4 |
-
from loguru import logger
|
| 5 |
-
|
| 6 |
-
from src.llm.deepseek_client import get_deepseek_client
|
| 7 |
-
from src.vectordb import get_vector_store
|
| 8 |
-
from src.api.models import SourceReference
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
class RAGPipeline:
|
| 12 |
-
"""RAG pipeline for document-based question answering"""
|
| 13 |
-
|
| 14 |
-
def __init__(self):
|
| 15 |
-
"""Initialize RAG pipeline"""
|
| 16 |
-
self.llm = get_deepseek_client()
|
| 17 |
-
self.vector_store = get_vector_store()
|
| 18 |
-
logger.info("RAG pipeline initialized")
|
| 19 |
-
|
| 20 |
-
def query(
|
| 21 |
-
self,
|
| 22 |
-
question: str,
|
| 23 |
-
chat_history: Optional[List[Dict[str, str]]] = None,
|
| 24 |
-
n_results: int = 3,
|
| 25 |
-
) -> Dict:
|
| 26 |
-
"""
|
| 27 |
-
Answer a question using RAG
|
| 28 |
-
|
| 29 |
-
Args:
|
| 30 |
-
question: User's question
|
| 31 |
-
chat_history: Previous chat messages
|
| 32 |
-
n_results: Number of documents to retrieve
|
| 33 |
-
|
| 34 |
-
Returns:
|
| 35 |
-
Dict with 'answer' and 'sources'
|
| 36 |
-
"""
|
| 37 |
-
logger.info(f"Processing query: {question[:100]}...")
|
| 38 |
-
|
| 39 |
-
# Step 1: Retrieve relevant documents
|
| 40 |
-
search_results = self.vector_store.search(question, n_results=n_results)
|
| 41 |
-
|
| 42 |
-
# Step 2: Format sources
|
| 43 |
-
sources = []
|
| 44 |
-
context_chunks = []
|
| 45 |
-
|
| 46 |
-
for doc, metadata in zip(search_results["documents"], search_results["metadatas"]):
|
| 47 |
-
sources.append(
|
| 48 |
-
SourceReference(
|
| 49 |
-
pdf_name=metadata.get("pdf_name", "unknown.pdf"),
|
| 50 |
-
page_number=metadata.get("page_number", 0),
|
| 51 |
-
content=doc[:500], # Limit content length
|
| 52 |
-
)
|
| 53 |
-
)
|
| 54 |
-
context_chunks.append(doc)
|
| 55 |
-
|
| 56 |
-
logger.info(f"Retrieved {len(sources)} source documents")
|
| 57 |
-
|
| 58 |
-
# Step 3: Generate answer using LLM
|
| 59 |
-
answer = self.llm.generate_with_context(
|
| 60 |
-
query=question,
|
| 61 |
-
context_chunks=context_chunks,
|
| 62 |
-
chat_history=chat_history,
|
| 63 |
-
)
|
| 64 |
-
|
| 65 |
-
return {
|
| 66 |
-
"answer": answer,
|
| 67 |
-
"sources": sources,
|
| 68 |
-
}
|
| 69 |
-
|
| 70 |
-
def add_processed_document(
|
| 71 |
-
self,
|
| 72 |
-
pdf_name: str,
|
| 73 |
-
pages: List[Dict[str, any]],
|
| 74 |
-
chunk_size: int = 600,
|
| 75 |
-
chunk_overlap: int = 100,
|
| 76 |
-
):
|
| 77 |
-
"""
|
| 78 |
-
Add a processed PDF to the vector store
|
| 79 |
-
|
| 80 |
-
Args:
|
| 81 |
-
pdf_name: Name of the PDF file
|
| 82 |
-
pages: List of page dicts with page_number and MD_text
|
| 83 |
-
chunk_size: Size of text chunks in characters
|
| 84 |
-
chunk_overlap: Overlap between chunks in characters
|
| 85 |
-
"""
|
| 86 |
-
logger.info(f"Adding document to vector store: {pdf_name}")
|
| 87 |
-
|
| 88 |
-
texts = []
|
| 89 |
-
metadatas = []
|
| 90 |
-
ids = []
|
| 91 |
-
|
| 92 |
-
# Process each page
|
| 93 |
-
for page in pages:
|
| 94 |
-
page_num = page["page_number"]
|
| 95 |
-
text = page["MD_text"]
|
| 96 |
-
|
| 97 |
-
# Simple chunking by character count
|
| 98 |
-
chunks = self._chunk_text(text, chunk_size, chunk_overlap)
|
| 99 |
-
|
| 100 |
-
for chunk_idx, chunk in enumerate(chunks):
|
| 101 |
-
texts.append(chunk)
|
| 102 |
-
metadatas.append({
|
| 103 |
-
"pdf_name": pdf_name,
|
| 104 |
-
"page_number": page_num,
|
| 105 |
-
"chunk_index": chunk_idx,
|
| 106 |
-
})
|
| 107 |
-
ids.append(f"{pdf_name}_p{page_num}_c{chunk_idx}")
|
| 108 |
-
|
| 109 |
-
# Add to vector store
|
| 110 |
-
self.vector_store.add_documents(texts, metadatas, ids)
|
| 111 |
-
logger.info(f"Added {len(texts)} chunks from {pdf_name}")
|
| 112 |
-
|
| 113 |
-
def _chunk_text(
|
| 114 |
-
self, text: str, chunk_size: int, chunk_overlap: int
|
| 115 |
-
) -> List[str]:
|
| 116 |
-
"""
|
| 117 |
-
Split text into overlapping chunks
|
| 118 |
-
|
| 119 |
-
Args:
|
| 120 |
-
text: Text to chunk
|
| 121 |
-
chunk_size: Size of each chunk
|
| 122 |
-
chunk_overlap: Overlap between chunks
|
| 123 |
-
|
| 124 |
-
Returns:
|
| 125 |
-
List of text chunks
|
| 126 |
-
"""
|
| 127 |
-
if not text:
|
| 128 |
-
return []
|
| 129 |
-
|
| 130 |
-
chunks = []
|
| 131 |
-
start = 0
|
| 132 |
-
|
| 133 |
-
while start < len(text):
|
| 134 |
-
end = start + chunk_size
|
| 135 |
-
chunk = text[start:end]
|
| 136 |
-
|
| 137 |
-
if chunk.strip():
|
| 138 |
-
chunks.append(chunk)
|
| 139 |
-
|
| 140 |
-
start += chunk_size - chunk_overlap
|
| 141 |
-
|
| 142 |
-
return chunks
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
# Singleton instance
|
| 146 |
-
_rag_pipeline = None
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
def get_rag_pipeline() -> RAGPipeline:
|
| 150 |
-
"""Get or create RAG pipeline instance"""
|
| 151 |
-
global _rag_pipeline
|
| 152 |
-
if _rag_pipeline is None:
|
| 153 |
-
_rag_pipeline = RAGPipeline()
|
| 154 |
-
return _rag_pipeline
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/ocr/__init__.py
DELETED
|
File without changes
|
src/ocr/azure_ocr.py
DELETED
|
@@ -1,143 +0,0 @@
|
|
| 1 |
-
"""Azure Document Intelligence OCR processor"""
|
| 2 |
-
|
| 3 |
-
from typing import List, Dict
|
| 4 |
-
from pathlib import Path
|
| 5 |
-
import io
|
| 6 |
-
from loguru import logger
|
| 7 |
-
|
| 8 |
-
from azure.ai.formrecognizer import DocumentAnalysisClient
|
| 9 |
-
from azure.core.credentials import AzureKeyCredential
|
| 10 |
-
import fitz # PyMuPDF for image detection
|
| 11 |
-
|
| 12 |
-
from src.config import settings
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
class AzureOCRProcessor:
|
| 16 |
-
"""Process PDFs using Azure Document Intelligence"""
|
| 17 |
-
|
| 18 |
-
def __init__(self):
|
| 19 |
-
"""Initialize Azure Document Intelligence client"""
|
| 20 |
-
# Use Azure OpenAI endpoint as Document Intelligence endpoint
|
| 21 |
-
# In production, these might be different
|
| 22 |
-
endpoint = settings.azure_openai_endpoint.rstrip("/")
|
| 23 |
-
api_key = settings.azure_openai_api_key
|
| 24 |
-
|
| 25 |
-
self.client = DocumentAnalysisClient(
|
| 26 |
-
endpoint=endpoint, credential=AzureKeyCredential(api_key)
|
| 27 |
-
)
|
| 28 |
-
|
| 29 |
-
logger.info("Initialized Azure Document Analysis client")
|
| 30 |
-
|
| 31 |
-
def process_pdf(self, pdf_file: bytes, pdf_name: str = "document.pdf") -> List[Dict[str, any]]:
|
| 32 |
-
"""
|
| 33 |
-
Process PDF and extract text + images using Azure Document Intelligence
|
| 34 |
-
|
| 35 |
-
Args:
|
| 36 |
-
pdf_file: PDF file as bytes
|
| 37 |
-
pdf_name: Name of the PDF file (for logging)
|
| 38 |
-
|
| 39 |
-
Returns:
|
| 40 |
-
List of dicts with page_number and MD_text (with inline images)
|
| 41 |
-
"""
|
| 42 |
-
try:
|
| 43 |
-
logger.info(f"Processing PDF ({len(pdf_file)} bytes)")
|
| 44 |
-
|
| 45 |
-
# Analyze document using Azure Form Recognizer
|
| 46 |
-
poller = self.client.begin_analyze_document(
|
| 47 |
-
"prebuilt-read", document=io.BytesIO(pdf_file)
|
| 48 |
-
)
|
| 49 |
-
result = poller.result()
|
| 50 |
-
|
| 51 |
-
# Detect images using PyMuPDF (don't save, just mention)
|
| 52 |
-
doc_name = Path(pdf_name).stem # Get filename without extension
|
| 53 |
-
images_by_page = self._detect_images(pdf_file, doc_name)
|
| 54 |
-
|
| 55 |
-
# Extract text page by page
|
| 56 |
-
pages_data = []
|
| 57 |
-
for page_num, page in enumerate(result.pages, start=1):
|
| 58 |
-
# Collect all lines from this page (PRESERVE CYRILLIC)
|
| 59 |
-
lines = []
|
| 60 |
-
if hasattr(page, 'lines') and page.lines:
|
| 61 |
-
for line in page.lines:
|
| 62 |
-
# Azure OCR preserves original encoding (Cyrillic stays Cyrillic)
|
| 63 |
-
lines.append(line.content)
|
| 64 |
-
|
| 65 |
-
page_text = "\n".join(lines) if lines else ""
|
| 66 |
-
|
| 67 |
-
# Get image references for this page
|
| 68 |
-
page_images = images_by_page.get(page_num - 1, []) # 0-indexed
|
| 69 |
-
|
| 70 |
-
# Only add image markdown if images exist
|
| 71 |
-
if page_images:
|
| 72 |
-
for img_ref in page_images:
|
| 73 |
-
# Create markdown image mention (no actual file)
|
| 74 |
-
md_image = f'\n\n\n\n'
|
| 75 |
-
page_text += md_image
|
| 76 |
-
|
| 77 |
-
logger.info(f"Added {len(page_images)} image references for page {page_num}")
|
| 78 |
-
|
| 79 |
-
pages_data.append({
|
| 80 |
-
"page_number": page_num,
|
| 81 |
-
"MD_text": page_text
|
| 82 |
-
})
|
| 83 |
-
|
| 84 |
-
logger.info(f"Successfully processed {len(pages_data)} pages")
|
| 85 |
-
return pages_data
|
| 86 |
-
|
| 87 |
-
except Exception as e:
|
| 88 |
-
logger.error(f"Error processing PDF with Azure: {e}")
|
| 89 |
-
raise
|
| 90 |
-
|
| 91 |
-
def _detect_images(self, pdf_file: bytes, doc_name: str) -> Dict[int, List[str]]:
|
| 92 |
-
"""
|
| 93 |
-
Detect images in PDF (don't save, just mention their presence)
|
| 94 |
-
|
| 95 |
-
Args:
|
| 96 |
-
pdf_file: PDF file as bytes
|
| 97 |
-
doc_name: Document name (without extension)
|
| 98 |
-
|
| 99 |
-
Returns:
|
| 100 |
-
Dict mapping page_number (0-indexed) to list of image references
|
| 101 |
-
"""
|
| 102 |
-
images_by_page = {}
|
| 103 |
-
|
| 104 |
-
try:
|
| 105 |
-
# Open PDF with PyMuPDF
|
| 106 |
-
pdf_document = fitz.open(stream=pdf_file, filetype="pdf")
|
| 107 |
-
|
| 108 |
-
for page_num in range(len(pdf_document)):
|
| 109 |
-
page = pdf_document[page_num]
|
| 110 |
-
image_refs = []
|
| 111 |
-
|
| 112 |
-
# Get images from page
|
| 113 |
-
image_list = page.get_images()
|
| 114 |
-
|
| 115 |
-
# Only process if images exist on this page
|
| 116 |
-
if image_list:
|
| 117 |
-
for img_index, img in enumerate(image_list):
|
| 118 |
-
# Create simple reference: document_page_X_image_Y
|
| 119 |
-
img_ref = f"{doc_name}_page_{page_num + 1}_image_{img_index + 1}"
|
| 120 |
-
image_refs.append(img_ref)
|
| 121 |
-
|
| 122 |
-
if image_refs:
|
| 123 |
-
images_by_page[page_num] = image_refs
|
| 124 |
-
logger.info(f"Detected {len(image_refs)} images on page {page_num + 1}")
|
| 125 |
-
|
| 126 |
-
pdf_document.close()
|
| 127 |
-
|
| 128 |
-
except Exception as e:
|
| 129 |
-
logger.warning(f"Could not detect images: {e}")
|
| 130 |
-
|
| 131 |
-
return images_by_page
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
# Singleton instance
|
| 135 |
-
_azure_ocr_processor = None
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
def get_azure_ocr_processor() -> AzureOCRProcessor:
|
| 139 |
-
"""Get or create Azure OCR processor instance"""
|
| 140 |
-
global _azure_ocr_processor
|
| 141 |
-
if _azure_ocr_processor is None:
|
| 142 |
-
_azure_ocr_processor = AzureOCRProcessor()
|
| 143 |
-
return _azure_ocr_processor
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/ocr/processor.py
DELETED
|
@@ -1,62 +0,0 @@
|
|
| 1 |
-
"""Main OCR processor that handles different backends"""
|
| 2 |
-
|
| 3 |
-
from typing import List, Dict
|
| 4 |
-
from pathlib import Path
|
| 5 |
-
from loguru import logger
|
| 6 |
-
|
| 7 |
-
from src.config import settings
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
class OCRProcessor:
|
| 11 |
-
"""Main OCR processor that can switch between different backends"""
|
| 12 |
-
|
| 13 |
-
def __init__(self, backend: str = None):
|
| 14 |
-
"""
|
| 15 |
-
Initialize OCR processor
|
| 16 |
-
|
| 17 |
-
Args:
|
| 18 |
-
backend: OCR backend to use (azure)
|
| 19 |
-
If None, uses settings.ocr_backend
|
| 20 |
-
"""
|
| 21 |
-
self.backend = backend or settings.ocr_backend
|
| 22 |
-
logger.info(f"Initializing OCR processor with backend: {self.backend}")
|
| 23 |
-
|
| 24 |
-
# Initialize Azure OCR processor
|
| 25 |
-
if self.backend == "azure":
|
| 26 |
-
from src.ocr.azure_ocr import get_azure_ocr_processor
|
| 27 |
-
self.processor = get_azure_ocr_processor()
|
| 28 |
-
else:
|
| 29 |
-
raise ValueError(f"Unsupported OCR backend: {self.backend}. Only 'azure' is supported.")
|
| 30 |
-
|
| 31 |
-
def process_pdf(self, pdf_file: bytes, filename: str = None) -> List[Dict[str, any]]:
|
| 32 |
-
"""
|
| 33 |
-
Process PDF file and extract text
|
| 34 |
-
|
| 35 |
-
Args:
|
| 36 |
-
pdf_file: PDF file as bytes
|
| 37 |
-
filename: Optional filename for logging
|
| 38 |
-
|
| 39 |
-
Returns:
|
| 40 |
-
List of dicts with page_number and MD_text
|
| 41 |
-
"""
|
| 42 |
-
logger.info(f"Processing PDF: {filename or 'unnamed'} ({len(pdf_file)} bytes)")
|
| 43 |
-
|
| 44 |
-
try:
|
| 45 |
-
result = self.processor.process_pdf(pdf_file)
|
| 46 |
-
logger.info(f"Successfully processed {len(result)} pages")
|
| 47 |
-
return result
|
| 48 |
-
except Exception as e:
|
| 49 |
-
logger.error(f"Error processing PDF: {e}")
|
| 50 |
-
raise
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
# Singleton instance
|
| 54 |
-
_ocr_processor = None
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
def get_ocr_processor() -> OCRProcessor:
|
| 58 |
-
"""Get or create OCR processor instance"""
|
| 59 |
-
global _ocr_processor
|
| 60 |
-
if _ocr_processor is None:
|
| 61 |
-
_ocr_processor = OCRProcessor()
|
| 62 |
-
return _ocr_processor
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/vectordb/__init__.py
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
"""Vector database factory and interface"""
|
| 2 |
-
|
| 3 |
-
from src.config import settings
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
def get_vector_store():
|
| 7 |
-
"""Factory function to get the configured vector store"""
|
| 8 |
-
if settings.vector_db_type == "pinecone":
|
| 9 |
-
from src.vectordb.pinecone_store import get_vector_store as get_pinecone_store
|
| 10 |
-
return get_pinecone_store()
|
| 11 |
-
else: # Default to chroma
|
| 12 |
-
from src.vectordb.chroma_store import get_vector_store as get_chroma_store
|
| 13 |
-
return get_chroma_store()
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
__all__ = ["get_vector_store"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/vectordb/chroma_store.py
DELETED
|
@@ -1,150 +0,0 @@
|
|
| 1 |
-
"""ChromaDB vector store for document embeddings"""
|
| 2 |
-
|
| 3 |
-
from typing import List, Dict, Optional
|
| 4 |
-
from pathlib import Path
|
| 5 |
-
import chromadb
|
| 6 |
-
from chromadb.config import Settings
|
| 7 |
-
from sentence_transformers import SentenceTransformer
|
| 8 |
-
from loguru import logger
|
| 9 |
-
|
| 10 |
-
from src.config import settings as app_settings
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
class ChromaVectorStore:
|
| 14 |
-
"""Vector store using ChromaDB"""
|
| 15 |
-
|
| 16 |
-
def __init__(self, collection_name: str = "socar_documents"):
|
| 17 |
-
"""
|
| 18 |
-
Initialize ChromaDB vector store
|
| 19 |
-
|
| 20 |
-
Args:
|
| 21 |
-
collection_name: Name of the collection to use
|
| 22 |
-
"""
|
| 23 |
-
# Initialize ChromaDB client
|
| 24 |
-
self.db_path = app_settings.vector_db_path
|
| 25 |
-
self.db_path.mkdir(parents=True, exist_ok=True)
|
| 26 |
-
|
| 27 |
-
self.client = chromadb.PersistentClient(
|
| 28 |
-
path=str(self.db_path),
|
| 29 |
-
settings=Settings(
|
| 30 |
-
anonymized_telemetry=False,
|
| 31 |
-
allow_reset=True,
|
| 32 |
-
),
|
| 33 |
-
)
|
| 34 |
-
|
| 35 |
-
# Initialize embedding model
|
| 36 |
-
logger.info("Loading embedding model...")
|
| 37 |
-
self.embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
| 38 |
-
logger.info("Embedding model loaded")
|
| 39 |
-
|
| 40 |
-
# Get or create collection
|
| 41 |
-
self.collection = self.client.get_or_create_collection(
|
| 42 |
-
name=collection_name,
|
| 43 |
-
metadata={"description": "SOCAR historical documents"},
|
| 44 |
-
)
|
| 45 |
-
|
| 46 |
-
logger.info(f"ChromaDB initialized with collection: {collection_name}")
|
| 47 |
-
logger.info(f"Collection contains {self.collection.count()} documents")
|
| 48 |
-
|
| 49 |
-
def add_documents(
|
| 50 |
-
self,
|
| 51 |
-
texts: List[str],
|
| 52 |
-
metadatas: List[Dict],
|
| 53 |
-
ids: Optional[List[str]] = None,
|
| 54 |
-
):
|
| 55 |
-
"""
|
| 56 |
-
Add documents to the vector store
|
| 57 |
-
|
| 58 |
-
Args:
|
| 59 |
-
texts: List of text chunks to add
|
| 60 |
-
metadatas: List of metadata dicts (pdf_name, page_number, etc.)
|
| 61 |
-
ids: Optional list of document IDs
|
| 62 |
-
"""
|
| 63 |
-
if not texts:
|
| 64 |
-
logger.warning("No texts provided to add")
|
| 65 |
-
return
|
| 66 |
-
|
| 67 |
-
# Generate IDs if not provided
|
| 68 |
-
if ids is None:
|
| 69 |
-
ids = [f"doc_{i}" for i in range(len(texts))]
|
| 70 |
-
|
| 71 |
-
logger.info(f"Adding {len(texts)} documents to vector store")
|
| 72 |
-
|
| 73 |
-
# Generate embeddings
|
| 74 |
-
embeddings = self.embedding_model.encode(texts, show_progress_bar=True)
|
| 75 |
-
|
| 76 |
-
# Add to ChromaDB
|
| 77 |
-
self.collection.add(
|
| 78 |
-
documents=texts,
|
| 79 |
-
embeddings=embeddings.tolist(),
|
| 80 |
-
metadatas=metadatas,
|
| 81 |
-
ids=ids,
|
| 82 |
-
)
|
| 83 |
-
|
| 84 |
-
logger.info(f"Successfully added {len(texts)} documents")
|
| 85 |
-
|
| 86 |
-
def search(
|
| 87 |
-
self,
|
| 88 |
-
query: str,
|
| 89 |
-
n_results: int = 5,
|
| 90 |
-
filter_metadata: Optional[Dict] = None,
|
| 91 |
-
) -> Dict:
|
| 92 |
-
"""
|
| 93 |
-
Search for similar documents
|
| 94 |
-
|
| 95 |
-
Args:
|
| 96 |
-
query: Search query
|
| 97 |
-
n_results: Number of results to return
|
| 98 |
-
filter_metadata: Optional metadata filter
|
| 99 |
-
|
| 100 |
-
Returns:
|
| 101 |
-
Dict with documents, metadatas, and distances
|
| 102 |
-
"""
|
| 103 |
-
logger.info(f"Searching for: {query[:100]}...")
|
| 104 |
-
|
| 105 |
-
# Generate query embedding
|
| 106 |
-
query_embedding = self.embedding_model.encode([query])[0]
|
| 107 |
-
|
| 108 |
-
# Search ChromaDB
|
| 109 |
-
results = self.collection.query(
|
| 110 |
-
query_embeddings=[query_embedding.tolist()],
|
| 111 |
-
n_results=n_results,
|
| 112 |
-
where=filter_metadata,
|
| 113 |
-
)
|
| 114 |
-
|
| 115 |
-
logger.info(f"Found {len(results['documents'][0])} results")
|
| 116 |
-
|
| 117 |
-
return {
|
| 118 |
-
"documents": results["documents"][0],
|
| 119 |
-
"metadatas": results["metadatas"][0],
|
| 120 |
-
"distances": results["distances"][0],
|
| 121 |
-
}
|
| 122 |
-
|
| 123 |
-
def clear(self):
|
| 124 |
-
"""Clear all documents from the collection"""
|
| 125 |
-
logger.warning("Clearing all documents from collection")
|
| 126 |
-
self.client.delete_collection(self.collection.name)
|
| 127 |
-
self.collection = self.client.create_collection(
|
| 128 |
-
name=self.collection.name,
|
| 129 |
-
metadata={"description": "SOCAR historical documents"},
|
| 130 |
-
)
|
| 131 |
-
|
| 132 |
-
def get_stats(self) -> Dict:
|
| 133 |
-
"""Get collection statistics"""
|
| 134 |
-
return {
|
| 135 |
-
"total_documents": self.collection.count(),
|
| 136 |
-
"collection_name": self.collection.name,
|
| 137 |
-
"db_path": str(self.db_path),
|
| 138 |
-
}
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
# Singleton instance
|
| 142 |
-
_vector_store = None
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
def get_vector_store() -> ChromaVectorStore:
|
| 146 |
-
"""Get or create vector store instance"""
|
| 147 |
-
global _vector_store
|
| 148 |
-
if _vector_store is None:
|
| 149 |
-
_vector_store = ChromaVectorStore()
|
| 150 |
-
return _vector_store
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/vectordb/pinecone_store.py
DELETED
|
@@ -1,176 +0,0 @@
|
|
| 1 |
-
"""Pinecone vector store for document embeddings"""
|
| 2 |
-
|
| 3 |
-
from typing import List, Dict, Optional
|
| 4 |
-
from pinecone import Pinecone, ServerlessSpec
|
| 5 |
-
from sentence_transformers import SentenceTransformer
|
| 6 |
-
from loguru import logger
|
| 7 |
-
import time
|
| 8 |
-
|
| 9 |
-
from src.config import settings as app_settings
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
class PineconeVectorStore:
|
| 13 |
-
"""Vector store using Pinecone"""
|
| 14 |
-
|
| 15 |
-
def __init__(self, index_name: str = None):
|
| 16 |
-
"""
|
| 17 |
-
Initialize Pinecone vector store
|
| 18 |
-
|
| 19 |
-
Args:
|
| 20 |
-
index_name: Name of the Pinecone index to use
|
| 21 |
-
"""
|
| 22 |
-
# Initialize Pinecone client
|
| 23 |
-
self.pc = Pinecone(api_key=app_settings.pinecone_api_key)
|
| 24 |
-
self.index_name = index_name or app_settings.pinecone_index_name
|
| 25 |
-
|
| 26 |
-
# Initialize embedding model (matches Pinecone index: 1024 dimensions)
|
| 27 |
-
logger.info("Loading embedding model...")
|
| 28 |
-
self.embedding_model = SentenceTransformer("BAAI/bge-large-en-v1.5")
|
| 29 |
-
self.embedding_dimension = 1024 # bge-large-en-v1.5 dimension (matches Pinecone)
|
| 30 |
-
logger.info("Embedding model loaded")
|
| 31 |
-
|
| 32 |
-
# Get or create index
|
| 33 |
-
self._ensure_index_exists()
|
| 34 |
-
self.index = self.pc.Index(self.index_name)
|
| 35 |
-
|
| 36 |
-
logger.info(f"Pinecone initialized with index: {self.index_name}")
|
| 37 |
-
logger.info(f"Index stats: {self.index.describe_index_stats()}")
|
| 38 |
-
|
| 39 |
-
def _ensure_index_exists(self):
|
| 40 |
-
"""Verify index exists"""
|
| 41 |
-
existing_indexes = [idx.name for idx in self.pc.list_indexes()]
|
| 42 |
-
|
| 43 |
-
if self.index_name not in existing_indexes:
|
| 44 |
-
logger.error(f"Pinecone index '{self.index_name}' not found!")
|
| 45 |
-
logger.error(f"Available indexes: {existing_indexes}")
|
| 46 |
-
raise ValueError(
|
| 47 |
-
f"Pinecone index '{self.index_name}' does not exist. "
|
| 48 |
-
f"Please create it first or check PINECONE_INDEX_NAME in .env"
|
| 49 |
-
)
|
| 50 |
-
logger.info(f"Connected to existing Pinecone index: {self.index_name}")
|
| 51 |
-
|
| 52 |
-
def add_documents(
|
| 53 |
-
self,
|
| 54 |
-
texts: List[str],
|
| 55 |
-
metadatas: List[Dict],
|
| 56 |
-
ids: Optional[List[str]] = None,
|
| 57 |
-
):
|
| 58 |
-
"""
|
| 59 |
-
Add documents to the vector store
|
| 60 |
-
|
| 61 |
-
Args:
|
| 62 |
-
texts: List of text chunks to add
|
| 63 |
-
metadatas: List of metadata dicts (pdf_name, page_number, etc.)
|
| 64 |
-
ids: Optional list of document IDs
|
| 65 |
-
"""
|
| 66 |
-
if not texts:
|
| 67 |
-
logger.warning("No texts provided to add")
|
| 68 |
-
return
|
| 69 |
-
|
| 70 |
-
# Generate IDs if not provided
|
| 71 |
-
if ids is None:
|
| 72 |
-
ids = [f"doc_{i}_{int(time.time())}" for i in range(len(texts))]
|
| 73 |
-
|
| 74 |
-
logger.info(f"Adding {len(texts)} documents to Pinecone")
|
| 75 |
-
|
| 76 |
-
# Generate embeddings
|
| 77 |
-
embeddings = self.embedding_model.encode(texts, show_progress_bar=True)
|
| 78 |
-
|
| 79 |
-
# Prepare vectors for upsert
|
| 80 |
-
vectors = []
|
| 81 |
-
for i, (doc_id, embedding, text, metadata) in enumerate(zip(ids, embeddings, texts, metadatas)):
|
| 82 |
-
vectors.append({
|
| 83 |
-
"id": doc_id,
|
| 84 |
-
"values": embedding.tolist(),
|
| 85 |
-
"metadata": {
|
| 86 |
-
**metadata,
|
| 87 |
-
"text": text[:1000] # Store first 1000 chars in metadata
|
| 88 |
-
}
|
| 89 |
-
})
|
| 90 |
-
|
| 91 |
-
# Upsert in batches of 100
|
| 92 |
-
batch_size = 100
|
| 93 |
-
for i in range(0, len(vectors), batch_size):
|
| 94 |
-
batch = vectors[i:i + batch_size]
|
| 95 |
-
self.index.upsert(vectors=batch)
|
| 96 |
-
logger.info(f"Upserted batch {i//batch_size + 1}/{(len(vectors)-1)//batch_size + 1}")
|
| 97 |
-
|
| 98 |
-
logger.info(f"Successfully added {len(texts)} documents to Pinecone")
|
| 99 |
-
|
| 100 |
-
def search(
|
| 101 |
-
self,
|
| 102 |
-
query: str,
|
| 103 |
-
n_results: int = 5,
|
| 104 |
-
filter_metadata: Optional[Dict] = None,
|
| 105 |
-
) -> Dict:
|
| 106 |
-
"""
|
| 107 |
-
Search for similar documents
|
| 108 |
-
|
| 109 |
-
Args:
|
| 110 |
-
query: Search query
|
| 111 |
-
n_results: Number of results to return
|
| 112 |
-
filter_metadata: Optional metadata filter
|
| 113 |
-
|
| 114 |
-
Returns:
|
| 115 |
-
Dict with documents, metadatas, and distances
|
| 116 |
-
"""
|
| 117 |
-
logger.info(f"Searching Pinecone for: {query[:100]}...")
|
| 118 |
-
|
| 119 |
-
# Generate query embedding
|
| 120 |
-
query_embedding = self.embedding_model.encode([query])[0]
|
| 121 |
-
|
| 122 |
-
# Search Pinecone
|
| 123 |
-
results = self.index.query(
|
| 124 |
-
vector=query_embedding.tolist(),
|
| 125 |
-
top_k=n_results,
|
| 126 |
-
include_metadata=True,
|
| 127 |
-
filter=filter_metadata
|
| 128 |
-
)
|
| 129 |
-
|
| 130 |
-
# Extract results
|
| 131 |
-
documents = []
|
| 132 |
-
metadatas = []
|
| 133 |
-
distances = []
|
| 134 |
-
|
| 135 |
-
for match in results['matches']:
|
| 136 |
-
documents.append(match['metadata'].get('text', ''))
|
| 137 |
-
# Remove 'text' from metadata as it's already in documents
|
| 138 |
-
metadata = {k: v for k, v in match['metadata'].items() if k != 'text'}
|
| 139 |
-
metadatas.append(metadata)
|
| 140 |
-
distances.append(1 - match['score']) # Convert similarity to distance
|
| 141 |
-
|
| 142 |
-
logger.info(f"Found {len(documents)} results")
|
| 143 |
-
|
| 144 |
-
return {
|
| 145 |
-
"documents": documents,
|
| 146 |
-
"metadatas": metadatas,
|
| 147 |
-
"distances": distances,
|
| 148 |
-
}
|
| 149 |
-
|
| 150 |
-
def clear(self):
|
| 151 |
-
"""Clear all documents from the index"""
|
| 152 |
-
logger.warning("Deleting and recreating Pinecone index")
|
| 153 |
-
self.pc.delete_index(self.index_name)
|
| 154 |
-
self._ensure_index_exists()
|
| 155 |
-
self.index = self.pc.Index(self.index_name)
|
| 156 |
-
|
| 157 |
-
def get_stats(self) -> Dict:
|
| 158 |
-
"""Get index statistics"""
|
| 159 |
-
stats = self.index.describe_index_stats()
|
| 160 |
-
return {
|
| 161 |
-
"total_documents": stats.get('total_vector_count', 0),
|
| 162 |
-
"index_name": self.index_name,
|
| 163 |
-
"dimension": self.embedding_dimension,
|
| 164 |
-
}
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
# Singleton instance
|
| 168 |
-
_vector_store = None
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
def get_vector_store() -> PineconeVectorStore:
|
| 172 |
-
"""Get or create Pinecone vector store instance"""
|
| 173 |
-
global _vector_store
|
| 174 |
-
if _vector_store is None:
|
| 175 |
-
_vector_store = PineconeVectorStore()
|
| 176 |
-
return _vector_store
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
start.sh
DELETED
|
@@ -1,81 +0,0 @@
|
|
| 1 |
-
#!/bin/bash
|
| 2 |
-
|
| 3 |
-
# SOCAR Document Processing - Quick Start Script
|
| 4 |
-
|
| 5 |
-
set -e
|
| 6 |
-
|
| 7 |
-
echo "=================================="
|
| 8 |
-
echo "SOCAR Document Processing System"
|
| 9 |
-
echo "=================================="
|
| 10 |
-
echo ""
|
| 11 |
-
|
| 12 |
-
# Check if .env exists
|
| 13 |
-
if [ ! -f .env ]; then
|
| 14 |
-
echo "❌ Error: .env file not found"
|
| 15 |
-
echo "Please create .env file with required credentials"
|
| 16 |
-
exit 1
|
| 17 |
-
fi
|
| 18 |
-
|
| 19 |
-
# Check if Docker is installed
|
| 20 |
-
if ! command -v docker &> /dev/null; then
|
| 21 |
-
echo "❌ Error: Docker is not installed"
|
| 22 |
-
echo "Please install Docker: https://docs.docker.com/get-docker/"
|
| 23 |
-
exit 1
|
| 24 |
-
fi
|
| 25 |
-
|
| 26 |
-
# Check if Docker Compose is installed
|
| 27 |
-
if ! command -v docker-compose &> /dev/null; then
|
| 28 |
-
echo "❌ Error: Docker Compose is not installed"
|
| 29 |
-
echo "Please install Docker Compose: https://docs.docker.com/compose/install/"
|
| 30 |
-
exit 1
|
| 31 |
-
fi
|
| 32 |
-
|
| 33 |
-
echo "✓ Prerequisites checked"
|
| 34 |
-
echo ""
|
| 35 |
-
|
| 36 |
-
# Create data directories
|
| 37 |
-
mkdir -p data/pdfs data/vector_db data/processed
|
| 38 |
-
echo "✓ Data directories created"
|
| 39 |
-
echo ""
|
| 40 |
-
|
| 41 |
-
# Build and start containers
|
| 42 |
-
echo "🔨 Building Docker image..."
|
| 43 |
-
docker-compose build
|
| 44 |
-
|
| 45 |
-
echo ""
|
| 46 |
-
echo "🚀 Starting containers..."
|
| 47 |
-
docker-compose up -d
|
| 48 |
-
|
| 49 |
-
echo ""
|
| 50 |
-
echo "⏳ Waiting for service to be ready..."
|
| 51 |
-
sleep 5
|
| 52 |
-
|
| 53 |
-
# Wait for health check
|
| 54 |
-
MAX_RETRIES=30
|
| 55 |
-
RETRY_COUNT=0
|
| 56 |
-
until curl -f http://localhost:8000/ &> /dev/null || [ $RETRY_COUNT -eq $MAX_RETRIES ]; do
|
| 57 |
-
echo " Waiting for API... ($RETRY_COUNT/$MAX_RETRIES)"
|
| 58 |
-
sleep 2
|
| 59 |
-
((RETRY_COUNT++))
|
| 60 |
-
done
|
| 61 |
-
|
| 62 |
-
if [ $RETRY_COUNT -eq $MAX_RETRIES ]; then
|
| 63 |
-
echo ""
|
| 64 |
-
echo "❌ Failed to start service"
|
| 65 |
-
echo "Check logs with: docker-compose logs"
|
| 66 |
-
exit 1
|
| 67 |
-
fi
|
| 68 |
-
|
| 69 |
-
echo ""
|
| 70 |
-
echo "=================================="
|
| 71 |
-
echo "✅ SOCAR API is ready!"
|
| 72 |
-
echo "=================================="
|
| 73 |
-
echo ""
|
| 74 |
-
echo "📍 API URL: http://localhost:8000"
|
| 75 |
-
echo "📖 Documentation: http://localhost:8000/docs"
|
| 76 |
-
echo ""
|
| 77 |
-
echo "Useful commands:"
|
| 78 |
-
echo " • View logs: docker-compose logs -f"
|
| 79 |
-
echo " • Stop: docker-compose down"
|
| 80 |
-
echo " • Restart: docker-compose restart"
|
| 81 |
-
echo ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test_complete_system.py
DELETED
|
@@ -1,128 +0,0 @@
|
|
| 1 |
-
"""Complete system test"""
|
| 2 |
-
|
| 3 |
-
import requests
|
| 4 |
-
import json
|
| 5 |
-
from pathlib import Path
|
| 6 |
-
|
| 7 |
-
API_URL = "http://localhost:8000"
|
| 8 |
-
|
| 9 |
-
def test_health():
|
| 10 |
-
"""Test API health"""
|
| 11 |
-
print("=" * 60)
|
| 12 |
-
print("1. Testing API Health")
|
| 13 |
-
print("=" * 60)
|
| 14 |
-
response = requests.get(f"{API_URL}/")
|
| 15 |
-
print(f"Status: {response.status_code}")
|
| 16 |
-
print(json.dumps(response.json(), indent=2))
|
| 17 |
-
return response.status_code == 200
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
def test_ocr():
|
| 21 |
-
"""Test OCR endpoint"""
|
| 22 |
-
print("\n" + "=" * 60)
|
| 23 |
-
print("2. Testing OCR Endpoint")
|
| 24 |
-
print("=" * 60)
|
| 25 |
-
|
| 26 |
-
pdf_path = Path("data/pdfs/document_00.pdf")
|
| 27 |
-
if not pdf_path.exists():
|
| 28 |
-
print("❌ PDF not found")
|
| 29 |
-
return False
|
| 30 |
-
|
| 31 |
-
with open(pdf_path, "rb") as f:
|
| 32 |
-
files = {"file": (pdf_path.name, f, "application/pdf")}
|
| 33 |
-
response = requests.post(f"{API_URL}/ocr", files=files)
|
| 34 |
-
|
| 35 |
-
if response.status_code == 200:
|
| 36 |
-
result = response.json()
|
| 37 |
-
print(f"✓ Successfully processed {len(result)} pages")
|
| 38 |
-
print(f" First page preview: {result[0]['MD_text'][:100]}...")
|
| 39 |
-
return True
|
| 40 |
-
else:
|
| 41 |
-
print(f"❌ Error: {response.status_code}")
|
| 42 |
-
return False
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
def test_llm():
|
| 46 |
-
"""Test LLM endpoint"""
|
| 47 |
-
print("\n" + "=" * 60)
|
| 48 |
-
print("3. Testing LLM Endpoint (RAG)")
|
| 49 |
-
print("=" * 60)
|
| 50 |
-
|
| 51 |
-
messages = [
|
| 52 |
-
{"role": "user", "content": "What geological formations are discussed?"}
|
| 53 |
-
]
|
| 54 |
-
|
| 55 |
-
response = requests.post(
|
| 56 |
-
f"{API_URL}/llm",
|
| 57 |
-
json=messages,
|
| 58 |
-
headers={"Content-Type": "application/json"}
|
| 59 |
-
)
|
| 60 |
-
|
| 61 |
-
if response.status_code == 200:
|
| 62 |
-
result = response.json()
|
| 63 |
-
print(f"✓ Generated answer with {len(result['sources'])} sources")
|
| 64 |
-
print(f"\nAnswer preview:")
|
| 65 |
-
print(result["answer"][:300] + "...")
|
| 66 |
-
print(f"\nSources:")
|
| 67 |
-
for i, src in enumerate(result["sources"][:3], 1):
|
| 68 |
-
print(f" [{i}] {src['pdf_name']} - Page {src['page_number']}")
|
| 69 |
-
return True
|
| 70 |
-
else:
|
| 71 |
-
print(f"❌ Error: {response.status_code}")
|
| 72 |
-
return False
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
def test_llm_with_history():
|
| 76 |
-
"""Test LLM with chat history"""
|
| 77 |
-
print("\n" + "=" * 60)
|
| 78 |
-
print("4. Testing LLM with Chat History")
|
| 79 |
-
print("=" * 60)
|
| 80 |
-
|
| 81 |
-
messages = [
|
| 82 |
-
{"role": "user", "content": "What is the South Caspian Basin?"},
|
| 83 |
-
{"role": "assistant", "content": "The South Caspian Basin is a sedimentary basin..."},
|
| 84 |
-
{"role": "user", "content": "Tell me more about its hydrocarbon potential."}
|
| 85 |
-
]
|
| 86 |
-
|
| 87 |
-
response = requests.post(
|
| 88 |
-
f"{API_URL}/llm",
|
| 89 |
-
json=messages,
|
| 90 |
-
headers={"Content-Type": "application/json"}
|
| 91 |
-
)
|
| 92 |
-
|
| 93 |
-
if response.status_code == 200:
|
| 94 |
-
result = response.json()
|
| 95 |
-
print(f"✓ Generated contextual answer with chat history")
|
| 96 |
-
print(f" Answer length: {len(result['answer'])} characters")
|
| 97 |
-
print(f" Sources: {len(result['sources'])} documents")
|
| 98 |
-
return True
|
| 99 |
-
else:
|
| 100 |
-
print(f"❌ Error: {response.status_code}")
|
| 101 |
-
return False
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
if __name__ == "__main__":
|
| 105 |
-
print("\n" + "🚀" * 30)
|
| 106 |
-
print("SOCAR Document Processing System - Complete Test")
|
| 107 |
-
print("🚀" * 30 + "\n")
|
| 108 |
-
|
| 109 |
-
results = []
|
| 110 |
-
results.append(("Health Check", test_health()))
|
| 111 |
-
results.append(("OCR Endpoint", test_ocr()))
|
| 112 |
-
results.append(("LLM Endpoint", test_llm()))
|
| 113 |
-
results.append(("LLM Chat History", test_llm_with_history()))
|
| 114 |
-
|
| 115 |
-
print("\n" + "=" * 60)
|
| 116 |
-
print("TEST SUMMARY")
|
| 117 |
-
print("=" * 60)
|
| 118 |
-
for name, passed in results:
|
| 119 |
-
status = "✓ PASS" if passed else "❌ FAIL"
|
| 120 |
-
print(f"{status:10} - {name}")
|
| 121 |
-
|
| 122 |
-
all_passed = all(r[1] for r in results)
|
| 123 |
-
print("\n" + ("🎉" if all_passed else "❌") * 30)
|
| 124 |
-
if all_passed:
|
| 125 |
-
print("ALL TESTS PASSED - System Ready for Hackathon!")
|
| 126 |
-
else:
|
| 127 |
-
print("Some tests failed - please review")
|
| 128 |
-
print(("🎉" if all_passed else "❌") * 30 + "\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|