init
Browse files- .dockerignore +66 -0
- .gitignore +4 -2
- DEPLOYMENT.md +257 -0
- Dockerfile +46 -0
- app/__init__.py +2 -0
- app/main.py +280 -0
- notebooks/requirements_llm_benchmark.txt → app/requirements.txt +12 -15
- data/dataset_info.json +7 -0
- data/document_00.md +252 -0
- docker-compose.yml +30 -0
- notebooks/llm_benchmark.ipynb +523 -753
- notebooks/llm_benchmark.ipynb.backup +0 -761
- notebooks/rag_optimization_benchmark.ipynb +0 -0
- notebooks/rag_optimization_benchmark.ipynb.backup +0 -1072
- notebooks/{requirements_rag_optimization.txt → requirements.txt} +12 -7
- notebooks/requirements_vlm_ocr.txt +0 -24
- notebooks/vlm_ocr_benchmark.ipynb +0 -0
- notebooks/vlm_ocr_benchmark.ipynb.backup +0 -0
- scripts/README.md +143 -0
- scripts/__init__.py +1 -0
- scripts/check_pinecone.py +62 -0
- scripts/clear_pinecone.py +70 -0
- scripts/list_azure_models.py +76 -0
- test_api.py +82 -0
.dockerignore
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
env/
|
| 8 |
+
venv/
|
| 9 |
+
ENV/
|
| 10 |
+
build/
|
| 11 |
+
develop-eggs/
|
| 12 |
+
dist/
|
| 13 |
+
downloads/
|
| 14 |
+
eggs/
|
| 15 |
+
.eggs/
|
| 16 |
+
lib/
|
| 17 |
+
lib64/
|
| 18 |
+
parts/
|
| 19 |
+
sdist/
|
| 20 |
+
var/
|
| 21 |
+
wheels/
|
| 22 |
+
*.egg-info/
|
| 23 |
+
.installed.cfg
|
| 24 |
+
*.egg
|
| 25 |
+
|
| 26 |
+
# Jupyter
|
| 27 |
+
notebooks/
|
| 28 |
+
*.ipynb
|
| 29 |
+
*.ipynb_checkpoints/
|
| 30 |
+
output/
|
| 31 |
+
|
| 32 |
+
# Data files
|
| 33 |
+
data/
|
| 34 |
+
*.pdf
|
| 35 |
+
*.csv
|
| 36 |
+
*.json
|
| 37 |
+
|
| 38 |
+
# Documentation
|
| 39 |
+
docs/
|
| 40 |
+
README.md
|
| 41 |
+
*.md
|
| 42 |
+
|
| 43 |
+
# Git
|
| 44 |
+
.git/
|
| 45 |
+
.gitignore
|
| 46 |
+
.gitattributes
|
| 47 |
+
|
| 48 |
+
# IDE
|
| 49 |
+
.vscode/
|
| 50 |
+
.idea/
|
| 51 |
+
*.swp
|
| 52 |
+
*.swo
|
| 53 |
+
*~
|
| 54 |
+
|
| 55 |
+
# OS
|
| 56 |
+
.DS_Store
|
| 57 |
+
Thumbs.db
|
| 58 |
+
|
| 59 |
+
# Environment
|
| 60 |
+
.env.local
|
| 61 |
+
.env.development
|
| 62 |
+
*.log
|
| 63 |
+
|
| 64 |
+
# Docker
|
| 65 |
+
docker-compose.override.yml
|
| 66 |
+
Dockerfile.dev
|
.gitignore
CHANGED
|
@@ -1,6 +1,8 @@
|
|
| 1 |
.claude
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
| 4 |
.env
|
| 5 |
.env.local
|
| 6 |
.env.development.local
|
|
|
|
| 1 |
.claude
|
| 2 |
+
|
| 3 |
+
data/pdfs
|
| 4 |
+
data/vector_db
|
| 5 |
+
data/ai_track_data
|
| 6 |
.env
|
| 7 |
.env.local
|
| 8 |
.env.development.local
|
DEPLOYMENT.md
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SOCAR Hackathon - LLM API Deployment Guide
|
| 2 |
+
|
| 3 |
+
## Overview
|
| 4 |
+
|
| 5 |
+
Production-ready FastAPI service for SOCAR historical documents chatbot.
|
| 6 |
+
|
| 7 |
+
**Configuration (Based on RAG Optimization Benchmark):**
|
| 8 |
+
- **Model**: Llama-4-Maverick-17B-128E-Instruct-FP8 (Open-source)
|
| 9 |
+
- **Embedding**: BAAI/bge-large-en-v1.5
|
| 10 |
+
- **Retrieval**: Top-3 vanilla
|
| 11 |
+
- **Prompt Strategy**: Citation-focused
|
| 12 |
+
- **Performance**: 55.67% LLM Judge Score, 73.33% Citation Score, ~3.6s response time
|
| 13 |
+
|
| 14 |
+
## Quick Start
|
| 15 |
+
|
| 16 |
+
### Prerequisites
|
| 17 |
+
- Docker and Docker Compose installed
|
| 18 |
+
- `.env` file with API keys (see `.env.example`)
|
| 19 |
+
|
| 20 |
+
### 1. Configure Environment
|
| 21 |
+
|
| 22 |
+
```bash
|
| 23 |
+
cp .env.example .env
|
| 24 |
+
# Edit .env with your actual API keys:
|
| 25 |
+
# - AZURE_OPENAI_API_KEY
|
| 26 |
+
# - AZURE_OPENAI_ENDPOINT
|
| 27 |
+
# - PINECONE_API_KEY
|
| 28 |
+
# - PINECONE_INDEX_NAME
|
| 29 |
+
```
|
| 30 |
+
|
| 31 |
+
### 2. Build and Run with Docker
|
| 32 |
+
|
| 33 |
+
```bash
|
| 34 |
+
# Build the image
|
| 35 |
+
docker-compose build
|
| 36 |
+
|
| 37 |
+
# Start the service
|
| 38 |
+
docker-compose up -d
|
| 39 |
+
|
| 40 |
+
# Check logs
|
| 41 |
+
docker-compose logs -f llm-api
|
| 42 |
+
|
| 43 |
+
# Check health
|
| 44 |
+
curl http://localhost:8000/health
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
### 3. Test the API
|
| 48 |
+
|
| 49 |
+
```bash
|
| 50 |
+
# Simple health check
|
| 51 |
+
curl http://localhost:8000/
|
| 52 |
+
|
| 53 |
+
# Test LLM endpoint
|
| 54 |
+
curl -X POST http://localhost:8000/llm \
|
| 55 |
+
-H "Content-Type: application/json" \
|
| 56 |
+
-d '{
|
| 57 |
+
"messages": [
|
| 58 |
+
{"role": "user", "content": "Palçıq vulkanlarının təsir radiusu nə qədərdir?"}
|
| 59 |
+
]
|
| 60 |
+
}'
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
## API Endpoints
|
| 64 |
+
|
| 65 |
+
### GET `/`
|
| 66 |
+
Root endpoint with service information.
|
| 67 |
+
|
| 68 |
+
**Response:**
|
| 69 |
+
```json
|
| 70 |
+
{
|
| 71 |
+
"status": "healthy",
|
| 72 |
+
"service": "SOCAR LLM Chatbot",
|
| 73 |
+
"version": "1.0.0",
|
| 74 |
+
"model": "Llama-4-Maverick-17B (open-source)",
|
| 75 |
+
"configuration": {
|
| 76 |
+
"embedding": "BAAI/bge-large-en-v1.5",
|
| 77 |
+
"retrieval": "top-3 vanilla",
|
| 78 |
+
"prompt": "citation_focused",
|
| 79 |
+
"benchmark_score": "55.67%"
|
| 80 |
+
}
|
| 81 |
+
}
|
| 82 |
+
```
|
| 83 |
+
|
| 84 |
+
### GET `/health`
|
| 85 |
+
Detailed health check with service status.
|
| 86 |
+
|
| 87 |
+
**Response:**
|
| 88 |
+
```json
|
| 89 |
+
{
|
| 90 |
+
"status": "healthy",
|
| 91 |
+
"pinecone": {
|
| 92 |
+
"connected": true,
|
| 93 |
+
"total_vectors": 1300
|
| 94 |
+
},
|
| 95 |
+
"azure_openai": "connected",
|
| 96 |
+
"embedding_model": "loaded"
|
| 97 |
+
}
|
| 98 |
+
```
|
| 99 |
+
|
| 100 |
+
### POST `/llm`
|
| 101 |
+
Main chatbot endpoint.
|
| 102 |
+
|
| 103 |
+
**Request:**
|
| 104 |
+
```json
|
| 105 |
+
{
|
| 106 |
+
"messages": [
|
| 107 |
+
{"role": "user", "content": "Your question here"}
|
| 108 |
+
],
|
| 109 |
+
"temperature": 0.2,
|
| 110 |
+
"max_tokens": 1000
|
| 111 |
+
}
|
| 112 |
+
```
|
| 113 |
+
|
| 114 |
+
**Response:**
|
| 115 |
+
```json
|
| 116 |
+
{
|
| 117 |
+
"response": "Answer with citations...",
|
| 118 |
+
"sources": [
|
| 119 |
+
{
|
| 120 |
+
"pdf_name": "document_00.pdf",
|
| 121 |
+
"page_number": "5",
|
| 122 |
+
"relevance_score": "0.892"
|
| 123 |
+
}
|
| 124 |
+
],
|
| 125 |
+
"response_time": 3.61,
|
| 126 |
+
"model": "Llama-4-Maverick-17B-128E-Instruct-FP8"
|
| 127 |
+
}
|
| 128 |
+
```
|
| 129 |
+
|
| 130 |
+
## Development Mode
|
| 131 |
+
|
| 132 |
+
### Run locally without Docker
|
| 133 |
+
|
| 134 |
+
```bash
|
| 135 |
+
# Install dependencies
|
| 136 |
+
cd app
|
| 137 |
+
pip install -r requirements.txt
|
| 138 |
+
|
| 139 |
+
# Run with uvicorn
|
| 140 |
+
uvicorn main:app --reload --host 0.0.0.0 --port 8000
|
| 141 |
+
```
|
| 142 |
+
|
| 143 |
+
### Access API documentation
|
| 144 |
+
|
| 145 |
+
Once running, visit:
|
| 146 |
+
- **Swagger UI**: http://localhost:8000/docs
|
| 147 |
+
- **ReDoc**: http://localhost:8000/redoc
|
| 148 |
+
|
| 149 |
+
## Production Deployment
|
| 150 |
+
|
| 151 |
+
### Environment Variables
|
| 152 |
+
|
| 153 |
+
Required in `.env`:
|
| 154 |
+
```bash
|
| 155 |
+
# Azure OpenAI
|
| 156 |
+
AZURE_OPENAI_API_KEY=your_key_here
|
| 157 |
+
AZURE_OPENAI_ENDPOINT=your_endpoint_here
|
| 158 |
+
AZURE_OPENAI_API_VERSION=2024-08-01-preview
|
| 159 |
+
|
| 160 |
+
# Pinecone
|
| 161 |
+
PINECONE_API_KEY=your_key_here
|
| 162 |
+
PINECONE_INDEX_NAME=hackathon
|
| 163 |
+
```
|
| 164 |
+
|
| 165 |
+
### Docker Commands
|
| 166 |
+
|
| 167 |
+
```bash
|
| 168 |
+
# Build
|
| 169 |
+
docker-compose build --no-cache
|
| 170 |
+
|
| 171 |
+
# Start in background
|
| 172 |
+
docker-compose up -d
|
| 173 |
+
|
| 174 |
+
# View logs
|
| 175 |
+
docker-compose logs -f
|
| 176 |
+
|
| 177 |
+
# Stop
|
| 178 |
+
docker-compose down
|
| 179 |
+
|
| 180 |
+
# Restart
|
| 181 |
+
docker-compose restart
|
| 182 |
+
|
| 183 |
+
# Remove everything
|
| 184 |
+
docker-compose down -v
|
| 185 |
+
```
|
| 186 |
+
|
| 187 |
+
### Health Checks
|
| 188 |
+
|
| 189 |
+
The Docker container includes automatic health checks:
|
| 190 |
+
- **Interval**: 30 seconds
|
| 191 |
+
- **Timeout**: 10 seconds
|
| 192 |
+
- **Start period**: 40 seconds (for model loading)
|
| 193 |
+
- **Retries**: 3
|
| 194 |
+
|
| 195 |
+
### Monitoring
|
| 196 |
+
|
| 197 |
+
```bash
|
| 198 |
+
# Check container status
|
| 199 |
+
docker-compose ps
|
| 200 |
+
|
| 201 |
+
# View resource usage
|
| 202 |
+
docker stats socar-llm-api
|
| 203 |
+
|
| 204 |
+
# Check logs
|
| 205 |
+
docker-compose logs --tail=100 llm-api
|
| 206 |
+
```
|
| 207 |
+
|
| 208 |
+
## Performance Optimization
|
| 209 |
+
|
| 210 |
+
### Lazy Loading
|
| 211 |
+
- Azure client, Pinecone index, and embedding model are lazy-loaded
|
| 212 |
+
- First request may take longer (~5-10s for model loading)
|
| 213 |
+
- Subsequent requests: ~3.6s average
|
| 214 |
+
|
| 215 |
+
### Caching (Future)
|
| 216 |
+
To improve performance, consider:
|
| 217 |
+
- Redis for frequently asked questions
|
| 218 |
+
- Embedding cache for common queries
|
| 219 |
+
- Model quantization for faster inference
|
| 220 |
+
|
| 221 |
+
## Troubleshooting
|
| 222 |
+
|
| 223 |
+
### Container won't start
|
| 224 |
+
```bash
|
| 225 |
+
# Check logs
|
| 226 |
+
docker-compose logs llm-api
|
| 227 |
+
|
| 228 |
+
# Verify environment variables
|
| 229 |
+
docker-compose config
|
| 230 |
+
|
| 231 |
+
# Rebuild
|
| 232 |
+
docker-compose build --no-cache
|
| 233 |
+
```
|
| 234 |
+
|
| 235 |
+
### API returns 500 errors
|
| 236 |
+
- Check Azure OpenAI key and endpoint
|
| 237 |
+
- Verify Pinecone connection
|
| 238 |
+
- Check model deployment name matches
|
| 239 |
+
|
| 240 |
+
### Slow responses
|
| 241 |
+
- First request loads models (5-10s)
|
| 242 |
+
- Subsequent requests should be ~3-4s
|
| 243 |
+
- Check network connectivity to Azure/Pinecone
|
| 244 |
+
|
| 245 |
+
## Architecture Score
|
| 246 |
+
|
| 247 |
+
**Open-Source Stack (20% bonus):**
|
| 248 |
+
- ✅ Llama-4-Maverick-17B (Open-source LLM)
|
| 249 |
+
- ✅ BAAI/bge-large-en-v1.5 (Open-source embeddings)
|
| 250 |
+
- ✅ FastAPI (Open-source framework)
|
| 251 |
+
- ✅ Docker (Open-source deployment)
|
| 252 |
+
|
| 253 |
+
**Total Architecture Score: Maximum 20% for hackathon!**
|
| 254 |
+
|
| 255 |
+
## License
|
| 256 |
+
|
| 257 |
+
Built for SOCAR Hackathon 2025
|
Dockerfile
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SOCAR Hackathon - LLM Endpoint Dockerfile
|
| 2 |
+
# Multi-stage build for optimized image size
|
| 3 |
+
|
| 4 |
+
# Stage 1: Builder
|
| 5 |
+
FROM python:3.10-slim as builder
|
| 6 |
+
|
| 7 |
+
WORKDIR /app
|
| 8 |
+
|
| 9 |
+
# Install build dependencies
|
| 10 |
+
RUN apt-get update && apt-get install -y \
|
| 11 |
+
build-essential \
|
| 12 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 13 |
+
|
| 14 |
+
# Copy requirements and install dependencies
|
| 15 |
+
COPY app/requirements.txt .
|
| 16 |
+
RUN pip install --no-cache-dir --user -r requirements.txt
|
| 17 |
+
|
| 18 |
+
# Stage 2: Runtime
|
| 19 |
+
FROM python:3.10-slim
|
| 20 |
+
|
| 21 |
+
WORKDIR /app
|
| 22 |
+
|
| 23 |
+
# Install runtime dependencies
|
| 24 |
+
RUN apt-get update && apt-get install -y \
|
| 25 |
+
curl \
|
| 26 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 27 |
+
|
| 28 |
+
# Copy Python dependencies from builder
|
| 29 |
+
COPY --from=builder /root/.local /root/.local
|
| 30 |
+
|
| 31 |
+
# Copy application code
|
| 32 |
+
COPY app/ ./app/
|
| 33 |
+
COPY .env.example .env
|
| 34 |
+
|
| 35 |
+
# Add local bin to PATH
|
| 36 |
+
ENV PATH=/root/.local/bin:$PATH
|
| 37 |
+
|
| 38 |
+
# Expose port
|
| 39 |
+
EXPOSE 8000
|
| 40 |
+
|
| 41 |
+
# Health check
|
| 42 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
|
| 43 |
+
CMD curl -f http://localhost:8000/health || exit 1
|
| 44 |
+
|
| 45 |
+
# Run the application
|
| 46 |
+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
app/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""SOCAR Hackathon LLM API"""
|
| 2 |
+
__version__ = "1.0.0"
|
app/main.py
ADDED
|
@@ -0,0 +1,280 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SOCAR Hackathon - LLM Chatbot Endpoint
|
| 3 |
+
Optimized based on RAG benchmark results
|
| 4 |
+
Best config: citation_focused + vanilla_k3 + Llama-4-Maverick
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import time
|
| 9 |
+
from typing import List, Dict
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
from fastapi import FastAPI, HTTPException
|
| 13 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 14 |
+
from pydantic import BaseModel
|
| 15 |
+
from dotenv import load_dotenv
|
| 16 |
+
from openai import AzureOpenAI
|
| 17 |
+
from pinecone import Pinecone
|
| 18 |
+
from sentence_transformers import SentenceTransformer
|
| 19 |
+
|
| 20 |
+
# Load environment variables
|
| 21 |
+
load_dotenv()
|
| 22 |
+
|
| 23 |
+
# Initialize FastAPI app
|
| 24 |
+
app = FastAPI(
|
| 25 |
+
title="SOCAR Historical Documents Chatbot",
|
| 26 |
+
description="RAG-based chatbot for SOCAR oil & gas historical documents",
|
| 27 |
+
version="1.0.0"
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
# CORS middleware
|
| 31 |
+
app.add_middleware(
|
| 32 |
+
CORSMiddleware,
|
| 33 |
+
allow_origins=["*"],
|
| 34 |
+
allow_credentials=True,
|
| 35 |
+
allow_methods=["*"],
|
| 36 |
+
allow_headers=["*"],
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
# Initialize clients (lazy loading for faster startup)
|
| 40 |
+
azure_client = None
|
| 41 |
+
pinecone_index = None
|
| 42 |
+
embedding_model = None
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def get_azure_client():
|
| 46 |
+
"""Lazy load Azure OpenAI client"""
|
| 47 |
+
global azure_client
|
| 48 |
+
if azure_client is None:
|
| 49 |
+
azure_client = AzureOpenAI(
|
| 50 |
+
api_key=os.getenv("AZURE_OPENAI_API_KEY"),
|
| 51 |
+
api_version=os.getenv("AZURE_OPENAI_API_VERSION", "2024-08-01-preview"),
|
| 52 |
+
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
|
| 53 |
+
)
|
| 54 |
+
return azure_client
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def get_pinecone_index():
|
| 58 |
+
"""Lazy load Pinecone index"""
|
| 59 |
+
global pinecone_index
|
| 60 |
+
if pinecone_index is None:
|
| 61 |
+
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
|
| 62 |
+
pinecone_index = pc.Index(os.getenv("PINECONE_INDEX_NAME", "hackathon"))
|
| 63 |
+
return pinecone_index
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def get_embedding_model():
|
| 67 |
+
"""Lazy load embedding model"""
|
| 68 |
+
global embedding_model
|
| 69 |
+
if embedding_model is None:
|
| 70 |
+
# Best performing model from benchmark
|
| 71 |
+
embedding_model = SentenceTransformer("BAAI/bge-large-en-v1.5")
|
| 72 |
+
return embedding_model
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
# Request/Response models
|
| 76 |
+
class ChatMessage(BaseModel):
|
| 77 |
+
role: str
|
| 78 |
+
content: str
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
class ChatRequest(BaseModel):
|
| 82 |
+
messages: List[ChatMessage]
|
| 83 |
+
temperature: float = 0.2
|
| 84 |
+
max_tokens: int = 1000
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
class ChatResponse(BaseModel):
|
| 88 |
+
response: str
|
| 89 |
+
sources: List[Dict[str, str]]
|
| 90 |
+
response_time: float
|
| 91 |
+
model: str
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def retrieve_documents(query: str, top_k: int = 3) -> List[Dict]:
|
| 95 |
+
"""
|
| 96 |
+
Retrieve relevant documents from Pinecone vector database.
|
| 97 |
+
Best strategy from benchmark: vanilla top-3
|
| 98 |
+
"""
|
| 99 |
+
index = get_pinecone_index()
|
| 100 |
+
embed_model = get_embedding_model()
|
| 101 |
+
|
| 102 |
+
# Generate query embedding
|
| 103 |
+
query_embedding = embed_model.encode(query).tolist()
|
| 104 |
+
|
| 105 |
+
# Search vector database
|
| 106 |
+
results = index.query(
|
| 107 |
+
vector=query_embedding,
|
| 108 |
+
top_k=top_k,
|
| 109 |
+
include_metadata=True
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
# Extract documents
|
| 113 |
+
documents = []
|
| 114 |
+
for match in results['matches']:
|
| 115 |
+
documents.append({
|
| 116 |
+
'pdf_name': match['metadata'].get('pdf_name', 'unknown.pdf'),
|
| 117 |
+
'page_number': match['metadata'].get('page_number', 0),
|
| 118 |
+
'content': match['metadata'].get('text', ''),
|
| 119 |
+
'score': match.get('score', 0.0)
|
| 120 |
+
})
|
| 121 |
+
|
| 122 |
+
return documents
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def generate_answer(query: str, documents: List[Dict], temperature: float = 0.2, max_tokens: int = 1000) -> tuple[str, float]:
|
| 126 |
+
"""
|
| 127 |
+
Generate answer using best-performing configuration.
|
| 128 |
+
Model: Llama-4-Maverick-17B (open-source)
|
| 129 |
+
Prompt: citation_focused (best citation score: 73.33%)
|
| 130 |
+
"""
|
| 131 |
+
client = get_azure_client()
|
| 132 |
+
|
| 133 |
+
# Build context from retrieved documents
|
| 134 |
+
context_parts = []
|
| 135 |
+
for i, doc in enumerate(documents, 1):
|
| 136 |
+
context_parts.append(
|
| 137 |
+
f"Sənəd {i} (Mənbə: {doc['pdf_name']}, Səhifə {doc['page_number']}):\n{doc['content']}"
|
| 138 |
+
)
|
| 139 |
+
context = "\n\n".join(context_parts)
|
| 140 |
+
|
| 141 |
+
# Citation-focused prompt (best performer: 55.67% score)
|
| 142 |
+
prompt = f"""Siz SOCAR-ın tarixi sənədlər üzrə mütəxəssis köməkçisisiniz.
|
| 143 |
+
|
| 144 |
+
ÖNƏMLİ: Hər bir faktı mütləq mənbə ilə təsdiqləyin (PDF adı və səhifə nömrəsi).
|
| 145 |
+
|
| 146 |
+
Kontekst:
|
| 147 |
+
{context}
|
| 148 |
+
|
| 149 |
+
Sual: {query}
|
| 150 |
+
|
| 151 |
+
Cavab verərkən:
|
| 152 |
+
1. Dəqiq faktlar yazın
|
| 153 |
+
2. Hər faktı mənbə ilə göstərin: (PDF: fayl_adı.pdf, Səhifə: X)
|
| 154 |
+
3. Kontekstdə olmayan məlumat əlavə etməyin"""
|
| 155 |
+
|
| 156 |
+
try:
|
| 157 |
+
start_time = time.time()
|
| 158 |
+
|
| 159 |
+
# Use Llama-4-Maverick (open-source, best performer)
|
| 160 |
+
response = client.chat.completions.create(
|
| 161 |
+
model="Llama-4-Maverick-17B-128E-Instruct-FP8",
|
| 162 |
+
messages=[{"role": "user", "content": prompt}],
|
| 163 |
+
temperature=temperature,
|
| 164 |
+
max_tokens=max_tokens
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
elapsed = time.time() - start_time
|
| 168 |
+
answer = response.choices[0].message.content
|
| 169 |
+
|
| 170 |
+
return answer, elapsed
|
| 171 |
+
|
| 172 |
+
except Exception as e:
|
| 173 |
+
raise HTTPException(status_code=500, detail=f"LLM Error: {str(e)}")
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
@app.get("/")
|
| 177 |
+
async def root():
|
| 178 |
+
"""Health check endpoint"""
|
| 179 |
+
return {
|
| 180 |
+
"status": "healthy",
|
| 181 |
+
"service": "SOCAR LLM Chatbot",
|
| 182 |
+
"version": "1.0.0",
|
| 183 |
+
"model": "Llama-4-Maverick-17B (open-source)",
|
| 184 |
+
"configuration": {
|
| 185 |
+
"embedding": "BAAI/bge-large-en-v1.5",
|
| 186 |
+
"retrieval": "top-3 vanilla",
|
| 187 |
+
"prompt": "citation_focused",
|
| 188 |
+
"benchmark_score": "55.67%"
|
| 189 |
+
}
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
@app.get("/health")
|
| 194 |
+
async def health():
|
| 195 |
+
"""Detailed health check"""
|
| 196 |
+
try:
|
| 197 |
+
# Check if services are initialized
|
| 198 |
+
index = get_pinecone_index()
|
| 199 |
+
stats = index.describe_index_stats()
|
| 200 |
+
|
| 201 |
+
return {
|
| 202 |
+
"status": "healthy",
|
| 203 |
+
"pinecone": {
|
| 204 |
+
"connected": True,
|
| 205 |
+
"total_vectors": stats.get('total_vector_count', 0)
|
| 206 |
+
},
|
| 207 |
+
"azure_openai": "connected",
|
| 208 |
+
"embedding_model": "loaded"
|
| 209 |
+
}
|
| 210 |
+
except Exception as e:
|
| 211 |
+
return {
|
| 212 |
+
"status": "degraded",
|
| 213 |
+
"error": str(e)
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
@app.post("/llm", response_model=ChatResponse)
|
| 218 |
+
async def llm_endpoint(request: ChatRequest):
|
| 219 |
+
"""
|
| 220 |
+
LLM chatbot endpoint for SOCAR historical documents.
|
| 221 |
+
|
| 222 |
+
Uses RAG (Retrieval Augmented Generation) with:
|
| 223 |
+
- Embedding: BAAI/bge-large-en-v1.5
|
| 224 |
+
- Retrieval: Top-3 documents
|
| 225 |
+
- LLM: Llama-4-Maverick-17B (open-source)
|
| 226 |
+
- Prompt: Citation-focused
|
| 227 |
+
|
| 228 |
+
Expected performance:
|
| 229 |
+
- Response time: ~3.6s
|
| 230 |
+
- LLM Judge Score: 55.67%
|
| 231 |
+
- Citation Score: 73.33%
|
| 232 |
+
"""
|
| 233 |
+
try:
|
| 234 |
+
# Extract the user's question (last message)
|
| 235 |
+
if not request.messages:
|
| 236 |
+
raise HTTPException(status_code=400, detail="No messages provided")
|
| 237 |
+
|
| 238 |
+
user_messages = [msg for msg in request.messages if msg.role == "user"]
|
| 239 |
+
if not user_messages:
|
| 240 |
+
raise HTTPException(status_code=400, detail="No user message found")
|
| 241 |
+
|
| 242 |
+
query = user_messages[-1].content
|
| 243 |
+
|
| 244 |
+
# Retrieve relevant documents
|
| 245 |
+
documents = retrieve_documents(query, top_k=3)
|
| 246 |
+
|
| 247 |
+
# Generate answer
|
| 248 |
+
answer, response_time = generate_answer(
|
| 249 |
+
query=query,
|
| 250 |
+
documents=documents,
|
| 251 |
+
temperature=request.temperature,
|
| 252 |
+
max_tokens=request.max_tokens
|
| 253 |
+
)
|
| 254 |
+
|
| 255 |
+
# Format sources
|
| 256 |
+
sources = [
|
| 257 |
+
{
|
| 258 |
+
"pdf_name": doc['pdf_name'],
|
| 259 |
+
"page_number": str(doc['page_number']),
|
| 260 |
+
"relevance_score": f"{doc['score']:.3f}"
|
| 261 |
+
}
|
| 262 |
+
for doc in documents
|
| 263 |
+
]
|
| 264 |
+
|
| 265 |
+
return ChatResponse(
|
| 266 |
+
response=answer,
|
| 267 |
+
sources=sources,
|
| 268 |
+
response_time=round(response_time, 2),
|
| 269 |
+
model="Llama-4-Maverick-17B-128E-Instruct-FP8"
|
| 270 |
+
)
|
| 271 |
+
|
| 272 |
+
except HTTPException:
|
| 273 |
+
raise
|
| 274 |
+
except Exception as e:
|
| 275 |
+
raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
if __name__ == "__main__":
|
| 279 |
+
import uvicorn
|
| 280 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
notebooks/requirements_llm_benchmark.txt → app/requirements.txt
RENAMED
|
@@ -1,27 +1,24 @@
|
|
| 1 |
-
# LLM
|
| 2 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
# Azure OpenAI client
|
| 5 |
openai==1.54.0
|
| 6 |
|
| 7 |
-
# Vector
|
| 8 |
pinecone-client==5.0.0
|
| 9 |
|
| 10 |
# Embeddings
|
| 11 |
sentence-transformers==3.3.1
|
| 12 |
-
|
| 13 |
-
# Metrics
|
| 14 |
-
jiwer==3.0.3
|
| 15 |
-
|
| 16 |
-
# Data analysis and visualization
|
| 17 |
-
pandas==2.1.3
|
| 18 |
-
matplotlib==3.8.2
|
| 19 |
-
seaborn==0.13.0
|
| 20 |
|
| 21 |
# Utilities
|
| 22 |
python-dotenv==1.0.0
|
| 23 |
-
|
| 24 |
|
| 25 |
-
#
|
| 26 |
-
|
| 27 |
-
ipykernel==6.27.1
|
|
|
|
| 1 |
+
# SOCAR Hackathon LLM Endpoint Dependencies
|
| 2 |
+
# Optimized for production deployment
|
| 3 |
+
|
| 4 |
+
# FastAPI and server
|
| 5 |
+
fastapi==0.109.0
|
| 6 |
+
uvicorn[standard]==0.27.0
|
| 7 |
+
pydantic==2.5.3
|
| 8 |
|
| 9 |
# Azure OpenAI client
|
| 10 |
openai==1.54.0
|
| 11 |
|
| 12 |
+
# Vector database
|
| 13 |
pinecone-client==5.0.0
|
| 14 |
|
| 15 |
# Embeddings
|
| 16 |
sentence-transformers==3.3.1
|
| 17 |
+
torch==2.1.2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
# Utilities
|
| 20 |
python-dotenv==1.0.0
|
| 21 |
+
python-multipart==0.0.6
|
| 22 |
|
| 23 |
+
# Optional: monitoring and logging
|
| 24 |
+
prometheus-fastapi-instrumentator==7.0.0
|
|
|
data/dataset_info.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"total_rows": 28,
|
| 3 |
+
"features": [
|
| 4 |
+
"pdf"
|
| 5 |
+
],
|
| 6 |
+
"description": "SOCAR AI Track Dataset with PDF documents"
|
| 7 |
+
}
|
data/document_00.md
ADDED
|
@@ -0,0 +1,252 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
**XÜLASƏ**
|
| 2 |
+
|
| 3 |
+
Bu tədqiqat Aşağı Kür çökəkliyi (AKÇ) və Bakı arxipelaqı (BA) daxil olmaqla Cənubi Xəzər çökəkliyi sistemində faydalı qazıntıların mənşəyinin paleotektonik, paleocoğrafi şərait və geodinamik rejimlə necə əlaqələndiyini, eləcə də Gec Miosendən etibarən Ərəbistan plitəsinin təsiri ilə formalaşan kollizion proseslərin bölgənin struktur-morfoloji və termal inkişafına nə dərəcədə yönverici rol oynadığını kompleks şəkildə qiymətləndirir. Seismotektonik göstəricilərin, çöküntütoplanma sürətlərinin, geotemperatur xəritələrinin və palçıq vulkanizmi indikatorlarının inteqrasiyası göstərir ki, Cənubi Xəzər meqaçökəkliyinin qərb periklinal zonasının morfotektonik skeleti sıxılma gərginlikləri tərəfindən idarə olunmuş, AKÇ və BA-nın intensiv deformasiya olunması isə antiklinal zonallaşmanı, yerli qalxımların və qırıcı şəbəkəsinin hüdudlarını müəyyənləşdirmişdir. Nəticələrə görə, AKÇ-də antiklinal xətlər şimal-qərbdən cənub-şərqə uzanır və bu, şimal-şərq–cənub-qərb istiqamətli maksimal sıxılma gərginliklərinə perpendikulyar struktur elementlərinin üstünlüyünü təsdiqləyir. AKÇ-nin şimal-şərq seqmentində sıxılma gərginliklərinin intensivliyi pik dəyərlərə çatır; buradakı lokal qalxımlar həm seysmik kəsiklərdə, həm də səth-şelf morfologiyasında asimmetrik, "flower structure" tipli transpressiv motivlərlə ifadə olunur.
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
**Səhifə 2**
|
| 8 |
+
|
| 9 |
+
Çöküntütoplanma sürətinin məkanca və zamanca dəyişkənliyi, xüsusən Cənubi Xəzərin mərkəzində 0.4 mm/il, şelfdə 3–4 mm/il və Kür çayının mənsəbində 6 mm/il-ə çatan göstəricilər, akkumulyasiya rejiminin geodinamik yüklənməyə həssas olduğunu göstərir. Çöküntü qalınlığının AKÇ mərkəzinə doğru 6–7 km-ə qədər artması subsidensiya-sıxılma balansının uzunmüddətli kinematikası ilə izah olunur. Geotemperatur modelləşdirmə və xəritələşdirmə neftəmələgəlmənin baş zonalarını AKÇ və BA üçün ayırd etmiş, BA-da 3000–4000 m intervalında neft, 8000–8500 m intervalında isə qaz əmələgəlməsinin pik zonalarını təsbit etmişdir. Palçıq vulkanizmi məcraları boyunca temperatur anomaliyalarının müşahidəsi, yerli termal axının artması və süxur kompleksləri daxilində maye-müxtəlif fazalı qarışıqların dərinlikdən gətirilməsi ilə əlaqələndirilmişdir. Nəticə olaraq, AKÇ və BA-nın neft-qazlılıq potensialının formalaşması birbaşa sıxılma gərginlikləri, qatlanma-qırılma kinematikası və çöküntütoplanma rejiminin sinxronlaşdırılması ilə bağlı olub; Pliosendə hər iki struktur domen üzrə inkişafın sürətlənməsi, akkumulyasiya-tektonika qarşılıqlı təsirinin ən yüksək səviyyəsini əks etdirir.
|
| 10 |
+
|
| 11 |
+
**Açar sözlər**
|
| 12 |
+
|
| 13 |
+
Cənubi Xəzər çökəkliyi; Aşağı Kür çökəkliyi; Bakı arxipelaqı; sıxılma gərginlikləri; paleotektonika; paleocoğrafiya; geodinamik rejim; palçıq vulkanizmi; çöküntütoplanma; neftəmələgəlmə; qazəmələgəlmə; geotemperatur modelləşdirmə.
|
| 14 |
+
|
| 15 |
+
---
|
| 16 |
+
|
| 17 |
+
**Səhifə 3**
|
| 18 |
+
|
| 19 |
+
## **Giriş**
|
| 20 |
+
|
| 21 |
+
Faydalı qazıntıların regional paylanması və genezisi əksər hallarda geodinamik çərçivə, paleotektonik epizodlar və paleocoğrafi təkamüllə sıx bağlıdır. Qafqaz orogeni və Cənubi Xəzər meqaçökəkliyi arasında yerləşən Aşağı Kür çökəkliyi (AKÇ) və Bakı arxipelaqı (BA) bu baxımdan unikal laboratorıyadır: burada Gec Miosendən bəri davam edən Ərəbistan-Avrasiya kolliziyası nəticəsində yaranmış sıxılma rejimi həm struktur-morfoloji, həm də neft-karbohidrogen sistemlərinin inkişafını güclü şəkildə yönləndirir. Bu işdə məqsəd:
|
| 22 |
+
|
| 23 |
+
* AKÇ və BA ərazilərində sıxılma gərginliklərinin orientasiyası, intensivliyi və zamanca dəyişkənliyini qiymətləndirmək;
|
| 24 |
+
* qatlanma-qırılma şəbəkəsinin geometriyasını paleostress sahəsi ilə əlaqələndirmək;
|
| 25 |
+
* çöküntütoplanma sürətlərinin məkan-zaman variasiyalarını və onların subsidensiya balansına təsirini göstərmək;
|
| 26 |
+
* geotemperatur xəritələri əsasında neft və qaz əmələgəlməsinin baş zonalarını müəyyənləşdirmək;
|
| 27 |
+
* palçıq vulkanizminin termal və hidrodinamik siqnallarını, həmçinin süxur komplekslərinin temperatur rejiminə təsirini izah etmək;
|
| 28 |
+
* mineral resursların (xüsusən neft-qaz) potensialını struktur-paleocoğrafi ramka daxilində inteqrasiya etmək.
|
| 29 |
+
|
| 30 |
+
## **Regional geodinamik fon**
|
| 31 |
+
|
| 32 |
+
Gec Miosendən etibarən Ərəbistan plitəsinin şimala doğru hərəkəti Qafqaz kollizion zonasını aktivləşdirmiş, Cənubi Xəzər meqaçökəkliyinin qərb kənarında transpressiv-sıxılmalı deformasiya rejimi yaratmışdır. Bu proses, Cənubi Xəzər dərin çökəkliyi altında yüksək sıxlıqlı litosfer blokunun şərti şəkildə udulması (subduksiya-vari tələffüz olunan udulma) ilə nəticələnmiş, qərb yamac boyunca pillələnmiş itələnmələr, sağ-yanal komponentli qırılmalar və ön zonada sıxılmalı qatlanma kəmərləri əmələ gətirmişdir. Regional stress sahəsinin maksimum üfüqi komponenti (SHmax) şimal-şərq–cənub-qərb istiqamətlidir; bu istiqamətdə sıxılma gərginliklərinin yüksəkliyi AKÇ-nin şimal-şərq seqmentində ən aydın şəkildə izlənir. Nəticə etibarilə, antiklinal oxlar və qırılma zonalarının dominant uzanma istiqaməti şimal-qərbdən cənub-şərqə doğrudur ki, bu da SHmax-ın ortoqonalını əks etdirir.
|
| 33 |
+
|
| 34 |
+
---
|
| 35 |
+
|
| 36 |
+
**Səhifə 4**
|
| 37 |
+
|
| 38 |
+
## **Paleotektonik və paleocoğrafi kontekst**
|
| 39 |
+
|
| 40 |
+
Paleocoğrafi rekonstruksiyalar göstərir ki, Neogen boyunca Kür hövzəsi-delta kompleksi, Abşeron şelfi və inteqrasiya olunmuş arxipelaq sistemində çöküntü fasiyaları deltaik-allüvial, prodelta-dənizli slam və dayaz dəniz karbonat-silis biogenləri arasında paylanmışdır. Gec Miosen–Pliosen dövründə tektono-eustatik səviyyə dalğalanmaları ilə sinxron proqradasiya-reqressiya mərhələləri çöküntü litoqrafiyasını zonallaşdırmış, BA xətti üzrə ritmik qalxımlar isə lokal akkumulyasiya baryerləri yaratmışdır. Paleotektonik təkamül, Ərəbistan plitəsinin itmə sürətindəki dəyişmələrlə birlikdə transpressiv komponentin güclənməsinə və qatlanma cəbhəsinin qərbə doğru miqrasiyasına yol açmış, AKÇ-də antiklinal məhəllələrin ardıcıl aktivləşməsi ilə müşayiət olunmuşdur.
|
| 41 |
+
|
| 42 |
+
---
|
| 43 |
+
|
| 44 |
+
## **Material və metodlar**
|
| 45 |
+
|
| 46 |
+
### **Məlumat bazası**
|
| 47 |
+
|
| 48 |
+
* Regional və yüksək ayırdetmə qabiliyyətli seysmik kəsiklərdən çıxarılan struktur xəritələr;
|
| 49 |
+
* Quyu geofizikası (temperatur, qalınlıq, vitrinit əksoluunma, akustik sürət);
|
| 50 |
+
* Palçıq vulkanlarının termal-fluid kimyası, qaz-izotop tərkibi və vent temperatur ölçmələri;
|
| 51 |
+
* Çökmə fasiyalarının nümunə təhlili, dənə ölçüsü statistikası və çöküntütoplanma sürətlərinin stratiqrafik kalibrlənməsi;
|
| 52 |
+
* Geodeziya-GNSS əsaslı müasir deformasiya sürətləri.
|
| 53 |
+
|
| 54 |
+
### **Paleostress rekonstruksiyası**
|
| 55 |
+
|
| 56 |
+
Qırılma müstəviləri, sürüşmə vektorları və qatlanma oxlarının istiqamətlərinə əsaslanan kinematik inversiya alqoritmi tətbiq edilmişdir. SHmax azimutu, sıxılma dərəcəsi və transpressiv komponentin nisbi payı ən kiçik kvadratlar metoduyla kalibr olunmuşdur.
|
| 57 |
+
|
| 58 |
+
### **Çökmə-subsidensiya modelləşdirməsi**
|
| 59 |
+
|
| 60 |
+
1D və 2D hövzə modelləri vasitəsilə çöküntütoplanma sürətləri, istilik axını və kompaksiya qanunları (tixotropluq və yükə bağlı porozite azalması) tətbiq edilmişdir. Dərinləşmə tarixi, su dərinliyi dəyişmələri və deltaların proqradasiya sürətləri daxil edilmişdir.
|
| 61 |
+
|
| 62 |
+
### **Geotemperatur modelləşdirməsi**
|
| 63 |
+
|
| 64 |
+
İstilik axını xəritələri, termal keçiricilik, radioaktiv istilik istehsalı və maye axını ilə bağlı sadələşdirilmiş kupləj modelləri istifadə olunmuşdur. Neftəmələgəlmə (60–120°C; bəzi üzvi maddə tiplərində 140°C-yə qədər), qazəmələgəlmə (120–200°C) intervalları, BA üçün quyu ölçüləri ilə kalibr olunmuşdur.
|
| 65 |
+
|
| 66 |
+
---
|
| 67 |
+
|
| 68 |
+
**Səhifə 5**
|
| 69 |
+
|
| 70 |
+
## **Palçıq vulkanizmi indikatorları**
|
| 71 |
+
|
| 72 |
+
Vulkan konuslarında vent temperaturu, suların xlorid-bor tərkibi, C1–C5 qaz paylanması və izotop fraksiyalanması qiymətləndirilmiş, temperatur anomal zonalarının lateral təsir radiusu empirik funksiya ilə (R ≈ 0.8–1.5 km) təxmini qiymətləndirilmişdir.
|
| 73 |
+
|
| 74 |
+
---
|
| 75 |
+
|
| 76 |
+
## **Nəticələr**
|
| 77 |
+
|
| 78 |
+
### **Sıxılma gərginliklərinin məkan paylanması və intensivliyi**
|
| 79 |
+
|
| 80 |
+
* AKÇ-nin şimal-şərq seqmentində SHmax ~35–45° azimutlu, müzdarib zonalarda effektiv sıxılma 12–18 MPa aralığındadır; cənub-qərbə doğru 7–10 MPa-a enir.
|
| 81 |
+
* BA boyunca transpressiv zolaqda hissəvi sağ-yanal komponentli itələnmələr müşahidə olunur; qırılma müstəviləri 310–320° trendlidir.
|
| 82 |
+
* Qırıcıların uzanma istiqaməti SHmax-a perpendikulyardır: dominant NW–SE uzanmalı tərs qırılmalar və onları kəsən NE–SW istiqamətli keçid qırılmaları üçbucaq zonaları təşkil edir.
|
| 83 |
+
|
| 84 |
+
### **Struktur formalar: antiklinal zonallaşma və lokal qalxımlar**
|
| 85 |
+
|
| 86 |
+
* AKÇ-də şimal-qərbdən cənub-şərqə uzanan antiklinal kəmərlər mərhələli şəkildə bir-birinə paralel düzülür; fold-propaqasiya tipli qatlanmaların ön hissəsində sürüşmə qırılmaları ilə birlikdə "fault-bend" mexanikası izlənir.
|
| 87 |
+
* Lokal qalxımlar (məsələn, AKÇ-NE-1, AKÇ-NE-3 sintetik strukturları) morfoloji baxımdan asimmetrikdir: şimal-şərq qanadında kəskin, cənub-qərb qanadında isə daha yastı yamaclarla məhdudlaşır. Bu, sıxılma vektorunun yönü ilə uyğundur.
|
| 88 |
+
* BA zolağında pozitiv "flower structure" motivləri müşahidə olunur; bu strukturların mərkəzində palçıq vulkanları və fluid ventləri sıx yerləşir.
|
| 89 |
+
|
| 90 |
+
### **Çöküntütoplanma dinamikası və qalınlığı**
|
| 91 |
+
|
| 92 |
+
* Cənubi Xəzərin mərkəz hissəsində çöküntütoplanma sürəti 0.4 mm/il olaraq qiymətləndirilmişdir; bu göstərici termal rejimin nisbi "soyuma" tendensiyası ilə uzlaşır.
|
| 93 |
+
* Şelfdə 3–4 mm/il və Kür çayının mənsəbində 6 mm/il-ə çatan sürətlər proqradasiya kənarlarında yüksək akkumulyasiya potensialını göstərir.
|
| 94 |
+
* Çöküntülərin qalınlığı AKÇ mərkəzinə doğru 6–7 km-ə çatır; bu artım, Pliosen dövründə (xüsusən erkən-orta Pliosen) sürətlənən subsidensiya ilə əlaqədardır.
|
| 95 |
+
* AKÇ və BA-da inkişaf sürəti eyni deyildir: BA boyunca struktur yüksəlmələr erkən Miosendən zəif, Pliosendə isə kəskin intensivləşmişdir; AKÇ-də isə Gec Miosen–Pliosen aralığında tədricən artan, lakin Pliosenin ortasında sıçrayış xarakterli mərhələ müşahidə olunur.
|
| 96 |
+
|
| 97 |
+
---
|
| 98 |
+
|
| 99 |
+
**Səhifə 6**
|
| 100 |
+
|
| 101 |
+
## **Termal rejim və hidrokarbon yetişməsi**
|
| 102 |
+
|
| 103 |
+
* Geotemperatur xəritələrinin analizi AKÇ-də neftəmələgəlmənin baş zonalarını 2.8–4.2 km intervalında ayırd edir; bu, lito-fasiyal müxtəliflikdən asılı olaraq lokal 2.6–2.8 km-lik dəyazlaşma göstərə bilir.
|
| 104 |
+
* BA-da 3000–4000 m intervalı neftəmələgəlmə baş zonasını, 8000–8500 m intervalı isə qazəmələgəlmə baş zonasını əhatə edir. Vitrinit əksoluuma (Ro) dəyərləri BA üçün müvafiq olaraq 0.7–1.0% (neft pəncərəsi) və 1.3–2.0% (qaz pəncərəsi) diapazonuna uyğun gəlir.
|
| 105 |
+
* İstilik axını dəyərləri şelfdə 44–52 mW/m², AKÇ mərkəzi boyunca 28–36 mW/m² olaraq təxmini qiymətləndirilmişdir; palçıq vulkanlarının yaxınlığında 5–9 mW/m²-lik lokal artımlar qeydə alınmışdır.
|
| 106 |
+
* Palçıq vulkanizmi süxur komplekslərinin temperaturunu lokallaşdırılmış şəkildə artırır: vent kənarında 8–15°C, 0.8–1.2 km lateral məsafədə 2–5°C artım müşahidə olunur; bu artımlar kerogenin termal yetişməsinə birbaşa təsir edən regional faktor olmasa da, maye köçürülməsini sürətləndirərək lokal "overmature" linzalar yarada bilir.
|
| 107 |
+
|
| 108 |
+
## **Palçıq vulkanizmi və fluid sistemləri**
|
| 109 |
+
|
| 110 |
+
* Palçıq vulkanlarının paylanması antiklinal krestdə kəsimlərlə üst-üstə düşür; dərin kök zonaları çox vaxt tərs qırılmaların kəsişmə nöqtələrinə bağlanır.
|
| 111 |
+
* Qaz tərkibi əsasən metandır (C1/C2+ > 100), lakin qazəmələgəlmə pəncərəsinin dərin kənarlarında C2–C5 fraksiyalarının nisbi payı artır; δ¹³CH₄ dəyərləri –44‰ ilə –34‰ arasında dəyişir.
|
| 112 |
+
* Vulkanların püskürmə dövrləri Pliosen–Kvarter çöküntülərinin sürüşmə səviyyələri ilə uyğundur və çoxkamüllü vent sistemləri termal anomaliyaları stasionar saxlayır.
|
| 113 |
+
|
| 114 |
+
## **Mineral resursların genetik zonallaşması**
|
| 115 |
+
|
| 116 |
+
* Faydalı qazıntılar (hidrokarbonlar başda olmaqla) paleotektonik zonallığa tabedir. NW–SE uzanmalı antiklinal kəmərlər boyunca tələlər sıxlaşır; qırılma-örtülü və qatlanma-örtülü tələlərin sinxronluğu ən yüksək ehtiyat sıxlığını təmin edir.
|
| 117 |
+
* Qeyri-hidrokarbon resurslar (gil süxurları, tikinti materialları, yerli karbonat linzaları) deltaik kənarlarda və reqressiv sekansların yuxarı hissələrində cəmlənir.
|
| 118 |
+
* Maykop tipli (Oliqosen–Aşağı Miosen) orqanik zəngin şistlər potensial mənbə süxuru rolunu oynayır, üstündəki Pliosen pelit kompleksləri effektiv örtük sistemi yaradır.
|
| 119 |
+
|
| 120 |
+
---
|
| 121 |
+
|
| 122 |
+
**Səhifə 7**
|
| 123 |
+
|
| 124 |
+
## **Müzakirə**
|
| 125 |
+
|
| 126 |
+
### **Kolliziya kinematikası və struktur miras**
|
| 127 |
+
|
| 128 |
+
Ərəbistan plitəsinin şimala hərəkət sürətinin 15–20 mm/il diapazonunda dəyişməsi Qafqaz kollizion kəmərində basqı yaratmış, Cənubi Xəzərin qərb yamacı boyunca transpressiv büküləmələri induksiya etmişdir. Cənubi Xəzər meqaçökəkliyi altına "udulma" effekti, litosfer miqyasında sıxlıq kontrastlarının və közmogen plitə sərhədi geometriyasının nəticəsi kimi şərh olunur. Bu geodinamik quruluş AKÇ və BA-da aşağıdakılara gətirib çıxarmışdır:
|
| 129 |
+
|
| 130 |
+
* Sıxılma gərginliklərinin şimal-şərqdə intensifikasiyası, antiklinal oxların NW–SE istiqamətdə ritmik sıralanması;
|
| 131 |
+
* Qırılma sındırıcı sistemlərin SHmax-a perpendikulyar şəkildə düzülməsi və lokal "tilt block" dinamikasının yaranması;
|
| 132 |
+
* Ön zonada qatlanma-kəsilmə cəbhələrinin mərhələli qərbə irəliləməsi.
|
| 133 |
+
|
| 134 |
+
### **AKÇ və BA-nın fərqli inkişaf trayektoriyaları**
|
| 135 |
+
|
| 136 |
+
AKÇ və BA inkişaf sürəti və rejimi baxımından fərqli davranış nümayiş etdirir. AKÇ, kollizion yükə həssas olaraq Pliosenin əvvəlindən ortalarınadək mərhələli artım göstərir, BA isə Pliosenin ortasında daha kəskin struktur yüksəlmələrlə seçilir. Bu asinxronluq, ehtimal ki, BA boyunca sağ-yanal komponentli transpressiyanın müqavimət kontrastlı litoloji paketlərlə rezonansına bağlıdır.
|
| 137 |
+
|
| 138 |
+
Beləliklə:
|
| 139 |
+
|
| 140 |
+
* AKÇ-də antiklinal zonalar boyunca tələlər uzunmüddətli akkumulyasiya-qatlanma sinxronluğu ilə "inkişaf etmiş tələ" mərhələsinə çatır;
|
| 141 |
+
* BA-da isə gedişat "gec yetişmə–sürətli tələ formalaşması" ssenarisi ilə uyğun gəlir.
|
| 142 |
+
|
| 143 |
+
### **Çöküntütoplanma–subsidensiya–istilik əlaqələri**
|
| 144 |
+
|
| 145 |
+
Mərkəzi hövzədə 0.4 mm/il-lik akkumulyasiya fonda sabit, lakin şelf və delta kənarlarında 3–6 mm/il-ə çatan yüksək sürətlər, istilik axının inversiyada olan mozaika yaratmasına səbəb olur. Sürətli çöküntü yığılması müəyyən sahələrdə istilik diffuziyasını azaldaraq daha yüksək yeraltı temperatur qradientlərini çevik şəkildə formalaşdıra bilər, lakin Cənubi Xəzərin dərin zonalarında ümumi istilik axını nisbi aşağı dəyərlər göstərir. Bunun nəticəsi olaraq:
|
| 146 |
+
|
| 147 |
+
* BA üçün 3000–4000 m intervalında neft pəncərəsi sabit izlənir, 8000–8500 m intervalında isə qaz pəncərəsi pikə çatır;
|
| 148 |
+
* AKÇ-də isə neft pəncərəsi 2.8–4.2 km aralığında olsa da, lokal yüksək konduktiv zonalarda 2.6 km-ə dəyazlaşır.
|
| 149 |
+
|
| 150 |
+
---
|
| 151 |
+
|
| 152 |
+
**Səhifə 8**
|
| 153 |
+
|
| 154 |
+
## **Palçıq vulkanizmi və termal təsirlər**
|
| 155 |
+
|
| 156 |
+
Palçıq vulkanizmi dərin maye sistemləri üçün "sürətli magistral" funksiyası daşıyır. Vulkan kanallarından keçən isti materialların gətirdiyi əlavə istilik, ətraf süxurlarda qısamüddətli (10³–10⁴ il) termal anomaliyalar yaradır. Bu anomaliyalar hidrokarbonların termodinamik yetişməsi üçün regional mənə daşımır, ancaq:
|
| 157 |
+
|
| 158 |
+
* Gec karbohidrogen miqrasiya hadisələrini sürətləndirir;
|
| 159 |
+
* Yüksək məsaməlilikli, məsud konduktiv laylarda "şirin nöqtələr"in (sweet spot) lokal inkişafına kömək edir;
|
| 160 |
+
* Vulkan konusları yaxınlığında səthi istilik axını ölçmələrinə 5–9 mW/m² artım şəklində yansıyır.
|
| 161 |
+
|
| 162 |
+
## **Neft-qaz sistemi elementlərinin sinxronluğu**
|
| 163 |
+
|
| 164 |
+
* **Mənbə süxuru:** Oliqosen–Aşağı Miosen Maykop kompleksləri, yüksək TOC (1.5–4.5%), kerogen tipləri II–III üstünlük təşkil edir.
|
| 165 |
+
* **Örtük süxuru:** Pliosen pelitlər və marllar, aşağı permeabellik və qalın ardıcıllıqlar sayəsində effektiv sızdırmazlıq təmin edir.
|
| 166 |
+
* **Kollektor layları:** Deltaik qumlar, dayaz-dəniz qumkarbonat paketlər; təsirli porozite 18–26%, keçiricilik 20–600 mD.
|
| 167 |
+
* **Tələlər:** Antiklinal-örtülü tələlər, tərs qırılma seqmentləri ilə güclənmiş qatlanma tələləri; BA boyunca qalxım kənarlarında lateral örtülmə ilə kombinə olunmuş miks tip.
|
| 168 |
+
|
| 169 |
+
## **Risklər və qeyri-müəyyənlik**
|
| 170 |
+
|
| 171 |
+
* Sıxılma gərginliklərinin zamanca dəyişməsi nəticəsində qırılma şəbəkəsinin reaktivasiya ehtimalı yüksəkdir; tələlərdə sızma riskini artırır.
|
| 172 |
+
* Palçıq vulkanizmi ilə bağlı epizodik "şok boşalmalar" tələlərdəki təzyiq rejimini dəyişərək lokal dekompressiya yarada bilər.
|
| 173 |
+
* Geotemperatur modellərinin kalibrlənməsi istilik axını xəritələrinin seyrəkliyi səbəbindən qeyri-müəyyənlik daşıyır; əlavə ölçmələr tələb olunur.
|
| 174 |
+
|
| 175 |
+
## **Alternativ modellər**
|
| 176 |
+
|
| 177 |
+
* Dərin kök zonalarında litosferik miqyaslı "roll-back" mexanizminə bənzər arxitektonik yenidənqurulma ehtimalı, qərb yamacadaki asimmetrik qalxımları izah edə bilər.
|
| 178 |
+
* Transpressiv zonalarda sağ-yanal komponentin payının artması, fold-thrust sistemlərinin "en echelon" tipində mərhələli təşkili ilə nəticələnmiş ola bilər; bu, BA xətti üzrə müşahidə olunan "flower structure" fəaliyyətini gücləndirir.
|
| 179 |
+
|
| 180 |
+
---
|
| 181 |
+
|
| 182 |
+
**Səhifə 9**
|
| 183 |
+
|
| 184 |
+
## **Tətbiqi nəticələr və proqnoz**
|
| 185 |
+
|
| 186 |
+
* Qaz pəncərəsi dərinliklərinin (8–8.5 km) BA üçün təsdiqi ultra-dərin hədəflərin (HP/HT rejim) texnoloji planlamasını tələb edir; buraxma temperaturu 170–210°C intervalı üçün risk qiymətləndirilməsi aparılmalıdır.
|
| 187 |
+
* AKÇ-də neft pəncərəsinin 2.8–4.2 km intervalında sabit izlənməsi, antiklinal kəmərlər boyunca orta dərinlikli quyu dizaynı üçün əlverişli pəncərə yaradır; kollektor layların proqnozu üçün seysmik atribut analizinə üstünlük verilməlidir.
|
| 188 |
+
* Palçıq vulkanizmi yaxınlığında yerləşən strukturlar üçün qaz sızması və geohazards risk modelləri hazırlanmalı, vent zonalarından uzaqlıq kriteriyası kimi ən azı 1.5–2.0 km radius saxlanmalıdır.
|
| 189 |
+
|
| 190 |
+
## **Metodoloji müşahidələr**
|
| 191 |
+
|
| 192 |
+
* Çöküntütoplanma sürətinin mərkəzi hövzədə 0.4 mm/il olması, qalın ardıcıllıqların formalaşması üçün uzunmüddətli zaman pəncərəsinin (~10 Myr) zəruriliyini göstərir; bu, subsidensiyanın tektonik komponentinin əhəmiyyətini vurğulayır.
|
| 193 |
+
* Sıxılma rejiminin AKÇ-nin şimal-şərqində intensivləşməsi, ehtimal ki, yerli litosfer qalınlaşması və köhnə rift-mirası ilə (qırılma zonalarının irsi zəifləmələri) bağlıdır; bu, antiklinal oxların paralellik dərəcəsini artırır.
|
| 194 |
+
|
| 195 |
+
## **Konseptual model**
|
| 196 |
+
|
| 197 |
+
* **Gec Miosen:** Ərəbistan-Avrasiya kolliziyası aktivləşir; qərb yamac boyunca transpressiv stresslər, ilk qatlanma-qırılma cəbhəsi formalaşır.
|
| 198 |
+
* **Erkən Pliosen:** Proqradasiya sürətlənir, Kür mənsəbi boyunca 6 mm/il-ə çatan akkumulyasiya, antiklinal zonalar aktivləşir, tələlərin ilkin doldurulması başlay��r.
|
| 199 |
+
* **Orta Pliosen:** BA xətti boyunca lokal qalxımlar kəskinləşir; palçıq vulkanizmi epizodları artar, neft pəncərəsi geniş miqyasda aktivdir.
|
| 200 |
+
* **Gec Pliosen–Erkən Kvarter:** Qaz pəncərəsi dərin hüdudlarda maksimum fəaliyyət göstərir; dərin kollektorlar qazla yüklənir; termal rejim sabitləşir.
|
| 201 |
+
|
| 202 |
+
## **Gələcək işlər üçün çərçivə**
|
| 203 |
+
|
| 204 |
+
* Yüksək ayırdetmə qabiliyyətli 3D seysmik-geomekanik kupləj modelləri ilə SHmax dinamizminin zaman içində izlənməsi;
|
| 205 |
+
* Palçıq vulkanizmi ventlərinin termal-maye axını simulyasiyası və geotemperatur xəritələrində lokal düzəlişlər;
|
| 206 |
+
* Çöküntütoplanma sürətlərinin paleomühit proksiləri (palinologiya, diatomit markerləri) ilə kalibrasiyası.
|
| 207 |
+
|
| 208 |
+
---
|
| 209 |
+
|
| 210 |
+
**Səhifə 10**
|
| 211 |
+
|
| 212 |
+
## **Nəticələr**
|
| 213 |
+
|
| 214 |
+
1. AKÇ və BA daxil olan Cənubi Xəzər çökəkliyi sistemində faydalı qazıntılar, xüsusılə neft-qaz sistemləri paleotektonik və paleocoğrafi şəraitin birgə məhsuludur; Gec Miosen kolliziyası ilə induksiya olunmuş sıxılma rejimi burada əsas idarəedici rol oynamışdır.
|
| 215 |
+
2. Ərəbistan plitəsinin təsiri altında Qafqaz kolliziyalarının inkişafı Cənubi Xəzər meqaçökəkliyinin altına udulma mexanizmi ilə yoldaş olmuş, qərb yamacın formalaşmasında sıxılma gərginlikləri kolledici olmuşdur.
|
| 216 |
+
3. AKÇ və Bakı arxipelaqı intensiv sıxılma gərginliklərindən təsirlənmiş, şimal-qərbdən cənub-şərqə uzanan antiklinal zonallaşma və SHmax-a perpendikulyar qırılma şəbəkəsi ilə xarakterizə olunmuşdur.
|
| 217 |
+
4. AKÇ-nin lokal qalxımları morfoloji olaraq sıxılma gərginliklərinin təsirini əks etdirir; şimal-şərq seqmentində bu təsir yüksək intensivlikdədir.
|
| 218 |
+
5. Çöküntütoplanma sürəti məkanca və zamanca dəyişir: Cənubi Xəzərin mərkəzində 0.4 mm/il, şelfdə 3–4 mm/il, Kür mənsəbində 6 mm/il; çöküntü qalınlığı AKÇ mərkəzinə doğru 6–7 km-ə çatır.
|
| 219 |
+
6. Geotemperatur xəritələri AKÇ və BA üçün neftəmələgəlmə və qazəmələgəlmənin baş zonalarını ayırmışdır: BA-da 3000–4000 m (neft), 8000–8500 m (qaz); AKÇ-də isə neft pəncərəsi 2.8–4.2 km intervalında sabit izlənir.
|
| 220 |
+
7. Palçıq vulkanizmi yerli termal rejimi yüksəldir və maye miqrasiyasını sürətləndirir; ancaq regional miqyasda termal yetişmənin əsas sürücüsü tektonik-sedimentoloji balansdır.
|
| 221 |
+
8. AKÇ və BA-nın inkişaf sürəti eyni deyil; Pliosendə hər iki bölgədə artım müşahidə olunsa da, BA daha kəskin struktur yüksəlmələrlə fərqlənir.
|
| 222 |
+
9. Neft-qazlılıq potensialı sıxılma gərginlikləri, qatlanma-qırılma morfodinamikası və akkumulyasiya rejiminin uyğunlaşması ilə maksimal səviyyəyə çatır; antiklinal-örtülü tələlər üstünlük təşkil edir.
|
| 223 |
+
|
| 224 |
+
---
|
| 225 |
+
|
| 226 |
+
**Səhifə 11**
|
| 227 |
+
|
| 228 |
+
## **Tövsiyələr**
|
| 229 |
+
|
| 230 |
+
* AKÇ-nin şimal-şərq seqmentində NW–SE uzanmalı antiklinal kəmərlər üzərində 3D seysmik atribut analizləri genişləndirilməli, qırılma-örtülü tələlərin konturları dəqiqləşdirilməlidir.
|
| 231 |
+
* BA boyunca ultra-dərin qaz hədəfləri üçün HP/HT mühəndislik protokolları (geomekanik pəncərə, quyu soyutma-rejim planlaması) hazırlanmalıdır.
|
| 232 |
+
* Palçıq vulkanlarına yaxın strukturların risk xəritələrində 2 km-lik təhlükəsizlik buffer zonası nəzərə alınmalı, vent aktivliyinin seysmik mikromüşahidələri aparılmalıdır.
|
| 233 |
+
* Geotemperatur xəritələrinin kalibrlənməsi üçün əlavə istilik axını ölçmələri və fiber-optik quyu temperatur profilləri tətbiq olunmalıdır.
|
| 234 |
+
* Çöküntütoplanma proqnozlarında Kür mənsəbi reaktivliyinin iqlim-hidroloji dəyişmələrə həssaslığı inteqrasiya olunmalıdır.
|
| 235 |
+
|
| 236 |
+
---
|
| 237 |
+
|
| 238 |
+
## **Əlavə müşahidələr və sintetik göstəricilər**
|
| 239 |
+
|
| 240 |
+
* SHmax azimutu 035–045°; Shmin azimutu 125–135°; qırılma düzənlikləri: 310–320° trend.
|
| 241 |
+
* Qatlanma amplitudları: 120–650 m; dalğa uzunluğu 2.5–7.8 km, asimmetriya əmsalı 1.2–1.8.
|
| 242 |
+
* Termal qradientlər: şelfdə 26–31°C/km, AKÇ mərkəzində 17–22°C/km, BA dərin zonalarında 15–19°C/km.
|
| 243 |
+
* Kollektor porozitesi: 18–26%; keçiricilik: 20–600 mD; kapilyar giriş təzyiqi: 0.3–1.1 MPa.
|
| 244 |
+
* Palçıq vulkanı vent temperaturu: 24–55°C; lateral termal təsir radiusu 0.8–1.5 km.
|
| 245 |
+
|
| 246 |
+
---
|
| 247 |
+
|
| 248 |
+
**Səhifə 12**
|
| 249 |
+
|
| 250 |
+
## **Yekun**
|
| 251 |
+
|
| 252 |
+
Bu tədqiqat göstərir ki, Cənubi Xəzər çökəkliyi sistemində faydalı qazıntıların mənşə mexanizmləri paleotektonik və paleocoğrafi amillərin inteqrasiyasına həssasdır. Ərəbistan plitəsinin kollizion təsiri ilə yaranmış sıxılma rejimi, AKÇ və BA üzrə antiklinal zonallaşmanı, qırılma şəbəkələrini və lokal qalxımları formalaşdıraraq neft-qaz sistemlərinin bütün komponentlərinin sinxron fəaliyyətini təmin etmişdir. Çöküntütoplanma sürətlərinin dəyişkənliyi, termal rejimin region üzrə mozaik xarakter almasına səbəb olmuş; geotemperatur xəritələri isə neft və qaz əmələgəlməsinin baş zonalarını etibarlı şəkildə ay��rd etmişdir. Palçıq vulkanizmi, hər nə qədər regional termogenezis üçün ikincil faktor olsa da, lokal istilik və maye axını anomaliyaları ilə tələlərin doldurulmasına dinamika qatmışdır. Pliosendə müşahidə olunan inkişaf sıçrayışları, akkumulyasiya-tektonika kəsişməsinin pik mərhələsi kimi şərh olunur və bu mərhələnin izləri bu gün də struktur-morfoloji naxışlarda, termal xəritələrdə və palçıq vulkanizminin fəallığında oxunaqdadır. Bu çərçivə daxilində aparılacaq istiqamətli kəşfiyyat və təhlükə qiymətləndirməsi, AKÇ və BA-nın neft-qazlılıq potensialından daha effektiv istifadə edilməsinə imkan verəcək.
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: '3.8'
|
| 2 |
+
|
| 3 |
+
services:
|
| 4 |
+
llm-api:
|
| 5 |
+
build:
|
| 6 |
+
context: .
|
| 7 |
+
dockerfile: Dockerfile
|
| 8 |
+
container_name: socar-llm-api
|
| 9 |
+
ports:
|
| 10 |
+
- "8000:8000"
|
| 11 |
+
env_file:
|
| 12 |
+
- .env
|
| 13 |
+
environment:
|
| 14 |
+
- PYTHONUNBUFFERED=1
|
| 15 |
+
volumes:
|
| 16 |
+
# Mount app directory for development (optional - remove in production)
|
| 17 |
+
- ./app:/app/app
|
| 18 |
+
restart: unless-stopped
|
| 19 |
+
healthcheck:
|
| 20 |
+
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
|
| 21 |
+
interval: 30s
|
| 22 |
+
timeout: 10s
|
| 23 |
+
retries: 3
|
| 24 |
+
start_period: 40s
|
| 25 |
+
networks:
|
| 26 |
+
- socar-network
|
| 27 |
+
|
| 28 |
+
networks:
|
| 29 |
+
socar-network:
|
| 30 |
+
driver: bridge
|
notebooks/llm_benchmark.ipynb
CHANGED
|
@@ -1,761 +1,531 @@
|
|
| 1 |
{
|
| 2 |
-
|
| 3 |
-
{
|
| 4 |
-
"cell_type": "markdown",
|
| 5 |
-
"metadata": {},
|
| 6 |
-
"source": [
|
| 7 |
-
"# LLM Benchmarking for SOCAR Hackathon RAG Chatbot\n",
|
| 8 |
-
"\n",
|
| 9 |
-
"This notebook tests different LLM models for the `/llm` endpoint to find the best performer.\n",
|
| 10 |
-
"\n",
|
| 11 |
-
"## Evaluation Criteria (LLM Judge Metrics):\n",
|
| 12 |
-
"- **Accuracy**: Is the answer correct?\n",
|
| 13 |
-
"- **Relevance**: Are retrieved citations relevant?\n",
|
| 14 |
-
"- **Completeness**: Does it fully answer the question?\n",
|
| 15 |
-
"- **Citation Quality**: Proper sources with page numbers?\n",
|
| 16 |
-
"- **Response Time**: Speed of generation\n",
|
| 17 |
-
"\n",
|
| 18 |
-
"## Available LLM Models:\n",
|
| 19 |
-
"1. **Llama-4-Maverick-17B-128E-Instruct-FP8** (Current choice, open-source)\n",
|
| 20 |
-
"2. **DeepSeek-R1** (Open-source reasoning model)\n",
|
| 21 |
-
"3. **GPT-4.1** (Strong general performance)\n",
|
| 22 |
-
"4. **GPT-5, GPT-5-mini**\n",
|
| 23 |
-
"5. **Claude Sonnet 4.5** (Best quality)\n",
|
| 24 |
-
"6. **Claude Opus 4.1**\n",
|
| 25 |
-
"7. **Phi-4-multimodal-instruct**\n",
|
| 26 |
-
"8. **gpt-oss-120b**"
|
| 27 |
-
]
|
| 28 |
-
},
|
| 29 |
-
{
|
| 30 |
-
"cell_type": "code",
|
| 31 |
-
"execution_count": 1,
|
| 32 |
-
"metadata": {},
|
| 33 |
-
"outputs": [],
|
| 34 |
-
"source": [
|
| 35 |
-
"# Install required packages\n",
|
| 36 |
-
"# !pip install openai pinecone-client sentence-transformers python-dotenv pandas matplotlib seaborn jiwer"
|
| 37 |
-
]
|
| 38 |
-
},
|
| 39 |
-
{
|
| 40 |
-
"cell_type": "code",
|
| 41 |
-
"execution_count": 2,
|
| 42 |
-
"metadata": {},
|
| 43 |
-
"outputs": [
|
| 44 |
{
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
"
|
| 49 |
-
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
},
|
| 52 |
{
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
"
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
"import time\n",
|
| 64 |
-
"from typing import Dict, List, Tuple\n",
|
| 65 |
-
"from dotenv import load_dotenv\n",
|
| 66 |
-
"import pandas as pd\n",
|
| 67 |
-
"import matplotlib.pyplot as plt\n",
|
| 68 |
-
"import seaborn as sns\n",
|
| 69 |
-
"from openai import AzureOpenAI\n",
|
| 70 |
-
"from pinecone import Pinecone\n",
|
| 71 |
-
"from sentence_transformers import SentenceTransformer\n",
|
| 72 |
-
"from jiwer import wer, cer\n",
|
| 73 |
-
"\n",
|
| 74 |
-
"# Load environment variables\n",
|
| 75 |
-
"load_dotenv()\n",
|
| 76 |
-
"\n",
|
| 77 |
-
"# Set style\n",
|
| 78 |
-
"sns.set_style('whitegrid')\n",
|
| 79 |
-
"plt.rcParams['figure.figsize'] = (14, 8)\n",
|
| 80 |
-
"\n",
|
| 81 |
-
"print(\"\u2705 Libraries loaded successfully\")"
|
| 82 |
-
]
|
| 83 |
-
},
|
| 84 |
-
{
|
| 85 |
-
"cell_type": "code",
|
| 86 |
-
"execution_count": 3,
|
| 87 |
-
"metadata": {},
|
| 88 |
-
"outputs": [
|
| 89 |
{
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
"
|
| 94 |
-
"
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
{
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
"
|
| 141 |
-
"
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
}
|
| 150 |
-
],
|
| 151 |
-
"source": [
|
| 152 |
-
"# Load sample questions - using dynamic paths\n",
|
| 153 |
-
"with open(DOCS_DIR / 'sample_questions.json', 'r', encoding='utf-8') as f:\n",
|
| 154 |
-
" questions = json.load(f)\n",
|
| 155 |
-
"\n",
|
| 156 |
-
"# Load expected answers - using dynamic paths\n",
|
| 157 |
-
"with open(DOCS_DIR / 'sample_answers.json', 'r', encoding='utf-8') as f:\n",
|
| 158 |
-
" expected_answers = json.load(f)\n",
|
| 159 |
-
"\n",
|
| 160 |
-
"print(f\"Loaded {len(questions)} test cases\")\n",
|
| 161 |
-
"print(\"\\nTest Questions:\")\n",
|
| 162 |
-
"for i, (key, msgs) in enumerate(questions.items(), 1):\n",
|
| 163 |
-
" user_msg = [m for m in msgs if m['role'] == 'user'][-1]\n",
|
| 164 |
-
" print(f\"{i}. {key}: {user_msg['content'][:100]}...\")"
|
| 165 |
-
]
|
| 166 |
-
},
|
| 167 |
-
{
|
| 168 |
-
"cell_type": "markdown",
|
| 169 |
-
"metadata": {},
|
| 170 |
-
"source": [
|
| 171 |
-
"## 2. Initialize Vector Database and Embedding Model"
|
| 172 |
-
]
|
| 173 |
-
},
|
| 174 |
-
{
|
| 175 |
-
"cell_type": "code",
|
| 176 |
-
"execution_count": null,
|
| 177 |
-
"metadata": {},
|
| 178 |
-
"outputs": [],
|
| 179 |
-
"source": [
|
| 180 |
-
"# Initialize Pinecone\n",
|
| 181 |
-
"pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))\n",
|
| 182 |
-
"index = pc.Index(os.getenv('PINECONE_INDEX_NAME', 'hackathon'))\n",
|
| 183 |
-
"\n",
|
| 184 |
-
"# Initialize embedding model (same as used for ingestion)\n",
|
| 185 |
-
"embed_model = SentenceTransformer('BAAI/bge-large-en-v1.5')\n",
|
| 186 |
-
"\n",
|
| 187 |
-
"print(f\"\u2705 Vector DB connected: {index.describe_index_stats()}\")\n",
|
| 188 |
-
"print(f\"\u2705 Embedding model loaded: {embed_model}\")"
|
| 189 |
-
]
|
| 190 |
-
},
|
| 191 |
-
{
|
| 192 |
-
"cell_type": "markdown",
|
| 193 |
-
"metadata": {},
|
| 194 |
-
"source": [
|
| 195 |
-
"## 3. RAG Retrieval Function"
|
| 196 |
-
]
|
| 197 |
-
},
|
| 198 |
-
{
|
| 199 |
-
"cell_type": "code",
|
| 200 |
-
"execution_count": null,
|
| 201 |
-
"metadata": {},
|
| 202 |
-
"outputs": [],
|
| 203 |
-
"source": [
|
| 204 |
-
"def retrieve_documents(query: str, top_k: int = 3) -> List[Dict]:\n",
|
| 205 |
-
" \"\"\"\n",
|
| 206 |
-
" Retrieve relevant documents from vector database.\n",
|
| 207 |
-
" \"\"\"\n",
|
| 208 |
-
" # Generate query embedding\n",
|
| 209 |
-
" query_embedding = embed_model.encode(query).tolist()\n",
|
| 210 |
-
" \n",
|
| 211 |
-
" # Search vector DB\n",
|
| 212 |
-
" results = index.query(\n",
|
| 213 |
-
" vector=query_embedding,\n",
|
| 214 |
-
" top_k=top_k,\n",
|
| 215 |
-
" include_metadata=True\n",
|
| 216 |
-
" )\n",
|
| 217 |
-
" \n",
|
| 218 |
-
" # Extract documents\n",
|
| 219 |
-
" documents = []\n",
|
| 220 |
-
" for match in results['matches']:\n",
|
| 221 |
-
" documents.append({\n",
|
| 222 |
-
" 'pdf_name': match['metadata'].get('pdf_name', 'unknown.pdf'),\n",
|
| 223 |
-
" 'page_number': match['metadata'].get('page_number', 0),\n",
|
| 224 |
-
" 'content': match['metadata'].get('text', ''),\n",
|
| 225 |
-
" 'score': match.get('score', 0.0)\n",
|
| 226 |
-
" })\n",
|
| 227 |
-
" \n",
|
| 228 |
-
" return documents\n",
|
| 229 |
-
"\n",
|
| 230 |
-
"# Test retrieval\n",
|
| 231 |
-
"test_query = \"Pal\u00e7\u0131q vulkanlar\u0131n\u0131n t\u0259sir radiusu n\u0259 q\u0259d\u0259rdir?\"\n",
|
| 232 |
-
"test_docs = retrieve_documents(test_query)\n",
|
| 233 |
-
"print(f\"\\n\u2705 Retrieved {len(test_docs)} documents for test query\")\n",
|
| 234 |
-
"print(f\"Top result: {test_docs[0]['pdf_name']}, page {test_docs[0]['page_number']} (score: {test_docs[0]['score']:.3f})\")"
|
| 235 |
-
]
|
| 236 |
-
},
|
| 237 |
-
{
|
| 238 |
-
"cell_type": "markdown",
|
| 239 |
-
"metadata": {},
|
| 240 |
-
"source": [
|
| 241 |
-
"## 4. LLM Client Functions"
|
| 242 |
-
]
|
| 243 |
-
},
|
| 244 |
-
{
|
| 245 |
-
"cell_type": "code",
|
| 246 |
-
"execution_count": null,
|
| 247 |
-
"metadata": {},
|
| 248 |
-
"outputs": [],
|
| 249 |
-
"source": [
|
| 250 |
-
"# Initialize Azure OpenAI\n",
|
| 251 |
-
"azure_client = AzureOpenAI(\n",
|
| 252 |
-
" api_key=os.getenv('AZURE_OPENAI_API_KEY'),\n",
|
| 253 |
-
" api_version=os.getenv('AZURE_OPENAI_API_VERSION', '2024-08-01-preview'),\n",
|
| 254 |
-
" azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT')\n",
|
| 255 |
-
")\n",
|
| 256 |
-
"\n",
|
| 257 |
-
"LLM_MODELS = {\n",
|
| 258 |
-
" 'Llama-4-Maverick': 'Llama-4-Maverick-17B-128E-Instruct-FP8',\n",
|
| 259 |
-
" 'DeepSeek-R1': 'DeepSeek-R1',\n",
|
| 260 |
-
" 'GPT-4.1': 'gpt-4.1',\n",
|
| 261 |
-
" 'GPT-5-mini': 'gpt-5-mini',\n",
|
| 262 |
-
" 'Claude-Sonnet-4.5': 'claude-sonnet-4-5',\n",
|
| 263 |
-
"}\n",
|
| 264 |
-
"\n",
|
| 265 |
-
"def generate_answer(model_name: str, query: str, documents: List[Dict], \n",
|
| 266 |
-
" temperature: float = 0.2, max_tokens: int = 1000) -> Tuple[str, float]:\n",
|
| 267 |
-
" \"\"\"\n",
|
| 268 |
-
" Generate answer using specified LLM model.\n",
|
| 269 |
-
" Returns: (answer, response_time)\n",
|
| 270 |
-
" \"\"\"\n",
|
| 271 |
-
" # Build context from retrieved documents\n",
|
| 272 |
-
" context_parts = []\n",
|
| 273 |
-
" for i, doc in enumerate(documents, 1):\n",
|
| 274 |
-
" context_parts.append(\n",
|
| 275 |
-
" f\"Document {i} (Source: {doc['pdf_name']}, Page {doc['page_number']}):\\n{doc['content']}\"\n",
|
| 276 |
-
" )\n",
|
| 277 |
-
" context = \"\\n\\n\".join(context_parts)\n",
|
| 278 |
-
" \n",
|
| 279 |
-
" # Create prompt\n",
|
| 280 |
-
" prompt = f\"\"\"Siz SOCAR-\u0131n tarixi neft v\u0259 qaz s\u0259n\u0259dl\u0259ri \u00fczr\u0259 m\u00fct\u0259x\u0259ssis k\u00f6m\u0259k\u00e7isisiniz.\n",
|
| 281 |
-
"\n",
|
| 282 |
-
"Kontekst (\u0259laq\u0259li s\u0259n\u0259dl\u0259r):\n",
|
| 283 |
-
"{context}\n",
|
| 284 |
-
"\n",
|
| 285 |
-
"Sual: {query}\n",
|
| 286 |
-
"\n",
|
| 287 |
-
"\u018ftrafl\u0131 cavab verin v\u0259 m\u00fctl\u0259q s\u0259n\u0259d m\u0259nb\u0259l\u0259rin\u0259 istinad edin (PDF ad\u0131 v\u0259 s\u0259hif\u0259 n\u00f6mr\u0259si il\u0259).\n",
|
| 288 |
-
"Cavab\u0131n\u0131z d\u0259qiq, faktlara \u0259saslanan v\u0259 kontekst m\u0259lumatlar\u0131ndan istifad\u0259 ed\u0259n olmal\u0131d\u0131r.\"\"\"\n",
|
| 289 |
-
" \n",
|
| 290 |
-
" # Get model deployment\n",
|
| 291 |
-
" deployment = MODELS[model_name]['deployment']\n",
|
| 292 |
-
" \n",
|
| 293 |
-
" try:\n",
|
| 294 |
-
" start_time = time.time()\n",
|
| 295 |
-
" \n",
|
| 296 |
-
" # GPT-5 models use max_completion_tokens, others use max_tokens\n",
|
| 297 |
-
" if deployment.startswith('gpt-5'):\n",
|
| 298 |
-
" response = azure_client.chat.completions.create(\n",
|
| 299 |
-
" model=deployment,\n",
|
| 300 |
-
" messages=[\n",
|
| 301 |
-
" {\"role\": \"user\", \"content\": prompt}\n",
|
| 302 |
-
" ],\n",
|
| 303 |
-
" temperature=temperature,\n",
|
| 304 |
-
" max_completion_tokens=max_tokens\n",
|
| 305 |
-
" )\n",
|
| 306 |
-
" else:\n",
|
| 307 |
-
" response = azure_client.chat.completions.create(\n",
|
| 308 |
-
" model=deployment,\n",
|
| 309 |
-
" messages=[\n",
|
| 310 |
-
" {\"role\": \"user\", \"content\": prompt}\n",
|
| 311 |
-
" ],\n",
|
| 312 |
-
" temperature=temperature,\n",
|
| 313 |
-
" max_tokens=max_tokens\n",
|
| 314 |
-
" )\n",
|
| 315 |
-
" \n",
|
| 316 |
-
" response_time = time.time() - start_time\n",
|
| 317 |
-
" answer = response.choices[0].message.content\n",
|
| 318 |
-
" \n",
|
| 319 |
-
" return answer, response_time\n",
|
| 320 |
-
" \n",
|
| 321 |
-
" except Exception as e:\n",
|
| 322 |
-
" return f\"ERROR: {str(e)}\", 0.0\n",
|
| 323 |
-
"\n",
|
| 324 |
-
"print(f\"\\n\u2705 Configured {len(LLM_MODELS)} LLM models for testing\")"
|
| 325 |
-
]
|
| 326 |
-
},
|
| 327 |
-
{
|
| 328 |
-
"cell_type": "markdown",
|
| 329 |
-
"metadata": {},
|
| 330 |
-
"source": [
|
| 331 |
-
"## 5. Evaluation Metrics"
|
| 332 |
-
]
|
| 333 |
-
},
|
| 334 |
-
{
|
| 335 |
-
"cell_type": "code",
|
| 336 |
-
"execution_count": null,
|
| 337 |
-
"metadata": {},
|
| 338 |
-
"outputs": [],
|
| 339 |
-
"source": [
|
| 340 |
-
"def normalize_text(text: str) -> str:\n",
|
| 341 |
-
" \"\"\"Normalize text for comparison.\"\"\"\n",
|
| 342 |
-
" import re\n",
|
| 343 |
-
" text = text.lower().strip()\n",
|
| 344 |
-
" text = re.sub(r'\\s+', ' ', text)\n",
|
| 345 |
-
" return text\n",
|
| 346 |
-
"\n",
|
| 347 |
-
"def calculate_answer_similarity(reference: str, hypothesis: str) -> Dict[str, float]:\n",
|
| 348 |
-
" \"\"\"\n",
|
| 349 |
-
" Calculate similarity between generated and expected answer.\n",
|
| 350 |
-
" Lower is better for error rates.\n",
|
| 351 |
-
" \"\"\"\n",
|
| 352 |
-
" ref_norm = normalize_text(reference)\n",
|
| 353 |
-
" hyp_norm = normalize_text(hypothesis)\n",
|
| 354 |
-
" \n",
|
| 355 |
-
" # Character Error Rate\n",
|
| 356 |
-
" cer_score = cer(ref_norm, hyp_norm) * 100\n",
|
| 357 |
-
" \n",
|
| 358 |
-
" # Word Error Rate \n",
|
| 359 |
-
" wer_score = wer(ref_norm, hyp_norm) * 100\n",
|
| 360 |
-
" \n",
|
| 361 |
-
" # Similarity scores (higher is better)\n",
|
| 362 |
-
" similarity = max(0, 100 - wer_score)\n",
|
| 363 |
-
" \n",
|
| 364 |
-
" return {\n",
|
| 365 |
-
" 'CER': round(cer_score, 2),\n",
|
| 366 |
-
" 'WER': round(wer_score, 2),\n",
|
| 367 |
-
" 'Similarity': round(similarity, 2)\n",
|
| 368 |
-
" }\n",
|
| 369 |
-
"\n",
|
| 370 |
-
"def check_citations(answer: str, documents: List[Dict]) -> Dict[str, any]:\n",
|
| 371 |
-
" \"\"\"\n",
|
| 372 |
-
" Check if answer includes proper citations.\n",
|
| 373 |
-
" \"\"\"\n",
|
| 374 |
-
" import re\n",
|
| 375 |
-
" \n",
|
| 376 |
-
" # Check for PDF names\n",
|
| 377 |
-
" pdf_names = [doc['pdf_name'] for doc in documents]\n",
|
| 378 |
-
" cited_pdfs = sum(1 for pdf in pdf_names if pdf.replace('.pdf', '') in answer)\n",
|
| 379 |
-
" \n",
|
| 380 |
-
" # Check for page numbers\n",
|
| 381 |
-
" page_numbers = [str(doc['page_number']) for doc in documents]\n",
|
| 382 |
-
" cited_pages = sum(1 for page in page_numbers if page in answer)\n",
|
| 383 |
-
" \n",
|
| 384 |
-
" # Check for source keywords\n",
|
| 385 |
-
" source_keywords = ['m\u0259nb\u0259', 's\u0259n\u0259d', 's\u0259hif\u0259', 'pdf', 'document', 'page', 'source']\n",
|
| 386 |
-
" has_source_ref = any(kw in answer.lower() for kw in source_keywords)\n",
|
| 387 |
-
" \n",
|
| 388 |
-
" citation_score = (\n",
|
| 389 |
-
" (cited_pdfs / len(pdf_names) * 40) + # 40% for PDF citation\n",
|
| 390 |
-
" (cited_pages / len(page_numbers) * 40) + # 40% for page citation\n",
|
| 391 |
-
" (20 if has_source_ref else 0) # 20% for having source keywords\n",
|
| 392 |
-
" )\n",
|
| 393 |
-
" \n",
|
| 394 |
-
" return {\n",
|
| 395 |
-
" 'Citation_Score': round(citation_score, 2),\n",
|
| 396 |
-
" 'Cited_PDFs': cited_pdfs,\n",
|
| 397 |
-
" 'Cited_Pages': cited_pages,\n",
|
| 398 |
-
" 'Has_Source_Reference': has_source_ref\n",
|
| 399 |
-
" }\n",
|
| 400 |
-
"\n",
|
| 401 |
-
"def evaluate_completeness(answer: str, min_length: int = 100) -> Dict[str, any]:\n",
|
| 402 |
-
" \"\"\"\n",
|
| 403 |
-
" Evaluate answer completeness.\n",
|
| 404 |
-
" \"\"\"\n",
|
| 405 |
-
" word_count = len(answer.split())\n",
|
| 406 |
-
" char_count = len(answer)\n",
|
| 407 |
-
" \n",
|
| 408 |
-
" # Penalize very short or very long answers\n",
|
| 409 |
-
" if char_count < min_length:\n",
|
| 410 |
-
" completeness_score = (char_count / min_length) * 100\n",
|
| 411 |
-
" elif char_count > 2000:\n",
|
| 412 |
-
" completeness_score = 100 - ((char_count - 2000) / 2000 * 20) # Penalty for verbosity\n",
|
| 413 |
-
" else:\n",
|
| 414 |
-
" completeness_score = 100\n",
|
| 415 |
-
" \n",
|
| 416 |
-
" return {\n",
|
| 417 |
-
" 'Completeness_Score': round(max(0, completeness_score), 2),\n",
|
| 418 |
-
" 'Word_Count': word_count,\n",
|
| 419 |
-
" 'Char_Count': char_count\n",
|
| 420 |
-
" }\n",
|
| 421 |
-
"\n",
|
| 422 |
-
"print(\"\u2705 Evaluation functions ready\")"
|
| 423 |
-
]
|
| 424 |
-
},
|
| 425 |
-
{
|
| 426 |
-
"cell_type": "markdown",
|
| 427 |
-
"metadata": {},
|
| 428 |
-
"source": [
|
| 429 |
-
"## 6. Run Benchmark on All Models"
|
| 430 |
-
]
|
| 431 |
-
},
|
| 432 |
-
{
|
| 433 |
-
"cell_type": "code",
|
| 434 |
-
"execution_count": null,
|
| 435 |
-
"metadata": {},
|
| 436 |
-
"outputs": [],
|
| 437 |
-
"source": [
|
| 438 |
-
"# Select models to test (you can comment out models to skip)\n",
|
| 439 |
-
"MODELS_TO_TEST = [\n",
|
| 440 |
-
" 'Llama-4-Maverick-17B',\n",
|
| 441 |
-
" 'DeepSeek-R1',\n",
|
| 442 |
-
" 'GPT-4.1',\n",
|
| 443 |
-
" 'GPT-5-mini',\n",
|
| 444 |
-
" 'Claude-Sonnet-4.5',\n",
|
| 445 |
-
" # 'Claude-Opus-4.1', # Uncomment to test\n",
|
| 446 |
-
" # 'Phi-4-multimodal', # Uncomment to test\n",
|
| 447 |
-
" # 'GPT-OSS-120B', # Uncomment to test\n",
|
| 448 |
-
"]\n",
|
| 449 |
-
"\n",
|
| 450 |
-
"print(f\"Testing {len(MODELS_TO_TEST)} models on {len(questions)} questions...\\n\")\n",
|
| 451 |
-
"print(\"This may take several minutes...\\n\")"
|
| 452 |
-
]
|
| 453 |
-
},
|
| 454 |
-
{
|
| 455 |
-
"cell_type": "code",
|
| 456 |
-
"execution_count": null,
|
| 457 |
-
"metadata": {},
|
| 458 |
-
"outputs": [],
|
| 459 |
-
"source": [
|
| 460 |
-
"# Run benchmark\n",
|
| 461 |
-
"results = []\n",
|
| 462 |
-
"\n",
|
| 463 |
-
"for model_name in MODELS_TO_TEST:\n",
|
| 464 |
-
" print(f\"\\n{'='*80}\")\n",
|
| 465 |
-
" print(f\"Testing: {model_name}\")\n",
|
| 466 |
-
" print(f\"{'='*80}\")\n",
|
| 467 |
-
" \n",
|
| 468 |
-
" model_results = []\n",
|
| 469 |
-
" \n",
|
| 470 |
-
" for example_key, messages in questions.items():\n",
|
| 471 |
-
" # Get the last user message (the actual question)\n",
|
| 472 |
-
" user_msg = [m for m in messages if m['role'] == 'user'][-1]\n",
|
| 473 |
-
" query = user_msg['content']\n",
|
| 474 |
-
" \n",
|
| 475 |
-
" print(f\"\\n Question {example_key}: {query[:80]}...\")\n",
|
| 476 |
-
" \n",
|
| 477 |
-
" # Retrieve documents\n",
|
| 478 |
-
" documents = retrieve_documents(query, top_k=3)\n",
|
| 479 |
-
" \n",
|
| 480 |
-
" # Generate answer\n",
|
| 481 |
-
" answer, response_time = generate_answer(model_name, query, documents)\n",
|
| 482 |
-
" \n",
|
| 483 |
-
" if answer.startswith('ERROR'):\n",
|
| 484 |
-
" print(f\" \u274c Failed: {answer}\")\n",
|
| 485 |
-
" continue\n",
|
| 486 |
-
" \n",
|
| 487 |
-
" print(f\" \u2705 Response time: {response_time:.2f}s\")\n",
|
| 488 |
-
" \n",
|
| 489 |
-
" # Get expected answer\n",
|
| 490 |
-
" expected = expected_answers.get(example_key, {}).get('Answer', '')\n",
|
| 491 |
-
" \n",
|
| 492 |
-
" # Calculate metrics\n",
|
| 493 |
-
" similarity_metrics = calculate_answer_similarity(expected, answer) if expected else {'CER': 0, 'WER': 0, 'Similarity': 0}\n",
|
| 494 |
-
" citation_metrics = check_citations(answer, documents)\n",
|
| 495 |
-
" completeness_metrics = evaluate_completeness(answer)\n",
|
| 496 |
-
" \n",
|
| 497 |
-
" # Store result\n",
|
| 498 |
-
" result = {\n",
|
| 499 |
-
" 'Model': model_name,\n",
|
| 500 |
-
" 'Question': example_key,\n",
|
| 501 |
-
" 'Query': query[:100],\n",
|
| 502 |
-
" 'Answer': answer[:200] + '...',\n",
|
| 503 |
-
" 'Response_Time': round(response_time, 2),\n",
|
| 504 |
-
" **similarity_metrics,\n",
|
| 505 |
-
" **citation_metrics,\n",
|
| 506 |
-
" **completeness_metrics,\n",
|
| 507 |
-
" 'Open_Source': MODELS[model_name]['open_source'],\n",
|
| 508 |
-
" 'Architecture_Score': MODELS[model_name]['architecture_score']\n",
|
| 509 |
-
" }\n",
|
| 510 |
-
" \n",
|
| 511 |
-
" model_results.append(result)\n",
|
| 512 |
-
" results.append(result)\n",
|
| 513 |
-
" \n",
|
| 514 |
-
" # Show summary for this model\n",
|
| 515 |
-
" if model_results:\n",
|
| 516 |
-
" avg_response_time = sum(r['Response_Time'] for r in model_results) / len(model_results)\n",
|
| 517 |
-
" avg_similarity = sum(r['Similarity'] for r in model_results) / len(model_results)\n",
|
| 518 |
-
" avg_citation = sum(r['Citation_Score'] for r in model_results) / len(model_results)\n",
|
| 519 |
-
" avg_completeness = sum(r['Completeness_Score'] for r in model_results) / len(model_results)\n",
|
| 520 |
-
" \n",
|
| 521 |
-
" print(f\"\\n \ud83d\udcca {model_name} Summary:\")\n",
|
| 522 |
-
" print(f\" Avg Response Time: {avg_response_time:.2f}s\")\n",
|
| 523 |
-
" print(f\" Avg Similarity: {avg_similarity:.1f}%\")\n",
|
| 524 |
-
" print(f\" Avg Citation Score: {avg_citation:.1f}%\")\n",
|
| 525 |
-
" print(f\" Avg Completeness: {avg_completeness:.1f}%\")\n",
|
| 526 |
-
"\n",
|
| 527 |
-
"print(f\"\\n{'='*80}\")\n",
|
| 528 |
-
"print(\"\u2705 Benchmarking complete!\")\n",
|
| 529 |
-
"print(f\"{'='*80}\")"
|
| 530 |
-
]
|
| 531 |
-
},
|
| 532 |
-
{
|
| 533 |
-
"cell_type": "markdown",
|
| 534 |
-
"metadata": {},
|
| 535 |
-
"source": [
|
| 536 |
-
"## 7. Aggregate Results and Rankings"
|
| 537 |
-
]
|
| 538 |
-
},
|
| 539 |
-
{
|
| 540 |
-
"cell_type": "code",
|
| 541 |
-
"execution_count": null,
|
| 542 |
-
"metadata": {},
|
| 543 |
-
"outputs": [],
|
| 544 |
-
"source": [
|
| 545 |
-
"# Create DataFrame\n",
|
| 546 |
-
"df = pd.DataFrame(results)\n",
|
| 547 |
-
"\n",
|
| 548 |
-
"# Calculate aggregate scores per model\n",
|
| 549 |
-
"model_summary = df.groupby('Model').agg({\n",
|
| 550 |
-
" 'Response_Time': 'mean',\n",
|
| 551 |
-
" 'Similarity': 'mean',\n",
|
| 552 |
-
" 'Citation_Score': 'mean',\n",
|
| 553 |
-
" 'Completeness_Score': 'mean',\n",
|
| 554 |
-
" 'CER': 'mean',\n",
|
| 555 |
-
" 'WER': 'mean',\n",
|
| 556 |
-
" 'Open_Source': 'first',\n",
|
| 557 |
-
" 'Architecture_Score': 'first'\n",
|
| 558 |
-
"}).round(2)\n",
|
| 559 |
-
"\n",
|
| 560 |
-
"# Calculate overall quality score (weighted average)\n",
|
| 561 |
-
"model_summary['Quality_Score'] = (\n",
|
| 562 |
-
" model_summary['Similarity'] * 0.35 + # 35% answer accuracy\n",
|
| 563 |
-
" model_summary['Citation_Score'] * 0.35 + # 35% citation quality\n",
|
| 564 |
-
" model_summary['Completeness_Score'] * 0.30 # 30% completeness\n",
|
| 565 |
-
").round(2)\n",
|
| 566 |
-
"\n",
|
| 567 |
-
"# Sort by Quality Score\n",
|
| 568 |
-
"model_summary = model_summary.sort_values('Quality_Score', ascending=False)\n",
|
| 569 |
-
"\n",
|
| 570 |
-
"# Display summary table\n",
|
| 571 |
-
"print(\"\\n\" + \"=\"*100)\n",
|
| 572 |
-
"print(\"\ud83d\udcca LLM BENCHMARKING RESULTS - MODEL SUMMARY\")\n",
|
| 573 |
-
"print(\"=\"*100)\n",
|
| 574 |
-
"print(model_summary.to_string())\n",
|
| 575 |
-
"print(\"=\"*100)"
|
| 576 |
-
]
|
| 577 |
-
},
|
| 578 |
-
{
|
| 579 |
-
"cell_type": "code",
|
| 580 |
-
"metadata": {},
|
| 581 |
-
"source": [
|
| 582 |
-
"# Create comprehensive visualization\n",
|
| 583 |
-
"import os\n",
|
| 584 |
-
"from pathlib import Path\n",
|
| 585 |
-
"\n",
|
| 586 |
-
"# Create output directory - using dynamic path\n",
|
| 587 |
-
"output_dir = OUTPUT_DIR / 'llm_benchmark'\n",
|
| 588 |
-
"output_dir.mkdir(parents=True, exist_ok=True)\n",
|
| 589 |
-
"\n",
|
| 590 |
-
"fig, axes = plt.subplots(2, 2, figsize=(16, 10))\n",
|
| 591 |
-
"\n",
|
| 592 |
-
"models = df['Model'].tolist()\n",
|
| 593 |
-
"colors = sns.color_palette('viridis', len(models))\n",
|
| 594 |
-
"\n",
|
| 595 |
-
"# 1. CSR - Character Success Rate (MAIN METRIC)\n",
|
| 596 |
-
"ax1 = axes[0, 0]\n",
|
| 597 |
-
"bars1 = ax1.barh(models, df['CSR'], color=colors)\n",
|
| 598 |
-
"ax1.set_xlabel('CSR (%) - Higher is Better', fontsize=12, fontweight='bold')\n",
|
| 599 |
-
"ax1.set_title('Character Success Rate (CSR)\\n\ud83c\udfc6 HACKATHON PRIMARY METRIC', \n",
|
| 600 |
-
" fontsize=14, fontweight='bold')\n",
|
| 601 |
-
"ax1.set_xlim(0, 100)\n",
|
| 602 |
-
"for i, (model, csr) in enumerate(zip(models, df['CSR'])):\n",
|
| 603 |
-
" ax1.text(csr + 1, i, f'{csr:.2f}%', va='center', fontsize=11, fontweight='bold')\n",
|
| 604 |
-
"ax1.axvline(x=90, color='green', linestyle='--', alpha=0.3, label='Excellent (>90%)')\n",
|
| 605 |
-
"ax1.axvline(x=80, color='orange', linestyle='--', alpha=0.3, label='Good (>80%)')\n",
|
| 606 |
-
"ax1.legend(fontsize=9)\n",
|
| 607 |
-
"\n",
|
| 608 |
-
"# 2. WSR - Word Success Rate\n",
|
| 609 |
-
"ax2 = axes[0, 1]\n",
|
| 610 |
-
"bars2 = ax2.barh(models, df['WSR'], color=colors)\n",
|
| 611 |
-
"ax2.set_xlabel('WSR (%) - Higher is Better', fontsize=12, fontweight='bold')\n",
|
| 612 |
-
"ax2.set_title('Word Success Rate (WSR)', fontsize=14, fontweight='bold')\n",
|
| 613 |
-
"ax2.set_xlim(0, 100)\n",
|
| 614 |
-
"for i, (model, wsr) in enumerate(zip(models, df['WSR'])):\n",
|
| 615 |
-
" ax2.text(wsr + 1, i, f'{wsr:.2f}%', va='center', fontsize=11, fontweight='bold')\n",
|
| 616 |
-
"\n",
|
| 617 |
-
"# 3. Response Time\n",
|
| 618 |
-
"ax3 = axes[1, 0]\n",
|
| 619 |
-
"bars3 = ax3.barh(models, df['Response_Time'], color=colors)\n",
|
| 620 |
-
"ax3.set_xlabel('Total Time (seconds) - Lower is Better', fontsize=12, fontweight='bold')\n",
|
| 621 |
-
"ax3.set_title('Processing Speed', fontsize=14, fontweight='bold')\n",
|
| 622 |
-
"for i, (model, time_val) in enumerate(zip(models, df['Response_Time'])):\n",
|
| 623 |
-
" ax3.text(time_val + 0.5, i, f'{time_val:.1f}s', va='center', fontsize=11)\n",
|
| 624 |
-
"\n",
|
| 625 |
-
"# 4. Error Rates Comparison\n",
|
| 626 |
-
"ax4 = axes[1, 1]\n",
|
| 627 |
-
"x = range(len(models))\n",
|
| 628 |
-
"width = 0.35\n",
|
| 629 |
-
"ax4.bar([i - width/2 for i in x], df['CER'], width, label='CER', color='coral', alpha=0.8)\n",
|
| 630 |
-
"ax4.bar([i + width/2 for i in x], df['WER'], width, label='WER', color='skyblue', alpha=0.8)\n",
|
| 631 |
-
"ax4.set_ylabel('Error Rate (%) - Lower is Better', fontsize=12, fontweight='bold')\n",
|
| 632 |
-
"ax4.set_title('Error Rates', fontsize=14, fontweight='bold')\n",
|
| 633 |
-
"ax4.set_xticks(x)\n",
|
| 634 |
-
"ax4.set_xticklabels(models, rotation=45, ha='right')\n",
|
| 635 |
-
"ax4.legend(fontsize=11)\n",
|
| 636 |
-
"ax4.grid(axis='y', alpha=0.3)\n",
|
| 637 |
-
"\n",
|
| 638 |
-
"plt.tight_layout()\n",
|
| 639 |
-
"plt.savefig(output_dir / 'results.png', dpi=300, bbox_inches='tight')\n",
|
| 640 |
-
"plt.show()\n",
|
| 641 |
-
"\n",
|
| 642 |
-
"print(f\"\\n\u2705 Visualization saved to '{output_dir}/results.png'\")"
|
| 643 |
-
]
|
| 644 |
-
},
|
| 645 |
-
{
|
| 646 |
-
"cell_type": "code",
|
| 647 |
-
"execution_count": null,
|
| 648 |
-
"metadata": {},
|
| 649 |
-
"outputs": [],
|
| 650 |
-
"source": [
|
| 651 |
-
"# Create rankings table\n",
|
| 652 |
-
"rankings = model_summary[[\n",
|
| 653 |
-
" 'Quality_Score', 'Similarity', 'Citation_Score', 'Completeness_Score', \n",
|
| 654 |
-
" 'Response_Time', 'Open_Source', 'Architecture_Score'\n",
|
| 655 |
-
"]].copy()\n",
|
| 656 |
-
"\n",
|
| 657 |
-
"rankings.insert(0, 'Rank', range(1, len(rankings) + 1))\n",
|
| 658 |
-
"\n",
|
| 659 |
-
"print(\"\\n\" + \"=\"*100)\n",
|
| 660 |
-
"print(\"\ud83c\udfc6 FINAL RANKINGS\")\n",
|
| 661 |
-
"print(\"=\"*100)\n",
|
| 662 |
-
"print(rankings.to_string())\n",
|
| 663 |
-
"print(\"=\"*100)\n",
|
| 664 |
-
"\n",
|
| 665 |
-
"# Winner analysis\n",
|
| 666 |
-
"best_overall = rankings.index[0]\n",
|
| 667 |
-
"best_open_source = rankings[rankings['Open_Source'] == True].index[0] if any(rankings['Open_Source']) else None\n",
|
| 668 |
-
"fastest = model_summary['Response_Time'].idxmin()\n",
|
| 669 |
-
"\n",
|
| 670 |
-
"print(\"\\n\" + \"=\"*100)\n",
|
| 671 |
-
"print(\"\ud83d\udca1 RECOMMENDATIONS FOR HACKATHON\")\n",
|
| 672 |
-
"print(\"=\"*100)\n",
|
| 673 |
-
"\n",
|
| 674 |
-
"print(f\"\\n\ud83e\udd47 Best Overall Quality: {best_overall}\")\n",
|
| 675 |
-
"print(f\" Quality Score: {model_summary.loc[best_overall, 'Quality_Score']:.1f}%\")\n",
|
| 676 |
-
"print(f\" Similarity: {model_summary.loc[best_overall, 'Similarity']:.1f}%\")\n",
|
| 677 |
-
"print(f\" Citation Score: {model_summary.loc[best_overall, 'Citation_Score']:.1f}%\")\n",
|
| 678 |
-
"print(f\" Response Time: {model_summary.loc[best_overall, 'Response_Time']:.2f}s\")\n",
|
| 679 |
-
"print(f\" Open Source: {model_summary.loc[best_overall, 'Open_Source']}\")\n",
|
| 680 |
-
"print(f\" Architecture Score: {model_summary.loc[best_overall, 'Architecture_Score']}\")\n",
|
| 681 |
-
"\n",
|
| 682 |
-
"if best_open_source:\n",
|
| 683 |
-
" print(f\"\\n\ud83d\udd13 Best Open-Source Model: {best_open_source}\")\n",
|
| 684 |
-
" print(f\" Quality Score: {model_summary.loc[best_open_source, 'Quality_Score']:.1f}%\")\n",
|
| 685 |
-
" print(f\" Architecture Score: {model_summary.loc[best_open_source, 'Architecture_Score']} (Better for hackathon!)\")\n",
|
| 686 |
-
" print(f\" Response Time: {model_summary.loc[best_open_source, 'Response_Time']:.2f}s\")\n",
|
| 687 |
-
"\n",
|
| 688 |
-
"print(f\"\\n\u26a1 Fastest Model: {fastest}\")\n",
|
| 689 |
-
"print(f\" Response Time: {model_summary.loc[fastest, 'Response_Time']:.2f}s\")\n",
|
| 690 |
-
"print(f\" Quality Score: {model_summary.loc[fastest, 'Quality_Score']:.1f}%\")\n",
|
| 691 |
-
"\n",
|
| 692 |
-
"print(\"\\n\" + \"=\"*100)\n",
|
| 693 |
-
"print(\"\ud83d\udcdd FINAL RECOMMENDATION\")\n",
|
| 694 |
-
"print(\"=\"*100)\n",
|
| 695 |
-
"print(\"\\nScoring Breakdown:\")\n",
|
| 696 |
-
"print(\" - LLM Quality: 30% of total hackathon score\")\n",
|
| 697 |
-
"print(\" - Architecture: 20% of total hackathon score (open-source preferred!)\")\n",
|
| 698 |
-
"print(\"\\nBest Choice:\")\n",
|
| 699 |
-
"if best_open_source and model_summary.loc[best_open_source, 'Quality_Score'] >= model_summary.loc[best_overall, 'Quality_Score'] * 0.9:\n",
|
| 700 |
-
" print(f\" \u2705 {best_open_source} - Best balance of quality and architecture score\")\n",
|
| 701 |
-
" print(f\" Only {model_summary.loc[best_overall, 'Quality_Score'] - model_summary.loc[best_open_source, 'Quality_Score']:.1f}% quality drop for higher architecture score!\")\n",
|
| 702 |
-
"else:\n",
|
| 703 |
-
" print(f\" \u2705 {best_overall} - Highest quality, use if quality gap is significant\")\n",
|
| 704 |
-
" if best_open_source:\n",
|
| 705 |
-
" print(f\" \u26a0\ufe0f Consider {best_open_source} for higher architecture score (trade-off: {model_summary.loc[best_overall, 'Quality_Score'] - model_summary.loc[best_open_source, 'Quality_Score']:.1f}% quality)\")\n",
|
| 706 |
-
"\n",
|
| 707 |
-
"print(\"=\"*100)"
|
| 708 |
-
]
|
| 709 |
-
},
|
| 710 |
-
{
|
| 711 |
-
"cell_type": "code",
|
| 712 |
-
"metadata": {},
|
| 713 |
-
"source": [
|
| 714 |
-
"# Save results\n",
|
| 715 |
-
"from pathlib import Path\n",
|
| 716 |
-
"\n",
|
| 717 |
-
"# Using dynamic path\n",
|
| 718 |
-
"output_dir = OUTPUT_DIR / 'llm_benchmark'\n",
|
| 719 |
-
"output_dir.mkdir(parents=True, exist_ok=True)\n",
|
| 720 |
-
"\n",
|
| 721 |
-
"df.to_csv(output_dir / 'detailed_results.csv', index=False, encoding='utf-8')\n",
|
| 722 |
-
"model_summary.to_csv(output_dir / 'summary.csv', encoding='utf-8')\n",
|
| 723 |
-
"rankings.to_csv(output_dir / 'rankings.csv', index=False, encoding='utf-8')\n",
|
| 724 |
-
"\n",
|
| 725 |
-
"print(\"\\n\u2705 Results exported to output/llm_benchmark/:\")\n",
|
| 726 |
-
"print(\" - detailed_results.csv (all questions and answers)\")\n",
|
| 727 |
-
"print(\" - summary.csv (model averages)\")\n",
|
| 728 |
-
"print(\" - rankings.csv (final rankings)\")\n",
|
| 729 |
-
"print(\" - results.png (visualizations)\")"
|
| 730 |
-
]
|
| 731 |
-
},
|
| 732 |
-
{
|
| 733 |
-
"cell_type": "markdown",
|
| 734 |
-
"metadata": {},
|
| 735 |
-
"source": [
|
| 736 |
-
"## 11. Sample Answer Comparison"
|
| 737 |
-
]
|
| 738 |
-
}
|
| 739 |
-
],
|
| 740 |
-
"metadata": {
|
| 741 |
-
"kernelspec": {
|
| 742 |
-
"display_name": "venv",
|
| 743 |
-
"language": "python",
|
| 744 |
-
"name": "python3"
|
| 745 |
},
|
| 746 |
-
"
|
| 747 |
-
|
| 748 |
-
|
| 749 |
-
"version": 3
|
| 750 |
-
},
|
| 751 |
-
"file_extension": ".py",
|
| 752 |
-
"mimetype": "text/x-python",
|
| 753 |
-
"name": "python",
|
| 754 |
-
"nbconvert_exporter": "python",
|
| 755 |
-
"pygments_lexer": "ipython3",
|
| 756 |
-
"version": "3.10.12"
|
| 757 |
-
}
|
| 758 |
-
},
|
| 759 |
-
"nbformat": 4,
|
| 760 |
-
"nbformat_minor": 4
|
| 761 |
-
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"cells": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"id": "38a7900c",
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"source": [
|
| 8 |
+
"# LLM Benchmarking for SOCAR Hackathon RAG Chatbot\n",
|
| 9 |
+
"\n",
|
| 10 |
+
"Testing different LLM models for the `/llm` endpoint to find the best performer.\n",
|
| 11 |
+
"\n",
|
| 12 |
+
"## Evaluation Criteria (LLM Judge Metrics):\n",
|
| 13 |
+
"- **Accuracy**: Is the answer correct?\n",
|
| 14 |
+
"- **Relevance**: Are retrieved citations relevant?\n",
|
| 15 |
+
"- **Completeness**: Does it fully answer the question?\n",
|
| 16 |
+
"- **Citation Quality**: Proper sources with page numbers?\n",
|
| 17 |
+
"- **Response Time**: Speed of generation\n",
|
| 18 |
+
"\n",
|
| 19 |
+
"## Available LLM Models:\n",
|
| 20 |
+
"1. **Llama-4-Maverick-17B** (Open-source)\n",
|
| 21 |
+
"2. **DeepSeek-R1** (Open-source reasoning)\n",
|
| 22 |
+
"3. **GPT-4.1, GPT-5, GPT-5-mini**\n",
|
| 23 |
+
"4. **Claude Sonnet 4.5**"
|
| 24 |
+
]
|
| 25 |
},
|
| 26 |
{
|
| 27 |
+
"cell_type": "code",
|
| 28 |
+
"execution_count": 45,
|
| 29 |
+
"id": "143cf60d",
|
| 30 |
+
"metadata": {},
|
| 31 |
+
"outputs": [],
|
| 32 |
+
"source": [
|
| 33 |
+
"# Install required packages\n",
|
| 34 |
+
"# !pip install openai pinecone-client sentence-transformers python-dotenv pandas matplotlib seaborn jiwer"
|
| 35 |
+
]
|
| 36 |
+
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
{
|
| 38 |
+
"cell_type": "code",
|
| 39 |
+
"execution_count": 46,
|
| 40 |
+
"id": "d698b11a",
|
| 41 |
+
"metadata": {},
|
| 42 |
+
"outputs": [
|
| 43 |
+
{
|
| 44 |
+
"name": "stdout",
|
| 45 |
+
"output_type": "stream",
|
| 46 |
+
"text": [
|
| 47 |
+
"✅ Libraries loaded\n"
|
| 48 |
+
]
|
| 49 |
+
}
|
| 50 |
+
],
|
| 51 |
+
"source": [
|
| 52 |
+
"import os\n",
|
| 53 |
+
"import json\n",
|
| 54 |
+
"import time\n",
|
| 55 |
+
"from typing import Dict, List, Tuple\n",
|
| 56 |
+
"from dotenv import load_dotenv\n",
|
| 57 |
+
"import pandas as pd\n",
|
| 58 |
+
"import matplotlib.pyplot as plt\n",
|
| 59 |
+
"import seaborn as sns\n",
|
| 60 |
+
"from openai import AzureOpenAI\n",
|
| 61 |
+
"from pinecone import Pinecone\n",
|
| 62 |
+
"from sentence_transformers import SentenceTransformer\n",
|
| 63 |
+
"from jiwer import wer, cer\n",
|
| 64 |
+
"from pathlib import Path\n",
|
| 65 |
+
"\n",
|
| 66 |
+
"load_dotenv()\n",
|
| 67 |
+
"sns.set_style(\"whitegrid\")\n",
|
| 68 |
+
"plt.rcParams[\"figure.figsize\"] = (14, 8)\n",
|
| 69 |
+
"\n",
|
| 70 |
+
"print(\"✅ Libraries loaded\")"
|
| 71 |
+
]
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"cell_type": "code",
|
| 75 |
+
"execution_count": 47,
|
| 76 |
+
"id": "087187fb",
|
| 77 |
+
"metadata": {},
|
| 78 |
+
"outputs": [
|
| 79 |
+
{
|
| 80 |
+
"name": "stdout",
|
| 81 |
+
"output_type": "stream",
|
| 82 |
+
"text": [
|
| 83 |
+
"✅ Project root: /Users/ismatsamadov/SOCAR_Hackathon\n",
|
| 84 |
+
"✅ Docs directory: /Users/ismatsamadov/SOCAR_Hackathon/docs\n"
|
| 85 |
+
]
|
| 86 |
+
}
|
| 87 |
+
],
|
| 88 |
+
"source": [
|
| 89 |
+
"# Auto-detect project root\n",
|
| 90 |
+
"if Path(\"data\").exists() and Path(\"docs\").exists():\n",
|
| 91 |
+
" PROJECT_ROOT = Path.cwd()\n",
|
| 92 |
+
"elif Path(\"../data\").exists() and Path(\"../docs\").exists():\n",
|
| 93 |
+
" PROJECT_ROOT = Path.cwd().parent\n",
|
| 94 |
+
"else:\n",
|
| 95 |
+
" current = Path.cwd()\n",
|
| 96 |
+
" while current != current.parent:\n",
|
| 97 |
+
" if (current / \"data\").exists() and (current / \"docs\").exists():\n",
|
| 98 |
+
" PROJECT_ROOT = current\n",
|
| 99 |
+
" break\n",
|
| 100 |
+
" current = current.parent\n",
|
| 101 |
+
" else:\n",
|
| 102 |
+
" PROJECT_ROOT = Path.cwd()\n",
|
| 103 |
+
"\n",
|
| 104 |
+
"DATA_DIR = PROJECT_ROOT / \"data\"\n",
|
| 105 |
+
"DOCS_DIR = PROJECT_ROOT / \"docs\"\n",
|
| 106 |
+
"OUTPUT_DIR = PROJECT_ROOT / \"output\"\n",
|
| 107 |
+
"\n",
|
| 108 |
+
"print(f\"✅ Project root: {PROJECT_ROOT}\")\n",
|
| 109 |
+
"print(f\"✅ Docs directory: {DOCS_DIR}\")"
|
| 110 |
+
]
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"cell_type": "code",
|
| 114 |
+
"execution_count": 48,
|
| 115 |
+
"id": "cf51bb3f",
|
| 116 |
+
"metadata": {},
|
| 117 |
+
"outputs": [
|
| 118 |
+
{
|
| 119 |
+
"name": "stdout",
|
| 120 |
+
"output_type": "stream",
|
| 121 |
+
"text": [
|
| 122 |
+
"Loaded 5 test cases\n"
|
| 123 |
+
]
|
| 124 |
+
}
|
| 125 |
+
],
|
| 126 |
+
"source": [
|
| 127 |
+
"# Load sample questions and answers using dynamic paths\n",
|
| 128 |
+
"with open(DOCS_DIR / \"sample_questions.json\", \"r\", encoding=\"utf-8\") as f:\n",
|
| 129 |
+
" questions = json.load(f)\n",
|
| 130 |
+
"\n",
|
| 131 |
+
"with open(DOCS_DIR / \"sample_answers.json\", \"r\", encoding=\"utf-8\") as f:\n",
|
| 132 |
+
" expected_answers = json.load(f)\n",
|
| 133 |
+
"\n",
|
| 134 |
+
"print(f\"Loaded {len(questions)} test cases\")"
|
| 135 |
+
]
|
| 136 |
+
},
|
| 137 |
+
{
|
| 138 |
+
"cell_type": "code",
|
| 139 |
+
"execution_count": 49,
|
| 140 |
+
"id": "9e761174",
|
| 141 |
+
"metadata": {},
|
| 142 |
+
"outputs": [
|
| 143 |
+
{
|
| 144 |
+
"name": "stdout",
|
| 145 |
+
"output_type": "stream",
|
| 146 |
+
"text": [
|
| 147 |
+
"✅ Vector DB connected\n",
|
| 148 |
+
"✅ Embedding model loaded\n"
|
| 149 |
+
]
|
| 150 |
+
}
|
| 151 |
+
],
|
| 152 |
+
"source": [
|
| 153 |
+
"# Initialize Pinecone\n",
|
| 154 |
+
"pc = Pinecone(api_key=os.getenv(\"PINECONE_API_KEY\"))\n",
|
| 155 |
+
"index = pc.Index(os.getenv(\"PINECONE_INDEX_NAME\", \"hackathon\"))\n",
|
| 156 |
+
"\n",
|
| 157 |
+
"# Initialize embedding model\n",
|
| 158 |
+
"embed_model = SentenceTransformer(\"BAAI/bge-large-en-v1.5\")\n",
|
| 159 |
+
"\n",
|
| 160 |
+
"print(f\"✅ Vector DB connected\")\n",
|
| 161 |
+
"print(f\"✅ Embedding model loaded\")"
|
| 162 |
+
]
|
| 163 |
+
},
|
| 164 |
+
{
|
| 165 |
+
"cell_type": "code",
|
| 166 |
+
"execution_count": 50,
|
| 167 |
+
"id": "74396795",
|
| 168 |
+
"metadata": {},
|
| 169 |
+
"outputs": [
|
| 170 |
+
{
|
| 171 |
+
"name": "stdout",
|
| 172 |
+
"output_type": "stream",
|
| 173 |
+
"text": [
|
| 174 |
+
"✅ Retrieval function ready\n"
|
| 175 |
+
]
|
| 176 |
+
}
|
| 177 |
+
],
|
| 178 |
+
"source": [
|
| 179 |
+
"def retrieve_documents(query: str, top_k: int = 3) -> List[Dict]:\n",
|
| 180 |
+
" \"\"\"Retrieve relevant documents from vector database.\"\"\"\n",
|
| 181 |
+
" query_embedding = embed_model.encode(query).tolist()\n",
|
| 182 |
+
" \n",
|
| 183 |
+
" results = index.query(\n",
|
| 184 |
+
" vector=query_embedding,\n",
|
| 185 |
+
" top_k=top_k,\n",
|
| 186 |
+
" include_metadata=True\n",
|
| 187 |
+
" )\n",
|
| 188 |
+
" \n",
|
| 189 |
+
" documents = []\n",
|
| 190 |
+
" for match in results[\"matches\"]:\n",
|
| 191 |
+
" documents.append({\n",
|
| 192 |
+
" \"pdf_name\": match[\"metadata\"].get(\"pdf_name\", \"unknown.pdf\"),\n",
|
| 193 |
+
" \"page_number\": match[\"metadata\"].get(\"page_number\", 0),\n",
|
| 194 |
+
" \"content\": match[\"metadata\"].get(\"text\", \"\"),\n",
|
| 195 |
+
" \"score\": match.get(\"score\", 0.0)\n",
|
| 196 |
+
" })\n",
|
| 197 |
+
" \n",
|
| 198 |
+
" return documents\n",
|
| 199 |
+
"\n",
|
| 200 |
+
"print(\"✅ Retrieval function ready\")"
|
| 201 |
+
]
|
| 202 |
+
},
|
| 203 |
+
{
|
| 204 |
+
"cell_type": "code",
|
| 205 |
+
"execution_count": 57,
|
| 206 |
+
"id": "2bfcc6fb",
|
| 207 |
+
"metadata": {},
|
| 208 |
+
"outputs": [
|
| 209 |
+
{
|
| 210 |
+
"name": "stdout",
|
| 211 |
+
"output_type": "stream",
|
| 212 |
+
"text": [
|
| 213 |
+
"✅ Configured 3 LLM models\n"
|
| 214 |
+
]
|
| 215 |
+
}
|
| 216 |
+
],
|
| 217 |
+
"source": [
|
| 218 |
+
"# Initialize Azure OpenAI\n",
|
| 219 |
+
"azure_client = AzureOpenAI(\n",
|
| 220 |
+
" api_key=os.getenv(\"AZURE_OPENAI_API_KEY\"),\n",
|
| 221 |
+
" api_version=os.getenv(\"AZURE_OPENAI_API_VERSION\", \"2024-08-01-preview\"),\n",
|
| 222 |
+
" azure_endpoint=os.getenv(\"AZURE_OPENAI_ENDPOINT\")\n",
|
| 223 |
+
")\n",
|
| 224 |
+
"\n",
|
| 225 |
+
"LLM_MODELS = {\n",
|
| 226 |
+
" \"Llama-4-Maverick\": \"Llama-4-Maverick-17B-128E-Instruct-FP8\",\n",
|
| 227 |
+
" \"DeepSeek-R1\": \"DeepSeek-R1\",\n",
|
| 228 |
+
" \"GPT-4.1\": \"gpt-4.1\",\n",
|
| 229 |
+
" # \"GPT-5-mini\": \"gpt-5-mini\"\n",
|
| 230 |
+
" # \"Claude-Sonnet-4.5\": \"claude-sonnet-4-5\" # Not available in Azure deployment\n",
|
| 231 |
+
"}\n",
|
| 232 |
+
"\n",
|
| 233 |
+
"print(f\"✅ Configured {len(LLM_MODELS)} LLM models\")"
|
| 234 |
+
]
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"cell_type": "code",
|
| 238 |
+
"execution_count": 61,
|
| 239 |
+
"id": "ddedd503",
|
| 240 |
+
"metadata": {},
|
| 241 |
+
"outputs": [
|
| 242 |
+
{
|
| 243 |
+
"name": "stdout",
|
| 244 |
+
"output_type": "stream",
|
| 245 |
+
"text": [
|
| 246 |
+
"✅ Generation function ready\n"
|
| 247 |
+
]
|
| 248 |
+
}
|
| 249 |
+
],
|
| 250 |
+
"source": [
|
| 251 |
+
"def generate_answer(model_name: str, query: str, documents: List[Dict],\n",
|
| 252 |
+
" temperature: float = 0.2, max_tokens: int = 1000) -> Tuple[str, float]:\n",
|
| 253 |
+
" \"\"\"Generate answer using specified LLM model.\"\"\"\n",
|
| 254 |
+
" context_parts = []\n",
|
| 255 |
+
" for i, doc in enumerate(documents, 1):\n",
|
| 256 |
+
" context_parts.append(\n",
|
| 257 |
+
" f\"Document {i} (Source: {doc['pdf_name']}, Page {doc['page_number']}):\\n{doc['content']}\"\n",
|
| 258 |
+
" )\n",
|
| 259 |
+
" context = \"\\n\\n\".join(context_parts)\n",
|
| 260 |
+
" \n",
|
| 261 |
+
" prompt = f\"\"\"Siz SOCAR-ın tarixi neft və qaz sənədləri üzrə mütəxəssis köməkçisisiniz.\n",
|
| 262 |
+
"\n",
|
| 263 |
+
"Kontekst:\n",
|
| 264 |
+
"{context}\n",
|
| 265 |
+
"\n",
|
| 266 |
+
"Sual: {query}\n",
|
| 267 |
+
"\n",
|
| 268 |
+
"Ətraflı cavab verin və mütləq sənəd mənbələrinə istinad edin.\"\"\"\n",
|
| 269 |
+
" \n",
|
| 270 |
+
" deployment = LLM_MODELS[model_name]\n",
|
| 271 |
+
" \n",
|
| 272 |
+
" try:\n",
|
| 273 |
+
" start_time = time.time()\n",
|
| 274 |
+
" \n",
|
| 275 |
+
" # GPT-5 models use max_completion_tokens, others use max_tokens\n",
|
| 276 |
+
" if deployment.startswith(\"gpt-5\"):\n",
|
| 277 |
+
" response = azure_client.chat.completions.create(\n",
|
| 278 |
+
" model=deployment,\n",
|
| 279 |
+
" messages=[{\"role\": \"user\", \"content\": prompt}],\n",
|
| 280 |
+
" temperature=temperature,\n",
|
| 281 |
+
" max_completion_tokens=max_tokens\n",
|
| 282 |
+
" )\n",
|
| 283 |
+
" else:\n",
|
| 284 |
+
" response = azure_client.chat.completions.create(\n",
|
| 285 |
+
" model=deployment,\n",
|
| 286 |
+
" messages=[{\"role\": \"user\", \"content\": prompt}],\n",
|
| 287 |
+
" temperature=temperature,\n",
|
| 288 |
+
" max_tokens=max_tokens\n",
|
| 289 |
+
" )\n",
|
| 290 |
+
" \n",
|
| 291 |
+
" response_time = time.time() - start_time\n",
|
| 292 |
+
" answer = response.choices[0].message.content\n",
|
| 293 |
+
" return answer, response_time\n",
|
| 294 |
+
" \n",
|
| 295 |
+
" except Exception as e:\n",
|
| 296 |
+
" return f\"ERROR: {str(e)}\", 0.0\n",
|
| 297 |
+
"\n",
|
| 298 |
+
"print(\"✅ Generation function ready\")"
|
| 299 |
+
]
|
| 300 |
+
},
|
| 301 |
+
{
|
| 302 |
+
"cell_type": "code",
|
| 303 |
+
"execution_count": 62,
|
| 304 |
+
"id": "946b0e30",
|
| 305 |
+
"metadata": {},
|
| 306 |
+
"outputs": [
|
| 307 |
+
{
|
| 308 |
+
"name": "stdout",
|
| 309 |
+
"output_type": "stream",
|
| 310 |
+
"text": [
|
| 311 |
+
"✅ Evaluation functions ready\n"
|
| 312 |
+
]
|
| 313 |
+
}
|
| 314 |
+
],
|
| 315 |
+
"source": [
|
| 316 |
+
"def evaluate_answer(expected: str, generated: str, documents: List[Dict]) -> Dict:\n",
|
| 317 |
+
" \"\"\"Evaluate answer quality.\"\"\"\n",
|
| 318 |
+
" # Normalize text\n",
|
| 319 |
+
" def normalize(text):\n",
|
| 320 |
+
" return text.lower().strip()\n",
|
| 321 |
+
" \n",
|
| 322 |
+
" # Calculate similarity\n",
|
| 323 |
+
" if expected:\n",
|
| 324 |
+
" wer_score = wer(normalize(expected), normalize(generated)) * 100\n",
|
| 325 |
+
" similarity = max(0, 100 - wer_score)\n",
|
| 326 |
+
" else:\n",
|
| 327 |
+
" similarity = 0\n",
|
| 328 |
+
" \n",
|
| 329 |
+
" # Check citations\n",
|
| 330 |
+
" pdf_names = [doc[\"pdf_name\"].replace(\".pdf\", \"\") for doc in documents]\n",
|
| 331 |
+
" cited_pdfs = sum(1 for pdf in pdf_names if pdf in generated)\n",
|
| 332 |
+
" citation_score = (cited_pdfs / len(pdf_names)) * 100 if pdf_names else 0\n",
|
| 333 |
+
" \n",
|
| 334 |
+
" # Completeness\n",
|
| 335 |
+
" word_count = len(generated.split())\n",
|
| 336 |
+
" completeness = min(100, (word_count / 50) * 100)\n",
|
| 337 |
+
" \n",
|
| 338 |
+
" return {\n",
|
| 339 |
+
" \"Similarity\": round(similarity, 2),\n",
|
| 340 |
+
" \"Citation_Score\": round(citation_score, 2),\n",
|
| 341 |
+
" \"Completeness\": round(completeness, 2),\n",
|
| 342 |
+
" \"Quality_Score\": round((similarity * 0.4 + citation_score * 0.4 + completeness * 0.2), 2)\n",
|
| 343 |
+
" }\n",
|
| 344 |
+
"\n",
|
| 345 |
+
"print(\"✅ Evaluation functions ready\")"
|
| 346 |
+
]
|
| 347 |
+
},
|
| 348 |
+
{
|
| 349 |
+
"cell_type": "markdown",
|
| 350 |
+
"id": "319459ce",
|
| 351 |
+
"metadata": {},
|
| 352 |
+
"source": [
|
| 353 |
+
"## Run LLM Benchmark"
|
| 354 |
+
]
|
| 355 |
+
},
|
| 356 |
{
|
| 357 |
+
"cell_type": "code",
|
| 358 |
+
"execution_count": 63,
|
| 359 |
+
"id": "c8867f44",
|
| 360 |
+
"metadata": {},
|
| 361 |
+
"outputs": [
|
| 362 |
+
{
|
| 363 |
+
"name": "stdout",
|
| 364 |
+
"output_type": "stream",
|
| 365 |
+
"text": [
|
| 366 |
+
"*******\n",
|
| 367 |
+
"Testing: Llama-4-Maverick\n",
|
| 368 |
+
"**********\n",
|
| 369 |
+
" Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas...\n",
|
| 370 |
+
" ✅ 4.31s\n",
|
| 371 |
+
" Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və ...\n",
|
| 372 |
+
" ✅ 4.61s\n",
|
| 373 |
+
" Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümun...\n",
|
| 374 |
+
" ✅ 3.92s\n",
|
| 375 |
+
" Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkliyi (AKÇ) üçün geote...\n",
|
| 376 |
+
" ✅ 4.13s\n",
|
| 377 |
+
" Example5: Bu zonada hansı proseslər baş verir?...\n",
|
| 378 |
+
" ✅ 3.50s\n",
|
| 379 |
+
"*******\n",
|
| 380 |
+
"Testing: DeepSeek-R1\n",
|
| 381 |
+
"**********\n",
|
| 382 |
+
" Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas...\n",
|
| 383 |
+
" ✅ 10.38s\n",
|
| 384 |
+
" Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və ...\n",
|
| 385 |
+
" ✅ 11.32s\n",
|
| 386 |
+
" Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümun...\n",
|
| 387 |
+
" ✅ 10.45s\n",
|
| 388 |
+
" Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkliyi (AKÇ) üçün geote...\n",
|
| 389 |
+
" ✅ 10.56s\n",
|
| 390 |
+
" Example5: Bu zonada hansı proseslər baş verir?...\n",
|
| 391 |
+
" ✅ 10.99s\n",
|
| 392 |
+
"*******\n",
|
| 393 |
+
"Testing: GPT-4.1\n",
|
| 394 |
+
"**********\n",
|
| 395 |
+
" Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas...\n",
|
| 396 |
+
" ✅ 6.32s\n",
|
| 397 |
+
" Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və ...\n",
|
| 398 |
+
" ✅ 5.85s\n",
|
| 399 |
+
" Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümun...\n",
|
| 400 |
+
" ✅ 8.09s\n",
|
| 401 |
+
" Example4: Bakı arxipelaqı (BA) və Aşağı Kür çök��kliyi (AKÇ) üçün geote...\n",
|
| 402 |
+
" ✅ 6.72s\n",
|
| 403 |
+
" Example5: Bu zonada hansı proseslər baş verir?...\n",
|
| 404 |
+
" ✅ 5.22s\n",
|
| 405 |
+
"*********\n",
|
| 406 |
+
"✅ Benchmark complete!\n"
|
| 407 |
+
]
|
| 408 |
+
}
|
| 409 |
+
],
|
| 410 |
+
"source": [
|
| 411 |
+
"# Run benchmark\n",
|
| 412 |
+
"results = []\n",
|
| 413 |
+
"\n",
|
| 414 |
+
"for model_name in LLM_MODELS.keys():\n",
|
| 415 |
+
" print(f\"*******\")\n",
|
| 416 |
+
" print(f\"Testing: {model_name}\")\n",
|
| 417 |
+
" print(f\"**********\")\n",
|
| 418 |
+
" \n",
|
| 419 |
+
" for example_key, messages in questions.items():\n",
|
| 420 |
+
" user_msg = [m for m in messages if m[\"role\"] == \"user\"][-1]\n",
|
| 421 |
+
" query = user_msg[\"content\"]\n",
|
| 422 |
+
" \n",
|
| 423 |
+
" print(f\" {example_key}: {query[:60]}...\")\n",
|
| 424 |
+
" \n",
|
| 425 |
+
" # Retrieve and generate\n",
|
| 426 |
+
" documents = retrieve_documents(query, top_k=3)\n",
|
| 427 |
+
" answer, response_time = generate_answer(model_name, query, documents)\n",
|
| 428 |
+
" \n",
|
| 429 |
+
" if answer.startswith(\"ERROR\"):\n",
|
| 430 |
+
" print(f\" ❌ {answer}\")\n",
|
| 431 |
+
" continue\n",
|
| 432 |
+
" \n",
|
| 433 |
+
" print(f\" ✅ {response_time:.2f}s\")\n",
|
| 434 |
+
" \n",
|
| 435 |
+
" # Evaluate\n",
|
| 436 |
+
" expected = expected_answers.get(example_key, {}).get(\"Answer\", \"\")\n",
|
| 437 |
+
" metrics = evaluate_answer(expected, answer, documents)\n",
|
| 438 |
+
" \n",
|
| 439 |
+
" results.append({\n",
|
| 440 |
+
" \"Model\": model_name,\n",
|
| 441 |
+
" \"Question\": example_key,\n",
|
| 442 |
+
" \"Response_Time\": round(response_time, 2),\n",
|
| 443 |
+
" **metrics\n",
|
| 444 |
+
" })\n",
|
| 445 |
+
"\n",
|
| 446 |
+
"print(\"*********\")\n",
|
| 447 |
+
"print(\"✅ Benchmark complete!\")"
|
| 448 |
+
]
|
| 449 |
+
},
|
| 450 |
+
{
|
| 451 |
+
"cell_type": "code",
|
| 452 |
+
"execution_count": 55,
|
| 453 |
+
"id": "9b243569",
|
| 454 |
+
"metadata": {},
|
| 455 |
+
"outputs": [
|
| 456 |
+
{
|
| 457 |
+
"name": "stdout",
|
| 458 |
+
"output_type": "stream",
|
| 459 |
+
"text": [
|
| 460 |
+
"\n",
|
| 461 |
+
"====================================================================================================\n",
|
| 462 |
+
"📊 LLM BENCHMARKING RESULTS\n",
|
| 463 |
+
"====================================================================================================\n",
|
| 464 |
+
" Quality_Score Similarity Citation_Score Completeness Response_Time\n",
|
| 465 |
+
"Model \n",
|
| 466 |
+
"GPT-4.1 52.00 0.00 80.00 100.0 6.38\n",
|
| 467 |
+
"Llama-4-Maverick 52.00 0.00 80.00 100.0 4.00\n",
|
| 468 |
+
"DeepSeek-R1 32.27 1.54 33.33 91.6 10.98\n",
|
| 469 |
+
"====================================================================================================\n"
|
| 470 |
+
]
|
| 471 |
+
}
|
| 472 |
+
],
|
| 473 |
+
"source": [
|
| 474 |
+
"# Analyze results\n",
|
| 475 |
+
"df = pd.DataFrame(results)\n",
|
| 476 |
+
"summary = df.groupby(\"Model\").agg({\n",
|
| 477 |
+
" \"Quality_Score\": \"mean\",\n",
|
| 478 |
+
" \"Similarity\": \"mean\",\n",
|
| 479 |
+
" \"Citation_Score\": \"mean\",\n",
|
| 480 |
+
" \"Completeness\": \"mean\",\n",
|
| 481 |
+
" \"Response_Time\": \"mean\"\n",
|
| 482 |
+
"}).round(2).sort_values(\"Quality_Score\", ascending=False)\n",
|
| 483 |
+
"\n",
|
| 484 |
+
"print(\"\\n\" + \"=\"*100)\n",
|
| 485 |
+
"print(\"📊 LLM BENCHMARKING RESULTS\")\n",
|
| 486 |
+
"print(\"=\"*100)\n",
|
| 487 |
+
"print(summary.to_string())\n",
|
| 488 |
+
"print(\"=\"*100)"
|
| 489 |
+
]
|
| 490 |
+
},
|
| 491 |
+
{
|
| 492 |
+
"cell_type": "code",
|
| 493 |
+
"execution_count": 56,
|
| 494 |
+
"id": "8c64cf75",
|
| 495 |
+
"metadata": {},
|
| 496 |
+
"outputs": [
|
| 497 |
+
{
|
| 498 |
+
"name": "stdout",
|
| 499 |
+
"output_type": "stream",
|
| 500 |
+
"text": [
|
| 501 |
+
"\n",
|
| 502 |
+
"✅ Results saved to output/llm_benchmark/\n"
|
| 503 |
+
]
|
| 504 |
+
}
|
| 505 |
+
],
|
| 506 |
+
"source": [
|
| 507 |
+
"# Save results using dynamic path\n",
|
| 508 |
+
"output_dir = OUTPUT_DIR / \"llm_benchmark\"\n",
|
| 509 |
+
"output_dir.mkdir(parents=True, exist_ok=True)\n",
|
| 510 |
+
"\n",
|
| 511 |
+
"df.to_csv(output_dir / \"detailed_results.csv\", index=False, encoding=\"utf-8\")\n",
|
| 512 |
+
"summary.to_csv(output_dir / \"summary.csv\", encoding=\"utf-8\")\n",
|
| 513 |
+
"\n",
|
| 514 |
+
"print(\"\\n✅ Results saved to output/llm_benchmark/\")"
|
| 515 |
+
]
|
| 516 |
+
}
|
| 517 |
+
],
|
| 518 |
+
"metadata": {
|
| 519 |
+
"kernelspec": {
|
| 520 |
+
"display_name": "Python 3",
|
| 521 |
+
"language": "python",
|
| 522 |
+
"name": "python3"
|
| 523 |
+
},
|
| 524 |
+
"language_info": {
|
| 525 |
+
"name": "python",
|
| 526 |
+
"version": "3.10.0"
|
| 527 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 528 |
},
|
| 529 |
+
"nbformat": 4,
|
| 530 |
+
"nbformat_minor": 5
|
| 531 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
notebooks/llm_benchmark.ipynb.backup
DELETED
|
@@ -1,761 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"cells": [
|
| 3 |
-
{
|
| 4 |
-
"cell_type": "markdown",
|
| 5 |
-
"metadata": {},
|
| 6 |
-
"source": [
|
| 7 |
-
"# LLM Benchmarking for SOCAR Hackathon RAG Chatbot\n",
|
| 8 |
-
"\n",
|
| 9 |
-
"This notebook tests different LLM models for the `/llm` endpoint to find the best performer.\n",
|
| 10 |
-
"\n",
|
| 11 |
-
"## Evaluation Criteria (LLM Judge Metrics):\n",
|
| 12 |
-
"- **Accuracy**: Is the answer correct?\n",
|
| 13 |
-
"- **Relevance**: Are retrieved citations relevant?\n",
|
| 14 |
-
"- **Completeness**: Does it fully answer the question?\n",
|
| 15 |
-
"- **Citation Quality**: Proper sources with page numbers?\n",
|
| 16 |
-
"- **Response Time**: Speed of generation\n",
|
| 17 |
-
"\n",
|
| 18 |
-
"## Available LLM Models:\n",
|
| 19 |
-
"1. **Llama-4-Maverick-17B-128E-Instruct-FP8** (Current choice, open-source)\n",
|
| 20 |
-
"2. **DeepSeek-R1** (Open-source reasoning model)\n",
|
| 21 |
-
"3. **GPT-4.1** (Strong general performance)\n",
|
| 22 |
-
"4. **GPT-5, GPT-5-mini**\n",
|
| 23 |
-
"5. **Claude Sonnet 4.5** (Best quality)\n",
|
| 24 |
-
"6. **Claude Opus 4.1**\n",
|
| 25 |
-
"7. **Phi-4-multimodal-instruct**\n",
|
| 26 |
-
"8. **gpt-oss-120b**"
|
| 27 |
-
]
|
| 28 |
-
},
|
| 29 |
-
{
|
| 30 |
-
"cell_type": "code",
|
| 31 |
-
"execution_count": 1,
|
| 32 |
-
"metadata": {},
|
| 33 |
-
"outputs": [],
|
| 34 |
-
"source": [
|
| 35 |
-
"# Install required packages\n",
|
| 36 |
-
"# !pip install openai pinecone-client sentence-transformers python-dotenv pandas matplotlib seaborn jiwer"
|
| 37 |
-
]
|
| 38 |
-
},
|
| 39 |
-
{
|
| 40 |
-
"cell_type": "code",
|
| 41 |
-
"execution_count": 2,
|
| 42 |
-
"metadata": {},
|
| 43 |
-
"outputs": [
|
| 44 |
-
{
|
| 45 |
-
"name": "stderr",
|
| 46 |
-
"output_type": "stream",
|
| 47 |
-
"text": [
|
| 48 |
-
"/Users/ismatsamadov/SOCAR_Hackathon/venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
| 49 |
-
" from .autonotebook import tqdm as notebook_tqdm\n"
|
| 50 |
-
]
|
| 51 |
-
},
|
| 52 |
-
{
|
| 53 |
-
"name": "stdout",
|
| 54 |
-
"output_type": "stream",
|
| 55 |
-
"text": [
|
| 56 |
-
"\u2705 Libraries loaded successfully\n"
|
| 57 |
-
]
|
| 58 |
-
}
|
| 59 |
-
],
|
| 60 |
-
"source": [
|
| 61 |
-
"import os\n",
|
| 62 |
-
"import json\n",
|
| 63 |
-
"import time\n",
|
| 64 |
-
"from typing import Dict, List, Tuple\n",
|
| 65 |
-
"from dotenv import load_dotenv\n",
|
| 66 |
-
"import pandas as pd\n",
|
| 67 |
-
"import matplotlib.pyplot as plt\n",
|
| 68 |
-
"import seaborn as sns\n",
|
| 69 |
-
"from openai import AzureOpenAI\n",
|
| 70 |
-
"from pinecone import Pinecone\n",
|
| 71 |
-
"from sentence_transformers import SentenceTransformer\n",
|
| 72 |
-
"from jiwer import wer, cer\n",
|
| 73 |
-
"\n",
|
| 74 |
-
"# Load environment variables\n",
|
| 75 |
-
"load_dotenv()\n",
|
| 76 |
-
"\n",
|
| 77 |
-
"# Set style\n",
|
| 78 |
-
"sns.set_style('whitegrid')\n",
|
| 79 |
-
"plt.rcParams['figure.figsize'] = (14, 8)\n",
|
| 80 |
-
"\n",
|
| 81 |
-
"print(\"\u2705 Libraries loaded successfully\")"
|
| 82 |
-
]
|
| 83 |
-
},
|
| 84 |
-
{
|
| 85 |
-
"cell_type": "code",
|
| 86 |
-
"execution_count": 3,
|
| 87 |
-
"metadata": {},
|
| 88 |
-
"outputs": [
|
| 89 |
-
{
|
| 90 |
-
"name": "stdout",
|
| 91 |
-
"output_type": "stream",
|
| 92 |
-
"text": [
|
| 93 |
-
"\u2705 Project root: /Users/ismatsamadov/SOCAR_Hackathon\n",
|
| 94 |
-
"\u2705 Docs directory: /Users/ismatsamadov/SOCAR_Hackathon/docs\n",
|
| 95 |
-
"\u2705 Output directory: /Users/ismatsamadov/SOCAR_Hackathon/output\n"
|
| 96 |
-
]
|
| 97 |
-
}
|
| 98 |
-
],
|
| 99 |
-
"source": [
|
| 100 |
-
"# Auto-detect project root (works from any directory)\n",
|
| 101 |
-
"import os\n",
|
| 102 |
-
"from pathlib import Path\n",
|
| 103 |
-
"\n",
|
| 104 |
-
"if Path('data').exists() and Path('docs').exists():\n",
|
| 105 |
-
" # Already in project root\n",
|
| 106 |
-
" PROJECT_ROOT = Path.cwd()\n",
|
| 107 |
-
"elif Path('../data').exists() and Path('../docs').exists():\n",
|
| 108 |
-
" # In notebooks/ subdirectory\n",
|
| 109 |
-
" PROJECT_ROOT = Path.cwd().parent\n",
|
| 110 |
-
"else:\n",
|
| 111 |
-
" # Fallback: try to find project root\n",
|
| 112 |
-
" current = Path.cwd()\n",
|
| 113 |
-
" while current != current.parent:\n",
|
| 114 |
-
" if (current / 'data').exists() and (current / 'docs').exists():\n",
|
| 115 |
-
" PROJECT_ROOT = current\n",
|
| 116 |
-
" break\n",
|
| 117 |
-
" current = current.parent\n",
|
| 118 |
-
" else:\n",
|
| 119 |
-
" PROJECT_ROOT = Path.cwd()\n",
|
| 120 |
-
"\n",
|
| 121 |
-
"# Define all paths relative to project root\n",
|
| 122 |
-
"DATA_DIR = PROJECT_ROOT / 'data'\n",
|
| 123 |
-
"DOCS_DIR = PROJECT_ROOT / 'docs'\n",
|
| 124 |
-
"OUTPUT_DIR = PROJECT_ROOT / 'output'\n",
|
| 125 |
-
"\n",
|
| 126 |
-
"print(f\"\u2705 Project root: {PROJECT_ROOT}\")\n",
|
| 127 |
-
"print(f\"\u2705 Docs directory: {DOCS_DIR}\")\n",
|
| 128 |
-
"print(f\"\u2705 Output directory: {OUTPUT_DIR}\")"
|
| 129 |
-
]
|
| 130 |
-
},
|
| 131 |
-
{
|
| 132 |
-
"cell_type": "code",
|
| 133 |
-
"execution_count": 4,
|
| 134 |
-
"metadata": {},
|
| 135 |
-
"outputs": [
|
| 136 |
-
{
|
| 137 |
-
"name": "stdout",
|
| 138 |
-
"output_type": "stream",
|
| 139 |
-
"text": [
|
| 140 |
-
"Loaded 5 test cases\n",
|
| 141 |
-
"\n",
|
| 142 |
-
"Test Questions:\n",
|
| 143 |
-
"1. Example1: Daha az quyu il\u0259 daha \u00e7ox hasilat \u0259ld\u0259 etm\u0259k \u00fc\u00e7\u00fcn hans\u0131 \u0259sas amill\u0259rin inteqrasiyas\u0131 t\u0259l\u0259b olunur?...\n",
|
| 144 |
-
"2. Example2: Q\u0259rbi Ab\u015feron yata\u011f\u0131nda suvurma t\u0259dbirl\u0259ri hans\u0131 tarixd\u0259 v\u0259 hans\u0131 layda t\u0259tbiq edilmi\u015fdir v\u0259 bunun m...\n",
|
| 145 |
-
"3. Example3: Pirallah\u0131 strukturunda 1253 n\u00f6mr\u0259li quyudan g\u00f6t\u00fcr\u00fclm\u00fc\u015f n\u00fcmun\u0259l\u0259rd\u0259 SiO2 v\u0259 CaO oksidl\u0259ri aras\u0131nda ha...\n",
|
| 146 |
-
"4. Example4: Bak\u0131 arxipelaq\u0131 (BA) v\u0259 A\u015fa\u011f\u0131 K\u00fcr \u00e7\u00f6k\u0259kliyi (AK\u00c7) \u00fc\u00e7\u00fcn geotemperatur x\u0259rit\u0259l\u0259rin\u0259 \u0259sas\u0259n neft v\u0259 qaz...\n",
|
| 147 |
-
"5. Example5: Bu zonada hans\u0131 prosesl\u0259r ba\u015f verir?...\n"
|
| 148 |
-
]
|
| 149 |
-
}
|
| 150 |
-
],
|
| 151 |
-
"source": [
|
| 152 |
-
"# Load sample questions - using dynamic paths\n",
|
| 153 |
-
"with open(DOCS_DIR / 'sample_questions.json', 'r', encoding='utf-8') as f:\n",
|
| 154 |
-
" questions = json.load(f)\n",
|
| 155 |
-
"\n",
|
| 156 |
-
"# Load expected answers - using dynamic paths\n",
|
| 157 |
-
"with open(DOCS_DIR / 'sample_answers.json', 'r', encoding='utf-8') as f:\n",
|
| 158 |
-
" expected_answers = json.load(f)\n",
|
| 159 |
-
"\n",
|
| 160 |
-
"print(f\"Loaded {len(questions)} test cases\")\n",
|
| 161 |
-
"print(\"\\nTest Questions:\")\n",
|
| 162 |
-
"for i, (key, msgs) in enumerate(questions.items(), 1):\n",
|
| 163 |
-
" user_msg = [m for m in msgs if m['role'] == 'user'][-1]\n",
|
| 164 |
-
" print(f\"{i}. {key}: {user_msg['content'][:100]}...\")"
|
| 165 |
-
]
|
| 166 |
-
},
|
| 167 |
-
{
|
| 168 |
-
"cell_type": "markdown",
|
| 169 |
-
"metadata": {},
|
| 170 |
-
"source": [
|
| 171 |
-
"## 2. Initialize Vector Database and Embedding Model"
|
| 172 |
-
]
|
| 173 |
-
},
|
| 174 |
-
{
|
| 175 |
-
"cell_type": "code",
|
| 176 |
-
"execution_count": null,
|
| 177 |
-
"metadata": {},
|
| 178 |
-
"outputs": [],
|
| 179 |
-
"source": [
|
| 180 |
-
"# Initialize Pinecone\n",
|
| 181 |
-
"pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))\n",
|
| 182 |
-
"index = pc.Index(os.getenv('PINECONE_INDEX_NAME', 'hackathon'))\n",
|
| 183 |
-
"\n",
|
| 184 |
-
"# Initialize embedding model (same as used for ingestion)\n",
|
| 185 |
-
"embed_model = SentenceTransformer('BAAI/bge-large-en-v1.5')\n",
|
| 186 |
-
"\n",
|
| 187 |
-
"print(f\"\u2705 Vector DB connected: {index.describe_index_stats()}\")\n",
|
| 188 |
-
"print(f\"\u2705 Embedding model loaded: {embed_model}\")"
|
| 189 |
-
]
|
| 190 |
-
},
|
| 191 |
-
{
|
| 192 |
-
"cell_type": "markdown",
|
| 193 |
-
"metadata": {},
|
| 194 |
-
"source": [
|
| 195 |
-
"## 3. RAG Retrieval Function"
|
| 196 |
-
]
|
| 197 |
-
},
|
| 198 |
-
{
|
| 199 |
-
"cell_type": "code",
|
| 200 |
-
"execution_count": null,
|
| 201 |
-
"metadata": {},
|
| 202 |
-
"outputs": [],
|
| 203 |
-
"source": [
|
| 204 |
-
"def retrieve_documents(query: str, top_k: int = 3) -> List[Dict]:\n",
|
| 205 |
-
" \"\"\"\n",
|
| 206 |
-
" Retrieve relevant documents from vector database.\n",
|
| 207 |
-
" \"\"\"\n",
|
| 208 |
-
" # Generate query embedding\n",
|
| 209 |
-
" query_embedding = embed_model.encode(query).tolist()\n",
|
| 210 |
-
" \n",
|
| 211 |
-
" # Search vector DB\n",
|
| 212 |
-
" results = index.query(\n",
|
| 213 |
-
" vector=query_embedding,\n",
|
| 214 |
-
" top_k=top_k,\n",
|
| 215 |
-
" include_metadata=True\n",
|
| 216 |
-
" )\n",
|
| 217 |
-
" \n",
|
| 218 |
-
" # Extract documents\n",
|
| 219 |
-
" documents = []\n",
|
| 220 |
-
" for match in results['matches']:\n",
|
| 221 |
-
" documents.append({\n",
|
| 222 |
-
" 'pdf_name': match['metadata'].get('pdf_name', 'unknown.pdf'),\n",
|
| 223 |
-
" 'page_number': match['metadata'].get('page_number', 0),\n",
|
| 224 |
-
" 'content': match['metadata'].get('text', ''),\n",
|
| 225 |
-
" 'score': match.get('score', 0.0)\n",
|
| 226 |
-
" })\n",
|
| 227 |
-
" \n",
|
| 228 |
-
" return documents\n",
|
| 229 |
-
"\n",
|
| 230 |
-
"# Test retrieval\n",
|
| 231 |
-
"test_query = \"Pal\u00e7\u0131q vulkanlar\u0131n\u0131n t\u0259sir radiusu n\u0259 q\u0259d\u0259rdir?\"\n",
|
| 232 |
-
"test_docs = retrieve_documents(test_query)\n",
|
| 233 |
-
"print(f\"\\n\u2705 Retrieved {len(test_docs)} documents for test query\")\n",
|
| 234 |
-
"print(f\"Top result: {test_docs[0]['pdf_name']}, page {test_docs[0]['page_number']} (score: {test_docs[0]['score']:.3f})\")"
|
| 235 |
-
]
|
| 236 |
-
},
|
| 237 |
-
{
|
| 238 |
-
"cell_type": "markdown",
|
| 239 |
-
"metadata": {},
|
| 240 |
-
"source": [
|
| 241 |
-
"## 4. LLM Client Functions"
|
| 242 |
-
]
|
| 243 |
-
},
|
| 244 |
-
{
|
| 245 |
-
"cell_type": "code",
|
| 246 |
-
"execution_count": null,
|
| 247 |
-
"metadata": {},
|
| 248 |
-
"outputs": [],
|
| 249 |
-
"source": [
|
| 250 |
-
"# Initialize Azure OpenAI\n",
|
| 251 |
-
"azure_client = AzureOpenAI(\n",
|
| 252 |
-
" api_key=os.getenv('AZURE_OPENAI_API_KEY'),\n",
|
| 253 |
-
" api_version=os.getenv('AZURE_OPENAI_API_VERSION', '2024-08-01-preview'),\n",
|
| 254 |
-
" azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT')\n",
|
| 255 |
-
")\n",
|
| 256 |
-
"\n",
|
| 257 |
-
"LLM_MODELS = {\n",
|
| 258 |
-
" 'Llama-4-Maverick': 'Llama-4-Maverick-17B-128E-Instruct-FP8',\n",
|
| 259 |
-
" 'DeepSeek-R1': 'DeepSeek-R1',\n",
|
| 260 |
-
" 'GPT-4.1': 'gpt-4.1',\n",
|
| 261 |
-
" 'GPT-5-mini': 'gpt-5-mini',\n",
|
| 262 |
-
" 'Claude-Sonnet-4.5': 'claude-sonnet-4-5',\n",
|
| 263 |
-
"}\n",
|
| 264 |
-
"\n",
|
| 265 |
-
"def generate_answer(model_name: str, query: str, documents: List[Dict], \n",
|
| 266 |
-
" temperature: float = 0.2, max_tokens: int = 1000) -> Tuple[str, float]:\n",
|
| 267 |
-
" \"\"\"\n",
|
| 268 |
-
" Generate answer using specified LLM model.\n",
|
| 269 |
-
" Returns: (answer, response_time)\n",
|
| 270 |
-
" \"\"\"\n",
|
| 271 |
-
" # Build context from retrieved documents\n",
|
| 272 |
-
" context_parts = []\n",
|
| 273 |
-
" for i, doc in enumerate(documents, 1):\n",
|
| 274 |
-
" context_parts.append(\n",
|
| 275 |
-
" f\"Document {i} (Source: {doc['pdf_name']}, Page {doc['page_number']}):\\n{doc['content']}\"\n",
|
| 276 |
-
" )\n",
|
| 277 |
-
" context = \"\\n\\n\".join(context_parts)\n",
|
| 278 |
-
" \n",
|
| 279 |
-
" # Create prompt\n",
|
| 280 |
-
" prompt = f\"\"\"Siz SOCAR-\u0131n tarixi neft v\u0259 qaz s\u0259n\u0259dl\u0259ri \u00fczr\u0259 m\u00fct\u0259x\u0259ssis k\u00f6m\u0259k\u00e7isisiniz.\n",
|
| 281 |
-
"\n",
|
| 282 |
-
"Kontekst (\u0259laq\u0259li s\u0259n\u0259dl\u0259r):\n",
|
| 283 |
-
"{context}\n",
|
| 284 |
-
"\n",
|
| 285 |
-
"Sual: {query}\n",
|
| 286 |
-
"\n",
|
| 287 |
-
"\u018ftrafl\u0131 cavab verin v\u0259 m\u00fctl\u0259q s\u0259n\u0259d m\u0259nb\u0259l\u0259rin\u0259 istinad edin (PDF ad\u0131 v\u0259 s\u0259hif\u0259 n\u00f6mr\u0259si il\u0259).\n",
|
| 288 |
-
"Cavab\u0131n\u0131z d\u0259qiq, faktlara \u0259saslanan v\u0259 kontekst m\u0259lumatlar\u0131ndan istifad\u0259 ed\u0259n olmal\u0131d\u0131r.\"\"\"\n",
|
| 289 |
-
" \n",
|
| 290 |
-
" # Get model deployment\n",
|
| 291 |
-
" deployment = MODELS[model_name]['deployment']\n",
|
| 292 |
-
" \n",
|
| 293 |
-
" try:\n",
|
| 294 |
-
" start_time = time.time()\n",
|
| 295 |
-
" \n",
|
| 296 |
-
" # GPT-5 models use max_completion_tokens, others use max_tokens\n",
|
| 297 |
-
" if deployment.startswith('gpt-5'):\n",
|
| 298 |
-
" response = azure_client.chat.completions.create(\n",
|
| 299 |
-
" model=deployment,\n",
|
| 300 |
-
" messages=[\n",
|
| 301 |
-
" {\"role\": \"user\", \"content\": prompt}\n",
|
| 302 |
-
" ],\n",
|
| 303 |
-
" temperature=temperature,\n",
|
| 304 |
-
" max_completion_tokens=max_tokens\n",
|
| 305 |
-
" )\n",
|
| 306 |
-
" else:\n",
|
| 307 |
-
" response = azure_client.chat.completions.create(\n",
|
| 308 |
-
" model=deployment,\n",
|
| 309 |
-
" messages=[\n",
|
| 310 |
-
" {\"role\": \"user\", \"content\": prompt}\n",
|
| 311 |
-
" ],\n",
|
| 312 |
-
" temperature=temperature,\n",
|
| 313 |
-
" max_tokens=max_tokens\n",
|
| 314 |
-
" )\n",
|
| 315 |
-
" \n",
|
| 316 |
-
" response_time = time.time() - start_time\n",
|
| 317 |
-
" answer = response.choices[0].message.content\n",
|
| 318 |
-
" \n",
|
| 319 |
-
" return answer, response_time\n",
|
| 320 |
-
" \n",
|
| 321 |
-
" except Exception as e:\n",
|
| 322 |
-
" return f\"ERROR: {str(e)}\", 0.0\n",
|
| 323 |
-
"\n",
|
| 324 |
-
"print(f\"\\n\u2705 Configured {len(LLM_MODELS)} LLM models for testing\")"
|
| 325 |
-
]
|
| 326 |
-
},
|
| 327 |
-
{
|
| 328 |
-
"cell_type": "markdown",
|
| 329 |
-
"metadata": {},
|
| 330 |
-
"source": [
|
| 331 |
-
"## 5. Evaluation Metrics"
|
| 332 |
-
]
|
| 333 |
-
},
|
| 334 |
-
{
|
| 335 |
-
"cell_type": "code",
|
| 336 |
-
"execution_count": null,
|
| 337 |
-
"metadata": {},
|
| 338 |
-
"outputs": [],
|
| 339 |
-
"source": [
|
| 340 |
-
"def normalize_text(text: str) -> str:\n",
|
| 341 |
-
" \"\"\"Normalize text for comparison.\"\"\"\n",
|
| 342 |
-
" import re\n",
|
| 343 |
-
" text = text.lower().strip()\n",
|
| 344 |
-
" text = re.sub(r'\\s+', ' ', text)\n",
|
| 345 |
-
" return text\n",
|
| 346 |
-
"\n",
|
| 347 |
-
"def calculate_answer_similarity(reference: str, hypothesis: str) -> Dict[str, float]:\n",
|
| 348 |
-
" \"\"\"\n",
|
| 349 |
-
" Calculate similarity between generated and expected answer.\n",
|
| 350 |
-
" Lower is better for error rates.\n",
|
| 351 |
-
" \"\"\"\n",
|
| 352 |
-
" ref_norm = normalize_text(reference)\n",
|
| 353 |
-
" hyp_norm = normalize_text(hypothesis)\n",
|
| 354 |
-
" \n",
|
| 355 |
-
" # Character Error Rate\n",
|
| 356 |
-
" cer_score = cer(ref_norm, hyp_norm) * 100\n",
|
| 357 |
-
" \n",
|
| 358 |
-
" # Word Error Rate \n",
|
| 359 |
-
" wer_score = wer(ref_norm, hyp_norm) * 100\n",
|
| 360 |
-
" \n",
|
| 361 |
-
" # Similarity scores (higher is better)\n",
|
| 362 |
-
" similarity = max(0, 100 - wer_score)\n",
|
| 363 |
-
" \n",
|
| 364 |
-
" return {\n",
|
| 365 |
-
" 'CER': round(cer_score, 2),\n",
|
| 366 |
-
" 'WER': round(wer_score, 2),\n",
|
| 367 |
-
" 'Similarity': round(similarity, 2)\n",
|
| 368 |
-
" }\n",
|
| 369 |
-
"\n",
|
| 370 |
-
"def check_citations(answer: str, documents: List[Dict]) -> Dict[str, any]:\n",
|
| 371 |
-
" \"\"\"\n",
|
| 372 |
-
" Check if answer includes proper citations.\n",
|
| 373 |
-
" \"\"\"\n",
|
| 374 |
-
" import re\n",
|
| 375 |
-
" \n",
|
| 376 |
-
" # Check for PDF names\n",
|
| 377 |
-
" pdf_names = [doc['pdf_name'] for doc in documents]\n",
|
| 378 |
-
" cited_pdfs = sum(1 for pdf in pdf_names if pdf.replace('.pdf', '') in answer)\n",
|
| 379 |
-
" \n",
|
| 380 |
-
" # Check for page numbers\n",
|
| 381 |
-
" page_numbers = [str(doc['page_number']) for doc in documents]\n",
|
| 382 |
-
" cited_pages = sum(1 for page in page_numbers if page in answer)\n",
|
| 383 |
-
" \n",
|
| 384 |
-
" # Check for source keywords\n",
|
| 385 |
-
" source_keywords = ['m\u0259nb\u0259', 's\u0259n\u0259d', 's\u0259hif\u0259', 'pdf', 'document', 'page', 'source']\n",
|
| 386 |
-
" has_source_ref = any(kw in answer.lower() for kw in source_keywords)\n",
|
| 387 |
-
" \n",
|
| 388 |
-
" citation_score = (\n",
|
| 389 |
-
" (cited_pdfs / len(pdf_names) * 40) + # 40% for PDF citation\n",
|
| 390 |
-
" (cited_pages / len(page_numbers) * 40) + # 40% for page citation\n",
|
| 391 |
-
" (20 if has_source_ref else 0) # 20% for having source keywords\n",
|
| 392 |
-
" )\n",
|
| 393 |
-
" \n",
|
| 394 |
-
" return {\n",
|
| 395 |
-
" 'Citation_Score': round(citation_score, 2),\n",
|
| 396 |
-
" 'Cited_PDFs': cited_pdfs,\n",
|
| 397 |
-
" 'Cited_Pages': cited_pages,\n",
|
| 398 |
-
" 'Has_Source_Reference': has_source_ref\n",
|
| 399 |
-
" }\n",
|
| 400 |
-
"\n",
|
| 401 |
-
"def evaluate_completeness(answer: str, min_length: int = 100) -> Dict[str, any]:\n",
|
| 402 |
-
" \"\"\"\n",
|
| 403 |
-
" Evaluate answer completeness.\n",
|
| 404 |
-
" \"\"\"\n",
|
| 405 |
-
" word_count = len(answer.split())\n",
|
| 406 |
-
" char_count = len(answer)\n",
|
| 407 |
-
" \n",
|
| 408 |
-
" # Penalize very short or very long answers\n",
|
| 409 |
-
" if char_count < min_length:\n",
|
| 410 |
-
" completeness_score = (char_count / min_length) * 100\n",
|
| 411 |
-
" elif char_count > 2000:\n",
|
| 412 |
-
" completeness_score = 100 - ((char_count - 2000) / 2000 * 20) # Penalty for verbosity\n",
|
| 413 |
-
" else:\n",
|
| 414 |
-
" completeness_score = 100\n",
|
| 415 |
-
" \n",
|
| 416 |
-
" return {\n",
|
| 417 |
-
" 'Completeness_Score': round(max(0, completeness_score), 2),\n",
|
| 418 |
-
" 'Word_Count': word_count,\n",
|
| 419 |
-
" 'Char_Count': char_count\n",
|
| 420 |
-
" }\n",
|
| 421 |
-
"\n",
|
| 422 |
-
"print(\"\u2705 Evaluation functions ready\")"
|
| 423 |
-
]
|
| 424 |
-
},
|
| 425 |
-
{
|
| 426 |
-
"cell_type": "markdown",
|
| 427 |
-
"metadata": {},
|
| 428 |
-
"source": [
|
| 429 |
-
"## 6. Run Benchmark on All Models"
|
| 430 |
-
]
|
| 431 |
-
},
|
| 432 |
-
{
|
| 433 |
-
"cell_type": "code",
|
| 434 |
-
"execution_count": null,
|
| 435 |
-
"metadata": {},
|
| 436 |
-
"outputs": [],
|
| 437 |
-
"source": [
|
| 438 |
-
"# Select models to test (you can comment out models to skip)\n",
|
| 439 |
-
"MODELS_TO_TEST = [\n",
|
| 440 |
-
" 'Llama-4-Maverick-17B',\n",
|
| 441 |
-
" 'DeepSeek-R1',\n",
|
| 442 |
-
" 'GPT-4.1',\n",
|
| 443 |
-
" 'GPT-5-mini',\n",
|
| 444 |
-
" 'Claude-Sonnet-4.5',\n",
|
| 445 |
-
" # 'Claude-Opus-4.1', # Uncomment to test\n",
|
| 446 |
-
" # 'Phi-4-multimodal', # Uncomment to test\n",
|
| 447 |
-
" # 'GPT-OSS-120B', # Uncomment to test\n",
|
| 448 |
-
"]\n",
|
| 449 |
-
"\n",
|
| 450 |
-
"print(f\"Testing {len(MODELS_TO_TEST)} models on {len(questions)} questions...\\n\")\n",
|
| 451 |
-
"print(\"This may take several minutes...\\n\")"
|
| 452 |
-
]
|
| 453 |
-
},
|
| 454 |
-
{
|
| 455 |
-
"cell_type": "code",
|
| 456 |
-
"execution_count": null,
|
| 457 |
-
"metadata": {},
|
| 458 |
-
"outputs": [],
|
| 459 |
-
"source": [
|
| 460 |
-
"# Run benchmark\n",
|
| 461 |
-
"results = []\n",
|
| 462 |
-
"\n",
|
| 463 |
-
"for model_name in MODELS_TO_TEST:\n",
|
| 464 |
-
" print(f\"\\n{'='*80}\")\n",
|
| 465 |
-
" print(f\"Testing: {model_name}\")\n",
|
| 466 |
-
" print(f\"{'='*80}\")\n",
|
| 467 |
-
" \n",
|
| 468 |
-
" model_results = []\n",
|
| 469 |
-
" \n",
|
| 470 |
-
" for example_key, messages in questions.items():\n",
|
| 471 |
-
" # Get the last user message (the actual question)\n",
|
| 472 |
-
" user_msg = [m for m in messages if m['role'] == 'user'][-1]\n",
|
| 473 |
-
" query = user_msg['content']\n",
|
| 474 |
-
" \n",
|
| 475 |
-
" print(f\"\\n Question {example_key}: {query[:80]}...\")\n",
|
| 476 |
-
" \n",
|
| 477 |
-
" # Retrieve documents\n",
|
| 478 |
-
" documents = retrieve_documents(query, top_k=3)\n",
|
| 479 |
-
" \n",
|
| 480 |
-
" # Generate answer\n",
|
| 481 |
-
" answer, response_time = generate_answer(model_name, query, documents)\n",
|
| 482 |
-
" \n",
|
| 483 |
-
" if answer.startswith('ERROR'):\n",
|
| 484 |
-
" print(f\" \u274c Failed: {answer}\")\n",
|
| 485 |
-
" continue\n",
|
| 486 |
-
" \n",
|
| 487 |
-
" print(f\" \u2705 Response time: {response_time:.2f}s\")\n",
|
| 488 |
-
" \n",
|
| 489 |
-
" # Get expected answer\n",
|
| 490 |
-
" expected = expected_answers.get(example_key, {}).get('Answer', '')\n",
|
| 491 |
-
" \n",
|
| 492 |
-
" # Calculate metrics\n",
|
| 493 |
-
" similarity_metrics = calculate_answer_similarity(expected, answer) if expected else {'CER': 0, 'WER': 0, 'Similarity': 0}\n",
|
| 494 |
-
" citation_metrics = check_citations(answer, documents)\n",
|
| 495 |
-
" completeness_metrics = evaluate_completeness(answer)\n",
|
| 496 |
-
" \n",
|
| 497 |
-
" # Store result\n",
|
| 498 |
-
" result = {\n",
|
| 499 |
-
" 'Model': model_name,\n",
|
| 500 |
-
" 'Question': example_key,\n",
|
| 501 |
-
" 'Query': query[:100],\n",
|
| 502 |
-
" 'Answer': answer[:200] + '...',\n",
|
| 503 |
-
" 'Response_Time': round(response_time, 2),\n",
|
| 504 |
-
" **similarity_metrics,\n",
|
| 505 |
-
" **citation_metrics,\n",
|
| 506 |
-
" **completeness_metrics,\n",
|
| 507 |
-
" 'Open_Source': MODELS[model_name]['open_source'],\n",
|
| 508 |
-
" 'Architecture_Score': MODELS[model_name]['architecture_score']\n",
|
| 509 |
-
" }\n",
|
| 510 |
-
" \n",
|
| 511 |
-
" model_results.append(result)\n",
|
| 512 |
-
" results.append(result)\n",
|
| 513 |
-
" \n",
|
| 514 |
-
" # Show summary for this model\n",
|
| 515 |
-
" if model_results:\n",
|
| 516 |
-
" avg_response_time = sum(r['Response_Time'] for r in model_results) / len(model_results)\n",
|
| 517 |
-
" avg_similarity = sum(r['Similarity'] for r in model_results) / len(model_results)\n",
|
| 518 |
-
" avg_citation = sum(r['Citation_Score'] for r in model_results) / len(model_results)\n",
|
| 519 |
-
" avg_completeness = sum(r['Completeness_Score'] for r in model_results) / len(model_results)\n",
|
| 520 |
-
" \n",
|
| 521 |
-
" print(f\"\\n \ud83d\udcca {model_name} Summary:\")\n",
|
| 522 |
-
" print(f\" Avg Response Time: {avg_response_time:.2f}s\")\n",
|
| 523 |
-
" print(f\" Avg Similarity: {avg_similarity:.1f}%\")\n",
|
| 524 |
-
" print(f\" Avg Citation Score: {avg_citation:.1f}%\")\n",
|
| 525 |
-
" print(f\" Avg Completeness: {avg_completeness:.1f}%\")\n",
|
| 526 |
-
"\n",
|
| 527 |
-
"print(f\"\\n{'='*80}\")\n",
|
| 528 |
-
"print(\"\u2705 Benchmarking complete!\")\n",
|
| 529 |
-
"print(f\"{'='*80}\")"
|
| 530 |
-
]
|
| 531 |
-
},
|
| 532 |
-
{
|
| 533 |
-
"cell_type": "markdown",
|
| 534 |
-
"metadata": {},
|
| 535 |
-
"source": [
|
| 536 |
-
"## 7. Aggregate Results and Rankings"
|
| 537 |
-
]
|
| 538 |
-
},
|
| 539 |
-
{
|
| 540 |
-
"cell_type": "code",
|
| 541 |
-
"execution_count": null,
|
| 542 |
-
"metadata": {},
|
| 543 |
-
"outputs": [],
|
| 544 |
-
"source": [
|
| 545 |
-
"# Create DataFrame\n",
|
| 546 |
-
"df = pd.DataFrame(results)\n",
|
| 547 |
-
"\n",
|
| 548 |
-
"# Calculate aggregate scores per model\n",
|
| 549 |
-
"model_summary = df.groupby('Model').agg({\n",
|
| 550 |
-
" 'Response_Time': 'mean',\n",
|
| 551 |
-
" 'Similarity': 'mean',\n",
|
| 552 |
-
" 'Citation_Score': 'mean',\n",
|
| 553 |
-
" 'Completeness_Score': 'mean',\n",
|
| 554 |
-
" 'CER': 'mean',\n",
|
| 555 |
-
" 'WER': 'mean',\n",
|
| 556 |
-
" 'Open_Source': 'first',\n",
|
| 557 |
-
" 'Architecture_Score': 'first'\n",
|
| 558 |
-
"}).round(2)\n",
|
| 559 |
-
"\n",
|
| 560 |
-
"# Calculate overall quality score (weighted average)\n",
|
| 561 |
-
"model_summary['Quality_Score'] = (\n",
|
| 562 |
-
" model_summary['Similarity'] * 0.35 + # 35% answer accuracy\n",
|
| 563 |
-
" model_summary['Citation_Score'] * 0.35 + # 35% citation quality\n",
|
| 564 |
-
" model_summary['Completeness_Score'] * 0.30 # 30% completeness\n",
|
| 565 |
-
").round(2)\n",
|
| 566 |
-
"\n",
|
| 567 |
-
"# Sort by Quality Score\n",
|
| 568 |
-
"model_summary = model_summary.sort_values('Quality_Score', ascending=False)\n",
|
| 569 |
-
"\n",
|
| 570 |
-
"# Display summary table\n",
|
| 571 |
-
"print(\"\\n\" + \"=\"*100)\n",
|
| 572 |
-
"print(\"\ud83d\udcca LLM BENCHMARKING RESULTS - MODEL SUMMARY\")\n",
|
| 573 |
-
"print(\"=\"*100)\n",
|
| 574 |
-
"print(model_summary.to_string())\n",
|
| 575 |
-
"print(\"=\"*100)"
|
| 576 |
-
]
|
| 577 |
-
},
|
| 578 |
-
{
|
| 579 |
-
"cell_type": "markdown",
|
| 580 |
-
"metadata": {},
|
| 581 |
-
"source": [
|
| 582 |
-
"# Create comprehensive visualization\n",
|
| 583 |
-
"import os\n",
|
| 584 |
-
"from pathlib import Path\n",
|
| 585 |
-
"\n",
|
| 586 |
-
"# Create output directory - using dynamic path\n",
|
| 587 |
-
"output_dir = OUTPUT_DIR / 'llm_benchmark'\n",
|
| 588 |
-
"output_dir.mkdir(parents=True, exist_ok=True)\n",
|
| 589 |
-
"\n",
|
| 590 |
-
"fig, axes = plt.subplots(2, 2, figsize=(16, 10))\n",
|
| 591 |
-
"\n",
|
| 592 |
-
"models = df['Model'].tolist()\n",
|
| 593 |
-
"colors = sns.color_palette('viridis', len(models))\n",
|
| 594 |
-
"\n",
|
| 595 |
-
"# 1. CSR - Character Success Rate (MAIN METRIC)\n",
|
| 596 |
-
"ax1 = axes[0, 0]\n",
|
| 597 |
-
"bars1 = ax1.barh(models, df['CSR'], color=colors)\n",
|
| 598 |
-
"ax1.set_xlabel('CSR (%) - Higher is Better', fontsize=12, fontweight='bold')\n",
|
| 599 |
-
"ax1.set_title('Character Success Rate (CSR)\\n\ud83c\udfc6 HACKATHON PRIMARY METRIC', \n",
|
| 600 |
-
" fontsize=14, fontweight='bold')\n",
|
| 601 |
-
"ax1.set_xlim(0, 100)\n",
|
| 602 |
-
"for i, (model, csr) in enumerate(zip(models, df['CSR'])):\n",
|
| 603 |
-
" ax1.text(csr + 1, i, f'{csr:.2f}%', va='center', fontsize=11, fontweight='bold')\n",
|
| 604 |
-
"ax1.axvline(x=90, color='green', linestyle='--', alpha=0.3, label='Excellent (>90%)')\n",
|
| 605 |
-
"ax1.axvline(x=80, color='orange', linestyle='--', alpha=0.3, label='Good (>80%)')\n",
|
| 606 |
-
"ax1.legend(fontsize=9)\n",
|
| 607 |
-
"\n",
|
| 608 |
-
"# 2. WSR - Word Success Rate\n",
|
| 609 |
-
"ax2 = axes[0, 1]\n",
|
| 610 |
-
"bars2 = ax2.barh(models, df['WSR'], color=colors)\n",
|
| 611 |
-
"ax2.set_xlabel('WSR (%) - Higher is Better', fontsize=12, fontweight='bold')\n",
|
| 612 |
-
"ax2.set_title('Word Success Rate (WSR)', fontsize=14, fontweight='bold')\n",
|
| 613 |
-
"ax2.set_xlim(0, 100)\n",
|
| 614 |
-
"for i, (model, wsr) in enumerate(zip(models, df['WSR'])):\n",
|
| 615 |
-
" ax2.text(wsr + 1, i, f'{wsr:.2f}%', va='center', fontsize=11, fontweight='bold')\n",
|
| 616 |
-
"\n",
|
| 617 |
-
"# 3. Response Time\n",
|
| 618 |
-
"ax3 = axes[1, 0]\n",
|
| 619 |
-
"bars3 = ax3.barh(models, df['Response_Time'], color=colors)\n",
|
| 620 |
-
"ax3.set_xlabel('Total Time (seconds) - Lower is Better', fontsize=12, fontweight='bold')\n",
|
| 621 |
-
"ax3.set_title('Processing Speed', fontsize=14, fontweight='bold')\n",
|
| 622 |
-
"for i, (model, time_val) in enumerate(zip(models, df['Response_Time'])):\n",
|
| 623 |
-
" ax3.text(time_val + 0.5, i, f'{time_val:.1f}s', va='center', fontsize=11)\n",
|
| 624 |
-
"\n",
|
| 625 |
-
"# 4. Error Rates Comparison\n",
|
| 626 |
-
"ax4 = axes[1, 1]\n",
|
| 627 |
-
"x = range(len(models))\n",
|
| 628 |
-
"width = 0.35\n",
|
| 629 |
-
"ax4.bar([i - width/2 for i in x], df['CER'], width, label='CER', color='coral', alpha=0.8)\n",
|
| 630 |
-
"ax4.bar([i + width/2 for i in x], df['WER'], width, label='WER', color='skyblue', alpha=0.8)\n",
|
| 631 |
-
"ax4.set_ylabel('Error Rate (%) - Lower is Better', fontsize=12, fontweight='bold')\n",
|
| 632 |
-
"ax4.set_title('Error Rates', fontsize=14, fontweight='bold')\n",
|
| 633 |
-
"ax4.set_xticks(x)\n",
|
| 634 |
-
"ax4.set_xticklabels(models, rotation=45, ha='right')\n",
|
| 635 |
-
"ax4.legend(fontsize=11)\n",
|
| 636 |
-
"ax4.grid(axis='y', alpha=0.3)\n",
|
| 637 |
-
"\n",
|
| 638 |
-
"plt.tight_layout()\n",
|
| 639 |
-
"plt.savefig(output_dir / 'results.png', dpi=300, bbox_inches='tight')\n",
|
| 640 |
-
"plt.show()\n",
|
| 641 |
-
"\n",
|
| 642 |
-
"print(f\"\\n\u2705 Visualization saved to '{output_dir}/results.png'\")"
|
| 643 |
-
]
|
| 644 |
-
},
|
| 645 |
-
{
|
| 646 |
-
"cell_type": "code",
|
| 647 |
-
"execution_count": null,
|
| 648 |
-
"metadata": {},
|
| 649 |
-
"outputs": [],
|
| 650 |
-
"source": [
|
| 651 |
-
"# Create rankings table\n",
|
| 652 |
-
"rankings = model_summary[[\n",
|
| 653 |
-
" 'Quality_Score', 'Similarity', 'Citation_Score', 'Completeness_Score', \n",
|
| 654 |
-
" 'Response_Time', 'Open_Source', 'Architecture_Score'\n",
|
| 655 |
-
"]].copy()\n",
|
| 656 |
-
"\n",
|
| 657 |
-
"rankings.insert(0, 'Rank', range(1, len(rankings) + 1))\n",
|
| 658 |
-
"\n",
|
| 659 |
-
"print(\"\\n\" + \"=\"*100)\n",
|
| 660 |
-
"print(\"\ud83c\udfc6 FINAL RANKINGS\")\n",
|
| 661 |
-
"print(\"=\"*100)\n",
|
| 662 |
-
"print(rankings.to_string())\n",
|
| 663 |
-
"print(\"=\"*100)\n",
|
| 664 |
-
"\n",
|
| 665 |
-
"# Winner analysis\n",
|
| 666 |
-
"best_overall = rankings.index[0]\n",
|
| 667 |
-
"best_open_source = rankings[rankings['Open_Source'] == True].index[0] if any(rankings['Open_Source']) else None\n",
|
| 668 |
-
"fastest = model_summary['Response_Time'].idxmin()\n",
|
| 669 |
-
"\n",
|
| 670 |
-
"print(\"\\n\" + \"=\"*100)\n",
|
| 671 |
-
"print(\"\ud83d\udca1 RECOMMENDATIONS FOR HACKATHON\")\n",
|
| 672 |
-
"print(\"=\"*100)\n",
|
| 673 |
-
"\n",
|
| 674 |
-
"print(f\"\\n\ud83e\udd47 Best Overall Quality: {best_overall}\")\n",
|
| 675 |
-
"print(f\" Quality Score: {model_summary.loc[best_overall, 'Quality_Score']:.1f}%\")\n",
|
| 676 |
-
"print(f\" Similarity: {model_summary.loc[best_overall, 'Similarity']:.1f}%\")\n",
|
| 677 |
-
"print(f\" Citation Score: {model_summary.loc[best_overall, 'Citation_Score']:.1f}%\")\n",
|
| 678 |
-
"print(f\" Response Time: {model_summary.loc[best_overall, 'Response_Time']:.2f}s\")\n",
|
| 679 |
-
"print(f\" Open Source: {model_summary.loc[best_overall, 'Open_Source']}\")\n",
|
| 680 |
-
"print(f\" Architecture Score: {model_summary.loc[best_overall, 'Architecture_Score']}\")\n",
|
| 681 |
-
"\n",
|
| 682 |
-
"if best_open_source:\n",
|
| 683 |
-
" print(f\"\\n\ud83d\udd13 Best Open-Source Model: {best_open_source}\")\n",
|
| 684 |
-
" print(f\" Quality Score: {model_summary.loc[best_open_source, 'Quality_Score']:.1f}%\")\n",
|
| 685 |
-
" print(f\" Architecture Score: {model_summary.loc[best_open_source, 'Architecture_Score']} (Better for hackathon!)\")\n",
|
| 686 |
-
" print(f\" Response Time: {model_summary.loc[best_open_source, 'Response_Time']:.2f}s\")\n",
|
| 687 |
-
"\n",
|
| 688 |
-
"print(f\"\\n\u26a1 Fastest Model: {fastest}\")\n",
|
| 689 |
-
"print(f\" Response Time: {model_summary.loc[fastest, 'Response_Time']:.2f}s\")\n",
|
| 690 |
-
"print(f\" Quality Score: {model_summary.loc[fastest, 'Quality_Score']:.1f}%\")\n",
|
| 691 |
-
"\n",
|
| 692 |
-
"print(\"\\n\" + \"=\"*100)\n",
|
| 693 |
-
"print(\"\ud83d\udcdd FINAL RECOMMENDATION\")\n",
|
| 694 |
-
"print(\"=\"*100)\n",
|
| 695 |
-
"print(\"\\nScoring Breakdown:\")\n",
|
| 696 |
-
"print(\" - LLM Quality: 30% of total hackathon score\")\n",
|
| 697 |
-
"print(\" - Architecture: 20% of total hackathon score (open-source preferred!)\")\n",
|
| 698 |
-
"print(\"\\nBest Choice:\")\n",
|
| 699 |
-
"if best_open_source and model_summary.loc[best_open_source, 'Quality_Score'] >= model_summary.loc[best_overall, 'Quality_Score'] * 0.9:\n",
|
| 700 |
-
" print(f\" \u2705 {best_open_source} - Best balance of quality and architecture score\")\n",
|
| 701 |
-
" print(f\" Only {model_summary.loc[best_overall, 'Quality_Score'] - model_summary.loc[best_open_source, 'Quality_Score']:.1f}% quality drop for higher architecture score!\")\n",
|
| 702 |
-
"else:\n",
|
| 703 |
-
" print(f\" \u2705 {best_overall} - Highest quality, use if quality gap is significant\")\n",
|
| 704 |
-
" if best_open_source:\n",
|
| 705 |
-
" print(f\" \u26a0\ufe0f Consider {best_open_source} for higher architecture score (trade-off: {model_summary.loc[best_overall, 'Quality_Score'] - model_summary.loc[best_open_source, 'Quality_Score']:.1f}% quality)\")\n",
|
| 706 |
-
"\n",
|
| 707 |
-
"print(\"=\"*100)"
|
| 708 |
-
]
|
| 709 |
-
},
|
| 710 |
-
{
|
| 711 |
-
"cell_type": "markdown",
|
| 712 |
-
"metadata": {},
|
| 713 |
-
"source": [
|
| 714 |
-
"# Save results\n",
|
| 715 |
-
"from pathlib import Path\n",
|
| 716 |
-
"\n",
|
| 717 |
-
"# Using dynamic path\n",
|
| 718 |
-
"output_dir = OUTPUT_DIR / 'llm_benchmark'\n",
|
| 719 |
-
"output_dir.mkdir(parents=True, exist_ok=True)\n",
|
| 720 |
-
"\n",
|
| 721 |
-
"df.to_csv(output_dir / 'detailed_results.csv', index=False, encoding='utf-8')\n",
|
| 722 |
-
"model_summary.to_csv(output_dir / 'summary.csv', encoding='utf-8')\n",
|
| 723 |
-
"rankings.to_csv(output_dir / 'rankings.csv', index=False, encoding='utf-8')\n",
|
| 724 |
-
"\n",
|
| 725 |
-
"print(\"\\n\u2705 Results exported to output/llm_benchmark/:\")\n",
|
| 726 |
-
"print(\" - detailed_results.csv (all questions and answers)\")\n",
|
| 727 |
-
"print(\" - summary.csv (model averages)\")\n",
|
| 728 |
-
"print(\" - rankings.csv (final rankings)\")\n",
|
| 729 |
-
"print(\" - results.png (visualizations)\")"
|
| 730 |
-
]
|
| 731 |
-
},
|
| 732 |
-
{
|
| 733 |
-
"cell_type": "markdown",
|
| 734 |
-
"metadata": {},
|
| 735 |
-
"source": [
|
| 736 |
-
"## 11. Sample Answer Comparison"
|
| 737 |
-
]
|
| 738 |
-
}
|
| 739 |
-
],
|
| 740 |
-
"metadata": {
|
| 741 |
-
"kernelspec": {
|
| 742 |
-
"display_name": "venv",
|
| 743 |
-
"language": "python",
|
| 744 |
-
"name": "python3"
|
| 745 |
-
},
|
| 746 |
-
"language_info": {
|
| 747 |
-
"codemirror_mode": {
|
| 748 |
-
"name": "ipython",
|
| 749 |
-
"version": 3
|
| 750 |
-
},
|
| 751 |
-
"file_extension": ".py",
|
| 752 |
-
"mimetype": "text/x-python",
|
| 753 |
-
"name": "python",
|
| 754 |
-
"nbconvert_exporter": "python",
|
| 755 |
-
"pygments_lexer": "ipython3",
|
| 756 |
-
"version": "3.10.12"
|
| 757 |
-
}
|
| 758 |
-
},
|
| 759 |
-
"nbformat": 4,
|
| 760 |
-
"nbformat_minor": 4
|
| 761 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
notebooks/rag_optimization_benchmark.ipynb
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
notebooks/rag_optimization_benchmark.ipynb.backup
DELETED
|
@@ -1,1072 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"cells": [
|
| 3 |
-
{
|
| 4 |
-
"cell_type": "markdown",
|
| 5 |
-
"metadata": {},
|
| 6 |
-
"source": [
|
| 7 |
-
"# RAG Pipeline Optimization Benchmark\n",
|
| 8 |
-
"\n",
|
| 9 |
-
"**Comprehensive testing of ALL RAG components to maximize LLM Judge score**\n",
|
| 10 |
-
"\n",
|
| 11 |
-
"## What We're Testing:\n",
|
| 12 |
-
"\n",
|
| 13 |
-
"### 1. Embedding Models (Vector Representations)\n",
|
| 14 |
-
"- `BAAI/bge-large-en-v1.5` (Current - 1024 dim, best quality)\n",
|
| 15 |
-
"- `BAAI/bge-base-en-v1.5` (768 dim, faster)\n",
|
| 16 |
-
"- `intfloat/multilingual-e5-large` (1024 dim, multi-language)\n",
|
| 17 |
-
"- `sentence-transformers/paraphrase-multilingual-mpnet-base-v2` (768 dim, multilingual)\n",
|
| 18 |
-
"- `sentence-transformers/all-MiniLM-L6-v2` (384 dim, very fast)\n",
|
| 19 |
-
"\n",
|
| 20 |
-
"### 2. Retrieval Strategies\n",
|
| 21 |
-
"- **Top-K**: Test 1, 3, 5, 10 documents\n",
|
| 22 |
-
"- **MMR** (Maximal Marginal Relevance): Diversity vs relevance trade-off\n",
|
| 23 |
-
"- **Similarity Threshold**: Filter low-relevance docs\n",
|
| 24 |
-
"- **Reranking**: Use cross-encoder to rerank results\n",
|
| 25 |
-
"\n",
|
| 26 |
-
"### 3. Chunking Strategies (Already in Vector DB, but we'll compare)\n",
|
| 27 |
-
"- Chunk size: 256, 512, 600 (current), 1000 tokens\n",
|
| 28 |
-
"- Overlap: 0, 50, 100 (current), 200 chars\n",
|
| 29 |
-
"\n",
|
| 30 |
-
"### 4. LLM Models\n",
|
| 31 |
-
"- Llama-4-Maverick-17B (open-source)\n",
|
| 32 |
-
"- DeepSeek-R1 (reasoning)\n",
|
| 33 |
-
"- GPT-4.1, GPT-5, GPT-5-mini\n",
|
| 34 |
-
"- Claude-Sonnet-4.5\n",
|
| 35 |
-
"\n",
|
| 36 |
-
"### 5. Prompting Techniques\n",
|
| 37 |
-
"- **Baseline**: Simple context + question\n",
|
| 38 |
-
"- **Citation-focused**: Emphasize source references\n",
|
| 39 |
-
"- **Step-by-step**: Chain-of-thought reasoning\n",
|
| 40 |
-
"- **Few-shot**: Include example Q&A\n",
|
| 41 |
-
"\n",
|
| 42 |
-
"## LLM Judge Evaluation Criteria:\n",
|
| 43 |
-
"- **Accuracy** (35%): Answer correctness\n",
|
| 44 |
-
"- **Relevance** (35%): Citation quality and relevance\n",
|
| 45 |
-
"- **Completeness** (30%): Thorough answers"
|
| 46 |
-
]
|
| 47 |
-
},
|
| 48 |
-
{
|
| 49 |
-
"cell_type": "code",
|
| 50 |
-
"execution_count": 1,
|
| 51 |
-
"metadata": {},
|
| 52 |
-
"outputs": [],
|
| 53 |
-
"source": [
|
| 54 |
-
"# !pip install openai pinecone-client sentence-transformers rank-bm25 python-dotenv pandas matplotlib seaborn jiwer"
|
| 55 |
-
]
|
| 56 |
-
},
|
| 57 |
-
{
|
| 58 |
-
"cell_type": "code",
|
| 59 |
-
"execution_count": 2,
|
| 60 |
-
"metadata": {},
|
| 61 |
-
"outputs": [
|
| 62 |
-
{
|
| 63 |
-
"name": "stderr",
|
| 64 |
-
"output_type": "stream",
|
| 65 |
-
"text": [
|
| 66 |
-
"/Users/ismatsamadov/SOCAR_Hackathon/venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
| 67 |
-
" from .autonotebook import tqdm as notebook_tqdm\n"
|
| 68 |
-
]
|
| 69 |
-
},
|
| 70 |
-
{
|
| 71 |
-
"name": "stdout",
|
| 72 |
-
"output_type": "stream",
|
| 73 |
-
"text": [
|
| 74 |
-
"✅ Libraries loaded\n"
|
| 75 |
-
]
|
| 76 |
-
}
|
| 77 |
-
],
|
| 78 |
-
"source": [
|
| 79 |
-
"import os\n",
|
| 80 |
-
"import json\n",
|
| 81 |
-
"import time\n",
|
| 82 |
-
"import re\n",
|
| 83 |
-
"from typing import Dict, List, Tuple, Any\n",
|
| 84 |
-
"from collections import defaultdict\n",
|
| 85 |
-
"from dotenv import load_dotenv\n",
|
| 86 |
-
"\n",
|
| 87 |
-
"import pandas as pd\n",
|
| 88 |
-
"import matplotlib.pyplot as plt\n",
|
| 89 |
-
"import seaborn as sns\n",
|
| 90 |
-
"from openai import AzureOpenAI\n",
|
| 91 |
-
"from pinecone import Pinecone\n",
|
| 92 |
-
"from sentence_transformers import SentenceTransformer, CrossEncoder\n",
|
| 93 |
-
"from jiwer import wer, cer\n",
|
| 94 |
-
"import numpy as np\n",
|
| 95 |
-
"\n",
|
| 96 |
-
"load_dotenv()\n",
|
| 97 |
-
"\n",
|
| 98 |
-
"sns.set_style('whitegrid')\n",
|
| 99 |
-
"plt.rcParams['figure.figsize'] = (16, 10)\n",
|
| 100 |
-
"\n",
|
| 101 |
-
"print(\"✅ Libraries loaded\")"
|
| 102 |
-
]
|
| 103 |
-
},
|
| 104 |
-
{
|
| 105 |
-
"cell_type": "code",
|
| 106 |
-
"execution_count": 3,
|
| 107 |
-
"metadata": {},
|
| 108 |
-
"outputs": [
|
| 109 |
-
{
|
| 110 |
-
"name": "stdout",
|
| 111 |
-
"output_type": "stream",
|
| 112 |
-
"text": [
|
| 113 |
-
"✅ Project root: /Users/ismatsamadov/SOCAR_Hackathon\n",
|
| 114 |
-
"✅ Docs directory: /Users/ismatsamadov/SOCAR_Hackathon/docs\n",
|
| 115 |
-
"✅ Output directory: /Users/ismatsamadov/SOCAR_Hackathon/output\n"
|
| 116 |
-
]
|
| 117 |
-
}
|
| 118 |
-
],
|
| 119 |
-
"source": [
|
| 120 |
-
"# Auto-detect project root (works from any directory)\n",
|
| 121 |
-
"import os\n",
|
| 122 |
-
"from pathlib import Path\n",
|
| 123 |
-
"\n",
|
| 124 |
-
"if Path('data').exists() and Path('docs').exists():\n",
|
| 125 |
-
" # Already in project root\n",
|
| 126 |
-
" PROJECT_ROOT = Path.cwd()\n",
|
| 127 |
-
"elif Path('../data').exists() and Path('../docs').exists():\n",
|
| 128 |
-
" # In notebooks/ subdirectory\n",
|
| 129 |
-
" PROJECT_ROOT = Path.cwd().parent\n",
|
| 130 |
-
"else:\n",
|
| 131 |
-
" # Fallback: try to find project root\n",
|
| 132 |
-
" current = Path.cwd()\n",
|
| 133 |
-
" while current != current.parent:\n",
|
| 134 |
-
" if (current / 'data').exists() and (current / 'docs').exists():\n",
|
| 135 |
-
" PROJECT_ROOT = current\n",
|
| 136 |
-
" break\n",
|
| 137 |
-
" current = current.parent\n",
|
| 138 |
-
" else:\n",
|
| 139 |
-
" PROJECT_ROOT = Path.cwd()\n",
|
| 140 |
-
"\n",
|
| 141 |
-
"# Define all paths relative to project root\n",
|
| 142 |
-
"DATA_DIR = PROJECT_ROOT / 'data'\n",
|
| 143 |
-
"DOCS_DIR = PROJECT_ROOT / 'docs'\n",
|
| 144 |
-
"OUTPUT_DIR = PROJECT_ROOT / 'output'\n",
|
| 145 |
-
"\n",
|
| 146 |
-
"print(f\"✅ Project root: {PROJECT_ROOT}\")\n",
|
| 147 |
-
"print(f\"✅ Docs directory: {DOCS_DIR}\")\n",
|
| 148 |
-
"print(f\"✅ Output directory: {OUTPUT_DIR}\")"
|
| 149 |
-
]
|
| 150 |
-
},
|
| 151 |
-
{
|
| 152 |
-
"cell_type": "code",
|
| 153 |
-
"execution_count": 4,
|
| 154 |
-
"metadata": {},
|
| 155 |
-
"outputs": [
|
| 156 |
-
{
|
| 157 |
-
"name": "stdout",
|
| 158 |
-
"output_type": "stream",
|
| 159 |
-
"text": [
|
| 160 |
-
"✅ Loaded 5 test questions\n",
|
| 161 |
-
" - Example1\n",
|
| 162 |
-
" - Example2\n",
|
| 163 |
-
" - Example3\n",
|
| 164 |
-
" - Example4\n",
|
| 165 |
-
" - Example5\n"
|
| 166 |
-
]
|
| 167 |
-
}
|
| 168 |
-
],
|
| 169 |
-
"source": [
|
| 170 |
-
"# Load test cases - using dynamic paths\n",
|
| 171 |
-
"with open(DOCS_DIR / 'sample_questions.json', 'r', encoding='utf-8') as f:\n",
|
| 172 |
-
" questions = json.load(f)\n",
|
| 173 |
-
"\n",
|
| 174 |
-
"with open(DOCS_DIR / 'sample_answers.json', 'r', encoding='utf-8') as f:\n",
|
| 175 |
-
" expected_answers = json.load(f)\n",
|
| 176 |
-
"\n",
|
| 177 |
-
"print(f\"✅ Loaded {len(questions)} test questions\")\n",
|
| 178 |
-
"for key in questions.keys():\n",
|
| 179 |
-
" print(f\" - {key}\")"
|
| 180 |
-
]
|
| 181 |
-
},
|
| 182 |
-
{
|
| 183 |
-
"cell_type": "markdown",
|
| 184 |
-
"metadata": {},
|
| 185 |
-
"source": [
|
| 186 |
-
"## 2. Initialize Vector Database"
|
| 187 |
-
]
|
| 188 |
-
},
|
| 189 |
-
{
|
| 190 |
-
"cell_type": "code",
|
| 191 |
-
"execution_count": null,
|
| 192 |
-
"metadata": {},
|
| 193 |
-
"outputs": [],
|
| 194 |
-
"source": [
|
| 195 |
-
"# Connect to Pinecone\n",
|
| 196 |
-
"pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))\n",
|
| 197 |
-
"index = pc.Index(os.getenv('PINECONE_INDEX_NAME', 'hackathon'))\n",
|
| 198 |
-
"\n",
|
| 199 |
-
"stats = index.describe_index_stats()\n",
|
| 200 |
-
"print(f\"✅ Vector DB connected\")\n",
|
| 201 |
-
"print(f\" Total vectors: {stats['total_vector_count']}\")\n",
|
| 202 |
-
"print(f\" Dimensions: {stats['dimension']}\")"
|
| 203 |
-
]
|
| 204 |
-
},
|
| 205 |
-
{
|
| 206 |
-
"cell_type": "markdown",
|
| 207 |
-
"metadata": {},
|
| 208 |
-
"source": [
|
| 209 |
-
"## 3. Embedding Models Configuration"
|
| 210 |
-
]
|
| 211 |
-
},
|
| 212 |
-
{
|
| 213 |
-
"cell_type": "code",
|
| 214 |
-
"execution_count": null,
|
| 215 |
-
"metadata": {},
|
| 216 |
-
"outputs": [],
|
| 217 |
-
"source": [
|
| 218 |
-
"EMBEDDING_MODELS = {\n",
|
| 219 |
-
" 'bge-large-en': {\n",
|
| 220 |
-
" 'name': 'BAAI/bge-large-en-v1.5',\n",
|
| 221 |
-
" 'dimensions': 1024,\n",
|
| 222 |
-
" 'notes': 'Current model - best quality'\n",
|
| 223 |
-
" },\n",
|
| 224 |
-
" 'bge-base-en': {\n",
|
| 225 |
-
" 'name': 'BAAI/bge-base-en-v1.5',\n",
|
| 226 |
-
" 'dimensions': 768,\n",
|
| 227 |
-
" 'notes': 'Faster, slightly lower quality'\n",
|
| 228 |
-
" },\n",
|
| 229 |
-
" 'multilingual-e5-large': {\n",
|
| 230 |
-
" 'name': 'intfloat/multilingual-e5-large',\n",
|
| 231 |
-
" 'dimensions': 1024,\n",
|
| 232 |
-
" 'notes': 'Multi-language optimized'\n",
|
| 233 |
-
" },\n",
|
| 234 |
-
" 'paraphrase-multilingual': {\n",
|
| 235 |
-
" 'name': 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2',\n",
|
| 236 |
-
" 'dimensions': 768,\n",
|
| 237 |
-
" 'notes': 'Good for Azerbaijani/Russian'\n",
|
| 238 |
-
" },\n",
|
| 239 |
-
" 'all-MiniLM-L6': {\n",
|
| 240 |
-
" 'name': 'sentence-transformers/all-MiniLM-L6-v2',\n",
|
| 241 |
-
" 'dimensions': 384,\n",
|
| 242 |
-
" 'notes': 'Very fast, lower quality'\n",
|
| 243 |
-
" }\n",
|
| 244 |
-
"}\n",
|
| 245 |
-
"\n",
|
| 246 |
-
"# Load embedding models (only test 1024-dim models for existing Pinecone index)\n",
|
| 247 |
-
"EMBEDDING_MODELS_TO_TEST = [\n",
|
| 248 |
-
" 'bge-large-en', # Current\n",
|
| 249 |
-
" 'multilingual-e5-large', # Alternative with same dims\n",
|
| 250 |
-
"]\n",
|
| 251 |
-
"\n",
|
| 252 |
-
"embedding_cache = {}\n",
|
| 253 |
-
"\n",
|
| 254 |
-
"for model_key in EMBEDDING_MODELS_TO_TEST:\n",
|
| 255 |
-
" model_name = EMBEDDING_MODELS[model_key]['name']\n",
|
| 256 |
-
" print(f\"Loading {model_key}...\")\n",
|
| 257 |
-
" embedding_cache[model_key] = SentenceTransformer(model_name)\n",
|
| 258 |
-
" print(f\" ✅ {model_name}\")\n",
|
| 259 |
-
"\n",
|
| 260 |
-
"print(f\"\\n✅ Loaded {len(embedding_cache)} embedding models\")"
|
| 261 |
-
]
|
| 262 |
-
},
|
| 263 |
-
{
|
| 264 |
-
"cell_type": "markdown",
|
| 265 |
-
"metadata": {},
|
| 266 |
-
"source": [
|
| 267 |
-
"## 4. Retrieval Strategies"
|
| 268 |
-
]
|
| 269 |
-
},
|
| 270 |
-
{
|
| 271 |
-
"cell_type": "code",
|
| 272 |
-
"execution_count": null,
|
| 273 |
-
"metadata": {},
|
| 274 |
-
"outputs": [],
|
| 275 |
-
"source": [
|
| 276 |
-
"def retrieve_vanilla(query: str, embed_model: SentenceTransformer, top_k: int = 3) -> List[Dict]:\n",
|
| 277 |
-
" \"\"\"\n",
|
| 278 |
-
" Vanilla retrieval: Simple top-k vector search.\n",
|
| 279 |
-
" \"\"\"\n",
|
| 280 |
-
" query_embedding = embed_model.encode(query).tolist()\n",
|
| 281 |
-
" results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)\n",
|
| 282 |
-
" \n",
|
| 283 |
-
" documents = []\n",
|
| 284 |
-
" for match in results['matches']:\n",
|
| 285 |
-
" documents.append({\n",
|
| 286 |
-
" 'pdf_name': match['metadata'].get('pdf_name', 'unknown.pdf'),\n",
|
| 287 |
-
" 'page_number': match['metadata'].get('page_number', 0),\n",
|
| 288 |
-
" 'content': match['metadata'].get('text', ''),\n",
|
| 289 |
-
" 'score': match.get('score', 0.0)\n",
|
| 290 |
-
" })\n",
|
| 291 |
-
" \n",
|
| 292 |
-
" return documents\n",
|
| 293 |
-
"\n",
|
| 294 |
-
"\n",
|
| 295 |
-
"def retrieve_with_threshold(query: str, embed_model: SentenceTransformer, \n",
|
| 296 |
-
" top_k: int = 10, threshold: float = 0.7) -> List[Dict]:\n",
|
| 297 |
-
" \"\"\"\n",
|
| 298 |
-
" Retrieve with similarity threshold filtering.\n",
|
| 299 |
-
" \"\"\"\n",
|
| 300 |
-
" docs = retrieve_vanilla(query, embed_model, top_k=top_k)\n",
|
| 301 |
-
" return [doc for doc in docs if doc['score'] >= threshold]\n",
|
| 302 |
-
"\n",
|
| 303 |
-
"\n",
|
| 304 |
-
"def retrieve_with_mmr(query: str, embed_model: SentenceTransformer, \n",
|
| 305 |
-
" top_k: int = 3, lambda_param: float = 0.5, fetch_k: int = 20) -> List[Dict]:\n",
|
| 306 |
-
" \"\"\"\n",
|
| 307 |
-
" MMR (Maximal Marginal Relevance) for diversity.\n",
|
| 308 |
-
" lambda=1 → pure relevance, lambda=0 → pure diversity\n",
|
| 309 |
-
" \"\"\"\n",
|
| 310 |
-
" # Fetch more candidates\n",
|
| 311 |
-
" candidates = retrieve_vanilla(query, embed_model, top_k=fetch_k)\n",
|
| 312 |
-
" \n",
|
| 313 |
-
" if len(candidates) <= top_k:\n",
|
| 314 |
-
" return candidates[:top_k]\n",
|
| 315 |
-
" \n",
|
| 316 |
-
" # Query embedding\n",
|
| 317 |
-
" query_emb = embed_model.encode(query)\n",
|
| 318 |
-
" \n",
|
| 319 |
-
" # Get embeddings for candidates\n",
|
| 320 |
-
" candidate_texts = [doc['content'] for doc in candidates]\n",
|
| 321 |
-
" candidate_embs = embed_model.encode(candidate_texts)\n",
|
| 322 |
-
" \n",
|
| 323 |
-
" # MMR algorithm\n",
|
| 324 |
-
" selected = []\n",
|
| 325 |
-
" selected_embs = []\n",
|
| 326 |
-
" \n",
|
| 327 |
-
" for _ in range(min(top_k, len(candidates))):\n",
|
| 328 |
-
" mmr_scores = []\n",
|
| 329 |
-
" \n",
|
| 330 |
-
" for i, (doc, emb) in enumerate(zip(candidates, candidate_embs)):\n",
|
| 331 |
-
" if i in [candidates.index(s) for s in selected]:\n",
|
| 332 |
-
" mmr_scores.append(-float('inf'))\n",
|
| 333 |
-
" continue\n",
|
| 334 |
-
" \n",
|
| 335 |
-
" # Relevance to query\n",
|
| 336 |
-
" relevance = np.dot(query_emb, emb) / (np.linalg.norm(query_emb) * np.linalg.norm(emb))\n",
|
| 337 |
-
" \n",
|
| 338 |
-
" # Max similarity to already selected\n",
|
| 339 |
-
" if selected_embs:\n",
|
| 340 |
-
" similarities = [np.dot(emb, s_emb) / (np.linalg.norm(emb) * np.linalg.norm(s_emb)) \n",
|
| 341 |
-
" for s_emb in selected_embs]\n",
|
| 342 |
-
" max_sim = max(similarities)\n",
|
| 343 |
-
" else:\n",
|
| 344 |
-
" max_sim = 0\n",
|
| 345 |
-
" \n",
|
| 346 |
-
" # MMR score\n",
|
| 347 |
-
" mmr = lambda_param * relevance - (1 - lambda_param) * max_sim\n",
|
| 348 |
-
" mmr_scores.append(mmr)\n",
|
| 349 |
-
" \n",
|
| 350 |
-
" # Select best MMR score\n",
|
| 351 |
-
" best_idx = np.argmax(mmr_scores)\n",
|
| 352 |
-
" selected.append(candidates[best_idx])\n",
|
| 353 |
-
" selected_embs.append(candidate_embs[best_idx])\n",
|
| 354 |
-
" \n",
|
| 355 |
-
" return selected\n",
|
| 356 |
-
"\n",
|
| 357 |
-
"\n",
|
| 358 |
-
"def retrieve_with_reranking(query: str, embed_model: SentenceTransformer, \n",
|
| 359 |
-
" top_k: int = 3, fetch_k: int = 20) -> List[Dict]:\n",
|
| 360 |
-
" \"\"\"\n",
|
| 361 |
-
" Two-stage: retrieve with embeddings, rerank with cross-encoder.\n",
|
| 362 |
-
" \"\"\"\n",
|
| 363 |
-
" # Stage 1: Retrieve candidates\n",
|
| 364 |
-
" candidates = retrieve_vanilla(query, embed_model, top_k=fetch_k)\n",
|
| 365 |
-
" \n",
|
| 366 |
-
" if len(candidates) <= top_k:\n",
|
| 367 |
-
" return candidates[:top_k]\n",
|
| 368 |
-
" \n",
|
| 369 |
-
" # Stage 2: Rerank with cross-encoder\n",
|
| 370 |
-
" reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')\n",
|
| 371 |
-
" \n",
|
| 372 |
-
" pairs = [[query, doc['content']] for doc in candidates]\n",
|
| 373 |
-
" scores = reranker.predict(pairs)\n",
|
| 374 |
-
" \n",
|
| 375 |
-
" # Sort by reranker score\n",
|
| 376 |
-
" scored_docs = [(doc, score) for doc, score in zip(candidates, scores)]\n",
|
| 377 |
-
" scored_docs.sort(key=lambda x: x[1], reverse=True)\n",
|
| 378 |
-
" \n",
|
| 379 |
-
" # Update scores and return top-k\n",
|
| 380 |
-
" reranked = []\n",
|
| 381 |
-
" for doc, score in scored_docs[:top_k]:\n",
|
| 382 |
-
" doc['rerank_score'] = float(score)\n",
|
| 383 |
-
" reranked.append(doc)\n",
|
| 384 |
-
" \n",
|
| 385 |
-
" return reranked\n",
|
| 386 |
-
"\n",
|
| 387 |
-
"\n",
|
| 388 |
-
"RETRIEVAL_STRATEGIES = {\n",
|
| 389 |
-
" 'vanilla_k3': {'func': retrieve_vanilla, 'params': {'top_k': 3}, 'notes': 'Current setup'},\n",
|
| 390 |
-
" 'vanilla_k5': {'func': retrieve_vanilla, 'params': {'top_k': 5}, 'notes': 'More context'},\n",
|
| 391 |
-
" 'vanilla_k10': {'func': retrieve_vanilla, 'params': {'top_k': 10}, 'notes': 'Maximum context'},\n",
|
| 392 |
-
" 'threshold_0.7': {'func': retrieve_with_threshold, 'params': {'top_k': 10, 'threshold': 0.7}, 'notes': 'Quality filter'},\n",
|
| 393 |
-
" 'mmr_balanced': {'func': retrieve_with_mmr, 'params': {'top_k': 3, 'lambda_param': 0.5}, 'notes': 'Balance diversity'},\n",
|
| 394 |
-
" 'mmr_diverse': {'func': retrieve_with_mmr, 'params': {'top_k': 3, 'lambda_param': 0.3}, 'notes': 'More diversity'},\n",
|
| 395 |
-
" 'reranked_k3': {'func': retrieve_with_reranking, 'params': {'top_k': 3, 'fetch_k': 20}, 'notes': 'Two-stage rerank'},\n",
|
| 396 |
-
"}\n",
|
| 397 |
-
"\n",
|
| 398 |
-
"print(f\"✅ Configured {len(RETRIEVAL_STRATEGIES)} retrieval strategies\")"
|
| 399 |
-
]
|
| 400 |
-
},
|
| 401 |
-
{
|
| 402 |
-
"cell_type": "markdown",
|
| 403 |
-
"metadata": {},
|
| 404 |
-
"source": [
|
| 405 |
-
"## 5. LLM Models and Prompting Strategies"
|
| 406 |
-
]
|
| 407 |
-
},
|
| 408 |
-
{
|
| 409 |
-
"cell_type": "code",
|
| 410 |
-
"execution_count": null,
|
| 411 |
-
"metadata": {},
|
| 412 |
-
"outputs": [],
|
| 413 |
-
"source": [
|
| 414 |
-
"# Initialize Azure OpenAI\n",
|
| 415 |
-
"azure_client = AzureOpenAI(\n",
|
| 416 |
-
" api_key=os.getenv('AZURE_OPENAI_API_KEY'),\n",
|
| 417 |
-
" api_version=os.getenv('AZURE_OPENAI_API_VERSION', '2024-08-01-preview'),\n",
|
| 418 |
-
" azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT')\n",
|
| 419 |
-
")\n",
|
| 420 |
-
"\n",
|
| 421 |
-
"LLM_MODELS = {\n",
|
| 422 |
-
" 'Llama-4-Maverick': 'Llama-4-Maverick-17B-128E-Instruct-FP8',\n",
|
| 423 |
-
" 'DeepSeek-R1': 'DeepSeek-R1',\n",
|
| 424 |
-
" 'GPT-4.1': 'gpt-4.1',\n",
|
| 425 |
-
" 'GPT-5-mini': 'gpt-5-mini',\n",
|
| 426 |
-
" 'Claude-Sonnet-4.5': 'claude-sonnet-4-5',\n",
|
| 427 |
-
"}\n",
|
| 428 |
-
"\n",
|
| 429 |
-
"# Prompting strategies\n",
|
| 430 |
-
"PROMPTING_STRATEGIES = {\n",
|
| 431 |
-
" 'baseline': \"\"\"\n",
|
| 432 |
-
"Siz SOCAR-ın tarixi neft və qaz sənədləri üzrə köməkçisiniz.\n",
|
| 433 |
-
"\n",
|
| 434 |
-
"Kontekst:\n",
|
| 435 |
-
"{context}\n",
|
| 436 |
-
"\n",
|
| 437 |
-
"Sual: {query}\n",
|
| 438 |
-
"\n",
|
| 439 |
-
"Kontekstə əsaslanaraq cavab verin.\n",
|
| 440 |
-
"\"\"\",\n",
|
| 441 |
-
" \n",
|
| 442 |
-
" 'citation_focused': \"\"\"\n",
|
| 443 |
-
"Siz SOCAR-ın tarixi sənədlər üzrə mütəxəssis köməkçisisiniz.\n",
|
| 444 |
-
"\n",
|
| 445 |
-
"ÖNƏMLİ: Hər bir faktı mütləq mənbə ilə təsdiqləyin (PDF adı və səhifə nömrəsi).\n",
|
| 446 |
-
"\n",
|
| 447 |
-
"Kontekst:\n",
|
| 448 |
-
"{context}\n",
|
| 449 |
-
"\n",
|
| 450 |
-
"Sual: {query}\n",
|
| 451 |
-
"\n",
|
| 452 |
-
"Cavab verərkən:\n",
|
| 453 |
-
"1. Dəqiq faktlar yazın\n",
|
| 454 |
-
"2. Hər faktı mənbə ilə göstərin: (PDF: fayl_adı.pdf, Səhifə: X)\n",
|
| 455 |
-
"3. Kontekstdə olmayan məlumat əlavə etməyin\n",
|
| 456 |
-
"\"\"\",\n",
|
| 457 |
-
" \n",
|
| 458 |
-
" 'step_by_step': \"\"\"\n",
|
| 459 |
-
"Siz SOCAR-ın tarixi sənədlər üzrə analitik köməkçisisiniz.\n",
|
| 460 |
-
"\n",
|
| 461 |
-
"Kontekst:\n",
|
| 462 |
-
"{context}\n",
|
| 463 |
-
"\n",
|
| 464 |
-
"Sual: {query}\n",
|
| 465 |
-
"\n",
|
| 466 |
-
"Addım-addım cavab verin:\n",
|
| 467 |
-
"1. Əvvəlcə kontekstdən əlaqəli məlumatları müəyyənləşdirin\n",
|
| 468 |
-
"2. Bu məlumatları təhlil edin\n",
|
| 469 |
-
"3. Nəticəni mənbələr ilə birlikdə təqdim edin\n",
|
| 470 |
-
"\"\"\",\n",
|
| 471 |
-
" \n",
|
| 472 |
-
" 'few_shot': \"\"\"\n",
|
| 473 |
-
"Siz SOCAR-ın tarixi sənədlər üzrə mütəxəssis köməkçisisiniz.\n",
|
| 474 |
-
"\n",
|
| 475 |
-
"Nümunə:\n",
|
| 476 |
-
"Sual: \"Palçıq vulkanlarının təsir radiusu nə qədərdir?\"\n",
|
| 477 |
-
"Cavab: \"Sahə müşahidələri və modelləşdirmə göstərir ki, palçıq vulkanlarının təsir radiusu təqribən 10 km-dir (PDF: document_06.pdf, Səhifə: 5).\"\n",
|
| 478 |
-
"\n",
|
| 479 |
-
"Kontekst:\n",
|
| 480 |
-
"{context}\n",
|
| 481 |
-
"\n",
|
| 482 |
-
"Sual: {query}\n",
|
| 483 |
-
"\n",
|
| 484 |
-
"Yuxarıdakı nümunə kimi cavab verin - dəqiq, qısa, mənbə ilə.\n",
|
| 485 |
-
"\"\"\"\n",
|
| 486 |
-
"}\n",
|
| 487 |
-
"\n",
|
| 488 |
-
"print(f\"✅ Configured {len(LLM_MODELS)} LLM models\")\n",
|
| 489 |
-
"print(f\"✅ Configured {len(PROMPTING_STRATEGIES)} prompting strategies\")"
|
| 490 |
-
]
|
| 491 |
-
},
|
| 492 |
-
{
|
| 493 |
-
"cell_type": "code",
|
| 494 |
-
"execution_count": null,
|
| 495 |
-
"metadata": {},
|
| 496 |
-
"outputs": [],
|
| 497 |
-
"source": [
|
| 498 |
-
"def generate_answer(llm_model: str, query: str, documents: List[Dict], \n",
|
| 499 |
-
" prompt_strategy: str = 'baseline',\n",
|
| 500 |
-
" temperature: float = 0.2) -> Tuple[str, float]:\n",
|
| 501 |
-
" \"\"\"\n",
|
| 502 |
-
" Generate answer using LLM with specified prompting strategy.\n",
|
| 503 |
-
" \"\"\"\n",
|
| 504 |
-
" # Build context\n",
|
| 505 |
-
" context_parts = []\n",
|
| 506 |
-
" for i, doc in enumerate(documents, 1):\n",
|
| 507 |
-
" context_parts.append(\n",
|
| 508 |
-
" f\"Sənəd {i} (Mənbə: {doc['pdf_name']}, Səhifə {doc['page_number']}):\\n{doc['content']}\"\n",
|
| 509 |
-
" )\n",
|
| 510 |
-
" context = \"\\n\\n\".join(context_parts)\n",
|
| 511 |
-
" \n",
|
| 512 |
-
" # Get prompt template\n",
|
| 513 |
-
" prompt_template = PROMPTING_STRATEGIES[prompt_strategy]\n",
|
| 514 |
-
" prompt = prompt_template.format(context=context, query=query)\n",
|
| 515 |
-
" \n",
|
| 516 |
-
" try:\n",
|
| 517 |
-
" start_time = time.time()\n",
|
| 518 |
-
" \n",
|
| 519 |
-
" deployment = LLM_MODELS[llm_model]\n",
|
| 520 |
-
" \n",
|
| 521 |
-
" # GPT-5 models use max_completion_tokens, others use max_tokens\n",
|
| 522 |
-
" if deployment.startswith('gpt-5'):\n",
|
| 523 |
-
" response = azure_client.chat.completions.create(\n",
|
| 524 |
-
" model=deployment,\n",
|
| 525 |
-
" messages=[{\"role\": \"user\", \"content\": prompt}],\n",
|
| 526 |
-
" temperature=temperature,\n",
|
| 527 |
-
" max_completion_tokens=1000\n",
|
| 528 |
-
" )\n",
|
| 529 |
-
" else:\n",
|
| 530 |
-
" response = azure_client.chat.completions.create(\n",
|
| 531 |
-
" model=deployment,\n",
|
| 532 |
-
" messages=[{\"role\": \"user\", \"content\": prompt}],\n",
|
| 533 |
-
" temperature=temperature,\n",
|
| 534 |
-
" max_tokens=1000\n",
|
| 535 |
-
" )\n",
|
| 536 |
-
" \n",
|
| 537 |
-
" elapsed = time.time() - start_time\n",
|
| 538 |
-
" answer = response.choices[0].message.content\n",
|
| 539 |
-
" \n",
|
| 540 |
-
" return answer, elapsed\n",
|
| 541 |
-
" \n",
|
| 542 |
-
" except Exception as e:\n",
|
| 543 |
-
" return f\"ERROR: {str(e)}\", 0.0\n",
|
| 544 |
-
"\n",
|
| 545 |
-
"print(\"✅ LLM generation function ready\")"
|
| 546 |
-
]
|
| 547 |
-
},
|
| 548 |
-
{
|
| 549 |
-
"cell_type": "markdown",
|
| 550 |
-
"metadata": {},
|
| 551 |
-
"source": [
|
| 552 |
-
"## 6. Evaluation Metrics"
|
| 553 |
-
]
|
| 554 |
-
},
|
| 555 |
-
{
|
| 556 |
-
"cell_type": "code",
|
| 557 |
-
"execution_count": null,
|
| 558 |
-
"metadata": {},
|
| 559 |
-
"outputs": [],
|
| 560 |
-
"source": [
|
| 561 |
-
"def normalize_text(text: str) -> str:\n",
|
| 562 |
-
" text = text.lower().strip()\n",
|
| 563 |
-
" text = re.sub(r'\\s+', ' ', text)\n",
|
| 564 |
-
" return text\n",
|
| 565 |
-
"\n",
|
| 566 |
-
"def calculate_answer_quality(reference: str, hypothesis: str) -> Dict[str, float]:\n",
|
| 567 |
-
" \"\"\"Accuracy metrics.\"\"\"\n",
|
| 568 |
-
" ref_norm = normalize_text(reference)\n",
|
| 569 |
-
" hyp_norm = normalize_text(hypothesis)\n",
|
| 570 |
-
" \n",
|
| 571 |
-
" cer_score = cer(ref_norm, hyp_norm) * 100\n",
|
| 572 |
-
" wer_score = wer(ref_norm, hyp_norm) * 100\n",
|
| 573 |
-
" similarity = max(0, 100 - wer_score)\n",
|
| 574 |
-
" \n",
|
| 575 |
-
" return {\n",
|
| 576 |
-
" 'Accuracy_Score': round(similarity, 2)\n",
|
| 577 |
-
" }\n",
|
| 578 |
-
"\n",
|
| 579 |
-
"def evaluate_citation_quality(answer: str, documents: List[Dict]) -> Dict[str, float]:\n",
|
| 580 |
-
" \"\"\"Relevance - citation quality.\"\"\"\n",
|
| 581 |
-
" pdf_names = [doc['pdf_name'].replace('.pdf', '') for doc in documents]\n",
|
| 582 |
-
" page_numbers = [str(doc['page_number']) for doc in documents]\n",
|
| 583 |
-
" \n",
|
| 584 |
-
" cited_pdfs = sum(1 for pdf in pdf_names if pdf in answer)\n",
|
| 585 |
-
" cited_pages = sum(1 for page in page_numbers if page in answer)\n",
|
| 586 |
-
" \n",
|
| 587 |
-
" citation_keywords = ['mənbə', 'sənəd', 'səhifə', 'pdf', 'document', 'page']\n",
|
| 588 |
-
" has_citation_format = any(kw in answer.lower() for kw in citation_keywords)\n",
|
| 589 |
-
" \n",
|
| 590 |
-
" citation_score = (\n",
|
| 591 |
-
" (cited_pdfs / len(pdf_names) * 40) +\n",
|
| 592 |
-
" (cited_pages / len(page_numbers) * 40) +\n",
|
| 593 |
-
" (20 if has_citation_format else 0)\n",
|
| 594 |
-
" )\n",
|
| 595 |
-
" \n",
|
| 596 |
-
" return {\n",
|
| 597 |
-
" 'Citation_Score': round(citation_score, 2),\n",
|
| 598 |
-
" 'Cited_PDFs': cited_pdfs,\n",
|
| 599 |
-
" 'Cited_Pages': cited_pages\n",
|
| 600 |
-
" }\n",
|
| 601 |
-
"\n",
|
| 602 |
-
"def evaluate_retrieval_quality(query: str, documents: List[Dict], expected_answer: str) -> Dict[str, float]:\n",
|
| 603 |
-
" \"\"\"Measure if retrieved docs are relevant to answer.\"\"\"\n",
|
| 604 |
-
" if not documents or not expected_answer:\n",
|
| 605 |
-
" return {'Retrieval_Relevance': 0.0}\n",
|
| 606 |
-
" \n",
|
| 607 |
-
" # Simple heuristic: check if expected answer words appear in retrieved docs\n",
|
| 608 |
-
" expected_words = set(normalize_text(expected_answer).split())\n",
|
| 609 |
-
" retrieved_text = ' '.join([doc['content'] for doc in documents])\n",
|
| 610 |
-
" retrieved_words = set(normalize_text(retrieved_text).split())\n",
|
| 611 |
-
" \n",
|
| 612 |
-
" overlap = len(expected_words & retrieved_words) / len(expected_words) if expected_words else 0\n",
|
| 613 |
-
" \n",
|
| 614 |
-
" return {\n",
|
| 615 |
-
" 'Retrieval_Relevance': round(overlap * 100, 2)\n",
|
| 616 |
-
" }\n",
|
| 617 |
-
"\n",
|
| 618 |
-
"def evaluate_completeness(answer: str) -> Dict[str, float]:\n",
|
| 619 |
-
" \"\"\"Completeness metrics.\"\"\"\n",
|
| 620 |
-
" word_count = len(answer.split())\n",
|
| 621 |
-
" \n",
|
| 622 |
-
" if word_count < 20:\n",
|
| 623 |
-
" completeness = (word_count / 20) * 100\n",
|
| 624 |
-
" elif word_count > 200:\n",
|
| 625 |
-
" completeness = 100 - ((word_count - 200) / 200 * 20)\n",
|
| 626 |
-
" else:\n",
|
| 627 |
-
" completeness = 100\n",
|
| 628 |
-
" \n",
|
| 629 |
-
" return {\n",
|
| 630 |
-
" 'Completeness_Score': round(max(0, completeness), 2),\n",
|
| 631 |
-
" 'Word_Count': word_count\n",
|
| 632 |
-
" }\n",
|
| 633 |
-
"\n",
|
| 634 |
-
"def calculate_llm_judge_score(accuracy: float, citation: float, completeness: float) -> float:\n",
|
| 635 |
-
" \"\"\"Overall LLM Judge score (weighted).\"\"\"\n",
|
| 636 |
-
" return round(\n",
|
| 637 |
-
" accuracy * 0.35 +\n",
|
| 638 |
-
" citation * 0.35 +\n",
|
| 639 |
-
" completeness * 0.30,\n",
|
| 640 |
-
" 2\n",
|
| 641 |
-
" )\n",
|
| 642 |
-
"\n",
|
| 643 |
-
"print(\"✅ Evaluation metrics ready\")"
|
| 644 |
-
]
|
| 645 |
-
},
|
| 646 |
-
{
|
| 647 |
-
"cell_type": "markdown",
|
| 648 |
-
"metadata": {},
|
| 649 |
-
"source": [
|
| 650 |
-
"## 7. Run Comprehensive Benchmark"
|
| 651 |
-
]
|
| 652 |
-
},
|
| 653 |
-
{
|
| 654 |
-
"cell_type": "code",
|
| 655 |
-
"execution_count": null,
|
| 656 |
-
"metadata": {},
|
| 657 |
-
"outputs": [],
|
| 658 |
-
"source": [
|
| 659 |
-
"# Configuration: Select what to test\n",
|
| 660 |
-
"CONFIGS_TO_TEST = [\n",
|
| 661 |
-
" # Format: (embed_model, retrieval_strategy, llm_model, prompt_strategy)\n",
|
| 662 |
-
" \n",
|
| 663 |
-
" # Baseline (current setup)\n",
|
| 664 |
-
" ('bge-large-en', 'vanilla_k3', 'Llama-4-Maverick', 'baseline'),\n",
|
| 665 |
-
" \n",
|
| 666 |
-
" # Test different embedding models\n",
|
| 667 |
-
" ('multilingual-e5-large', 'vanilla_k3', 'Llama-4-Maverick', 'baseline'),\n",
|
| 668 |
-
" \n",
|
| 669 |
-
" # Test different retrieval strategies\n",
|
| 670 |
-
" ('bge-large-en', 'vanilla_k5', 'Llama-4-Maverick', 'baseline'),\n",
|
| 671 |
-
" ('bge-large-en', 'mmr_balanced', 'Llama-4-Maverick', 'baseline'),\n",
|
| 672 |
-
" ('bge-large-en', 'reranked_k3', 'Llama-4-Maverick', 'baseline'),\n",
|
| 673 |
-
" \n",
|
| 674 |
-
" # Test different LLM models\n",
|
| 675 |
-
" ('bge-large-en', 'vanilla_k3', 'GPT-5-mini', 'baseline'),\n",
|
| 676 |
-
" ('bge-large-en', 'vanilla_k3', 'Claude-Sonnet-4.5', 'baseline'),\n",
|
| 677 |
-
" \n",
|
| 678 |
-
" # Test different prompting strategies\n",
|
| 679 |
-
" ('bge-large-en', 'vanilla_k3', 'Llama-4-Maverick', 'citation_focused'),\n",
|
| 680 |
-
" ('bge-large-en', 'vanilla_k3', 'Llama-4-Maverick', 'few_shot'),\n",
|
| 681 |
-
" \n",
|
| 682 |
-
" # Best combinations\n",
|
| 683 |
-
" ('bge-large-en', 'reranked_k3', 'GPT-5-mini', 'citation_focused'),\n",
|
| 684 |
-
" ('bge-large-en', 'mmr_balanced', 'Claude-Sonnet-4.5', 'citation_focused'),\n",
|
| 685 |
-
"]\n",
|
| 686 |
-
"\n",
|
| 687 |
-
"print(f\"Testing {len(CONFIGS_TO_TEST)} configurations on {len(questions)} questions\")\n",
|
| 688 |
-
"print(f\"Total API calls: ~{len(CONFIGS_TO_TEST) * len(questions)}\")\n",
|
| 689 |
-
"print(\"This will take 15-30 minutes...\\n\")"
|
| 690 |
-
]
|
| 691 |
-
},
|
| 692 |
-
{
|
| 693 |
-
"cell_type": "code",
|
| 694 |
-
"execution_count": null,
|
| 695 |
-
"metadata": {},
|
| 696 |
-
"outputs": [],
|
| 697 |
-
"source": [
|
| 698 |
-
"# Run benchmark\n",
|
| 699 |
-
"results = []\n",
|
| 700 |
-
"\n",
|
| 701 |
-
"for config_idx, (embed_key, retrieval_key, llm_key, prompt_key) in enumerate(CONFIGS_TO_TEST, 1):\n",
|
| 702 |
-
" config_name = f\"{embed_key}_{retrieval_key}_{llm_key}_{prompt_key}\"\n",
|
| 703 |
-
" \n",
|
| 704 |
-
" print(f\"\\n{'='*100}\")\n",
|
| 705 |
-
" print(f\"Config {config_idx}/{len(CONFIGS_TO_TEST)}: {config_name}\")\n",
|
| 706 |
-
" print(f\"{'='*100}\")\n",
|
| 707 |
-
" \n",
|
| 708 |
-
" # Get components\n",
|
| 709 |
-
" embed_model = embedding_cache[embed_key]\n",
|
| 710 |
-
" retrieval_func = RETRIEVAL_STRATEGIES[retrieval_key]['func']\n",
|
| 711 |
-
" retrieval_params = RETRIEVAL_STRATEGIES[retrieval_key]['params']\n",
|
| 712 |
-
" \n",
|
| 713 |
-
" config_results = []\n",
|
| 714 |
-
" \n",
|
| 715 |
-
" for example_key, messages in questions.items():\n",
|
| 716 |
-
" user_msg = [m for m in messages if m['role'] == 'user'][-1]\n",
|
| 717 |
-
" query = user_msg['content']\n",
|
| 718 |
-
" \n",
|
| 719 |
-
" print(f\"\\n {example_key}: {query[:60]}...\")\n",
|
| 720 |
-
" \n",
|
| 721 |
-
" # Retrieve documents\n",
|
| 722 |
-
" documents = retrieval_func(query, embed_model, **retrieval_params)\n",
|
| 723 |
-
" print(f\" Retrieved {len(documents)} docs\")\n",
|
| 724 |
-
" \n",
|
| 725 |
-
" # Generate answer\n",
|
| 726 |
-
" answer, response_time = generate_answer(llm_key, query, documents, prompt_key)\n",
|
| 727 |
-
" \n",
|
| 728 |
-
" if answer.startswith('ERROR'):\n",
|
| 729 |
-
" print(f\" ❌ {answer}\")\n",
|
| 730 |
-
" continue\n",
|
| 731 |
-
" \n",
|
| 732 |
-
" print(f\" ✅ Generated in {response_time:.2f}s\")\n",
|
| 733 |
-
" \n",
|
| 734 |
-
" # Evaluate\n",
|
| 735 |
-
" expected = expected_answers.get(example_key, {}).get('Answer', '')\n",
|
| 736 |
-
" \n",
|
| 737 |
-
" accuracy_metrics = calculate_answer_quality(expected, answer) if expected else {'Accuracy_Score': 0}\n",
|
| 738 |
-
" citation_metrics = evaluate_citation_quality(answer, documents)\n",
|
| 739 |
-
" retrieval_metrics = evaluate_retrieval_quality(query, documents, expected)\n",
|
| 740 |
-
" completeness_metrics = evaluate_completeness(answer)\n",
|
| 741 |
-
" \n",
|
| 742 |
-
" # Calculate overall score\n",
|
| 743 |
-
" llm_judge_score = calculate_llm_judge_score(\n",
|
| 744 |
-
" accuracy_metrics['Accuracy_Score'],\n",
|
| 745 |
-
" citation_metrics['Citation_Score'],\n",
|
| 746 |
-
" completeness_metrics['Completeness_Score']\n",
|
| 747 |
-
" )\n",
|
| 748 |
-
" \n",
|
| 749 |
-
" result = {\n",
|
| 750 |
-
" 'Config': config_name,\n",
|
| 751 |
-
" 'Embedding_Model': embed_key,\n",
|
| 752 |
-
" 'Retrieval_Strategy': retrieval_key,\n",
|
| 753 |
-
" 'LLM_Model': llm_key,\n",
|
| 754 |
-
" 'Prompt_Strategy': prompt_key,\n",
|
| 755 |
-
" 'Question': example_key,\n",
|
| 756 |
-
" 'Query': query[:80],\n",
|
| 757 |
-
" 'Num_Docs_Retrieved': len(documents),\n",
|
| 758 |
-
" 'Response_Time': round(response_time, 2),\n",
|
| 759 |
-
" 'LLM_Judge_Score': llm_judge_score,\n",
|
| 760 |
-
" **accuracy_metrics,\n",
|
| 761 |
-
" **citation_metrics,\n",
|
| 762 |
-
" **retrieval_metrics,\n",
|
| 763 |
-
" **completeness_metrics,\n",
|
| 764 |
-
" 'Answer_Preview': answer[:150]\n",
|
| 765 |
-
" }\n",
|
| 766 |
-
" \n",
|
| 767 |
-
" results.append(result)\n",
|
| 768 |
-
" config_results.append(result)\n",
|
| 769 |
-
" \n",
|
| 770 |
-
" # Show config summary\n",
|
| 771 |
-
" if config_results:\n",
|
| 772 |
-
" avg_score = sum(r['LLM_Judge_Score'] for r in config_results) / len(config_results)\n",
|
| 773 |
-
" avg_time = sum(r['Response_Time'] for r in config_results) / len(config_results)\n",
|
| 774 |
-
" print(f\"\\n 📊 Config Summary:\")\n",
|
| 775 |
-
" print(f\" Avg LLM Judge Score: {avg_score:.2f}%\")\n",
|
| 776 |
-
" print(f\" Avg Response Time: {avg_time:.2f}s\")\n",
|
| 777 |
-
"\n",
|
| 778 |
-
"print(f\"\\n{'='*100}\")\n",
|
| 779 |
-
"print(\"✅ Comprehensive benchmark complete!\")\n",
|
| 780 |
-
"print(f\"{'='*100}\")"
|
| 781 |
-
]
|
| 782 |
-
},
|
| 783 |
-
{
|
| 784 |
-
"cell_type": "markdown",
|
| 785 |
-
"metadata": {},
|
| 786 |
-
"source": [
|
| 787 |
-
"## 8. Analyze Results"
|
| 788 |
-
]
|
| 789 |
-
},
|
| 790 |
-
{
|
| 791 |
-
"cell_type": "code",
|
| 792 |
-
"execution_count": null,
|
| 793 |
-
"metadata": {},
|
| 794 |
-
"outputs": [],
|
| 795 |
-
"source": [
|
| 796 |
-
"# Create DataFrame\n",
|
| 797 |
-
"df = pd.DataFrame(results)\n",
|
| 798 |
-
"\n",
|
| 799 |
-
"# Aggregate by configuration\n",
|
| 800 |
-
"config_summary = df.groupby('Config').agg({\n",
|
| 801 |
-
" 'LLM_Judge_Score': 'mean',\n",
|
| 802 |
-
" 'Accuracy_Score': 'mean',\n",
|
| 803 |
-
" 'Citation_Score': 'mean',\n",
|
| 804 |
-
" 'Retrieval_Relevance': 'mean',\n",
|
| 805 |
-
" 'Completeness_Score': 'mean',\n",
|
| 806 |
-
" 'Response_Time': 'mean',\n",
|
| 807 |
-
" 'Embedding_Model': 'first',\n",
|
| 808 |
-
" 'Retrieval_Strategy': 'first',\n",
|
| 809 |
-
" 'LLM_Model': 'first',\n",
|
| 810 |
-
" 'Prompt_Strategy': 'first'\n",
|
| 811 |
-
"}).round(2)\n",
|
| 812 |
-
"\n",
|
| 813 |
-
"# Sort by LLM Judge Score\n",
|
| 814 |
-
"config_summary = config_summary.sort_values('LLM_Judge_Score', ascending=False)\n",
|
| 815 |
-
"\n",
|
| 816 |
-
"print(\"\\n\" + \"=\"*120)\n",
|
| 817 |
-
"print(\"📊 CONFIGURATION RANKINGS (By LLM Judge Score)\")\n",
|
| 818 |
-
"print(\"=\"*120)\n",
|
| 819 |
-
"display_cols = ['Embedding_Model', 'Retrieval_Strategy', 'LLM_Model', 'Prompt_Strategy', \n",
|
| 820 |
-
" 'LLM_Judge_Score', 'Accuracy_Score', 'Citation_Score', 'Response_Time']\n",
|
| 821 |
-
"print(config_summary[display_cols].to_string())\n",
|
| 822 |
-
"print(\"=\"*120)"
|
| 823 |
-
]
|
| 824 |
-
},
|
| 825 |
-
{
|
| 826 |
-
"cell_type": "markdown",
|
| 827 |
-
"metadata": {},
|
| 828 |
-
"source": [
|
| 829 |
-
"## 9. Component Analysis"
|
| 830 |
-
]
|
| 831 |
-
},
|
| 832 |
-
{
|
| 833 |
-
"cell_type": "code",
|
| 834 |
-
"execution_count": null,
|
| 835 |
-
"metadata": {},
|
| 836 |
-
"outputs": [],
|
| 837 |
-
"source": [
|
| 838 |
-
"# Analyze impact of each component\n",
|
| 839 |
-
"print(\"\\n\" + \"=\"*100)\n",
|
| 840 |
-
"print(\"🔍 COMPONENT IMPACT ANALYSIS\")\n",
|
| 841 |
-
"print(\"=\"*100)\n",
|
| 842 |
-
"\n",
|
| 843 |
-
"# 1. Embedding Models\n",
|
| 844 |
-
"print(\"\\n📚 EMBEDDING MODELS:\")\n",
|
| 845 |
-
"embed_impact = df.groupby('Embedding_Model')['LLM_Judge_Score'].mean().sort_values(ascending=False)\n",
|
| 846 |
-
"for model, score in embed_impact.items():\n",
|
| 847 |
-
" print(f\" {model}: {score:.2f}%\")\n",
|
| 848 |
-
"\n",
|
| 849 |
-
"# 2. Retrieval Strategies\n",
|
| 850 |
-
"print(\"\\n🔎 RETRIEVAL STRATEGIES:\")\n",
|
| 851 |
-
"retrieval_impact = df.groupby('Retrieval_Strategy')['LLM_Judge_Score'].mean().sort_values(ascending=False)\n",
|
| 852 |
-
"for strategy, score in retrieval_impact.items():\n",
|
| 853 |
-
" notes = RETRIEVAL_STRATEGIES[strategy]['notes']\n",
|
| 854 |
-
" print(f\" {strategy}: {score:.2f}% ({notes})\")\n",
|
| 855 |
-
"\n",
|
| 856 |
-
"# 3. LLM Models\n",
|
| 857 |
-
"print(\"\\n🤖 LLM MODELS:\")\n",
|
| 858 |
-
"llm_impact = df.groupby('LLM_Model')['LLM_Judge_Score'].mean().sort_values(ascending=False)\n",
|
| 859 |
-
"for model, score in llm_impact.items():\n",
|
| 860 |
-
" print(f\" {model}: {score:.2f}%\")\n",
|
| 861 |
-
"\n",
|
| 862 |
-
"# 4. Prompting Strategies\n",
|
| 863 |
-
"print(\"\\n💬 PROMPTING STRATEGIES:\")\n",
|
| 864 |
-
"prompt_impact = df.groupby('Prompt_Strategy')['LLM_Judge_Score'].mean().sort_values(ascending=False)\n",
|
| 865 |
-
"for strategy, score in prompt_impact.items():\n",
|
| 866 |
-
" print(f\" {strategy}: {score:.2f}%\")\n",
|
| 867 |
-
"\n",
|
| 868 |
-
"print(\"\\n\" + \"=\"*100)"
|
| 869 |
-
]
|
| 870 |
-
},
|
| 871 |
-
{
|
| 872 |
-
"cell_type": "markdown",
|
| 873 |
-
"metadata": {},
|
| 874 |
-
"source": [
|
| 875 |
-
"import os\n",
|
| 876 |
-
"from pathlib import Path\n",
|
| 877 |
-
"\n",
|
| 878 |
-
"# Create output directory - using dynamic path\n",
|
| 879 |
-
"output_dir = OUTPUT_DIR / 'rag_optimization_benchmark'\n",
|
| 880 |
-
"output_dir.mkdir(parents=True, exist_ok=True)\n",
|
| 881 |
-
"\n",
|
| 882 |
-
"fig, axes = plt.subplots(2, 3, figsize=(20, 12))\n",
|
| 883 |
-
"\n",
|
| 884 |
-
"# 1. Top Configurations\n",
|
| 885 |
-
"ax1 = axes[0, 0]\n",
|
| 886 |
-
"top_configs = config_summary.head(10)\n",
|
| 887 |
-
"config_labels = [c.split('_')[-2] + '+' + c.split('_')[-1] for c in top_configs.index]\n",
|
| 888 |
-
"ax1.barh(config_labels, top_configs['LLM_Judge_Score'], color=sns.color_palette('viridis', len(top_configs)))\n",
|
| 889 |
-
"ax1.set_xlabel('LLM Judge Score (%)', fontsize=11, fontweight='bold')\n",
|
| 890 |
-
"ax1.set_title('Top 10 Configurations', fontsize=13, fontweight='bold')\n",
|
| 891 |
-
"ax1.set_xlim(0, 100)\n",
|
| 892 |
-
"for i, score in enumerate(top_configs['LLM_Judge_Score']):\n",
|
| 893 |
-
" ax1.text(score + 1, i, f'{score:.1f}', va='center', fontsize=10)\n",
|
| 894 |
-
"\n",
|
| 895 |
-
"# 2. Embedding Model Impact\n",
|
| 896 |
-
"ax2 = axes[0, 1]\n",
|
| 897 |
-
"ax2.bar(embed_impact.index, embed_impact.values, color='skyblue', alpha=0.8)\n",
|
| 898 |
-
"ax2.set_ylabel('Avg LLM Judge Score (%)', fontsize=11, fontweight='bold')\n",
|
| 899 |
-
"ax2.set_title('Embedding Model Impact', fontsize=13, fontweight='bold')\n",
|
| 900 |
-
"ax2.set_ylim(0, 100)\n",
|
| 901 |
-
"ax2.tick_params(axis='x', rotation=45)\n",
|
| 902 |
-
"for i, (model, score) in enumerate(embed_impact.items()):\n",
|
| 903 |
-
" ax2.text(i, score + 2, f'{score:.1f}', ha='center', fontsize=10)\n",
|
| 904 |
-
"\n",
|
| 905 |
-
"# 3. Retrieval Strategy Impact\n",
|
| 906 |
-
"ax3 = axes[0, 2]\n",
|
| 907 |
-
"ax3.bar(retrieval_impact.index, retrieval_impact.values, color='coral', alpha=0.8)\n",
|
| 908 |
-
"ax3.set_ylabel('Avg LLM Judge Score (%)', fontsize=11, fontweight='bold')\n",
|
| 909 |
-
"ax3.set_title('Retrieval Strategy Impact', fontsize=13, fontweight='bold')\n",
|
| 910 |
-
"ax3.set_ylim(0, 100)\n",
|
| 911 |
-
"ax3.tick_params(axis='x', rotation=45)\n",
|
| 912 |
-
"for i, (strategy, score) in enumerate(retrieval_impact.items()):\n",
|
| 913 |
-
" ax3.text(i, score + 2, f'{score:.1f}', ha='center', fontsize=9)\n",
|
| 914 |
-
"\n",
|
| 915 |
-
"# 4. LLM Model Impact\n",
|
| 916 |
-
"ax4 = axes[1, 0]\n",
|
| 917 |
-
"ax4.bar(llm_impact.index, llm_impact.values, color='mediumseagreen', alpha=0.8)\n",
|
| 918 |
-
"ax4.set_ylabel('Avg LLM Judge Score (%)', fontsize=11, fontweight='bold')\n",
|
| 919 |
-
"ax4.set_title('LLM Model Impact', fontsize=13, fontweight='bold')\n",
|
| 920 |
-
"ax4.set_ylim(0, 100)\n",
|
| 921 |
-
"ax4.tick_params(axis='x', rotation=45)\n",
|
| 922 |
-
"for i, (model, score) in enumerate(llm_impact.items()):\n",
|
| 923 |
-
" ax4.text(i, score + 2, f'{score:.1f}', ha='center', fontsize=10)\n",
|
| 924 |
-
"\n",
|
| 925 |
-
"# 5. Prompting Strategy Impact\n",
|
| 926 |
-
"ax5 = axes[1, 1]\n",
|
| 927 |
-
"ax5.bar(prompt_impact.index, prompt_impact.values, color='mediumpurple', alpha=0.8)\n",
|
| 928 |
-
"ax5.set_ylabel('Avg LLM Judge Score (%)', fontsize=11, fontweight='bold')\n",
|
| 929 |
-
"ax5.set_title('Prompting Strategy Impact', fontsize=13, fontweight='bold')\n",
|
| 930 |
-
"ax5.set_ylim(0, 100)\n",
|
| 931 |
-
"ax5.tick_params(axis='x', rotation=45)\n",
|
| 932 |
-
"for i, (strategy, score) in enumerate(prompt_impact.items()):\n",
|
| 933 |
-
" ax5.text(i, score + 2, f'{score:.1f}', ha='center', fontsize=10)\n",
|
| 934 |
-
"\n",
|
| 935 |
-
"# 6. Score Components (best config)\n",
|
| 936 |
-
"ax6 = axes[1, 2]\n",
|
| 937 |
-
"best_config = config_summary.iloc[0]\n",
|
| 938 |
-
"components = ['Accuracy', 'Citation', 'Completeness']\n",
|
| 939 |
-
"scores = [best_config['Accuracy_Score'], best_config['Citation_Score'], best_config['Completeness_Score']]\n",
|
| 940 |
-
"colors_comp = ['#FF6B6B', '#4ECDC4', '#45B7D1']\n",
|
| 941 |
-
"bars = ax6.bar(components, scores, color=colors_comp, alpha=0.8)\n",
|
| 942 |
-
"ax6.set_ylabel('Score (%)', fontsize=11, fontweight='bold')\n",
|
| 943 |
-
"ax6.set_title(f'Best Config Components\\n{best_config.name.split(\"_\")[2]}', fontsize=13, fontweight='bold')\n",
|
| 944 |
-
"ax6.set_ylim(0, 100)\n",
|
| 945 |
-
"for i, score in enumerate(scores):\n",
|
| 946 |
-
" ax6.text(i, score + 2, f'{score:.1f}%', ha='center', fontsize=10, fontweight='bold')\n",
|
| 947 |
-
"\n",
|
| 948 |
-
"plt.tight_layout()\n",
|
| 949 |
-
"plt.savefig(output_dir / 'results.png', dpi=300, bbox_inches='tight')\n",
|
| 950 |
-
"plt.show()\n",
|
| 951 |
-
"\n",
|
| 952 |
-
"print(f\"\\n✅ Visualization saved to '{output_dir}/results.png'\")"
|
| 953 |
-
]
|
| 954 |
-
},
|
| 955 |
-
{
|
| 956 |
-
"cell_type": "code",
|
| 957 |
-
"execution_count": null,
|
| 958 |
-
"metadata": {},
|
| 959 |
-
"outputs": [],
|
| 960 |
-
"source": "import os\nfrom pathlib import Path\n\n# Create output directory - using dynamic path\noutput_dir = OUTPUT_DIR / 'rag_optimization_benchmark'\noutput_dir.mkdir(parents=True, exist_ok=True)\n\nfig, axes = plt.subplots(2, 3, figsize=(20, 12))\n\n# 1. Top Configurations\nax1 = axes[0, 0]\ntop_configs = config_summary.head(10)\nconfig_labels = [c.split('_')[-2] + '+' + c.split('_')[-1] for c in top_configs.index]\nax1.barh(config_labels, top_configs['LLM_Judge_Score'], color=sns.color_palette('viridis', len(top_configs)))\nax1.set_xlabel('LLM Judge Score (%)', fontsize=11, fontweight='bold')\nax1.set_title('Top 10 Configurations', fontsize=13, fontweight='bold')\nax1.set_xlim(0, 100)\nfor i, score in enumerate(top_configs['LLM_Judge_Score']):\n ax1.text(score + 1, i, f'{score:.1f}', va='center', fontsize=10)\n\n# 2. Embedding Model Impact\nax2 = axes[0, 1]\nax2.bar(embed_impact.index, embed_impact.values, color='skyblue', alpha=0.8)\nax2.set_ylabel('Avg LLM Judge Score (%)', fontsize=11, fontweight='bold')\nax2.set_title('Embedding Model Impact', fontsize=13, fontweight='bold')\nax2.set_ylim(0, 100)\nax2.tick_params(axis='x', rotation=45)\nfor i, (model, score) in enumerate(embed_impact.items()):\n ax2.text(i, score + 2, f'{score:.1f}', ha='center', fontsize=10)\n\n# 3. Retrieval Strategy Impact\nax3 = axes[0, 2]\nax3.bar(retrieval_impact.index, retrieval_impact.values, color='coral', alpha=0.8)\nax3.set_ylabel('Avg LLM Judge Score (%)', fontsize=11, fontweight='bold')\nax3.set_title('Retrieval Strategy Impact', fontsize=13, fontweight='bold')\nax3.set_ylim(0, 100)\nax3.tick_params(axis='x', rotation=45)\nfor i, (strategy, score) in enumerate(retrieval_impact.items()):\n ax3.text(i, score + 2, f'{score:.1f}', ha='center', fontsize=9)\n\n# 4. LLM Model Impact\nax4 = axes[1, 0]\nax4.bar(llm_impact.index, llm_impact.values, color='mediumseagreen', alpha=0.8)\nax4.set_ylabel('Avg LLM Judge Score (%)', fontsize=11, fontweight='bold')\nax4.set_title('LLM Model Impact', fontsize=13, fontweight='bold')\nax4.set_ylim(0, 100)\nax4.tick_params(axis='x', rotation=45)\nfor i, (model, score) in enumerate(llm_impact.items()):\n ax4.text(i, score + 2, f'{score:.1f}', ha='center', fontsize=10)\n\n# 5. Prompting Strategy Impact\nax5 = axes[1, 1]\nax5.bar(prompt_impact.index, prompt_impact.values, color='mediumpurple', alpha=0.8)\nax5.set_ylabel('Avg LLM Judge Score (%)', fontsize=11, fontweight='bold')\nax5.set_title('Prompting Strategy Impact', fontsize=13, fontweight='bold')\nax5.set_ylim(0, 100)\nax5.tick_params(axis='x', rotation=45)\nfor i, (strategy, score) in enumerate(prompt_impact.items()):\n ax5.text(i, score + 2, f'{score:.1f}', ha='center', fontsize=10)\n\n# 6. Score Components (best config)\nax6 = axes[1, 2]\nbest_config = config_summary.iloc[0]\ncomponents = ['Accuracy', 'Citation', 'Completeness']\nscores = [best_config['Accuracy_Score'], best_config['Citation_Score'], best_config['Completeness_Score']]\ncolors_comp = ['#FF6B6B', '#4ECDC4', '#45B7D1']\nbars = ax6.bar(components, scores, color=colors_comp, alpha=0.8)\nax6.set_ylabel('Score (%)', fontsize=11, fontweight='bold')\nax6.set_title(f'Best Config Components\\n{best_config.name.split(\"_\")[2]}', fontsize=13, fontweight='bold')\nax6.set_ylim(0, 100)\nfor i, score in enumerate(scores):\n ax6.text(i, score + 2, f'{score:.1f}%', ha='center', fontsize=10, fontweight='bold')\n\nplt.tight_layout()\nplt.savefig(output_dir / 'results.png', dpi=300, bbox_inches='tight')\nplt.show()\n\nprint(f\"\\n✅ Visualization saved to '{output_dir}/results.png'\")"
|
| 961 |
-
},
|
| 962 |
-
{
|
| 963 |
-
"cell_type": "code",
|
| 964 |
-
"execution_count": null,
|
| 965 |
-
"metadata": {},
|
| 966 |
-
"outputs": [],
|
| 967 |
-
"source": [
|
| 968 |
-
"best_config = config_summary.iloc[0]\n",
|
| 969 |
-
"\n",
|
| 970 |
-
"print(\"\\n\" + \"=\"*100)\n",
|
| 971 |
-
"print(\"🏆 OPTIMAL RAG CONFIGURATION\")\n",
|
| 972 |
-
"print(\"=\"*100)\n",
|
| 973 |
-
"\n",
|
| 974 |
-
"print(f\"\\n✅ Best Configuration: {best_config.name}\")\n",
|
| 975 |
-
"print(f\"\\n📊 Performance:\")\n",
|
| 976 |
-
"print(f\" LLM Judge Score: {best_config['LLM_Judge_Score']:.2f}%\")\n",
|
| 977 |
-
"print(f\" Accuracy: {best_config['Accuracy_Score']:.2f}%\")\n",
|
| 978 |
-
"print(f\" Citation Quality: {best_config['Citation_Score']:.2f}%\")\n",
|
| 979 |
-
"print(f\" Completeness: {best_config['Completeness_Score']:.2f}%\")\n",
|
| 980 |
-
"print(f\" Avg Response Time: {best_config['Response_Time']:.2f}s\")\n",
|
| 981 |
-
"\n",
|
| 982 |
-
"print(f\"\\n⚙️ Components:\")\n",
|
| 983 |
-
"print(f\" Embedding Model: {best_config['Embedding_Model']}\")\n",
|
| 984 |
-
"print(f\" → {EMBEDDING_MODELS[best_config['Embedding_Model']]['name']}\")\n",
|
| 985 |
-
"print(f\" Retrieval Strategy: {best_config['Retrieval_Strategy']}\")\n",
|
| 986 |
-
"print(f\" → {RETRIEVAL_STRATEGIES[best_config['Retrieval_Strategy']]['notes']}\")\n",
|
| 987 |
-
"print(f\" LLM Model: {best_config['LLM_Model']}\")\n",
|
| 988 |
-
"print(f\" Prompting Strategy: {best_config['Prompt_Strategy']}\")\n",
|
| 989 |
-
"\n",
|
| 990 |
-
"print(f\"\\n💡 Key Findings:\")\n",
|
| 991 |
-
"print(f\" 1. Best Embedding: {embed_impact.index[0]} ({embed_impact.values[0]:.2f}%)\")\n",
|
| 992 |
-
"print(f\" 2. Best Retrieval: {retrieval_impact.index[0]} ({retrieval_impact.values[0]:.2f}%)\")\n",
|
| 993 |
-
"print(f\" 3. Best LLM: {llm_impact.index[0]} ({llm_impact.values[0]:.2f}%)\")\n",
|
| 994 |
-
"print(f\" 4. Best Prompt: {prompt_impact.index[0]} ({prompt_impact.values[0]:.2f}%)\")\n",
|
| 995 |
-
"\n",
|
| 996 |
-
"print(f\"\\n🎯 Hackathon Impact:\")\n",
|
| 997 |
-
"print(f\" LLM Quality = 30% of total score\")\n",
|
| 998 |
-
"print(f\" Your score: {best_config['LLM_Judge_Score']:.2f}% × 30% = {best_config['LLM_Judge_Score'] * 0.3:.2f} points\")\n",
|
| 999 |
-
"\n",
|
| 1000 |
-
"baseline = df[df['Config'].str.contains('baseline')].iloc[0] if len(df[df['Config'].str.contains('baseline')]) > 0 else None\n",
|
| 1001 |
-
"if baseline is not None:\n",
|
| 1002 |
-
" improvement = best_config['LLM_Judge_Score'] - baseline['LLM_Judge_Score']\n",
|
| 1003 |
-
" print(f\"\\n📈 Improvement vs Baseline:\")\n",
|
| 1004 |
-
" print(f\" +{improvement:.2f}% quality improvement\")\n",
|
| 1005 |
-
" print(f\" = +{improvement * 0.3:.2f} hackathon points\")\n",
|
| 1006 |
-
"\n",
|
| 1007 |
-
"print(\"\\n\" + \"=\"*100)\n",
|
| 1008 |
-
"print(\"📝 IMPLEMENTATION CHECKLIST\")\n",
|
| 1009 |
-
"print(\"=\"*100)\n",
|
| 1010 |
-
"print(f\"\\n1. Use embedding model: {EMBEDDING_MODELS[best_config['Embedding_Model']]['name']}\")\n",
|
| 1011 |
-
"print(f\"2. Implement retrieval: {best_config['Retrieval_Strategy']}\")\n",
|
| 1012 |
-
"print(f\"3. Use LLM model: {best_config['LLM_Model']}\")\n",
|
| 1013 |
-
"print(f\"4. Apply prompt: {best_config['Prompt_Strategy']}\")\n",
|
| 1014 |
-
"print(f\"\\n5. Expected performance:\")\n",
|
| 1015 |
-
"print(f\" - LLM Judge Score: {best_config['LLM_Judge_Score']:.2f}%\")\n",
|
| 1016 |
-
"print(f\" - Response time: ~{best_config['Response_Time']:.1f}s\")\n",
|
| 1017 |
-
"print(\"=\"*100)"
|
| 1018 |
-
]
|
| 1019 |
-
},
|
| 1020 |
-
{
|
| 1021 |
-
"cell_type": "markdown",
|
| 1022 |
-
"metadata": {},
|
| 1023 |
-
"source": [
|
| 1024 |
-
"# Save results\n",
|
| 1025 |
-
"from pathlib import Path\n",
|
| 1026 |
-
"\n",
|
| 1027 |
-
"# Using dynamic path\n",
|
| 1028 |
-
"output_dir = OUTPUT_DIR / 'rag_optimization_benchmark'\n",
|
| 1029 |
-
"output_dir.mkdir(parents=True, exist_ok=True)\n",
|
| 1030 |
-
"\n",
|
| 1031 |
-
"df.to_csv(output_dir / 'detailed_results.csv', index=False, encoding='utf-8')\n",
|
| 1032 |
-
"config_summary.to_csv(output_dir / 'summary.csv', encoding='utf-8')\n",
|
| 1033 |
-
"\n",
|
| 1034 |
-
"# Save component impacts\n",
|
| 1035 |
-
"impacts = pd.DataFrame({\n",
|
| 1036 |
-
" 'Embedding_Impact': embed_impact,\n",
|
| 1037 |
-
" 'Retrieval_Impact': retrieval_impact.reindex(embed_impact.index, fill_value=0),\n",
|
| 1038 |
-
" 'LLM_Impact': llm_impact.reindex(embed_impact.index, fill_value=0),\n",
|
| 1039 |
-
" 'Prompt_Impact': prompt_impact.reindex(embed_impact.index, fill_value=0)\n",
|
| 1040 |
-
"}).fillna(0)\n",
|
| 1041 |
-
"impacts.to_csv(output_dir / 'component_impacts.csv', encoding='utf-8')\n",
|
| 1042 |
-
"\n",
|
| 1043 |
-
"print(\"\\n✅ Results exported to output/rag_optimization_benchmark/:\")\n",
|
| 1044 |
-
"print(\" - detailed_results.csv (all tests)\")\n",
|
| 1045 |
-
"print(\" - summary.csv (config rankings)\")\n",
|
| 1046 |
-
"print(\" - component_impacts.csv (component analysis)\")\n",
|
| 1047 |
-
"print(\" - results.png (visualizations)\")"
|
| 1048 |
-
]
|
| 1049 |
-
}
|
| 1050 |
-
],
|
| 1051 |
-
"metadata": {
|
| 1052 |
-
"kernelspec": {
|
| 1053 |
-
"display_name": "venv",
|
| 1054 |
-
"language": "python",
|
| 1055 |
-
"name": "python3"
|
| 1056 |
-
},
|
| 1057 |
-
"language_info": {
|
| 1058 |
-
"codemirror_mode": {
|
| 1059 |
-
"name": "ipython",
|
| 1060 |
-
"version": 3
|
| 1061 |
-
},
|
| 1062 |
-
"file_extension": ".py",
|
| 1063 |
-
"mimetype": "text/x-python",
|
| 1064 |
-
"name": "python",
|
| 1065 |
-
"nbconvert_exporter": "python",
|
| 1066 |
-
"pygments_lexer": "ipython3",
|
| 1067 |
-
"version": "3.10.12"
|
| 1068 |
-
}
|
| 1069 |
-
},
|
| 1070 |
-
"nbformat": 4,
|
| 1071 |
-
"nbformat_minor": 4
|
| 1072 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
notebooks/{requirements_rag_optimization.txt → requirements.txt}
RENAMED
|
@@ -1,19 +1,24 @@
|
|
| 1 |
-
#
|
| 2 |
-
#
|
|
|
|
| 3 |
|
| 4 |
-
# Azure OpenAI
|
| 5 |
openai==1.54.0
|
| 6 |
|
| 7 |
-
# Vector Database
|
| 8 |
pinecone-client==5.0.0
|
| 9 |
|
| 10 |
-
# Embeddings and
|
| 11 |
sentence-transformers==3.3.1
|
| 12 |
|
| 13 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
jiwer==3.0.3
|
| 15 |
|
| 16 |
-
# Data
|
| 17 |
pandas==2.1.3
|
| 18 |
matplotlib==3.8.2
|
| 19 |
seaborn==0.13.0
|
|
|
|
| 1 |
+
# Notebooks Requirements
|
| 2 |
+
# All dependencies for VLM OCR, LLM Benchmark, and RAG Optimization notebooks
|
| 3 |
+
# Install with: pip install -r requirements.txt
|
| 4 |
|
| 5 |
+
# Azure OpenAI (for all notebooks)
|
| 6 |
openai==1.54.0
|
| 7 |
|
| 8 |
+
# Vector Database (for LLM and RAG notebooks)
|
| 9 |
pinecone-client==5.0.0
|
| 10 |
|
| 11 |
+
# Embeddings and Transformers (for LLM and RAG notebooks)
|
| 12 |
sentence-transformers==3.3.1
|
| 13 |
|
| 14 |
+
# PDF Processing (for VLM OCR notebook)
|
| 15 |
+
PyMuPDF==1.23.8
|
| 16 |
+
Pillow==10.1.0
|
| 17 |
+
|
| 18 |
+
# Metrics (for all notebooks)
|
| 19 |
jiwer==3.0.3
|
| 20 |
|
| 21 |
+
# Data Analysis and Visualization (for all notebooks)
|
| 22 |
pandas==2.1.3
|
| 23 |
matplotlib==3.8.2
|
| 24 |
seaborn==0.13.0
|
notebooks/requirements_vlm_ocr.txt
DELETED
|
@@ -1,24 +0,0 @@
|
|
| 1 |
-
# VLM OCR Benchmarking Requirements
|
| 2 |
-
# Install with: pip install -r requirements_vlm_ocr.txt
|
| 3 |
-
|
| 4 |
-
# Azure OpenAI client (for vision models)
|
| 5 |
-
openai==1.54.0
|
| 6 |
-
|
| 7 |
-
# PDF processing
|
| 8 |
-
PyMuPDF==1.23.8
|
| 9 |
-
Pillow==10.1.0
|
| 10 |
-
|
| 11 |
-
# Metrics
|
| 12 |
-
jiwer==3.0.3
|
| 13 |
-
|
| 14 |
-
# Data analysis and visualization
|
| 15 |
-
pandas==2.1.3
|
| 16 |
-
matplotlib==3.8.2
|
| 17 |
-
seaborn==0.13.0
|
| 18 |
-
|
| 19 |
-
# Utilities
|
| 20 |
-
python-dotenv==1.0.0
|
| 21 |
-
|
| 22 |
-
# Jupyter
|
| 23 |
-
jupyter==1.0.0
|
| 24 |
-
ipykernel==6.27.1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
notebooks/vlm_ocr_benchmark.ipynb
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
notebooks/vlm_ocr_benchmark.ipynb.backup
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
scripts/README.md
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Scripts Directory
|
| 2 |
+
|
| 3 |
+
One-time utility scripts for SOCAR Hackathon project.
|
| 4 |
+
|
| 5 |
+
## Available Scripts
|
| 6 |
+
|
| 7 |
+
### 📊 Data Management
|
| 8 |
+
|
| 9 |
+
#### `check_pinecone.py`
|
| 10 |
+
Check Pinecone vector database status and statistics.
|
| 11 |
+
|
| 12 |
+
```bash
|
| 13 |
+
python scripts/check_pinecone.py
|
| 14 |
+
```
|
| 15 |
+
|
| 16 |
+
**Output:**
|
| 17 |
+
- Total vector count
|
| 18 |
+
- Index dimensions
|
| 19 |
+
- Namespaces (if any)
|
| 20 |
+
- Connection status
|
| 21 |
+
|
| 22 |
+
#### `clear_pinecone.py`
|
| 23 |
+
Clear all data from Pinecone index before re-ingestion.
|
| 24 |
+
|
| 25 |
+
```bash
|
| 26 |
+
python scripts/clear_pinecone.py
|
| 27 |
+
```
|
| 28 |
+
|
| 29 |
+
**⚠️ WARNING**: This deletes ALL vectors! Requires typing 'DELETE' to confirm.
|
| 30 |
+
|
| 31 |
+
**Use case:**
|
| 32 |
+
- Before re-ingesting documents with new chunking strategy
|
| 33 |
+
- Testing with fresh data
|
| 34 |
+
- Cleaning up after experiments
|
| 35 |
+
|
| 36 |
+
### 🤖 Azure OpenAI
|
| 37 |
+
|
| 38 |
+
#### `list_azure_models.py`
|
| 39 |
+
List all deployed Azure OpenAI models.
|
| 40 |
+
|
| 41 |
+
```bash
|
| 42 |
+
python scripts/list_azure_models.py
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
**Output:**
|
| 46 |
+
- Vision models (GPT-4.1, GPT-5, Claude, etc.)
|
| 47 |
+
- Text models (Llama, DeepSeek, etc.)
|
| 48 |
+
- Total count and categorization
|
| 49 |
+
|
| 50 |
+
**Use case:**
|
| 51 |
+
- Verify which models are deployed
|
| 52 |
+
- Check model availability before updating notebooks
|
| 53 |
+
- Debugging 404 errors
|
| 54 |
+
|
| 55 |
+
## Setup
|
| 56 |
+
|
| 57 |
+
All scripts use environment variables from `.env` file:
|
| 58 |
+
|
| 59 |
+
```bash
|
| 60 |
+
# Required in .env
|
| 61 |
+
PINECONE_API_KEY=your_key
|
| 62 |
+
PINECONE_INDEX_NAME=hackathon
|
| 63 |
+
AZURE_OPENAI_API_KEY=your_key
|
| 64 |
+
AZURE_OPENAI_ENDPOINT=your_endpoint
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
## Dependencies
|
| 68 |
+
|
| 69 |
+
Scripts use the same dependencies as the main project:
|
| 70 |
+
- `python-dotenv` - Environment variables
|
| 71 |
+
- `pinecone-client` - Vector database
|
| 72 |
+
- `openai` - Azure OpenAI
|
| 73 |
+
|
| 74 |
+
Install from project root:
|
| 75 |
+
```bash
|
| 76 |
+
pip install -r notebooks/requirements.txt
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
+
## Common Workflows
|
| 80 |
+
|
| 81 |
+
### Re-ingesting Documents
|
| 82 |
+
|
| 83 |
+
```bash
|
| 84 |
+
# 1. Check current data
|
| 85 |
+
python scripts/check_pinecone.py
|
| 86 |
+
|
| 87 |
+
# 2. Clear existing data
|
| 88 |
+
python scripts/clear_pinecone.py
|
| 89 |
+
|
| 90 |
+
# 3. Run ingestion script (not included - create as needed)
|
| 91 |
+
# python scripts/ingest_documents.py
|
| 92 |
+
|
| 93 |
+
# 4. Verify new data
|
| 94 |
+
python scripts/check_pinecone.py
|
| 95 |
+
```
|
| 96 |
+
|
| 97 |
+
### Verifying Model Availability
|
| 98 |
+
|
| 99 |
+
```bash
|
| 100 |
+
# List all deployed models
|
| 101 |
+
python scripts/list_azure_models.py
|
| 102 |
+
|
| 103 |
+
# Check if specific model exists in output
|
| 104 |
+
python scripts/list_azure_models.py | grep "Llama-3.2-Vision"
|
| 105 |
+
```
|
| 106 |
+
|
| 107 |
+
## Adding New Scripts
|
| 108 |
+
|
| 109 |
+
When creating new scripts:
|
| 110 |
+
1. Add descriptive docstring at top
|
| 111 |
+
2. Use environment variables from `.env`
|
| 112 |
+
3. Include error handling with helpful messages
|
| 113 |
+
4. Update this README with usage instructions
|
| 114 |
+
5. Follow existing naming convention: `verb_noun.py`
|
| 115 |
+
|
| 116 |
+
## Examples
|
| 117 |
+
|
| 118 |
+
### Safe Pinecone Cleanup
|
| 119 |
+
```python
|
| 120 |
+
# First check what's there
|
| 121 |
+
$ python scripts/check_pinecone.py
|
| 122 |
+
Total Vectors: 1,300
|
| 123 |
+
Dimensions: 1024
|
| 124 |
+
|
| 125 |
+
# Then clear if needed
|
| 126 |
+
$ python scripts/clear_pinecone.py
|
| 127 |
+
⚠️ WARNING: This will delete ALL 1,300 vectors!
|
| 128 |
+
Type 'DELETE' to confirm: DELETE
|
| 129 |
+
✅ Deletion completed!
|
| 130 |
+
```
|
| 131 |
+
|
| 132 |
+
### Check Vision Models
|
| 133 |
+
```python
|
| 134 |
+
$ python scripts/list_azure_models.py
|
| 135 |
+
|
| 136 |
+
🖼️ Vision Models (6):
|
| 137 |
+
✅ gpt-4.1
|
| 138 |
+
✅ gpt-5
|
| 139 |
+
✅ gpt-5-mini
|
| 140 |
+
✅ claude-sonnet-4-5
|
| 141 |
+
✅ claude-opus-4-1
|
| 142 |
+
✅ Phi-4-multimodal-instruct
|
| 143 |
+
```
|
scripts/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""SOCAR Hackathon utility scripts"""
|
scripts/check_pinecone.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Check Pinecone index status and statistics
|
| 3 |
+
Quick utility to inspect vector database
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
from pinecone import Pinecone
|
| 9 |
+
|
| 10 |
+
# Load environment variables
|
| 11 |
+
load_dotenv()
|
| 12 |
+
|
| 13 |
+
def check_pinecone_status():
|
| 14 |
+
"""Display Pinecone index information"""
|
| 15 |
+
|
| 16 |
+
try:
|
| 17 |
+
# Initialize Pinecone
|
| 18 |
+
pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
|
| 19 |
+
index_name = os.getenv('PINECONE_INDEX_NAME', 'hackathon')
|
| 20 |
+
index = pc.Index(index_name)
|
| 21 |
+
|
| 22 |
+
# Get index statistics
|
| 23 |
+
stats = index.describe_index_stats()
|
| 24 |
+
|
| 25 |
+
print("="*80)
|
| 26 |
+
print("PINECONE INDEX STATUS")
|
| 27 |
+
print("="*80)
|
| 28 |
+
|
| 29 |
+
print(f"\n📊 Index Information:")
|
| 30 |
+
print(f" Name: {index_name}")
|
| 31 |
+
print(f" Total Vectors: {stats.get('total_vector_count', 0):,}")
|
| 32 |
+
print(f" Dimensions: {stats.get('dimension', 'N/A')}")
|
| 33 |
+
|
| 34 |
+
# Check namespaces if any
|
| 35 |
+
if 'namespaces' in stats and stats['namespaces']:
|
| 36 |
+
print(f"\n📁 Namespaces:")
|
| 37 |
+
for ns_name, ns_stats in stats['namespaces'].items():
|
| 38 |
+
ns_display = ns_name if ns_name else "(default)"
|
| 39 |
+
print(f" {ns_display}: {ns_stats.get('vector_count', 0):,} vectors")
|
| 40 |
+
|
| 41 |
+
# Index configuration
|
| 42 |
+
print(f"\n⚙️ Configuration:")
|
| 43 |
+
print(f" API Key: {os.getenv('PINECONE_API_KEY')[:10]}..." if os.getenv('PINECONE_API_KEY') else " API Key: Not set")
|
| 44 |
+
|
| 45 |
+
# Connection status
|
| 46 |
+
if stats.get('total_vector_count', 0) > 0:
|
| 47 |
+
print(f"\n✅ Status: Connected and populated")
|
| 48 |
+
else:
|
| 49 |
+
print(f"\n⚠️ Status: Connected but empty")
|
| 50 |
+
|
| 51 |
+
except Exception as e:
|
| 52 |
+
print("="*80)
|
| 53 |
+
print("PINECONE CONNECTION ERROR")
|
| 54 |
+
print("="*80)
|
| 55 |
+
print(f"\n❌ Error: {e}")
|
| 56 |
+
print("\nPlease check:")
|
| 57 |
+
print(" 1. PINECONE_API_KEY in .env file")
|
| 58 |
+
print(" 2. PINECONE_INDEX_NAME in .env file")
|
| 59 |
+
print(" 3. Index exists in your Pinecone account")
|
| 60 |
+
|
| 61 |
+
if __name__ == "__main__":
|
| 62 |
+
check_pinecone_status()
|
scripts/clear_pinecone.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Clear all data from Pinecone index
|
| 3 |
+
One-time script for data cleanup before re-ingestion
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
from pinecone import Pinecone
|
| 9 |
+
|
| 10 |
+
# Load environment variables
|
| 11 |
+
load_dotenv()
|
| 12 |
+
|
| 13 |
+
def clear_pinecone_index():
|
| 14 |
+
"""Delete all vectors from Pinecone index"""
|
| 15 |
+
|
| 16 |
+
# Initialize Pinecone
|
| 17 |
+
pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
|
| 18 |
+
index = pc.Index(os.getenv('PINECONE_INDEX_NAME', 'hackathon'))
|
| 19 |
+
|
| 20 |
+
# Get current stats
|
| 21 |
+
stats = index.describe_index_stats()
|
| 22 |
+
total_vectors = stats['total_vector_count']
|
| 23 |
+
|
| 24 |
+
print("="*80)
|
| 25 |
+
print("PINECONE DATA CLEANUP")
|
| 26 |
+
print("="*80)
|
| 27 |
+
print(f"\nIndex: {os.getenv('PINECONE_INDEX_NAME', 'hackathon')}")
|
| 28 |
+
print(f"Current vectors: {total_vectors}")
|
| 29 |
+
print(f"Dimensions: {stats.get('dimension', 'N/A')}")
|
| 30 |
+
|
| 31 |
+
if total_vectors == 0:
|
| 32 |
+
print("\n✅ Index is already empty. Nothing to delete.")
|
| 33 |
+
return
|
| 34 |
+
|
| 35 |
+
# Confirm deletion
|
| 36 |
+
print(f"\n⚠️ WARNING: This will delete ALL {total_vectors} vectors!")
|
| 37 |
+
confirm = input("Type 'DELETE' to confirm: ")
|
| 38 |
+
|
| 39 |
+
if confirm != 'DELETE':
|
| 40 |
+
print("\n❌ Deletion cancelled. No data was removed.")
|
| 41 |
+
return
|
| 42 |
+
|
| 43 |
+
print("\n🗑️ Deleting all vectors...")
|
| 44 |
+
|
| 45 |
+
try:
|
| 46 |
+
# Delete all vectors
|
| 47 |
+
index.delete(delete_all=True)
|
| 48 |
+
|
| 49 |
+
print("✅ Deletion completed!")
|
| 50 |
+
|
| 51 |
+
# Verify deletion
|
| 52 |
+
import time
|
| 53 |
+
time.sleep(2) # Wait for deletion to propagate
|
| 54 |
+
|
| 55 |
+
stats = index.describe_index_stats()
|
| 56 |
+
remaining = stats['total_vector_count']
|
| 57 |
+
|
| 58 |
+
print(f"\n📊 Final status:")
|
| 59 |
+
print(f" Remaining vectors: {remaining}")
|
| 60 |
+
|
| 61 |
+
if remaining == 0:
|
| 62 |
+
print(" ✅ Index successfully cleared!")
|
| 63 |
+
else:
|
| 64 |
+
print(f" ⚠️ {remaining} vectors still remain (may need a moment to sync)")
|
| 65 |
+
|
| 66 |
+
except Exception as e:
|
| 67 |
+
print(f"\n❌ Error during deletion: {e}")
|
| 68 |
+
|
| 69 |
+
if __name__ == "__main__":
|
| 70 |
+
clear_pinecone_index()
|
scripts/list_azure_models.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
List all deployed Azure OpenAI models
|
| 3 |
+
Useful for verifying available models
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
from openai import AzureOpenAI
|
| 9 |
+
|
| 10 |
+
# Load environment variables
|
| 11 |
+
load_dotenv()
|
| 12 |
+
|
| 13 |
+
def list_azure_models():
|
| 14 |
+
"""List all deployed Azure OpenAI models"""
|
| 15 |
+
|
| 16 |
+
try:
|
| 17 |
+
client = AzureOpenAI(
|
| 18 |
+
api_key=os.getenv('AZURE_OPENAI_API_KEY'),
|
| 19 |
+
api_version=os.getenv('AZURE_OPENAI_API_VERSION', '2024-08-01-preview'),
|
| 20 |
+
azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT')
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
print("="*80)
|
| 24 |
+
print("AZURE OPENAI DEPLOYED MODELS")
|
| 25 |
+
print("="*80)
|
| 26 |
+
|
| 27 |
+
# List models
|
| 28 |
+
models = client.models.list()
|
| 29 |
+
|
| 30 |
+
print(f"\n📊 Total Models: {len(list(models))}")
|
| 31 |
+
print(f"\nDeployed Models:")
|
| 32 |
+
print("-" * 80)
|
| 33 |
+
|
| 34 |
+
model_list = []
|
| 35 |
+
for model in models:
|
| 36 |
+
model_list.append({
|
| 37 |
+
'id': model.id,
|
| 38 |
+
'created': model.created if hasattr(model, 'created') else 'N/A'
|
| 39 |
+
})
|
| 40 |
+
|
| 41 |
+
# Sort by id
|
| 42 |
+
model_list.sort(key=lambda x: x['id'])
|
| 43 |
+
|
| 44 |
+
# Categorize models
|
| 45 |
+
vision_models = []
|
| 46 |
+
text_models = []
|
| 47 |
+
|
| 48 |
+
for model in model_list:
|
| 49 |
+
model_id = model['id']
|
| 50 |
+
if any(keyword in model_id.lower() for keyword in ['vision', 'multimodal', 'gpt-4.1', 'gpt-5', 'claude']):
|
| 51 |
+
vision_models.append(model_id)
|
| 52 |
+
else:
|
| 53 |
+
text_models.append(model_id)
|
| 54 |
+
|
| 55 |
+
print(f"\n🖼️ Vision Models ({len(vision_models)}):")
|
| 56 |
+
for model_id in vision_models:
|
| 57 |
+
print(f" ✅ {model_id}")
|
| 58 |
+
|
| 59 |
+
print(f"\n📝 Text Models ({len(text_models)}):")
|
| 60 |
+
for model_id in text_models:
|
| 61 |
+
print(f" ✅ {model_id}")
|
| 62 |
+
|
| 63 |
+
print("\n" + "="*80)
|
| 64 |
+
|
| 65 |
+
except Exception as e:
|
| 66 |
+
print("="*80)
|
| 67 |
+
print("AZURE OPENAI CONNECTION ERROR")
|
| 68 |
+
print("="*80)
|
| 69 |
+
print(f"\n❌ Error: {e}")
|
| 70 |
+
print("\nPlease check:")
|
| 71 |
+
print(" 1. AZURE_OPENAI_API_KEY in .env file")
|
| 72 |
+
print(" 2. AZURE_OPENAI_ENDPOINT in .env file")
|
| 73 |
+
print(" 3. API version compatibility")
|
| 74 |
+
|
| 75 |
+
if __name__ == "__main__":
|
| 76 |
+
list_azure_models()
|
test_api.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Quick test script for SOCAR LLM API
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import requests
|
| 6 |
+
import json
|
| 7 |
+
from docs.sample_questions import questions
|
| 8 |
+
|
| 9 |
+
# API base URL
|
| 10 |
+
BASE_URL = "http://localhost:8000"
|
| 11 |
+
|
| 12 |
+
def test_health():
|
| 13 |
+
"""Test health endpoint"""
|
| 14 |
+
print("🔍 Testing health endpoint...")
|
| 15 |
+
response = requests.get(f"{BASE_URL}/health")
|
| 16 |
+
print(f"Status: {response.status_code}")
|
| 17 |
+
print(json.dumps(response.json(), indent=2))
|
| 18 |
+
print()
|
| 19 |
+
|
| 20 |
+
def test_root():
|
| 21 |
+
"""Test root endpoint"""
|
| 22 |
+
print("🔍 Testing root endpoint...")
|
| 23 |
+
response = requests.get(BASE_URL)
|
| 24 |
+
print(f"Status: {response.status_code}")
|
| 25 |
+
print(json.dumps(response.json(), indent=2))
|
| 26 |
+
print()
|
| 27 |
+
|
| 28 |
+
def test_llm(question: str):
|
| 29 |
+
"""Test LLM endpoint"""
|
| 30 |
+
print(f"🔍 Testing LLM endpoint...")
|
| 31 |
+
print(f"Question: {question}\n")
|
| 32 |
+
|
| 33 |
+
payload = {
|
| 34 |
+
"messages": [
|
| 35 |
+
{"role": "user", "content": question}
|
| 36 |
+
],
|
| 37 |
+
"temperature": 0.2,
|
| 38 |
+
"max_tokens": 1000
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
response = requests.post(f"{BASE_URL}/llm", json=payload)
|
| 42 |
+
print(f"Status: {response.status_code}")
|
| 43 |
+
|
| 44 |
+
if response.status_code == 200:
|
| 45 |
+
result = response.json()
|
| 46 |
+
print(f"Response time: {result['response_time']}s")
|
| 47 |
+
print(f"Model: {result['model']}")
|
| 48 |
+
print(f"\nAnswer:\n{result['response']}")
|
| 49 |
+
print(f"\nSources:")
|
| 50 |
+
for source in result['sources']:
|
| 51 |
+
print(f" - {source['pdf_name']}, Page {source['page_number']} (score: {source['relevance_score']})")
|
| 52 |
+
else:
|
| 53 |
+
print(f"Error: {response.text}")
|
| 54 |
+
print()
|
| 55 |
+
|
| 56 |
+
if __name__ == "__main__":
|
| 57 |
+
print("="*80)
|
| 58 |
+
print("SOCAR LLM API Test Suite")
|
| 59 |
+
print("="*80)
|
| 60 |
+
print()
|
| 61 |
+
|
| 62 |
+
# Test health
|
| 63 |
+
try:
|
| 64 |
+
test_health()
|
| 65 |
+
except Exception as e:
|
| 66 |
+
print(f"❌ Health check failed: {e}\n")
|
| 67 |
+
|
| 68 |
+
# Test root
|
| 69 |
+
try:
|
| 70 |
+
test_root()
|
| 71 |
+
except Exception as e:
|
| 72 |
+
print(f"❌ Root endpoint failed: {e}\n")
|
| 73 |
+
|
| 74 |
+
# Test LLM with sample question
|
| 75 |
+
try:
|
| 76 |
+
test_llm("Palçıq vulkanlarının təsir radiusu nə qədərdir?")
|
| 77 |
+
except Exception as e:
|
| 78 |
+
print(f"❌ LLM endpoint failed: {e}\n")
|
| 79 |
+
|
| 80 |
+
print("="*80)
|
| 81 |
+
print("✅ Test suite completed!")
|
| 82 |
+
print("="*80)
|