IsmatS commited on
Commit
1add76f
·
1 Parent(s): 0591d5f
.dockerignore ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ env/
8
+ venv/
9
+ ENV/
10
+ build/
11
+ develop-eggs/
12
+ dist/
13
+ downloads/
14
+ eggs/
15
+ .eggs/
16
+ lib/
17
+ lib64/
18
+ parts/
19
+ sdist/
20
+ var/
21
+ wheels/
22
+ *.egg-info/
23
+ .installed.cfg
24
+ *.egg
25
+
26
+ # Jupyter
27
+ notebooks/
28
+ *.ipynb
29
+ *.ipynb_checkpoints/
30
+ output/
31
+
32
+ # Data files
33
+ data/
34
+ *.pdf
35
+ *.csv
36
+ *.json
37
+
38
+ # Documentation
39
+ docs/
40
+ README.md
41
+ *.md
42
+
43
+ # Git
44
+ .git/
45
+ .gitignore
46
+ .gitattributes
47
+
48
+ # IDE
49
+ .vscode/
50
+ .idea/
51
+ *.swp
52
+ *.swo
53
+ *~
54
+
55
+ # OS
56
+ .DS_Store
57
+ Thumbs.db
58
+
59
+ # Environment
60
+ .env.local
61
+ .env.development
62
+ *.log
63
+
64
+ # Docker
65
+ docker-compose.override.yml
66
+ Dockerfile.dev
.gitignore CHANGED
@@ -1,6 +1,8 @@
1
  .claude
2
- /docs
3
- /data
 
 
4
  .env
5
  .env.local
6
  .env.development.local
 
1
  .claude
2
+
3
+ data/pdfs
4
+ data/vector_db
5
+ data/ai_track_data
6
  .env
7
  .env.local
8
  .env.development.local
DEPLOYMENT.md ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SOCAR Hackathon - LLM API Deployment Guide
2
+
3
+ ## Overview
4
+
5
+ Production-ready FastAPI service for SOCAR historical documents chatbot.
6
+
7
+ **Configuration (Based on RAG Optimization Benchmark):**
8
+ - **Model**: Llama-4-Maverick-17B-128E-Instruct-FP8 (Open-source)
9
+ - **Embedding**: BAAI/bge-large-en-v1.5
10
+ - **Retrieval**: Top-3 vanilla
11
+ - **Prompt Strategy**: Citation-focused
12
+ - **Performance**: 55.67% LLM Judge Score, 73.33% Citation Score, ~3.6s response time
13
+
14
+ ## Quick Start
15
+
16
+ ### Prerequisites
17
+ - Docker and Docker Compose installed
18
+ - `.env` file with API keys (see `.env.example`)
19
+
20
+ ### 1. Configure Environment
21
+
22
+ ```bash
23
+ cp .env.example .env
24
+ # Edit .env with your actual API keys:
25
+ # - AZURE_OPENAI_API_KEY
26
+ # - AZURE_OPENAI_ENDPOINT
27
+ # - PINECONE_API_KEY
28
+ # - PINECONE_INDEX_NAME
29
+ ```
30
+
31
+ ### 2. Build and Run with Docker
32
+
33
+ ```bash
34
+ # Build the image
35
+ docker-compose build
36
+
37
+ # Start the service
38
+ docker-compose up -d
39
+
40
+ # Check logs
41
+ docker-compose logs -f llm-api
42
+
43
+ # Check health
44
+ curl http://localhost:8000/health
45
+ ```
46
+
47
+ ### 3. Test the API
48
+
49
+ ```bash
50
+ # Simple health check
51
+ curl http://localhost:8000/
52
+
53
+ # Test LLM endpoint
54
+ curl -X POST http://localhost:8000/llm \
55
+ -H "Content-Type: application/json" \
56
+ -d '{
57
+ "messages": [
58
+ {"role": "user", "content": "Palçıq vulkanlarının təsir radiusu nə qədərdir?"}
59
+ ]
60
+ }'
61
+ ```
62
+
63
+ ## API Endpoints
64
+
65
+ ### GET `/`
66
+ Root endpoint with service information.
67
+
68
+ **Response:**
69
+ ```json
70
+ {
71
+ "status": "healthy",
72
+ "service": "SOCAR LLM Chatbot",
73
+ "version": "1.0.0",
74
+ "model": "Llama-4-Maverick-17B (open-source)",
75
+ "configuration": {
76
+ "embedding": "BAAI/bge-large-en-v1.5",
77
+ "retrieval": "top-3 vanilla",
78
+ "prompt": "citation_focused",
79
+ "benchmark_score": "55.67%"
80
+ }
81
+ }
82
+ ```
83
+
84
+ ### GET `/health`
85
+ Detailed health check with service status.
86
+
87
+ **Response:**
88
+ ```json
89
+ {
90
+ "status": "healthy",
91
+ "pinecone": {
92
+ "connected": true,
93
+ "total_vectors": 1300
94
+ },
95
+ "azure_openai": "connected",
96
+ "embedding_model": "loaded"
97
+ }
98
+ ```
99
+
100
+ ### POST `/llm`
101
+ Main chatbot endpoint.
102
+
103
+ **Request:**
104
+ ```json
105
+ {
106
+ "messages": [
107
+ {"role": "user", "content": "Your question here"}
108
+ ],
109
+ "temperature": 0.2,
110
+ "max_tokens": 1000
111
+ }
112
+ ```
113
+
114
+ **Response:**
115
+ ```json
116
+ {
117
+ "response": "Answer with citations...",
118
+ "sources": [
119
+ {
120
+ "pdf_name": "document_00.pdf",
121
+ "page_number": "5",
122
+ "relevance_score": "0.892"
123
+ }
124
+ ],
125
+ "response_time": 3.61,
126
+ "model": "Llama-4-Maverick-17B-128E-Instruct-FP8"
127
+ }
128
+ ```
129
+
130
+ ## Development Mode
131
+
132
+ ### Run locally without Docker
133
+
134
+ ```bash
135
+ # Install dependencies
136
+ cd app
137
+ pip install -r requirements.txt
138
+
139
+ # Run with uvicorn
140
+ uvicorn main:app --reload --host 0.0.0.0 --port 8000
141
+ ```
142
+
143
+ ### Access API documentation
144
+
145
+ Once running, visit:
146
+ - **Swagger UI**: http://localhost:8000/docs
147
+ - **ReDoc**: http://localhost:8000/redoc
148
+
149
+ ## Production Deployment
150
+
151
+ ### Environment Variables
152
+
153
+ Required in `.env`:
154
+ ```bash
155
+ # Azure OpenAI
156
+ AZURE_OPENAI_API_KEY=your_key_here
157
+ AZURE_OPENAI_ENDPOINT=your_endpoint_here
158
+ AZURE_OPENAI_API_VERSION=2024-08-01-preview
159
+
160
+ # Pinecone
161
+ PINECONE_API_KEY=your_key_here
162
+ PINECONE_INDEX_NAME=hackathon
163
+ ```
164
+
165
+ ### Docker Commands
166
+
167
+ ```bash
168
+ # Build
169
+ docker-compose build --no-cache
170
+
171
+ # Start in background
172
+ docker-compose up -d
173
+
174
+ # View logs
175
+ docker-compose logs -f
176
+
177
+ # Stop
178
+ docker-compose down
179
+
180
+ # Restart
181
+ docker-compose restart
182
+
183
+ # Remove everything
184
+ docker-compose down -v
185
+ ```
186
+
187
+ ### Health Checks
188
+
189
+ The Docker container includes automatic health checks:
190
+ - **Interval**: 30 seconds
191
+ - **Timeout**: 10 seconds
192
+ - **Start period**: 40 seconds (for model loading)
193
+ - **Retries**: 3
194
+
195
+ ### Monitoring
196
+
197
+ ```bash
198
+ # Check container status
199
+ docker-compose ps
200
+
201
+ # View resource usage
202
+ docker stats socar-llm-api
203
+
204
+ # Check logs
205
+ docker-compose logs --tail=100 llm-api
206
+ ```
207
+
208
+ ## Performance Optimization
209
+
210
+ ### Lazy Loading
211
+ - Azure client, Pinecone index, and embedding model are lazy-loaded
212
+ - First request may take longer (~5-10s for model loading)
213
+ - Subsequent requests: ~3.6s average
214
+
215
+ ### Caching (Future)
216
+ To improve performance, consider:
217
+ - Redis for frequently asked questions
218
+ - Embedding cache for common queries
219
+ - Model quantization for faster inference
220
+
221
+ ## Troubleshooting
222
+
223
+ ### Container won't start
224
+ ```bash
225
+ # Check logs
226
+ docker-compose logs llm-api
227
+
228
+ # Verify environment variables
229
+ docker-compose config
230
+
231
+ # Rebuild
232
+ docker-compose build --no-cache
233
+ ```
234
+
235
+ ### API returns 500 errors
236
+ - Check Azure OpenAI key and endpoint
237
+ - Verify Pinecone connection
238
+ - Check model deployment name matches
239
+
240
+ ### Slow responses
241
+ - First request loads models (5-10s)
242
+ - Subsequent requests should be ~3-4s
243
+ - Check network connectivity to Azure/Pinecone
244
+
245
+ ## Architecture Score
246
+
247
+ **Open-Source Stack (20% bonus):**
248
+ - ✅ Llama-4-Maverick-17B (Open-source LLM)
249
+ - ✅ BAAI/bge-large-en-v1.5 (Open-source embeddings)
250
+ - ✅ FastAPI (Open-source framework)
251
+ - ✅ Docker (Open-source deployment)
252
+
253
+ **Total Architecture Score: Maximum 20% for hackathon!**
254
+
255
+ ## License
256
+
257
+ Built for SOCAR Hackathon 2025
Dockerfile ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SOCAR Hackathon - LLM Endpoint Dockerfile
2
+ # Multi-stage build for optimized image size
3
+
4
+ # Stage 1: Builder
5
+ FROM python:3.10-slim as builder
6
+
7
+ WORKDIR /app
8
+
9
+ # Install build dependencies
10
+ RUN apt-get update && apt-get install -y \
11
+ build-essential \
12
+ && rm -rf /var/lib/apt/lists/*
13
+
14
+ # Copy requirements and install dependencies
15
+ COPY app/requirements.txt .
16
+ RUN pip install --no-cache-dir --user -r requirements.txt
17
+
18
+ # Stage 2: Runtime
19
+ FROM python:3.10-slim
20
+
21
+ WORKDIR /app
22
+
23
+ # Install runtime dependencies
24
+ RUN apt-get update && apt-get install -y \
25
+ curl \
26
+ && rm -rf /var/lib/apt/lists/*
27
+
28
+ # Copy Python dependencies from builder
29
+ COPY --from=builder /root/.local /root/.local
30
+
31
+ # Copy application code
32
+ COPY app/ ./app/
33
+ COPY .env.example .env
34
+
35
+ # Add local bin to PATH
36
+ ENV PATH=/root/.local/bin:$PATH
37
+
38
+ # Expose port
39
+ EXPOSE 8000
40
+
41
+ # Health check
42
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
43
+ CMD curl -f http://localhost:8000/health || exit 1
44
+
45
+ # Run the application
46
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
app/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ """SOCAR Hackathon LLM API"""
2
+ __version__ = "1.0.0"
app/main.py ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SOCAR Hackathon - LLM Chatbot Endpoint
3
+ Optimized based on RAG benchmark results
4
+ Best config: citation_focused + vanilla_k3 + Llama-4-Maverick
5
+ """
6
+
7
+ import os
8
+ import time
9
+ from typing import List, Dict
10
+ from pathlib import Path
11
+
12
+ from fastapi import FastAPI, HTTPException
13
+ from fastapi.middleware.cors import CORSMiddleware
14
+ from pydantic import BaseModel
15
+ from dotenv import load_dotenv
16
+ from openai import AzureOpenAI
17
+ from pinecone import Pinecone
18
+ from sentence_transformers import SentenceTransformer
19
+
20
+ # Load environment variables
21
+ load_dotenv()
22
+
23
+ # Initialize FastAPI app
24
+ app = FastAPI(
25
+ title="SOCAR Historical Documents Chatbot",
26
+ description="RAG-based chatbot for SOCAR oil & gas historical documents",
27
+ version="1.0.0"
28
+ )
29
+
30
+ # CORS middleware
31
+ app.add_middleware(
32
+ CORSMiddleware,
33
+ allow_origins=["*"],
34
+ allow_credentials=True,
35
+ allow_methods=["*"],
36
+ allow_headers=["*"],
37
+ )
38
+
39
+ # Initialize clients (lazy loading for faster startup)
40
+ azure_client = None
41
+ pinecone_index = None
42
+ embedding_model = None
43
+
44
+
45
+ def get_azure_client():
46
+ """Lazy load Azure OpenAI client"""
47
+ global azure_client
48
+ if azure_client is None:
49
+ azure_client = AzureOpenAI(
50
+ api_key=os.getenv("AZURE_OPENAI_API_KEY"),
51
+ api_version=os.getenv("AZURE_OPENAI_API_VERSION", "2024-08-01-preview"),
52
+ azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
53
+ )
54
+ return azure_client
55
+
56
+
57
+ def get_pinecone_index():
58
+ """Lazy load Pinecone index"""
59
+ global pinecone_index
60
+ if pinecone_index is None:
61
+ pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
62
+ pinecone_index = pc.Index(os.getenv("PINECONE_INDEX_NAME", "hackathon"))
63
+ return pinecone_index
64
+
65
+
66
+ def get_embedding_model():
67
+ """Lazy load embedding model"""
68
+ global embedding_model
69
+ if embedding_model is None:
70
+ # Best performing model from benchmark
71
+ embedding_model = SentenceTransformer("BAAI/bge-large-en-v1.5")
72
+ return embedding_model
73
+
74
+
75
+ # Request/Response models
76
+ class ChatMessage(BaseModel):
77
+ role: str
78
+ content: str
79
+
80
+
81
+ class ChatRequest(BaseModel):
82
+ messages: List[ChatMessage]
83
+ temperature: float = 0.2
84
+ max_tokens: int = 1000
85
+
86
+
87
+ class ChatResponse(BaseModel):
88
+ response: str
89
+ sources: List[Dict[str, str]]
90
+ response_time: float
91
+ model: str
92
+
93
+
94
+ def retrieve_documents(query: str, top_k: int = 3) -> List[Dict]:
95
+ """
96
+ Retrieve relevant documents from Pinecone vector database.
97
+ Best strategy from benchmark: vanilla top-3
98
+ """
99
+ index = get_pinecone_index()
100
+ embed_model = get_embedding_model()
101
+
102
+ # Generate query embedding
103
+ query_embedding = embed_model.encode(query).tolist()
104
+
105
+ # Search vector database
106
+ results = index.query(
107
+ vector=query_embedding,
108
+ top_k=top_k,
109
+ include_metadata=True
110
+ )
111
+
112
+ # Extract documents
113
+ documents = []
114
+ for match in results['matches']:
115
+ documents.append({
116
+ 'pdf_name': match['metadata'].get('pdf_name', 'unknown.pdf'),
117
+ 'page_number': match['metadata'].get('page_number', 0),
118
+ 'content': match['metadata'].get('text', ''),
119
+ 'score': match.get('score', 0.0)
120
+ })
121
+
122
+ return documents
123
+
124
+
125
+ def generate_answer(query: str, documents: List[Dict], temperature: float = 0.2, max_tokens: int = 1000) -> tuple[str, float]:
126
+ """
127
+ Generate answer using best-performing configuration.
128
+ Model: Llama-4-Maverick-17B (open-source)
129
+ Prompt: citation_focused (best citation score: 73.33%)
130
+ """
131
+ client = get_azure_client()
132
+
133
+ # Build context from retrieved documents
134
+ context_parts = []
135
+ for i, doc in enumerate(documents, 1):
136
+ context_parts.append(
137
+ f"Sənəd {i} (Mənbə: {doc['pdf_name']}, Səhifə {doc['page_number']}):\n{doc['content']}"
138
+ )
139
+ context = "\n\n".join(context_parts)
140
+
141
+ # Citation-focused prompt (best performer: 55.67% score)
142
+ prompt = f"""Siz SOCAR-ın tarixi sənədlər üzrə mütəxəssis köməkçisisiniz.
143
+
144
+ ÖNƏMLİ: Hər bir faktı mütləq mənbə ilə təsdiqləyin (PDF adı və səhifə nömrəsi).
145
+
146
+ Kontekst:
147
+ {context}
148
+
149
+ Sual: {query}
150
+
151
+ Cavab verərkən:
152
+ 1. Dəqiq faktlar yazın
153
+ 2. Hər faktı mənbə ilə göstərin: (PDF: fayl_adı.pdf, Səhifə: X)
154
+ 3. Kontekstdə olmayan məlumat əlavə etməyin"""
155
+
156
+ try:
157
+ start_time = time.time()
158
+
159
+ # Use Llama-4-Maverick (open-source, best performer)
160
+ response = client.chat.completions.create(
161
+ model="Llama-4-Maverick-17B-128E-Instruct-FP8",
162
+ messages=[{"role": "user", "content": prompt}],
163
+ temperature=temperature,
164
+ max_tokens=max_tokens
165
+ )
166
+
167
+ elapsed = time.time() - start_time
168
+ answer = response.choices[0].message.content
169
+
170
+ return answer, elapsed
171
+
172
+ except Exception as e:
173
+ raise HTTPException(status_code=500, detail=f"LLM Error: {str(e)}")
174
+
175
+
176
+ @app.get("/")
177
+ async def root():
178
+ """Health check endpoint"""
179
+ return {
180
+ "status": "healthy",
181
+ "service": "SOCAR LLM Chatbot",
182
+ "version": "1.0.0",
183
+ "model": "Llama-4-Maverick-17B (open-source)",
184
+ "configuration": {
185
+ "embedding": "BAAI/bge-large-en-v1.5",
186
+ "retrieval": "top-3 vanilla",
187
+ "prompt": "citation_focused",
188
+ "benchmark_score": "55.67%"
189
+ }
190
+ }
191
+
192
+
193
+ @app.get("/health")
194
+ async def health():
195
+ """Detailed health check"""
196
+ try:
197
+ # Check if services are initialized
198
+ index = get_pinecone_index()
199
+ stats = index.describe_index_stats()
200
+
201
+ return {
202
+ "status": "healthy",
203
+ "pinecone": {
204
+ "connected": True,
205
+ "total_vectors": stats.get('total_vector_count', 0)
206
+ },
207
+ "azure_openai": "connected",
208
+ "embedding_model": "loaded"
209
+ }
210
+ except Exception as e:
211
+ return {
212
+ "status": "degraded",
213
+ "error": str(e)
214
+ }
215
+
216
+
217
+ @app.post("/llm", response_model=ChatResponse)
218
+ async def llm_endpoint(request: ChatRequest):
219
+ """
220
+ LLM chatbot endpoint for SOCAR historical documents.
221
+
222
+ Uses RAG (Retrieval Augmented Generation) with:
223
+ - Embedding: BAAI/bge-large-en-v1.5
224
+ - Retrieval: Top-3 documents
225
+ - LLM: Llama-4-Maverick-17B (open-source)
226
+ - Prompt: Citation-focused
227
+
228
+ Expected performance:
229
+ - Response time: ~3.6s
230
+ - LLM Judge Score: 55.67%
231
+ - Citation Score: 73.33%
232
+ """
233
+ try:
234
+ # Extract the user's question (last message)
235
+ if not request.messages:
236
+ raise HTTPException(status_code=400, detail="No messages provided")
237
+
238
+ user_messages = [msg for msg in request.messages if msg.role == "user"]
239
+ if not user_messages:
240
+ raise HTTPException(status_code=400, detail="No user message found")
241
+
242
+ query = user_messages[-1].content
243
+
244
+ # Retrieve relevant documents
245
+ documents = retrieve_documents(query, top_k=3)
246
+
247
+ # Generate answer
248
+ answer, response_time = generate_answer(
249
+ query=query,
250
+ documents=documents,
251
+ temperature=request.temperature,
252
+ max_tokens=request.max_tokens
253
+ )
254
+
255
+ # Format sources
256
+ sources = [
257
+ {
258
+ "pdf_name": doc['pdf_name'],
259
+ "page_number": str(doc['page_number']),
260
+ "relevance_score": f"{doc['score']:.3f}"
261
+ }
262
+ for doc in documents
263
+ ]
264
+
265
+ return ChatResponse(
266
+ response=answer,
267
+ sources=sources,
268
+ response_time=round(response_time, 2),
269
+ model="Llama-4-Maverick-17B-128E-Instruct-FP8"
270
+ )
271
+
272
+ except HTTPException:
273
+ raise
274
+ except Exception as e:
275
+ raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
276
+
277
+
278
+ if __name__ == "__main__":
279
+ import uvicorn
280
+ uvicorn.run(app, host="0.0.0.0", port=8000)
notebooks/requirements_llm_benchmark.txt → app/requirements.txt RENAMED
@@ -1,27 +1,24 @@
1
- # LLM Benchmarking Requirements
2
- # Install with: pip install -r requirements_llm_benchmark.txt
 
 
 
 
 
3
 
4
  # Azure OpenAI client
5
  openai==1.54.0
6
 
7
- # Vector Database
8
  pinecone-client==5.0.0
9
 
10
  # Embeddings
11
  sentence-transformers==3.3.1
12
-
13
- # Metrics
14
- jiwer==3.0.3
15
-
16
- # Data analysis and visualization
17
- pandas==2.1.3
18
- matplotlib==3.8.2
19
- seaborn==0.13.0
20
 
21
  # Utilities
22
  python-dotenv==1.0.0
23
- numpy==1.26.2
24
 
25
- # Jupyter
26
- jupyter==1.0.0
27
- ipykernel==6.27.1
 
1
+ # SOCAR Hackathon LLM Endpoint Dependencies
2
+ # Optimized for production deployment
3
+
4
+ # FastAPI and server
5
+ fastapi==0.109.0
6
+ uvicorn[standard]==0.27.0
7
+ pydantic==2.5.3
8
 
9
  # Azure OpenAI client
10
  openai==1.54.0
11
 
12
+ # Vector database
13
  pinecone-client==5.0.0
14
 
15
  # Embeddings
16
  sentence-transformers==3.3.1
17
+ torch==2.1.2
 
 
 
 
 
 
 
18
 
19
  # Utilities
20
  python-dotenv==1.0.0
21
+ python-multipart==0.0.6
22
 
23
+ # Optional: monitoring and logging
24
+ prometheus-fastapi-instrumentator==7.0.0
 
data/dataset_info.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_rows": 28,
3
+ "features": [
4
+ "pdf"
5
+ ],
6
+ "description": "SOCAR AI Track Dataset with PDF documents"
7
+ }
data/document_00.md ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ **XÜLASƏ**
2
+
3
+ Bu tədqiqat Aşağı Kür çökəkliyi (AKÇ) və Bakı arxipelaqı (BA) daxil olmaqla Cənubi Xəzər çökəkliyi sistemində faydalı qazıntıların mənşəyinin paleotektonik, paleocoğrafi şərait və geodinamik rejimlə necə əlaqələndiyini, eləcə də Gec Miosendən etibarən Ərəbistan plitəsinin təsiri ilə formalaşan kollizion proseslərin bölgənin struktur-morfoloji və termal inkişafına nə dərəcədə yönverici rol oynadığını kompleks şəkildə qiymətləndirir. Seismotektonik göstəricilərin, çöküntütoplanma sürətlərinin, geotemperatur xəritələrinin və palçıq vulkanizmi indikatorlarının inteqrasiyası göstərir ki, Cənubi Xəzər meqaçökəkliyinin qərb periklinal zonasının morfotektonik skeleti sıxılma gərginlikləri tərəfindən idarə olunmuş, AKÇ və BA-nın intensiv deformasiya olunması isə antiklinal zonallaşmanı, yerli qalxımların və qırıcı şəbəkəsinin hüdudlarını müəyyənləşdirmişdir. Nəticələrə görə, AKÇ-də antiklinal xətlər şimal-qərbdən cənub-şərqə uzanır və bu, şimal-şərq–cənub-qərb istiqamətli maksimal sıxılma gərginliklərinə perpendikulyar struktur elementlərinin üstünlüyünü təsdiqləyir. AKÇ-nin şimal-şərq seqmentində sıxılma gərginliklərinin intensivliyi pik dəyərlərə çatır; buradakı lokal qalxımlar həm seysmik kəsiklərdə, həm də səth-şelf morfologiyasında asimmetrik, "flower structure" tipli transpressiv motivlərlə ifadə olunur.
4
+
5
+ ---
6
+
7
+ **Səhifə 2**
8
+
9
+ Çöküntütoplanma sürətinin məkanca və zamanca dəyişkənliyi, xüsusən Cənubi Xəzərin mərkəzində 0.4 mm/il, şelfdə 3–4 mm/il və Kür çayının mənsəbində 6 mm/il-ə çatan göstəricilər, akkumulyasiya rejiminin geodinamik yüklənməyə həssas olduğunu göstərir. Çöküntü qalınlığının AKÇ mərkəzinə doğru 6–7 km-ə qədər artması subsidensiya-sıxılma balansının uzunmüddətli kinematikası ilə izah olunur. Geotemperatur modelləşdirmə və xəritələşdirmə neftəmələgəlmənin baş zonalarını AKÇ və BA üçün ayırd etmiş, BA-da 3000–4000 m intervalında neft, 8000–8500 m intervalında isə qaz əmələgəlməsinin pik zonalarını təsbit etmişdir. Palçıq vulkanizmi məcraları boyunca temperatur anomaliyalarının müşahidəsi, yerli termal axının artması və süxur kompleksləri daxilində maye-müxtəlif fazalı qarışıqların dərinlikdən gətirilməsi ilə əlaqələndirilmişdir. Nəticə olaraq, AKÇ və BA-nın neft-qazlılıq potensialının formalaşması birbaşa sıxılma gərginlikləri, qatlanma-qırılma kinematikası və çöküntütoplanma rejiminin sinxronlaşdırılması ilə bağlı olub; Pliosendə hər iki struktur domen üzrə inkişafın sürətlənməsi, akkumulyasiya-tektonika qarşılıqlı təsirinin ən yüksək səviyyəsini əks etdirir.
10
+
11
+ **Açar sözlər**
12
+
13
+ Cənubi Xəzər çökəkliyi; Aşağı Kür çökəkliyi; Bakı arxipelaqı; sıxılma gərginlikləri; paleotektonika; paleocoğrafiya; geodinamik rejim; palçıq vulkanizmi; çöküntütoplanma; neftəmələgəlmə; qazəmələgəlmə; geotemperatur modelləşdirmə.
14
+
15
+ ---
16
+
17
+ **Səhifə 3**
18
+
19
+ ## **Giriş**
20
+
21
+ Faydalı qazıntıların regional paylanması və genezisi əksər hallarda geodinamik çərçivə, paleotektonik epizodlar və paleocoğrafi təkamüllə sıx bağlıdır. Qafqaz orogeni və Cənubi Xəzər meqaçökəkliyi arasında yerləşən Aşağı Kür çökəkliyi (AKÇ) və Bakı arxipelaqı (BA) bu baxımdan unikal laboratorıyadır: burada Gec Miosendən bəri davam edən Ərəbistan-Avrasiya kolliziyası nəticəsində yaranmış sıxılma rejimi həm struktur-morfoloji, həm də neft-karbohidrogen sistemlərinin inkişafını güclü şəkildə yönləndirir. Bu işdə məqsəd:
22
+
23
+ * AKÇ və BA ərazilərində sıxılma gərginliklərinin orientasiyası, intensivliyi və zamanca dəyişkənliyini qiymətləndirmək;
24
+ * qatlanma-qırılma şəbəkəsinin geometriyasını paleostress sahəsi ilə əlaqələndirmək;
25
+ * çöküntütoplanma sürətlərinin məkan-zaman variasiyalarını və onların subsidensiya balansına təsirini göstərmək;
26
+ * geotemperatur xəritələri əsasında neft və qaz əmələgəlməsinin baş zonalarını müəyyənləşdirmək;
27
+ * palçıq vulkanizminin termal və hidrodinamik siqnallarını, həmçinin süxur komplekslərinin temperatur rejiminə təsirini izah etmək;
28
+ * mineral resursların (xüsusən neft-qaz) potensialını struktur-paleocoğrafi ramka daxilində inteqrasiya etmək.
29
+
30
+ ## **Regional geodinamik fon**
31
+
32
+ Gec Miosendən etibarən Ərəbistan plitəsinin şimala doğru hərəkəti Qafqaz kollizion zonasını aktivləşdirmiş, Cənubi Xəzər meqaçökəkliyinin qərb kənarında transpressiv-sıxılmalı deformasiya rejimi yaratmışdır. Bu proses, Cənubi Xəzər dərin çökəkliyi altında yüksək sıxlıqlı litosfer blokunun şərti şəkildə udulması (subduksiya-vari tələffüz olunan udulma) ilə nəticələnmiş, qərb yamac boyunca pillələnmiş itələnmələr, sağ-yanal komponentli qırılmalar və ön zonada sıxılmalı qatlanma kəmərləri əmələ gətirmişdir. Regional stress sahəsinin maksimum üfüqi komponenti (SHmax) şimal-şərq–cənub-qərb istiqamətlidir; bu istiqamətdə sıxılma gərginliklərinin yüksəkliyi AKÇ-nin şimal-şərq seqmentində ən aydın şəkildə izlənir. Nəticə etibarilə, antiklinal oxlar və qırılma zonalarının dominant uzanma istiqaməti şimal-qərbdən cənub-şərqə doğrudur ki, bu da SHmax-ın ortoqonalını əks etdirir.
33
+
34
+ ---
35
+
36
+ **Səhifə 4**
37
+
38
+ ## **Paleotektonik və paleocoğrafi kontekst**
39
+
40
+ Paleocoğrafi rekonstruksiyalar göstərir ki, Neogen boyunca Kür hövzəsi-delta kompleksi, Abşeron şelfi və inteqrasiya olunmuş arxipelaq sistemində çöküntü fasiyaları deltaik-allüvial, prodelta-dənizli slam və dayaz dəniz karbonat-silis biogenləri arasında paylanmışdır. Gec Miosen–Pliosen dövründə tektono-eustatik səviyyə dalğalanmaları ilə sinxron proqradasiya-reqressiya mərhələləri çöküntü litoqrafiyasını zonallaşdırmış, BA xətti üzrə ritmik qalxımlar isə lokal akkumulyasiya baryerləri yaratmışdır. Paleotektonik təkamül, Ərəbistan plitəsinin itmə sürətindəki dəyişmələrlə birlikdə transpressiv komponentin güclənməsinə və qatlanma cəbhəsinin qərbə doğru miqrasiyasına yol açmış, AKÇ-də antiklinal məhəllələrin ardıcıl aktivləşməsi ilə müşayiət olunmuşdur.
41
+
42
+ ---
43
+
44
+ ## **Material və metodlar**
45
+
46
+ ### **Məlumat bazası**
47
+
48
+ * Regional və yüksək ayırdetmə qabiliyyətli seysmik kəsiklərdən çıxarılan struktur xəritələr;
49
+ * Quyu geofizikası (temperatur, qalınlıq, vitrinit əksoluunma, akustik sürət);
50
+ * Palçıq vulkanlarının termal-fluid kimyası, qaz-izotop tərkibi və vent temperatur ölçmələri;
51
+ * Çökmə fasiyalarının nümunə təhlili, dənə ölçüsü statistikası və çöküntütoplanma sürətlərinin stratiqrafik kalibrlənməsi;
52
+ * Geodeziya-GNSS əsaslı müasir deformasiya sürətləri.
53
+
54
+ ### **Paleostress rekonstruksiyası**
55
+
56
+ Qırılma müstəviləri, sürüşmə vektorları və qatlanma oxlarının istiqamətlərinə əsaslanan kinematik inversiya alqoritmi tətbiq edilmişdir. SHmax azimutu, sıxılma dərəcəsi və transpressiv komponentin nisbi payı ən kiçik kvadratlar metoduyla kalibr olunmuşdur.
57
+
58
+ ### **Çökmə-subsidensiya modelləşdirməsi**
59
+
60
+ 1D və 2D hövzə modelləri vasitəsilə çöküntütoplanma sürətləri, istilik axını və kompaksiya qanunları (tixotropluq və yükə bağlı porozite azalması) tətbiq edilmişdir. Dərinləşmə tarixi, su dərinliyi dəyişmələri və deltaların proqradasiya sürətləri daxil edilmişdir.
61
+
62
+ ### **Geotemperatur modelləşdirməsi**
63
+
64
+ İstilik axını xəritələri, termal keçiricilik, radioaktiv istilik istehsalı və maye axını ilə bağlı sadələşdirilmiş kupləj modelləri istifadə olunmuşdur. Neftəmələgəlmə (60–120°C; bəzi üzvi maddə tiplərində 140°C-yə qədər), qazəmələgəlmə (120–200°C) intervalları, BA üçün quyu ölçüləri ilə kalibr olunmuşdur.
65
+
66
+ ---
67
+
68
+ **Səhifə 5**
69
+
70
+ ## **Palçıq vulkanizmi indikatorları**
71
+
72
+ Vulkan konuslarında vent temperaturu, suların xlorid-bor tərkibi, C1–C5 qaz paylanması və izotop fraksiyalanması qiymətləndirilmiş, temperatur anomal zonalarının lateral təsir radiusu empirik funksiya ilə (R ≈ 0.8–1.5 km) təxmini qiymətləndirilmişdir.
73
+
74
+ ---
75
+
76
+ ## **Nəticələr**
77
+
78
+ ### **Sıxılma gərginliklərinin məkan paylanması və intensivliyi**
79
+
80
+ * AKÇ-nin şimal-şərq seqmentində SHmax ~35–45° azimutlu, müzdarib zonalarda effektiv sıxılma 12–18 MPa aralığındadır; cənub-qərbə doğru 7–10 MPa-a enir.
81
+ * BA boyunca transpressiv zolaqda hissəvi sağ-yanal komponentli itələnmələr müşahidə olunur; qırılma müstəviləri 310–320° trendlidir.
82
+ * Qırıcıların uzanma istiqaməti SHmax-a perpendikulyardır: dominant NW–SE uzanmalı tərs qırılmalar və onları kəsən NE–SW istiqamətli keçid qırılmaları üçbucaq zonaları təşkil edir.
83
+
84
+ ### **Struktur formalar: antiklinal zonallaşma və lokal qalxımlar**
85
+
86
+ * AKÇ-də şimal-qərbdən cənub-şərqə uzanan antiklinal kəmərlər mərhələli şəkildə bir-birinə paralel düzülür; fold-propaqasiya tipli qatlanmaların ön hissəsində sürüşmə qırılmaları ilə birlikdə "fault-bend" mexanikası izlənir.
87
+ * Lokal qalxımlar (məsələn, AKÇ-NE-1, AKÇ-NE-3 sintetik strukturları) morfoloji baxımdan asimmetrikdir: şimal-şərq qanadında kəskin, cənub-qərb qanadında isə daha yastı yamaclarla məhdudlaşır. Bu, sıxılma vektorunun yönü ilə uyğundur.
88
+ * BA zolağında pozitiv "flower structure" motivləri müşahidə olunur; bu strukturların mərkəzində palçıq vulkanları və fluid ventləri sıx yerləşir.
89
+
90
+ ### **Çöküntütoplanma dinamikası və qalınlığı**
91
+
92
+ * Cənubi Xəzərin mərkəz hissəsində çöküntütoplanma sürəti 0.4 mm/il olaraq qiymətləndirilmişdir; bu göstərici termal rejimin nisbi "soyuma" tendensiyası ilə uzlaşır.
93
+ * Şelfdə 3–4 mm/il və Kür çayının mənsəbində 6 mm/il-ə çatan sürətlər proqradasiya kənarlarında yüksək akkumulyasiya potensialını göstərir.
94
+ * Çöküntülərin qalınlığı AKÇ mərkəzinə doğru 6–7 km-ə çatır; bu artım, Pliosen dövründə (xüsusən erkən-orta Pliosen) sürətlənən subsidensiya ilə əlaqədardır.
95
+ * AKÇ və BA-da inkişaf sürəti eyni deyildir: BA boyunca struktur yüksəlmələr erkən Miosendən zəif, Pliosendə isə kəskin intensivləşmişdir; AKÇ-də isə Gec Miosen–Pliosen aralığında tədricən artan, lakin Pliosenin ortasında sıçrayış xarakterli mərhələ müşahidə olunur.
96
+
97
+ ---
98
+
99
+ **Səhifə 6**
100
+
101
+ ## **Termal rejim və hidrokarbon yetişməsi**
102
+
103
+ * Geotemperatur xəritələrinin analizi AKÇ-də neftəmələgəlmənin baş zonalarını 2.8–4.2 km intervalında ayırd edir; bu, lito-fasiyal müxtəliflikdən asılı olaraq lokal 2.6–2.8 km-lik dəyazlaşma göstərə bilir.
104
+ * BA-da 3000–4000 m intervalı neftəmələgəlmə baş zonasını, 8000–8500 m intervalı isə qazəmələgəlmə baş zonasını əhatə edir. Vitrinit əksoluuma (Ro) dəyərləri BA üçün müvafiq olaraq 0.7–1.0% (neft pəncərəsi) və 1.3–2.0% (qaz pəncərəsi) diapazonuna uyğun gəlir.
105
+ * İstilik axını dəyərləri şelfdə 44–52 mW/m², AKÇ mərkəzi boyunca 28–36 mW/m² olaraq təxmini qiymətləndirilmişdir; palçıq vulkanlarının yaxınlığında 5–9 mW/m²-lik lokal artımlar qeydə alınmışdır.
106
+ * Palçıq vulkanizmi süxur komplekslərinin temperaturunu lokallaşdırılmış şəkildə artırır: vent kənarında 8–15°C, 0.8–1.2 km lateral məsafədə 2–5°C artım müşahidə olunur; bu artımlar kerogenin termal yetişməsinə birbaşa təsir edən regional faktor olmasa da, maye köçürülməsini sürətləndirərək lokal "overmature" linzalar yarada bilir.
107
+
108
+ ## **Palçıq vulkanizmi və fluid sistemləri**
109
+
110
+ * Palçıq vulkanlarının paylanması antiklinal krestdə kəsimlərlə üst-üstə düşür; dərin kök zonaları çox vaxt tərs qırılmaların kəsişmə nöqtələrinə bağlanır.
111
+ * Qaz tərkibi əsasən metandır (C1/C2+ > 100), lakin qazəmələgəlmə pəncərəsinin dərin kənarlarında C2–C5 fraksiyalarının nisbi payı artır; δ¹³CH₄ dəyərləri –44‰ ilə –34‰ arasında dəyişir.
112
+ * Vulkanların püskürmə dövrləri Pliosen–Kvarter çöküntülərinin sürüşmə səviyyələri ilə uyğundur və çoxkamüllü vent sistemləri termal anomaliyaları stasionar saxlayır.
113
+
114
+ ## **Mineral resursların genetik zonallaşması**
115
+
116
+ * Faydalı qazıntılar (hidrokarbonlar başda olmaqla) paleotektonik zonallığa tabedir. NW–SE uzanmalı antiklinal kəmərlər boyunca tələlər sıxlaşır; qırılma-örtülü və qatlanma-örtülü tələlərin sinxronluğu ən yüksək ehtiyat sıxlığını təmin edir.
117
+ * Qeyri-hidrokarbon resurslar (gil süxurları, tikinti materialları, yerli karbonat linzaları) deltaik kənarlarda və reqressiv sekansların yuxarı hissələrində cəmlənir.
118
+ * Maykop tipli (Oliqosen–Aşağı Miosen) orqanik zəngin şistlər potensial mənbə süxuru rolunu oynayır, üstündəki Pliosen pelit kompleksləri effektiv örtük sistemi yaradır.
119
+
120
+ ---
121
+
122
+ **Səhifə 7**
123
+
124
+ ## **Müzakirə**
125
+
126
+ ### **Kolliziya kinematikası və struktur miras**
127
+
128
+ Ərəbistan plitəsinin şimala hərəkət sürətinin 15–20 mm/il diapazonunda dəyişməsi Qafqaz kollizion kəmərində basqı yaratmış, Cənubi Xəzərin qərb yamacı boyunca transpressiv büküləmələri induksiya etmişdir. Cənubi Xəzər meqaçökəkliyi altına "udulma" effekti, litosfer miqyasında sıxlıq kontrastlarının və közmogen plitə sərhədi geometriyasının nəticəsi kimi şərh olunur. Bu geodinamik quruluş AKÇ və BA-da aşağıdakılara gətirib çıxarmışdır:
129
+
130
+ * Sıxılma gərginliklərinin şimal-şərqdə intensifikasiyası, antiklinal oxların NW–SE istiqamətdə ritmik sıralanması;
131
+ * Qırılma sındırıcı sistemlərin SHmax-a perpendikulyar şəkildə düzülməsi və lokal "tilt block" dinamikasının yaranması;
132
+ * Ön zonada qatlanma-kəsilmə cəbhələrinin mərhələli qərbə irəliləməsi.
133
+
134
+ ### **AKÇ və BA-nın fərqli inkişaf trayektoriyaları**
135
+
136
+ AKÇ və BA inkişaf sürəti və rejimi baxımından fərqli davranış nümayiş etdirir. AKÇ, kollizion yükə həssas olaraq Pliosenin əvvəlindən ortalarınadək mərhələli artım göstərir, BA isə Pliosenin ortasında daha kəskin struktur yüksəlmələrlə seçilir. Bu asinxronluq, ehtimal ki, BA boyunca sağ-yanal komponentli transpressiyanın müqavimət kontrastlı litoloji paketlərlə rezonansına bağlıdır.
137
+
138
+ Beləliklə:
139
+
140
+ * AKÇ-də antiklinal zonalar boyunca tələlər uzunmüddətli akkumulyasiya-qatlanma sinxronluğu ilə "inkişaf etmiş tələ" mərhələsinə çatır;
141
+ * BA-da isə gedişat "gec yetişmə–sürətli tələ formalaşması" ssenarisi ilə uyğun gəlir.
142
+
143
+ ### **Çöküntütoplanma–subsidensiya–istilik əlaqələri**
144
+
145
+ Mərkəzi hövzədə 0.4 mm/il-lik akkumulyasiya fonda sabit, lakin şelf və delta kənarlarında 3–6 mm/il-ə çatan yüksək sürətlər, istilik axının inversiyada olan mozaika yaratmasına səbəb olur. Sürətli çöküntü yığılması müəyyən sahələrdə istilik diffuziyasını azaldaraq daha yüksək yeraltı temperatur qradientlərini çevik şəkildə formalaşdıra bilər, lakin Cənubi Xəzərin dərin zonalarında ümumi istilik axını nisbi aşağı dəyərlər göstərir. Bunun nəticəsi olaraq:
146
+
147
+ * BA üçün 3000–4000 m intervalında neft pəncərəsi sabit izlənir, 8000–8500 m intervalında isə qaz pəncərəsi pikə çatır;
148
+ * AKÇ-də isə neft pəncərəsi 2.8–4.2 km aralığında olsa da, lokal yüksək konduktiv zonalarda 2.6 km-ə dəyazlaşır.
149
+
150
+ ---
151
+
152
+ **Səhifə 8**
153
+
154
+ ## **Palçıq vulkanizmi və termal təsirlər**
155
+
156
+ Palçıq vulkanizmi dərin maye sistemləri üçün "sürətli magistral" funksiyası daşıyır. Vulkan kanallarından keçən isti materialların gətirdiyi əlavə istilik, ətraf süxurlarda qısamüddətli (10³–10⁴ il) termal anomaliyalar yaradır. Bu anomaliyalar hidrokarbonların termodinamik yetişməsi üçün regional mənə daşımır, ancaq:
157
+
158
+ * Gec karbohidrogen miqrasiya hadisələrini sürətləndirir;
159
+ * Yüksək məsaməlilikli, məsud konduktiv laylarda "şirin nöqtələr"in (sweet spot) lokal inkişafına kömək edir;
160
+ * Vulkan konusları yaxınlığında səthi istilik axını ölçmələrinə 5–9 mW/m² artım şəklində yansıyır.
161
+
162
+ ## **Neft-qaz sistemi elementlərinin sinxronluğu**
163
+
164
+ * **Mənbə süxuru:** Oliqosen–Aşağı Miosen Maykop kompleksləri, yüksək TOC (1.5–4.5%), kerogen tipləri II–III üstünlük təşkil edir.
165
+ * **Örtük süxuru:** Pliosen pelitlər və marllar, aşağı permeabellik və qalın ardıcıllıqlar sayəsində effektiv sızdırmazlıq təmin edir.
166
+ * **Kollektor layları:** Deltaik qumlar, dayaz-dəniz qumkarbonat paketlər; təsirli porozite 18–26%, keçiricilik 20–600 mD.
167
+ * **Tələlər:** Antiklinal-örtülü tələlər, tərs qırılma seqmentləri ilə güclənmiş qatlanma tələləri; BA boyunca qalxım kənarlarında lateral örtülmə ilə kombinə olunmuş miks tip.
168
+
169
+ ## **Risklər və qeyri-müəyyənlik**
170
+
171
+ * Sıxılma gərginliklərinin zamanca dəyişməsi nəticəsində qırılma şəbəkəsinin reaktivasiya ehtimalı yüksəkdir; tələlərdə sızma riskini artırır.
172
+ * Palçıq vulkanizmi ilə bağlı epizodik "şok boşalmalar" tələlərdəki təzyiq rejimini dəyişərək lokal dekompressiya yarada bilər.
173
+ * Geotemperatur modellərinin kalibrlənməsi istilik axını xəritələrinin seyrəkliyi səbəbindən qeyri-müəyyənlik daşıyır; əlavə ölçmələr tələb olunur.
174
+
175
+ ## **Alternativ modellər**
176
+
177
+ * Dərin kök zonalarında litosferik miqyaslı "roll-back" mexanizminə bənzər arxitektonik yenidənqurulma ehtimalı, qərb yamacadaki asimmetrik qalxımları izah edə bilər.
178
+ * Transpressiv zonalarda sağ-yanal komponentin payının artması, fold-thrust sistemlərinin "en echelon" tipində mərhələli təşkili ilə nəticələnmiş ola bilər; bu, BA xətti üzrə müşahidə olunan "flower structure" fəaliyyətini gücləndirir.
179
+
180
+ ---
181
+
182
+ **Səhifə 9**
183
+
184
+ ## **Tətbiqi nəticələr və proqnoz**
185
+
186
+ * Qaz pəncərəsi dərinliklərinin (8–8.5 km) BA üçün təsdiqi ultra-dərin hədəflərin (HP/HT rejim) texnoloji planlamasını tələb edir; buraxma temperaturu 170–210°C intervalı üçün risk qiymətləndirilməsi aparılmalıdır.
187
+ * AKÇ-də neft pəncərəsinin 2.8–4.2 km intervalında sabit izlənməsi, antiklinal kəmərlər boyunca orta dərinlikli quyu dizaynı üçün əlverişli pəncərə yaradır; kollektor layların proqnozu üçün seysmik atribut analizinə üstünlük verilməlidir.
188
+ * Palçıq vulkanizmi yaxınlığında yerləşən strukturlar üçün qaz sızması və geohazards risk modelləri hazırlanmalı, vent zonalarından uzaqlıq kriteriyası kimi ən azı 1.5–2.0 km radius saxlanmalıdır.
189
+
190
+ ## **Metodoloji müşahidələr**
191
+
192
+ * Çöküntütoplanma sürətinin mərkəzi hövzədə 0.4 mm/il olması, qalın ardıcıllıqların formalaşması üçün uzunmüddətli zaman pəncərəsinin (~10 Myr) zəruriliyini göstərir; bu, subsidensiyanın tektonik komponentinin əhəmiyyətini vurğulayır.
193
+ * Sıxılma rejiminin AKÇ-nin şimal-şərqində intensivləşməsi, ehtimal ki, yerli litosfer qalınlaşması və köhnə rift-mirası ilə (qırılma zonalarının irsi zəifləmələri) bağlıdır; bu, antiklinal oxların paralellik dərəcəsini artırır.
194
+
195
+ ## **Konseptual model**
196
+
197
+ * **Gec Miosen:** Ərəbistan-Avrasiya kolliziyası aktivləşir; qərb yamac boyunca transpressiv stresslər, ilk qatlanma-qırılma cəbhəsi formalaşır.
198
+ * **Erkən Pliosen:** Proqradasiya sürətlənir, Kür mənsəbi boyunca 6 mm/il-ə çatan akkumulyasiya, antiklinal zonalar aktivləşir, tələlərin ilkin doldurulması başlay��r.
199
+ * **Orta Pliosen:** BA xətti boyunca lokal qalxımlar kəskinləşir; palçıq vulkanizmi epizodları artar, neft pəncərəsi geniş miqyasda aktivdir.
200
+ * **Gec Pliosen–Erkən Kvarter:** Qaz pəncərəsi dərin hüdudlarda maksimum fəaliyyət göstərir; dərin kollektorlar qazla yüklənir; termal rejim sabitləşir.
201
+
202
+ ## **Gələcək işlər üçün çərçivə**
203
+
204
+ * Yüksək ayırdetmə qabiliyyətli 3D seysmik-geomekanik kupləj modelləri ilə SHmax dinamizminin zaman içində izlənməsi;
205
+ * Palçıq vulkanizmi ventlərinin termal-maye axını simulyasiyası və geotemperatur xəritələrində lokal düzəlişlər;
206
+ * Çöküntütoplanma sürətlərinin paleomühit proksiləri (palinologiya, diatomit markerləri) ilə kalibrasiyası.
207
+
208
+ ---
209
+
210
+ **Səhifə 10**
211
+
212
+ ## **Nəticələr**
213
+
214
+ 1. AKÇ və BA daxil olan Cənubi Xəzər çökəkliyi sistemində faydalı qazıntılar, xüsusılə neft-qaz sistemləri paleotektonik və paleocoğrafi şəraitin birgə məhsuludur; Gec Miosen kolliziyası ilə induksiya olunmuş sıxılma rejimi burada əsas idarəedici rol oynamışdır.
215
+ 2. Ərəbistan plitəsinin təsiri altında Qafqaz kolliziyalarının inkişafı Cənubi Xəzər meqaçökəkliyinin altına udulma mexanizmi ilə yoldaş olmuş, qərb yamacın formalaşmasında sıxılma gərginlikləri kolledici olmuşdur.
216
+ 3. AKÇ və Bakı arxipelaqı intensiv sıxılma gərginliklərindən təsirlənmiş, şimal-qərbdən cənub-şərqə uzanan antiklinal zonallaşma və SHmax-a perpendikulyar qırılma şəbəkəsi ilə xarakterizə olunmuşdur.
217
+ 4. AKÇ-nin lokal qalxımları morfoloji olaraq sıxılma gərginliklərinin təsirini əks etdirir; şimal-şərq seqmentində bu təsir yüksək intensivlikdədir.
218
+ 5. Çöküntütoplanma sürəti məkanca və zamanca dəyişir: Cənubi Xəzərin mərkəzində 0.4 mm/il, şelfdə 3–4 mm/il, Kür mənsəbində 6 mm/il; çöküntü qalınlığı AKÇ mərkəzinə doğru 6–7 km-ə çatır.
219
+ 6. Geotemperatur xəritələri AKÇ və BA üçün neftəmələgəlmə və qazəmələgəlmənin baş zonalarını ayırmışdır: BA-da 3000–4000 m (neft), 8000–8500 m (qaz); AKÇ-də isə neft pəncərəsi 2.8–4.2 km intervalında sabit izlənir.
220
+ 7. Palçıq vulkanizmi yerli termal rejimi yüksəldir və maye miqrasiyasını sürətləndirir; ancaq regional miqyasda termal yetişmənin əsas sürücüsü tektonik-sedimentoloji balansdır.
221
+ 8. AKÇ və BA-nın inkişaf sürəti eyni deyil; Pliosendə hər iki bölgədə artım müşahidə olunsa da, BA daha kəskin struktur yüksəlmələrlə fərqlənir.
222
+ 9. Neft-qazlılıq potensialı sıxılma gərginlikləri, qatlanma-qırılma morfodinamikası və akkumulyasiya rejiminin uyğunlaşması ilə maksimal səviyyəyə çatır; antiklinal-örtülü tələlər üstünlük təşkil edir.
223
+
224
+ ---
225
+
226
+ **Səhifə 11**
227
+
228
+ ## **Tövsiyələr**
229
+
230
+ * AKÇ-nin şimal-şərq seqmentində NW–SE uzanmalı antiklinal kəmərlər üzərində 3D seysmik atribut analizləri genişləndirilməli, qırılma-örtülü tələlərin konturları dəqiqləşdirilməlidir.
231
+ * BA boyunca ultra-dərin qaz hədəfləri üçün HP/HT mühəndislik protokolları (geomekanik pəncərə, quyu soyutma-rejim planlaması) hazırlanmalıdır.
232
+ * Palçıq vulkanlarına yaxın strukturların risk xəritələrində 2 km-lik təhlükəsizlik buffer zonası nəzərə alınmalı, vent aktivliyinin seysmik mikromüşahidələri aparılmalıdır.
233
+ * Geotemperatur xəritələrinin kalibrlənməsi üçün əlavə istilik axını ölçmələri və fiber-optik quyu temperatur profilləri tətbiq olunmalıdır.
234
+ * Çöküntütoplanma proqnozlarında Kür mənsəbi reaktivliyinin iqlim-hidroloji dəyişmələrə həssaslığı inteqrasiya olunmalıdır.
235
+
236
+ ---
237
+
238
+ ## **Əlavə müşahidələr və sintetik göstəricilər**
239
+
240
+ * SHmax azimutu 035–045°; Shmin azimutu 125–135°; qırılma düzənlikləri: 310–320° trend.
241
+ * Qatlanma amplitudları: 120–650 m; dalğa uzunluğu 2.5–7.8 km, asimmetriya əmsalı 1.2–1.8.
242
+ * Termal qradientlər: şelfdə 26–31°C/km, AKÇ mərkəzində 17–22°C/km, BA dərin zonalarında 15–19°C/km.
243
+ * Kollektor porozitesi: 18–26%; keçiricilik: 20–600 mD; kapilyar giriş təzyiqi: 0.3–1.1 MPa.
244
+ * Palçıq vulkanı vent temperaturu: 24–55°C; lateral termal təsir radiusu 0.8–1.5 km.
245
+
246
+ ---
247
+
248
+ **Səhifə 12**
249
+
250
+ ## **Yekun**
251
+
252
+ Bu tədqiqat göstərir ki, Cənubi Xəzər çökəkliyi sistemində faydalı qazıntıların mənşə mexanizmləri paleotektonik və paleocoğrafi amillərin inteqrasiyasına həssasdır. Ərəbistan plitəsinin kollizion təsiri ilə yaranmış sıxılma rejimi, AKÇ və BA üzrə antiklinal zonallaşmanı, qırılma şəbəkələrini və lokal qalxımları formalaşdıraraq neft-qaz sistemlərinin bütün komponentlərinin sinxron fəaliyyətini təmin etmişdir. Çöküntütoplanma sürətlərinin dəyişkənliyi, termal rejimin region üzrə mozaik xarakter almasına səbəb olmuş; geotemperatur xəritələri isə neft və qaz əmələgəlməsinin baş zonalarını etibarlı şəkildə ay��rd etmişdir. Palçıq vulkanizmi, hər nə qədər regional termogenezis üçün ikincil faktor olsa da, lokal istilik və maye axını anomaliyaları ilə tələlərin doldurulmasına dinamika qatmışdır. Pliosendə müşahidə olunan inkişaf sıçrayışları, akkumulyasiya-tektonika kəsişməsinin pik mərhələsi kimi şərh olunur və bu mərhələnin izləri bu gün də struktur-morfoloji naxışlarda, termal xəritələrdə və palçıq vulkanizminin fəallığında oxunaqdadır. Bu çərçivə daxilində aparılacaq istiqamətli kəşfiyyat və təhlükə qiymətləndirməsi, AKÇ və BA-nın neft-qazlılıq potensialından daha effektiv istifadə edilməsinə imkan verəcək.
docker-compose.yml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ services:
4
+ llm-api:
5
+ build:
6
+ context: .
7
+ dockerfile: Dockerfile
8
+ container_name: socar-llm-api
9
+ ports:
10
+ - "8000:8000"
11
+ env_file:
12
+ - .env
13
+ environment:
14
+ - PYTHONUNBUFFERED=1
15
+ volumes:
16
+ # Mount app directory for development (optional - remove in production)
17
+ - ./app:/app/app
18
+ restart: unless-stopped
19
+ healthcheck:
20
+ test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
21
+ interval: 30s
22
+ timeout: 10s
23
+ retries: 3
24
+ start_period: 40s
25
+ networks:
26
+ - socar-network
27
+
28
+ networks:
29
+ socar-network:
30
+ driver: bridge
notebooks/llm_benchmark.ipynb CHANGED
@@ -1,761 +1,531 @@
1
  {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "metadata": {},
6
- "source": [
7
- "# LLM Benchmarking for SOCAR Hackathon RAG Chatbot\n",
8
- "\n",
9
- "This notebook tests different LLM models for the `/llm` endpoint to find the best performer.\n",
10
- "\n",
11
- "## Evaluation Criteria (LLM Judge Metrics):\n",
12
- "- **Accuracy**: Is the answer correct?\n",
13
- "- **Relevance**: Are retrieved citations relevant?\n",
14
- "- **Completeness**: Does it fully answer the question?\n",
15
- "- **Citation Quality**: Proper sources with page numbers?\n",
16
- "- **Response Time**: Speed of generation\n",
17
- "\n",
18
- "## Available LLM Models:\n",
19
- "1. **Llama-4-Maverick-17B-128E-Instruct-FP8** (Current choice, open-source)\n",
20
- "2. **DeepSeek-R1** (Open-source reasoning model)\n",
21
- "3. **GPT-4.1** (Strong general performance)\n",
22
- "4. **GPT-5, GPT-5-mini**\n",
23
- "5. **Claude Sonnet 4.5** (Best quality)\n",
24
- "6. **Claude Opus 4.1**\n",
25
- "7. **Phi-4-multimodal-instruct**\n",
26
- "8. **gpt-oss-120b**"
27
- ]
28
- },
29
- {
30
- "cell_type": "code",
31
- "execution_count": 1,
32
- "metadata": {},
33
- "outputs": [],
34
- "source": [
35
- "# Install required packages\n",
36
- "# !pip install openai pinecone-client sentence-transformers python-dotenv pandas matplotlib seaborn jiwer"
37
- ]
38
- },
39
- {
40
- "cell_type": "code",
41
- "execution_count": 2,
42
- "metadata": {},
43
- "outputs": [
44
  {
45
- "name": "stderr",
46
- "output_type": "stream",
47
- "text": [
48
- "/Users/ismatsamadov/SOCAR_Hackathon/venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
49
- " from .autonotebook import tqdm as notebook_tqdm\n"
50
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  },
52
  {
53
- "name": "stdout",
54
- "output_type": "stream",
55
- "text": [
56
- "\u2705 Libraries loaded successfully\n"
57
- ]
58
- }
59
- ],
60
- "source": [
61
- "import os\n",
62
- "import json\n",
63
- "import time\n",
64
- "from typing import Dict, List, Tuple\n",
65
- "from dotenv import load_dotenv\n",
66
- "import pandas as pd\n",
67
- "import matplotlib.pyplot as plt\n",
68
- "import seaborn as sns\n",
69
- "from openai import AzureOpenAI\n",
70
- "from pinecone import Pinecone\n",
71
- "from sentence_transformers import SentenceTransformer\n",
72
- "from jiwer import wer, cer\n",
73
- "\n",
74
- "# Load environment variables\n",
75
- "load_dotenv()\n",
76
- "\n",
77
- "# Set style\n",
78
- "sns.set_style('whitegrid')\n",
79
- "plt.rcParams['figure.figsize'] = (14, 8)\n",
80
- "\n",
81
- "print(\"\u2705 Libraries loaded successfully\")"
82
- ]
83
- },
84
- {
85
- "cell_type": "code",
86
- "execution_count": 3,
87
- "metadata": {},
88
- "outputs": [
89
  {
90
- "name": "stdout",
91
- "output_type": "stream",
92
- "text": [
93
- "\u2705 Project root: /Users/ismatsamadov/SOCAR_Hackathon\n",
94
- "\u2705 Docs directory: /Users/ismatsamadov/SOCAR_Hackathon/docs\n",
95
- "\u2705 Output directory: /Users/ismatsamadov/SOCAR_Hackathon/output\n"
96
- ]
97
- }
98
- ],
99
- "source": [
100
- "# Auto-detect project root (works from any directory)\n",
101
- "import os\n",
102
- "from pathlib import Path\n",
103
- "\n",
104
- "if Path('data').exists() and Path('docs').exists():\n",
105
- " # Already in project root\n",
106
- " PROJECT_ROOT = Path.cwd()\n",
107
- "elif Path('../data').exists() and Path('../docs').exists():\n",
108
- " # In notebooks/ subdirectory\n",
109
- " PROJECT_ROOT = Path.cwd().parent\n",
110
- "else:\n",
111
- " # Fallback: try to find project root\n",
112
- " current = Path.cwd()\n",
113
- " while current != current.parent:\n",
114
- " if (current / 'data').exists() and (current / 'docs').exists():\n",
115
- " PROJECT_ROOT = current\n",
116
- " break\n",
117
- " current = current.parent\n",
118
- " else:\n",
119
- " PROJECT_ROOT = Path.cwd()\n",
120
- "\n",
121
- "# Define all paths relative to project root\n",
122
- "DATA_DIR = PROJECT_ROOT / 'data'\n",
123
- "DOCS_DIR = PROJECT_ROOT / 'docs'\n",
124
- "OUTPUT_DIR = PROJECT_ROOT / 'output'\n",
125
- "\n",
126
- "print(f\"\u2705 Project root: {PROJECT_ROOT}\")\n",
127
- "print(f\"\u2705 Docs directory: {DOCS_DIR}\")\n",
128
- "print(f\"\u2705 Output directory: {OUTPUT_DIR}\")"
129
- ]
130
- },
131
- {
132
- "cell_type": "code",
133
- "execution_count": 4,
134
- "metadata": {},
135
- "outputs": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  {
137
- "name": "stdout",
138
- "output_type": "stream",
139
- "text": [
140
- "Loaded 5 test cases\n",
141
- "\n",
142
- "Test Questions:\n",
143
- "1. Example1: Daha az quyu il\u0259 daha \u00e7ox hasilat \u0259ld\u0259 etm\u0259k \u00fc\u00e7\u00fcn hans\u0131 \u0259sas amill\u0259rin inteqrasiyas\u0131 t\u0259l\u0259b olunur?...\n",
144
- "2. Example2: Q\u0259rbi Ab\u015feron yata\u011f\u0131nda suvurma t\u0259dbirl\u0259ri hans\u0131 tarixd\u0259 v\u0259 hans\u0131 layda t\u0259tbiq edilmi\u015fdir v\u0259 bunun m...\n",
145
- "3. Example3: Pirallah\u0131 strukturunda 1253 n\u00f6mr\u0259li quyudan g\u00f6t\u00fcr\u00fclm\u00fc\u015f n\u00fcmun\u0259l\u0259rd\u0259 SiO2 v\u0259 CaO oksidl\u0259ri aras\u0131nda ha...\n",
146
- "4. Example4: Bak\u0131 arxipelaq\u0131 (BA) v\u0259 A\u015fa\u011f\u0131 K\u00fcr \u00e7\u00f6k\u0259kliyi (AK\u00c7) \u00fc\u00e7\u00fcn geotemperatur x\u0259rit\u0259l\u0259rin\u0259 \u0259sas\u0259n neft v\u0259 qaz...\n",
147
- "5. Example5: Bu zonada hans\u0131 prosesl\u0259r ba\u015f verir?...\n"
148
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  }
150
- ],
151
- "source": [
152
- "# Load sample questions - using dynamic paths\n",
153
- "with open(DOCS_DIR / 'sample_questions.json', 'r', encoding='utf-8') as f:\n",
154
- " questions = json.load(f)\n",
155
- "\n",
156
- "# Load expected answers - using dynamic paths\n",
157
- "with open(DOCS_DIR / 'sample_answers.json', 'r', encoding='utf-8') as f:\n",
158
- " expected_answers = json.load(f)\n",
159
- "\n",
160
- "print(f\"Loaded {len(questions)} test cases\")\n",
161
- "print(\"\\nTest Questions:\")\n",
162
- "for i, (key, msgs) in enumerate(questions.items(), 1):\n",
163
- " user_msg = [m for m in msgs if m['role'] == 'user'][-1]\n",
164
- " print(f\"{i}. {key}: {user_msg['content'][:100]}...\")"
165
- ]
166
- },
167
- {
168
- "cell_type": "markdown",
169
- "metadata": {},
170
- "source": [
171
- "## 2. Initialize Vector Database and Embedding Model"
172
- ]
173
- },
174
- {
175
- "cell_type": "code",
176
- "execution_count": null,
177
- "metadata": {},
178
- "outputs": [],
179
- "source": [
180
- "# Initialize Pinecone\n",
181
- "pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))\n",
182
- "index = pc.Index(os.getenv('PINECONE_INDEX_NAME', 'hackathon'))\n",
183
- "\n",
184
- "# Initialize embedding model (same as used for ingestion)\n",
185
- "embed_model = SentenceTransformer('BAAI/bge-large-en-v1.5')\n",
186
- "\n",
187
- "print(f\"\u2705 Vector DB connected: {index.describe_index_stats()}\")\n",
188
- "print(f\"\u2705 Embedding model loaded: {embed_model}\")"
189
- ]
190
- },
191
- {
192
- "cell_type": "markdown",
193
- "metadata": {},
194
- "source": [
195
- "## 3. RAG Retrieval Function"
196
- ]
197
- },
198
- {
199
- "cell_type": "code",
200
- "execution_count": null,
201
- "metadata": {},
202
- "outputs": [],
203
- "source": [
204
- "def retrieve_documents(query: str, top_k: int = 3) -> List[Dict]:\n",
205
- " \"\"\"\n",
206
- " Retrieve relevant documents from vector database.\n",
207
- " \"\"\"\n",
208
- " # Generate query embedding\n",
209
- " query_embedding = embed_model.encode(query).tolist()\n",
210
- " \n",
211
- " # Search vector DB\n",
212
- " results = index.query(\n",
213
- " vector=query_embedding,\n",
214
- " top_k=top_k,\n",
215
- " include_metadata=True\n",
216
- " )\n",
217
- " \n",
218
- " # Extract documents\n",
219
- " documents = []\n",
220
- " for match in results['matches']:\n",
221
- " documents.append({\n",
222
- " 'pdf_name': match['metadata'].get('pdf_name', 'unknown.pdf'),\n",
223
- " 'page_number': match['metadata'].get('page_number', 0),\n",
224
- " 'content': match['metadata'].get('text', ''),\n",
225
- " 'score': match.get('score', 0.0)\n",
226
- " })\n",
227
- " \n",
228
- " return documents\n",
229
- "\n",
230
- "# Test retrieval\n",
231
- "test_query = \"Pal\u00e7\u0131q vulkanlar\u0131n\u0131n t\u0259sir radiusu n\u0259 q\u0259d\u0259rdir?\"\n",
232
- "test_docs = retrieve_documents(test_query)\n",
233
- "print(f\"\\n\u2705 Retrieved {len(test_docs)} documents for test query\")\n",
234
- "print(f\"Top result: {test_docs[0]['pdf_name']}, page {test_docs[0]['page_number']} (score: {test_docs[0]['score']:.3f})\")"
235
- ]
236
- },
237
- {
238
- "cell_type": "markdown",
239
- "metadata": {},
240
- "source": [
241
- "## 4. LLM Client Functions"
242
- ]
243
- },
244
- {
245
- "cell_type": "code",
246
- "execution_count": null,
247
- "metadata": {},
248
- "outputs": [],
249
- "source": [
250
- "# Initialize Azure OpenAI\n",
251
- "azure_client = AzureOpenAI(\n",
252
- " api_key=os.getenv('AZURE_OPENAI_API_KEY'),\n",
253
- " api_version=os.getenv('AZURE_OPENAI_API_VERSION', '2024-08-01-preview'),\n",
254
- " azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT')\n",
255
- ")\n",
256
- "\n",
257
- "LLM_MODELS = {\n",
258
- " 'Llama-4-Maverick': 'Llama-4-Maverick-17B-128E-Instruct-FP8',\n",
259
- " 'DeepSeek-R1': 'DeepSeek-R1',\n",
260
- " 'GPT-4.1': 'gpt-4.1',\n",
261
- " 'GPT-5-mini': 'gpt-5-mini',\n",
262
- " 'Claude-Sonnet-4.5': 'claude-sonnet-4-5',\n",
263
- "}\n",
264
- "\n",
265
- "def generate_answer(model_name: str, query: str, documents: List[Dict], \n",
266
- " temperature: float = 0.2, max_tokens: int = 1000) -> Tuple[str, float]:\n",
267
- " \"\"\"\n",
268
- " Generate answer using specified LLM model.\n",
269
- " Returns: (answer, response_time)\n",
270
- " \"\"\"\n",
271
- " # Build context from retrieved documents\n",
272
- " context_parts = []\n",
273
- " for i, doc in enumerate(documents, 1):\n",
274
- " context_parts.append(\n",
275
- " f\"Document {i} (Source: {doc['pdf_name']}, Page {doc['page_number']}):\\n{doc['content']}\"\n",
276
- " )\n",
277
- " context = \"\\n\\n\".join(context_parts)\n",
278
- " \n",
279
- " # Create prompt\n",
280
- " prompt = f\"\"\"Siz SOCAR-\u0131n tarixi neft v\u0259 qaz s\u0259n\u0259dl\u0259ri \u00fczr\u0259 m\u00fct\u0259x\u0259ssis k\u00f6m\u0259k\u00e7isisiniz.\n",
281
- "\n",
282
- "Kontekst (\u0259laq\u0259li s\u0259n\u0259dl\u0259r):\n",
283
- "{context}\n",
284
- "\n",
285
- "Sual: {query}\n",
286
- "\n",
287
- "\u018ftrafl\u0131 cavab verin v\u0259 m\u00fctl\u0259q s\u0259n\u0259d m\u0259nb\u0259l\u0259rin\u0259 istinad edin (PDF ad\u0131 v\u0259 s\u0259hif\u0259 n\u00f6mr\u0259si il\u0259).\n",
288
- "Cavab\u0131n\u0131z d\u0259qiq, faktlara \u0259saslanan v\u0259 kontekst m\u0259lumatlar\u0131ndan istifad\u0259 ed\u0259n olmal\u0131d\u0131r.\"\"\"\n",
289
- " \n",
290
- " # Get model deployment\n",
291
- " deployment = MODELS[model_name]['deployment']\n",
292
- " \n",
293
- " try:\n",
294
- " start_time = time.time()\n",
295
- " \n",
296
- " # GPT-5 models use max_completion_tokens, others use max_tokens\n",
297
- " if deployment.startswith('gpt-5'):\n",
298
- " response = azure_client.chat.completions.create(\n",
299
- " model=deployment,\n",
300
- " messages=[\n",
301
- " {\"role\": \"user\", \"content\": prompt}\n",
302
- " ],\n",
303
- " temperature=temperature,\n",
304
- " max_completion_tokens=max_tokens\n",
305
- " )\n",
306
- " else:\n",
307
- " response = azure_client.chat.completions.create(\n",
308
- " model=deployment,\n",
309
- " messages=[\n",
310
- " {\"role\": \"user\", \"content\": prompt}\n",
311
- " ],\n",
312
- " temperature=temperature,\n",
313
- " max_tokens=max_tokens\n",
314
- " )\n",
315
- " \n",
316
- " response_time = time.time() - start_time\n",
317
- " answer = response.choices[0].message.content\n",
318
- " \n",
319
- " return answer, response_time\n",
320
- " \n",
321
- " except Exception as e:\n",
322
- " return f\"ERROR: {str(e)}\", 0.0\n",
323
- "\n",
324
- "print(f\"\\n\u2705 Configured {len(LLM_MODELS)} LLM models for testing\")"
325
- ]
326
- },
327
- {
328
- "cell_type": "markdown",
329
- "metadata": {},
330
- "source": [
331
- "## 5. Evaluation Metrics"
332
- ]
333
- },
334
- {
335
- "cell_type": "code",
336
- "execution_count": null,
337
- "metadata": {},
338
- "outputs": [],
339
- "source": [
340
- "def normalize_text(text: str) -> str:\n",
341
- " \"\"\"Normalize text for comparison.\"\"\"\n",
342
- " import re\n",
343
- " text = text.lower().strip()\n",
344
- " text = re.sub(r'\\s+', ' ', text)\n",
345
- " return text\n",
346
- "\n",
347
- "def calculate_answer_similarity(reference: str, hypothesis: str) -> Dict[str, float]:\n",
348
- " \"\"\"\n",
349
- " Calculate similarity between generated and expected answer.\n",
350
- " Lower is better for error rates.\n",
351
- " \"\"\"\n",
352
- " ref_norm = normalize_text(reference)\n",
353
- " hyp_norm = normalize_text(hypothesis)\n",
354
- " \n",
355
- " # Character Error Rate\n",
356
- " cer_score = cer(ref_norm, hyp_norm) * 100\n",
357
- " \n",
358
- " # Word Error Rate \n",
359
- " wer_score = wer(ref_norm, hyp_norm) * 100\n",
360
- " \n",
361
- " # Similarity scores (higher is better)\n",
362
- " similarity = max(0, 100 - wer_score)\n",
363
- " \n",
364
- " return {\n",
365
- " 'CER': round(cer_score, 2),\n",
366
- " 'WER': round(wer_score, 2),\n",
367
- " 'Similarity': round(similarity, 2)\n",
368
- " }\n",
369
- "\n",
370
- "def check_citations(answer: str, documents: List[Dict]) -> Dict[str, any]:\n",
371
- " \"\"\"\n",
372
- " Check if answer includes proper citations.\n",
373
- " \"\"\"\n",
374
- " import re\n",
375
- " \n",
376
- " # Check for PDF names\n",
377
- " pdf_names = [doc['pdf_name'] for doc in documents]\n",
378
- " cited_pdfs = sum(1 for pdf in pdf_names if pdf.replace('.pdf', '') in answer)\n",
379
- " \n",
380
- " # Check for page numbers\n",
381
- " page_numbers = [str(doc['page_number']) for doc in documents]\n",
382
- " cited_pages = sum(1 for page in page_numbers if page in answer)\n",
383
- " \n",
384
- " # Check for source keywords\n",
385
- " source_keywords = ['m\u0259nb\u0259', 's\u0259n\u0259d', 's\u0259hif\u0259', 'pdf', 'document', 'page', 'source']\n",
386
- " has_source_ref = any(kw in answer.lower() for kw in source_keywords)\n",
387
- " \n",
388
- " citation_score = (\n",
389
- " (cited_pdfs / len(pdf_names) * 40) + # 40% for PDF citation\n",
390
- " (cited_pages / len(page_numbers) * 40) + # 40% for page citation\n",
391
- " (20 if has_source_ref else 0) # 20% for having source keywords\n",
392
- " )\n",
393
- " \n",
394
- " return {\n",
395
- " 'Citation_Score': round(citation_score, 2),\n",
396
- " 'Cited_PDFs': cited_pdfs,\n",
397
- " 'Cited_Pages': cited_pages,\n",
398
- " 'Has_Source_Reference': has_source_ref\n",
399
- " }\n",
400
- "\n",
401
- "def evaluate_completeness(answer: str, min_length: int = 100) -> Dict[str, any]:\n",
402
- " \"\"\"\n",
403
- " Evaluate answer completeness.\n",
404
- " \"\"\"\n",
405
- " word_count = len(answer.split())\n",
406
- " char_count = len(answer)\n",
407
- " \n",
408
- " # Penalize very short or very long answers\n",
409
- " if char_count < min_length:\n",
410
- " completeness_score = (char_count / min_length) * 100\n",
411
- " elif char_count > 2000:\n",
412
- " completeness_score = 100 - ((char_count - 2000) / 2000 * 20) # Penalty for verbosity\n",
413
- " else:\n",
414
- " completeness_score = 100\n",
415
- " \n",
416
- " return {\n",
417
- " 'Completeness_Score': round(max(0, completeness_score), 2),\n",
418
- " 'Word_Count': word_count,\n",
419
- " 'Char_Count': char_count\n",
420
- " }\n",
421
- "\n",
422
- "print(\"\u2705 Evaluation functions ready\")"
423
- ]
424
- },
425
- {
426
- "cell_type": "markdown",
427
- "metadata": {},
428
- "source": [
429
- "## 6. Run Benchmark on All Models"
430
- ]
431
- },
432
- {
433
- "cell_type": "code",
434
- "execution_count": null,
435
- "metadata": {},
436
- "outputs": [],
437
- "source": [
438
- "# Select models to test (you can comment out models to skip)\n",
439
- "MODELS_TO_TEST = [\n",
440
- " 'Llama-4-Maverick-17B',\n",
441
- " 'DeepSeek-R1',\n",
442
- " 'GPT-4.1',\n",
443
- " 'GPT-5-mini',\n",
444
- " 'Claude-Sonnet-4.5',\n",
445
- " # 'Claude-Opus-4.1', # Uncomment to test\n",
446
- " # 'Phi-4-multimodal', # Uncomment to test\n",
447
- " # 'GPT-OSS-120B', # Uncomment to test\n",
448
- "]\n",
449
- "\n",
450
- "print(f\"Testing {len(MODELS_TO_TEST)} models on {len(questions)} questions...\\n\")\n",
451
- "print(\"This may take several minutes...\\n\")"
452
- ]
453
- },
454
- {
455
- "cell_type": "code",
456
- "execution_count": null,
457
- "metadata": {},
458
- "outputs": [],
459
- "source": [
460
- "# Run benchmark\n",
461
- "results = []\n",
462
- "\n",
463
- "for model_name in MODELS_TO_TEST:\n",
464
- " print(f\"\\n{'='*80}\")\n",
465
- " print(f\"Testing: {model_name}\")\n",
466
- " print(f\"{'='*80}\")\n",
467
- " \n",
468
- " model_results = []\n",
469
- " \n",
470
- " for example_key, messages in questions.items():\n",
471
- " # Get the last user message (the actual question)\n",
472
- " user_msg = [m for m in messages if m['role'] == 'user'][-1]\n",
473
- " query = user_msg['content']\n",
474
- " \n",
475
- " print(f\"\\n Question {example_key}: {query[:80]}...\")\n",
476
- " \n",
477
- " # Retrieve documents\n",
478
- " documents = retrieve_documents(query, top_k=3)\n",
479
- " \n",
480
- " # Generate answer\n",
481
- " answer, response_time = generate_answer(model_name, query, documents)\n",
482
- " \n",
483
- " if answer.startswith('ERROR'):\n",
484
- " print(f\" \u274c Failed: {answer}\")\n",
485
- " continue\n",
486
- " \n",
487
- " print(f\" \u2705 Response time: {response_time:.2f}s\")\n",
488
- " \n",
489
- " # Get expected answer\n",
490
- " expected = expected_answers.get(example_key, {}).get('Answer', '')\n",
491
- " \n",
492
- " # Calculate metrics\n",
493
- " similarity_metrics = calculate_answer_similarity(expected, answer) if expected else {'CER': 0, 'WER': 0, 'Similarity': 0}\n",
494
- " citation_metrics = check_citations(answer, documents)\n",
495
- " completeness_metrics = evaluate_completeness(answer)\n",
496
- " \n",
497
- " # Store result\n",
498
- " result = {\n",
499
- " 'Model': model_name,\n",
500
- " 'Question': example_key,\n",
501
- " 'Query': query[:100],\n",
502
- " 'Answer': answer[:200] + '...',\n",
503
- " 'Response_Time': round(response_time, 2),\n",
504
- " **similarity_metrics,\n",
505
- " **citation_metrics,\n",
506
- " **completeness_metrics,\n",
507
- " 'Open_Source': MODELS[model_name]['open_source'],\n",
508
- " 'Architecture_Score': MODELS[model_name]['architecture_score']\n",
509
- " }\n",
510
- " \n",
511
- " model_results.append(result)\n",
512
- " results.append(result)\n",
513
- " \n",
514
- " # Show summary for this model\n",
515
- " if model_results:\n",
516
- " avg_response_time = sum(r['Response_Time'] for r in model_results) / len(model_results)\n",
517
- " avg_similarity = sum(r['Similarity'] for r in model_results) / len(model_results)\n",
518
- " avg_citation = sum(r['Citation_Score'] for r in model_results) / len(model_results)\n",
519
- " avg_completeness = sum(r['Completeness_Score'] for r in model_results) / len(model_results)\n",
520
- " \n",
521
- " print(f\"\\n \ud83d\udcca {model_name} Summary:\")\n",
522
- " print(f\" Avg Response Time: {avg_response_time:.2f}s\")\n",
523
- " print(f\" Avg Similarity: {avg_similarity:.1f}%\")\n",
524
- " print(f\" Avg Citation Score: {avg_citation:.1f}%\")\n",
525
- " print(f\" Avg Completeness: {avg_completeness:.1f}%\")\n",
526
- "\n",
527
- "print(f\"\\n{'='*80}\")\n",
528
- "print(\"\u2705 Benchmarking complete!\")\n",
529
- "print(f\"{'='*80}\")"
530
- ]
531
- },
532
- {
533
- "cell_type": "markdown",
534
- "metadata": {},
535
- "source": [
536
- "## 7. Aggregate Results and Rankings"
537
- ]
538
- },
539
- {
540
- "cell_type": "code",
541
- "execution_count": null,
542
- "metadata": {},
543
- "outputs": [],
544
- "source": [
545
- "# Create DataFrame\n",
546
- "df = pd.DataFrame(results)\n",
547
- "\n",
548
- "# Calculate aggregate scores per model\n",
549
- "model_summary = df.groupby('Model').agg({\n",
550
- " 'Response_Time': 'mean',\n",
551
- " 'Similarity': 'mean',\n",
552
- " 'Citation_Score': 'mean',\n",
553
- " 'Completeness_Score': 'mean',\n",
554
- " 'CER': 'mean',\n",
555
- " 'WER': 'mean',\n",
556
- " 'Open_Source': 'first',\n",
557
- " 'Architecture_Score': 'first'\n",
558
- "}).round(2)\n",
559
- "\n",
560
- "# Calculate overall quality score (weighted average)\n",
561
- "model_summary['Quality_Score'] = (\n",
562
- " model_summary['Similarity'] * 0.35 + # 35% answer accuracy\n",
563
- " model_summary['Citation_Score'] * 0.35 + # 35% citation quality\n",
564
- " model_summary['Completeness_Score'] * 0.30 # 30% completeness\n",
565
- ").round(2)\n",
566
- "\n",
567
- "# Sort by Quality Score\n",
568
- "model_summary = model_summary.sort_values('Quality_Score', ascending=False)\n",
569
- "\n",
570
- "# Display summary table\n",
571
- "print(\"\\n\" + \"=\"*100)\n",
572
- "print(\"\ud83d\udcca LLM BENCHMARKING RESULTS - MODEL SUMMARY\")\n",
573
- "print(\"=\"*100)\n",
574
- "print(model_summary.to_string())\n",
575
- "print(\"=\"*100)"
576
- ]
577
- },
578
- {
579
- "cell_type": "code",
580
- "metadata": {},
581
- "source": [
582
- "# Create comprehensive visualization\n",
583
- "import os\n",
584
- "from pathlib import Path\n",
585
- "\n",
586
- "# Create output directory - using dynamic path\n",
587
- "output_dir = OUTPUT_DIR / 'llm_benchmark'\n",
588
- "output_dir.mkdir(parents=True, exist_ok=True)\n",
589
- "\n",
590
- "fig, axes = plt.subplots(2, 2, figsize=(16, 10))\n",
591
- "\n",
592
- "models = df['Model'].tolist()\n",
593
- "colors = sns.color_palette('viridis', len(models))\n",
594
- "\n",
595
- "# 1. CSR - Character Success Rate (MAIN METRIC)\n",
596
- "ax1 = axes[0, 0]\n",
597
- "bars1 = ax1.barh(models, df['CSR'], color=colors)\n",
598
- "ax1.set_xlabel('CSR (%) - Higher is Better', fontsize=12, fontweight='bold')\n",
599
- "ax1.set_title('Character Success Rate (CSR)\\n\ud83c\udfc6 HACKATHON PRIMARY METRIC', \n",
600
- " fontsize=14, fontweight='bold')\n",
601
- "ax1.set_xlim(0, 100)\n",
602
- "for i, (model, csr) in enumerate(zip(models, df['CSR'])):\n",
603
- " ax1.text(csr + 1, i, f'{csr:.2f}%', va='center', fontsize=11, fontweight='bold')\n",
604
- "ax1.axvline(x=90, color='green', linestyle='--', alpha=0.3, label='Excellent (>90%)')\n",
605
- "ax1.axvline(x=80, color='orange', linestyle='--', alpha=0.3, label='Good (>80%)')\n",
606
- "ax1.legend(fontsize=9)\n",
607
- "\n",
608
- "# 2. WSR - Word Success Rate\n",
609
- "ax2 = axes[0, 1]\n",
610
- "bars2 = ax2.barh(models, df['WSR'], color=colors)\n",
611
- "ax2.set_xlabel('WSR (%) - Higher is Better', fontsize=12, fontweight='bold')\n",
612
- "ax2.set_title('Word Success Rate (WSR)', fontsize=14, fontweight='bold')\n",
613
- "ax2.set_xlim(0, 100)\n",
614
- "for i, (model, wsr) in enumerate(zip(models, df['WSR'])):\n",
615
- " ax2.text(wsr + 1, i, f'{wsr:.2f}%', va='center', fontsize=11, fontweight='bold')\n",
616
- "\n",
617
- "# 3. Response Time\n",
618
- "ax3 = axes[1, 0]\n",
619
- "bars3 = ax3.barh(models, df['Response_Time'], color=colors)\n",
620
- "ax3.set_xlabel('Total Time (seconds) - Lower is Better', fontsize=12, fontweight='bold')\n",
621
- "ax3.set_title('Processing Speed', fontsize=14, fontweight='bold')\n",
622
- "for i, (model, time_val) in enumerate(zip(models, df['Response_Time'])):\n",
623
- " ax3.text(time_val + 0.5, i, f'{time_val:.1f}s', va='center', fontsize=11)\n",
624
- "\n",
625
- "# 4. Error Rates Comparison\n",
626
- "ax4 = axes[1, 1]\n",
627
- "x = range(len(models))\n",
628
- "width = 0.35\n",
629
- "ax4.bar([i - width/2 for i in x], df['CER'], width, label='CER', color='coral', alpha=0.8)\n",
630
- "ax4.bar([i + width/2 for i in x], df['WER'], width, label='WER', color='skyblue', alpha=0.8)\n",
631
- "ax4.set_ylabel('Error Rate (%) - Lower is Better', fontsize=12, fontweight='bold')\n",
632
- "ax4.set_title('Error Rates', fontsize=14, fontweight='bold')\n",
633
- "ax4.set_xticks(x)\n",
634
- "ax4.set_xticklabels(models, rotation=45, ha='right')\n",
635
- "ax4.legend(fontsize=11)\n",
636
- "ax4.grid(axis='y', alpha=0.3)\n",
637
- "\n",
638
- "plt.tight_layout()\n",
639
- "plt.savefig(output_dir / 'results.png', dpi=300, bbox_inches='tight')\n",
640
- "plt.show()\n",
641
- "\n",
642
- "print(f\"\\n\u2705 Visualization saved to '{output_dir}/results.png'\")"
643
- ]
644
- },
645
- {
646
- "cell_type": "code",
647
- "execution_count": null,
648
- "metadata": {},
649
- "outputs": [],
650
- "source": [
651
- "# Create rankings table\n",
652
- "rankings = model_summary[[\n",
653
- " 'Quality_Score', 'Similarity', 'Citation_Score', 'Completeness_Score', \n",
654
- " 'Response_Time', 'Open_Source', 'Architecture_Score'\n",
655
- "]].copy()\n",
656
- "\n",
657
- "rankings.insert(0, 'Rank', range(1, len(rankings) + 1))\n",
658
- "\n",
659
- "print(\"\\n\" + \"=\"*100)\n",
660
- "print(\"\ud83c\udfc6 FINAL RANKINGS\")\n",
661
- "print(\"=\"*100)\n",
662
- "print(rankings.to_string())\n",
663
- "print(\"=\"*100)\n",
664
- "\n",
665
- "# Winner analysis\n",
666
- "best_overall = rankings.index[0]\n",
667
- "best_open_source = rankings[rankings['Open_Source'] == True].index[0] if any(rankings['Open_Source']) else None\n",
668
- "fastest = model_summary['Response_Time'].idxmin()\n",
669
- "\n",
670
- "print(\"\\n\" + \"=\"*100)\n",
671
- "print(\"\ud83d\udca1 RECOMMENDATIONS FOR HACKATHON\")\n",
672
- "print(\"=\"*100)\n",
673
- "\n",
674
- "print(f\"\\n\ud83e\udd47 Best Overall Quality: {best_overall}\")\n",
675
- "print(f\" Quality Score: {model_summary.loc[best_overall, 'Quality_Score']:.1f}%\")\n",
676
- "print(f\" Similarity: {model_summary.loc[best_overall, 'Similarity']:.1f}%\")\n",
677
- "print(f\" Citation Score: {model_summary.loc[best_overall, 'Citation_Score']:.1f}%\")\n",
678
- "print(f\" Response Time: {model_summary.loc[best_overall, 'Response_Time']:.2f}s\")\n",
679
- "print(f\" Open Source: {model_summary.loc[best_overall, 'Open_Source']}\")\n",
680
- "print(f\" Architecture Score: {model_summary.loc[best_overall, 'Architecture_Score']}\")\n",
681
- "\n",
682
- "if best_open_source:\n",
683
- " print(f\"\\n\ud83d\udd13 Best Open-Source Model: {best_open_source}\")\n",
684
- " print(f\" Quality Score: {model_summary.loc[best_open_source, 'Quality_Score']:.1f}%\")\n",
685
- " print(f\" Architecture Score: {model_summary.loc[best_open_source, 'Architecture_Score']} (Better for hackathon!)\")\n",
686
- " print(f\" Response Time: {model_summary.loc[best_open_source, 'Response_Time']:.2f}s\")\n",
687
- "\n",
688
- "print(f\"\\n\u26a1 Fastest Model: {fastest}\")\n",
689
- "print(f\" Response Time: {model_summary.loc[fastest, 'Response_Time']:.2f}s\")\n",
690
- "print(f\" Quality Score: {model_summary.loc[fastest, 'Quality_Score']:.1f}%\")\n",
691
- "\n",
692
- "print(\"\\n\" + \"=\"*100)\n",
693
- "print(\"\ud83d\udcdd FINAL RECOMMENDATION\")\n",
694
- "print(\"=\"*100)\n",
695
- "print(\"\\nScoring Breakdown:\")\n",
696
- "print(\" - LLM Quality: 30% of total hackathon score\")\n",
697
- "print(\" - Architecture: 20% of total hackathon score (open-source preferred!)\")\n",
698
- "print(\"\\nBest Choice:\")\n",
699
- "if best_open_source and model_summary.loc[best_open_source, 'Quality_Score'] >= model_summary.loc[best_overall, 'Quality_Score'] * 0.9:\n",
700
- " print(f\" \u2705 {best_open_source} - Best balance of quality and architecture score\")\n",
701
- " print(f\" Only {model_summary.loc[best_overall, 'Quality_Score'] - model_summary.loc[best_open_source, 'Quality_Score']:.1f}% quality drop for higher architecture score!\")\n",
702
- "else:\n",
703
- " print(f\" \u2705 {best_overall} - Highest quality, use if quality gap is significant\")\n",
704
- " if best_open_source:\n",
705
- " print(f\" \u26a0\ufe0f Consider {best_open_source} for higher architecture score (trade-off: {model_summary.loc[best_overall, 'Quality_Score'] - model_summary.loc[best_open_source, 'Quality_Score']:.1f}% quality)\")\n",
706
- "\n",
707
- "print(\"=\"*100)"
708
- ]
709
- },
710
- {
711
- "cell_type": "code",
712
- "metadata": {},
713
- "source": [
714
- "# Save results\n",
715
- "from pathlib import Path\n",
716
- "\n",
717
- "# Using dynamic path\n",
718
- "output_dir = OUTPUT_DIR / 'llm_benchmark'\n",
719
- "output_dir.mkdir(parents=True, exist_ok=True)\n",
720
- "\n",
721
- "df.to_csv(output_dir / 'detailed_results.csv', index=False, encoding='utf-8')\n",
722
- "model_summary.to_csv(output_dir / 'summary.csv', encoding='utf-8')\n",
723
- "rankings.to_csv(output_dir / 'rankings.csv', index=False, encoding='utf-8')\n",
724
- "\n",
725
- "print(\"\\n\u2705 Results exported to output/llm_benchmark/:\")\n",
726
- "print(\" - detailed_results.csv (all questions and answers)\")\n",
727
- "print(\" - summary.csv (model averages)\")\n",
728
- "print(\" - rankings.csv (final rankings)\")\n",
729
- "print(\" - results.png (visualizations)\")"
730
- ]
731
- },
732
- {
733
- "cell_type": "markdown",
734
- "metadata": {},
735
- "source": [
736
- "## 11. Sample Answer Comparison"
737
- ]
738
- }
739
- ],
740
- "metadata": {
741
- "kernelspec": {
742
- "display_name": "venv",
743
- "language": "python",
744
- "name": "python3"
745
  },
746
- "language_info": {
747
- "codemirror_mode": {
748
- "name": "ipython",
749
- "version": 3
750
- },
751
- "file_extension": ".py",
752
- "mimetype": "text/x-python",
753
- "name": "python",
754
- "nbconvert_exporter": "python",
755
- "pygments_lexer": "ipython3",
756
- "version": "3.10.12"
757
- }
758
- },
759
- "nbformat": 4,
760
- "nbformat_minor": 4
761
- }
 
1
  {
2
+ "cells": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  {
4
+ "cell_type": "markdown",
5
+ "id": "38a7900c",
6
+ "metadata": {},
7
+ "source": [
8
+ "# LLM Benchmarking for SOCAR Hackathon RAG Chatbot\n",
9
+ "\n",
10
+ "Testing different LLM models for the `/llm` endpoint to find the best performer.\n",
11
+ "\n",
12
+ "## Evaluation Criteria (LLM Judge Metrics):\n",
13
+ "- **Accuracy**: Is the answer correct?\n",
14
+ "- **Relevance**: Are retrieved citations relevant?\n",
15
+ "- **Completeness**: Does it fully answer the question?\n",
16
+ "- **Citation Quality**: Proper sources with page numbers?\n",
17
+ "- **Response Time**: Speed of generation\n",
18
+ "\n",
19
+ "## Available LLM Models:\n",
20
+ "1. **Llama-4-Maverick-17B** (Open-source)\n",
21
+ "2. **DeepSeek-R1** (Open-source reasoning)\n",
22
+ "3. **GPT-4.1, GPT-5, GPT-5-mini**\n",
23
+ "4. **Claude Sonnet 4.5**"
24
+ ]
25
  },
26
  {
27
+ "cell_type": "code",
28
+ "execution_count": 45,
29
+ "id": "143cf60d",
30
+ "metadata": {},
31
+ "outputs": [],
32
+ "source": [
33
+ "# Install required packages\n",
34
+ "# !pip install openai pinecone-client sentence-transformers python-dotenv pandas matplotlib seaborn jiwer"
35
+ ]
36
+ },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  {
38
+ "cell_type": "code",
39
+ "execution_count": 46,
40
+ "id": "d698b11a",
41
+ "metadata": {},
42
+ "outputs": [
43
+ {
44
+ "name": "stdout",
45
+ "output_type": "stream",
46
+ "text": [
47
+ " Libraries loaded\n"
48
+ ]
49
+ }
50
+ ],
51
+ "source": [
52
+ "import os\n",
53
+ "import json\n",
54
+ "import time\n",
55
+ "from typing import Dict, List, Tuple\n",
56
+ "from dotenv import load_dotenv\n",
57
+ "import pandas as pd\n",
58
+ "import matplotlib.pyplot as plt\n",
59
+ "import seaborn as sns\n",
60
+ "from openai import AzureOpenAI\n",
61
+ "from pinecone import Pinecone\n",
62
+ "from sentence_transformers import SentenceTransformer\n",
63
+ "from jiwer import wer, cer\n",
64
+ "from pathlib import Path\n",
65
+ "\n",
66
+ "load_dotenv()\n",
67
+ "sns.set_style(\"whitegrid\")\n",
68
+ "plt.rcParams[\"figure.figsize\"] = (14, 8)\n",
69
+ "\n",
70
+ "print(\"✅ Libraries loaded\")"
71
+ ]
72
+ },
73
+ {
74
+ "cell_type": "code",
75
+ "execution_count": 47,
76
+ "id": "087187fb",
77
+ "metadata": {},
78
+ "outputs": [
79
+ {
80
+ "name": "stdout",
81
+ "output_type": "stream",
82
+ "text": [
83
+ "✅ Project root: /Users/ismatsamadov/SOCAR_Hackathon\n",
84
+ "✅ Docs directory: /Users/ismatsamadov/SOCAR_Hackathon/docs\n"
85
+ ]
86
+ }
87
+ ],
88
+ "source": [
89
+ "# Auto-detect project root\n",
90
+ "if Path(\"data\").exists() and Path(\"docs\").exists():\n",
91
+ " PROJECT_ROOT = Path.cwd()\n",
92
+ "elif Path(\"../data\").exists() and Path(\"../docs\").exists():\n",
93
+ " PROJECT_ROOT = Path.cwd().parent\n",
94
+ "else:\n",
95
+ " current = Path.cwd()\n",
96
+ " while current != current.parent:\n",
97
+ " if (current / \"data\").exists() and (current / \"docs\").exists():\n",
98
+ " PROJECT_ROOT = current\n",
99
+ " break\n",
100
+ " current = current.parent\n",
101
+ " else:\n",
102
+ " PROJECT_ROOT = Path.cwd()\n",
103
+ "\n",
104
+ "DATA_DIR = PROJECT_ROOT / \"data\"\n",
105
+ "DOCS_DIR = PROJECT_ROOT / \"docs\"\n",
106
+ "OUTPUT_DIR = PROJECT_ROOT / \"output\"\n",
107
+ "\n",
108
+ "print(f\"✅ Project root: {PROJECT_ROOT}\")\n",
109
+ "print(f\"✅ Docs directory: {DOCS_DIR}\")"
110
+ ]
111
+ },
112
+ {
113
+ "cell_type": "code",
114
+ "execution_count": 48,
115
+ "id": "cf51bb3f",
116
+ "metadata": {},
117
+ "outputs": [
118
+ {
119
+ "name": "stdout",
120
+ "output_type": "stream",
121
+ "text": [
122
+ "Loaded 5 test cases\n"
123
+ ]
124
+ }
125
+ ],
126
+ "source": [
127
+ "# Load sample questions and answers using dynamic paths\n",
128
+ "with open(DOCS_DIR / \"sample_questions.json\", \"r\", encoding=\"utf-8\") as f:\n",
129
+ " questions = json.load(f)\n",
130
+ "\n",
131
+ "with open(DOCS_DIR / \"sample_answers.json\", \"r\", encoding=\"utf-8\") as f:\n",
132
+ " expected_answers = json.load(f)\n",
133
+ "\n",
134
+ "print(f\"Loaded {len(questions)} test cases\")"
135
+ ]
136
+ },
137
+ {
138
+ "cell_type": "code",
139
+ "execution_count": 49,
140
+ "id": "9e761174",
141
+ "metadata": {},
142
+ "outputs": [
143
+ {
144
+ "name": "stdout",
145
+ "output_type": "stream",
146
+ "text": [
147
+ "✅ Vector DB connected\n",
148
+ "✅ Embedding model loaded\n"
149
+ ]
150
+ }
151
+ ],
152
+ "source": [
153
+ "# Initialize Pinecone\n",
154
+ "pc = Pinecone(api_key=os.getenv(\"PINECONE_API_KEY\"))\n",
155
+ "index = pc.Index(os.getenv(\"PINECONE_INDEX_NAME\", \"hackathon\"))\n",
156
+ "\n",
157
+ "# Initialize embedding model\n",
158
+ "embed_model = SentenceTransformer(\"BAAI/bge-large-en-v1.5\")\n",
159
+ "\n",
160
+ "print(f\"✅ Vector DB connected\")\n",
161
+ "print(f\"✅ Embedding model loaded\")"
162
+ ]
163
+ },
164
+ {
165
+ "cell_type": "code",
166
+ "execution_count": 50,
167
+ "id": "74396795",
168
+ "metadata": {},
169
+ "outputs": [
170
+ {
171
+ "name": "stdout",
172
+ "output_type": "stream",
173
+ "text": [
174
+ "✅ Retrieval function ready\n"
175
+ ]
176
+ }
177
+ ],
178
+ "source": [
179
+ "def retrieve_documents(query: str, top_k: int = 3) -> List[Dict]:\n",
180
+ " \"\"\"Retrieve relevant documents from vector database.\"\"\"\n",
181
+ " query_embedding = embed_model.encode(query).tolist()\n",
182
+ " \n",
183
+ " results = index.query(\n",
184
+ " vector=query_embedding,\n",
185
+ " top_k=top_k,\n",
186
+ " include_metadata=True\n",
187
+ " )\n",
188
+ " \n",
189
+ " documents = []\n",
190
+ " for match in results[\"matches\"]:\n",
191
+ " documents.append({\n",
192
+ " \"pdf_name\": match[\"metadata\"].get(\"pdf_name\", \"unknown.pdf\"),\n",
193
+ " \"page_number\": match[\"metadata\"].get(\"page_number\", 0),\n",
194
+ " \"content\": match[\"metadata\"].get(\"text\", \"\"),\n",
195
+ " \"score\": match.get(\"score\", 0.0)\n",
196
+ " })\n",
197
+ " \n",
198
+ " return documents\n",
199
+ "\n",
200
+ "print(\"✅ Retrieval function ready\")"
201
+ ]
202
+ },
203
+ {
204
+ "cell_type": "code",
205
+ "execution_count": 57,
206
+ "id": "2bfcc6fb",
207
+ "metadata": {},
208
+ "outputs": [
209
+ {
210
+ "name": "stdout",
211
+ "output_type": "stream",
212
+ "text": [
213
+ "✅ Configured 3 LLM models\n"
214
+ ]
215
+ }
216
+ ],
217
+ "source": [
218
+ "# Initialize Azure OpenAI\n",
219
+ "azure_client = AzureOpenAI(\n",
220
+ " api_key=os.getenv(\"AZURE_OPENAI_API_KEY\"),\n",
221
+ " api_version=os.getenv(\"AZURE_OPENAI_API_VERSION\", \"2024-08-01-preview\"),\n",
222
+ " azure_endpoint=os.getenv(\"AZURE_OPENAI_ENDPOINT\")\n",
223
+ ")\n",
224
+ "\n",
225
+ "LLM_MODELS = {\n",
226
+ " \"Llama-4-Maverick\": \"Llama-4-Maverick-17B-128E-Instruct-FP8\",\n",
227
+ " \"DeepSeek-R1\": \"DeepSeek-R1\",\n",
228
+ " \"GPT-4.1\": \"gpt-4.1\",\n",
229
+ " # \"GPT-5-mini\": \"gpt-5-mini\"\n",
230
+ " # \"Claude-Sonnet-4.5\": \"claude-sonnet-4-5\" # Not available in Azure deployment\n",
231
+ "}\n",
232
+ "\n",
233
+ "print(f\"✅ Configured {len(LLM_MODELS)} LLM models\")"
234
+ ]
235
+ },
236
+ {
237
+ "cell_type": "code",
238
+ "execution_count": 61,
239
+ "id": "ddedd503",
240
+ "metadata": {},
241
+ "outputs": [
242
+ {
243
+ "name": "stdout",
244
+ "output_type": "stream",
245
+ "text": [
246
+ "✅ Generation function ready\n"
247
+ ]
248
+ }
249
+ ],
250
+ "source": [
251
+ "def generate_answer(model_name: str, query: str, documents: List[Dict],\n",
252
+ " temperature: float = 0.2, max_tokens: int = 1000) -> Tuple[str, float]:\n",
253
+ " \"\"\"Generate answer using specified LLM model.\"\"\"\n",
254
+ " context_parts = []\n",
255
+ " for i, doc in enumerate(documents, 1):\n",
256
+ " context_parts.append(\n",
257
+ " f\"Document {i} (Source: {doc['pdf_name']}, Page {doc['page_number']}):\\n{doc['content']}\"\n",
258
+ " )\n",
259
+ " context = \"\\n\\n\".join(context_parts)\n",
260
+ " \n",
261
+ " prompt = f\"\"\"Siz SOCAR-ın tarixi neft və qaz sənədləri üzrə mütəxəssis köməkçisisiniz.\n",
262
+ "\n",
263
+ "Kontekst:\n",
264
+ "{context}\n",
265
+ "\n",
266
+ "Sual: {query}\n",
267
+ "\n",
268
+ "Ətraflı cavab verin və mütləq sənəd mənbələrinə istinad edin.\"\"\"\n",
269
+ " \n",
270
+ " deployment = LLM_MODELS[model_name]\n",
271
+ " \n",
272
+ " try:\n",
273
+ " start_time = time.time()\n",
274
+ " \n",
275
+ " # GPT-5 models use max_completion_tokens, others use max_tokens\n",
276
+ " if deployment.startswith(\"gpt-5\"):\n",
277
+ " response = azure_client.chat.completions.create(\n",
278
+ " model=deployment,\n",
279
+ " messages=[{\"role\": \"user\", \"content\": prompt}],\n",
280
+ " temperature=temperature,\n",
281
+ " max_completion_tokens=max_tokens\n",
282
+ " )\n",
283
+ " else:\n",
284
+ " response = azure_client.chat.completions.create(\n",
285
+ " model=deployment,\n",
286
+ " messages=[{\"role\": \"user\", \"content\": prompt}],\n",
287
+ " temperature=temperature,\n",
288
+ " max_tokens=max_tokens\n",
289
+ " )\n",
290
+ " \n",
291
+ " response_time = time.time() - start_time\n",
292
+ " answer = response.choices[0].message.content\n",
293
+ " return answer, response_time\n",
294
+ " \n",
295
+ " except Exception as e:\n",
296
+ " return f\"ERROR: {str(e)}\", 0.0\n",
297
+ "\n",
298
+ "print(\"✅ Generation function ready\")"
299
+ ]
300
+ },
301
+ {
302
+ "cell_type": "code",
303
+ "execution_count": 62,
304
+ "id": "946b0e30",
305
+ "metadata": {},
306
+ "outputs": [
307
+ {
308
+ "name": "stdout",
309
+ "output_type": "stream",
310
+ "text": [
311
+ "✅ Evaluation functions ready\n"
312
+ ]
313
+ }
314
+ ],
315
+ "source": [
316
+ "def evaluate_answer(expected: str, generated: str, documents: List[Dict]) -> Dict:\n",
317
+ " \"\"\"Evaluate answer quality.\"\"\"\n",
318
+ " # Normalize text\n",
319
+ " def normalize(text):\n",
320
+ " return text.lower().strip()\n",
321
+ " \n",
322
+ " # Calculate similarity\n",
323
+ " if expected:\n",
324
+ " wer_score = wer(normalize(expected), normalize(generated)) * 100\n",
325
+ " similarity = max(0, 100 - wer_score)\n",
326
+ " else:\n",
327
+ " similarity = 0\n",
328
+ " \n",
329
+ " # Check citations\n",
330
+ " pdf_names = [doc[\"pdf_name\"].replace(\".pdf\", \"\") for doc in documents]\n",
331
+ " cited_pdfs = sum(1 for pdf in pdf_names if pdf in generated)\n",
332
+ " citation_score = (cited_pdfs / len(pdf_names)) * 100 if pdf_names else 0\n",
333
+ " \n",
334
+ " # Completeness\n",
335
+ " word_count = len(generated.split())\n",
336
+ " completeness = min(100, (word_count / 50) * 100)\n",
337
+ " \n",
338
+ " return {\n",
339
+ " \"Similarity\": round(similarity, 2),\n",
340
+ " \"Citation_Score\": round(citation_score, 2),\n",
341
+ " \"Completeness\": round(completeness, 2),\n",
342
+ " \"Quality_Score\": round((similarity * 0.4 + citation_score * 0.4 + completeness * 0.2), 2)\n",
343
+ " }\n",
344
+ "\n",
345
+ "print(\"✅ Evaluation functions ready\")"
346
+ ]
347
+ },
348
+ {
349
+ "cell_type": "markdown",
350
+ "id": "319459ce",
351
+ "metadata": {},
352
+ "source": [
353
+ "## Run LLM Benchmark"
354
+ ]
355
+ },
356
  {
357
+ "cell_type": "code",
358
+ "execution_count": 63,
359
+ "id": "c8867f44",
360
+ "metadata": {},
361
+ "outputs": [
362
+ {
363
+ "name": "stdout",
364
+ "output_type": "stream",
365
+ "text": [
366
+ "*******\n",
367
+ "Testing: Llama-4-Maverick\n",
368
+ "**********\n",
369
+ " Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas...\n",
370
+ " ✅ 4.31s\n",
371
+ " Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və ...\n",
372
+ " ✅ 4.61s\n",
373
+ " Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümun...\n",
374
+ " ✅ 3.92s\n",
375
+ " Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkliyi (AKÇ) üçün geote...\n",
376
+ " ✅ 4.13s\n",
377
+ " Example5: Bu zonada hansı proseslər baş verir?...\n",
378
+ " ✅ 3.50s\n",
379
+ "*******\n",
380
+ "Testing: DeepSeek-R1\n",
381
+ "**********\n",
382
+ " Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas...\n",
383
+ " ✅ 10.38s\n",
384
+ " Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və ...\n",
385
+ " ✅ 11.32s\n",
386
+ " Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümun...\n",
387
+ " ✅ 10.45s\n",
388
+ " Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkliyi (AKÇ) üçün geote...\n",
389
+ " ✅ 10.56s\n",
390
+ " Example5: Bu zonada hansı proseslər baş verir?...\n",
391
+ " ✅ 10.99s\n",
392
+ "*******\n",
393
+ "Testing: GPT-4.1\n",
394
+ "**********\n",
395
+ " Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas...\n",
396
+ " ✅ 6.32s\n",
397
+ " Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və ...\n",
398
+ " ✅ 5.85s\n",
399
+ " Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümun...\n",
400
+ " ✅ 8.09s\n",
401
+ " Example4: Bakı arxipelaqı (BA) və Aşağı Kür çök��kliyi (AKÇ) üçün geote...\n",
402
+ " ✅ 6.72s\n",
403
+ " Example5: Bu zonada hansı proseslər baş verir?...\n",
404
+ " ✅ 5.22s\n",
405
+ "*********\n",
406
+ "✅ Benchmark complete!\n"
407
+ ]
408
+ }
409
+ ],
410
+ "source": [
411
+ "# Run benchmark\n",
412
+ "results = []\n",
413
+ "\n",
414
+ "for model_name in LLM_MODELS.keys():\n",
415
+ " print(f\"*******\")\n",
416
+ " print(f\"Testing: {model_name}\")\n",
417
+ " print(f\"**********\")\n",
418
+ " \n",
419
+ " for example_key, messages in questions.items():\n",
420
+ " user_msg = [m for m in messages if m[\"role\"] == \"user\"][-1]\n",
421
+ " query = user_msg[\"content\"]\n",
422
+ " \n",
423
+ " print(f\" {example_key}: {query[:60]}...\")\n",
424
+ " \n",
425
+ " # Retrieve and generate\n",
426
+ " documents = retrieve_documents(query, top_k=3)\n",
427
+ " answer, response_time = generate_answer(model_name, query, documents)\n",
428
+ " \n",
429
+ " if answer.startswith(\"ERROR\"):\n",
430
+ " print(f\" ❌ {answer}\")\n",
431
+ " continue\n",
432
+ " \n",
433
+ " print(f\" ✅ {response_time:.2f}s\")\n",
434
+ " \n",
435
+ " # Evaluate\n",
436
+ " expected = expected_answers.get(example_key, {}).get(\"Answer\", \"\")\n",
437
+ " metrics = evaluate_answer(expected, answer, documents)\n",
438
+ " \n",
439
+ " results.append({\n",
440
+ " \"Model\": model_name,\n",
441
+ " \"Question\": example_key,\n",
442
+ " \"Response_Time\": round(response_time, 2),\n",
443
+ " **metrics\n",
444
+ " })\n",
445
+ "\n",
446
+ "print(\"*********\")\n",
447
+ "print(\"✅ Benchmark complete!\")"
448
+ ]
449
+ },
450
+ {
451
+ "cell_type": "code",
452
+ "execution_count": 55,
453
+ "id": "9b243569",
454
+ "metadata": {},
455
+ "outputs": [
456
+ {
457
+ "name": "stdout",
458
+ "output_type": "stream",
459
+ "text": [
460
+ "\n",
461
+ "====================================================================================================\n",
462
+ "📊 LLM BENCHMARKING RESULTS\n",
463
+ "====================================================================================================\n",
464
+ " Quality_Score Similarity Citation_Score Completeness Response_Time\n",
465
+ "Model \n",
466
+ "GPT-4.1 52.00 0.00 80.00 100.0 6.38\n",
467
+ "Llama-4-Maverick 52.00 0.00 80.00 100.0 4.00\n",
468
+ "DeepSeek-R1 32.27 1.54 33.33 91.6 10.98\n",
469
+ "====================================================================================================\n"
470
+ ]
471
+ }
472
+ ],
473
+ "source": [
474
+ "# Analyze results\n",
475
+ "df = pd.DataFrame(results)\n",
476
+ "summary = df.groupby(\"Model\").agg({\n",
477
+ " \"Quality_Score\": \"mean\",\n",
478
+ " \"Similarity\": \"mean\",\n",
479
+ " \"Citation_Score\": \"mean\",\n",
480
+ " \"Completeness\": \"mean\",\n",
481
+ " \"Response_Time\": \"mean\"\n",
482
+ "}).round(2).sort_values(\"Quality_Score\", ascending=False)\n",
483
+ "\n",
484
+ "print(\"\\n\" + \"=\"*100)\n",
485
+ "print(\"📊 LLM BENCHMARKING RESULTS\")\n",
486
+ "print(\"=\"*100)\n",
487
+ "print(summary.to_string())\n",
488
+ "print(\"=\"*100)"
489
+ ]
490
+ },
491
+ {
492
+ "cell_type": "code",
493
+ "execution_count": 56,
494
+ "id": "8c64cf75",
495
+ "metadata": {},
496
+ "outputs": [
497
+ {
498
+ "name": "stdout",
499
+ "output_type": "stream",
500
+ "text": [
501
+ "\n",
502
+ "✅ Results saved to output/llm_benchmark/\n"
503
+ ]
504
+ }
505
+ ],
506
+ "source": [
507
+ "# Save results using dynamic path\n",
508
+ "output_dir = OUTPUT_DIR / \"llm_benchmark\"\n",
509
+ "output_dir.mkdir(parents=True, exist_ok=True)\n",
510
+ "\n",
511
+ "df.to_csv(output_dir / \"detailed_results.csv\", index=False, encoding=\"utf-8\")\n",
512
+ "summary.to_csv(output_dir / \"summary.csv\", encoding=\"utf-8\")\n",
513
+ "\n",
514
+ "print(\"\\n✅ Results saved to output/llm_benchmark/\")"
515
+ ]
516
+ }
517
+ ],
518
+ "metadata": {
519
+ "kernelspec": {
520
+ "display_name": "Python 3",
521
+ "language": "python",
522
+ "name": "python3"
523
+ },
524
+ "language_info": {
525
+ "name": "python",
526
+ "version": "3.10.0"
527
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
528
  },
529
+ "nbformat": 4,
530
+ "nbformat_minor": 5
531
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
notebooks/llm_benchmark.ipynb.backup DELETED
@@ -1,761 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "metadata": {},
6
- "source": [
7
- "# LLM Benchmarking for SOCAR Hackathon RAG Chatbot\n",
8
- "\n",
9
- "This notebook tests different LLM models for the `/llm` endpoint to find the best performer.\n",
10
- "\n",
11
- "## Evaluation Criteria (LLM Judge Metrics):\n",
12
- "- **Accuracy**: Is the answer correct?\n",
13
- "- **Relevance**: Are retrieved citations relevant?\n",
14
- "- **Completeness**: Does it fully answer the question?\n",
15
- "- **Citation Quality**: Proper sources with page numbers?\n",
16
- "- **Response Time**: Speed of generation\n",
17
- "\n",
18
- "## Available LLM Models:\n",
19
- "1. **Llama-4-Maverick-17B-128E-Instruct-FP8** (Current choice, open-source)\n",
20
- "2. **DeepSeek-R1** (Open-source reasoning model)\n",
21
- "3. **GPT-4.1** (Strong general performance)\n",
22
- "4. **GPT-5, GPT-5-mini**\n",
23
- "5. **Claude Sonnet 4.5** (Best quality)\n",
24
- "6. **Claude Opus 4.1**\n",
25
- "7. **Phi-4-multimodal-instruct**\n",
26
- "8. **gpt-oss-120b**"
27
- ]
28
- },
29
- {
30
- "cell_type": "code",
31
- "execution_count": 1,
32
- "metadata": {},
33
- "outputs": [],
34
- "source": [
35
- "# Install required packages\n",
36
- "# !pip install openai pinecone-client sentence-transformers python-dotenv pandas matplotlib seaborn jiwer"
37
- ]
38
- },
39
- {
40
- "cell_type": "code",
41
- "execution_count": 2,
42
- "metadata": {},
43
- "outputs": [
44
- {
45
- "name": "stderr",
46
- "output_type": "stream",
47
- "text": [
48
- "/Users/ismatsamadov/SOCAR_Hackathon/venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
49
- " from .autonotebook import tqdm as notebook_tqdm\n"
50
- ]
51
- },
52
- {
53
- "name": "stdout",
54
- "output_type": "stream",
55
- "text": [
56
- "\u2705 Libraries loaded successfully\n"
57
- ]
58
- }
59
- ],
60
- "source": [
61
- "import os\n",
62
- "import json\n",
63
- "import time\n",
64
- "from typing import Dict, List, Tuple\n",
65
- "from dotenv import load_dotenv\n",
66
- "import pandas as pd\n",
67
- "import matplotlib.pyplot as plt\n",
68
- "import seaborn as sns\n",
69
- "from openai import AzureOpenAI\n",
70
- "from pinecone import Pinecone\n",
71
- "from sentence_transformers import SentenceTransformer\n",
72
- "from jiwer import wer, cer\n",
73
- "\n",
74
- "# Load environment variables\n",
75
- "load_dotenv()\n",
76
- "\n",
77
- "# Set style\n",
78
- "sns.set_style('whitegrid')\n",
79
- "plt.rcParams['figure.figsize'] = (14, 8)\n",
80
- "\n",
81
- "print(\"\u2705 Libraries loaded successfully\")"
82
- ]
83
- },
84
- {
85
- "cell_type": "code",
86
- "execution_count": 3,
87
- "metadata": {},
88
- "outputs": [
89
- {
90
- "name": "stdout",
91
- "output_type": "stream",
92
- "text": [
93
- "\u2705 Project root: /Users/ismatsamadov/SOCAR_Hackathon\n",
94
- "\u2705 Docs directory: /Users/ismatsamadov/SOCAR_Hackathon/docs\n",
95
- "\u2705 Output directory: /Users/ismatsamadov/SOCAR_Hackathon/output\n"
96
- ]
97
- }
98
- ],
99
- "source": [
100
- "# Auto-detect project root (works from any directory)\n",
101
- "import os\n",
102
- "from pathlib import Path\n",
103
- "\n",
104
- "if Path('data').exists() and Path('docs').exists():\n",
105
- " # Already in project root\n",
106
- " PROJECT_ROOT = Path.cwd()\n",
107
- "elif Path('../data').exists() and Path('../docs').exists():\n",
108
- " # In notebooks/ subdirectory\n",
109
- " PROJECT_ROOT = Path.cwd().parent\n",
110
- "else:\n",
111
- " # Fallback: try to find project root\n",
112
- " current = Path.cwd()\n",
113
- " while current != current.parent:\n",
114
- " if (current / 'data').exists() and (current / 'docs').exists():\n",
115
- " PROJECT_ROOT = current\n",
116
- " break\n",
117
- " current = current.parent\n",
118
- " else:\n",
119
- " PROJECT_ROOT = Path.cwd()\n",
120
- "\n",
121
- "# Define all paths relative to project root\n",
122
- "DATA_DIR = PROJECT_ROOT / 'data'\n",
123
- "DOCS_DIR = PROJECT_ROOT / 'docs'\n",
124
- "OUTPUT_DIR = PROJECT_ROOT / 'output'\n",
125
- "\n",
126
- "print(f\"\u2705 Project root: {PROJECT_ROOT}\")\n",
127
- "print(f\"\u2705 Docs directory: {DOCS_DIR}\")\n",
128
- "print(f\"\u2705 Output directory: {OUTPUT_DIR}\")"
129
- ]
130
- },
131
- {
132
- "cell_type": "code",
133
- "execution_count": 4,
134
- "metadata": {},
135
- "outputs": [
136
- {
137
- "name": "stdout",
138
- "output_type": "stream",
139
- "text": [
140
- "Loaded 5 test cases\n",
141
- "\n",
142
- "Test Questions:\n",
143
- "1. Example1: Daha az quyu il\u0259 daha \u00e7ox hasilat \u0259ld\u0259 etm\u0259k \u00fc\u00e7\u00fcn hans\u0131 \u0259sas amill\u0259rin inteqrasiyas\u0131 t\u0259l\u0259b olunur?...\n",
144
- "2. Example2: Q\u0259rbi Ab\u015feron yata\u011f\u0131nda suvurma t\u0259dbirl\u0259ri hans\u0131 tarixd\u0259 v\u0259 hans\u0131 layda t\u0259tbiq edilmi\u015fdir v\u0259 bunun m...\n",
145
- "3. Example3: Pirallah\u0131 strukturunda 1253 n\u00f6mr\u0259li quyudan g\u00f6t\u00fcr\u00fclm\u00fc\u015f n\u00fcmun\u0259l\u0259rd\u0259 SiO2 v\u0259 CaO oksidl\u0259ri aras\u0131nda ha...\n",
146
- "4. Example4: Bak\u0131 arxipelaq\u0131 (BA) v\u0259 A\u015fa\u011f\u0131 K\u00fcr \u00e7\u00f6k\u0259kliyi (AK\u00c7) \u00fc\u00e7\u00fcn geotemperatur x\u0259rit\u0259l\u0259rin\u0259 \u0259sas\u0259n neft v\u0259 qaz...\n",
147
- "5. Example5: Bu zonada hans\u0131 prosesl\u0259r ba\u015f verir?...\n"
148
- ]
149
- }
150
- ],
151
- "source": [
152
- "# Load sample questions - using dynamic paths\n",
153
- "with open(DOCS_DIR / 'sample_questions.json', 'r', encoding='utf-8') as f:\n",
154
- " questions = json.load(f)\n",
155
- "\n",
156
- "# Load expected answers - using dynamic paths\n",
157
- "with open(DOCS_DIR / 'sample_answers.json', 'r', encoding='utf-8') as f:\n",
158
- " expected_answers = json.load(f)\n",
159
- "\n",
160
- "print(f\"Loaded {len(questions)} test cases\")\n",
161
- "print(\"\\nTest Questions:\")\n",
162
- "for i, (key, msgs) in enumerate(questions.items(), 1):\n",
163
- " user_msg = [m for m in msgs if m['role'] == 'user'][-1]\n",
164
- " print(f\"{i}. {key}: {user_msg['content'][:100]}...\")"
165
- ]
166
- },
167
- {
168
- "cell_type": "markdown",
169
- "metadata": {},
170
- "source": [
171
- "## 2. Initialize Vector Database and Embedding Model"
172
- ]
173
- },
174
- {
175
- "cell_type": "code",
176
- "execution_count": null,
177
- "metadata": {},
178
- "outputs": [],
179
- "source": [
180
- "# Initialize Pinecone\n",
181
- "pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))\n",
182
- "index = pc.Index(os.getenv('PINECONE_INDEX_NAME', 'hackathon'))\n",
183
- "\n",
184
- "# Initialize embedding model (same as used for ingestion)\n",
185
- "embed_model = SentenceTransformer('BAAI/bge-large-en-v1.5')\n",
186
- "\n",
187
- "print(f\"\u2705 Vector DB connected: {index.describe_index_stats()}\")\n",
188
- "print(f\"\u2705 Embedding model loaded: {embed_model}\")"
189
- ]
190
- },
191
- {
192
- "cell_type": "markdown",
193
- "metadata": {},
194
- "source": [
195
- "## 3. RAG Retrieval Function"
196
- ]
197
- },
198
- {
199
- "cell_type": "code",
200
- "execution_count": null,
201
- "metadata": {},
202
- "outputs": [],
203
- "source": [
204
- "def retrieve_documents(query: str, top_k: int = 3) -> List[Dict]:\n",
205
- " \"\"\"\n",
206
- " Retrieve relevant documents from vector database.\n",
207
- " \"\"\"\n",
208
- " # Generate query embedding\n",
209
- " query_embedding = embed_model.encode(query).tolist()\n",
210
- " \n",
211
- " # Search vector DB\n",
212
- " results = index.query(\n",
213
- " vector=query_embedding,\n",
214
- " top_k=top_k,\n",
215
- " include_metadata=True\n",
216
- " )\n",
217
- " \n",
218
- " # Extract documents\n",
219
- " documents = []\n",
220
- " for match in results['matches']:\n",
221
- " documents.append({\n",
222
- " 'pdf_name': match['metadata'].get('pdf_name', 'unknown.pdf'),\n",
223
- " 'page_number': match['metadata'].get('page_number', 0),\n",
224
- " 'content': match['metadata'].get('text', ''),\n",
225
- " 'score': match.get('score', 0.0)\n",
226
- " })\n",
227
- " \n",
228
- " return documents\n",
229
- "\n",
230
- "# Test retrieval\n",
231
- "test_query = \"Pal\u00e7\u0131q vulkanlar\u0131n\u0131n t\u0259sir radiusu n\u0259 q\u0259d\u0259rdir?\"\n",
232
- "test_docs = retrieve_documents(test_query)\n",
233
- "print(f\"\\n\u2705 Retrieved {len(test_docs)} documents for test query\")\n",
234
- "print(f\"Top result: {test_docs[0]['pdf_name']}, page {test_docs[0]['page_number']} (score: {test_docs[0]['score']:.3f})\")"
235
- ]
236
- },
237
- {
238
- "cell_type": "markdown",
239
- "metadata": {},
240
- "source": [
241
- "## 4. LLM Client Functions"
242
- ]
243
- },
244
- {
245
- "cell_type": "code",
246
- "execution_count": null,
247
- "metadata": {},
248
- "outputs": [],
249
- "source": [
250
- "# Initialize Azure OpenAI\n",
251
- "azure_client = AzureOpenAI(\n",
252
- " api_key=os.getenv('AZURE_OPENAI_API_KEY'),\n",
253
- " api_version=os.getenv('AZURE_OPENAI_API_VERSION', '2024-08-01-preview'),\n",
254
- " azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT')\n",
255
- ")\n",
256
- "\n",
257
- "LLM_MODELS = {\n",
258
- " 'Llama-4-Maverick': 'Llama-4-Maverick-17B-128E-Instruct-FP8',\n",
259
- " 'DeepSeek-R1': 'DeepSeek-R1',\n",
260
- " 'GPT-4.1': 'gpt-4.1',\n",
261
- " 'GPT-5-mini': 'gpt-5-mini',\n",
262
- " 'Claude-Sonnet-4.5': 'claude-sonnet-4-5',\n",
263
- "}\n",
264
- "\n",
265
- "def generate_answer(model_name: str, query: str, documents: List[Dict], \n",
266
- " temperature: float = 0.2, max_tokens: int = 1000) -> Tuple[str, float]:\n",
267
- " \"\"\"\n",
268
- " Generate answer using specified LLM model.\n",
269
- " Returns: (answer, response_time)\n",
270
- " \"\"\"\n",
271
- " # Build context from retrieved documents\n",
272
- " context_parts = []\n",
273
- " for i, doc in enumerate(documents, 1):\n",
274
- " context_parts.append(\n",
275
- " f\"Document {i} (Source: {doc['pdf_name']}, Page {doc['page_number']}):\\n{doc['content']}\"\n",
276
- " )\n",
277
- " context = \"\\n\\n\".join(context_parts)\n",
278
- " \n",
279
- " # Create prompt\n",
280
- " prompt = f\"\"\"Siz SOCAR-\u0131n tarixi neft v\u0259 qaz s\u0259n\u0259dl\u0259ri \u00fczr\u0259 m\u00fct\u0259x\u0259ssis k\u00f6m\u0259k\u00e7isisiniz.\n",
281
- "\n",
282
- "Kontekst (\u0259laq\u0259li s\u0259n\u0259dl\u0259r):\n",
283
- "{context}\n",
284
- "\n",
285
- "Sual: {query}\n",
286
- "\n",
287
- "\u018ftrafl\u0131 cavab verin v\u0259 m\u00fctl\u0259q s\u0259n\u0259d m\u0259nb\u0259l\u0259rin\u0259 istinad edin (PDF ad\u0131 v\u0259 s\u0259hif\u0259 n\u00f6mr\u0259si il\u0259).\n",
288
- "Cavab\u0131n\u0131z d\u0259qiq, faktlara \u0259saslanan v\u0259 kontekst m\u0259lumatlar\u0131ndan istifad\u0259 ed\u0259n olmal\u0131d\u0131r.\"\"\"\n",
289
- " \n",
290
- " # Get model deployment\n",
291
- " deployment = MODELS[model_name]['deployment']\n",
292
- " \n",
293
- " try:\n",
294
- " start_time = time.time()\n",
295
- " \n",
296
- " # GPT-5 models use max_completion_tokens, others use max_tokens\n",
297
- " if deployment.startswith('gpt-5'):\n",
298
- " response = azure_client.chat.completions.create(\n",
299
- " model=deployment,\n",
300
- " messages=[\n",
301
- " {\"role\": \"user\", \"content\": prompt}\n",
302
- " ],\n",
303
- " temperature=temperature,\n",
304
- " max_completion_tokens=max_tokens\n",
305
- " )\n",
306
- " else:\n",
307
- " response = azure_client.chat.completions.create(\n",
308
- " model=deployment,\n",
309
- " messages=[\n",
310
- " {\"role\": \"user\", \"content\": prompt}\n",
311
- " ],\n",
312
- " temperature=temperature,\n",
313
- " max_tokens=max_tokens\n",
314
- " )\n",
315
- " \n",
316
- " response_time = time.time() - start_time\n",
317
- " answer = response.choices[0].message.content\n",
318
- " \n",
319
- " return answer, response_time\n",
320
- " \n",
321
- " except Exception as e:\n",
322
- " return f\"ERROR: {str(e)}\", 0.0\n",
323
- "\n",
324
- "print(f\"\\n\u2705 Configured {len(LLM_MODELS)} LLM models for testing\")"
325
- ]
326
- },
327
- {
328
- "cell_type": "markdown",
329
- "metadata": {},
330
- "source": [
331
- "## 5. Evaluation Metrics"
332
- ]
333
- },
334
- {
335
- "cell_type": "code",
336
- "execution_count": null,
337
- "metadata": {},
338
- "outputs": [],
339
- "source": [
340
- "def normalize_text(text: str) -> str:\n",
341
- " \"\"\"Normalize text for comparison.\"\"\"\n",
342
- " import re\n",
343
- " text = text.lower().strip()\n",
344
- " text = re.sub(r'\\s+', ' ', text)\n",
345
- " return text\n",
346
- "\n",
347
- "def calculate_answer_similarity(reference: str, hypothesis: str) -> Dict[str, float]:\n",
348
- " \"\"\"\n",
349
- " Calculate similarity between generated and expected answer.\n",
350
- " Lower is better for error rates.\n",
351
- " \"\"\"\n",
352
- " ref_norm = normalize_text(reference)\n",
353
- " hyp_norm = normalize_text(hypothesis)\n",
354
- " \n",
355
- " # Character Error Rate\n",
356
- " cer_score = cer(ref_norm, hyp_norm) * 100\n",
357
- " \n",
358
- " # Word Error Rate \n",
359
- " wer_score = wer(ref_norm, hyp_norm) * 100\n",
360
- " \n",
361
- " # Similarity scores (higher is better)\n",
362
- " similarity = max(0, 100 - wer_score)\n",
363
- " \n",
364
- " return {\n",
365
- " 'CER': round(cer_score, 2),\n",
366
- " 'WER': round(wer_score, 2),\n",
367
- " 'Similarity': round(similarity, 2)\n",
368
- " }\n",
369
- "\n",
370
- "def check_citations(answer: str, documents: List[Dict]) -> Dict[str, any]:\n",
371
- " \"\"\"\n",
372
- " Check if answer includes proper citations.\n",
373
- " \"\"\"\n",
374
- " import re\n",
375
- " \n",
376
- " # Check for PDF names\n",
377
- " pdf_names = [doc['pdf_name'] for doc in documents]\n",
378
- " cited_pdfs = sum(1 for pdf in pdf_names if pdf.replace('.pdf', '') in answer)\n",
379
- " \n",
380
- " # Check for page numbers\n",
381
- " page_numbers = [str(doc['page_number']) for doc in documents]\n",
382
- " cited_pages = sum(1 for page in page_numbers if page in answer)\n",
383
- " \n",
384
- " # Check for source keywords\n",
385
- " source_keywords = ['m\u0259nb\u0259', 's\u0259n\u0259d', 's\u0259hif\u0259', 'pdf', 'document', 'page', 'source']\n",
386
- " has_source_ref = any(kw in answer.lower() for kw in source_keywords)\n",
387
- " \n",
388
- " citation_score = (\n",
389
- " (cited_pdfs / len(pdf_names) * 40) + # 40% for PDF citation\n",
390
- " (cited_pages / len(page_numbers) * 40) + # 40% for page citation\n",
391
- " (20 if has_source_ref else 0) # 20% for having source keywords\n",
392
- " )\n",
393
- " \n",
394
- " return {\n",
395
- " 'Citation_Score': round(citation_score, 2),\n",
396
- " 'Cited_PDFs': cited_pdfs,\n",
397
- " 'Cited_Pages': cited_pages,\n",
398
- " 'Has_Source_Reference': has_source_ref\n",
399
- " }\n",
400
- "\n",
401
- "def evaluate_completeness(answer: str, min_length: int = 100) -> Dict[str, any]:\n",
402
- " \"\"\"\n",
403
- " Evaluate answer completeness.\n",
404
- " \"\"\"\n",
405
- " word_count = len(answer.split())\n",
406
- " char_count = len(answer)\n",
407
- " \n",
408
- " # Penalize very short or very long answers\n",
409
- " if char_count < min_length:\n",
410
- " completeness_score = (char_count / min_length) * 100\n",
411
- " elif char_count > 2000:\n",
412
- " completeness_score = 100 - ((char_count - 2000) / 2000 * 20) # Penalty for verbosity\n",
413
- " else:\n",
414
- " completeness_score = 100\n",
415
- " \n",
416
- " return {\n",
417
- " 'Completeness_Score': round(max(0, completeness_score), 2),\n",
418
- " 'Word_Count': word_count,\n",
419
- " 'Char_Count': char_count\n",
420
- " }\n",
421
- "\n",
422
- "print(\"\u2705 Evaluation functions ready\")"
423
- ]
424
- },
425
- {
426
- "cell_type": "markdown",
427
- "metadata": {},
428
- "source": [
429
- "## 6. Run Benchmark on All Models"
430
- ]
431
- },
432
- {
433
- "cell_type": "code",
434
- "execution_count": null,
435
- "metadata": {},
436
- "outputs": [],
437
- "source": [
438
- "# Select models to test (you can comment out models to skip)\n",
439
- "MODELS_TO_TEST = [\n",
440
- " 'Llama-4-Maverick-17B',\n",
441
- " 'DeepSeek-R1',\n",
442
- " 'GPT-4.1',\n",
443
- " 'GPT-5-mini',\n",
444
- " 'Claude-Sonnet-4.5',\n",
445
- " # 'Claude-Opus-4.1', # Uncomment to test\n",
446
- " # 'Phi-4-multimodal', # Uncomment to test\n",
447
- " # 'GPT-OSS-120B', # Uncomment to test\n",
448
- "]\n",
449
- "\n",
450
- "print(f\"Testing {len(MODELS_TO_TEST)} models on {len(questions)} questions...\\n\")\n",
451
- "print(\"This may take several minutes...\\n\")"
452
- ]
453
- },
454
- {
455
- "cell_type": "code",
456
- "execution_count": null,
457
- "metadata": {},
458
- "outputs": [],
459
- "source": [
460
- "# Run benchmark\n",
461
- "results = []\n",
462
- "\n",
463
- "for model_name in MODELS_TO_TEST:\n",
464
- " print(f\"\\n{'='*80}\")\n",
465
- " print(f\"Testing: {model_name}\")\n",
466
- " print(f\"{'='*80}\")\n",
467
- " \n",
468
- " model_results = []\n",
469
- " \n",
470
- " for example_key, messages in questions.items():\n",
471
- " # Get the last user message (the actual question)\n",
472
- " user_msg = [m for m in messages if m['role'] == 'user'][-1]\n",
473
- " query = user_msg['content']\n",
474
- " \n",
475
- " print(f\"\\n Question {example_key}: {query[:80]}...\")\n",
476
- " \n",
477
- " # Retrieve documents\n",
478
- " documents = retrieve_documents(query, top_k=3)\n",
479
- " \n",
480
- " # Generate answer\n",
481
- " answer, response_time = generate_answer(model_name, query, documents)\n",
482
- " \n",
483
- " if answer.startswith('ERROR'):\n",
484
- " print(f\" \u274c Failed: {answer}\")\n",
485
- " continue\n",
486
- " \n",
487
- " print(f\" \u2705 Response time: {response_time:.2f}s\")\n",
488
- " \n",
489
- " # Get expected answer\n",
490
- " expected = expected_answers.get(example_key, {}).get('Answer', '')\n",
491
- " \n",
492
- " # Calculate metrics\n",
493
- " similarity_metrics = calculate_answer_similarity(expected, answer) if expected else {'CER': 0, 'WER': 0, 'Similarity': 0}\n",
494
- " citation_metrics = check_citations(answer, documents)\n",
495
- " completeness_metrics = evaluate_completeness(answer)\n",
496
- " \n",
497
- " # Store result\n",
498
- " result = {\n",
499
- " 'Model': model_name,\n",
500
- " 'Question': example_key,\n",
501
- " 'Query': query[:100],\n",
502
- " 'Answer': answer[:200] + '...',\n",
503
- " 'Response_Time': round(response_time, 2),\n",
504
- " **similarity_metrics,\n",
505
- " **citation_metrics,\n",
506
- " **completeness_metrics,\n",
507
- " 'Open_Source': MODELS[model_name]['open_source'],\n",
508
- " 'Architecture_Score': MODELS[model_name]['architecture_score']\n",
509
- " }\n",
510
- " \n",
511
- " model_results.append(result)\n",
512
- " results.append(result)\n",
513
- " \n",
514
- " # Show summary for this model\n",
515
- " if model_results:\n",
516
- " avg_response_time = sum(r['Response_Time'] for r in model_results) / len(model_results)\n",
517
- " avg_similarity = sum(r['Similarity'] for r in model_results) / len(model_results)\n",
518
- " avg_citation = sum(r['Citation_Score'] for r in model_results) / len(model_results)\n",
519
- " avg_completeness = sum(r['Completeness_Score'] for r in model_results) / len(model_results)\n",
520
- " \n",
521
- " print(f\"\\n \ud83d\udcca {model_name} Summary:\")\n",
522
- " print(f\" Avg Response Time: {avg_response_time:.2f}s\")\n",
523
- " print(f\" Avg Similarity: {avg_similarity:.1f}%\")\n",
524
- " print(f\" Avg Citation Score: {avg_citation:.1f}%\")\n",
525
- " print(f\" Avg Completeness: {avg_completeness:.1f}%\")\n",
526
- "\n",
527
- "print(f\"\\n{'='*80}\")\n",
528
- "print(\"\u2705 Benchmarking complete!\")\n",
529
- "print(f\"{'='*80}\")"
530
- ]
531
- },
532
- {
533
- "cell_type": "markdown",
534
- "metadata": {},
535
- "source": [
536
- "## 7. Aggregate Results and Rankings"
537
- ]
538
- },
539
- {
540
- "cell_type": "code",
541
- "execution_count": null,
542
- "metadata": {},
543
- "outputs": [],
544
- "source": [
545
- "# Create DataFrame\n",
546
- "df = pd.DataFrame(results)\n",
547
- "\n",
548
- "# Calculate aggregate scores per model\n",
549
- "model_summary = df.groupby('Model').agg({\n",
550
- " 'Response_Time': 'mean',\n",
551
- " 'Similarity': 'mean',\n",
552
- " 'Citation_Score': 'mean',\n",
553
- " 'Completeness_Score': 'mean',\n",
554
- " 'CER': 'mean',\n",
555
- " 'WER': 'mean',\n",
556
- " 'Open_Source': 'first',\n",
557
- " 'Architecture_Score': 'first'\n",
558
- "}).round(2)\n",
559
- "\n",
560
- "# Calculate overall quality score (weighted average)\n",
561
- "model_summary['Quality_Score'] = (\n",
562
- " model_summary['Similarity'] * 0.35 + # 35% answer accuracy\n",
563
- " model_summary['Citation_Score'] * 0.35 + # 35% citation quality\n",
564
- " model_summary['Completeness_Score'] * 0.30 # 30% completeness\n",
565
- ").round(2)\n",
566
- "\n",
567
- "# Sort by Quality Score\n",
568
- "model_summary = model_summary.sort_values('Quality_Score', ascending=False)\n",
569
- "\n",
570
- "# Display summary table\n",
571
- "print(\"\\n\" + \"=\"*100)\n",
572
- "print(\"\ud83d\udcca LLM BENCHMARKING RESULTS - MODEL SUMMARY\")\n",
573
- "print(\"=\"*100)\n",
574
- "print(model_summary.to_string())\n",
575
- "print(\"=\"*100)"
576
- ]
577
- },
578
- {
579
- "cell_type": "markdown",
580
- "metadata": {},
581
- "source": [
582
- "# Create comprehensive visualization\n",
583
- "import os\n",
584
- "from pathlib import Path\n",
585
- "\n",
586
- "# Create output directory - using dynamic path\n",
587
- "output_dir = OUTPUT_DIR / 'llm_benchmark'\n",
588
- "output_dir.mkdir(parents=True, exist_ok=True)\n",
589
- "\n",
590
- "fig, axes = plt.subplots(2, 2, figsize=(16, 10))\n",
591
- "\n",
592
- "models = df['Model'].tolist()\n",
593
- "colors = sns.color_palette('viridis', len(models))\n",
594
- "\n",
595
- "# 1. CSR - Character Success Rate (MAIN METRIC)\n",
596
- "ax1 = axes[0, 0]\n",
597
- "bars1 = ax1.barh(models, df['CSR'], color=colors)\n",
598
- "ax1.set_xlabel('CSR (%) - Higher is Better', fontsize=12, fontweight='bold')\n",
599
- "ax1.set_title('Character Success Rate (CSR)\\n\ud83c\udfc6 HACKATHON PRIMARY METRIC', \n",
600
- " fontsize=14, fontweight='bold')\n",
601
- "ax1.set_xlim(0, 100)\n",
602
- "for i, (model, csr) in enumerate(zip(models, df['CSR'])):\n",
603
- " ax1.text(csr + 1, i, f'{csr:.2f}%', va='center', fontsize=11, fontweight='bold')\n",
604
- "ax1.axvline(x=90, color='green', linestyle='--', alpha=0.3, label='Excellent (>90%)')\n",
605
- "ax1.axvline(x=80, color='orange', linestyle='--', alpha=0.3, label='Good (>80%)')\n",
606
- "ax1.legend(fontsize=9)\n",
607
- "\n",
608
- "# 2. WSR - Word Success Rate\n",
609
- "ax2 = axes[0, 1]\n",
610
- "bars2 = ax2.barh(models, df['WSR'], color=colors)\n",
611
- "ax2.set_xlabel('WSR (%) - Higher is Better', fontsize=12, fontweight='bold')\n",
612
- "ax2.set_title('Word Success Rate (WSR)', fontsize=14, fontweight='bold')\n",
613
- "ax2.set_xlim(0, 100)\n",
614
- "for i, (model, wsr) in enumerate(zip(models, df['WSR'])):\n",
615
- " ax2.text(wsr + 1, i, f'{wsr:.2f}%', va='center', fontsize=11, fontweight='bold')\n",
616
- "\n",
617
- "# 3. Response Time\n",
618
- "ax3 = axes[1, 0]\n",
619
- "bars3 = ax3.barh(models, df['Response_Time'], color=colors)\n",
620
- "ax3.set_xlabel('Total Time (seconds) - Lower is Better', fontsize=12, fontweight='bold')\n",
621
- "ax3.set_title('Processing Speed', fontsize=14, fontweight='bold')\n",
622
- "for i, (model, time_val) in enumerate(zip(models, df['Response_Time'])):\n",
623
- " ax3.text(time_val + 0.5, i, f'{time_val:.1f}s', va='center', fontsize=11)\n",
624
- "\n",
625
- "# 4. Error Rates Comparison\n",
626
- "ax4 = axes[1, 1]\n",
627
- "x = range(len(models))\n",
628
- "width = 0.35\n",
629
- "ax4.bar([i - width/2 for i in x], df['CER'], width, label='CER', color='coral', alpha=0.8)\n",
630
- "ax4.bar([i + width/2 for i in x], df['WER'], width, label='WER', color='skyblue', alpha=0.8)\n",
631
- "ax4.set_ylabel('Error Rate (%) - Lower is Better', fontsize=12, fontweight='bold')\n",
632
- "ax4.set_title('Error Rates', fontsize=14, fontweight='bold')\n",
633
- "ax4.set_xticks(x)\n",
634
- "ax4.set_xticklabels(models, rotation=45, ha='right')\n",
635
- "ax4.legend(fontsize=11)\n",
636
- "ax4.grid(axis='y', alpha=0.3)\n",
637
- "\n",
638
- "plt.tight_layout()\n",
639
- "plt.savefig(output_dir / 'results.png', dpi=300, bbox_inches='tight')\n",
640
- "plt.show()\n",
641
- "\n",
642
- "print(f\"\\n\u2705 Visualization saved to '{output_dir}/results.png'\")"
643
- ]
644
- },
645
- {
646
- "cell_type": "code",
647
- "execution_count": null,
648
- "metadata": {},
649
- "outputs": [],
650
- "source": [
651
- "# Create rankings table\n",
652
- "rankings = model_summary[[\n",
653
- " 'Quality_Score', 'Similarity', 'Citation_Score', 'Completeness_Score', \n",
654
- " 'Response_Time', 'Open_Source', 'Architecture_Score'\n",
655
- "]].copy()\n",
656
- "\n",
657
- "rankings.insert(0, 'Rank', range(1, len(rankings) + 1))\n",
658
- "\n",
659
- "print(\"\\n\" + \"=\"*100)\n",
660
- "print(\"\ud83c\udfc6 FINAL RANKINGS\")\n",
661
- "print(\"=\"*100)\n",
662
- "print(rankings.to_string())\n",
663
- "print(\"=\"*100)\n",
664
- "\n",
665
- "# Winner analysis\n",
666
- "best_overall = rankings.index[0]\n",
667
- "best_open_source = rankings[rankings['Open_Source'] == True].index[0] if any(rankings['Open_Source']) else None\n",
668
- "fastest = model_summary['Response_Time'].idxmin()\n",
669
- "\n",
670
- "print(\"\\n\" + \"=\"*100)\n",
671
- "print(\"\ud83d\udca1 RECOMMENDATIONS FOR HACKATHON\")\n",
672
- "print(\"=\"*100)\n",
673
- "\n",
674
- "print(f\"\\n\ud83e\udd47 Best Overall Quality: {best_overall}\")\n",
675
- "print(f\" Quality Score: {model_summary.loc[best_overall, 'Quality_Score']:.1f}%\")\n",
676
- "print(f\" Similarity: {model_summary.loc[best_overall, 'Similarity']:.1f}%\")\n",
677
- "print(f\" Citation Score: {model_summary.loc[best_overall, 'Citation_Score']:.1f}%\")\n",
678
- "print(f\" Response Time: {model_summary.loc[best_overall, 'Response_Time']:.2f}s\")\n",
679
- "print(f\" Open Source: {model_summary.loc[best_overall, 'Open_Source']}\")\n",
680
- "print(f\" Architecture Score: {model_summary.loc[best_overall, 'Architecture_Score']}\")\n",
681
- "\n",
682
- "if best_open_source:\n",
683
- " print(f\"\\n\ud83d\udd13 Best Open-Source Model: {best_open_source}\")\n",
684
- " print(f\" Quality Score: {model_summary.loc[best_open_source, 'Quality_Score']:.1f}%\")\n",
685
- " print(f\" Architecture Score: {model_summary.loc[best_open_source, 'Architecture_Score']} (Better for hackathon!)\")\n",
686
- " print(f\" Response Time: {model_summary.loc[best_open_source, 'Response_Time']:.2f}s\")\n",
687
- "\n",
688
- "print(f\"\\n\u26a1 Fastest Model: {fastest}\")\n",
689
- "print(f\" Response Time: {model_summary.loc[fastest, 'Response_Time']:.2f}s\")\n",
690
- "print(f\" Quality Score: {model_summary.loc[fastest, 'Quality_Score']:.1f}%\")\n",
691
- "\n",
692
- "print(\"\\n\" + \"=\"*100)\n",
693
- "print(\"\ud83d\udcdd FINAL RECOMMENDATION\")\n",
694
- "print(\"=\"*100)\n",
695
- "print(\"\\nScoring Breakdown:\")\n",
696
- "print(\" - LLM Quality: 30% of total hackathon score\")\n",
697
- "print(\" - Architecture: 20% of total hackathon score (open-source preferred!)\")\n",
698
- "print(\"\\nBest Choice:\")\n",
699
- "if best_open_source and model_summary.loc[best_open_source, 'Quality_Score'] >= model_summary.loc[best_overall, 'Quality_Score'] * 0.9:\n",
700
- " print(f\" \u2705 {best_open_source} - Best balance of quality and architecture score\")\n",
701
- " print(f\" Only {model_summary.loc[best_overall, 'Quality_Score'] - model_summary.loc[best_open_source, 'Quality_Score']:.1f}% quality drop for higher architecture score!\")\n",
702
- "else:\n",
703
- " print(f\" \u2705 {best_overall} - Highest quality, use if quality gap is significant\")\n",
704
- " if best_open_source:\n",
705
- " print(f\" \u26a0\ufe0f Consider {best_open_source} for higher architecture score (trade-off: {model_summary.loc[best_overall, 'Quality_Score'] - model_summary.loc[best_open_source, 'Quality_Score']:.1f}% quality)\")\n",
706
- "\n",
707
- "print(\"=\"*100)"
708
- ]
709
- },
710
- {
711
- "cell_type": "markdown",
712
- "metadata": {},
713
- "source": [
714
- "# Save results\n",
715
- "from pathlib import Path\n",
716
- "\n",
717
- "# Using dynamic path\n",
718
- "output_dir = OUTPUT_DIR / 'llm_benchmark'\n",
719
- "output_dir.mkdir(parents=True, exist_ok=True)\n",
720
- "\n",
721
- "df.to_csv(output_dir / 'detailed_results.csv', index=False, encoding='utf-8')\n",
722
- "model_summary.to_csv(output_dir / 'summary.csv', encoding='utf-8')\n",
723
- "rankings.to_csv(output_dir / 'rankings.csv', index=False, encoding='utf-8')\n",
724
- "\n",
725
- "print(\"\\n\u2705 Results exported to output/llm_benchmark/:\")\n",
726
- "print(\" - detailed_results.csv (all questions and answers)\")\n",
727
- "print(\" - summary.csv (model averages)\")\n",
728
- "print(\" - rankings.csv (final rankings)\")\n",
729
- "print(\" - results.png (visualizations)\")"
730
- ]
731
- },
732
- {
733
- "cell_type": "markdown",
734
- "metadata": {},
735
- "source": [
736
- "## 11. Sample Answer Comparison"
737
- ]
738
- }
739
- ],
740
- "metadata": {
741
- "kernelspec": {
742
- "display_name": "venv",
743
- "language": "python",
744
- "name": "python3"
745
- },
746
- "language_info": {
747
- "codemirror_mode": {
748
- "name": "ipython",
749
- "version": 3
750
- },
751
- "file_extension": ".py",
752
- "mimetype": "text/x-python",
753
- "name": "python",
754
- "nbconvert_exporter": "python",
755
- "pygments_lexer": "ipython3",
756
- "version": "3.10.12"
757
- }
758
- },
759
- "nbformat": 4,
760
- "nbformat_minor": 4
761
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
notebooks/rag_optimization_benchmark.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
notebooks/rag_optimization_benchmark.ipynb.backup DELETED
@@ -1,1072 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "metadata": {},
6
- "source": [
7
- "# RAG Pipeline Optimization Benchmark\n",
8
- "\n",
9
- "**Comprehensive testing of ALL RAG components to maximize LLM Judge score**\n",
10
- "\n",
11
- "## What We're Testing:\n",
12
- "\n",
13
- "### 1. Embedding Models (Vector Representations)\n",
14
- "- `BAAI/bge-large-en-v1.5` (Current - 1024 dim, best quality)\n",
15
- "- `BAAI/bge-base-en-v1.5` (768 dim, faster)\n",
16
- "- `intfloat/multilingual-e5-large` (1024 dim, multi-language)\n",
17
- "- `sentence-transformers/paraphrase-multilingual-mpnet-base-v2` (768 dim, multilingual)\n",
18
- "- `sentence-transformers/all-MiniLM-L6-v2` (384 dim, very fast)\n",
19
- "\n",
20
- "### 2. Retrieval Strategies\n",
21
- "- **Top-K**: Test 1, 3, 5, 10 documents\n",
22
- "- **MMR** (Maximal Marginal Relevance): Diversity vs relevance trade-off\n",
23
- "- **Similarity Threshold**: Filter low-relevance docs\n",
24
- "- **Reranking**: Use cross-encoder to rerank results\n",
25
- "\n",
26
- "### 3. Chunking Strategies (Already in Vector DB, but we'll compare)\n",
27
- "- Chunk size: 256, 512, 600 (current), 1000 tokens\n",
28
- "- Overlap: 0, 50, 100 (current), 200 chars\n",
29
- "\n",
30
- "### 4. LLM Models\n",
31
- "- Llama-4-Maverick-17B (open-source)\n",
32
- "- DeepSeek-R1 (reasoning)\n",
33
- "- GPT-4.1, GPT-5, GPT-5-mini\n",
34
- "- Claude-Sonnet-4.5\n",
35
- "\n",
36
- "### 5. Prompting Techniques\n",
37
- "- **Baseline**: Simple context + question\n",
38
- "- **Citation-focused**: Emphasize source references\n",
39
- "- **Step-by-step**: Chain-of-thought reasoning\n",
40
- "- **Few-shot**: Include example Q&A\n",
41
- "\n",
42
- "## LLM Judge Evaluation Criteria:\n",
43
- "- **Accuracy** (35%): Answer correctness\n",
44
- "- **Relevance** (35%): Citation quality and relevance\n",
45
- "- **Completeness** (30%): Thorough answers"
46
- ]
47
- },
48
- {
49
- "cell_type": "code",
50
- "execution_count": 1,
51
- "metadata": {},
52
- "outputs": [],
53
- "source": [
54
- "# !pip install openai pinecone-client sentence-transformers rank-bm25 python-dotenv pandas matplotlib seaborn jiwer"
55
- ]
56
- },
57
- {
58
- "cell_type": "code",
59
- "execution_count": 2,
60
- "metadata": {},
61
- "outputs": [
62
- {
63
- "name": "stderr",
64
- "output_type": "stream",
65
- "text": [
66
- "/Users/ismatsamadov/SOCAR_Hackathon/venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
67
- " from .autonotebook import tqdm as notebook_tqdm\n"
68
- ]
69
- },
70
- {
71
- "name": "stdout",
72
- "output_type": "stream",
73
- "text": [
74
- "✅ Libraries loaded\n"
75
- ]
76
- }
77
- ],
78
- "source": [
79
- "import os\n",
80
- "import json\n",
81
- "import time\n",
82
- "import re\n",
83
- "from typing import Dict, List, Tuple, Any\n",
84
- "from collections import defaultdict\n",
85
- "from dotenv import load_dotenv\n",
86
- "\n",
87
- "import pandas as pd\n",
88
- "import matplotlib.pyplot as plt\n",
89
- "import seaborn as sns\n",
90
- "from openai import AzureOpenAI\n",
91
- "from pinecone import Pinecone\n",
92
- "from sentence_transformers import SentenceTransformer, CrossEncoder\n",
93
- "from jiwer import wer, cer\n",
94
- "import numpy as np\n",
95
- "\n",
96
- "load_dotenv()\n",
97
- "\n",
98
- "sns.set_style('whitegrid')\n",
99
- "plt.rcParams['figure.figsize'] = (16, 10)\n",
100
- "\n",
101
- "print(\"✅ Libraries loaded\")"
102
- ]
103
- },
104
- {
105
- "cell_type": "code",
106
- "execution_count": 3,
107
- "metadata": {},
108
- "outputs": [
109
- {
110
- "name": "stdout",
111
- "output_type": "stream",
112
- "text": [
113
- "✅ Project root: /Users/ismatsamadov/SOCAR_Hackathon\n",
114
- "✅ Docs directory: /Users/ismatsamadov/SOCAR_Hackathon/docs\n",
115
- "✅ Output directory: /Users/ismatsamadov/SOCAR_Hackathon/output\n"
116
- ]
117
- }
118
- ],
119
- "source": [
120
- "# Auto-detect project root (works from any directory)\n",
121
- "import os\n",
122
- "from pathlib import Path\n",
123
- "\n",
124
- "if Path('data').exists() and Path('docs').exists():\n",
125
- " # Already in project root\n",
126
- " PROJECT_ROOT = Path.cwd()\n",
127
- "elif Path('../data').exists() and Path('../docs').exists():\n",
128
- " # In notebooks/ subdirectory\n",
129
- " PROJECT_ROOT = Path.cwd().parent\n",
130
- "else:\n",
131
- " # Fallback: try to find project root\n",
132
- " current = Path.cwd()\n",
133
- " while current != current.parent:\n",
134
- " if (current / 'data').exists() and (current / 'docs').exists():\n",
135
- " PROJECT_ROOT = current\n",
136
- " break\n",
137
- " current = current.parent\n",
138
- " else:\n",
139
- " PROJECT_ROOT = Path.cwd()\n",
140
- "\n",
141
- "# Define all paths relative to project root\n",
142
- "DATA_DIR = PROJECT_ROOT / 'data'\n",
143
- "DOCS_DIR = PROJECT_ROOT / 'docs'\n",
144
- "OUTPUT_DIR = PROJECT_ROOT / 'output'\n",
145
- "\n",
146
- "print(f\"✅ Project root: {PROJECT_ROOT}\")\n",
147
- "print(f\"✅ Docs directory: {DOCS_DIR}\")\n",
148
- "print(f\"✅ Output directory: {OUTPUT_DIR}\")"
149
- ]
150
- },
151
- {
152
- "cell_type": "code",
153
- "execution_count": 4,
154
- "metadata": {},
155
- "outputs": [
156
- {
157
- "name": "stdout",
158
- "output_type": "stream",
159
- "text": [
160
- "✅ Loaded 5 test questions\n",
161
- " - Example1\n",
162
- " - Example2\n",
163
- " - Example3\n",
164
- " - Example4\n",
165
- " - Example5\n"
166
- ]
167
- }
168
- ],
169
- "source": [
170
- "# Load test cases - using dynamic paths\n",
171
- "with open(DOCS_DIR / 'sample_questions.json', 'r', encoding='utf-8') as f:\n",
172
- " questions = json.load(f)\n",
173
- "\n",
174
- "with open(DOCS_DIR / 'sample_answers.json', 'r', encoding='utf-8') as f:\n",
175
- " expected_answers = json.load(f)\n",
176
- "\n",
177
- "print(f\"✅ Loaded {len(questions)} test questions\")\n",
178
- "for key in questions.keys():\n",
179
- " print(f\" - {key}\")"
180
- ]
181
- },
182
- {
183
- "cell_type": "markdown",
184
- "metadata": {},
185
- "source": [
186
- "## 2. Initialize Vector Database"
187
- ]
188
- },
189
- {
190
- "cell_type": "code",
191
- "execution_count": null,
192
- "metadata": {},
193
- "outputs": [],
194
- "source": [
195
- "# Connect to Pinecone\n",
196
- "pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))\n",
197
- "index = pc.Index(os.getenv('PINECONE_INDEX_NAME', 'hackathon'))\n",
198
- "\n",
199
- "stats = index.describe_index_stats()\n",
200
- "print(f\"✅ Vector DB connected\")\n",
201
- "print(f\" Total vectors: {stats['total_vector_count']}\")\n",
202
- "print(f\" Dimensions: {stats['dimension']}\")"
203
- ]
204
- },
205
- {
206
- "cell_type": "markdown",
207
- "metadata": {},
208
- "source": [
209
- "## 3. Embedding Models Configuration"
210
- ]
211
- },
212
- {
213
- "cell_type": "code",
214
- "execution_count": null,
215
- "metadata": {},
216
- "outputs": [],
217
- "source": [
218
- "EMBEDDING_MODELS = {\n",
219
- " 'bge-large-en': {\n",
220
- " 'name': 'BAAI/bge-large-en-v1.5',\n",
221
- " 'dimensions': 1024,\n",
222
- " 'notes': 'Current model - best quality'\n",
223
- " },\n",
224
- " 'bge-base-en': {\n",
225
- " 'name': 'BAAI/bge-base-en-v1.5',\n",
226
- " 'dimensions': 768,\n",
227
- " 'notes': 'Faster, slightly lower quality'\n",
228
- " },\n",
229
- " 'multilingual-e5-large': {\n",
230
- " 'name': 'intfloat/multilingual-e5-large',\n",
231
- " 'dimensions': 1024,\n",
232
- " 'notes': 'Multi-language optimized'\n",
233
- " },\n",
234
- " 'paraphrase-multilingual': {\n",
235
- " 'name': 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2',\n",
236
- " 'dimensions': 768,\n",
237
- " 'notes': 'Good for Azerbaijani/Russian'\n",
238
- " },\n",
239
- " 'all-MiniLM-L6': {\n",
240
- " 'name': 'sentence-transformers/all-MiniLM-L6-v2',\n",
241
- " 'dimensions': 384,\n",
242
- " 'notes': 'Very fast, lower quality'\n",
243
- " }\n",
244
- "}\n",
245
- "\n",
246
- "# Load embedding models (only test 1024-dim models for existing Pinecone index)\n",
247
- "EMBEDDING_MODELS_TO_TEST = [\n",
248
- " 'bge-large-en', # Current\n",
249
- " 'multilingual-e5-large', # Alternative with same dims\n",
250
- "]\n",
251
- "\n",
252
- "embedding_cache = {}\n",
253
- "\n",
254
- "for model_key in EMBEDDING_MODELS_TO_TEST:\n",
255
- " model_name = EMBEDDING_MODELS[model_key]['name']\n",
256
- " print(f\"Loading {model_key}...\")\n",
257
- " embedding_cache[model_key] = SentenceTransformer(model_name)\n",
258
- " print(f\" ✅ {model_name}\")\n",
259
- "\n",
260
- "print(f\"\\n✅ Loaded {len(embedding_cache)} embedding models\")"
261
- ]
262
- },
263
- {
264
- "cell_type": "markdown",
265
- "metadata": {},
266
- "source": [
267
- "## 4. Retrieval Strategies"
268
- ]
269
- },
270
- {
271
- "cell_type": "code",
272
- "execution_count": null,
273
- "metadata": {},
274
- "outputs": [],
275
- "source": [
276
- "def retrieve_vanilla(query: str, embed_model: SentenceTransformer, top_k: int = 3) -> List[Dict]:\n",
277
- " \"\"\"\n",
278
- " Vanilla retrieval: Simple top-k vector search.\n",
279
- " \"\"\"\n",
280
- " query_embedding = embed_model.encode(query).tolist()\n",
281
- " results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)\n",
282
- " \n",
283
- " documents = []\n",
284
- " for match in results['matches']:\n",
285
- " documents.append({\n",
286
- " 'pdf_name': match['metadata'].get('pdf_name', 'unknown.pdf'),\n",
287
- " 'page_number': match['metadata'].get('page_number', 0),\n",
288
- " 'content': match['metadata'].get('text', ''),\n",
289
- " 'score': match.get('score', 0.0)\n",
290
- " })\n",
291
- " \n",
292
- " return documents\n",
293
- "\n",
294
- "\n",
295
- "def retrieve_with_threshold(query: str, embed_model: SentenceTransformer, \n",
296
- " top_k: int = 10, threshold: float = 0.7) -> List[Dict]:\n",
297
- " \"\"\"\n",
298
- " Retrieve with similarity threshold filtering.\n",
299
- " \"\"\"\n",
300
- " docs = retrieve_vanilla(query, embed_model, top_k=top_k)\n",
301
- " return [doc for doc in docs if doc['score'] >= threshold]\n",
302
- "\n",
303
- "\n",
304
- "def retrieve_with_mmr(query: str, embed_model: SentenceTransformer, \n",
305
- " top_k: int = 3, lambda_param: float = 0.5, fetch_k: int = 20) -> List[Dict]:\n",
306
- " \"\"\"\n",
307
- " MMR (Maximal Marginal Relevance) for diversity.\n",
308
- " lambda=1 → pure relevance, lambda=0 → pure diversity\n",
309
- " \"\"\"\n",
310
- " # Fetch more candidates\n",
311
- " candidates = retrieve_vanilla(query, embed_model, top_k=fetch_k)\n",
312
- " \n",
313
- " if len(candidates) <= top_k:\n",
314
- " return candidates[:top_k]\n",
315
- " \n",
316
- " # Query embedding\n",
317
- " query_emb = embed_model.encode(query)\n",
318
- " \n",
319
- " # Get embeddings for candidates\n",
320
- " candidate_texts = [doc['content'] for doc in candidates]\n",
321
- " candidate_embs = embed_model.encode(candidate_texts)\n",
322
- " \n",
323
- " # MMR algorithm\n",
324
- " selected = []\n",
325
- " selected_embs = []\n",
326
- " \n",
327
- " for _ in range(min(top_k, len(candidates))):\n",
328
- " mmr_scores = []\n",
329
- " \n",
330
- " for i, (doc, emb) in enumerate(zip(candidates, candidate_embs)):\n",
331
- " if i in [candidates.index(s) for s in selected]:\n",
332
- " mmr_scores.append(-float('inf'))\n",
333
- " continue\n",
334
- " \n",
335
- " # Relevance to query\n",
336
- " relevance = np.dot(query_emb, emb) / (np.linalg.norm(query_emb) * np.linalg.norm(emb))\n",
337
- " \n",
338
- " # Max similarity to already selected\n",
339
- " if selected_embs:\n",
340
- " similarities = [np.dot(emb, s_emb) / (np.linalg.norm(emb) * np.linalg.norm(s_emb)) \n",
341
- " for s_emb in selected_embs]\n",
342
- " max_sim = max(similarities)\n",
343
- " else:\n",
344
- " max_sim = 0\n",
345
- " \n",
346
- " # MMR score\n",
347
- " mmr = lambda_param * relevance - (1 - lambda_param) * max_sim\n",
348
- " mmr_scores.append(mmr)\n",
349
- " \n",
350
- " # Select best MMR score\n",
351
- " best_idx = np.argmax(mmr_scores)\n",
352
- " selected.append(candidates[best_idx])\n",
353
- " selected_embs.append(candidate_embs[best_idx])\n",
354
- " \n",
355
- " return selected\n",
356
- "\n",
357
- "\n",
358
- "def retrieve_with_reranking(query: str, embed_model: SentenceTransformer, \n",
359
- " top_k: int = 3, fetch_k: int = 20) -> List[Dict]:\n",
360
- " \"\"\"\n",
361
- " Two-stage: retrieve with embeddings, rerank with cross-encoder.\n",
362
- " \"\"\"\n",
363
- " # Stage 1: Retrieve candidates\n",
364
- " candidates = retrieve_vanilla(query, embed_model, top_k=fetch_k)\n",
365
- " \n",
366
- " if len(candidates) <= top_k:\n",
367
- " return candidates[:top_k]\n",
368
- " \n",
369
- " # Stage 2: Rerank with cross-encoder\n",
370
- " reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')\n",
371
- " \n",
372
- " pairs = [[query, doc['content']] for doc in candidates]\n",
373
- " scores = reranker.predict(pairs)\n",
374
- " \n",
375
- " # Sort by reranker score\n",
376
- " scored_docs = [(doc, score) for doc, score in zip(candidates, scores)]\n",
377
- " scored_docs.sort(key=lambda x: x[1], reverse=True)\n",
378
- " \n",
379
- " # Update scores and return top-k\n",
380
- " reranked = []\n",
381
- " for doc, score in scored_docs[:top_k]:\n",
382
- " doc['rerank_score'] = float(score)\n",
383
- " reranked.append(doc)\n",
384
- " \n",
385
- " return reranked\n",
386
- "\n",
387
- "\n",
388
- "RETRIEVAL_STRATEGIES = {\n",
389
- " 'vanilla_k3': {'func': retrieve_vanilla, 'params': {'top_k': 3}, 'notes': 'Current setup'},\n",
390
- " 'vanilla_k5': {'func': retrieve_vanilla, 'params': {'top_k': 5}, 'notes': 'More context'},\n",
391
- " 'vanilla_k10': {'func': retrieve_vanilla, 'params': {'top_k': 10}, 'notes': 'Maximum context'},\n",
392
- " 'threshold_0.7': {'func': retrieve_with_threshold, 'params': {'top_k': 10, 'threshold': 0.7}, 'notes': 'Quality filter'},\n",
393
- " 'mmr_balanced': {'func': retrieve_with_mmr, 'params': {'top_k': 3, 'lambda_param': 0.5}, 'notes': 'Balance diversity'},\n",
394
- " 'mmr_diverse': {'func': retrieve_with_mmr, 'params': {'top_k': 3, 'lambda_param': 0.3}, 'notes': 'More diversity'},\n",
395
- " 'reranked_k3': {'func': retrieve_with_reranking, 'params': {'top_k': 3, 'fetch_k': 20}, 'notes': 'Two-stage rerank'},\n",
396
- "}\n",
397
- "\n",
398
- "print(f\"✅ Configured {len(RETRIEVAL_STRATEGIES)} retrieval strategies\")"
399
- ]
400
- },
401
- {
402
- "cell_type": "markdown",
403
- "metadata": {},
404
- "source": [
405
- "## 5. LLM Models and Prompting Strategies"
406
- ]
407
- },
408
- {
409
- "cell_type": "code",
410
- "execution_count": null,
411
- "metadata": {},
412
- "outputs": [],
413
- "source": [
414
- "# Initialize Azure OpenAI\n",
415
- "azure_client = AzureOpenAI(\n",
416
- " api_key=os.getenv('AZURE_OPENAI_API_KEY'),\n",
417
- " api_version=os.getenv('AZURE_OPENAI_API_VERSION', '2024-08-01-preview'),\n",
418
- " azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT')\n",
419
- ")\n",
420
- "\n",
421
- "LLM_MODELS = {\n",
422
- " 'Llama-4-Maverick': 'Llama-4-Maverick-17B-128E-Instruct-FP8',\n",
423
- " 'DeepSeek-R1': 'DeepSeek-R1',\n",
424
- " 'GPT-4.1': 'gpt-4.1',\n",
425
- " 'GPT-5-mini': 'gpt-5-mini',\n",
426
- " 'Claude-Sonnet-4.5': 'claude-sonnet-4-5',\n",
427
- "}\n",
428
- "\n",
429
- "# Prompting strategies\n",
430
- "PROMPTING_STRATEGIES = {\n",
431
- " 'baseline': \"\"\"\n",
432
- "Siz SOCAR-ın tarixi neft və qaz sənədləri üzrə köməkçisiniz.\n",
433
- "\n",
434
- "Kontekst:\n",
435
- "{context}\n",
436
- "\n",
437
- "Sual: {query}\n",
438
- "\n",
439
- "Kontekstə əsaslanaraq cavab verin.\n",
440
- "\"\"\",\n",
441
- " \n",
442
- " 'citation_focused': \"\"\"\n",
443
- "Siz SOCAR-ın tarixi sənədlər üzrə mütəxəssis köməkçisisiniz.\n",
444
- "\n",
445
- "ÖNƏMLİ: Hər bir faktı mütləq mənbə ilə təsdiqləyin (PDF adı və səhifə nömrəsi).\n",
446
- "\n",
447
- "Kontekst:\n",
448
- "{context}\n",
449
- "\n",
450
- "Sual: {query}\n",
451
- "\n",
452
- "Cavab verərkən:\n",
453
- "1. Dəqiq faktlar yazın\n",
454
- "2. Hər faktı mənbə ilə göstərin: (PDF: fayl_adı.pdf, Səhifə: X)\n",
455
- "3. Kontekstdə olmayan məlumat əlavə etməyin\n",
456
- "\"\"\",\n",
457
- " \n",
458
- " 'step_by_step': \"\"\"\n",
459
- "Siz SOCAR-ın tarixi sənədlər üzrə analitik köməkçisisiniz.\n",
460
- "\n",
461
- "Kontekst:\n",
462
- "{context}\n",
463
- "\n",
464
- "Sual: {query}\n",
465
- "\n",
466
- "Addım-addım cavab verin:\n",
467
- "1. Əvvəlcə kontekstdən əlaqəli məlumatları müəyyənləşdirin\n",
468
- "2. Bu məlumatları təhlil edin\n",
469
- "3. Nəticəni mənbələr ilə birlikdə təqdim edin\n",
470
- "\"\"\",\n",
471
- " \n",
472
- " 'few_shot': \"\"\"\n",
473
- "Siz SOCAR-ın tarixi sənədlər üzrə mütəxəssis köməkçisisiniz.\n",
474
- "\n",
475
- "Nümunə:\n",
476
- "Sual: \"Palçıq vulkanlarının təsir radiusu nə qədərdir?\"\n",
477
- "Cavab: \"Sahə müşahidələri və modelləşdirmə göstərir ki, palçıq vulkanlarının təsir radiusu təqribən 10 km-dir (PDF: document_06.pdf, Səhifə: 5).\"\n",
478
- "\n",
479
- "Kontekst:\n",
480
- "{context}\n",
481
- "\n",
482
- "Sual: {query}\n",
483
- "\n",
484
- "Yuxarıdakı nümunə kimi cavab verin - dəqiq, qısa, mənbə ilə.\n",
485
- "\"\"\"\n",
486
- "}\n",
487
- "\n",
488
- "print(f\"✅ Configured {len(LLM_MODELS)} LLM models\")\n",
489
- "print(f\"✅ Configured {len(PROMPTING_STRATEGIES)} prompting strategies\")"
490
- ]
491
- },
492
- {
493
- "cell_type": "code",
494
- "execution_count": null,
495
- "metadata": {},
496
- "outputs": [],
497
- "source": [
498
- "def generate_answer(llm_model: str, query: str, documents: List[Dict], \n",
499
- " prompt_strategy: str = 'baseline',\n",
500
- " temperature: float = 0.2) -> Tuple[str, float]:\n",
501
- " \"\"\"\n",
502
- " Generate answer using LLM with specified prompting strategy.\n",
503
- " \"\"\"\n",
504
- " # Build context\n",
505
- " context_parts = []\n",
506
- " for i, doc in enumerate(documents, 1):\n",
507
- " context_parts.append(\n",
508
- " f\"Sənəd {i} (Mənbə: {doc['pdf_name']}, Səhifə {doc['page_number']}):\\n{doc['content']}\"\n",
509
- " )\n",
510
- " context = \"\\n\\n\".join(context_parts)\n",
511
- " \n",
512
- " # Get prompt template\n",
513
- " prompt_template = PROMPTING_STRATEGIES[prompt_strategy]\n",
514
- " prompt = prompt_template.format(context=context, query=query)\n",
515
- " \n",
516
- " try:\n",
517
- " start_time = time.time()\n",
518
- " \n",
519
- " deployment = LLM_MODELS[llm_model]\n",
520
- " \n",
521
- " # GPT-5 models use max_completion_tokens, others use max_tokens\n",
522
- " if deployment.startswith('gpt-5'):\n",
523
- " response = azure_client.chat.completions.create(\n",
524
- " model=deployment,\n",
525
- " messages=[{\"role\": \"user\", \"content\": prompt}],\n",
526
- " temperature=temperature,\n",
527
- " max_completion_tokens=1000\n",
528
- " )\n",
529
- " else:\n",
530
- " response = azure_client.chat.completions.create(\n",
531
- " model=deployment,\n",
532
- " messages=[{\"role\": \"user\", \"content\": prompt}],\n",
533
- " temperature=temperature,\n",
534
- " max_tokens=1000\n",
535
- " )\n",
536
- " \n",
537
- " elapsed = time.time() - start_time\n",
538
- " answer = response.choices[0].message.content\n",
539
- " \n",
540
- " return answer, elapsed\n",
541
- " \n",
542
- " except Exception as e:\n",
543
- " return f\"ERROR: {str(e)}\", 0.0\n",
544
- "\n",
545
- "print(\"✅ LLM generation function ready\")"
546
- ]
547
- },
548
- {
549
- "cell_type": "markdown",
550
- "metadata": {},
551
- "source": [
552
- "## 6. Evaluation Metrics"
553
- ]
554
- },
555
- {
556
- "cell_type": "code",
557
- "execution_count": null,
558
- "metadata": {},
559
- "outputs": [],
560
- "source": [
561
- "def normalize_text(text: str) -> str:\n",
562
- " text = text.lower().strip()\n",
563
- " text = re.sub(r'\\s+', ' ', text)\n",
564
- " return text\n",
565
- "\n",
566
- "def calculate_answer_quality(reference: str, hypothesis: str) -> Dict[str, float]:\n",
567
- " \"\"\"Accuracy metrics.\"\"\"\n",
568
- " ref_norm = normalize_text(reference)\n",
569
- " hyp_norm = normalize_text(hypothesis)\n",
570
- " \n",
571
- " cer_score = cer(ref_norm, hyp_norm) * 100\n",
572
- " wer_score = wer(ref_norm, hyp_norm) * 100\n",
573
- " similarity = max(0, 100 - wer_score)\n",
574
- " \n",
575
- " return {\n",
576
- " 'Accuracy_Score': round(similarity, 2)\n",
577
- " }\n",
578
- "\n",
579
- "def evaluate_citation_quality(answer: str, documents: List[Dict]) -> Dict[str, float]:\n",
580
- " \"\"\"Relevance - citation quality.\"\"\"\n",
581
- " pdf_names = [doc['pdf_name'].replace('.pdf', '') for doc in documents]\n",
582
- " page_numbers = [str(doc['page_number']) for doc in documents]\n",
583
- " \n",
584
- " cited_pdfs = sum(1 for pdf in pdf_names if pdf in answer)\n",
585
- " cited_pages = sum(1 for page in page_numbers if page in answer)\n",
586
- " \n",
587
- " citation_keywords = ['mənbə', 'sənəd', 'səhifə', 'pdf', 'document', 'page']\n",
588
- " has_citation_format = any(kw in answer.lower() for kw in citation_keywords)\n",
589
- " \n",
590
- " citation_score = (\n",
591
- " (cited_pdfs / len(pdf_names) * 40) +\n",
592
- " (cited_pages / len(page_numbers) * 40) +\n",
593
- " (20 if has_citation_format else 0)\n",
594
- " )\n",
595
- " \n",
596
- " return {\n",
597
- " 'Citation_Score': round(citation_score, 2),\n",
598
- " 'Cited_PDFs': cited_pdfs,\n",
599
- " 'Cited_Pages': cited_pages\n",
600
- " }\n",
601
- "\n",
602
- "def evaluate_retrieval_quality(query: str, documents: List[Dict], expected_answer: str) -> Dict[str, float]:\n",
603
- " \"\"\"Measure if retrieved docs are relevant to answer.\"\"\"\n",
604
- " if not documents or not expected_answer:\n",
605
- " return {'Retrieval_Relevance': 0.0}\n",
606
- " \n",
607
- " # Simple heuristic: check if expected answer words appear in retrieved docs\n",
608
- " expected_words = set(normalize_text(expected_answer).split())\n",
609
- " retrieved_text = ' '.join([doc['content'] for doc in documents])\n",
610
- " retrieved_words = set(normalize_text(retrieved_text).split())\n",
611
- " \n",
612
- " overlap = len(expected_words & retrieved_words) / len(expected_words) if expected_words else 0\n",
613
- " \n",
614
- " return {\n",
615
- " 'Retrieval_Relevance': round(overlap * 100, 2)\n",
616
- " }\n",
617
- "\n",
618
- "def evaluate_completeness(answer: str) -> Dict[str, float]:\n",
619
- " \"\"\"Completeness metrics.\"\"\"\n",
620
- " word_count = len(answer.split())\n",
621
- " \n",
622
- " if word_count < 20:\n",
623
- " completeness = (word_count / 20) * 100\n",
624
- " elif word_count > 200:\n",
625
- " completeness = 100 - ((word_count - 200) / 200 * 20)\n",
626
- " else:\n",
627
- " completeness = 100\n",
628
- " \n",
629
- " return {\n",
630
- " 'Completeness_Score': round(max(0, completeness), 2),\n",
631
- " 'Word_Count': word_count\n",
632
- " }\n",
633
- "\n",
634
- "def calculate_llm_judge_score(accuracy: float, citation: float, completeness: float) -> float:\n",
635
- " \"\"\"Overall LLM Judge score (weighted).\"\"\"\n",
636
- " return round(\n",
637
- " accuracy * 0.35 +\n",
638
- " citation * 0.35 +\n",
639
- " completeness * 0.30,\n",
640
- " 2\n",
641
- " )\n",
642
- "\n",
643
- "print(\"✅ Evaluation metrics ready\")"
644
- ]
645
- },
646
- {
647
- "cell_type": "markdown",
648
- "metadata": {},
649
- "source": [
650
- "## 7. Run Comprehensive Benchmark"
651
- ]
652
- },
653
- {
654
- "cell_type": "code",
655
- "execution_count": null,
656
- "metadata": {},
657
- "outputs": [],
658
- "source": [
659
- "# Configuration: Select what to test\n",
660
- "CONFIGS_TO_TEST = [\n",
661
- " # Format: (embed_model, retrieval_strategy, llm_model, prompt_strategy)\n",
662
- " \n",
663
- " # Baseline (current setup)\n",
664
- " ('bge-large-en', 'vanilla_k3', 'Llama-4-Maverick', 'baseline'),\n",
665
- " \n",
666
- " # Test different embedding models\n",
667
- " ('multilingual-e5-large', 'vanilla_k3', 'Llama-4-Maverick', 'baseline'),\n",
668
- " \n",
669
- " # Test different retrieval strategies\n",
670
- " ('bge-large-en', 'vanilla_k5', 'Llama-4-Maverick', 'baseline'),\n",
671
- " ('bge-large-en', 'mmr_balanced', 'Llama-4-Maverick', 'baseline'),\n",
672
- " ('bge-large-en', 'reranked_k3', 'Llama-4-Maverick', 'baseline'),\n",
673
- " \n",
674
- " # Test different LLM models\n",
675
- " ('bge-large-en', 'vanilla_k3', 'GPT-5-mini', 'baseline'),\n",
676
- " ('bge-large-en', 'vanilla_k3', 'Claude-Sonnet-4.5', 'baseline'),\n",
677
- " \n",
678
- " # Test different prompting strategies\n",
679
- " ('bge-large-en', 'vanilla_k3', 'Llama-4-Maverick', 'citation_focused'),\n",
680
- " ('bge-large-en', 'vanilla_k3', 'Llama-4-Maverick', 'few_shot'),\n",
681
- " \n",
682
- " # Best combinations\n",
683
- " ('bge-large-en', 'reranked_k3', 'GPT-5-mini', 'citation_focused'),\n",
684
- " ('bge-large-en', 'mmr_balanced', 'Claude-Sonnet-4.5', 'citation_focused'),\n",
685
- "]\n",
686
- "\n",
687
- "print(f\"Testing {len(CONFIGS_TO_TEST)} configurations on {len(questions)} questions\")\n",
688
- "print(f\"Total API calls: ~{len(CONFIGS_TO_TEST) * len(questions)}\")\n",
689
- "print(\"This will take 15-30 minutes...\\n\")"
690
- ]
691
- },
692
- {
693
- "cell_type": "code",
694
- "execution_count": null,
695
- "metadata": {},
696
- "outputs": [],
697
- "source": [
698
- "# Run benchmark\n",
699
- "results = []\n",
700
- "\n",
701
- "for config_idx, (embed_key, retrieval_key, llm_key, prompt_key) in enumerate(CONFIGS_TO_TEST, 1):\n",
702
- " config_name = f\"{embed_key}_{retrieval_key}_{llm_key}_{prompt_key}\"\n",
703
- " \n",
704
- " print(f\"\\n{'='*100}\")\n",
705
- " print(f\"Config {config_idx}/{len(CONFIGS_TO_TEST)}: {config_name}\")\n",
706
- " print(f\"{'='*100}\")\n",
707
- " \n",
708
- " # Get components\n",
709
- " embed_model = embedding_cache[embed_key]\n",
710
- " retrieval_func = RETRIEVAL_STRATEGIES[retrieval_key]['func']\n",
711
- " retrieval_params = RETRIEVAL_STRATEGIES[retrieval_key]['params']\n",
712
- " \n",
713
- " config_results = []\n",
714
- " \n",
715
- " for example_key, messages in questions.items():\n",
716
- " user_msg = [m for m in messages if m['role'] == 'user'][-1]\n",
717
- " query = user_msg['content']\n",
718
- " \n",
719
- " print(f\"\\n {example_key}: {query[:60]}...\")\n",
720
- " \n",
721
- " # Retrieve documents\n",
722
- " documents = retrieval_func(query, embed_model, **retrieval_params)\n",
723
- " print(f\" Retrieved {len(documents)} docs\")\n",
724
- " \n",
725
- " # Generate answer\n",
726
- " answer, response_time = generate_answer(llm_key, query, documents, prompt_key)\n",
727
- " \n",
728
- " if answer.startswith('ERROR'):\n",
729
- " print(f\" ❌ {answer}\")\n",
730
- " continue\n",
731
- " \n",
732
- " print(f\" ✅ Generated in {response_time:.2f}s\")\n",
733
- " \n",
734
- " # Evaluate\n",
735
- " expected = expected_answers.get(example_key, {}).get('Answer', '')\n",
736
- " \n",
737
- " accuracy_metrics = calculate_answer_quality(expected, answer) if expected else {'Accuracy_Score': 0}\n",
738
- " citation_metrics = evaluate_citation_quality(answer, documents)\n",
739
- " retrieval_metrics = evaluate_retrieval_quality(query, documents, expected)\n",
740
- " completeness_metrics = evaluate_completeness(answer)\n",
741
- " \n",
742
- " # Calculate overall score\n",
743
- " llm_judge_score = calculate_llm_judge_score(\n",
744
- " accuracy_metrics['Accuracy_Score'],\n",
745
- " citation_metrics['Citation_Score'],\n",
746
- " completeness_metrics['Completeness_Score']\n",
747
- " )\n",
748
- " \n",
749
- " result = {\n",
750
- " 'Config': config_name,\n",
751
- " 'Embedding_Model': embed_key,\n",
752
- " 'Retrieval_Strategy': retrieval_key,\n",
753
- " 'LLM_Model': llm_key,\n",
754
- " 'Prompt_Strategy': prompt_key,\n",
755
- " 'Question': example_key,\n",
756
- " 'Query': query[:80],\n",
757
- " 'Num_Docs_Retrieved': len(documents),\n",
758
- " 'Response_Time': round(response_time, 2),\n",
759
- " 'LLM_Judge_Score': llm_judge_score,\n",
760
- " **accuracy_metrics,\n",
761
- " **citation_metrics,\n",
762
- " **retrieval_metrics,\n",
763
- " **completeness_metrics,\n",
764
- " 'Answer_Preview': answer[:150]\n",
765
- " }\n",
766
- " \n",
767
- " results.append(result)\n",
768
- " config_results.append(result)\n",
769
- " \n",
770
- " # Show config summary\n",
771
- " if config_results:\n",
772
- " avg_score = sum(r['LLM_Judge_Score'] for r in config_results) / len(config_results)\n",
773
- " avg_time = sum(r['Response_Time'] for r in config_results) / len(config_results)\n",
774
- " print(f\"\\n 📊 Config Summary:\")\n",
775
- " print(f\" Avg LLM Judge Score: {avg_score:.2f}%\")\n",
776
- " print(f\" Avg Response Time: {avg_time:.2f}s\")\n",
777
- "\n",
778
- "print(f\"\\n{'='*100}\")\n",
779
- "print(\"✅ Comprehensive benchmark complete!\")\n",
780
- "print(f\"{'='*100}\")"
781
- ]
782
- },
783
- {
784
- "cell_type": "markdown",
785
- "metadata": {},
786
- "source": [
787
- "## 8. Analyze Results"
788
- ]
789
- },
790
- {
791
- "cell_type": "code",
792
- "execution_count": null,
793
- "metadata": {},
794
- "outputs": [],
795
- "source": [
796
- "# Create DataFrame\n",
797
- "df = pd.DataFrame(results)\n",
798
- "\n",
799
- "# Aggregate by configuration\n",
800
- "config_summary = df.groupby('Config').agg({\n",
801
- " 'LLM_Judge_Score': 'mean',\n",
802
- " 'Accuracy_Score': 'mean',\n",
803
- " 'Citation_Score': 'mean',\n",
804
- " 'Retrieval_Relevance': 'mean',\n",
805
- " 'Completeness_Score': 'mean',\n",
806
- " 'Response_Time': 'mean',\n",
807
- " 'Embedding_Model': 'first',\n",
808
- " 'Retrieval_Strategy': 'first',\n",
809
- " 'LLM_Model': 'first',\n",
810
- " 'Prompt_Strategy': 'first'\n",
811
- "}).round(2)\n",
812
- "\n",
813
- "# Sort by LLM Judge Score\n",
814
- "config_summary = config_summary.sort_values('LLM_Judge_Score', ascending=False)\n",
815
- "\n",
816
- "print(\"\\n\" + \"=\"*120)\n",
817
- "print(\"📊 CONFIGURATION RANKINGS (By LLM Judge Score)\")\n",
818
- "print(\"=\"*120)\n",
819
- "display_cols = ['Embedding_Model', 'Retrieval_Strategy', 'LLM_Model', 'Prompt_Strategy', \n",
820
- " 'LLM_Judge_Score', 'Accuracy_Score', 'Citation_Score', 'Response_Time']\n",
821
- "print(config_summary[display_cols].to_string())\n",
822
- "print(\"=\"*120)"
823
- ]
824
- },
825
- {
826
- "cell_type": "markdown",
827
- "metadata": {},
828
- "source": [
829
- "## 9. Component Analysis"
830
- ]
831
- },
832
- {
833
- "cell_type": "code",
834
- "execution_count": null,
835
- "metadata": {},
836
- "outputs": [],
837
- "source": [
838
- "# Analyze impact of each component\n",
839
- "print(\"\\n\" + \"=\"*100)\n",
840
- "print(\"🔍 COMPONENT IMPACT ANALYSIS\")\n",
841
- "print(\"=\"*100)\n",
842
- "\n",
843
- "# 1. Embedding Models\n",
844
- "print(\"\\n📚 EMBEDDING MODELS:\")\n",
845
- "embed_impact = df.groupby('Embedding_Model')['LLM_Judge_Score'].mean().sort_values(ascending=False)\n",
846
- "for model, score in embed_impact.items():\n",
847
- " print(f\" {model}: {score:.2f}%\")\n",
848
- "\n",
849
- "# 2. Retrieval Strategies\n",
850
- "print(\"\\n🔎 RETRIEVAL STRATEGIES:\")\n",
851
- "retrieval_impact = df.groupby('Retrieval_Strategy')['LLM_Judge_Score'].mean().sort_values(ascending=False)\n",
852
- "for strategy, score in retrieval_impact.items():\n",
853
- " notes = RETRIEVAL_STRATEGIES[strategy]['notes']\n",
854
- " print(f\" {strategy}: {score:.2f}% ({notes})\")\n",
855
- "\n",
856
- "# 3. LLM Models\n",
857
- "print(\"\\n🤖 LLM MODELS:\")\n",
858
- "llm_impact = df.groupby('LLM_Model')['LLM_Judge_Score'].mean().sort_values(ascending=False)\n",
859
- "for model, score in llm_impact.items():\n",
860
- " print(f\" {model}: {score:.2f}%\")\n",
861
- "\n",
862
- "# 4. Prompting Strategies\n",
863
- "print(\"\\n💬 PROMPTING STRATEGIES:\")\n",
864
- "prompt_impact = df.groupby('Prompt_Strategy')['LLM_Judge_Score'].mean().sort_values(ascending=False)\n",
865
- "for strategy, score in prompt_impact.items():\n",
866
- " print(f\" {strategy}: {score:.2f}%\")\n",
867
- "\n",
868
- "print(\"\\n\" + \"=\"*100)"
869
- ]
870
- },
871
- {
872
- "cell_type": "markdown",
873
- "metadata": {},
874
- "source": [
875
- "import os\n",
876
- "from pathlib import Path\n",
877
- "\n",
878
- "# Create output directory - using dynamic path\n",
879
- "output_dir = OUTPUT_DIR / 'rag_optimization_benchmark'\n",
880
- "output_dir.mkdir(parents=True, exist_ok=True)\n",
881
- "\n",
882
- "fig, axes = plt.subplots(2, 3, figsize=(20, 12))\n",
883
- "\n",
884
- "# 1. Top Configurations\n",
885
- "ax1 = axes[0, 0]\n",
886
- "top_configs = config_summary.head(10)\n",
887
- "config_labels = [c.split('_')[-2] + '+' + c.split('_')[-1] for c in top_configs.index]\n",
888
- "ax1.barh(config_labels, top_configs['LLM_Judge_Score'], color=sns.color_palette('viridis', len(top_configs)))\n",
889
- "ax1.set_xlabel('LLM Judge Score (%)', fontsize=11, fontweight='bold')\n",
890
- "ax1.set_title('Top 10 Configurations', fontsize=13, fontweight='bold')\n",
891
- "ax1.set_xlim(0, 100)\n",
892
- "for i, score in enumerate(top_configs['LLM_Judge_Score']):\n",
893
- " ax1.text(score + 1, i, f'{score:.1f}', va='center', fontsize=10)\n",
894
- "\n",
895
- "# 2. Embedding Model Impact\n",
896
- "ax2 = axes[0, 1]\n",
897
- "ax2.bar(embed_impact.index, embed_impact.values, color='skyblue', alpha=0.8)\n",
898
- "ax2.set_ylabel('Avg LLM Judge Score (%)', fontsize=11, fontweight='bold')\n",
899
- "ax2.set_title('Embedding Model Impact', fontsize=13, fontweight='bold')\n",
900
- "ax2.set_ylim(0, 100)\n",
901
- "ax2.tick_params(axis='x', rotation=45)\n",
902
- "for i, (model, score) in enumerate(embed_impact.items()):\n",
903
- " ax2.text(i, score + 2, f'{score:.1f}', ha='center', fontsize=10)\n",
904
- "\n",
905
- "# 3. Retrieval Strategy Impact\n",
906
- "ax3 = axes[0, 2]\n",
907
- "ax3.bar(retrieval_impact.index, retrieval_impact.values, color='coral', alpha=0.8)\n",
908
- "ax3.set_ylabel('Avg LLM Judge Score (%)', fontsize=11, fontweight='bold')\n",
909
- "ax3.set_title('Retrieval Strategy Impact', fontsize=13, fontweight='bold')\n",
910
- "ax3.set_ylim(0, 100)\n",
911
- "ax3.tick_params(axis='x', rotation=45)\n",
912
- "for i, (strategy, score) in enumerate(retrieval_impact.items()):\n",
913
- " ax3.text(i, score + 2, f'{score:.1f}', ha='center', fontsize=9)\n",
914
- "\n",
915
- "# 4. LLM Model Impact\n",
916
- "ax4 = axes[1, 0]\n",
917
- "ax4.bar(llm_impact.index, llm_impact.values, color='mediumseagreen', alpha=0.8)\n",
918
- "ax4.set_ylabel('Avg LLM Judge Score (%)', fontsize=11, fontweight='bold')\n",
919
- "ax4.set_title('LLM Model Impact', fontsize=13, fontweight='bold')\n",
920
- "ax4.set_ylim(0, 100)\n",
921
- "ax4.tick_params(axis='x', rotation=45)\n",
922
- "for i, (model, score) in enumerate(llm_impact.items()):\n",
923
- " ax4.text(i, score + 2, f'{score:.1f}', ha='center', fontsize=10)\n",
924
- "\n",
925
- "# 5. Prompting Strategy Impact\n",
926
- "ax5 = axes[1, 1]\n",
927
- "ax5.bar(prompt_impact.index, prompt_impact.values, color='mediumpurple', alpha=0.8)\n",
928
- "ax5.set_ylabel('Avg LLM Judge Score (%)', fontsize=11, fontweight='bold')\n",
929
- "ax5.set_title('Prompting Strategy Impact', fontsize=13, fontweight='bold')\n",
930
- "ax5.set_ylim(0, 100)\n",
931
- "ax5.tick_params(axis='x', rotation=45)\n",
932
- "for i, (strategy, score) in enumerate(prompt_impact.items()):\n",
933
- " ax5.text(i, score + 2, f'{score:.1f}', ha='center', fontsize=10)\n",
934
- "\n",
935
- "# 6. Score Components (best config)\n",
936
- "ax6 = axes[1, 2]\n",
937
- "best_config = config_summary.iloc[0]\n",
938
- "components = ['Accuracy', 'Citation', 'Completeness']\n",
939
- "scores = [best_config['Accuracy_Score'], best_config['Citation_Score'], best_config['Completeness_Score']]\n",
940
- "colors_comp = ['#FF6B6B', '#4ECDC4', '#45B7D1']\n",
941
- "bars = ax6.bar(components, scores, color=colors_comp, alpha=0.8)\n",
942
- "ax6.set_ylabel('Score (%)', fontsize=11, fontweight='bold')\n",
943
- "ax6.set_title(f'Best Config Components\\n{best_config.name.split(\"_\")[2]}', fontsize=13, fontweight='bold')\n",
944
- "ax6.set_ylim(0, 100)\n",
945
- "for i, score in enumerate(scores):\n",
946
- " ax6.text(i, score + 2, f'{score:.1f}%', ha='center', fontsize=10, fontweight='bold')\n",
947
- "\n",
948
- "plt.tight_layout()\n",
949
- "plt.savefig(output_dir / 'results.png', dpi=300, bbox_inches='tight')\n",
950
- "plt.show()\n",
951
- "\n",
952
- "print(f\"\\n✅ Visualization saved to '{output_dir}/results.png'\")"
953
- ]
954
- },
955
- {
956
- "cell_type": "code",
957
- "execution_count": null,
958
- "metadata": {},
959
- "outputs": [],
960
- "source": "import os\nfrom pathlib import Path\n\n# Create output directory - using dynamic path\noutput_dir = OUTPUT_DIR / 'rag_optimization_benchmark'\noutput_dir.mkdir(parents=True, exist_ok=True)\n\nfig, axes = plt.subplots(2, 3, figsize=(20, 12))\n\n# 1. Top Configurations\nax1 = axes[0, 0]\ntop_configs = config_summary.head(10)\nconfig_labels = [c.split('_')[-2] + '+' + c.split('_')[-1] for c in top_configs.index]\nax1.barh(config_labels, top_configs['LLM_Judge_Score'], color=sns.color_palette('viridis', len(top_configs)))\nax1.set_xlabel('LLM Judge Score (%)', fontsize=11, fontweight='bold')\nax1.set_title('Top 10 Configurations', fontsize=13, fontweight='bold')\nax1.set_xlim(0, 100)\nfor i, score in enumerate(top_configs['LLM_Judge_Score']):\n ax1.text(score + 1, i, f'{score:.1f}', va='center', fontsize=10)\n\n# 2. Embedding Model Impact\nax2 = axes[0, 1]\nax2.bar(embed_impact.index, embed_impact.values, color='skyblue', alpha=0.8)\nax2.set_ylabel('Avg LLM Judge Score (%)', fontsize=11, fontweight='bold')\nax2.set_title('Embedding Model Impact', fontsize=13, fontweight='bold')\nax2.set_ylim(0, 100)\nax2.tick_params(axis='x', rotation=45)\nfor i, (model, score) in enumerate(embed_impact.items()):\n ax2.text(i, score + 2, f'{score:.1f}', ha='center', fontsize=10)\n\n# 3. Retrieval Strategy Impact\nax3 = axes[0, 2]\nax3.bar(retrieval_impact.index, retrieval_impact.values, color='coral', alpha=0.8)\nax3.set_ylabel('Avg LLM Judge Score (%)', fontsize=11, fontweight='bold')\nax3.set_title('Retrieval Strategy Impact', fontsize=13, fontweight='bold')\nax3.set_ylim(0, 100)\nax3.tick_params(axis='x', rotation=45)\nfor i, (strategy, score) in enumerate(retrieval_impact.items()):\n ax3.text(i, score + 2, f'{score:.1f}', ha='center', fontsize=9)\n\n# 4. LLM Model Impact\nax4 = axes[1, 0]\nax4.bar(llm_impact.index, llm_impact.values, color='mediumseagreen', alpha=0.8)\nax4.set_ylabel('Avg LLM Judge Score (%)', fontsize=11, fontweight='bold')\nax4.set_title('LLM Model Impact', fontsize=13, fontweight='bold')\nax4.set_ylim(0, 100)\nax4.tick_params(axis='x', rotation=45)\nfor i, (model, score) in enumerate(llm_impact.items()):\n ax4.text(i, score + 2, f'{score:.1f}', ha='center', fontsize=10)\n\n# 5. Prompting Strategy Impact\nax5 = axes[1, 1]\nax5.bar(prompt_impact.index, prompt_impact.values, color='mediumpurple', alpha=0.8)\nax5.set_ylabel('Avg LLM Judge Score (%)', fontsize=11, fontweight='bold')\nax5.set_title('Prompting Strategy Impact', fontsize=13, fontweight='bold')\nax5.set_ylim(0, 100)\nax5.tick_params(axis='x', rotation=45)\nfor i, (strategy, score) in enumerate(prompt_impact.items()):\n ax5.text(i, score + 2, f'{score:.1f}', ha='center', fontsize=10)\n\n# 6. Score Components (best config)\nax6 = axes[1, 2]\nbest_config = config_summary.iloc[0]\ncomponents = ['Accuracy', 'Citation', 'Completeness']\nscores = [best_config['Accuracy_Score'], best_config['Citation_Score'], best_config['Completeness_Score']]\ncolors_comp = ['#FF6B6B', '#4ECDC4', '#45B7D1']\nbars = ax6.bar(components, scores, color=colors_comp, alpha=0.8)\nax6.set_ylabel('Score (%)', fontsize=11, fontweight='bold')\nax6.set_title(f'Best Config Components\\n{best_config.name.split(\"_\")[2]}', fontsize=13, fontweight='bold')\nax6.set_ylim(0, 100)\nfor i, score in enumerate(scores):\n ax6.text(i, score + 2, f'{score:.1f}%', ha='center', fontsize=10, fontweight='bold')\n\nplt.tight_layout()\nplt.savefig(output_dir / 'results.png', dpi=300, bbox_inches='tight')\nplt.show()\n\nprint(f\"\\n✅ Visualization saved to '{output_dir}/results.png'\")"
961
- },
962
- {
963
- "cell_type": "code",
964
- "execution_count": null,
965
- "metadata": {},
966
- "outputs": [],
967
- "source": [
968
- "best_config = config_summary.iloc[0]\n",
969
- "\n",
970
- "print(\"\\n\" + \"=\"*100)\n",
971
- "print(\"🏆 OPTIMAL RAG CONFIGURATION\")\n",
972
- "print(\"=\"*100)\n",
973
- "\n",
974
- "print(f\"\\n✅ Best Configuration: {best_config.name}\")\n",
975
- "print(f\"\\n📊 Performance:\")\n",
976
- "print(f\" LLM Judge Score: {best_config['LLM_Judge_Score']:.2f}%\")\n",
977
- "print(f\" Accuracy: {best_config['Accuracy_Score']:.2f}%\")\n",
978
- "print(f\" Citation Quality: {best_config['Citation_Score']:.2f}%\")\n",
979
- "print(f\" Completeness: {best_config['Completeness_Score']:.2f}%\")\n",
980
- "print(f\" Avg Response Time: {best_config['Response_Time']:.2f}s\")\n",
981
- "\n",
982
- "print(f\"\\n⚙️ Components:\")\n",
983
- "print(f\" Embedding Model: {best_config['Embedding_Model']}\")\n",
984
- "print(f\" → {EMBEDDING_MODELS[best_config['Embedding_Model']]['name']}\")\n",
985
- "print(f\" Retrieval Strategy: {best_config['Retrieval_Strategy']}\")\n",
986
- "print(f\" → {RETRIEVAL_STRATEGIES[best_config['Retrieval_Strategy']]['notes']}\")\n",
987
- "print(f\" LLM Model: {best_config['LLM_Model']}\")\n",
988
- "print(f\" Prompting Strategy: {best_config['Prompt_Strategy']}\")\n",
989
- "\n",
990
- "print(f\"\\n💡 Key Findings:\")\n",
991
- "print(f\" 1. Best Embedding: {embed_impact.index[0]} ({embed_impact.values[0]:.2f}%)\")\n",
992
- "print(f\" 2. Best Retrieval: {retrieval_impact.index[0]} ({retrieval_impact.values[0]:.2f}%)\")\n",
993
- "print(f\" 3. Best LLM: {llm_impact.index[0]} ({llm_impact.values[0]:.2f}%)\")\n",
994
- "print(f\" 4. Best Prompt: {prompt_impact.index[0]} ({prompt_impact.values[0]:.2f}%)\")\n",
995
- "\n",
996
- "print(f\"\\n🎯 Hackathon Impact:\")\n",
997
- "print(f\" LLM Quality = 30% of total score\")\n",
998
- "print(f\" Your score: {best_config['LLM_Judge_Score']:.2f}% × 30% = {best_config['LLM_Judge_Score'] * 0.3:.2f} points\")\n",
999
- "\n",
1000
- "baseline = df[df['Config'].str.contains('baseline')].iloc[0] if len(df[df['Config'].str.contains('baseline')]) > 0 else None\n",
1001
- "if baseline is not None:\n",
1002
- " improvement = best_config['LLM_Judge_Score'] - baseline['LLM_Judge_Score']\n",
1003
- " print(f\"\\n📈 Improvement vs Baseline:\")\n",
1004
- " print(f\" +{improvement:.2f}% quality improvement\")\n",
1005
- " print(f\" = +{improvement * 0.3:.2f} hackathon points\")\n",
1006
- "\n",
1007
- "print(\"\\n\" + \"=\"*100)\n",
1008
- "print(\"📝 IMPLEMENTATION CHECKLIST\")\n",
1009
- "print(\"=\"*100)\n",
1010
- "print(f\"\\n1. Use embedding model: {EMBEDDING_MODELS[best_config['Embedding_Model']]['name']}\")\n",
1011
- "print(f\"2. Implement retrieval: {best_config['Retrieval_Strategy']}\")\n",
1012
- "print(f\"3. Use LLM model: {best_config['LLM_Model']}\")\n",
1013
- "print(f\"4. Apply prompt: {best_config['Prompt_Strategy']}\")\n",
1014
- "print(f\"\\n5. Expected performance:\")\n",
1015
- "print(f\" - LLM Judge Score: {best_config['LLM_Judge_Score']:.2f}%\")\n",
1016
- "print(f\" - Response time: ~{best_config['Response_Time']:.1f}s\")\n",
1017
- "print(\"=\"*100)"
1018
- ]
1019
- },
1020
- {
1021
- "cell_type": "markdown",
1022
- "metadata": {},
1023
- "source": [
1024
- "# Save results\n",
1025
- "from pathlib import Path\n",
1026
- "\n",
1027
- "# Using dynamic path\n",
1028
- "output_dir = OUTPUT_DIR / 'rag_optimization_benchmark'\n",
1029
- "output_dir.mkdir(parents=True, exist_ok=True)\n",
1030
- "\n",
1031
- "df.to_csv(output_dir / 'detailed_results.csv', index=False, encoding='utf-8')\n",
1032
- "config_summary.to_csv(output_dir / 'summary.csv', encoding='utf-8')\n",
1033
- "\n",
1034
- "# Save component impacts\n",
1035
- "impacts = pd.DataFrame({\n",
1036
- " 'Embedding_Impact': embed_impact,\n",
1037
- " 'Retrieval_Impact': retrieval_impact.reindex(embed_impact.index, fill_value=0),\n",
1038
- " 'LLM_Impact': llm_impact.reindex(embed_impact.index, fill_value=0),\n",
1039
- " 'Prompt_Impact': prompt_impact.reindex(embed_impact.index, fill_value=0)\n",
1040
- "}).fillna(0)\n",
1041
- "impacts.to_csv(output_dir / 'component_impacts.csv', encoding='utf-8')\n",
1042
- "\n",
1043
- "print(\"\\n✅ Results exported to output/rag_optimization_benchmark/:\")\n",
1044
- "print(\" - detailed_results.csv (all tests)\")\n",
1045
- "print(\" - summary.csv (config rankings)\")\n",
1046
- "print(\" - component_impacts.csv (component analysis)\")\n",
1047
- "print(\" - results.png (visualizations)\")"
1048
- ]
1049
- }
1050
- ],
1051
- "metadata": {
1052
- "kernelspec": {
1053
- "display_name": "venv",
1054
- "language": "python",
1055
- "name": "python3"
1056
- },
1057
- "language_info": {
1058
- "codemirror_mode": {
1059
- "name": "ipython",
1060
- "version": 3
1061
- },
1062
- "file_extension": ".py",
1063
- "mimetype": "text/x-python",
1064
- "name": "python",
1065
- "nbconvert_exporter": "python",
1066
- "pygments_lexer": "ipython3",
1067
- "version": "3.10.12"
1068
- }
1069
- },
1070
- "nbformat": 4,
1071
- "nbformat_minor": 4
1072
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
notebooks/{requirements_rag_optimization.txt → requirements.txt} RENAMED
@@ -1,19 +1,24 @@
1
- # RAG Pipeline Optimization Requirements
2
- # Install with: pip install -r requirements_rag_optimization.txt
 
3
 
4
- # Azure OpenAI
5
  openai==1.54.0
6
 
7
- # Vector Database
8
  pinecone-client==5.0.0
9
 
10
- # Embeddings and Reranking
11
  sentence-transformers==3.3.1
12
 
13
- # Metrics
 
 
 
 
14
  jiwer==3.0.3
15
 
16
- # Data analysis and visualization
17
  pandas==2.1.3
18
  matplotlib==3.8.2
19
  seaborn==0.13.0
 
1
+ # Notebooks Requirements
2
+ # All dependencies for VLM OCR, LLM Benchmark, and RAG Optimization notebooks
3
+ # Install with: pip install -r requirements.txt
4
 
5
+ # Azure OpenAI (for all notebooks)
6
  openai==1.54.0
7
 
8
+ # Vector Database (for LLM and RAG notebooks)
9
  pinecone-client==5.0.0
10
 
11
+ # Embeddings and Transformers (for LLM and RAG notebooks)
12
  sentence-transformers==3.3.1
13
 
14
+ # PDF Processing (for VLM OCR notebook)
15
+ PyMuPDF==1.23.8
16
+ Pillow==10.1.0
17
+
18
+ # Metrics (for all notebooks)
19
  jiwer==3.0.3
20
 
21
+ # Data Analysis and Visualization (for all notebooks)
22
  pandas==2.1.3
23
  matplotlib==3.8.2
24
  seaborn==0.13.0
notebooks/requirements_vlm_ocr.txt DELETED
@@ -1,24 +0,0 @@
1
- # VLM OCR Benchmarking Requirements
2
- # Install with: pip install -r requirements_vlm_ocr.txt
3
-
4
- # Azure OpenAI client (for vision models)
5
- openai==1.54.0
6
-
7
- # PDF processing
8
- PyMuPDF==1.23.8
9
- Pillow==10.1.0
10
-
11
- # Metrics
12
- jiwer==3.0.3
13
-
14
- # Data analysis and visualization
15
- pandas==2.1.3
16
- matplotlib==3.8.2
17
- seaborn==0.13.0
18
-
19
- # Utilities
20
- python-dotenv==1.0.0
21
-
22
- # Jupyter
23
- jupyter==1.0.0
24
- ipykernel==6.27.1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
notebooks/vlm_ocr_benchmark.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
notebooks/vlm_ocr_benchmark.ipynb.backup DELETED
The diff for this file is too large to render. See raw diff
 
scripts/README.md ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Scripts Directory
2
+
3
+ One-time utility scripts for SOCAR Hackathon project.
4
+
5
+ ## Available Scripts
6
+
7
+ ### 📊 Data Management
8
+
9
+ #### `check_pinecone.py`
10
+ Check Pinecone vector database status and statistics.
11
+
12
+ ```bash
13
+ python scripts/check_pinecone.py
14
+ ```
15
+
16
+ **Output:**
17
+ - Total vector count
18
+ - Index dimensions
19
+ - Namespaces (if any)
20
+ - Connection status
21
+
22
+ #### `clear_pinecone.py`
23
+ Clear all data from Pinecone index before re-ingestion.
24
+
25
+ ```bash
26
+ python scripts/clear_pinecone.py
27
+ ```
28
+
29
+ **⚠️ WARNING**: This deletes ALL vectors! Requires typing 'DELETE' to confirm.
30
+
31
+ **Use case:**
32
+ - Before re-ingesting documents with new chunking strategy
33
+ - Testing with fresh data
34
+ - Cleaning up after experiments
35
+
36
+ ### 🤖 Azure OpenAI
37
+
38
+ #### `list_azure_models.py`
39
+ List all deployed Azure OpenAI models.
40
+
41
+ ```bash
42
+ python scripts/list_azure_models.py
43
+ ```
44
+
45
+ **Output:**
46
+ - Vision models (GPT-4.1, GPT-5, Claude, etc.)
47
+ - Text models (Llama, DeepSeek, etc.)
48
+ - Total count and categorization
49
+
50
+ **Use case:**
51
+ - Verify which models are deployed
52
+ - Check model availability before updating notebooks
53
+ - Debugging 404 errors
54
+
55
+ ## Setup
56
+
57
+ All scripts use environment variables from `.env` file:
58
+
59
+ ```bash
60
+ # Required in .env
61
+ PINECONE_API_KEY=your_key
62
+ PINECONE_INDEX_NAME=hackathon
63
+ AZURE_OPENAI_API_KEY=your_key
64
+ AZURE_OPENAI_ENDPOINT=your_endpoint
65
+ ```
66
+
67
+ ## Dependencies
68
+
69
+ Scripts use the same dependencies as the main project:
70
+ - `python-dotenv` - Environment variables
71
+ - `pinecone-client` - Vector database
72
+ - `openai` - Azure OpenAI
73
+
74
+ Install from project root:
75
+ ```bash
76
+ pip install -r notebooks/requirements.txt
77
+ ```
78
+
79
+ ## Common Workflows
80
+
81
+ ### Re-ingesting Documents
82
+
83
+ ```bash
84
+ # 1. Check current data
85
+ python scripts/check_pinecone.py
86
+
87
+ # 2. Clear existing data
88
+ python scripts/clear_pinecone.py
89
+
90
+ # 3. Run ingestion script (not included - create as needed)
91
+ # python scripts/ingest_documents.py
92
+
93
+ # 4. Verify new data
94
+ python scripts/check_pinecone.py
95
+ ```
96
+
97
+ ### Verifying Model Availability
98
+
99
+ ```bash
100
+ # List all deployed models
101
+ python scripts/list_azure_models.py
102
+
103
+ # Check if specific model exists in output
104
+ python scripts/list_azure_models.py | grep "Llama-3.2-Vision"
105
+ ```
106
+
107
+ ## Adding New Scripts
108
+
109
+ When creating new scripts:
110
+ 1. Add descriptive docstring at top
111
+ 2. Use environment variables from `.env`
112
+ 3. Include error handling with helpful messages
113
+ 4. Update this README with usage instructions
114
+ 5. Follow existing naming convention: `verb_noun.py`
115
+
116
+ ## Examples
117
+
118
+ ### Safe Pinecone Cleanup
119
+ ```python
120
+ # First check what's there
121
+ $ python scripts/check_pinecone.py
122
+ Total Vectors: 1,300
123
+ Dimensions: 1024
124
+
125
+ # Then clear if needed
126
+ $ python scripts/clear_pinecone.py
127
+ ⚠️ WARNING: This will delete ALL 1,300 vectors!
128
+ Type 'DELETE' to confirm: DELETE
129
+ ✅ Deletion completed!
130
+ ```
131
+
132
+ ### Check Vision Models
133
+ ```python
134
+ $ python scripts/list_azure_models.py
135
+
136
+ 🖼️ Vision Models (6):
137
+ ✅ gpt-4.1
138
+ ✅ gpt-5
139
+ ✅ gpt-5-mini
140
+ ✅ claude-sonnet-4-5
141
+ ✅ claude-opus-4-1
142
+ ✅ Phi-4-multimodal-instruct
143
+ ```
scripts/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """SOCAR Hackathon utility scripts"""
scripts/check_pinecone.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Check Pinecone index status and statistics
3
+ Quick utility to inspect vector database
4
+ """
5
+
6
+ import os
7
+ from dotenv import load_dotenv
8
+ from pinecone import Pinecone
9
+
10
+ # Load environment variables
11
+ load_dotenv()
12
+
13
+ def check_pinecone_status():
14
+ """Display Pinecone index information"""
15
+
16
+ try:
17
+ # Initialize Pinecone
18
+ pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
19
+ index_name = os.getenv('PINECONE_INDEX_NAME', 'hackathon')
20
+ index = pc.Index(index_name)
21
+
22
+ # Get index statistics
23
+ stats = index.describe_index_stats()
24
+
25
+ print("="*80)
26
+ print("PINECONE INDEX STATUS")
27
+ print("="*80)
28
+
29
+ print(f"\n📊 Index Information:")
30
+ print(f" Name: {index_name}")
31
+ print(f" Total Vectors: {stats.get('total_vector_count', 0):,}")
32
+ print(f" Dimensions: {stats.get('dimension', 'N/A')}")
33
+
34
+ # Check namespaces if any
35
+ if 'namespaces' in stats and stats['namespaces']:
36
+ print(f"\n📁 Namespaces:")
37
+ for ns_name, ns_stats in stats['namespaces'].items():
38
+ ns_display = ns_name if ns_name else "(default)"
39
+ print(f" {ns_display}: {ns_stats.get('vector_count', 0):,} vectors")
40
+
41
+ # Index configuration
42
+ print(f"\n⚙️ Configuration:")
43
+ print(f" API Key: {os.getenv('PINECONE_API_KEY')[:10]}..." if os.getenv('PINECONE_API_KEY') else " API Key: Not set")
44
+
45
+ # Connection status
46
+ if stats.get('total_vector_count', 0) > 0:
47
+ print(f"\n✅ Status: Connected and populated")
48
+ else:
49
+ print(f"\n⚠️ Status: Connected but empty")
50
+
51
+ except Exception as e:
52
+ print("="*80)
53
+ print("PINECONE CONNECTION ERROR")
54
+ print("="*80)
55
+ print(f"\n❌ Error: {e}")
56
+ print("\nPlease check:")
57
+ print(" 1. PINECONE_API_KEY in .env file")
58
+ print(" 2. PINECONE_INDEX_NAME in .env file")
59
+ print(" 3. Index exists in your Pinecone account")
60
+
61
+ if __name__ == "__main__":
62
+ check_pinecone_status()
scripts/clear_pinecone.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Clear all data from Pinecone index
3
+ One-time script for data cleanup before re-ingestion
4
+ """
5
+
6
+ import os
7
+ from dotenv import load_dotenv
8
+ from pinecone import Pinecone
9
+
10
+ # Load environment variables
11
+ load_dotenv()
12
+
13
+ def clear_pinecone_index():
14
+ """Delete all vectors from Pinecone index"""
15
+
16
+ # Initialize Pinecone
17
+ pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
18
+ index = pc.Index(os.getenv('PINECONE_INDEX_NAME', 'hackathon'))
19
+
20
+ # Get current stats
21
+ stats = index.describe_index_stats()
22
+ total_vectors = stats['total_vector_count']
23
+
24
+ print("="*80)
25
+ print("PINECONE DATA CLEANUP")
26
+ print("="*80)
27
+ print(f"\nIndex: {os.getenv('PINECONE_INDEX_NAME', 'hackathon')}")
28
+ print(f"Current vectors: {total_vectors}")
29
+ print(f"Dimensions: {stats.get('dimension', 'N/A')}")
30
+
31
+ if total_vectors == 0:
32
+ print("\n✅ Index is already empty. Nothing to delete.")
33
+ return
34
+
35
+ # Confirm deletion
36
+ print(f"\n⚠️ WARNING: This will delete ALL {total_vectors} vectors!")
37
+ confirm = input("Type 'DELETE' to confirm: ")
38
+
39
+ if confirm != 'DELETE':
40
+ print("\n❌ Deletion cancelled. No data was removed.")
41
+ return
42
+
43
+ print("\n🗑️ Deleting all vectors...")
44
+
45
+ try:
46
+ # Delete all vectors
47
+ index.delete(delete_all=True)
48
+
49
+ print("✅ Deletion completed!")
50
+
51
+ # Verify deletion
52
+ import time
53
+ time.sleep(2) # Wait for deletion to propagate
54
+
55
+ stats = index.describe_index_stats()
56
+ remaining = stats['total_vector_count']
57
+
58
+ print(f"\n📊 Final status:")
59
+ print(f" Remaining vectors: {remaining}")
60
+
61
+ if remaining == 0:
62
+ print(" ✅ Index successfully cleared!")
63
+ else:
64
+ print(f" ⚠️ {remaining} vectors still remain (may need a moment to sync)")
65
+
66
+ except Exception as e:
67
+ print(f"\n❌ Error during deletion: {e}")
68
+
69
+ if __name__ == "__main__":
70
+ clear_pinecone_index()
scripts/list_azure_models.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ List all deployed Azure OpenAI models
3
+ Useful for verifying available models
4
+ """
5
+
6
+ import os
7
+ from dotenv import load_dotenv
8
+ from openai import AzureOpenAI
9
+
10
+ # Load environment variables
11
+ load_dotenv()
12
+
13
+ def list_azure_models():
14
+ """List all deployed Azure OpenAI models"""
15
+
16
+ try:
17
+ client = AzureOpenAI(
18
+ api_key=os.getenv('AZURE_OPENAI_API_KEY'),
19
+ api_version=os.getenv('AZURE_OPENAI_API_VERSION', '2024-08-01-preview'),
20
+ azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT')
21
+ )
22
+
23
+ print("="*80)
24
+ print("AZURE OPENAI DEPLOYED MODELS")
25
+ print("="*80)
26
+
27
+ # List models
28
+ models = client.models.list()
29
+
30
+ print(f"\n📊 Total Models: {len(list(models))}")
31
+ print(f"\nDeployed Models:")
32
+ print("-" * 80)
33
+
34
+ model_list = []
35
+ for model in models:
36
+ model_list.append({
37
+ 'id': model.id,
38
+ 'created': model.created if hasattr(model, 'created') else 'N/A'
39
+ })
40
+
41
+ # Sort by id
42
+ model_list.sort(key=lambda x: x['id'])
43
+
44
+ # Categorize models
45
+ vision_models = []
46
+ text_models = []
47
+
48
+ for model in model_list:
49
+ model_id = model['id']
50
+ if any(keyword in model_id.lower() for keyword in ['vision', 'multimodal', 'gpt-4.1', 'gpt-5', 'claude']):
51
+ vision_models.append(model_id)
52
+ else:
53
+ text_models.append(model_id)
54
+
55
+ print(f"\n🖼️ Vision Models ({len(vision_models)}):")
56
+ for model_id in vision_models:
57
+ print(f" ✅ {model_id}")
58
+
59
+ print(f"\n📝 Text Models ({len(text_models)}):")
60
+ for model_id in text_models:
61
+ print(f" ✅ {model_id}")
62
+
63
+ print("\n" + "="*80)
64
+
65
+ except Exception as e:
66
+ print("="*80)
67
+ print("AZURE OPENAI CONNECTION ERROR")
68
+ print("="*80)
69
+ print(f"\n❌ Error: {e}")
70
+ print("\nPlease check:")
71
+ print(" 1. AZURE_OPENAI_API_KEY in .env file")
72
+ print(" 2. AZURE_OPENAI_ENDPOINT in .env file")
73
+ print(" 3. API version compatibility")
74
+
75
+ if __name__ == "__main__":
76
+ list_azure_models()
test_api.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Quick test script for SOCAR LLM API
3
+ """
4
+
5
+ import requests
6
+ import json
7
+ from docs.sample_questions import questions
8
+
9
+ # API base URL
10
+ BASE_URL = "http://localhost:8000"
11
+
12
+ def test_health():
13
+ """Test health endpoint"""
14
+ print("🔍 Testing health endpoint...")
15
+ response = requests.get(f"{BASE_URL}/health")
16
+ print(f"Status: {response.status_code}")
17
+ print(json.dumps(response.json(), indent=2))
18
+ print()
19
+
20
+ def test_root():
21
+ """Test root endpoint"""
22
+ print("🔍 Testing root endpoint...")
23
+ response = requests.get(BASE_URL)
24
+ print(f"Status: {response.status_code}")
25
+ print(json.dumps(response.json(), indent=2))
26
+ print()
27
+
28
+ def test_llm(question: str):
29
+ """Test LLM endpoint"""
30
+ print(f"🔍 Testing LLM endpoint...")
31
+ print(f"Question: {question}\n")
32
+
33
+ payload = {
34
+ "messages": [
35
+ {"role": "user", "content": question}
36
+ ],
37
+ "temperature": 0.2,
38
+ "max_tokens": 1000
39
+ }
40
+
41
+ response = requests.post(f"{BASE_URL}/llm", json=payload)
42
+ print(f"Status: {response.status_code}")
43
+
44
+ if response.status_code == 200:
45
+ result = response.json()
46
+ print(f"Response time: {result['response_time']}s")
47
+ print(f"Model: {result['model']}")
48
+ print(f"\nAnswer:\n{result['response']}")
49
+ print(f"\nSources:")
50
+ for source in result['sources']:
51
+ print(f" - {source['pdf_name']}, Page {source['page_number']} (score: {source['relevance_score']})")
52
+ else:
53
+ print(f"Error: {response.text}")
54
+ print()
55
+
56
+ if __name__ == "__main__":
57
+ print("="*80)
58
+ print("SOCAR LLM API Test Suite")
59
+ print("="*80)
60
+ print()
61
+
62
+ # Test health
63
+ try:
64
+ test_health()
65
+ except Exception as e:
66
+ print(f"❌ Health check failed: {e}\n")
67
+
68
+ # Test root
69
+ try:
70
+ test_root()
71
+ except Exception as e:
72
+ print(f"❌ Root endpoint failed: {e}\n")
73
+
74
+ # Test LLM with sample question
75
+ try:
76
+ test_llm("Palçıq vulkanlarının təsir radiusu nə qədərdir?")
77
+ except Exception as e:
78
+ print(f"❌ LLM endpoint failed: {e}\n")
79
+
80
+ print("="*80)
81
+ print("✅ Test suite completed!")
82
+ print("="*80)