IsmatS commited on
Commit
0591d5f
·
1 Parent(s): a58b934
notebooks/llm_benchmark.ipynb CHANGED
@@ -53,7 +53,7 @@
53
  "name": "stdout",
54
  "output_type": "stream",
55
  "text": [
56
- " Libraries loaded successfully\n"
57
  ]
58
  }
59
  ],
@@ -78,19 +78,59 @@
78
  "sns.set_style('whitegrid')\n",
79
  "plt.rcParams['figure.figsize'] = (14, 8)\n",
80
  "\n",
81
- "print(\" Libraries loaded successfully\")"
82
  ]
83
  },
84
  {
85
- "cell_type": "markdown",
 
86
  "metadata": {},
 
 
 
 
 
 
 
 
 
 
 
87
  "source": [
88
- "## 1. Load Test Questions and Expected Answers"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  ]
90
  },
91
  {
92
  "cell_type": "code",
93
- "execution_count": 3,
94
  "metadata": {},
95
  "outputs": [
96
  {
@@ -100,21 +140,21 @@
100
  "Loaded 5 test cases\n",
101
  "\n",
102
  "Test Questions:\n",
103
- "1. Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas amillərin inteqrasiyası tələb olunur?...\n",
104
- "2. Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə hansı layda tətbiq edilmişdir bunun m...\n",
105
- "3. Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümunələrdə SiO2 CaO oksidləri arasında ha...\n",
106
- "4. Example4: Bakı arxipelaqı (BA) Aşağı Kür çökəkliyi (AKÇ) üçün geotemperatur xəritələrinə əsasən neft qaz...\n",
107
- "5. Example5: Bu zonada hansı proseslər baş verir?...\n"
108
  ]
109
  }
110
  ],
111
  "source": [
112
- "# Load sample questions\n",
113
- "with open('docs/sample_questions.json', 'r', encoding='utf-8') as f:\n",
114
  " questions = json.load(f)\n",
115
  "\n",
116
- "# Load expected answers\n",
117
- "with open('docs/sample_answers.json', 'r', encoding='utf-8') as f:\n",
118
  " expected_answers = json.load(f)\n",
119
  "\n",
120
  "print(f\"Loaded {len(questions)} test cases\")\n",
@@ -133,38 +173,9 @@
133
  },
134
  {
135
  "cell_type": "code",
136
- "execution_count": 4,
137
  "metadata": {},
138
- "outputs": [
139
- {
140
- "name": "stdout",
141
- "output_type": "stream",
142
- "text": [
143
- "✅ Vector DB connected: {'_response_info': {'raw_headers': {'connection': 'keep-alive',\n",
144
- " 'content-length': '188',\n",
145
- " 'content-type': 'application/json',\n",
146
- " 'date': 'Sun, 14 Dec 2025 03:21:33 GMT',\n",
147
- " 'grpc-status': '0',\n",
148
- " 'server': 'envoy',\n",
149
- " 'x-envoy-upstream-service-time': '4',\n",
150
- " 'x-pinecone-request-id': '3979707437017514155',\n",
151
- " 'x-pinecone-request-latency-ms': '4'}},\n",
152
- " 'dimension': 1024,\n",
153
- " 'index_fullness': 0.0,\n",
154
- " 'memoryFullness': 0.0,\n",
155
- " 'metric': 'cosine',\n",
156
- " 'namespaces': {'__default__': {'vector_count': 1300}},\n",
157
- " 'storageFullness': 0.0,\n",
158
- " 'total_vector_count': 1300,\n",
159
- " 'vector_type': 'dense'}\n",
160
- "✅ Embedding model loaded: SentenceTransformer(\n",
161
- " (0): Transformer({'max_seq_length': 512, 'do_lower_case': True, 'architecture': 'BertModel'})\n",
162
- " (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})\n",
163
- " (2): Normalize()\n",
164
- ")\n"
165
- ]
166
- }
167
- ],
168
  "source": [
169
  "# Initialize Pinecone\n",
170
  "pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))\n",
@@ -173,8 +184,8 @@
173
  "# Initialize embedding model (same as used for ingestion)\n",
174
  "embed_model = SentenceTransformer('BAAI/bge-large-en-v1.5')\n",
175
  "\n",
176
- "print(f\" Vector DB connected: {index.describe_index_stats()}\")\n",
177
- "print(f\" Embedding model loaded: {embed_model}\")"
178
  ]
179
  },
180
  {
@@ -186,19 +197,9 @@
186
  },
187
  {
188
  "cell_type": "code",
189
- "execution_count": 5,
190
  "metadata": {},
191
- "outputs": [
192
- {
193
- "name": "stdout",
194
- "output_type": "stream",
195
- "text": [
196
- "\n",
197
- "✅ Retrieved 3 documents for test query\n",
198
- "Top result: document_10.pdf, page 8 (score: 0.767)\n"
199
- ]
200
- }
201
- ],
202
  "source": [
203
  "def retrieve_documents(query: str, top_k: int = 3) -> List[Dict]:\n",
204
  " \"\"\"\n",
@@ -227,9 +228,9 @@
227
  " return documents\n",
228
  "\n",
229
  "# Test retrieval\n",
230
- "test_query = \"Palçıq vulkanlarının təsir radiusu qədərdir?\"\n",
231
  "test_docs = retrieve_documents(test_query)\n",
232
- "print(f\"\\n Retrieved {len(test_docs)} documents for test query\")\n",
233
  "print(f\"Top result: {test_docs[0]['pdf_name']}, page {test_docs[0]['page_number']} (score: {test_docs[0]['score']:.3f})\")"
234
  ]
235
  },
@@ -245,7 +246,83 @@
245
  "execution_count": null,
246
  "metadata": {},
247
  "outputs": [],
248
- "source": "# Initialize Azure OpenAI\nazure_client = AzureOpenAI(\n api_key=os.getenv('AZURE_OPENAI_API_KEY'),\n api_version=os.getenv('AZURE_OPENAI_API_VERSION', '2024-08-01-preview'),\n azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT')\n)\n\nLLM_MODELS = {\n 'Llama-4-Maverick': 'Llama-4-Maverick-17B-128E-Instruct-FP8',\n 'DeepSeek-R1': 'DeepSeek-R1',\n 'GPT-4.1': 'gpt-4.1',\n 'GPT-5-mini': 'gpt-5-mini',\n 'Claude-Sonnet-4.5': 'claude-sonnet-4-5',\n}\n\ndef generate_answer(model_name: str, query: str, documents: List[Dict], \n temperature: float = 0.2, max_tokens: int = 1000) -> Tuple[str, float]:\n \"\"\"\n Generate answer using specified LLM model.\n Returns: (answer, response_time)\n \"\"\"\n # Build context from retrieved documents\n context_parts = []\n for i, doc in enumerate(documents, 1):\n context_parts.append(\n f\"Document {i} (Source: {doc['pdf_name']}, Page {doc['page_number']}):\\n{doc['content']}\"\n )\n context = \"\\n\\n\".join(context_parts)\n \n # Create prompt\n prompt = f\"\"\"Siz SOCAR-ın tarixi neft və qaz sənədləri üzrə mütəxəssis köməkçisisiniz.\n\nKontekst (əlaqəli sənədlər):\n{context}\n\nSual: {query}\n\nƏtraflı cavab verin və mütləq sənəd mənbələrinə istinad edin (PDF adı və səhifə nömrəsi ilə).\nCavabınız dəqiq, faktlara əsaslanan və kontekst məlumatlarından istifadə edən olmalıdır.\"\"\"\n \n # Get model deployment\n deployment = MODELS[model_name]['deployment']\n \n try:\n start_time = time.time()\n \n # GPT-5 models use max_completion_tokens, others use max_tokens\n if deployment.startswith('gpt-5'):\n response = azure_client.chat.completions.create(\n model=deployment,\n messages=[\n {\"role\": \"user\", \"content\": prompt}\n ],\n temperature=temperature,\n max_completion_tokens=max_tokens\n )\n else:\n response = azure_client.chat.completions.create(\n model=deployment,\n messages=[\n {\"role\": \"user\", \"content\": prompt}\n ],\n temperature=temperature,\n max_tokens=max_tokens\n )\n \n response_time = time.time() - start_time\n answer = response.choices[0].message.content\n \n return answer, response_time\n \n except Exception as e:\n return f\"ERROR: {str(e)}\", 0.0\n\nprint(f\"\\n✅ Configured {len(LLM_MODELS)} LLM models for testing\")"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
  },
250
  {
251
  "cell_type": "markdown",
@@ -256,17 +333,9 @@
256
  },
257
  {
258
  "cell_type": "code",
259
- "execution_count": 7,
260
  "metadata": {},
261
- "outputs": [
262
- {
263
- "name": "stdout",
264
- "output_type": "stream",
265
- "text": [
266
- "✅ Evaluation functions ready\n"
267
- ]
268
- }
269
- ],
270
  "source": [
271
  "def normalize_text(text: str) -> str:\n",
272
  " \"\"\"Normalize text for comparison.\"\"\"\n",
@@ -313,7 +382,7 @@
313
  " cited_pages = sum(1 for page in page_numbers if page in answer)\n",
314
  " \n",
315
  " # Check for source keywords\n",
316
- " source_keywords = ['mənbə', 'sənəd', 'səhifə', 'pdf', 'document', 'page', 'source']\n",
317
  " has_source_ref = any(kw in answer.lower() for kw in source_keywords)\n",
318
  " \n",
319
  " citation_score = (\n",
@@ -350,7 +419,7 @@
350
  " 'Char_Count': char_count\n",
351
  " }\n",
352
  "\n",
353
- "print(\" Evaluation functions ready\")"
354
  ]
355
  },
356
  {
@@ -362,20 +431,9 @@
362
  },
363
  {
364
  "cell_type": "code",
365
- "execution_count": 8,
366
  "metadata": {},
367
- "outputs": [
368
- {
369
- "name": "stdout",
370
- "output_type": "stream",
371
- "text": [
372
- "Testing 5 models on 5 questions...\n",
373
- "\n",
374
- "This may take several minutes...\n",
375
- "\n"
376
- ]
377
- }
378
- ],
379
  "source": [
380
  "# Select models to test (you can comment out models to skip)\n",
381
  "MODELS_TO_TEST = [\n",
@@ -395,133 +453,9 @@
395
  },
396
  {
397
  "cell_type": "code",
398
- "execution_count": 9,
399
  "metadata": {},
400
- "outputs": [
401
- {
402
- "name": "stdout",
403
- "output_type": "stream",
404
- "text": [
405
- "\n",
406
- "================================================================================\n",
407
- "Testing: Llama-4-Maverick-17B\n",
408
- "================================================================================\n",
409
- "\n",
410
- " Question Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas amillərin inteqrasi...\n",
411
- " ✅ Response time: 4.39s\n",
412
- "\n",
413
- " Question Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və hansı layda tətbiq e...\n",
414
- " ✅ Response time: 3.74s\n",
415
- "\n",
416
- " Question Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümunələrdə SiO2 və CaO o...\n",
417
- " ✅ Response time: 4.07s\n",
418
- "\n",
419
- " Question Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkliyi (AKÇ) üçün geotemperatur xəritələrin...\n",
420
- " ✅ Response time: 4.20s\n",
421
- "\n",
422
- " Question Example5: Bu zonada hansı proseslər baş verir?...\n",
423
- " ✅ Response time: 3.50s\n",
424
- "\n",
425
- " 📊 Llama-4-Maverick-17B Summary:\n",
426
- " Avg Response Time: 3.98s\n",
427
- " Avg Similarity: 0.0%\n",
428
- " Avg Citation Score: 84.0%\n",
429
- " Avg Completeness: 100.0%\n",
430
- "\n",
431
- "================================================================================\n",
432
- "Testing: DeepSeek-R1\n",
433
- "================================================================================\n",
434
- "\n",
435
- " Question Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas amillərin inteqrasi...\n",
436
- " ✅ Response time: 10.00s\n",
437
- "\n",
438
- " Question Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və hansı layda tətbiq e...\n",
439
- " ✅ Response time: 10.39s\n",
440
- "\n",
441
- " Question Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümunələrdə SiO2 və CaO o...\n",
442
- " ✅ Response time: 10.73s\n",
443
- "\n",
444
- " Question Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkliyi (AKÇ) üçün geotemperatur xəritələrin...\n",
445
- " ✅ Response time: 12.17s\n",
446
- "\n",
447
- " Question Example5: Bu zonada hansı proseslər baş verir?...\n",
448
- " ✅ Response time: 10.72s\n",
449
- "\n",
450
- " 📊 DeepSeek-R1 Summary:\n",
451
- " Avg Response Time: 10.80s\n",
452
- " Avg Similarity: 0.0%\n",
453
- " Avg Citation Score: 80.0%\n",
454
- " Avg Completeness: 67.7%\n",
455
- "\n",
456
- "================================================================================\n",
457
- "Testing: GPT-4.1\n",
458
- "================================================================================\n",
459
- "\n",
460
- " Question Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas amillərin inteqrasi...\n",
461
- " ✅ Response time: 6.66s\n",
462
- "\n",
463
- " Question Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və hansı layda tətbiq e...\n",
464
- " ✅ Response time: 5.05s\n",
465
- "\n",
466
- " Question Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümunələrdə SiO2 və CaO o...\n",
467
- " ✅ Response time: 7.65s\n",
468
- "\n",
469
- " Question Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkliyi (AKÇ) üçün geotemperatur xəritələrin...\n",
470
- " ✅ Response time: 6.68s\n",
471
- "\n",
472
- " Question Example5: Bu zonada hansı proseslər baş verir?...\n",
473
- " ✅ Response time: 3.69s\n",
474
- "\n",
475
- " 📊 GPT-4.1 Summary:\n",
476
- " Avg Response Time: 5.95s\n",
477
- " Avg Similarity: 0.0%\n",
478
- " Avg Citation Score: 84.0%\n",
479
- " Avg Completeness: 93.5%\n",
480
- "\n",
481
- "================================================================================\n",
482
- "Testing: GPT-5-mini\n",
483
- "================================================================================\n",
484
- "\n",
485
- " Question Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas amillərin inteqrasi...\n",
486
- " ❌ Failed: ERROR: Error code: 400 - {'error': {'message': \"Unsupported parameter: 'max_tokens' is not supported with this model. Use 'max_completion_tokens' instead.\", 'type': 'invalid_request_error', 'param': 'max_tokens', 'code': 'unsupported_parameter'}}\n",
487
- "\n",
488
- " Question Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və hansı layda tətbiq e...\n",
489
- " ❌ Failed: ERROR: Error code: 400 - {'error': {'message': \"Unsupported parameter: 'max_tokens' is not supported with this model. Use 'max_completion_tokens' instead.\", 'type': 'invalid_request_error', 'param': 'max_tokens', 'code': 'unsupported_parameter'}}\n",
490
- "\n",
491
- " Question Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümunələrdə SiO2 və CaO o...\n",
492
- " ❌ Failed: ERROR: Error code: 400 - {'error': {'message': \"Unsupported parameter: 'max_tokens' is not supported with this model. Use 'max_completion_tokens' instead.\", 'type': 'invalid_request_error', 'param': 'max_tokens', 'code': 'unsupported_parameter'}}\n",
493
- "\n",
494
- " Question Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkliyi (AKÇ) üçün geotemperatur xəritələrin...\n",
495
- " ❌ Failed: ERROR: Error code: 400 - {'error': {'message': \"Unsupported parameter: 'max_tokens' is not supported with this model. Use 'max_completion_tokens' instead.\", 'type': 'invalid_request_error', 'param': 'max_tokens', 'code': 'unsupported_parameter'}}\n",
496
- "\n",
497
- " Question Example5: Bu zonada hansı proseslər baş verir?...\n",
498
- " ❌ Failed: ERROR: Error code: 400 - {'error': {'message': \"Unsupported parameter: 'max_tokens' is not supported with this model. Use 'max_completion_tokens' instead.\", 'type': 'invalid_request_error', 'param': 'max_tokens', 'code': 'unsupported_parameter'}}\n",
499
- "\n",
500
- "================================================================================\n",
501
- "Testing: Claude-Sonnet-4.5\n",
502
- "================================================================================\n",
503
- "\n",
504
- " Question Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas amillərin inteqrasi...\n",
505
- " ❌ Failed: ERROR: Error code: 400 - {'error': {'code': 'unknown_model', 'message': 'Unknown model: claude-sonnet-4-5', 'details': 'Unknown model: claude-sonnet-4-5'}}\n",
506
- "\n",
507
- " Question Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və hansı layda tətbiq e...\n",
508
- " ❌ Failed: ERROR: Error code: 400 - {'error': {'code': 'unknown_model', 'message': 'Unknown model: claude-sonnet-4-5', 'details': 'Unknown model: claude-sonnet-4-5'}}\n",
509
- "\n",
510
- " Question Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümunələrdə SiO2 və CaO o...\n",
511
- " ❌ Failed: ERROR: Error code: 400 - {'error': {'code': 'unknown_model', 'message': 'Unknown model: claude-sonnet-4-5', 'details': 'Unknown model: claude-sonnet-4-5'}}\n",
512
- "\n",
513
- " Question Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkliyi (AKÇ) üçün geotemperatur xəritələrin...\n",
514
- " ❌ Failed: ERROR: Error code: 400 - {'error': {'code': 'unknown_model', 'message': 'Unknown model: claude-sonnet-4-5', 'details': 'Unknown model: claude-sonnet-4-5'}}\n",
515
- "\n",
516
- " Question Example5: Bu zonada hansı proseslər baş verir?...\n",
517
- " ❌ Failed: ERROR: Error code: 400 - {'error': {'code': 'unknown_model', 'message': 'Unknown model: claude-sonnet-4-5', 'details': 'Unknown model: claude-sonnet-4-5'}}\n",
518
- "\n",
519
- "================================================================================\n",
520
- "✅ Benchmarking complete!\n",
521
- "================================================================================\n"
522
- ]
523
- }
524
- ],
525
  "source": [
526
  "# Run benchmark\n",
527
  "results = []\n",
@@ -547,10 +481,10 @@
547
  " answer, response_time = generate_answer(model_name, query, documents)\n",
548
  " \n",
549
  " if answer.startswith('ERROR'):\n",
550
- " print(f\" Failed: {answer}\")\n",
551
  " continue\n",
552
  " \n",
553
- " print(f\" Response time: {response_time:.2f}s\")\n",
554
  " \n",
555
  " # Get expected answer\n",
556
  " expected = expected_answers.get(example_key, {}).get('Answer', '')\n",
@@ -584,14 +518,14 @@
584
  " avg_citation = sum(r['Citation_Score'] for r in model_results) / len(model_results)\n",
585
  " avg_completeness = sum(r['Completeness_Score'] for r in model_results) / len(model_results)\n",
586
  " \n",
587
- " print(f\"\\n 📊 {model_name} Summary:\")\n",
588
  " print(f\" Avg Response Time: {avg_response_time:.2f}s\")\n",
589
  " print(f\" Avg Similarity: {avg_similarity:.1f}%\")\n",
590
  " print(f\" Avg Citation Score: {avg_citation:.1f}%\")\n",
591
  " print(f\" Avg Completeness: {avg_completeness:.1f}%\")\n",
592
  "\n",
593
  "print(f\"\\n{'='*80}\")\n",
594
- "print(\" Benchmarking complete!\")\n",
595
  "print(f\"{'='*80}\")"
596
  ]
597
  },
@@ -604,26 +538,9 @@
604
  },
605
  {
606
  "cell_type": "code",
607
- "execution_count": 10,
608
  "metadata": {},
609
- "outputs": [
610
- {
611
- "name": "stdout",
612
- "output_type": "stream",
613
- "text": [
614
- "\n",
615
- "====================================================================================================\n",
616
- "📊 LLM BENCHMARKING RESULTS - MODEL SUMMARY\n",
617
- "====================================================================================================\n",
618
- " Response_Time Similarity Citation_Score Completeness_Score CER WER Open_Source Architecture_Score Quality_Score\n",
619
- "Model \n",
620
- "Llama-4-Maverick-17B 3.98 0.0 84.0 100.00 330.97 378.42 True High 59.40\n",
621
- "GPT-4.1 5.95 0.0 84.0 93.54 755.19 780.64 False Medium 57.46\n",
622
- "DeepSeek-R1 10.80 0.0 80.0 67.73 855.43 992.02 True High 48.32\n",
623
- "====================================================================================================\n"
624
- ]
625
- }
626
- ],
627
  "source": [
628
  "# Create DataFrame\n",
629
  "df = pd.DataFrame(results)\n",
@@ -652,17 +569,77 @@
652
  "\n",
653
  "# Display summary table\n",
654
  "print(\"\\n\" + \"=\"*100)\n",
655
- "print(\"📊 LLM BENCHMARKING RESULTS - MODEL SUMMARY\")\n",
656
  "print(\"=\"*100)\n",
657
  "print(model_summary.to_string())\n",
658
  "print(\"=\"*100)"
659
  ]
660
  },
661
  {
662
- "cell_type": "markdown",
663
  "metadata": {},
664
  "source": [
665
- "## 8. Visualizations"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
666
  ]
667
  },
668
  {
@@ -670,71 +647,6 @@
670
  "execution_count": null,
671
  "metadata": {},
672
  "outputs": [],
673
- "source": "# Create comprehensive visualization\nimport os\nfrom pathlib import Path\n\n# Create output directory\noutput_dir = Path('output/llm_benchmark')\noutput_dir.mkdir(parents=True, exist_ok=True)\n\nfig, axes = plt.subplots(2, 3, figsize=(18, 12))\n\nmodels = model_summary.index.tolist()\ncolors = sns.color_palette('husl', len(models))\n\n# 1. Overall Quality Score\nax1 = axes[0, 0]\nbars1 = ax1.barh(models, model_summary['Quality_Score'], color=colors)\nax1.set_xlabel('Quality Score (Higher is Better)', fontsize=11)\nax1.set_title('Overall Quality Score\\n(Similarity 35% + Citation 35% + Completeness 30%)', \n fontsize=12, fontweight='bold')\nax1.set_xlim(0, 100)\nfor i, (model, score) in enumerate(zip(models, model_summary['Quality_Score'])):\n ax1.text(score + 1, i, f'{score:.1f}', va='center', fontsize=10, fontweight='bold')\n\n# 2. Answer Similarity (Accuracy)\nax2 = axes[0, 1]\nax2.barh(models, model_summary['Similarity'], color=colors)\nax2.set_xlabel('Similarity to Expected Answer (%)', fontsize=11)\nax2.set_title('Answer Accuracy', fontsize=12, fontweight='bold')\nax2.set_xlim(0, 100)\nfor i, (model, score) in enumerate(zip(models, model_summary['Similarity'])):\n ax2.text(score + 1, i, f'{score:.1f}%', va='center', fontsize=9)\n\n# 3. Citation Quality\nax3 = axes[0, 2]\nax3.barh(models, model_summary['Citation_Score'], color=colors)\nax3.set_xlabel('Citation Score (%)', fontsize=11)\nax3.set_title('Citation Quality\\n(PDF names + Page numbers)', fontsize=12, fontweight='bold')\nax3.set_xlim(0, 100)\nfor i, (model, score) in enumerate(zip(models, model_summary['Citation_Score'])):\n ax3.text(score + 1, i, f'{score:.1f}%', va='center', fontsize=9)\n\n# 4. Response Time\nax4 = axes[1, 0]\nax4.barh(models, model_summary['Response_Time'], color=colors)\nax4.set_xlabel('Response Time (seconds - Lower is Better)', fontsize=11)\nax4.set_title('Speed Performance', fontsize=12, fontweight='bold')\nfor i, (model, time) in enumerate(zip(models, model_summary['Response_Time'])):\n ax4.text(time + 0.1, i, f'{time:.2f}s', va='center', fontsize=9)\n\n# 5. Completeness\nax5 = axes[1, 1]\nax5.barh(models, model_summary['Completeness_Score'], color=colors)\nax5.set_xlabel('Completeness Score (%)', fontsize=11)\nax5.set_title('Answer Completeness', fontsize=12, fontweight='bold')\nax5.set_xlim(0, 100)\nfor i, (model, score) in enumerate(zip(models, model_summary['Completeness_Score'])):\n ax5.text(score + 1, i, f'{score:.1f}%', va='center', fontsize=9)\n\n# 6. Error Rates (CER vs WER)\nax6 = axes[1, 2]\nx = range(len(models))\nwidth = 0.35\nax6.bar([i - width/2 for i in x], model_summary['CER'], width, label='CER', alpha=0.8)\nax6.bar([i + width/2 for i in x], model_summary['WER'], width, label='WER', alpha=0.8)\nax6.set_ylabel('Error Rate (% - Lower is Better)', fontsize=11)\nax6.set_title('Error Rates', fontsize=12, fontweight='bold')\nax6.set_xticks(x)\nax6.set_xticklabels(models, rotation=45, ha='right')\nax6.legend()\nax6.grid(axis='y', alpha=0.3)\n\nplt.tight_layout()\nplt.savefig(output_dir / 'results.png', dpi=300, bbox_inches='tight')\nplt.show()\n\nprint(f\"\\n✅ Visualization saved to '{output_dir}/results.png'\")"
674
- },
675
- {
676
- "cell_type": "markdown",
677
- "metadata": {},
678
- "source": [
679
- "## 9. Final Rankings and Recommendations"
680
- ]
681
- },
682
- {
683
- "cell_type": "code",
684
- "execution_count": 12,
685
- "metadata": {},
686
- "outputs": [
687
- {
688
- "name": "stdout",
689
- "output_type": "stream",
690
- "text": [
691
- "\n",
692
- "====================================================================================================\n",
693
- "🏆 FINAL RANKINGS\n",
694
- "====================================================================================================\n",
695
- " Rank Quality_Score Similarity Citation_Score Completeness_Score Response_Time Open_Source Architecture_Score\n",
696
- "Model \n",
697
- "Llama-4-Maverick-17B 1 59.40 0.0 84.0 100.00 3.98 True High\n",
698
- "GPT-4.1 2 57.46 0.0 84.0 93.54 5.95 False Medium\n",
699
- "DeepSeek-R1 3 48.32 0.0 80.0 67.73 10.80 True High\n",
700
- "====================================================================================================\n",
701
- "\n",
702
- "====================================================================================================\n",
703
- "💡 RECOMMENDATIONS FOR HACKATHON\n",
704
- "====================================================================================================\n",
705
- "\n",
706
- "🥇 Best Overall Quality: Llama-4-Maverick-17B\n",
707
- " Quality Score: 59.4%\n",
708
- " Similarity: 0.0%\n",
709
- " Citation Score: 84.0%\n",
710
- " Response Time: 3.98s\n",
711
- " Open Source: True\n",
712
- " Architecture Score: High\n",
713
- "\n",
714
- "🔓 Best Open-Source Model: Llama-4-Maverick-17B\n",
715
- " Quality Score: 59.4%\n",
716
- " Architecture Score: High (Better for hackathon!)\n",
717
- " Response Time: 3.98s\n",
718
- "\n",
719
- "⚡ Fastest Model: Llama-4-Maverick-17B\n",
720
- " Response Time: 3.98s\n",
721
- " Quality Score: 59.4%\n",
722
- "\n",
723
- "====================================================================================================\n",
724
- "📝 FINAL RECOMMENDATION\n",
725
- "====================================================================================================\n",
726
- "\n",
727
- "Scoring Breakdown:\n",
728
- " - LLM Quality: 30% of total hackathon score\n",
729
- " - Architecture: 20% of total hackathon score (open-source preferred!)\n",
730
- "\n",
731
- "Best Choice:\n",
732
- " ✅ Llama-4-Maverick-17B - Best balance of quality and architecture score\n",
733
- " Only 0.0% quality drop for higher architecture score!\n",
734
- "====================================================================================================\n"
735
- ]
736
- }
737
- ],
738
  "source": [
739
  "# Create rankings table\n",
740
  "rankings = model_summary[[\n",
@@ -745,7 +657,7 @@
745
  "rankings.insert(0, 'Rank', range(1, len(rankings) + 1))\n",
746
  "\n",
747
  "print(\"\\n\" + \"=\"*100)\n",
748
- "print(\"🏆 FINAL RANKINGS\")\n",
749
  "print(\"=\"*100)\n",
750
  "print(rankings.to_string())\n",
751
  "print(\"=\"*100)\n",
@@ -756,10 +668,10 @@
756
  "fastest = model_summary['Response_Time'].idxmin()\n",
757
  "\n",
758
  "print(\"\\n\" + \"=\"*100)\n",
759
- "print(\"💡 RECOMMENDATIONS FOR HACKATHON\")\n",
760
  "print(\"=\"*100)\n",
761
  "\n",
762
- "print(f\"\\n🥇 Best Overall Quality: {best_overall}\")\n",
763
  "print(f\" Quality Score: {model_summary.loc[best_overall, 'Quality_Score']:.1f}%\")\n",
764
  "print(f\" Similarity: {model_summary.loc[best_overall, 'Similarity']:.1f}%\")\n",
765
  "print(f\" Citation Score: {model_summary.loc[best_overall, 'Citation_Score']:.1f}%\")\n",
@@ -768,113 +680,61 @@
768
  "print(f\" Architecture Score: {model_summary.loc[best_overall, 'Architecture_Score']}\")\n",
769
  "\n",
770
  "if best_open_source:\n",
771
- " print(f\"\\n🔓 Best Open-Source Model: {best_open_source}\")\n",
772
  " print(f\" Quality Score: {model_summary.loc[best_open_source, 'Quality_Score']:.1f}%\")\n",
773
  " print(f\" Architecture Score: {model_summary.loc[best_open_source, 'Architecture_Score']} (Better for hackathon!)\")\n",
774
  " print(f\" Response Time: {model_summary.loc[best_open_source, 'Response_Time']:.2f}s\")\n",
775
  "\n",
776
- "print(f\"\\n Fastest Model: {fastest}\")\n",
777
  "print(f\" Response Time: {model_summary.loc[fastest, 'Response_Time']:.2f}s\")\n",
778
  "print(f\" Quality Score: {model_summary.loc[fastest, 'Quality_Score']:.1f}%\")\n",
779
  "\n",
780
  "print(\"\\n\" + \"=\"*100)\n",
781
- "print(\"📝 FINAL RECOMMENDATION\")\n",
782
  "print(\"=\"*100)\n",
783
  "print(\"\\nScoring Breakdown:\")\n",
784
  "print(\" - LLM Quality: 30% of total hackathon score\")\n",
785
  "print(\" - Architecture: 20% of total hackathon score (open-source preferred!)\")\n",
786
  "print(\"\\nBest Choice:\")\n",
787
  "if best_open_source and model_summary.loc[best_open_source, 'Quality_Score'] >= model_summary.loc[best_overall, 'Quality_Score'] * 0.9:\n",
788
- " print(f\" {best_open_source} - Best balance of quality and architecture score\")\n",
789
  " print(f\" Only {model_summary.loc[best_overall, 'Quality_Score'] - model_summary.loc[best_open_source, 'Quality_Score']:.1f}% quality drop for higher architecture score!\")\n",
790
  "else:\n",
791
- " print(f\" {best_overall} - Highest quality, use if quality gap is significant\")\n",
792
  " if best_open_source:\n",
793
- " print(f\" ⚠️ Consider {best_open_source} for higher architecture score (trade-off: {model_summary.loc[best_overall, 'Quality_Score'] - model_summary.loc[best_open_source, 'Quality_Score']:.1f}% quality)\")\n",
794
  "\n",
795
  "print(\"=\"*100)"
796
  ]
797
  },
798
  {
799
- "cell_type": "markdown",
800
  "metadata": {},
801
  "source": [
802
- "## 10. Export Results"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
803
  ]
804
  },
805
- {
806
- "cell_type": "code",
807
- "execution_count": null,
808
- "metadata": {},
809
- "outputs": [],
810
- "source": "# Save results\nfrom pathlib import Path\n\noutput_dir = Path('output/llm_benchmark')\noutput_dir.mkdir(parents=True, exist_ok=True)\n\ndf.to_csv(output_dir / 'detailed_results.csv', index=False, encoding='utf-8')\nmodel_summary.to_csv(output_dir / 'summary.csv', encoding='utf-8')\nrankings.to_csv(output_dir / 'rankings.csv', index=False, encoding='utf-8')\n\nprint(\"\\n✅ Results exported to output/llm_benchmark/:\")\nprint(\" - detailed_results.csv (all questions and answers)\")\nprint(\" - summary.csv (model averages)\")\nprint(\" - rankings.csv (final rankings)\")\nprint(\" - results.png (visualizations)\")"
811
- },
812
  {
813
  "cell_type": "markdown",
814
  "metadata": {},
815
  "source": [
816
  "## 11. Sample Answer Comparison"
817
  ]
818
- },
819
- {
820
- "cell_type": "code",
821
- "execution_count": 14,
822
- "metadata": {},
823
- "outputs": [
824
- {
825
- "name": "stdout",
826
- "output_type": "stream",
827
- "text": [
828
- "\n",
829
- "====================================================================================================\n",
830
- "📝 SAMPLE ANSWER COMPARISON - Example1\n",
831
- "====================================================================================================\n",
832
- "\n",
833
- "❓ Question: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas amillərin inteqrasiyası tələb olunur?\n",
834
- "\n",
835
- "✅ Expected Answer:\n",
836
- "Daha az quyu ilə daha çox hasilat əldə etmək üçün düzgün seçilmiş texnoloji inteqrasiya (horizontal və çoxtərəfli qazma texnikaları) və qazma məhlullarının səmərəli idarə edilməsi tələb olunur. Bu yanaşma həm iqtisadi, həm də ekoloji baxımdan üstünlük yaradır.\n",
837
- "\n",
838
- "----------------------------------------------------------------------------------------------------\n",
839
- "\n",
840
- "🤖 Llama-4-Maverick-17B (Quality: 59.4%, Time: 4.39s):\n",
841
- "Daha az quyu ilə daha çox hasilat əldə etmək üçün düzgün seçilmiş texnoloji inteqrasiya və qazma məhlullarının səmərəli idarəsi əsas amillərdir. Bu, Document 1 (document_11.pdf, Səhifə 3)-də qeyd olun...\n",
842
- "----------------------------------------------------------------------------------------------------\n",
843
- "\n",
844
- "🤖 DeepSeek-R1 (Quality: 48.3%, Time: 10.00s):\n",
845
- "<think>\n",
846
- "Okay, let's tackle this question. The user is asking about the main factors that need to be integrated to achieve more production with fewer wells. They provided three documents, so I need to ...\n",
847
- "----------------------------------------------------------------------------------------------------\n",
848
- "\n",
849
- "🤖 GPT-4.1 (Quality: 57.5%, Time: 6.66s):\n",
850
- "Daha az quyu ilə daha çox hasilat əldə etmək üçün bir neçə əsas amilin inteqrasiyası tələb olunur. Bu amillər aşağıdakı kimi sistematik şəkildə sənəd mənbələrinə istinadla izah olunur:\n",
851
- "\n",
852
- "1. **Düzgün se...\n",
853
- "----------------------------------------------------------------------------------------------------\n",
854
- "====================================================================================================\n"
855
- ]
856
- }
857
- ],
858
- "source": [
859
- "# Show sample answers for first question\n",
860
- "sample_question = 'Example1'\n",
861
- "sample_results = df[df['Question'] == sample_question]\n",
862
- "\n",
863
- "print(\"\\n\" + \"=\"*100)\n",
864
- "print(f\"📝 SAMPLE ANSWER COMPARISON - {sample_question}\")\n",
865
- "print(\"=\"*100)\n",
866
- "\n",
867
- "print(f\"\\n❓ Question: {questions[sample_question][0]['content']}\")\n",
868
- "print(f\"\\n✅ Expected Answer:\\n{expected_answers[sample_question]['Answer']}\")\n",
869
- "print(\"\\n\" + \"-\"*100)\n",
870
- "\n",
871
- "for _, row in sample_results.iterrows():\n",
872
- " print(f\"\\n🤖 {row['Model']} (Quality: {model_summary.loc[row['Model'], 'Quality_Score']:.1f}%, Time: {row['Response_Time']:.2f}s):\")\n",
873
- " print(f\"{row['Answer']}\")\n",
874
- " print(\"-\"*100)\n",
875
- "\n",
876
- "print(\"=\"*100)"
877
- ]
878
  }
879
  ],
880
  "metadata": {
 
53
  "name": "stdout",
54
  "output_type": "stream",
55
  "text": [
56
+ "\u2705 Libraries loaded successfully\n"
57
  ]
58
  }
59
  ],
 
78
  "sns.set_style('whitegrid')\n",
79
  "plt.rcParams['figure.figsize'] = (14, 8)\n",
80
  "\n",
81
+ "print(\"\u2705 Libraries loaded successfully\")"
82
  ]
83
  },
84
  {
85
+ "cell_type": "code",
86
+ "execution_count": 3,
87
  "metadata": {},
88
+ "outputs": [
89
+ {
90
+ "name": "stdout",
91
+ "output_type": "stream",
92
+ "text": [
93
+ "\u2705 Project root: /Users/ismatsamadov/SOCAR_Hackathon\n",
94
+ "\u2705 Docs directory: /Users/ismatsamadov/SOCAR_Hackathon/docs\n",
95
+ "\u2705 Output directory: /Users/ismatsamadov/SOCAR_Hackathon/output\n"
96
+ ]
97
+ }
98
+ ],
99
  "source": [
100
+ "# Auto-detect project root (works from any directory)\n",
101
+ "import os\n",
102
+ "from pathlib import Path\n",
103
+ "\n",
104
+ "if Path('data').exists() and Path('docs').exists():\n",
105
+ " # Already in project root\n",
106
+ " PROJECT_ROOT = Path.cwd()\n",
107
+ "elif Path('../data').exists() and Path('../docs').exists():\n",
108
+ " # In notebooks/ subdirectory\n",
109
+ " PROJECT_ROOT = Path.cwd().parent\n",
110
+ "else:\n",
111
+ " # Fallback: try to find project root\n",
112
+ " current = Path.cwd()\n",
113
+ " while current != current.parent:\n",
114
+ " if (current / 'data').exists() and (current / 'docs').exists():\n",
115
+ " PROJECT_ROOT = current\n",
116
+ " break\n",
117
+ " current = current.parent\n",
118
+ " else:\n",
119
+ " PROJECT_ROOT = Path.cwd()\n",
120
+ "\n",
121
+ "# Define all paths relative to project root\n",
122
+ "DATA_DIR = PROJECT_ROOT / 'data'\n",
123
+ "DOCS_DIR = PROJECT_ROOT / 'docs'\n",
124
+ "OUTPUT_DIR = PROJECT_ROOT / 'output'\n",
125
+ "\n",
126
+ "print(f\"\u2705 Project root: {PROJECT_ROOT}\")\n",
127
+ "print(f\"\u2705 Docs directory: {DOCS_DIR}\")\n",
128
+ "print(f\"\u2705 Output directory: {OUTPUT_DIR}\")"
129
  ]
130
  },
131
  {
132
  "cell_type": "code",
133
+ "execution_count": 4,
134
  "metadata": {},
135
  "outputs": [
136
  {
 
140
  "Loaded 5 test cases\n",
141
  "\n",
142
  "Test Questions:\n",
143
+ "1. Example1: Daha az quyu il\u0259 daha \u00e7ox hasilat \u0259ld\u0259 etm\u0259k \u00fc\u00e7\u00fcn hans\u0131 \u0259sas amill\u0259rin inteqrasiyas\u0131 t\u0259l\u0259b olunur?...\n",
144
+ "2. Example2: Q\u0259rbi Ab\u015feron yata\u011f\u0131nda suvurma t\u0259dbirl\u0259ri hans\u0131 tarixd\u0259 v\u0259 hans\u0131 layda t\u0259tbiq edilmi\u015fdir v\u0259 bunun m...\n",
145
+ "3. Example3: Pirallah\u0131 strukturunda 1253 n\u00f6mr\u0259li quyudan g\u00f6t\u00fcr\u00fclm\u00fc\u015f n\u00fcmun\u0259l\u0259rd\u0259 SiO2 v\u0259 CaO oksidl\u0259ri aras\u0131nda ha...\n",
146
+ "4. Example4: Bak\u0131 arxipelaq\u0131 (BA) v\u0259 A\u015fa\u011f\u0131 K\u00fcr \u00e7\u00f6k\u0259kliyi (AK\u00c7) \u00fc\u00e7\u00fcn geotemperatur x\u0259rit\u0259l\u0259rin\u0259 \u0259sas\u0259n neft v\u0259 qaz...\n",
147
+ "5. Example5: Bu zonada hans\u0131 prosesl\u0259r ba\u015f verir?...\n"
148
  ]
149
  }
150
  ],
151
  "source": [
152
+ "# Load sample questions - using dynamic paths\n",
153
+ "with open(DOCS_DIR / 'sample_questions.json', 'r', encoding='utf-8') as f:\n",
154
  " questions = json.load(f)\n",
155
  "\n",
156
+ "# Load expected answers - using dynamic paths\n",
157
+ "with open(DOCS_DIR / 'sample_answers.json', 'r', encoding='utf-8') as f:\n",
158
  " expected_answers = json.load(f)\n",
159
  "\n",
160
  "print(f\"Loaded {len(questions)} test cases\")\n",
 
173
  },
174
  {
175
  "cell_type": "code",
176
+ "execution_count": null,
177
  "metadata": {},
178
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  "source": [
180
  "# Initialize Pinecone\n",
181
  "pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))\n",
 
184
  "# Initialize embedding model (same as used for ingestion)\n",
185
  "embed_model = SentenceTransformer('BAAI/bge-large-en-v1.5')\n",
186
  "\n",
187
+ "print(f\"\u2705 Vector DB connected: {index.describe_index_stats()}\")\n",
188
+ "print(f\"\u2705 Embedding model loaded: {embed_model}\")"
189
  ]
190
  },
191
  {
 
197
  },
198
  {
199
  "cell_type": "code",
200
+ "execution_count": null,
201
  "metadata": {},
202
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
203
  "source": [
204
  "def retrieve_documents(query: str, top_k: int = 3) -> List[Dict]:\n",
205
  " \"\"\"\n",
 
228
  " return documents\n",
229
  "\n",
230
  "# Test retrieval\n",
231
+ "test_query = \"Pal\u00e7\u0131q vulkanlar\u0131n\u0131n t\u0259sir radiusu n\u0259 q\u0259d\u0259rdir?\"\n",
232
  "test_docs = retrieve_documents(test_query)\n",
233
+ "print(f\"\\n\u2705 Retrieved {len(test_docs)} documents for test query\")\n",
234
  "print(f\"Top result: {test_docs[0]['pdf_name']}, page {test_docs[0]['page_number']} (score: {test_docs[0]['score']:.3f})\")"
235
  ]
236
  },
 
246
  "execution_count": null,
247
  "metadata": {},
248
  "outputs": [],
249
+ "source": [
250
+ "# Initialize Azure OpenAI\n",
251
+ "azure_client = AzureOpenAI(\n",
252
+ " api_key=os.getenv('AZURE_OPENAI_API_KEY'),\n",
253
+ " api_version=os.getenv('AZURE_OPENAI_API_VERSION', '2024-08-01-preview'),\n",
254
+ " azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT')\n",
255
+ ")\n",
256
+ "\n",
257
+ "LLM_MODELS = {\n",
258
+ " 'Llama-4-Maverick': 'Llama-4-Maverick-17B-128E-Instruct-FP8',\n",
259
+ " 'DeepSeek-R1': 'DeepSeek-R1',\n",
260
+ " 'GPT-4.1': 'gpt-4.1',\n",
261
+ " 'GPT-5-mini': 'gpt-5-mini',\n",
262
+ " 'Claude-Sonnet-4.5': 'claude-sonnet-4-5',\n",
263
+ "}\n",
264
+ "\n",
265
+ "def generate_answer(model_name: str, query: str, documents: List[Dict], \n",
266
+ " temperature: float = 0.2, max_tokens: int = 1000) -> Tuple[str, float]:\n",
267
+ " \"\"\"\n",
268
+ " Generate answer using specified LLM model.\n",
269
+ " Returns: (answer, response_time)\n",
270
+ " \"\"\"\n",
271
+ " # Build context from retrieved documents\n",
272
+ " context_parts = []\n",
273
+ " for i, doc in enumerate(documents, 1):\n",
274
+ " context_parts.append(\n",
275
+ " f\"Document {i} (Source: {doc['pdf_name']}, Page {doc['page_number']}):\\n{doc['content']}\"\n",
276
+ " )\n",
277
+ " context = \"\\n\\n\".join(context_parts)\n",
278
+ " \n",
279
+ " # Create prompt\n",
280
+ " prompt = f\"\"\"Siz SOCAR-\u0131n tarixi neft v\u0259 qaz s\u0259n\u0259dl\u0259ri \u00fczr\u0259 m\u00fct\u0259x\u0259ssis k\u00f6m\u0259k\u00e7isisiniz.\n",
281
+ "\n",
282
+ "Kontekst (\u0259laq\u0259li s\u0259n\u0259dl\u0259r):\n",
283
+ "{context}\n",
284
+ "\n",
285
+ "Sual: {query}\n",
286
+ "\n",
287
+ "\u018ftrafl\u0131 cavab verin v\u0259 m\u00fctl\u0259q s\u0259n\u0259d m\u0259nb\u0259l\u0259rin\u0259 istinad edin (PDF ad\u0131 v\u0259 s\u0259hif\u0259 n\u00f6mr\u0259si il\u0259).\n",
288
+ "Cavab\u0131n\u0131z d\u0259qiq, faktlara \u0259saslanan v\u0259 kontekst m\u0259lumatlar\u0131ndan istifad\u0259 ed\u0259n olmal\u0131d\u0131r.\"\"\"\n",
289
+ " \n",
290
+ " # Get model deployment\n",
291
+ " deployment = MODELS[model_name]['deployment']\n",
292
+ " \n",
293
+ " try:\n",
294
+ " start_time = time.time()\n",
295
+ " \n",
296
+ " # GPT-5 models use max_completion_tokens, others use max_tokens\n",
297
+ " if deployment.startswith('gpt-5'):\n",
298
+ " response = azure_client.chat.completions.create(\n",
299
+ " model=deployment,\n",
300
+ " messages=[\n",
301
+ " {\"role\": \"user\", \"content\": prompt}\n",
302
+ " ],\n",
303
+ " temperature=temperature,\n",
304
+ " max_completion_tokens=max_tokens\n",
305
+ " )\n",
306
+ " else:\n",
307
+ " response = azure_client.chat.completions.create(\n",
308
+ " model=deployment,\n",
309
+ " messages=[\n",
310
+ " {\"role\": \"user\", \"content\": prompt}\n",
311
+ " ],\n",
312
+ " temperature=temperature,\n",
313
+ " max_tokens=max_tokens\n",
314
+ " )\n",
315
+ " \n",
316
+ " response_time = time.time() - start_time\n",
317
+ " answer = response.choices[0].message.content\n",
318
+ " \n",
319
+ " return answer, response_time\n",
320
+ " \n",
321
+ " except Exception as e:\n",
322
+ " return f\"ERROR: {str(e)}\", 0.0\n",
323
+ "\n",
324
+ "print(f\"\\n\u2705 Configured {len(LLM_MODELS)} LLM models for testing\")"
325
+ ]
326
  },
327
  {
328
  "cell_type": "markdown",
 
333
  },
334
  {
335
  "cell_type": "code",
336
+ "execution_count": null,
337
  "metadata": {},
338
+ "outputs": [],
 
 
 
 
 
 
 
 
339
  "source": [
340
  "def normalize_text(text: str) -> str:\n",
341
  " \"\"\"Normalize text for comparison.\"\"\"\n",
 
382
  " cited_pages = sum(1 for page in page_numbers if page in answer)\n",
383
  " \n",
384
  " # Check for source keywords\n",
385
+ " source_keywords = ['m\u0259nb\u0259', 's\u0259n\u0259d', 's\u0259hif\u0259', 'pdf', 'document', 'page', 'source']\n",
386
  " has_source_ref = any(kw in answer.lower() for kw in source_keywords)\n",
387
  " \n",
388
  " citation_score = (\n",
 
419
  " 'Char_Count': char_count\n",
420
  " }\n",
421
  "\n",
422
+ "print(\"\u2705 Evaluation functions ready\")"
423
  ]
424
  },
425
  {
 
431
  },
432
  {
433
  "cell_type": "code",
434
+ "execution_count": null,
435
  "metadata": {},
436
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
437
  "source": [
438
  "# Select models to test (you can comment out models to skip)\n",
439
  "MODELS_TO_TEST = [\n",
 
453
  },
454
  {
455
  "cell_type": "code",
456
+ "execution_count": null,
457
  "metadata": {},
458
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
459
  "source": [
460
  "# Run benchmark\n",
461
  "results = []\n",
 
481
  " answer, response_time = generate_answer(model_name, query, documents)\n",
482
  " \n",
483
  " if answer.startswith('ERROR'):\n",
484
+ " print(f\" \u274c Failed: {answer}\")\n",
485
  " continue\n",
486
  " \n",
487
+ " print(f\" \u2705 Response time: {response_time:.2f}s\")\n",
488
  " \n",
489
  " # Get expected answer\n",
490
  " expected = expected_answers.get(example_key, {}).get('Answer', '')\n",
 
518
  " avg_citation = sum(r['Citation_Score'] for r in model_results) / len(model_results)\n",
519
  " avg_completeness = sum(r['Completeness_Score'] for r in model_results) / len(model_results)\n",
520
  " \n",
521
+ " print(f\"\\n \ud83d\udcca {model_name} Summary:\")\n",
522
  " print(f\" Avg Response Time: {avg_response_time:.2f}s\")\n",
523
  " print(f\" Avg Similarity: {avg_similarity:.1f}%\")\n",
524
  " print(f\" Avg Citation Score: {avg_citation:.1f}%\")\n",
525
  " print(f\" Avg Completeness: {avg_completeness:.1f}%\")\n",
526
  "\n",
527
  "print(f\"\\n{'='*80}\")\n",
528
+ "print(\"\u2705 Benchmarking complete!\")\n",
529
  "print(f\"{'='*80}\")"
530
  ]
531
  },
 
538
  },
539
  {
540
  "cell_type": "code",
541
+ "execution_count": null,
542
  "metadata": {},
543
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
544
  "source": [
545
  "# Create DataFrame\n",
546
  "df = pd.DataFrame(results)\n",
 
569
  "\n",
570
  "# Display summary table\n",
571
  "print(\"\\n\" + \"=\"*100)\n",
572
+ "print(\"\ud83d\udcca LLM BENCHMARKING RESULTS - MODEL SUMMARY\")\n",
573
  "print(\"=\"*100)\n",
574
  "print(model_summary.to_string())\n",
575
  "print(\"=\"*100)"
576
  ]
577
  },
578
  {
579
+ "cell_type": "code",
580
  "metadata": {},
581
  "source": [
582
+ "# Create comprehensive visualization\n",
583
+ "import os\n",
584
+ "from pathlib import Path\n",
585
+ "\n",
586
+ "# Create output directory - using dynamic path\n",
587
+ "output_dir = OUTPUT_DIR / 'llm_benchmark'\n",
588
+ "output_dir.mkdir(parents=True, exist_ok=True)\n",
589
+ "\n",
590
+ "fig, axes = plt.subplots(2, 2, figsize=(16, 10))\n",
591
+ "\n",
592
+ "models = df['Model'].tolist()\n",
593
+ "colors = sns.color_palette('viridis', len(models))\n",
594
+ "\n",
595
+ "# 1. CSR - Character Success Rate (MAIN METRIC)\n",
596
+ "ax1 = axes[0, 0]\n",
597
+ "bars1 = ax1.barh(models, df['CSR'], color=colors)\n",
598
+ "ax1.set_xlabel('CSR (%) - Higher is Better', fontsize=12, fontweight='bold')\n",
599
+ "ax1.set_title('Character Success Rate (CSR)\\n\ud83c\udfc6 HACKATHON PRIMARY METRIC', \n",
600
+ " fontsize=14, fontweight='bold')\n",
601
+ "ax1.set_xlim(0, 100)\n",
602
+ "for i, (model, csr) in enumerate(zip(models, df['CSR'])):\n",
603
+ " ax1.text(csr + 1, i, f'{csr:.2f}%', va='center', fontsize=11, fontweight='bold')\n",
604
+ "ax1.axvline(x=90, color='green', linestyle='--', alpha=0.3, label='Excellent (>90%)')\n",
605
+ "ax1.axvline(x=80, color='orange', linestyle='--', alpha=0.3, label='Good (>80%)')\n",
606
+ "ax1.legend(fontsize=9)\n",
607
+ "\n",
608
+ "# 2. WSR - Word Success Rate\n",
609
+ "ax2 = axes[0, 1]\n",
610
+ "bars2 = ax2.barh(models, df['WSR'], color=colors)\n",
611
+ "ax2.set_xlabel('WSR (%) - Higher is Better', fontsize=12, fontweight='bold')\n",
612
+ "ax2.set_title('Word Success Rate (WSR)', fontsize=14, fontweight='bold')\n",
613
+ "ax2.set_xlim(0, 100)\n",
614
+ "for i, (model, wsr) in enumerate(zip(models, df['WSR'])):\n",
615
+ " ax2.text(wsr + 1, i, f'{wsr:.2f}%', va='center', fontsize=11, fontweight='bold')\n",
616
+ "\n",
617
+ "# 3. Response Time\n",
618
+ "ax3 = axes[1, 0]\n",
619
+ "bars3 = ax3.barh(models, df['Response_Time'], color=colors)\n",
620
+ "ax3.set_xlabel('Total Time (seconds) - Lower is Better', fontsize=12, fontweight='bold')\n",
621
+ "ax3.set_title('Processing Speed', fontsize=14, fontweight='bold')\n",
622
+ "for i, (model, time_val) in enumerate(zip(models, df['Response_Time'])):\n",
623
+ " ax3.text(time_val + 0.5, i, f'{time_val:.1f}s', va='center', fontsize=11)\n",
624
+ "\n",
625
+ "# 4. Error Rates Comparison\n",
626
+ "ax4 = axes[1, 1]\n",
627
+ "x = range(len(models))\n",
628
+ "width = 0.35\n",
629
+ "ax4.bar([i - width/2 for i in x], df['CER'], width, label='CER', color='coral', alpha=0.8)\n",
630
+ "ax4.bar([i + width/2 for i in x], df['WER'], width, label='WER', color='skyblue', alpha=0.8)\n",
631
+ "ax4.set_ylabel('Error Rate (%) - Lower is Better', fontsize=12, fontweight='bold')\n",
632
+ "ax4.set_title('Error Rates', fontsize=14, fontweight='bold')\n",
633
+ "ax4.set_xticks(x)\n",
634
+ "ax4.set_xticklabels(models, rotation=45, ha='right')\n",
635
+ "ax4.legend(fontsize=11)\n",
636
+ "ax4.grid(axis='y', alpha=0.3)\n",
637
+ "\n",
638
+ "plt.tight_layout()\n",
639
+ "plt.savefig(output_dir / 'results.png', dpi=300, bbox_inches='tight')\n",
640
+ "plt.show()\n",
641
+ "\n",
642
+ "print(f\"\\n\u2705 Visualization saved to '{output_dir}/results.png'\")"
643
  ]
644
  },
645
  {
 
647
  "execution_count": null,
648
  "metadata": {},
649
  "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
650
  "source": [
651
  "# Create rankings table\n",
652
  "rankings = model_summary[[\n",
 
657
  "rankings.insert(0, 'Rank', range(1, len(rankings) + 1))\n",
658
  "\n",
659
  "print(\"\\n\" + \"=\"*100)\n",
660
+ "print(\"\ud83c\udfc6 FINAL RANKINGS\")\n",
661
  "print(\"=\"*100)\n",
662
  "print(rankings.to_string())\n",
663
  "print(\"=\"*100)\n",
 
668
  "fastest = model_summary['Response_Time'].idxmin()\n",
669
  "\n",
670
  "print(\"\\n\" + \"=\"*100)\n",
671
+ "print(\"\ud83d\udca1 RECOMMENDATIONS FOR HACKATHON\")\n",
672
  "print(\"=\"*100)\n",
673
  "\n",
674
+ "print(f\"\\n\ud83e\udd47 Best Overall Quality: {best_overall}\")\n",
675
  "print(f\" Quality Score: {model_summary.loc[best_overall, 'Quality_Score']:.1f}%\")\n",
676
  "print(f\" Similarity: {model_summary.loc[best_overall, 'Similarity']:.1f}%\")\n",
677
  "print(f\" Citation Score: {model_summary.loc[best_overall, 'Citation_Score']:.1f}%\")\n",
 
680
  "print(f\" Architecture Score: {model_summary.loc[best_overall, 'Architecture_Score']}\")\n",
681
  "\n",
682
  "if best_open_source:\n",
683
+ " print(f\"\\n\ud83d\udd13 Best Open-Source Model: {best_open_source}\")\n",
684
  " print(f\" Quality Score: {model_summary.loc[best_open_source, 'Quality_Score']:.1f}%\")\n",
685
  " print(f\" Architecture Score: {model_summary.loc[best_open_source, 'Architecture_Score']} (Better for hackathon!)\")\n",
686
  " print(f\" Response Time: {model_summary.loc[best_open_source, 'Response_Time']:.2f}s\")\n",
687
  "\n",
688
+ "print(f\"\\n\u26a1 Fastest Model: {fastest}\")\n",
689
  "print(f\" Response Time: {model_summary.loc[fastest, 'Response_Time']:.2f}s\")\n",
690
  "print(f\" Quality Score: {model_summary.loc[fastest, 'Quality_Score']:.1f}%\")\n",
691
  "\n",
692
  "print(\"\\n\" + \"=\"*100)\n",
693
+ "print(\"\ud83d\udcdd FINAL RECOMMENDATION\")\n",
694
  "print(\"=\"*100)\n",
695
  "print(\"\\nScoring Breakdown:\")\n",
696
  "print(\" - LLM Quality: 30% of total hackathon score\")\n",
697
  "print(\" - Architecture: 20% of total hackathon score (open-source preferred!)\")\n",
698
  "print(\"\\nBest Choice:\")\n",
699
  "if best_open_source and model_summary.loc[best_open_source, 'Quality_Score'] >= model_summary.loc[best_overall, 'Quality_Score'] * 0.9:\n",
700
+ " print(f\" \u2705 {best_open_source} - Best balance of quality and architecture score\")\n",
701
  " print(f\" Only {model_summary.loc[best_overall, 'Quality_Score'] - model_summary.loc[best_open_source, 'Quality_Score']:.1f}% quality drop for higher architecture score!\")\n",
702
  "else:\n",
703
+ " print(f\" \u2705 {best_overall} - Highest quality, use if quality gap is significant\")\n",
704
  " if best_open_source:\n",
705
+ " print(f\" \u26a0\ufe0f Consider {best_open_source} for higher architecture score (trade-off: {model_summary.loc[best_overall, 'Quality_Score'] - model_summary.loc[best_open_source, 'Quality_Score']:.1f}% quality)\")\n",
706
  "\n",
707
  "print(\"=\"*100)"
708
  ]
709
  },
710
  {
711
+ "cell_type": "code",
712
  "metadata": {},
713
  "source": [
714
+ "# Save results\n",
715
+ "from pathlib import Path\n",
716
+ "\n",
717
+ "# Using dynamic path\n",
718
+ "output_dir = OUTPUT_DIR / 'llm_benchmark'\n",
719
+ "output_dir.mkdir(parents=True, exist_ok=True)\n",
720
+ "\n",
721
+ "df.to_csv(output_dir / 'detailed_results.csv', index=False, encoding='utf-8')\n",
722
+ "model_summary.to_csv(output_dir / 'summary.csv', encoding='utf-8')\n",
723
+ "rankings.to_csv(output_dir / 'rankings.csv', index=False, encoding='utf-8')\n",
724
+ "\n",
725
+ "print(\"\\n\u2705 Results exported to output/llm_benchmark/:\")\n",
726
+ "print(\" - detailed_results.csv (all questions and answers)\")\n",
727
+ "print(\" - summary.csv (model averages)\")\n",
728
+ "print(\" - rankings.csv (final rankings)\")\n",
729
+ "print(\" - results.png (visualizations)\")"
730
  ]
731
  },
 
 
 
 
 
 
 
732
  {
733
  "cell_type": "markdown",
734
  "metadata": {},
735
  "source": [
736
  "## 11. Sample Answer Comparison"
737
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
738
  }
739
  ],
740
  "metadata": {
notebooks/llm_benchmark.ipynb.backup ADDED
@@ -0,0 +1,761 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# LLM Benchmarking for SOCAR Hackathon RAG Chatbot\n",
8
+ "\n",
9
+ "This notebook tests different LLM models for the `/llm` endpoint to find the best performer.\n",
10
+ "\n",
11
+ "## Evaluation Criteria (LLM Judge Metrics):\n",
12
+ "- **Accuracy**: Is the answer correct?\n",
13
+ "- **Relevance**: Are retrieved citations relevant?\n",
14
+ "- **Completeness**: Does it fully answer the question?\n",
15
+ "- **Citation Quality**: Proper sources with page numbers?\n",
16
+ "- **Response Time**: Speed of generation\n",
17
+ "\n",
18
+ "## Available LLM Models:\n",
19
+ "1. **Llama-4-Maverick-17B-128E-Instruct-FP8** (Current choice, open-source)\n",
20
+ "2. **DeepSeek-R1** (Open-source reasoning model)\n",
21
+ "3. **GPT-4.1** (Strong general performance)\n",
22
+ "4. **GPT-5, GPT-5-mini**\n",
23
+ "5. **Claude Sonnet 4.5** (Best quality)\n",
24
+ "6. **Claude Opus 4.1**\n",
25
+ "7. **Phi-4-multimodal-instruct**\n",
26
+ "8. **gpt-oss-120b**"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": 1,
32
+ "metadata": {},
33
+ "outputs": [],
34
+ "source": [
35
+ "# Install required packages\n",
36
+ "# !pip install openai pinecone-client sentence-transformers python-dotenv pandas matplotlib seaborn jiwer"
37
+ ]
38
+ },
39
+ {
40
+ "cell_type": "code",
41
+ "execution_count": 2,
42
+ "metadata": {},
43
+ "outputs": [
44
+ {
45
+ "name": "stderr",
46
+ "output_type": "stream",
47
+ "text": [
48
+ "/Users/ismatsamadov/SOCAR_Hackathon/venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
49
+ " from .autonotebook import tqdm as notebook_tqdm\n"
50
+ ]
51
+ },
52
+ {
53
+ "name": "stdout",
54
+ "output_type": "stream",
55
+ "text": [
56
+ "\u2705 Libraries loaded successfully\n"
57
+ ]
58
+ }
59
+ ],
60
+ "source": [
61
+ "import os\n",
62
+ "import json\n",
63
+ "import time\n",
64
+ "from typing import Dict, List, Tuple\n",
65
+ "from dotenv import load_dotenv\n",
66
+ "import pandas as pd\n",
67
+ "import matplotlib.pyplot as plt\n",
68
+ "import seaborn as sns\n",
69
+ "from openai import AzureOpenAI\n",
70
+ "from pinecone import Pinecone\n",
71
+ "from sentence_transformers import SentenceTransformer\n",
72
+ "from jiwer import wer, cer\n",
73
+ "\n",
74
+ "# Load environment variables\n",
75
+ "load_dotenv()\n",
76
+ "\n",
77
+ "# Set style\n",
78
+ "sns.set_style('whitegrid')\n",
79
+ "plt.rcParams['figure.figsize'] = (14, 8)\n",
80
+ "\n",
81
+ "print(\"\u2705 Libraries loaded successfully\")"
82
+ ]
83
+ },
84
+ {
85
+ "cell_type": "code",
86
+ "execution_count": 3,
87
+ "metadata": {},
88
+ "outputs": [
89
+ {
90
+ "name": "stdout",
91
+ "output_type": "stream",
92
+ "text": [
93
+ "\u2705 Project root: /Users/ismatsamadov/SOCAR_Hackathon\n",
94
+ "\u2705 Docs directory: /Users/ismatsamadov/SOCAR_Hackathon/docs\n",
95
+ "\u2705 Output directory: /Users/ismatsamadov/SOCAR_Hackathon/output\n"
96
+ ]
97
+ }
98
+ ],
99
+ "source": [
100
+ "# Auto-detect project root (works from any directory)\n",
101
+ "import os\n",
102
+ "from pathlib import Path\n",
103
+ "\n",
104
+ "if Path('data').exists() and Path('docs').exists():\n",
105
+ " # Already in project root\n",
106
+ " PROJECT_ROOT = Path.cwd()\n",
107
+ "elif Path('../data').exists() and Path('../docs').exists():\n",
108
+ " # In notebooks/ subdirectory\n",
109
+ " PROJECT_ROOT = Path.cwd().parent\n",
110
+ "else:\n",
111
+ " # Fallback: try to find project root\n",
112
+ " current = Path.cwd()\n",
113
+ " while current != current.parent:\n",
114
+ " if (current / 'data').exists() and (current / 'docs').exists():\n",
115
+ " PROJECT_ROOT = current\n",
116
+ " break\n",
117
+ " current = current.parent\n",
118
+ " else:\n",
119
+ " PROJECT_ROOT = Path.cwd()\n",
120
+ "\n",
121
+ "# Define all paths relative to project root\n",
122
+ "DATA_DIR = PROJECT_ROOT / 'data'\n",
123
+ "DOCS_DIR = PROJECT_ROOT / 'docs'\n",
124
+ "OUTPUT_DIR = PROJECT_ROOT / 'output'\n",
125
+ "\n",
126
+ "print(f\"\u2705 Project root: {PROJECT_ROOT}\")\n",
127
+ "print(f\"\u2705 Docs directory: {DOCS_DIR}\")\n",
128
+ "print(f\"\u2705 Output directory: {OUTPUT_DIR}\")"
129
+ ]
130
+ },
131
+ {
132
+ "cell_type": "code",
133
+ "execution_count": 4,
134
+ "metadata": {},
135
+ "outputs": [
136
+ {
137
+ "name": "stdout",
138
+ "output_type": "stream",
139
+ "text": [
140
+ "Loaded 5 test cases\n",
141
+ "\n",
142
+ "Test Questions:\n",
143
+ "1. Example1: Daha az quyu il\u0259 daha \u00e7ox hasilat \u0259ld\u0259 etm\u0259k \u00fc\u00e7\u00fcn hans\u0131 \u0259sas amill\u0259rin inteqrasiyas\u0131 t\u0259l\u0259b olunur?...\n",
144
+ "2. Example2: Q\u0259rbi Ab\u015feron yata\u011f\u0131nda suvurma t\u0259dbirl\u0259ri hans\u0131 tarixd\u0259 v\u0259 hans\u0131 layda t\u0259tbiq edilmi\u015fdir v\u0259 bunun m...\n",
145
+ "3. Example3: Pirallah\u0131 strukturunda 1253 n\u00f6mr\u0259li quyudan g\u00f6t\u00fcr\u00fclm\u00fc\u015f n\u00fcmun\u0259l\u0259rd\u0259 SiO2 v\u0259 CaO oksidl\u0259ri aras\u0131nda ha...\n",
146
+ "4. Example4: Bak\u0131 arxipelaq\u0131 (BA) v\u0259 A\u015fa\u011f\u0131 K\u00fcr \u00e7\u00f6k\u0259kliyi (AK\u00c7) \u00fc\u00e7\u00fcn geotemperatur x\u0259rit\u0259l\u0259rin\u0259 \u0259sas\u0259n neft v\u0259 qaz...\n",
147
+ "5. Example5: Bu zonada hans\u0131 prosesl\u0259r ba\u015f verir?...\n"
148
+ ]
149
+ }
150
+ ],
151
+ "source": [
152
+ "# Load sample questions - using dynamic paths\n",
153
+ "with open(DOCS_DIR / 'sample_questions.json', 'r', encoding='utf-8') as f:\n",
154
+ " questions = json.load(f)\n",
155
+ "\n",
156
+ "# Load expected answers - using dynamic paths\n",
157
+ "with open(DOCS_DIR / 'sample_answers.json', 'r', encoding='utf-8') as f:\n",
158
+ " expected_answers = json.load(f)\n",
159
+ "\n",
160
+ "print(f\"Loaded {len(questions)} test cases\")\n",
161
+ "print(\"\\nTest Questions:\")\n",
162
+ "for i, (key, msgs) in enumerate(questions.items(), 1):\n",
163
+ " user_msg = [m for m in msgs if m['role'] == 'user'][-1]\n",
164
+ " print(f\"{i}. {key}: {user_msg['content'][:100]}...\")"
165
+ ]
166
+ },
167
+ {
168
+ "cell_type": "markdown",
169
+ "metadata": {},
170
+ "source": [
171
+ "## 2. Initialize Vector Database and Embedding Model"
172
+ ]
173
+ },
174
+ {
175
+ "cell_type": "code",
176
+ "execution_count": null,
177
+ "metadata": {},
178
+ "outputs": [],
179
+ "source": [
180
+ "# Initialize Pinecone\n",
181
+ "pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))\n",
182
+ "index = pc.Index(os.getenv('PINECONE_INDEX_NAME', 'hackathon'))\n",
183
+ "\n",
184
+ "# Initialize embedding model (same as used for ingestion)\n",
185
+ "embed_model = SentenceTransformer('BAAI/bge-large-en-v1.5')\n",
186
+ "\n",
187
+ "print(f\"\u2705 Vector DB connected: {index.describe_index_stats()}\")\n",
188
+ "print(f\"\u2705 Embedding model loaded: {embed_model}\")"
189
+ ]
190
+ },
191
+ {
192
+ "cell_type": "markdown",
193
+ "metadata": {},
194
+ "source": [
195
+ "## 3. RAG Retrieval Function"
196
+ ]
197
+ },
198
+ {
199
+ "cell_type": "code",
200
+ "execution_count": null,
201
+ "metadata": {},
202
+ "outputs": [],
203
+ "source": [
204
+ "def retrieve_documents(query: str, top_k: int = 3) -> List[Dict]:\n",
205
+ " \"\"\"\n",
206
+ " Retrieve relevant documents from vector database.\n",
207
+ " \"\"\"\n",
208
+ " # Generate query embedding\n",
209
+ " query_embedding = embed_model.encode(query).tolist()\n",
210
+ " \n",
211
+ " # Search vector DB\n",
212
+ " results = index.query(\n",
213
+ " vector=query_embedding,\n",
214
+ " top_k=top_k,\n",
215
+ " include_metadata=True\n",
216
+ " )\n",
217
+ " \n",
218
+ " # Extract documents\n",
219
+ " documents = []\n",
220
+ " for match in results['matches']:\n",
221
+ " documents.append({\n",
222
+ " 'pdf_name': match['metadata'].get('pdf_name', 'unknown.pdf'),\n",
223
+ " 'page_number': match['metadata'].get('page_number', 0),\n",
224
+ " 'content': match['metadata'].get('text', ''),\n",
225
+ " 'score': match.get('score', 0.0)\n",
226
+ " })\n",
227
+ " \n",
228
+ " return documents\n",
229
+ "\n",
230
+ "# Test retrieval\n",
231
+ "test_query = \"Pal\u00e7\u0131q vulkanlar\u0131n\u0131n t\u0259sir radiusu n\u0259 q\u0259d\u0259rdir?\"\n",
232
+ "test_docs = retrieve_documents(test_query)\n",
233
+ "print(f\"\\n\u2705 Retrieved {len(test_docs)} documents for test query\")\n",
234
+ "print(f\"Top result: {test_docs[0]['pdf_name']}, page {test_docs[0]['page_number']} (score: {test_docs[0]['score']:.3f})\")"
235
+ ]
236
+ },
237
+ {
238
+ "cell_type": "markdown",
239
+ "metadata": {},
240
+ "source": [
241
+ "## 4. LLM Client Functions"
242
+ ]
243
+ },
244
+ {
245
+ "cell_type": "code",
246
+ "execution_count": null,
247
+ "metadata": {},
248
+ "outputs": [],
249
+ "source": [
250
+ "# Initialize Azure OpenAI\n",
251
+ "azure_client = AzureOpenAI(\n",
252
+ " api_key=os.getenv('AZURE_OPENAI_API_KEY'),\n",
253
+ " api_version=os.getenv('AZURE_OPENAI_API_VERSION', '2024-08-01-preview'),\n",
254
+ " azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT')\n",
255
+ ")\n",
256
+ "\n",
257
+ "LLM_MODELS = {\n",
258
+ " 'Llama-4-Maverick': 'Llama-4-Maverick-17B-128E-Instruct-FP8',\n",
259
+ " 'DeepSeek-R1': 'DeepSeek-R1',\n",
260
+ " 'GPT-4.1': 'gpt-4.1',\n",
261
+ " 'GPT-5-mini': 'gpt-5-mini',\n",
262
+ " 'Claude-Sonnet-4.5': 'claude-sonnet-4-5',\n",
263
+ "}\n",
264
+ "\n",
265
+ "def generate_answer(model_name: str, query: str, documents: List[Dict], \n",
266
+ " temperature: float = 0.2, max_tokens: int = 1000) -> Tuple[str, float]:\n",
267
+ " \"\"\"\n",
268
+ " Generate answer using specified LLM model.\n",
269
+ " Returns: (answer, response_time)\n",
270
+ " \"\"\"\n",
271
+ " # Build context from retrieved documents\n",
272
+ " context_parts = []\n",
273
+ " for i, doc in enumerate(documents, 1):\n",
274
+ " context_parts.append(\n",
275
+ " f\"Document {i} (Source: {doc['pdf_name']}, Page {doc['page_number']}):\\n{doc['content']}\"\n",
276
+ " )\n",
277
+ " context = \"\\n\\n\".join(context_parts)\n",
278
+ " \n",
279
+ " # Create prompt\n",
280
+ " prompt = f\"\"\"Siz SOCAR-\u0131n tarixi neft v\u0259 qaz s\u0259n\u0259dl\u0259ri \u00fczr\u0259 m\u00fct\u0259x\u0259ssis k\u00f6m\u0259k\u00e7isisiniz.\n",
281
+ "\n",
282
+ "Kontekst (\u0259laq\u0259li s\u0259n\u0259dl\u0259r):\n",
283
+ "{context}\n",
284
+ "\n",
285
+ "Sual: {query}\n",
286
+ "\n",
287
+ "\u018ftrafl\u0131 cavab verin v\u0259 m\u00fctl\u0259q s\u0259n\u0259d m\u0259nb\u0259l\u0259rin\u0259 istinad edin (PDF ad\u0131 v\u0259 s\u0259hif\u0259 n\u00f6mr\u0259si il\u0259).\n",
288
+ "Cavab\u0131n\u0131z d\u0259qiq, faktlara \u0259saslanan v\u0259 kontekst m\u0259lumatlar\u0131ndan istifad\u0259 ed\u0259n olmal\u0131d\u0131r.\"\"\"\n",
289
+ " \n",
290
+ " # Get model deployment\n",
291
+ " deployment = MODELS[model_name]['deployment']\n",
292
+ " \n",
293
+ " try:\n",
294
+ " start_time = time.time()\n",
295
+ " \n",
296
+ " # GPT-5 models use max_completion_tokens, others use max_tokens\n",
297
+ " if deployment.startswith('gpt-5'):\n",
298
+ " response = azure_client.chat.completions.create(\n",
299
+ " model=deployment,\n",
300
+ " messages=[\n",
301
+ " {\"role\": \"user\", \"content\": prompt}\n",
302
+ " ],\n",
303
+ " temperature=temperature,\n",
304
+ " max_completion_tokens=max_tokens\n",
305
+ " )\n",
306
+ " else:\n",
307
+ " response = azure_client.chat.completions.create(\n",
308
+ " model=deployment,\n",
309
+ " messages=[\n",
310
+ " {\"role\": \"user\", \"content\": prompt}\n",
311
+ " ],\n",
312
+ " temperature=temperature,\n",
313
+ " max_tokens=max_tokens\n",
314
+ " )\n",
315
+ " \n",
316
+ " response_time = time.time() - start_time\n",
317
+ " answer = response.choices[0].message.content\n",
318
+ " \n",
319
+ " return answer, response_time\n",
320
+ " \n",
321
+ " except Exception as e:\n",
322
+ " return f\"ERROR: {str(e)}\", 0.0\n",
323
+ "\n",
324
+ "print(f\"\\n\u2705 Configured {len(LLM_MODELS)} LLM models for testing\")"
325
+ ]
326
+ },
327
+ {
328
+ "cell_type": "markdown",
329
+ "metadata": {},
330
+ "source": [
331
+ "## 5. Evaluation Metrics"
332
+ ]
333
+ },
334
+ {
335
+ "cell_type": "code",
336
+ "execution_count": null,
337
+ "metadata": {},
338
+ "outputs": [],
339
+ "source": [
340
+ "def normalize_text(text: str) -> str:\n",
341
+ " \"\"\"Normalize text for comparison.\"\"\"\n",
342
+ " import re\n",
343
+ " text = text.lower().strip()\n",
344
+ " text = re.sub(r'\\s+', ' ', text)\n",
345
+ " return text\n",
346
+ "\n",
347
+ "def calculate_answer_similarity(reference: str, hypothesis: str) -> Dict[str, float]:\n",
348
+ " \"\"\"\n",
349
+ " Calculate similarity between generated and expected answer.\n",
350
+ " Lower is better for error rates.\n",
351
+ " \"\"\"\n",
352
+ " ref_norm = normalize_text(reference)\n",
353
+ " hyp_norm = normalize_text(hypothesis)\n",
354
+ " \n",
355
+ " # Character Error Rate\n",
356
+ " cer_score = cer(ref_norm, hyp_norm) * 100\n",
357
+ " \n",
358
+ " # Word Error Rate \n",
359
+ " wer_score = wer(ref_norm, hyp_norm) * 100\n",
360
+ " \n",
361
+ " # Similarity scores (higher is better)\n",
362
+ " similarity = max(0, 100 - wer_score)\n",
363
+ " \n",
364
+ " return {\n",
365
+ " 'CER': round(cer_score, 2),\n",
366
+ " 'WER': round(wer_score, 2),\n",
367
+ " 'Similarity': round(similarity, 2)\n",
368
+ " }\n",
369
+ "\n",
370
+ "def check_citations(answer: str, documents: List[Dict]) -> Dict[str, any]:\n",
371
+ " \"\"\"\n",
372
+ " Check if answer includes proper citations.\n",
373
+ " \"\"\"\n",
374
+ " import re\n",
375
+ " \n",
376
+ " # Check for PDF names\n",
377
+ " pdf_names = [doc['pdf_name'] for doc in documents]\n",
378
+ " cited_pdfs = sum(1 for pdf in pdf_names if pdf.replace('.pdf', '') in answer)\n",
379
+ " \n",
380
+ " # Check for page numbers\n",
381
+ " page_numbers = [str(doc['page_number']) for doc in documents]\n",
382
+ " cited_pages = sum(1 for page in page_numbers if page in answer)\n",
383
+ " \n",
384
+ " # Check for source keywords\n",
385
+ " source_keywords = ['m\u0259nb\u0259', 's\u0259n\u0259d', 's\u0259hif\u0259', 'pdf', 'document', 'page', 'source']\n",
386
+ " has_source_ref = any(kw in answer.lower() for kw in source_keywords)\n",
387
+ " \n",
388
+ " citation_score = (\n",
389
+ " (cited_pdfs / len(pdf_names) * 40) + # 40% for PDF citation\n",
390
+ " (cited_pages / len(page_numbers) * 40) + # 40% for page citation\n",
391
+ " (20 if has_source_ref else 0) # 20% for having source keywords\n",
392
+ " )\n",
393
+ " \n",
394
+ " return {\n",
395
+ " 'Citation_Score': round(citation_score, 2),\n",
396
+ " 'Cited_PDFs': cited_pdfs,\n",
397
+ " 'Cited_Pages': cited_pages,\n",
398
+ " 'Has_Source_Reference': has_source_ref\n",
399
+ " }\n",
400
+ "\n",
401
+ "def evaluate_completeness(answer: str, min_length: int = 100) -> Dict[str, any]:\n",
402
+ " \"\"\"\n",
403
+ " Evaluate answer completeness.\n",
404
+ " \"\"\"\n",
405
+ " word_count = len(answer.split())\n",
406
+ " char_count = len(answer)\n",
407
+ " \n",
408
+ " # Penalize very short or very long answers\n",
409
+ " if char_count < min_length:\n",
410
+ " completeness_score = (char_count / min_length) * 100\n",
411
+ " elif char_count > 2000:\n",
412
+ " completeness_score = 100 - ((char_count - 2000) / 2000 * 20) # Penalty for verbosity\n",
413
+ " else:\n",
414
+ " completeness_score = 100\n",
415
+ " \n",
416
+ " return {\n",
417
+ " 'Completeness_Score': round(max(0, completeness_score), 2),\n",
418
+ " 'Word_Count': word_count,\n",
419
+ " 'Char_Count': char_count\n",
420
+ " }\n",
421
+ "\n",
422
+ "print(\"\u2705 Evaluation functions ready\")"
423
+ ]
424
+ },
425
+ {
426
+ "cell_type": "markdown",
427
+ "metadata": {},
428
+ "source": [
429
+ "## 6. Run Benchmark on All Models"
430
+ ]
431
+ },
432
+ {
433
+ "cell_type": "code",
434
+ "execution_count": null,
435
+ "metadata": {},
436
+ "outputs": [],
437
+ "source": [
438
+ "# Select models to test (you can comment out models to skip)\n",
439
+ "MODELS_TO_TEST = [\n",
440
+ " 'Llama-4-Maverick-17B',\n",
441
+ " 'DeepSeek-R1',\n",
442
+ " 'GPT-4.1',\n",
443
+ " 'GPT-5-mini',\n",
444
+ " 'Claude-Sonnet-4.5',\n",
445
+ " # 'Claude-Opus-4.1', # Uncomment to test\n",
446
+ " # 'Phi-4-multimodal', # Uncomment to test\n",
447
+ " # 'GPT-OSS-120B', # Uncomment to test\n",
448
+ "]\n",
449
+ "\n",
450
+ "print(f\"Testing {len(MODELS_TO_TEST)} models on {len(questions)} questions...\\n\")\n",
451
+ "print(\"This may take several minutes...\\n\")"
452
+ ]
453
+ },
454
+ {
455
+ "cell_type": "code",
456
+ "execution_count": null,
457
+ "metadata": {},
458
+ "outputs": [],
459
+ "source": [
460
+ "# Run benchmark\n",
461
+ "results = []\n",
462
+ "\n",
463
+ "for model_name in MODELS_TO_TEST:\n",
464
+ " print(f\"\\n{'='*80}\")\n",
465
+ " print(f\"Testing: {model_name}\")\n",
466
+ " print(f\"{'='*80}\")\n",
467
+ " \n",
468
+ " model_results = []\n",
469
+ " \n",
470
+ " for example_key, messages in questions.items():\n",
471
+ " # Get the last user message (the actual question)\n",
472
+ " user_msg = [m for m in messages if m['role'] == 'user'][-1]\n",
473
+ " query = user_msg['content']\n",
474
+ " \n",
475
+ " print(f\"\\n Question {example_key}: {query[:80]}...\")\n",
476
+ " \n",
477
+ " # Retrieve documents\n",
478
+ " documents = retrieve_documents(query, top_k=3)\n",
479
+ " \n",
480
+ " # Generate answer\n",
481
+ " answer, response_time = generate_answer(model_name, query, documents)\n",
482
+ " \n",
483
+ " if answer.startswith('ERROR'):\n",
484
+ " print(f\" \u274c Failed: {answer}\")\n",
485
+ " continue\n",
486
+ " \n",
487
+ " print(f\" \u2705 Response time: {response_time:.2f}s\")\n",
488
+ " \n",
489
+ " # Get expected answer\n",
490
+ " expected = expected_answers.get(example_key, {}).get('Answer', '')\n",
491
+ " \n",
492
+ " # Calculate metrics\n",
493
+ " similarity_metrics = calculate_answer_similarity(expected, answer) if expected else {'CER': 0, 'WER': 0, 'Similarity': 0}\n",
494
+ " citation_metrics = check_citations(answer, documents)\n",
495
+ " completeness_metrics = evaluate_completeness(answer)\n",
496
+ " \n",
497
+ " # Store result\n",
498
+ " result = {\n",
499
+ " 'Model': model_name,\n",
500
+ " 'Question': example_key,\n",
501
+ " 'Query': query[:100],\n",
502
+ " 'Answer': answer[:200] + '...',\n",
503
+ " 'Response_Time': round(response_time, 2),\n",
504
+ " **similarity_metrics,\n",
505
+ " **citation_metrics,\n",
506
+ " **completeness_metrics,\n",
507
+ " 'Open_Source': MODELS[model_name]['open_source'],\n",
508
+ " 'Architecture_Score': MODELS[model_name]['architecture_score']\n",
509
+ " }\n",
510
+ " \n",
511
+ " model_results.append(result)\n",
512
+ " results.append(result)\n",
513
+ " \n",
514
+ " # Show summary for this model\n",
515
+ " if model_results:\n",
516
+ " avg_response_time = sum(r['Response_Time'] for r in model_results) / len(model_results)\n",
517
+ " avg_similarity = sum(r['Similarity'] for r in model_results) / len(model_results)\n",
518
+ " avg_citation = sum(r['Citation_Score'] for r in model_results) / len(model_results)\n",
519
+ " avg_completeness = sum(r['Completeness_Score'] for r in model_results) / len(model_results)\n",
520
+ " \n",
521
+ " print(f\"\\n \ud83d\udcca {model_name} Summary:\")\n",
522
+ " print(f\" Avg Response Time: {avg_response_time:.2f}s\")\n",
523
+ " print(f\" Avg Similarity: {avg_similarity:.1f}%\")\n",
524
+ " print(f\" Avg Citation Score: {avg_citation:.1f}%\")\n",
525
+ " print(f\" Avg Completeness: {avg_completeness:.1f}%\")\n",
526
+ "\n",
527
+ "print(f\"\\n{'='*80}\")\n",
528
+ "print(\"\u2705 Benchmarking complete!\")\n",
529
+ "print(f\"{'='*80}\")"
530
+ ]
531
+ },
532
+ {
533
+ "cell_type": "markdown",
534
+ "metadata": {},
535
+ "source": [
536
+ "## 7. Aggregate Results and Rankings"
537
+ ]
538
+ },
539
+ {
540
+ "cell_type": "code",
541
+ "execution_count": null,
542
+ "metadata": {},
543
+ "outputs": [],
544
+ "source": [
545
+ "# Create DataFrame\n",
546
+ "df = pd.DataFrame(results)\n",
547
+ "\n",
548
+ "# Calculate aggregate scores per model\n",
549
+ "model_summary = df.groupby('Model').agg({\n",
550
+ " 'Response_Time': 'mean',\n",
551
+ " 'Similarity': 'mean',\n",
552
+ " 'Citation_Score': 'mean',\n",
553
+ " 'Completeness_Score': 'mean',\n",
554
+ " 'CER': 'mean',\n",
555
+ " 'WER': 'mean',\n",
556
+ " 'Open_Source': 'first',\n",
557
+ " 'Architecture_Score': 'first'\n",
558
+ "}).round(2)\n",
559
+ "\n",
560
+ "# Calculate overall quality score (weighted average)\n",
561
+ "model_summary['Quality_Score'] = (\n",
562
+ " model_summary['Similarity'] * 0.35 + # 35% answer accuracy\n",
563
+ " model_summary['Citation_Score'] * 0.35 + # 35% citation quality\n",
564
+ " model_summary['Completeness_Score'] * 0.30 # 30% completeness\n",
565
+ ").round(2)\n",
566
+ "\n",
567
+ "# Sort by Quality Score\n",
568
+ "model_summary = model_summary.sort_values('Quality_Score', ascending=False)\n",
569
+ "\n",
570
+ "# Display summary table\n",
571
+ "print(\"\\n\" + \"=\"*100)\n",
572
+ "print(\"\ud83d\udcca LLM BENCHMARKING RESULTS - MODEL SUMMARY\")\n",
573
+ "print(\"=\"*100)\n",
574
+ "print(model_summary.to_string())\n",
575
+ "print(\"=\"*100)"
576
+ ]
577
+ },
578
+ {
579
+ "cell_type": "markdown",
580
+ "metadata": {},
581
+ "source": [
582
+ "# Create comprehensive visualization\n",
583
+ "import os\n",
584
+ "from pathlib import Path\n",
585
+ "\n",
586
+ "# Create output directory - using dynamic path\n",
587
+ "output_dir = OUTPUT_DIR / 'llm_benchmark'\n",
588
+ "output_dir.mkdir(parents=True, exist_ok=True)\n",
589
+ "\n",
590
+ "fig, axes = plt.subplots(2, 2, figsize=(16, 10))\n",
591
+ "\n",
592
+ "models = df['Model'].tolist()\n",
593
+ "colors = sns.color_palette('viridis', len(models))\n",
594
+ "\n",
595
+ "# 1. CSR - Character Success Rate (MAIN METRIC)\n",
596
+ "ax1 = axes[0, 0]\n",
597
+ "bars1 = ax1.barh(models, df['CSR'], color=colors)\n",
598
+ "ax1.set_xlabel('CSR (%) - Higher is Better', fontsize=12, fontweight='bold')\n",
599
+ "ax1.set_title('Character Success Rate (CSR)\\n\ud83c\udfc6 HACKATHON PRIMARY METRIC', \n",
600
+ " fontsize=14, fontweight='bold')\n",
601
+ "ax1.set_xlim(0, 100)\n",
602
+ "for i, (model, csr) in enumerate(zip(models, df['CSR'])):\n",
603
+ " ax1.text(csr + 1, i, f'{csr:.2f}%', va='center', fontsize=11, fontweight='bold')\n",
604
+ "ax1.axvline(x=90, color='green', linestyle='--', alpha=0.3, label='Excellent (>90%)')\n",
605
+ "ax1.axvline(x=80, color='orange', linestyle='--', alpha=0.3, label='Good (>80%)')\n",
606
+ "ax1.legend(fontsize=9)\n",
607
+ "\n",
608
+ "# 2. WSR - Word Success Rate\n",
609
+ "ax2 = axes[0, 1]\n",
610
+ "bars2 = ax2.barh(models, df['WSR'], color=colors)\n",
611
+ "ax2.set_xlabel('WSR (%) - Higher is Better', fontsize=12, fontweight='bold')\n",
612
+ "ax2.set_title('Word Success Rate (WSR)', fontsize=14, fontweight='bold')\n",
613
+ "ax2.set_xlim(0, 100)\n",
614
+ "for i, (model, wsr) in enumerate(zip(models, df['WSR'])):\n",
615
+ " ax2.text(wsr + 1, i, f'{wsr:.2f}%', va='center', fontsize=11, fontweight='bold')\n",
616
+ "\n",
617
+ "# 3. Response Time\n",
618
+ "ax3 = axes[1, 0]\n",
619
+ "bars3 = ax3.barh(models, df['Response_Time'], color=colors)\n",
620
+ "ax3.set_xlabel('Total Time (seconds) - Lower is Better', fontsize=12, fontweight='bold')\n",
621
+ "ax3.set_title('Processing Speed', fontsize=14, fontweight='bold')\n",
622
+ "for i, (model, time_val) in enumerate(zip(models, df['Response_Time'])):\n",
623
+ " ax3.text(time_val + 0.5, i, f'{time_val:.1f}s', va='center', fontsize=11)\n",
624
+ "\n",
625
+ "# 4. Error Rates Comparison\n",
626
+ "ax4 = axes[1, 1]\n",
627
+ "x = range(len(models))\n",
628
+ "width = 0.35\n",
629
+ "ax4.bar([i - width/2 for i in x], df['CER'], width, label='CER', color='coral', alpha=0.8)\n",
630
+ "ax4.bar([i + width/2 for i in x], df['WER'], width, label='WER', color='skyblue', alpha=0.8)\n",
631
+ "ax4.set_ylabel('Error Rate (%) - Lower is Better', fontsize=12, fontweight='bold')\n",
632
+ "ax4.set_title('Error Rates', fontsize=14, fontweight='bold')\n",
633
+ "ax4.set_xticks(x)\n",
634
+ "ax4.set_xticklabels(models, rotation=45, ha='right')\n",
635
+ "ax4.legend(fontsize=11)\n",
636
+ "ax4.grid(axis='y', alpha=0.3)\n",
637
+ "\n",
638
+ "plt.tight_layout()\n",
639
+ "plt.savefig(output_dir / 'results.png', dpi=300, bbox_inches='tight')\n",
640
+ "plt.show()\n",
641
+ "\n",
642
+ "print(f\"\\n\u2705 Visualization saved to '{output_dir}/results.png'\")"
643
+ ]
644
+ },
645
+ {
646
+ "cell_type": "code",
647
+ "execution_count": null,
648
+ "metadata": {},
649
+ "outputs": [],
650
+ "source": [
651
+ "# Create rankings table\n",
652
+ "rankings = model_summary[[\n",
653
+ " 'Quality_Score', 'Similarity', 'Citation_Score', 'Completeness_Score', \n",
654
+ " 'Response_Time', 'Open_Source', 'Architecture_Score'\n",
655
+ "]].copy()\n",
656
+ "\n",
657
+ "rankings.insert(0, 'Rank', range(1, len(rankings) + 1))\n",
658
+ "\n",
659
+ "print(\"\\n\" + \"=\"*100)\n",
660
+ "print(\"\ud83c\udfc6 FINAL RANKINGS\")\n",
661
+ "print(\"=\"*100)\n",
662
+ "print(rankings.to_string())\n",
663
+ "print(\"=\"*100)\n",
664
+ "\n",
665
+ "# Winner analysis\n",
666
+ "best_overall = rankings.index[0]\n",
667
+ "best_open_source = rankings[rankings['Open_Source'] == True].index[0] if any(rankings['Open_Source']) else None\n",
668
+ "fastest = model_summary['Response_Time'].idxmin()\n",
669
+ "\n",
670
+ "print(\"\\n\" + \"=\"*100)\n",
671
+ "print(\"\ud83d\udca1 RECOMMENDATIONS FOR HACKATHON\")\n",
672
+ "print(\"=\"*100)\n",
673
+ "\n",
674
+ "print(f\"\\n\ud83e\udd47 Best Overall Quality: {best_overall}\")\n",
675
+ "print(f\" Quality Score: {model_summary.loc[best_overall, 'Quality_Score']:.1f}%\")\n",
676
+ "print(f\" Similarity: {model_summary.loc[best_overall, 'Similarity']:.1f}%\")\n",
677
+ "print(f\" Citation Score: {model_summary.loc[best_overall, 'Citation_Score']:.1f}%\")\n",
678
+ "print(f\" Response Time: {model_summary.loc[best_overall, 'Response_Time']:.2f}s\")\n",
679
+ "print(f\" Open Source: {model_summary.loc[best_overall, 'Open_Source']}\")\n",
680
+ "print(f\" Architecture Score: {model_summary.loc[best_overall, 'Architecture_Score']}\")\n",
681
+ "\n",
682
+ "if best_open_source:\n",
683
+ " print(f\"\\n\ud83d\udd13 Best Open-Source Model: {best_open_source}\")\n",
684
+ " print(f\" Quality Score: {model_summary.loc[best_open_source, 'Quality_Score']:.1f}%\")\n",
685
+ " print(f\" Architecture Score: {model_summary.loc[best_open_source, 'Architecture_Score']} (Better for hackathon!)\")\n",
686
+ " print(f\" Response Time: {model_summary.loc[best_open_source, 'Response_Time']:.2f}s\")\n",
687
+ "\n",
688
+ "print(f\"\\n\u26a1 Fastest Model: {fastest}\")\n",
689
+ "print(f\" Response Time: {model_summary.loc[fastest, 'Response_Time']:.2f}s\")\n",
690
+ "print(f\" Quality Score: {model_summary.loc[fastest, 'Quality_Score']:.1f}%\")\n",
691
+ "\n",
692
+ "print(\"\\n\" + \"=\"*100)\n",
693
+ "print(\"\ud83d\udcdd FINAL RECOMMENDATION\")\n",
694
+ "print(\"=\"*100)\n",
695
+ "print(\"\\nScoring Breakdown:\")\n",
696
+ "print(\" - LLM Quality: 30% of total hackathon score\")\n",
697
+ "print(\" - Architecture: 20% of total hackathon score (open-source preferred!)\")\n",
698
+ "print(\"\\nBest Choice:\")\n",
699
+ "if best_open_source and model_summary.loc[best_open_source, 'Quality_Score'] >= model_summary.loc[best_overall, 'Quality_Score'] * 0.9:\n",
700
+ " print(f\" \u2705 {best_open_source} - Best balance of quality and architecture score\")\n",
701
+ " print(f\" Only {model_summary.loc[best_overall, 'Quality_Score'] - model_summary.loc[best_open_source, 'Quality_Score']:.1f}% quality drop for higher architecture score!\")\n",
702
+ "else:\n",
703
+ " print(f\" \u2705 {best_overall} - Highest quality, use if quality gap is significant\")\n",
704
+ " if best_open_source:\n",
705
+ " print(f\" \u26a0\ufe0f Consider {best_open_source} for higher architecture score (trade-off: {model_summary.loc[best_overall, 'Quality_Score'] - model_summary.loc[best_open_source, 'Quality_Score']:.1f}% quality)\")\n",
706
+ "\n",
707
+ "print(\"=\"*100)"
708
+ ]
709
+ },
710
+ {
711
+ "cell_type": "markdown",
712
+ "metadata": {},
713
+ "source": [
714
+ "# Save results\n",
715
+ "from pathlib import Path\n",
716
+ "\n",
717
+ "# Using dynamic path\n",
718
+ "output_dir = OUTPUT_DIR / 'llm_benchmark'\n",
719
+ "output_dir.mkdir(parents=True, exist_ok=True)\n",
720
+ "\n",
721
+ "df.to_csv(output_dir / 'detailed_results.csv', index=False, encoding='utf-8')\n",
722
+ "model_summary.to_csv(output_dir / 'summary.csv', encoding='utf-8')\n",
723
+ "rankings.to_csv(output_dir / 'rankings.csv', index=False, encoding='utf-8')\n",
724
+ "\n",
725
+ "print(\"\\n\u2705 Results exported to output/llm_benchmark/:\")\n",
726
+ "print(\" - detailed_results.csv (all questions and answers)\")\n",
727
+ "print(\" - summary.csv (model averages)\")\n",
728
+ "print(\" - rankings.csv (final rankings)\")\n",
729
+ "print(\" - results.png (visualizations)\")"
730
+ ]
731
+ },
732
+ {
733
+ "cell_type": "markdown",
734
+ "metadata": {},
735
+ "source": [
736
+ "## 11. Sample Answer Comparison"
737
+ ]
738
+ }
739
+ ],
740
+ "metadata": {
741
+ "kernelspec": {
742
+ "display_name": "venv",
743
+ "language": "python",
744
+ "name": "python3"
745
+ },
746
+ "language_info": {
747
+ "codemirror_mode": {
748
+ "name": "ipython",
749
+ "version": 3
750
+ },
751
+ "file_extension": ".py",
752
+ "mimetype": "text/x-python",
753
+ "name": "python",
754
+ "nbconvert_exporter": "python",
755
+ "pygments_lexer": "ipython3",
756
+ "version": "3.10.12"
757
+ }
758
+ },
759
+ "nbformat": 4,
760
+ "nbformat_minor": 4
761
+ }
notebooks/rag_optimization_benchmark.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
notebooks/rag_optimization_benchmark.ipynb.backup ADDED
@@ -0,0 +1,1072 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# RAG Pipeline Optimization Benchmark\n",
8
+ "\n",
9
+ "**Comprehensive testing of ALL RAG components to maximize LLM Judge score**\n",
10
+ "\n",
11
+ "## What We're Testing:\n",
12
+ "\n",
13
+ "### 1. Embedding Models (Vector Representations)\n",
14
+ "- `BAAI/bge-large-en-v1.5` (Current - 1024 dim, best quality)\n",
15
+ "- `BAAI/bge-base-en-v1.5` (768 dim, faster)\n",
16
+ "- `intfloat/multilingual-e5-large` (1024 dim, multi-language)\n",
17
+ "- `sentence-transformers/paraphrase-multilingual-mpnet-base-v2` (768 dim, multilingual)\n",
18
+ "- `sentence-transformers/all-MiniLM-L6-v2` (384 dim, very fast)\n",
19
+ "\n",
20
+ "### 2. Retrieval Strategies\n",
21
+ "- **Top-K**: Test 1, 3, 5, 10 documents\n",
22
+ "- **MMR** (Maximal Marginal Relevance): Diversity vs relevance trade-off\n",
23
+ "- **Similarity Threshold**: Filter low-relevance docs\n",
24
+ "- **Reranking**: Use cross-encoder to rerank results\n",
25
+ "\n",
26
+ "### 3. Chunking Strategies (Already in Vector DB, but we'll compare)\n",
27
+ "- Chunk size: 256, 512, 600 (current), 1000 tokens\n",
28
+ "- Overlap: 0, 50, 100 (current), 200 chars\n",
29
+ "\n",
30
+ "### 4. LLM Models\n",
31
+ "- Llama-4-Maverick-17B (open-source)\n",
32
+ "- DeepSeek-R1 (reasoning)\n",
33
+ "- GPT-4.1, GPT-5, GPT-5-mini\n",
34
+ "- Claude-Sonnet-4.5\n",
35
+ "\n",
36
+ "### 5. Prompting Techniques\n",
37
+ "- **Baseline**: Simple context + question\n",
38
+ "- **Citation-focused**: Emphasize source references\n",
39
+ "- **Step-by-step**: Chain-of-thought reasoning\n",
40
+ "- **Few-shot**: Include example Q&A\n",
41
+ "\n",
42
+ "## LLM Judge Evaluation Criteria:\n",
43
+ "- **Accuracy** (35%): Answer correctness\n",
44
+ "- **Relevance** (35%): Citation quality and relevance\n",
45
+ "- **Completeness** (30%): Thorough answers"
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "execution_count": 1,
51
+ "metadata": {},
52
+ "outputs": [],
53
+ "source": [
54
+ "# !pip install openai pinecone-client sentence-transformers rank-bm25 python-dotenv pandas matplotlib seaborn jiwer"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": 2,
60
+ "metadata": {},
61
+ "outputs": [
62
+ {
63
+ "name": "stderr",
64
+ "output_type": "stream",
65
+ "text": [
66
+ "/Users/ismatsamadov/SOCAR_Hackathon/venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
67
+ " from .autonotebook import tqdm as notebook_tqdm\n"
68
+ ]
69
+ },
70
+ {
71
+ "name": "stdout",
72
+ "output_type": "stream",
73
+ "text": [
74
+ "✅ Libraries loaded\n"
75
+ ]
76
+ }
77
+ ],
78
+ "source": [
79
+ "import os\n",
80
+ "import json\n",
81
+ "import time\n",
82
+ "import re\n",
83
+ "from typing import Dict, List, Tuple, Any\n",
84
+ "from collections import defaultdict\n",
85
+ "from dotenv import load_dotenv\n",
86
+ "\n",
87
+ "import pandas as pd\n",
88
+ "import matplotlib.pyplot as plt\n",
89
+ "import seaborn as sns\n",
90
+ "from openai import AzureOpenAI\n",
91
+ "from pinecone import Pinecone\n",
92
+ "from sentence_transformers import SentenceTransformer, CrossEncoder\n",
93
+ "from jiwer import wer, cer\n",
94
+ "import numpy as np\n",
95
+ "\n",
96
+ "load_dotenv()\n",
97
+ "\n",
98
+ "sns.set_style('whitegrid')\n",
99
+ "plt.rcParams['figure.figsize'] = (16, 10)\n",
100
+ "\n",
101
+ "print(\"✅ Libraries loaded\")"
102
+ ]
103
+ },
104
+ {
105
+ "cell_type": "code",
106
+ "execution_count": 3,
107
+ "metadata": {},
108
+ "outputs": [
109
+ {
110
+ "name": "stdout",
111
+ "output_type": "stream",
112
+ "text": [
113
+ "✅ Project root: /Users/ismatsamadov/SOCAR_Hackathon\n",
114
+ "✅ Docs directory: /Users/ismatsamadov/SOCAR_Hackathon/docs\n",
115
+ "✅ Output directory: /Users/ismatsamadov/SOCAR_Hackathon/output\n"
116
+ ]
117
+ }
118
+ ],
119
+ "source": [
120
+ "# Auto-detect project root (works from any directory)\n",
121
+ "import os\n",
122
+ "from pathlib import Path\n",
123
+ "\n",
124
+ "if Path('data').exists() and Path('docs').exists():\n",
125
+ " # Already in project root\n",
126
+ " PROJECT_ROOT = Path.cwd()\n",
127
+ "elif Path('../data').exists() and Path('../docs').exists():\n",
128
+ " # In notebooks/ subdirectory\n",
129
+ " PROJECT_ROOT = Path.cwd().parent\n",
130
+ "else:\n",
131
+ " # Fallback: try to find project root\n",
132
+ " current = Path.cwd()\n",
133
+ " while current != current.parent:\n",
134
+ " if (current / 'data').exists() and (current / 'docs').exists():\n",
135
+ " PROJECT_ROOT = current\n",
136
+ " break\n",
137
+ " current = current.parent\n",
138
+ " else:\n",
139
+ " PROJECT_ROOT = Path.cwd()\n",
140
+ "\n",
141
+ "# Define all paths relative to project root\n",
142
+ "DATA_DIR = PROJECT_ROOT / 'data'\n",
143
+ "DOCS_DIR = PROJECT_ROOT / 'docs'\n",
144
+ "OUTPUT_DIR = PROJECT_ROOT / 'output'\n",
145
+ "\n",
146
+ "print(f\"✅ Project root: {PROJECT_ROOT}\")\n",
147
+ "print(f\"✅ Docs directory: {DOCS_DIR}\")\n",
148
+ "print(f\"✅ Output directory: {OUTPUT_DIR}\")"
149
+ ]
150
+ },
151
+ {
152
+ "cell_type": "code",
153
+ "execution_count": 4,
154
+ "metadata": {},
155
+ "outputs": [
156
+ {
157
+ "name": "stdout",
158
+ "output_type": "stream",
159
+ "text": [
160
+ "✅ Loaded 5 test questions\n",
161
+ " - Example1\n",
162
+ " - Example2\n",
163
+ " - Example3\n",
164
+ " - Example4\n",
165
+ " - Example5\n"
166
+ ]
167
+ }
168
+ ],
169
+ "source": [
170
+ "# Load test cases - using dynamic paths\n",
171
+ "with open(DOCS_DIR / 'sample_questions.json', 'r', encoding='utf-8') as f:\n",
172
+ " questions = json.load(f)\n",
173
+ "\n",
174
+ "with open(DOCS_DIR / 'sample_answers.json', 'r', encoding='utf-8') as f:\n",
175
+ " expected_answers = json.load(f)\n",
176
+ "\n",
177
+ "print(f\"✅ Loaded {len(questions)} test questions\")\n",
178
+ "for key in questions.keys():\n",
179
+ " print(f\" - {key}\")"
180
+ ]
181
+ },
182
+ {
183
+ "cell_type": "markdown",
184
+ "metadata": {},
185
+ "source": [
186
+ "## 2. Initialize Vector Database"
187
+ ]
188
+ },
189
+ {
190
+ "cell_type": "code",
191
+ "execution_count": null,
192
+ "metadata": {},
193
+ "outputs": [],
194
+ "source": [
195
+ "# Connect to Pinecone\n",
196
+ "pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))\n",
197
+ "index = pc.Index(os.getenv('PINECONE_INDEX_NAME', 'hackathon'))\n",
198
+ "\n",
199
+ "stats = index.describe_index_stats()\n",
200
+ "print(f\"✅ Vector DB connected\")\n",
201
+ "print(f\" Total vectors: {stats['total_vector_count']}\")\n",
202
+ "print(f\" Dimensions: {stats['dimension']}\")"
203
+ ]
204
+ },
205
+ {
206
+ "cell_type": "markdown",
207
+ "metadata": {},
208
+ "source": [
209
+ "## 3. Embedding Models Configuration"
210
+ ]
211
+ },
212
+ {
213
+ "cell_type": "code",
214
+ "execution_count": null,
215
+ "metadata": {},
216
+ "outputs": [],
217
+ "source": [
218
+ "EMBEDDING_MODELS = {\n",
219
+ " 'bge-large-en': {\n",
220
+ " 'name': 'BAAI/bge-large-en-v1.5',\n",
221
+ " 'dimensions': 1024,\n",
222
+ " 'notes': 'Current model - best quality'\n",
223
+ " },\n",
224
+ " 'bge-base-en': {\n",
225
+ " 'name': 'BAAI/bge-base-en-v1.5',\n",
226
+ " 'dimensions': 768,\n",
227
+ " 'notes': 'Faster, slightly lower quality'\n",
228
+ " },\n",
229
+ " 'multilingual-e5-large': {\n",
230
+ " 'name': 'intfloat/multilingual-e5-large',\n",
231
+ " 'dimensions': 1024,\n",
232
+ " 'notes': 'Multi-language optimized'\n",
233
+ " },\n",
234
+ " 'paraphrase-multilingual': {\n",
235
+ " 'name': 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2',\n",
236
+ " 'dimensions': 768,\n",
237
+ " 'notes': 'Good for Azerbaijani/Russian'\n",
238
+ " },\n",
239
+ " 'all-MiniLM-L6': {\n",
240
+ " 'name': 'sentence-transformers/all-MiniLM-L6-v2',\n",
241
+ " 'dimensions': 384,\n",
242
+ " 'notes': 'Very fast, lower quality'\n",
243
+ " }\n",
244
+ "}\n",
245
+ "\n",
246
+ "# Load embedding models (only test 1024-dim models for existing Pinecone index)\n",
247
+ "EMBEDDING_MODELS_TO_TEST = [\n",
248
+ " 'bge-large-en', # Current\n",
249
+ " 'multilingual-e5-large', # Alternative with same dims\n",
250
+ "]\n",
251
+ "\n",
252
+ "embedding_cache = {}\n",
253
+ "\n",
254
+ "for model_key in EMBEDDING_MODELS_TO_TEST:\n",
255
+ " model_name = EMBEDDING_MODELS[model_key]['name']\n",
256
+ " print(f\"Loading {model_key}...\")\n",
257
+ " embedding_cache[model_key] = SentenceTransformer(model_name)\n",
258
+ " print(f\" ✅ {model_name}\")\n",
259
+ "\n",
260
+ "print(f\"\\n✅ Loaded {len(embedding_cache)} embedding models\")"
261
+ ]
262
+ },
263
+ {
264
+ "cell_type": "markdown",
265
+ "metadata": {},
266
+ "source": [
267
+ "## 4. Retrieval Strategies"
268
+ ]
269
+ },
270
+ {
271
+ "cell_type": "code",
272
+ "execution_count": null,
273
+ "metadata": {},
274
+ "outputs": [],
275
+ "source": [
276
+ "def retrieve_vanilla(query: str, embed_model: SentenceTransformer, top_k: int = 3) -> List[Dict]:\n",
277
+ " \"\"\"\n",
278
+ " Vanilla retrieval: Simple top-k vector search.\n",
279
+ " \"\"\"\n",
280
+ " query_embedding = embed_model.encode(query).tolist()\n",
281
+ " results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)\n",
282
+ " \n",
283
+ " documents = []\n",
284
+ " for match in results['matches']:\n",
285
+ " documents.append({\n",
286
+ " 'pdf_name': match['metadata'].get('pdf_name', 'unknown.pdf'),\n",
287
+ " 'page_number': match['metadata'].get('page_number', 0),\n",
288
+ " 'content': match['metadata'].get('text', ''),\n",
289
+ " 'score': match.get('score', 0.0)\n",
290
+ " })\n",
291
+ " \n",
292
+ " return documents\n",
293
+ "\n",
294
+ "\n",
295
+ "def retrieve_with_threshold(query: str, embed_model: SentenceTransformer, \n",
296
+ " top_k: int = 10, threshold: float = 0.7) -> List[Dict]:\n",
297
+ " \"\"\"\n",
298
+ " Retrieve with similarity threshold filtering.\n",
299
+ " \"\"\"\n",
300
+ " docs = retrieve_vanilla(query, embed_model, top_k=top_k)\n",
301
+ " return [doc for doc in docs if doc['score'] >= threshold]\n",
302
+ "\n",
303
+ "\n",
304
+ "def retrieve_with_mmr(query: str, embed_model: SentenceTransformer, \n",
305
+ " top_k: int = 3, lambda_param: float = 0.5, fetch_k: int = 20) -> List[Dict]:\n",
306
+ " \"\"\"\n",
307
+ " MMR (Maximal Marginal Relevance) for diversity.\n",
308
+ " lambda=1 → pure relevance, lambda=0 → pure diversity\n",
309
+ " \"\"\"\n",
310
+ " # Fetch more candidates\n",
311
+ " candidates = retrieve_vanilla(query, embed_model, top_k=fetch_k)\n",
312
+ " \n",
313
+ " if len(candidates) <= top_k:\n",
314
+ " return candidates[:top_k]\n",
315
+ " \n",
316
+ " # Query embedding\n",
317
+ " query_emb = embed_model.encode(query)\n",
318
+ " \n",
319
+ " # Get embeddings for candidates\n",
320
+ " candidate_texts = [doc['content'] for doc in candidates]\n",
321
+ " candidate_embs = embed_model.encode(candidate_texts)\n",
322
+ " \n",
323
+ " # MMR algorithm\n",
324
+ " selected = []\n",
325
+ " selected_embs = []\n",
326
+ " \n",
327
+ " for _ in range(min(top_k, len(candidates))):\n",
328
+ " mmr_scores = []\n",
329
+ " \n",
330
+ " for i, (doc, emb) in enumerate(zip(candidates, candidate_embs)):\n",
331
+ " if i in [candidates.index(s) for s in selected]:\n",
332
+ " mmr_scores.append(-float('inf'))\n",
333
+ " continue\n",
334
+ " \n",
335
+ " # Relevance to query\n",
336
+ " relevance = np.dot(query_emb, emb) / (np.linalg.norm(query_emb) * np.linalg.norm(emb))\n",
337
+ " \n",
338
+ " # Max similarity to already selected\n",
339
+ " if selected_embs:\n",
340
+ " similarities = [np.dot(emb, s_emb) / (np.linalg.norm(emb) * np.linalg.norm(s_emb)) \n",
341
+ " for s_emb in selected_embs]\n",
342
+ " max_sim = max(similarities)\n",
343
+ " else:\n",
344
+ " max_sim = 0\n",
345
+ " \n",
346
+ " # MMR score\n",
347
+ " mmr = lambda_param * relevance - (1 - lambda_param) * max_sim\n",
348
+ " mmr_scores.append(mmr)\n",
349
+ " \n",
350
+ " # Select best MMR score\n",
351
+ " best_idx = np.argmax(mmr_scores)\n",
352
+ " selected.append(candidates[best_idx])\n",
353
+ " selected_embs.append(candidate_embs[best_idx])\n",
354
+ " \n",
355
+ " return selected\n",
356
+ "\n",
357
+ "\n",
358
+ "def retrieve_with_reranking(query: str, embed_model: SentenceTransformer, \n",
359
+ " top_k: int = 3, fetch_k: int = 20) -> List[Dict]:\n",
360
+ " \"\"\"\n",
361
+ " Two-stage: retrieve with embeddings, rerank with cross-encoder.\n",
362
+ " \"\"\"\n",
363
+ " # Stage 1: Retrieve candidates\n",
364
+ " candidates = retrieve_vanilla(query, embed_model, top_k=fetch_k)\n",
365
+ " \n",
366
+ " if len(candidates) <= top_k:\n",
367
+ " return candidates[:top_k]\n",
368
+ " \n",
369
+ " # Stage 2: Rerank with cross-encoder\n",
370
+ " reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')\n",
371
+ " \n",
372
+ " pairs = [[query, doc['content']] for doc in candidates]\n",
373
+ " scores = reranker.predict(pairs)\n",
374
+ " \n",
375
+ " # Sort by reranker score\n",
376
+ " scored_docs = [(doc, score) for doc, score in zip(candidates, scores)]\n",
377
+ " scored_docs.sort(key=lambda x: x[1], reverse=True)\n",
378
+ " \n",
379
+ " # Update scores and return top-k\n",
380
+ " reranked = []\n",
381
+ " for doc, score in scored_docs[:top_k]:\n",
382
+ " doc['rerank_score'] = float(score)\n",
383
+ " reranked.append(doc)\n",
384
+ " \n",
385
+ " return reranked\n",
386
+ "\n",
387
+ "\n",
388
+ "RETRIEVAL_STRATEGIES = {\n",
389
+ " 'vanilla_k3': {'func': retrieve_vanilla, 'params': {'top_k': 3}, 'notes': 'Current setup'},\n",
390
+ " 'vanilla_k5': {'func': retrieve_vanilla, 'params': {'top_k': 5}, 'notes': 'More context'},\n",
391
+ " 'vanilla_k10': {'func': retrieve_vanilla, 'params': {'top_k': 10}, 'notes': 'Maximum context'},\n",
392
+ " 'threshold_0.7': {'func': retrieve_with_threshold, 'params': {'top_k': 10, 'threshold': 0.7}, 'notes': 'Quality filter'},\n",
393
+ " 'mmr_balanced': {'func': retrieve_with_mmr, 'params': {'top_k': 3, 'lambda_param': 0.5}, 'notes': 'Balance diversity'},\n",
394
+ " 'mmr_diverse': {'func': retrieve_with_mmr, 'params': {'top_k': 3, 'lambda_param': 0.3}, 'notes': 'More diversity'},\n",
395
+ " 'reranked_k3': {'func': retrieve_with_reranking, 'params': {'top_k': 3, 'fetch_k': 20}, 'notes': 'Two-stage rerank'},\n",
396
+ "}\n",
397
+ "\n",
398
+ "print(f\"✅ Configured {len(RETRIEVAL_STRATEGIES)} retrieval strategies\")"
399
+ ]
400
+ },
401
+ {
402
+ "cell_type": "markdown",
403
+ "metadata": {},
404
+ "source": [
405
+ "## 5. LLM Models and Prompting Strategies"
406
+ ]
407
+ },
408
+ {
409
+ "cell_type": "code",
410
+ "execution_count": null,
411
+ "metadata": {},
412
+ "outputs": [],
413
+ "source": [
414
+ "# Initialize Azure OpenAI\n",
415
+ "azure_client = AzureOpenAI(\n",
416
+ " api_key=os.getenv('AZURE_OPENAI_API_KEY'),\n",
417
+ " api_version=os.getenv('AZURE_OPENAI_API_VERSION', '2024-08-01-preview'),\n",
418
+ " azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT')\n",
419
+ ")\n",
420
+ "\n",
421
+ "LLM_MODELS = {\n",
422
+ " 'Llama-4-Maverick': 'Llama-4-Maverick-17B-128E-Instruct-FP8',\n",
423
+ " 'DeepSeek-R1': 'DeepSeek-R1',\n",
424
+ " 'GPT-4.1': 'gpt-4.1',\n",
425
+ " 'GPT-5-mini': 'gpt-5-mini',\n",
426
+ " 'Claude-Sonnet-4.5': 'claude-sonnet-4-5',\n",
427
+ "}\n",
428
+ "\n",
429
+ "# Prompting strategies\n",
430
+ "PROMPTING_STRATEGIES = {\n",
431
+ " 'baseline': \"\"\"\n",
432
+ "Siz SOCAR-ın tarixi neft və qaz sənədləri üzrə köməkçisiniz.\n",
433
+ "\n",
434
+ "Kontekst:\n",
435
+ "{context}\n",
436
+ "\n",
437
+ "Sual: {query}\n",
438
+ "\n",
439
+ "Kontekstə əsaslanaraq cavab verin.\n",
440
+ "\"\"\",\n",
441
+ " \n",
442
+ " 'citation_focused': \"\"\"\n",
443
+ "Siz SOCAR-ın tarixi sənədlər üzrə mütəxəssis köməkçisisiniz.\n",
444
+ "\n",
445
+ "ÖNƏMLİ: Hər bir faktı mütləq mənbə ilə təsdiqləyin (PDF adı və səhifə nömrəsi).\n",
446
+ "\n",
447
+ "Kontekst:\n",
448
+ "{context}\n",
449
+ "\n",
450
+ "Sual: {query}\n",
451
+ "\n",
452
+ "Cavab verərkən:\n",
453
+ "1. Dəqiq faktlar yazın\n",
454
+ "2. Hər faktı mənbə ilə göstərin: (PDF: fayl_adı.pdf, Səhifə: X)\n",
455
+ "3. Kontekstdə olmayan məlumat əlavə etməyin\n",
456
+ "\"\"\",\n",
457
+ " \n",
458
+ " 'step_by_step': \"\"\"\n",
459
+ "Siz SOCAR-ın tarixi sənədlər üzrə analitik köməkçisisiniz.\n",
460
+ "\n",
461
+ "Kontekst:\n",
462
+ "{context}\n",
463
+ "\n",
464
+ "Sual: {query}\n",
465
+ "\n",
466
+ "Addım-addım cavab verin:\n",
467
+ "1. Əvvəlcə kontekstdən əlaqəli məlumatları müəyyənləşdirin\n",
468
+ "2. Bu məlumatları təhlil edin\n",
469
+ "3. Nəticəni mənbələr ilə birlikdə təqdim edin\n",
470
+ "\"\"\",\n",
471
+ " \n",
472
+ " 'few_shot': \"\"\"\n",
473
+ "Siz SOCAR-ın tarixi sənədlər üzrə mütəxəssis köməkçisisiniz.\n",
474
+ "\n",
475
+ "Nümunə:\n",
476
+ "Sual: \"Palçıq vulkanlarının təsir radiusu nə qədərdir?\"\n",
477
+ "Cavab: \"Sahə müşahidələri və modelləşdirmə göstərir ki, palçıq vulkanlarının təsir radiusu təqribən 10 km-dir (PDF: document_06.pdf, Səhifə: 5).\"\n",
478
+ "\n",
479
+ "Kontekst:\n",
480
+ "{context}\n",
481
+ "\n",
482
+ "Sual: {query}\n",
483
+ "\n",
484
+ "Yuxarıdakı nümunə kimi cavab verin - dəqiq, qısa, mənbə ilə.\n",
485
+ "\"\"\"\n",
486
+ "}\n",
487
+ "\n",
488
+ "print(f\"✅ Configured {len(LLM_MODELS)} LLM models\")\n",
489
+ "print(f\"✅ Configured {len(PROMPTING_STRATEGIES)} prompting strategies\")"
490
+ ]
491
+ },
492
+ {
493
+ "cell_type": "code",
494
+ "execution_count": null,
495
+ "metadata": {},
496
+ "outputs": [],
497
+ "source": [
498
+ "def generate_answer(llm_model: str, query: str, documents: List[Dict], \n",
499
+ " prompt_strategy: str = 'baseline',\n",
500
+ " temperature: float = 0.2) -> Tuple[str, float]:\n",
501
+ " \"\"\"\n",
502
+ " Generate answer using LLM with specified prompting strategy.\n",
503
+ " \"\"\"\n",
504
+ " # Build context\n",
505
+ " context_parts = []\n",
506
+ " for i, doc in enumerate(documents, 1):\n",
507
+ " context_parts.append(\n",
508
+ " f\"Sənəd {i} (Mənbə: {doc['pdf_name']}, Səhifə {doc['page_number']}):\\n{doc['content']}\"\n",
509
+ " )\n",
510
+ " context = \"\\n\\n\".join(context_parts)\n",
511
+ " \n",
512
+ " # Get prompt template\n",
513
+ " prompt_template = PROMPTING_STRATEGIES[prompt_strategy]\n",
514
+ " prompt = prompt_template.format(context=context, query=query)\n",
515
+ " \n",
516
+ " try:\n",
517
+ " start_time = time.time()\n",
518
+ " \n",
519
+ " deployment = LLM_MODELS[llm_model]\n",
520
+ " \n",
521
+ " # GPT-5 models use max_completion_tokens, others use max_tokens\n",
522
+ " if deployment.startswith('gpt-5'):\n",
523
+ " response = azure_client.chat.completions.create(\n",
524
+ " model=deployment,\n",
525
+ " messages=[{\"role\": \"user\", \"content\": prompt}],\n",
526
+ " temperature=temperature,\n",
527
+ " max_completion_tokens=1000\n",
528
+ " )\n",
529
+ " else:\n",
530
+ " response = azure_client.chat.completions.create(\n",
531
+ " model=deployment,\n",
532
+ " messages=[{\"role\": \"user\", \"content\": prompt}],\n",
533
+ " temperature=temperature,\n",
534
+ " max_tokens=1000\n",
535
+ " )\n",
536
+ " \n",
537
+ " elapsed = time.time() - start_time\n",
538
+ " answer = response.choices[0].message.content\n",
539
+ " \n",
540
+ " return answer, elapsed\n",
541
+ " \n",
542
+ " except Exception as e:\n",
543
+ " return f\"ERROR: {str(e)}\", 0.0\n",
544
+ "\n",
545
+ "print(\"✅ LLM generation function ready\")"
546
+ ]
547
+ },
548
+ {
549
+ "cell_type": "markdown",
550
+ "metadata": {},
551
+ "source": [
552
+ "## 6. Evaluation Metrics"
553
+ ]
554
+ },
555
+ {
556
+ "cell_type": "code",
557
+ "execution_count": null,
558
+ "metadata": {},
559
+ "outputs": [],
560
+ "source": [
561
+ "def normalize_text(text: str) -> str:\n",
562
+ " text = text.lower().strip()\n",
563
+ " text = re.sub(r'\\s+', ' ', text)\n",
564
+ " return text\n",
565
+ "\n",
566
+ "def calculate_answer_quality(reference: str, hypothesis: str) -> Dict[str, float]:\n",
567
+ " \"\"\"Accuracy metrics.\"\"\"\n",
568
+ " ref_norm = normalize_text(reference)\n",
569
+ " hyp_norm = normalize_text(hypothesis)\n",
570
+ " \n",
571
+ " cer_score = cer(ref_norm, hyp_norm) * 100\n",
572
+ " wer_score = wer(ref_norm, hyp_norm) * 100\n",
573
+ " similarity = max(0, 100 - wer_score)\n",
574
+ " \n",
575
+ " return {\n",
576
+ " 'Accuracy_Score': round(similarity, 2)\n",
577
+ " }\n",
578
+ "\n",
579
+ "def evaluate_citation_quality(answer: str, documents: List[Dict]) -> Dict[str, float]:\n",
580
+ " \"\"\"Relevance - citation quality.\"\"\"\n",
581
+ " pdf_names = [doc['pdf_name'].replace('.pdf', '') for doc in documents]\n",
582
+ " page_numbers = [str(doc['page_number']) for doc in documents]\n",
583
+ " \n",
584
+ " cited_pdfs = sum(1 for pdf in pdf_names if pdf in answer)\n",
585
+ " cited_pages = sum(1 for page in page_numbers if page in answer)\n",
586
+ " \n",
587
+ " citation_keywords = ['mənbə', 'sənəd', 'səhifə', 'pdf', 'document', 'page']\n",
588
+ " has_citation_format = any(kw in answer.lower() for kw in citation_keywords)\n",
589
+ " \n",
590
+ " citation_score = (\n",
591
+ " (cited_pdfs / len(pdf_names) * 40) +\n",
592
+ " (cited_pages / len(page_numbers) * 40) +\n",
593
+ " (20 if has_citation_format else 0)\n",
594
+ " )\n",
595
+ " \n",
596
+ " return {\n",
597
+ " 'Citation_Score': round(citation_score, 2),\n",
598
+ " 'Cited_PDFs': cited_pdfs,\n",
599
+ " 'Cited_Pages': cited_pages\n",
600
+ " }\n",
601
+ "\n",
602
+ "def evaluate_retrieval_quality(query: str, documents: List[Dict], expected_answer: str) -> Dict[str, float]:\n",
603
+ " \"\"\"Measure if retrieved docs are relevant to answer.\"\"\"\n",
604
+ " if not documents or not expected_answer:\n",
605
+ " return {'Retrieval_Relevance': 0.0}\n",
606
+ " \n",
607
+ " # Simple heuristic: check if expected answer words appear in retrieved docs\n",
608
+ " expected_words = set(normalize_text(expected_answer).split())\n",
609
+ " retrieved_text = ' '.join([doc['content'] for doc in documents])\n",
610
+ " retrieved_words = set(normalize_text(retrieved_text).split())\n",
611
+ " \n",
612
+ " overlap = len(expected_words & retrieved_words) / len(expected_words) if expected_words else 0\n",
613
+ " \n",
614
+ " return {\n",
615
+ " 'Retrieval_Relevance': round(overlap * 100, 2)\n",
616
+ " }\n",
617
+ "\n",
618
+ "def evaluate_completeness(answer: str) -> Dict[str, float]:\n",
619
+ " \"\"\"Completeness metrics.\"\"\"\n",
620
+ " word_count = len(answer.split())\n",
621
+ " \n",
622
+ " if word_count < 20:\n",
623
+ " completeness = (word_count / 20) * 100\n",
624
+ " elif word_count > 200:\n",
625
+ " completeness = 100 - ((word_count - 200) / 200 * 20)\n",
626
+ " else:\n",
627
+ " completeness = 100\n",
628
+ " \n",
629
+ " return {\n",
630
+ " 'Completeness_Score': round(max(0, completeness), 2),\n",
631
+ " 'Word_Count': word_count\n",
632
+ " }\n",
633
+ "\n",
634
+ "def calculate_llm_judge_score(accuracy: float, citation: float, completeness: float) -> float:\n",
635
+ " \"\"\"Overall LLM Judge score (weighted).\"\"\"\n",
636
+ " return round(\n",
637
+ " accuracy * 0.35 +\n",
638
+ " citation * 0.35 +\n",
639
+ " completeness * 0.30,\n",
640
+ " 2\n",
641
+ " )\n",
642
+ "\n",
643
+ "print(\"✅ Evaluation metrics ready\")"
644
+ ]
645
+ },
646
+ {
647
+ "cell_type": "markdown",
648
+ "metadata": {},
649
+ "source": [
650
+ "## 7. Run Comprehensive Benchmark"
651
+ ]
652
+ },
653
+ {
654
+ "cell_type": "code",
655
+ "execution_count": null,
656
+ "metadata": {},
657
+ "outputs": [],
658
+ "source": [
659
+ "# Configuration: Select what to test\n",
660
+ "CONFIGS_TO_TEST = [\n",
661
+ " # Format: (embed_model, retrieval_strategy, llm_model, prompt_strategy)\n",
662
+ " \n",
663
+ " # Baseline (current setup)\n",
664
+ " ('bge-large-en', 'vanilla_k3', 'Llama-4-Maverick', 'baseline'),\n",
665
+ " \n",
666
+ " # Test different embedding models\n",
667
+ " ('multilingual-e5-large', 'vanilla_k3', 'Llama-4-Maverick', 'baseline'),\n",
668
+ " \n",
669
+ " # Test different retrieval strategies\n",
670
+ " ('bge-large-en', 'vanilla_k5', 'Llama-4-Maverick', 'baseline'),\n",
671
+ " ('bge-large-en', 'mmr_balanced', 'Llama-4-Maverick', 'baseline'),\n",
672
+ " ('bge-large-en', 'reranked_k3', 'Llama-4-Maverick', 'baseline'),\n",
673
+ " \n",
674
+ " # Test different LLM models\n",
675
+ " ('bge-large-en', 'vanilla_k3', 'GPT-5-mini', 'baseline'),\n",
676
+ " ('bge-large-en', 'vanilla_k3', 'Claude-Sonnet-4.5', 'baseline'),\n",
677
+ " \n",
678
+ " # Test different prompting strategies\n",
679
+ " ('bge-large-en', 'vanilla_k3', 'Llama-4-Maverick', 'citation_focused'),\n",
680
+ " ('bge-large-en', 'vanilla_k3', 'Llama-4-Maverick', 'few_shot'),\n",
681
+ " \n",
682
+ " # Best combinations\n",
683
+ " ('bge-large-en', 'reranked_k3', 'GPT-5-mini', 'citation_focused'),\n",
684
+ " ('bge-large-en', 'mmr_balanced', 'Claude-Sonnet-4.5', 'citation_focused'),\n",
685
+ "]\n",
686
+ "\n",
687
+ "print(f\"Testing {len(CONFIGS_TO_TEST)} configurations on {len(questions)} questions\")\n",
688
+ "print(f\"Total API calls: ~{len(CONFIGS_TO_TEST) * len(questions)}\")\n",
689
+ "print(\"This will take 15-30 minutes...\\n\")"
690
+ ]
691
+ },
692
+ {
693
+ "cell_type": "code",
694
+ "execution_count": null,
695
+ "metadata": {},
696
+ "outputs": [],
697
+ "source": [
698
+ "# Run benchmark\n",
699
+ "results = []\n",
700
+ "\n",
701
+ "for config_idx, (embed_key, retrieval_key, llm_key, prompt_key) in enumerate(CONFIGS_TO_TEST, 1):\n",
702
+ " config_name = f\"{embed_key}_{retrieval_key}_{llm_key}_{prompt_key}\"\n",
703
+ " \n",
704
+ " print(f\"\\n{'='*100}\")\n",
705
+ " print(f\"Config {config_idx}/{len(CONFIGS_TO_TEST)}: {config_name}\")\n",
706
+ " print(f\"{'='*100}\")\n",
707
+ " \n",
708
+ " # Get components\n",
709
+ " embed_model = embedding_cache[embed_key]\n",
710
+ " retrieval_func = RETRIEVAL_STRATEGIES[retrieval_key]['func']\n",
711
+ " retrieval_params = RETRIEVAL_STRATEGIES[retrieval_key]['params']\n",
712
+ " \n",
713
+ " config_results = []\n",
714
+ " \n",
715
+ " for example_key, messages in questions.items():\n",
716
+ " user_msg = [m for m in messages if m['role'] == 'user'][-1]\n",
717
+ " query = user_msg['content']\n",
718
+ " \n",
719
+ " print(f\"\\n {example_key}: {query[:60]}...\")\n",
720
+ " \n",
721
+ " # Retrieve documents\n",
722
+ " documents = retrieval_func(query, embed_model, **retrieval_params)\n",
723
+ " print(f\" Retrieved {len(documents)} docs\")\n",
724
+ " \n",
725
+ " # Generate answer\n",
726
+ " answer, response_time = generate_answer(llm_key, query, documents, prompt_key)\n",
727
+ " \n",
728
+ " if answer.startswith('ERROR'):\n",
729
+ " print(f\" ❌ {answer}\")\n",
730
+ " continue\n",
731
+ " \n",
732
+ " print(f\" ✅ Generated in {response_time:.2f}s\")\n",
733
+ " \n",
734
+ " # Evaluate\n",
735
+ " expected = expected_answers.get(example_key, {}).get('Answer', '')\n",
736
+ " \n",
737
+ " accuracy_metrics = calculate_answer_quality(expected, answer) if expected else {'Accuracy_Score': 0}\n",
738
+ " citation_metrics = evaluate_citation_quality(answer, documents)\n",
739
+ " retrieval_metrics = evaluate_retrieval_quality(query, documents, expected)\n",
740
+ " completeness_metrics = evaluate_completeness(answer)\n",
741
+ " \n",
742
+ " # Calculate overall score\n",
743
+ " llm_judge_score = calculate_llm_judge_score(\n",
744
+ " accuracy_metrics['Accuracy_Score'],\n",
745
+ " citation_metrics['Citation_Score'],\n",
746
+ " completeness_metrics['Completeness_Score']\n",
747
+ " )\n",
748
+ " \n",
749
+ " result = {\n",
750
+ " 'Config': config_name,\n",
751
+ " 'Embedding_Model': embed_key,\n",
752
+ " 'Retrieval_Strategy': retrieval_key,\n",
753
+ " 'LLM_Model': llm_key,\n",
754
+ " 'Prompt_Strategy': prompt_key,\n",
755
+ " 'Question': example_key,\n",
756
+ " 'Query': query[:80],\n",
757
+ " 'Num_Docs_Retrieved': len(documents),\n",
758
+ " 'Response_Time': round(response_time, 2),\n",
759
+ " 'LLM_Judge_Score': llm_judge_score,\n",
760
+ " **accuracy_metrics,\n",
761
+ " **citation_metrics,\n",
762
+ " **retrieval_metrics,\n",
763
+ " **completeness_metrics,\n",
764
+ " 'Answer_Preview': answer[:150]\n",
765
+ " }\n",
766
+ " \n",
767
+ " results.append(result)\n",
768
+ " config_results.append(result)\n",
769
+ " \n",
770
+ " # Show config summary\n",
771
+ " if config_results:\n",
772
+ " avg_score = sum(r['LLM_Judge_Score'] for r in config_results) / len(config_results)\n",
773
+ " avg_time = sum(r['Response_Time'] for r in config_results) / len(config_results)\n",
774
+ " print(f\"\\n 📊 Config Summary:\")\n",
775
+ " print(f\" Avg LLM Judge Score: {avg_score:.2f}%\")\n",
776
+ " print(f\" Avg Response Time: {avg_time:.2f}s\")\n",
777
+ "\n",
778
+ "print(f\"\\n{'='*100}\")\n",
779
+ "print(\"✅ Comprehensive benchmark complete!\")\n",
780
+ "print(f\"{'='*100}\")"
781
+ ]
782
+ },
783
+ {
784
+ "cell_type": "markdown",
785
+ "metadata": {},
786
+ "source": [
787
+ "## 8. Analyze Results"
788
+ ]
789
+ },
790
+ {
791
+ "cell_type": "code",
792
+ "execution_count": null,
793
+ "metadata": {},
794
+ "outputs": [],
795
+ "source": [
796
+ "# Create DataFrame\n",
797
+ "df = pd.DataFrame(results)\n",
798
+ "\n",
799
+ "# Aggregate by configuration\n",
800
+ "config_summary = df.groupby('Config').agg({\n",
801
+ " 'LLM_Judge_Score': 'mean',\n",
802
+ " 'Accuracy_Score': 'mean',\n",
803
+ " 'Citation_Score': 'mean',\n",
804
+ " 'Retrieval_Relevance': 'mean',\n",
805
+ " 'Completeness_Score': 'mean',\n",
806
+ " 'Response_Time': 'mean',\n",
807
+ " 'Embedding_Model': 'first',\n",
808
+ " 'Retrieval_Strategy': 'first',\n",
809
+ " 'LLM_Model': 'first',\n",
810
+ " 'Prompt_Strategy': 'first'\n",
811
+ "}).round(2)\n",
812
+ "\n",
813
+ "# Sort by LLM Judge Score\n",
814
+ "config_summary = config_summary.sort_values('LLM_Judge_Score', ascending=False)\n",
815
+ "\n",
816
+ "print(\"\\n\" + \"=\"*120)\n",
817
+ "print(\"📊 CONFIGURATION RANKINGS (By LLM Judge Score)\")\n",
818
+ "print(\"=\"*120)\n",
819
+ "display_cols = ['Embedding_Model', 'Retrieval_Strategy', 'LLM_Model', 'Prompt_Strategy', \n",
820
+ " 'LLM_Judge_Score', 'Accuracy_Score', 'Citation_Score', 'Response_Time']\n",
821
+ "print(config_summary[display_cols].to_string())\n",
822
+ "print(\"=\"*120)"
823
+ ]
824
+ },
825
+ {
826
+ "cell_type": "markdown",
827
+ "metadata": {},
828
+ "source": [
829
+ "## 9. Component Analysis"
830
+ ]
831
+ },
832
+ {
833
+ "cell_type": "code",
834
+ "execution_count": null,
835
+ "metadata": {},
836
+ "outputs": [],
837
+ "source": [
838
+ "# Analyze impact of each component\n",
839
+ "print(\"\\n\" + \"=\"*100)\n",
840
+ "print(\"🔍 COMPONENT IMPACT ANALYSIS\")\n",
841
+ "print(\"=\"*100)\n",
842
+ "\n",
843
+ "# 1. Embedding Models\n",
844
+ "print(\"\\n📚 EMBEDDING MODELS:\")\n",
845
+ "embed_impact = df.groupby('Embedding_Model')['LLM_Judge_Score'].mean().sort_values(ascending=False)\n",
846
+ "for model, score in embed_impact.items():\n",
847
+ " print(f\" {model}: {score:.2f}%\")\n",
848
+ "\n",
849
+ "# 2. Retrieval Strategies\n",
850
+ "print(\"\\n🔎 RETRIEVAL STRATEGIES:\")\n",
851
+ "retrieval_impact = df.groupby('Retrieval_Strategy')['LLM_Judge_Score'].mean().sort_values(ascending=False)\n",
852
+ "for strategy, score in retrieval_impact.items():\n",
853
+ " notes = RETRIEVAL_STRATEGIES[strategy]['notes']\n",
854
+ " print(f\" {strategy}: {score:.2f}% ({notes})\")\n",
855
+ "\n",
856
+ "# 3. LLM Models\n",
857
+ "print(\"\\n🤖 LLM MODELS:\")\n",
858
+ "llm_impact = df.groupby('LLM_Model')['LLM_Judge_Score'].mean().sort_values(ascending=False)\n",
859
+ "for model, score in llm_impact.items():\n",
860
+ " print(f\" {model}: {score:.2f}%\")\n",
861
+ "\n",
862
+ "# 4. Prompting Strategies\n",
863
+ "print(\"\\n💬 PROMPTING STRATEGIES:\")\n",
864
+ "prompt_impact = df.groupby('Prompt_Strategy')['LLM_Judge_Score'].mean().sort_values(ascending=False)\n",
865
+ "for strategy, score in prompt_impact.items():\n",
866
+ " print(f\" {strategy}: {score:.2f}%\")\n",
867
+ "\n",
868
+ "print(\"\\n\" + \"=\"*100)"
869
+ ]
870
+ },
871
+ {
872
+ "cell_type": "markdown",
873
+ "metadata": {},
874
+ "source": [
875
+ "import os\n",
876
+ "from pathlib import Path\n",
877
+ "\n",
878
+ "# Create output directory - using dynamic path\n",
879
+ "output_dir = OUTPUT_DIR / 'rag_optimization_benchmark'\n",
880
+ "output_dir.mkdir(parents=True, exist_ok=True)\n",
881
+ "\n",
882
+ "fig, axes = plt.subplots(2, 3, figsize=(20, 12))\n",
883
+ "\n",
884
+ "# 1. Top Configurations\n",
885
+ "ax1 = axes[0, 0]\n",
886
+ "top_configs = config_summary.head(10)\n",
887
+ "config_labels = [c.split('_')[-2] + '+' + c.split('_')[-1] for c in top_configs.index]\n",
888
+ "ax1.barh(config_labels, top_configs['LLM_Judge_Score'], color=sns.color_palette('viridis', len(top_configs)))\n",
889
+ "ax1.set_xlabel('LLM Judge Score (%)', fontsize=11, fontweight='bold')\n",
890
+ "ax1.set_title('Top 10 Configurations', fontsize=13, fontweight='bold')\n",
891
+ "ax1.set_xlim(0, 100)\n",
892
+ "for i, score in enumerate(top_configs['LLM_Judge_Score']):\n",
893
+ " ax1.text(score + 1, i, f'{score:.1f}', va='center', fontsize=10)\n",
894
+ "\n",
895
+ "# 2. Embedding Model Impact\n",
896
+ "ax2 = axes[0, 1]\n",
897
+ "ax2.bar(embed_impact.index, embed_impact.values, color='skyblue', alpha=0.8)\n",
898
+ "ax2.set_ylabel('Avg LLM Judge Score (%)', fontsize=11, fontweight='bold')\n",
899
+ "ax2.set_title('Embedding Model Impact', fontsize=13, fontweight='bold')\n",
900
+ "ax2.set_ylim(0, 100)\n",
901
+ "ax2.tick_params(axis='x', rotation=45)\n",
902
+ "for i, (model, score) in enumerate(embed_impact.items()):\n",
903
+ " ax2.text(i, score + 2, f'{score:.1f}', ha='center', fontsize=10)\n",
904
+ "\n",
905
+ "# 3. Retrieval Strategy Impact\n",
906
+ "ax3 = axes[0, 2]\n",
907
+ "ax3.bar(retrieval_impact.index, retrieval_impact.values, color='coral', alpha=0.8)\n",
908
+ "ax3.set_ylabel('Avg LLM Judge Score (%)', fontsize=11, fontweight='bold')\n",
909
+ "ax3.set_title('Retrieval Strategy Impact', fontsize=13, fontweight='bold')\n",
910
+ "ax3.set_ylim(0, 100)\n",
911
+ "ax3.tick_params(axis='x', rotation=45)\n",
912
+ "for i, (strategy, score) in enumerate(retrieval_impact.items()):\n",
913
+ " ax3.text(i, score + 2, f'{score:.1f}', ha='center', fontsize=9)\n",
914
+ "\n",
915
+ "# 4. LLM Model Impact\n",
916
+ "ax4 = axes[1, 0]\n",
917
+ "ax4.bar(llm_impact.index, llm_impact.values, color='mediumseagreen', alpha=0.8)\n",
918
+ "ax4.set_ylabel('Avg LLM Judge Score (%)', fontsize=11, fontweight='bold')\n",
919
+ "ax4.set_title('LLM Model Impact', fontsize=13, fontweight='bold')\n",
920
+ "ax4.set_ylim(0, 100)\n",
921
+ "ax4.tick_params(axis='x', rotation=45)\n",
922
+ "for i, (model, score) in enumerate(llm_impact.items()):\n",
923
+ " ax4.text(i, score + 2, f'{score:.1f}', ha='center', fontsize=10)\n",
924
+ "\n",
925
+ "# 5. Prompting Strategy Impact\n",
926
+ "ax5 = axes[1, 1]\n",
927
+ "ax5.bar(prompt_impact.index, prompt_impact.values, color='mediumpurple', alpha=0.8)\n",
928
+ "ax5.set_ylabel('Avg LLM Judge Score (%)', fontsize=11, fontweight='bold')\n",
929
+ "ax5.set_title('Prompting Strategy Impact', fontsize=13, fontweight='bold')\n",
930
+ "ax5.set_ylim(0, 100)\n",
931
+ "ax5.tick_params(axis='x', rotation=45)\n",
932
+ "for i, (strategy, score) in enumerate(prompt_impact.items()):\n",
933
+ " ax5.text(i, score + 2, f'{score:.1f}', ha='center', fontsize=10)\n",
934
+ "\n",
935
+ "# 6. Score Components (best config)\n",
936
+ "ax6 = axes[1, 2]\n",
937
+ "best_config = config_summary.iloc[0]\n",
938
+ "components = ['Accuracy', 'Citation', 'Completeness']\n",
939
+ "scores = [best_config['Accuracy_Score'], best_config['Citation_Score'], best_config['Completeness_Score']]\n",
940
+ "colors_comp = ['#FF6B6B', '#4ECDC4', '#45B7D1']\n",
941
+ "bars = ax6.bar(components, scores, color=colors_comp, alpha=0.8)\n",
942
+ "ax6.set_ylabel('Score (%)', fontsize=11, fontweight='bold')\n",
943
+ "ax6.set_title(f'Best Config Components\\n{best_config.name.split(\"_\")[2]}', fontsize=13, fontweight='bold')\n",
944
+ "ax6.set_ylim(0, 100)\n",
945
+ "for i, score in enumerate(scores):\n",
946
+ " ax6.text(i, score + 2, f'{score:.1f}%', ha='center', fontsize=10, fontweight='bold')\n",
947
+ "\n",
948
+ "plt.tight_layout()\n",
949
+ "plt.savefig(output_dir / 'results.png', dpi=300, bbox_inches='tight')\n",
950
+ "plt.show()\n",
951
+ "\n",
952
+ "print(f\"\\n✅ Visualization saved to '{output_dir}/results.png'\")"
953
+ ]
954
+ },
955
+ {
956
+ "cell_type": "code",
957
+ "execution_count": null,
958
+ "metadata": {},
959
+ "outputs": [],
960
+ "source": "import os\nfrom pathlib import Path\n\n# Create output directory - using dynamic path\noutput_dir = OUTPUT_DIR / 'rag_optimization_benchmark'\noutput_dir.mkdir(parents=True, exist_ok=True)\n\nfig, axes = plt.subplots(2, 3, figsize=(20, 12))\n\n# 1. Top Configurations\nax1 = axes[0, 0]\ntop_configs = config_summary.head(10)\nconfig_labels = [c.split('_')[-2] + '+' + c.split('_')[-1] for c in top_configs.index]\nax1.barh(config_labels, top_configs['LLM_Judge_Score'], color=sns.color_palette('viridis', len(top_configs)))\nax1.set_xlabel('LLM Judge Score (%)', fontsize=11, fontweight='bold')\nax1.set_title('Top 10 Configurations', fontsize=13, fontweight='bold')\nax1.set_xlim(0, 100)\nfor i, score in enumerate(top_configs['LLM_Judge_Score']):\n ax1.text(score + 1, i, f'{score:.1f}', va='center', fontsize=10)\n\n# 2. Embedding Model Impact\nax2 = axes[0, 1]\nax2.bar(embed_impact.index, embed_impact.values, color='skyblue', alpha=0.8)\nax2.set_ylabel('Avg LLM Judge Score (%)', fontsize=11, fontweight='bold')\nax2.set_title('Embedding Model Impact', fontsize=13, fontweight='bold')\nax2.set_ylim(0, 100)\nax2.tick_params(axis='x', rotation=45)\nfor i, (model, score) in enumerate(embed_impact.items()):\n ax2.text(i, score + 2, f'{score:.1f}', ha='center', fontsize=10)\n\n# 3. Retrieval Strategy Impact\nax3 = axes[0, 2]\nax3.bar(retrieval_impact.index, retrieval_impact.values, color='coral', alpha=0.8)\nax3.set_ylabel('Avg LLM Judge Score (%)', fontsize=11, fontweight='bold')\nax3.set_title('Retrieval Strategy Impact', fontsize=13, fontweight='bold')\nax3.set_ylim(0, 100)\nax3.tick_params(axis='x', rotation=45)\nfor i, (strategy, score) in enumerate(retrieval_impact.items()):\n ax3.text(i, score + 2, f'{score:.1f}', ha='center', fontsize=9)\n\n# 4. LLM Model Impact\nax4 = axes[1, 0]\nax4.bar(llm_impact.index, llm_impact.values, color='mediumseagreen', alpha=0.8)\nax4.set_ylabel('Avg LLM Judge Score (%)', fontsize=11, fontweight='bold')\nax4.set_title('LLM Model Impact', fontsize=13, fontweight='bold')\nax4.set_ylim(0, 100)\nax4.tick_params(axis='x', rotation=45)\nfor i, (model, score) in enumerate(llm_impact.items()):\n ax4.text(i, score + 2, f'{score:.1f}', ha='center', fontsize=10)\n\n# 5. Prompting Strategy Impact\nax5 = axes[1, 1]\nax5.bar(prompt_impact.index, prompt_impact.values, color='mediumpurple', alpha=0.8)\nax5.set_ylabel('Avg LLM Judge Score (%)', fontsize=11, fontweight='bold')\nax5.set_title('Prompting Strategy Impact', fontsize=13, fontweight='bold')\nax5.set_ylim(0, 100)\nax5.tick_params(axis='x', rotation=45)\nfor i, (strategy, score) in enumerate(prompt_impact.items()):\n ax5.text(i, score + 2, f'{score:.1f}', ha='center', fontsize=10)\n\n# 6. Score Components (best config)\nax6 = axes[1, 2]\nbest_config = config_summary.iloc[0]\ncomponents = ['Accuracy', 'Citation', 'Completeness']\nscores = [best_config['Accuracy_Score'], best_config['Citation_Score'], best_config['Completeness_Score']]\ncolors_comp = ['#FF6B6B', '#4ECDC4', '#45B7D1']\nbars = ax6.bar(components, scores, color=colors_comp, alpha=0.8)\nax6.set_ylabel('Score (%)', fontsize=11, fontweight='bold')\nax6.set_title(f'Best Config Components\\n{best_config.name.split(\"_\")[2]}', fontsize=13, fontweight='bold')\nax6.set_ylim(0, 100)\nfor i, score in enumerate(scores):\n ax6.text(i, score + 2, f'{score:.1f}%', ha='center', fontsize=10, fontweight='bold')\n\nplt.tight_layout()\nplt.savefig(output_dir / 'results.png', dpi=300, bbox_inches='tight')\nplt.show()\n\nprint(f\"\\n✅ Visualization saved to '{output_dir}/results.png'\")"
961
+ },
962
+ {
963
+ "cell_type": "code",
964
+ "execution_count": null,
965
+ "metadata": {},
966
+ "outputs": [],
967
+ "source": [
968
+ "best_config = config_summary.iloc[0]\n",
969
+ "\n",
970
+ "print(\"\\n\" + \"=\"*100)\n",
971
+ "print(\"🏆 OPTIMAL RAG CONFIGURATION\")\n",
972
+ "print(\"=\"*100)\n",
973
+ "\n",
974
+ "print(f\"\\n✅ Best Configuration: {best_config.name}\")\n",
975
+ "print(f\"\\n📊 Performance:\")\n",
976
+ "print(f\" LLM Judge Score: {best_config['LLM_Judge_Score']:.2f}%\")\n",
977
+ "print(f\" Accuracy: {best_config['Accuracy_Score']:.2f}%\")\n",
978
+ "print(f\" Citation Quality: {best_config['Citation_Score']:.2f}%\")\n",
979
+ "print(f\" Completeness: {best_config['Completeness_Score']:.2f}%\")\n",
980
+ "print(f\" Avg Response Time: {best_config['Response_Time']:.2f}s\")\n",
981
+ "\n",
982
+ "print(f\"\\n⚙️ Components:\")\n",
983
+ "print(f\" Embedding Model: {best_config['Embedding_Model']}\")\n",
984
+ "print(f\" → {EMBEDDING_MODELS[best_config['Embedding_Model']]['name']}\")\n",
985
+ "print(f\" Retrieval Strategy: {best_config['Retrieval_Strategy']}\")\n",
986
+ "print(f\" → {RETRIEVAL_STRATEGIES[best_config['Retrieval_Strategy']]['notes']}\")\n",
987
+ "print(f\" LLM Model: {best_config['LLM_Model']}\")\n",
988
+ "print(f\" Prompting Strategy: {best_config['Prompt_Strategy']}\")\n",
989
+ "\n",
990
+ "print(f\"\\n💡 Key Findings:\")\n",
991
+ "print(f\" 1. Best Embedding: {embed_impact.index[0]} ({embed_impact.values[0]:.2f}%)\")\n",
992
+ "print(f\" 2. Best Retrieval: {retrieval_impact.index[0]} ({retrieval_impact.values[0]:.2f}%)\")\n",
993
+ "print(f\" 3. Best LLM: {llm_impact.index[0]} ({llm_impact.values[0]:.2f}%)\")\n",
994
+ "print(f\" 4. Best Prompt: {prompt_impact.index[0]} ({prompt_impact.values[0]:.2f}%)\")\n",
995
+ "\n",
996
+ "print(f\"\\n🎯 Hackathon Impact:\")\n",
997
+ "print(f\" LLM Quality = 30% of total score\")\n",
998
+ "print(f\" Your score: {best_config['LLM_Judge_Score']:.2f}% × 30% = {best_config['LLM_Judge_Score'] * 0.3:.2f} points\")\n",
999
+ "\n",
1000
+ "baseline = df[df['Config'].str.contains('baseline')].iloc[0] if len(df[df['Config'].str.contains('baseline')]) > 0 else None\n",
1001
+ "if baseline is not None:\n",
1002
+ " improvement = best_config['LLM_Judge_Score'] - baseline['LLM_Judge_Score']\n",
1003
+ " print(f\"\\n📈 Improvement vs Baseline:\")\n",
1004
+ " print(f\" +{improvement:.2f}% quality improvement\")\n",
1005
+ " print(f\" = +{improvement * 0.3:.2f} hackathon points\")\n",
1006
+ "\n",
1007
+ "print(\"\\n\" + \"=\"*100)\n",
1008
+ "print(\"📝 IMPLEMENTATION CHECKLIST\")\n",
1009
+ "print(\"=\"*100)\n",
1010
+ "print(f\"\\n1. Use embedding model: {EMBEDDING_MODELS[best_config['Embedding_Model']]['name']}\")\n",
1011
+ "print(f\"2. Implement retrieval: {best_config['Retrieval_Strategy']}\")\n",
1012
+ "print(f\"3. Use LLM model: {best_config['LLM_Model']}\")\n",
1013
+ "print(f\"4. Apply prompt: {best_config['Prompt_Strategy']}\")\n",
1014
+ "print(f\"\\n5. Expected performance:\")\n",
1015
+ "print(f\" - LLM Judge Score: {best_config['LLM_Judge_Score']:.2f}%\")\n",
1016
+ "print(f\" - Response time: ~{best_config['Response_Time']:.1f}s\")\n",
1017
+ "print(\"=\"*100)"
1018
+ ]
1019
+ },
1020
+ {
1021
+ "cell_type": "markdown",
1022
+ "metadata": {},
1023
+ "source": [
1024
+ "# Save results\n",
1025
+ "from pathlib import Path\n",
1026
+ "\n",
1027
+ "# Using dynamic path\n",
1028
+ "output_dir = OUTPUT_DIR / 'rag_optimization_benchmark'\n",
1029
+ "output_dir.mkdir(parents=True, exist_ok=True)\n",
1030
+ "\n",
1031
+ "df.to_csv(output_dir / 'detailed_results.csv', index=False, encoding='utf-8')\n",
1032
+ "config_summary.to_csv(output_dir / 'summary.csv', encoding='utf-8')\n",
1033
+ "\n",
1034
+ "# Save component impacts\n",
1035
+ "impacts = pd.DataFrame({\n",
1036
+ " 'Embedding_Impact': embed_impact,\n",
1037
+ " 'Retrieval_Impact': retrieval_impact.reindex(embed_impact.index, fill_value=0),\n",
1038
+ " 'LLM_Impact': llm_impact.reindex(embed_impact.index, fill_value=0),\n",
1039
+ " 'Prompt_Impact': prompt_impact.reindex(embed_impact.index, fill_value=0)\n",
1040
+ "}).fillna(0)\n",
1041
+ "impacts.to_csv(output_dir / 'component_impacts.csv', encoding='utf-8')\n",
1042
+ "\n",
1043
+ "print(\"\\n✅ Results exported to output/rag_optimization_benchmark/:\")\n",
1044
+ "print(\" - detailed_results.csv (all tests)\")\n",
1045
+ "print(\" - summary.csv (config rankings)\")\n",
1046
+ "print(\" - component_impacts.csv (component analysis)\")\n",
1047
+ "print(\" - results.png (visualizations)\")"
1048
+ ]
1049
+ }
1050
+ ],
1051
+ "metadata": {
1052
+ "kernelspec": {
1053
+ "display_name": "venv",
1054
+ "language": "python",
1055
+ "name": "python3"
1056
+ },
1057
+ "language_info": {
1058
+ "codemirror_mode": {
1059
+ "name": "ipython",
1060
+ "version": 3
1061
+ },
1062
+ "file_extension": ".py",
1063
+ "mimetype": "text/x-python",
1064
+ "name": "python",
1065
+ "nbconvert_exporter": "python",
1066
+ "pygments_lexer": "ipython3",
1067
+ "version": "3.10.12"
1068
+ }
1069
+ },
1070
+ "nbformat": 4,
1071
+ "nbformat_minor": 4
1072
+ }
notebooks/vlm_ocr_benchmark.ipynb CHANGED
@@ -31,7 +31,7 @@
31
  },
32
  {
33
  "cell_type": "code",
34
- "execution_count": 15,
35
  "metadata": {},
36
  "outputs": [],
37
  "source": [
@@ -41,7 +41,7 @@
41
  },
42
  {
43
  "cell_type": "code",
44
- "execution_count": 16,
45
  "metadata": {},
46
  "outputs": [
47
  {
@@ -81,15 +81,58 @@
81
  ]
82
  },
83
  {
84
- "cell_type": "markdown",
 
85
  "metadata": {},
 
 
 
 
 
 
 
 
 
 
 
 
86
  "source": [
87
- "## 1. Load Ground Truth"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  ]
89
  },
90
  {
91
  "cell_type": "code",
92
- "execution_count": 17,
93
  "metadata": {},
94
  "outputs": [
95
  {
@@ -122,22 +165,15 @@
122
  " \n",
123
  " return text.strip()\n",
124
  "\n",
125
- "# Load ground truth\n",
126
- "ground_truth = load_ground_truth('data/document_00.md')\n",
127
  "print(f\"✅ Ground truth loaded: {len(ground_truth)} characters\")\n",
128
  "print(f\"Preview:\\n{ground_truth[:300]}...\")"
129
  ]
130
  },
131
- {
132
- "cell_type": "markdown",
133
- "metadata": {},
134
- "source": [
135
- "## 2. PDF to Image Conversion"
136
- ]
137
- },
138
  {
139
  "cell_type": "code",
140
- "execution_count": 18,
141
  "metadata": {},
142
  "outputs": [
143
  {
@@ -181,23 +217,16 @@
181
  " image.save(buffered, format=format)\n",
182
  " return base64.b64encode(buffered.getvalue()).decode('utf-8')\n",
183
  "\n",
184
- "# Test conversion\n",
185
- "pdf_path = 'data/pdfs/document_00.pdf'\n",
186
  "test_images = pdf_to_images(pdf_path)\n",
187
  "print(f\"\\n✅ Converted PDF to {len(test_images)} images\")\n",
188
  "print(f\"First image size: {test_images[0].size}\")"
189
  ]
190
  },
191
- {
192
- "cell_type": "markdown",
193
- "metadata": {},
194
- "source": [
195
- "## 3. Vision-Language Model Client"
196
- ]
197
- },
198
  {
199
  "cell_type": "code",
200
- "execution_count": 19,
201
  "metadata": {},
202
  "outputs": [
203
  {
@@ -261,7 +290,7 @@
261
  },
262
  {
263
  "cell_type": "code",
264
- "execution_count": 20,
265
  "metadata": {},
266
  "outputs": [
267
  {
@@ -370,68 +399,6 @@
370
  "## 4. Metrics Calculation"
371
  ]
372
  },
373
- {
374
- "cell_type": "code",
375
- "execution_count": 21,
376
- "metadata": {},
377
- "outputs": [
378
- {
379
- "name": "stdout",
380
- "output_type": "stream",
381
- "text": [
382
- "✅ Metrics functions ready\n"
383
- ]
384
- }
385
- ],
386
- "source": [
387
- "def normalize_text(text: str) -> str:\n",
388
- " \"\"\"Normalize text for comparison.\"\"\"\n",
389
- " text = text.lower().strip()\n",
390
- " text = re.sub(r'\\s+', ' ', text)\n",
391
- " return text\n",
392
- "\n",
393
- "def calculate_ocr_metrics(reference: str, hypothesis: str) -> Dict[str, float]:\n",
394
- " \"\"\"\n",
395
- " Calculate comprehensive OCR metrics.\n",
396
- " \"\"\"\n",
397
- " ref_norm = normalize_text(reference)\n",
398
- " hyp_norm = normalize_text(hypothesis)\n",
399
- " \n",
400
- " # Character Error Rate\n",
401
- " cer_score = cer(ref_norm, hyp_norm) * 100\n",
402
- " \n",
403
- " # Word Error Rate\n",
404
- " wer_score = wer(ref_norm, hyp_norm) * 100\n",
405
- " \n",
406
- " # Success rates\n",
407
- " csr_score = max(0, 100 - cer_score)\n",
408
- " wsr_score = max(0, 100 - wer_score)\n",
409
- " \n",
410
- " # Length metrics\n",
411
- " ref_chars = len(ref_norm)\n",
412
- " hyp_chars = len(hyp_norm)\n",
413
- " ref_words = len(ref_norm.split())\n",
414
- " hyp_words = len(hyp_norm.split())\n",
415
- " \n",
416
- " char_length_acc = (min(ref_chars, hyp_chars) / max(ref_chars, hyp_chars) * 100) if max(ref_chars, hyp_chars) > 0 else 0\n",
417
- " word_length_acc = (min(ref_words, hyp_words) / max(ref_words, hyp_words) * 100) if max(ref_words, hyp_words) > 0 else 0\n",
418
- " \n",
419
- " return {\n",
420
- " 'CER': round(cer_score, 2),\n",
421
- " 'WER': round(wer_score, 2),\n",
422
- " 'CSR': round(csr_score, 2),\n",
423
- " 'WSR': round(wsr_score, 2),\n",
424
- " 'Char_Count_Ref': ref_chars,\n",
425
- " 'Char_Count_Hyp': hyp_chars,\n",
426
- " 'Word_Count_Ref': ref_words,\n",
427
- " 'Word_Count_Hyp': hyp_words,\n",
428
- " 'Char_Length_Accuracy': round(char_length_acc, 2),\n",
429
- " 'Word_Length_Accuracy': round(word_length_acc, 2)\n",
430
- " }\n",
431
- "\n",
432
- "print(\"✅ Metrics functions ready\")"
433
- ]
434
- },
435
  {
436
  "cell_type": "markdown",
437
  "metadata": {},
@@ -441,7 +408,7 @@
441
  },
442
  {
443
  "cell_type": "code",
444
- "execution_count": 22,
445
  "metadata": {},
446
  "outputs": [
447
  {
@@ -471,7 +438,7 @@
471
  },
472
  {
473
  "cell_type": "code",
474
- "execution_count": null,
475
  "metadata": {},
476
  "outputs": [
477
  {
@@ -486,9 +453,68 @@
486
  "Notes: Excellent OCR\n",
487
  "================================================================================\n",
488
  " Page 1/12: 9.2s\n",
489
- " Page 2/12: 9.5s\n",
490
- " Page 3/12: 10.9s\n",
491
- " Page 4/12: 10.7s\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
492
  ]
493
  }
494
  ],
@@ -552,7 +578,7 @@
552
  },
553
  {
554
  "cell_type": "code",
555
- "execution_count": null,
556
  "metadata": {},
557
  "outputs": [
558
  {
@@ -564,7 +590,7 @@
564
  "📊 VLM OCR BENCHMARKING RESULTS\n",
565
  "====================================================================================================\n",
566
  " Model CSR WSR CER WER Response_Time Rating\n",
567
- "GPT-4.1 85.86 67.61 14.14 32.39 133.49 ⭐⭐⭐⭐⭐\n",
568
  "====================================================================================================\n"
569
  ]
570
  }
@@ -586,13 +612,6 @@
586
  "print(\"=\"*100)"
587
  ]
588
  },
589
- {
590
- "cell_type": "markdown",
591
- "metadata": {},
592
- "source": [
593
- "## 7. Visualizations"
594
- ]
595
- },
596
  {
597
  "cell_type": "code",
598
  "execution_count": null,
@@ -603,8 +622,8 @@
603
  "import os\n",
604
  "from pathlib import Path\n",
605
  "\n",
606
- "# Create output directory\n",
607
- "output_dir = Path('output/vlm_ocr_benchmark')\n",
608
  "output_dir.mkdir(parents=True, exist_ok=True)\n",
609
  "\n",
610
  "fig, axes = plt.subplots(2, 2, figsize=(16, 10))\n",
@@ -662,133 +681,6 @@
662
  "print(f\"\\n✅ Visualization saved to '{output_dir}/results.png'\")"
663
  ]
664
  },
665
- {
666
- "cell_type": "markdown",
667
- "metadata": {},
668
- "source": [
669
- "## 8. Winner Analysis and Recommendations"
670
- ]
671
- },
672
- {
673
- "cell_type": "code",
674
- "execution_count": null,
675
- "metadata": {},
676
- "outputs": [
677
- {
678
- "name": "stdout",
679
- "output_type": "stream",
680
- "text": [
681
- "\n",
682
- "====================================================================================================\n",
683
- "🏆 FINAL RANKINGS\n",
684
- "====================================================================================================\n",
685
- " Rank Model CSR WSR CER WER Response_Time Rating\n",
686
- " 1 GPT-4.1 85.86 67.61 14.14 32.39 133.49 ⭐⭐⭐⭐⭐\n",
687
- "====================================================================================================\n",
688
- "\n",
689
- "====================================================================================================\n",
690
- "💡 RECOMMENDATIONS FOR HACKATHON\n",
691
- "====================================================================================================\n",
692
- "\n",
693
- "🥇 BEST OVERALL: GPT-4.1 ⭐⭐⭐⭐⭐\n",
694
- " CSR: 85.86% (Character Success)\n",
695
- " WSR: 67.61% (Word Success)\n",
696
- " CER: 14.14% (Character Error)\n",
697
- " WER: 32.39% (Word Error)\n",
698
- " Time: 133.49s for 12 pages\n",
699
- " Notes: Excellent OCR\n",
700
- "\n",
701
- "⚡ FASTEST: GPT-4.1\n",
702
- " Time: 133.49s\n",
703
- " CSR: 85.86%\n",
704
- "\n",
705
- "====================================================================================================\n",
706
- "📝 HACKATHON SCORING IMPACT\n",
707
- "====================================================================================================\n",
708
- "\n",
709
- "OCR Quality = 50% of total hackathon score\n",
710
- "\n",
711
- "Using GPT-4.1:\n",
712
- " - CSR: 85.86% × 50% = 42.93 points\n",
713
- " - This is 85.9% accuracy on character-level OCR\n",
714
- "\n",
715
- "⚠️ GOOD - Consider optimizing prompt or trying other models\n",
716
- "\n",
717
- "====================================================================================================\n",
718
- "🎯 FINAL RECOMMENDATION\n",
719
- "====================================================================================================\n",
720
- "\n",
721
- "Use: GPT-4.1\n",
722
- "Reason: Highest accuracy (85.86% CSR) for hackathon OCR benchmark\n",
723
- "Implementation: Use vision API with same prompt as tested\n",
724
- "====================================================================================================\n"
725
- ]
726
- }
727
- ],
728
- "source": [
729
- "# Rankings\n",
730
- "rankings = df[['Model', 'CSR', 'WSR', 'CER', 'WER', 'Response_Time', 'Rating']].copy()\n",
731
- "rankings.insert(0, 'Rank', range(1, len(rankings) + 1))\n",
732
- "\n",
733
- "print(\"\\n\" + \"=\"*100)\n",
734
- "print(\"🏆 FINAL RANKINGS\")\n",
735
- "print(\"=\"*100)\n",
736
- "print(rankings.to_string(index=False))\n",
737
- "print(\"=\"*100)\n",
738
- "\n",
739
- "# Winner\n",
740
- "best_model = df.iloc[0]\n",
741
- "fastest_model = df.loc[df['Response_Time'].idxmin()]\n",
742
- "\n",
743
- "print(\"\\n\" + \"=\"*100)\n",
744
- "print(\"💡 RECOMMENDATIONS FOR HACKATHON\")\n",
745
- "print(\"=\"*100)\n",
746
- "\n",
747
- "print(f\"\\n🥇 BEST OVERALL: {best_model['Model']} {best_model['Rating']}\")\n",
748
- "print(f\" CSR: {best_model['CSR']:.2f}% (Character Success)\")\n",
749
- "print(f\" WSR: {best_model['WSR']:.2f}% (Word Success)\")\n",
750
- "print(f\" CER: {best_model['CER']:.2f}% (Character Error)\")\n",
751
- "print(f\" WER: {best_model['WER']:.2f}% (Word Error)\")\n",
752
- "print(f\" Time: {best_model['Response_Time']:.2f}s for {len(images)} pages\")\n",
753
- "print(f\" Notes: {best_model['Notes']}\")\n",
754
- "\n",
755
- "print(f\"\\n⚡ FASTEST: {fastest_model['Model']}\")\n",
756
- "print(f\" Time: {fastest_model['Response_Time']:.2f}s\")\n",
757
- "print(f\" CSR: {fastest_model['CSR']:.2f}%\")\n",
758
- "\n",
759
- "print(\"\\n\" + \"=\"*100)\n",
760
- "print(\"📝 HACKATHON SCORING IMPACT\")\n",
761
- "print(\"=\"*100)\n",
762
- "print(\"\\nOCR Quality = 50% of total hackathon score\")\n",
763
- "print(f\"\\nUsing {best_model['Model']}:\")\n",
764
- "print(f\" - CSR: {best_model['CSR']:.2f}% × 50% = {best_model['CSR'] * 0.5:.2f} points\")\n",
765
- "print(f\" - This is {best_model['CSR']:.1f}% accuracy on character-level OCR\")\n",
766
- "\n",
767
- "if best_model['CSR'] >= 95:\n",
768
- " print(\"\\n✅ EXCELLENT - This will score very high on OCR!\")\n",
769
- "elif best_model['CSR'] >= 90:\n",
770
- " print(\"\\n✅ VERY GOOD - Strong OCR performance!\")\n",
771
- "elif best_model['CSR'] >= 85:\n",
772
- " print(\"\\n⚠️ GOOD - Consider optimizing prompt or trying other models\")\n",
773
- "else:\n",
774
- " print(\"\\n⚠️ NEEDS IMPROVEMENT - Try other models or adjust parameters\")\n",
775
- "\n",
776
- "print(\"\\n\" + \"=\"*100)\n",
777
- "print(\"🎯 FINAL RECOMMENDATION\")\n",
778
- "print(\"=\"*100)\n",
779
- "print(f\"\\nUse: {best_model['Model']}\")\n",
780
- "print(f\"Reason: Highest accuracy ({best_model['CSR']:.2f}% CSR) for hackathon OCR benchmark\")\n",
781
- "print(f\"Implementation: Use vision API with same prompt as tested\")\n",
782
- "print(\"=\"*100)"
783
- ]
784
- },
785
- {
786
- "cell_type": "markdown",
787
- "metadata": {},
788
- "source": [
789
- "## 9. Export Results"
790
- ]
791
- },
792
  {
793
  "cell_type": "code",
794
  "execution_count": null,
@@ -798,7 +690,8 @@
798
  "# Save results\n",
799
  "from pathlib import Path\n",
800
  "\n",
801
- "output_dir = Path('output/vlm_ocr_benchmark')\n",
 
802
  "output_dir.mkdir(parents=True, exist_ok=True)\n",
803
  "\n",
804
  "df.to_csv(output_dir / 'detailed_results.csv', index=False, encoding='utf-8')\n",
@@ -821,50 +714,8 @@
821
  "cell_type": "code",
822
  "execution_count": null,
823
  "metadata": {},
824
- "outputs": [
825
- {
826
- "name": "stdout",
827
- "output_type": "stream",
828
- "text": [
829
- "\n",
830
- "====================================================================================================\n",
831
- "📝 SAMPLE TEXT COMPARISON (First 500 characters)\n",
832
- "====================================================================================================\n",
833
- "\n",
834
- "🎯 GROUND TRUTH:\n",
835
- "----------------------------------------------------------------------------------------------------\n",
836
- "XÜLASƏ\n",
837
- "\n",
838
- "Bu tədqiqat Aşağı Kür çökəkliyi (AKÇ) və Bakı arxipelaqı (BA) daxil olmaqla Cənubi Xəzər çökəkliyi sistemində faydalı qazıntıların mənşəyinin paleotektonik, paleocoğrafi şərait və geodinamik rejimlə necə əlaqələndiyini, eləcə də Gec Miosendən etibarən Ərəbistan plitəsinin təsiri ilə formalaşan kollizion proseslərin bölgənin struktur-morfoloji və termal inkişafına nə dərəcədə yönverici rol oynadığını kompleks şəkildə qiymətləndirir. Seismotektonik göstəricilərin, çöküntütoplanma sürətləri\n",
839
- "\n",
840
- "🤖 GPT-4.1 (CSR: 85.86%):\n",
841
- "----------------------------------------------------------------------------------------------------\n",
842
- "Xülasə\n",
843
- "Bu məqalə Apşə Kər çökəkliyi (AKÇ) və Bakı arxipelaqı (BA) daxil olmaqla Cənubi Xəzər çökəkliyi sistemində faydalı qazıntıların mənşəyinin paleotektonik, paleocoğrafi şərait və geodinamik rejim\n",
844
- "\n",
845
- "====================================================================================================\n"
846
- ]
847
- }
848
- ],
849
- "source": [
850
- "# Show comparison of first 500 characters\n",
851
- "print(\"\\n\" + \"=\"*100)\n",
852
- "print(\"📝 SAMPLE TEXT COMPARISON (First 500 characters)\")\n",
853
- "print(\"=\"*100)\n",
854
- "\n",
855
- "print(\"\\n🎯 GROUND TRUTH:\")\n",
856
- "print(\"-\" * 100)\n",
857
- "print(ground_truth[:500])\n",
858
- "\n",
859
- "for _, row in df.iterrows():\n",
860
- " print(f\"\\n🤖 {row['Model']} (CSR: {row['CSR']:.2f}%):\")\n",
861
- " print(\"-\" * 100)\n",
862
- " # Get first 500 chars from extracted text\n",
863
- " preview = row['Extracted_Preview'] if len(row['Extracted_Preview']) >= 500 else row['Extracted_Preview']\n",
864
- " print(preview[:500])\n",
865
- "\n",
866
- "print(\"\\n\" + \"=\"*100)"
867
- ]
868
  }
869
  ],
870
  "metadata": {
 
31
  },
32
  {
33
  "cell_type": "code",
34
+ "execution_count": 1,
35
  "metadata": {},
36
  "outputs": [],
37
  "source": [
 
41
  },
42
  {
43
  "cell_type": "code",
44
+ "execution_count": 2,
45
  "metadata": {},
46
  "outputs": [
47
  {
 
81
  ]
82
  },
83
  {
84
+ "cell_type": "code",
85
+ "execution_count": 3,
86
  "metadata": {},
87
+ "outputs": [
88
+ {
89
+ "name": "stdout",
90
+ "output_type": "stream",
91
+ "text": [
92
+ "✅ Project root: /Users/ismatsamadov/SOCAR_Hackathon\n",
93
+ "✅ Data directory: /Users/ismatsamadov/SOCAR_Hackathon/data\n",
94
+ "✅ PDFs directory: /Users/ismatsamadov/SOCAR_Hackathon/data/pdfs\n",
95
+ "✅ Output directory: /Users/ismatsamadov/SOCAR_Hackathon/output\n"
96
+ ]
97
+ }
98
+ ],
99
  "source": [
100
+ "# Auto-detect project root (works from any directory)\n",
101
+ "import os\n",
102
+ "from pathlib import Path\n",
103
+ "\n",
104
+ "if Path('data').exists() and Path('docs').exists():\n",
105
+ " # Already in project root\n",
106
+ " PROJECT_ROOT = Path.cwd()\n",
107
+ "elif Path('../data').exists() and Path('../docs').exists():\n",
108
+ " # In notebooks/ subdirectory\n",
109
+ " PROJECT_ROOT = Path.cwd().parent\n",
110
+ "else:\n",
111
+ " # Fallback: try to find project root\n",
112
+ " current = Path.cwd()\n",
113
+ " while current != current.parent:\n",
114
+ " if (current / 'data').exists() and (current / 'docs').exists():\n",
115
+ " PROJECT_ROOT = current\n",
116
+ " break\n",
117
+ " current = current.parent\n",
118
+ " else:\n",
119
+ " PROJECT_ROOT = Path.cwd()\n",
120
+ "\n",
121
+ "# Define all paths relative to project root\n",
122
+ "DATA_DIR = PROJECT_ROOT / 'data'\n",
123
+ "DOCS_DIR = PROJECT_ROOT / 'docs'\n",
124
+ "OUTPUT_DIR = PROJECT_ROOT / 'output'\n",
125
+ "PDFS_DIR = DATA_DIR / 'pdfs'\n",
126
+ "\n",
127
+ "print(f\"✅ Project root: {PROJECT_ROOT}\")\n",
128
+ "print(f\"✅ Data directory: {DATA_DIR}\")\n",
129
+ "print(f\"✅ PDFs directory: {PDFS_DIR}\")\n",
130
+ "print(f\"✅ Output directory: {OUTPUT_DIR}\")"
131
  ]
132
  },
133
  {
134
  "cell_type": "code",
135
+ "execution_count": 4,
136
  "metadata": {},
137
  "outputs": [
138
  {
 
165
  " \n",
166
  " return text.strip()\n",
167
  "\n",
168
+ "# Load ground truth - using dynamic path\n",
169
+ "ground_truth = load_ground_truth(str(DATA_DIR / 'document_00.md'))\n",
170
  "print(f\"✅ Ground truth loaded: {len(ground_truth)} characters\")\n",
171
  "print(f\"Preview:\\n{ground_truth[:300]}...\")"
172
  ]
173
  },
 
 
 
 
 
 
 
174
  {
175
  "cell_type": "code",
176
+ "execution_count": 5,
177
  "metadata": {},
178
  "outputs": [
179
  {
 
217
  " image.save(buffered, format=format)\n",
218
  " return base64.b64encode(buffered.getvalue()).decode('utf-8')\n",
219
  "\n",
220
+ "# Test conversion - using dynamic path\n",
221
+ "pdf_path = str(PDFS_DIR / 'document_00.pdf')\n",
222
  "test_images = pdf_to_images(pdf_path)\n",
223
  "print(f\"\\n✅ Converted PDF to {len(test_images)} images\")\n",
224
  "print(f\"First image size: {test_images[0].size}\")"
225
  ]
226
  },
 
 
 
 
 
 
 
227
  {
228
  "cell_type": "code",
229
+ "execution_count": 7,
230
  "metadata": {},
231
  "outputs": [
232
  {
 
290
  },
291
  {
292
  "cell_type": "code",
293
+ "execution_count": 8,
294
  "metadata": {},
295
  "outputs": [
296
  {
 
399
  "## 4. Metrics Calculation"
400
  ]
401
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
  {
403
  "cell_type": "markdown",
404
  "metadata": {},
 
408
  },
409
  {
410
  "cell_type": "code",
411
+ "execution_count": 10,
412
  "metadata": {},
413
  "outputs": [
414
  {
 
438
  },
439
  {
440
  "cell_type": "code",
441
+ "execution_count": 11,
442
  "metadata": {},
443
  "outputs": [
444
  {
 
453
  "Notes: Excellent OCR\n",
454
  "================================================================================\n",
455
  " Page 1/12: 9.2s\n",
456
+ " Page 2/12: 8.8s\n",
457
+ " Page 3/12: 9.7s\n",
458
+ " Page 4/12: 10.7s\n",
459
+ " Page 5/12: 12.4s\n",
460
+ " Page 6/12: 9.9s\n",
461
+ " Page 7/12: 12.1s\n",
462
+ " Page 8/12: 10.7s\n",
463
+ " Page 9/12: 12.0s\n",
464
+ " Page 10/12: 10.9s\n",
465
+ " Page 11/12: 7.7s\n",
466
+ " Page 12/12: 6.6s\n",
467
+ "\n",
468
+ "✅ Total time: 120.73s\n",
469
+ "✅ Extracted: 22018 characters\n",
470
+ "\n",
471
+ "📊 Metrics:\n",
472
+ " CSR (Character Success): 86.17%\n",
473
+ " WSR (Word Success): 68.19%\n",
474
+ " CER (Character Error): 13.83%\n",
475
+ " WER (Word Error): 31.81%\n",
476
+ "\n",
477
+ "================================================================================\n",
478
+ "Testing: GPT-5 ⭐⭐⭐⭐⭐\n",
479
+ "Notes: Latest model\n",
480
+ "================================================================================\n",
481
+ "❌ Failed: ERROR: Completions.create() got an unexpected keyword argument 'max_completion_tokens'\n",
482
+ "\n",
483
+ "================================================================================\n",
484
+ "Testing: GPT-5-mini ⭐⭐⭐⭐⭐\n",
485
+ "Notes: Fast + excellent\n",
486
+ "================================================================================\n",
487
+ "❌ Failed: ERROR: Completions.create() got an unexpected keyword argument 'max_completion_tokens'\n",
488
+ "\n",
489
+ "================================================================================\n",
490
+ "Testing: Claude-Sonnet-4.5 ⭐⭐⭐⭐⭐\n",
491
+ "Notes: Very good OCR\n",
492
+ "================================================================================\n",
493
+ "❌ Failed: ERROR: Error code: 400 - {'error': {'code': 'unknown_model', 'message': 'Unknown model: claude-sonnet-4-5', 'details': 'Unknown model: claude-sonnet-4-5'}}\n",
494
+ "\n",
495
+ "================================================================================\n",
496
+ "Testing: Phi-4-multimodal ⭐⭐⭐⭐\n",
497
+ "Notes: Explicitly multimodal\n",
498
+ "================================================================================\n",
499
+ " Page 1/12: 32.6s\n",
500
+ " Page 2/12: 3.9s\n",
501
+ " Page 3/12: 76.4s\n",
502
+ " Page 4/12: 6.0s\n",
503
+ "❌ Failed: ERROR: Error code: 500 - {'statusCode': 500, 'message': 'Internal server error', 'activityId': '22657b68-c10c-4f10-b8d7-4f5d7e1d963b'}\n",
504
+ "\n",
505
+ "================================================================================\n",
506
+ "Testing: Llama-4-Maverick-17B ⭐⭐⭐⭐\n",
507
+ "Notes: Testing vision capability\n",
508
+ "================================================================================\n",
509
+ " Page 1/12: 6.8s\n",
510
+ " Page 2/12: 7.3s\n",
511
+ " Page 3/12: 9.0s\n",
512
+ " Page 4/12: 9.7s\n",
513
+ "❌ Failed: ERROR: Error code: 500 - {'statusCode': 500, 'message': 'Internal server error', 'activityId': '4a2cd3c9-b21e-4efd-a273-1a4a5e99e2c4'}\n",
514
+ "\n",
515
+ "================================================================================\n",
516
+ "✅ VLM OCR Benchmarking complete!\n",
517
+ "================================================================================\n"
518
  ]
519
  }
520
  ],
 
578
  },
579
  {
580
  "cell_type": "code",
581
+ "execution_count": 12,
582
  "metadata": {},
583
  "outputs": [
584
  {
 
590
  "📊 VLM OCR BENCHMARKING RESULTS\n",
591
  "====================================================================================================\n",
592
  " Model CSR WSR CER WER Response_Time Rating\n",
593
+ "GPT-4.1 86.17 68.19 13.83 31.81 120.73 ⭐⭐⭐⭐⭐\n",
594
  "====================================================================================================\n"
595
  ]
596
  }
 
612
  "print(\"=\"*100)"
613
  ]
614
  },
 
 
 
 
 
 
 
615
  {
616
  "cell_type": "code",
617
  "execution_count": null,
 
622
  "import os\n",
623
  "from pathlib import Path\n",
624
  "\n",
625
+ "# Create output directory - using dynamic path\n",
626
+ "output_dir = OUTPUT_DIR / 'vlm_ocr_benchmark'\n",
627
  "output_dir.mkdir(parents=True, exist_ok=True)\n",
628
  "\n",
629
  "fig, axes = plt.subplots(2, 2, figsize=(16, 10))\n",
 
681
  "print(f\"\\n✅ Visualization saved to '{output_dir}/results.png'\")"
682
  ]
683
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
684
  {
685
  "cell_type": "code",
686
  "execution_count": null,
 
690
  "# Save results\n",
691
  "from pathlib import Path\n",
692
  "\n",
693
+ "# Using dynamic path\n",
694
+ "output_dir = OUTPUT_DIR / 'vlm_ocr_benchmark'\n",
695
  "output_dir.mkdir(parents=True, exist_ok=True)\n",
696
  "\n",
697
  "df.to_csv(output_dir / 'detailed_results.csv', index=False, encoding='utf-8')\n",
 
714
  "cell_type": "code",
715
  "execution_count": null,
716
  "metadata": {},
717
+ "outputs": [],
718
+ "source": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
719
  }
720
  ],
721
  "metadata": {
notebooks/vlm_ocr_benchmark.ipynb.backup ADDED
The diff for this file is too large to render. See raw diff