{ "cells": [ { "cell_type": "markdown", "id": "8de99634", "metadata": {}, "source": [ "# RAG Pipeline Optimization Benchmark\n", "\n", "**Comprehensive testing of ALL RAG components to maximize LLM Judge score**\n", "\n", "## What We're Testing:\n", "\n", "### 1. Embedding Models\n", "- BAAI/bge-large-en-v1.5 (Current - 1024 dim)\n", "- intfloat/multilingual-e5-large (1024 dim, multi-language)\n", "\n", "### 2. Retrieval Strategies\n", "- Top-K: 3, 5, 10 documents\n", "- MMR (Maximal Marginal Relevance)\n", "- Reranking with cross-encoder\n", "\n", "### 3. LLM Models\n", "- Llama-4-Maverick-17B, DeepSeek-R1, GPT-5-mini, Claude-Sonnet-4.5\n", "\n", "### 4. Prompting Strategies\n", "- Baseline, Citation-focused, Step-by-step, Few-shot" ] }, { "cell_type": "code", "execution_count": 46, "id": "164d7005", "metadata": {}, "outputs": [], "source": [ "# !pip install openai pinecone-client sentence-transformers rank-bm25 python-dotenv pandas matplotlib seaborn jiwer" ] }, { "cell_type": "code", "execution_count": 47, "id": "5817ef87", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Libraries loaded\n" ] } ], "source": [ "import os\n", "import json\n", "import time\n", "import re\n", "from typing import Dict, List, Tuple\n", "from dotenv import load_dotenv\n", "from pathlib import Path\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from openai import AzureOpenAI\n", "from pinecone import Pinecone\n", "from sentence_transformers import SentenceTransformer, CrossEncoder\n", "from jiwer import wer, cer\n", "import numpy as np\n", "\n", "load_dotenv()\n", "sns.set_style(\"whitegrid\")\n", "plt.rcParams[\"figure.figsize\"] = (16, 10)\n", "\n", "print(\"✅ Libraries loaded\")" ] }, { "cell_type": "code", "execution_count": 48, "id": "bf5006a0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Project root: /Users/ismatsamadov/SOCAR_Hackathon\n" ] } ], "source": [ "# Auto-detect project root\n", "if Path(\"data\").exists() and Path(\"docs\").exists():\n", " PROJECT_ROOT = Path.cwd()\n", "elif Path(\"../data\").exists() and Path(\"../docs\").exists():\n", " PROJECT_ROOT = Path.cwd().parent\n", "else:\n", " current = Path.cwd()\n", " while current != current.parent:\n", " if (current / \"data\").exists() and (current / \"docs\").exists():\n", " PROJECT_ROOT = current\n", " break\n", " current = current.parent\n", " else:\n", " PROJECT_ROOT = Path.cwd()\n", "\n", "DATA_DIR = PROJECT_ROOT / \"data\"\n", "DOCS_DIR = PROJECT_ROOT / \"docs\"\n", "OUTPUT_DIR = PROJECT_ROOT / \"output\"\n", "\n", "print(f\"✅ Project root: {PROJECT_ROOT}\")" ] }, { "cell_type": "code", "execution_count": 49, "id": "f56de816", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Loaded 5 test questions\n" ] } ], "source": [ "# Load test cases using dynamic paths\n", "with open(DOCS_DIR / \"sample_questions.json\", \"r\", encoding=\"utf-8\") as f:\n", " questions = json.load(f)\n", "\n", "with open(DOCS_DIR / \"sample_answers.json\", \"r\", encoding=\"utf-8\") as f:\n", " expected_answers = json.load(f)\n", "\n", "print(f\"✅ Loaded {len(questions)} test questions\")" ] }, { "cell_type": "code", "execution_count": 50, "id": "01eb4d27", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Vector DB connected: 1300 vectors\n" ] } ], "source": [ "# Connect to Pinecone\n", "pc = Pinecone(api_key=os.getenv(\"PINECONE_API_KEY\"))\n", "index = pc.Index(os.getenv(\"PINECONE_INDEX_NAME\", \"hackathon\"))\n", "\n", "stats = index.describe_index_stats()\n", "print(f\"✅ Vector DB connected: {stats['total_vector_count']} vectors\")" ] }, { "cell_type": "code", "execution_count": 51, "id": "1184ac1b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loading bge-large-en...\n", "Loading multilingual-e5-large...\n", "✅ Loaded 2 embedding models\n" ] } ], "source": [ "EMBEDDING_MODELS = {\n", " \"bge-large-en\": \"BAAI/bge-large-en-v1.5\",\n", " \"multilingual-e5-large\": \"intfloat/multilingual-e5-large\"\n", "}\n", "\n", "embedding_cache = {}\n", "for key, model_name in EMBEDDING_MODELS.items():\n", " print(f\"Loading {key}...\")\n", " embedding_cache[key] = SentenceTransformer(model_name)\n", "\n", "print(f\"✅ Loaded {len(embedding_cache)} embedding models\")" ] }, { "cell_type": "code", "execution_count": 52, "id": "bbfff41b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Configured 4 retrieval strategies\n" ] } ], "source": [ "def retrieve_vanilla(query: str, embed_model, top_k: int = 3):\n", " \"\"\"Vanilla retrieval: Simple top-k vector search.\"\"\"\n", " query_embedding = embed_model.encode(query).tolist()\n", " results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)\n", " \n", " documents = []\n", " for match in results[\"matches\"]:\n", " documents.append({\n", " \"pdf_name\": match[\"metadata\"].get(\"pdf_name\", \"unknown.pdf\"),\n", " \"page_number\": match[\"metadata\"].get(\"page_number\", 0),\n", " \"content\": match[\"metadata\"].get(\"text\", \"\"),\n", " \"score\": match.get(\"score\", 0.0)\n", " })\n", " return documents\n", "\n", "def retrieve_with_mmr(query: str, embed_model, top_k: int = 3, lambda_param: float = 0.5, fetch_k: int = 20):\n", " \"\"\"MMR for diversity.\"\"\"\n", " candidates = retrieve_vanilla(query, embed_model, top_k=fetch_k)\n", " if len(candidates) <= top_k:\n", " return candidates[:top_k]\n", " \n", " query_emb = embed_model.encode(query)\n", " candidate_texts = [doc[\"content\"] for doc in candidates]\n", " candidate_embs = embed_model.encode(candidate_texts)\n", " \n", " selected = []\n", " selected_embs = []\n", " \n", " for _ in range(min(top_k, len(candidates))):\n", " mmr_scores = []\n", " for i, (doc, emb) in enumerate(zip(candidates, candidate_embs)):\n", " if i in [candidates.index(s) for s in selected]:\n", " mmr_scores.append(-float(\"inf\"))\n", " continue\n", " \n", " relevance = np.dot(query_emb, emb) / (np.linalg.norm(query_emb) * np.linalg.norm(emb))\n", " if selected_embs:\n", " similarities = [np.dot(emb, s_emb) / (np.linalg.norm(emb) * np.linalg.norm(s_emb)) for s_emb in selected_embs]\n", " max_sim = max(similarities)\n", " else:\n", " max_sim = 0\n", " \n", " mmr = lambda_param * relevance - (1 - lambda_param) * max_sim\n", " mmr_scores.append(mmr)\n", " \n", " best_idx = np.argmax(mmr_scores)\n", " selected.append(candidates[best_idx])\n", " selected_embs.append(candidate_embs[best_idx])\n", " \n", " return selected\n", "\n", "def retrieve_with_reranking(query: str, embed_model, top_k: int = 3, fetch_k: int = 20):\n", " \"\"\"Two-stage: retrieve then rerank.\"\"\"\n", " candidates = retrieve_vanilla(query, embed_model, top_k=fetch_k)\n", " if len(candidates) <= top_k:\n", " return candidates[:top_k]\n", " \n", " reranker = CrossEncoder(\"cross-encoder/ms-marco-MiniLM-L-6-v2\")\n", " pairs = [[query, doc[\"content\"]] for doc in candidates]\n", " scores = reranker.predict(pairs)\n", " \n", " scored_docs = [(doc, score) for doc, score in zip(candidates, scores)]\n", " scored_docs.sort(key=lambda x: x[1], reverse=True)\n", " \n", " return [doc for doc, _ in scored_docs[:top_k]]\n", "\n", "RETRIEVAL_STRATEGIES = {\n", " \"vanilla_k3\": {\"func\": retrieve_vanilla, \"params\": {\"top_k\": 3}},\n", " \"vanilla_k5\": {\"func\": retrieve_vanilla, \"params\": {\"top_k\": 5}},\n", " \"mmr_balanced\": {\"func\": retrieve_with_mmr, \"params\": {\"top_k\": 3, \"lambda_param\": 0.5}},\n", " \"reranked_k3\": {\"func\": retrieve_with_reranking, \"params\": {\"top_k\": 3, \"fetch_k\": 20}}\n", "}\n", "\n", "print(f\"✅ Configured {len(RETRIEVAL_STRATEGIES)} retrieval strategies\")" ] }, { "cell_type": "code", "execution_count": 53, "id": "509c5466", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Configured 3 LLM models and 3 prompts\n" ] } ], "source": [ "azure_client = AzureOpenAI(\n", " api_key=os.getenv(\"AZURE_OPENAI_API_KEY\"),\n", " api_version=os.getenv(\"AZURE_OPENAI_API_VERSION\", \"2024-08-01-preview\"),\n", " azure_endpoint=os.getenv(\"AZURE_OPENAI_ENDPOINT\")\n", ")\n", "\n", "LLM_MODELS = {\n", " \"Llama-4-Maverick\": \"Llama-4-Maverick-17B-128E-Instruct-FP8\",\n", " \"DeepSeek-R1\": \"DeepSeek-R1\",\n", " \"GPT-5-mini\": \"gpt-5-mini\"\n", "}\n", "# Note: Claude-Sonnet-4.5 removed - not available in Azure deployment\n", "\n", "PROMPTING_STRATEGIES = {\n", " \"baseline\": \"\"\"Cavab verin:\n", "{context}\n", "\n", "Sual: {query}\"\"\",\n", " \n", " \"citation_focused\": \"\"\"Mənbə göstərin:\n", "{context}\n", "\n", "Sual: {query}\n", "Hər faktı PDF və səhifə nömrəsi ilə göstərin.\"\"\",\n", " \n", " \"few_shot\": \"\"\"Nümunə: \"Palçıq vulkanlarının təsir radiusu 10 km-dir (PDF: doc.pdf, Səhifə: 5)\"\n", "\n", "{context}\n", "\n", "Sual: {query}\"\"\"\n", "}\n", "\n", "print(f\"✅ Configured {len(LLM_MODELS)} LLM models and {len(PROMPTING_STRATEGIES)} prompts\")" ] }, { "cell_type": "code", "execution_count": 54, "id": "2b6b48fe", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Generation function ready\n" ] } ], "source": [ "def generate_answer(llm_model: str, query: str, documents: List[Dict], prompt_strategy: str = \"baseline\") -> Tuple[str, float]:\n", " \"\"\"Generate answer using LLM.\"\"\"\n", " context_parts = []\n", " for i, doc in enumerate(documents, 1):\n", " context_parts.append(f\"Sənəd {i} ({doc['pdf_name']}, Səhifə {doc['page_number']}):\\n{doc['content']}\")\n", " context = \"\\n\\n\".join(context_parts)\n", " \n", " prompt_template = PROMPTING_STRATEGIES[prompt_strategy]\n", " prompt = prompt_template.format(context=context, query=query)\n", " \n", " try:\n", " start_time = time.time()\n", " deployment = LLM_MODELS[llm_model]\n", " \n", " # GPT-5 models use max_completion_tokens, others use max_tokens\n", " if deployment.startswith(\"gpt-5\"):\n", " response = azure_client.chat.completions.create(\n", " model=deployment,\n", " messages=[{\"role\": \"user\", \"content\": prompt}],\n", " temperature=0.2,\n", " max_completion_tokens=1000\n", " )\n", " else:\n", " response = azure_client.chat.completions.create(\n", " model=deployment,\n", " messages=[{\"role\": \"user\", \"content\": prompt}],\n", " temperature=0.2,\n", " max_tokens=1000\n", " )\n", " \n", " elapsed = time.time() - start_time\n", " answer = response.choices[0].message.content\n", " return answer, elapsed\n", " \n", " except Exception as e:\n", " return f\"ERROR: {str(e)}\", 0.0\n", "\n", "print(\"✅ Generation function ready\")" ] }, { "cell_type": "code", "execution_count": 55, "id": "3bd17b98", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Evaluation functions ready\n" ] } ], "source": [ "def evaluate_rag(expected: str, generated: str, documents: List[Dict]) -> Dict:\n", " \"\"\"Evaluate RAG answer quality.\"\"\"\n", " def normalize(text):\n", " return text.lower().strip()\n", " \n", " # Accuracy\n", " if expected:\n", " wer_score = wer(normalize(expected), normalize(generated)) * 100\n", " accuracy = max(0, 100 - wer_score)\n", " else:\n", " accuracy = 0\n", " \n", " # Citation quality\n", " pdf_names = [doc[\"pdf_name\"].replace(\".pdf\", \"\") for doc in documents]\n", " cited_pdfs = sum(1 for pdf in pdf_names if pdf in generated)\n", " citation_score = (cited_pdfs / len(pdf_names)) * 100 if pdf_names else 0\n", " \n", " # Completeness\n", " word_count = len(generated.split())\n", " completeness = min(100, (word_count / 30) * 100)\n", " \n", " # Overall LLM Judge Score\n", " llm_judge_score = round(accuracy * 0.35 + citation_score * 0.35 + completeness * 0.30, 2)\n", " \n", " return {\n", " \"Accuracy\": round(accuracy, 2),\n", " \"Citation_Score\": round(citation_score, 2),\n", " \"Completeness\": round(completeness, 2),\n", " \"LLM_Judge_Score\": llm_judge_score\n", " }\n", "\n", "print(\"✅ Evaluation functions ready\")" ] }, { "cell_type": "markdown", "id": "02fd6f6d", "metadata": {}, "source": [ "## Run Comprehensive Benchmark" ] }, { "cell_type": "code", "execution_count": 56, "id": "7b08f5bc", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Testing 7 configurations on 5 questions\n" ] } ], "source": [ "# Select configurations to test\n", "CONFIGS_TO_TEST = [\n", " (\"bge-large-en\", \"vanilla_k3\", \"Llama-4-Maverick\", \"baseline\"),\n", " (\"bge-large-en\", \"vanilla_k3\", \"Llama-4-Maverick\", \"citation_focused\"),\n", " (\"bge-large-en\", \"vanilla_k3\", \"Llama-4-Maverick\", \"few_shot\"),\n", " (\"bge-large-en\", \"vanilla_k5\", \"Llama-4-Maverick\", \"baseline\"),\n", " (\"bge-large-en\", \"mmr_balanced\", \"Llama-4-Maverick\", \"baseline\"),\n", " (\"bge-large-en\", \"reranked_k3\", \"Llama-4-Maverick\", \"baseline\"),\n", " (\"multilingual-e5-large\", \"vanilla_k3\", \"Llama-4-Maverick\", \"baseline\")\n", "]\n", "\n", "print(f\"Testing {len(CONFIGS_TO_TEST)} configurations on {len(questions)} questions\")" ] }, { "cell_type": "code", "execution_count": 57, "id": "5fe11d34", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "********************************\n", "Config 1/7: bge-large-en_vanilla_k3_Llama-4-Maverick_baseline\n", " Example1: Daha az quyu ilə daha çox hasilat əldə e...\n", " ✅ Score: 43.2% (1.86s)\n", " Example2: Qərbi Abşeron yatağında suvurma tədbirlə...\n", " ✅ Score: 41.7% (2.32s)\n", " Example3: Pirallahı strukturunda 1253 nömrəli quyu...\n", " ✅ Score: 30.0% (2.23s)\n", " Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkl...\n", " ✅ Score: 53.3% (3.55s)\n", " Example5: Bu zonada hansı proseslər baş verir?...\n", " ✅ Score: 30.0% (1.45s)\n", "********************************\n", "Config 2/7: bge-large-en_vanilla_k3_Llama-4-Maverick_citation_focused\n", " Example1: Daha az quyu ilə daha çox hasilat əldə e...\n", " ✅ Score: 65.0% (3.50s)\n", " Example2: Qərbi Abşeron yatağında suvurma tədbirlə...\n", " ✅ Score: 41.7% (3.54s)\n", " Example3: Pirallahı strukturunda 1253 nömrəli quyu...\n", " ✅ Score: 65.0% (3.78s)\n", " Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkl...\n", " ✅ Score: 53.3% (4.11s)\n", " Example5: Bu zonada hansı proseslər baş verir?...\n", " ✅ Score: 53.3% (3.13s)\n", "********************************\n", "Config 3/7: bge-large-en_vanilla_k3_Llama-4-Maverick_few_shot\n", " Example1: Daha az quyu ilə daha çox hasilat əldə e...\n", " ✅ Score: 38.5% (1.53s)\n", " Example2: Qərbi Abşeron yatağında suvurma tədbirlə...\n", " ✅ Score: 41.7% (2.86s)\n", " Example3: Pirallahı strukturunda 1253 nömrəli quyu...\n", " ✅ Score: 30.0% (2.14s)\n", " Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkl...\n", " ✅ Score: 53.3% (2.22s)\n", " Example5: Bu zonada hansı proseslər baş verir?...\n", " ✅ Score: 65.0% (2.09s)\n", "********************************\n", "Config 4/7: bge-large-en_vanilla_k5_Llama-4-Maverick_baseline\n", " Example1: Daha az quyu ilə daha çox hasilat əldə e...\n", " ✅ Score: 30.0% (3.49s)\n", " Example2: Qərbi Abşeron yatağında suvurma tədbirlə...\n", " ✅ Score: 44.0% (3.28s)\n", " Example3: Pirallahı strukturunda 1253 nömrəli quyu...\n", " ✅ Score: 30.0% (2.43s)\n", " Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkl...\n", " ✅ Score: 44.0% (3.10s)\n", " Example5: Bu zonada hansı proseslər baş verir?...\n", " ✅ Score: 30.0% (4.62s)\n", "********************************\n", "Config 5/7: bge-large-en_mmr_balanced_Llama-4-Maverick_baseline\n", " Example1: Daha az quyu ilə daha çox hasilat əldə e...\n", " ✅ Score: 41.4% (1.28s)\n", " Example2: Qərbi Abşeron yatağında suvurma tədbirlə...\n", " ✅ Score: 30.0% (2.35s)\n", " Example3: Pirallahı strukturunda 1253 nömrəli quyu...\n", " ✅ Score: 30.0% (2.98s)\n", " Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkl...\n", " ✅ Score: 41.7% (3.49s)\n", " Example5: Bu zonada hansı proseslər baş verir?...\n", " ✅ Score: 30.0% (2.54s)\n", "********************************\n", "Config 6/7: bge-large-en_reranked_k3_Llama-4-Maverick_baseline\n", " Example1: Daha az quyu ilə daha çox hasilat əldə e...\n", " ✅ Score: 43.2% (1.23s)\n", " Example2: Qərbi Abşeron yatağında suvurma tədbirlə...\n", " ✅ Score: 30.0% (3.73s)\n", " Example3: Pirallahı strukturunda 1253 nömrəli quyu...\n", " ✅ Score: 30.0% (2.68s)\n", " Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkl...\n", " ✅ Score: 53.3% (3.21s)\n", " Example5: Bu zonada hansı proseslər baş verir?...\n", " ✅ Score: 30.0% (4.24s)\n", "********************************\n", "Config 7/7: multilingual-e5-large_vanilla_k3_Llama-4-Maverick_baseline\n", " Example1: Daha az quyu ilə daha çox hasilat əldə e...\n", " ✅ Score: 30.0% (4.30s)\n", " Example2: Qərbi Abşeron yatağında suvurma tədbirlə...\n", " ✅ Score: 30.0% (1.58s)\n", " Example3: Pirallahı strukturunda 1253 nömrəli quyu...\n", " ✅ Score: 30.0% (2.04s)\n", " Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkl...\n", " ✅ Score: 30.0% (4.57s)\n", " Example5: Bu zonada hansı proseslər baş verir?...\n", " ✅ Score: 30.0% (2.31s)\n", "********************************\n", "✅ Benchmark complete!\n" ] } ], "source": [ "results = []\n", "\n", "for config_idx, (embed_key, retrieval_key, llm_key, prompt_key) in enumerate(CONFIGS_TO_TEST, 1):\n", " config_name = f\"{embed_key}_{retrieval_key}_{llm_key}_{prompt_key}\"\n", " print(\"********************************\")\n", " print(f\"Config {config_idx}/{len(CONFIGS_TO_TEST)}: {config_name}\")\n", " \n", " embed_model = embedding_cache[embed_key]\n", " retrieval_func = RETRIEVAL_STRATEGIES[retrieval_key][\"func\"]\n", " retrieval_params = RETRIEVAL_STRATEGIES[retrieval_key][\"params\"]\n", " \n", " for example_key, messages in questions.items():\n", " user_msg = [m for m in messages if m[\"role\"] == \"user\"][-1]\n", " query = user_msg[\"content\"]\n", " \n", " print(f\" {example_key}: {query[:40]}...\")\n", " \n", " documents = retrieval_func(query, embed_model, **retrieval_params)\n", " answer, response_time = generate_answer(llm_key, query, documents, prompt_key)\n", " \n", " if answer.startswith(\"ERROR\"):\n", " print(f\" ❌ {answer}\")\n", " continue\n", " \n", " expected = expected_answers.get(example_key, {}).get(\"Answer\", \"\")\n", " metrics = evaluate_rag(expected, answer, documents)\n", " \n", " results.append({\n", " \"Config\": config_name,\n", " \"Embedding\": embed_key,\n", " \"Retrieval\": retrieval_key,\n", " \"LLM\": llm_key,\n", " \"Prompt\": prompt_key,\n", " \"Question\": example_key,\n", " \"Response_Time\": round(response_time, 2),\n", " **metrics\n", " })\n", " \n", " print(f\" ✅ Score: {metrics['LLM_Judge_Score']:.1f}% ({response_time:.2f}s)\")\n", "\n", "print(\"********************************\")\n", "print(\"✅ Benchmark complete!\")" ] }, { "cell_type": "code", "execution_count": 58, "id": "5fb97def", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "====================================================================================================\n", "📊 CONFIGURATION RANKINGS\n", "====================================================================================================\n", " LLM_Judge_Score Accuracy Citation_Score Completeness Response_Time\n", "Config \n", "bge-large-en_vanilla_k3_Llama-4-Maverick_citation_focused 55.67 0.00 73.33 100.0 3.61\n", "bge-large-en_vanilla_k3_Llama-4-Maverick_few_shot 45.70 4.86 40.00 100.0 2.17\n", "bge-large-en_vanilla_k3_Llama-4-Maverick_baseline 39.65 7.57 20.00 100.0 2.28\n", "bge-large-en_reranked_k3_Llama-4-Maverick_baseline 37.31 7.57 13.33 100.0 3.02\n", "bge-large-en_vanilla_k5_Llama-4-Maverick_baseline 35.60 0.00 16.00 100.0 3.38\n", "bge-large-en_mmr_balanced_Llama-4-Maverick_baseline 34.60 6.49 6.67 100.0 2.53\n", "multilingual-e5-large_vanilla_k3_Llama-4-Maverick_baseline 30.00 0.00 0.00 100.0 2.96\n", "====================================================================================================\n" ] } ], "source": [ "df = pd.DataFrame(results)\n", "\n", "config_summary = df.groupby(\"Config\").agg({\n", " \"LLM_Judge_Score\": \"mean\",\n", " \"Accuracy\": \"mean\",\n", " \"Citation_Score\": \"mean\",\n", " \"Completeness\": \"mean\",\n", " \"Response_Time\": \"mean\"\n", "}).round(2).sort_values(\"LLM_Judge_Score\", ascending=False)\n", "\n", "print(\"\\n\" + \"=\"*100)\n", "print(\"📊 CONFIGURATION RANKINGS\")\n", "print(\"=\"*100)\n", "print(config_summary.to_string())\n", "print(\"=\"*100)" ] }, { "cell_type": "code", "execution_count": 59, "id": "704b7148", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "====================================================================================================\n", "🏆 OPTIMAL RAG CONFIGURATION\n", "====================================================================================================\n", "Best Configuration: bge-large-en_vanilla_k3_Llama-4-Maverick_citation_focused\n", "LLM Judge Score: 55.67%\n", "Accuracy: 0.00%\n", "Citation Quality: 73.33%\n", "Response Time: 3.61s\n", "====================================================================================================\n" ] } ], "source": [ "best_config = config_summary.iloc[0]\n", "\n", "print(\"\\n\" + \"=\"*100)\n", "print(\"🏆 OPTIMAL RAG CONFIGURATION\")\n", "print(\"=\"*100)\n", "print(f\"Best Configuration: {best_config.name}\")\n", "print(f\"LLM Judge Score: {best_config['LLM_Judge_Score']:.2f}%\")\n", "print(f\"Accuracy: {best_config['Accuracy']:.2f}%\")\n", "print(f\"Citation Quality: {best_config['Citation_Score']:.2f}%\")\n", "print(f\"Response Time: {best_config['Response_Time']:.2f}s\")\n", "print(\"=\"*100)" ] }, { "cell_type": "code", "execution_count": 60, "id": "9d8ce688", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "✅ Results saved to output/rag_optimization_benchmark/\n" ] } ], "source": [ "# Save results using dynamic path\n", "output_dir = OUTPUT_DIR / \"rag_optimization_benchmark\"\n", "output_dir.mkdir(parents=True, exist_ok=True)\n", "\n", "df.to_csv(output_dir / \"detailed_results.csv\", index=False, encoding=\"utf-8\")\n", "config_summary.to_csv(output_dir / \"summary.csv\", encoding=\"utf-8\")\n", "\n", "print(\"\\n✅ Results saved to output/rag_optimization_benchmark/\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.10.0" } }, "nbformat": 4, "nbformat_minor": 5 }