{ "cells": [ { "cell_type": "markdown", "id": "38a7900c", "metadata": {}, "source": [ "# LLM Benchmarking for SOCAR Hackathon RAG Chatbot\n", "\n", "Testing different LLM models for the `/llm` endpoint to find the best performer.\n", "\n", "## Evaluation Criteria (LLM Judge Metrics):\n", "- **Accuracy**: Is the answer correct?\n", "- **Relevance**: Are retrieved citations relevant?\n", "- **Completeness**: Does it fully answer the question?\n", "- **Citation Quality**: Proper sources with page numbers?\n", "- **Response Time**: Speed of generation\n", "\n", "## Available LLM Models:\n", "1. **Llama-4-Maverick-17B** (Open-source)\n", "2. **DeepSeek-R1** (Open-source reasoning)\n", "3. **GPT-4.1, GPT-5, GPT-5-mini**\n", "4. **Claude Sonnet 4.5**" ] }, { "cell_type": "code", "execution_count": 45, "id": "143cf60d", "metadata": {}, "outputs": [], "source": [ "# Install required packages\n", "# !pip install openai pinecone-client sentence-transformers python-dotenv pandas matplotlib seaborn jiwer" ] }, { "cell_type": "code", "execution_count": 46, "id": "d698b11a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Libraries loaded\n" ] } ], "source": [ "import os\n", "import json\n", "import time\n", "from typing import Dict, List, Tuple\n", "from dotenv import load_dotenv\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from openai import AzureOpenAI\n", "from pinecone import Pinecone\n", "from sentence_transformers import SentenceTransformer\n", "from jiwer import wer, cer\n", "from pathlib import Path\n", "\n", "load_dotenv()\n", "sns.set_style(\"whitegrid\")\n", "plt.rcParams[\"figure.figsize\"] = (14, 8)\n", "\n", "print(\"✅ Libraries loaded\")" ] }, { "cell_type": "code", "execution_count": 47, "id": "087187fb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Project root: /Users/ismatsamadov/SOCAR_Hackathon\n", "✅ Docs directory: /Users/ismatsamadov/SOCAR_Hackathon/docs\n" ] } ], "source": [ "# Auto-detect project root\n", "if Path(\"data\").exists() and Path(\"docs\").exists():\n", " PROJECT_ROOT = Path.cwd()\n", "elif Path(\"../data\").exists() and Path(\"../docs\").exists():\n", " PROJECT_ROOT = Path.cwd().parent\n", "else:\n", " current = Path.cwd()\n", " while current != current.parent:\n", " if (current / \"data\").exists() and (current / \"docs\").exists():\n", " PROJECT_ROOT = current\n", " break\n", " current = current.parent\n", " else:\n", " PROJECT_ROOT = Path.cwd()\n", "\n", "DATA_DIR = PROJECT_ROOT / \"data\"\n", "DOCS_DIR = PROJECT_ROOT / \"docs\"\n", "OUTPUT_DIR = PROJECT_ROOT / \"output\"\n", "\n", "print(f\"✅ Project root: {PROJECT_ROOT}\")\n", "print(f\"✅ Docs directory: {DOCS_DIR}\")" ] }, { "cell_type": "code", "execution_count": 48, "id": "cf51bb3f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loaded 5 test cases\n" ] } ], "source": [ "# Load sample questions and answers using dynamic paths\n", "with open(DOCS_DIR / \"sample_questions.json\", \"r\", encoding=\"utf-8\") as f:\n", " questions = json.load(f)\n", "\n", "with open(DOCS_DIR / \"sample_answers.json\", \"r\", encoding=\"utf-8\") as f:\n", " expected_answers = json.load(f)\n", "\n", "print(f\"Loaded {len(questions)} test cases\")" ] }, { "cell_type": "code", "execution_count": 49, "id": "9e761174", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Vector DB connected\n", "✅ Embedding model loaded\n" ] } ], "source": [ "# Initialize Pinecone\n", "pc = Pinecone(api_key=os.getenv(\"PINECONE_API_KEY\"))\n", "index = pc.Index(os.getenv(\"PINECONE_INDEX_NAME\", \"hackathon\"))\n", "\n", "# Initialize embedding model\n", "embed_model = SentenceTransformer(\"BAAI/bge-large-en-v1.5\")\n", "\n", "print(f\"✅ Vector DB connected\")\n", "print(f\"✅ Embedding model loaded\")" ] }, { "cell_type": "code", "execution_count": 50, "id": "74396795", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Retrieval function ready\n" ] } ], "source": [ "def retrieve_documents(query: str, top_k: int = 3) -> List[Dict]:\n", " \"\"\"Retrieve relevant documents from vector database.\"\"\"\n", " query_embedding = embed_model.encode(query).tolist()\n", " \n", " results = index.query(\n", " vector=query_embedding,\n", " top_k=top_k,\n", " include_metadata=True\n", " )\n", " \n", " documents = []\n", " for match in results[\"matches\"]:\n", " documents.append({\n", " \"pdf_name\": match[\"metadata\"].get(\"pdf_name\", \"unknown.pdf\"),\n", " \"page_number\": match[\"metadata\"].get(\"page_number\", 0),\n", " \"content\": match[\"metadata\"].get(\"text\", \"\"),\n", " \"score\": match.get(\"score\", 0.0)\n", " })\n", " \n", " return documents\n", "\n", "print(\"✅ Retrieval function ready\")" ] }, { "cell_type": "code", "execution_count": 57, "id": "2bfcc6fb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Configured 3 LLM models\n" ] } ], "source": [ "# Initialize Azure OpenAI\n", "azure_client = AzureOpenAI(\n", " api_key=os.getenv(\"AZURE_OPENAI_API_KEY\"),\n", " api_version=os.getenv(\"AZURE_OPENAI_API_VERSION\", \"2024-08-01-preview\"),\n", " azure_endpoint=os.getenv(\"AZURE_OPENAI_ENDPOINT\")\n", ")\n", "\n", "LLM_MODELS = {\n", " \"Llama-4-Maverick\": \"Llama-4-Maverick-17B-128E-Instruct-FP8\",\n", " \"DeepSeek-R1\": \"DeepSeek-R1\",\n", " \"GPT-4.1\": \"gpt-4.1\",\n", " # \"GPT-5-mini\": \"gpt-5-mini\"\n", " # \"Claude-Sonnet-4.5\": \"claude-sonnet-4-5\" # Not available in Azure deployment\n", "}\n", "\n", "print(f\"✅ Configured {len(LLM_MODELS)} LLM models\")" ] }, { "cell_type": "code", "execution_count": 61, "id": "ddedd503", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Generation function ready\n" ] } ], "source": [ "def generate_answer(model_name: str, query: str, documents: List[Dict],\n", " temperature: float = 0.2, max_tokens: int = 1000) -> Tuple[str, float]:\n", " \"\"\"Generate answer using specified LLM model.\"\"\"\n", " context_parts = []\n", " for i, doc in enumerate(documents, 1):\n", " context_parts.append(\n", " f\"Document {i} (Source: {doc['pdf_name']}, Page {doc['page_number']}):\\n{doc['content']}\"\n", " )\n", " context = \"\\n\\n\".join(context_parts)\n", " \n", " prompt = f\"\"\"Siz SOCAR-ın tarixi neft və qaz sənədləri üzrə mütəxəssis köməkçisisiniz.\n", "\n", "Kontekst:\n", "{context}\n", "\n", "Sual: {query}\n", "\n", "Ətraflı cavab verin və mütləq sənəd mənbələrinə istinad edin.\"\"\"\n", " \n", " deployment = LLM_MODELS[model_name]\n", " \n", " try:\n", " start_time = time.time()\n", " \n", " # GPT-5 models use max_completion_tokens, others use max_tokens\n", " if deployment.startswith(\"gpt-5\"):\n", " response = azure_client.chat.completions.create(\n", " model=deployment,\n", " messages=[{\"role\": \"user\", \"content\": prompt}],\n", " temperature=temperature,\n", " max_completion_tokens=max_tokens\n", " )\n", " else:\n", " response = azure_client.chat.completions.create(\n", " model=deployment,\n", " messages=[{\"role\": \"user\", \"content\": prompt}],\n", " temperature=temperature,\n", " max_tokens=max_tokens\n", " )\n", " \n", " response_time = time.time() - start_time\n", " answer = response.choices[0].message.content\n", " return answer, response_time\n", " \n", " except Exception as e:\n", " return f\"ERROR: {str(e)}\", 0.0\n", "\n", "print(\"✅ Generation function ready\")" ] }, { "cell_type": "code", "execution_count": 62, "id": "946b0e30", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Evaluation functions ready\n" ] } ], "source": [ "def evaluate_answer(expected: str, generated: str, documents: List[Dict]) -> Dict:\n", " \"\"\"Evaluate answer quality.\"\"\"\n", " # Normalize text\n", " def normalize(text):\n", " return text.lower().strip()\n", " \n", " # Calculate similarity\n", " if expected:\n", " wer_score = wer(normalize(expected), normalize(generated)) * 100\n", " similarity = max(0, 100 - wer_score)\n", " else:\n", " similarity = 0\n", " \n", " # Check citations\n", " pdf_names = [doc[\"pdf_name\"].replace(\".pdf\", \"\") for doc in documents]\n", " cited_pdfs = sum(1 for pdf in pdf_names if pdf in generated)\n", " citation_score = (cited_pdfs / len(pdf_names)) * 100 if pdf_names else 0\n", " \n", " # Completeness\n", " word_count = len(generated.split())\n", " completeness = min(100, (word_count / 50) * 100)\n", " \n", " return {\n", " \"Similarity\": round(similarity, 2),\n", " \"Citation_Score\": round(citation_score, 2),\n", " \"Completeness\": round(completeness, 2),\n", " \"Quality_Score\": round((similarity * 0.4 + citation_score * 0.4 + completeness * 0.2), 2)\n", " }\n", "\n", "print(\"✅ Evaluation functions ready\")" ] }, { "cell_type": "markdown", "id": "319459ce", "metadata": {}, "source": [ "## Run LLM Benchmark" ] }, { "cell_type": "code", "execution_count": 63, "id": "c8867f44", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "*******\n", "Testing: Llama-4-Maverick\n", "**********\n", " Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas...\n", " ✅ 4.31s\n", " Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və ...\n", " ✅ 4.61s\n", " Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümun...\n", " ✅ 3.92s\n", " Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkliyi (AKÇ) üçün geote...\n", " ✅ 4.13s\n", " Example5: Bu zonada hansı proseslər baş verir?...\n", " ✅ 3.50s\n", "*******\n", "Testing: DeepSeek-R1\n", "**********\n", " Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas...\n", " ✅ 10.38s\n", " Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və ...\n", " ✅ 11.32s\n", " Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümun...\n", " ✅ 10.45s\n", " Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkliyi (AKÇ) üçün geote...\n", " ✅ 10.56s\n", " Example5: Bu zonada hansı proseslər baş verir?...\n", " ✅ 10.99s\n", "*******\n", "Testing: GPT-4.1\n", "**********\n", " Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas...\n", " ✅ 6.32s\n", " Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və ...\n", " ✅ 5.85s\n", " Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümun...\n", " ✅ 8.09s\n", " Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkliyi (AKÇ) üçün geote...\n", " ✅ 6.72s\n", " Example5: Bu zonada hansı proseslər baş verir?...\n", " ✅ 5.22s\n", "*********\n", "✅ Benchmark complete!\n" ] } ], "source": [ "# Run benchmark\n", "results = []\n", "\n", "for model_name in LLM_MODELS.keys():\n", " print(f\"*******\")\n", " print(f\"Testing: {model_name}\")\n", " print(f\"**********\")\n", " \n", " for example_key, messages in questions.items():\n", " user_msg = [m for m in messages if m[\"role\"] == \"user\"][-1]\n", " query = user_msg[\"content\"]\n", " \n", " print(f\" {example_key}: {query[:60]}...\")\n", " \n", " # Retrieve and generate\n", " documents = retrieve_documents(query, top_k=3)\n", " answer, response_time = generate_answer(model_name, query, documents)\n", " \n", " if answer.startswith(\"ERROR\"):\n", " print(f\" ❌ {answer}\")\n", " continue\n", " \n", " print(f\" ✅ {response_time:.2f}s\")\n", " \n", " # Evaluate\n", " expected = expected_answers.get(example_key, {}).get(\"Answer\", \"\")\n", " metrics = evaluate_answer(expected, answer, documents)\n", " \n", " results.append({\n", " \"Model\": model_name,\n", " \"Question\": example_key,\n", " \"Response_Time\": round(response_time, 2),\n", " **metrics\n", " })\n", "\n", "print(\"*********\")\n", "print(\"✅ Benchmark complete!\")" ] }, { "cell_type": "code", "execution_count": 55, "id": "9b243569", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "====================================================================================================\n", "📊 LLM BENCHMARKING RESULTS\n", "====================================================================================================\n", " Quality_Score Similarity Citation_Score Completeness Response_Time\n", "Model \n", "GPT-4.1 52.00 0.00 80.00 100.0 6.38\n", "Llama-4-Maverick 52.00 0.00 80.00 100.0 4.00\n", "DeepSeek-R1 32.27 1.54 33.33 91.6 10.98\n", "====================================================================================================\n" ] } ], "source": [ "# Analyze results\n", "df = pd.DataFrame(results)\n", "summary = df.groupby(\"Model\").agg({\n", " \"Quality_Score\": \"mean\",\n", " \"Similarity\": \"mean\",\n", " \"Citation_Score\": \"mean\",\n", " \"Completeness\": \"mean\",\n", " \"Response_Time\": \"mean\"\n", "}).round(2).sort_values(\"Quality_Score\", ascending=False)\n", "\n", "print(\"\\n\" + \"=\"*100)\n", "print(\"📊 LLM BENCHMARKING RESULTS\")\n", "print(\"=\"*100)\n", "print(summary.to_string())\n", "print(\"=\"*100)" ] }, { "cell_type": "code", "execution_count": 56, "id": "8c64cf75", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "✅ Results saved to output/llm_benchmark/\n" ] } ], "source": [ "# Save results using dynamic path\n", "output_dir = OUTPUT_DIR / \"llm_benchmark\"\n", "output_dir.mkdir(parents=True, exist_ok=True)\n", "\n", "df.to_csv(output_dir / \"detailed_results.csv\", index=False, encoding=\"utf-8\")\n", "summary.to_csv(output_dir / \"summary.csv\", encoding=\"utf-8\")\n", "\n", "print(\"\\n✅ Results saved to output/llm_benchmark/\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.10.0" } }, "nbformat": 4, "nbformat_minor": 5 }