{
  "cells": [
    {
      "cell_type": "markdown",
      "id": "38a7900c",
      "metadata": {},
      "source": [
        "# LLM Benchmarking for SOCAR Hackathon RAG Chatbot\n",
        "\n",
        "Testing different LLM models for the `/llm` endpoint to find the best performer.\n",
        "\n",
        "## Evaluation Criteria (LLM Judge Metrics):\n",
        "- **Accuracy**: Is the answer correct?\n",
        "- **Relevance**: Are retrieved citations relevant?\n",
        "- **Completeness**: Does it fully answer the question?\n",
        "- **Citation Quality**: Proper sources with page numbers?\n",
        "- **Response Time**: Speed of generation\n",
        "\n",
        "## Available LLM Models:\n",
        "1. **Llama-4-Maverick-17B** (Open-source)\n",
        "2. **DeepSeek-R1** (Open-source reasoning)\n",
        "3. **GPT-4.1, GPT-5, GPT-5-mini**\n",
        "4. **Claude Sonnet 4.5**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 45,
      "id": "143cf60d",
      "metadata": {},
      "outputs": [],
      "source": [
        "# Install required packages\n",
        "# !pip install openai pinecone-client sentence-transformers python-dotenv pandas matplotlib seaborn jiwer"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 46,
      "id": "d698b11a",
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "✅ Libraries loaded\n"
          ]
        }
      ],
      "source": [
        "import os\n",
        "import json\n",
        "import time\n",
        "from typing import Dict, List, Tuple\n",
        "from dotenv import load_dotenv\n",
        "import pandas as pd\n",
        "import matplotlib.pyplot as plt\n",
        "import seaborn as sns\n",
        "from openai import AzureOpenAI\n",
        "from pinecone import Pinecone\n",
        "from sentence_transformers import SentenceTransformer\n",
        "from jiwer import wer, cer\n",
        "from pathlib import Path\n",
        "\n",
        "load_dotenv()\n",
        "sns.set_style(\"whitegrid\")\n",
        "plt.rcParams[\"figure.figsize\"] = (14, 8)\n",
        "\n",
        "print(\"✅ Libraries loaded\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 47,
      "id": "087187fb",
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "✅ Project root: /Users/ismatsamadov/SOCAR_Hackathon\n",
            "✅ Docs directory: /Users/ismatsamadov/SOCAR_Hackathon/docs\n"
          ]
        }
      ],
      "source": [
        "# Auto-detect project root\n",
        "if Path(\"data\").exists() and Path(\"docs\").exists():\n",
        "    PROJECT_ROOT = Path.cwd()\n",
        "elif Path(\"../data\").exists() and Path(\"../docs\").exists():\n",
        "    PROJECT_ROOT = Path.cwd().parent\n",
        "else:\n",
        "    current = Path.cwd()\n",
        "    while current != current.parent:\n",
        "        if (current / \"data\").exists() and (current / \"docs\").exists():\n",
        "            PROJECT_ROOT = current\n",
        "            break\n",
        "        current = current.parent\n",
        "    else:\n",
        "        PROJECT_ROOT = Path.cwd()\n",
        "\n",
        "DATA_DIR = PROJECT_ROOT / \"data\"\n",
        "DOCS_DIR = PROJECT_ROOT / \"docs\"\n",
        "OUTPUT_DIR = PROJECT_ROOT / \"output\"\n",
        "\n",
        "print(f\"✅ Project root: {PROJECT_ROOT}\")\n",
        "print(f\"✅ Docs directory: {DOCS_DIR}\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 48,
      "id": "cf51bb3f",
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Loaded 5 test cases\n"
          ]
        }
      ],
      "source": [
        "# Load sample questions and answers using dynamic paths\n",
        "with open(DOCS_DIR / \"sample_questions.json\", \"r\", encoding=\"utf-8\") as f:\n",
        "    questions = json.load(f)\n",
        "\n",
        "with open(DOCS_DIR / \"sample_answers.json\", \"r\", encoding=\"utf-8\") as f:\n",
        "    expected_answers = json.load(f)\n",
        "\n",
        "print(f\"Loaded {len(questions)} test cases\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 49,
      "id": "9e761174",
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "✅ Vector DB connected\n",
            "✅ Embedding model loaded\n"
          ]
        }
      ],
      "source": [
        "# Initialize Pinecone\n",
        "pc = Pinecone(api_key=os.getenv(\"PINECONE_API_KEY\"))\n",
        "index = pc.Index(os.getenv(\"PINECONE_INDEX_NAME\", \"hackathon\"))\n",
        "\n",
        "# Initialize embedding model\n",
        "embed_model = SentenceTransformer(\"BAAI/bge-large-en-v1.5\")\n",
        "\n",
        "print(f\"✅ Vector DB connected\")\n",
        "print(f\"✅ Embedding model loaded\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 50,
      "id": "74396795",
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "✅ Retrieval function ready\n"
          ]
        }
      ],
      "source": [
        "def retrieve_documents(query: str, top_k: int = 3) -> List[Dict]:\n",
        "    \"\"\"Retrieve relevant documents from vector database.\"\"\"\n",
        "    query_embedding = embed_model.encode(query).tolist()\n",
        "    \n",
        "    results = index.query(\n",
        "        vector=query_embedding,\n",
        "        top_k=top_k,\n",
        "        include_metadata=True\n",
        "    )\n",
        "    \n",
        "    documents = []\n",
        "    for match in results[\"matches\"]:\n",
        "        documents.append({\n",
        "            \"pdf_name\": match[\"metadata\"].get(\"pdf_name\", \"unknown.pdf\"),\n",
        "            \"page_number\": match[\"metadata\"].get(\"page_number\", 0),\n",
        "            \"content\": match[\"metadata\"].get(\"text\", \"\"),\n",
        "            \"score\": match.get(\"score\", 0.0)\n",
        "        })\n",
        "    \n",
        "    return documents\n",
        "\n",
        "print(\"✅ Retrieval function ready\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 57,
      "id": "2bfcc6fb",
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "✅ Configured 3 LLM models\n"
          ]
        }
      ],
      "source": [
        "# Initialize Azure OpenAI\n",
        "azure_client = AzureOpenAI(\n",
        "    api_key=os.getenv(\"AZURE_OPENAI_API_KEY\"),\n",
        "    api_version=os.getenv(\"AZURE_OPENAI_API_VERSION\", \"2024-08-01-preview\"),\n",
        "    azure_endpoint=os.getenv(\"AZURE_OPENAI_ENDPOINT\")\n",
        ")\n",
        "\n",
        "LLM_MODELS = {\n",
        "    \"Llama-4-Maverick\": \"Llama-4-Maverick-17B-128E-Instruct-FP8\",\n",
        "    \"DeepSeek-R1\": \"DeepSeek-R1\",\n",
        "    \"GPT-4.1\": \"gpt-4.1\",\n",
        "    # \"GPT-5-mini\": \"gpt-5-mini\"\n",
        "    # \"Claude-Sonnet-4.5\": \"claude-sonnet-4-5\"  # Not available in Azure deployment\n",
        "}\n",
        "\n",
        "print(f\"✅ Configured {len(LLM_MODELS)} LLM models\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 61,
      "id": "ddedd503",
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "✅ Generation function ready\n"
          ]
        }
      ],
      "source": [
        "def generate_answer(model_name: str, query: str, documents: List[Dict],\n",
        "                   temperature: float = 0.2, max_tokens: int = 1000) -> Tuple[str, float]:\n",
        "    \"\"\"Generate answer using specified LLM model.\"\"\"\n",
        "    context_parts = []\n",
        "    for i, doc in enumerate(documents, 1):\n",
        "        context_parts.append(\n",
        "            f\"Document {i} (Source: {doc['pdf_name']}, Page {doc['page_number']}):\\n{doc['content']}\"\n",
        "        )\n",
        "    context = \"\\n\\n\".join(context_parts)\n",
        "    \n",
        "    prompt = f\"\"\"Siz SOCAR-ın tarixi neft və qaz sənədləri üzrə mütəxəssis köməkçisisiniz.\n",
        "\n",
        "Kontekst:\n",
        "{context}\n",
        "\n",
        "Sual: {query}\n",
        "\n",
        "Ətraflı cavab verin və mütləq sənəd mənbələrinə istinad edin.\"\"\"\n",
        "    \n",
        "    deployment = LLM_MODELS[model_name]\n",
        "    \n",
        "    try:\n",
        "        start_time = time.time()\n",
        "        \n",
        "        # GPT-5 models use max_completion_tokens, others use max_tokens\n",
        "        if deployment.startswith(\"gpt-5\"):\n",
        "            response = azure_client.chat.completions.create(\n",
        "                model=deployment,\n",
        "                messages=[{\"role\": \"user\", \"content\": prompt}],\n",
        "                temperature=temperature,\n",
        "                max_completion_tokens=max_tokens\n",
        "            )\n",
        "        else:\n",
        "            response = azure_client.chat.completions.create(\n",
        "                model=deployment,\n",
        "                messages=[{\"role\": \"user\", \"content\": prompt}],\n",
        "                temperature=temperature,\n",
        "                max_tokens=max_tokens\n",
        "            )\n",
        "        \n",
        "        response_time = time.time() - start_time\n",
        "        answer = response.choices[0].message.content\n",
        "        return answer, response_time\n",
        "    \n",
        "    except Exception as e:\n",
        "        return f\"ERROR: {str(e)}\", 0.0\n",
        "\n",
        "print(\"✅ Generation function ready\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 62,
      "id": "946b0e30",
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "✅ Evaluation functions ready\n"
          ]
        }
      ],
      "source": [
        "def evaluate_answer(expected: str, generated: str, documents: List[Dict]) -> Dict:\n",
        "    \"\"\"Evaluate answer quality.\"\"\"\n",
        "    # Normalize text\n",
        "    def normalize(text):\n",
        "        return text.lower().strip()\n",
        "    \n",
        "    # Calculate similarity\n",
        "    if expected:\n",
        "        wer_score = wer(normalize(expected), normalize(generated)) * 100\n",
        "        similarity = max(0, 100 - wer_score)\n",
        "    else:\n",
        "        similarity = 0\n",
        "    \n",
        "    # Check citations\n",
        "    pdf_names = [doc[\"pdf_name\"].replace(\".pdf\", \"\") for doc in documents]\n",
        "    cited_pdfs = sum(1 for pdf in pdf_names if pdf in generated)\n",
        "    citation_score = (cited_pdfs / len(pdf_names)) * 100 if pdf_names else 0\n",
        "    \n",
        "    # Completeness\n",
        "    word_count = len(generated.split())\n",
        "    completeness = min(100, (word_count / 50) * 100)\n",
        "    \n",
        "    return {\n",
        "        \"Similarity\": round(similarity, 2),\n",
        "        \"Citation_Score\": round(citation_score, 2),\n",
        "        \"Completeness\": round(completeness, 2),\n",
        "        \"Quality_Score\": round((similarity * 0.4 + citation_score * 0.4 + completeness * 0.2), 2)\n",
        "    }\n",
        "\n",
        "print(\"✅ Evaluation functions ready\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "319459ce",
      "metadata": {},
      "source": [
        "## Run LLM Benchmark"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 63,
      "id": "c8867f44",
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "*******\n",
            "Testing: Llama-4-Maverick\n",
            "**********\n",
            "  Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas...\n",
            "    ✅ 4.31s\n",
            "  Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və ...\n",
            "    ✅ 4.61s\n",
            "  Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümun...\n",
            "    ✅ 3.92s\n",
            "  Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkliyi (AKÇ) üçün geote...\n",
            "    ✅ 4.13s\n",
            "  Example5: Bu zonada hansı proseslər baş verir?...\n",
            "    ✅ 3.50s\n",
            "*******\n",
            "Testing: DeepSeek-R1\n",
            "**********\n",
            "  Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas...\n",
            "    ✅ 10.38s\n",
            "  Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və ...\n",
            "    ✅ 11.32s\n",
            "  Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümun...\n",
            "    ✅ 10.45s\n",
            "  Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkliyi (AKÇ) üçün geote...\n",
            "    ✅ 10.56s\n",
            "  Example5: Bu zonada hansı proseslər baş verir?...\n",
            "    ✅ 10.99s\n",
            "*******\n",
            "Testing: GPT-4.1\n",
            "**********\n",
            "  Example1: Daha az quyu ilə daha çox hasilat əldə etmək üçün hansı əsas...\n",
            "    ✅ 6.32s\n",
            "  Example2: Qərbi Abşeron yatağında suvurma tədbirləri hansı tarixdə və ...\n",
            "    ✅ 5.85s\n",
            "  Example3: Pirallahı strukturunda 1253 nömrəli quyudan götürülmüş nümun...\n",
            "    ✅ 8.09s\n",
            "  Example4: Bakı arxipelaqı (BA) və Aşağı Kür çökəkliyi (AKÇ) üçün geote...\n",
            "    ✅ 6.72s\n",
            "  Example5: Bu zonada hansı proseslər baş verir?...\n",
            "    ✅ 5.22s\n",
            "*********\n",
            "✅ Benchmark complete!\n"
          ]
        }
      ],
      "source": [
        "# Run benchmark\n",
        "results = []\n",
        "\n",
        "for model_name in LLM_MODELS.keys():\n",
        "    print(f\"*******\")\n",
        "    print(f\"Testing: {model_name}\")\n",
        "    print(f\"**********\")\n",
        "    \n",
        "    for example_key, messages in questions.items():\n",
        "        user_msg = [m for m in messages if m[\"role\"] == \"user\"][-1]\n",
        "        query = user_msg[\"content\"]\n",
        "        \n",
        "        print(f\"  {example_key}: {query[:60]}...\")\n",
        "        \n",
        "        # Retrieve and generate\n",
        "        documents = retrieve_documents(query, top_k=3)\n",
        "        answer, response_time = generate_answer(model_name, query, documents)\n",
        "        \n",
        "        if answer.startswith(\"ERROR\"):\n",
        "            print(f\"    ❌ {answer}\")\n",
        "            continue\n",
        "        \n",
        "        print(f\"    ✅ {response_time:.2f}s\")\n",
        "        \n",
        "        # Evaluate\n",
        "        expected = expected_answers.get(example_key, {}).get(\"Answer\", \"\")\n",
        "        metrics = evaluate_answer(expected, answer, documents)\n",
        "        \n",
        "        results.append({\n",
        "            \"Model\": model_name,\n",
        "            \"Question\": example_key,\n",
        "            \"Response_Time\": round(response_time, 2),\n",
        "            **metrics\n",
        "        })\n",
        "\n",
        "print(\"*********\")\n",
        "print(\"✅ Benchmark complete!\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 55,
      "id": "9b243569",
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\n",
            "====================================================================================================\n",
            "📊 LLM BENCHMARKING RESULTS\n",
            "====================================================================================================\n",
            "                  Quality_Score  Similarity  Citation_Score  Completeness  Response_Time\n",
            "Model                                                                                   \n",
            "GPT-4.1                   52.00        0.00           80.00         100.0           6.38\n",
            "Llama-4-Maverick          52.00        0.00           80.00         100.0           4.00\n",
            "DeepSeek-R1               32.27        1.54           33.33          91.6          10.98\n",
            "====================================================================================================\n"
          ]
        }
      ],
      "source": [
        "# Analyze results\n",
        "df = pd.DataFrame(results)\n",
        "summary = df.groupby(\"Model\").agg({\n",
        "    \"Quality_Score\": \"mean\",\n",
        "    \"Similarity\": \"mean\",\n",
        "    \"Citation_Score\": \"mean\",\n",
        "    \"Completeness\": \"mean\",\n",
        "    \"Response_Time\": \"mean\"\n",
        "}).round(2).sort_values(\"Quality_Score\", ascending=False)\n",
        "\n",
        "print(\"\\n\" + \"=\"*100)\n",
        "print(\"📊 LLM BENCHMARKING RESULTS\")\n",
        "print(\"=\"*100)\n",
        "print(summary.to_string())\n",
        "print(\"=\"*100)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 56,
      "id": "8c64cf75",
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\n",
            "✅ Results saved to output/llm_benchmark/\n"
          ]
        }
      ],
      "source": [
        "# Save results using dynamic path\n",
        "output_dir = OUTPUT_DIR / \"llm_benchmark\"\n",
        "output_dir.mkdir(parents=True, exist_ok=True)\n",
        "\n",
        "df.to_csv(output_dir / \"detailed_results.csv\", index=False, encoding=\"utf-8\")\n",
        "summary.to_csv(output_dir / \"summary.csv\", encoding=\"utf-8\")\n",
        "\n",
        "print(\"\\n✅ Results saved to output/llm_benchmark/\")"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "name": "python",
      "version": "3.10.0"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}