{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "1d608e23",
   "metadata": {},
   "source": [
    "#### Adaptive Hybrid RAG Pipeline\n",
    "##### Data Ingestion and Importing important libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "d6e6dd37",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from langchain_community.document_loaders import PyPDFLoader,PyMuPDFLoader\n",
    "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
    "from datasets import load_dataset\n",
    "from dotenv import load_dotenv\n",
    "import numpy as np\n",
    "from tqdm.auto import tqdm\n",
    "from sentence_transformers import SentenceTransformer,CrossEncoder\n",
    "import torch\n",
    "from groq import Groq\n",
    "from langchain_groq import ChatGroq\n",
    "import re\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "34769b9e",
   "metadata": {},
   "source": [
    "#### Loading HF Token from HuggingFace for authenticated requests"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "1404eeca",
   "metadata": {},
   "outputs": [],
   "source": [
    "load_dotenv()\n",
    "os.environ[\"HF_TOKEN\"] = os.getenv(\"HF_TOKEN\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ff8937c7",
   "metadata": {},
   "source": [
    "#### Data Chunking and Embedding \n",
    "###### (npy and json file downloaded from google colab with runtime set to T4 GPU due to laptop's CPU constraints(lack of GPU = 26hrs+ to load all 10000+ documents))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "01c7f384",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<>:11: SyntaxWarning: invalid escape sequence '\\d'\n",
      "<>:11: SyntaxWarning: invalid escape sequence '\\d'\n",
      "C:\\Users\\mishr\\AppData\\Local\\Temp\\ipykernel_9072\\1794663228.py:11: SyntaxWarning: invalid escape sequence '\\d'\n",
      "  text = re.sub(r\"@xmath\\d+\", \"[MATH]\", text)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'\\nimport re\\nimport hashlib\\ndataset = load_dataset(\"ccdv/arxiv-summarization\")\\npaper = dataset[\"train\"][0]\\nprint(paper.keys())\\nsubset = dataset[\"train\"].select(range(10000))\\nsubset\\ndef clean_text(text):\\n    text = re.sub(r\"@xcite\", \"\", text)       \\n    text = re.sub(r\"@xmath\\\\d+\", \"[MATH]\", text)  \\n    text = re.sub(r\"[ \\t]+\", \" \", text)\\n    text = re.sub(r\"\\n{3,}\", \"\\n\\n\", text)\\n    return text.strip()\\ndef process_dataset(data):\\n    all_chunks = []\\n    splitter = RecursiveCharacterTextSplitter(\\n        chunk_size=1000,\\n        chunk_overlap=200,\\n        separators=[\"\\n\\n\", \"\\n\", \". \", \" \", \"\"]\\n    )\\n    for i, paper in enumerate(data):\\n        print(f\"[INFO] Processing document {i}...\")\\n        try:\\n            article = clean_text(paper.get(\"article\", \"\"))\\n            abstract = clean_text(paper.get(\"abstract\", \"\"))\\n            abstract_chunks = splitter.split_text(abstract)\\n            for chunk_idx, chunk in enumerate(abstract_chunks):\\n                chunk = chunk.strip()\\n                if len(chunk) < 50:\\n                    continue\\n                chunk_id = hashlib.md5(\\n                    f\"{i}_abstract_{chunk_idx}\".encode()\\n                ).hexdigest()\\n                all_chunks.append({\\n                    \"chunk_id\": chunk_id,\\n                    \"paper_id\": i,\\n                    \"section_title\": \"ABSTRACT\",\\n                    \"chunk_index\": chunk_idx,\\n                    \"text\": f\"{abstract[:200]} {chunk}\",\\n                    \"abstract\": abstract[:300],\\n                    \"token_estimate\": len(chunk.split())\\n                })\\n            article_chunks = splitter.split_text(article)\\n            for chunk_idx, chunk in enumerate(article_chunks):\\n                chunk = chunk.strip()\\n                if len(chunk) < 50:\\n                    continue\\n                chunk_id = hashlib.md5(\\n                    f\"{i}_article_{chunk_idx}\".encode()\\n                ).hexdigest()\\n                all_chunks.append({\\n                    \"chunk_id\": chunk_id,\\n                    \"paper_id\": i,\\n                    \"section_title\": \"ARTICLE\",\\n                    \"chunk_index\": chunk_idx,\\n                    \"text\": chunk,\\n                    \"abstract\": abstract[:300],\\n                    \"token_estimate\": len(chunk.split())\\n                })\\n\\n        except Exception as e:\\n            print(f\"[ERROR] Failed processing document {i}: {e}\")\\n\\n    print(f\"\\n[INFO] Successfully created {len(all_chunks)} chunks from {len(data)} papers\")\\n    return all_chunks\\nall_chunks = process_dataset(subset)\\nall_chunks\\n'"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"\"\"\n",
    "import re\n",
    "import hashlib\n",
    "dataset = load_dataset(\"ccdv/arxiv-summarization\")\n",
    "paper = dataset[\"train\"][0]\n",
    "print(paper.keys())\n",
    "subset = dataset[\"train\"].select(range(10000))\n",
    "subset\n",
    "def clean_text(text):\n",
    "    text = re.sub(r\"@xcite\", \"\", text)       \n",
    "    text = re.sub(r\"@xmath\\d+\", \"[MATH]\", text)  \n",
    "    text = re.sub(r\"[ \\t]+\", \" \", text)\n",
    "    text = re.sub(r\"\\n{3,}\", \"\\n\\n\", text)\n",
    "    return text.strip()\n",
    "def process_dataset(data):\n",
    "    all_chunks = []\n",
    "    splitter = RecursiveCharacterTextSplitter(\n",
    "        chunk_size=1000,\n",
    "        chunk_overlap=200,\n",
    "        separators=[\"\\n\\n\", \"\\n\", \". \", \" \", \"\"]\n",
    "    )\n",
    "    for i, paper in enumerate(data):\n",
    "        print(f\"[INFO] Processing document {i}...\")\n",
    "        try:\n",
    "            article = clean_text(paper.get(\"article\", \"\"))\n",
    "            abstract = clean_text(paper.get(\"abstract\", \"\"))\n",
    "            abstract_chunks = splitter.split_text(abstract)\n",
    "            for chunk_idx, chunk in enumerate(abstract_chunks):\n",
    "                chunk = chunk.strip()\n",
    "                if len(chunk) < 50:\n",
    "                    continue\n",
    "                chunk_id = hashlib.md5(\n",
    "                    f\"{i}_abstract_{chunk_idx}\".encode()\n",
    "                ).hexdigest()\n",
    "                all_chunks.append({\n",
    "                    \"chunk_id\": chunk_id,\n",
    "                    \"paper_id\": i,\n",
    "                    \"section_title\": \"ABSTRACT\",\n",
    "                    \"chunk_index\": chunk_idx,\n",
    "                    \"text\": f\"{abstract[:200]} {chunk}\",\n",
    "                    \"abstract\": abstract[:300],\n",
    "                    \"token_estimate\": len(chunk.split())\n",
    "                })\n",
    "            article_chunks = splitter.split_text(article)\n",
    "            for chunk_idx, chunk in enumerate(article_chunks):\n",
    "                chunk = chunk.strip()\n",
    "                if len(chunk) < 50:\n",
    "                    continue\n",
    "                chunk_id = hashlib.md5(\n",
    "                    f\"{i}_article_{chunk_idx}\".encode()\n",
    "                ).hexdigest()\n",
    "                all_chunks.append({\n",
    "                    \"chunk_id\": chunk_id,\n",
    "                    \"paper_id\": i,\n",
    "                    \"section_title\": \"ARTICLE\",\n",
    "                    \"chunk_index\": chunk_idx,\n",
    "                    \"text\": chunk,\n",
    "                    \"abstract\": abstract[:300],\n",
    "                    \"token_estimate\": len(chunk.split())\n",
    "                })\n",
    "\n",
    "        except Exception as e:\n",
    "            print(f\"[ERROR] Failed processing document {i}: {e}\")\n",
    "\n",
    "    print(f\"\\n[INFO] Successfully created {len(all_chunks)} chunks from {len(data)} papers\")\n",
    "    return all_chunks\n",
    "all_chunks = process_dataset(subset)\n",
    "all_chunks\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fe30ccfb",
   "metadata": {},
   "source": [
    "#### Loading relevant files downloaded from google colab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "fb5fc723",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Chunks: 443080\n",
      "Embeddings: (443080, 384)\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import numpy as np\n",
    "with open(\"../all_chunks_slim.json\") as f:\n",
    "    all_chunks = json.load(f)\n",
    "chunk_embeddings = np.load(\"../chunk_embeddings_10k (1).npy\")\n",
    "print(f\"Chunks: {len(all_chunks)}\")\n",
    "print(f\"Embeddings: {chunk_embeddings.shape}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8bf6fac2",
   "metadata": {},
   "source": [
    "#### Checking the contents of the 10000+ files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "5802296d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "════════════════════════════════════════════════════════════\n",
      "TOP 30 DOMAIN WORDS IN YOUR CORPUS (stopwords removed)\n",
      "════════════════════════════════════════════════════════════\n",
      "  math                 2426  █████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\n",
      "  field                  78  ███████████████\n",
      "  time                   77  ███████████████\n",
      "  case                   64  ████████████\n",
      "  state                  64  ████████████\n",
      "  energy                 61  ████████████\n",
      "  order                  58  ███████████\n",
      "  function               56  ███████████\n",
      "  system                 54  ██████████\n",
      "  same                   49  █████████\n",
      "  since                  49  █████████\n",
      "  states                 48  █████████\n",
      "  only                   47  █████████\n",
      "  mass                   45  █████████\n",
      "  galaxies               44  ████████\n",
      "  different              43  ████████\n",
      "  above                  43  ████████\n",
      "  values                 43  ████████\n",
      "  first                  42  ████████\n",
      "  line                   40  ████████\n",
      "  frac                   39  ███████\n",
      "  high                   39  ███████\n",
      "  density                38  ███████\n",
      "  section                37  ███████\n",
      "  value                  37  ███████\n",
      "  magnetic               37  ███████\n",
      "  shown                  37  ███████\n",
      "  phase                  36  ███████\n",
      "  terms                  36  ███████\n",
      "  form                   36  ███████\n",
      "\n",
      "Total chunks in corpus : 443,080\n",
      "Sample size            : 300\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import random\n",
    "from collections import Counter\n",
    "import re\n",
    "random.seed(42)\n",
    "sample_chunks = random.sample(all_chunks, min(300, len(all_chunks)))\n",
    "word_counts = Counter()\n",
    "stopwords = {\n",
    "    \"the\",\"a\",\"an\",\"of\",\"in\",\"to\",\"and\",\"is\",\"are\",\"for\",\"with\",\"that\",\n",
    "    \"this\",\"on\",\"by\",\"we\",\"be\",\"as\",\"from\",\"it\",\"or\",\"at\",\"our\",\"their\",\n",
    "    \"which\",\"have\",\"has\",\"been\",\"can\",\"also\",\"these\",\"they\",\"its\",\"was\",\n",
    "    \"were\",\"not\",\"but\",\"such\",\"using\",\"used\",\"when\",\"where\",\"all\",\"each\",\n",
    "    \"two\",\"one\",\"two\",\"three\",\"more\",\"than\",\"into\",\"over\",\"between\",\"after\",\n",
    "    \"both\",\"while\",\"than\",\"thus\",\"well\",\"here\",\"then\",\"there\",\"so\",\"may\",\n",
    "    \"will\",\"would\",\"could\",\"should\",\"about\",\"other\",\"through\",\"however\",\n",
    "    \"paper\",\"model\",\"show\",\"shows\",\"method\",\"approach\",\"results\",\"result\",\n",
    "    \"based\",\"proposed\",\"present\",\"use\",\"data\",\"set\",\"number\",\"given\"\n",
    "}\n",
    "for chunk in sample_chunks:\n",
    "    words = re.findall(r'\\b[a-z]{4,}\\b', chunk[\"text\"].lower())\n",
    "    for w in words:\n",
    "        if w not in stopwords:\n",
    "            word_counts[w] += 1\n",
    "print(\"═\" * 60)\n",
    "print(\"TOP 30 DOMAIN WORDS IN YOUR CORPUS (stopwords removed)\")\n",
    "print(\"═\" * 60)\n",
    "for word, count in word_counts.most_common(30):\n",
    "    bar = \"█\" * (count // 5)\n",
    "    print(f\"  {word:<20} {count:>4}  {bar}\")\n",
    "print()\n",
    "print(f\"Total chunks in corpus : {len(all_chunks):,}\")\n",
    "print(f\"Sample size            : {len(sample_chunks)}\")\n",
    "print()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2aff09ef",
   "metadata": {},
   "source": [
    "#### Run once to initialise BM25 index, save it to bm25_index_10k"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1b134658",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Split strings:  69%|██████▊   | 304183/443080 [00:18<00:07, 17657.93it/s]"
     ]
    }
   ],
   "source": [
    "import bm25s\n",
    "corpus_texts = [c[\"text\"] for c in all_chunks]\n",
    "retriever = bm25s.BM25()\n",
    "retriever.index(bm25s.tokenize(corpus_texts))\n",
    "retriever.save(\"bm25_index_10k\")\n",
    "print(f\"[INFO]BM25 index saved\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b6299ae7",
   "metadata": {},
   "source": [
    "#### Once bm25s index is created simply load it"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "6d895079",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\mishr\\OneDrive\\Desktop\\adaptive-rag\\venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[INFO]BM25 loaded from disk\n"
     ]
    }
   ],
   "source": [
    "import bm25s\n",
    "retriever = bm25s.BM25.load(\"bm25_index_10k\", load_corpus=True)\n",
    "print(f\"[INFO]BM25 loaded from disk\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1a83bbad",
   "metadata": {},
   "source": [
    "#### Initialise QDrant Client for VectorDB storage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "36fbeb39",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\mishr\\AppData\\Local\\Temp\\ipykernel_17912\\359453757.py:3: UserWarning: Local mode is not recommended for collections with more than 20,000 points. Collection <papers> contains 443080 points. Consider using Qdrant in Docker or Qdrant Cloud for better performance with large datasets.\n",
      "  qdrant = QdrantClient(path=\"./qdrant_database_10k\")\n",
      "C:\\Users\\mishr\\AppData\\Local\\Temp\\ipykernel_17912\\359453757.py:4: DeprecationWarning: `recreate_collection` method is deprecated and will be removed in the future. Use `collection_exists` to check collection existence and `create_collection` instead.\n",
      "  qdrant.recreate_collection(\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from qdrant_client import QdrantClient\n",
    "from qdrant_client.models import VectorParams, Distance, PointStruct\n",
    "qdrant = QdrantClient(path=\"./qdrant_database_10k\") \n",
    "qdrant.recreate_collection(\n",
    "    collection_name=\"papers\",\n",
    "    vectors_config=VectorParams(size=384, distance=Distance.COSINE)\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7990100a",
   "metadata": {},
   "source": [
    "#### Run once to create collections Note: replace recreate_collection with create_collection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "5cba3048",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\mishr\\AppData\\Local\\Temp\\ipykernel_7008\\16691855.py:1: DeprecationWarning: `recreate_collection` method is deprecated and will be removed in the future. Use `collection_exists` to check collection existence and `create_collection` instead.\n",
      "  qdrant.recreate_collection(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Indexed 0/443080\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\mishr\\AppData\\Local\\Temp\\ipykernel_7008\\16691855.py:21: UserWarning: Local mode is not recommended for collections with more than 20,000 points. Current collection contains 20480 points. Consider using Qdrant in Docker or Qdrant Cloud for better performance with large datasets.\n",
      "  qdrant.upsert(collection_name=\"papers\", points=points)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[INFO]Qdrant indexing complete\n"
     ]
    }
   ],
   "source": [
    "qdrant.recreate_collection(\n",
    "    collection_name=\"papers\",\n",
    "    vectors_config=VectorParams(size=384, distance=Distance.COSINE)\n",
    ")\n",
    "BATCH = 512\n",
    "for start in range(0, len(all_chunks), BATCH):\n",
    "    end = min(start + BATCH, len(all_chunks))\n",
    "    points = [\n",
    "        PointStruct(\n",
    "            id=i,\n",
    "            vector=chunk_embeddings[i].tolist(),\n",
    "            payload={\n",
    "                \"chunk_id\": all_chunks[i][\"chunk_id\"],\n",
    "                \"paper_id\": all_chunks[i][\"paper_id\"],\n",
    "                \"section\": all_chunks[i][\"section_title\"],\n",
    "                \"text\": all_chunks[i][\"text\"]\n",
    "            }\n",
    "        )\n",
    "        for i in range(start, end)\n",
    "    ]\n",
    "    qdrant.upsert(collection_name=\"papers\", points=points)\n",
    "    if start % 50000 == 0:\n",
    "        print(f\"Indexed {start}/{len(all_chunks)}\")\n",
    "print(f\"[INFO]Qdrant indexing complete\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1266a038",
   "metadata": {},
   "source": [
    "#### Sanity check to see if qdrant vectorized all the chunks->expected output : 443080"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "60514180",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total vectors in Qdrant: 443080\n"
     ]
    }
   ],
   "source": [
    "info = qdrant.get_collection(\"papers\")\n",
    "print(f\"Total vectors in Qdrant: {info.points_count}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bfe24ff4",
   "metadata": {},
   "source": [
    "#### Initialise all-MiniLM-L6-v2 Encoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "c09fae3d",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading weights: 100%|██████████| 103/103 [00:00<00:00, 2010.29it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "bi_encoder ready\n"
     ]
    }
   ],
   "source": [
    "bi_encoder = SentenceTransformer('all-MiniLM-L6-v2')\n",
    "print(\"bi_encoder ready\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7a2c3a05",
   "metadata": {},
   "source": [
    "#### Initialize ms-marco-MiniLM-L-12-v2 cross encoder model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "fddb7ab7",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading weights: 100%|██████████| 201/201 [00:00<00:00, 3571.72it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Cross encoder ready\n"
     ]
    }
   ],
   "source": [
    "cross_encoder = CrossEncoder(\"cross-encoder/ms-marco-MiniLM-L-12-v2\") #changed from all-MiniLM-L6-v2 and BAAI/bge-reranker-base\n",
    "print(\"Cross encoder ready\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b0fbf331",
   "metadata": {},
   "source": [
    "#### bm25 Search for keyword search"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2a6c4423",
   "metadata": {},
   "outputs": [],
   "source": [
    "def bm25_search(query, top_k=20):\n",
    "    tokenized = bm25s.tokenize(query)                       \n",
    "    results, scores = retriever.retrieve(tokenized, k=top_k) \n",
    "    output = []\n",
    "    for i, score in zip(results[0], scores[0]):\n",
    "        chunk = all_chunks[i].copy()\n",
    "        chunk[\"bm25_score\"] = float(score)\n",
    "        chunk[\"chunk_id\"] = str(chunk[\"chunk_id\"])\n",
    "        output.append(chunk)\n",
    "    return output"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f8898b61",
   "metadata": {},
   "source": [
    "#### Semantic search for dense vector"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "36c5a4a3",
   "metadata": {},
   "outputs": [],
   "source": [
    "def semantic_search(query, top_k=20):\n",
    "    query_emb = bi_encoder.encode([query]).tolist()[0]   \n",
    "    hits = qdrant.query_points(\n",
    "        collection_name=\"papers\",\n",
    "        query=query_emb,\n",
    "        limit=top_k\n",
    "    ).points\n",
    "    results = []\n",
    "    for hit in hits:\n",
    "        r = dict(hit.payload)\n",
    "        r[\"section_title\"] = r.pop(\"section\", \"UNKNOWN\")\n",
    "        r[\"chunk_id\"] = str(r.get(\"chunk_id\", \"\"))\n",
    "        results.append(r)\n",
    "    return results"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d74a9df6",
   "metadata": {},
   "source": [
    "#### Reciprocal Rank fusion to normalize Semantic results + BM25 indexing for better results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "60b25a3a",
   "metadata": {},
   "outputs": [],
   "source": [
    "def reciprocal_rank_fusion(bm25_results, semantic_results, k=30):\n",
    "    scores = {}\n",
    "    chunk_map = {}\n",
    "    for rank, result in enumerate(bm25_results):\n",
    "        cid = str(result[\"chunk_id\"])\n",
    "        scores[cid] = scores.get(cid, 0) + 1 / (k + rank + 1)\n",
    "        chunk_map[cid] = result\n",
    "    for rank, result in enumerate(semantic_results):\n",
    "        cid = str(result[\"chunk_id\"])\n",
    "        scores[cid] = scores.get(cid, 0) + 1 / (k + rank + 1)\n",
    "        chunk_map[cid] = result\n",
    "    sorted_ids = sorted(scores, key=lambda x: scores[x], reverse=True)\n",
    "    results = []\n",
    "    for cid in sorted_ids:\n",
    "        chunk = chunk_map[cid].copy()\n",
    "        chunk[\"rrf_score\"] = scores[cid]\n",
    "        results.append(chunk)\n",
    "    return results"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dd86a6bb",
   "metadata": {},
   "source": [
    "#### Hybrid search combining bm25s and semantic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "62c92687",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def hybrid_search(query, top_k=50, fetch_k=100):\n",
    "    bm25_results = bm25_search(query, top_k=fetch_k)\n",
    "    semantic_results = semantic_search(query, top_k=fetch_k)\n",
    "    fused = reciprocal_rank_fusion(bm25_results, semantic_results)\n",
    "    return fused[:top_k]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d602b26e",
   "metadata": {},
   "source": [
    "#### Test for hybrid search"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "3e5f5254",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[{'chunk_id': '817383855449daaf928de75431e6145b',\n",
       "  'paper_id': 10,\n",
       "  'text': 'we demonstrate different types of swarm equilibria via examples . in section \\n [ sec : repulsive ] , we focus on purely repulsive endogenous interactions . \\n we consider a bounded domain with no exogenous forces , a half - line subject to gravitational forces , and an unbounded domain subject to a quadratic exogenous potential , modeling attraction to a light , chemical , or nutrient source . \\n for all three situations , we find exact solutions for swarm equilibria . \\n for the first two examples , these equilibria consist of a density distribution that is classical in the interior of the domain , but contains [MATH]-functions at the boundaries . for the third example , the equilibrium is compactly supported with the density dropping discontinuously to zero at the edge of the support .',\n",
       "  'section_title': 'ARTICLE',\n",
       "  'rrf_score': 0.061669829222011384},\n",
       " {'chunk_id': '86480860ff7c16792aa1f822a587e234',\n",
       "  'paper_id': 10,\n",
       "  'text': 'we then derive an analogous continuum model and use variational methods to seek minimizers of its energy . \\n this process involves solution of a fredholm integral equation for the density . \\n for some choices of endogenous forces , we are able to find exact solutions . perhaps surprisingly , they are not always classical . in particular , they can involve [MATH]-function concentrations of mass at the domain boundary . \\n the rest of this paper is organized as follows . in section \\n [ sec : formulation ] , we create the mathematical framework for our study , and derive conditions for a particular density distribution to be an equilibrium solution , and to be stable to various classes of perturbations . in sections [ sec : repulsive ] and [ sec : morse ] , \\n we demonstrate different types of swarm equilibria via examples . in section \\n [ sec : repulsive ] , we focus on purely repulsive endogenous interactions .',\n",
       "  'section_title': 'ARTICLE',\n",
       "  'rrf_score': 0.05689102564102564},\n",
       " {'chunk_id': 'cb0657d577a661a0e9ce465434a09c56',\n",
       "  'paper_id': 10,\n",
       "  'text': 'this energy can be interpreted as the continuum analog of the summed pairwise energy of the corresponding discrete ( particle ) model . \\n we will also exploit this energy to find equilibrium solutions and study their stability . in this paper \\n , we focus on equilibria of swarms and ask the following questions : * what sorts of density distributions do swarming systems make ? are they classical or nonclassical ? * how are the final density distributions reached affected by endogenous interactions , exogenous forces , boundaries , and the interplay of these ? * \\n how well can discrete and continuum swarming systems approximate each other ? to answer these questions , we formulate a general mathematical framework for discrete , interacting swarm members in one spatial dimension , also subject to exogenous forces . \\n we then derive an analogous continuum model and use variational methods to seek minimizers of its energy .',\n",
       "  'section_title': 'ARTICLE',\n",
       "  'rrf_score': 0.052526595744680854},\n",
       " {'chunk_id': 'c9d225b2dda312fedd8ec56b9800b08f',\n",
       "  'paper_id': 5318,\n",
       "  'text': '+ + calculating the stability conditions of all three equilibria allows us to decide under which parameter conditions the swarm either collapses to a point ( [MATH] ) , freezes to a certain pattern ( [MATH] ) , or moves amorphously ( [MATH] ) . for equilibrium [MATH] , \\n all seven eigenvalues of the jacobian are accessible and it is found to be stable under the conditions [MATH] if these inequalities are satisfied , all swarm particles will collapse into the origin and will remain motionless apart from stochastic fluctuations . \\n time evolution of the deterministic equations ( [ eins])-([zwei ] ) for [MATH] ( dashed line ) , [MATH] ( solid - line ) , [MATH] ( dot - dashed line ) , [MATH] ( solid line ) , [MATH] , [MATH] and [MATH] ( not shown for simplicity of presentation ) for the collapse of the swarm to the minimum of the external potential ( [MATH] ) . \\n simulation parameters are [MATH] , [MATH] , [MATH] , [MATH] , [MATH] and [MATH] ( [MATH] ) . ]',\n",
       "  'section_title': 'ARTICLE',\n",
       "  'rrf_score': 0.0521680216802168},\n",
       " {'chunk_id': 'c93f9ef8e148a775654dd7ca43713f6e',\n",
       "  'paper_id': 10,\n",
       "  'text': 'the horizontal blue line indicates ( schematically ) that for [MATH] , the equilibrium consists of all mass concentrated at the origin ; as discussed above , this state is the global minimizer and ( we believe ) the global attractor . as mass \\n is increased through [MATH] , the equilibrium is a swarm minimizer consisting of a classical swarm in the air separated from the origin , and some mass concentrated on the ground . as [MATH] increases , the proportion of mass located on the ground decreases monotonically . \\n figure [ fig : quasi2dnumerics](b ) visualizes the support of the airborne swarm , which exists only for [MATH] ; the lower and upper data represent the coordinates of the bottom and top of the swarm , respectively . as mass \\n is increased , the span of the swarm increases monotonically . as established above , when [MATH] , swarm minimizers exist with two components .',\n",
       "  'section_title': 'ARTICLE',\n",
       "  'rrf_score': 0.05083655083655084},\n",
       " {'chunk_id': '02c60d8c2c89aa9b8956a5439adbbfce',\n",
       "  'paper_id': 10,\n",
       "  'text': 'we study equilibrium configurations of swarming biological organisms subject to exogenous and pairwise endogenous forces . beginning with a discrete dynamical model \\n , we derive a variational descrip we study equilibrium configurations of swarming biological organisms subject to exogenous and pairwise endogenous forces . beginning with a discrete dynamical model \\n , we derive a variational description of the corresponding continuum population density . \\n equilibrium solutions are extrema of an energy functional , and satisfy a fredholm integral equation . \\n we find conditions for the extrema to be local minimizers , global minimizers , and minimizers with respect to infinitesimal lagrangian displacements of mass . \\n in one spatial dimension , for a variety of exogenous forces , endogenous forces , and domain configurations , we find exact analytical expressions for the equilibria . \\n these agree closely with numerical simulations of the underlying discrete model.the exact solutions provide a sampling of the wide variety of equilibrium configurations possible within our general swarm modeling framework .',\n",
       "  'section_title': 'ABSTRACT',\n",
       "  'rrf_score': 0.050505050505050504},\n",
       " {'chunk_id': 'fc81774421f77a46f4cf352c80b7081a',\n",
       "  'paper_id': 10,\n",
       "  'text': \"we take total mass [MATH] and set the domain half - width to be [MATH] . the interaction potential parameters [MATH] and [MATH] . \\n the solid line is the classical solution [MATH] . \\n dots correspond to the numerically - obtained equilibrium of the discrete system ( [ eq : discretesystem ] ) with [MATH] swarm members . \\n each `` lollipop '' at the domain boundary corresponds to a [MATH]-function of mass [MATH] in the analytical solution , and simultaneously to a superposition of [MATH] swarm members in the numerical simulation . \\n we now return to the locust swarm model of , discussed also in section [ sec : intro ] . \\n recall that locust swarms are observed to have a concentration of individuals on the ground , a gap or `` bubble '' where the density of individuals is near zero , and a sharply delineated swarm of flying individuals . \\n this behavior is reproduced in the model ( [ eq : locusts ] ) ; see figure [ fig : locust](b ) .\",\n",
       "  'section_title': 'ARTICLE',\n",
       "  'rrf_score': 0.04816017316017316},\n",
       " {'chunk_id': 'bd7899f25d368af0905c00ba98d2010e',\n",
       "  'paper_id': 10,\n",
       "  'text': ', there is no equilibrium solution on an infinite domain . on a finite domain \\n , mass is partitioned between a classical solution in the interior and [MATH]-concentrations on the boundary . \\n we recall that for the locust model of ( see figure [ fig : locust ] ) a concentration of locusts occurs on the ground , with a seemingly classical component above , separated by a gap . \\n none of the one - dimensional solutions ( for the laplace and morse potentials ) discussed above contain a gap , that is , multiple swarm components that are spatially disconnected , suggesting that this configuration is intrinsically two - dimensional . to study this configuration , we computed a quasi - two - dimensional potential corresponding to a horizontally uniform swarm .',\n",
       "  'section_title': 'ARTICLE',\n",
       "  'rrf_score': 0.04583333333333334},\n",
       " {'chunk_id': '27c1b3136864fa203f71c8e3e78b507a',\n",
       "  'paper_id': 4662,\n",
       "  'text': 'one of eq . and eq . . as boundary conditions we use the following : \\n the first boundary condition is eq . \\n taken on one of the boundaries , which simply determine the integration constant for eq . . \\n the 4 other conditions control the overall content of the box ( particularly , it is the same as in equilibrium ) [MATH] here [MATH] and [MATH] are the equilibrium values of the total overall mass and the total mass of the 1st component in the whole box . \\n the 2 more conditions are [MATH] which indicate the fact , that box boundaries are in the homogeneous region . in contrast to the equilibrium case \\n , the density in the non - equilibrium homogeneous region may vary with coordinate , so [MATH] differs from zero on the boundaries , and , in fact , they do . \\n the value of the [MATH] is however small in the homogeneous region , comparing to the value in the surface region , so we may neglect it and use such approximation .',\n",
       "  'section_title': 'ARTICLE',\n",
       "  'rrf_score': 0.044444444444444446},\n",
       " {'chunk_id': '968d4fa78f163012ce6d5c9793b06065',\n",
       "  'paper_id': 10,\n",
       "  'text': \"therefore , the solution is a global minimizer . the solution [MATH] is shown schematically in figure [ fig : repulsion_schematic](a ) . \\n figure [ fig : repulsion_numerics](a ) compares analytical and numerical results for an example case where we take the total mass to be [MATH] and the finite domain to be [MATH] $ ] with [MATH] . \\n cross - hatched boxes indicate the boundary of the domain . \\n the solid line is the classical solution [MATH] . \\n dots correspond to the numerically - obtained equilibrium of the discrete system ( [ eq : discretesystem ] ) with [MATH] swarm members . \\n the density at each lagrangian grid point is estimated using the correspondence discussed in section ( [ sec : contmodel ] ) and pictured in figure [ fig : delta_schematic ] . \\n each `` lollipop '' at the domain boundary corresponds to a [MATH]-function of mass [MATH] in the analytical solution , and simultaneously to a superposition of [MATH] swarm members in the numerical simulation .\",\n",
       "  'section_title': 'ARTICLE',\n",
       "  'rrf_score': 0.04444444444444444},\n",
       " {'chunk_id': 'adb706c03bb31500dff355df3e00a347',\n",
       "  'paper_id': 10,\n",
       "  'text': 'by contrast with the one - dimensional system of section [ sec : grav ] in which no gap is observed , these gap states appear to be the generic configuration for sufficiently large mass in the quasi - two - dimensional system . we conclude that dimensionality is crucial element for the formation of the bubble - like shape of real locust swarms . \\n in this paper we deeveloped a framework for studying equilibrium solutions for swarming problems . \\n we related the discrete swarming problem to an associated continuum model . \\n this continuum model has an energy formulation which enables analysis equilibrium solutions and their stability . \\n we derived conditions for an equilibrium solution to be a local minimizer , a global minimizer , and/or a swarm minimizer , that is , stable to infinitesimal lagrangian deformations of the mass . \\n we found many examples of compactly supported equilibrium solutions , which may be discontinuous at the boundary of the support .',\n",
       "  'section_title': 'ARTICLE',\n",
       "  'rrf_score': 0.04298245614035087},\n",
       " {'chunk_id': '846fac6ecf37bb017d3f7ada493e4f10',\n",
       "  'paper_id': 10,\n",
       "  'text': 'we define the concept of a multi - component swarm equilibrium . \\n suppose the swarm s support can be divided into a set of [MATH] disjoint , closed , connected components [MATH] , that is [MATH] we define a swarm equilibrium as a configuration in which each individual swarm component is in equilibrium , [MATH] we can still define [MATH] in [MATH] [MATH] + f(x ) = \\\\int_{{\\\\omega_{{{\\\\bar \\\\rho}}}}}q(x - y ) { { \\\\bar \\\\rho}}(y)~dy + f(x),\\\\ ] ] but now [MATH] in [MATH] . \\n we can now define a swarm minimizer . \\n we say a swarm equilibrium is a swarm minimizer if [MATH] for some neighborhood of each component [MATH] of the swarm . in practice \\n this means that the swarm is an energy minimizer for infinitesimal redistributions of mass in the neighborhood of each component . \\n this might also be called a lagrangian minimizer in the sense that the equilibrium is a minimizer with respect to infinitesimal lagrangian deformations of the distributions .',\n",
       "  'section_title': 'ARTICLE',\n",
       "  'rrf_score': 0.042674731182795696},\n",
       " {'chunk_id': '634b150943b085280c57ee92ec17902a',\n",
       "  'paper_id': 10,\n",
       "  'text': 'we demonstrated numerically that for a wide range of parameters , there exists a continuous family of swarm minimizers that consist of a concentration on the ground and a disconnected , classical component in the air , reminiscent of our earlier numerical studies of a discrete locust swarm model . \\n we believe that the analytical solutions we found provide a sampling of the rich tapestry of equilibrium solutions that manifest in the general model we have considered , and in nature . \\n we hope that these solutions will inspire further analysis and guide future modeling efforts . \\n cmt acknowledges support from the nsf through grants dms-0740484 and dms-1009633 . \\n ajb gratefully acknowledges the support from the nsf through grants dms-0807347 and dms-0730630 , and the hospitality of robert kohn and the courant institute of mathematical sciences .',\n",
       "  'section_title': 'ARTICLE',\n",
       "  'rrf_score': 0.04160688665710187},\n",
       " {'chunk_id': '012c6600d33062cc2eb16585b8f77153',\n",
       "  'paper_id': 5318,\n",
       "  'text': '+ + _ dynamic _ states of the swarm are realized when reaching equilibrium [MATH] , since the sum of all kinetic energies then has a non - vanishing value . \\n [MATH] is found to be stable under the conditions [MATH] , [MATH] and [MATH] again , one zero eigenvalue appears and the ruth - hurwitz - theorem was applied to determine the stability of the system . \\n note that in [MATH] , the two equations for [MATH] and [MATH] define two planes in the space of variables [MATH] , whose intersection gives a line , i.e. a manifold of equilibria with dimension one , which matches the overall number of zero eigenvalues . \\n hence the theory of stable manifolds again ensures that we have converging trajectories to [MATH] under the conditions given above . \\n if these inequalities are satisfied by a specific parameter choice , the swarm reaches equilibrium [MATH] and appears to simulate the amorphous behavior of insect swarms .',\n",
       "  'section_title': 'ARTICLE',\n",
       "  'rrf_score': 0.04126602564102564},\n",
       " {'chunk_id': '0f1c71e308fd3c3c819b0f1d57169104',\n",
       "  'paper_id': 3514,\n",
       "  'text': '[ fig : latticemodel](b ) . \\n this situation will be discussed in section [ jams ] . \\n the motor traffic through tube - like compartments in which one or several filaments are aligned parallel to the cylinder axis represents a simple system which mimics the transport in axons . \\n we have studied tube - like systems with various kinds of boundary condition : closed systems , periodic boundary conditions , open boundaries coupled to motor reservoirs , and half - open systems . \\n the simplest case is given by periodic boundary conditions which can be solved exactly . \\n in this case , the stationary probability distribution is given by a product measure ; the bound and unbound motor densities are constant and satisfy the radial equilibrium condition [MATH] where the last approximation usually holds under experimentally accessible conditions , where the unbound density is small , but the bound motor density can be of the order of one motor per binding site .',\n",
       "  'section_title': 'ARTICLE',\n",
       "  'rrf_score': 0.039649256576439196},\n",
       " {'chunk_id': '12e4fd413d278c7c6f9ac4dabce23902',\n",
       "  'paper_id': 5318,\n",
       "  'text': 'additionally , also the variable [MATH] can be written in terms of [MATH] and [MATH] : [MATH] . in these new variables , \\n when omitting the noise influences , the system reads : [MATH] to handle the above nonlinear system , we first calculate stationary points , i.e. values of [MATH] , where [MATH] and linearize the equations around these equilibrium solutions . by calculating eigenvalues of the jacobian \\n , we can analyze stability properties . \\n we find three distinct equilibrium points [MATH] : [MATH] equilibria [MATH] and [MATH] are truly _ static _ configurations of the swarm , since [MATH] in both cases , whereas equilibirum [MATH] has a nonvanishing equilibrium value of [MATH] , which means that swarm particles are actually moving .',\n",
       "  'section_title': 'ARTICLE',\n",
       "  'rrf_score': 0.039282990083905414},\n",
       " {'chunk_id': 'c66a4c0aac5a15a8813ac108226ab253',\n",
       "  'paper_id': 10,\n",
       "  'text': 'figure [ fig : lambda](c ) shows the critical case when [MATH] . in this case \\n the local minimum of [MATH] at [MATH] satisfies [MATH] and [MATH] . \\n figure [ fig : lambda](d ) shows the case when [MATH] and now [MATH] in the neighborhood of the minimum . in this case \\n the solution with the mass concentrated at the origin is only a swarm minimizer ; the energy of the system can be reduced by transporting some of the mass at the origin to the neighborhood of the local minimum . when [MATH] it is possible to construct a continuum of swarm minimizers . \\n we have conducted a range of simulations for varying [MATH] and have measured two basic properties of the solutions . \\n we set [MATH] and use [MATH] in all simulations of the discrete system . \\n initially , all the swarm members are high above the ground and we evolve the simulation to equilibrium . figure [ fig : quasi2dnumerics](a ) measures the mass on the ground as a percentage of the total swarm mass .',\n",
       "  'section_title': 'ARTICLE',\n",
       "  'rrf_score': 0.035151515151515156},\n",
       " {'chunk_id': 'acaa0a88d265e020a2029db3d8c5bdf0',\n",
       "  'paper_id': 10,\n",
       "  'text': \"furthermore , for especially large systems , computation , though straightforward , may become a bottleneck . \\n continuum models are more amenable to analysis . \\n one well - studied continuum model is that of , a partial integrodifferential equation model for a swarm population density [MATH] in one spatial dimension : [MATH] the density [MATH] obeys a conservation equation , and [MATH] is the velocity field , which is determined via convolution with the antisymmetric pairwise endogenous force [MATH] , the one - dimensional analog of a social force like the one in ( [ eq : locusts ] ) . \\n the general model ( [ eq : introeq ] ) displays at least three solution types as identified in . \\n populations may concentrate to a point , reach a finite steady state , or spread . in , we identified conditions on the social interaction force [MATH] for each behavior to occur . \\n these conditions map out a `` phase diagram '' dividing parameter space into regions associated with each behavior .\",\n",
       "  'section_title': 'ARTICLE',\n",
       "  'rrf_score': 0.03480392156862745},\n",
       " {'chunk_id': '40178222980779fd6eb9f079a04cf4c7',\n",
       "  'paper_id': 5318,\n",
       "  'text': 'according to stable manifold theory , for every chosen value of [MATH] , there exists a stable manifold on which the trajectory converges asymptotically to equilibrium [MATH] . \\n ( in the following , we have set [MATH] . ) in this second equilibrium state , the whole swarm is frozen to a static configuration , which satisfies [MATH] const . \\n relaxation to a jelly - like state ( [MATH] ) , according to equations ( [ eins])-([zwei ] ) . \\n simulation parameters are [MATH] , [MATH] , [MATH] , [MATH] , [MATH] and [MATH] ( [MATH] ) . ] \\n one example of such a frozen state is shown in figure [ fig : two ] . \\n if the swarm is subject to noise influences , the individual particles will oscillate around their fixed positions . \\n + + _ dynamic _ states of the swarm are realized when reaching equilibrium [MATH] , since the sum of all kinetic energies then has a non - vanishing value .',\n",
       "  'section_title': 'ARTICLE',\n",
       "  'rrf_score': 0.03469952225295449},\n",
       " {'chunk_id': '273d867f9a2de1158d4d8ad825332ef3',\n",
       "  'paper_id': 804,\n",
       "  'text': 'however , it should be noted that the necessary conditions satisfied by such core - envelope models at the surface may not always turn out to be sufficient for describing the state of hydrostatic equilibrium [ because for an assigned value of [MATH] , the average density of such configurations may not always turn out to be less than or equal to the density of the homogeneous density sphere for the same mass , as indicated by eqs.(16 ) and ( 17 ) respectively ( it would depend upon the types of the density variations considered for the core and envelope regions and the the matching conditions at the core - envelope boundary ) ] . \\n thus , it follows that the criterion obtained in [ 20 ] is able to provide a _ necessary _ and _ sufficient _ condition for any regular configuration to be consistent with the state of hydrostatic equilibrium .',\n",
       "  'section_title': 'ARTICLE',\n",
       "  'rrf_score': 0.03142589118198874}]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "query = \"what is swarm equilibrium density distribution boundary conditions\"\n",
    "hybrid_results = hybrid_search(query, top_k=20)\n",
    "hybrid_results"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9a6c96df",
   "metadata": {},
   "source": [
    "#### Rerank"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "3ea9c98d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def rerank(query, top_k_chunks, top_n=5):\n",
    "    pairs = [(query, c[\"text\"]) for c in top_k_chunks]\n",
    "    scores = cross_encoder.predict(pairs)\n",
    "    ranked = sorted(zip(scores, top_k_chunks), key=lambda x: x[0], reverse=True)\n",
    "    results = []\n",
    "    for score, chunk in ranked[:top_n]:\n",
    "        chunk = chunk.copy()\n",
    "        chunk[\"rerank_score\"] = float(score)\n",
    "        results.append(chunk)\n",
    "    return results"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1f3a3b4a",
   "metadata": {},
   "source": [
    "#### Test rerank"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "276fe4d8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RERANK: 2.6198\n",
      "SEC:    ARTICLE\n",
      "TEXT:   this energy can be interpreted as the continuum analog of the summed pairwise energy of the corresponding discrete ( particle ) model . \n",
      " we will also exploit this energy to find equilibrium solutions and study their stability . in this paper \n",
      " , we \n",
      "---\n",
      "RERANK: 2.1184\n",
      "SEC:    ARTICLE\n",
      "TEXT:   we then derive an analogous continuum model and use variational methods to seek minimizers of its energy . \n",
      " this process involves solution of a fredholm integral equation for the density . \n",
      " for some choices of endogenous forces , we are able to fin\n",
      "---\n",
      "RERANK: 2.0673\n",
      "SEC:    ARTICLE\n",
      "TEXT:   we demonstrate different types of swarm equilibria via examples . in section \n",
      " [ sec : repulsive ] , we focus on purely repulsive endogenous interactions . \n",
      " we consider a bounded domain with no exogenous forces , a half - line subject to gravitation\n",
      "---\n",
      "RERANK: 2.0151\n",
      "SEC:    ABSTRACT\n",
      "TEXT:   we study equilibrium configurations of swarming biological organisms subject to exogenous and pairwise endogenous forces . beginning with a discrete dynamical model \n",
      " , we derive a variational descrip we study equilibrium configurations of swarming b\n",
      "---\n",
      "RERANK: 1.1149\n",
      "SEC:    ARTICLE\n",
      "TEXT:   by contrast with the one - dimensional system of section [ sec : grav ] in which no gap is observed , these gap states appear to be the generic configuration for sufficiently large mass in the quasi - two - dimensional system . we conclude that dimen\n",
      "---\n"
     ]
    }
   ],
   "source": [
    "query = \"what is swarm equilibrium density distribution boundary conditions\"   \n",
    "reranked = rerank(query, hybrid_results, top_n=5)\n",
    "\n",
    "for r in reranked:\n",
    "    print(f\"RERANK: {r['rerank_score']:.4f}\")\n",
    "    print(f\"SEC:    {r['section_title']}\")\n",
    "    print(f\"TEXT:   {r['text'][:250]}\")\n",
    "    print(\"---\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7620b6dc",
   "metadata": {},
   "source": [
    "#### Load GROQ_QPI_KEY from .env"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "1bbcea7b",
   "metadata": {},
   "outputs": [],
   "source": [
    "load_dotenv()\n",
    "os.environ[\"GROQ_API_KEY\"] = os.getenv(\"GROQ_API_KEY\")\n",
    "groq_client = Groq()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "14437f78",
   "metadata": {},
   "source": [
    "#### Calculate confidence score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "e46487b8",
   "metadata": {},
   "outputs": [],
   "source": [
    "def confidence_score(query, rank_results):\n",
    "    if not rank_results:\n",
    "        return 0.0, []\n",
    "    scores = [r.get(\"rerank_score\", r.get(\"rrf_score\", 0.0)) for r in rank_results]\n",
    "    min_s, max_s = min(scores), max(scores)\n",
    "    if max_s > min_s:\n",
    "        norm = [(s - min_s) / (max_s - min_s) for s in scores]\n",
    "    else:\n",
    "        norm = [1.0] * len(scores)\n",
    "    top_score = norm[0]\n",
    "    avg_score = np.mean(norm)\n",
    "    dropoff   = norm[0] - norm[1] if len(norm) > 1 else 0\n",
    "    confidence = 0.5 * top_score + 0.3 * avg_score + 0.2 * min(dropoff * 2, 1.0)\n",
    "    return round(float(confidence), 4), scores"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "27c5d811",
   "metadata": {},
   "source": [
    "#### Use LLM Query routing to grade. LLM used - llama-3.1-8b-instant"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "bb287e26",
   "metadata": {},
   "outputs": [],
   "source": [
    "def grading(query, rank_results, confidence):\n",
    "    if not rank_results:\n",
    "        return \"irrelevant\", \"no result found\"\n",
    "    top_chunks = \"\\n\\n--\\n\\n\".join(r[\"text\"][:300] for r in rank_results[:3])\n",
    "    response = groq_client.chat.completions.create(\n",
    "        model=\"llama-3.1-8b-instant\",\n",
    "        max_tokens=200,\n",
    "        messages=[{\n",
    "            \"role\": \"user\",\n",
    "            \"content\": f\"\"\"You are a retrieval quality grader.\n",
    "Query: {query}\n",
    "Retrieved chunks:\n",
    "{top_chunks}\n",
    "Grade the retrieval. Respond in this exact format:\n",
    "GRADE: <relevant|partial|irrelevant>\n",
    "REASON: <one sentence>\"\"\"\n",
    "        }]\n",
    "    ) \n",
    "    text = response.choices[0].message.content.strip()\n",
    "    grade_match = re.search(r\"GRADE:\\s*(relevant|partial|irrelevant)\", text, re.IGNORECASE)\n",
    "    reason_match = re.search(r\"REASON:\\s*(.+)\", text)\n",
    "    grade = grade_match.group(1).lower() if grade_match else \"irrelevant\"\n",
    "    reason = reason_match.group(1).strip() if reason_match else \"unknown\"\n",
    "    return grade, reason"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d175b562",
   "metadata": {},
   "source": [
    "#### Prompt engineer for better queries and try hybrid search with new prompt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "de937303",
   "metadata": {},
   "outputs": [],
   "source": [
    "def expand_query(query, n=2):\n",
    "    response = groq_client.chat.completions.create(\n",
    "        model=\"llama-3.1-8b-instant\",\n",
    "        max_tokens=150,\n",
    "        messages=[{\n",
    "            \"role\": \"user\",\n",
    "            \"content\": f\"\"\"Generate {n} alternative search queries for a scientific paper retrieval system.\n",
    "Original: {query}\n",
    "\n",
    "Rules:\n",
    "- Do NOT repeat or rephrase the original query\n",
    "- Use different vocabulary — avoid repeating key terms from the original\n",
    "- One must use broader conceptual language\n",
    "- One must use related technical synonyms or adjacent concepts\n",
    "\n",
    "Return ONLY the {n} new queries, one per line, no numbering, no original.\"\"\"\n",
    "        }]\n",
    "    )\n",
    "    lines = response.choices[0].message.content.strip().split(\"\\n\")\n",
    "    variants = [l.strip() for l in lines if l.strip() and l.strip().lower() != query.lower()]\n",
    "    return [query] + variants[:n]\n",
    "\n",
    "def hybrid_search_expanded(query, top_k=50, fetch_k=100):\n",
    "    queries = expand_query(query)\n",
    "    seen, merged = set(), []\n",
    "    for q in queries:\n",
    "        for r in hybrid_search(q, top_k=fetch_k):\n",
    "            cid = r[\"chunk_id\"]\n",
    "            if cid not in seen:\n",
    "                seen.add(cid)\n",
    "                merged.append(r)\n",
    "    return merged[:top_k]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7ab76b3c",
   "metadata": {},
   "source": [
    "#### Retrieve files and grade them"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "b35c206a",
   "metadata": {},
   "outputs": [],
   "source": [
    "def retrieve_and_grade(query, top_k=5, fetch_k=50, confidence_threshold=0.45):\n",
    "    raw = hybrid_search_expanded(query, top_k=fetch_k)\n",
    "    pre_confidence, _ = confidence_score(query, raw[:top_k])\n",
    "    if pre_confidence >= confidence_threshold:\n",
    "        results = raw[:top_k]\n",
    "        confidence = pre_confidence\n",
    "    else:\n",
    "        results = rerank(query, raw, top_n=top_k)\n",
    "        confidence, _ = confidence_score(query, results)\n",
    "    grade, reason = grading(query, results, confidence)\n",
    "    failed = (grade == \"irrelevant\" or confidence < confidence_threshold)\n",
    "    return {\n",
    "        \"query\": query,\n",
    "        \"results\": results if not failed else [],\n",
    "        \"confidence\": confidence,\n",
    "        \"grade\": grade,\n",
    "        \"reason\": reason,\n",
    "        \"retrieval_failed\": failed,\n",
    "        \"fallback_message\": \"No relevant information found for this query.\" if failed else None\n",
    "    }"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7fffed34",
   "metadata": {},
   "source": [
    "#### Test retrieve and grade"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "837e388f",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "QUERY:      what is swarm equilibrium density distribution boundary conditions\n",
      "CONFIDENCE: 0.7867\n",
      "GRADE:      partial\n",
      "REASON:     The retrieved chunks contain some information related to the concept of \"swarm equilibrium density distribution\", including a mention of finding equilibrium solutions, but they do not specifically address the term \"boundary conditions\", which is the main query.\n",
      "FAILED:     False\n",
      "TOP RESULT: we demonstrate different types of swarm equilibria via examples . in section \n",
      " [ sec : repulsive ] , we focus on purely repulsive endogenous interacti\n",
      "────────────────────────────────────────────────────────────\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "QUERY:      what is the best recipe for pasta carbonara\n",
      "CONFIDENCE: 0.7901\n",
      "GRADE:      irrelevant\n",
      "REASON:     The provided chunks discuss topics unrelated to pasta carbonara, such as fluid dynamics, quantum mechanics, and nonlinear normal modes, and have no relevance to a recipe.\n",
      "FAILED:     True\n",
      "FALLBACK:   No relevant information found for this query.\n",
      "────────────────────────────────────────────────────────────\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "QUERY:      explain transformer attention mechanism\n",
      "CONFIDENCE: 0.7739\n",
      "GRADE:      partial\n",
      "REASON:     The retrieved chunks are about concepts from physics and problem-solving in visual saliency modeling, but they do not mention the transformer attention mechanism, making them only partially relevant.\n",
      "FAILED:     False\n",
      "TOP RESULT: can be viewed conceptually as a net current [MATH] circulating around a primary coil . \n",
      " circulating currents found high in the corona can be viewed a\n",
      "────────────────────────────────────────────────────────────\n"
     ]
    }
   ],
   "source": [
    "queries = [\n",
    "    \"what is swarm equilibrium density distribution boundary conditions\",  # should pass\n",
    "    \"what is the best recipe for pasta carbonara\",                         # should fail\n",
    "    \"explain transformer attention mechanism\",                             # borderline\n",
    "]\n",
    "for q in queries:\n",
    "    result = retrieve_and_grade(q)\n",
    "    print(f\"\\nQUERY:      {result['query']}\")\n",
    "    print(f\"CONFIDENCE: {result['confidence']}\")\n",
    "    print(f\"GRADE:      {result['grade']}\")\n",
    "    print(f\"REASON:     {result['reason']}\")\n",
    "    print(f\"FAILED:     {result['retrieval_failed']}\")\n",
    "    if result['retrieval_failed']:\n",
    "        print(f\"FALLBACK:   {result['fallback_message']}\")\n",
    "    else:\n",
    "        print(f\"TOP RESULT: {result['results'][0]['text'][:150]}\")\n",
    "    print(\"─\" * 60)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "10f2b373",
   "metadata": {},
   "source": [
    "#### Rewrite query "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "11f8d6ad",
   "metadata": {},
   "outputs": [],
   "source": [
    "def rewrite_query(original_query,grade,reason,attempt=1):\n",
    "    strats = {\n",
    "        1:\"Rephrase the query using more technical/academic language\",\n",
    "        2:\"Break the query into its core concept only. Make it short,concise and to the point\",\n",
    "        3:\"Rewrite the query using different synonyms or related words\"\n",
    "    }\n",
    "    strat = strats.get(attempt,strats[1])\n",
    "    response = groq_client.chat.completions.create(\n",
    "        model = \"llama-3.1-8b-instant\",\n",
    "        max_tokens=100,\n",
    "        messages=[{\n",
    "            \"role\": \"user\",\n",
    "            \"content\": f\"\"\"You are a query rewriting assistant for a scientific paper retrieval system.\n",
    "\n",
    "Original query: {original_query}\n",
    "Retrieval grade: {grade}\n",
    "Reason it failed: {reason}\n",
    "Rewriting strategy: {strat}\n",
    "\n",
    "Respond with ONLY the rewritten query, nothing else.\"\"\"\n",
    "        }]\n",
    "    )\n",
    "    return response.choices[0].message.content.strip()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "2d2a9d67",
   "metadata": {},
   "outputs": [],
   "source": [
    "RETRY_STRATEGIES = [\n",
    "    (\"hybrid\",   lambda q: hybrid_search(q, top_k=50)),\n",
    "    (\"bm25\",     lambda q: bm25_search(q, top_k=50)),\n",
    "    (\"semantic\", lambda q: semantic_search(q, top_k=50)),\n",
    "]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "69df6444",
   "metadata": {},
   "source": [
    "#### Retry mechanism. set max_retries to 2 to prevent overloading/slowing down"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "4f019121",
   "metadata": {},
   "outputs": [],
   "source": [
    "def retrieve_with_retry(query, max_retries=2, confidence_threshold=0.45):\n",
    "    attempt_log = []\n",
    "    current_query = query\n",
    "    for attempt in range(max_retries + 1):\n",
    "        strategy_name, search_fn = RETRY_STRATEGIES[min(attempt, len(RETRY_STRATEGIES) - 1)]\n",
    "        print(f\"\\n[ATTEMPT {attempt}] strategy={strategy_name}\")\n",
    "        print(f\"[QUERY]    {current_query}\")\n",
    "\n",
    "        raw_results = search_fn(current_query)\n",
    "        pre_confidence, _ = confidence_score(current_query, raw_results[:5])\n",
    "\n",
    "        if pre_confidence >= confidence_threshold:\n",
    "            results = raw_results[:5]\n",
    "            confidence = pre_confidence\n",
    "        else:\n",
    "            results = rerank(current_query, raw_results, top_n=5)\n",
    "            confidence, _ = confidence_score(current_query, results)\n",
    "        grade, reason = grading(current_query, results, confidence)\n",
    "        failed = grade == \"irrelevant\" or confidence < confidence_threshold\n",
    "        attempt_log.append({\n",
    "            \"attempt\": attempt,\n",
    "            \"query\": current_query,\n",
    "            \"strategy\": strategy_name,\n",
    "            \"confidence\": confidence,\n",
    "            \"grade\": grade,\n",
    "            \"reason\": reason,\n",
    "            \"failed\": failed\n",
    "        })\n",
    "        print(f\"[GRADE]    {grade} | confidence={confidence} | failed={failed}\")\n",
    "        if not failed:\n",
    "            return {\n",
    "                \"query\": query,\n",
    "                \"final_query\": current_query,\n",
    "                \"results\": results,\n",
    "                \"confidence\": confidence,\n",
    "                \"grade\": grade,\n",
    "                \"reason\": reason,\n",
    "                \"retrieval_failed\": False,\n",
    "                \"attempts\": attempt_log\n",
    "            }\n",
    "        if attempt >= max_retries:\n",
    "            break\n",
    "        current_query = rewrite_query(query, grade, reason, attempt=attempt + 1)\n",
    "        print(f\"[REWRITE]  {current_query}\")\n",
    "    return {\n",
    "        \"query\": query,\n",
    "        \"final_query\": current_query,\n",
    "        \"results\": [],\n",
    "        \"confidence\": 0.0,\n",
    "        \"grade\": \"irrelevant\",\n",
    "        \"reason\": \"all retry attempts failed\",\n",
    "        \"retrieval_failed\": True,\n",
    "        \"fallback_message\": \"I couldn't find relevant information after multiple attempts.\",\n",
    "        \"attempts\": attempt_log\n",
    "    }"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "67d5f597",
   "metadata": {},
   "source": [
    "#### Test retry mechanism"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "id": "e113891a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "════════════════════════════════════════════════════════════\n",
      "ORIGINAL QUERY: what is the best recipe for pasta carbonara\n",
      "════════════════════════════════════════════════════════════\n",
      "\n",
      "[ATTEMPT 0] strategy=hybrid\n",
      "[QUERY]    what is the best recipe for pasta carbonara\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[GRADE]    irrelevant | confidence=-8.2033 | failed=True\n",
      "[REWRITE]  Query: \"Optimizing the preparation protocol for a traditional Italian pasta dish, specifically 'carbonara', incorporating a review of extant formulations and identification of a consensus recipe.\"\n",
      "\n",
      "[ATTEMPT 1] strategy=bm25\n",
      "[QUERY]    Query: \"Optimizing the preparation protocol for a traditional Italian pasta dish, specifically 'carbonara', incorporating a review of extant formulations and identification of a consensus recipe.\"\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[GRADE]    partial | confidence=-5.8053 | failed=True\n",
      "[REWRITE]  \"pasta carbonara preparation protocol\"\n",
      "\n",
      "[ATTEMPT 2] strategy=semantic\n",
      "[QUERY]    \"pasta carbonara preparation protocol\"\n",
      "[GRADE]    relevant | confidence=-7.5504 | failed=True\n",
      "[REWRITE]  What is the traditional method of preparing spaghetti with eggs and bacon.\n",
      "\n",
      "[ATTEMPT 3] strategy=semantic\n",
      "[QUERY]    What is the traditional method of preparing spaghetti with eggs and bacon.\n",
      "[GRADE]    irrelevant | confidence=-8.7406 | failed=True\n",
      "\n",
      "❌ FAILED\n",
      "FINAL QUERY:  What is the traditional method of preparing spaghetti with eggs and bacon.\n",
      "CONFIDENCE:   0.0\n",
      "GRADE:        irrelevant\n",
      "ATTEMPTS:     4\n",
      "FALLBACK:     I couldn't find relevant information after multiple attempts.\n",
      "\n",
      "ATTEMPT LOG:\n",
      "  ❌ [0] hybrid   | conf=-8.203 | grade=irrelevant   | query: what is the best recipe for pasta carbonara\n",
      "  ❌ [1] bm25     | conf=-5.805 | grade=partial      | query: Query: \"Optimizing the preparation protocol for a traditiona\n",
      "  ❌ [2] semantic | conf=-7.550 | grade=relevant     | query: \"pasta carbonara preparation protocol\"\n",
      "  ❌ [3] semantic | conf=-8.741 | grade=irrelevant   | query: What is the traditional method of preparing spaghetti with e\n",
      "\n",
      "════════════════════════════════════════════════════════════\n",
      "ORIGINAL QUERY: nonlinear stability analysis of aggregation equations\n",
      "════════════════════════════════════════════════════════════\n",
      "\n",
      "[ATTEMPT 0] strategy=hybrid\n",
      "[QUERY]    nonlinear stability analysis of aggregation equations\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[GRADE]    relevant | confidence=4.2569 | failed=False\n",
      "\n",
      "✅ SUCCESS\n",
      "FINAL QUERY:  nonlinear stability analysis of aggregation equations\n",
      "CONFIDENCE:   4.2569\n",
      "GRADE:        relevant\n",
      "ATTEMPTS:     1\n",
      "TOP RESULT:   a linear stability analysis is usually the first step of an analysis in this direction . for this \n",
      " the method of interest is applied to a scalar linear test equation and stability conditions on metho\n",
      "\n",
      "ATTEMPT LOG:\n",
      "  ✅ [0] hybrid   | conf=4.257 | grade=relevant     | query: nonlinear stability analysis of aggregation equations\n",
      "\n",
      "════════════════════════════════════════════════════════════\n",
      "ORIGINAL QUERY: what is swarm equilibrium density distribution boundary conditions\n",
      "════════════════════════════════════════════════════════════\n",
      "\n",
      "[ATTEMPT 0] strategy=hybrid\n",
      "[QUERY]    what is swarm equilibrium density distribution boundary conditions\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[GRADE]    relevant | confidence=2.1061 | failed=False\n",
      "\n",
      "✅ SUCCESS\n",
      "FINAL QUERY:  what is swarm equilibrium density distribution boundary conditions\n",
      "CONFIDENCE:   2.1061\n",
      "GRADE:        relevant\n",
      "ATTEMPTS:     1\n",
      "TOP RESULT:   this energy can be interpreted as the continuum analog of the summed pairwise energy of the corresponding discrete ( particle ) model . \n",
      " we will also exploit this energy to find equilibrium solutions\n",
      "\n",
      "ATTEMPT LOG:\n",
      "  ✅ [0] hybrid   | conf=2.106 | grade=relevant     | query: what is swarm equilibrium density distribution boundary cond\n"
     ]
    }
   ],
   "source": [
    "test_queries = [\n",
    "    \"what is the best recipe for pasta carbonara\",\n",
    "    \"nonlinear stability analysis of aggregation equations\",\n",
    "    \"what is swarm equilibrium density distribution boundary conditions\",\n",
    "]\n",
    "for q in test_queries:\n",
    "    print(\"\\n\" + \"═\" * 60)\n",
    "    print(f\"ORIGINAL QUERY: {q}\")\n",
    "    print(\"═\" * 60)\n",
    "    result = retrieve_with_retry(q, max_retries=3)\n",
    "    print(f\"\\n{'✅ SUCCESS' if not result['retrieval_failed'] else '❌ FAILED'}\")\n",
    "    print(f\"FINAL QUERY:  {result['final_query']}\")\n",
    "    print(f\"CONFIDENCE:   {result['confidence']}\")\n",
    "    print(f\"GRADE:        {result['grade']}\")\n",
    "    print(f\"ATTEMPTS:     {len(result['attempts'])}\")\n",
    "    if not result['retrieval_failed']:\n",
    "        print(f\"TOP RESULT:   {result['results'][0]['text'][:200]}\")\n",
    "    else:\n",
    "        print(f\"FALLBACK:     {result['fallback_message']}\")\n",
    "\n",
    "    print(\"\\nATTEMPT LOG:\")\n",
    "    for a in result[\"attempts\"]:\n",
    "        status = \"✅\" if not a[\"failed\"] else \"❌\"\n",
    "        print(f\"  {status} [{a['attempt']}] {a['strategy']:8} | conf={a['confidence']:.3f} | grade={a['grade']:12} | query: {a['query'][:60]}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ea96d7f9",
   "metadata": {},
   "source": [
    "#### Run once to generated grounded truth benchmark based on the relevant ArXiv files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "3b398d0b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "usable chunks: 422,488 / 443,080\n",
      "generating 50 queries...\n",
      "  [1/50] Quantum parameter influence in plasma dynamics\n",
      "  [2/50] local density and energy calculation in molecular dynamics simulations\n",
      "  [3/50] graphical model Bayesian learning two-step process\n",
      "  [4/50] MACHOs in Andromeda galaxy detection survey\n",
      "  [5/50] ultracompact x-ray binary pulsar optical pulsations\n",
      "  [6/50] method of crosses and wrenches for finding bordering faces in polyhedr\n",
      "  [7/50] entanglement degradation in non-inertial frames due to decoherence and\n",
      "  [8/50] quantum coherent scattering atomic ensemble angular momentum measureme\n",
      "  [9/50] columnar pins high temperature superconductors melting transition simu\n",
      "  [10/50] Soft X-ray transient long-term optical observation\n",
      "  [11/50] Sturm distance metric spaces mathematical definition\n",
      "  [12/50] String topology applications to lagrangian embeddings in symplectic ma\n",
      "  [13/50] galaxy virial factor inclinations effect on black hole mass\n",
      "  [14/50] 2dF galaxy redshift survey dynamical mass analysis\n",
      "  [15/50] magnetic helicity inversion methods in astrophysics\n",
      "  [16/50] partitioning bosonic frequency sums in continuum models for slow conve\n",
      "  [17/50] Linewidth-size relations of molecular clouds\n",
      "  [18/50] quantum field theory in dispersive media weak coupling limit\n",
      "  [19/50] scientific papers on star-forming regions with mixed radio and X-ray m\n",
      "  [20/50] afterglow emission mechanisms in gamma ray bursts\n",
      "  [21/50] Bayesian inference for spectrum sensing in cognitive radio systems\n",
      "  [22/50] integral inequality for nonlinear ordinary differential equations with\n",
      "  [23/50] effect of photon constraints on lepton distributions in particle physi\n",
      "  [24/50] bi-lipschitz equivalent metrics on the heisenberg group\n",
      "  [25/50] spectral radius of interconnected networks perturbation theory eigenva\n",
      "  [26/50] x-ray emission in ngc 4151 galaxy inner region\n",
      "  [27/50] gravitational duals for near-horizon CFTs in AdS space\n",
      "  [28/50] inductive construction of Lie superalgebras examples\n",
      "  [29/50] higgs cross section at LHC constrained by exclusive photon production\n",
      "  [30/50] Haar measure for classical Lie groups and orthosymplectic Lie superalg\n",
      "  [31/50] spot migration patterns in ii peg light curves\n",
      "  [32/50] spin polarization in materials with Rashba effect and Yang Mills field\n",
      "  [33/50] fuzzy automata with free monoid and string concatenation\n",
      "  [34/50] deconfined phase exponential correlation functions in field theory\n",
      "  [35/50] complexity of point set processing in geometric algorithms\n",
      "  [36/50] metal poor gas replenishment in large magellanic cloud\n",
      "  [37/50] relativistic jet formation in M87 black holes\n",
      "  [38/50] antiferromagnet relaxation rate temperature dependence disorder\n",
      "  [39/50] quantum mechanics interpretation collapse of the wave function in prob\n",
      "  [40/50] halo model predictions versus different color measurements in SDSS dat\n",
      "  [41/50] Extreme value laws in dynamical systems with observational noise\n",
      "  [42/50] black hole simulation boundary conditions\n",
      "  [43/50] Local stability of difference equation equilibria\n",
      "  [44/50] belle detector kekb asymmetric s decay analysis\n",
      "  [45/50] detection of warm hot intragroup medium in galaxy clusters\n",
      "  [46/50] satellite galaxy motion around isolated galaxies in the Sloan Digital \n",
      "  [47/50] Distributed power allocation in cloud radio access networks with coexi\n",
      "  [48/50] lozin's transformation effect on graph regularity\n",
      "  [49/50] quantum potential in ground state of scattering process\n",
      "  [50/50] charge disproportionation in triangular lattice materials\n",
      "saved to grounded_benchmark.json\n"
     ]
    }
   ],
   "source": [
    "import random\n",
    "def is_informative_chunk(chunk, min_words=30, max_math_ratio=0.15):\n",
    "    text = chunk[\"text\"] if isinstance(chunk, dict) else chunk\n",
    "    tokens = text.split()\n",
    "    if len(tokens) < min_words:\n",
    "        return False\n",
    "    math_tokens = sum(1 for t in tokens if '[MATH]' in t)\n",
    "    return math_tokens / len(tokens) <= max_math_ratio\n",
    "\n",
    "def generate_query_from_chunk(chunk_text):\n",
    "    response = groq_client.chat.completions.create(\n",
    "        model=\"llama-3.1-8b-instant\",\n",
    "        max_tokens=80,\n",
    "        temperature=0.7,\n",
    "        messages=[{\n",
    "            \"role\": \"user\",\n",
    "            \"content\": f\"\"\"You are helping build a retrieval benchmark for scientific papers.\n",
    "\n",
    "Given this passage:\n",
    "\\\"\\\"\\\"{chunk_text[:450]}\\\"\\\"\\\"\n",
    "\n",
    "Write ONE natural search query that a researcher would type to find information like this.\n",
    "- Use plain language, not a copy of the passage\n",
    "- Do NOT use quotes or bullet points\n",
    "- Respond with ONLY the query, nothing else\"\"\"\n",
    "        }]\n",
    "    )\n",
    "    return response.choices[0].message.content.strip().strip('\"').strip(\"'\")\n",
    "\n",
    "def build_grounded_benchmark(all_chunks, n=50, seed=42):\n",
    "    random.seed(seed)\n",
    "    pool = [c for c in all_chunks if is_informative_chunk(c)]\n",
    "    print(f\"usable chunks: {len(pool):,} / {len(all_chunks):,}\")\n",
    "\n",
    "    abstract_pool = [c for c in pool if c.get(\"section_title\") == \"ABSTRACT\"]\n",
    "    article_pool  = [c for c in pool if c.get(\"section_title\") == \"ARTICLE\"]\n",
    "    n_abstract = min(n // 2, len(abstract_pool))\n",
    "    n_article  = min(n - n_abstract, len(article_pool))\n",
    "\n",
    "    sampled = random.sample(abstract_pool, n_abstract) + random.sample(article_pool, n_article)\n",
    "    random.shuffle(sampled)\n",
    "\n",
    "    print(f\"generating {len(sampled)} queries...\")\n",
    "    benchmark = []\n",
    "    for idx, chunk in enumerate(sampled):\n",
    "        try:\n",
    "            query = generate_query_from_chunk(chunk[\"text\"])\n",
    "            benchmark.append({\n",
    "                \"id\": idx,\n",
    "                \"query\": query,\n",
    "                \"ground_truth_chunk_id\": str(chunk[\"chunk_id\"]),\n",
    "                \"ground_truth_paper_id\": chunk[\"paper_id\"],\n",
    "                \"source_section\": chunk.get(\"section_title\", \"UNKNOWN\"),\n",
    "                \"source_text_preview\": chunk[\"text\"][:200],\n",
    "            })\n",
    "            print(f\"  [{idx+1}/{len(sampled)}] {query[:70]}\")\n",
    "        except Exception as e:\n",
    "            print(f\"  [{idx+1}/{len(sampled)}] failed: {e}\")\n",
    "\n",
    "    return benchmark\n",
    "\n",
    "grounded_benchmark = build_grounded_benchmark(all_chunks, n=50)\n",
    "with open(\"grounded_benchmark.json\", \"w\") as f:\n",
    "    json.dump(grounded_benchmark, f, indent=2)\n",
    "print(\"saved to grounded_benchmark.json\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "39cad6d2",
   "metadata": {},
   "source": [
    "#### Load benchmark in json format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "f12dc2c8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[INFO] Loaded 50 benchmark items\n"
     ]
    }
   ],
   "source": [
    "with open(\"grounded_benchmark.json\") as f:\n",
    "    grounded_benchmark = json.load(f)\n",
    "print(f\"[INFO] Loaded {len(grounded_benchmark)} benchmark items\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0674c5d6",
   "metadata": {},
   "source": [
    "#### Recall, MRR metrics for paper(soft) and exact(strict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "d1c7db9f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# exact = did we return the specific chunk the query came from\n",
    "# paper = did we return any chunk from the right paper\n",
    "def recall_at_k_exact(results, gt_chunk_id, k=5):\n",
    "    return int(any(str(r[\"chunk_id\"]) == gt_chunk_id for r in results[:k]))\n",
    "def recall_at_k_paper(results, gt_paper_id, k=5):\n",
    "    return int(any(r.get(\"paper_id\") == gt_paper_id for r in results[:k]))\n",
    "def mrr_exact(results, gt_chunk_id):\n",
    "    for rank, r in enumerate(results, start=1):\n",
    "        if str(r[\"chunk_id\"]) == gt_chunk_id:\n",
    "            return 1 / rank\n",
    "    return 0.0\n",
    "def mrr_paper(results, gt_paper_id):\n",
    "    for rank, r in enumerate(results, start=1):\n",
    "        if r.get(\"paper_id\") == gt_paper_id:\n",
    "            return 1 / rank\n",
    "    return 0.0"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7d7bbae6",
   "metadata": {},
   "source": [
    "#### BENCHMARK TO EVALUATE FULL PIPELINE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "199cb6cf",
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "\n",
    "def run_grounded_eval(benchmark, top_k_retrieve=50, top_n_rerank=5):\n",
    "    metrics = defaultdict(list)\n",
    "    failed  = []\n",
    "    for item in benchmark:\n",
    "        query    = item[\"query\"]\n",
    "        gt_chunk = item[\"ground_truth_chunk_id\"]\n",
    "        gt_paper = item[\"ground_truth_paper_id\"]\n",
    "        section  = item[\"source_section\"]\n",
    "        try:\n",
    "            raw = hybrid_search_expanded(query, top_k=top_k_retrieve)\n",
    "            results = rerank(query, raw, top_n=top_n_rerank)\n",
    "            r5_raw    = recall_at_k_exact(raw[:top_n_rerank], gt_chunk, k=top_n_rerank)\n",
    "            r5_rerank = recall_at_k_exact(results,            gt_chunk, k=top_n_rerank)\n",
    "            metrics[\"recall@5_raw\"].append(r5_raw)\n",
    "            metrics[\"recall@5_reranked\"].append(r5_rerank)\n",
    "            r5_exact  = recall_at_k_exact(results, gt_chunk, k=top_n_rerank)\n",
    "            r5_pre    = recall_at_k_exact(raw,     gt_chunk, k=top_k_retrieve)\n",
    "            r5_paper  = recall_at_k_paper(results, gt_paper, k=top_n_rerank)\n",
    "            r50_paper = recall_at_k_paper(raw,     gt_paper, k=top_k_retrieve)\n",
    "            metrics[\"recall@5_exact\"].append(r5_exact)\n",
    "            metrics[\"recall@5_paper\"].append(r5_paper)\n",
    "            metrics[f\"recall@5_exact_{section}\"].append(r5_exact)\n",
    "            metrics[f\"recall@5_paper_{section}\"].append(r5_paper)\n",
    "            metrics[\"recall@50_pre_exact\"].append(r5_pre)\n",
    "            metrics[\"recall@50_pre_paper\"].append(r50_paper)\n",
    "            metrics[\"mrr_exact\"].append(mrr_exact(results, gt_chunk))\n",
    "            metrics[\"mrr_paper\"].append(mrr_paper(results, gt_paper))\n",
    "            icon = \"A\" if r5_exact else (\"B\" if r5_paper else (\"C\" if r5_pre else \"F\"))\n",
    "            print(f\"{icon} [{section}] exact={r5_exact} paper={r5_paper} | {query[:60]}\")\n",
    "        except Exception as e:\n",
    "            failed.append(query)\n",
    "            print(f\"[ERROR]error on: {query[:60]} — {e}\")\n",
    "    n_queries = len(benchmark) - len(failed)\n",
    "    print(f\"\\n--- results ({n_queries} queries) ---\")\n",
    "    print(f\"[INFO]recall@50 pre-rerank  (exact) : {np.mean(metrics['recall@50_pre_exact']):.3f}\")\n",
    "    print(f\"[INFO]recall@5  post-rerank (exact) : {np.mean(metrics['recall@5_exact']):.3f}\")\n",
    "    print(f\"[INFO]recall@5  post-rerank (paper) : {np.mean(metrics['recall@5_paper']):.3f}\")\n",
    "    print(f\"[INFO]mrr exact                     : {np.mean(metrics['mrr_exact']):.3f}\")\n",
    "    print(f\"[INFO]mrr paper                     : {np.mean(metrics['mrr_paper']):.3f}\")\n",
    "\n",
    "    for section in [\"ABSTRACT\", \"ARTICLE\"]:\n",
    "        k = f\"recall@5_exact_{section}\"\n",
    "        if metrics[k]:\n",
    "            print(f\"\\n{section.lower()} ({len(metrics[k])} chunks)\")\n",
    "            print(f\"  exact : {np.mean(metrics[k]):.3f}\")\n",
    "            print(f\"  paper : {np.mean(metrics[f'recall@5_paper_{section}']):.3f}\")\n",
    "    true_lift = np.mean(metrics[\"recall@5_reranked\"]) - np.mean(metrics[\"recall@5_raw\"])\n",
    "    print(f\"\\n[DEBUG]true reranker lift (reranked@5 vs raw@5) : {true_lift:+.3f}\")\n",
    "    if failed:\n",
    "        print(f\"\\n[DEBUG]failed on {len(failed)} queries: {failed}\")\n",
    "    return metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "e5f95129",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "B [ARTICLE] exact=0 paper=1 | Quantum parameter influence in plasma dynamics\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                             \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "B [ABSTRACT] exact=0 paper=1 | local density and energy calculation in molecular dynamics s\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ABSTRACT] exact=1 paper=1 | graphical model Bayesian learning two-step process\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ABSTRACT] exact=1 paper=1 | MACHOs in Andromeda galaxy detection survey\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ABSTRACT] exact=1 paper=1 | ultracompact x-ray binary pulsar optical pulsations\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ARTICLE] exact=1 paper=1 | method of crosses and wrenches for finding bordering faces i\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ABSTRACT] exact=1 paper=1 | entanglement degradation in non-inertial frames due to decoh\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ABSTRACT] exact=1 paper=1 | quantum coherent scattering atomic ensemble angular momentum\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ABSTRACT] exact=1 paper=1 | columnar pins high temperature superconductors melting trans\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ABSTRACT] exact=1 paper=1 | Soft X-ray transient long-term optical observation\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ARTICLE] exact=1 paper=1 | Sturm distance metric spaces mathematical definition\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ABSTRACT] exact=1 paper=1 | String topology applications to lagrangian embeddings in sym\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "B [ARTICLE] exact=0 paper=1 | galaxy virial factor inclinations effect on black hole mass\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ABSTRACT] exact=1 paper=1 | 2dF galaxy redshift survey dynamical mass analysis\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ABSTRACT] exact=1 paper=1 | magnetic helicity inversion methods in astrophysics\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ARTICLE] exact=1 paper=1 | partitioning bosonic frequency sums in continuum models for \n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ARTICLE] exact=1 paper=1 | Linewidth-size relations of molecular clouds\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ABSTRACT] exact=1 paper=1 | quantum field theory in dispersive media weak coupling limit\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "F [ARTICLE] exact=0 paper=0 | scientific papers on star-forming regions with mixed radio a\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "C [ARTICLE] exact=0 paper=0 | afterglow emission mechanisms in gamma ray bursts\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ARTICLE] exact=1 paper=1 | Bayesian inference for spectrum sensing in cognitive radio s\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "F [ARTICLE] exact=0 paper=0 | integral inequality for nonlinear ordinary differential equa\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ARTICLE] exact=1 paper=1 | effect of photon constraints on lepton distributions in part\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ARTICLE] exact=1 paper=1 | bi-lipschitz equivalent metrics on the heisenberg group\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ARTICLE] exact=1 paper=1 | spectral radius of interconnected networks perturbation theo\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ABSTRACT] exact=1 paper=1 | x-ray emission in ngc 4151 galaxy inner region\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "F [ARTICLE] exact=0 paper=0 | gravitational duals for near-horizon CFTs in AdS space\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ABSTRACT] exact=1 paper=1 | inductive construction of Lie superalgebras examples\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ARTICLE] exact=1 paper=1 | higgs cross section at LHC constrained by exclusive photon p\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ABSTRACT] exact=1 paper=1 | Haar measure for classical Lie groups and orthosymplectic Li\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ARTICLE] exact=1 paper=1 | spot migration patterns in ii peg light curves\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "B [ABSTRACT] exact=0 paper=1 | spin polarization in materials with Rashba effect and Yang M\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ARTICLE] exact=1 paper=1 | fuzzy automata with free monoid and string concatenation\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ARTICLE] exact=1 paper=1 | deconfined phase exponential correlation functions in field \n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "F [ARTICLE] exact=0 paper=0 | complexity of point set processing in geometric algorithms\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ABSTRACT] exact=1 paper=1 | metal poor gas replenishment in large magellanic cloud\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ABSTRACT] exact=1 paper=1 | relativistic jet formation in M87 black holes\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ABSTRACT] exact=1 paper=1 | antiferromagnet relaxation rate temperature dependence disor\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ABSTRACT] exact=1 paper=1 | quantum mechanics interpretation collapse of the wave functi\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ARTICLE] exact=1 paper=1 | halo model predictions versus different color measurements i\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ABSTRACT] exact=1 paper=1 | Extreme value laws in dynamical systems with observational n\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ARTICLE] exact=1 paper=1 | black hole simulation boundary conditions\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ARTICLE] exact=1 paper=1 | Local stability of difference equation equilibria\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ABSTRACT] exact=1 paper=1 | belle detector kekb asymmetric s decay analysis\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ARTICLE] exact=1 paper=1 | detection of warm hot intragroup medium in galaxy clusters\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ABSTRACT] exact=1 paper=1 | satellite galaxy motion around isolated galaxies in the Sloa\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ABSTRACT] exact=1 paper=1 | Distributed power allocation in cloud radio access networks \n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ABSTRACT] exact=1 paper=1 | lozin's transformation effect on graph regularity\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ARTICLE] exact=1 paper=1 | quantum potential in ground state of scattering process\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A [ARTICLE] exact=1 paper=1 | charge disproportionation in triangular lattice materials\n",
      "\n",
      "--- results (50 queries) ---\n",
      "[INFO]recall@50 pre-rerank  (exact) : 0.880\n",
      "[INFO]recall@5  post-rerank (exact) : 0.820\n",
      "[INFO]recall@5  post-rerank (paper) : 0.900\n",
      "[INFO]mrr exact                     : 0.663\n",
      "[INFO]mrr paper                     : 0.830\n",
      "\n",
      "abstract (25 chunks)\n",
      "  exact : 0.920\n",
      "  paper : 1.000\n",
      "\n",
      "article (25 chunks)\n",
      "  exact : 0.720\n",
      "  paper : 0.800\n",
      "\n",
      "[DEBUG]true reranker lift (reranked@5 vs raw@5) : +0.040\n"
     ]
    }
   ],
   "source": [
    "eval_metrics = run_grounded_eval(grounded_benchmark, top_k_retrieve=50, top_n_rerank=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "e754d52f",
   "metadata": {},
   "outputs": [],
   "source": [
    "groq_api_key = os.getenv(\"GROQ_API_KEY\")\n",
    "llm = ChatGroq(groq_api_key=groq_api_key,model_name=\"llama-3.1-8b-instant\",temperature=0.0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "cbda8151",
   "metadata": {},
   "outputs": [],
   "source": [
    "def llm_route(query):\n",
    "    prompt = f\"\"\"\n",
    "You are a deterministic retrieval policy classifier. Your sole function is to map an incoming query to the optimal retrieval configuration.\n",
    "\n",
    "Available retrieval strategies:\n",
    "1. bm25: Best for exact terminology, acronyms, specific codes, or exact paper/entity names.\n",
    "2. dense: Best for semantic similarity, conceptual questions, and natural language phrasing.\n",
    "3. hybrid: Best for mixed queries containing both specific keywords and broader concepts.\n",
    "4. hybrid_rerank: Best for difficult, complex, or multi-hop questions requiring high precision.\n",
    "\n",
    "[CRITICAL BEHAVIORAL CONSTRAINTS]\n",
    "- Do NOT output any free-form reasoning, thinking blocks, or chain-of-thought.\n",
    "- Do NOT output any explanations, introductions, or postscripts.\n",
    "- You must behave strictly as a token-optimized classifier.\n",
    "- Your output must be ONLY valid, minified JSON matching the schema below.\n",
    "\n",
    "[OUTPUT SCHEMA]\n",
    "{{\n",
    "  \"strategy\": \"bm25\" | \"dense\" | \"hybrid\" | \"hybrid_rerank\",\n",
    "  \"top_k\": integer,\n",
    "  \"rerank\": boolean\n",
    "}}\n",
    "\n",
    "Input Query: {query}\n",
    "JSON Output:\n",
    "\"\"\"\n",
    "    response = llm.invoke([prompt])\n",
    "    raw = response.content.strip()\n",
    "\n",
    "    try:\n",
    "        return json.loads(raw)\n",
    "    except json.JSONDecodeError:\n",
    "        match = re.search(r'\\{.*?\\}', raw, re.DOTALL)\n",
    "        if not match:\n",
    "            raise ValueError(f\"No JSON found in LLM response: {raw!r}\")\n",
    "        return json.loads(match.group())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "b9380c9d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_answer(query, results, top_k=3):\n",
    "    context = \"\\n\\n---\\n\\n\".join(r[\"text\"][:400] for r in results[:top_k])\n",
    "    \n",
    "    response = groq_client.chat.completions.create(\n",
    "        model=\"llama-3.1-8b-instant\",\n",
    "        max_tokens=300,\n",
    "        messages=[{\n",
    "            \"role\": \"user\",\n",
    "            \"content\": f\"\"\"You are a scientific research assistant. Answer the query using only the provided context.\n",
    "\n",
    "Context:\n",
    "{context}\n",
    "\n",
    "Query: {query}\n",
    "\n",
    "Rules:\n",
    "- Answer concisely in 2-3 sentences\n",
    "- Only use information from the context\n",
    "- If the context doesn't contain the answer, say so\n",
    "\n",
    "Answer:\"\"\"\n",
    "        }]\n",
    "    )\n",
    "    return response.choices[0].message.content.strip()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e67b0fdc",
   "metadata": {},
   "source": [
    "#### EVALUATING GENERATION"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "7bd54b63",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ [relevant] 2dF galaxy redshift survey dynamical mass analysis\n",
      "   → We compute the dynamical mass, [MATH], for 809 isolated host galaxies in the 100k data release of the 2dF galaxy redshif\n",
      "   reason: The retrieved chunks include sentences directly related to the topic of dynamical mass analysis of the 2Df galaxy redshift survey.\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ [relevant] relativistic jet formation in M87 black holes\n",
      "   → The accretion of matter onto a massive black hole is believed to feed the relativistic plasma jets found in active galac\n",
      "   reason: The retrieved chunks all directly relate to the topic of relativistic jet formation in M87 black holes, mentioning its connection to the accretion of matter onto a massive black hole in active galactic nuclei.\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ [relevant] satellite galaxy motion around isolated galaxies in the Sloa\n",
      "   → The motion of satellite galaxies around normal galaxies, particularly at distances 50 - 500 kpc, can provide a sensitive\n",
      "   reason: The retrieved chunks of text mention concepts and data (e.g., Sloan Digital Sky Survey, isolated host galaxies, dynamical mass) related to the study of satellite galaxy motion around isolated galaxies.\n",
      "\n",
      "✅ [relevant] Bayesian inference for spectrum sensing in cognitive radio s\n",
      "   → For spectrum sensing in cognitive radio systems, a criterion from a Bayesian perspective is mentioned as \"minimum Bayesi\n",
      "   reason: The retrieved chunks describe the importance of reliable spectrum sensing and detection in cognitive radio systems, mentioning specific techniques and considerations, which is relevant to the topic of Bayesian inference for spectrum sensing.\n",
      "\n",
      "✅ [relevant] charge disproportionation in triangular lattice materials\n",
      "   → In the context of triangular lattice materials, charge disproportionation is mentioned as the intradimer charge dispropo\n",
      "   reason: The retrieved chunks discuss theoretical modeling and the mechanisms behind charge order and pec formation in triangular lattice materials, which is the topic specified in the query.\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ [relevant] black hole simulation boundary conditions\n",
      "   → The black hole boundary conditions used are the black hole boundary conditions (BHBC) or the quasi-boundary conditions (\n",
      "   reason: The retrieved chunks provide information regarding the setup of black hole simulations, including boundary conditions and specific parameters used in the 3D simulation.\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ [relevant] spin polarization in materials with Rashba effect and Yang M\n",
      "   → Spin polarization is presented in a quite general form using an effective Yang-Mills field, with time and space derivati\n",
      "   reason: The retrieved chunks explicitly mention key terms such as Rashba effect, Yang-Mills field, and spin-orbit interaction, indicating that they are highly relevant to the query topic.\n",
      "\n",
      "✅ [relevant] x-ray emission in ngc 4151 galaxy inner region\n",
      "   → The x-ray emission in the inner region of the NGC 4151 galaxy was studied within a radius of approximately [MATH] pc. Hi\n",
      "   reason: The retrieved chunks contain information about the X-ray emission in the NGC 4151 galaxy's inner region, including the use of Chandra observations and spectral analysis.\n",
      "\n",
      "✅ [relevant] detection of warm hot intragroup medium in galaxy clusters\n",
      "   → The warm-hot intragroup medium in galaxy clusters has thus far evaded direct detection. As a result, its density and tem\n",
      "   reason: The retrieved chunks provide information related to the detection of warm-hot intragroup medium in galaxy clusters, their properties, and the cooling processes that occur in these systems.\n",
      "\n",
      "✅ [relevant] Haar measure for classical Lie groups and orthosymplectic Li\n",
      "   → The Haar measure for the classical compact Lie groups [MATH], [MATH], or [MATH] is of unit mass. This unit mass is denot\n",
      "   reason: The retrieved chunks directly address the topics mentioned in the query, including Haar measures for classical Lie groups and orthosymplectic Lie superalgebra representations.\n",
      "\n",
      "--- generation eval summary ---\n",
      "relevant  : 10/10\n",
      "partial   : 0/10\n",
      "irrelevant: 0/10\n"
     ]
    }
   ],
   "source": [
    "def run_generation_eval(benchmark, top_k_retrieve=50, top_n_rerank=5, n=10):\n",
    "    results_log = []\n",
    "    sample = random.sample(benchmark, min(n, len(benchmark)))\n",
    "    for item in sample:\n",
    "        query = item[\"query\"]\n",
    "        gt_chunk = item[\"ground_truth_chunk_id\"]\n",
    "        gt_paper = item[\"ground_truth_paper_id\"]\n",
    "        route = llm_route(query)\n",
    "        if route[\"strategy\"] == \"bm25\":\n",
    "            results = bm25_search(query, top_k=top_k_retrieve)\n",
    "        elif route[\"strategy\"] == \"dense\":\n",
    "            results = semantic_search(query, top_k=top_k_retrieve)\n",
    "        elif route[\"strategy\"] == \"hybrid\":\n",
    "            results = hybrid_search_expanded(query, top_k=top_k_retrieve)\n",
    "        else:\n",
    "            results = retrieve_with_retry(query)\n",
    "        reranked = rerank(query, results, top_n=top_n_rerank)\n",
    "        answer = generate_answer(query, reranked)\n",
    "        confidence = reranked[0].get(\"rrf_score\", 1.0) if reranked else 0.0\n",
    "        grade, reason = grading(query, reranked, confidence)\n",
    "        results_log.append({\n",
    "            \"query\": query,\n",
    "            \"strategy\": route[\"strategy\"],\n",
    "            \"grade\": grade,\n",
    "            \"reason\": reason,\n",
    "            \"answer\": answer,\n",
    "            \"gt_chunk\": gt_chunk,\n",
    "            \"gt_paper\": gt_paper,\n",
    "        })\n",
    "        icon = \"✅\" if grade == \"relevant\" else (\"⚠️\" if grade == \"partial\" else \"❌\")\n",
    "        print(f\"{icon} [{grade}] {query[:60]}\")\n",
    "        print(f\"   → {answer[:120]}\")\n",
    "        print(f\"   reason: {reason}\\n\")\n",
    "    grades = [r[\"grade\"] for r in results_log]\n",
    "    print(\"--- generation eval summary ---\")\n",
    "    print(f\"relevant  : {grades.count('relevant')}/{len(grades)}\")\n",
    "    print(f\"partial   : {grades.count('partial')}/{len(grades)}\")\n",
    "    print(f\"irrelevant: {grades.count('irrelevant')}/{len(grades)}\")\n",
    "    return results_log\n",
    "results_log = run_generation_eval(grounded_benchmark, n=10)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f4a35e49",
   "metadata": {},
   "source": [
    "#### Testing generation quality"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "f02aca65",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                     \r"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'To get a job in 2026, following your passion and studying something you\\'re interested in might be a good approach, as hinted at by the phrase \"follow your passion\" and the mention of being lucky when a skill evolves into being marketable. No specific steps are provided in the given context, only general advice.'"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "query = \"how to get a job in 2026\"\n",
    "route = llm_route(query)\n",
    "if route[\"strategy\"] == \"bm25\":\n",
    "    results = bm25_search(query)\n",
    "\n",
    "elif route[\"strategy\"] == \"dense\":\n",
    "    results = semantic_search(query)\n",
    "\n",
    "elif route[\"strategy\"] == \"hybrid\":\n",
    "    results = hybrid_search_expanded(query)\n",
    "else:\n",
    "    results = retrieve_with_retry(query)\n",
    "answer = generate_answer(query,results)\n",
    "answer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "099f603d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'The provided context does not contain information on how to make pasta, so I cannot provide an answer.'"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "query = \"how to make pasta\"\n",
    "route = llm_route(query)\n",
    "ans = generate_answer(query,results)\n",
    "ans"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv (3.13.5)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}