diff --git "a/rag.ipynb" "b/rag.ipynb"
new file mode 100644--- /dev/null
+++ "b/rag.ipynb"
@@ -0,0 +1,3007 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "i2nHJ6o7TQW3"
+ },
+ "source": [
+ "# π Multi-Document RAG System with Advanced Retrieval\n",
+ "\n",
+ "## Project Overview\n",
+ "This notebook implements a **production-ready Retrieval-Augmented Generation (RAG)** system capable of:\n",
+ "- Ingesting **multiple PDF documents** into a unified knowledge base\n",
+ "- Answering questions using **hybrid retrieval** (vector + keyword search)\n",
+ "- Providing **cited, verifiable answers** with source attribution\n",
+ "- **Comparing information** across multiple documents\n",
+ "\n",
+ "## Architecture Summary\n",
+ "```\n",
+ "User Query β Query Classification β Query Expansion (Multi-Query)\n",
+ " β\n",
+ "HyDE Generation β Hybrid Retrieval (Vector + BM25)\n",
+ " β\n",
+ "RRF Fusion β Cross-Encoder Re-ranking β LLM Generation\n",
+ " β\n",
+ "Answer Verification β Final Response with Citations\n",
+ "```\n",
+ "\n",
+ "## Key Technologies\n",
+ "| Component | Technology |\n",
+ "|-----------|------------|\n",
+ "| LLM | Llama 3.3 70B (via Groq) |\n",
+ "| Embeddings | BAAI/bge-large-en-v1.5 |\n",
+ "| Re-ranker | BAAI/bge-reranker-v2-m3 |\n",
+ "| Vector DB | ChromaDB |\n",
+ "| Keyword Search | BM25 |\n",
+ "| UI | Gradio |\n",
+ "\n",
+ "## Requirements\n",
+ "- **Groq API Key** (free at console.groq.com)\n",
+ "- **Python 3.10+**\n",
+ "- **GPU recommended** but not required"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "AiaiOaSb-m1U",
+ "outputId": "4b784a1e-a4a0-43d8-b4ff-eb7b1255438b"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "π₯ Cleaning up the environment\n",
+ "\u001b[33mWARNING: Skipping langchain-community as it is not installed.\u001b[0m\u001b[33m\n",
+ "\u001b[0m\u001b[33mWARNING: Skipping langchain-groq as it is not installed.\u001b[0m\u001b[33m\n",
+ "\u001b[0mπ¦ Installing the Dependencies\n",
+ "\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m61.0/61.0 kB\u001b[0m \u001b[31m4.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m18.0/18.0 MB\u001b[0m \u001b[31m56.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
+ "dask-cudf-cu12 25.10.0 requires pandas<2.4.0dev0,>=2.0, which is not installed.\n",
+ "access 1.1.10.post3 requires pandas>=2.1.0, which is not installed.\n",
+ "access 1.1.10.post3 requires scipy>=1.14.1, which is not installed.\n",
+ "pandas-gbq 0.30.0 requires pandas>=1.1.4, which is not installed.\n",
+ "geemap 0.35.3 requires pandas, which is not installed.\n",
+ "yellowbrick 1.5 requires scikit-learn>=1.0.0, which is not installed.\n",
+ "yellowbrick 1.5 requires scipy>=1.0.0, which is not installed.\n",
+ "tensorflow-decision-forests 1.12.0 requires pandas, which is not installed.\n",
+ "librosa 0.11.0 requires scikit-learn>=1.1.0, which is not installed.\n",
+ "librosa 0.11.0 requires scipy>=1.6.0, which is not installed.\n",
+ "cmdstanpy 1.3.0 requires pandas, which is not installed.\n",
+ "albumentations 2.0.8 requires scipy>=1.10.0, which is not installed.\n",
+ "mizani 0.13.5 requires pandas>=2.2.0, which is not installed.\n",
+ "mizani 0.13.5 requires scipy>=1.8.0, which is not installed.\n",
+ "imbalanced-learn 0.14.1 requires scikit-learn<2,>=1.4.2, which is not installed.\n",
+ "imbalanced-learn 0.14.1 requires scipy<2,>=1.11.4, which is not installed.\n",
+ "hdbscan 0.8.41 requires scikit-learn>=1.6, which is not installed.\n",
+ "hdbscan 0.8.41 requires scipy>=1.0, which is not installed.\n",
+ "stumpy 1.13.0 requires scipy>=1.10, which is not installed.\n",
+ "spreg 1.8.4 requires pandas, which is not installed.\n",
+ "spreg 1.8.4 requires scikit-learn>=0.22, which is not installed.\n",
+ "spreg 1.8.4 requires scipy>=0.11, which is not installed.\n",
+ "spopt 0.7.0 requires pandas>=2.1.0, which is not installed.\n",
+ "spopt 0.7.0 requires scikit-learn>=1.4.0, which is not installed.\n",
+ "spopt 0.7.0 requires scipy>=1.12.0, which is not installed.\n",
+ "datasets 4.0.0 requires pandas, which is not installed.\n",
+ "plotnine 0.14.5 requires pandas>=2.2.0, which is not installed.\n",
+ "plotnine 0.14.5 requires scipy>=1.8.0, which is not installed.\n",
+ "pymc 5.27.0 requires pandas>=0.24.0, which is not installed.\n",
+ "pymc 5.27.0 requires scipy>=1.4.1, which is not installed.\n",
+ "db-dtypes 1.5.0 requires pandas<3.0.0,>=1.5.3, which is not installed.\n",
+ "sklearn-pandas 2.2.0 requires pandas>=1.1.4, which is not installed.\n",
+ "sklearn-pandas 2.2.0 requires scikit-learn>=0.23.0, which is not installed.\n",
+ "sklearn-pandas 2.2.0 requires scipy>=1.5.1, which is not installed.\n",
+ "pynndescent 0.6.0 requires scikit-learn>=0.18, which is not installed.\n",
+ "pynndescent 0.6.0 requires scipy>=1.0, which is not installed.\n",
+ "cvxpy 1.6.7 requires scipy>=1.11.0, which is not installed.\n",
+ "scikit-image 0.25.2 requires scipy>=1.11.4, which is not installed.\n",
+ "mlxtend 0.23.4 requires pandas>=0.24.2, which is not installed.\n",
+ "mlxtend 0.23.4 requires scikit-learn>=1.3.1, which is not installed.\n",
+ "mlxtend 0.23.4 requires scipy>=1.2.1, which is not installed.\n",
+ "clarabel 0.11.1 requires scipy, which is not installed.\n",
+ "mapclassify 2.10.0 requires pandas>=2.1, which is not installed.\n",
+ "mapclassify 2.10.0 requires scikit-learn>=1.4, which is not installed.\n",
+ "mapclassify 2.10.0 requires scipy>=1.12, which is not installed.\n",
+ "cudf-cu12 25.10.0 requires pandas<2.4.0dev0,>=2.0, which is not installed.\n",
+ "segregation 2.5.3 requires pandas, which is not installed.\n",
+ "segregation 2.5.3 requires scikit-learn>=0.21.3, which is not installed.\n",
+ "segregation 2.5.3 requires scipy, which is not installed.\n",
+ "bqplot 0.12.45 requires pandas<3.0.0,>=1.0.0, which is not installed.\n",
+ "osqp 1.0.5 requires scipy>=0.13.2, which is not installed.\n",
+ "giddy 2.3.8 requires scipy>=1.12, which is not installed.\n",
+ "pytensor 2.36.3 requires scipy<2,>=1, which is not installed.\n",
+ "matplotlib-venn 1.1.2 requires scipy, which is not installed.\n",
+ "mgwr 2.2.1 requires scipy>=0.11, which is not installed.\n",
+ "tsfresh 0.21.1 requires pandas>=0.25.0, which is not installed.\n",
+ "tsfresh 0.21.1 requires scikit-learn>=0.22.0, which is not installed.\n",
+ "tsfresh 0.21.1 requires scipy>=1.14.0; python_version >= \"3.10\", which is not installed.\n",
+ "arviz 0.22.0 requires pandas>=2.1.0, which is not installed.\n",
+ "arviz 0.22.0 requires scipy>=1.11.0, which is not installed.\n",
+ "inequality 1.1.2 requires pandas>=2.1, which is not installed.\n",
+ "inequality 1.1.2 requires scipy>=1.12, which is not installed.\n",
+ "missingno 0.5.2 requires scipy, which is not installed.\n",
+ "pysal 25.7 requires pandas>=1.4, which is not installed.\n",
+ "pysal 25.7 requires scikit-learn>=1.1, which is not installed.\n",
+ "pysal 25.7 requires scipy>=1.8, which is not installed.\n",
+ "xgboost 3.1.2 requires scipy, which is not installed.\n",
+ "prophet 1.2.1 requires pandas>=1.0.4, which is not installed.\n",
+ "cuml-cu12 25.10.0 requires scikit-learn>=1.4, which is not installed.\n",
+ "cuml-cu12 25.10.0 requires scipy>=1.8.0, which is not installed.\n",
+ "dopamine-rl 4.1.2 requires pandas>=0.24.2, which is not installed.\n",
+ "bigquery-magics 0.10.3 requires pandas>=1.2.0, which is not installed.\n",
+ "hyperopt 0.2.7 requires scipy, which is not installed.\n",
+ "bokeh 3.7.3 requires pandas>=1.2, which is not installed.\n",
+ "spint 1.0.7 requires scipy>=0.11, which is not installed.\n",
+ "fastai 2.8.6 requires pandas, which is not installed.\n",
+ "fastai 2.8.6 requires scikit-learn, which is not installed.\n",
+ "fastai 2.8.6 requires scipy, which is not installed.\n",
+ "geopandas 1.1.2 requires pandas>=2.0.0, which is not installed.\n",
+ "pointpats 2.5.2 requires pandas!=1.5.0,>=1.4, which is not installed.\n",
+ "pointpats 2.5.2 requires scipy>=1.10, which is not installed.\n",
+ "shap 0.50.0 requires pandas, which is not installed.\n",
+ "shap 0.50.0 requires scikit-learn, which is not installed.\n",
+ "shap 0.50.0 requires scipy, which is not installed.\n",
+ "spglm 1.1.0 requires scipy>=1.8, which is not installed.\n",
+ "cufflinks 0.17.3 requires pandas>=0.19.2, which is not installed.\n",
+ "gradio 5.50.0 requires pandas<3.0,>=1.0, which is not installed.\n",
+ "xarray 2025.12.0 requires pandas>=2.2, which is not installed.\n",
+ "tobler 0.13.0 requires pandas>=2.2, which is not installed.\n",
+ "tobler 0.13.0 requires scipy>=1.13, which is not installed.\n",
+ "scs 3.2.10 requires scipy, which is not installed.\n",
+ "statsmodels 0.14.6 requires pandas!=2.1.0,>=1.4, which is not installed.\n",
+ "statsmodels 0.14.6 requires scipy!=1.9.2,>=1.8, which is not installed.\n",
+ "esda 2.8.1 requires pandas>=2.1, which is not installed.\n",
+ "esda 2.8.1 requires scikit-learn>=1.4, which is not installed.\n",
+ "esda 2.8.1 requires scipy>=1.12, which is not installed.\n",
+ "xarray-einstats 0.9.1 requires scipy>=1.11, which is not installed.\n",
+ "holoviews 1.22.1 requires pandas>=1.3, which is not installed.\n",
+ "momepy 0.11.0 requires pandas>=2.0, which is not installed.\n",
+ "treelite 4.4.1 requires scipy, which is not installed.\n",
+ "libpysal 4.14.0 requires pandas>=2.1.0, which is not installed.\n",
+ "libpysal 4.14.0 requires scikit-learn>=1.4.0, which is not installed.\n",
+ "libpysal 4.14.0 requires scipy>=1.12.0, which is not installed.\n",
+ "jax 0.7.2 requires scipy>=1.13, which is not installed.\n",
+ "seaborn 0.13.2 requires pandas>=1.2, which is not installed.\n",
+ "jaxlib 0.7.2 requires scipy>=1.13, which is not installed.\n",
+ "umap-learn 0.5.9.post2 requires scikit-learn>=1.6, which is not installed.\n",
+ "umap-learn 0.5.9.post2 requires scipy>=1.3.1, which is not installed.\n",
+ "dask-cuda 25.10.0 requires pandas>=1.3, which is not installed.\n",
+ "spaghetti 1.7.6 requires pandas!=1.5.0,>=1.4, which is not installed.\n",
+ "spaghetti 1.7.6 requires scipy>=1.8, which is not installed.\n",
+ "quantecon 0.10.1 requires scipy>=1.5.0, which is not installed.\n",
+ "bigframes 2.31.0 requires pandas>=1.5.3, which is not installed.\n",
+ "lightgbm 4.6.0 requires scipy, which is not installed.\n",
+ "yfinance 0.2.66 requires pandas>=1.3.0, which is not installed.\n",
+ "opencv-contrib-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= \"3.9\", but you have numpy 1.26.4 which is incompatible.\n",
+ "pytensor 2.36.3 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.\n",
+ "opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= \"3.9\", but you have numpy 1.26.4 which is incompatible.\n",
+ "shap 0.50.0 requires numpy>=2, but you have numpy 1.26.4 which is incompatible.\n",
+ "tobler 0.13.0 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.\n",
+ "opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= \"3.9\", but you have numpy 1.26.4 which is incompatible.\n",
+ "rasterio 1.5.0 requires numpy>=2, but you have numpy 1.26.4 which is incompatible.\n",
+ "jax 0.7.2 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.\n",
+ "jaxlib 0.7.2 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.\u001b[0m\u001b[31m\n",
+ "\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m12.7/12.7 MB\u001b[0m \u001b[31m124.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
+ "access 1.1.10.post3 requires scipy>=1.14.1, which is not installed.\n",
+ "mizani 0.13.5 requires scipy>=1.8.0, which is not installed.\n",
+ "spreg 1.8.4 requires scikit-learn>=0.22, which is not installed.\n",
+ "spreg 1.8.4 requires scipy>=0.11, which is not installed.\n",
+ "spopt 0.7.0 requires scikit-learn>=1.4.0, which is not installed.\n",
+ "spopt 0.7.0 requires scipy>=1.12.0, which is not installed.\n",
+ "plotnine 0.14.5 requires scipy>=1.8.0, which is not installed.\n",
+ "pymc 5.27.0 requires scipy>=1.4.1, which is not installed.\n",
+ "sklearn-pandas 2.2.0 requires scikit-learn>=0.23.0, which is not installed.\n",
+ "sklearn-pandas 2.2.0 requires scipy>=1.5.1, which is not installed.\n",
+ "mlxtend 0.23.4 requires scikit-learn>=1.3.1, which is not installed.\n",
+ "mlxtend 0.23.4 requires scipy>=1.2.1, which is not installed.\n",
+ "mapclassify 2.10.0 requires scikit-learn>=1.4, which is not installed.\n",
+ "mapclassify 2.10.0 requires scipy>=1.12, which is not installed.\n",
+ "segregation 2.5.3 requires scikit-learn>=0.21.3, which is not installed.\n",
+ "segregation 2.5.3 requires scipy, which is not installed.\n",
+ "giddy 2.3.8 requires scipy>=1.12, which is not installed.\n",
+ "mgwr 2.2.1 requires scipy>=0.11, which is not installed.\n",
+ "tsfresh 0.21.1 requires scikit-learn>=0.22.0, which is not installed.\n",
+ "tsfresh 0.21.1 requires scipy>=1.14.0; python_version >= \"3.10\", which is not installed.\n",
+ "arviz 0.22.0 requires scipy>=1.11.0, which is not installed.\n",
+ "inequality 1.1.2 requires scipy>=1.12, which is not installed.\n",
+ "pysal 25.7 requires scikit-learn>=1.1, which is not installed.\n",
+ "pysal 25.7 requires scipy>=1.8, which is not installed.\n",
+ "cuml-cu12 25.10.0 requires scikit-learn>=1.4, which is not installed.\n",
+ "cuml-cu12 25.10.0 requires scipy>=1.8.0, which is not installed.\n",
+ "spint 1.0.7 requires scipy>=0.11, which is not installed.\n",
+ "fastai 2.8.6 requires scikit-learn, which is not installed.\n",
+ "fastai 2.8.6 requires scipy, which is not installed.\n",
+ "pointpats 2.5.2 requires scipy>=1.10, which is not installed.\n",
+ "shap 0.50.0 requires scikit-learn, which is not installed.\n",
+ "shap 0.50.0 requires scipy, which is not installed.\n",
+ "spglm 1.1.0 requires scipy>=1.8, which is not installed.\n",
+ "tobler 0.13.0 requires scipy>=1.13, which is not installed.\n",
+ "statsmodels 0.14.6 requires scipy!=1.9.2,>=1.8, which is not installed.\n",
+ "esda 2.8.1 requires scikit-learn>=1.4, which is not installed.\n",
+ "esda 2.8.1 requires scipy>=1.12, which is not installed.\n",
+ "xarray-einstats 0.9.1 requires scipy>=1.11, which is not installed.\n",
+ "libpysal 4.14.0 requires scikit-learn>=1.4.0, which is not installed.\n",
+ "libpysal 4.14.0 requires scipy>=1.12.0, which is not installed.\n",
+ "spaghetti 1.7.6 requires scipy>=1.8, which is not installed.\n",
+ "shap 0.50.0 requires numpy>=2, but you have numpy 1.26.4 which is incompatible.\n",
+ "tobler 0.13.0 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.\u001b[0m\u001b[31m\n",
+ "\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m60.6/60.6 kB\u001b[0m \u001b[31m3.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m38.2/38.2 MB\u001b[0m \u001b[31m19.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
+ "yellowbrick 1.5 requires scikit-learn>=1.0.0, which is not installed.\n",
+ "librosa 0.11.0 requires scikit-learn>=1.1.0, which is not installed.\n",
+ "imbalanced-learn 0.14.1 requires scikit-learn<2,>=1.4.2, which is not installed.\n",
+ "hdbscan 0.8.41 requires scikit-learn>=1.6, which is not installed.\n",
+ "spreg 1.8.4 requires scikit-learn>=0.22, which is not installed.\n",
+ "spopt 0.7.0 requires scikit-learn>=1.4.0, which is not installed.\n",
+ "sklearn-pandas 2.2.0 requires scikit-learn>=0.23.0, which is not installed.\n",
+ "pynndescent 0.6.0 requires scikit-learn>=0.18, which is not installed.\n",
+ "mlxtend 0.23.4 requires scikit-learn>=1.3.1, which is not installed.\n",
+ "mapclassify 2.10.0 requires scikit-learn>=1.4, which is not installed.\n",
+ "segregation 2.5.3 requires scikit-learn>=0.21.3, which is not installed.\n",
+ "tsfresh 0.21.1 requires scikit-learn>=0.22.0, which is not installed.\n",
+ "pysal 25.7 requires scikit-learn>=1.1, which is not installed.\n",
+ "cuml-cu12 25.10.0 requires scikit-learn>=1.4, which is not installed.\n",
+ "fastai 2.8.6 requires scikit-learn, which is not installed.\n",
+ "sentence-transformers 5.2.0 requires scikit-learn, which is not installed.\n",
+ "shap 0.50.0 requires scikit-learn, which is not installed.\n",
+ "esda 2.8.1 requires scikit-learn>=1.4, which is not installed.\n",
+ "libpysal 4.14.0 requires scikit-learn>=1.4.0, which is not installed.\n",
+ "umap-learn 0.5.9.post2 requires scikit-learn>=1.6, which is not installed.\n",
+ "access 1.1.10.post3 requires scipy>=1.14.1, but you have scipy 1.13.1 which is incompatible.\n",
+ "pytensor 2.36.3 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.\n",
+ "tsfresh 0.21.1 requires scipy>=1.14.0; python_version >= \"3.10\", but you have scipy 1.13.1 which is incompatible.\n",
+ "shap 0.50.0 requires numpy>=2, but you have numpy 1.26.4 which is incompatible.\n",
+ "tobler 0.13.0 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.\n",
+ "jax 0.7.2 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.\n",
+ "jaxlib 0.7.2 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.\u001b[0m\u001b[31m\n",
+ "\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m397.0/397.0 kB\u001b[0m \u001b[31m12.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m311.8/311.8 kB\u001b[0m \u001b[31m31.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m65.5/65.5 kB\u001b[0m \u001b[31m7.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
+ "mlxtend 0.23.4 requires scikit-learn>=1.3.1, which is not installed.\n",
+ "segregation 2.5.3 requires scikit-learn>=0.21.3, which is not installed.\n",
+ "pysal 25.7 requires scikit-learn>=1.1, which is not installed.\n",
+ "cuml-cu12 25.10.0 requires scikit-learn>=1.4, which is not installed.\n",
+ "fastai 2.8.6 requires scikit-learn, which is not installed.\n",
+ "sentence-transformers 5.2.0 requires scikit-learn, which is not installed.\n",
+ "shap 0.50.0 requires scikit-learn, which is not installed.\n",
+ "esda 2.8.1 requires scikit-learn>=1.4, which is not installed.\n",
+ "libpysal 4.14.0 requires scikit-learn>=1.4.0, which is not installed.\n",
+ "langgraph-prebuilt 1.0.5 requires langchain-core>=1.0.0, but you have langchain-core 0.2.40 which is incompatible.\n",
+ "shap 0.50.0 requires numpy>=2, but you have numpy 1.26.4 which is incompatible.\n",
+ "tobler 0.13.0 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.\n",
+ "google-adk 1.21.0 requires tenacity<10.0.0,>=9.0.0, but you have tenacity 8.5.0 which is incompatible.\u001b[0m\u001b[31m\n",
+ "\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m2.3/2.3 MB\u001b[0m \u001b[31m40.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m1.0/1.0 MB\u001b[0m \u001b[31m55.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m397.1/397.1 kB\u001b[0m \u001b[31m36.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m51.0/51.0 kB\u001b[0m \u001b[31m4.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
+ "langgraph-prebuilt 1.0.5 requires langchain-core>=1.0.0, but you have langchain-core 0.2.43 which is incompatible.\u001b[0m\u001b[31m\n",
+ "\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m1.0/1.0 MB\u001b[0m \u001b[31m23.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m137.5/137.5 kB\u001b[0m \u001b[31m5.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m52.0/52.0 kB\u001b[0m \u001b[31m3.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m584.3/584.3 kB\u001b[0m \u001b[31m20.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m74.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m278.2/278.2 kB\u001b[0m \u001b[31m30.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m107.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m17.4/17.4 MB\u001b[0m \u001b[31m129.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m72.5/72.5 kB\u001b[0m \u001b[31m8.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m220.0/220.0 kB\u001b[0m \u001b[31m24.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m66.4/66.4 kB\u001b[0m \u001b[31m7.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m132.6/132.6 kB\u001b[0m \u001b[31m13.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m167.6/167.6 kB\u001b[0m \u001b[31m19.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m60.6/60.6 kB\u001b[0m \u001b[31m7.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m10.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m88.0/88.0 kB\u001b[0m \u001b[31m10.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
+ "opentelemetry-exporter-otlp-proto-http 1.37.0 requires opentelemetry-exporter-otlp-proto-common==1.37.0, but you have opentelemetry-exporter-otlp-proto-common 1.39.1 which is incompatible.\n",
+ "opentelemetry-exporter-otlp-proto-http 1.37.0 requires opentelemetry-proto==1.37.0, but you have opentelemetry-proto 1.39.1 which is incompatible.\n",
+ "opentelemetry-exporter-otlp-proto-http 1.37.0 requires opentelemetry-sdk~=1.37.0, but you have opentelemetry-sdk 1.39.1 which is incompatible.\n",
+ "opentelemetry-exporter-gcp-logging 1.11.0a0 requires opentelemetry-sdk<1.39.0,>=1.35.0, but you have opentelemetry-sdk 1.39.1 which is incompatible.\n",
+ "google-adk 1.21.0 requires opentelemetry-api<=1.37.0,>=1.37.0, but you have opentelemetry-api 1.39.1 which is incompatible.\n",
+ "google-adk 1.21.0 requires opentelemetry-sdk<=1.37.0,>=1.37.0, but you have opentelemetry-sdk 1.39.1 which is incompatible.\n",
+ "google-adk 1.21.0 requires tenacity<10.0.0,>=9.0.0, but you have tenacity 8.5.0 which is incompatible.\u001b[0m\u001b[31m\n",
+ "\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m227.1/227.1 kB\u001b[0m \u001b[31m8.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m8.9/8.9 MB\u001b[0m \u001b[31m119.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
+ "tsfresh 0.21.1 requires scipy>=1.14.0; python_version >= \"3.10\", but you have scipy 1.13.1 which is incompatible.\n",
+ "shap 0.50.0 requires numpy>=2, but you have numpy 1.26.4 which is incompatible.\u001b[0m\u001b[31m\n",
+ "\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m295.8/295.8 kB\u001b[0m \u001b[31m9.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hCRITICAL: Go to 'Runtime' > 'Restart session' NOW.\n",
+ "After restarting, run Cell 2.\n",
+ "Requirement already satisfied: gradio in /usr/local/lib/python3.12/dist-packages (5.50.0)\n",
+ "Requirement already satisfied: aiofiles<25.0,>=22.0 in /usr/local/lib/python3.12/dist-packages (from gradio) (24.1.0)\n",
+ "Requirement already satisfied: anyio<5.0,>=3.0 in /usr/local/lib/python3.12/dist-packages (from gradio) (4.12.1)\n",
+ "Requirement already satisfied: brotli>=1.1.0 in /usr/local/lib/python3.12/dist-packages (from gradio) (1.2.0)\n",
+ "Requirement already satisfied: fastapi<1.0,>=0.115.2 in /usr/local/lib/python3.12/dist-packages (from gradio) (0.123.10)\n",
+ "Requirement already satisfied: ffmpy in /usr/local/lib/python3.12/dist-packages (from gradio) (1.0.0)\n",
+ "Requirement already satisfied: gradio-client==1.14.0 in /usr/local/lib/python3.12/dist-packages (from gradio) (1.14.0)\n",
+ "Requirement already satisfied: groovy~=0.1 in /usr/local/lib/python3.12/dist-packages (from gradio) (0.1.2)\n",
+ "Requirement already satisfied: httpx<1.0,>=0.24.1 in /usr/local/lib/python3.12/dist-packages (from gradio) (0.28.1)\n",
+ "Requirement already satisfied: huggingface-hub<2.0,>=0.33.5 in /usr/local/lib/python3.12/dist-packages (from gradio) (0.36.0)\n",
+ "Requirement already satisfied: jinja2<4.0 in /usr/local/lib/python3.12/dist-packages (from gradio) (3.1.6)\n",
+ "Requirement already satisfied: markupsafe<4.0,>=2.0 in /usr/local/lib/python3.12/dist-packages (from gradio) (3.0.3)\n",
+ "Requirement already satisfied: numpy<3.0,>=1.0 in /usr/local/lib/python3.12/dist-packages (from gradio) (1.26.4)\n",
+ "Requirement already satisfied: orjson~=3.0 in /usr/local/lib/python3.12/dist-packages (from gradio) (3.11.5)\n",
+ "Requirement already satisfied: packaging in /usr/local/lib/python3.12/dist-packages (from gradio) (24.2)\n",
+ "Requirement already satisfied: pandas<3.0,>=1.0 in /usr/local/lib/python3.12/dist-packages (from gradio) (2.2.2)\n",
+ "Requirement already satisfied: pillow<12.0,>=8.0 in /usr/local/lib/python3.12/dist-packages (from gradio) (11.3.0)\n",
+ "Requirement already satisfied: pydantic<=2.12.3,>=2.0 in /usr/local/lib/python3.12/dist-packages (from gradio) (2.12.3)\n",
+ "Requirement already satisfied: pydub in /usr/local/lib/python3.12/dist-packages (from gradio) (0.25.1)\n",
+ "Requirement already satisfied: python-multipart>=0.0.18 in /usr/local/lib/python3.12/dist-packages (from gradio) (0.0.21)\n",
+ "Requirement already satisfied: pyyaml<7.0,>=5.0 in /usr/local/lib/python3.12/dist-packages (from gradio) (6.0.3)\n",
+ "Requirement already satisfied: ruff>=0.9.3 in /usr/local/lib/python3.12/dist-packages (from gradio) (0.14.11)\n",
+ "Requirement already satisfied: safehttpx<0.2.0,>=0.1.6 in /usr/local/lib/python3.12/dist-packages (from gradio) (0.1.7)\n",
+ "Requirement already satisfied: semantic-version~=2.0 in /usr/local/lib/python3.12/dist-packages (from gradio) (2.10.0)\n",
+ "Requirement already satisfied: starlette<1.0,>=0.40.0 in /usr/local/lib/python3.12/dist-packages (from gradio) (0.50.0)\n",
+ "Requirement already satisfied: tomlkit<0.14.0,>=0.12.0 in /usr/local/lib/python3.12/dist-packages (from gradio) (0.13.3)\n",
+ "Requirement already satisfied: typer<1.0,>=0.12 in /usr/local/lib/python3.12/dist-packages (from gradio) (0.21.1)\n",
+ "Requirement already satisfied: typing-extensions~=4.0 in /usr/local/lib/python3.12/dist-packages (from gradio) (4.15.0)\n",
+ "Requirement already satisfied: uvicorn>=0.14.0 in /usr/local/lib/python3.12/dist-packages (from gradio) (0.40.0)\n",
+ "Requirement already satisfied: fsspec in /usr/local/lib/python3.12/dist-packages (from gradio-client==1.14.0->gradio) (2025.3.0)\n",
+ "Requirement already satisfied: websockets<16.0,>=13.0 in /usr/local/lib/python3.12/dist-packages (from gradio-client==1.14.0->gradio) (15.0.1)\n",
+ "Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.12/dist-packages (from anyio<5.0,>=3.0->gradio) (3.11)\n",
+ "Requirement already satisfied: annotated-doc>=0.0.2 in /usr/local/lib/python3.12/dist-packages (from fastapi<1.0,>=0.115.2->gradio) (0.0.4)\n",
+ "Requirement already satisfied: certifi in /usr/local/lib/python3.12/dist-packages (from httpx<1.0,>=0.24.1->gradio) (2026.1.4)\n",
+ "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.12/dist-packages (from httpx<1.0,>=0.24.1->gradio) (1.0.9)\n",
+ "Requirement already satisfied: h11>=0.16 in /usr/local/lib/python3.12/dist-packages (from httpcore==1.*->httpx<1.0,>=0.24.1->gradio) (0.16.0)\n",
+ "Requirement already satisfied: filelock in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.33.5->gradio) (3.20.2)\n",
+ "Requirement already satisfied: requests in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.33.5->gradio) (2.32.4)\n",
+ "Requirement already satisfied: tqdm>=4.42.1 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.33.5->gradio) (4.67.1)\n",
+ "Requirement already satisfied: hf-xet<2.0.0,>=1.1.3 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.33.5->gradio) (1.2.0)\n",
+ "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas<3.0,>=1.0->gradio) (2.9.0.post0)\n",
+ "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas<3.0,>=1.0->gradio) (2025.2)\n",
+ "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas<3.0,>=1.0->gradio) (2025.3)\n",
+ "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.12/dist-packages (from pydantic<=2.12.3,>=2.0->gradio) (0.7.0)\n",
+ "Requirement already satisfied: pydantic-core==2.41.4 in /usr/local/lib/python3.12/dist-packages (from pydantic<=2.12.3,>=2.0->gradio) (2.41.4)\n",
+ "Requirement already satisfied: typing-inspection>=0.4.2 in /usr/local/lib/python3.12/dist-packages (from pydantic<=2.12.3,>=2.0->gradio) (0.4.2)\n",
+ "Requirement already satisfied: click>=8.0.0 in /usr/local/lib/python3.12/dist-packages (from typer<1.0,>=0.12->gradio) (8.3.1)\n",
+ "Requirement already satisfied: shellingham>=1.3.0 in /usr/local/lib/python3.12/dist-packages (from typer<1.0,>=0.12->gradio) (1.5.4)\n",
+ "Requirement already satisfied: rich>=10.11.0 in /usr/local/lib/python3.12/dist-packages (from typer<1.0,>=0.12->gradio) (13.9.4)\n",
+ "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas<3.0,>=1.0->gradio) (1.17.0)\n",
+ "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.12/dist-packages (from rich>=10.11.0->typer<1.0,>=0.12->gradio) (4.0.0)\n",
+ "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.12/dist-packages (from rich>=10.11.0->typer<1.0,>=0.12->gradio) (2.19.2)\n",
+ "Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests->huggingface-hub<2.0,>=0.33.5->gradio) (3.4.4)\n",
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests->huggingface-hub<2.0,>=0.33.5->gradio) (2.5.0)\n",
+ "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.12/dist-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer<1.0,>=0.12->gradio) (0.1.2)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# ==========================================\n",
+ "# CELL 1: DEPENDENCY Installation\n",
+ "# ==========================================\n",
+ "import os\n",
+ "\n",
+ "print(\"π₯ Cleaning up the environment\")\n",
+ "\n",
+ "# 1. Uninstall EVERYTHING to ensure no \"ghost\" versions remain\n",
+ "!pip uninstall -y -q numpy pandas scipy scikit-learn langchain langchain-community langchain-core langchain-groq\n",
+ "\n",
+ "\n",
+ "print(\"π¦ Installing the Dependencies\")\n",
+ "\n",
+ "# CORE MATH LIBRARIES\n",
+ "!pip install -q numpy==1.26.4\n",
+ "!pip install -q pandas==2.2.2\n",
+ "!pip install -q scipy==1.13.1\n",
+ "\n",
+ "# LANGCHAIN 0.2 ECOSYSTEM (\n",
+ "# We strictly pin these to the 0.2 series to avoid the breaking 0.3 update\n",
+ "!pip install -q langchain-core==0.2.40\n",
+ "!pip install -q langchain-community==0.2.16\n",
+ "!pip install -q langchain==0.2.16\n",
+ "!pip install -q langchain-groq==0.1.9\n",
+ "!pip install -q langchain-text-splitters==0.2.4\n",
+ "\n",
+ "# VECTOR DATABASE & EMBEDDINGS\n",
+ "!pip install -q chromadb==0.5.5\n",
+ "!pip install -q sentence-transformers==3.0.1\n",
+ "!pip install -q pypdf==4.3.1\n",
+ "!pip install -q rank-bm25==0.2.2\n",
+ "\n",
+ "\n",
+ "print(\"CRITICAL: Go to 'Runtime' > 'Restart session' NOW.\")\n",
+ "print(\"After restarting, run Cell 2.\")\n",
+ "\n",
+ "!pip install gradio"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "e7y44MyPVC3_"
+ },
+ "source": [
+ "This cell imports all required libraries and sets up the compute device (GPU if available, else CPU).\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "iUn1NfUQBHPK",
+ "outputId": "bf945e96-5495-4d71-e3d2-b6bae439b0dc"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "System ready. Running on: CUDA\n"
+ ]
+ }
+ ],
+ "source": [
+ "import os\n",
+ "import sys\n",
+ "import json\n",
+ "import torch\n",
+ "import numpy as np\n",
+ "from typing import List, Dict, Tuple, Optional\n",
+ "from collections import defaultdict\n",
+ "from dataclasses import dataclass\n",
+ "import hashlib\n",
+ "import gradio as gr\n",
+ "from datetime import datetime\n",
+ "\n",
+ "# Core Imports\n",
+ "from langchain_community.document_loaders import PyPDFLoader\n",
+ "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
+ "from langchain_community.vectorstores import Chroma\n",
+ "from langchain_community.embeddings import HuggingFaceEmbeddings\n",
+ "from langchain_community.retrievers import BM25Retriever\n",
+ "from langchain_groq import ChatGroq\n",
+ "from langchain.schema import Document\n",
+ "\n",
+ "# Advanced Models\n",
+ "from sentence_transformers import SentenceTransformer, CrossEncoder\n",
+ "\n",
+ "# Setup Device\n",
+ "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+ "print(f\"System ready. Running on: {device.upper()}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Gojw-RfdVOa5"
+ },
+ "source": [
+ "## Core Data Structures\n",
+ "\n",
+ "### QueryProfile Dataclass\n",
+ "\n",
+ "**Purpose**: Encapsulates the result of query classification to guide retrieval strategy.\n",
+ "\n",
+ "| Field | Type | Description | Example Values |\n",
+ "|-------|------|-------------|----------------|\n",
+ "| `query_type` | str | Category of question | `\"factoid\"`, `\"summary\"`, `\"comparison\"`, `\"extraction\"`, `\"reasoning\"` |\n",
+ "| `intent` | str | Same as query_type (for extensibility) | Same as above |\n",
+ "| `needs_multi_docs` | bool | Does query span multiple documents? | `True` for comparison queries |\n",
+ "| `requires_comparison` | bool | Is this a compare/contrast question? | `True` if \"compare\", \"difference\" in query |\n",
+ "| `answer_style` | str | How to format the answer | `\"direct\"`, `\"bullets\"`, `\"steps\"` |\n",
+ "| `k` | int | Number of chunks to retrieve | 5-12 (auto-tuned based on query type) |\n",
+ "\n",
+ "### Query Type β Retrieval Strategy Mapping:\n",
+ "```\n",
+ "factoid β k=6, style=direct (simple fact lookup)\n",
+ "summary β k=10, style=bullets (overview questions)\n",
+ "comparison β k=12, style=bullets (cross-document comparison)\n",
+ "extraction β k=8, style=direct (extract specific info)\n",
+ "reasoning β k=10, style=steps (explain how/why)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "id": "zs63mc1EBls0"
+ },
+ "outputs": [],
+ "source": [
+ "@dataclass\n",
+ "class QueryProfile:\n",
+ " query_type: str\n",
+ " intent: str\n",
+ " needs_multi_docs: bool\n",
+ " requires_comparison: bool\n",
+ " answer_style: str\n",
+ " k: int\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "_voziDjOVl6M"
+ },
+ "source": [
+ "### QueryCache Class\n",
+ "\n",
+ "**Purpose**: LRU-style cache to avoid redundant LLM calls for repeated queries.\n",
+ "\n",
+ "#### How It Works:\n",
+ "1. **Key Generation**: MD5 hash of query string\n",
+ "2. **Storage**: Dictionary mapping hash β response\n",
+ "3. **Eviction**: FIFO (First-In-First-Out) when `max_size` exceeded\n",
+ "\n",
+ "#### Methods:\n",
+ "| Method | Input | Output | Description |\n",
+ "|--------|-------|--------|-------------|\n",
+ "| `get(query)` | Query string | Response or `None` | Check if query is cached |\n",
+ "| `set(query, response)` | Query + Response | None | Store result, evict oldest if full |\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "id": "oWpJgmpwN1tS"
+ },
+ "outputs": [],
+ "source": [
+ "class QueryCache:\n",
+ " \"\"\"Simple cache for repeated queries\"\"\"\n",
+ " def __init__(self, max_size=100):\n",
+ " self.cache = {}\n",
+ " self.max_size = max_size\n",
+ "\n",
+ " def get(self, query: str) -> Optional[str]:\n",
+ " key = hashlib.md5(query.encode()).hexdigest()\n",
+ " return self.cache.get(key)\n",
+ "\n",
+ " def set(self, query: str, response: str):\n",
+ " key = hashlib.md5(query.encode()).hexdigest()\n",
+ " if len(self.cache) >= self.max_size:\n",
+ " self.cache.pop(next(iter(self.cache)))\n",
+ " self.cache[key] = response\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "rwBQl2SjVuW3"
+ },
+ "source": [
+ "### SemanticChunker Class\n",
+ "\n",
+ "**Purpose**: Split documents into semantically coherent chunks (vs. arbitrary character-based splits).\n",
+ "\n",
+ "#### Why Semantic Chunking?\n",
+ "| Traditional Chunking | Semantic Chunking |\n",
+ "|---------------------|-------------------|\n",
+ "| Splits at fixed character count | Splits at topic boundaries |\n",
+ "| May cut mid-sentence/concept | Preserves complete ideas |\n",
+ "| Lower retrieval relevance | Higher retrieval relevance |\n",
+ "\n",
+ "#### Algorithm:\n",
+ "```\n",
+ "1. Split text into sentences (by \". \")\n",
+ "2. Encode each sentence with SentenceTransformer\n",
+ "3. For each consecutive sentence pair:\n",
+ " - Compute cosine similarity\n",
+ " - If similarity > threshold AND size < max:\n",
+ " β Add to current chunk\n",
+ " - Else:\n",
+ " β Save chunk, start new one\n",
+ "4. Return list of semantic chunks\n",
+ "```\n",
+ "\n",
+ "#### Parameters:\n",
+ "| Parameter | Default | Description |\n",
+ "|-----------|---------|-------------|\n",
+ "| `model_name` | `all-MiniLM-L6-v2` | Sentence embedding model |\n",
+ "| `max_chunk_size` | 1000 | Maximum characters per chunk |\n",
+ "| `similarity_threshold` | 0.5 | Cosine similarity threshold for grouping |"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "id": "ZqF22Kb5Nv7e"
+ },
+ "outputs": [],
+ "source": [
+ "class SemanticChunker:\n",
+ " \"\"\"Advanced semantic chunking using sentence embeddings\"\"\"\n",
+ " def __init__(self, model_name=\"sentence-transformers/all-MiniLM-L6-v2\"):\n",
+ " self.model = SentenceTransformer(model_name, device=device)\n",
+ "\n",
+ " def chunk_document(self, text: str, max_chunk_size=1000, similarity_threshold=0.5):\n",
+ " \"\"\"Split text into semantically coherent chunks\"\"\"\n",
+ " sentences = text.replace('\\n', ' ').split('. ')\n",
+ " sentences = [s.strip() + '.' for s in sentences if s.strip()]\n",
+ "\n",
+ " if not sentences:\n",
+ " return [text]\n",
+ "\n",
+ " embeddings = self.model.encode(sentences)\n",
+ " chunks = []\n",
+ " current_chunk = [sentences[0]]\n",
+ " current_size = len(sentences[0])\n",
+ "\n",
+ " for i in range(1, len(sentences)):\n",
+ " similarity = np.dot(embeddings[i-1], embeddings[i]) / (\n",
+ " np.linalg.norm(embeddings[i-1]) * np.linalg.norm(embeddings[i])\n",
+ " )\n",
+ " sentence_len = len(sentences[i])\n",
+ "\n",
+ " if similarity > similarity_threshold and current_size + sentence_len < max_chunk_size:\n",
+ " current_chunk.append(sentences[i])\n",
+ " current_size += sentence_len\n",
+ " else:\n",
+ " chunks.append(' '.join(current_chunk))\n",
+ " current_chunk = [sentences[i]]\n",
+ " current_size = sentence_len\n",
+ "\n",
+ " if current_chunk:\n",
+ " chunks.append(' '.join(current_chunk))\n",
+ "\n",
+ " return chunks\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "yTCw6h2YWCZq"
+ },
+ "source": [
+ "### ReciprocalRankFusion (RRF) Class\n",
+ "\n",
+ "**Purpose**: Combine multiple ranked retrieval lists into a single optimal ranking.\n",
+ "\n",
+ "#### The Problem RRF Solves:\n",
+ "When using multiple retrievers (vector search, keyword search, etc.), each returns a ranked list. How do we combine them?\n",
+ "\n",
+ "#### RRF Formula:\n",
+ "```\n",
+ "score(doc) = Ξ£ 1 / (k + rank_i + 1)\n",
+ "```\n",
+ "Where:\n",
+ "- `k` = 60 (smoothing constant, standard value)\n",
+ "- `rank_i` = position of document in retrieval list i (0-indexed)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "id": "nxiRtNGINqQr"
+ },
+ "outputs": [],
+ "source": [
+ "class ReciprocalRankFusion:\n",
+ " \"\"\"RRF for combining multiple retrieval results\"\"\"\n",
+ " @staticmethod\n",
+ " def fuse(retrieval_results: List[List[Document]], k=60) -> List[Document]:\n",
+ " doc_scores = defaultdict(float)\n",
+ " doc_map = {}\n",
+ "\n",
+ " for docs in retrieval_results:\n",
+ " for rank, doc in enumerate(docs):\n",
+ " doc_id = doc.metadata.get('chunk_id') or f\"{doc.metadata.get('pdf_id', 'unknown')}::{hashlib.md5(doc.page_content.encode()).hexdigest()}\"\n",
+ " doc_scores[doc_id] += 1 / (k + rank + 1)\n",
+ " doc_map[doc_id] = doc\n",
+ "\n",
+ " sorted_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)\n",
+ " return [doc_map[doc_id] for doc_id, _ in sorted_docs]\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "VtUYO8vPWMzp"
+ },
+ "source": [
+ "## EnhancedRAG - Complete RAG Engine\n",
+ "\n",
+ "This is the **core class** that orchestrates the entire RAG pipeline. All document ingestion, retrieval, and generation flows through this class.\n",
+ "\n",
+ "---\n",
+ "\n",
+ "### Class Architecture\n",
+ "\n",
+ "```\n",
+ "EnhancedRAGv3\n",
+ "βββ Storage Layer\n",
+ "β βββ vector_db (ChromaDB) # Semantic search index\n",
+ "β βββ bm25_retriever # Keyword search index\n",
+ "β βββ documents (List) # All document chunks\n",
+ "β βββ pdf_metadata (Dict) # PDF tracking {name: {path, pages, chunks, pdf_id}}\n",
+ "β\n",
+ "βββ Model Layer (Lazy-loaded for memory efficiency)\n",
+ "β βββ embedding_model # BAAI/bge-large-en-v1.5 (~1.2GB)\n",
+ "β βββ cross_encoder # BAAI/bge-reranker-v2-m3 (~560MB)\n",
+ "β βββ semantic_chunker # all-MiniLM-L6-v2 (~90MB)\n",
+ "β βββ query_model # all-MiniLM-L6-v2 (~90MB)\n",
+ "β\n",
+ "βββ LLM Layer\n",
+ "β βββ llm (ChatGroq) # Llama 3.3 70B via Groq API\n",
+ "β\n",
+ "βββ Utility Layer\n",
+ " βββ cache (QueryCache) # Response caching (max 100 queries)\n",
+ " βββ api_key # Groq API key\n",
+ "```\n",
+ "\n",
+ "---\n",
+ "\n",
+ "### Method Reference\n",
+ "\n",
+ "| Method | Purpose | Key Details |\n",
+ "|--------|---------|-------------|\n",
+ "| `__init__(api_key)` | Initialize system | Sets up LLM, all other models lazy-loaded |\n",
+ "| `load_models()` | Load ML models | BGE embeddings β CrossEncoder β Chunker β Query model |\n",
+ "| `ingest_pdf(path)` | Process PDF | Extract β Chunk β Index in ChromaDB + BM25 |\n",
+ "| `chat(query)` | Answer questions | Full pipeline: classify β expand β retrieve β rerank β generate |\n",
+ "| `summarize_document()` | Summarize all docs | Map-reduce: batch summaries β final synthesis |\n",
+ "\n",
+ "---\n",
+ "\n",
+ "### 1. Initialization & Model Loading\n",
+ "\n",
+ "**`__init__(api_key)`** - Sets up the system with Groq API key. Models are NOT loaded yet (lazy loading for faster startup).\n",
+ "\n",
+ "**`load_models()`** - Loads all ML models with progress tracking:\n",
+ "\n",
+ "| Progress | Model | Size | Purpose |\n",
+ "|----------|-------|------|---------|\n",
+ "| 10% β 40% | BAAI/bge-large-en-v1.5 | ~1.2GB | Document & query embeddings (1024-dim, normalized) |\n",
+ "| 40% β 60% | BAAI/bge-reranker-v2-m3 | ~560MB | Cross-encoder re-ranking |\n",
+ "| 60% β 80% | all-MiniLM-L6-v2 | ~90MB | Semantic chunking |\n",
+ "| 80% β 100% | all-MiniLM-L6-v2 | ~90MB | Query processing |\n",
+ "\n",
+ "---\n",
+ "\n",
+ "### 2. Document Ingestion Pipeline\n",
+ "\n",
+ "**`ingest_pdf(pdf_path, use_semantic_chunking=True)`**\n",
+ "\n",
+ "```\n",
+ "PDF File\n",
+ " β\n",
+ "βββββββββββββββββββββββ\n",
+ "β 1. PyPDFLoader β Extract text from each page\n",
+ "βββββββββββββββββββββββ\n",
+ " β\n",
+ "βββββββββββββββββββββββ\n",
+ "β 2. Duplicate Check β Skip if pdf_name already in pdf_metadata\n",
+ "βββββββββββββββββββββββ\n",
+ " β\n",
+ "βββββββββββββββββββββββ\n",
+ "β 3. Chunking β SemanticChunker (default) or RecursiveTextSplitter\n",
+ "βββββββββββββββββββββββ\n",
+ " β\n",
+ "βββββββββββββββββββββββ\n",
+ "β 4. Add Metadata β {page, source, pdf_name, pdf_id, chunk_id}\n",
+ "βββββββββββββββββββββββ\n",
+ " β\n",
+ "βββββββββββββββββββββββ\n",
+ "β 5. Rebuild Indexes β ChromaDB (vector) + BM25 (keyword) with ALL docs\n",
+ "βββββββββββββββββββββββ\n",
+ "```\n",
+ "\n",
+ "**Chunk Metadata Schema:**\n",
+ "```python\n",
+ "{\n",
+ " \"page\": 0, # 0-indexed page number\n",
+ " \"source\": \"/path/to/doc.pdf\", # Full file path\n",
+ " \"pdf_name\": \"doc.pdf\", # Filename only\n",
+ " \"pdf_id\": \"a1b2c3d4\", # 8-char MD5 hash (unique per PDF)\n",
+ " \"chunk_id\": \"a1b2c3d4-42\" # Unique chunk identifier\n",
+ "}\n",
+ "```\n",
+ "\n",
+ "---\n",
+ "\n",
+ "### 3. Query Classification\n",
+ "\n",
+ "**`_classify_query(query) β QueryProfile`**\n",
+ "\n",
+ "Determines optimal retrieval strategy using LLM + heuristic fallback:\n",
+ "\n",
+ "| Query Type | Trigger Keywords | k | Answer Style |\n",
+ "|------------|------------------|---|--------------|\n",
+ "| `factoid` | \"what is\", \"who is\", \"define\" | 6 | direct |\n",
+ "| `summary` | \"summarize\", \"overview\", \"key points\" | 10 | bullets |\n",
+ "| `comparison` | \"compare\", \"difference\", \"vs\", \"between\" | 12 | bullets |\n",
+ "| `extraction` | (default) | 8 | direct |\n",
+ "| `reasoning` | \"explain\", \"how does\", \"why\" | 10 | steps |\n",
+ "\n",
+ "**Returns:** `QueryProfile(query_type, intent, needs_multi_docs, requires_comparison, answer_style, k)`\n",
+ "\n",
+ "---\n",
+ "\n",
+ "### 4. Query Enhancement Techniques\n",
+ "\n",
+ "**`_generate_hyde_document(query) β str`** - HyDE (Hypothetical Document Embeddings)\n",
+ "\n",
+ "```\n",
+ "Query: \"What is attention?\"\n",
+ " β LLM generates\n",
+ "HyDE Doc: \"The attention mechanism is a neural network component\n",
+ " that allows models to focus on relevant parts...\"\n",
+ " β\n",
+ "Used for retrieval (matches real docs better than short query!)\n",
+ "```\n",
+ "\n",
+ "**`_expand_query(query) β List[str]`** - Multi-Query Expansion\n",
+ "\n",
+ "```\n",
+ "Original: \"What are the benefits of transformers?\"\n",
+ " β LLM generates 3 variants\n",
+ "[\n",
+ " \"What are the benefits of transformers?\", # Original\n",
+ " \"What advantages do transformer models offer?\", # Variant 1\n",
+ " \"Why are transformers better than RNNs?\", # Variant 2\n",
+ " \"What makes transformer architecture effective?\" # Variant 3\n",
+ "]\n",
+ " β\n",
+ "All used for retrieval β RRF fuses results\n",
+ "```\n",
+ "\n",
+ "---\n",
+ "\n",
+ "### 5. Hybrid Retrieval Pipeline\n",
+ "\n",
+ "**`_retrieve_with_rrf(query, k, fetch_factor=2) β List[Document]`**\n",
+ "\n",
+ "```\n",
+ "Query\n",
+ " β\n",
+ " βββββββββββββββββββββββββββββββββββββββ\n",
+ " β β\n",
+ "ββββββββββββββββββββββ ββββββββββββββββββββββ\n",
+ "β Vector Search (MMR)β β BM25 Search β\n",
+ "β β β β\n",
+ "β β’ Semantic match β β β’ Exact keywords β\n",
+ "β β’ lambda=0.6 β β β’ Term frequency β\n",
+ "β (relevance+ β β β\n",
+ "β diversity) β β β\n",
+ "ββββββββββββββββββββββ ββββββββββββββββββββββ\n",
+ " β β\n",
+ " βββββββββββββββββββ¬ββββββββββββββββββββ\n",
+ " β\n",
+ " ββββββββββββββββββ\n",
+ " β RRF Fusion β score = Ξ£ 1/(60 + rank + 1)\n",
+ " ββββββββββββββββββ\n",
+ " β\n",
+ " Fused ranked list\n",
+ "```\n",
+ "\n",
+ "**Why Hybrid?**\n",
+ "- Vector: Understands synonyms, semantic similarity\n",
+ "- BM25: Exact term matching, handles rare words\n",
+ "- Combined: Best of both worlds\n",
+ "\n",
+ "---\n",
+ "\n",
+ "### 6. Re-ranking & PDF Diversity\n",
+ "\n",
+ "**`_rerank_documents(query, documents, top_k) β List[(Document, score)]`**\n",
+ "\n",
+ "Uses **CrossEncoder** for neural re-ranking:\n",
+ "- Bi-encoder (initial): Fast but less accurate (query/doc encoded separately)\n",
+ "- Cross-encoder (re-rank): Slower but accurate (query+doc processed together)\n",
+ "\n",
+ "**Comparison Query Boost:** For comparison queries, documents containing keywords like \"compared to\", \"in contrast\", \"whereas\" get +10% score boost per keyword.\n",
+ "\n",
+ "---\n",
+ "\n",
+ "**`_ensure_pdf_diversity(query, documents, target_docs=2) β List[Document]`**\n",
+ "\n",
+ "For multi-document queries, ensures chunks from ALL loaded PDFs:\n",
+ "\n",
+ "```\n",
+ "Problem: Query about \"both papers\" returns only Paper A chunks\n",
+ " β\n",
+ "Solution: Detect missing PDFs β filtered vector search β add their chunks\n",
+ " β\n",
+ "Result: [chunk_A1, chunk_A2, chunk_A3, chunk_B1, chunk_B2]\n",
+ "```\n",
+ "\n",
+ "---\n",
+ "\n",
+ "### 7. Main Chat Pipeline\n",
+ "\n",
+ "**`chat(query, use_hyde=True, use_multi_query=True) β (answer, citations, metadata)`**\n",
+ "\n",
+ "```\n",
+ "βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\n",
+ "β 1. CACHE CHECK β Return immediately if query cached β\n",
+ "βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€\n",
+ "β 2. CLASSIFY QUERY β β QueryProfile (type, k, style) β\n",
+ "βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€\n",
+ "β 3. EXPAND QUERY β Generate 3 alternative phrasings β\n",
+ "βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€\n",
+ "β 4. GENERATE HyDE β Create hypothetical answer document β\n",
+ "βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€\n",
+ "β 5. RETRIEVE β For EACH query variant: β\n",
+ "β β β’ Vector search (MMR) β\n",
+ "β β β’ BM25 search β\n",
+ "β β β’ RRF fusion β\n",
+ "βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€\n",
+ "β 6. GLOBAL RRF β Fuse results from all query variants β\n",
+ "βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€\n",
+ "β 7. PDF DIVERSITY β Ensure chunks from all loaded PDFs β\n",
+ "βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€\n",
+ "β 8. RERANK β CrossEncoder neural scoring β top k β\n",
+ "βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€\n",
+ "β 9. BUILD CONTEXT β Format: \"[Source 1]: chunk content...\" β\n",
+ "βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€\n",
+ "β 10. LLM GENERATE β Answer with inline [Source X] citationsβ\n",
+ "β β (Different prompts for comparison) β\n",
+ "βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€\n",
+ "β 11. VERIFY (complex) β Self-check: direct? structured? If not β\n",
+ "β β β regenerate improved answer β\n",
+ "βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€\n",
+ "β 12. CACHE & RETURN β Store result, return (answer, cites, β\n",
+ "β β metadata) β\n",
+ "βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\n",
+ "```\n",
+ "\n",
+ "---\n",
+ "\n",
+ "### 8. Document Summarization\n",
+ "\n",
+ "**`summarize_document(max_chunks=None) β (summary, metadata)`**\n",
+ "\n",
+ "Uses **Map-Reduce** pattern:\n",
+ "\n",
+ "```\n",
+ "MAP PHASE:\n",
+ " Chunks [1-10] β LLM β 3-5 bullet summary\n",
+ " Chunks [11-20] β LLM β 3-5 bullet summary\n",
+ " ...\n",
+ " Chunks [n-m] β LLM β 3-5 bullet summary\n",
+ "\n",
+ "REDUCE PHASE:\n",
+ " All batch summaries β LLM β Final structured summary:\n",
+ " β’ Overview (2-3 sentences)\n",
+ " β’ Main Topics (bullets)\n",
+ " β’ Important Details (3-5 points)\n",
+ " β’ Conclusion\n",
+ "```\n",
+ "\n",
+ "---\n",
+ "\n",
+ "### Key Parameters Reference\n",
+ "\n",
+ "| Parameter | Location | Default | Description |\n",
+ "|-----------|----------|---------|-------------|\n",
+ "| `k` | QueryProfile | 5-12 | Chunks to retrieve (auto-tuned by query type) |\n",
+ "| `fetch_factor` | _retrieve_with_rrf | 2 | Multiplier for initial retrieval pool |\n",
+ "| `lambda_mult` | MMR search | 0.6 | Diversity vs relevance (0=diverse, 1=relevant) |\n",
+ "| `similarity_threshold` | SemanticChunker | 0.5 | Cosine sim for chunk boundaries |\n",
+ "| `max_chunk_size` | SemanticChunker | 1000 | Max characters per chunk |\n",
+ "| `chunk_size` | TextSplitter | 800 | Fallback chunker size |\n",
+ "| `chunk_overlap` | TextSplitter | 150 | Character overlap between chunks |\n",
+ "| `max_size` | QueryCache | 100 | Maximum cached queries |\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "iXFJQbsgNVpz"
+ },
+ "outputs": [],
+ "source": [
+ "class EnhancedRAGv3:\n",
+ " def __init__(self, api_key: str):\n",
+ " self.vector_db = None\n",
+ " self.bm25_retriever = None\n",
+ " self.documents = []\n",
+ " self.pdf_metadata = {} # Track multiple PDFs\n",
+ " self.doc_headers = {} # Store extracted headers (title, authors, abstract) per PDF\n",
+ " self.cache = QueryCache()\n",
+ " self.api_key = api_key\n",
+ " self.is_initialized = False\n",
+ "\n",
+ " # Initialize LLM\n",
+ " self.llm = ChatGroq(\n",
+ " temperature=0,\n",
+ " model_name=\"llama-3.3-70b-versatile\",\n",
+ " groq_api_key=api_key\n",
+ " )\n",
+ "\n",
+ " # Models (loaded on demand)\n",
+ " self.embedding_model = None\n",
+ " self.cross_encoder = None\n",
+ " self.semantic_chunker = None\n",
+ " self.query_model = None\n",
+ "\n",
+ " def load_models(self, progress=gr.Progress()):\n",
+ " \"\"\"Load all models with progress tracking\"\"\"\n",
+ " if self.is_initialized:\n",
+ " return \"Models already loaded.\"\n",
+ "\n",
+ " progress(0.1, desc=\"Loading BGE embeddings...\")\n",
+ " self.embedding_model = HuggingFaceEmbeddings(\n",
+ " model_name=\"BAAI/bge-large-en-v1.5\",\n",
+ " model_kwargs={'device': device, 'trust_remote_code': True},\n",
+ " encode_kwargs={'normalize_embeddings': True}\n",
+ " )\n",
+ "\n",
+ " progress(0.4, desc=\"Loading Re-ranker...\")\n",
+ " self.cross_encoder = CrossEncoder('BAAI/bge-reranker-v2-m3', device=device)\n",
+ "\n",
+ " progress(0.6, desc=\"Loading Semantic Chunker...\")\n",
+ " self.semantic_chunker = SemanticChunker()\n",
+ "\n",
+ " progress(0.8, desc=\"Loading Query Model...\")\n",
+ " self.query_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)\n",
+ "\n",
+ " progress(1.0, desc=\"Complete\")\n",
+ " self.is_initialized = True\n",
+ " return \"All models loaded successfully.\"\n",
+ "\n",
+ " def _extract_document_header(self, pages: List[Document], pdf_name: str, pdf_id: str) -> Dict:\n",
+ " \"\"\"Extract title, authors, and abstract from first pages of PDF\"\"\"\n",
+ " # Get text from first 2 pages (where metadata usually is)\n",
+ " header_text = \"\"\n",
+ " for i, page in enumerate(pages[:2]):\n",
+ " header_text += page.page_content + \"\\n\\n\"\n",
+ " \n",
+ " # Use LLM to extract structured metadata\n",
+ " extraction_prompt = f\"\"\"Extract the following information from this academic paper's first pages.\n",
+ "Return ONLY a JSON object with these keys:\n",
+ "- title: The paper's title (string)\n",
+ "- authors: List of author names (array of strings)\n",
+ "- abstract: The paper's abstract if present (string, or null if not found)\n",
+ "- institutions: List of institutions/affiliations if present (array of strings, or empty array)\n",
+ "\n",
+ "Text from first pages:\n",
+ "{header_text[:4000]}\n",
+ "\n",
+ "JSON:\"\"\"\n",
+ "\n",
+ " try:\n",
+ " response = self.llm.invoke(extraction_prompt)\n",
+ " # Parse JSON from response\n",
+ " import re\n",
+ " json_match = re.search(r'\\{[\\s\\S]*\\}', response.content)\n",
+ " if json_match:\n",
+ " metadata = json.loads(json_match.group())\n",
+ " metadata['pdf_name'] = pdf_name\n",
+ " metadata['pdf_id'] = pdf_id\n",
+ " metadata['raw_header'] = header_text[:2000] # Store raw text too\n",
+ " return metadata\n",
+ " except Exception as e:\n",
+ " print(f\"Header extraction error: {e}\")\n",
+ " \n",
+ " # Fallback: return raw header text\n",
+ " return {\n",
+ " 'title': None,\n",
+ " 'authors': [],\n",
+ " 'abstract': None,\n",
+ " 'institutions': [],\n",
+ " 'pdf_name': pdf_name,\n",
+ " 'pdf_id': pdf_id,\n",
+ " 'raw_header': header_text[:2000]\n",
+ " }\n",
+ "\n",
+ " def _is_metadata_query(self, query: str) -> Tuple[bool, str]:\n",
+ " \"\"\"Check if query is asking for basic document metadata\"\"\"\n",
+ " query_lower = query.lower()\n",
+ " \n",
+ " # Author queries\n",
+ " author_patterns = ['who are the authors', 'who wrote', 'author', 'authors', 'written by', 'by whom']\n",
+ " if any(p in query_lower for p in author_patterns):\n",
+ " return True, 'authors'\n",
+ " \n",
+ " # Title queries\n",
+ " title_patterns = ['what is the title', 'title of', 'paper title', 'document title', 'name of the paper']\n",
+ " if any(p in query_lower for p in title_patterns):\n",
+ " return True, 'title'\n",
+ " \n",
+ " # Abstract queries\n",
+ " abstract_patterns = ['what is the abstract', 'abstract of', 'paper abstract', 'summarize the abstract']\n",
+ " if any(p in query_lower for p in abstract_patterns):\n",
+ " return True, 'abstract'\n",
+ " \n",
+ " # Institution queries\n",
+ " institution_patterns = ['which institution', 'which university', 'affiliation', 'where are the authors from']\n",
+ " if any(p in query_lower for p in institution_patterns):\n",
+ " return True, 'institutions'\n",
+ " \n",
+ " return False, None\n",
+ "\n",
+ " def _answer_metadata_query(self, query: str, metadata_type: str) -> Tuple[str, str, str]:\n",
+ " \"\"\"Answer queries about document metadata directly\"\"\"\n",
+ " if not self.doc_headers:\n",
+ " return \"No document metadata available.\", \"\", \"\"\n",
+ " \n",
+ " # Build response from stored headers\n",
+ " responses = []\n",
+ " citations = []\n",
+ " \n",
+ " for pdf_name, header in self.doc_headers.items():\n",
+ " if metadata_type == 'authors':\n",
+ " authors = header.get('authors', [])\n",
+ " if authors:\n",
+ " author_str = \", \".join(authors)\n",
+ " responses.append(f\"**{pdf_name}**: {author_str}\")\n",
+ " else:\n",
+ " # Fallback to raw header\n",
+ " responses.append(f\"**{pdf_name}**: Authors could not be automatically extracted. See first page.\")\n",
+ " \n",
+ " elif metadata_type == 'title':\n",
+ " title = header.get('title')\n",
+ " if title:\n",
+ " responses.append(f\"**{pdf_name}**: {title}\")\n",
+ " else:\n",
+ " responses.append(f\"**{pdf_name}**: Title could not be automatically extracted.\")\n",
+ " \n",
+ " elif metadata_type == 'abstract':\n",
+ " abstract = header.get('abstract')\n",
+ " if abstract:\n",
+ " responses.append(f\"**{pdf_name}**:\\n{abstract}\")\n",
+ " else:\n",
+ " responses.append(f\"**{pdf_name}**: Abstract not found in first pages.\")\n",
+ " \n",
+ " elif metadata_type == 'institutions':\n",
+ " institutions = header.get('institutions', [])\n",
+ " if institutions:\n",
+ " inst_str = \", \".join(institutions)\n",
+ " responses.append(f\"**{pdf_name}**: {inst_str}\")\n",
+ " else:\n",
+ " responses.append(f\"**{pdf_name}**: Institutions could not be automatically extracted.\")\n",
+ " \n",
+ " # Create citation from raw header\n",
+ " snippet = header.get('raw_header', '')[:300] + \"...\"\n",
+ " citations.append(f\"\"\"\n",
+ "[1] {pdf_name} β Page 1 | Relevance: High (Document Header)
\n",
+ "{snippet}
\n",
+ "[{idx}] {pdf_name} β Page {page} | Relevance: {relevance} ({score:.2f})
\n",
+ "{snippet}
\n",
+ "