{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "081405cc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
    "from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader\n",
    "from langchain_community.vectorstores import FAISS\n",
    "from langchain_community.embeddings import HuggingFaceEmbeddings\n",
    "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "load_dotenv()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "3c40840f",
   "metadata": {},
   "outputs": [],
   "source": [
    "MODEL_NAME = \"sentence-transformers/all-MiniLM-L12-v2\"\n",
    "DATA_PATH=\"data/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "90fc0a47",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading documents from data/...\n",
      "Loaded 2087 PDF document(s).\n",
      "Split into 25938 chunks.\n",
      "Creating and saving FAISS vector store...\n"
     ]
    }
   ],
   "source": [
    "embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME)\n",
    "\n",
    "print(f\"Loading documents from {DATA_PATH}...\")\n",
    "loader = DirectoryLoader(\n",
    "    DATA_PATH,\n",
    "    glob='*.pdf',         \n",
    "    loader_cls=PyPDFLoader  \n",
    ")\n",
    "documents = loader.load()\n",
    "\n",
    "if not documents:\n",
    "    print(\"No PDF documents found. Make sure your PDFs are in the /data folder.\")\n",
    "    exit()\n",
    "\n",
    "print(f\"Loaded {len(documents)} PDF document(s).\")\n",
    "\n",
    "# 3. Split Documents\n",
    "text_splitter = RecursiveCharacterTextSplitter(\n",
    "    chunk_size=300, \n",
    "    chunk_overlap=200,\n",
    "    separators=[\"\\n\\n\", \"\\n\", \".\", \"!\", \"?\", \" \", \"\"]\n",
    "    )\n",
    "docs = text_splitter.split_documents(documents)\n",
    "\n",
    "print(f\"Split into {len(docs)} chunks.\")\n",
    "\n",
    "# 4. Create and Save FAISS Vector Store\n",
    "print(\"Creating and saving FAISS vector store...\")\n",
    "db = FAISS.from_documents(docs, embeddings)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9ca0ee2b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading embedding model: sentence-transformers/all-MiniLM-L12-v2...\n",
      "\n",
      "✅ Retriever is ready.\n",
      "   Enter your query to test. Type 'exit' to quit.\n",
      "\n",
      "--- Retrieving docs for: 'who is director' ---\n",
      "\n",
      "--- Document 1 ---\n",
      "Source: data/iiitdmj_crawl_data_1.pdf\n",
      "Page: 133\n",
      "\n",
      "Content:\n",
      "director@iiitdmj.ac.in\n",
      "2.\n",
      "Deputy Director\n",
      "To be nominated on appointment\n",
      "3.\n",
      "Deans (Ex-officio)\n",
      "1. Dr. Mukesh Kumar Roy\n",
      "Faculty-in-Charge (Student Affairs)\n",
      "mkroy@iiitdmj.ac.in\n",
      "2. Prof. V. K. Gupta\n",
      "Professor In-charge (Academic)\n",
      "dean.acad@iiitdmj.ac.in\n",
      "3. Prof. Pritee Khanna\n",
      "--------------------\n",
      "\n",
      "--- Document 2 ---\n",
      "Source: data/IIITDM Jabalpur.pdf\n",
      "Page: 2\n",
      "\n",
      "Content:\n",
      " The Deputy Director  (to be nominated on appointment) \n",
      " The Deans \n",
      " The Heads of various disciplines and \n",
      " The Registrar \n",
      " \n",
      " \n",
      " \n",
      " \n",
      "Building And Works Committee \n",
      "S. No.  Name Designation  \n",
      "1.    Prof. Bhartendu Kumar  Singh \n",
      "Director \n",
      "PDPM-IIITDM Jabalpur \n",
      "director@iiitdmj.ac.in\n",
      "--------------------\n",
      "\n",
      "--- Document 3 ---\n",
      "Source: data/iiitdmj_crawl_data_1.pdf\n",
      "Page: 133\n",
      "\n",
      "Content:\n",
      "S. No.\n",
      "Name\n",
      "Address\n",
      "1.\n",
      "Director as Chairperson (Ex-officio)\n",
      "Prof. Bhartendu K Singh (Director)\n",
      "director@iiitdmj.ac.in\n",
      "2.\n",
      "Deputy Director\n",
      "To be nominated on appointment\n",
      "3.\n",
      "Deans (Ex-officio)\n",
      "1. Dr. Mukesh Kumar Roy\n",
      "Faculty-in-Charge (Student Affairs)\n",
      "mkroy@iiitdmj.ac.in\n",
      "2. Prof. V. K. Gupta\n",
      "--------------------\n"
     ]
    }
   ],
   "source": [
    "import sys\n",
    "from langchain_community.vectorstores import FAISS\n",
    "from langchain_community.embeddings import HuggingFaceEmbeddings\n",
    "\n",
    "\n",
    "def check_retriever():\n",
    "    \"\"\"\n",
    "    A standalone script to test the FAISS retriever.\n",
    "    \"\"\"\n",
    "    \n",
    "    # 1. Load the Embedding Model\n",
    "    print(f\"Loading embedding model: {MODEL_NAME}...\")\n",
    "    try:\n",
    "        # This line might show a deprecation warning, which is OK.\n",
    "        # It's the same one your agent.py is using.\n",
    "        embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME)\n",
    "    except Exception as e:\n",
    "        print(f\"Error loading embeddings: {e}\")\n",
    "        print(\"Make sure 'sentence-transformers' is installed: pip install sentence-transformers\")\n",
    "        return\n",
    "\n",
    "    # # 2. Load the FAISS Vector Store\n",
    "    # print(f\"Loading FAISS index from: {DB_FAISS_PATH}...\")\n",
    "    # try:\n",
    "    #     db = FAISS.load_local(\n",
    "    #         DB_FAISS_PATH, \n",
    "    #         embeddings, \n",
    "    #         allow_dangerous_deserialization=True # This is required\n",
    "    #     )\n",
    "    # except Exception as e:\n",
    "    #     print(f\"Error loading FAISS index: {e}\")\n",
    "    #     print(\"Be sure you have run 'python ingest.py' successfully first.\")\n",
    "    #     return\n",
    "\n",
    "    retriever = db.as_retriever(search_kwargs={'k': 3})\n",
    "    \n",
    "    print(\"\\n✅ Retriever is ready.\")\n",
    "    print(\"   Enter your query to test. Type 'exit' to quit.\")\n",
    "    \n",
    "    while True:\n",
    "        try:\n",
    "            query = input(\"\\nQuery> \")\n",
    "            if query.lower() == 'exit':\n",
    "                break\n",
    "            if not query:\n",
    "                continue\n",
    "                \n",
    "            print(f\"\\n--- Retrieving docs for: '{query}' ---\")\n",
    "            \n",
    "            documents = retriever.invoke(query)\n",
    "            \n",
    "            if not documents:\n",
    "                print(\"\\n!!! No documents found. !!!\")\n",
    "            else:\n",
    "                for i, doc in enumerate(documents):\n",
    "                    print(f\"\\n--- Document {i+1} ---\")\n",
    "                    print(f\"Source: {doc.metadata.get('source', 'N/A')}\")\n",
    "                    print(f\"Page: {doc.metadata.get('page', 'N/A')}\")\n",
    "                    print(\"\\nContent:\")\n",
    "                    print(doc.page_content)\n",
    "                    print(\"-\" * 20)\n",
    "                    \n",
    "        except Exception as e:\n",
    "            print(f\"An error occurred: {e}\")\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    check_retriever()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "45430224",
   "metadata": {},
   "outputs": [],
   "source": [
    "DB_FAISS_PATH = \"vectorstore/faiss_index2\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "9488f2a3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Successfully created and saved FAISS index to vectorstore/faiss_index2\n"
     ]
    }
   ],
   "source": [
    "db = FAISS.from_documents(docs, embeddings)\n",
    "db.save_local(DB_FAISS_PATH)\n",
    "\n",
    "print(f\"Successfully created and saved FAISS index to {DB_FAISS_PATH}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bef0e8c2",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}