{ "cells": [ { "cell_type": "code", "execution_count": 20, "id": "081405cc", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader\n", "from langchain_community.vectorstores import FAISS\n", "from langchain_community.embeddings import HuggingFaceEmbeddings\n", "from langchain_text_splitters import RecursiveCharacterTextSplitter\n", "from dotenv import load_dotenv\n", "\n", "load_dotenv()" ] }, { "cell_type": "code", "execution_count": 21, "id": "3c40840f", "metadata": {}, "outputs": [], "source": [ "MODEL_NAME = \"sentence-transformers/all-MiniLM-L12-v2\"\n", "DATA_PATH=\"data/\"" ] }, { "cell_type": "code", "execution_count": 22, "id": "90fc0a47", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loading documents from data/...\n", "Loaded 2087 PDF document(s).\n", "Split into 25938 chunks.\n", "Creating and saving FAISS vector store...\n" ] } ], "source": [ "embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME)\n", "\n", "print(f\"Loading documents from {DATA_PATH}...\")\n", "loader = DirectoryLoader(\n", " DATA_PATH,\n", " glob='*.pdf', \n", " loader_cls=PyPDFLoader \n", ")\n", "documents = loader.load()\n", "\n", "if not documents:\n", " print(\"No PDF documents found. Make sure your PDFs are in the /data folder.\")\n", " exit()\n", "\n", "print(f\"Loaded {len(documents)} PDF document(s).\")\n", "\n", "# 3. Split Documents\n", "text_splitter = RecursiveCharacterTextSplitter(\n", " chunk_size=300, \n", " chunk_overlap=200,\n", " separators=[\"\\n\\n\", \"\\n\", \".\", \"!\", \"?\", \" \", \"\"]\n", " )\n", "docs = text_splitter.split_documents(documents)\n", "\n", "print(f\"Split into {len(docs)} chunks.\")\n", "\n", "# 4. Create and Save FAISS Vector Store\n", "print(\"Creating and saving FAISS vector store...\")\n", "db = FAISS.from_documents(docs, embeddings)" ] }, { "cell_type": "code", "execution_count": null, "id": "9ca0ee2b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loading embedding model: sentence-transformers/all-MiniLM-L12-v2...\n", "\n", "✅ Retriever is ready.\n", " Enter your query to test. Type 'exit' to quit.\n", "\n", "--- Retrieving docs for: 'who is director' ---\n", "\n", "--- Document 1 ---\n", "Source: data/iiitdmj_crawl_data_1.pdf\n", "Page: 133\n", "\n", "Content:\n", "director@iiitdmj.ac.in\n", "2.\n", "Deputy Director\n", "To be nominated on appointment\n", "3.\n", "Deans (Ex-officio)\n", "1. Dr. Mukesh Kumar Roy\n", "Faculty-in-Charge (Student Affairs)\n", "mkroy@iiitdmj.ac.in\n", "2. Prof. V. K. Gupta\n", "Professor In-charge (Academic)\n", "dean.acad@iiitdmj.ac.in\n", "3. Prof. Pritee Khanna\n", "--------------------\n", "\n", "--- Document 2 ---\n", "Source: data/IIITDM Jabalpur.pdf\n", "Page: 2\n", "\n", "Content:\n", " The Deputy Director (to be nominated on appointment) \n", " The Deans \n", " The Heads of various disciplines and \n", " The Registrar \n", " \n", " \n", " \n", " \n", "Building And Works Committee \n", "S. No. Name Designation \n", "1. Prof. Bhartendu Kumar Singh \n", "Director \n", "PDPM-IIITDM Jabalpur \n", "director@iiitdmj.ac.in\n", "--------------------\n", "\n", "--- Document 3 ---\n", "Source: data/iiitdmj_crawl_data_1.pdf\n", "Page: 133\n", "\n", "Content:\n", "S. No.\n", "Name\n", "Address\n", "1.\n", "Director as Chairperson (Ex-officio)\n", "Prof. Bhartendu K Singh (Director)\n", "director@iiitdmj.ac.in\n", "2.\n", "Deputy Director\n", "To be nominated on appointment\n", "3.\n", "Deans (Ex-officio)\n", "1. Dr. Mukesh Kumar Roy\n", "Faculty-in-Charge (Student Affairs)\n", "mkroy@iiitdmj.ac.in\n", "2. Prof. V. K. Gupta\n", "--------------------\n" ] } ], "source": [ "import sys\n", "from langchain_community.vectorstores import FAISS\n", "from langchain_community.embeddings import HuggingFaceEmbeddings\n", "\n", "\n", "def check_retriever():\n", " \"\"\"\n", " A standalone script to test the FAISS retriever.\n", " \"\"\"\n", " \n", " # 1. Load the Embedding Model\n", " print(f\"Loading embedding model: {MODEL_NAME}...\")\n", " try:\n", " # This line might show a deprecation warning, which is OK.\n", " # It's the same one your agent.py is using.\n", " embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME)\n", " except Exception as e:\n", " print(f\"Error loading embeddings: {e}\")\n", " print(\"Make sure 'sentence-transformers' is installed: pip install sentence-transformers\")\n", " return\n", "\n", " # # 2. Load the FAISS Vector Store\n", " # print(f\"Loading FAISS index from: {DB_FAISS_PATH}...\")\n", " # try:\n", " # db = FAISS.load_local(\n", " # DB_FAISS_PATH, \n", " # embeddings, \n", " # allow_dangerous_deserialization=True # This is required\n", " # )\n", " # except Exception as e:\n", " # print(f\"Error loading FAISS index: {e}\")\n", " # print(\"Be sure you have run 'python ingest.py' successfully first.\")\n", " # return\n", "\n", " retriever = db.as_retriever(search_kwargs={'k': 3})\n", " \n", " print(\"\\n✅ Retriever is ready.\")\n", " print(\" Enter your query to test. Type 'exit' to quit.\")\n", " \n", " while True:\n", " try:\n", " query = input(\"\\nQuery> \")\n", " if query.lower() == 'exit':\n", " break\n", " if not query:\n", " continue\n", " \n", " print(f\"\\n--- Retrieving docs for: '{query}' ---\")\n", " \n", " documents = retriever.invoke(query)\n", " \n", " if not documents:\n", " print(\"\\n!!! No documents found. !!!\")\n", " else:\n", " for i, doc in enumerate(documents):\n", " print(f\"\\n--- Document {i+1} ---\")\n", " print(f\"Source: {doc.metadata.get('source', 'N/A')}\")\n", " print(f\"Page: {doc.metadata.get('page', 'N/A')}\")\n", " print(\"\\nContent:\")\n", " print(doc.page_content)\n", " print(\"-\" * 20)\n", " \n", " except Exception as e:\n", " print(f\"An error occurred: {e}\")\n", "\n", "if __name__ == \"__main__\":\n", " check_retriever()\n" ] }, { "cell_type": "code", "execution_count": 24, "id": "45430224", "metadata": {}, "outputs": [], "source": [ "DB_FAISS_PATH = \"vectorstore/faiss_index2\"\n" ] }, { "cell_type": "code", "execution_count": 25, "id": "9488f2a3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Successfully created and saved FAISS index to vectorstore/faiss_index2\n" ] } ], "source": [ "db = FAISS.from_documents(docs, embeddings)\n", "db.save_local(DB_FAISS_PATH)\n", "\n", "print(f\"Successfully created and saved FAISS index to {DB_FAISS_PATH}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "bef0e8c2", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.7" } }, "nbformat": 4, "nbformat_minor": 5 }