In [20]:
import os
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from dotenv import load_dotenv

load_dotenv()

True

In [21]:
MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"
DATA_PATH="data/"

In [22]:
embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME)

print(f"Loading documents from {DATA_PATH}...")
loader = DirectoryLoader(
 DATA_PATH,
 glob='*.pdf', 
 loader_cls=PyPDFLoader 
)
documents = loader.load()

if not documents:
 print("No PDF documents found. Make sure your PDFs are in the /data folder.")
 exit()

print(f"Loaded {len(documents)} PDF document(s).")

# 3. Split Documents
text_splitter = RecursiveCharacterTextSplitter(
 chunk_size=300, 
 chunk_overlap=200,
 separators=["\n\n", "\n", ".", "!", "?", " ", ""]
 )
docs = text_splitter.split_documents(documents)

print(f"Split into {len(docs)} chunks.")

# 4. Create and Save FAISS Vector Store
print("Creating and saving FAISS vector store...")
db = FAISS.from_documents(docs, embeddings)

Loading documents from data/...
Loaded 2087 PDF document(s).
Split into 25938 chunks.
Creating and saving FAISS vector store...


In [None]:
import sys
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings


def check_retriever():
 """
 A standalone script to test the FAISS retriever.
 """
 
 # 1. Load the Embedding Model
 print(f"Loading embedding model: {MODEL_NAME}...")
 try:
 # This line might show a deprecation warning, which is OK.
 # It's the same one your agent.py is using.
 embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME)
 except Exception as e:
 print(f"Error loading embeddings: {e}")
 print("Make sure 'sentence-transformers' is installed: pip install sentence-transformers")
 return

 # # 2. Load the FAISS Vector Store
 # print(f"Loading FAISS index from: {DB_FAISS_PATH}...")
 # try:
 # db = FAISS.load_local(
 # DB_FAISS_PATH, 
 # embeddings, 
 # allow_dangerous_deserialization=True # This is required
 # )
 # except Exception as e:
 # print(f"Error loading FAISS index: {e}")
 # print("Be sure you have run 'python ingest.py' successfully first.")
 # return

 retriever = db.as_retriever(search_kwargs={'k': 3})
 
 print("\n✅ Retriever is ready.")
 print(" Enter your query to test. Type 'exit' to quit.")
 
 while True:
 try:
 query = input("\nQuery> ")
 if query.lower() == 'exit':
 break
 if not query:
 continue
 
 print(f"\n--- Retrieving docs for: '{query}' ---")
 
 documents = retriever.invoke(query)
 
 if not documents:
 print("\n!!! No documents found. !!!")
 else:
 for i, doc in enumerate(documents):
 print(f"\n--- Document {i+1} ---")
 print(f"Source: {doc.metadata.get('source', 'N/A')}")
 print(f"Page: {doc.metadata.get('page', 'N/A')}")
 print("\nContent:")
 print(doc.page_content)
 print("-" * 20)
 
 except Exception as e:
 print(f"An error occurred: {e}")

if __name__ == "__main__":
 check_retriever()


Loading embedding model: sentence-transformers/all-MiniLM-L12-v2...

✅ Retriever is ready.
 Enter your query to test. Type 'exit' to quit.

--- Retrieving docs for: 'who is director' ---

--- Document 1 ---
Source: data/iiitdmj_crawl_data_1.pdf
Page: 133

Content:
director@iiitdmj.ac.in
2.
Deputy Director
To be nominated on appointment
3.
Deans (Ex-officio)
1. Dr. Mukesh Kumar Roy
Faculty-in-Charge (Student Affairs)
mkroy@iiitdmj.ac.in
2. Prof. V. K. Gupta
Professor In-charge (Academic)
dean.acad@iiitdmj.ac.in
3. Prof. Pritee Khanna
--------------------

--- Document 2 ---
Source: data/IIITDM Jabalpur.pdf
Page: 2

Content:
 The Deputy Director (to be nominated on appointment) 
 The Deans 
 The Heads of various disciplines and 
 The Registrar 
 
 
 
 
Building And Works Committee 
S. No. Name Designation 
1. Prof. Bhartendu Kumar Singh 
Director 
PDPM-IIITDM Jabalpur 
director@iiitdmj.ac.in
--------------------

--- Document 3 ---
Source: data/iiitdmj_crawl_data_1.pdf
Page: 133

Con

In [24]:
DB_FAISS_PATH = "vectorstore/faiss_index2"


In [25]:
db = FAISS.from_documents(docs, embeddings)
db.save_local(DB_FAISS_PATH)

print(f"Successfully created and saved FAISS index to {DB_FAISS_PATH}")

Successfully created and saved FAISS index to vectorstore/faiss_index2
