Spaces:

PrinceDeepakSiddharth12
/

IIITDMJ_Chatbot

Running

App Files Files Community

PrinceDeepakSiddharth12 commited on Oct 29

Commit

bc0299d

verified ·

1 Parent(s): 22c0bde

Upload 5 files

Browse files

Files changed (5) hide show

agent.py +281 -0
app.py +159 -0
check.ipynb +305 -0
ingest.py +43 -0
requirements.txt +13 -0

agent.py ADDED Viewed

	@@ -0,0 +1,281 @@

+import os
+from typing import TypedDict, Annotated, List, Literal
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
+from langchain_core.messages import HumanMessage, AIMessage, BaseMessage, SystemMessage
+from langchain_core.documents import Document
+from langgraph.graph import StateGraph, END
+from langgraph.checkpoint.memory import MemorySaver
+from langgraph.graph import add_messages
+from dotenv import load_dotenv
+load_dotenv()
+llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0, streaming=True)
+classification_llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0)
+embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L12-v2")
+db = FAISS.load_local("vectorstore/faiss_index2", embeddings, allow_dangerous_deserialization=True)
+retriever = db.as_retriever(search_kwargs={'k': 3}) #
+class AgentState(TypedDict):
+    messages: Annotated[list, add_messages]
+    context: List[Document]
+    rewritten_query: str
+    query_type: Literal["simple_rag", "comparative_rag", "conversational"]
+    sub_queries: List[str]
+def format_history_for_prompt(messages: list[BaseMessage]) -> str:
+    buffer = []
+    for msg in messages:
+        if isinstance(msg, HumanMessage): buffer.append(f"Human: {msg.content}")
+        elif isinstance(msg, AIMessage): buffer.append(f"AI: {msg.content}")
+    return "\n".join(buffer)
+def format_docs_for_prompt(docs: List[Document]) -> str:
+    return "\n\n".join([doc.page_content for doc in docs])
+def inject_system_prompt(state: AgentState) -> dict:
+    print("---NODE: INJECT_SYSTEM_PROMPT (START)---")
+    has_system_message = any(isinstance(msg, SystemMessage) for msg in state["messages"])
+    if not has_system_message:
+        system_prompt = (
+            "You are a helpful and professional assistant for IIITDMJ. "
+            "You must answer user questions based *only* on the retrieved context. "
+            "If the context does not contain the answer, you must state that "
+            "you do not have that information. Do not make up answers."
+        )
+        return {"messages": [SystemMessage(content=system_prompt)]}
+    return {}
+def rewrite_query_node(state: AgentState) -> dict:
+    print("---NODE: REWRITE_QUERY---")
+    last_human_message = None
+    for msg in reversed(state["messages"]):
+        if isinstance(msg, HumanMessage):
+            last_human_message = msg
+            break
+    last_query = last_human_message.content if last_human_message else ""
+    chat_history = format_history_for_prompt(state["messages"][:-1])
+    if not chat_history:
+        print(f"--- Standalone Query: {last_query} ---")
+        return {"rewritten_query": last_query}
+    prompt = ChatPromptTemplate.from_template(
+        """Given the following chat history and the user's latest question,
+        rewrite the user's question to be a standalone question...
+        Chat History: {chat_history}
+        Latest Question: {query}
+        Standalone Question:"""
+    )
+    rewrite_chain = prompt | classification_llm | StrOutputParser()
+    rewritten_query = rewrite_chain.invoke({"chat_history": chat_history, "query": last_query})
+    print(f"--- Rewritten Query: {rewritten_query} ---")
+    return {"rewritten_query": rewritten_query}
+def classify_query_node(state: AgentState) -> dict:
+    print("---NODE: CLASSIFY_QUERY---")
+    query = state["rewritten_query"]
+    prompt = ChatPromptTemplate.from_template(
+        """Classify the user's query into one of three categories:
+        1.  **simple_rag**: ...
+        2.  **comparative_rag**: ...
+        3.  **conversational**: ...
+        Query: {query}
+        """
+    )
+    classification_chain = prompt | classification_llm | StrOutputParser()
+    result = classification_chain.invoke({"query": query})
+    decision = "simple_rag"
+    if "comparative_rag" in result.lower(): decision = "comparative_rag"
+    elif "conversational" in result.lower(): decision = "conversational"
+    print(f"--- Decision: {decision} ---")
+    return {"query_type": decision}
+def handle_chat_node(state: AgentState) -> dict:
+    """
+    Path A: Generates an answer based *only* on the chat history.
+    """
+    print("---NODE: HANDLE_CHAT---")
+    # query = state["rewritten_query"]
+    chat_history = format_history_for_prompt(state["messages"])
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", "You are a helpful college assistant. Answer the user's question based on the chat history. Be conversational."),
+        ("user", "Here is the chat history (including my last question):\n{chat_history}\n\nNow, please provide a conversational answer.")
+    ])
+    generation_chain = prompt | llm | StrOutputParser()
+    answer = generation_chain.invoke({"chat_history": chat_history})
+    print(f"--- HANDLE_CHAT generated answer: {answer} ---")
+    return {"messages": [AIMessage(content=answer)]}
+def retrieve_docs_node(state: AgentState) -> dict:
+    print("---NODE: RETRIEVE_DOCS (SIMPLE)---")
+    query = state["rewritten_query"]
+    documents = retriever.invoke(query)
+    print("\n--- RETRIEVED CONTEXT ---")
+    if documents:
+        for i, doc in enumerate(documents):
+            print(f"DOC {i+1}: Source: {doc.metadata.get('source', 'N/A')}, Page: {doc.metadata.get('page', 'N/A')}")
+    else: print("!!! No context retrieved. !!!")
+    print("---------------------------\n")
+    return {"context": documents}
+def generate_answer_node(state: AgentState) -> dict:
+    print("---NODE: GENERATE_ANSWER (SIMPLE)---")
+    query = state["rewritten_query"]
+    context_docs = state["context"]
+    context_str = format_docs_for_prompt(context_docs)
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", (
+            "You are a helpful assistant. Answer the user's question based *only* on the retrieved context. "
+            "If the context is empty or irrelevant, you *must* state that you do not have the information "
+            "and recommend visiting the official Indian Institute of Information Technology, Design and Manufacturing, Jabalpur (IIITDM Jabalpur) website (https://www.iiitdmj.ac.in/) for more details."
+         )),
+        ("user", "Context:\n{context}\n\nQuestion:\n{query}")
+    ])
+    generation_chain = prompt | llm | StrOutputParser()
+    answer = generation_chain.invoke({"context": context_str, "query": query})
+    sources = []
+    if context_docs:
+        for i, doc in enumerate(context_docs):
+            source_file = doc.metadata.get('source', 'N/A')
+            source_name = source_file.split('/')[-1]
+            page_num = doc.metadata.get('page', 'N/A')
+            sources.append(f"  {i+1}. {source_name} (Page: {page_num})")
+    if sources and "website" not in answer:
+        pretty_answer = answer + "\n--- \n**Sources:**\n" + "\n".join(sources)
+    else:
+        pretty_answer = answer
+    return {"messages": [AIMessage(content=pretty_answer)]}
+def decompose_query_node(state: AgentState) -> dict:
+    print("---NODE: DECOMPOSE_QUERY---")
+    query = state["rewritten_query"]
+    prompt = ChatPromptTemplate.from_template(
+        """You are a query decomposition assistant...
+        Query: {query}
+        Respond with a JSON object..."""
+    )
+    parser = JsonOutputParser()
+    decomposition_chain = prompt | classification_llm | parser
+    result = decomposition_chain.invoke({"query": query})
+    print(f"--- Sub-queries: {result['queries']} ---")
+    return {"sub_queries": result['queries']}
+def retrieve_multi_docs_node(state: AgentState) -> dict:
+    print("---NODE: RETRIEVE_DOCS (MULTI)---")
+    sub_queries = state["sub_queries"]
+    all_docs = []
+    for query in sub_queries:
+        documents = retriever.invoke(query)
+        all_docs.extend(documents)
+    unique_docs_map = {doc.page_content: doc for doc in all_docs}
+    unique_docs = list(unique_docs_map.values())
+    print("\n--- RETRIEVED CONTEXT (MULTI) ---")
+    if unique_docs:
+        for i, doc in enumerate(unique_docs):
+            print(f"DOC {i+1}: Source: {doc.metadata.get('source', 'N/A')}, Page: {doc.metadata.get('page', 'N/A')}")
+    else: print("!!! No context retrieved. !!!")
+    print("---------------------------\n")
+    return {"context": unique_docs}
+def generate_synthesized_answer_node(state: AgentState) -> dict:
+    print("---NODE: GENERATE_ANSWER (SYNTHESIZED)---")
+    query = state["rewritten_query"]
+    context_docs = state["context"]
+    context_str = format_docs_for_prompt(context_docs)
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", (
+            "You are a helpful assistant. Your task is to answer a comparative question based on the provided context. "
+            "Synthesize the information from the context to form a comprehensive answer. "
+            "If the context is insufficient, you *must* state that you do not have the information "
+            "and recommend visiting the official Indian Institute of Information Technology, Design and Manufacturing, Jabalpur (IIITDM Jabalpur) website (https://www.iiitdmj.ac.in/) for more details."
+        )),
+        ("user", (
+            "Here is the context I've gathered:\n{context}\n\n"
+            "Now, please answer this original question:\n{query}"
+        ))
+    ])
+    generation_chain = prompt | llm | StrOutputParser()
+    answer = generation_chain.invoke({"context": context_str, "query": query})
+    sources = []
+    if context_docs:
+        for i, doc in enumerate(context_docs):
+            source_file = doc.metadata.get('source', 'N/A')
+            source_name = source_file.split('/')[-1]
+            page_num = doc.metadata.get('page', 'N/A')
+            sources.append(f"  {i+1}. {source_name} (Page: {page_num})")
+    if sources and "website" not in answer:
+        pretty_answer = answer + "\n--- \n**Sources:**\n" + "\n".join(sources)
+    else:
+        pretty_answer = answer
+    return {"messages": [AIMessage(content=pretty_answer)]}
+def router(state: AgentState) -> Literal["conversational", "simple_rag", "comparative_rag"]:
+    print(f"--- ROUTING TO: {state['query_type']} ---")
+    return state["query_type"]
+checkpointer = MemorySaver()
+def build_graph():
+    workflow = StateGraph(AgentState)
+    workflow.add_node("inject_system_prompt", inject_system_prompt)
+    workflow.add_node("rewrite_query", rewrite_query_node)
+    workflow.add_node("classify_query", classify_query_node)
+    workflow.add_node("handle_chat", handle_chat_node)
+    workflow.add_node("retrieve_docs", retrieve_docs_node)
+    workflow.add_node("generate_answer", generate_answer_node)
+    workflow.add_node("decompose_query", decompose_query_node)
+    workflow.add_node("retrieve_multi_docs", retrieve_multi_docs_node)
+    workflow.add_node("generate_synthesized_answer", generate_synthesized_answer_node)
+    workflow.set_entry_point("inject_system_prompt")
+    workflow.add_edge("inject_system_prompt", "rewrite_query")
+    workflow.add_edge("rewrite_query", "classify_query")
+    workflow.add_conditional_edges(
+        "classify_query",
+        router,
+        {
+            "conversational": "handle_chat",
+            "simple_rag": "retrieve_docs",
+            "comparative_rag": "decompose_query"
+        }
+    )
+    workflow.add_edge("handle_chat", END)
+    workflow.add_edge("retrieve_docs", "generate_answer")
+    workflow.add_edge("generate_answer", END)
+    workflow.add_edge("decompose_query", "retrieve_multi_docs")
+    workflow.add_edge("retrieve_multi_docs", "generate_synthesized_answer")
+    workflow.add_edge("generate_synthesized_answer", END)
+    app = workflow.compile(checkpointer=checkpointer)
+    return app
+chatbot = build_graph()
+if __name__ == "__main__":
+    config = {"configurable": {"thread_id": "test-direct-run-1"}}
+    print("\n--- Testing Direct Run ---")
+    inputs = {"messages": [HumanMessage(content="What is the name of director?")]}
+    for event in chatbot.stream(inputs, config, stream_mode="values"):
+        if "messages" in event:
+            event["messages"][-1].pretty_print()

app.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import streamlit as st
+from agent import chatbot, classification_llm
+from langchain_core.messages import HumanMessage, AIMessage, BaseMessage, SystemMessage
+import uuid
+import asyncio
+def generate_thread_id():
+    thread_id= uuid.uuid4()
+    return thread_id
+def reset_chat():
+    thread_id=uuid.uuid4()
+    st.session_state['thread_id']=thread_id
+    add_thread(st.session_state['thread_id'])
+    st.session_state['message_history']=[]
+def add_thread(thread_id):
+    if thread_id not in st.session_state['chat_threads']:
+        st.session_state['chat_threads'].append(thread_id)
+        st.session_state['thread_titles'][thread_id]=f"New Chat {len(st.session_state['chat_threads'])}"
+def load_conversation(thread_id):
+   try:
+        state= chatbot.get_state(config={'configurable' : {'thread_id': thread_id}})
+        raw_messages = state.values.get('messages', []) if state else []
+        return [msg for msg in raw_messages if isinstance(msg, BaseMessage)]
+   except Exception as e:
+       print(f"Error loading conversation for thread {thread_id}: {e}")
+       return []
+def generate_title(query):
+    print("--- Generating Title ---")
+    try:
+        prompt = f"Summarize this query into a very short title (max 5 words): {query}"
+        response = classification_llm.invoke(prompt)
+        title = response.content.strip().strip('"')
+        return title if title else "Chat"
+    except Exception as e:
+        print(f"Error generating title: {e}")
+        return "Chat"
+if 'message_history' not in st.session_state: st.session_state['message_history']=[]
+if 'thread_id' not in st.session_state: st.session_state['thread_id']=generate_thread_id()
+if 'chat_threads' not in st.session_state: st.session_state['chat_threads']=[]
+if 'thread_titles' not in st.session_state: st.session_state['thread_titles']={}
+add_thread(st.session_state['thread_id'])
+st.sidebar.title("IIITDMJ Chatbot")
+if st.sidebar.button("➕ New Chat"):
+    reset_chat()
+    st.rerun()
+st.sidebar.header("My Conversations")
+for thread_id in st.session_state['chat_threads'][::-1]:
+    title=st.session_state['thread_titles'].get(thread_id,"Untitled Chat")
+    if st.sidebar.button(title, key=f"thread_{thread_id}", use_container_width=True):
+        st.session_state['thread_id']=thread_id
+        messages= load_conversation(thread_id)
+        temp_messages = []
+        for msg in messages:
+            if isinstance(msg, SystemMessage): continue
+            role = 'user' if isinstance(msg, HumanMessage) else 'assistant'
+            temp_messages.append({'role': role, 'content': msg.content})
+        st.session_state['message_history'] = temp_messages
+        st.rerun()
+st.title("IIITDMJ College Assistant")
+st.caption("This bot uses a local vector store and LangGraph to answer your questions.")
+for message in st.session_state['message_history']:
+    with st.chat_message(message['role']):
+        if message['role'] == 'assistant':
+            st.markdown(f"<div style='font-size: 15px;'>{message['content']}</div>", unsafe_allow_html=True)
+        else:
+            st.markdown(message['content'])
+user_input=st.chat_input("Ask about IIITDMJ...")
+if user_input:
+    CONFIG={'configurable' : {'thread_id': st.session_state['thread_id']}}
+    st.session_state['message_history'].append({'role':'user','content':user_input})
+    with st.chat_message('user'):
+        st.markdown(user_input)
+    with st.chat_message('assistant'):
+        placeholder = st.empty()
+        ai_message_content = ""
+        try:
+            print(f"\n--- Streaming response for Thread ID: {st.session_state['thread_id']} ---")
+            async def stream_agent_events(stream_placeholder):
+                local_ai_message_content_streamed = ""
+                local_final_node_output = None
+                local_final_node_name = ""
+                async for event in chatbot.astream_events(
+                    {'messages': [HumanMessage(content=user_input)]},
+                    config=CONFIG,
+                    version="v1"
+                ):
+                    kind = event["event"]
+                    name = event["name"]
+                    if kind == "on_chat_model_stream":
+                        if name in ("generate_answer", "generate_synthesized_answer", "handle_chat"):
+                            chunk_content = event["data"]["chunk"].content
+                            if chunk_content:
+                                local_ai_message_content_streamed += chunk_content
+                                stream_placeholder.markdown(f"<div style='font-size: 15px;'>{local_ai_message_content_streamed}▌</div>", unsafe_allow_html=True)
+                    if kind == "on_chain_end":
+                        if name in ("generate_answer", "generate_synthesized_answer", "handle_chat"):
+                             if "output" in event.get("data", {}) and isinstance(event["data"]["output"], dict):
+                                local_final_node_output = event["data"]["output"]
+                                local_final_node_name = name
+                                print(f"--- Captured final output from node: {name} ---")
+                return local_ai_message_content_streamed, local_final_node_output, local_final_node_name
+            streamed_content, final_output, final_name = asyncio.run(stream_agent_events(placeholder))
+            if not streamed_content and final_output:
+                print(f"--- Using fallback: No stream content captured. Using final output from {final_name}. ---")
+                if "messages" in final_output and final_output["messages"]:
+                    ai_message_content = final_output["messages"][-1].content
+                    placeholder.markdown(f"<div style='font-size: 15px;'>{ai_message_content}</div>", unsafe_allow_html=True)
+                else:
+                    print(f"--- Fallback failed: Final output from {final_name} had unexpected format: {final_output} ---")
+                    ai_message_content = "Sorry, I couldn't generate a response (fallback error)."
+                    placeholder.markdown(ai_message_content)
+            elif streamed_content:
+                 ai_message_content = streamed_content
+                 placeholder.markdown(f"<div style='font-size: 15px;'>{ai_message_content}</div>", unsafe_allow_html=True)
+            else:
+                 print("--- Fallback failed: No stream content and no final output captured. ---")
+                 ai_message_content = "Sorry, I couldn't generate a response (capture error)."
+                 placeholder.markdown(ai_message_content)
+        except Exception as e:
+            st.error(f"An error occurred: {e}")
+            print(f"ERROR DURING STREAM/FALLBACK: {e}")
+            ai_message_content = "Sorry, I encountered an error during execution."
+            placeholder.markdown(ai_message_content)
+    if not ai_message_content:
+        ai_message_content = "Sorry, I couldn't generate a response."
+    st.session_state['message_history'].append({'role':'assistant','content':ai_message_content})
+    current_id=st.session_state['thread_id']
+    current_title=st.session_state['thread_titles'].get(current_id,"New Chat")
+    if current_title.startswith("New Chat") and len(st.session_state['message_history']) <= 2:
+        summarized_title = generate_title(user_input)
+        st.session_state['thread_titles'][current_id] = summarized_title
+        st.rerun()

check.ipynb ADDED Viewed

	@@ -0,0 +1,305 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "081405cc",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import os\n",
+    "from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader\n",
+    "from langchain_community.vectorstores import FAISS\n",
+    "from langchain_community.embeddings import HuggingFaceEmbeddings\n",
+    "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "load_dotenv()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "3c40840f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MODEL_NAME = \"sentence-transformers/all-MiniLM-L12-v2\"\n",
+    "DATA_PATH=\"data/\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "90fc0a47",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loading documents from data/...\n",
+      "Loaded 2087 PDF document(s).\n",
+      "Split into 25938 chunks.\n",
+      "Creating and saving FAISS vector store...\n"
+     ]
+    }
+   ],
+   "source": [
+    "embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME)\n",
+    "\n",
+    "print(f\"Loading documents from {DATA_PATH}...\")\n",
+    "loader = DirectoryLoader(\n",
+    "    DATA_PATH,\n",
+    "    glob='*.pdf',         \n",
+    "    loader_cls=PyPDFLoader  \n",
+    ")\n",
+    "documents = loader.load()\n",
+    "\n",
+    "if not documents:\n",
+    "    print(\"No PDF documents found. Make sure your PDFs are in the /data folder.\")\n",
+    "    exit()\n",
+    "\n",
+    "print(f\"Loaded {len(documents)} PDF document(s).\")\n",
+    "\n",
+    "# 3. Split Documents\n",
+    "text_splitter = RecursiveCharacterTextSplitter(\n",
+    "    chunk_size=300, \n",
+    "    chunk_overlap=200,\n",
+    "    separators=[\"\\n\\n\", \"\\n\", \".\", \"!\", \"?\", \" \", \"\"]\n",
+    "    )\n",
+    "docs = text_splitter.split_documents(documents)\n",
+    "\n",
+    "print(f\"Split into {len(docs)} chunks.\")\n",
+    "\n",
+    "# 4. Create and Save FAISS Vector Store\n",
+    "print(\"Creating and saving FAISS vector store...\")\n",
+    "db = FAISS.from_documents(docs, embeddings)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ca0ee2b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loading embedding model: sentence-transformers/all-MiniLM-L12-v2...\n",
+      "\n",
+      "✅ Retriever is ready.\n",
+      "   Enter your query to test. Type 'exit' to quit.\n",
+      "\n",
+      "--- Retrieving docs for: 'who is director' ---\n",
+      "\n",
+      "--- Document 1 ---\n",
+      "Source: data/iiitdmj_crawl_data_1.pdf\n",
+      "Page: 133\n",
+      "\n",
+      "Content:\n",
+      "director@iiitdmj.ac.in\n",
+      "2.\n",
+      "Deputy Director\n",
+      "To be nominated on appointment\n",
+      "3.\n",
+      "Deans (Ex-officio)\n",
+      "1. Dr. Mukesh Kumar Roy\n",
+      "Faculty-in-Charge (Student Affairs)\n",
+      "mkroy@iiitdmj.ac.in\n",
+      "2. Prof. V. K. Gupta\n",
+      "Professor In-charge (Academic)\n",
+      "dean.acad@iiitdmj.ac.in\n",
+      "3. Prof. Pritee Khanna\n",
+      "--------------------\n",
+      "\n",
+      "--- Document 2 ---\n",
+      "Source: data/IIITDM Jabalpur.pdf\n",
+      "Page: 2\n",
+      "\n",
+      "Content:\n",
+      " The Deputy Director  (to be nominated on appointment) \n",
+      " The Deans \n",
+      " The Heads of various disciplines and \n",
+      " The Registrar \n",
+      " \n",
+      " \n",
+      " \n",
+      " \n",
+      "Building And Works Committee \n",
+      "S. No.  Name Designation  \n",
+      "1.    Prof. Bhartendu Kumar  Singh \n",
+      "Director \n",
+      "PDPM-IIITDM Jabalpur \n",
+      "director@iiitdmj.ac.in\n",
+      "--------------------\n",
+      "\n",
+      "--- Document 3 ---\n",
+      "Source: data/iiitdmj_crawl_data_1.pdf\n",
+      "Page: 133\n",
+      "\n",
+      "Content:\n",
+      "S. No.\n",
+      "Name\n",
+      "Address\n",
+      "1.\n",
+      "Director as Chairperson (Ex-officio)\n",
+      "Prof. Bhartendu K Singh (Director)\n",
+      "director@iiitdmj.ac.in\n",
+      "2.\n",
+      "Deputy Director\n",
+      "To be nominated on appointment\n",
+      "3.\n",
+      "Deans (Ex-officio)\n",
+      "1. Dr. Mukesh Kumar Roy\n",
+      "Faculty-in-Charge (Student Affairs)\n",
+      "mkroy@iiitdmj.ac.in\n",
+      "2. Prof. V. K. Gupta\n",
+      "--------------------\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "from langchain_community.vectorstores import FAISS\n",
+    "from langchain_community.embeddings import HuggingFaceEmbeddings\n",
+    "\n",
+    "\n",
+    "def check_retriever():\n",
+    "    \"\"\"\n",
+    "    A standalone script to test the FAISS retriever.\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    # 1. Load the Embedding Model\n",
+    "    print(f\"Loading embedding model: {MODEL_NAME}...\")\n",
+    "    try:\n",
+    "        # This line might show a deprecation warning, which is OK.\n",
+    "        # It's the same one your agent.py is using.\n",
+    "        embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME)\n",
+    "    except Exception as e:\n",
+    "        print(f\"Error loading embeddings: {e}\")\n",
+    "        print(\"Make sure 'sentence-transformers' is installed: pip install sentence-transformers\")\n",
+    "        return\n",
+    "\n",
+    "    # # 2. Load the FAISS Vector Store\n",
+    "    # print(f\"Loading FAISS index from: {DB_FAISS_PATH}...\")\n",
+    "    # try:\n",
+    "    #     db = FAISS.load_local(\n",
+    "    #         DB_FAISS_PATH, \n",
+    "    #         embeddings, \n",
+    "    #         allow_dangerous_deserialization=True # This is required\n",
+    "    #     )\n",
+    "    # except Exception as e:\n",
+    "    #     print(f\"Error loading FAISS index: {e}\")\n",
+    "    #     print(\"Be sure you have run 'python ingest.py' successfully first.\")\n",
+    "    #     return\n",
+    "\n",
+    "    retriever = db.as_retriever(search_kwargs={'k': 3})\n",
+    "    \n",
+    "    print(\"\\n✅ Retriever is ready.\")\n",
+    "    print(\"   Enter your query to test. Type 'exit' to quit.\")\n",
+    "    \n",
+    "    while True:\n",
+    "        try:\n",
+    "            query = input(\"\\nQuery> \")\n",
+    "            if query.lower() == 'exit':\n",
+    "                break\n",
+    "            if not query:\n",
+    "                continue\n",
+    "                \n",
+    "            print(f\"\\n--- Retrieving docs for: '{query}' ---\")\n",
+    "            \n",
+    "            documents = retriever.invoke(query)\n",
+    "            \n",
+    "            if not documents:\n",
+    "                print(\"\\n!!! No documents found. !!!\")\n",
+    "            else:\n",
+    "                for i, doc in enumerate(documents):\n",
+    "                    print(f\"\\n--- Document {i+1} ---\")\n",
+    "                    print(f\"Source: {doc.metadata.get('source', 'N/A')}\")\n",
+    "                    print(f\"Page: {doc.metadata.get('page', 'N/A')}\")\n",
+    "                    print(\"\\nContent:\")\n",
+    "                    print(doc.page_content)\n",
+    "                    print(\"-\" * 20)\n",
+    "                    \n",
+    "        except Exception as e:\n",
+    "            print(f\"An error occurred: {e}\")\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    check_retriever()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "45430224",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "DB_FAISS_PATH = \"vectorstore/faiss_index2\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "9488f2a3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Successfully created and saved FAISS index to vectorstore/faiss_index2\n"
+     ]
+    }
+   ],
+   "source": [
+    "db = FAISS.from_documents(docs, embeddings)\n",
+    "db.save_local(DB_FAISS_PATH)\n",
+    "\n",
+    "print(f\"Successfully created and saved FAISS index to {DB_FAISS_PATH}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bef0e8c2",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

ingest.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import os
+from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
+from langchain_community.vectorstores import FAISS
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from dotenv import load_dotenv
+load_dotenv()
+DATA_PATH = "data/"
+DB_FAISS_PATH = "vectorstore/faiss_index"
+MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2" # Model for embeddings
+embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME)
+print(f"Loading documents from {DATA_PATH}...")
+loader = DirectoryLoader(
+    DATA_PATH,
+    glob='*.pdf',
+    loader_cls=PyPDFLoader
+)
+documents = loader.load()
+if not documents:
+    print("No PDF documents found. Make sure your PDFs are in the /data folder.")
+    exit()
+print(f"Loaded {len(documents)} PDF document(s).")
+text_splitter = RecursiveCharacterTextSplitter(
+    chunk_size=300,
+    chunk_overlap=200,
+    separators=["\n\n", "\n", ".", "!", "?", " ", ""]
+    )
+docs = text_splitter.split_documents(documents)
+print(f"Split into {len(docs)} chunks.")
+print("Creating and saving FAISS vector store...")
+db = FAISS.from_documents(docs, embeddings)
+db.save_local(DB_FAISS_PATH)
+print(f"Successfully created and saved FAISS index to {DB_FAISS_PATH}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+streamlit
+langchain
+langchain-community
+langgraph
+langchain-google-genai
+langchain-huggingface
+faiss-cpu
+sentence-transformers
+pypdf
+python-dotenv
+langchain-text-splitters
+pydantic
+tiktoken