zidankhan commited on
Commit
c0f74f5
Β·
verified Β·
1 Parent(s): 014d53f

Upload 43 files

Browse files
Files changed (44) hide show
  1. .gitattributes +1 -0
  2. FinalProject/__init__.py +0 -0
  3. FinalProject/__pycache__/__init__.cpython-311.pyc +0 -0
  4. FinalProject/agents/__pycache__/graph.cpython-311.pyc +0 -0
  5. FinalProject/agents/__pycache__/graph.cpython-313.pyc +0 -0
  6. FinalProject/agents/__pycache__/state.cpython-311.pyc +0 -0
  7. FinalProject/agents/graph.py +40 -0
  8. FinalProject/agents/nodes/__init__.py +0 -0
  9. FinalProject/agents/nodes/__pycache__/__init__.cpython-311.pyc +0 -0
  10. FinalProject/agents/nodes/__pycache__/__init__.cpython-313.pyc +0 -0
  11. FinalProject/agents/nodes/__pycache__/answer_node.cpython-311.pyc +0 -0
  12. FinalProject/agents/nodes/__pycache__/rag_node.cpython-311.pyc +0 -0
  13. FinalProject/agents/nodes/__pycache__/rag_node.cpython-313.pyc +0 -0
  14. FinalProject/agents/nodes/__pycache__/router_noder.cpython-311.pyc +0 -0
  15. FinalProject/agents/nodes/__pycache__/wiki_node.cpython-311.pyc +0 -0
  16. FinalProject/agents/nodes/__pycache__/wiki_node.cpython-313.pyc +0 -0
  17. FinalProject/agents/nodes/answer_node.py +32 -0
  18. FinalProject/agents/nodes/jskdnvcoa.ipynb +61 -0
  19. FinalProject/agents/nodes/rag_node.py +13 -0
  20. FinalProject/agents/nodes/router_noder.py +47 -0
  21. FinalProject/agents/nodes/wiki_node.py +17 -0
  22. FinalProject/agents/state.py +12 -0
  23. FinalProject/agents/test.ipynb +51 -0
  24. FinalProject/app.py +165 -0
  25. FinalProject/data/__init__.py +0 -0
  26. FinalProject/data/__pycache__/__init__.cpython-311.pyc +0 -0
  27. FinalProject/data/__pycache__/dataingestion.cpython-311.pyc +0 -0
  28. FinalProject/data/dataingestion.py +33 -0
  29. FinalProject/data/pdfs/RU-MILITARY.pdf +3 -0
  30. FinalProject/data/pdfs/__init__.py +0 -0
  31. FinalProject/models/__init__.py +0 -0
  32. FinalProject/models/__pycache__/__init__.cpython-311.pyc +0 -0
  33. FinalProject/models/__pycache__/__init__.cpython-313.pyc +0 -0
  34. FinalProject/models/__pycache__/embedding.cpython-311.pyc +0 -0
  35. FinalProject/models/__pycache__/embedding.cpython-313.pyc +0 -0
  36. FinalProject/models/__pycache__/llm.cpython-311.pyc +0 -0
  37. FinalProject/models/__pycache__/retriever.cpython-311.pyc +0 -0
  38. FinalProject/models/__pycache__/retriever.cpython-313.pyc +0 -0
  39. FinalProject/models/embedding.py +6 -0
  40. FinalProject/models/llm.py +7 -0
  41. FinalProject/models/retriever.py +31 -0
  42. FinalProject/requirements.txt +10 -0
  43. FinalProject/test.ipynb +79 -0
  44. app.py +165 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ FinalProject/data/pdfs/RU-MILITARY.pdf filter=lfs diff=lfs merge=lfs -text
FinalProject/__init__.py ADDED
File without changes
FinalProject/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (166 Bytes). View file
 
FinalProject/agents/__pycache__/graph.cpython-311.pyc ADDED
Binary file (1.95 kB). View file
 
FinalProject/agents/__pycache__/graph.cpython-313.pyc ADDED
Binary file (1.66 kB). View file
 
FinalProject/agents/__pycache__/state.cpython-311.pyc ADDED
Binary file (1.18 kB). View file
 
FinalProject/agents/graph.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ current_dir = os.path.dirname(os.path.abspath(__file__))
5
+
6
+ project_root = os.path.join(current_dir, os.pardir)
7
+
8
+ sys.path.insert(0, project_root)
9
+
10
+ from agents.nodes.rag_node import rag_node
11
+ from agents.nodes.wiki_node import wiki_node
12
+ from agents.nodes.answer_node import answer_node
13
+ from agents.nodes.router_noder import route_node,route_decision
14
+ from agents.state import AgentGraph
15
+ from langgraph.graph import StateGraph
16
+
17
+ graph = StateGraph(state_schema=AgentGraph)
18
+
19
+ graph.add_node("router",route_node)
20
+ graph.add_node("document",rag_node)
21
+ graph.add_node("wiki",wiki_node)
22
+ graph.add_node("answer",answer_node)
23
+
24
+ graph.set_entry_point("router")
25
+
26
+ graph.add_conditional_edges(
27
+ source="router",
28
+ path=route_decision,
29
+ path_map={
30
+ "rag":"document",
31
+ "wiki":"wiki"
32
+ }
33
+ )
34
+
35
+ graph.add_edge("document","answer")
36
+ graph.add_edge("wiki","answer")
37
+
38
+ graph.set_finish_point("answer")
39
+
40
+ app = graph.compile()
FinalProject/agents/nodes/__init__.py ADDED
File without changes
FinalProject/agents/nodes/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (179 Bytes). View file
 
FinalProject/agents/nodes/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (177 Bytes). View file
 
FinalProject/agents/nodes/__pycache__/answer_node.cpython-311.pyc ADDED
Binary file (1.91 kB). View file
 
FinalProject/agents/nodes/__pycache__/rag_node.cpython-311.pyc ADDED
Binary file (708 Bytes). View file
 
FinalProject/agents/nodes/__pycache__/rag_node.cpython-313.pyc ADDED
Binary file (653 Bytes). View file
 
FinalProject/agents/nodes/__pycache__/router_noder.cpython-311.pyc ADDED
Binary file (2.48 kB). View file
 
FinalProject/agents/nodes/__pycache__/wiki_node.cpython-311.pyc ADDED
Binary file (1.04 kB). View file
 
FinalProject/agents/nodes/__pycache__/wiki_node.cpython-313.pyc ADDED
Binary file (989 Bytes). View file
 
FinalProject/agents/nodes/answer_node.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
5
+
6
+ sys.path.insert(0, project_root)
7
+
8
+ from models.llm import get_llm
9
+ from langchain_core.prompts import ChatPromptTemplate
10
+
11
+ def answer_node(state):
12
+ question = state["messages"][-1].content
13
+ document = state["documents"]
14
+ context = "\n\n----\n\n".join([docs.page_content for docs in document])
15
+
16
+ api_key = state.get("api_key")
17
+ if not api_key:
18
+
19
+ raise ValueError("API Key not found in state.")
20
+
21
+ model = get_llm(api=api_key)
22
+
23
+ prompt = ChatPromptTemplate.from_messages([
24
+ ("system","Your job is to provide a concise answer to the user query from the provided context: {context}"),
25
+ ("user","{query}")
26
+ ])
27
+
28
+ answer_chain = prompt|model
29
+
30
+ response = answer_chain.invoke({"query":question,"context":context})
31
+
32
+ return {"messages":[response]}
FinalProject/agents/nodes/jskdnvcoa.ipynb ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "2be3c102",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import os\n",
11
+ "import sys\n",
12
+ "\n",
13
+ "\n",
14
+ "project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))\n",
15
+ "\n",
16
+ "sys.path.insert(0, project_root)\n",
17
+ "\n",
18
+ "print(f\"Added {project_root} to system path.\")\n"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": null,
24
+ "id": "d31913ce",
25
+ "metadata": {},
26
+ "outputs": [],
27
+ "source": [
28
+ "from models.llm import get_llm"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": null,
34
+ "id": "4edc211f",
35
+ "metadata": {},
36
+ "outputs": [],
37
+ "source": []
38
+ }
39
+ ],
40
+ "metadata": {
41
+ "kernelspec": {
42
+ "display_name": "myenv",
43
+ "language": "python",
44
+ "name": "python3"
45
+ },
46
+ "language_info": {
47
+ "codemirror_mode": {
48
+ "name": "ipython",
49
+ "version": 3
50
+ },
51
+ "file_extension": ".py",
52
+ "mimetype": "text/x-python",
53
+ "name": "python",
54
+ "nbconvert_exporter": "python",
55
+ "pygments_lexer": "ipython3",
56
+ "version": "3.11.8"
57
+ }
58
+ },
59
+ "nbformat": 4,
60
+ "nbformat_minor": 5
61
+ }
FinalProject/agents/nodes/rag_node.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ def rag_node(state):
3
+ question = state["messages"][-1].content
4
+
5
+ rag_retriever = state.get("rag_retriever")
6
+
7
+ if rag_retriever is None:
8
+ print("RAG source is not available. Skipping retrieval.")
9
+ return {"documents": []}
10
+
11
+ documents = rag_retriever.invoke(question)
12
+
13
+ return {"documents": documents}
FinalProject/agents/nodes/router_noder.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
5
+
6
+ sys.path.insert(0, project_root)
7
+
8
+
9
+ from models.llm import get_llm
10
+ from langchain_core.prompts import ChatPromptTemplate
11
+ from models.llm import get_llm
12
+ from langchain_core.output_parsers import StrOutputParser
13
+ from data.dataingestion import load_all_pdfs
14
+ document = load_all_pdfs()
15
+
16
+ def route_node(state):
17
+ question = state["messages"][-1].content
18
+
19
+ api_key = state.get("api_key")
20
+ if not api_key:
21
+ raise ValueError("API Key not found in state.")
22
+
23
+ model = get_llm(api=api_key)
24
+
25
+ prompt = ChatPromptTemplate.from_messages([
26
+ ("system","""You are an expert router.
27
+ Your task is to classify the user's question based on its content:
28
+ 1. 'rag': If the question is related to the topics provided in these documents : {documents}
29
+ 2. 'wikipedia': If the question is about general knowledge, history, people, or events.
30
+ Return ONLY a single word string: 'rag' or 'wikipedia'.
31
+ """),
32
+ ("user","{question}")
33
+ ])
34
+
35
+ route_chain = prompt|model|StrOutputParser()
36
+
37
+ route = route_chain.invoke({"question":question,"documents":document})
38
+ if "rag" in route:
39
+ decision = "rag"
40
+ print("routing to rag")
41
+ else:
42
+ decision = "wiki"
43
+ print("routing to wikipedia")
44
+
45
+ return {"source":decision}
46
+ def route_decision(state):
47
+ return state["source"]
FinalProject/agents/nodes/wiki_node.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
5
+
6
+ sys.path.insert(0, project_root)
7
+
8
+
9
+ from models.retriever import get_wiki_retriever
10
+
11
+
12
+ def wiki_node(state):
13
+ retriever = get_wiki_retriever()
14
+ question = state["messages"][-1].content
15
+ document = retriever.invoke(question)
16
+
17
+ return {"documents":document,"source":"final"}
FinalProject/agents/state.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langgraph.graph import StateGraph
2
+ from langgraph.graph.message import TypedDict,Annotated,Literal
3
+ from typing import List,Optional
4
+ from operator import add
5
+ from langchain_core.messages import BaseMessage
6
+ from langchain_core.documents import Document
7
+
8
+ class AgentGraph(TypedDict):
9
+ messages:Annotated[List[BaseMessage],add]
10
+ documents:List[Document]
11
+ source:Literal["rag","wiki","final"]
12
+ api_key: Optional[str]
FinalProject/agents/test.ipynb ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 4,
6
+ "id": "41112477",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from graph import app"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": null,
16
+ "id": "19a2b36e",
17
+ "metadata": {},
18
+ "outputs": [],
19
+ "source": []
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": null,
24
+ "id": "49732fe8",
25
+ "metadata": {},
26
+ "outputs": [],
27
+ "source": []
28
+ }
29
+ ],
30
+ "metadata": {
31
+ "kernelspec": {
32
+ "display_name": "myenv",
33
+ "language": "python",
34
+ "name": "python3"
35
+ },
36
+ "language_info": {
37
+ "codemirror_mode": {
38
+ "name": "ipython",
39
+ "version": 3
40
+ },
41
+ "file_extension": ".py",
42
+ "mimetype": "text/x-python",
43
+ "name": "python",
44
+ "nbconvert_exporter": "python",
45
+ "pygments_lexer": "ipython3",
46
+ "version": "3.11.8"
47
+ }
48
+ },
49
+ "nbformat": 4,
50
+ "nbformat_minor": 5
51
+ }
FinalProject/app.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from agents.graph import app
3
+ from langchain_core.messages import HumanMessage
4
+ import os
5
+ import sys
6
+ import tempfile
7
+ from typing import List
8
+
9
+ # Ensure you have implemented this function in FinalProject/models/retriever.py
10
+ # It should accept a list of PDF file paths and return a LangChain Retriever object.
11
+ try:
12
+ from models.retriever import get_rag_retriever_from_paths
13
+ except ImportError:
14
+ st.error("Could not import get_rag_retriever_from_paths. Please check your models/retriever.py file.")
15
+ sys.exit()
16
+
17
+
18
+ # --- PATH SETUP ---
19
+ current_dir = os.path.dirname(os.path.abspath(__file__))
20
+ sys.path.insert(0, current_dir)
21
+
22
+ # --- PAGE CONFIGURATION ---
23
+ st.set_page_config(
24
+ page_title="GraphQuery RAG Agent",
25
+ page_icon="πŸ€–",
26
+ layout="wide"
27
+ )
28
+
29
+ # --- CACHED FUNCTION TO BUILD RAG RETRIEVER ---
30
+ # Hashing trick: By passing file_paths (a list of strings), Streamlit hashes the list.
31
+ # The expensive function only runs if the list of paths changes (i.e., files are added/removed).
32
+ @st.cache_resource
33
+ def load_and_index_documents(file_paths: List[str]):
34
+ """Loads documents and creates/returns a RAG retriever."""
35
+ if not file_paths:
36
+ return None
37
+
38
+ with st.spinner(f"Indexing {len(file_paths)} PDF file(s)... This may take a moment."):
39
+ try:
40
+ # Calls the function from your models/retriever.py
41
+ retriever = get_rag_retriever_from_paths(file_paths)
42
+ st.success(f"Indexed {len(file_paths)} PDF file(s) successfully!")
43
+ return retriever
44
+ except Exception as e:
45
+ st.error(f"Failed to index documents: {e}")
46
+ return None
47
+
48
+ # --- SIDEBAR (Settings, Key, and Upload) ---
49
+ with st.sidebar:
50
+ st.header("βš™οΈ Agent Settings")
51
+ st.caption("Configure your LLM and Access Key.")
52
+
53
+ # API Key Input
54
+ api_key = st.text_input(
55
+ "**Groq API Key (Required):**",
56
+ type="password",
57
+ help="Paste your private Groq API Key here. It is used only for this session.",
58
+ )
59
+
60
+ st.divider()
61
+
62
+ # 1. FILE UPLOAD SECTION
63
+ st.subheader("πŸ“š Document Upload")
64
+ uploaded_files = st.file_uploader(
65
+ "Upload your own PDFs for RAG context:",
66
+ type=["pdf"],
67
+ accept_multiple_files=True
68
+ )
69
+
70
+ # 2. FILE SAVING & INDEXING LOGIC
71
+ file_paths = []
72
+ rag_retriever = None
73
+
74
+ if uploaded_files:
75
+ # Streamlit files are in memory; we must write them to a temporary file
76
+ # so LangChain's PyPDFLoader (which needs a file path) can read them.
77
+ with tempfile.TemporaryDirectory() as temp_dir:
78
+ for uploaded_file in uploaded_files:
79
+ file_path = os.path.join(temp_dir, uploaded_file.name)
80
+ # Write the file bytes to the temporary path
81
+ with open(file_path, "wb") as f:
82
+ f.write(uploaded_file.getbuffer())
83
+ file_paths.append(file_path)
84
+
85
+ # 3. Build the retriever and cache it based on the list of paths
86
+ # NOTE: We pass the list of temporary paths to the cached function.
87
+ rag_retriever = load_and_index_documents(file_paths)
88
+
89
+ else:
90
+ # Clear the cache if no files are uploaded to ensure a clean state
91
+ st.info("No documents uploaded. Only Wikipedia lookup is enabled.")
92
+ load_and_index_documents.clear() # Clears the cache for this function
93
+
94
+
95
+ st.divider()
96
+ st.subheader("πŸ› οΈ Features")
97
+ st.info(f"RAG (Document Context) status: {'**ENABLED**' if rag_retriever else 'DISABLED'}")
98
+ st.info("Wikipedia Routing is always active.")
99
+ st.text("MORE COMING SOON ⏱️")
100
+
101
+ # --- MAIN INTERFACE (Header) ---
102
+ st.markdown(
103
+ """
104
+ # 🧠 LangGraph Query Model
105
+ ### Multi-Source RAG Agent
106
+ Ask a question related to your uploaded documents or general knowledge.
107
+ """
108
+ )
109
+ st.divider()
110
+
111
+ # --- STATE INITIALIZATION ---
112
+ initial_state_base = {
113
+ "documents": [],
114
+ "source": "",
115
+ "api_key": api_key,
116
+ # Pass the dynamically created retriever to the graph state
117
+ "rag_retriever": rag_retriever
118
+ }
119
+
120
+ # --- CHAT INPUT AND LOGIC ---
121
+ with st.form(key='query_form', clear_on_submit=True):
122
+ user_query = st.text_input(
123
+ "**Your Question:**",
124
+ placeholder="e.g., What is the significance of the military-industrial complex in Russia?",
125
+ label_visibility="collapsed"
126
+ )
127
+ submit_button = st.form_submit_button(label='Ask the Agent πŸš€')
128
+
129
+
130
+ # --- EXECUTION LOGIC ---
131
+
132
+ if submit_button and user_query:
133
+ if not api_key:
134
+ st.error("πŸ”‘ **Error:** Please enter your Groq API Key in the sidebar to run the query.")
135
+ st.stop()
136
+
137
+ st.info("πŸ”„ **Querying the Agent...** Please wait.")
138
+
139
+ # Prepare state
140
+ initial_state = initial_state_base.copy()
141
+ initial_state["messages"] = [HumanMessage(content=user_query)]
142
+
143
+ with st.spinner('Thinking... Routing and Retrieving Context...'):
144
+ try:
145
+ response = app.invoke(initial_state)
146
+
147
+ # --- Output Display ---
148
+ final_message = response["messages"][-1].content
149
+
150
+ st.success("βœ… **Agent Response:**")
151
+ st.markdown(final_message)
152
+ st.divider()
153
+
154
+ # Optional: Show debug info
155
+ with st.expander("πŸ” **Debug Info (Agent Flow)**"):
156
+ st.write(f"**Final Source:** {response.get('source', 'Unknown')}")
157
+ if 'documents' in response and response['documents']:
158
+ st.write(f"**Retrieved Documents:** {len(response['documents'])} chunks used.")
159
+
160
+ except Exception as e:
161
+ st.error("❌ **Agent Failure:** An error occurred during execution.")
162
+ st.exception(e)
163
+
164
+ elif not user_query and not api_key:
165
+ st.markdown("πŸ‘‹ Start by entering your **Groq API Key** in the sidebar and asking a question above!")
FinalProject/data/__init__.py ADDED
File without changes
FinalProject/data/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (171 Bytes). View file
 
FinalProject/data/__pycache__/dataingestion.cpython-311.pyc ADDED
Binary file (2.09 kB). View file
 
FinalProject/data/dataingestion.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain_community.document_loaders import PyPDFDirectoryLoader
3
+ from langchain_core.documents import Document
4
+ from typing import List
5
+
6
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
7
+
8
+ PDF_FOLDER = os.path.join(BASE_DIR, "pdfs")
9
+
10
+ def load_all_pdfs() -> List[Document]:
11
+
12
+ if not os.path.exists(PDF_FOLDER):
13
+ os.makedirs(PDF_FOLDER, exist_ok=True)
14
+ print(f"Created PDF ingestion directory: {PDF_FOLDER}")
15
+ return []
16
+
17
+ try:
18
+
19
+ loader = PyPDFDirectoryLoader(PDF_FOLDER)
20
+
21
+ all_docs = loader.load()
22
+
23
+ print(f"Successfully loaded {len(all_docs)} document pages from the 'pdfs' folder.")
24
+
25
+ for doc in all_docs:
26
+ if 'source' in doc.metadata:
27
+ doc.metadata['source_short'] = os.path.basename(doc.metadata['source'])
28
+
29
+ return all_docs
30
+
31
+ except Exception as e:
32
+ print(f"Error loading PDFs: {e}")
33
+ return []
FinalProject/data/pdfs/RU-MILITARY.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a435f3f6f06baf8883cab3f2c84cbe0be0a1268df3a3da74f2e3f17161dd6a7
3
+ size 1527223
FinalProject/data/pdfs/__init__.py ADDED
File without changes
FinalProject/models/__init__.py ADDED
File without changes
FinalProject/models/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (173 Bytes). View file
 
FinalProject/models/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (171 Bytes). View file
 
FinalProject/models/__pycache__/embedding.cpython-311.pyc ADDED
Binary file (458 Bytes). View file
 
FinalProject/models/__pycache__/embedding.cpython-313.pyc ADDED
Binary file (407 Bytes). View file
 
FinalProject/models/__pycache__/llm.cpython-311.pyc ADDED
Binary file (449 Bytes). View file
 
FinalProject/models/__pycache__/retriever.cpython-311.pyc ADDED
Binary file (1.85 kB). View file
 
FinalProject/models/__pycache__/retriever.cpython-313.pyc ADDED
Binary file (1.6 kB). View file
 
FinalProject/models/embedding.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from langchain_huggingface import HuggingFaceEmbeddings
2
+
3
+ def get_embeddings():
4
+ return HuggingFaceEmbeddings(
5
+ model="all-MiniLM-L6-v2"
6
+ )
FinalProject/models/llm.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from langchain_groq import ChatGroq
2
+
3
+ def get_llm(api):
4
+ return ChatGroq(
5
+ model = "llama-3.3-70b-versatile",
6
+ api_key=api
7
+ )
FinalProject/models/retriever.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ from langchain_community.retrievers import WikipediaRetriever
4
+ from langchain_community.vectorstores import Chroma
5
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
6
+ from langchain_community.document_loaders import PyPDFLoader
7
+ from .embedding import get_embeddings
8
+ from typing import List
9
+
10
+ embedder = get_embeddings()
11
+
12
+
13
+ def get_rag_retriever_from_paths(pdf_paths: List[str]):
14
+ """Loads PDFs from a list of paths, splits them, and creates a Chroma retriever."""
15
+
16
+ all_docs = []
17
+ for path in pdf_paths:
18
+ loader = PyPDFLoader(path)
19
+ all_docs.extend(loader.load())
20
+
21
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=270)
22
+ splits = text_splitter.split_documents(all_docs)
23
+
24
+ vectorstore = Chroma.from_documents(documents=splits, embedding=embedder)
25
+ rag_retriever = vectorstore.as_retriever()
26
+
27
+ return rag_retriever
28
+
29
+ def get_wiki_retriever():
30
+ wikiretriever = WikipediaRetriever(top_k_results=2)
31
+ return wikiretriever
FinalProject/requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ langchain-core
3
+ langchain-community
4
+ langchain-text-splitters
5
+ langgraph
6
+ langchain-groq
7
+ langchain-huggingface
8
+ pypdf
9
+ sentence-transformers
10
+ chromadb
FinalProject/test.ipynb ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 4,
6
+ "id": "f6960caa",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "data": {
11
+ "text/plain": [
12
+ "[Document(metadata={'source': 'c:\\\\Users\\\\xyzai\\\\Documents\\\\GraphDB\\\\FinalProject\\\\data\\\\pdfs\\\\RU-MILITARY.pdf', 'creationdate': '2014-04-22T10:23:16+02:00', 'creator': 'Adobe InDesign CS6 (Macintosh)', 'trapped': '/False', 'page_label': '3', 'total_pages': 4, 'moddate': '2014-04-22T10:23:18+02:00', 'page': 2, 'producer': 'Adobe PDF Library 10.0.1'}, page_content='developing the military-industrial complex.\\nThe Kremlin’s massive armaments program \\nand its reform of the military-industrial \\ncomplex also has significance in terms of \\nindustrial and social policy. The military in-\\ndustry employs two million workers; five \\nper cent of the Russian population depend \\non it for their livelihood. In this way, the \\nKremlin is β€œsolving” an issue in Soviet style: \\nFunding for the military is once more tak -\\ning on a central role in society. It is hoped \\nthat this will boost innovation and global \\nRussian Demographics\\nRussia has a security apparatus \\nproportionally more than twice \\nthe size of that of the US.'),\n",
13
+ " Document(metadata={'creator': 'Adobe InDesign CS6 (Macintosh)', 'creationdate': '2014-04-22T10:23:16+02:00', 'source': 'c:\\\\Users\\\\xyzai\\\\Documents\\\\GraphDB\\\\FinalProject\\\\data\\\\pdfs\\\\RU-MILITARY.pdf', 'producer': 'Adobe PDF Library 10.0.1', 'moddate': '2014-04-22T10:23:18+02:00', 'total_pages': 4, 'page_label': '3', 'page': 2, 'trapped': '/False'}, page_content='developing the military-industrial complex.\\nThe Kremlin’s massive armaments program \\nand its reform of the military-industrial \\ncomplex also has significance in terms of \\nindustrial and social policy. The military in-\\ndustry employs two million workers; five \\nper cent of the Russian population depend \\non it for their livelihood. In this way, the \\nKremlin is β€œsolving” an issue in Soviet style: \\nFunding for the military is once more tak -\\ning on a central role in society. It is hoped \\nthat this will boost innovation and global \\nRussian Demographics\\nRussia has a security apparatus \\nproportionally more than twice \\nthe size of that of the US.'),\n",
14
+ " Document(metadata={'producer': 'Adobe PDF Library 10.0.1', 'source': 'c:\\\\Users\\\\xyzai\\\\Documents\\\\GraphDB\\\\FinalProject\\\\data\\\\pdfs\\\\RU-MILITARY.pdf', 'total_pages': 4, 'trapped': '/False', 'creationdate': '2014-04-22T10:23:16+02:00', 'creator': 'Adobe InDesign CS6 (Macintosh)', 'page_label': '4', 'moddate': '2014-04-22T10:23:18+02:00', 'page': 3}, page_content='there are bottlenecks in production capaci-\\nty, for example in aircraft production and \\nshipbuilding. The plans to enhance military \\ntransport aviation can only be realized if \\ncapacity is expanded rapidly. Russia also \\ndepends on cooperation with Ukraine: So \\nfar, many motors for helicopters and air -\\ncraft as well as rockets have been produced \\nin Ukraine. Russia lacks the know-how for \\nproducing many of the parts required. The \\ncurrent conflict is putting a strain on this \\ncooperation and necessitates import substi-\\ntutes, which entail great cost and delays.\\nThe Effects of Remilitarization\\nUnder Vladimir Putin, the modernization \\nof Russia’s armed forces has become a pri -\\nority for the first time since 1991. For sev-\\neral years, considerable sums have been ex-\\npended on this reform. However, challenges \\nremain when it comes to technology and \\norganizational culture; and demographic \\nproblems are also still an issue. Moreover, \\nthe lagging economic output will exacer -'),\n",
15
+ " Document(metadata={'creationdate': '2014-04-22T10:23:16+02:00', 'total_pages': 4, 'moddate': '2014-04-22T10:23:18+02:00', 'trapped': '/False', 'page_label': '4', 'source': 'c:\\\\Users\\\\xyzai\\\\Documents\\\\GraphDB\\\\FinalProject\\\\data\\\\pdfs\\\\RU-MILITARY.pdf', 'creator': 'Adobe InDesign CS6 (Macintosh)', 'page': 3, 'producer': 'Adobe PDF Library 10.0.1'}, page_content='there are bottlenecks in production capaci-\\nty, for example in aircraft production and \\nshipbuilding. The plans to enhance military \\ntransport aviation can only be realized if \\ncapacity is expanded rapidly. Russia also \\ndepends on cooperation with Ukraine: So \\nfar, many motors for helicopters and air -\\ncraft as well as rockets have been produced \\nin Ukraine. Russia lacks the know-how for \\nproducing many of the parts required. The \\ncurrent conflict is putting a strain on this \\ncooperation and necessitates import substi-\\ntutes, which entail great cost and delays.\\nThe Effects of Remilitarization\\nUnder Vladimir Putin, the modernization \\nof Russia’s armed forces has become a pri -\\nority for the first time since 1991. For sev-\\neral years, considerable sums have been ex-\\npended on this reform. However, challenges \\nremain when it comes to technology and \\norganizational culture; and demographic \\nproblems are also still an issue. Moreover, \\nthe lagging economic output will exacer -')]"
16
+ ]
17
+ },
18
+ "execution_count": 4,
19
+ "metadata": {},
20
+ "output_type": "execute_result"
21
+ }
22
+ ],
23
+ "source": [
24
+ "from langchain_community.retrievers import WikipediaRetriever\n",
25
+ "from langchain_community.vectorstores import Chroma\n",
26
+ "from data.dataingestion import load_all_pdfs\n",
27
+ "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
28
+ "from langchain_community.vectorstores import Chroma\n",
29
+ "from models.embedding import get_embeddings\n",
30
+ "embeddings = get_embeddings()\n",
31
+ "pdf_data = load_all_pdfs()\n",
32
+ "\n",
33
+ "text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=270)\n",
34
+ "splits = text_splitter.split_documents(pdf_data)\n",
35
+ "vectorstore = Chroma.from_documents(documents=splits,embedding=embeddings)\n",
36
+ "rag_retriever = vectorstore.as_retriever()\n",
37
+ "resp=rag_retriever.invoke(\"russian military\")\n",
38
+ "resp \n"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": null,
44
+ "id": "261ab304",
45
+ "metadata": {},
46
+ "outputs": [],
47
+ "source": []
48
+ },
49
+ {
50
+ "cell_type": "code",
51
+ "execution_count": null,
52
+ "id": "c7e948d4",
53
+ "metadata": {},
54
+ "outputs": [],
55
+ "source": []
56
+ }
57
+ ],
58
+ "metadata": {
59
+ "kernelspec": {
60
+ "display_name": "myenv",
61
+ "language": "python",
62
+ "name": "python3"
63
+ },
64
+ "language_info": {
65
+ "codemirror_mode": {
66
+ "name": "ipython",
67
+ "version": 3
68
+ },
69
+ "file_extension": ".py",
70
+ "mimetype": "text/x-python",
71
+ "name": "python",
72
+ "nbconvert_exporter": "python",
73
+ "pygments_lexer": "ipython3",
74
+ "version": "3.11.8"
75
+ }
76
+ },
77
+ "nbformat": 4,
78
+ "nbformat_minor": 5
79
+ }
app.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from agents.graph import app
3
+ from langchain_core.messages import HumanMessage
4
+ import os
5
+ import sys
6
+ import tempfile
7
+ from typing import List
8
+
9
+ # Ensure you have implemented this function in FinalProject/models/retriever.py
10
+ # It should accept a list of PDF file paths and return a LangChain Retriever object.
11
+ try:
12
+ from models.retriever import get_rag_retriever_from_paths
13
+ except ImportError:
14
+ st.error("Could not import get_rag_retriever_from_paths. Please check your models/retriever.py file.")
15
+ sys.exit()
16
+
17
+
18
+ # --- PATH SETUP ---
19
+ current_dir = os.path.dirname(os.path.abspath(__file__))
20
+ sys.path.insert(0, current_dir)
21
+
22
+ # --- PAGE CONFIGURATION ---
23
+ st.set_page_config(
24
+ page_title="GraphQuery RAG Agent",
25
+ page_icon="πŸ€–",
26
+ layout="wide"
27
+ )
28
+
29
+ # --- CACHED FUNCTION TO BUILD RAG RETRIEVER ---
30
+ # Hashing trick: By passing file_paths (a list of strings), Streamlit hashes the list.
31
+ # The expensive function only runs if the list of paths changes (i.e., files are added/removed).
32
+ @st.cache_resource
33
+ def load_and_index_documents(file_paths: List[str]):
34
+ """Loads documents and creates/returns a RAG retriever."""
35
+ if not file_paths:
36
+ return None
37
+
38
+ with st.spinner(f"Indexing {len(file_paths)} PDF file(s)... This may take a moment."):
39
+ try:
40
+ # Calls the function from your models/retriever.py
41
+ retriever = get_rag_retriever_from_paths(file_paths)
42
+ st.success(f"Indexed {len(file_paths)} PDF file(s) successfully!")
43
+ return retriever
44
+ except Exception as e:
45
+ st.error(f"Failed to index documents: {e}")
46
+ return None
47
+
48
+ # --- SIDEBAR (Settings, Key, and Upload) ---
49
+ with st.sidebar:
50
+ st.header("βš™οΈ Agent Settings")
51
+ st.caption("Configure your LLM and Access Key.")
52
+
53
+ # API Key Input
54
+ api_key = st.text_input(
55
+ "**Groq API Key (Required):**",
56
+ type="password",
57
+ help="Paste your private Groq API Key here. It is used only for this session.",
58
+ )
59
+
60
+ st.divider()
61
+
62
+ # 1. FILE UPLOAD SECTION
63
+ st.subheader("πŸ“š Document Upload")
64
+ uploaded_files = st.file_uploader(
65
+ "Upload your own PDFs for RAG context:",
66
+ type=["pdf"],
67
+ accept_multiple_files=True
68
+ )
69
+
70
+ # 2. FILE SAVING & INDEXING LOGIC
71
+ file_paths = []
72
+ rag_retriever = None
73
+
74
+ if uploaded_files:
75
+ # Streamlit files are in memory; we must write them to a temporary file
76
+ # so LangChain's PyPDFLoader (which needs a file path) can read them.
77
+ with tempfile.TemporaryDirectory() as temp_dir:
78
+ for uploaded_file in uploaded_files:
79
+ file_path = os.path.join(temp_dir, uploaded_file.name)
80
+ # Write the file bytes to the temporary path
81
+ with open(file_path, "wb") as f:
82
+ f.write(uploaded_file.getbuffer())
83
+ file_paths.append(file_path)
84
+
85
+ # 3. Build the retriever and cache it based on the list of paths
86
+ # NOTE: We pass the list of temporary paths to the cached function.
87
+ rag_retriever = load_and_index_documents(file_paths)
88
+
89
+ else:
90
+ # Clear the cache if no files are uploaded to ensure a clean state
91
+ st.info("No documents uploaded. Only Wikipedia lookup is enabled.")
92
+ load_and_index_documents.clear() # Clears the cache for this function
93
+
94
+
95
+ st.divider()
96
+ st.subheader("πŸ› οΈ Features")
97
+ st.info(f"RAG (Document Context) status: {'**ENABLED**' if rag_retriever else 'DISABLED'}")
98
+ st.info("Wikipedia Routing is always active.")
99
+ st.text("MORE COMING SOON ⏱️")
100
+
101
+ # --- MAIN INTERFACE (Header) ---
102
+ st.markdown(
103
+ """
104
+ # 🧠 LangGraph Query Model
105
+ ### Multi-Source RAG Agent
106
+ Ask a question related to your uploaded documents or general knowledge.
107
+ """
108
+ )
109
+ st.divider()
110
+
111
+ # --- STATE INITIALIZATION ---
112
+ initial_state_base = {
113
+ "documents": [],
114
+ "source": "",
115
+ "api_key": api_key,
116
+ # Pass the dynamically created retriever to the graph state
117
+ "rag_retriever": rag_retriever
118
+ }
119
+
120
+ # --- CHAT INPUT AND LOGIC ---
121
+ with st.form(key='query_form', clear_on_submit=True):
122
+ user_query = st.text_input(
123
+ "**Your Question:**",
124
+ placeholder="e.g., What is the significance of the military-industrial complex in Russia?",
125
+ label_visibility="collapsed"
126
+ )
127
+ submit_button = st.form_submit_button(label='Ask the Agent πŸš€')
128
+
129
+
130
+ # --- EXECUTION LOGIC ---
131
+
132
+ if submit_button and user_query:
133
+ if not api_key:
134
+ st.error("πŸ”‘ **Error:** Please enter your Groq API Key in the sidebar to run the query.")
135
+ st.stop()
136
+
137
+ st.info("πŸ”„ **Querying the Agent...** Please wait.")
138
+
139
+ # Prepare state
140
+ initial_state = initial_state_base.copy()
141
+ initial_state["messages"] = [HumanMessage(content=user_query)]
142
+
143
+ with st.spinner('Thinking... Routing and Retrieving Context...'):
144
+ try:
145
+ response = app.invoke(initial_state)
146
+
147
+ # --- Output Display ---
148
+ final_message = response["messages"][-1].content
149
+
150
+ st.success("βœ… **Agent Response:**")
151
+ st.markdown(final_message)
152
+ st.divider()
153
+
154
+ # Optional: Show debug info
155
+ with st.expander("πŸ” **Debug Info (Agent Flow)**"):
156
+ st.write(f"**Final Source:** {response.get('source', 'Unknown')}")
157
+ if 'documents' in response and response['documents']:
158
+ st.write(f"**Retrieved Documents:** {len(response['documents'])} chunks used.")
159
+
160
+ except Exception as e:
161
+ st.error("❌ **Agent Failure:** An error occurred during execution.")
162
+ st.exception(e)
163
+
164
+ elif not user_query and not api_key:
165
+ st.markdown("πŸ‘‹ Start by entering your **Groq API Key** in the sidebar and asking a question above!")