Spaces:
Sleeping
Sleeping
Upload 43 files
Browse files- .gitattributes +1 -0
- FinalProject/__init__.py +0 -0
- FinalProject/__pycache__/__init__.cpython-311.pyc +0 -0
- FinalProject/agents/__pycache__/graph.cpython-311.pyc +0 -0
- FinalProject/agents/__pycache__/graph.cpython-313.pyc +0 -0
- FinalProject/agents/__pycache__/state.cpython-311.pyc +0 -0
- FinalProject/agents/graph.py +40 -0
- FinalProject/agents/nodes/__init__.py +0 -0
- FinalProject/agents/nodes/__pycache__/__init__.cpython-311.pyc +0 -0
- FinalProject/agents/nodes/__pycache__/__init__.cpython-313.pyc +0 -0
- FinalProject/agents/nodes/__pycache__/answer_node.cpython-311.pyc +0 -0
- FinalProject/agents/nodes/__pycache__/rag_node.cpython-311.pyc +0 -0
- FinalProject/agents/nodes/__pycache__/rag_node.cpython-313.pyc +0 -0
- FinalProject/agents/nodes/__pycache__/router_noder.cpython-311.pyc +0 -0
- FinalProject/agents/nodes/__pycache__/wiki_node.cpython-311.pyc +0 -0
- FinalProject/agents/nodes/__pycache__/wiki_node.cpython-313.pyc +0 -0
- FinalProject/agents/nodes/answer_node.py +32 -0
- FinalProject/agents/nodes/jskdnvcoa.ipynb +61 -0
- FinalProject/agents/nodes/rag_node.py +13 -0
- FinalProject/agents/nodes/router_noder.py +47 -0
- FinalProject/agents/nodes/wiki_node.py +17 -0
- FinalProject/agents/state.py +12 -0
- FinalProject/agents/test.ipynb +51 -0
- FinalProject/app.py +165 -0
- FinalProject/data/__init__.py +0 -0
- FinalProject/data/__pycache__/__init__.cpython-311.pyc +0 -0
- FinalProject/data/__pycache__/dataingestion.cpython-311.pyc +0 -0
- FinalProject/data/dataingestion.py +33 -0
- FinalProject/data/pdfs/RU-MILITARY.pdf +3 -0
- FinalProject/data/pdfs/__init__.py +0 -0
- FinalProject/models/__init__.py +0 -0
- FinalProject/models/__pycache__/__init__.cpython-311.pyc +0 -0
- FinalProject/models/__pycache__/__init__.cpython-313.pyc +0 -0
- FinalProject/models/__pycache__/embedding.cpython-311.pyc +0 -0
- FinalProject/models/__pycache__/embedding.cpython-313.pyc +0 -0
- FinalProject/models/__pycache__/llm.cpython-311.pyc +0 -0
- FinalProject/models/__pycache__/retriever.cpython-311.pyc +0 -0
- FinalProject/models/__pycache__/retriever.cpython-313.pyc +0 -0
- FinalProject/models/embedding.py +6 -0
- FinalProject/models/llm.py +7 -0
- FinalProject/models/retriever.py +31 -0
- FinalProject/requirements.txt +10 -0
- FinalProject/test.ipynb +79 -0
- app.py +165 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
FinalProject/data/pdfs/RU-MILITARY.pdf filter=lfs diff=lfs merge=lfs -text
|
FinalProject/__init__.py
ADDED
|
File without changes
|
FinalProject/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (166 Bytes). View file
|
|
|
FinalProject/agents/__pycache__/graph.cpython-311.pyc
ADDED
|
Binary file (1.95 kB). View file
|
|
|
FinalProject/agents/__pycache__/graph.cpython-313.pyc
ADDED
|
Binary file (1.66 kB). View file
|
|
|
FinalProject/agents/__pycache__/state.cpython-311.pyc
ADDED
|
Binary file (1.18 kB). View file
|
|
|
FinalProject/agents/graph.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
|
| 4 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
| 5 |
+
|
| 6 |
+
project_root = os.path.join(current_dir, os.pardir)
|
| 7 |
+
|
| 8 |
+
sys.path.insert(0, project_root)
|
| 9 |
+
|
| 10 |
+
from agents.nodes.rag_node import rag_node
|
| 11 |
+
from agents.nodes.wiki_node import wiki_node
|
| 12 |
+
from agents.nodes.answer_node import answer_node
|
| 13 |
+
from agents.nodes.router_noder import route_node,route_decision
|
| 14 |
+
from agents.state import AgentGraph
|
| 15 |
+
from langgraph.graph import StateGraph
|
| 16 |
+
|
| 17 |
+
graph = StateGraph(state_schema=AgentGraph)
|
| 18 |
+
|
| 19 |
+
graph.add_node("router",route_node)
|
| 20 |
+
graph.add_node("document",rag_node)
|
| 21 |
+
graph.add_node("wiki",wiki_node)
|
| 22 |
+
graph.add_node("answer",answer_node)
|
| 23 |
+
|
| 24 |
+
graph.set_entry_point("router")
|
| 25 |
+
|
| 26 |
+
graph.add_conditional_edges(
|
| 27 |
+
source="router",
|
| 28 |
+
path=route_decision,
|
| 29 |
+
path_map={
|
| 30 |
+
"rag":"document",
|
| 31 |
+
"wiki":"wiki"
|
| 32 |
+
}
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
graph.add_edge("document","answer")
|
| 36 |
+
graph.add_edge("wiki","answer")
|
| 37 |
+
|
| 38 |
+
graph.set_finish_point("answer")
|
| 39 |
+
|
| 40 |
+
app = graph.compile()
|
FinalProject/agents/nodes/__init__.py
ADDED
|
File without changes
|
FinalProject/agents/nodes/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (179 Bytes). View file
|
|
|
FinalProject/agents/nodes/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (177 Bytes). View file
|
|
|
FinalProject/agents/nodes/__pycache__/answer_node.cpython-311.pyc
ADDED
|
Binary file (1.91 kB). View file
|
|
|
FinalProject/agents/nodes/__pycache__/rag_node.cpython-311.pyc
ADDED
|
Binary file (708 Bytes). View file
|
|
|
FinalProject/agents/nodes/__pycache__/rag_node.cpython-313.pyc
ADDED
|
Binary file (653 Bytes). View file
|
|
|
FinalProject/agents/nodes/__pycache__/router_noder.cpython-311.pyc
ADDED
|
Binary file (2.48 kB). View file
|
|
|
FinalProject/agents/nodes/__pycache__/wiki_node.cpython-311.pyc
ADDED
|
Binary file (1.04 kB). View file
|
|
|
FinalProject/agents/nodes/__pycache__/wiki_node.cpython-313.pyc
ADDED
|
Binary file (989 Bytes). View file
|
|
|
FinalProject/agents/nodes/answer_node.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
|
| 4 |
+
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
|
| 5 |
+
|
| 6 |
+
sys.path.insert(0, project_root)
|
| 7 |
+
|
| 8 |
+
from models.llm import get_llm
|
| 9 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 10 |
+
|
| 11 |
+
def answer_node(state):
|
| 12 |
+
question = state["messages"][-1].content
|
| 13 |
+
document = state["documents"]
|
| 14 |
+
context = "\n\n----\n\n".join([docs.page_content for docs in document])
|
| 15 |
+
|
| 16 |
+
api_key = state.get("api_key")
|
| 17 |
+
if not api_key:
|
| 18 |
+
|
| 19 |
+
raise ValueError("API Key not found in state.")
|
| 20 |
+
|
| 21 |
+
model = get_llm(api=api_key)
|
| 22 |
+
|
| 23 |
+
prompt = ChatPromptTemplate.from_messages([
|
| 24 |
+
("system","Your job is to provide a concise answer to the user query from the provided context: {context}"),
|
| 25 |
+
("user","{query}")
|
| 26 |
+
])
|
| 27 |
+
|
| 28 |
+
answer_chain = prompt|model
|
| 29 |
+
|
| 30 |
+
response = answer_chain.invoke({"query":question,"context":context})
|
| 31 |
+
|
| 32 |
+
return {"messages":[response]}
|
FinalProject/agents/nodes/jskdnvcoa.ipynb
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": null,
|
| 6 |
+
"id": "2be3c102",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"import os\n",
|
| 11 |
+
"import sys\n",
|
| 12 |
+
"\n",
|
| 13 |
+
"\n",
|
| 14 |
+
"project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))\n",
|
| 15 |
+
"\n",
|
| 16 |
+
"sys.path.insert(0, project_root)\n",
|
| 17 |
+
"\n",
|
| 18 |
+
"print(f\"Added {project_root} to system path.\")\n"
|
| 19 |
+
]
|
| 20 |
+
},
|
| 21 |
+
{
|
| 22 |
+
"cell_type": "code",
|
| 23 |
+
"execution_count": null,
|
| 24 |
+
"id": "d31913ce",
|
| 25 |
+
"metadata": {},
|
| 26 |
+
"outputs": [],
|
| 27 |
+
"source": [
|
| 28 |
+
"from models.llm import get_llm"
|
| 29 |
+
]
|
| 30 |
+
},
|
| 31 |
+
{
|
| 32 |
+
"cell_type": "code",
|
| 33 |
+
"execution_count": null,
|
| 34 |
+
"id": "4edc211f",
|
| 35 |
+
"metadata": {},
|
| 36 |
+
"outputs": [],
|
| 37 |
+
"source": []
|
| 38 |
+
}
|
| 39 |
+
],
|
| 40 |
+
"metadata": {
|
| 41 |
+
"kernelspec": {
|
| 42 |
+
"display_name": "myenv",
|
| 43 |
+
"language": "python",
|
| 44 |
+
"name": "python3"
|
| 45 |
+
},
|
| 46 |
+
"language_info": {
|
| 47 |
+
"codemirror_mode": {
|
| 48 |
+
"name": "ipython",
|
| 49 |
+
"version": 3
|
| 50 |
+
},
|
| 51 |
+
"file_extension": ".py",
|
| 52 |
+
"mimetype": "text/x-python",
|
| 53 |
+
"name": "python",
|
| 54 |
+
"nbconvert_exporter": "python",
|
| 55 |
+
"pygments_lexer": "ipython3",
|
| 56 |
+
"version": "3.11.8"
|
| 57 |
+
}
|
| 58 |
+
},
|
| 59 |
+
"nbformat": 4,
|
| 60 |
+
"nbformat_minor": 5
|
| 61 |
+
}
|
FinalProject/agents/nodes/rag_node.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
def rag_node(state):
|
| 3 |
+
question = state["messages"][-1].content
|
| 4 |
+
|
| 5 |
+
rag_retriever = state.get("rag_retriever")
|
| 6 |
+
|
| 7 |
+
if rag_retriever is None:
|
| 8 |
+
print("RAG source is not available. Skipping retrieval.")
|
| 9 |
+
return {"documents": []}
|
| 10 |
+
|
| 11 |
+
documents = rag_retriever.invoke(question)
|
| 12 |
+
|
| 13 |
+
return {"documents": documents}
|
FinalProject/agents/nodes/router_noder.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
|
| 4 |
+
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
|
| 5 |
+
|
| 6 |
+
sys.path.insert(0, project_root)
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
from models.llm import get_llm
|
| 10 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 11 |
+
from models.llm import get_llm
|
| 12 |
+
from langchain_core.output_parsers import StrOutputParser
|
| 13 |
+
from data.dataingestion import load_all_pdfs
|
| 14 |
+
document = load_all_pdfs()
|
| 15 |
+
|
| 16 |
+
def route_node(state):
|
| 17 |
+
question = state["messages"][-1].content
|
| 18 |
+
|
| 19 |
+
api_key = state.get("api_key")
|
| 20 |
+
if not api_key:
|
| 21 |
+
raise ValueError("API Key not found in state.")
|
| 22 |
+
|
| 23 |
+
model = get_llm(api=api_key)
|
| 24 |
+
|
| 25 |
+
prompt = ChatPromptTemplate.from_messages([
|
| 26 |
+
("system","""You are an expert router.
|
| 27 |
+
Your task is to classify the user's question based on its content:
|
| 28 |
+
1. 'rag': If the question is related to the topics provided in these documents : {documents}
|
| 29 |
+
2. 'wikipedia': If the question is about general knowledge, history, people, or events.
|
| 30 |
+
Return ONLY a single word string: 'rag' or 'wikipedia'.
|
| 31 |
+
"""),
|
| 32 |
+
("user","{question}")
|
| 33 |
+
])
|
| 34 |
+
|
| 35 |
+
route_chain = prompt|model|StrOutputParser()
|
| 36 |
+
|
| 37 |
+
route = route_chain.invoke({"question":question,"documents":document})
|
| 38 |
+
if "rag" in route:
|
| 39 |
+
decision = "rag"
|
| 40 |
+
print("routing to rag")
|
| 41 |
+
else:
|
| 42 |
+
decision = "wiki"
|
| 43 |
+
print("routing to wikipedia")
|
| 44 |
+
|
| 45 |
+
return {"source":decision}
|
| 46 |
+
def route_decision(state):
|
| 47 |
+
return state["source"]
|
FinalProject/agents/nodes/wiki_node.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
|
| 4 |
+
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
|
| 5 |
+
|
| 6 |
+
sys.path.insert(0, project_root)
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
from models.retriever import get_wiki_retriever
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def wiki_node(state):
|
| 13 |
+
retriever = get_wiki_retriever()
|
| 14 |
+
question = state["messages"][-1].content
|
| 15 |
+
document = retriever.invoke(question)
|
| 16 |
+
|
| 17 |
+
return {"documents":document,"source":"final"}
|
FinalProject/agents/state.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langgraph.graph import StateGraph
|
| 2 |
+
from langgraph.graph.message import TypedDict,Annotated,Literal
|
| 3 |
+
from typing import List,Optional
|
| 4 |
+
from operator import add
|
| 5 |
+
from langchain_core.messages import BaseMessage
|
| 6 |
+
from langchain_core.documents import Document
|
| 7 |
+
|
| 8 |
+
class AgentGraph(TypedDict):
|
| 9 |
+
messages:Annotated[List[BaseMessage],add]
|
| 10 |
+
documents:List[Document]
|
| 11 |
+
source:Literal["rag","wiki","final"]
|
| 12 |
+
api_key: Optional[str]
|
FinalProject/agents/test.ipynb
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 4,
|
| 6 |
+
"id": "41112477",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"from graph import app"
|
| 11 |
+
]
|
| 12 |
+
},
|
| 13 |
+
{
|
| 14 |
+
"cell_type": "code",
|
| 15 |
+
"execution_count": null,
|
| 16 |
+
"id": "19a2b36e",
|
| 17 |
+
"metadata": {},
|
| 18 |
+
"outputs": [],
|
| 19 |
+
"source": []
|
| 20 |
+
},
|
| 21 |
+
{
|
| 22 |
+
"cell_type": "code",
|
| 23 |
+
"execution_count": null,
|
| 24 |
+
"id": "49732fe8",
|
| 25 |
+
"metadata": {},
|
| 26 |
+
"outputs": [],
|
| 27 |
+
"source": []
|
| 28 |
+
}
|
| 29 |
+
],
|
| 30 |
+
"metadata": {
|
| 31 |
+
"kernelspec": {
|
| 32 |
+
"display_name": "myenv",
|
| 33 |
+
"language": "python",
|
| 34 |
+
"name": "python3"
|
| 35 |
+
},
|
| 36 |
+
"language_info": {
|
| 37 |
+
"codemirror_mode": {
|
| 38 |
+
"name": "ipython",
|
| 39 |
+
"version": 3
|
| 40 |
+
},
|
| 41 |
+
"file_extension": ".py",
|
| 42 |
+
"mimetype": "text/x-python",
|
| 43 |
+
"name": "python",
|
| 44 |
+
"nbconvert_exporter": "python",
|
| 45 |
+
"pygments_lexer": "ipython3",
|
| 46 |
+
"version": "3.11.8"
|
| 47 |
+
}
|
| 48 |
+
},
|
| 49 |
+
"nbformat": 4,
|
| 50 |
+
"nbformat_minor": 5
|
| 51 |
+
}
|
FinalProject/app.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from agents.graph import app
|
| 3 |
+
from langchain_core.messages import HumanMessage
|
| 4 |
+
import os
|
| 5 |
+
import sys
|
| 6 |
+
import tempfile
|
| 7 |
+
from typing import List
|
| 8 |
+
|
| 9 |
+
# Ensure you have implemented this function in FinalProject/models/retriever.py
|
| 10 |
+
# It should accept a list of PDF file paths and return a LangChain Retriever object.
|
| 11 |
+
try:
|
| 12 |
+
from models.retriever import get_rag_retriever_from_paths
|
| 13 |
+
except ImportError:
|
| 14 |
+
st.error("Could not import get_rag_retriever_from_paths. Please check your models/retriever.py file.")
|
| 15 |
+
sys.exit()
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# --- PATH SETUP ---
|
| 19 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
| 20 |
+
sys.path.insert(0, current_dir)
|
| 21 |
+
|
| 22 |
+
# --- PAGE CONFIGURATION ---
|
| 23 |
+
st.set_page_config(
|
| 24 |
+
page_title="GraphQuery RAG Agent",
|
| 25 |
+
page_icon="π€",
|
| 26 |
+
layout="wide"
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
# --- CACHED FUNCTION TO BUILD RAG RETRIEVER ---
|
| 30 |
+
# Hashing trick: By passing file_paths (a list of strings), Streamlit hashes the list.
|
| 31 |
+
# The expensive function only runs if the list of paths changes (i.e., files are added/removed).
|
| 32 |
+
@st.cache_resource
|
| 33 |
+
def load_and_index_documents(file_paths: List[str]):
|
| 34 |
+
"""Loads documents and creates/returns a RAG retriever."""
|
| 35 |
+
if not file_paths:
|
| 36 |
+
return None
|
| 37 |
+
|
| 38 |
+
with st.spinner(f"Indexing {len(file_paths)} PDF file(s)... This may take a moment."):
|
| 39 |
+
try:
|
| 40 |
+
# Calls the function from your models/retriever.py
|
| 41 |
+
retriever = get_rag_retriever_from_paths(file_paths)
|
| 42 |
+
st.success(f"Indexed {len(file_paths)} PDF file(s) successfully!")
|
| 43 |
+
return retriever
|
| 44 |
+
except Exception as e:
|
| 45 |
+
st.error(f"Failed to index documents: {e}")
|
| 46 |
+
return None
|
| 47 |
+
|
| 48 |
+
# --- SIDEBAR (Settings, Key, and Upload) ---
|
| 49 |
+
with st.sidebar:
|
| 50 |
+
st.header("βοΈ Agent Settings")
|
| 51 |
+
st.caption("Configure your LLM and Access Key.")
|
| 52 |
+
|
| 53 |
+
# API Key Input
|
| 54 |
+
api_key = st.text_input(
|
| 55 |
+
"**Groq API Key (Required):**",
|
| 56 |
+
type="password",
|
| 57 |
+
help="Paste your private Groq API Key here. It is used only for this session.",
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
st.divider()
|
| 61 |
+
|
| 62 |
+
# 1. FILE UPLOAD SECTION
|
| 63 |
+
st.subheader("π Document Upload")
|
| 64 |
+
uploaded_files = st.file_uploader(
|
| 65 |
+
"Upload your own PDFs for RAG context:",
|
| 66 |
+
type=["pdf"],
|
| 67 |
+
accept_multiple_files=True
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
# 2. FILE SAVING & INDEXING LOGIC
|
| 71 |
+
file_paths = []
|
| 72 |
+
rag_retriever = None
|
| 73 |
+
|
| 74 |
+
if uploaded_files:
|
| 75 |
+
# Streamlit files are in memory; we must write them to a temporary file
|
| 76 |
+
# so LangChain's PyPDFLoader (which needs a file path) can read them.
|
| 77 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
| 78 |
+
for uploaded_file in uploaded_files:
|
| 79 |
+
file_path = os.path.join(temp_dir, uploaded_file.name)
|
| 80 |
+
# Write the file bytes to the temporary path
|
| 81 |
+
with open(file_path, "wb") as f:
|
| 82 |
+
f.write(uploaded_file.getbuffer())
|
| 83 |
+
file_paths.append(file_path)
|
| 84 |
+
|
| 85 |
+
# 3. Build the retriever and cache it based on the list of paths
|
| 86 |
+
# NOTE: We pass the list of temporary paths to the cached function.
|
| 87 |
+
rag_retriever = load_and_index_documents(file_paths)
|
| 88 |
+
|
| 89 |
+
else:
|
| 90 |
+
# Clear the cache if no files are uploaded to ensure a clean state
|
| 91 |
+
st.info("No documents uploaded. Only Wikipedia lookup is enabled.")
|
| 92 |
+
load_and_index_documents.clear() # Clears the cache for this function
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
st.divider()
|
| 96 |
+
st.subheader("π οΈ Features")
|
| 97 |
+
st.info(f"RAG (Document Context) status: {'**ENABLED**' if rag_retriever else 'DISABLED'}")
|
| 98 |
+
st.info("Wikipedia Routing is always active.")
|
| 99 |
+
st.text("MORE COMING SOON β±οΈ")
|
| 100 |
+
|
| 101 |
+
# --- MAIN INTERFACE (Header) ---
|
| 102 |
+
st.markdown(
|
| 103 |
+
"""
|
| 104 |
+
# π§ LangGraph Query Model
|
| 105 |
+
### Multi-Source RAG Agent
|
| 106 |
+
Ask a question related to your uploaded documents or general knowledge.
|
| 107 |
+
"""
|
| 108 |
+
)
|
| 109 |
+
st.divider()
|
| 110 |
+
|
| 111 |
+
# --- STATE INITIALIZATION ---
|
| 112 |
+
initial_state_base = {
|
| 113 |
+
"documents": [],
|
| 114 |
+
"source": "",
|
| 115 |
+
"api_key": api_key,
|
| 116 |
+
# Pass the dynamically created retriever to the graph state
|
| 117 |
+
"rag_retriever": rag_retriever
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
# --- CHAT INPUT AND LOGIC ---
|
| 121 |
+
with st.form(key='query_form', clear_on_submit=True):
|
| 122 |
+
user_query = st.text_input(
|
| 123 |
+
"**Your Question:**",
|
| 124 |
+
placeholder="e.g., What is the significance of the military-industrial complex in Russia?",
|
| 125 |
+
label_visibility="collapsed"
|
| 126 |
+
)
|
| 127 |
+
submit_button = st.form_submit_button(label='Ask the Agent π')
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
# --- EXECUTION LOGIC ---
|
| 131 |
+
|
| 132 |
+
if submit_button and user_query:
|
| 133 |
+
if not api_key:
|
| 134 |
+
st.error("π **Error:** Please enter your Groq API Key in the sidebar to run the query.")
|
| 135 |
+
st.stop()
|
| 136 |
+
|
| 137 |
+
st.info("π **Querying the Agent...** Please wait.")
|
| 138 |
+
|
| 139 |
+
# Prepare state
|
| 140 |
+
initial_state = initial_state_base.copy()
|
| 141 |
+
initial_state["messages"] = [HumanMessage(content=user_query)]
|
| 142 |
+
|
| 143 |
+
with st.spinner('Thinking... Routing and Retrieving Context...'):
|
| 144 |
+
try:
|
| 145 |
+
response = app.invoke(initial_state)
|
| 146 |
+
|
| 147 |
+
# --- Output Display ---
|
| 148 |
+
final_message = response["messages"][-1].content
|
| 149 |
+
|
| 150 |
+
st.success("β
**Agent Response:**")
|
| 151 |
+
st.markdown(final_message)
|
| 152 |
+
st.divider()
|
| 153 |
+
|
| 154 |
+
# Optional: Show debug info
|
| 155 |
+
with st.expander("π **Debug Info (Agent Flow)**"):
|
| 156 |
+
st.write(f"**Final Source:** {response.get('source', 'Unknown')}")
|
| 157 |
+
if 'documents' in response and response['documents']:
|
| 158 |
+
st.write(f"**Retrieved Documents:** {len(response['documents'])} chunks used.")
|
| 159 |
+
|
| 160 |
+
except Exception as e:
|
| 161 |
+
st.error("β **Agent Failure:** An error occurred during execution.")
|
| 162 |
+
st.exception(e)
|
| 163 |
+
|
| 164 |
+
elif not user_query and not api_key:
|
| 165 |
+
st.markdown("π Start by entering your **Groq API Key** in the sidebar and asking a question above!")
|
FinalProject/data/__init__.py
ADDED
|
File without changes
|
FinalProject/data/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (171 Bytes). View file
|
|
|
FinalProject/data/__pycache__/dataingestion.cpython-311.pyc
ADDED
|
Binary file (2.09 kB). View file
|
|
|
FinalProject/data/dataingestion.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from langchain_community.document_loaders import PyPDFDirectoryLoader
|
| 3 |
+
from langchain_core.documents import Document
|
| 4 |
+
from typing import List
|
| 5 |
+
|
| 6 |
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 7 |
+
|
| 8 |
+
PDF_FOLDER = os.path.join(BASE_DIR, "pdfs")
|
| 9 |
+
|
| 10 |
+
def load_all_pdfs() -> List[Document]:
|
| 11 |
+
|
| 12 |
+
if not os.path.exists(PDF_FOLDER):
|
| 13 |
+
os.makedirs(PDF_FOLDER, exist_ok=True)
|
| 14 |
+
print(f"Created PDF ingestion directory: {PDF_FOLDER}")
|
| 15 |
+
return []
|
| 16 |
+
|
| 17 |
+
try:
|
| 18 |
+
|
| 19 |
+
loader = PyPDFDirectoryLoader(PDF_FOLDER)
|
| 20 |
+
|
| 21 |
+
all_docs = loader.load()
|
| 22 |
+
|
| 23 |
+
print(f"Successfully loaded {len(all_docs)} document pages from the 'pdfs' folder.")
|
| 24 |
+
|
| 25 |
+
for doc in all_docs:
|
| 26 |
+
if 'source' in doc.metadata:
|
| 27 |
+
doc.metadata['source_short'] = os.path.basename(doc.metadata['source'])
|
| 28 |
+
|
| 29 |
+
return all_docs
|
| 30 |
+
|
| 31 |
+
except Exception as e:
|
| 32 |
+
print(f"Error loading PDFs: {e}")
|
| 33 |
+
return []
|
FinalProject/data/pdfs/RU-MILITARY.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1a435f3f6f06baf8883cab3f2c84cbe0be0a1268df3a3da74f2e3f17161dd6a7
|
| 3 |
+
size 1527223
|
FinalProject/data/pdfs/__init__.py
ADDED
|
File without changes
|
FinalProject/models/__init__.py
ADDED
|
File without changes
|
FinalProject/models/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (173 Bytes). View file
|
|
|
FinalProject/models/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (171 Bytes). View file
|
|
|
FinalProject/models/__pycache__/embedding.cpython-311.pyc
ADDED
|
Binary file (458 Bytes). View file
|
|
|
FinalProject/models/__pycache__/embedding.cpython-313.pyc
ADDED
|
Binary file (407 Bytes). View file
|
|
|
FinalProject/models/__pycache__/llm.cpython-311.pyc
ADDED
|
Binary file (449 Bytes). View file
|
|
|
FinalProject/models/__pycache__/retriever.cpython-311.pyc
ADDED
|
Binary file (1.85 kB). View file
|
|
|
FinalProject/models/__pycache__/retriever.cpython-313.pyc
ADDED
|
Binary file (1.6 kB). View file
|
|
|
FinalProject/models/embedding.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 2 |
+
|
| 3 |
+
def get_embeddings():
|
| 4 |
+
return HuggingFaceEmbeddings(
|
| 5 |
+
model="all-MiniLM-L6-v2"
|
| 6 |
+
)
|
FinalProject/models/llm.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_groq import ChatGroq
|
| 2 |
+
|
| 3 |
+
def get_llm(api):
|
| 4 |
+
return ChatGroq(
|
| 5 |
+
model = "llama-3.3-70b-versatile",
|
| 6 |
+
api_key=api
|
| 7 |
+
)
|
FinalProject/models/retriever.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import os
|
| 3 |
+
from langchain_community.retrievers import WikipediaRetriever
|
| 4 |
+
from langchain_community.vectorstores import Chroma
|
| 5 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 6 |
+
from langchain_community.document_loaders import PyPDFLoader
|
| 7 |
+
from .embedding import get_embeddings
|
| 8 |
+
from typing import List
|
| 9 |
+
|
| 10 |
+
embedder = get_embeddings()
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def get_rag_retriever_from_paths(pdf_paths: List[str]):
|
| 14 |
+
"""Loads PDFs from a list of paths, splits them, and creates a Chroma retriever."""
|
| 15 |
+
|
| 16 |
+
all_docs = []
|
| 17 |
+
for path in pdf_paths:
|
| 18 |
+
loader = PyPDFLoader(path)
|
| 19 |
+
all_docs.extend(loader.load())
|
| 20 |
+
|
| 21 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=270)
|
| 22 |
+
splits = text_splitter.split_documents(all_docs)
|
| 23 |
+
|
| 24 |
+
vectorstore = Chroma.from_documents(documents=splits, embedding=embedder)
|
| 25 |
+
rag_retriever = vectorstore.as_retriever()
|
| 26 |
+
|
| 27 |
+
return rag_retriever
|
| 28 |
+
|
| 29 |
+
def get_wiki_retriever():
|
| 30 |
+
wikiretriever = WikipediaRetriever(top_k_results=2)
|
| 31 |
+
return wikiretriever
|
FinalProject/requirements.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
langchain-core
|
| 3 |
+
langchain-community
|
| 4 |
+
langchain-text-splitters
|
| 5 |
+
langgraph
|
| 6 |
+
langchain-groq
|
| 7 |
+
langchain-huggingface
|
| 8 |
+
pypdf
|
| 9 |
+
sentence-transformers
|
| 10 |
+
chromadb
|
FinalProject/test.ipynb
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 4,
|
| 6 |
+
"id": "f6960caa",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [
|
| 9 |
+
{
|
| 10 |
+
"data": {
|
| 11 |
+
"text/plain": [
|
| 12 |
+
"[Document(metadata={'source': 'c:\\\\Users\\\\xyzai\\\\Documents\\\\GraphDB\\\\FinalProject\\\\data\\\\pdfs\\\\RU-MILITARY.pdf', 'creationdate': '2014-04-22T10:23:16+02:00', 'creator': 'Adobe InDesign CS6 (Macintosh)', 'trapped': '/False', 'page_label': '3', 'total_pages': 4, 'moddate': '2014-04-22T10:23:18+02:00', 'page': 2, 'producer': 'Adobe PDF Library 10.0.1'}, page_content='developing the military-industrial complex.\\nThe Kremlinβs massive armaments program \\nand its reform of the military-industrial \\ncomplex also has significance in terms of \\nindustrial and social policy. The military in-\\ndustry employs two million workers; five \\nper cent of the Russian population depend \\non it for their livelihood. In this way, the \\nKremlin is βsolvingβ an issue in Soviet style: \\nFunding for the military is once more tak -\\ning on a central role in society. It is hoped \\nthat this will boost innovation and global \\nRussian Demographics\\nRussia has a security apparatus \\nproportionally more than twice \\nthe size of that of the US.'),\n",
|
| 13 |
+
" Document(metadata={'creator': 'Adobe InDesign CS6 (Macintosh)', 'creationdate': '2014-04-22T10:23:16+02:00', 'source': 'c:\\\\Users\\\\xyzai\\\\Documents\\\\GraphDB\\\\FinalProject\\\\data\\\\pdfs\\\\RU-MILITARY.pdf', 'producer': 'Adobe PDF Library 10.0.1', 'moddate': '2014-04-22T10:23:18+02:00', 'total_pages': 4, 'page_label': '3', 'page': 2, 'trapped': '/False'}, page_content='developing the military-industrial complex.\\nThe Kremlinβs massive armaments program \\nand its reform of the military-industrial \\ncomplex also has significance in terms of \\nindustrial and social policy. The military in-\\ndustry employs two million workers; five \\nper cent of the Russian population depend \\non it for their livelihood. In this way, the \\nKremlin is βsolvingβ an issue in Soviet style: \\nFunding for the military is once more tak -\\ning on a central role in society. It is hoped \\nthat this will boost innovation and global \\nRussian Demographics\\nRussia has a security apparatus \\nproportionally more than twice \\nthe size of that of the US.'),\n",
|
| 14 |
+
" Document(metadata={'producer': 'Adobe PDF Library 10.0.1', 'source': 'c:\\\\Users\\\\xyzai\\\\Documents\\\\GraphDB\\\\FinalProject\\\\data\\\\pdfs\\\\RU-MILITARY.pdf', 'total_pages': 4, 'trapped': '/False', 'creationdate': '2014-04-22T10:23:16+02:00', 'creator': 'Adobe InDesign CS6 (Macintosh)', 'page_label': '4', 'moddate': '2014-04-22T10:23:18+02:00', 'page': 3}, page_content='there are bottlenecks in production capaci-\\nty, for example in aircraft production and \\nshipbuilding. The plans to enhance military \\ntransport aviation can only be realized if \\ncapacity is expanded rapidly. Russia also \\ndepends on cooperation with Ukraine: So \\nfar, many motors for helicopters and air -\\ncraft as well as rockets have been produced \\nin Ukraine. Russia lacks the know-how for \\nproducing many of the parts required. The \\ncurrent conflict is putting a strain on this \\ncooperation and necessitates import substi-\\ntutes, which entail great cost and delays.\\nThe Effects of Remilitarization\\nUnder Vladimir Putin, the modernization \\nof Russiaβs armed forces has become a pri -\\nority for the first time since 1991. For sev-\\neral years, considerable sums have been ex-\\npended on this reform. However, challenges \\nremain when it comes to technology and \\norganizational culture; and demographic \\nproblems are also still an issue. Moreover, \\nthe lagging economic output will exacer -'),\n",
|
| 15 |
+
" Document(metadata={'creationdate': '2014-04-22T10:23:16+02:00', 'total_pages': 4, 'moddate': '2014-04-22T10:23:18+02:00', 'trapped': '/False', 'page_label': '4', 'source': 'c:\\\\Users\\\\xyzai\\\\Documents\\\\GraphDB\\\\FinalProject\\\\data\\\\pdfs\\\\RU-MILITARY.pdf', 'creator': 'Adobe InDesign CS6 (Macintosh)', 'page': 3, 'producer': 'Adobe PDF Library 10.0.1'}, page_content='there are bottlenecks in production capaci-\\nty, for example in aircraft production and \\nshipbuilding. The plans to enhance military \\ntransport aviation can only be realized if \\ncapacity is expanded rapidly. Russia also \\ndepends on cooperation with Ukraine: So \\nfar, many motors for helicopters and air -\\ncraft as well as rockets have been produced \\nin Ukraine. Russia lacks the know-how for \\nproducing many of the parts required. The \\ncurrent conflict is putting a strain on this \\ncooperation and necessitates import substi-\\ntutes, which entail great cost and delays.\\nThe Effects of Remilitarization\\nUnder Vladimir Putin, the modernization \\nof Russiaβs armed forces has become a pri -\\nority for the first time since 1991. For sev-\\neral years, considerable sums have been ex-\\npended on this reform. However, challenges \\nremain when it comes to technology and \\norganizational culture; and demographic \\nproblems are also still an issue. Moreover, \\nthe lagging economic output will exacer -')]"
|
| 16 |
+
]
|
| 17 |
+
},
|
| 18 |
+
"execution_count": 4,
|
| 19 |
+
"metadata": {},
|
| 20 |
+
"output_type": "execute_result"
|
| 21 |
+
}
|
| 22 |
+
],
|
| 23 |
+
"source": [
|
| 24 |
+
"from langchain_community.retrievers import WikipediaRetriever\n",
|
| 25 |
+
"from langchain_community.vectorstores import Chroma\n",
|
| 26 |
+
"from data.dataingestion import load_all_pdfs\n",
|
| 27 |
+
"from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
|
| 28 |
+
"from langchain_community.vectorstores import Chroma\n",
|
| 29 |
+
"from models.embedding import get_embeddings\n",
|
| 30 |
+
"embeddings = get_embeddings()\n",
|
| 31 |
+
"pdf_data = load_all_pdfs()\n",
|
| 32 |
+
"\n",
|
| 33 |
+
"text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=270)\n",
|
| 34 |
+
"splits = text_splitter.split_documents(pdf_data)\n",
|
| 35 |
+
"vectorstore = Chroma.from_documents(documents=splits,embedding=embeddings)\n",
|
| 36 |
+
"rag_retriever = vectorstore.as_retriever()\n",
|
| 37 |
+
"resp=rag_retriever.invoke(\"russian military\")\n",
|
| 38 |
+
"resp \n"
|
| 39 |
+
]
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"cell_type": "code",
|
| 43 |
+
"execution_count": null,
|
| 44 |
+
"id": "261ab304",
|
| 45 |
+
"metadata": {},
|
| 46 |
+
"outputs": [],
|
| 47 |
+
"source": []
|
| 48 |
+
},
|
| 49 |
+
{
|
| 50 |
+
"cell_type": "code",
|
| 51 |
+
"execution_count": null,
|
| 52 |
+
"id": "c7e948d4",
|
| 53 |
+
"metadata": {},
|
| 54 |
+
"outputs": [],
|
| 55 |
+
"source": []
|
| 56 |
+
}
|
| 57 |
+
],
|
| 58 |
+
"metadata": {
|
| 59 |
+
"kernelspec": {
|
| 60 |
+
"display_name": "myenv",
|
| 61 |
+
"language": "python",
|
| 62 |
+
"name": "python3"
|
| 63 |
+
},
|
| 64 |
+
"language_info": {
|
| 65 |
+
"codemirror_mode": {
|
| 66 |
+
"name": "ipython",
|
| 67 |
+
"version": 3
|
| 68 |
+
},
|
| 69 |
+
"file_extension": ".py",
|
| 70 |
+
"mimetype": "text/x-python",
|
| 71 |
+
"name": "python",
|
| 72 |
+
"nbconvert_exporter": "python",
|
| 73 |
+
"pygments_lexer": "ipython3",
|
| 74 |
+
"version": "3.11.8"
|
| 75 |
+
}
|
| 76 |
+
},
|
| 77 |
+
"nbformat": 4,
|
| 78 |
+
"nbformat_minor": 5
|
| 79 |
+
}
|
app.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from agents.graph import app
|
| 3 |
+
from langchain_core.messages import HumanMessage
|
| 4 |
+
import os
|
| 5 |
+
import sys
|
| 6 |
+
import tempfile
|
| 7 |
+
from typing import List
|
| 8 |
+
|
| 9 |
+
# Ensure you have implemented this function in FinalProject/models/retriever.py
|
| 10 |
+
# It should accept a list of PDF file paths and return a LangChain Retriever object.
|
| 11 |
+
try:
|
| 12 |
+
from models.retriever import get_rag_retriever_from_paths
|
| 13 |
+
except ImportError:
|
| 14 |
+
st.error("Could not import get_rag_retriever_from_paths. Please check your models/retriever.py file.")
|
| 15 |
+
sys.exit()
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# --- PATH SETUP ---
|
| 19 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
| 20 |
+
sys.path.insert(0, current_dir)
|
| 21 |
+
|
| 22 |
+
# --- PAGE CONFIGURATION ---
|
| 23 |
+
st.set_page_config(
|
| 24 |
+
page_title="GraphQuery RAG Agent",
|
| 25 |
+
page_icon="π€",
|
| 26 |
+
layout="wide"
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
# --- CACHED FUNCTION TO BUILD RAG RETRIEVER ---
|
| 30 |
+
# Hashing trick: By passing file_paths (a list of strings), Streamlit hashes the list.
|
| 31 |
+
# The expensive function only runs if the list of paths changes (i.e., files are added/removed).
|
| 32 |
+
@st.cache_resource
|
| 33 |
+
def load_and_index_documents(file_paths: List[str]):
|
| 34 |
+
"""Loads documents and creates/returns a RAG retriever."""
|
| 35 |
+
if not file_paths:
|
| 36 |
+
return None
|
| 37 |
+
|
| 38 |
+
with st.spinner(f"Indexing {len(file_paths)} PDF file(s)... This may take a moment."):
|
| 39 |
+
try:
|
| 40 |
+
# Calls the function from your models/retriever.py
|
| 41 |
+
retriever = get_rag_retriever_from_paths(file_paths)
|
| 42 |
+
st.success(f"Indexed {len(file_paths)} PDF file(s) successfully!")
|
| 43 |
+
return retriever
|
| 44 |
+
except Exception as e:
|
| 45 |
+
st.error(f"Failed to index documents: {e}")
|
| 46 |
+
return None
|
| 47 |
+
|
| 48 |
+
# --- SIDEBAR (Settings, Key, and Upload) ---
|
| 49 |
+
with st.sidebar:
|
| 50 |
+
st.header("βοΈ Agent Settings")
|
| 51 |
+
st.caption("Configure your LLM and Access Key.")
|
| 52 |
+
|
| 53 |
+
# API Key Input
|
| 54 |
+
api_key = st.text_input(
|
| 55 |
+
"**Groq API Key (Required):**",
|
| 56 |
+
type="password",
|
| 57 |
+
help="Paste your private Groq API Key here. It is used only for this session.",
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
st.divider()
|
| 61 |
+
|
| 62 |
+
# 1. FILE UPLOAD SECTION
|
| 63 |
+
st.subheader("π Document Upload")
|
| 64 |
+
uploaded_files = st.file_uploader(
|
| 65 |
+
"Upload your own PDFs for RAG context:",
|
| 66 |
+
type=["pdf"],
|
| 67 |
+
accept_multiple_files=True
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
# 2. FILE SAVING & INDEXING LOGIC
|
| 71 |
+
file_paths = []
|
| 72 |
+
rag_retriever = None
|
| 73 |
+
|
| 74 |
+
if uploaded_files:
|
| 75 |
+
# Streamlit files are in memory; we must write them to a temporary file
|
| 76 |
+
# so LangChain's PyPDFLoader (which needs a file path) can read them.
|
| 77 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
| 78 |
+
for uploaded_file in uploaded_files:
|
| 79 |
+
file_path = os.path.join(temp_dir, uploaded_file.name)
|
| 80 |
+
# Write the file bytes to the temporary path
|
| 81 |
+
with open(file_path, "wb") as f:
|
| 82 |
+
f.write(uploaded_file.getbuffer())
|
| 83 |
+
file_paths.append(file_path)
|
| 84 |
+
|
| 85 |
+
# 3. Build the retriever and cache it based on the list of paths
|
| 86 |
+
# NOTE: We pass the list of temporary paths to the cached function.
|
| 87 |
+
rag_retriever = load_and_index_documents(file_paths)
|
| 88 |
+
|
| 89 |
+
else:
|
| 90 |
+
# Clear the cache if no files are uploaded to ensure a clean state
|
| 91 |
+
st.info("No documents uploaded. Only Wikipedia lookup is enabled.")
|
| 92 |
+
load_and_index_documents.clear() # Clears the cache for this function
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
st.divider()
|
| 96 |
+
st.subheader("π οΈ Features")
|
| 97 |
+
st.info(f"RAG (Document Context) status: {'**ENABLED**' if rag_retriever else 'DISABLED'}")
|
| 98 |
+
st.info("Wikipedia Routing is always active.")
|
| 99 |
+
st.text("MORE COMING SOON β±οΈ")
|
| 100 |
+
|
| 101 |
+
# --- MAIN INTERFACE (Header) ---
|
| 102 |
+
st.markdown(
|
| 103 |
+
"""
|
| 104 |
+
# π§ LangGraph Query Model
|
| 105 |
+
### Multi-Source RAG Agent
|
| 106 |
+
Ask a question related to your uploaded documents or general knowledge.
|
| 107 |
+
"""
|
| 108 |
+
)
|
| 109 |
+
st.divider()
|
| 110 |
+
|
| 111 |
+
# --- STATE INITIALIZATION ---
|
| 112 |
+
initial_state_base = {
|
| 113 |
+
"documents": [],
|
| 114 |
+
"source": "",
|
| 115 |
+
"api_key": api_key,
|
| 116 |
+
# Pass the dynamically created retriever to the graph state
|
| 117 |
+
"rag_retriever": rag_retriever
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
# --- CHAT INPUT AND LOGIC ---
|
| 121 |
+
with st.form(key='query_form', clear_on_submit=True):
|
| 122 |
+
user_query = st.text_input(
|
| 123 |
+
"**Your Question:**",
|
| 124 |
+
placeholder="e.g., What is the significance of the military-industrial complex in Russia?",
|
| 125 |
+
label_visibility="collapsed"
|
| 126 |
+
)
|
| 127 |
+
submit_button = st.form_submit_button(label='Ask the Agent π')
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
# --- EXECUTION LOGIC ---
|
| 131 |
+
|
| 132 |
+
if submit_button and user_query:
|
| 133 |
+
if not api_key:
|
| 134 |
+
st.error("π **Error:** Please enter your Groq API Key in the sidebar to run the query.")
|
| 135 |
+
st.stop()
|
| 136 |
+
|
| 137 |
+
st.info("π **Querying the Agent...** Please wait.")
|
| 138 |
+
|
| 139 |
+
# Prepare state
|
| 140 |
+
initial_state = initial_state_base.copy()
|
| 141 |
+
initial_state["messages"] = [HumanMessage(content=user_query)]
|
| 142 |
+
|
| 143 |
+
with st.spinner('Thinking... Routing and Retrieving Context...'):
|
| 144 |
+
try:
|
| 145 |
+
response = app.invoke(initial_state)
|
| 146 |
+
|
| 147 |
+
# --- Output Display ---
|
| 148 |
+
final_message = response["messages"][-1].content
|
| 149 |
+
|
| 150 |
+
st.success("β
**Agent Response:**")
|
| 151 |
+
st.markdown(final_message)
|
| 152 |
+
st.divider()
|
| 153 |
+
|
| 154 |
+
# Optional: Show debug info
|
| 155 |
+
with st.expander("π **Debug Info (Agent Flow)**"):
|
| 156 |
+
st.write(f"**Final Source:** {response.get('source', 'Unknown')}")
|
| 157 |
+
if 'documents' in response and response['documents']:
|
| 158 |
+
st.write(f"**Retrieved Documents:** {len(response['documents'])} chunks used.")
|
| 159 |
+
|
| 160 |
+
except Exception as e:
|
| 161 |
+
st.error("β **Agent Failure:** An error occurred during execution.")
|
| 162 |
+
st.exception(e)
|
| 163 |
+
|
| 164 |
+
elif not user_query and not api_key:
|
| 165 |
+
st.markdown("π Start by entering your **Groq API Key** in the sidebar and asking a question above!")
|