Dinesh310 commited on
Commit
8586611
Β·
verified Β·
1 Parent(s): c086254

Update streamlit_app.py

Browse files
Files changed (1) hide show
  1. streamlit_app.py +133 -108
streamlit_app.py CHANGED
@@ -2,8 +2,11 @@ import streamlit as st
2
  from pathlib import Path
3
  import sys
4
  import os
 
5
 
6
- # Add src to path to ensure imports work correctly
 
 
7
  sys.path.append(str(Path(__file__).parent))
8
 
9
  from src.config.config import Config
@@ -11,164 +14,186 @@ from src.document_ingestion.document_processor import DocumentProcessor
11
  from src.vectorstore.vectorstore import VectorStore
12
  from src.graph_builder.graph_builder import GraphBuilder
13
 
14
- # --- Page Configuration ---
 
 
15
  st.set_page_config(
16
  page_title="Agentic PDF RAG",
17
  page_icon="🧠",
18
  layout="wide"
19
  )
20
 
21
- # Custom CSS for chat styling
 
 
22
  st.markdown("""
23
- <style>
24
- .stChatMessage { border-radius: 10px; margin-bottom: 10px; }
25
- .stSidebar { background-color: #f8f9fa; }
26
- </style>
27
  """, unsafe_allow_html=True)
28
 
 
 
 
29
  def init_session_state():
30
- """Initializes all required session state variables"""
31
- if 'rag_system' not in st.session_state:
32
- st.session_state.rag_system = None
33
- if 'messages' not in st.session_state:
34
- st.session_state.messages = [
35
- {"role": "assistant", "content": "Hello! Please upload PDF documents in the sidebar to begin our technical deep-dive."}
36
- ]
37
- if 'processed_files' not in st.session_state:
38
- st.session_state.processed_files = []
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  def process_documents(uploaded_files):
41
  """
42
- Handles multi-file ingestion:
43
- 1. Loops through all uploaded files
44
- 2. Saves each to a temp path
45
- 3. Aggregates all document chunks
46
- 4. Initializes VectorStore and Graph once
47
  """
48
  try:
49
- doc_processor = DocumentProcessor(
50
  chunk_size=Config.CHUNK_SIZE,
51
  chunk_overlap=Config.CHUNK_OVERLAP
52
  )
53
-
54
- all_docs = []
55
-
56
- # Ensure a temporary directory exists
57
  temp_dir = Path("temp_uploads")
58
  temp_dir.mkdir(exist_ok=True)
59
-
60
- for uploaded_file in uploaded_files:
61
- # 1. Save uploaded bytes to a local string path
62
- temp_path = temp_dir / uploaded_file.name
 
63
  with open(temp_path, "wb") as f:
64
- f.write(uploaded_file.getvalue())
65
-
66
- # 2. Process this specific PDF into chunks
67
- # Assuming your DocumentProcessor.process_pdf takes a string path
68
- docs = doc_processor.process_pdf(str(temp_path))
 
 
 
69
  all_docs.extend(docs)
70
-
71
- # 3. Clean up the temporary file immediately after processing
72
- if temp_path.exists():
73
- os.remove(temp_path)
74
 
75
  if not all_docs:
76
- st.error("No text could be extracted from the uploaded files.")
77
  return None, 0
78
 
79
- # 4. Create Vector Store with the combined list of all chunks
80
  vector_store = VectorStore()
81
  vector_store.create_vectorstore(all_docs)
82
-
83
- # 5. Build the Agentic Graph using the compiled retriever
84
- graph_builder = GraphBuilder(
85
  retriever=vector_store.get_retriever(),
86
  llm=Config.get_llm()
87
  )
88
- graph_builder.build()
89
-
90
- return graph_builder, len(all_docs)
91
 
92
  except Exception as e:
93
- st.error(f"Critical Error during ingestion: {str(e)}")
94
  return None, 0
95
 
 
 
 
96
  def main():
97
  init_session_state()
98
-
99
- # --- Sidebar UI ---
100
  with st.sidebar:
101
- st.header("Document Ingestion")
 
102
  uploaded_files = st.file_uploader(
103
- "Upload PDF files",
104
- type="pdf",
105
- accept_multiple_files=True,
106
- help="You can select multiple files at once."
107
  )
108
-
109
  if st.button("πŸ› οΈ Build Knowledge Base", type="primary"):
110
- if uploaded_files:
111
- with st.spinner("Analyzing PDF structure and generating embeddings..."):
112
- rag_system, num_chunks = process_documents(uploaded_files)
113
- if rag_system:
114
- st.session_state.rag_system = rag_system
115
- st.session_state.processed_files = [f.name for f in uploaded_files]
116
-
117
- # Add success notification to chat
118
- confirm_msg = f"I have successfully indexed {num_chunks} chunks from: {', '.join(st.session_state.processed_files)}."
119
- st.session_state.messages.append({"role": "assistant", "content": confirm_msg})
120
- st.rerun() # Refresh to show the message immediately
121
  else:
122
- st.warning("Please upload at least one PDF first.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
  if st.session_state.processed_files:
125
  st.markdown("---")
126
- st.subheader("Loaded Documents")
127
  for f in st.session_state.processed_files:
128
- st.caption(f"βœ… {f}")
129
-
130
- if st.button("Clear Chat History"):
131
- st.session_state.messages = [{"role": "assistant", "content": "Chat cleared. How can I help with the current documents?"}]
 
 
132
  st.rerun()
133
 
134
- # --- Main Chat UI ---
135
- st.title("πŸ” Agentic RAG Explorer")
136
- st.caption("Powered by LangGraph & Vector Embeddings")
137
 
138
- # Display existing chat history
139
- for message in st.session_state.messages:
140
- with st.chat_message(message["role"]):
141
- st.markdown(message["content"])
142
 
143
- # Chat Input logic
144
- if prompt := st.chat_input("Ask a question about your documents..."):
145
- st.chat_message("user").markdown(prompt)
146
  st.session_state.messages.append({"role": "user", "content": prompt})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
- if st.session_state.rag_system:
149
- with st.chat_message("assistant"):
150
- with st.spinner("Agent searching knowledge base..."):
151
- try:
152
- # Call your GraphBuilder's run method
153
- result = st.session_state.rag_system.run(prompt)
154
- answer = result.get('answer', "I couldn't find a definitive answer.")
155
- st.markdown(answer)
156
-
157
- # Show Source Citations in an Expander
158
- if result.get('retrieved_docs'):
159
- with st.expander("πŸ” View Referenced Context"):
160
- for i, doc in enumerate(result['retrieved_docs'], 1):
161
- source_name = Path(doc.metadata.get('source', 'Unknown')).name
162
- page_num = doc.metadata.get('page', 'N/A')
163
- st.markdown(f"**Source {i}:** {source_name} (Page {page_num})")
164
- st.info(doc.page_content[:400] + "...")
165
-
166
- st.session_state.messages.append({"role": "assistant", "content": answer})
167
-
168
- except Exception as e:
169
- st.error(f"Search Error: {str(e)}")
170
- else:
171
- st.warning("Please upload and build the knowledge base first!")
172
 
 
173
  if __name__ == "__main__":
174
- main()
 
2
  from pathlib import Path
3
  import sys
4
  import os
5
+ import hashlib
6
 
7
+ # -------------------------------------------------
8
+ # Path setup
9
+ # -------------------------------------------------
10
  sys.path.append(str(Path(__file__).parent))
11
 
12
  from src.config.config import Config
 
14
  from src.vectorstore.vectorstore import VectorStore
15
  from src.graph_builder.graph_builder import GraphBuilder
16
 
17
+ # -------------------------------------------------
18
+ # Page config
19
+ # -------------------------------------------------
20
  st.set_page_config(
21
  page_title="Agentic PDF RAG",
22
  page_icon="🧠",
23
  layout="wide"
24
  )
25
 
26
+ # -------------------------------------------------
27
+ # Styles
28
+ # -------------------------------------------------
29
  st.markdown("""
30
+ <style>
31
+ .stChatMessage { border-radius: 10px; margin-bottom: 10px; }
32
+ .stSidebar { background-color: #f8f9fa; }
33
+ </style>
34
  """, unsafe_allow_html=True)
35
 
36
+ # -------------------------------------------------
37
+ # Session state
38
+ # -------------------------------------------------
39
  def init_session_state():
40
+ defaults = {
41
+ "rag_system": None,
42
+ "messages": [
43
+ {"role": "assistant", "content": "πŸ‘‹ Upload one or more PDFs from the sidebar and start chatting across them."}
44
+ ],
45
+ "processed_files": [],
46
+ "kb_hash": None
47
+ }
48
+ for k, v in defaults.items():
49
+ if k not in st.session_state:
50
+ st.session_state[k] = v
51
+
52
+ # -------------------------------------------------
53
+ # Helpers
54
+ # -------------------------------------------------
55
+ def compute_files_hash(files):
56
+ """Prevents rebuilding KB for same PDFs"""
57
+ hasher = hashlib.md5()
58
+ for f in files:
59
+ hasher.update(f.name.encode())
60
+ hasher.update(f.getvalue())
61
+ return hasher.hexdigest()
62
 
63
  def process_documents(uploaded_files):
64
  """
65
+ Ingests multiple PDFs into ONE knowledge base
 
 
 
 
66
  """
67
  try:
68
+ processor = DocumentProcessor(
69
  chunk_size=Config.CHUNK_SIZE,
70
  chunk_overlap=Config.CHUNK_OVERLAP
71
  )
72
+
 
 
 
73
  temp_dir = Path("temp_uploads")
74
  temp_dir.mkdir(exist_ok=True)
75
+
76
+ all_docs = []
77
+
78
+ for file in uploaded_files:
79
+ temp_path = temp_dir / file.name
80
  with open(temp_path, "wb") as f:
81
+ f.write(file.getvalue())
82
+
83
+ docs = processor.process_pdf(str(temp_path))
84
+
85
+ # Ensure filename metadata exists
86
+ for d in docs:
87
+ d.metadata["source"] = file.name
88
+
89
  all_docs.extend(docs)
90
+ os.remove(temp_path)
 
 
 
91
 
92
  if not all_docs:
 
93
  return None, 0
94
 
 
95
  vector_store = VectorStore()
96
  vector_store.create_vectorstore(all_docs)
97
+
98
+ graph = GraphBuilder(
 
99
  retriever=vector_store.get_retriever(),
100
  llm=Config.get_llm()
101
  )
102
+ graph.build()
103
+
104
+ return graph, len(all_docs)
105
 
106
  except Exception as e:
107
+ st.error(f"Ingestion failed: {e}")
108
  return None, 0
109
 
110
+ # -------------------------------------------------
111
+ # Main app
112
+ # -------------------------------------------------
113
  def main():
114
  init_session_state()
115
+
116
+ # ---------------- Sidebar ----------------
117
  with st.sidebar:
118
+ st.header("πŸ“„ Document Ingestion")
119
+
120
  uploaded_files = st.file_uploader(
121
+ "Upload PDF files",
122
+ type="pdf",
123
+ accept_multiple_files=True
 
124
  )
125
+
126
  if st.button("πŸ› οΈ Build Knowledge Base", type="primary"):
127
+ if not uploaded_files:
128
+ st.warning("Upload at least one PDF.")
 
 
 
 
 
 
 
 
 
129
  else:
130
+ new_hash = compute_files_hash(uploaded_files)
131
+
132
+ if new_hash == st.session_state.kb_hash:
133
+ st.info("Knowledge base already built for these PDFs.")
134
+ else:
135
+ with st.spinner("Indexing PDFs and building agent graph..."):
136
+ rag, chunks = process_documents(uploaded_files)
137
+
138
+ if rag:
139
+ st.session_state.rag_system = rag
140
+ st.session_state.processed_files = [f.name for f in uploaded_files]
141
+ st.session_state.kb_hash = new_hash
142
+
143
+ msg = (
144
+ f"βœ… Knowledge base ready!\n\n"
145
+ f"Indexed **{chunks} chunks** from:\n"
146
+ + "\n".join(f"- {f}" for f in st.session_state.processed_files)
147
+ )
148
+ st.session_state.messages.append({"role": "assistant", "content": msg})
149
+ st.rerun()
150
 
151
  if st.session_state.processed_files:
152
  st.markdown("---")
153
+ st.subheader("πŸ“š Loaded PDFs")
154
  for f in st.session_state.processed_files:
155
+ st.caption(f"βœ” {f}")
156
+
157
+ if st.button("🧹 Clear Chat"):
158
+ st.session_state.messages = [
159
+ {"role": "assistant", "content": "Chat cleared. Ask anything about the loaded PDFs!"}
160
+ ]
161
  st.rerun()
162
 
163
+ # ---------------- Main Chat ----------------
164
+ st.title("πŸ” Agentic Multi-PDF Chat")
165
+ st.caption("Ask questions across all uploaded documents")
166
 
167
+ for msg in st.session_state.messages:
168
+ with st.chat_message(msg["role"]):
169
+ st.markdown(msg["content"])
 
170
 
171
+ if prompt := st.chat_input("Ask a question across all PDFs..."):
 
 
172
  st.session_state.messages.append({"role": "user", "content": prompt})
173
+ st.chat_message("user").markdown(prompt)
174
+
175
+ if not st.session_state.rag_system:
176
+ st.warning("Build the knowledge base first.")
177
+ return
178
+
179
+ with st.chat_message("assistant"):
180
+ with st.spinner("Thinking across documents..."):
181
+ result = st.session_state.rag_system.run(prompt)
182
+
183
+ answer = result.get("answer", "No clear answer found.")
184
+ st.markdown(answer)
185
+
186
+ if result.get("retrieved_docs"):
187
+ with st.expander("πŸ“Œ Sources"):
188
+ for i, doc in enumerate(result["retrieved_docs"], 1):
189
+ st.markdown(
190
+ f"**{i}. {doc.metadata.get('source', 'Unknown')} "
191
+ f"(Page {doc.metadata.get('page', 'N/A')})**"
192
+ )
193
+ st.info(doc.page_content[:400] + "...")
194
 
195
+ st.session_state.messages.append({"role": "assistant", "content": answer})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
 
197
+ # -------------------------------------------------
198
  if __name__ == "__main__":
199
+ main()