dnj0 commited on
Commit
d3aa2b9
Β·
verified Β·
1 Parent(s): 84f81fc

Update src/app.py

Browse files
Files changed (1) hide show
  1. src/app.py +99 -60
src/app.py CHANGED
@@ -2,7 +2,7 @@ import streamlit as st
2
  import os
3
  from pathlib import Path
4
  from rag_pipeline import RAGPipeline
5
- import shutil
6
 
7
  # Page configuration
8
  st.set_page_config(
@@ -20,8 +20,8 @@ if "uploaded_files" not in st.session_state:
20
  st.session_state.uploaded_files = []
21
  if "rag_pipeline" not in st.session_state:
22
  st.session_state.rag_pipeline = None
23
- if "needs_reindex" not in st.session_state:
24
- st.session_state.needs_reindex = False
25
 
26
  # Sidebar configuration
27
  with st.sidebar:
@@ -33,6 +33,9 @@ with st.sidebar:
33
  help="Path to directory containing PDF files"
34
  )
35
 
 
 
 
36
  device = st.selectbox(
37
  "πŸ–₯️ Device",
38
  ["cpu", "cuda"],
@@ -49,36 +52,44 @@ with st.sidebar:
49
 
50
  st.divider()
51
 
52
- # PDF Upload Section
53
  st.subheader("πŸ“€ Upload PDF Files")
54
 
55
- uploaded_pdfs = st.file_uploader(
56
- "Choose PDF files to upload",
57
- type="pdf",
58
- accept_multiple_files=True,
59
- help="Select one or more PDF files to add to the system"
60
- )
61
-
62
- if uploaded_pdfs:
63
- # Create PDF directory if not exists
64
- os.makedirs(pdf_dir, exist_ok=True)
65
 
66
- upload_button = st.button("⬆️ Upload PDFs", use_container_width=True)
67
 
68
- if upload_button:
 
69
  uploaded_count = 0
 
70
  for uploaded_file in uploaded_pdfs:
71
- file_path = os.path.join(pdf_dir, uploaded_file.name)
72
-
73
- # Save file
74
- with open(file_path, "wb") as f:
75
- f.write(uploaded_file.getbuffer())
76
-
77
- st.session_state.uploaded_files.append(uploaded_file.name)
78
- uploaded_count += 1
 
 
 
 
 
79
 
80
- st.success(f"βœ… Uploaded {uploaded_count} PDF(s) successfully!")
81
- st.session_state.needs_reindex = True
 
 
 
82
 
83
  st.divider()
84
 
@@ -86,59 +97,94 @@ with st.sidebar:
86
  pdf_files = list(Path(pdf_dir).glob("*.pdf"))
87
  if pdf_files:
88
  st.subheader(f"πŸ“š Documents ({len(pdf_files)})")
 
89
  for pdf_file in pdf_files:
90
  col1, col2 = st.columns([4, 1])
91
  with col1:
92
  st.write(f"β€’ {pdf_file.name}")
93
  with col2:
94
  if st.button("πŸ—‘οΈ", key=f"delete_{pdf_file.name}", help="Delete this file"):
95
- os.remove(pdf_file)
96
- st.session_state.needs_reindex = True
97
- st.rerun()
 
 
 
 
 
 
 
98
 
99
  st.divider()
100
 
101
- # Reindex button
102
- if st.button("πŸ”„ Reload & Index PDFs", use_container_width=True):
103
- st.session_state.rag_pipeline = None
104
- st.session_state.needs_reindex = True
105
- st.rerun()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
 
108
- # Initialize pipeline in session state
109
  @st.cache_resource
110
  def init_rag_pipeline(_device, _pdf_dir):
111
  """Initialize RAG pipeline (cached)"""
112
- # Create PDF directory if not exists
113
  os.makedirs(_pdf_dir, exist_ok=True)
114
 
115
- # Check if PDFs exist
116
  pdf_files = list(Path(_pdf_dir).glob("*.pdf"))
117
  if not pdf_files:
118
- return None, f"No PDF files found in {_pdf_dir}. Upload PDFs using the sidebar."
119
 
120
  try:
121
- with st.spinner("⏳ Initializing RAG pipeline..."):
122
  pipeline = RAGPipeline(pdf_dir=_pdf_dir, device=_device)
123
- with st.spinner("⏳ Indexing PDFs..."):
124
- pipeline.index_pdfs()
 
 
125
  return pipeline, None
126
  except Exception as e:
127
  return None, str(e)
128
 
129
 
130
  # Get or initialize pipeline
131
- if st.session_state.rag_pipeline is None or st.session_state.needs_reindex:
132
- pipeline, error = init_rag_pipeline(device, pdf_dir)
133
- if error:
134
- st.error(f"❌ Error: {error}")
135
- st.info("πŸ’‘ **How to get started:**\n1. Upload PDF files using the sidebar\n2. Click 'Upload PDFs' to save them\n3. Click 'Reload & Index PDFs' to process them")
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  st.stop()
137
- st.session_state.rag_pipeline = pipeline
138
- st.session_state.needs_reindex = False
139
  else:
140
  pipeline = st.session_state.rag_pipeline
141
 
 
142
  # Main content
143
  if pipeline:
144
  # Tabs
@@ -170,22 +216,19 @@ if pipeline:
170
  try:
171
  result = pipeline.answer_question(question, n_context_docs=n_context_docs)
172
  except Exception as e:
173
- st.error(f"Error generating answer: {str(e)}")
174
  result = None
175
 
176
  if result and result.get("answer"):
177
  st.success("βœ“ Answer generated!")
178
 
179
- # Display answer
180
  st.subheader("πŸ“ Answer")
181
  st.write(result["answer"])
182
 
183
- # Display sources
184
  with st.expander("πŸ“š Sources Used"):
185
  for i, source in enumerate(result["sources"], 1):
186
  st.write(f"{i}. {source}")
187
 
188
- # Display stats
189
  col1, col2 = st.columns(2)
190
  with col1:
191
  st.metric("Documents Used", result.get("context_used", 0))
@@ -206,7 +249,7 @@ if pipeline:
206
  st.subheader("πŸ“„ Document Summary")
207
  st.write(summary)
208
  except Exception as e:
209
- st.error(f"Error generating summary: {str(e)}")
210
 
211
  # Tab 3: Document Retrieval
212
  with tab3:
@@ -253,15 +296,11 @@ if pipeline:
253
  info = pipeline.vector_store.get_collection_info()
254
  col1, col2, col3, col4 = st.columns(4)
255
  with col1:
256
- st.metric("πŸ“š Documents", info.get("document_count", 0))
257
  with col2:
258
  st.metric("πŸ–₯️ Device", device.upper())
259
  with col3:
260
- st.metric("πŸ” Context Docs", n_context_docs)
261
  with col4:
262
  pdf_count = len(list(Path(pdf_dir).glob("*.pdf")))
263
- st.metric("πŸ“ PDF Files", pdf_count)
264
-
265
- else:
266
- st.error("❌ Failed to initialize RAG pipeline")
267
- st.info("πŸ’‘ **How to get started:**\n1. Upload PDF files using the sidebar\n2. Click 'Upload PDFs' to save them\n3. Click 'Reload & Index PDFs' to process them")
 
2
  import os
3
  from pathlib import Path
4
  from rag_pipeline import RAGPipeline
5
+ import time
6
 
7
  # Page configuration
8
  st.set_page_config(
 
20
  st.session_state.uploaded_files = []
21
  if "rag_pipeline" not in st.session_state:
22
  st.session_state.rag_pipeline = None
23
+ if "last_upload_time" not in st.session_state:
24
+ st.session_state.last_upload_time = 0
25
 
26
  # Sidebar configuration
27
  with st.sidebar:
 
33
  help="Path to directory containing PDF files"
34
  )
35
 
36
+ # Ensure directory exists
37
+ os.makedirs(pdf_dir, exist_ok=True)
38
+
39
  device = st.selectbox(
40
  "πŸ–₯️ Device",
41
  ["cpu", "cuda"],
 
52
 
53
  st.divider()
54
 
55
+ # PDF Upload Section with Form
56
  st.subheader("πŸ“€ Upload PDF Files")
57
 
58
+ # Use a form to separate file upload from submission
59
+ with st.form("pdf_upload_form", clear_on_submit=True):
60
+ uploaded_pdfs = st.file_uploader(
61
+ "Choose PDF files to upload",
62
+ type="pdf",
63
+ accept_multiple_files=True,
64
+ help="Select one or more PDF files to add to the system"
65
+ )
 
 
66
 
67
+ submit_button = st.form_submit_button("⬆️ Upload PDFs", use_container_width=True)
68
 
69
+ if submit_button and uploaded_pdfs:
70
+ upload_successful = True
71
  uploaded_count = 0
72
+
73
  for uploaded_file in uploaded_pdfs:
74
+ try:
75
+ file_path = os.path.join(pdf_dir, uploaded_file.name)
76
+
77
+ # Save file to disk
78
+ with open(file_path, "wb") as f:
79
+ f.write(uploaded_file.getbuffer())
80
+
81
+ st.session_state.uploaded_files.append(uploaded_file.name)
82
+ uploaded_count += 1
83
+
84
+ except Exception as e:
85
+ st.error(f"Failed to upload {uploaded_file.name}: {str(e)}")
86
+ upload_successful = False
87
 
88
+ if upload_successful and uploaded_count > 0:
89
+ st.session_state.last_upload_time = time.time()
90
+ st.success(f"βœ… Uploaded {uploaded_count} PDF(s) successfully!")
91
+ st.info("πŸ“Œ Click 'Reload & Index PDFs' below to process them.")
92
+ # Don't call st.rerun() here - let form handle clear_on_submit
93
 
94
  st.divider()
95
 
 
97
  pdf_files = list(Path(pdf_dir).glob("*.pdf"))
98
  if pdf_files:
99
  st.subheader(f"πŸ“š Documents ({len(pdf_files)})")
100
+
101
  for pdf_file in pdf_files:
102
  col1, col2 = st.columns([4, 1])
103
  with col1:
104
  st.write(f"β€’ {pdf_file.name}")
105
  with col2:
106
  if st.button("πŸ—‘οΈ", key=f"delete_{pdf_file.name}", help="Delete this file"):
107
+ try:
108
+ os.remove(pdf_file)
109
+ st.session_state.rag_pipeline = None # Clear pipeline
110
+ st.success(f"Deleted {pdf_file.name}")
111
+ time.sleep(0.5)
112
+ st.rerun()
113
+ except Exception as e:
114
+ st.error(f"Failed to delete: {str(e)}")
115
+ else:
116
+ st.info("πŸ“­ No PDF files in directory yet")
117
 
118
  st.divider()
119
 
120
+ # Reload/Index button
121
+ col1, col2 = st.columns(2)
122
+ with col1:
123
+ if st.button("πŸ”„ Reload & Index", use_container_width=True):
124
+ st.session_state.rag_pipeline = None # Clear cached pipeline
125
+ st.rerun()
126
+
127
+ with col2:
128
+ if st.button("πŸ—‘οΈ Clear All", use_container_width=True):
129
+ # Delete all PDFs
130
+ for pdf_file in Path(pdf_dir).glob("*.pdf"):
131
+ try:
132
+ os.remove(pdf_file)
133
+ except:
134
+ pass
135
+ st.session_state.rag_pipeline = None
136
+ st.session_state.uploaded_files = []
137
+ st.success("All PDFs cleared")
138
+ time.sleep(0.5)
139
+ st.rerun()
140
 
141
 
142
+ # Initialize pipeline
143
  @st.cache_resource
144
  def init_rag_pipeline(_device, _pdf_dir):
145
  """Initialize RAG pipeline (cached)"""
 
146
  os.makedirs(_pdf_dir, exist_ok=True)
147
 
 
148
  pdf_files = list(Path(_pdf_dir).glob("*.pdf"))
149
  if not pdf_files:
150
+ return None, f"No PDF files found in {_pdf_dir}"
151
 
152
  try:
153
+ with st.spinner("⏳ Initializing models..."):
154
  pipeline = RAGPipeline(pdf_dir=_pdf_dir, device=_device)
155
+
156
+ with st.spinner("⏳ Indexing PDFs..."):
157
+ pipeline.index_pdfs()
158
+
159
  return pipeline, None
160
  except Exception as e:
161
  return None, str(e)
162
 
163
 
164
  # Get or initialize pipeline
165
+ if st.session_state.rag_pipeline is None:
166
+ pdf_files = list(Path(pdf_dir).glob("*.pdf"))
167
+
168
+ if pdf_files:
169
+ pipeline, error = init_rag_pipeline(device, pdf_dir)
170
+ if error:
171
+ st.error(f"❌ Error: {error}")
172
+ st.stop()
173
+ st.session_state.rag_pipeline = pipeline
174
+ else:
175
+ st.warning("πŸ“­ No PDF files found")
176
+ st.info("""
177
+ **How to get started:**
178
+ 1. πŸ“€ Upload PDF files using the sidebar file uploader
179
+ 2. βœ… Click 'Upload PDFs' to save them
180
+ 3. πŸ”„ Click 'Reload & Index PDFs' to process
181
+ 4. ❓ Ask questions in the Q&A tab
182
+ """)
183
  st.stop()
 
 
184
  else:
185
  pipeline = st.session_state.rag_pipeline
186
 
187
+
188
  # Main content
189
  if pipeline:
190
  # Tabs
 
216
  try:
217
  result = pipeline.answer_question(question, n_context_docs=n_context_docs)
218
  except Exception as e:
219
+ st.error(f"Error: {str(e)}")
220
  result = None
221
 
222
  if result and result.get("answer"):
223
  st.success("βœ“ Answer generated!")
224
 
 
225
  st.subheader("πŸ“ Answer")
226
  st.write(result["answer"])
227
 
 
228
  with st.expander("πŸ“š Sources Used"):
229
  for i, source in enumerate(result["sources"], 1):
230
  st.write(f"{i}. {source}")
231
 
 
232
  col1, col2 = st.columns(2)
233
  with col1:
234
  st.metric("Documents Used", result.get("context_used", 0))
 
249
  st.subheader("πŸ“„ Document Summary")
250
  st.write(summary)
251
  except Exception as e:
252
+ st.error(f"Error: {str(e)}")
253
 
254
  # Tab 3: Document Retrieval
255
  with tab3:
 
296
  info = pipeline.vector_store.get_collection_info()
297
  col1, col2, col3, col4 = st.columns(4)
298
  with col1:
299
+ st.metric("πŸ“š Chunks", info.get("document_count", 0))
300
  with col2:
301
  st.metric("πŸ–₯️ Device", device.upper())
302
  with col3:
303
+ st.metric("πŸ” Context", n_context_docs)
304
  with col4:
305
  pdf_count = len(list(Path(pdf_dir).glob("*.pdf")))
306
+ st.metric("πŸ“ PDFs", pdf_count)