dnj0 commited on
Commit
84f81fc
Β·
verified Β·
1 Parent(s): 4c5d0c1

Update src/app.py

Browse files
Files changed (1) hide show
  1. src/app.py +60 -99
src/app.py CHANGED
@@ -2,7 +2,7 @@ import streamlit as st
2
  import os
3
  from pathlib import Path
4
  from rag_pipeline import RAGPipeline
5
- import time
6
 
7
  # Page configuration
8
  st.set_page_config(
@@ -20,8 +20,8 @@ if "uploaded_files" not in st.session_state:
20
  st.session_state.uploaded_files = []
21
  if "rag_pipeline" not in st.session_state:
22
  st.session_state.rag_pipeline = None
23
- if "last_upload_time" not in st.session_state:
24
- st.session_state.last_upload_time = 0
25
 
26
  # Sidebar configuration
27
  with st.sidebar:
@@ -33,9 +33,6 @@ with st.sidebar:
33
  help="Path to directory containing PDF files"
34
  )
35
 
36
- # Ensure directory exists
37
- os.makedirs(pdf_dir, exist_ok=True)
38
-
39
  device = st.selectbox(
40
  "πŸ–₯️ Device",
41
  ["cpu", "cuda"],
@@ -52,44 +49,36 @@ with st.sidebar:
52
 
53
  st.divider()
54
 
55
- # PDF Upload Section with Form
56
  st.subheader("πŸ“€ Upload PDF Files")
57
 
58
- # Use a form to separate file upload from submission
59
- with st.form("pdf_upload_form", clear_on_submit=True):
60
- uploaded_pdfs = st.file_uploader(
61
- "Choose PDF files to upload",
62
- type="pdf",
63
- accept_multiple_files=True,
64
- help="Select one or more PDF files to add to the system"
65
- )
 
 
66
 
67
- submit_button = st.form_submit_button("⬆️ Upload PDFs", use_container_width=True)
68
 
69
- if submit_button and uploaded_pdfs:
70
- upload_successful = True
71
  uploaded_count = 0
72
-
73
  for uploaded_file in uploaded_pdfs:
74
- try:
75
- file_path = os.path.join(pdf_dir, uploaded_file.name)
76
-
77
- # Save file to disk
78
- with open(file_path, "wb") as f:
79
- f.write(uploaded_file.getbuffer())
80
-
81
- st.session_state.uploaded_files.append(uploaded_file.name)
82
- uploaded_count += 1
83
-
84
- except Exception as e:
85
- st.error(f"Failed to upload {uploaded_file.name}: {str(e)}")
86
- upload_successful = False
87
 
88
- if upload_successful and uploaded_count > 0:
89
- st.session_state.last_upload_time = time.time()
90
- st.success(f"βœ… Uploaded {uploaded_count} PDF(s) successfully!")
91
- st.info("πŸ“Œ Click 'Reload & Index PDFs' below to process them.")
92
- # Don't call st.rerun() here - let form handle clear_on_submit
93
 
94
  st.divider()
95
 
@@ -97,94 +86,59 @@ with st.sidebar:
97
  pdf_files = list(Path(pdf_dir).glob("*.pdf"))
98
  if pdf_files:
99
  st.subheader(f"πŸ“š Documents ({len(pdf_files)})")
100
-
101
  for pdf_file in pdf_files:
102
  col1, col2 = st.columns([4, 1])
103
  with col1:
104
  st.write(f"β€’ {pdf_file.name}")
105
  with col2:
106
  if st.button("πŸ—‘οΈ", key=f"delete_{pdf_file.name}", help="Delete this file"):
107
- try:
108
- os.remove(pdf_file)
109
- st.session_state.rag_pipeline = None # Clear pipeline
110
- st.success(f"Deleted {pdf_file.name}")
111
- time.sleep(0.5)
112
- st.rerun()
113
- except Exception as e:
114
- st.error(f"Failed to delete: {str(e)}")
115
- else:
116
- st.info("πŸ“­ No PDF files in directory yet")
117
 
118
  st.divider()
119
 
120
- # Reload/Index button
121
- col1, col2 = st.columns(2)
122
- with col1:
123
- if st.button("πŸ”„ Reload & Index", use_container_width=True):
124
- st.session_state.rag_pipeline = None # Clear cached pipeline
125
- st.rerun()
126
-
127
- with col2:
128
- if st.button("πŸ—‘οΈ Clear All", use_container_width=True):
129
- # Delete all PDFs
130
- for pdf_file in Path(pdf_dir).glob("*.pdf"):
131
- try:
132
- os.remove(pdf_file)
133
- except:
134
- pass
135
- st.session_state.rag_pipeline = None
136
- st.session_state.uploaded_files = []
137
- st.success("All PDFs cleared")
138
- time.sleep(0.5)
139
- st.rerun()
140
 
141
 
142
- # Initialize pipeline
143
  @st.cache_resource
144
  def init_rag_pipeline(_device, _pdf_dir):
145
  """Initialize RAG pipeline (cached)"""
 
146
  os.makedirs(_pdf_dir, exist_ok=True)
147
 
 
148
  pdf_files = list(Path(_pdf_dir).glob("*.pdf"))
149
  if not pdf_files:
150
- return None, f"No PDF files found in {_pdf_dir}"
151
 
152
  try:
153
- with st.spinner("⏳ Initializing models..."):
154
  pipeline = RAGPipeline(pdf_dir=_pdf_dir, device=_device)
155
-
156
- with st.spinner("⏳ Indexing PDFs..."):
157
- pipeline.index_pdfs()
158
-
159
  return pipeline, None
160
  except Exception as e:
161
  return None, str(e)
162
 
163
 
164
  # Get or initialize pipeline
165
- if st.session_state.rag_pipeline is None:
166
- pdf_files = list(Path(pdf_dir).glob("*.pdf"))
167
-
168
- if pdf_files:
169
- pipeline, error = init_rag_pipeline(device, pdf_dir)
170
- if error:
171
- st.error(f"❌ Error: {error}")
172
- st.stop()
173
- st.session_state.rag_pipeline = pipeline
174
- else:
175
- st.warning("πŸ“­ No PDF files found")
176
- st.info("""
177
- **How to get started:**
178
- 1. πŸ“€ Upload PDF files using the sidebar file uploader
179
- 2. βœ… Click 'Upload PDFs' to save them
180
- 3. πŸ”„ Click 'Reload & Index PDFs' to process
181
- 4. ❓ Ask questions in the Q&A tab
182
- """)
183
  st.stop()
 
 
184
  else:
185
  pipeline = st.session_state.rag_pipeline
186
 
187
-
188
  # Main content
189
  if pipeline:
190
  # Tabs
@@ -216,19 +170,22 @@ if pipeline:
216
  try:
217
  result = pipeline.answer_question(question, n_context_docs=n_context_docs)
218
  except Exception as e:
219
- st.error(f"Error: {str(e)}")
220
  result = None
221
 
222
  if result and result.get("answer"):
223
  st.success("βœ“ Answer generated!")
224
 
 
225
  st.subheader("πŸ“ Answer")
226
  st.write(result["answer"])
227
 
 
228
  with st.expander("πŸ“š Sources Used"):
229
  for i, source in enumerate(result["sources"], 1):
230
  st.write(f"{i}. {source}")
231
 
 
232
  col1, col2 = st.columns(2)
233
  with col1:
234
  st.metric("Documents Used", result.get("context_used", 0))
@@ -249,7 +206,7 @@ if pipeline:
249
  st.subheader("πŸ“„ Document Summary")
250
  st.write(summary)
251
  except Exception as e:
252
- st.error(f"Error: {str(e)}")
253
 
254
  # Tab 3: Document Retrieval
255
  with tab3:
@@ -296,11 +253,15 @@ if pipeline:
296
  info = pipeline.vector_store.get_collection_info()
297
  col1, col2, col3, col4 = st.columns(4)
298
  with col1:
299
- st.metric("πŸ“š Chunks", info.get("document_count", 0))
300
  with col2:
301
  st.metric("πŸ–₯️ Device", device.upper())
302
  with col3:
303
- st.metric("πŸ” Context", n_context_docs)
304
  with col4:
305
  pdf_count = len(list(Path(pdf_dir).glob("*.pdf")))
306
- st.metric("πŸ“ PDFs", pdf_count)
 
 
 
 
 
2
  import os
3
  from pathlib import Path
4
  from rag_pipeline import RAGPipeline
5
+ import shutil
6
 
7
  # Page configuration
8
  st.set_page_config(
 
20
  st.session_state.uploaded_files = []
21
  if "rag_pipeline" not in st.session_state:
22
  st.session_state.rag_pipeline = None
23
+ if "needs_reindex" not in st.session_state:
24
+ st.session_state.needs_reindex = False
25
 
26
  # Sidebar configuration
27
  with st.sidebar:
 
33
  help="Path to directory containing PDF files"
34
  )
35
 
 
 
 
36
  device = st.selectbox(
37
  "πŸ–₯️ Device",
38
  ["cpu", "cuda"],
 
49
 
50
  st.divider()
51
 
52
+ # PDF Upload Section
53
  st.subheader("πŸ“€ Upload PDF Files")
54
 
55
+ uploaded_pdfs = st.file_uploader(
56
+ "Choose PDF files to upload",
57
+ type="pdf",
58
+ accept_multiple_files=True,
59
+ help="Select one or more PDF files to add to the system"
60
+ )
61
+
62
+ if uploaded_pdfs:
63
+ # Create PDF directory if not exists
64
+ os.makedirs(pdf_dir, exist_ok=True)
65
 
66
+ upload_button = st.button("⬆️ Upload PDFs", use_container_width=True)
67
 
68
+ if upload_button:
 
69
  uploaded_count = 0
 
70
  for uploaded_file in uploaded_pdfs:
71
+ file_path = os.path.join(pdf_dir, uploaded_file.name)
72
+
73
+ # Save file
74
+ with open(file_path, "wb") as f:
75
+ f.write(uploaded_file.getbuffer())
76
+
77
+ st.session_state.uploaded_files.append(uploaded_file.name)
78
+ uploaded_count += 1
 
 
 
 
 
79
 
80
+ st.success(f"βœ… Uploaded {uploaded_count} PDF(s) successfully!")
81
+ st.session_state.needs_reindex = True
 
 
 
82
 
83
  st.divider()
84
 
 
86
  pdf_files = list(Path(pdf_dir).glob("*.pdf"))
87
  if pdf_files:
88
  st.subheader(f"πŸ“š Documents ({len(pdf_files)})")
 
89
  for pdf_file in pdf_files:
90
  col1, col2 = st.columns([4, 1])
91
  with col1:
92
  st.write(f"β€’ {pdf_file.name}")
93
  with col2:
94
  if st.button("πŸ—‘οΈ", key=f"delete_{pdf_file.name}", help="Delete this file"):
95
+ os.remove(pdf_file)
96
+ st.session_state.needs_reindex = True
97
+ st.rerun()
 
 
 
 
 
 
 
98
 
99
  st.divider()
100
 
101
+ # Reindex button
102
+ if st.button("πŸ”„ Reload & Index PDFs", use_container_width=True):
103
+ st.session_state.rag_pipeline = None
104
+ st.session_state.needs_reindex = True
105
+ st.rerun()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
 
108
+ # Initialize pipeline in session state
109
  @st.cache_resource
110
  def init_rag_pipeline(_device, _pdf_dir):
111
  """Initialize RAG pipeline (cached)"""
112
+ # Create PDF directory if not exists
113
  os.makedirs(_pdf_dir, exist_ok=True)
114
 
115
+ # Check if PDFs exist
116
  pdf_files = list(Path(_pdf_dir).glob("*.pdf"))
117
  if not pdf_files:
118
+ return None, f"No PDF files found in {_pdf_dir}. Upload PDFs using the sidebar."
119
 
120
  try:
121
+ with st.spinner("⏳ Initializing RAG pipeline..."):
122
  pipeline = RAGPipeline(pdf_dir=_pdf_dir, device=_device)
123
+ with st.spinner("⏳ Indexing PDFs..."):
124
+ pipeline.index_pdfs()
 
 
125
  return pipeline, None
126
  except Exception as e:
127
  return None, str(e)
128
 
129
 
130
  # Get or initialize pipeline
131
+ if st.session_state.rag_pipeline is None or st.session_state.needs_reindex:
132
+ pipeline, error = init_rag_pipeline(device, pdf_dir)
133
+ if error:
134
+ st.error(f"❌ Error: {error}")
135
+ st.info("πŸ’‘ **How to get started:**\n1. Upload PDF files using the sidebar\n2. Click 'Upload PDFs' to save them\n3. Click 'Reload & Index PDFs' to process them")
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  st.stop()
137
+ st.session_state.rag_pipeline = pipeline
138
+ st.session_state.needs_reindex = False
139
  else:
140
  pipeline = st.session_state.rag_pipeline
141
 
 
142
  # Main content
143
  if pipeline:
144
  # Tabs
 
170
  try:
171
  result = pipeline.answer_question(question, n_context_docs=n_context_docs)
172
  except Exception as e:
173
+ st.error(f"Error generating answer: {str(e)}")
174
  result = None
175
 
176
  if result and result.get("answer"):
177
  st.success("βœ“ Answer generated!")
178
 
179
+ # Display answer
180
  st.subheader("πŸ“ Answer")
181
  st.write(result["answer"])
182
 
183
+ # Display sources
184
  with st.expander("πŸ“š Sources Used"):
185
  for i, source in enumerate(result["sources"], 1):
186
  st.write(f"{i}. {source}")
187
 
188
+ # Display stats
189
  col1, col2 = st.columns(2)
190
  with col1:
191
  st.metric("Documents Used", result.get("context_used", 0))
 
206
  st.subheader("πŸ“„ Document Summary")
207
  st.write(summary)
208
  except Exception as e:
209
+ st.error(f"Error generating summary: {str(e)}")
210
 
211
  # Tab 3: Document Retrieval
212
  with tab3:
 
253
  info = pipeline.vector_store.get_collection_info()
254
  col1, col2, col3, col4 = st.columns(4)
255
  with col1:
256
+ st.metric("πŸ“š Documents", info.get("document_count", 0))
257
  with col2:
258
  st.metric("πŸ–₯️ Device", device.upper())
259
  with col3:
260
+ st.metric("πŸ” Context Docs", n_context_docs)
261
  with col4:
262
  pdf_count = len(list(Path(pdf_dir).glob("*.pdf")))
263
+ st.metric("πŸ“ PDF Files", pdf_count)
264
+
265
+ else:
266
+ st.error("❌ Failed to initialize RAG pipeline")
267
+ st.info("πŸ’‘ **How to get started:**\n1. Upload PDF files using the sidebar\n2. Click 'Upload PDFs' to save them\n3. Click 'Reload & Index PDFs' to process them")