cryogenic22 commited on
Commit
d7c897b
Β·
verified Β·
1 Parent(s): acec4a7

Update utils/database.py

Browse files
Files changed (1) hide show
  1. utils/database.py +42 -31
utils/database.py CHANGED
@@ -146,24 +146,24 @@ def verify_vector_store(vector_store):
146
 
147
  def handle_document_upload(uploaded_files):
148
  """Handle document upload with improved chunking and progress tracking."""
 
 
 
 
 
 
149
  try:
150
  # Initialize session state variables
151
- # Initialize persistence manager
152
- persistence = PersistenceManager()
153
-
154
- # Generate a session ID based on timestamp and files
155
- session_id = f"session_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
156
-
157
  if 'qa_system' not in st.session_state:
158
  st.session_state.qa_system = None
159
  if 'vector_store' not in st.session_state:
160
  st.session_state.vector_store = None
161
 
162
- # Create progress containers
163
- progress_container = st.empty()
164
- status_container = st.empty()
165
- details_container = st.empty()
166
- progress_bar = progress_container.progress(0)
167
 
168
  # Initialize embeddings (10% progress)
169
  status_container.info("πŸ”„ Initializing embeddings model...")
@@ -175,7 +175,7 @@ def handle_document_upload(uploaded_files):
175
 
176
  # Initialize document chunker
177
  chunker = DocumentChunker(
178
- chunk_size=1000, # Adjust these parameters based on your needs
179
  chunk_overlap=200,
180
  max_tokens_per_chunk=2000
181
  )
@@ -195,18 +195,26 @@ def handle_document_upload(uploaded_files):
195
  tmp_file.write(uploaded_file.getvalue())
196
  tmp_file.flush()
197
 
198
- # Load PDF content
199
- loader = PyPDFLoader(tmp_file.name)
200
- pdf_documents = loader.load()
201
- content = "\n".join(doc.page_content for doc in pdf_documents)
202
-
203
- # Store original content in database
204
- doc_id = insert_document(st.session_state.db_conn, file_name, content)
205
- if not doc_id:
206
- status_container.error(f"❌ Failed to store document: {file_name}")
207
- continue
208
-
209
- document_pairs.append((content, file_name))
 
 
 
 
 
 
 
 
210
 
211
  current_progress += progress_per_file
212
  progress_bar.progress(int(current_progress))
@@ -225,7 +233,8 @@ def handle_document_upload(uploaded_files):
225
  return
226
 
227
  progress_bar.progress(80)
228
- # After chunking documents
 
229
  persistence.save_chunks(chunks, chunk_metadatas, session_id)
230
 
231
  # Initialize vector store (90% progress)
@@ -237,11 +246,12 @@ def handle_document_upload(uploaded_files):
237
  status_container.error("❌ Failed to initialize vector store")
238
  return
239
 
240
- st.session_state.vector_store = vector_store
241
- progress_bar.progress(90)
242
  persistence.save_vector_store(vector_store, session_id)
243
- # Store session ID in state
244
  st.session_state.current_session_id = session_id
 
 
245
  # Initialize QA system (100% progress)
246
  status_container.info("πŸ”„ Setting up QA system...")
247
  qa_system = initialize_qa_system(vector_store)
@@ -260,8 +270,9 @@ def handle_document_upload(uploaded_files):
260
  - Documents processed: {len(document_pairs)}
261
  - Total chunks created: {len(chunks)}
262
  - Average chunk size: {sum(len(chunk) for chunk in chunks) / len(chunks):.0f} characters
263
- - Vector store initialized
264
  - QA system ready
 
265
 
266
  You can now start asking questions about your documents!
267
  """)
@@ -270,14 +281,14 @@ def handle_document_upload(uploaded_files):
270
  st.session_state.chat_ready = True
271
 
272
  except Exception as e:
273
- status_container.error(f"❌ Error processing documents: {e}")
274
  details_container.error(traceback.format_exc())
275
  st.session_state.vector_store = None
276
  st.session_state.qa_system = None
277
  st.session_state.chat_ready = False
278
- st.error(f"Error in document upload: {str(e)}")
279
 
280
  finally:
 
281
  if st.session_state.get('qa_system') is not None:
282
  time.sleep(5)
283
  progress_container.empty()
 
146
 
147
  def handle_document_upload(uploaded_files):
148
  """Handle document upload with improved chunking and progress tracking."""
149
+ # Initialize containers first - before any processing
150
+ progress_container = st.empty()
151
+ status_container = st.empty()
152
+ details_container = st.empty()
153
+ progress_bar = progress_container.progress(0)
154
+
155
  try:
156
  # Initialize session state variables
 
 
 
 
 
 
157
  if 'qa_system' not in st.session_state:
158
  st.session_state.qa_system = None
159
  if 'vector_store' not in st.session_state:
160
  st.session_state.vector_store = None
161
 
162
+ # Initialize persistence manager
163
+ persistence = PersistenceManager()
164
+
165
+ # Generate a session ID based on timestamp and files
166
+ session_id = f"session_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
167
 
168
  # Initialize embeddings (10% progress)
169
  status_container.info("πŸ”„ Initializing embeddings model...")
 
175
 
176
  # Initialize document chunker
177
  chunker = DocumentChunker(
178
+ chunk_size=1000,
179
  chunk_overlap=200,
180
  max_tokens_per_chunk=2000
181
  )
 
195
  tmp_file.write(uploaded_file.getvalue())
196
  tmp_file.flush()
197
 
198
+ try:
199
+ # Load PDF content
200
+ loader = PyPDFLoader(tmp_file.name)
201
+ pdf_documents = loader.load()
202
+ content = "\n".join(doc.page_content for doc in pdf_documents)
203
+
204
+ # Store original content in database
205
+ doc_id = insert_document(st.session_state.db_conn, file_name, content)
206
+ if not doc_id:
207
+ status_container.error(f"❌ Failed to store document: {file_name}")
208
+ continue
209
+
210
+ document_pairs.append((content, file_name))
211
+
212
+ finally:
213
+ # Ensure temporary file is cleaned up
214
+ try:
215
+ os.unlink(tmp_file.name)
216
+ except Exception as e:
217
+ st.warning(f"Could not delete temporary file: {e}")
218
 
219
  current_progress += progress_per_file
220
  progress_bar.progress(int(current_progress))
 
233
  return
234
 
235
  progress_bar.progress(80)
236
+
237
+ # Save chunks for persistence
238
  persistence.save_chunks(chunks, chunk_metadatas, session_id)
239
 
240
  # Initialize vector store (90% progress)
 
246
  status_container.error("❌ Failed to initialize vector store")
247
  return
248
 
249
+ # Save vector store and update session state
 
250
  persistence.save_vector_store(vector_store, session_id)
251
+ st.session_state.vector_store = vector_store
252
  st.session_state.current_session_id = session_id
253
+ progress_bar.progress(90)
254
+
255
  # Initialize QA system (100% progress)
256
  status_container.info("πŸ”„ Setting up QA system...")
257
  qa_system = initialize_qa_system(vector_store)
 
270
  - Documents processed: {len(document_pairs)}
271
  - Total chunks created: {len(chunks)}
272
  - Average chunk size: {sum(len(chunk) for chunk in chunks) / len(chunks):.0f} characters
273
+ - Vector store initialized and saved
274
  - QA system ready
275
+ - Session ID: {session_id}
276
 
277
  You can now start asking questions about your documents!
278
  """)
 
281
  st.session_state.chat_ready = True
282
 
283
  except Exception as e:
284
+ status_container.error(f"❌ Error processing documents: {str(e)}")
285
  details_container.error(traceback.format_exc())
286
  st.session_state.vector_store = None
287
  st.session_state.qa_system = None
288
  st.session_state.chat_ready = False
 
289
 
290
  finally:
291
+ # Clean up progress display after successful processing
292
  if st.session_state.get('qa_system') is not None:
293
  time.sleep(5)
294
  progress_container.empty()