cryogenic22 commited on
Commit
368e7bf
Β·
verified Β·
1 Parent(s): 354813e

Update utils/database.py

Browse files
Files changed (1) hide show
  1. utils/database.py +43 -57
utils/database.py CHANGED
@@ -148,29 +148,20 @@ def verify_vector_store(vector_store):
148
 
149
 
150
  def handle_document_upload(uploaded_files):
151
- """Handle document upload with progress tracking."""
152
  try:
153
- # Initialize session state variables if they don't exist
154
  if 'qa_system' not in st.session_state:
155
  st.session_state.qa_system = None
156
  if 'vector_store' not in st.session_state:
157
  st.session_state.vector_store = None
158
 
159
- # Create a progress container
160
  progress_container = st.empty()
161
  status_container = st.empty()
162
  details_container = st.empty()
163
-
164
- # Initialize progress bar
165
  progress_bar = progress_container.progress(0)
166
- status_container.info("πŸ”„ Initializing document processing...")
167
 
168
- # Reset existing states
169
- if st.session_state.vector_store is not None:
170
- st.session_state.vector_store = None
171
- if st.session_state.qa_system is not None:
172
- st.session_state.qa_system = None
173
-
174
  # Initialize embeddings (10% progress)
175
  status_container.info("πŸ”„ Initializing embeddings model...")
176
  embeddings = get_embeddings_model()
@@ -179,12 +170,16 @@ def handle_document_upload(uploaded_files):
179
  return
180
  progress_bar.progress(10)
181
 
182
- # Process documents
183
- documents = []
184
- document_names = []
 
 
 
185
 
186
- # Calculate progress steps per file
187
- progress_per_file = 70 / len(uploaded_files) # 70% of progress for file processing
 
188
  current_progress = 10
189
 
190
  for idx, uploaded_file in enumerate(uploaded_files):
@@ -192,98 +187,89 @@ def handle_document_upload(uploaded_files):
192
  status_container.info(f"πŸ”„ Processing document {idx + 1}/{len(uploaded_files)}: {file_name}")
193
  details_container.text(f"πŸ“„ Current file: {file_name}")
194
 
195
- # Create a temporary file to save the PDF
196
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
197
- # Write the uploaded file content to the temporary file
198
  tmp_file.write(uploaded_file.getvalue())
199
  tmp_file.flush()
200
 
201
- # Use PyPDFLoader to load the PDF
202
  loader = PyPDFLoader(tmp_file.name)
203
  pdf_documents = loader.load()
204
-
205
- # Extract text content from the PDF
206
  content = "\n".join(doc.page_content for doc in pdf_documents)
207
 
208
- # Store in database
209
- details_container.text(f"πŸ’Ύ Storing {file_name} in database...")
210
  doc_id = insert_document(st.session_state.db_conn, file_name, content)
211
  if not doc_id:
212
  status_container.error(f"❌ Failed to store document: {file_name}")
213
  continue
214
 
215
- documents.append(content)
216
- document_names.append(file_name)
217
 
218
- # Update progress
219
  current_progress += progress_per_file
220
  progress_bar.progress(int(current_progress))
221
-
222
- if not documents:
223
  status_container.error("❌ No documents were successfully processed")
224
  return
225
-
226
- # Initialize vector store (80-90% progress)
 
 
 
 
 
 
 
 
 
 
 
227
  status_container.info("πŸ”„ Initializing vector store...")
228
  details_container.text("πŸ” Creating vector embeddings...")
229
- vector_store = initialize_faiss(embeddings, documents, document_names)
 
230
  if not vector_store:
231
  status_container.error("❌ Failed to initialize vector store")
232
  return
233
 
234
- # Store vector store in session state
235
  st.session_state.vector_store = vector_store
236
  progress_bar.progress(90)
237
 
238
- # Verify vector store
239
- status_container.info("πŸ”„ Verifying document indexing...")
240
- details_container.text("✨ Performing final checks...")
241
- if not verify_vector_store(vector_store):
242
- status_container.error("❌ Vector store verification failed")
243
- return
244
-
245
- # Initialize QA system (90-100% progress)
246
  status_container.info("πŸ”„ Setting up QA system...")
247
  qa_system = initialize_qa_system(vector_store)
 
248
  if not qa_system:
249
  status_container.error("❌ Failed to initialize QA system")
250
  return
251
-
252
- # Store QA system in session state
253
- st.session_state.qa_system = qa_system
254
 
255
- # Complete!
256
  progress_bar.progress(100)
 
 
257
  status_container.success("βœ… Documents processed successfully!")
258
- details_container.markdown("""
259
  πŸŽ‰ **Ready to chat!**
260
- - Documents loaded: {}
261
- - Total content size: {:.2f} KB
 
262
  - Vector store initialized
263
  - QA system ready
264
 
265
  You can now start asking questions about your documents!
266
- """.format(
267
- len(documents),
268
- sum(len(doc) for doc in documents) / 1024
269
- ))
270
 
271
- # Add notification
272
  st.balloons()
273
-
274
- # Set chat ready flag
275
  st.session_state.chat_ready = True
276
 
277
  except Exception as e:
278
  status_container.error(f"❌ Error processing documents: {e}")
279
  details_container.error(traceback.format_exc())
280
- # Reset states on error
281
  st.session_state.vector_store = None
282
  st.session_state.qa_system = None
283
  st.session_state.chat_ready = False
284
 
285
  finally:
286
- # Clean up progress display after 5 seconds if successful
287
  if st.session_state.get('qa_system') is not None:
288
  time.sleep(5)
289
  progress_container.empty()
 
148
 
149
 
150
  def handle_document_upload(uploaded_files):
151
+ """Handle document upload with improved chunking and progress tracking."""
152
  try:
153
+ # Initialize session state variables
154
  if 'qa_system' not in st.session_state:
155
  st.session_state.qa_system = None
156
  if 'vector_store' not in st.session_state:
157
  st.session_state.vector_store = None
158
 
159
+ # Create progress containers
160
  progress_container = st.empty()
161
  status_container = st.empty()
162
  details_container = st.empty()
 
 
163
  progress_bar = progress_container.progress(0)
 
164
 
 
 
 
 
 
 
165
  # Initialize embeddings (10% progress)
166
  status_container.info("πŸ”„ Initializing embeddings model...")
167
  embeddings = get_embeddings_model()
 
170
  return
171
  progress_bar.progress(10)
172
 
173
+ # Initialize document chunker
174
+ chunker = DocumentChunker(
175
+ chunk_size=1000, # Adjust these parameters based on your needs
176
+ chunk_overlap=200,
177
+ max_tokens_per_chunk=2000
178
+ )
179
 
180
+ # Process documents
181
+ document_pairs = [] # List to store (content, filename) pairs
182
+ progress_per_file = 70 / len(uploaded_files)
183
  current_progress = 10
184
 
185
  for idx, uploaded_file in enumerate(uploaded_files):
 
187
  status_container.info(f"πŸ”„ Processing document {idx + 1}/{len(uploaded_files)}: {file_name}")
188
  details_container.text(f"πŸ“„ Current file: {file_name}")
189
 
190
+ # Create temporary file for PDF processing
191
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
 
192
  tmp_file.write(uploaded_file.getvalue())
193
  tmp_file.flush()
194
 
195
+ # Load PDF content
196
  loader = PyPDFLoader(tmp_file.name)
197
  pdf_documents = loader.load()
 
 
198
  content = "\n".join(doc.page_content for doc in pdf_documents)
199
 
200
+ # Store original content in database
 
201
  doc_id = insert_document(st.session_state.db_conn, file_name, content)
202
  if not doc_id:
203
  status_container.error(f"❌ Failed to store document: {file_name}")
204
  continue
205
 
206
+ document_pairs.append((content, file_name))
 
207
 
 
208
  current_progress += progress_per_file
209
  progress_bar.progress(int(current_progress))
210
+
211
+ if not document_pairs:
212
  status_container.error("❌ No documents were successfully processed")
213
  return
214
+
215
+ # Chunk documents (80% progress)
216
+ status_container.info("πŸ”„ Chunking documents...")
217
+ details_container.text("πŸ“‘ Splitting documents into manageable chunks...")
218
+ chunks, chunk_metadatas = chunker.process_documents(document_pairs)
219
+
220
+ if not chunks:
221
+ status_container.error("❌ Failed to chunk documents")
222
+ return
223
+
224
+ progress_bar.progress(80)
225
+
226
+ # Initialize vector store (90% progress)
227
  status_container.info("πŸ”„ Initializing vector store...")
228
  details_container.text("πŸ” Creating vector embeddings...")
229
+ vector_store = initialize_faiss(embeddings, chunks, chunk_metadatas)
230
+
231
  if not vector_store:
232
  status_container.error("❌ Failed to initialize vector store")
233
  return
234
 
 
235
  st.session_state.vector_store = vector_store
236
  progress_bar.progress(90)
237
 
238
+ # Initialize QA system (100% progress)
 
 
 
 
 
 
 
239
  status_container.info("πŸ”„ Setting up QA system...")
240
  qa_system = initialize_qa_system(vector_store)
241
+
242
  if not qa_system:
243
  status_container.error("❌ Failed to initialize QA system")
244
  return
 
 
 
245
 
246
+ st.session_state.qa_system = qa_system
247
  progress_bar.progress(100)
248
+
249
+ # Success message
250
  status_container.success("βœ… Documents processed successfully!")
251
+ details_container.markdown(f"""
252
  πŸŽ‰ **Ready to chat!**
253
+ - Documents processed: {len(document_pairs)}
254
+ - Total chunks created: {len(chunks)}
255
+ - Average chunk size: {sum(len(chunk) for chunk in chunks) / len(chunks):.0f} characters
256
  - Vector store initialized
257
  - QA system ready
258
 
259
  You can now start asking questions about your documents!
260
+ """)
 
 
 
261
 
 
262
  st.balloons()
 
 
263
  st.session_state.chat_ready = True
264
 
265
  except Exception as e:
266
  status_container.error(f"❌ Error processing documents: {e}")
267
  details_container.error(traceback.format_exc())
 
268
  st.session_state.vector_store = None
269
  st.session_state.qa_system = None
270
  st.session_state.chat_ready = False
271
 
272
  finally:
 
273
  if st.session_state.get('qa_system') is not None:
274
  time.sleep(5)
275
  progress_container.empty()