cryogenic22 commited on
Commit
b5ab699
Β·
verified Β·
1 Parent(s): f4cf8fa

Update utils/database.py

Browse files
Files changed (1) hide show
  1. utils/database.py +82 -65
utils/database.py CHANGED
@@ -97,36 +97,6 @@ def create_tables(conn):
97
  st.error(f"Error: {e}")
98
 
99
 
100
- def process_document(file_path):
101
- """
102
- Process a PDF document with proper chunking.
103
-
104
- Args:
105
- file_path (str): Path to the PDF file.
106
- Returns:
107
- tuple: (list of document chunks, full content of the document).
108
- """
109
- # Load PDF
110
- loader = PyPDFLoader(file_path)
111
- documents = loader.load()
112
-
113
- # Create text splitter
114
- text_splitter = RecursiveCharacterTextSplitter(
115
- chunk_size=1000,
116
- chunk_overlap=200,
117
- length_function=len,
118
- separators=["\n\n", "\n", " ", ""]
119
- )
120
-
121
- # Split documents into chunks
122
- chunks = text_splitter.split_documents(documents)
123
-
124
- # Extract text content for database storage
125
- full_content = "\n".join(doc.page_content for doc in documents)
126
-
127
- return chunks, full_content
128
-
129
-
130
  def get_documents(conn):
131
  """
132
  Retrieve all documents from the database.
@@ -199,12 +169,16 @@ def verify_vector_store(vector_store):
199
  return False
200
 
201
 
202
- def handle_document_upload(uploaded_files):
203
- """
204
- Handle document upload with progress tracking.
205
 
 
 
 
 
206
  Args:
207
- uploaded_files (list): List of uploaded files.
 
 
208
  """
209
  try:
210
  # Initialize session state variables if they don't exist
@@ -213,7 +187,7 @@ def handle_document_upload(uploaded_files):
213
  if 'vector_store' not in st.session_state:
214
  st.session_state.vector_store = None
215
 
216
- # Create a progress container
217
  progress_container = st.empty()
218
  status_container = st.empty()
219
  details_container = st.empty()
@@ -223,17 +197,15 @@ def handle_document_upload(uploaded_files):
223
  status_container.info("πŸ”„ Initializing document processing...")
224
 
225
  # Reset existing states
226
- if st.session_state.vector_store is not None:
227
- st.session_state.vector_store = None
228
- if st.session_state.qa_system is not None:
229
- st.session_state.qa_system = None
230
 
231
  # Initialize embeddings (10% progress)
232
  status_container.info("πŸ”„ Initializing embeddings model...")
233
  embeddings = get_embeddings_model()
234
  if not embeddings:
235
  status_container.error("❌ Failed to initialize embeddings model")
236
- return
237
  progress_bar.progress(10)
238
 
239
  # Process documents
@@ -244,6 +216,8 @@ def handle_document_upload(uploaded_files):
244
  progress_per_file = 70 / len(uploaded_files)
245
  current_progress = 10
246
 
 
 
247
  for idx, uploaded_file in enumerate(uploaded_files):
248
  file_name = uploaded_file.name
249
  status_container.info(f"πŸ”„ Processing document {idx + 1}/{len(uploaded_files)}: {file_name}")
@@ -262,9 +236,18 @@ def handle_document_upload(uploaded_files):
262
  status_container.error(f"❌ Failed to store document: {file_name}")
263
  continue
264
 
 
 
 
 
 
265
  # Add chunks with metadata
266
  for chunk in chunks:
267
- chunk.metadata["source"] = file_name
 
 
 
 
268
  all_chunks.extend(chunks)
269
 
270
  documents.append(content)
@@ -273,7 +256,7 @@ def handle_document_upload(uploaded_files):
273
  current_progress += progress_per_file
274
  progress_bar.progress(int(current_progress))
275
 
276
- # Initialize vector store with chunks instead of full documents
277
  status_container.info("πŸ”„ Initializing vector store...")
278
  vector_store = FAISS.from_documents(
279
  all_chunks,
@@ -285,55 +268,89 @@ def handle_document_upload(uploaded_files):
285
  details_container.text("✨ Performing final checks...")
286
  if not verify_vector_store(vector_store):
287
  status_container.error("❌ Vector store verification failed")
288
- return
289
 
290
  # Initialize QA system (90-100% progress)
291
  status_container.info("πŸ”„ Setting up QA system...")
292
  qa_system = initialize_qa_system(vector_store)
293
  if not qa_system:
294
  status_container.error("❌ Failed to initialize QA system")
295
- return
296
-
297
- # Store QA system in session state
 
 
 
 
 
 
 
298
  st.session_state.qa_system = qa_system
299
 
300
  # Complete!
301
  progress_bar.progress(100)
302
  status_container.success("βœ… Documents processed successfully!")
303
  details_container.markdown(
304
- """
305
  πŸŽ‰ **Ready to chat!**
306
- - Documents loaded: {}
307
- - Total content size: {:.2f} KB
308
- - Vector store initialized
309
- - QA system ready
310
-
311
  You can now start asking questions about your documents!
312
- """.format(
313
- len(documents),
314
- sum(len(doc) for doc in documents) / 1024
315
- )
316
  )
317
 
318
  # Add notification
319
  st.balloons()
320
 
321
- # Set chat ready flag
322
- st.session_state.chat_ready = True
 
 
 
 
 
323
 
324
  except Exception as e:
325
- status_container.error(f"❌ Error processing documents: {e}")
326
- details_container.error(traceback.format_exc())
 
327
  # Reset states on error
328
  st.session_state.vector_store = None
329
  st.session_state.qa_system = None
330
  st.session_state.chat_ready = False
 
331
 
332
- finally:
333
- # Clean up progress display after 5 seconds if successful
334
- if st.session_state.get('qa_system') is not None:
335
- time.sleep(5)
336
- progress_container.empty()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337
 
338
 
339
  def display_vector_store_info():
 
97
  st.error(f"Error: {e}")
98
 
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  def get_documents(conn):
101
  """
102
  Retrieve all documents from the database.
 
169
  return False
170
 
171
 
172
+ # utils/database.py
 
 
173
 
174
+ def handle_document_upload(uploaded_files, **kwargs):
175
+ """
176
+ Handle document upload with progress tracking and collection support.
177
+
178
  Args:
179
+ uploaded_files (list): List of uploaded files
180
+ **kwargs: Additional arguments including:
181
+ - collection_id (int, optional): ID of the collection to add documents to
182
  """
183
  try:
184
  # Initialize session state variables if they don't exist
 
187
  if 'vector_store' not in st.session_state:
188
  st.session_state.vector_store = None
189
 
190
+ # Create progress containers
191
  progress_container = st.empty()
192
  status_container = st.empty()
193
  details_container = st.empty()
 
197
  status_container.info("πŸ”„ Initializing document processing...")
198
 
199
  # Reset existing states
200
+ st.session_state.vector_store = None
201
+ st.session_state.qa_system = None
 
 
202
 
203
  # Initialize embeddings (10% progress)
204
  status_container.info("πŸ”„ Initializing embeddings model...")
205
  embeddings = get_embeddings_model()
206
  if not embeddings:
207
  status_container.error("❌ Failed to initialize embeddings model")
208
+ return False
209
  progress_bar.progress(10)
210
 
211
  # Process documents
 
216
  progress_per_file = 70 / len(uploaded_files)
217
  current_progress = 10
218
 
219
+ collection_id = kwargs.get('collection_id')
220
+
221
  for idx, uploaded_file in enumerate(uploaded_files):
222
  file_name = uploaded_file.name
223
  status_container.info(f"πŸ”„ Processing document {idx + 1}/{len(uploaded_files)}: {file_name}")
 
236
  status_container.error(f"❌ Failed to store document: {file_name}")
237
  continue
238
 
239
+ # Add to collection if specified
240
+ if collection_id:
241
+ if not add_document_to_collection(st.session_state.db_conn, doc_id, collection_id):
242
+ status_container.warning(f"⚠️ Failed to add document to collection: {file_name}")
243
+
244
  # Add chunks with metadata
245
  for chunk in chunks:
246
+ chunk.metadata.update({
247
+ "source": file_name,
248
+ "document_id": doc_id,
249
+ "collection_id": collection_id if collection_id else None
250
+ })
251
  all_chunks.extend(chunks)
252
 
253
  documents.append(content)
 
256
  current_progress += progress_per_file
257
  progress_bar.progress(int(current_progress))
258
 
259
+ # Initialize vector store with chunks
260
  status_container.info("πŸ”„ Initializing vector store...")
261
  vector_store = FAISS.from_documents(
262
  all_chunks,
 
268
  details_container.text("✨ Performing final checks...")
269
  if not verify_vector_store(vector_store):
270
  status_container.error("❌ Vector store verification failed")
271
+ return False
272
 
273
  # Initialize QA system (90-100% progress)
274
  status_container.info("πŸ”„ Setting up QA system...")
275
  qa_system = initialize_qa_system(vector_store)
276
  if not qa_system:
277
  status_container.error("❌ Failed to initialize QA system")
278
+ return False
279
+
280
+ # Store in session state
281
+ if collection_id:
282
+ if 'vector_stores' not in st.session_state:
283
+ st.session_state.vector_stores = {}
284
+ st.session_state.vector_stores[collection_id] = vector_store
285
+ else:
286
+ st.session_state.vector_store = vector_store
287
+
288
  st.session_state.qa_system = qa_system
289
 
290
  # Complete!
291
  progress_bar.progress(100)
292
  status_container.success("βœ… Documents processed successfully!")
293
  details_container.markdown(
294
+ f"""
295
  πŸŽ‰ **Ready to chat!**
296
+ - Documents processed: {len(documents)}
297
+ - Total content size: {sum(len(doc) for doc in documents) / 1024:.2f} KB
298
+ - {"Added to collection" if collection_id else "Processed as standalone documents"}
299
+
 
300
  You can now start asking questions about your documents!
301
+ """
 
 
 
302
  )
303
 
304
  # Add notification
305
  st.balloons()
306
 
307
+ # Clean up progress display after 3 seconds
308
+ time.sleep(3)
309
+ progress_container.empty()
310
+ status_container.empty()
311
+ details_container.empty()
312
+
313
+ return True
314
 
315
  except Exception as e:
316
+ st.error(f"❌ Error processing documents: {str(e)}")
317
+ if status_container:
318
+ status_container.error(traceback.format_exc())
319
  # Reset states on error
320
  st.session_state.vector_store = None
321
  st.session_state.qa_system = None
322
  st.session_state.chat_ready = False
323
+ return False
324
 
325
+ def process_document(file_path):
326
+ """
327
+ Process a PDF document with proper chunking.
328
+
329
+ Args:
330
+ file_path (str): Path to the PDF file
331
+
332
+ Returns:
333
+ tuple: (list of document chunks, full content of the document)
334
+ """
335
+ # Load PDF
336
+ loader = PyPDFLoader(file_path)
337
+ documents = loader.load()
338
+
339
+ # Create text splitter
340
+ text_splitter = RecursiveCharacterTextSplitter(
341
+ chunk_size=1000,
342
+ chunk_overlap=200,
343
+ length_function=len,
344
+ separators=["\n\n", "\n", " ", ""]
345
+ )
346
+
347
+ # Split documents into chunks
348
+ chunks = text_splitter.split_documents(documents)
349
+
350
+ # Extract full content for database storage
351
+ full_content = "\n".join(doc.page_content for doc in documents)
352
+
353
+ return chunks, full_content
354
 
355
 
356
  def display_vector_store_info():