Update utils/database.py
Browse files- utils/database.py +82 -65
utils/database.py
CHANGED
|
@@ -97,36 +97,6 @@ def create_tables(conn):
|
|
| 97 |
st.error(f"Error: {e}")
|
| 98 |
|
| 99 |
|
| 100 |
-
def process_document(file_path):
|
| 101 |
-
"""
|
| 102 |
-
Process a PDF document with proper chunking.
|
| 103 |
-
|
| 104 |
-
Args:
|
| 105 |
-
file_path (str): Path to the PDF file.
|
| 106 |
-
Returns:
|
| 107 |
-
tuple: (list of document chunks, full content of the document).
|
| 108 |
-
"""
|
| 109 |
-
# Load PDF
|
| 110 |
-
loader = PyPDFLoader(file_path)
|
| 111 |
-
documents = loader.load()
|
| 112 |
-
|
| 113 |
-
# Create text splitter
|
| 114 |
-
text_splitter = RecursiveCharacterTextSplitter(
|
| 115 |
-
chunk_size=1000,
|
| 116 |
-
chunk_overlap=200,
|
| 117 |
-
length_function=len,
|
| 118 |
-
separators=["\n\n", "\n", " ", ""]
|
| 119 |
-
)
|
| 120 |
-
|
| 121 |
-
# Split documents into chunks
|
| 122 |
-
chunks = text_splitter.split_documents(documents)
|
| 123 |
-
|
| 124 |
-
# Extract text content for database storage
|
| 125 |
-
full_content = "\n".join(doc.page_content for doc in documents)
|
| 126 |
-
|
| 127 |
-
return chunks, full_content
|
| 128 |
-
|
| 129 |
-
|
| 130 |
def get_documents(conn):
|
| 131 |
"""
|
| 132 |
Retrieve all documents from the database.
|
|
@@ -199,12 +169,16 @@ def verify_vector_store(vector_store):
|
|
| 199 |
return False
|
| 200 |
|
| 201 |
|
| 202 |
-
|
| 203 |
-
"""
|
| 204 |
-
Handle document upload with progress tracking.
|
| 205 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
Args:
|
| 207 |
-
uploaded_files (list): List of uploaded files
|
|
|
|
|
|
|
| 208 |
"""
|
| 209 |
try:
|
| 210 |
# Initialize session state variables if they don't exist
|
|
@@ -213,7 +187,7 @@ def handle_document_upload(uploaded_files):
|
|
| 213 |
if 'vector_store' not in st.session_state:
|
| 214 |
st.session_state.vector_store = None
|
| 215 |
|
| 216 |
-
# Create
|
| 217 |
progress_container = st.empty()
|
| 218 |
status_container = st.empty()
|
| 219 |
details_container = st.empty()
|
|
@@ -223,17 +197,15 @@ def handle_document_upload(uploaded_files):
|
|
| 223 |
status_container.info("π Initializing document processing...")
|
| 224 |
|
| 225 |
# Reset existing states
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
if st.session_state.qa_system is not None:
|
| 229 |
-
st.session_state.qa_system = None
|
| 230 |
|
| 231 |
# Initialize embeddings (10% progress)
|
| 232 |
status_container.info("π Initializing embeddings model...")
|
| 233 |
embeddings = get_embeddings_model()
|
| 234 |
if not embeddings:
|
| 235 |
status_container.error("β Failed to initialize embeddings model")
|
| 236 |
-
return
|
| 237 |
progress_bar.progress(10)
|
| 238 |
|
| 239 |
# Process documents
|
|
@@ -244,6 +216,8 @@ def handle_document_upload(uploaded_files):
|
|
| 244 |
progress_per_file = 70 / len(uploaded_files)
|
| 245 |
current_progress = 10
|
| 246 |
|
|
|
|
|
|
|
| 247 |
for idx, uploaded_file in enumerate(uploaded_files):
|
| 248 |
file_name = uploaded_file.name
|
| 249 |
status_container.info(f"π Processing document {idx + 1}/{len(uploaded_files)}: {file_name}")
|
|
@@ -262,9 +236,18 @@ def handle_document_upload(uploaded_files):
|
|
| 262 |
status_container.error(f"β Failed to store document: {file_name}")
|
| 263 |
continue
|
| 264 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
# Add chunks with metadata
|
| 266 |
for chunk in chunks:
|
| 267 |
-
chunk.metadata
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
all_chunks.extend(chunks)
|
| 269 |
|
| 270 |
documents.append(content)
|
|
@@ -273,7 +256,7 @@ def handle_document_upload(uploaded_files):
|
|
| 273 |
current_progress += progress_per_file
|
| 274 |
progress_bar.progress(int(current_progress))
|
| 275 |
|
| 276 |
-
# Initialize vector store with chunks
|
| 277 |
status_container.info("π Initializing vector store...")
|
| 278 |
vector_store = FAISS.from_documents(
|
| 279 |
all_chunks,
|
|
@@ -285,55 +268,89 @@ def handle_document_upload(uploaded_files):
|
|
| 285 |
details_container.text("β¨ Performing final checks...")
|
| 286 |
if not verify_vector_store(vector_store):
|
| 287 |
status_container.error("β Vector store verification failed")
|
| 288 |
-
return
|
| 289 |
|
| 290 |
# Initialize QA system (90-100% progress)
|
| 291 |
status_container.info("π Setting up QA system...")
|
| 292 |
qa_system = initialize_qa_system(vector_store)
|
| 293 |
if not qa_system:
|
| 294 |
status_container.error("β Failed to initialize QA system")
|
| 295 |
-
return
|
| 296 |
-
|
| 297 |
-
# Store
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 298 |
st.session_state.qa_system = qa_system
|
| 299 |
|
| 300 |
# Complete!
|
| 301 |
progress_bar.progress(100)
|
| 302 |
status_container.success("β
Documents processed successfully!")
|
| 303 |
details_container.markdown(
|
| 304 |
-
"""
|
| 305 |
π **Ready to chat!**
|
| 306 |
-
- Documents
|
| 307 |
-
- Total content size: {:.2f} KB
|
| 308 |
-
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
You can now start asking questions about your documents!
|
| 312 |
-
"""
|
| 313 |
-
len(documents),
|
| 314 |
-
sum(len(doc) for doc in documents) / 1024
|
| 315 |
-
)
|
| 316 |
)
|
| 317 |
|
| 318 |
# Add notification
|
| 319 |
st.balloons()
|
| 320 |
|
| 321 |
-
#
|
| 322 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
|
| 324 |
except Exception as e:
|
| 325 |
-
|
| 326 |
-
|
|
|
|
| 327 |
# Reset states on error
|
| 328 |
st.session_state.vector_store = None
|
| 329 |
st.session_state.qa_system = None
|
| 330 |
st.session_state.chat_ready = False
|
|
|
|
| 331 |
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 337 |
|
| 338 |
|
| 339 |
def display_vector_store_info():
|
|
|
|
| 97 |
st.error(f"Error: {e}")
|
| 98 |
|
| 99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
def get_documents(conn):
|
| 101 |
"""
|
| 102 |
Retrieve all documents from the database.
|
|
|
|
| 169 |
return False
|
| 170 |
|
| 171 |
|
| 172 |
+
# utils/database.py
|
|
|
|
|
|
|
| 173 |
|
| 174 |
+
def handle_document_upload(uploaded_files, **kwargs):
|
| 175 |
+
"""
|
| 176 |
+
Handle document upload with progress tracking and collection support.
|
| 177 |
+
|
| 178 |
Args:
|
| 179 |
+
uploaded_files (list): List of uploaded files
|
| 180 |
+
**kwargs: Additional arguments including:
|
| 181 |
+
- collection_id (int, optional): ID of the collection to add documents to
|
| 182 |
"""
|
| 183 |
try:
|
| 184 |
# Initialize session state variables if they don't exist
|
|
|
|
| 187 |
if 'vector_store' not in st.session_state:
|
| 188 |
st.session_state.vector_store = None
|
| 189 |
|
| 190 |
+
# Create progress containers
|
| 191 |
progress_container = st.empty()
|
| 192 |
status_container = st.empty()
|
| 193 |
details_container = st.empty()
|
|
|
|
| 197 |
status_container.info("π Initializing document processing...")
|
| 198 |
|
| 199 |
# Reset existing states
|
| 200 |
+
st.session_state.vector_store = None
|
| 201 |
+
st.session_state.qa_system = None
|
|
|
|
|
|
|
| 202 |
|
| 203 |
# Initialize embeddings (10% progress)
|
| 204 |
status_container.info("π Initializing embeddings model...")
|
| 205 |
embeddings = get_embeddings_model()
|
| 206 |
if not embeddings:
|
| 207 |
status_container.error("β Failed to initialize embeddings model")
|
| 208 |
+
return False
|
| 209 |
progress_bar.progress(10)
|
| 210 |
|
| 211 |
# Process documents
|
|
|
|
| 216 |
progress_per_file = 70 / len(uploaded_files)
|
| 217 |
current_progress = 10
|
| 218 |
|
| 219 |
+
collection_id = kwargs.get('collection_id')
|
| 220 |
+
|
| 221 |
for idx, uploaded_file in enumerate(uploaded_files):
|
| 222 |
file_name = uploaded_file.name
|
| 223 |
status_container.info(f"π Processing document {idx + 1}/{len(uploaded_files)}: {file_name}")
|
|
|
|
| 236 |
status_container.error(f"β Failed to store document: {file_name}")
|
| 237 |
continue
|
| 238 |
|
| 239 |
+
# Add to collection if specified
|
| 240 |
+
if collection_id:
|
| 241 |
+
if not add_document_to_collection(st.session_state.db_conn, doc_id, collection_id):
|
| 242 |
+
status_container.warning(f"β οΈ Failed to add document to collection: {file_name}")
|
| 243 |
+
|
| 244 |
# Add chunks with metadata
|
| 245 |
for chunk in chunks:
|
| 246 |
+
chunk.metadata.update({
|
| 247 |
+
"source": file_name,
|
| 248 |
+
"document_id": doc_id,
|
| 249 |
+
"collection_id": collection_id if collection_id else None
|
| 250 |
+
})
|
| 251 |
all_chunks.extend(chunks)
|
| 252 |
|
| 253 |
documents.append(content)
|
|
|
|
| 256 |
current_progress += progress_per_file
|
| 257 |
progress_bar.progress(int(current_progress))
|
| 258 |
|
| 259 |
+
# Initialize vector store with chunks
|
| 260 |
status_container.info("π Initializing vector store...")
|
| 261 |
vector_store = FAISS.from_documents(
|
| 262 |
all_chunks,
|
|
|
|
| 268 |
details_container.text("β¨ Performing final checks...")
|
| 269 |
if not verify_vector_store(vector_store):
|
| 270 |
status_container.error("β Vector store verification failed")
|
| 271 |
+
return False
|
| 272 |
|
| 273 |
# Initialize QA system (90-100% progress)
|
| 274 |
status_container.info("π Setting up QA system...")
|
| 275 |
qa_system = initialize_qa_system(vector_store)
|
| 276 |
if not qa_system:
|
| 277 |
status_container.error("β Failed to initialize QA system")
|
| 278 |
+
return False
|
| 279 |
+
|
| 280 |
+
# Store in session state
|
| 281 |
+
if collection_id:
|
| 282 |
+
if 'vector_stores' not in st.session_state:
|
| 283 |
+
st.session_state.vector_stores = {}
|
| 284 |
+
st.session_state.vector_stores[collection_id] = vector_store
|
| 285 |
+
else:
|
| 286 |
+
st.session_state.vector_store = vector_store
|
| 287 |
+
|
| 288 |
st.session_state.qa_system = qa_system
|
| 289 |
|
| 290 |
# Complete!
|
| 291 |
progress_bar.progress(100)
|
| 292 |
status_container.success("β
Documents processed successfully!")
|
| 293 |
details_container.markdown(
|
| 294 |
+
f"""
|
| 295 |
π **Ready to chat!**
|
| 296 |
+
- Documents processed: {len(documents)}
|
| 297 |
+
- Total content size: {sum(len(doc) for doc in documents) / 1024:.2f} KB
|
| 298 |
+
- {"Added to collection" if collection_id else "Processed as standalone documents"}
|
| 299 |
+
|
|
|
|
| 300 |
You can now start asking questions about your documents!
|
| 301 |
+
"""
|
|
|
|
|
|
|
|
|
|
| 302 |
)
|
| 303 |
|
| 304 |
# Add notification
|
| 305 |
st.balloons()
|
| 306 |
|
| 307 |
+
# Clean up progress display after 3 seconds
|
| 308 |
+
time.sleep(3)
|
| 309 |
+
progress_container.empty()
|
| 310 |
+
status_container.empty()
|
| 311 |
+
details_container.empty()
|
| 312 |
+
|
| 313 |
+
return True
|
| 314 |
|
| 315 |
except Exception as e:
|
| 316 |
+
st.error(f"β Error processing documents: {str(e)}")
|
| 317 |
+
if status_container:
|
| 318 |
+
status_container.error(traceback.format_exc())
|
| 319 |
# Reset states on error
|
| 320 |
st.session_state.vector_store = None
|
| 321 |
st.session_state.qa_system = None
|
| 322 |
st.session_state.chat_ready = False
|
| 323 |
+
return False
|
| 324 |
|
| 325 |
+
def process_document(file_path):
|
| 326 |
+
"""
|
| 327 |
+
Process a PDF document with proper chunking.
|
| 328 |
+
|
| 329 |
+
Args:
|
| 330 |
+
file_path (str): Path to the PDF file
|
| 331 |
+
|
| 332 |
+
Returns:
|
| 333 |
+
tuple: (list of document chunks, full content of the document)
|
| 334 |
+
"""
|
| 335 |
+
# Load PDF
|
| 336 |
+
loader = PyPDFLoader(file_path)
|
| 337 |
+
documents = loader.load()
|
| 338 |
+
|
| 339 |
+
# Create text splitter
|
| 340 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 341 |
+
chunk_size=1000,
|
| 342 |
+
chunk_overlap=200,
|
| 343 |
+
length_function=len,
|
| 344 |
+
separators=["\n\n", "\n", " ", ""]
|
| 345 |
+
)
|
| 346 |
+
|
| 347 |
+
# Split documents into chunks
|
| 348 |
+
chunks = text_splitter.split_documents(documents)
|
| 349 |
+
|
| 350 |
+
# Extract full content for database storage
|
| 351 |
+
full_content = "\n".join(doc.page_content for doc in documents)
|
| 352 |
+
|
| 353 |
+
return chunks, full_content
|
| 354 |
|
| 355 |
|
| 356 |
def display_vector_store_info():
|