RFP_Analyzer_Agent_backup

Build error

App Files Files Community

cryogenic22 commited on Dec 1, 2024

Commit

292eb86

verified ·

1 Parent(s): 10c877a

Update utils/database.py

Browse files

Files changed (1) hide show

utils/database.py +137 -136

utils/database.py CHANGED Viewed

@@ -1,4 +1,5 @@
 # utils/database.py
 # Update the imports first
 from langchain_community.chat_models import ChatOpenAI
 from langchain_core.messages import (
@@ -25,12 +26,18 @@ import traceback
 import time
 import io
 import tempfile
-from langchain_community.document_loaders import PyPDFLoader
 from sqlite3 import Error
 def create_connection(db_file):
-    """Create a database connection to the SQLite database."""
     conn = None
     try:
         conn = sqlite3.connect(db_file)
@@ -39,8 +46,14 @@ def create_connection(db_file):
         st.error("Failed to connect to database. Please try again or contact support.")
     return None
 def create_tables(conn):
-    """Create necessary tables in the database."""
     try:
         sql_create_documents_table = '''
         CREATE TABLE IF NOT EXISTS documents (
@@ -81,11 +94,18 @@ def create_tables(conn):
 def process_document(file_path):
-    """Process a PDF document with proper chunking."""
     # Load PDF
     loader = PyPDFLoader(file_path)
     documents = loader.load()
     # Create text splitter
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=1000,
@@ -93,52 +113,54 @@ def process_document(file_path):
         length_function=len,
         separators=["\n\n", "\n", " ", ""]
     )
     # Split documents into chunks
     chunks = text_splitter.split_documents(documents)
     # Extract text content for database storage
     full_content = "\n".join(doc.page_content for doc in documents)
     return chunks, full_content
 def get_documents(conn):
-    """Retrieve all documents from the database.
     Args:
-        conn: SQLite database connection
     Returns:
-        tuple: (list of document contents, list of document names)
     """
     try:
         cursor = conn.cursor()
         cursor.execute("SELECT content, name FROM documents")
         results = cursor.fetchall()
         if not results:
             return [], []
         # Separate contents and names
         document_contents = [row[0] for row in results]
         document_names = [row[1] for row in results]
         return document_contents, document_names
     except Error as e:
         st.error(f"Error retrieving documents: {e}")
         return [], []
 def insert_document(conn, name, content):
-    """Insert a new document into the database.
     Args:
-        conn: SQLite database connection
-        name (str): Name of the document
-        content (str): Content of the document
     Returns:
-        int: ID of the inserted document, or None if insertion failed
     """
     try:
         cursor = conn.cursor()
@@ -147,19 +169,20 @@ def insert_document(conn, name, content):
         cursor.execute(sql, (name, content))
         conn.commit()
         return cursor.lastrowid
     except Error as e:
         st.error(f"Error inserting document: {e}")
         return None
 def verify_vector_store(vector_store):
-    """Verify that the vector store has documents loaded.
     Args:
-        vector_store: FAISS vector store instance
     Returns:
-        bool: True if vector store is properly initialized with documents
     """
     try:
         # Try to perform a simple similarity search
@@ -170,31 +193,35 @@ def verify_vector_store(vector_store):
         return False
 def handle_document_upload(uploaded_files):
-    """Handle document upload with progress tracking."""
     try:
         # Initialize session state variables if they don't exist
         if 'qa_system' not in st.session_state:
             st.session_state.qa_system = None
         if 'vector_store' not in st.session_state:
             st.session_state.vector_store = None
         # Create a progress container
         progress_container = st.empty()
         status_container = st.empty()
         details_container = st.empty()
         # Initialize progress bar
         progress_bar = progress_container.progress(0)
         status_container.info("🔄 Initializing document processing...")
         # Reset existing states
         if st.session_state.vector_store is not None:
             st.session_state.vector_store = None
         if st.session_state.qa_system is not None:
             st.session_state.qa_system = None
         # Initialize embeddings (10% progress)
         status_container.info("🔄 Initializing embeddings model...")
         embeddings = get_embeddings_model()
@@ -202,142 +229,92 @@ def handle_document_upload(uploaded_files):
             status_container.error("❌ Failed to initialize embeddings model")
             return
         progress_bar.progress(10)
-         # Process documents
         all_chunks = []
         documents = []
         document_names = []
         progress_per_file = 70 / len(uploaded_files)
         current_progress = 10
         for idx, uploaded_file in enumerate(uploaded_files):
             file_name = uploaded_file.name
             status_container.info(f"🔄 Processing document {idx + 1}/{len(uploaded_files)}: {file_name}")
             # Create temporary file
             with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
                 tmp_file.write(uploaded_file.getvalue())
                 tmp_file.flush()
                 # Process document with chunking
                 chunks, content = process_document(tmp_file.name)
                 # Store in database
                 doc_id = insert_document(st.session_state.db_conn, file_name, content)
                 if not doc_id:
                     status_container.error(f"❌ Failed to store document: {file_name}")
                     continue
                 # Add chunks with metadata
                 for chunk in chunks:
                     chunk.metadata["source"] = file_name
                 all_chunks.extend(chunks)
                 documents.append(content)
                 document_names.append(file_name)
             current_progress += progress_per_file
             progress_bar.progress(int(current_progress))
         # Initialize vector store with chunks instead of full documents
         status_container.info("🔄 Initializing vector store...")
         vector_store = FAISS.from_documents(
             all_chunks,
             embeddings
         )
-        # Calculate progress steps per file
-        progress_per_file = 70 / len(uploaded_files)  # 70% of progress for file processing
-        current_progress = 10
-        for idx, uploaded_file in enumerate(uploaded_files):
-            file_name = uploaded_file.name
-            status_container.info(f"🔄 Processing document {idx + 1}/{len(uploaded_files)}: {file_name}")
-            details_container.text(f"📄 Current file: {file_name}")
-            # Create a temporary file to save the PDF
-            with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
-                # Write the uploaded file content to the temporary file
-                tmp_file.write(uploaded_file.getvalue())
-                tmp_file.flush()
-                # Use PyPDFLoader to load the PDF
-                loader = PyPDFLoader(tmp_file.name)
-                pdf_documents = loader.load()
-                # Extract text content from the PDF
-                content = "\n".join(doc.page_content for doc in pdf_documents)
-                # Store in database
-                details_container.text(f"💾 Storing {file_name} in database...")
-                doc_id = insert_document(st.session_state.db_conn, file_name, content)
-                if not doc_id:
-                    status_container.error(f"❌ Failed to store document: {file_name}")
-                    continue
-                documents.append(content)
-                document_names.append(file_name)
-            # Update progress
-            current_progress += progress_per_file
-            progress_bar.progress(int(current_progress))
-        if not documents:
-            status_container.error("❌ No documents were successfully processed")
-            return
-        # Initialize vector store (80-90% progress)
-        status_container.info("🔄 Initializing vector store...")
-        details_container.text("🔍 Creating vector embeddings...")
-        vector_store = initialize_faiss(embeddings, documents, document_names)
-        if not vector_store:
-            status_container.error("❌ Failed to initialize vector store")
-            return
-        # Store vector store in session state
-        st.session_state.vector_store = vector_store
-        progress_bar.progress(90)
         # Verify vector store
         status_container.info("🔄 Verifying document indexing...")
         details_container.text("✨ Performing final checks...")
         if not verify_vector_store(vector_store):
             status_container.error("❌ Vector store verification failed")
             return
         # Initialize QA system (90-100% progress)
         status_container.info("🔄 Setting up QA system...")
         qa_system = initialize_qa_system(vector_store)
         if not qa_system:
             status_container.error("❌ Failed to initialize QA system")
             return
         # Store QA system in session state
         st.session_state.qa_system = qa_system
         # Complete!
         progress_bar.progress(100)
         status_container.success("✅ Documents processed successfully!")
-        details_container.markdown("""
-        🎉 **Ready to chat!**
-        - Documents loaded: {}
-        - Total content size: {:.2f} KB
-        - Vector store initialized
-        - QA system ready
-        You can now start asking questions about your documents!
-        """.format(
-            len(documents),
-            sum(len(doc) for doc in documents) / 1024
-        ))
         # Add notification
         st.balloons()
         # Set chat ready flag
         st.session_state.chat_ready = True
     except Exception as e:
         status_container.error(f"❌ Error processing documents: {e}")
         details_container.error(traceback.format_exc())
@@ -345,44 +322,46 @@ def handle_document_upload(uploaded_files):
         st.session_state.vector_store = None
         st.session_state.qa_system = None
         st.session_state.chat_ready = False
-        except Exception as e:
     finally:
         # Clean up progress display after 5 seconds if successful
         if st.session_state.get('qa_system') is not None:
             time.sleep(5)
             progress_container.empty()
 def display_vector_store_info():
-    """Display information about the current vector store state."""
     if 'vector_store' not in st.session_state:
         st.info("ℹ️ No documents loaded yet.")
         return
     try:
         # Get the vector store from session state
         vector_store = st.session_state.vector_store
         # Get basic stats
         test_query = vector_store.similarity_search("test", k=1)
         doc_count = len(test_query)
         # Create an expander for detailed info
         with st.expander("📊 Knowledge Base Status"):
             col1, col2 = st.columns(2)
             with col1:
                 st.metric(
                     label="Documents Loaded",
                     value=doc_count
                 )
             with col2:
                 st.metric(
                     label="System Status",
                     value="Ready" if verify_vector_store(vector_store) else "Not Ready"
                 )
             # Display sample queries
             if verify_vector_store(vector_store):
                 st.markdown("### 🔍 Sample Document Snippets")
@@ -391,14 +370,21 @@ def display_vector_store_info():
                     with st.container():
                         st.markdown(f"**Snippet {i}:**")
                         st.text(doc.page_content[:200] + "...")
     except Exception as e:
         st.error(f"Error displaying vector store info: {e}")
         st.error(traceback.format_exc())
 def initialize_qa_system(vector_store):
-    """Initialize QA system with optimized retrieval."""
     try:
         llm = ChatOpenAI(
             temperature=0.5,
@@ -439,8 +425,7 @@ Tone and Language: Use formal and professional language, ensuring clarity and pr
 Accuracy: Double-check all information for accuracy and completeness before providing it to the user.
-            """),
             MessagesPlaceholder(variable_name="chat_history"),
             ("human", "{input}\n\nContext: {context}")
         ])
@@ -474,10 +459,20 @@ Accuracy: Double-check all information for accuracy and completeness before prov
     except Exception as e:
         st.error(f"Error initializing QA system: {e}")
         return None
 # FAISS vector store initialization
 def initialize_faiss(embeddings, documents, document_names):
-    """Initialize FAISS vector store."""
     try:
         from langchain.vectorstores import FAISS
@@ -491,10 +486,16 @@ def initialize_faiss(embeddings, documents, document_names):
         st.error(f"Error initializing FAISS: {e}")
         return None
 # Embeddings model retrieval
 @st.cache_resource
 def get_embeddings_model():
-    """Get the embeddings model."""
     try:
         from langchain.embeddings import HuggingFaceEmbeddings
@@ -503,4 +504,4 @@ def get_embeddings_model():
         return embeddings
     except Exception as e:
         st.error(f"Error loading embeddings model: {e}")
-        return None

 # utils/database.py
 # Update the imports first
 from langchain_community.chat_models import ChatOpenAI
 from langchain_core.messages import (
 import time
 import io
 import tempfile
 from sqlite3 import Error
 def create_connection(db_file):
+    """
+    Create a database connection to the SQLite database.
+    Args:
+        db_file (str): Path to the SQLite database file.
+    Returns:
+        sqlite3.Connection: Database connection object or None if an error occurs.
+    """
     conn = None
     try:
         conn = sqlite3.connect(db_file)
         st.error("Failed to connect to database. Please try again or contact support.")
     return None
 def create_tables(conn):
+    """
+    Create necessary tables in the database.
+    Args:
+        conn (sqlite3.Connection): SQLite database connection.
+    """
     try:
         sql_create_documents_table = '''
         CREATE TABLE IF NOT EXISTS documents (
 def process_document(file_path):
+    """
+    Process a PDF document with proper chunking.
+    Args:
+        file_path (str): Path to the PDF file.
+    Returns:
+        tuple: (list of document chunks, full content of the document).
+    """
     # Load PDF
     loader = PyPDFLoader(file_path)
     documents = loader.load()
     # Create text splitter
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=1000,
         length_function=len,
         separators=["\n\n", "\n", " ", ""]
     )
     # Split documents into chunks
     chunks = text_splitter.split_documents(documents)
     # Extract text content for database storage
     full_content = "\n".join(doc.page_content for doc in documents)
     return chunks, full_content
 def get_documents(conn):
+    """
+    Retrieve all documents from the database.
     Args:
+        conn (sqlite3.Connection): SQLite database connection.
     Returns:
+        tuple: (list of document contents, list of document names).
     """
     try:
         cursor = conn.cursor()
         cursor.execute("SELECT content, name FROM documents")
         results = cursor.fetchall()
         if not results:
             return [], []
         # Separate contents and names
         document_contents = [row[0] for row in results]
         document_names = [row[1] for row in results]
         return document_contents, document_names
     except Error as e:
         st.error(f"Error retrieving documents: {e}")
         return [], []
 def insert_document(conn, name, content):
+    """
+    Insert a new document into the database.
     Args:
+        conn (sqlite3.Connection): SQLite database connection.
+        name (str): Name of the document.
+        content (str): Content of the document.
     Returns:
+        int: ID of the inserted document, or None if insertion failed.
     """
     try:
         cursor = conn.cursor()
         cursor.execute(sql, (name, content))
         conn.commit()
         return cursor.lastrowid
     except Error as e:
         st.error(f"Error inserting document: {e}")
         return None
 def verify_vector_store(vector_store):
+    """
+    Verify that the vector store has documents loaded.
     Args:
+        vector_store (FAISS): FAISS vector store instance.
     Returns:
+        bool: True if vector store is properly initialized with documents.
     """
     try:
         # Try to perform a simple similarity search
         return False
 def handle_document_upload(uploaded_files):
+    """
+    Handle document upload with progress tracking.
+    Args:
+        uploaded_files (list): List of uploaded files.
+    """
     try:
         # Initialize session state variables if they don't exist
         if 'qa_system' not in st.session_state:
             st.session_state.qa_system = None
         if 'vector_store' not in st.session_state:
             st.session_state.vector_store = None
         # Create a progress container
         progress_container = st.empty()
         status_container = st.empty()
         details_container = st.empty()
         # Initialize progress bar
         progress_bar = progress_container.progress(0)
         status_container.info("🔄 Initializing document processing...")
         # Reset existing states
         if st.session_state.vector_store is not None:
             st.session_state.vector_store = None
         if st.session_state.qa_system is not None:
             st.session_state.qa_system = None
         # Initialize embeddings (10% progress)
         status_container.info("🔄 Initializing embeddings model...")
         embeddings = get_embeddings_model()
             status_container.error("❌ Failed to initialize embeddings model")
             return
         progress_bar.progress(10)
+        # Process documents
         all_chunks = []
         documents = []
         document_names = []
         progress_per_file = 70 / len(uploaded_files)
         current_progress = 10
         for idx, uploaded_file in enumerate(uploaded_files):
             file_name = uploaded_file.name
             status_container.info(f"🔄 Processing document {idx + 1}/{len(uploaded_files)}: {file_name}")
             # Create temporary file
             with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
                 tmp_file.write(uploaded_file.getvalue())
                 tmp_file.flush()
                 # Process document with chunking
                 chunks, content = process_document(tmp_file.name)
                 # Store in database
                 doc_id = insert_document(st.session_state.db_conn, file_name, content)
                 if not doc_id:
                     status_container.error(f"❌ Failed to store document: {file_name}")
                     continue
                 # Add chunks with metadata
                 for chunk in chunks:
                     chunk.metadata["source"] = file_name
                 all_chunks.extend(chunks)
                 documents.append(content)
                 document_names.append(file_name)
             current_progress += progress_per_file
             progress_bar.progress(int(current_progress))
         # Initialize vector store with chunks instead of full documents
         status_container.info("🔄 Initializing vector store...")
         vector_store = FAISS.from_documents(
             all_chunks,
             embeddings
         )
         # Verify vector store
         status_container.info("🔄 Verifying document indexing...")
         details_container.text("✨ Performing final checks...")
         if not verify_vector_store(vector_store):
             status_container.error("❌ Vector store verification failed")
             return
         # Initialize QA system (90-100% progress)
         status_container.info("🔄 Setting up QA system...")
         qa_system = initialize_qa_system(vector_store)
         if not qa_system:
             status_container.error("❌ Failed to initialize QA system")
             return
         # Store QA system in session state
         st.session_state.qa_system = qa_system
         # Complete!
         progress_bar.progress(100)
         status_container.success("✅ Documents processed successfully!")
+        details_container.markdown(
+            """
+            🎉 **Ready to chat!**
+            - Documents loaded: {}
+            - Total content size: {:.2f} KB
+            - Vector store initialized
+            - QA system ready
+            You can now start asking questions about your documents!
+            """.format(
+                len(documents),
+                sum(len(doc) for doc in documents) / 1024
+            )
+        )
         # Add notification
         st.balloons()
         # Set chat ready flag
         st.session_state.chat_ready = True
     except Exception as e:
         status_container.error(f"❌ Error processing documents: {e}")
         details_container.error(traceback.format_exc())
         st.session_state.vector_store = None
         st.session_state.qa_system = None
         st.session_state.chat_ready = False
     finally:
         # Clean up progress display after 5 seconds if successful
         if st.session_state.get('qa_system') is not None:
             time.sleep(5)
             progress_container.empty()
 def display_vector_store_info():
+    """
+    Display information about the current vector store state.
+    """
     if 'vector_store' not in st.session_state:
         st.info("ℹ️ No documents loaded yet.")
         return
     try:
         # Get the vector store from session state
         vector_store = st.session_state.vector_store
         # Get basic stats
         test_query = vector_store.similarity_search("test", k=1)
         doc_count = len(test_query)
         # Create an expander for detailed info
         with st.expander("📊 Knowledge Base Status"):
             col1, col2 = st.columns(2)
             with col1:
                 st.metric(
                     label="Documents Loaded",
                     value=doc_count
                 )
             with col2:
                 st.metric(
                     label="System Status",
                     value="Ready" if verify_vector_store(vector_store) else "Not Ready"
                 )
             # Display sample queries
             if verify_vector_store(vector_store):
                 st.markdown("### 🔍 Sample Document Snippets")
                     with st.container():
                         st.markdown(f"**Snippet {i}:**")
                         st.text(doc.page_content[:200] + "...")
     except Exception as e:
         st.error(f"Error displaying vector store info: {e}")
         st.error(traceback.format_exc())
 def initialize_qa_system(vector_store):
+    """
+    Initialize QA system with optimized retrieval.
+    Args:
+        vector_store (FAISS): FAISS vector store instance.
+    Returns:
+        dict: QA system chain or None if initialization fails.
+    """
     try:
         llm = ChatOpenAI(
             temperature=0.5,
 Accuracy: Double-check all information for accuracy and completeness before providing it to the user.
+"""),
             MessagesPlaceholder(variable_name="chat_history"),
             ("human", "{input}\n\nContext: {context}")
         ])
     except Exception as e:
         st.error(f"Error initializing QA system: {e}")
         return None
 # FAISS vector store initialization
 def initialize_faiss(embeddings, documents, document_names):
+    """
+    Initialize FAISS vector store.
+    Args:
+        embeddings (Embeddings): Embeddings model to use.
+        documents (list): List of document contents.
+        document_names (list): List of document names.
+    Returns:
+        FAISS: FAISS vector store instance or None if initialization fails.
+    """
     try:
         from langchain.vectorstores import FAISS
         st.error(f"Error initializing FAISS: {e}")
         return None
 # Embeddings model retrieval
 @st.cache_resource
 def get_embeddings_model():
+    """
+    Get the embeddings model.
+    Returns:
+        Embeddings: Embeddings model instance or None if loading fails.
+    """
     try:
         from langchain.embeddings import HuggingFaceEmbeddings
         return embeddings
     except Exception as e:
         st.error(f"Error loading embeddings model: {e}")
+        return None