RFP_Analyzer_Agent_backup

Build error

App Files Files Community

cryogenic22 commited on Dec 1, 2024

Commit

10c877a

verified ·

1 Parent(s): 3ad6907

Update utils/database.py

Browse files

Files changed (1) hide show

utils/database.py +76 -5

utils/database.py CHANGED Viewed

@@ -14,6 +14,9 @@ from langchain.chat_models import ChatOpenAI
 from langchain.agents import AgentExecutor, Tool, create_openai_tools_agent
 from langchain.agents.format_scratchpad.tools import format_to_tool_messages
 from langchain.agents.output_parsers.openai_tools import OpenAIToolsAgentOutputParser
 import os
 import streamlit as st
@@ -76,7 +79,28 @@ def create_tables(conn):
     except Error as e:
         st.error(f"Error: {e}")
-# Add this function to your database.py file
 def get_documents(conn):
     """Retrieve all documents from the database.
@@ -179,10 +203,50 @@ def handle_document_upload(uploaded_files):
             return
         progress_bar.progress(10)
-        # Process documents
         documents = []
         document_names = []
         # Calculate progress steps per file
         progress_per_file = 70 / len(uploaded_files)  # 70% of progress for file processing
         current_progress = 10
@@ -281,6 +345,7 @@ def handle_document_upload(uploaded_files):
         st.session_state.vector_store = None
         st.session_state.qa_system = None
         st.session_state.chat_ready = False
     finally:
         # Clean up progress display after 5 seconds if successful
@@ -333,16 +398,22 @@ def display_vector_store_info():
 def initialize_qa_system(vector_store):
-    """Initialize QA system with proper chat handling."""
     try:
         llm = ChatOpenAI(
             temperature=0.5,
             model_name="gpt-4",
             api_key=os.environ.get("OPENAI_API_KEY")
         )
-        # Create retriever function
-        retriever = vector_store.as_retriever(search_kwargs={"k": 2})
         # Create a template that enforces clean formatting
         prompt = ChatPromptTemplate.from_messages([

 from langchain.agents import AgentExecutor, Tool, create_openai_tools_agent
 from langchain.agents.format_scratchpad.tools import format_to_tool_messages
 from langchain.agents.output_parsers.openai_tools import OpenAIToolsAgentOutputParser
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import PyPDFLoader
+from langchain.vectorstores import FAISS
 import os
 import streamlit as st
     except Error as e:
         st.error(f"Error: {e}")
+def process_document(file_path):
+    """Process a PDF document with proper chunking."""
+    # Load PDF
+    loader = PyPDFLoader(file_path)
+    documents = loader.load()
+    # Create text splitter
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=200,
+        length_function=len,
+        separators=["\n\n", "\n", " ", ""]
+    )
+    # Split documents into chunks
+    chunks = text_splitter.split_documents(documents)
+    # Extract text content for database storage
+    full_content = "\n".join(doc.page_content for doc in documents)
+    return chunks, full_content
 def get_documents(conn):
     """Retrieve all documents from the database.
             return
         progress_bar.progress(10)
+         # Process documents
+        all_chunks = []
         documents = []
         document_names = []
+        progress_per_file = 70 / len(uploaded_files)
+        current_progress = 10
+        for idx, uploaded_file in enumerate(uploaded_files):
+            file_name = uploaded_file.name
+            status_container.info(f"🔄 Processing document {idx + 1}/{len(uploaded_files)}: {file_name}")
+            # Create temporary file
+            with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
+                tmp_file.write(uploaded_file.getvalue())
+                tmp_file.flush()
+                # Process document with chunking
+                chunks, content = process_document(tmp_file.name)
+                # Store in database
+                doc_id = insert_document(st.session_state.db_conn, file_name, content)
+                if not doc_id:
+                    status_container.error(f"❌ Failed to store document: {file_name}")
+                    continue
+                # Add chunks with metadata
+                for chunk in chunks:
+                    chunk.metadata["source"] = file_name
+                all_chunks.extend(chunks)
+                documents.append(content)
+                document_names.append(file_name)
+            current_progress += progress_per_file
+            progress_bar.progress(int(current_progress))
+        # Initialize vector store with chunks instead of full documents
+        status_container.info("🔄 Initializing vector store...")
+        vector_store = FAISS.from_documents(
+            all_chunks,
+            embeddings
+        )
         # Calculate progress steps per file
         progress_per_file = 70 / len(uploaded_files)  # 70% of progress for file processing
         current_progress = 10
         st.session_state.vector_store = None
         st.session_state.qa_system = None
         st.session_state.chat_ready = False
+        except Exception as e:
     finally:
         # Clean up progress display after 5 seconds if successful
 def initialize_qa_system(vector_store):
+    """Initialize QA system with optimized retrieval."""
     try:
         llm = ChatOpenAI(
             temperature=0.5,
             model_name="gpt-4",
+            max_tokens=4000,  # Explicitly set max tokens
             api_key=os.environ.get("OPENAI_API_KEY")
         )
+        # Optimize retriever settings
+        retriever = vector_store.as_retriever(
+            search_kwargs={
+                "k": 3,  # Retrieve fewer, more relevant chunks
+                "fetch_k": 5  # Consider more candidates before selecting top k
+            }
+        )
         # Create a template that enforces clean formatting
         prompt = ChatPromptTemplate.from_messages([