File size: 17,419 Bytes
7333afe
09e8d84
61cecf9
03a5156
 
61cecf9
2329f67
61cecf9
2329f67
 
8a72544
09e8d84
64b3386
 
 
69ee452
64b3386
 
03a5156
50e05f5
 
4e1e6ae
 
 
50e05f5
03a5156
59815da
03a5156
 
9f3cc4f
03a5156
 
 
 
041398a
03a5156
 
14499b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03a5156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44d3a72
14499b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4fbfef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35cf712
44d3a72
35cf712
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ab15d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f61c1a4
 
368e7bf
d7c897b
 
 
 
 
 
f61c1a4
368e7bf
f777bc6
 
 
 
 
d7c897b
 
 
 
 
f61c1a4
 
 
 
 
 
 
 
 
368e7bf
 
d7c897b
368e7bf
 
 
f61c1a4
368e7bf
 
 
f61c1a4
 
 
 
 
 
 
368e7bf
4e1e6ae
 
 
 
d7c897b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f61c1a4
 
 
368e7bf
 
f61c1a4
 
368e7bf
 
 
 
 
 
 
 
 
 
 
d7c897b
 
96aced1
368e7bf
 
f61c1a4
 
368e7bf
 
f61c1a4
 
 
f777bc6
d7c897b
96aced1
d7c897b
96aced1
d7c897b
 
368e7bf
f61c1a4
 
368e7bf
f61c1a4
 
 
 
368e7bf
f61c1a4
368e7bf
 
f61c1a4
368e7bf
f61c1a4
368e7bf
 
 
d7c897b
f61c1a4
d7c897b
f61c1a4
 
368e7bf
f61c1a4
 
f777bc6
 
f61c1a4
d7c897b
f61c1a4
f777bc6
 
 
f61c1a4
 
d7c897b
f777bc6
f61c1a4
4e1e6ae
1118663
1957e91
 
 
 
 
 
 
8ea261f
 
 
1957e91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ea261f
2bd9b5d
 
7333afe
cbeca91
7333afe
09e8d84
09d4055
09e8d84
 
f4bcb1f
09e8d84
cbeca91
d39a1aa
2bd9b5d
8c249d6
4571b33
fccb2e5
 
311a114
fccb2e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c249d6
 
939f85b
 
cbeca91
 
 
 
 
 
 
 
 
 
 
 
 
 
2329f67
 
cbeca91
d39a1aa
914b163
2329f67
8c249d6
2329f67
 
7333afe
4571b33
7333afe
 
 
 
4590d5a
96aced1
03a5156
7333afe
03a5156
7333afe
 
59815da
7333afe
 
 
59815da
7333afe
 
 
 
 
 
03a5156
59815da
7333afe
03a5156
7333afe
 
59815da
7333afe
 
 
 
 
03a5156
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
# utils/database.py
from langchain_community.chat_models import ChatOpenAI
from langchain_core.messages import (
    HumanMessage,
    AIMessage,
    SystemMessage,
    BaseMessage
)
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables import RunnablePassthrough
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
from langchain.agents import AgentExecutor, Tool, create_openai_tools_agent
from langchain.agents.format_scratchpad.tools import format_to_tool_messages
from langchain.agents.output_parsers.openai_tools import OpenAIToolsAgentOutputParser
from utils.document_chunker import DocumentChunker
import os
import streamlit as st
import sqlite3
import traceback
import time
import io
import tempfile
from langchain_community.document_loaders import PyPDFLoader

from sqlite3 import Error

def create_connection(db_file):
    """Create a database connection to the SQLite database."""
    conn = None
    try:
        conn = sqlite3.connect(db_file)
        return conn
    except Error as e:
        st.error("Failed to connect to database. Please try again or contact support.")
    return None



# Add this function to your database.py file
def get_db_connection():
    """Get a thread-safe database connection."""
    try:
        data_dir = Path("data")
        data_dir.mkdir(exist_ok=True)
        db_path = data_dir / 'rfp_analysis.db'
        
        # Create new connection
        conn = sqlite3.connect(str(db_path))
        
        # Create tables if they don't exist
        create_tables(conn)
        
        return conn
    except Exception as e:
        st.error(f"Database connection error: {str(e)}")
        return None

def create_tables(conn):
    """Create necessary tables in the database."""
    try:
        sql_create_documents_table = '''
        CREATE TABLE IF NOT EXISTS documents (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            name TEXT NOT NULL,
            content TEXT NOT NULL,
            upload_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        );
        '''

        sql_create_queries_table = '''
        CREATE TABLE IF NOT EXISTS queries (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            query TEXT NOT NULL,
            response TEXT NOT NULL,
            document_id INTEGER,
            query_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            FOREIGN KEY (document_id) REFERENCES documents (id)
        );
        '''

        sql_create_annotations_table = '''
        CREATE TABLE IF NOT EXISTS annotations (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            document_id INTEGER NOT NULL,
            annotation TEXT NOT NULL,
            page_number INTEGER,
            annotation_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            FOREIGN KEY (document_id) REFERENCES documents (id)
        );
        '''

        conn.execute(sql_create_documents_table)
        conn.execute(sql_create_queries_table)
        conn.execute(sql_create_annotations_table)
    except Error as e:
        st.error(f"Error: {e}")

def insert_document(name, content):
    """Insert a document with thread-safe connection."""
    try:
        conn = get_db_connection()
        if conn is None:
            return None
            
        cursor = conn.cursor()
        cursor.execute(
            "INSERT INTO documents (name, content) VALUES (?, ?)",
            (name, content)
        )
        conn.commit()
        doc_id = cursor.lastrowid
        conn.close()
        return doc_id
    except Exception as e:
        st.error(f"Error inserting document: {str(e)}")
        if conn:
            conn.rollback()
            conn.close()
        return None
def get_documents(conn):
    """Retrieve all documents from the database.
    
    Args:
        conn: SQLite database connection
        
    Returns:
        tuple: (list of document contents, list of document names)
    """
    try:
        cursor = conn.cursor()
        cursor.execute("SELECT content, name FROM documents")
        results = cursor.fetchall()
        
        if not results:
            return [], []
            
        # Separate contents and names
        document_contents = [row[0] for row in results]
        document_names = [row[1] for row in results]
        
        return document_contents, document_names
        
    except Error as e:
        st.error(f"Error retrieving documents: {e}")
        return [], []

def insert_document(conn, name, content):
    """Insert a new document into the database.
    
    Args:
        conn: SQLite database connection
        name (str): Name of the document
        content (str): Content of the document
        
    Returns:
        int: ID of the inserted document, or None if insertion failed
    """
    try:
        cursor = conn.cursor()
        sql = '''INSERT INTO documents (name, content)
                VALUES (?, ?)'''
        cursor.execute(sql, (name, content))
        conn.commit()
        return cursor.lastrowid
        
    except Error as e:
        st.error(f"Error inserting document: {e}")
        return None

def verify_vector_store(vector_store):
    """Verify that the vector store has documents loaded.
    
    Args:
        vector_store: FAISS vector store instance
        
    Returns:
        bool: True if vector store is properly initialized with documents
    """
    try:
        # Try to perform a simple similarity search
        test_results = vector_store.similarity_search("test", k=1)
        return len(test_results) > 0
    except Exception as e:
        st.error(f"Vector store verification failed: {e}")
        return False

def handle_document_upload(uploaded_files):
    """Handle document upload with improved chunking and progress tracking."""
    # Initialize containers first - before any processing
    progress_container = st.empty()
    status_container = st.empty()
    details_container = st.empty()
    progress_bar = progress_container.progress(0)

    try:
        # Initialize session state variables
        if 'qa_system' not in st.session_state:
            st.session_state.qa_system = None
        if 'vector_store' not in st.session_state:
            st.session_state.vector_store = None
        
        # Initialize persistence manager
        persistence = PersistenceManager()
        
        # Generate a session ID based on timestamp and files
        session_id = f"session_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        
        # Initialize embeddings (10% progress)
        status_container.info("πŸ”„ Initializing embeddings model...")
        embeddings = get_embeddings_model()
        if not embeddings:
            status_container.error("❌ Failed to initialize embeddings model")
            return
        progress_bar.progress(10)
        
        # Initialize document chunker
        chunker = DocumentChunker(
            chunk_size=1000,
            chunk_overlap=200,
            max_tokens_per_chunk=2000
        )
        
        # Process documents
        document_pairs = []  # List to store (content, filename) pairs
        progress_per_file = 70 / len(uploaded_files)
        current_progress = 10
        
        for idx, uploaded_file in enumerate(uploaded_files):
            file_name = uploaded_file.name
            status_container.info(f"πŸ”„ Processing document {idx + 1}/{len(uploaded_files)}: {file_name}")
            details_container.text(f"πŸ“„ Current file: {file_name}")
            
            # Create temporary file for PDF processing
            with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
                tmp_file.write(uploaded_file.getvalue())
                tmp_file.flush()
                
                try:
                    # Load PDF content
                    loader = PyPDFLoader(tmp_file.name)
                    pdf_documents = loader.load()
                    content = "\n".join(doc.page_content for doc in pdf_documents)
                    
                    # Store original content in database
                    doc_id = insert_document(st.session_state.db_conn, file_name, content)
                    if not doc_id:
                        status_container.error(f"❌ Failed to store document: {file_name}")
                        continue
                    
                    document_pairs.append((content, file_name))
                    
                finally:
                    # Ensure temporary file is cleaned up
                    try:
                        os.unlink(tmp_file.name)
                    except Exception as e:
                        st.warning(f"Could not delete temporary file: {e}")
            
            current_progress += progress_per_file
            progress_bar.progress(int(current_progress))
        
        if not document_pairs:
            status_container.error("❌ No documents were successfully processed")
            return
        
        # Chunk documents (80% progress)
        status_container.info("πŸ”„ Chunking documents...")
        details_container.text("πŸ“‘ Splitting documents into manageable chunks...")
        chunks, chunk_metadatas = chunker.process_documents(document_pairs)
        
        if not chunks:
            status_container.error("❌ Failed to chunk documents")
            return
        
        progress_bar.progress(80)
        
        # Save chunks for persistence
        persistence.save_chunks(chunks, chunk_metadatas, session_id)
        
        # Initialize vector store (90% progress)
        status_container.info("πŸ”„ Initializing vector store...")
        details_container.text("πŸ” Creating vector embeddings...")
        vector_store = initialize_faiss(embeddings, chunks, chunk_metadatas)
        
        if not vector_store:
            status_container.error("❌ Failed to initialize vector store")
            return
        
        # Save vector store and update session state
        persistence.save_vector_store(vector_store, session_id)
        st.session_state.vector_store = vector_store
        st.session_state.current_session_id = session_id
        progress_bar.progress(90)
        
        # Initialize QA system (100% progress)
        status_container.info("πŸ”„ Setting up QA system...")
        qa_system = initialize_qa_system(vector_store)
        
        if not qa_system:
            status_container.error("❌ Failed to initialize QA system")
            return
        
        st.session_state.qa_system = qa_system
        progress_bar.progress(100)
        
        # Success message
        status_container.success("βœ… Documents processed successfully!")
        details_container.markdown(f"""
        πŸŽ‰ **Ready to chat!**
        - Documents processed: {len(document_pairs)}
        - Total chunks created: {len(chunks)}
        - Average chunk size: {sum(len(chunk) for chunk in chunks) / len(chunks):.0f} characters
        - Vector store initialized and saved
        - QA system ready
        - Session ID: {session_id}
        
        You can now start asking questions about your documents!
        """)
        
        st.balloons()
        st.session_state.chat_ready = True
        
    except Exception as e:
        status_container.error(f"❌ Error processing documents: {str(e)}")
        details_container.error(traceback.format_exc())
        st.session_state.vector_store = None
        st.session_state.qa_system = None
        st.session_state.chat_ready = False
        
    finally:
        # Clean up progress display after successful processing
        if st.session_state.get('qa_system') is not None:
            time.sleep(5)
            progress_container.empty()

def display_vector_store_info():
    """Display information about the current vector store state."""
    if 'vector_store' not in st.session_state:
        st.info("ℹ️ No documents loaded yet.")
        return
        
    try:
        # Get the vector store from session state
        vector_store = st.session_state.vector_store
        
        # Get basic stats
        test_query = vector_store.similarity_search("test", k=1)
        doc_count = len(test_query)
        
        # Create an expander for detailed info
        with st.expander("πŸ“Š Knowledge Base Status"):
            col1, col2 = st.columns(2)
            
            with col1:
                st.metric(
                    label="Documents Loaded",
                    value=doc_count
                )
                
            with col2:
                st.metric(
                    label="System Status",
                    value="Ready" if verify_vector_store(vector_store) else "Not Ready"
                )
            
            # Display sample queries
            if verify_vector_store(vector_store):
                st.markdown("### πŸ” Sample Document Snippets")
                sample_docs = vector_store.similarity_search("", k=3)
                for i, doc in enumerate(sample_docs, 1):
                    with st.container():
                        st.markdown(f"**Snippet {i}:**")
                        st.text(doc.page_content[:200] + "...")
                        
    except Exception as e:
        st.error(f"Error displaying vector store info: {e}")
        st.error(traceback.format_exc())


def initialize_qa_system(vector_store):
    """Initialize QA system with proper chat handling."""
    try:
        llm = ChatOpenAI(
            temperature=0.5,
            model_name="gpt-4",
            api_key=os.environ.get("OPENAI_API_KEY")
        )

        # Create retriever function
        retriever = vector_store.as_retriever(search_kwargs={"k": 2})

        # Create a template that enforces clean formatting
        prompt = ChatPromptTemplate.from_messages([
            ("system", """You are an expert consultant specializing in analyzing Request for Proposal (RFP) documents. Your goal is to assist users by providing clear, concise, and professional insights based on the content provided. Please adhere to the following guidelines when crafting your responses:

Begin with a summary that highlights the key findings or answers the main query.

Structured Format: Use clear and descriptive section headers to organize the information logically.

Bullet Points: Utilize bullet points for lists or complex information to enhance readability.

Source Attribution: Cite specific sections or page numbers from the RFP document when referencing information.

Professional Formatting: Maintain a clean and professional layout using Markdown formatting where appropriate (e.g., headings, bold, italics).

Focused Content: Keep your responses concise and directly related to the user's query, avoiding unnecessary information.

Scope Awareness: If a query falls outside the provided information or context, politely acknowledge this and suggest consulting the relevant sections or additional sources.

Confidentiality: Respect the confidentiality of the information provided and avoid sharing any sensitive data beyond the scope of the query.

Tone and Language: Use formal and professional language, ensuring clarity and precision in your responses.

Accuracy: Double-check all information for accuracy and completeness before providing it to the user.

 
            """),
            MessagesPlaceholder(variable_name="chat_history"),
            ("human", "{input}\n\nContext: {context}")
        ])

        def get_chat_history(inputs):
            chat_history = inputs.get("chat_history", [])
            if not isinstance(chat_history, list):
                return []
            return [msg for msg in chat_history if isinstance(msg, BaseMessage)]

        def get_context(inputs):
            docs = retriever.get_relevant_documents(inputs["input"])
            context_parts = []
            for doc in docs:
                source = doc.metadata.get('source', 'Unknown source')
                context_parts.append(f"\nFrom {source}:\n{doc.page_content}")
            return "\n".join(context_parts)

        chain = (
            {
                "context": get_context,
                "chat_history": get_chat_history,
                "input": lambda x: x["input"]
            }
            | prompt
            | llm
        )

        return chain

    except Exception as e:
        st.error(f"Error initializing QA system: {e}")
        return None
        

# FAISS vector store initialization
def initialize_faiss(embeddings, documents, document_names):
    """Initialize FAISS vector store."""
    try:
        from langchain.vectorstores import FAISS

        vector_store = FAISS.from_texts(
            documents,
            embeddings,
            metadatas=[{"source": name} for name in document_names],
        )
        return vector_store
    except Exception as e:
        st.error(f"Error initializing FAISS: {e}")
        return None

# Embeddings model retrieval
@st.cache_resource
def get_embeddings_model():
    """Get the embeddings model."""
    try:
        from langchain.embeddings import HuggingFaceEmbeddings

        model_name = "sentence-transformers/all-MiniLM-L6-v2"
        embeddings = HuggingFaceEmbeddings(model_name=model_name)
        return embeddings
    except Exception as e:
        st.error(f"Error loading embeddings model: {e}")
        return None