Spaces:
Paused
Paused
File size: 17,419 Bytes
7333afe 09e8d84 61cecf9 03a5156 61cecf9 2329f67 61cecf9 2329f67 8a72544 09e8d84 64b3386 69ee452 64b3386 03a5156 50e05f5 4e1e6ae 50e05f5 03a5156 59815da 03a5156 9f3cc4f 03a5156 041398a 03a5156 14499b7 03a5156 44d3a72 14499b7 f4fbfef 35cf712 44d3a72 35cf712 6ab15d2 f61c1a4 368e7bf d7c897b f61c1a4 368e7bf f777bc6 d7c897b f61c1a4 368e7bf d7c897b 368e7bf f61c1a4 368e7bf f61c1a4 368e7bf 4e1e6ae d7c897b f61c1a4 368e7bf f61c1a4 368e7bf d7c897b 96aced1 368e7bf f61c1a4 368e7bf f61c1a4 f777bc6 d7c897b 96aced1 d7c897b 96aced1 d7c897b 368e7bf f61c1a4 368e7bf f61c1a4 368e7bf f61c1a4 368e7bf f61c1a4 368e7bf f61c1a4 368e7bf d7c897b f61c1a4 d7c897b f61c1a4 368e7bf f61c1a4 f777bc6 f61c1a4 d7c897b f61c1a4 f777bc6 f61c1a4 d7c897b f777bc6 f61c1a4 4e1e6ae 1118663 1957e91 8ea261f 1957e91 8ea261f 2bd9b5d 7333afe cbeca91 7333afe 09e8d84 09d4055 09e8d84 f4bcb1f 09e8d84 cbeca91 d39a1aa 2bd9b5d 8c249d6 4571b33 fccb2e5 311a114 fccb2e5 8c249d6 939f85b cbeca91 2329f67 cbeca91 d39a1aa 914b163 2329f67 8c249d6 2329f67 7333afe 4571b33 7333afe 4590d5a 96aced1 03a5156 7333afe 03a5156 7333afe 59815da 7333afe 59815da 7333afe 03a5156 59815da 7333afe 03a5156 7333afe 59815da 7333afe 03a5156 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 |
# utils/database.py
from langchain_community.chat_models import ChatOpenAI
from langchain_core.messages import (
HumanMessage,
AIMessage,
SystemMessage,
BaseMessage
)
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables import RunnablePassthrough
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
from langchain.agents import AgentExecutor, Tool, create_openai_tools_agent
from langchain.agents.format_scratchpad.tools import format_to_tool_messages
from langchain.agents.output_parsers.openai_tools import OpenAIToolsAgentOutputParser
from utils.document_chunker import DocumentChunker
import os
import streamlit as st
import sqlite3
import traceback
import time
import io
import tempfile
from langchain_community.document_loaders import PyPDFLoader
from sqlite3 import Error
def create_connection(db_file):
"""Create a database connection to the SQLite database."""
conn = None
try:
conn = sqlite3.connect(db_file)
return conn
except Error as e:
st.error("Failed to connect to database. Please try again or contact support.")
return None
# Add this function to your database.py file
def get_db_connection():
"""Get a thread-safe database connection."""
try:
data_dir = Path("data")
data_dir.mkdir(exist_ok=True)
db_path = data_dir / 'rfp_analysis.db'
# Create new connection
conn = sqlite3.connect(str(db_path))
# Create tables if they don't exist
create_tables(conn)
return conn
except Exception as e:
st.error(f"Database connection error: {str(e)}")
return None
def create_tables(conn):
"""Create necessary tables in the database."""
try:
sql_create_documents_table = '''
CREATE TABLE IF NOT EXISTS documents (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL,
content TEXT NOT NULL,
upload_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
'''
sql_create_queries_table = '''
CREATE TABLE IF NOT EXISTS queries (
id INTEGER PRIMARY KEY AUTOINCREMENT,
query TEXT NOT NULL,
response TEXT NOT NULL,
document_id INTEGER,
query_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (document_id) REFERENCES documents (id)
);
'''
sql_create_annotations_table = '''
CREATE TABLE IF NOT EXISTS annotations (
id INTEGER PRIMARY KEY AUTOINCREMENT,
document_id INTEGER NOT NULL,
annotation TEXT NOT NULL,
page_number INTEGER,
annotation_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (document_id) REFERENCES documents (id)
);
'''
conn.execute(sql_create_documents_table)
conn.execute(sql_create_queries_table)
conn.execute(sql_create_annotations_table)
except Error as e:
st.error(f"Error: {e}")
def insert_document(name, content):
"""Insert a document with thread-safe connection."""
try:
conn = get_db_connection()
if conn is None:
return None
cursor = conn.cursor()
cursor.execute(
"INSERT INTO documents (name, content) VALUES (?, ?)",
(name, content)
)
conn.commit()
doc_id = cursor.lastrowid
conn.close()
return doc_id
except Exception as e:
st.error(f"Error inserting document: {str(e)}")
if conn:
conn.rollback()
conn.close()
return None
def get_documents(conn):
"""Retrieve all documents from the database.
Args:
conn: SQLite database connection
Returns:
tuple: (list of document contents, list of document names)
"""
try:
cursor = conn.cursor()
cursor.execute("SELECT content, name FROM documents")
results = cursor.fetchall()
if not results:
return [], []
# Separate contents and names
document_contents = [row[0] for row in results]
document_names = [row[1] for row in results]
return document_contents, document_names
except Error as e:
st.error(f"Error retrieving documents: {e}")
return [], []
def insert_document(conn, name, content):
"""Insert a new document into the database.
Args:
conn: SQLite database connection
name (str): Name of the document
content (str): Content of the document
Returns:
int: ID of the inserted document, or None if insertion failed
"""
try:
cursor = conn.cursor()
sql = '''INSERT INTO documents (name, content)
VALUES (?, ?)'''
cursor.execute(sql, (name, content))
conn.commit()
return cursor.lastrowid
except Error as e:
st.error(f"Error inserting document: {e}")
return None
def verify_vector_store(vector_store):
"""Verify that the vector store has documents loaded.
Args:
vector_store: FAISS vector store instance
Returns:
bool: True if vector store is properly initialized with documents
"""
try:
# Try to perform a simple similarity search
test_results = vector_store.similarity_search("test", k=1)
return len(test_results) > 0
except Exception as e:
st.error(f"Vector store verification failed: {e}")
return False
def handle_document_upload(uploaded_files):
"""Handle document upload with improved chunking and progress tracking."""
# Initialize containers first - before any processing
progress_container = st.empty()
status_container = st.empty()
details_container = st.empty()
progress_bar = progress_container.progress(0)
try:
# Initialize session state variables
if 'qa_system' not in st.session_state:
st.session_state.qa_system = None
if 'vector_store' not in st.session_state:
st.session_state.vector_store = None
# Initialize persistence manager
persistence = PersistenceManager()
# Generate a session ID based on timestamp and files
session_id = f"session_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
# Initialize embeddings (10% progress)
status_container.info("π Initializing embeddings model...")
embeddings = get_embeddings_model()
if not embeddings:
status_container.error("β Failed to initialize embeddings model")
return
progress_bar.progress(10)
# Initialize document chunker
chunker = DocumentChunker(
chunk_size=1000,
chunk_overlap=200,
max_tokens_per_chunk=2000
)
# Process documents
document_pairs = [] # List to store (content, filename) pairs
progress_per_file = 70 / len(uploaded_files)
current_progress = 10
for idx, uploaded_file in enumerate(uploaded_files):
file_name = uploaded_file.name
status_container.info(f"π Processing document {idx + 1}/{len(uploaded_files)}: {file_name}")
details_container.text(f"π Current file: {file_name}")
# Create temporary file for PDF processing
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
tmp_file.write(uploaded_file.getvalue())
tmp_file.flush()
try:
# Load PDF content
loader = PyPDFLoader(tmp_file.name)
pdf_documents = loader.load()
content = "\n".join(doc.page_content for doc in pdf_documents)
# Store original content in database
doc_id = insert_document(st.session_state.db_conn, file_name, content)
if not doc_id:
status_container.error(f"β Failed to store document: {file_name}")
continue
document_pairs.append((content, file_name))
finally:
# Ensure temporary file is cleaned up
try:
os.unlink(tmp_file.name)
except Exception as e:
st.warning(f"Could not delete temporary file: {e}")
current_progress += progress_per_file
progress_bar.progress(int(current_progress))
if not document_pairs:
status_container.error("β No documents were successfully processed")
return
# Chunk documents (80% progress)
status_container.info("π Chunking documents...")
details_container.text("π Splitting documents into manageable chunks...")
chunks, chunk_metadatas = chunker.process_documents(document_pairs)
if not chunks:
status_container.error("β Failed to chunk documents")
return
progress_bar.progress(80)
# Save chunks for persistence
persistence.save_chunks(chunks, chunk_metadatas, session_id)
# Initialize vector store (90% progress)
status_container.info("π Initializing vector store...")
details_container.text("π Creating vector embeddings...")
vector_store = initialize_faiss(embeddings, chunks, chunk_metadatas)
if not vector_store:
status_container.error("β Failed to initialize vector store")
return
# Save vector store and update session state
persistence.save_vector_store(vector_store, session_id)
st.session_state.vector_store = vector_store
st.session_state.current_session_id = session_id
progress_bar.progress(90)
# Initialize QA system (100% progress)
status_container.info("π Setting up QA system...")
qa_system = initialize_qa_system(vector_store)
if not qa_system:
status_container.error("β Failed to initialize QA system")
return
st.session_state.qa_system = qa_system
progress_bar.progress(100)
# Success message
status_container.success("β
Documents processed successfully!")
details_container.markdown(f"""
π **Ready to chat!**
- Documents processed: {len(document_pairs)}
- Total chunks created: {len(chunks)}
- Average chunk size: {sum(len(chunk) for chunk in chunks) / len(chunks):.0f} characters
- Vector store initialized and saved
- QA system ready
- Session ID: {session_id}
You can now start asking questions about your documents!
""")
st.balloons()
st.session_state.chat_ready = True
except Exception as e:
status_container.error(f"β Error processing documents: {str(e)}")
details_container.error(traceback.format_exc())
st.session_state.vector_store = None
st.session_state.qa_system = None
st.session_state.chat_ready = False
finally:
# Clean up progress display after successful processing
if st.session_state.get('qa_system') is not None:
time.sleep(5)
progress_container.empty()
def display_vector_store_info():
"""Display information about the current vector store state."""
if 'vector_store' not in st.session_state:
st.info("βΉοΈ No documents loaded yet.")
return
try:
# Get the vector store from session state
vector_store = st.session_state.vector_store
# Get basic stats
test_query = vector_store.similarity_search("test", k=1)
doc_count = len(test_query)
# Create an expander for detailed info
with st.expander("π Knowledge Base Status"):
col1, col2 = st.columns(2)
with col1:
st.metric(
label="Documents Loaded",
value=doc_count
)
with col2:
st.metric(
label="System Status",
value="Ready" if verify_vector_store(vector_store) else "Not Ready"
)
# Display sample queries
if verify_vector_store(vector_store):
st.markdown("### π Sample Document Snippets")
sample_docs = vector_store.similarity_search("", k=3)
for i, doc in enumerate(sample_docs, 1):
with st.container():
st.markdown(f"**Snippet {i}:**")
st.text(doc.page_content[:200] + "...")
except Exception as e:
st.error(f"Error displaying vector store info: {e}")
st.error(traceback.format_exc())
def initialize_qa_system(vector_store):
"""Initialize QA system with proper chat handling."""
try:
llm = ChatOpenAI(
temperature=0.5,
model_name="gpt-4",
api_key=os.environ.get("OPENAI_API_KEY")
)
# Create retriever function
retriever = vector_store.as_retriever(search_kwargs={"k": 2})
# Create a template that enforces clean formatting
prompt = ChatPromptTemplate.from_messages([
("system", """You are an expert consultant specializing in analyzing Request for Proposal (RFP) documents. Your goal is to assist users by providing clear, concise, and professional insights based on the content provided. Please adhere to the following guidelines when crafting your responses:
Begin with a summary that highlights the key findings or answers the main query.
Structured Format: Use clear and descriptive section headers to organize the information logically.
Bullet Points: Utilize bullet points for lists or complex information to enhance readability.
Source Attribution: Cite specific sections or page numbers from the RFP document when referencing information.
Professional Formatting: Maintain a clean and professional layout using Markdown formatting where appropriate (e.g., headings, bold, italics).
Focused Content: Keep your responses concise and directly related to the user's query, avoiding unnecessary information.
Scope Awareness: If a query falls outside the provided information or context, politely acknowledge this and suggest consulting the relevant sections or additional sources.
Confidentiality: Respect the confidentiality of the information provided and avoid sharing any sensitive data beyond the scope of the query.
Tone and Language: Use formal and professional language, ensuring clarity and precision in your responses.
Accuracy: Double-check all information for accuracy and completeness before providing it to the user.
"""),
MessagesPlaceholder(variable_name="chat_history"),
("human", "{input}\n\nContext: {context}")
])
def get_chat_history(inputs):
chat_history = inputs.get("chat_history", [])
if not isinstance(chat_history, list):
return []
return [msg for msg in chat_history if isinstance(msg, BaseMessage)]
def get_context(inputs):
docs = retriever.get_relevant_documents(inputs["input"])
context_parts = []
for doc in docs:
source = doc.metadata.get('source', 'Unknown source')
context_parts.append(f"\nFrom {source}:\n{doc.page_content}")
return "\n".join(context_parts)
chain = (
{
"context": get_context,
"chat_history": get_chat_history,
"input": lambda x: x["input"]
}
| prompt
| llm
)
return chain
except Exception as e:
st.error(f"Error initializing QA system: {e}")
return None
# FAISS vector store initialization
def initialize_faiss(embeddings, documents, document_names):
"""Initialize FAISS vector store."""
try:
from langchain.vectorstores import FAISS
vector_store = FAISS.from_texts(
documents,
embeddings,
metadatas=[{"source": name} for name in document_names],
)
return vector_store
except Exception as e:
st.error(f"Error initializing FAISS: {e}")
return None
# Embeddings model retrieval
@st.cache_resource
def get_embeddings_model():
"""Get the embeddings model."""
try:
from langchain.embeddings import HuggingFaceEmbeddings
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=model_name)
return embeddings
except Exception as e:
st.error(f"Error loading embeddings model: {e}")
return None |