Spaces:
Paused
Paused
| # Import Dependencies (dependencies.py) | |
| import streamlit as st | |
| from langchain.chains import RetrievalQA | |
| from langchain_community.vectorstores import FAISS | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| from langchain.document_loaders import PyPDFLoader, OnlinePDFLoader | |
| from transformers import pipeline | |
| import re | |
| import sqlite3 | |
| from sqlite3 import Error | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| import requests | |
| import pandas as pd | |
| from pydrive.auth import GoogleAuth | |
| from pydrive.drive import GoogleDrive | |
| from io import BytesIO | |
| from googleapiclient.discovery import build | |
| from googleapiclient.http import MediaIoBaseDownload | |
| from google.oauth2 import service_account | |
| import tempfile | |
| import os | |
| from langchain.llms import OpenAI # Import the OpenAI class | |
| from langchain.chat_models import ChatOpenAI # Import ChatOpenAI | |
| from langchain.memory import ConversationBufferMemory | |
| from langchain.agents import create_openai_tools_agent, AgentExecutor, Tool | |
| from langchain.prompts import ( | |
| ChatPromptTemplate, | |
| MessagesPlaceholder, | |
| ) # Import necessary classes | |
| # SQLite Database Functions (database.py) | |
| def create_connection(db_file): | |
| try: | |
| conn = sqlite3.connect(db_file) | |
| return conn | |
| except Error as e: | |
| st.error(f"Error: {e}") | |
| return None | |
| def create_tables(conn): | |
| try: | |
| sql_create_documents_table = """ | |
| CREATE TABLE IF NOT EXISTS documents ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| name TEXT NOT NULL, | |
| content TEXT NOT NULL, | |
| upload_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP | |
| ); | |
| """ | |
| sql_create_queries_table = """ | |
| CREATE TABLE IF NOT EXISTS queries ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| query TEXT NOT NULL, | |
| response TEXT NOT NULL, | |
| document_id INTEGER, | |
| query_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP, | |
| FOREIGN KEY (document_id) REFERENCES documents (id) | |
| ); | |
| """ | |
| sql_create_annotations_table = """ | |
| CREATE TABLE IF NOT EXISTS annotations ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| document_id INTEGER NOT NULL, | |
| annotation TEXT NOT NULL, | |
| page_number INTEGER, | |
| annotation_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP, | |
| FOREIGN KEY (document_id) REFERENCES documents (id) | |
| ); | |
| """ | |
| c = conn.cursor() | |
| c.execute(sql_create_documents_table) | |
| c.execute(sql_create_queries_table) | |
| c.execute(sql_create_annotations_table) | |
| except Error as e: | |
| st.error(f"Error: {e}") | |
| # FAISS Initialization (faiss_initialization.py) | |
| def initialize_faiss(embeddings, documents, document_names): | |
| try: | |
| vector_store = FAISS.from_texts( | |
| documents, | |
| embeddings, | |
| metadatas=[{"source": name} for name in document_names], | |
| ) | |
| return vector_store | |
| except Exception as e: | |
| st.error(f"Error initializing FAISS: {e}") | |
| return None | |
| # Document Upload & Parsing Functions (document_parsing.py) | |
| def upload_and_parse_documents(documents): | |
| all_texts = [] | |
| document_names = [] | |
| document_pages = [] | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) | |
| for doc in documents: | |
| try: | |
| if doc.name in document_names: | |
| st.warning( | |
| f"Duplicate file name detected: {doc.name}. This file will be ignored.", | |
| icon="⚠️", | |
| ) | |
| continue # Skip to the next file | |
| # Create a temporary file | |
| with tempfile.NamedTemporaryFile(delete=False) as tmp_file: | |
| tmp_file.write(doc.read()) | |
| tmp_file_path = tmp_file.name | |
| loader = PyPDFLoader(tmp_file_path) | |
| pages = loader.load() | |
| document_names.append(doc.name) | |
| page_contents = [] | |
| for page in pages: | |
| chunks = text_splitter.split_text(page.page_content) | |
| all_texts.extend(chunks) | |
| page_contents.append(page.page_content) | |
| document_pages.append(page_contents) | |
| # Remove the temporary file | |
| os.remove(tmp_file_path) | |
| except Exception as e: | |
| st.error(f"Error parsing document {doc.name}: {e}") | |
| return all_texts, document_names, document_pages | |
| def parse_pdf_from_url(url): | |
| try: | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| with open("temp.pdf", "wb") as f: | |
| f.write(response.content) | |
| loader = PyPDFLoader("temp.pdf") | |
| pages = loader.load() | |
| all_texts = [] | |
| document_name = url.split("/")[-1] | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1000, chunk_overlap=100 | |
| ) | |
| for page in pages: | |
| chunks = text_splitter.split_text(page.page_content) | |
| all_texts.extend(chunks) | |
| return all_texts, document_name | |
| except requests.exceptions.RequestException as e: | |
| st.error(f"Failed to download PDF from URL: {e}") | |
| return None, None | |
| except Exception as e: | |
| st.error(f"Error parsing PDF from URL: {e}") | |
| return None, None | |
| def parse_pdf_from_google_drive(file_id): | |
| try: | |
| # Authenticate and create the drive service | |
| credentials = service_account.Credentials.from_service_account_info( | |
| st.secrets["gdrive_service_account"], | |
| scopes=["https://www.googleapis.com/auth/drive"], | |
| ) | |
| service = build("drive", "v3", credentials=credentials) | |
| request = service.files().get_media(fileId=file_id) | |
| fh = BytesIO() | |
| downloader = MediaIoBaseDownload(fh, request) | |
| done = False | |
| while not done: | |
| status, done = downloader.next_chunk() | |
| fh.seek(0) | |
| with open("temp_drive.pdf", "wb") as f: | |
| f.write(fh.read()) | |
| loader = PyPDFLoader("temp_drive.pdf") | |
| pages = loader.load() | |
| all_texts = [] | |
| document_name = f"GoogleDrive_{file_id}.pdf" | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1000, chunk_overlap=100 | |
| ) | |
| for page in pages: | |
| chunks = text_splitter.split_text(page.page_content) | |
| all_texts.extend(chunks) | |
| return all_texts, document_name | |
| except Exception as e: | |
| st.error(f"Error downloading PDF from Google Drive: {e}") | |
| return None, None | |
| # Embeddings for Semantic Search (embeddings.py) | |
| def get_embeddings_model(): | |
| try: | |
| model_name = "sentence-transformers/all-MiniLM-L6-v2" | |
| embeddings = HuggingFaceEmbeddings(model_name=model_name) | |
| return embeddings | |
| except Exception as e: | |
| st.error(f"Error loading embeddings model: {e}") | |
| return None | |
| # QA System Initialization (qa_system.py) | |
| def initialize_qa_system(vector_store): | |
| """Initialize QA system with proper chat handling.""" | |
| try: | |
| llm = ChatOpenAI( | |
| temperature=0.5, | |
| model_name="gpt-4", | |
| api_key=os.environ.get("OPENAI_API_KEY") | |
| ) | |
| # Create a more basic prompt template | |
| prompt = ChatPromptTemplate.from_messages([ | |
| ("system", """You are an expert consultant specializing in analyzing Request for Proposal (RFP) documents. | |
| Your goal is to provide clear, accurate responses based on the provided context. | |
| Start with a direct answer and organize additional details under relevant headers."""), | |
| ("human", "{input}") | |
| ]) | |
| # Create the retriever chain | |
| retriever = vector_store.as_retriever( | |
| search_type="similarity", | |
| search_kwargs={"k": 3} | |
| ) | |
| chain = ( | |
| { | |
| "input": RunnablePassthrough() | |
| } | |
| | {"input": lambda x: x["input"], "docs": retriever} | |
| | { | |
| "input": lambda x: x["input"], | |
| "context": lambda x: "\n\n".join([doc.page_content for doc in x["docs"]]) | |
| } | |
| | prompt | |
| | llm | |
| ) | |
| return chain | |
| except Exception as e: | |
| st.error(f"Error initializing QA system: {e}") | |
| return None |