|
|
|
|
|
import streamlit as st |
|
|
from langchain.chains import RetrievalQA |
|
|
from langchain_community.vectorstores import FAISS |
|
|
from langchain.embeddings import HuggingFaceEmbeddings |
|
|
from langchain.document_loaders import PyPDFLoader, OnlinePDFLoader |
|
|
from transformers import pipeline |
|
|
import re |
|
|
import sqlite3 |
|
|
from sqlite3 import Error |
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
import requests |
|
|
import pandas as pd |
|
|
from pydrive.auth import GoogleAuth |
|
|
from pydrive.drive import GoogleDrive |
|
|
from io import BytesIO |
|
|
from googleapiclient.discovery import build |
|
|
from googleapiclient.http import MediaIoBaseDownload |
|
|
from google.oauth2 import service_account |
|
|
import tempfile |
|
|
import os |
|
|
from langchain.llms import OpenAI |
|
|
from langchain.chat_models import ChatOpenAI |
|
|
from langchain.memory import ConversationBufferMemory |
|
|
from langchain.agents import create_openai_tools_agent, AgentExecutor, Tool |
|
|
from langchain.prompts import ( |
|
|
ChatPromptTemplate, |
|
|
MessagesPlaceholder, |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
def create_connection(db_file): |
|
|
try: |
|
|
conn = sqlite3.connect(db_file) |
|
|
return conn |
|
|
except Error as e: |
|
|
st.error(f"Error: {e}") |
|
|
return None |
|
|
|
|
|
|
|
|
def create_tables(conn): |
|
|
try: |
|
|
sql_create_documents_table = """ |
|
|
CREATE TABLE IF NOT EXISTS documents ( |
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT, |
|
|
name TEXT NOT NULL, |
|
|
content TEXT NOT NULL, |
|
|
upload_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP |
|
|
); |
|
|
""" |
|
|
|
|
|
sql_create_queries_table = """ |
|
|
CREATE TABLE IF NOT EXISTS queries ( |
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT, |
|
|
query TEXT NOT NULL, |
|
|
response TEXT NOT NULL, |
|
|
document_id INTEGER, |
|
|
query_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP, |
|
|
FOREIGN KEY (document_id) REFERENCES documents (id) |
|
|
); |
|
|
""" |
|
|
|
|
|
sql_create_annotations_table = """ |
|
|
CREATE TABLE IF NOT EXISTS annotations ( |
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT, |
|
|
document_id INTEGER NOT NULL, |
|
|
annotation TEXT NOT NULL, |
|
|
page_number INTEGER, |
|
|
annotation_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP, |
|
|
FOREIGN KEY (document_id) REFERENCES documents (id) |
|
|
); |
|
|
""" |
|
|
|
|
|
c = conn.cursor() |
|
|
c.execute(sql_create_documents_table) |
|
|
c.execute(sql_create_queries_table) |
|
|
c.execute(sql_create_annotations_table) |
|
|
except Error as e: |
|
|
st.error(f"Error: {e}") |
|
|
|
|
|
|
|
|
|
|
|
def initialize_faiss(embeddings, documents, document_names): |
|
|
try: |
|
|
vector_store = FAISS.from_texts( |
|
|
documents, |
|
|
embeddings, |
|
|
metadatas=[{"source": name} for name in document_names], |
|
|
) |
|
|
return vector_store |
|
|
except Exception as e: |
|
|
st.error(f"Error initializing FAISS: {e}") |
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
@st.cache_data |
|
|
def upload_and_parse_documents(documents): |
|
|
all_texts = [] |
|
|
document_names = [] |
|
|
document_pages = [] |
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) |
|
|
for doc in documents: |
|
|
try: |
|
|
if doc.name in document_names: |
|
|
st.warning( |
|
|
f"Duplicate file name detected: {doc.name}. This file will be ignored.", |
|
|
icon="⚠️", |
|
|
) |
|
|
continue |
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False) as tmp_file: |
|
|
tmp_file.write(doc.read()) |
|
|
tmp_file_path = tmp_file.name |
|
|
|
|
|
loader = PyPDFLoader(tmp_file_path) |
|
|
pages = loader.load() |
|
|
document_names.append(doc.name) |
|
|
page_contents = [] |
|
|
for page in pages: |
|
|
chunks = text_splitter.split_text(page.page_content) |
|
|
all_texts.extend(chunks) |
|
|
page_contents.append(page.page_content) |
|
|
document_pages.append(page_contents) |
|
|
|
|
|
|
|
|
os.remove(tmp_file_path) |
|
|
|
|
|
except Exception as e: |
|
|
st.error(f"Error parsing document {doc.name}: {e}") |
|
|
return all_texts, document_names, document_pages |
|
|
|
|
|
|
|
|
@st.cache_data |
|
|
def parse_pdf_from_url(url): |
|
|
try: |
|
|
response = requests.get(url) |
|
|
response.raise_for_status() |
|
|
with open("temp.pdf", "wb") as f: |
|
|
f.write(response.content) |
|
|
loader = PyPDFLoader("temp.pdf") |
|
|
pages = loader.load() |
|
|
all_texts = [] |
|
|
document_name = url.split("/")[-1] |
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
|
chunk_size=1000, chunk_overlap=100 |
|
|
) |
|
|
for page in pages: |
|
|
chunks = text_splitter.split_text(page.page_content) |
|
|
all_texts.extend(chunks) |
|
|
return all_texts, document_name |
|
|
except requests.exceptions.RequestException as e: |
|
|
st.error(f"Failed to download PDF from URL: {e}") |
|
|
return None, None |
|
|
except Exception as e: |
|
|
st.error(f"Error parsing PDF from URL: {e}") |
|
|
return None, None |
|
|
|
|
|
|
|
|
@st.cache_data |
|
|
def parse_pdf_from_google_drive(file_id): |
|
|
try: |
|
|
|
|
|
credentials = service_account.Credentials.from_service_account_info( |
|
|
st.secrets["gdrive_service_account"], |
|
|
scopes=["https://www.googleapis.com/auth/drive"], |
|
|
) |
|
|
service = build("drive", "v3", credentials=credentials) |
|
|
request = service.files().get_media(fileId=file_id) |
|
|
fh = BytesIO() |
|
|
downloader = MediaIoBaseDownload(fh, request) |
|
|
done = False |
|
|
while not done: |
|
|
status, done = downloader.next_chunk() |
|
|
fh.seek(0) |
|
|
with open("temp_drive.pdf", "wb") as f: |
|
|
f.write(fh.read()) |
|
|
loader = PyPDFLoader("temp_drive.pdf") |
|
|
pages = loader.load() |
|
|
all_texts = [] |
|
|
document_name = f"GoogleDrive_{file_id}.pdf" |
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
|
chunk_size=1000, chunk_overlap=100 |
|
|
) |
|
|
for page in pages: |
|
|
chunks = text_splitter.split_text(page.page_content) |
|
|
all_texts.extend(chunks) |
|
|
return all_texts, document_name |
|
|
except Exception as e: |
|
|
st.error(f"Error downloading PDF from Google Drive: {e}") |
|
|
return None, None |
|
|
|
|
|
|
|
|
|
|
|
@st.cache_resource |
|
|
def get_embeddings_model(): |
|
|
try: |
|
|
model_name = "sentence-transformers/all-MiniLM-L6-v2" |
|
|
embeddings = HuggingFaceEmbeddings(model_name=model_name) |
|
|
return embeddings |
|
|
except Exception as e: |
|
|
st.error(f"Error loading embeddings model: {e}") |
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@st.cache_resource |
|
|
def initialize_qa_system(_vector_store): |
|
|
try: |
|
|
llm = ChatOpenAI( |
|
|
temperature=0, |
|
|
model_name="gpt-4", |
|
|
api_key=os.environ.get("OPENAI_API_KEY"), |
|
|
) |
|
|
|
|
|
|
|
|
prompt = ChatPromptTemplate.from_messages( |
|
|
[ |
|
|
("system", "You are a helpful assistant"), |
|
|
MessagesPlaceholder(variable_name="chat_history"), |
|
|
("human", "{input}"), |
|
|
MessagesPlaceholder(variable_name="agent_scratchpad"), |
|
|
] |
|
|
) |
|
|
|
|
|
|
|
|
tools = [ |
|
|
Tool( |
|
|
name="Search", |
|
|
func=_vector_store.as_retriever( |
|
|
search_kwargs={"k": 2} |
|
|
).get_relevant_documents, |
|
|
description="useful for when you need to answer questions about the documents you have been uploaded. Input should be a fully formed question.", |
|
|
) |
|
|
] |
|
|
|
|
|
|
|
|
agent = create_openai_tools_agent(llm=llm, tools=tools, prompt=prompt) |
|
|
agent_executor = AgentExecutor( |
|
|
agent=agent, |
|
|
tools=tools, |
|
|
verbose=True, |
|
|
memory=ConversationBufferMemory(memory_key="chat_history"), |
|
|
) |
|
|
|
|
|
return agent_executor |
|
|
except Exception as e: |
|
|
st.error(f"Error initializing QA system: {e}") |
|
|
return None |