Spaces:
Build error
Build error
Update backend.py
Browse files- backend.py +54 -25
backend.py
CHANGED
|
@@ -23,7 +23,10 @@ from langchain.llms import OpenAI # Import the OpenAI class
|
|
| 23 |
from langchain.chat_models import ChatOpenAI # Import ChatOpenAI
|
| 24 |
from langchain.memory import ConversationBufferMemory
|
| 25 |
from langchain.agents import create_openai_tools_agent, AgentExecutor
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
|
| 29 |
# SQLite Database Functions (database.py)
|
|
@@ -35,18 +38,19 @@ def create_connection(db_file):
|
|
| 35 |
st.error(f"Error: {e}")
|
| 36 |
return None
|
| 37 |
|
|
|
|
| 38 |
def create_tables(conn):
|
| 39 |
try:
|
| 40 |
-
sql_create_documents_table =
|
| 41 |
CREATE TABLE IF NOT EXISTS documents (
|
| 42 |
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 43 |
name TEXT NOT NULL,
|
| 44 |
content TEXT NOT NULL,
|
| 45 |
upload_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
| 46 |
);
|
| 47 |
-
|
| 48 |
|
| 49 |
-
sql_create_queries_table =
|
| 50 |
CREATE TABLE IF NOT EXISTS queries (
|
| 51 |
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 52 |
query TEXT NOT NULL,
|
|
@@ -55,9 +59,9 @@ def create_tables(conn):
|
|
| 55 |
query_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
| 56 |
FOREIGN KEY (document_id) REFERENCES documents (id)
|
| 57 |
);
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
sql_create_annotations_table =
|
| 61 |
CREATE TABLE IF NOT EXISTS annotations (
|
| 62 |
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 63 |
document_id INTEGER NOT NULL,
|
|
@@ -66,8 +70,8 @@ def create_tables(conn):
|
|
| 66 |
annotation_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
| 67 |
FOREIGN KEY (document_id) REFERENCES documents (id)
|
| 68 |
);
|
| 69 |
-
|
| 70 |
-
|
| 71 |
c = conn.cursor()
|
| 72 |
c.execute(sql_create_documents_table)
|
| 73 |
c.execute(sql_create_queries_table)
|
|
@@ -75,15 +79,21 @@ def create_tables(conn):
|
|
| 75 |
except Error as e:
|
| 76 |
st.error(f"Error: {e}")
|
| 77 |
|
|
|
|
| 78 |
# FAISS Initialization (faiss_initialization.py)
|
| 79 |
def initialize_faiss(embeddings, documents, document_names):
|
| 80 |
try:
|
| 81 |
-
vector_store = FAISS.from_texts(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
return vector_store
|
| 83 |
except Exception as e:
|
| 84 |
st.error(f"Error initializing FAISS: {e}")
|
| 85 |
return None
|
| 86 |
|
|
|
|
| 87 |
# Document Upload & Parsing Functions (document_parsing.py)
|
| 88 |
@st.cache_data
|
| 89 |
def upload_and_parse_documents(documents):
|
|
@@ -94,7 +104,10 @@ def upload_and_parse_documents(documents):
|
|
| 94 |
for doc in documents:
|
| 95 |
try:
|
| 96 |
if doc.name in document_names:
|
| 97 |
-
st.warning(
|
|
|
|
|
|
|
|
|
|
| 98 |
continue # Skip to the next file
|
| 99 |
|
| 100 |
# Create a temporary file
|
|
@@ -118,7 +131,8 @@ def upload_and_parse_documents(documents):
|
|
| 118 |
except Exception as e:
|
| 119 |
st.error(f"Error parsing document {doc.name}: {e}")
|
| 120 |
return all_texts, document_names, document_pages
|
| 121 |
-
|
|
|
|
| 122 |
@st.cache_data
|
| 123 |
def parse_pdf_from_url(url):
|
| 124 |
try:
|
|
@@ -130,7 +144,9 @@ def parse_pdf_from_url(url):
|
|
| 130 |
pages = loader.load()
|
| 131 |
all_texts = []
|
| 132 |
document_name = url.split("/")[-1]
|
| 133 |
-
text_splitter = RecursiveCharacterTextSplitter(
|
|
|
|
|
|
|
| 134 |
for page in pages:
|
| 135 |
chunks = text_splitter.split_text(page.page_content)
|
| 136 |
all_texts.extend(chunks)
|
|
@@ -142,15 +158,16 @@ def parse_pdf_from_url(url):
|
|
| 142 |
st.error(f"Error parsing PDF from URL: {e}")
|
| 143 |
return None, None
|
| 144 |
|
|
|
|
| 145 |
@st.cache_data
|
| 146 |
def parse_pdf_from_google_drive(file_id):
|
| 147 |
try:
|
| 148 |
# Authenticate and create the drive service
|
| 149 |
credentials = service_account.Credentials.from_service_account_info(
|
| 150 |
st.secrets["gdrive_service_account"],
|
| 151 |
-
scopes=["https://www.googleapis.com/auth/drive"]
|
| 152 |
)
|
| 153 |
-
service = build(
|
| 154 |
request = service.files().get_media(fileId=file_id)
|
| 155 |
fh = BytesIO()
|
| 156 |
downloader = MediaIoBaseDownload(fh, request)
|
|
@@ -164,7 +181,9 @@ def parse_pdf_from_google_drive(file_id):
|
|
| 164 |
pages = loader.load()
|
| 165 |
all_texts = []
|
| 166 |
document_name = f"GoogleDrive_{file_id}.pdf"
|
| 167 |
-
text_splitter = RecursiveCharacterTextSplitter(
|
|
|
|
|
|
|
| 168 |
for page in pages:
|
| 169 |
chunks = text_splitter.split_text(page.page_content)
|
| 170 |
all_texts.extend(chunks)
|
|
@@ -173,6 +192,7 @@ def parse_pdf_from_google_drive(file_id):
|
|
| 173 |
st.error(f"Error downloading PDF from Google Drive: {e}")
|
| 174 |
return None, None
|
| 175 |
|
|
|
|
| 176 |
# Embeddings for Semantic Search (embeddings.py)
|
| 177 |
@st.cache_resource
|
| 178 |
def get_embeddings_model():
|
|
@@ -184,6 +204,7 @@ def get_embeddings_model():
|
|
| 184 |
st.error(f"Error loading embeddings model: {e}")
|
| 185 |
return None
|
| 186 |
|
|
|
|
| 187 |
# QA System Initialization (qa_system.py)
|
| 188 |
|
| 189 |
|
|
@@ -193,29 +214,37 @@ def initialize_qa_system(_vector_store):
|
|
| 193 |
llm = ChatOpenAI(
|
| 194 |
temperature=0,
|
| 195 |
model_name="gpt-4", # Or another OpenAI model like "gpt-3.5-turbo"
|
| 196 |
-
api_key=os.environ.get(
|
| 197 |
)
|
| 198 |
|
| 199 |
# Define the prompt template
|
| 200 |
-
prompt = ChatPromptTemplate.from_messages(
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
|
|
|
| 206 |
|
| 207 |
# Define the tools
|
| 208 |
tools = [
|
| 209 |
Tool(
|
| 210 |
name="Search",
|
| 211 |
-
func=_vector_store.as_retriever(
|
|
|
|
|
|
|
| 212 |
description="useful for when you need to answer questions about the documents you have been uploaded. Input should be a fully formed question.",
|
| 213 |
)
|
| 214 |
]
|
| 215 |
|
| 216 |
# Create the agent and executor
|
| 217 |
agent = create_openai_tools_agent(llm=llm, tools=tools, prompt=prompt)
|
| 218 |
-
agent_executor = AgentExecutor(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
|
| 220 |
return agent_executor # Return the agent executor
|
| 221 |
except Exception as e:
|
|
|
|
| 23 |
from langchain.chat_models import ChatOpenAI # Import ChatOpenAI
|
| 24 |
from langchain.memory import ConversationBufferMemory
|
| 25 |
from langchain.agents import create_openai_tools_agent, AgentExecutor
|
| 26 |
+
from langchain.prompts import (
|
| 27 |
+
ChatPromptTemplate,
|
| 28 |
+
MessagesPlaceholder,
|
| 29 |
+
) # Import necessary classes
|
| 30 |
|
| 31 |
|
| 32 |
# SQLite Database Functions (database.py)
|
|
|
|
| 38 |
st.error(f"Error: {e}")
|
| 39 |
return None
|
| 40 |
|
| 41 |
+
|
| 42 |
def create_tables(conn):
|
| 43 |
try:
|
| 44 |
+
sql_create_documents_table = """
|
| 45 |
CREATE TABLE IF NOT EXISTS documents (
|
| 46 |
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 47 |
name TEXT NOT NULL,
|
| 48 |
content TEXT NOT NULL,
|
| 49 |
upload_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
| 50 |
);
|
| 51 |
+
"""
|
| 52 |
|
| 53 |
+
sql_create_queries_table = """
|
| 54 |
CREATE TABLE IF NOT EXISTS queries (
|
| 55 |
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 56 |
query TEXT NOT NULL,
|
|
|
|
| 59 |
query_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
| 60 |
FOREIGN KEY (document_id) REFERENCES documents (id)
|
| 61 |
);
|
| 62 |
+
"""
|
| 63 |
+
|
| 64 |
+
sql_create_annotations_table = """
|
| 65 |
CREATE TABLE IF NOT EXISTS annotations (
|
| 66 |
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 67 |
document_id INTEGER NOT NULL,
|
|
|
|
| 70 |
annotation_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
| 71 |
FOREIGN KEY (document_id) REFERENCES documents (id)
|
| 72 |
);
|
| 73 |
+
"""
|
| 74 |
+
|
| 75 |
c = conn.cursor()
|
| 76 |
c.execute(sql_create_documents_table)
|
| 77 |
c.execute(sql_create_queries_table)
|
|
|
|
| 79 |
except Error as e:
|
| 80 |
st.error(f"Error: {e}")
|
| 81 |
|
| 82 |
+
|
| 83 |
# FAISS Initialization (faiss_initialization.py)
|
| 84 |
def initialize_faiss(embeddings, documents, document_names):
|
| 85 |
try:
|
| 86 |
+
vector_store = FAISS.from_texts(
|
| 87 |
+
documents,
|
| 88 |
+
embeddings,
|
| 89 |
+
metadatas=[{"source": name} for name in document_names],
|
| 90 |
+
)
|
| 91 |
return vector_store
|
| 92 |
except Exception as e:
|
| 93 |
st.error(f"Error initializing FAISS: {e}")
|
| 94 |
return None
|
| 95 |
|
| 96 |
+
|
| 97 |
# Document Upload & Parsing Functions (document_parsing.py)
|
| 98 |
@st.cache_data
|
| 99 |
def upload_and_parse_documents(documents):
|
|
|
|
| 104 |
for doc in documents:
|
| 105 |
try:
|
| 106 |
if doc.name in document_names:
|
| 107 |
+
st.warning(
|
| 108 |
+
f"Duplicate file name detected: {doc.name}. This file will be ignored.",
|
| 109 |
+
icon="⚠️",
|
| 110 |
+
)
|
| 111 |
continue # Skip to the next file
|
| 112 |
|
| 113 |
# Create a temporary file
|
|
|
|
| 131 |
except Exception as e:
|
| 132 |
st.error(f"Error parsing document {doc.name}: {e}")
|
| 133 |
return all_texts, document_names, document_pages
|
| 134 |
+
|
| 135 |
+
|
| 136 |
@st.cache_data
|
| 137 |
def parse_pdf_from_url(url):
|
| 138 |
try:
|
|
|
|
| 144 |
pages = loader.load()
|
| 145 |
all_texts = []
|
| 146 |
document_name = url.split("/")[-1]
|
| 147 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 148 |
+
chunk_size=1000, chunk_overlap=100
|
| 149 |
+
)
|
| 150 |
for page in pages:
|
| 151 |
chunks = text_splitter.split_text(page.page_content)
|
| 152 |
all_texts.extend(chunks)
|
|
|
|
| 158 |
st.error(f"Error parsing PDF from URL: {e}")
|
| 159 |
return None, None
|
| 160 |
|
| 161 |
+
|
| 162 |
@st.cache_data
|
| 163 |
def parse_pdf_from_google_drive(file_id):
|
| 164 |
try:
|
| 165 |
# Authenticate and create the drive service
|
| 166 |
credentials = service_account.Credentials.from_service_account_info(
|
| 167 |
st.secrets["gdrive_service_account"],
|
| 168 |
+
scopes=["https://www.googleapis.com/auth/drive"],
|
| 169 |
)
|
| 170 |
+
service = build("drive", "v3", credentials=credentials)
|
| 171 |
request = service.files().get_media(fileId=file_id)
|
| 172 |
fh = BytesIO()
|
| 173 |
downloader = MediaIoBaseDownload(fh, request)
|
|
|
|
| 181 |
pages = loader.load()
|
| 182 |
all_texts = []
|
| 183 |
document_name = f"GoogleDrive_{file_id}.pdf"
|
| 184 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 185 |
+
chunk_size=1000, chunk_overlap=100
|
| 186 |
+
)
|
| 187 |
for page in pages:
|
| 188 |
chunks = text_splitter.split_text(page.page_content)
|
| 189 |
all_texts.extend(chunks)
|
|
|
|
| 192 |
st.error(f"Error downloading PDF from Google Drive: {e}")
|
| 193 |
return None, None
|
| 194 |
|
| 195 |
+
|
| 196 |
# Embeddings for Semantic Search (embeddings.py)
|
| 197 |
@st.cache_resource
|
| 198 |
def get_embeddings_model():
|
|
|
|
| 204 |
st.error(f"Error loading embeddings model: {e}")
|
| 205 |
return None
|
| 206 |
|
| 207 |
+
|
| 208 |
# QA System Initialization (qa_system.py)
|
| 209 |
|
| 210 |
|
|
|
|
| 214 |
llm = ChatOpenAI(
|
| 215 |
temperature=0,
|
| 216 |
model_name="gpt-4", # Or another OpenAI model like "gpt-3.5-turbo"
|
| 217 |
+
api_key=os.environ.get("OPENAI_API_KEY"),
|
| 218 |
)
|
| 219 |
|
| 220 |
# Define the prompt template
|
| 221 |
+
prompt = ChatPromptTemplate.from_messages(
|
| 222 |
+
[
|
| 223 |
+
("system", "You are a helpful assistant"),
|
| 224 |
+
MessagesPlaceholder(variable_name="chat_history"),
|
| 225 |
+
("human", "{input}"),
|
| 226 |
+
]
|
| 227 |
+
)
|
| 228 |
|
| 229 |
# Define the tools
|
| 230 |
tools = [
|
| 231 |
Tool(
|
| 232 |
name="Search",
|
| 233 |
+
func=_vector_store.as_retriever(
|
| 234 |
+
search_kwargs={"k": 2}
|
| 235 |
+
).get_relevant_documents,
|
| 236 |
description="useful for when you need to answer questions about the documents you have been uploaded. Input should be a fully formed question.",
|
| 237 |
)
|
| 238 |
]
|
| 239 |
|
| 240 |
# Create the agent and executor
|
| 241 |
agent = create_openai_tools_agent(llm=llm, tools=tools, prompt=prompt)
|
| 242 |
+
agent_executor = AgentExecutor(
|
| 243 |
+
agent=agent,
|
| 244 |
+
tools=tools,
|
| 245 |
+
verbose=True,
|
| 246 |
+
memory=ConversationBufferMemory(memory_key="chat_history"),
|
| 247 |
+
)
|
| 248 |
|
| 249 |
return agent_executor # Return the agent executor
|
| 250 |
except Exception as e:
|