Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -19,6 +19,8 @@ from googleapiclient.http import MediaIoBaseDownload
|
|
| 19 |
from google.oauth2 import service_account
|
| 20 |
import tempfile
|
| 21 |
import os
|
|
|
|
|
|
|
| 22 |
|
| 23 |
|
| 24 |
# SQLite Database Functions (database.py)
|
|
@@ -88,12 +90,16 @@ def upload_and_parse_documents(documents):
|
|
| 88 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
| 89 |
for doc in documents:
|
| 90 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
# Create a temporary file
|
| 92 |
with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
|
| 93 |
tmp_file.write(doc.read())
|
| 94 |
tmp_file_path = tmp_file.name
|
| 95 |
|
| 96 |
-
loader = PyPDFLoader(tmp_file_path)
|
| 97 |
pages = loader.load()
|
| 98 |
document_names.append(doc.name)
|
| 99 |
page_contents = []
|
|
@@ -104,13 +110,12 @@ def upload_and_parse_documents(documents):
|
|
| 104 |
document_pages.append(page_contents)
|
| 105 |
|
| 106 |
# Remove the temporary file
|
| 107 |
-
os.remove(tmp_file_path)
|
| 108 |
|
| 109 |
except Exception as e:
|
| 110 |
st.error(f"Error parsing document {doc.name}: {e}")
|
| 111 |
return all_texts, document_names, document_pages
|
| 112 |
|
| 113 |
-
|
| 114 |
@st.cache_data
|
| 115 |
def parse_pdf_from_url(url):
|
| 116 |
try:
|
|
@@ -177,16 +182,19 @@ def get_embeddings_model():
|
|
| 177 |
return None
|
| 178 |
|
| 179 |
# QA System Initialization (qa_system.py)
|
|
|
|
| 180 |
@st.cache_resource
|
| 181 |
-
def initialize_qa_system(_vector_store):
|
| 182 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
qa_pipeline = RetrievalQA.from_chain_type(
|
| 184 |
-
llm=
|
| 185 |
-
|
| 186 |
-
model="gpt-4",
|
| 187 |
-
api_key=os.environ.get('OPENAI_API_KEY'),
|
| 188 |
-
prompt_template="Extract the specific details relevant to the query accurately from the document without adding additional information that is not present in the text. Provide concise, clear responses that stay within the boundaries of the document's content."),
|
| 189 |
-
retriever=_vector_store.as_retriever() # Use '_vector_store' here as well
|
| 190 |
)
|
| 191 |
return qa_pipeline
|
| 192 |
except Exception as e:
|
|
|
|
| 19 |
from google.oauth2 import service_account
|
| 20 |
import tempfile
|
| 21 |
import os
|
| 22 |
+
from langchain.llms import OpenAI # Import the OpenAI class
|
| 23 |
+
|
| 24 |
|
| 25 |
|
| 26 |
# SQLite Database Functions (database.py)
|
|
|
|
| 90 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
| 91 |
for doc in documents:
|
| 92 |
try:
|
| 93 |
+
if doc.name in document_names:
|
| 94 |
+
st.warning(f"Duplicate file name detected: {doc.name}. This file will be ignored.", icon="⚠️")
|
| 95 |
+
continue # Skip to the next file
|
| 96 |
+
|
| 97 |
# Create a temporary file
|
| 98 |
with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
|
| 99 |
tmp_file.write(doc.read())
|
| 100 |
tmp_file_path = tmp_file.name
|
| 101 |
|
| 102 |
+
loader = PyPDFLoader(tmp_file_path)
|
| 103 |
pages = loader.load()
|
| 104 |
document_names.append(doc.name)
|
| 105 |
page_contents = []
|
|
|
|
| 110 |
document_pages.append(page_contents)
|
| 111 |
|
| 112 |
# Remove the temporary file
|
| 113 |
+
os.remove(tmp_file_path)
|
| 114 |
|
| 115 |
except Exception as e:
|
| 116 |
st.error(f"Error parsing document {doc.name}: {e}")
|
| 117 |
return all_texts, document_names, document_pages
|
| 118 |
|
|
|
|
| 119 |
@st.cache_data
|
| 120 |
def parse_pdf_from_url(url):
|
| 121 |
try:
|
|
|
|
| 182 |
return None
|
| 183 |
|
| 184 |
# QA System Initialization (qa_system.py)
|
| 185 |
+
|
| 186 |
@st.cache_resource
|
| 187 |
+
def initialize_qa_system(_vector_store):
|
| 188 |
try:
|
| 189 |
+
llm = OpenAI(
|
| 190 |
+
model_name="gpt-4", # Or another OpenAI model like "text-davinci-003"
|
| 191 |
+
api_key=st.secrets["OPENAI_API_KEY"],
|
| 192 |
+
prompt_template="Extract the specific details relevant to the query accurately from the document without adding additional information that is not present in the text. Provide concise, clear responses that stay within the boundaries of the document's content."
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
qa_pipeline = RetrievalQA.from_chain_type(
|
| 196 |
+
llm=llm,
|
| 197 |
+
retriever=_vector_store.as_retriever()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
)
|
| 199 |
return qa_pipeline
|
| 200 |
except Exception as e:
|