chatpdf / app.py
fcernafukuzaki's picture
Update app.py
4abbd9e
import os
import streamlit as st
# from PyPDF2 import PdfReader
from dotenv import load_dotenv
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader
from llama_index.llama_pack import download_llama_pack
# download and install dependencies
EmbeddedTablesUnstructuredRetrieverPack = download_llama_pack(
"EmbeddedTablesUnstructuredRetrieverPack", "./embedded_tables_unstructured_pack"
)
# import requests
import subprocess
# Function to read PDF content
def read_pdf(file_path):
print(f"Parámetros: file_path: {file_path}")
pdf_link = file_path
loader = PyPDFLoader(pdf_link, extract_images=False)
data = loader.load_and_split()
return data
# Load environment variables
load_dotenv()
# Main Streamlit app
def main():
# st.title("🤗💬 ChatPDF")
archivo_pdf = st.file_uploader("Cargar archivo PDF", type=["pdf"])
with st.sidebar:
st.title('🤗💬 ChatPDF')
st.markdown('''
## Instrucciones
Cargar un archivo PDF.
Esperar unos segundos y aparecerá la ventana de chat.
Finalmente, comenzar a chatear con el PDF.
''')
# custom_names = list(pdf_mapping.keys())
# selected_custom_name = st.sidebar.selectbox('Choose your PDF', ['', *custom_names])
# selected_actual_name = pdf_mapping.get(selected_custom_name)
if archivo_pdf is not None:
# # pdf_folder = "pdfs"
# file_path = archivo_pdf#os.path.join(pdf_folder, selected_actual_name)
file_path = os.path.join(os.getcwd(), archivo_pdf.name)# PyPDFLoader
with open(file_path, "wb") as f:
f.write(archivo_pdf.getvalue())
try:
text = read_pdf(file_path)
st.info("The content of the PDF is hidden. Type your query in the chat window.")
except FileNotFoundError:
st.error(f"No se encontró el archivo: {file_path}")
return
except Exception as e:
st.error(f"Error durante la lectura del archivo: {e}")
return
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=20,
length_function=len,
add_start_index = True,
)
# Process the PDF text and create the documents list
# documents = text_splitter.split_text(text=text)
documents = text_splitter.split_documents(text)
# Vectorize the documents and create vectorstore
embeddings = OpenAIEmbeddings()
# vectorstore = FAISS.from_texts(documents, embedding=embeddings)
vectorstore = FAISS.from_documents(documents, embedding=embeddings)
st.session_state.processed_data = {
"document_chunks": documents,
"vectorstore": vectorstore,
}
# Load the Langchain chatbot
llm = ChatOpenAI(temperature=0, max_tokens=1000, model_name="gpt-3.5-turbo")
qa = ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever())
# Initialize Streamlit chat UI
if "messages" not in st.session_state:
st.session_state.messages = []
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])
if prompt := st.chat_input("Haz tus preguntas..."):
st.session_state.messages.append({"role": "user", "content": prompt})
with st.chat_message("user"):
st.markdown(prompt)
result = qa({"question": prompt, "chat_history": [(message["role"], message["content"]) for message in st.session_state.messages]})
print(prompt)
with st.chat_message("assistant"):
message_placeholder = st.empty()
full_response = result["answer"]
message_placeholder.markdown(full_response + "|")
message_placeholder.markdown(full_response)
print(full_response)
st.session_state.messages.append({"role": "assistant", "content": full_response})
if __name__ == "__main__":
main()