import os import streamlit as st # from PyPDF2 import PdfReader from dotenv import load_dotenv from langchain.embeddings.openai import OpenAIEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.chains import ConversationalRetrievalChain from langchain.chat_models import ChatOpenAI from langchain.vectorstores import FAISS from langchain.document_loaders import PyPDFLoader from llama_index.llama_pack import download_llama_pack # download and install dependencies EmbeddedTablesUnstructuredRetrieverPack = download_llama_pack( "EmbeddedTablesUnstructuredRetrieverPack", "./embedded_tables_unstructured_pack" ) # import requests import subprocess # Function to read PDF content def read_pdf(file_path): print(f"Parámetros: file_path: {file_path}") pdf_link = file_path loader = PyPDFLoader(pdf_link, extract_images=False) data = loader.load_and_split() return data # Load environment variables load_dotenv() # Main Streamlit app def main(): # st.title("🤗💬 ChatPDF") archivo_pdf = st.file_uploader("Cargar archivo PDF", type=["pdf"]) with st.sidebar: st.title('🤗💬 ChatPDF') st.markdown(''' ## Instrucciones Cargar un archivo PDF. Esperar unos segundos y aparecerá la ventana de chat. Finalmente, comenzar a chatear con el PDF. ''') # custom_names = list(pdf_mapping.keys()) # selected_custom_name = st.sidebar.selectbox('Choose your PDF', ['', *custom_names]) # selected_actual_name = pdf_mapping.get(selected_custom_name) if archivo_pdf is not None: # # pdf_folder = "pdfs" # file_path = archivo_pdf#os.path.join(pdf_folder, selected_actual_name) file_path = os.path.join(os.getcwd(), archivo_pdf.name)# PyPDFLoader with open(file_path, "wb") as f: f.write(archivo_pdf.getvalue()) try: text = read_pdf(file_path) st.info("The content of the PDF is hidden. Type your query in the chat window.") except FileNotFoundError: st.error(f"No se encontró el archivo: {file_path}") return except Exception as e: st.error(f"Error durante la lectura del archivo: {e}") return text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=20, length_function=len, add_start_index = True, ) # Process the PDF text and create the documents list # documents = text_splitter.split_text(text=text) documents = text_splitter.split_documents(text) # Vectorize the documents and create vectorstore embeddings = OpenAIEmbeddings() # vectorstore = FAISS.from_texts(documents, embedding=embeddings) vectorstore = FAISS.from_documents(documents, embedding=embeddings) st.session_state.processed_data = { "document_chunks": documents, "vectorstore": vectorstore, } # Load the Langchain chatbot llm = ChatOpenAI(temperature=0, max_tokens=1000, model_name="gpt-3.5-turbo") qa = ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever()) # Initialize Streamlit chat UI if "messages" not in st.session_state: st.session_state.messages = [] for message in st.session_state.messages: with st.chat_message(message["role"]): st.markdown(message["content"]) if prompt := st.chat_input("Haz tus preguntas..."): st.session_state.messages.append({"role": "user", "content": prompt}) with st.chat_message("user"): st.markdown(prompt) result = qa({"question": prompt, "chat_history": [(message["role"], message["content"]) for message in st.session_state.messages]}) print(prompt) with st.chat_message("assistant"): message_placeholder = st.empty() full_response = result["answer"] message_placeholder.markdown(full_response + "|") message_placeholder.markdown(full_response) print(full_response) st.session_state.messages.append({"role": "assistant", "content": full_response}) if __name__ == "__main__": main()