File size: 4,391 Bytes
810f6e5
d8fa018
fee4b6c
a26048f
 
 
 
 
 
658d39c
810f6e5
56f4ff9
 
435e1e6
56f4ff9
 
 
 
a26048f
837924a
435e1e6
 
 
a26048f
 
d1ad982
293d55c
658d39c
 
 
a26048f
810f6e5
 
 
 
 
 
 
5dcf9d3
a5930f5
 
810f6e5
5dcf9d3
810f6e5
5dcf9d3
 
e8f6569
 
8c8901e
e8f6569
810f6e5
 
 
a5930f5
810f6e5
a5930f5
810f6e5
a5930f5
810f6e5
a5930f5
658d39c
 
 
 
 
 
810f6e5
 
 
 
5dcf9d3
810f6e5
 
5dcf9d3
810f6e5
 
 
 
7dc4e70
 
 
810f6e5
286044f
810f6e5
e8f6569
 
810f6e5
 
 
750e91f
 
810f6e5
 
 
 
 
 
7dc4e70
810f6e5
 
 
 
 
 
 
 
 
 
 
 
5dcf9d3
810f6e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
import streamlit as st
# from PyPDF2 import PdfReader
from dotenv import load_dotenv
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader

from llama_index.llama_pack import download_llama_pack


# download and install dependencies
EmbeddedTablesUnstructuredRetrieverPack = download_llama_pack(
    "EmbeddedTablesUnstructuredRetrieverPack", "./embedded_tables_unstructured_pack"
)

# import requests
import subprocess


# Function to read PDF content
def read_pdf(file_path):
    print(f"Parámetros: file_path: {file_path}")
    pdf_link = file_path
    loader = PyPDFLoader(pdf_link, extract_images=False)
    data = loader.load_and_split()
    return data


# Load environment variables
load_dotenv()


# Main Streamlit app
def main():
    # st.title("🤗💬 ChatPDF")
    archivo_pdf = st.file_uploader("Cargar archivo PDF", type=["pdf"])
    
    with st.sidebar:
        st.title('🤗💬 ChatPDF')
        st.markdown('''
        ## Instrucciones
        Cargar un archivo PDF.

        Esperar unos segundos y aparecerá la ventana de chat.
        
        Finalmente, comenzar a chatear con el PDF.
        ''')


    # custom_names = list(pdf_mapping.keys())

    # selected_custom_name = st.sidebar.selectbox('Choose your PDF', ['', *custom_names])

    # selected_actual_name = pdf_mapping.get(selected_custom_name)

    if archivo_pdf is not None:
        # # pdf_folder = "pdfs"
        # file_path = archivo_pdf#os.path.join(pdf_folder, selected_actual_name)
        file_path = os.path.join(os.getcwd(), archivo_pdf.name)# PyPDFLoader
        with open(file_path, "wb") as f:
            f.write(archivo_pdf.getvalue())
        
        try:
            text = read_pdf(file_path)
            st.info("The content of the PDF is hidden. Type your query in the chat window.")
        except FileNotFoundError:
            st.error(f"No se encontró el archivo: {file_path}")
            return
        except Exception as e:
            st.error(f"Error durante la lectura del archivo: {e}")
            return

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=20,
            length_function=len,
            add_start_index = True,
        )
        
        # Process the PDF text and create the documents list
        # documents = text_splitter.split_text(text=text)
        documents = text_splitter.split_documents(text)

        # Vectorize the documents and create vectorstore
        embeddings = OpenAIEmbeddings()
        # vectorstore = FAISS.from_texts(documents, embedding=embeddings)
        vectorstore = FAISS.from_documents(documents, embedding=embeddings)

        st.session_state.processed_data = {
            "document_chunks": documents,
            "vectorstore": vectorstore,
        }

        
        # Load the Langchain chatbot
        llm = ChatOpenAI(temperature=0, max_tokens=1000, model_name="gpt-3.5-turbo")
        qa = ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever())

        # Initialize Streamlit chat UI
        if "messages" not in st.session_state:
            st.session_state.messages = []

        for message in st.session_state.messages:
            with st.chat_message(message["role"]):
                st.markdown(message["content"])

        if prompt := st.chat_input("Haz tus preguntas..."):
            st.session_state.messages.append({"role": "user", "content": prompt})
            with st.chat_message("user"):
                st.markdown(prompt)

            result = qa({"question": prompt, "chat_history": [(message["role"], message["content"]) for message in st.session_state.messages]})
            print(prompt)

            with st.chat_message("assistant"):
                message_placeholder = st.empty()
                full_response = result["answer"]
                message_placeholder.markdown(full_response + "|")
            message_placeholder.markdown(full_response)
            print(full_response)
            st.session_state.messages.append({"role": "assistant", "content": full_response})

if __name__ == "__main__":
    main()