Spaces:

fcernafukuzaki
/

chatpdf

Sleeping

App Files Files Community

chatpdf / app.py

fcernafukuzaki

Update app.py

4abbd9e about 2 years ago

raw

history blame contribute delete

4.39 kB

	import os
	import streamlit as st
	# from PyPDF2 import PdfReader
	from dotenv import load_dotenv
	from langchain.embeddings.openai import OpenAIEmbeddings
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.chains import ConversationalRetrievalChain
	from langchain.chat_models import ChatOpenAI
	from langchain.vectorstores import FAISS
	from langchain.document_loaders import PyPDFLoader

	from llama_index.llama_pack import download_llama_pack


	# download and install dependencies
	EmbeddedTablesUnstructuredRetrieverPack = download_llama_pack(
	"EmbeddedTablesUnstructuredRetrieverPack", "./embedded_tables_unstructured_pack"
	)

	# import requests
	import subprocess


	# Function to read PDF content
	def read_pdf(file_path):
	print(f"Parámetros: file_path: {file_path}")
	pdf_link = file_path
	loader = PyPDFLoader(pdf_link, extract_images=False)
	data = loader.load_and_split()
	return data


	# Load environment variables
	load_dotenv()


	# Main Streamlit app
	def main():
	# st.title("🤗💬 ChatPDF")
	archivo_pdf = st.file_uploader("Cargar archivo PDF", type=["pdf"])

	with st.sidebar:
	st.title('🤗💬 ChatPDF')
	st.markdown('''
	## Instrucciones
	Cargar un archivo PDF.

	Esperar unos segundos y aparecerá la ventana de chat.

	Finalmente, comenzar a chatear con el PDF.
	''')


	# custom_names = list(pdf_mapping.keys())

	# selected_custom_name = st.sidebar.selectbox('Choose your PDF', ['', *custom_names])

	# selected_actual_name = pdf_mapping.get(selected_custom_name)

	if archivo_pdf is not None:
	# # pdf_folder = "pdfs"
	# file_path = archivo_pdf#os.path.join(pdf_folder, selected_actual_name)
	file_path = os.path.join(os.getcwd(), archivo_pdf.name)# PyPDFLoader
	with open(file_path, "wb") as f:
	f.write(archivo_pdf.getvalue())

	try:
	text = read_pdf(file_path)
	st.info("The content of the PDF is hidden. Type your query in the chat window.")
	except FileNotFoundError:
	st.error(f"No se encontró el archivo: {file_path}")
	return
	except Exception as e:
	st.error(f"Error durante la lectura del archivo: {e}")
	return

	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=20,
	length_function=len,
	add_start_index = True,
	)

	# Process the PDF text and create the documents list
	# documents = text_splitter.split_text(text=text)
	documents = text_splitter.split_documents(text)

	# Vectorize the documents and create vectorstore
	embeddings = OpenAIEmbeddings()
	# vectorstore = FAISS.from_texts(documents, embedding=embeddings)
	vectorstore = FAISS.from_documents(documents, embedding=embeddings)

	st.session_state.processed_data = {
	"document_chunks": documents,
	"vectorstore": vectorstore,
	}


	# Load the Langchain chatbot
	llm = ChatOpenAI(temperature=0, max_tokens=1000, model_name="gpt-3.5-turbo")
	qa = ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever())

	# Initialize Streamlit chat UI
	if "messages" not in st.session_state:
	st.session_state.messages = []

	for message in st.session_state.messages:
	with st.chat_message(message["role"]):
	st.markdown(message["content"])

	if prompt := st.chat_input("Haz tus preguntas..."):
	st.session_state.messages.append({"role": "user", "content": prompt})
	with st.chat_message("user"):
	st.markdown(prompt)

	result = qa({"question": prompt, "chat_history": [(message["role"], message["content"]) for message in st.session_state.messages]})
	print(prompt)

	with st.chat_message("assistant"):
	message_placeholder = st.empty()
	full_response = result["answer"]
	message_placeholder.markdown(full_response + "\|")
	message_placeholder.markdown(full_response)
	print(full_response)
	st.session_state.messages.append({"role": "assistant", "content": full_response})

	if __name__ == "__main__":
	main()