Spaces:

hancav
/

openai-question-answer-documents

Runtime error

openai-question-answer-documents / app.py

0e89322 over 2 years ago

7.47 kB

	"""## Import necessary libraries"""
	import os
	import shutil
	import json
	from langchain.document_loaders import PyPDFLoader
	from langchain.document_loaders import PyPDFDirectoryLoader
	from langchain.llms import OpenAI
	from langchain.prompts import PromptTemplate
	from langchain.chains import LLMChain
	from langchain.output_parsers import PydanticOutputParser
	from pydantic import BaseModel, Field
	from langchain.document_loaders import YoutubeLoader
	from langchain.document_loaders import WebBaseLoader
	from langchain.text_splitter import CharacterTextSplitter
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.embeddings.openai import OpenAIEmbeddings
	from langchain.vectorstores import Chroma
	from langchain.chat_models import ChatOpenAI
	from langchain.chains import RetrievalQA
	#from google.colab import drive
	from google.oauth2 import service_account
	from google.cloud import translate_v2 as translate
	import gradio as gr

	"""## Access KEY"""
	#ACCESS_KEY = os.environ.get("ACCESS_KEY")
	service_account_info = json.loads(os.environ.get("SERVICE_ACCOUNT_FILE"))
	credentials = service_account.Credentials.from_service_account_info(service_account_info)

	""" ## Load PDF """
	class LoadPdf:

	def __init__(self, pdf_file):
	if not self.is_pdf_file(pdf_file):
	raise gr.Error("Invalid file extension. Please load a PDF file")
	self.pdf_file = pdf_file

	def is_pdf_file(self, file_path):
	_, file_extension = os.path.splitext(file_path)
	return file_extension.lower() == ".pdf"

	def read_file(self):
	loader = PyPDFLoader(self.pdf_file)
	data = loader.load()
	return data

	"""## Request OpenAI"""
	class QuestionAnswer:

	def __init__(self, data, question, user_key):
	self.data = data
	self.question = question
	self.key = user_key

	def make_qa(self):
	#Splitter
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
	splits = text_splitter.split_documents(self.data)
	#Persist dir
	persist_directory = 'files/chroma/'
	#EMbedings
	embedding = OpenAIEmbeddings(openai_api_key=self.key)
	retriever = Chroma.from_documents(documents=splits,
	embedding=embedding,
	persist_directory=persist_directory).as_retriever()

	# initialize the LLM
	llm = ChatOpenAI(temperature=0.2, model="gpt-3.5-turbo-16k", openai_api_key=self.key)
	question_answer = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

	make_question = f'{self.question}'

	return question_answer.run(make_question)

	"""## Translation"""
	class TranslateOutput:

	def __init__(self, credentials):
	self.credentials = credentials

	def list_languages(self):
	client = translate.client.Client(credentials=self.credentials)
	languages = client.get_languages()
	language_names = [language['name'] for language in languages]
	return language_names

	def all_languages(self):
	client = translate.client.Client(credentials=self.credentials)
	languages = client.get_languages()
	return languages

	def translate_text(self, text, target_language):
	client = translate.client.Client(target_language=target_language, credentials=self.credentials)

	if isinstance(text, bytes):
	text = text.decode("utf-8")

	result = client.translate(text, target_language=target_language)
	return result["translatedText"]

	"""## Run QA """
	def run_qa(files,checkboxes,question,language,user_key):

	#secret_key = os.environ.get("SECRET_KEY")
	if user_key is None:
	return 'Introduza OpenAI API KEY'

	full_filenames = [file.name for file in files]
	available_files = [os.path.basename(path) for path in full_filenames]
	chosen_files = checkboxes

	# Filter files that are both available and chosen
	loadable_files = [file for file in available_files if file in chosen_files]

	# debug messages
	print(f"=> Available Files: {str(available_files)}")
	print(f"=> Chosen Files: {str(chosen_files)}")
	print(f"=> Question for Files: {str(question)}")
	print(f"=> Language to use: {str(language)}")

	# clear data
	data=''
	# Load files
	for file in loadable_files:
	print(f"=> Loading chosen file: {str(file)}")
	pdf_loader = LoadPdf("pdfs/"+file)
	data = pdf_loader.read_file()

	# Run the model
	qa = QuestionAnswer(data, question, user_key)
	answer_open_ai = qa.make_qa()

	# Translate output
	language_selected = language
	translate_output = TranslateOutput(credentials)

	for i in translate_output.all_languages():
	if i['name'] == language_selected:
	iso_code = i['language']
	break

	print(f"=> Answer OpenAI: {answer_open_ai}")
	print(f"=> Target Language IsoCode: {iso_code}")

	answer = translate_output.translate_text(answer_open_ai, target_language=iso_code)
	print(f"=> Translated Answer OpenAI: {answer}")

	return answer

	# Define a function to be called when files are uploaded
	def on_files_upload(files):
	# save files to files dir
	if not os.path.exists("pdfs"):
	os.makedirs("pdfs", exist_ok=True)
	# print(f"The directory 'pdfs' was created!");
	files_dir = "pdfs"
	for fileobj in files:
	path = files_dir + "/" + os.path.basename(fileobj)
	shutil.copyfile(fileobj.name, path)
	# checkbox group update
	full_filenames = [file.name for file in files]
	filenames = [os.path.basename(path) for path in full_filenames]
	return(gr.CheckboxGroup(choices=filenames))

	# Define a function to be called when files are cleared
	def on_files_cleared():
	if os.path.exists("pdfs"):
	shutil.rmtree("pdfs")
	# print(f"The directory was removed!");
	return(gr.CheckboxGroup(choices=[]))

	# Define the Gradio interface
	title = "Question/Answer over Documents"
	subtitle = "OpenAI GPT 3.5 Turbo LLM assisted Question/Answer over multiple PDF context documents"
	authors = "Hugo Cavalaria "
	custom_layout = "<h1>{}</h1><h2>{}</h2><p>{}</p>".format(title,subtitle,authors)

	# Get the list of languages available
	translate_output = TranslateOutput(credentials)
	language_names = [i for i in translate_output.list_languages()]

	# Gradio Interface
	with gr.Blocks() as interface:
	with gr.Row():
	with gr.Column(scale=2):
	gr.HTML(custom_layout)

	with gr.Row():
	with gr.Column(scale=1):
	upload_pdfs = gr.Files(label="Upload multiple PDF files.", interactive=True, file_types=['.pdf'], container=True)
	checkbox_group = gr.CheckboxGroup(label="Select the files to question.", choices=[], interactive=True)
	question_text = gr.Textbox(label="Question:")
	answer_language = gr.Dropdown(label="Answer translation to:", choices=language_names, value="Portuguese")
	secret_key = gr.Textbox(label="OpenAI API Key:")
	with gr.Column(scale=1):
	output_status = gr.Textbox(label="Answer:")

	btn = gr.Button("Ask")

	btn.click(fn=run_qa,
	inputs=[upload_pdfs,checkbox_group,question_text,answer_language,secret_key],
	outputs=[output_status])

	upload_pdfs.upload(fn=on_files_upload,
	inputs=[upload_pdfs],
	outputs=[checkbox_group],
	show_progress="full")

	upload_pdfs.clear(fn=on_files_cleared,
	inputs=None,
	outputs=[checkbox_group])

	"""## Launch Interface"""
	# launch interface
	if __name__ == "__main__":
	interface.launch(share=False, debug=True)