Spaces:

jackkuo
/

PaperExtractGPT

Sleeping

App Files Files Community

PaperExtractGPT / app.py

jackkuo

Update app.py

900c0a5 verified about 1 year ago

raw

history blame contribute delete

5.65 kB

	from openai import OpenAI
	import gradio as gr
	import fitz # PyMuPDF
	from PIL import Image
	from pathlib import Path
	import os


	api_key = os.getenv('API_KEY')
	base_url = os.getenv("BASE_URL")

	client = OpenAI(
	api_key=api_key,
	base_url=base_url,
	)


	def extract_pdf_pypdf(pdf_dir):
	try:
	doc = fitz.open(pdf_dir)
	except Exception as e:
	print(f"Error opening PDF: {e}")
	return None

	page_count = doc.page_count
	file_content = ""
	for page in range(page_count):
	try:
	text = doc.load_page(page).get_text("text")
	file_content += text + "\n\n"
	except Exception as e:
	print(f"Error reading page {page}: {e}")
	continue

	return file_content


	def openai_api(messages):
	try:
	completion = client.chat.completions.create(
	model="claude-3-5-sonnet-20240620",
	messages=messages,
	temperature=0.1,
	max_tokens=8192,
	stream=True
	)
	response = ''.join(
	[chunk.choices[0].delta.content if chunk.choices[0].delta.content else "" for chunk in completion])
	return response
	except Exception as ex:
	print("API error:", ex)
	return None


	def predict(input_text, pdf_file):
	if pdf_file is None:
	return "Please upload a PDF file to proceed."

	file_content = extract_pdf_pypdf(pdf_file.name)
	messages = [
	{
	"role": "system",
	"content": "You are an expert in information extraction from scientific literature.",
	},
	{"role": "user", "content": """Provided Text:
	'''
	{{""" + file_content + """}}
	'''
	""" + input_text}
	]
	extract_result = openai_api(messages)

	return extract_result or "Too many users. Please wait a moment!"


	def convert_pdf_to_images(pdf_path, image_folder="pdf_images", dpi=300):
	# 创建存储图像的文件夹
	os.makedirs(image_folder, exist_ok=True)

	# 打开PDF文档
	pdf_document = fitz.open(pdf_path)
	image_paths = []

	# 遍历每一页PDF，并生成高DPI的图像
	for page_number in range(len(pdf_document)):
	page = pdf_document[page_number]
	pix = page.get_pixmap(dpi=dpi)
	image_path = Path(image_folder) / f"page_{page_number + 1}.png"
	Image.frombytes("RGB", [pix.width, pix.height], pix.samples).save(image_path)
	image_paths.append(str(image_path)) # 收集每一页的图像路径

	pdf_document.close()
	return image_paths


	def display_pdf_images(file):
	# 转换PDF为高清图像
	image_paths = convert_pdf_to_images(file)
	return image_paths # 返回图像路径列表以显示


	en_1 = """Could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a markdown table format?
	If any of this information was not available in the paper, please replace it with the string `""`. If the property contains multiple entities, please use a list to contain.
	"""

	en_2 = """Could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a JSON format?
	If any of this information was not available in the paper, please replace it with the string `""`. If the property contains multiple entities, please use a list to contain.
	"""

	examples = [[en_1], [en_2]]

	with gr.Blocks(title="PaperExtractGPT") as demo:
	gr.Markdown(
	'''<h1 align="center"> Paper Extract GPT </h1>
	<p>How to use:
	<br><strong>1</strong>: Upload your PDF.
	<br><strong>2</strong>: Click "View PDF" to preview it.
	<br><strong>3</strong>: Enter your extraction prompt in the input box.
	<br><strong>4</strong>: Click "Generate" to extract, and the extracted information will display below.
	</p>'''
	)
	with gr.Row():
	with gr.Column():
	file_input = gr.File(label="Upload your PDF", type="filepath")
	example = gr.Examples(examples=[["./sample.pdf"]], inputs=file_input)
	viewer_button = gr.Button("View PDF")
	file_out = gr.Gallery(label="PDF Viewer", columns=1, height="auto", object_fit="contain")

	with gr.Column():
	model_input = gr.Textbox(lines=7, placeholder='Enter your extraction prompt here', label='Input Prompt')
	example = gr.Examples(examples=examples, inputs=model_input)
	with gr.Row():
	gen = gr.Button("Generate")
	clr = gr.Button("Clear")
	outputs = gr.Markdown(label='Output', value="""\| Title \| Journal \| Year \| Author \| Institution \| Email \|
	\|---------------------------------------------\|--------------------\|------\|-----------------------------------------------\|-------------------------------------------------------\|-----------------------\|
	\| Paleomagnetic Study of Deccan Traps from Jabalpur to Amarkantak, Central India \| J. Geomag. Geoelectr. \| 1973 \| R. K. VERMA, G. PULLAIAH, G.R. ANJANEYULU, P. K. MALLIK \| National Geophysical Research Institute, Hyderabad, and Indian School o f Mines, Dhanbad \| "" \|
	""")

	gen.click(fn=predict, inputs=[model_input, file_input], outputs=outputs)
	clr.click(fn=lambda: [gr.update(value=""), gr.update(value="")], inputs=None, outputs=[model_input, outputs])
	viewer_button.click(display_pdf_images, inputs=file_input, outputs=file_out)

	demo.launch()