Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
-
import
|
|
|
|
|
|
|
| 3 |
import os
|
| 4 |
-
|
| 5 |
-
import fitz
|
| 6 |
|
| 7 |
api_key = os.getenv('API_KEY')
|
| 8 |
base_url = os.getenv("BASE_URL")
|
|
@@ -14,20 +16,21 @@ client = OpenAI(
|
|
| 14 |
|
| 15 |
|
| 16 |
def extract_pdf_pypdf(pdf_dir):
|
| 17 |
-
path = pdf_dir
|
| 18 |
-
|
| 19 |
try:
|
| 20 |
-
doc = fitz.open(
|
| 21 |
-
except:
|
| 22 |
-
print("
|
| 23 |
return None
|
| 24 |
|
| 25 |
page_count = doc.page_count
|
| 26 |
file_content = ""
|
| 27 |
for page in range(page_count):
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
return file_content
|
| 33 |
|
|
@@ -39,26 +42,13 @@ def openai_api(messages):
|
|
| 39 |
messages=messages,
|
| 40 |
temperature=0.1,
|
| 41 |
max_tokens=8192,
|
| 42 |
-
# timeout=300,
|
| 43 |
stream=True
|
| 44 |
)
|
|
|
|
|
|
|
|
|
|
| 45 |
except Exception as ex:
|
| 46 |
-
print("
|
| 47 |
-
return None
|
| 48 |
-
|
| 49 |
-
if completion:
|
| 50 |
-
try:
|
| 51 |
-
response_2_list = [chunk.choices[0].delta.content if chunk.choices[0].delta.content else "" for chunk in
|
| 52 |
-
completion]
|
| 53 |
-
print("response tokens:", len(response_2_list))
|
| 54 |
-
|
| 55 |
-
response_2_content = ''.join(response_2_list)
|
| 56 |
-
return response_2_content
|
| 57 |
-
except Exception as ex:
|
| 58 |
-
print("第二轮 出现如下异常%s" % ex)
|
| 59 |
-
return None
|
| 60 |
-
else:
|
| 61 |
-
print("第二轮出现异常")
|
| 62 |
return None
|
| 63 |
|
| 64 |
|
|
@@ -83,29 +73,30 @@ def predict(input_text, pdf_file):
|
|
| 83 |
return extract_result or "Too many users. Please wait a moment!"
|
| 84 |
|
| 85 |
|
| 86 |
-
def
|
| 87 |
-
|
| 88 |
-
|
| 89 |
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
|
|
|
|
|
|
|
|
|
| 98 |
|
| 99 |
-
|
| 100 |
-
|
| 101 |
|
| 102 |
-
# Encode as base64 for embedding in HTML
|
| 103 |
-
b64_data = base64.b64encode(pdf_data).decode('utf-8')
|
| 104 |
-
return f"<embed src='data:application/pdf;base64,{b64_data}' type='application/pdf' width='100%' height='700px' />"
|
| 105 |
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
|
|
|
| 109 |
|
| 110 |
|
| 111 |
en_1 = """Could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a markdown table format?
|
|
@@ -120,22 +111,20 @@ examples = [[en_1], [en_2]]
|
|
| 120 |
|
| 121 |
with gr.Blocks(title="PaperExtractGPT") as demo:
|
| 122 |
gr.Markdown(
|
| 123 |
-
'''<
|
| 124 |
-
<
|
| 125 |
-
<
|
| 126 |
-
<br
|
| 127 |
-
<br
|
| 128 |
-
<br
|
| 129 |
-
|
| 130 |
-
</p>
|
| 131 |
-
'''
|
| 132 |
)
|
| 133 |
with gr.Row():
|
| 134 |
with gr.Column():
|
| 135 |
-
gr.Markdown('## Upload PDF')
|
| 136 |
file_input = gr.File(label="Upload your PDF", type="filepath")
|
|
|
|
| 137 |
viewer_button = gr.Button("View PDF")
|
| 138 |
-
file_out = gr.
|
| 139 |
|
| 140 |
with gr.Column():
|
| 141 |
model_input = gr.Textbox(lines=7, placeholder='Enter your extraction prompt here', label='Input Prompt')
|
|
@@ -143,13 +132,13 @@ with gr.Blocks(title="PaperExtractGPT") as demo:
|
|
| 143 |
with gr.Row():
|
| 144 |
gen = gr.Button("Generate")
|
| 145 |
clr = gr.Button("Clear")
|
| 146 |
-
outputs = gr.Markdown(label='Output',
|
| 147 |
|---------------------------------------------|--------------------|------|-----------------------------------------------|-------------------------------------------------------|-----------------------|
|
| 148 |
| Paleomagnetic Study of Deccan Traps from Jabalpur to Amarkantak, Central India | J. Geomag. Geoelectr. | 1973 | R. K. VERMA, G. PULLAIAH, G.R. ANJANEYULU, P. K. MALLIK | National Geophysical Research Institute, Hyderabad, and Indian School o f Mines, Dhanbad | "" |
|
| 149 |
""")
|
| 150 |
|
| 151 |
gen.click(fn=predict, inputs=[model_input, file_input], outputs=outputs)
|
| 152 |
clr.click(fn=lambda: [gr.update(value=""), gr.update(value="")], inputs=None, outputs=[model_input, outputs])
|
| 153 |
-
viewer_button.click(
|
| 154 |
|
| 155 |
demo.launch()
|
|
|
|
| 1 |
+
from openai import OpenAI
|
| 2 |
import gradio as gr
|
| 3 |
+
import fitz # PyMuPDF
|
| 4 |
+
from PIL import Image
|
| 5 |
+
from pathlib import Path
|
| 6 |
import os
|
| 7 |
+
|
|
|
|
| 8 |
|
| 9 |
api_key = os.getenv('API_KEY')
|
| 10 |
base_url = os.getenv("BASE_URL")
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
def extract_pdf_pypdf(pdf_dir):
|
|
|
|
|
|
|
| 19 |
try:
|
| 20 |
+
doc = fitz.open(pdf_dir)
|
| 21 |
+
except Exception as e:
|
| 22 |
+
print(f"Error opening PDF: {e}")
|
| 23 |
return None
|
| 24 |
|
| 25 |
page_count = doc.page_count
|
| 26 |
file_content = ""
|
| 27 |
for page in range(page_count):
|
| 28 |
+
try:
|
| 29 |
+
text = doc.load_page(page).get_text("text")
|
| 30 |
+
file_content += text + "\n\n"
|
| 31 |
+
except Exception as e:
|
| 32 |
+
print(f"Error reading page {page}: {e}")
|
| 33 |
+
continue
|
| 34 |
|
| 35 |
return file_content
|
| 36 |
|
|
|
|
| 42 |
messages=messages,
|
| 43 |
temperature=0.1,
|
| 44 |
max_tokens=8192,
|
|
|
|
| 45 |
stream=True
|
| 46 |
)
|
| 47 |
+
response = ''.join(
|
| 48 |
+
[chunk.choices[0].delta.content if chunk.choices[0].delta.content else "" for chunk in completion])
|
| 49 |
+
return response
|
| 50 |
except Exception as ex:
|
| 51 |
+
print("API error:", ex)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
return None
|
| 53 |
|
| 54 |
|
|
|
|
| 73 |
return extract_result or "Too many users. Please wait a moment!"
|
| 74 |
|
| 75 |
|
| 76 |
+
def convert_pdf_to_images(pdf_path, image_folder="pdf_images", dpi=300):
|
| 77 |
+
# 创建存储图像的文件夹
|
| 78 |
+
os.makedirs(image_folder, exist_ok=True)
|
| 79 |
|
| 80 |
+
# 打开PDF文档
|
| 81 |
+
pdf_document = fitz.open(pdf_path)
|
| 82 |
+
image_paths = []
|
| 83 |
|
| 84 |
+
# 遍历每一页PDF,并生成高DPI的图像
|
| 85 |
+
for page_number in range(len(pdf_document)):
|
| 86 |
+
page = pdf_document[page_number]
|
| 87 |
+
pix = page.get_pixmap(dpi=dpi)
|
| 88 |
+
image_path = Path(image_folder) / f"page_{page_number + 1}.png"
|
| 89 |
+
Image.frombytes("RGB", [pix.width, pix.height], pix.samples).save(image_path)
|
| 90 |
+
image_paths.append(str(image_path)) # 收集每一页的图像路径
|
| 91 |
|
| 92 |
+
pdf_document.close()
|
| 93 |
+
return image_paths
|
| 94 |
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
+
def display_pdf_images(file):
|
| 97 |
+
# 转换PDF为高清图像
|
| 98 |
+
image_paths = convert_pdf_to_images(file)
|
| 99 |
+
return image_paths # 返回图像路径列表以显示
|
| 100 |
|
| 101 |
|
| 102 |
en_1 = """Could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a markdown table format?
|
|
|
|
| 111 |
|
| 112 |
with gr.Blocks(title="PaperExtractGPT") as demo:
|
| 113 |
gr.Markdown(
|
| 114 |
+
'''<h1 align="center"> Paper Extract GPT </h1>
|
| 115 |
+
<p>How to use:
|
| 116 |
+
<br><strong>1</strong>: Upload your PDF.
|
| 117 |
+
<br><strong>2</strong>: Click "View PDF" to preview it.
|
| 118 |
+
<br><strong>3</strong>: Enter your extraction prompt in the input box.
|
| 119 |
+
<br><strong>4</strong>: Click "Generate" to extract, and the extracted information will display below.
|
| 120 |
+
</p>'''
|
|
|
|
|
|
|
| 121 |
)
|
| 122 |
with gr.Row():
|
| 123 |
with gr.Column():
|
|
|
|
| 124 |
file_input = gr.File(label="Upload your PDF", type="filepath")
|
| 125 |
+
example = gr.Examples(examples=[["./sample.pdf"]], inputs=file_input)
|
| 126 |
viewer_button = gr.Button("View PDF")
|
| 127 |
+
file_out = gr.Gallery(label="PDF Viewer", columns=1, height="auto", object_fit="contain")
|
| 128 |
|
| 129 |
with gr.Column():
|
| 130 |
model_input = gr.Textbox(lines=7, placeholder='Enter your extraction prompt here', label='Input Prompt')
|
|
|
|
| 132 |
with gr.Row():
|
| 133 |
gen = gr.Button("Generate")
|
| 134 |
clr = gr.Button("Clear")
|
| 135 |
+
outputs = gr.Markdown(label='Output', value="""| Title | Journal | Year | Author | Institution | Email |
|
| 136 |
|---------------------------------------------|--------------------|------|-----------------------------------------------|-------------------------------------------------------|-----------------------|
|
| 137 |
| Paleomagnetic Study of Deccan Traps from Jabalpur to Amarkantak, Central India | J. Geomag. Geoelectr. | 1973 | R. K. VERMA, G. PULLAIAH, G.R. ANJANEYULU, P. K. MALLIK | National Geophysical Research Institute, Hyderabad, and Indian School o f Mines, Dhanbad | "" |
|
| 138 |
""")
|
| 139 |
|
| 140 |
gen.click(fn=predict, inputs=[model_input, file_input], outputs=outputs)
|
| 141 |
clr.click(fn=lambda: [gr.update(value=""), gr.update(value="")], inputs=None, outputs=[model_input, outputs])
|
| 142 |
+
viewer_button.click(display_pdf_images, inputs=file_input, outputs=file_out)
|
| 143 |
|
| 144 |
demo.launch()
|