Spaces:

jackkuo
/

PaperExtractGPT

Sleeping

App Files Files Community

jackkuo commited on Oct 29, 2024

Commit

900c0a5

verified ·

1 Parent(s): 0ad8950

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -59

app.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import gradio as gr
-import base64
 import os
-from openai import OpenAI
-import fitz
 api_key = os.getenv('API_KEY')
 base_url = os.getenv("BASE_URL")
@@ -14,20 +16,21 @@ client = OpenAI(
 def extract_pdf_pypdf(pdf_dir):
-    path = pdf_dir
     try:
-        doc = fitz.open(path)
-    except:
-        print("can not read pdf")
         return None
     page_count = doc.page_count
     file_content = ""
     for page in range(page_count):
-        text = doc.load_page(page).get_text("text")
-        # 防止目录中包含References
-        file_content += text + "\n\n"
     return file_content
@@ -39,26 +42,13 @@ def openai_api(messages):
             messages=messages,
             temperature=0.1,
             max_tokens=8192,
-            # timeout=300,
             stream=True
         )
     except Exception as ex:
-        print("api 出现如下异常%s" % ex)
-        return None
-    if completion:
-        try:
-            response_2_list = [chunk.choices[0].delta.content if chunk.choices[0].delta.content else "" for chunk in
-                               completion]
-            print("response tokens:", len(response_2_list))
-            response_2_content = ''.join(response_2_list)
-            return response_2_content
-        except Exception as ex:
-            print("第二轮 出现如下异常%s" % ex)
-            return None
-    else:
-        print("第二轮出现异常")
         return None
@@ -83,29 +73,30 @@ def predict(input_text, pdf_file):
     return extract_result or "Too many users. Please wait a moment!"
-def view_pdf(pdf_file, max_pages=3):
-    if pdf_file is None:
-        return "Please upload a PDF file to view."
-    try:
-        # Open the PDF file
-        doc = fitz.open(pdf_file.name)
-        # Only read up to `max_pages` pages to reduce size for large PDFs
-        preview_pdf = fitz.open()  # Create an empty PDF for the preview
-        for page_num in range(min(max_pages, doc.page_count)):
-            preview_pdf.insert_pdf(doc, from_page=page_num, to_page=page_num)
-        # Save the preview as a temporary in-memory file
-        pdf_data = preview_pdf.tobytes()
-        # Encode as base64 for embedding in HTML
-        b64_data = base64.b64encode(pdf_data).decode('utf-8')
-        return f"<embed src='data:application/pdf;base64,{b64_data}' type='application/pdf' width='100%' height='700px' />"
-    except Exception as e:
-        print(f"Error displaying PDF: {e}")
-        return "Error displaying PDF. Please try re-uploading."
 en_1 = """Could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a markdown table format?
@@ -120,22 +111,20 @@ examples = [[en_1], [en_2]]
 with gr.Blocks(title="PaperExtractGPT") as demo:
     gr.Markdown(
-        '''<p align="center">
-        <h1 align="center"> Paper Extract GPT </h1>
-        <p> How to use:
-        <br> <strong>1</strong>: Upload your PDF.
-        <br> <strong>2</strong>: Click "View PDF" to preview it.
-        <br> <strong>3</strong>: Enter your extraction prompt in the input box.
-        <br> <strong>4</strong>: Click "Generate" to extract, and the extracted information will display below.
-        </p>
-        '''
     )
     with gr.Row():
         with gr.Column():
-            gr.Markdown('## Upload PDF')
             file_input = gr.File(label="Upload your PDF", type="filepath")
             viewer_button = gr.Button("View PDF")
-            file_out = gr.HTML(label="PDF Preview")
         with gr.Column():
             model_input = gr.Textbox(lines=7, placeholder='Enter your extraction prompt here', label='Input Prompt')
@@ -143,13 +132,13 @@ with gr.Blocks(title="PaperExtractGPT") as demo:
             with gr.Row():
                 gen = gr.Button("Generate")
                 clr = gr.Button("Clear")
-            outputs = gr.Markdown(label='Output', show_label=True,  value="""| Title                                       | Journal            | Year | Author                                        | Institution                                           | Email                 |
 |---------------------------------------------|--------------------|------|-----------------------------------------------|-------------------------------------------------------|-----------------------|
 | Paleomagnetic Study of Deccan Traps from Jabalpur to Amarkantak, Central India | J. Geomag. Geoelectr. | 1973 | R. K. VERMA, G. PULLAIAH, G.R. ANJANEYULU, P. K. MALLIK | National Geophysical Research Institute, Hyderabad, and Indian School o f Mines, Dhanbad | "" |
 """)
     gen.click(fn=predict, inputs=[model_input, file_input], outputs=outputs)
     clr.click(fn=lambda: [gr.update(value=""), gr.update(value="")], inputs=None, outputs=[model_input, outputs])
-    viewer_button.click(view_pdf, inputs=file_input, outputs=file_out)
 demo.launch()

+from openai import OpenAI
 import gradio as gr
+import fitz  # PyMuPDF
+from PIL import Image
+from pathlib import Path
 import os
 api_key = os.getenv('API_KEY')
 base_url = os.getenv("BASE_URL")
 def extract_pdf_pypdf(pdf_dir):
     try:
+        doc = fitz.open(pdf_dir)
+    except Exception as e:
+        print(f"Error opening PDF: {e}")
         return None
     page_count = doc.page_count
     file_content = ""
     for page in range(page_count):
+        try:
+            text = doc.load_page(page).get_text("text")
+            file_content += text + "\n\n"
+        except Exception as e:
+            print(f"Error reading page {page}: {e}")
+            continue
     return file_content
             messages=messages,
             temperature=0.1,
             max_tokens=8192,
             stream=True
         )
+        response = ''.join(
+            [chunk.choices[0].delta.content if chunk.choices[0].delta.content else "" for chunk in completion])
+        return response
     except Exception as ex:
+        print("API error:", ex)
         return None
     return extract_result or "Too many users. Please wait a moment!"
+def convert_pdf_to_images(pdf_path, image_folder="pdf_images", dpi=300):
+    # 创建存储图像的文件夹
+    os.makedirs(image_folder, exist_ok=True)
+    # 打开PDF文档
+    pdf_document = fitz.open(pdf_path)
+    image_paths = []
+    # 遍历每一页PDF，并生成高DPI的图像
+    for page_number in range(len(pdf_document)):
+        page = pdf_document[page_number]
+        pix = page.get_pixmap(dpi=dpi)
+        image_path = Path(image_folder) / f"page_{page_number + 1}.png"
+        Image.frombytes("RGB", [pix.width, pix.height], pix.samples).save(image_path)
+        image_paths.append(str(image_path))  # 收集每一页的图像路径
+    pdf_document.close()
+    return image_paths
+def display_pdf_images(file):
+    # 转换PDF为高清图像
+    image_paths = convert_pdf_to_images(file)
+    return image_paths  # 返回图像路径列表以显示
 en_1 = """Could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a markdown table format?
 with gr.Blocks(title="PaperExtractGPT") as demo:
     gr.Markdown(
+        '''<h1 align="center"> Paper Extract GPT </h1>
+        <p>How to use:
+        <br><strong>1</strong>: Upload your PDF.
+        <br><strong>2</strong>: Click "View PDF" to preview it.
+        <br><strong>3</strong>: Enter your extraction prompt in the input box.
+        <br><strong>4</strong>: Click "Generate" to extract, and the extracted information will display below.
+        </p>'''
     )
     with gr.Row():
         with gr.Column():
             file_input = gr.File(label="Upload your PDF", type="filepath")
+            example = gr.Examples(examples=[["./sample.pdf"]], inputs=file_input)
             viewer_button = gr.Button("View PDF")
+            file_out = gr.Gallery(label="PDF Viewer", columns=1, height="auto", object_fit="contain")
         with gr.Column():
             model_input = gr.Textbox(lines=7, placeholder='Enter your extraction prompt here', label='Input Prompt')
             with gr.Row():
                 gen = gr.Button("Generate")
                 clr = gr.Button("Clear")
+            outputs = gr.Markdown(label='Output', value="""| Title                                       | Journal            | Year | Author                                        | Institution                                           | Email                 |
 |---------------------------------------------|--------------------|------|-----------------------------------------------|-------------------------------------------------------|-----------------------|
 | Paleomagnetic Study of Deccan Traps from Jabalpur to Amarkantak, Central India | J. Geomag. Geoelectr. | 1973 | R. K. VERMA, G. PULLAIAH, G.R. ANJANEYULU, P. K. MALLIK | National Geophysical Research Institute, Hyderabad, and Indian School o f Mines, Dhanbad | "" |
 """)
     gen.click(fn=predict, inputs=[model_input, file_input], outputs=outputs)
     clr.click(fn=lambda: [gr.update(value=""), gr.update(value="")], inputs=None, outputs=[model_input, outputs])
+    viewer_button.click(display_pdf_images, inputs=file_input, outputs=file_out)
 demo.launch()