Spaces:

jackkuo
/

PDF-text-extractor

Sleeping

App Files Files Community

jackkuo commited on Oct 29, 2024

Commit

c1df903

verified ·

1 Parent(s): a013bda

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -23

app.py CHANGED Viewed

@@ -1,51 +1,95 @@
 import gradio as gr
 import fitz  # PyMuPDF
-from base64 import b64encode
-from gradio_pdf import PDF
 def read_pdf(file):
-    # 打开PDF文件
     pdf_document = fitz.open(file)
     text = ""
-    # 遍历每一页PDF，并提取文本
     for page in pdf_document:
         text += page.get_text()
     pdf_document.close()
     return text
-# def display_pdf(file):
-#     # 将PDF文件内容编码为base64
-#     with open(file.name, "rb") as f:
-#         encoded_pdf = b64encode(f.read()).decode('utf-8')
-#     # 使用HTML嵌入PDF查看器
-#     pdf_html = f'<iframe src="data:application/pdf;base64,{encoded_pdf}" width="100%" height="600px"></iframe>'
-#     return pdf_html
-def display_pdf(file):
-    return file  # 返回文件路径，以便PDF组件使用
 # 使用Blocks布局
 with gr.Blocks() as app:
     gr.Markdown('''<h1 align="center"> PDF Text Extractor </h1>''')
-    gr.Markdown('''<p  align="center">Upload a PDF file to extract its text and view it.</p>''')
     with gr.Row():
         with gr.Column(scale=1):
-            file_input = gr.File(label="Upload a PDF file", type="filepath")  # 设置为 'filepath'
             with gr.Row():
-                display_button = gr.Button("Display PDF", variant="primary")
-                extract_button = gr.Button("Extract Text", variant="secondary")
-            # pdf_viewer = gr.HTML(label="PDF Viewer")
-            pdf_viewer = PDF(label="PDF Viewer")  # 使用gradio_pdf的PDF组件
         with gr.Column(scale=1):
-            text_output = gr.Textbox(label="Extracted Text", interactive=False,
-                                     placeholder="Extracted text will appear here...", lines=49)
     # 连接按钮和功能
     extract_button.click(read_pdf, inputs=file_input, outputs=text_output)
-    display_button.click(display_pdf, inputs=file_input, outputs=pdf_viewer)
-app.launch()

 import gradio as gr
 import fitz  # PyMuPDF
+from PIL import Image
+from pathlib import Path
+import os
+def convert_pdf_to_images(pdf_path, image_folder="pdf_images", dpi=300):
+    # 创建存储图像的文件夹
+    os.makedirs(image_folder, exist_ok=True)
+    # 打开PDF文档
+    pdf_document = fitz.open(pdf_path)
+    image_paths = []
+    # 遍历每一页PDF，并生成高DPI的图像
+    for page_number in range(len(pdf_document)):
+        page = pdf_document[page_number]
+        pix = page.get_pixmap(dpi=dpi)
+        image_path = Path(image_folder) / f"page_{page_number + 1}.png"
+        Image.frombytes("RGB", [pix.width, pix.height], pix.samples).save(image_path)
+        image_paths.append(str(image_path))  # 收集每一页的图像路径
+    pdf_document.close()
+    return image_paths
 def read_pdf(file):
+    # 提取PDF中的文本
     pdf_document = fitz.open(file)
     text = ""
     for page in pdf_document:
         text += page.get_text()
     pdf_document.close()
     return text
+def display_pdf_images(file):
+    # 转换PDF为高清图像
+    image_paths = convert_pdf_to_images(file)
+    return image_paths  # 返回图像路径列表以显示
+# 示例PDF路径
+example_pdf_path = "./sample.pdf"  # 将此替换为您的示例 PDF 的实际路径
 # 使用Blocks布局
 with gr.Blocks() as app:
     gr.Markdown('''<h1 align="center"> PDF Text Extractor </h1>''')
+    gr.Markdown('''<p align="center">Upload a PDF file to extract its text and view it.</p>''')
     with gr.Row():
         with gr.Column(scale=1):
+            file_input = gr.File(label="Upload a PDF file", type="filepath")
             with gr.Row():
+                display_button = gr.Button("Display PDF", variant="secondary")
+                extract_button = gr.Button("Extract Text", variant="primary")
+            # 使用 Gallery 作为 PDF 查看器，并指定列数和高度
+            pdf_viewer = gr.Gallery(label="PDF Viewer", columns=1, height="auto", object_fit="contain")
         with gr.Column(scale=1):
+            text_output = gr.Textbox(
+                label="Extracted Text",
+                interactive=True,
+                placeholder="Extracted text will appear here...",
+                lines=39,
+                max_lines=39,  # 设置最大行数，如果超过将显示滚动条
+                autoscroll=False,  # 设置自动滚动到底部
+                show_copy_button=True,
+                elem_id="text-output"
+            )
+    # 添加一个预设示例PDF
+    gr.Examples(
+        examples=[[example_pdf_path]],
+        inputs=file_input,
+        outputs=[pdf_viewer, text_output],
+        fn=lambda file: (display_pdf_images(file), read_pdf(file))
+    )
     # 连接按钮和功能
     extract_button.click(read_pdf, inputs=file_input, outputs=text_output)
+    display_button.click(display_pdf_images, inputs=file_input, outputs=pdf_viewer)
+# 自定义样式
+app.css = """
+#text-output {
+    width: 100%;
+    max-width: 600px;
+    overflow-y: auto;
+}
+"""
+app.launch()