Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import fitz # PyMuPDF
|
| 3 |
+
from base64 import b64encode
|
| 4 |
+
from gradio_pdf import PDF
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def read_pdf(file):
|
| 8 |
+
# 打开PDF文件
|
| 9 |
+
pdf_document = fitz.open(file)
|
| 10 |
+
text = ""
|
| 11 |
+
# 遍历每一页PDF,并提取文本
|
| 12 |
+
for page in pdf_document:
|
| 13 |
+
text += page.get_text()
|
| 14 |
+
pdf_document.close()
|
| 15 |
+
return text
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# def display_pdf(file):
|
| 19 |
+
# # 将PDF文件内容编码为base64
|
| 20 |
+
# with open(file.name, "rb") as f:
|
| 21 |
+
# encoded_pdf = b64encode(f.read()).decode('utf-8')
|
| 22 |
+
# # 使用HTML嵌入PDF查看器
|
| 23 |
+
# pdf_html = f'<iframe src="data:application/pdf;base64,{encoded_pdf}" width="100%" height="600px"></iframe>'
|
| 24 |
+
# return pdf_html
|
| 25 |
+
def display_pdf(file):
|
| 26 |
+
return file # 返回文件路径,以便PDF组件使用
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# 使用Blocks布局
|
| 30 |
+
with gr.Blocks() as app:
|
| 31 |
+
gr.Markdown('''<h1 align="center"> PDF Text Extractor </h1>''')
|
| 32 |
+
gr.Markdown('''<p align="center">Upload a PDF file to extract its text and view it.</p>''')
|
| 33 |
+
|
| 34 |
+
with gr.Row():
|
| 35 |
+
with gr.Column(scale=1):
|
| 36 |
+
file_input = gr.File(label="Upload a PDF file", type="filepath") # 设置为 'filepath'
|
| 37 |
+
with gr.Row():
|
| 38 |
+
display_button = gr.Button("Display PDF", variant="primary")
|
| 39 |
+
extract_button = gr.Button("Extract Text", variant="secondary")
|
| 40 |
+
# pdf_viewer = gr.HTML(label="PDF Viewer")
|
| 41 |
+
pdf_viewer = PDF(label="PDF Viewer") # 使用gradio_pdf的PDF组件
|
| 42 |
+
|
| 43 |
+
with gr.Column(scale=1):
|
| 44 |
+
text_output = gr.Textbox(label="Extracted Text", interactive=False,
|
| 45 |
+
placeholder="Extracted text will appear here...", lines=49)
|
| 46 |
+
|
| 47 |
+
# 连接按钮和功能
|
| 48 |
+
extract_button.click(read_pdf, inputs=file_input, outputs=text_output)
|
| 49 |
+
display_button.click(display_pdf, inputs=file_input, outputs=pdf_viewer)
|
| 50 |
+
|
| 51 |
+
app.launch()
|