AloneDancer commited on
Commit
1c1c960
·
verified ·
1 Parent(s): 0d0aea0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +194 -65
app.py CHANGED
@@ -1,65 +1,194 @@
1
- import gradio as gr
2
- import os
3
- from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
4
- from magic_pdf.pipe.UNIPipe import UNIPipe
5
- from magic_pdf.tools.common import parse_pdf_by_unipipe
6
-
7
- def convert_pdf(pdf_file, enable_formula, enable_table, max_pages=100):
8
- try:
9
- # 获取上传文件路径
10
- file_path = pdf_file.name
11
- parent_path = os.path.dirname(file_path)
12
-
13
- # 初始化 MinerU 管道
14
- image_writer = DiskReaderWriter(parent_path)
15
- jso_useful_key = {
16
- "model_list": [],
17
- "enable_formula_recognition": enable_formula,
18
- "enable_table_recognition": enable_table
19
- }
20
- pipe = UNIPipe(file_path, jso_useful_key, image_writer=image_writer)
21
-
22
- # 执行 PDF 解析
23
- pipe.pipe_classify()
24
- pipe.pipe_parse()
25
-
26
- # 转换为 Markdown
27
- md_content = parse_pdf_by_unipipe(file_path, pipe.pipe_mkユニ, max_pages=max_pages)
28
-
29
- # 保存 Markdown 到临时文件
30
- output_dir = "output"
31
- os.makedirs(output_dir, exist_ok=True)
32
- output_file = os.path.join(output_dir, f"{os.path.basename(file_path)}.md")
33
- with open(output_file, "w", encoding="utf-8") as f:
34
- f.write(md_content)
35
-
36
- return md_content, output_file
37
- except Exception as e:
38
- return f"Error: {str(e)}", None
39
-
40
- # 定义 Gradio 界面
41
- with gr.Blocks(title="MinerU PDF to Markdown Converter") as iface:
42
- gr.Markdown("# MinerU PDF to Markdown Converter")
43
- gr.Markdown("Upload a PDF file, configure options, and convert to Markdown.")
44
-
45
- with gr.Row():
46
- with gr.Column():
47
- pdf_input = gr.File(label="Upload PDF File", file_types=[".pdf"])
48
- formula_checkbox = gr.Checkbox(label="Enable formula recognition", value=False)
49
- table_checkbox = gr.Checkbox(label="Enable table recognition", value=False)
50
- max_pages = gr.Number(label="Max Pages to Process", value=100, precision=0)
51
- convert_button = gr.Button("Convert to Markdown")
52
-
53
- with gr.Column():
54
- markdown_output = gr.Markdown(label="Markdown Output")
55
- download_button = gr.File(label="Download Markdown File")
56
-
57
- # 绑定按钮事件
58
- convert_button.click(
59
- fn=convert_pdf,
60
- inputs=[pdf_input, formula_checkbox, table_checkbox, max_pages],
61
- outputs=[markdown_output, download_button]
62
- )
63
-
64
- # 启动 Gradio 应用
65
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import fitz # PyMuPDF
3
+ from pymupdf4llm import get_markdown
4
+ import os
5
+ import tempfile
6
+ import aiofiles
7
+ from typing import Optional
8
+ import markdown2
9
+ from pathlib import Path
10
+
11
+ class MinerUProcessor:
12
+ def __init__(self):
13
+ self.supported_formats = ['.pdf', '.png', '.jpg', '.jpeg', '.bmp', '.tiff']
14
+
15
+ async def process_file(self, file_path: str, enable_formula: bool, enable_table: bool) -> str:
16
+ """处理文件并返回markdown内容"""
17
+ try:
18
+ file_ext = os.path.splitext(file_path)[1].lower()
19
+
20
+ if file_ext == '.pdf':
21
+ # 处理PDF文件
22
+ markdown_text = get_markdown(
23
+ file_path,
24
+ write_images=False,
25
+ use_latex=enable_formula,
26
+ use_table=enable_table
27
+ )
28
+ else:
29
+ # 处理图片文件(转换为PDF再处理)
30
+ pdf_path = await self.image_to_pdf(file_path)
31
+ markdown_text = get_markdown(
32
+ pdf_path,
33
+ write_images=False,
34
+ use_latex=enable_formula,
35
+ use_table=enable_table
36
+ )
37
+ # 清理临时PDF文件
38
+ if os.path.exists(pdf_path):
39
+ os.remove(pdf_path)
40
+
41
+ return markdown_text
42
+
43
+ except Exception as e:
44
+ return f"处理文件时出错: {str(e)}"
45
+
46
+ async def image_to_pdf(self, image_path: str) -> str:
47
+ """将图片转换为PDF"""
48
+ doc = fitz.open()
49
+ img = fitz.open(image_path)
50
+ rect = img[0].rect
51
+ pdf_bytes = img.convert_to_pdf()
52
+ img.close()
53
+
54
+ img_pdf = fitz.open("pdf", pdf_bytes)
55
+ page = doc.new_page(width=rect.width, height=rect.height)
56
+ page.show_pdf_page(rect, img_pdf, 0)
57
+
58
+ # 保存临时PDF文件
59
+ temp_pdf = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
60
+ doc.save(temp_pdf.name)
61
+ doc.close()
62
+ img_pdf.close()
63
+
64
+ return temp_pdf.name
65
+
66
+ async def process_document(
67
+ file: Optional[gr.components.File] = None,
68
+ enable_formula: bool = True,
69
+ enable_table: bool = True
70
+ ):
71
+ """处理文档的主函数"""
72
+ if file is None:
73
+ return "请上传文件", None, None
74
+
75
+ try:
76
+ processor = MinerUProcessor()
77
+
78
+ # 保存上传的文件
79
+ file_path = file.name
80
+ file_ext = os.path.splitext(file_path)[1].lower()
81
+
82
+ if file_ext not in processor.supported_formats:
83
+ return f"不支持的文件格式: {file_ext}", None, None
84
+
85
+ # 处理文件
86
+ markdown_content = await processor.process_file(file_path, enable_formula, enable_table)
87
+
88
+ # 生成HTML预览
89
+ html_preview = markdown2.markdown(markdown_content)
90
+
91
+ # 生成下载文件名
92
+ original_name = os.path.basename(file_path)
93
+ base_name = os.path.splitext(original_name)[0]
94
+ download_filename = f"{base_name}.md"
95
+
96
+ return markdown_content, html_preview, download_filename
97
+
98
+ except Exception as e:
99
+ return f"处理过程中出错: {str(e)}", None, None
100
+
101
+ def create_download_file(markdown_content: str, filename: str):
102
+ """创建下载文件"""
103
+ if not markdown_content or markdown_content.startswith("处理过程中出错") or markdown_content.startswith("请上传文件"):
104
+ return None
105
+
106
+ # 创建临时文件
107
+ temp_file = tempfile.NamedTemporaryFile(suffix='.md', delete=False, mode='w', encoding='utf-8')
108
+ temp_file.write(markdown_content)
109
+ temp_file.close()
110
+
111
+ return temp_file.name
112
+
113
+ # 创建Gradio界面
114
+ with gr.Blocks(title="MinerU Document Processor", theme=gr.themes.Soft()) as demo:
115
+ gr.Markdown("""
116
+ # 📄 MinerU Document Processor
117
+
118
+ 将PDF和图片文档转换为Markdown格式,支持公式和表格识别。
119
+ """)
120
+
121
+ with gr.Row():
122
+ with gr.Column(scale=1):
123
+ file_input = gr.File(
124
+ label="上传文档",
125
+ file_types=[".pdf", ".png", ".jpg", ".jpeg", ".bmp", ".tiff"],
126
+ type="filepath"
127
+ )
128
+
129
+ with gr.Group():
130
+ gr.Markdown("### 识别选项")
131
+ enable_formula = gr.Checkbox(
132
+ label="Enable formula recognition",
133
+ value=True,
134
+ info="是否启用公式识别"
135
+ )
136
+ enable_table = gr.Checkbox(
137
+ label="Enable table recognition",
138
+ value=True,
139
+ info="是否启用表格识别"
140
+ )
141
+
142
+ process_btn = gr.Button("🚀 开始处理", variant="primary")
143
+
144
+ download_btn = gr.DownloadButton(
145
+ "📥 下载Markdown文件",
146
+ visible=False
147
+ )
148
+
149
+ with gr.Column(scale=2):
150
+ with gr.Tab("Markdown预览"):
151
+ markdown_output = gr.Code(
152
+ label="转换结果 (Markdown)",
153
+ language="markdown",
154
+ lines=20,
155
+ interactive=False
156
+ )
157
+
158
+ with gr.Tab("HTML预览"):
159
+ html_output = gr.HTML(
160
+ label="HTML预览"
161
+ )
162
+
163
+ # 设置事件处理
164
+ process_btn.click(
165
+ fn=process_document,
166
+ inputs=[file_input, enable_formula, enable_table],
167
+ outputs=[markdown_output, html_output, download_btn]
168
+ )
169
+
170
+ # 下载处理
171
+ download_btn.click(
172
+ fn=create_download_file,
173
+ inputs=[markdown_output, download_btn],
174
+ outputs=download_btn
175
+ )
176
+
177
+ # 示例
178
+ gr.Examples(
179
+ examples=[
180
+ ["https://example.com/sample.pdf", True, True],
181
+ ["https://example.com/sample.png", False, True]
182
+ ],
183
+ inputs=[file_input, enable_formula, enable_table],
184
+ outputs=[markdown_output, html_output, download_btn],
185
+ fn=process_document,
186
+ cache_examples=False
187
+ )
188
+
189
+ if __name__ == "__main__":
190
+ demo.launch(
191
+ server_name="0.0.0.0",
192
+ server_port=7860,
193
+ share=False
194
+ )