AloneDancer commited on
Commit
55cb0f8
·
verified ·
1 Parent(s): 7a422b5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -48
app.py CHANGED
@@ -1,57 +1,76 @@
1
  import gradio as gr
2
  import fitz # PyMuPDF
3
- from pymupdf4llm import get_markdown
4
  import os
5
  import tempfile
6
  import aiofiles
7
  from typing import Optional
8
  import markdown2
9
  from pathlib import Path
10
- import asyncio
 
11
 
12
  class MinerUProcessor:
13
  def __init__(self):
14
  self.supported_formats = ['.pdf', '.png', '.jpg', '.jpeg', '.bmp', '.tiff']
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  async def process_file(self, file_path: str, enable_formula: bool, enable_table: bool) -> str:
17
  """处理文件并返回markdown内容"""
18
  try:
19
  file_ext = os.path.splitext(file_path)[1].lower()
20
 
21
  if file_ext == '.pdf':
22
- # 处理PDF文件
23
- markdown_text = get_markdown(
24
- file_path,
25
- write_images=False,
26
- use_latex=enable_formula,
27
- use_table=enable_table
28
- )
29
  else:
30
- # 处理图片文件
31
- doc = fitz.open()
32
- img = fitz.open(file_path)
33
- pdf_bytes = img.convert_to_pdf()
34
- img.close()
35
-
36
- # 保存为临时PDF
37
- temp_pdf = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
38
- temp_pdf.close()
39
-
40
- with open(temp_pdf.name, 'wb') as f:
41
- f.write(pdf_bytes)
42
-
43
- markdown_text = get_markdown(
44
- temp_pdf.name,
45
- write_images=False,
46
- use_latex=enable_formula,
47
- use_table=enable_table
48
- )
49
-
50
- # 清理临时文件
51
- if os.path.exists(temp_pdf.name):
52
- os.remove(temp_pdf.name)
53
-
54
- return markdown_text
55
 
56
  except Exception as e:
57
  return f"处理文件时出错: {str(e)}"
@@ -63,7 +82,7 @@ async def process_document(
63
  ):
64
  """处理文档的主函数"""
65
  if file is None:
66
- return "请上传文件", None, None
67
 
68
  try:
69
  processor = MinerUProcessor()
@@ -71,7 +90,7 @@ async def process_document(
71
  file_ext = os.path.splitext(file_path)[1].lower()
72
 
73
  if file_ext not in processor.supported_formats:
74
- return f"不支持的文件格式: {file_ext}", None, None
75
 
76
  # 异步处理文件
77
  markdown_content = await processor.process_file(file_path, enable_formula, enable_table)
@@ -87,7 +106,7 @@ async def process_document(
87
  return markdown_content, html_preview, gr.DownloadButton("📥 下载Markdown文件", visible=True)
88
 
89
  except Exception as e:
90
- return f"处理过程中出错: {str(e)}", None, None
91
 
92
  def create_download_file(markdown_content: str):
93
  """创建下载文件"""
@@ -102,18 +121,20 @@ def create_download_file(markdown_content: str):
102
  return temp_file.name
103
 
104
  # 创建Gradio界面
105
- with gr.Blocks(title="MinerU Document Processor", theme=gr.themes.Soft()) as demo:
106
  gr.Markdown("""
107
- # 📄 MinerU Document Processor
 
 
108
 
109
- PDF和图片文档转换为Markdown格式,支持公式和表格识别。
110
  """)
111
 
112
  with gr.Row():
113
  with gr.Column(scale=1):
114
  file_input = gr.File(
115
- label="上传文档",
116
- file_types=[".pdf", ".png", ".jpg", ".jpeg"],
117
  type="filepath"
118
  )
119
 
@@ -121,8 +142,8 @@ with gr.Blocks(title="MinerU Document Processor", theme=gr.themes.Soft()) as dem
121
  gr.Markdown("### 识别选项")
122
  enable_formula = gr.Checkbox(
123
  label="Enable formula recognition",
124
- value=True,
125
- info="是否启用公式识别"
126
  )
127
  enable_table = gr.Checkbox(
128
  label="Enable table recognition",
@@ -140,10 +161,8 @@ with gr.Blocks(title="MinerU Document Processor", theme=gr.themes.Soft()) as dem
140
  interactive=False
141
  )
142
 
143
- html_output = gr.HTML(
144
- label="HTML预览",
145
- visible=False
146
- )
147
 
148
  download_btn = gr.DownloadButton(
149
  "📥 下载Markdown文件",
@@ -163,6 +182,22 @@ with gr.Blocks(title="MinerU Document Processor", theme=gr.themes.Soft()) as dem
163
  inputs=markdown_output,
164
  outputs=download_btn
165
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
  if __name__ == "__main__":
168
  demo.launch(
 
1
  import gradio as gr
2
  import fitz # PyMuPDF
 
3
  import os
4
  import tempfile
5
  import aiofiles
6
  from typing import Optional
7
  import markdown2
8
  from pathlib import Path
9
+ import io
10
+ import base64
11
 
12
  class MinerUProcessor:
13
  def __init__(self):
14
  self.supported_formats = ['.pdf', '.png', '.jpg', '.jpeg', '.bmp', '.tiff']
15
 
16
+ async def process_pdf(self, file_path: str, enable_formula: bool, enable_table: bool) -> str:
17
+ """处理PDF文件并转换为Markdown"""
18
+ try:
19
+ doc = fitz.open(file_path)
20
+ markdown_content = ""
21
+
22
+ for page_num in range(len(doc)):
23
+ page = doc.load_page(page_num)
24
+
25
+ # 获取文本内容
26
+ text = page.get_text()
27
+
28
+ # 简单的Markdown转换
29
+ markdown_content += f"# Page {page_num + 1}\n\n"
30
+ markdown_content += text.replace('\n', ' \n') + "\n\n"
31
+
32
+ # 如果启用表格识别,尝试提取表格
33
+ if enable_table:
34
+ tabs = page.find_tables()
35
+ if tabs.tables:
36
+ markdown_content += "## Tables\n\n"
37
+ for i, tab in enumerate(tabs.tables):
38
+ table_data = tab.extract()
39
+ if table_data:
40
+ markdown_content += f"### Table {i + 1}\n\n"
41
+ # 简单的表格Markdown格式
42
+ for row_idx, row in enumerate(table_data):
43
+ if row_idx == 0: # 表头
44
+ markdown_content += "| " + " | ".join(str(cell) for cell in row) + " |\n"
45
+ markdown_content += "| " + " | ".join("---" for _ in row) + " |\n"
46
+ else:
47
+ markdown_content += "| " + " | ".join(str(cell) for cell in row) + " |\n"
48
+ markdown_content += "\n"
49
+
50
+ doc.close()
51
+ return markdown_content
52
+
53
+ except Exception as e:
54
+ return f"处理PDF时出错: {str(e)}"
55
+
56
+ async def process_image(self, file_path: str) -> str:
57
+ """处理图片文件(简单返回提示信息)"""
58
+ try:
59
+ # 对于图片文件,我们使用OCR功能(需要tesseract,这里简单处理)
60
+ return f"# Image File: {os.path.basename(file_path)}\n\n⚠️ 图片OCR功能需要额外配置Tesseract。当前仅支持PDF文件的文本提取。"
61
+
62
+ except Exception as e:
63
+ return f"处理图片时出错: {str(e)}"
64
+
65
  async def process_file(self, file_path: str, enable_formula: bool, enable_table: bool) -> str:
66
  """处理文件并返回markdown内容"""
67
  try:
68
  file_ext = os.path.splitext(file_path)[1].lower()
69
 
70
  if file_ext == '.pdf':
71
+ return await self.process_pdf(file_path, enable_formula, enable_table)
 
 
 
 
 
 
72
  else:
73
+ return await self.process_image(file_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  except Exception as e:
76
  return f"处理文件时出错: {str(e)}"
 
82
  ):
83
  """处理文档的主函数"""
84
  if file is None:
85
+ return "请上传PDF文件", None, gr.DownloadButton("📥 下载Markdown文件", visible=False)
86
 
87
  try:
88
  processor = MinerUProcessor()
 
90
  file_ext = os.path.splitext(file_path)[1].lower()
91
 
92
  if file_ext not in processor.supported_formats:
93
+ return f"不支持的文件格式: {file_ext}。支持格式: {', '.join(processor.supported_formats)}", None, gr.DownloadButton("📥 下载Markdown文件", visible=False)
94
 
95
  # 异步处理文件
96
  markdown_content = await processor.process_file(file_path, enable_formula, enable_table)
 
106
  return markdown_content, html_preview, gr.DownloadButton("📥 下载Markdown文件", visible=True)
107
 
108
  except Exception as e:
109
+ return f"处理过程中出错: {str(e)}", None, gr.DownloadButton("📥 下载Markdown文件", visible=False)
110
 
111
  def create_download_file(markdown_content: str):
112
  """创建下载文件"""
 
121
  return temp_file.name
122
 
123
  # 创建Gradio界面
124
+ with gr.Blocks(title="MinerU PDF to Markdown", theme=gr.themes.Soft()) as demo:
125
  gr.Markdown("""
126
+ # 📄 MinerU PDF to Markdown Converter
127
+
128
+ 将PDF文档转换为Markdown格式,支持基本的文本提取和表格识别。
129
 
130
+ **注意**: 当前版本主要支持PDF文本提取,公式识别需要额外配置。
131
  """)
132
 
133
  with gr.Row():
134
  with gr.Column(scale=1):
135
  file_input = gr.File(
136
+ label="上传PDF文档",
137
+ file_types=[".pdf"],
138
  type="filepath"
139
  )
140
 
 
142
  gr.Markdown("### 识别选项")
143
  enable_formula = gr.Checkbox(
144
  label="Enable formula recognition",
145
+ value=False,
146
+ info="需要额外配置(当前不可用)"
147
  )
148
  enable_table = gr.Checkbox(
149
  label="Enable table recognition",
 
161
  interactive=False
162
  )
163
 
164
+ with gr.Accordion("HTML预览", open=False):
165
+ html_output = gr.HTML()
 
 
166
 
167
  download_btn = gr.DownloadButton(
168
  "📥 下载Markdown文件",
 
182
  inputs=markdown_output,
183
  outputs=download_btn
184
  )
185
+
186
+ # 添加说明
187
+ gr.Markdown("""
188
+ ## 使用说明
189
+
190
+ 1. 上传PDF文件
191
+ 2. 选择识别选项
192
+ 3. 点击"开始处理"按钮
193
+ 4. 查看转换结果并下载Markdown文件
194
+
195
+ ## 限制
196
+
197
+ - 当前主要支持PDF文本提取
198
+ - 公式识别需要额外配置OCR服务
199
+ - 表格识别为基本功能
200
+ """)
201
 
202
  if __name__ == "__main__":
203
  demo.launch(