Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,57 +1,76 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import fitz # PyMuPDF
|
| 3 |
-
from pymupdf4llm import get_markdown
|
| 4 |
import os
|
| 5 |
import tempfile
|
| 6 |
import aiofiles
|
| 7 |
from typing import Optional
|
| 8 |
import markdown2
|
| 9 |
from pathlib import Path
|
| 10 |
-
import
|
|
|
|
| 11 |
|
| 12 |
class MinerUProcessor:
|
| 13 |
def __init__(self):
|
| 14 |
self.supported_formats = ['.pdf', '.png', '.jpg', '.jpeg', '.bmp', '.tiff']
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
async def process_file(self, file_path: str, enable_formula: bool, enable_table: bool) -> str:
|
| 17 |
"""处理文件并返回markdown内容"""
|
| 18 |
try:
|
| 19 |
file_ext = os.path.splitext(file_path)[1].lower()
|
| 20 |
|
| 21 |
if file_ext == '.pdf':
|
| 22 |
-
|
| 23 |
-
markdown_text = get_markdown(
|
| 24 |
-
file_path,
|
| 25 |
-
write_images=False,
|
| 26 |
-
use_latex=enable_formula,
|
| 27 |
-
use_table=enable_table
|
| 28 |
-
)
|
| 29 |
else:
|
| 30 |
-
|
| 31 |
-
doc = fitz.open()
|
| 32 |
-
img = fitz.open(file_path)
|
| 33 |
-
pdf_bytes = img.convert_to_pdf()
|
| 34 |
-
img.close()
|
| 35 |
-
|
| 36 |
-
# 保存为临时PDF
|
| 37 |
-
temp_pdf = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
|
| 38 |
-
temp_pdf.close()
|
| 39 |
-
|
| 40 |
-
with open(temp_pdf.name, 'wb') as f:
|
| 41 |
-
f.write(pdf_bytes)
|
| 42 |
-
|
| 43 |
-
markdown_text = get_markdown(
|
| 44 |
-
temp_pdf.name,
|
| 45 |
-
write_images=False,
|
| 46 |
-
use_latex=enable_formula,
|
| 47 |
-
use_table=enable_table
|
| 48 |
-
)
|
| 49 |
-
|
| 50 |
-
# 清理临时文件
|
| 51 |
-
if os.path.exists(temp_pdf.name):
|
| 52 |
-
os.remove(temp_pdf.name)
|
| 53 |
-
|
| 54 |
-
return markdown_text
|
| 55 |
|
| 56 |
except Exception as e:
|
| 57 |
return f"处理文件时出错: {str(e)}"
|
|
@@ -63,7 +82,7 @@ async def process_document(
|
|
| 63 |
):
|
| 64 |
"""处理文档的主函数"""
|
| 65 |
if file is None:
|
| 66 |
-
return "
|
| 67 |
|
| 68 |
try:
|
| 69 |
processor = MinerUProcessor()
|
|
@@ -71,7 +90,7 @@ async def process_document(
|
|
| 71 |
file_ext = os.path.splitext(file_path)[1].lower()
|
| 72 |
|
| 73 |
if file_ext not in processor.supported_formats:
|
| 74 |
-
return f"不支持的文件格式: {file_ext}", None,
|
| 75 |
|
| 76 |
# 异步处理文件
|
| 77 |
markdown_content = await processor.process_file(file_path, enable_formula, enable_table)
|
|
@@ -87,7 +106,7 @@ async def process_document(
|
|
| 87 |
return markdown_content, html_preview, gr.DownloadButton("📥 下载Markdown文件", visible=True)
|
| 88 |
|
| 89 |
except Exception as e:
|
| 90 |
-
return f"处理过程中出错: {str(e)}", None,
|
| 91 |
|
| 92 |
def create_download_file(markdown_content: str):
|
| 93 |
"""创建下载文件"""
|
|
@@ -102,18 +121,20 @@ def create_download_file(markdown_content: str):
|
|
| 102 |
return temp_file.name
|
| 103 |
|
| 104 |
# 创建Gradio界面
|
| 105 |
-
with gr.Blocks(title="MinerU
|
| 106 |
gr.Markdown("""
|
| 107 |
-
# 📄 MinerU
|
|
|
|
|
|
|
| 108 |
|
| 109 |
-
|
| 110 |
""")
|
| 111 |
|
| 112 |
with gr.Row():
|
| 113 |
with gr.Column(scale=1):
|
| 114 |
file_input = gr.File(
|
| 115 |
-
label="
|
| 116 |
-
file_types=[".pdf"
|
| 117 |
type="filepath"
|
| 118 |
)
|
| 119 |
|
|
@@ -121,8 +142,8 @@ with gr.Blocks(title="MinerU Document Processor", theme=gr.themes.Soft()) as dem
|
|
| 121 |
gr.Markdown("### 识别选项")
|
| 122 |
enable_formula = gr.Checkbox(
|
| 123 |
label="Enable formula recognition",
|
| 124 |
-
value=
|
| 125 |
-
info="
|
| 126 |
)
|
| 127 |
enable_table = gr.Checkbox(
|
| 128 |
label="Enable table recognition",
|
|
@@ -140,10 +161,8 @@ with gr.Blocks(title="MinerU Document Processor", theme=gr.themes.Soft()) as dem
|
|
| 140 |
interactive=False
|
| 141 |
)
|
| 142 |
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
visible=False
|
| 146 |
-
)
|
| 147 |
|
| 148 |
download_btn = gr.DownloadButton(
|
| 149 |
"📥 下载Markdown文件",
|
|
@@ -163,6 +182,22 @@ with gr.Blocks(title="MinerU Document Processor", theme=gr.themes.Soft()) as dem
|
|
| 163 |
inputs=markdown_output,
|
| 164 |
outputs=download_btn
|
| 165 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
|
| 167 |
if __name__ == "__main__":
|
| 168 |
demo.launch(
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import fitz # PyMuPDF
|
|
|
|
| 3 |
import os
|
| 4 |
import tempfile
|
| 5 |
import aiofiles
|
| 6 |
from typing import Optional
|
| 7 |
import markdown2
|
| 8 |
from pathlib import Path
|
| 9 |
+
import io
|
| 10 |
+
import base64
|
| 11 |
|
| 12 |
class MinerUProcessor:
|
| 13 |
def __init__(self):
|
| 14 |
self.supported_formats = ['.pdf', '.png', '.jpg', '.jpeg', '.bmp', '.tiff']
|
| 15 |
|
| 16 |
+
async def process_pdf(self, file_path: str, enable_formula: bool, enable_table: bool) -> str:
|
| 17 |
+
"""处理PDF文件并转换为Markdown"""
|
| 18 |
+
try:
|
| 19 |
+
doc = fitz.open(file_path)
|
| 20 |
+
markdown_content = ""
|
| 21 |
+
|
| 22 |
+
for page_num in range(len(doc)):
|
| 23 |
+
page = doc.load_page(page_num)
|
| 24 |
+
|
| 25 |
+
# 获取文本内容
|
| 26 |
+
text = page.get_text()
|
| 27 |
+
|
| 28 |
+
# 简单的Markdown转换
|
| 29 |
+
markdown_content += f"# Page {page_num + 1}\n\n"
|
| 30 |
+
markdown_content += text.replace('\n', ' \n') + "\n\n"
|
| 31 |
+
|
| 32 |
+
# 如果启用表格识别,尝试提取表格
|
| 33 |
+
if enable_table:
|
| 34 |
+
tabs = page.find_tables()
|
| 35 |
+
if tabs.tables:
|
| 36 |
+
markdown_content += "## Tables\n\n"
|
| 37 |
+
for i, tab in enumerate(tabs.tables):
|
| 38 |
+
table_data = tab.extract()
|
| 39 |
+
if table_data:
|
| 40 |
+
markdown_content += f"### Table {i + 1}\n\n"
|
| 41 |
+
# 简单的表格Markdown格式
|
| 42 |
+
for row_idx, row in enumerate(table_data):
|
| 43 |
+
if row_idx == 0: # 表头
|
| 44 |
+
markdown_content += "| " + " | ".join(str(cell) for cell in row) + " |\n"
|
| 45 |
+
markdown_content += "| " + " | ".join("---" for _ in row) + " |\n"
|
| 46 |
+
else:
|
| 47 |
+
markdown_content += "| " + " | ".join(str(cell) for cell in row) + " |\n"
|
| 48 |
+
markdown_content += "\n"
|
| 49 |
+
|
| 50 |
+
doc.close()
|
| 51 |
+
return markdown_content
|
| 52 |
+
|
| 53 |
+
except Exception as e:
|
| 54 |
+
return f"处理PDF时出错: {str(e)}"
|
| 55 |
+
|
| 56 |
+
async def process_image(self, file_path: str) -> str:
|
| 57 |
+
"""处理图片文件(简单返回提示信息)"""
|
| 58 |
+
try:
|
| 59 |
+
# 对于图片文件,我们使用OCR功能(需要tesseract,这里简单处理)
|
| 60 |
+
return f"# Image File: {os.path.basename(file_path)}\n\n⚠️ 图片OCR功能需要额外配置Tesseract。当前仅支持PDF文件的文本提取。"
|
| 61 |
+
|
| 62 |
+
except Exception as e:
|
| 63 |
+
return f"处理图片时出错: {str(e)}"
|
| 64 |
+
|
| 65 |
async def process_file(self, file_path: str, enable_formula: bool, enable_table: bool) -> str:
|
| 66 |
"""处理文件并返回markdown内容"""
|
| 67 |
try:
|
| 68 |
file_ext = os.path.splitext(file_path)[1].lower()
|
| 69 |
|
| 70 |
if file_ext == '.pdf':
|
| 71 |
+
return await self.process_pdf(file_path, enable_formula, enable_table)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
else:
|
| 73 |
+
return await self.process_image(file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
except Exception as e:
|
| 76 |
return f"处理文件时出错: {str(e)}"
|
|
|
|
| 82 |
):
|
| 83 |
"""处理文档的主函数"""
|
| 84 |
if file is None:
|
| 85 |
+
return "请上传PDF文件", None, gr.DownloadButton("📥 下载Markdown文件", visible=False)
|
| 86 |
|
| 87 |
try:
|
| 88 |
processor = MinerUProcessor()
|
|
|
|
| 90 |
file_ext = os.path.splitext(file_path)[1].lower()
|
| 91 |
|
| 92 |
if file_ext not in processor.supported_formats:
|
| 93 |
+
return f"不支持的文件格式: {file_ext}。支持格式: {', '.join(processor.supported_formats)}", None, gr.DownloadButton("📥 下载Markdown文件", visible=False)
|
| 94 |
|
| 95 |
# 异步处理文件
|
| 96 |
markdown_content = await processor.process_file(file_path, enable_formula, enable_table)
|
|
|
|
| 106 |
return markdown_content, html_preview, gr.DownloadButton("📥 下载Markdown文件", visible=True)
|
| 107 |
|
| 108 |
except Exception as e:
|
| 109 |
+
return f"处理过程中出错: {str(e)}", None, gr.DownloadButton("📥 下载Markdown文件", visible=False)
|
| 110 |
|
| 111 |
def create_download_file(markdown_content: str):
|
| 112 |
"""创建下载文件"""
|
|
|
|
| 121 |
return temp_file.name
|
| 122 |
|
| 123 |
# 创建Gradio界面
|
| 124 |
+
with gr.Blocks(title="MinerU PDF to Markdown", theme=gr.themes.Soft()) as demo:
|
| 125 |
gr.Markdown("""
|
| 126 |
+
# 📄 MinerU PDF to Markdown Converter
|
| 127 |
+
|
| 128 |
+
将PDF文档转换为Markdown格式,支持基本的文本提取和表格识别。
|
| 129 |
|
| 130 |
+
**注意**: 当前版本主要支持PDF文本提取,公式识别需要额外配置。
|
| 131 |
""")
|
| 132 |
|
| 133 |
with gr.Row():
|
| 134 |
with gr.Column(scale=1):
|
| 135 |
file_input = gr.File(
|
| 136 |
+
label="上传PDF文档",
|
| 137 |
+
file_types=[".pdf"],
|
| 138 |
type="filepath"
|
| 139 |
)
|
| 140 |
|
|
|
|
| 142 |
gr.Markdown("### 识别选项")
|
| 143 |
enable_formula = gr.Checkbox(
|
| 144 |
label="Enable formula recognition",
|
| 145 |
+
value=False,
|
| 146 |
+
info="需要额外配置(当前不可用)"
|
| 147 |
)
|
| 148 |
enable_table = gr.Checkbox(
|
| 149 |
label="Enable table recognition",
|
|
|
|
| 161 |
interactive=False
|
| 162 |
)
|
| 163 |
|
| 164 |
+
with gr.Accordion("HTML预览", open=False):
|
| 165 |
+
html_output = gr.HTML()
|
|
|
|
|
|
|
| 166 |
|
| 167 |
download_btn = gr.DownloadButton(
|
| 168 |
"📥 下载Markdown文件",
|
|
|
|
| 182 |
inputs=markdown_output,
|
| 183 |
outputs=download_btn
|
| 184 |
)
|
| 185 |
+
|
| 186 |
+
# 添加说明
|
| 187 |
+
gr.Markdown("""
|
| 188 |
+
## 使用说明
|
| 189 |
+
|
| 190 |
+
1. 上传PDF文件
|
| 191 |
+
2. 选择识别选项
|
| 192 |
+
3. 点击"开始处理"按钮
|
| 193 |
+
4. 查看转换结果并下载Markdown文件
|
| 194 |
+
|
| 195 |
+
## 限制
|
| 196 |
+
|
| 197 |
+
- 当前主要支持PDF文本提取
|
| 198 |
+
- 公式识别需要额外配置OCR服务
|
| 199 |
+
- 表格识别为基本功能
|
| 200 |
+
""")
|
| 201 |
|
| 202 |
if __name__ == "__main__":
|
| 203 |
demo.launch(
|