MinerU-noGPU / app.py
AloneDancer's picture
Update app.py
55cb0f8 verified
import gradio as gr
import fitz # PyMuPDF
import os
import tempfile
import aiofiles
from typing import Optional
import markdown2
from pathlib import Path
import io
import base64
class MinerUProcessor:
def __init__(self):
self.supported_formats = ['.pdf', '.png', '.jpg', '.jpeg', '.bmp', '.tiff']
async def process_pdf(self, file_path: str, enable_formula: bool, enable_table: bool) -> str:
"""处理PDF文件并转换为Markdown"""
try:
doc = fitz.open(file_path)
markdown_content = ""
for page_num in range(len(doc)):
page = doc.load_page(page_num)
# 获取文本内容
text = page.get_text()
# 简单的Markdown转换
markdown_content += f"# Page {page_num + 1}\n\n"
markdown_content += text.replace('\n', ' \n') + "\n\n"
# 如果启用表格识别,尝试提取表格
if enable_table:
tabs = page.find_tables()
if tabs.tables:
markdown_content += "## Tables\n\n"
for i, tab in enumerate(tabs.tables):
table_data = tab.extract()
if table_data:
markdown_content += f"### Table {i + 1}\n\n"
# 简单的表格Markdown格式
for row_idx, row in enumerate(table_data):
if row_idx == 0: # 表头
markdown_content += "| " + " | ".join(str(cell) for cell in row) + " |\n"
markdown_content += "| " + " | ".join("---" for _ in row) + " |\n"
else:
markdown_content += "| " + " | ".join(str(cell) for cell in row) + " |\n"
markdown_content += "\n"
doc.close()
return markdown_content
except Exception as e:
return f"处理PDF时出错: {str(e)}"
async def process_image(self, file_path: str) -> str:
"""处理图片文件(简单返回提示信息)"""
try:
# 对于图片文件,我们使用OCR功能(需要tesseract,这里简单处理)
return f"# Image File: {os.path.basename(file_path)}\n\n⚠️ 图片OCR功能需要额外配置Tesseract。当前仅支持PDF文件的文本提取。"
except Exception as e:
return f"处理图片时出错: {str(e)}"
async def process_file(self, file_path: str, enable_formula: bool, enable_table: bool) -> str:
"""处理文件并返回markdown内容"""
try:
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext == '.pdf':
return await self.process_pdf(file_path, enable_formula, enable_table)
else:
return await self.process_image(file_path)
except Exception as e:
return f"处理文件时出错: {str(e)}"
async def process_document(
file: Optional[gr.components.File] = None,
enable_formula: bool = True,
enable_table: bool = True
):
"""处理文档的主函数"""
if file is None:
return "请上传PDF文件", None, gr.DownloadButton("📥 下载Markdown文件", visible=False)
try:
processor = MinerUProcessor()
file_path = file.name
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext not in processor.supported_formats:
return f"不支持的文件格式: {file_ext}。支持格式: {', '.join(processor.supported_formats)}", None, gr.DownloadButton("📥 下载Markdown文件", visible=False)
# 异步处理文件
markdown_content = await processor.process_file(file_path, enable_formula, enable_table)
# 生成HTML预览
html_preview = markdown2.markdown(markdown_content)
# 生成下载文件名
original_name = os.path.basename(file_path)
base_name = os.path.splitext(original_name)[0]
download_filename = f"{base_name}.md"
return markdown_content, html_preview, gr.DownloadButton("📥 下载Markdown文件", visible=True)
except Exception as e:
return f"处理过程中出错: {str(e)}", None, gr.DownloadButton("📥 下载Markdown文件", visible=False)
def create_download_file(markdown_content: str):
"""创建下载文件"""
if not markdown_content or markdown_content.startswith("处理过程中出错") or markdown_content.startswith("请上传文件"):
return None
# 创建临时文件
temp_file = tempfile.NamedTemporaryFile(suffix='.md', delete=False, mode='w', encoding='utf-8')
temp_file.write(markdown_content)
temp_file.close()
return temp_file.name
# 创建Gradio界面
with gr.Blocks(title="MinerU PDF to Markdown", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# 📄 MinerU PDF to Markdown Converter
将PDF文档转换为Markdown格式,支持基本的文本提取和表格识别。
**注意**: 当前版本主要支持PDF文本提取,公式识别需要额外配置。
""")
with gr.Row():
with gr.Column(scale=1):
file_input = gr.File(
label="上传PDF文档",
file_types=[".pdf"],
type="filepath"
)
with gr.Group():
gr.Markdown("### 识别选项")
enable_formula = gr.Checkbox(
label="Enable formula recognition",
value=False,
info="需要额外配置(当前不可用)"
)
enable_table = gr.Checkbox(
label="Enable table recognition",
value=True,
info="是否启用表格识别"
)
process_btn = gr.Button("🚀 开始处理", variant="primary")
with gr.Column(scale=2):
markdown_output = gr.Code(
label="转换结果 (Markdown)",
language="markdown",
lines=15,
interactive=False
)
with gr.Accordion("HTML预览", open=False):
html_output = gr.HTML()
download_btn = gr.DownloadButton(
"📥 下载Markdown文件",
visible=False
)
# 设置事件处理
process_btn.click(
fn=process_document,
inputs=[file_input, enable_formula, enable_table],
outputs=[markdown_output, html_output, download_btn]
)
# 下载处理
download_btn.click(
fn=create_download_file,
inputs=markdown_output,
outputs=download_btn
)
# 添加说明
gr.Markdown("""
## 使用说明
1. 上传PDF文件
2. 选择识别选项
3. 点击"开始处理"按钮
4. 查看转换结果并下载Markdown文件
## 限制
- 当前主要支持PDF文本提取
- 公式识别需要额外配置OCR服务
- 表格识别为基本功能
""")
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False
)