Spaces:
Running
Running
File size: 7,646 Bytes
1c1c960 55cb0f8 1c1c960 55cb0f8 1c1c960 55cb0f8 1c1c960 55cb0f8 1c1c960 55cb0f8 1c1c960 55cb0f8 1c1c960 09bf09a 1c1c960 09bf09a 1c1c960 55cb0f8 1c1c960 09bf09a 1c1c960 55cb0f8 1c1c960 55cb0f8 1c1c960 55cb0f8 1c1c960 55cb0f8 1c1c960 55cb0f8 1c1c960 09bf09a 55cb0f8 09bf09a 1c1c960 09bf09a 1c1c960 55cb0f8 1c1c960 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 |
import gradio as gr
import fitz # PyMuPDF
import os
import tempfile
import aiofiles
from typing import Optional
import markdown2
from pathlib import Path
import io
import base64
class MinerUProcessor:
def __init__(self):
self.supported_formats = ['.pdf', '.png', '.jpg', '.jpeg', '.bmp', '.tiff']
async def process_pdf(self, file_path: str, enable_formula: bool, enable_table: bool) -> str:
"""处理PDF文件并转换为Markdown"""
try:
doc = fitz.open(file_path)
markdown_content = ""
for page_num in range(len(doc)):
page = doc.load_page(page_num)
# 获取文本内容
text = page.get_text()
# 简单的Markdown转换
markdown_content += f"# Page {page_num + 1}\n\n"
markdown_content += text.replace('\n', ' \n') + "\n\n"
# 如果启用表格识别,尝试提取表格
if enable_table:
tabs = page.find_tables()
if tabs.tables:
markdown_content += "## Tables\n\n"
for i, tab in enumerate(tabs.tables):
table_data = tab.extract()
if table_data:
markdown_content += f"### Table {i + 1}\n\n"
# 简单的表格Markdown格式
for row_idx, row in enumerate(table_data):
if row_idx == 0: # 表头
markdown_content += "| " + " | ".join(str(cell) for cell in row) + " |\n"
markdown_content += "| " + " | ".join("---" for _ in row) + " |\n"
else:
markdown_content += "| " + " | ".join(str(cell) for cell in row) + " |\n"
markdown_content += "\n"
doc.close()
return markdown_content
except Exception as e:
return f"处理PDF时出错: {str(e)}"
async def process_image(self, file_path: str) -> str:
"""处理图片文件(简单返回提示信息)"""
try:
# 对于图片文件,我们使用OCR功能(需要tesseract,这里简单处理)
return f"# Image File: {os.path.basename(file_path)}\n\n⚠️ 图片OCR功能需要额外配置Tesseract。当前仅支持PDF文件的文本提取。"
except Exception as e:
return f"处理图片时出错: {str(e)}"
async def process_file(self, file_path: str, enable_formula: bool, enable_table: bool) -> str:
"""处理文件并返回markdown内容"""
try:
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext == '.pdf':
return await self.process_pdf(file_path, enable_formula, enable_table)
else:
return await self.process_image(file_path)
except Exception as e:
return f"处理文件时出错: {str(e)}"
async def process_document(
file: Optional[gr.components.File] = None,
enable_formula: bool = True,
enable_table: bool = True
):
"""处理文档的主函数"""
if file is None:
return "请上传PDF文件", None, gr.DownloadButton("📥 下载Markdown文件", visible=False)
try:
processor = MinerUProcessor()
file_path = file.name
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext not in processor.supported_formats:
return f"不支持的文件格式: {file_ext}。支持格式: {', '.join(processor.supported_formats)}", None, gr.DownloadButton("📥 下载Markdown文件", visible=False)
# 异步处理文件
markdown_content = await processor.process_file(file_path, enable_formula, enable_table)
# 生成HTML预览
html_preview = markdown2.markdown(markdown_content)
# 生成下载文件名
original_name = os.path.basename(file_path)
base_name = os.path.splitext(original_name)[0]
download_filename = f"{base_name}.md"
return markdown_content, html_preview, gr.DownloadButton("📥 下载Markdown文件", visible=True)
except Exception as e:
return f"处理过程中出错: {str(e)}", None, gr.DownloadButton("📥 下载Markdown文件", visible=False)
def create_download_file(markdown_content: str):
"""创建下载文件"""
if not markdown_content or markdown_content.startswith("处理过程中出错") or markdown_content.startswith("请上传文件"):
return None
# 创建临时文件
temp_file = tempfile.NamedTemporaryFile(suffix='.md', delete=False, mode='w', encoding='utf-8')
temp_file.write(markdown_content)
temp_file.close()
return temp_file.name
# 创建Gradio界面
with gr.Blocks(title="MinerU PDF to Markdown", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# 📄 MinerU PDF to Markdown Converter
将PDF文档转换为Markdown格式,支持基本的文本提取和表格识别。
**注意**: 当前版本主要支持PDF文本提取,公式识别需要额外配置。
""")
with gr.Row():
with gr.Column(scale=1):
file_input = gr.File(
label="上传PDF文档",
file_types=[".pdf"],
type="filepath"
)
with gr.Group():
gr.Markdown("### 识别选项")
enable_formula = gr.Checkbox(
label="Enable formula recognition",
value=False,
info="需要额外配置(当前不可用)"
)
enable_table = gr.Checkbox(
label="Enable table recognition",
value=True,
info="是否启用表格识别"
)
process_btn = gr.Button("🚀 开始处理", variant="primary")
with gr.Column(scale=2):
markdown_output = gr.Code(
label="转换结果 (Markdown)",
language="markdown",
lines=15,
interactive=False
)
with gr.Accordion("HTML预览", open=False):
html_output = gr.HTML()
download_btn = gr.DownloadButton(
"📥 下载Markdown文件",
visible=False
)
# 设置事件处理
process_btn.click(
fn=process_document,
inputs=[file_input, enable_formula, enable_table],
outputs=[markdown_output, html_output, download_btn]
)
# 下载处理
download_btn.click(
fn=create_download_file,
inputs=markdown_output,
outputs=download_btn
)
# 添加说明
gr.Markdown("""
## 使用说明
1. 上传PDF文件
2. 选择识别选项
3. 点击"开始处理"按钮
4. 查看转换结果并下载Markdown文件
## 限制
- 当前主要支持PDF文本提取
- 公式识别需要额外配置OCR服务
- 表格识别为基本功能
""")
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False
) |