import gradio as gr import fitz # PyMuPDF import os import tempfile import aiofiles from typing import Optional import markdown2 from pathlib import Path import io import base64 class MinerUProcessor: def __init__(self): self.supported_formats = ['.pdf', '.png', '.jpg', '.jpeg', '.bmp', '.tiff'] async def process_pdf(self, file_path: str, enable_formula: bool, enable_table: bool) -> str: """处理PDF文件并转换为Markdown""" try: doc = fitz.open(file_path) markdown_content = "" for page_num in range(len(doc)): page = doc.load_page(page_num) # 获取文本内容 text = page.get_text() # 简单的Markdown转换 markdown_content += f"# Page {page_num + 1}\n\n" markdown_content += text.replace('\n', ' \n') + "\n\n" # 如果启用表格识别,尝试提取表格 if enable_table: tabs = page.find_tables() if tabs.tables: markdown_content += "## Tables\n\n" for i, tab in enumerate(tabs.tables): table_data = tab.extract() if table_data: markdown_content += f"### Table {i + 1}\n\n" # 简单的表格Markdown格式 for row_idx, row in enumerate(table_data): if row_idx == 0: # 表头 markdown_content += "| " + " | ".join(str(cell) for cell in row) + " |\n" markdown_content += "| " + " | ".join("---" for _ in row) + " |\n" else: markdown_content += "| " + " | ".join(str(cell) for cell in row) + " |\n" markdown_content += "\n" doc.close() return markdown_content except Exception as e: return f"处理PDF时出错: {str(e)}" async def process_image(self, file_path: str) -> str: """处理图片文件(简单返回提示信息)""" try: # 对于图片文件,我们使用OCR功能(需要tesseract,这里简单处理) return f"# Image File: {os.path.basename(file_path)}\n\n⚠️ 图片OCR功能需要额外配置Tesseract。当前仅支持PDF文件的文本提取。" except Exception as e: return f"处理图片时出错: {str(e)}" async def process_file(self, file_path: str, enable_formula: bool, enable_table: bool) -> str: """处理文件并返回markdown内容""" try: file_ext = os.path.splitext(file_path)[1].lower() if file_ext == '.pdf': return await self.process_pdf(file_path, enable_formula, enable_table) else: return await self.process_image(file_path) except Exception as e: return f"处理文件时出错: {str(e)}" async def process_document( file: Optional[gr.components.File] = None, enable_formula: bool = True, enable_table: bool = True ): """处理文档的主函数""" if file is None: return "请上传PDF文件", None, gr.DownloadButton("📥 下载Markdown文件", visible=False) try: processor = MinerUProcessor() file_path = file.name file_ext = os.path.splitext(file_path)[1].lower() if file_ext not in processor.supported_formats: return f"不支持的文件格式: {file_ext}。支持格式: {', '.join(processor.supported_formats)}", None, gr.DownloadButton("📥 下载Markdown文件", visible=False) # 异步处理文件 markdown_content = await processor.process_file(file_path, enable_formula, enable_table) # 生成HTML预览 html_preview = markdown2.markdown(markdown_content) # 生成下载文件名 original_name = os.path.basename(file_path) base_name = os.path.splitext(original_name)[0] download_filename = f"{base_name}.md" return markdown_content, html_preview, gr.DownloadButton("📥 下载Markdown文件", visible=True) except Exception as e: return f"处理过程中出错: {str(e)}", None, gr.DownloadButton("📥 下载Markdown文件", visible=False) def create_download_file(markdown_content: str): """创建下载文件""" if not markdown_content or markdown_content.startswith("处理过程中出错") or markdown_content.startswith("请上传文件"): return None # 创建临时文件 temp_file = tempfile.NamedTemporaryFile(suffix='.md', delete=False, mode='w', encoding='utf-8') temp_file.write(markdown_content) temp_file.close() return temp_file.name # 创建Gradio界面 with gr.Blocks(title="MinerU PDF to Markdown", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 📄 MinerU PDF to Markdown Converter 将PDF文档转换为Markdown格式,支持基本的文本提取和表格识别。 **注意**: 当前版本主要支持PDF文本提取,公式识别需要额外配置。 """) with gr.Row(): with gr.Column(scale=1): file_input = gr.File( label="上传PDF文档", file_types=[".pdf"], type="filepath" ) with gr.Group(): gr.Markdown("### 识别选项") enable_formula = gr.Checkbox( label="Enable formula recognition", value=False, info="需要额外配置(当前不可用)" ) enable_table = gr.Checkbox( label="Enable table recognition", value=True, info="是否启用表格识别" ) process_btn = gr.Button("🚀 开始处理", variant="primary") with gr.Column(scale=2): markdown_output = gr.Code( label="转换结果 (Markdown)", language="markdown", lines=15, interactive=False ) with gr.Accordion("HTML预览", open=False): html_output = gr.HTML() download_btn = gr.DownloadButton( "📥 下载Markdown文件", visible=False ) # 设置事件处理 process_btn.click( fn=process_document, inputs=[file_input, enable_formula, enable_table], outputs=[markdown_output, html_output, download_btn] ) # 下载处理 download_btn.click( fn=create_download_file, inputs=markdown_output, outputs=download_btn ) # 添加说明 gr.Markdown(""" ## 使用说明 1. 上传PDF文件 2. 选择识别选项 3. 点击"开始处理"按钮 4. 查看转换结果并下载Markdown文件 ## 限制 - 当前主要支持PDF文本提取 - 公式识别需要额外配置OCR服务 - 表格识别为基本功能 """) if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, share=False )