Spaces:

AloneDancer
/

MinerU-noGPU

Running

File size: 7,646 Bytes

import gradio as gr
import fitz  # PyMuPDF
import os
import tempfile
import aiofiles
from typing import Optional
import markdown2
from pathlib import Path
import io
import base64

class MinerUProcessor:
    def __init__(self):
        self.supported_formats = ['.pdf', '.png', '.jpg', '.jpeg', '.bmp', '.tiff']
    
    async def process_pdf(self, file_path: str, enable_formula: bool, enable_table: bool) -> str:
        """处理PDF文件并转换为Markdown"""
        try:
            doc = fitz.open(file_path)
            markdown_content = ""
            
            for page_num in range(len(doc)):
                page = doc.load_page(page_num)
                
                # 获取文本内容
                text = page.get_text()
                
                # 简单的Markdown转换
                markdown_content += f"# Page {page_num + 1}\n\n"
                markdown_content += text.replace('\n', '  \n') + "\n\n"
                
                # 如果启用表格识别，尝试提取表格
                if enable_table:
                    tabs = page.find_tables()
                    if tabs.tables:
                        markdown_content += "## Tables\n\n"
                        for i, tab in enumerate(tabs.tables):
                            table_data = tab.extract()
                            if table_data:
                                markdown_content += f"### Table {i + 1}\n\n"
                                # 简单的表格Markdown格式
                                for row_idx, row in enumerate(table_data):
                                    if row_idx == 0:  # 表头
                                        markdown_content += "| " + " | ".join(str(cell) for cell in row) + " |\n"
                                        markdown_content += "| " + " | ".join("---" for _ in row) + " |\n"
                                    else:
                                        markdown_content += "| " + " | ".join(str(cell) for cell in row) + " |\n"
                                markdown_content += "\n"
            
            doc.close()
            return markdown_content
            
        except Exception as e:
            return f"处理PDF时出错: {str(e)}"
    
    async def process_image(self, file_path: str) -> str:
        """处理图片文件（简单返回提示信息）"""
        try:
            # 对于图片文件，我们使用OCR功能（需要tesseract，这里简单处理）
            return f"# Image File: {os.path.basename(file_path)}\n\n⚠️ 图片OCR功能需要额外配置Tesseract。当前仅支持PDF文件的文本提取。"
            
        except Exception as e:
            return f"处理图片时出错: {str(e)}"
    
    async def process_file(self, file_path: str, enable_formula: bool, enable_table: bool) -> str:
        """处理文件并返回markdown内容"""
        try:
            file_ext = os.path.splitext(file_path)[1].lower()
            
            if file_ext == '.pdf':
                return await self.process_pdf(file_path, enable_formula, enable_table)
            else:
                return await self.process_image(file_path)
            
        except Exception as e:
            return f"处理文件时出错: {str(e)}"

async def process_document(
    file: Optional[gr.components.File] = None,
    enable_formula: bool = True,
    enable_table: bool = True
):
    """处理文档的主函数"""
    if file is None:
        return "请上传PDF文件", None, gr.DownloadButton("📥 下载Markdown文件", visible=False)
    
    try:
        processor = MinerUProcessor()
        file_path = file.name
        file_ext = os.path.splitext(file_path)[1].lower()
        
        if file_ext not in processor.supported_formats:
            return f"不支持的文件格式: {file_ext}。支持格式: {', '.join(processor.supported_formats)}", None, gr.DownloadButton("📥 下载Markdown文件", visible=False)
        
        # 异步处理文件
        markdown_content = await processor.process_file(file_path, enable_formula, enable_table)
        
        # 生成HTML预览
        html_preview = markdown2.markdown(markdown_content)
        
        # 生成下载文件名
        original_name = os.path.basename(file_path)
        base_name = os.path.splitext(original_name)[0]
        download_filename = f"{base_name}.md"
        
        return markdown_content, html_preview, gr.DownloadButton("📥 下载Markdown文件", visible=True)
        
    except Exception as e:
        return f"处理过程中出错: {str(e)}", None, gr.DownloadButton("📥 下载Markdown文件", visible=False)

def create_download_file(markdown_content: str):
    """创建下载文件"""
    if not markdown_content or markdown_content.startswith("处理过程中出错") or markdown_content.startswith("请上传文件"):
        return None
    
    # 创建临时文件
    temp_file = tempfile.NamedTemporaryFile(suffix='.md', delete=False, mode='w', encoding='utf-8')
    temp_file.write(markdown_content)
    temp_file.close()
    
    return temp_file.name

# 创建Gradio界面
with gr.Blocks(title="MinerU PDF to Markdown", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 📄 MinerU PDF to Markdown Converter
    
    将PDF文档转换为Markdown格式，支持基本的文本提取和表格识别。
    
    **注意**: 当前版本主要支持PDF文本提取，公式识别需要额外配置。
    """)
    
    with gr.Row():
        with gr.Column(scale=1):
            file_input = gr.File(
                label="上传PDF文档",
                file_types=[".pdf"],
                type="filepath"
            )
            
            with gr.Group():
                gr.Markdown("### 识别选项")
                enable_formula = gr.Checkbox(
                    label="Enable formula recognition", 
                    value=False,
                    info="需要额外配置（当前不可用）"
                )
                enable_table = gr.Checkbox(
                    label="Enable table recognition", 
                    value=True,
                    info="是否启用表格识别"
                )
            
            process_btn = gr.Button("🚀 开始处理", variant="primary")
            
        with gr.Column(scale=2):
            markdown_output = gr.Code(
                label="转换结果 (Markdown)",
                language="markdown",
                lines=15,
                interactive=False
            )
            
            with gr.Accordion("HTML预览", open=False):
                html_output = gr.HTML()
            
            download_btn = gr.DownloadButton(
                "📥 下载Markdown文件",
                visible=False
            )
    
    # 设置事件处理
    process_btn.click(
        fn=process_document,
        inputs=[file_input, enable_formula, enable_table],
        outputs=[markdown_output, html_output, download_btn]
    )
    
    # 下载处理
    download_btn.click(
        fn=create_download_file,
        inputs=markdown_output,
        outputs=download_btn
    )
    
    # 添加说明
    gr.Markdown("""
    ## 使用说明
    
    1. 上传PDF文件
    2. 选择识别选项
    3. 点击"开始处理"按钮
    4. 查看转换结果并下载Markdown文件
    
    ## 限制
    
    - 当前主要支持PDF文本提取
    - 公式识别需要额外配置OCR服务
    - 表格识别为基本功能
    """)

if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False
    )