Spaces:
Build error
Build error
| import gradio as gr | |
| import fitz # PyMuPDF | |
| import os | |
| import tempfile | |
| import aiofiles | |
| from typing import Optional | |
| import markdown2 | |
| from pathlib import Path | |
| import io | |
| import base64 | |
| class MinerUProcessor: | |
| def __init__(self): | |
| self.supported_formats = ['.pdf', '.png', '.jpg', '.jpeg', '.bmp', '.tiff'] | |
| async def process_pdf(self, file_path: str, enable_formula: bool, enable_table: bool) -> str: | |
| """处理PDF文件并转换为Markdown""" | |
| try: | |
| doc = fitz.open(file_path) | |
| markdown_content = "" | |
| for page_num in range(len(doc)): | |
| page = doc.load_page(page_num) | |
| # 获取文本内容 | |
| text = page.get_text() | |
| # 简单的Markdown转换 | |
| markdown_content += f"# Page {page_num + 1}\n\n" | |
| markdown_content += text.replace('\n', ' \n') + "\n\n" | |
| # 如果启用表格识别,尝试提取表格 | |
| if enable_table: | |
| tabs = page.find_tables() | |
| if tabs.tables: | |
| markdown_content += "## Tables\n\n" | |
| for i, tab in enumerate(tabs.tables): | |
| table_data = tab.extract() | |
| if table_data: | |
| markdown_content += f"### Table {i + 1}\n\n" | |
| # 简单的表格Markdown格式 | |
| for row_idx, row in enumerate(table_data): | |
| if row_idx == 0: # 表头 | |
| markdown_content += "| " + " | ".join(str(cell) for cell in row) + " |\n" | |
| markdown_content += "| " + " | ".join("---" for _ in row) + " |\n" | |
| else: | |
| markdown_content += "| " + " | ".join(str(cell) for cell in row) + " |\n" | |
| markdown_content += "\n" | |
| doc.close() | |
| return markdown_content | |
| except Exception as e: | |
| return f"处理PDF时出错: {str(e)}" | |
| async def process_image(self, file_path: str) -> str: | |
| """处理图片文件(简单返回提示信息)""" | |
| try: | |
| # 对于图片文件,我们使用OCR功能(需要tesseract,这里简单处理) | |
| return f"# Image File: {os.path.basename(file_path)}\n\n⚠️ 图片OCR功能需要额外配置Tesseract。当前仅支持PDF文件的文本提取。" | |
| except Exception as e: | |
| return f"处理图片时出错: {str(e)}" | |
| async def process_file(self, file_path: str, enable_formula: bool, enable_table: bool) -> str: | |
| """处理文件并返回markdown内容""" | |
| try: | |
| file_ext = os.path.splitext(file_path)[1].lower() | |
| if file_ext == '.pdf': | |
| return await self.process_pdf(file_path, enable_formula, enable_table) | |
| else: | |
| return await self.process_image(file_path) | |
| except Exception as e: | |
| return f"处理文件时出错: {str(e)}" | |
| async def process_document( | |
| file: Optional[gr.components.File] = None, | |
| enable_formula: bool = True, | |
| enable_table: bool = True | |
| ): | |
| """处理文档的主函数""" | |
| if file is None: | |
| return "请上传PDF文件", None, gr.DownloadButton("📥 下载Markdown文件", visible=False) | |
| try: | |
| processor = MinerUProcessor() | |
| file_path = file.name | |
| file_ext = os.path.splitext(file_path)[1].lower() | |
| if file_ext not in processor.supported_formats: | |
| return f"不支持的文件格式: {file_ext}。支持格式: {', '.join(processor.supported_formats)}", None, gr.DownloadButton("📥 下载Markdown文件", visible=False) | |
| # 异步处理文件 | |
| markdown_content = await processor.process_file(file_path, enable_formula, enable_table) | |
| # 生成HTML预览 | |
| html_preview = markdown2.markdown(markdown_content) | |
| # 生成下载文件名 | |
| original_name = os.path.basename(file_path) | |
| base_name = os.path.splitext(original_name)[0] | |
| download_filename = f"{base_name}.md" | |
| return markdown_content, html_preview, gr.DownloadButton("📥 下载Markdown文件", visible=True) | |
| except Exception as e: | |
| return f"处理过程中出错: {str(e)}", None, gr.DownloadButton("📥 下载Markdown文件", visible=False) | |
| def create_download_file(markdown_content: str): | |
| """创建下载文件""" | |
| if not markdown_content or markdown_content.startswith("处理过程中出错") or markdown_content.startswith("请上传文件"): | |
| return None | |
| # 创建临时文件 | |
| temp_file = tempfile.NamedTemporaryFile(suffix='.md', delete=False, mode='w', encoding='utf-8') | |
| temp_file.write(markdown_content) | |
| temp_file.close() | |
| return temp_file.name | |
| # 创建Gradio界面 | |
| with gr.Blocks(title="MinerU PDF to Markdown", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(""" | |
| # 📄 MinerU PDF to Markdown Converter | |
| 将PDF文档转换为Markdown格式,支持基本的文本提取和表格识别。 | |
| **注意**: 当前版本主要支持PDF文本提取,公式识别需要额外配置。 | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| file_input = gr.File( | |
| label="上传PDF文档", | |
| file_types=[".pdf"], | |
| type="filepath" | |
| ) | |
| with gr.Group(): | |
| gr.Markdown("### 识别选项") | |
| enable_formula = gr.Checkbox( | |
| label="Enable formula recognition", | |
| value=False, | |
| info="需要额外配置(当前不可用)" | |
| ) | |
| enable_table = gr.Checkbox( | |
| label="Enable table recognition", | |
| value=True, | |
| info="是否启用表格识别" | |
| ) | |
| process_btn = gr.Button("🚀 开始处理", variant="primary") | |
| with gr.Column(scale=2): | |
| markdown_output = gr.Code( | |
| label="转换结果 (Markdown)", | |
| language="markdown", | |
| lines=15, | |
| interactive=False | |
| ) | |
| with gr.Accordion("HTML预览", open=False): | |
| html_output = gr.HTML() | |
| download_btn = gr.DownloadButton( | |
| "📥 下载Markdown文件", | |
| visible=False | |
| ) | |
| # 设置事件处理 | |
| process_btn.click( | |
| fn=process_document, | |
| inputs=[file_input, enable_formula, enable_table], | |
| outputs=[markdown_output, html_output, download_btn] | |
| ) | |
| # 下载处理 | |
| download_btn.click( | |
| fn=create_download_file, | |
| inputs=markdown_output, | |
| outputs=download_btn | |
| ) | |
| # 添加说明 | |
| gr.Markdown(""" | |
| ## 使用说明 | |
| 1. 上传PDF文件 | |
| 2. 选择识别选项 | |
| 3. 点击"开始处理"按钮 | |
| 4. 查看转换结果并下载Markdown文件 | |
| ## 限制 | |
| - 当前主要支持PDF文本提取 | |
| - 公式识别需要额外配置OCR服务 | |
| - 表格识别为基本功能 | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False | |
| ) |