MinerU-noGPU

Build error

App Files Files Community

AloneDancer commited on Aug 23, 2025

Commit

55cb0f8

verified ·

1 Parent(s): 7a422b5

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -48

app.py CHANGED Viewed

@@ -1,57 +1,76 @@
 import gradio as gr
 import fitz  # PyMuPDF
-from pymupdf4llm import get_markdown
 import os
 import tempfile
 import aiofiles
 from typing import Optional
 import markdown2
 from pathlib import Path
-import asyncio
 class MinerUProcessor:
     def __init__(self):
         self.supported_formats = ['.pdf', '.png', '.jpg', '.jpeg', '.bmp', '.tiff']
     async def process_file(self, file_path: str, enable_formula: bool, enable_table: bool) -> str:
         """处理文件并返回markdown内容"""
         try:
             file_ext = os.path.splitext(file_path)[1].lower()
             if file_ext == '.pdf':
-                # 处理PDF文件
-                markdown_text = get_markdown(
-                    file_path,
-                    write_images=False,
-                    use_latex=enable_formula,
-                    use_table=enable_table
-                )
             else:
-                # 处理图片文件
-                doc = fitz.open()
-                img = fitz.open(file_path)
-                pdf_bytes = img.convert_to_pdf()
-                img.close()
-                # 保存为临时PDF
-                temp_pdf = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
-                temp_pdf.close()
-                with open(temp_pdf.name, 'wb') as f:
-                    f.write(pdf_bytes)
-                markdown_text = get_markdown(
-                    temp_pdf.name,
-                    write_images=False,
-                    use_latex=enable_formula,
-                    use_table=enable_table
-                )
-                # 清理临时文件
-                if os.path.exists(temp_pdf.name):
-                    os.remove(temp_pdf.name)
-            return markdown_text
         except Exception as e:
             return f"处理文件时出错: {str(e)}"
@@ -63,7 +82,7 @@ async def process_document(
 ):
     """处理文档的主函数"""
     if file is None:
-        return "请上传文件", None, None
     try:
         processor = MinerUProcessor()
@@ -71,7 +90,7 @@ async def process_document(
         file_ext = os.path.splitext(file_path)[1].lower()
         if file_ext not in processor.supported_formats:
-            return f"不支持的文件格式: {file_ext}", None, None
         # 异步处理文件
         markdown_content = await processor.process_file(file_path, enable_formula, enable_table)
@@ -87,7 +106,7 @@ async def process_document(
         return markdown_content, html_preview, gr.DownloadButton("📥 下载Markdown文件", visible=True)
     except Exception as e:
-        return f"处理过程中出错: {str(e)}", None, None
 def create_download_file(markdown_content: str):
     """创建下载文件"""
@@ -102,18 +121,20 @@ def create_download_file(markdown_content: str):
     return temp_file.name
 # 创建Gradio界面
-with gr.Blocks(title="MinerU Document Processor", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
-    # 📄 MinerU Document Processor
-    将PDF和图片文档转换为Markdown格式，支持公式和表格识别。
     """)
     with gr.Row():
         with gr.Column(scale=1):
             file_input = gr.File(
-                label="上传文档",
-                file_types=[".pdf", ".png", ".jpg", ".jpeg"],
                 type="filepath"
             )
@@ -121,8 +142,8 @@ with gr.Blocks(title="MinerU Document Processor", theme=gr.themes.Soft()) as dem
                 gr.Markdown("### 识别选项")
                 enable_formula = gr.Checkbox(
                     label="Enable formula recognition",
-                    value=True,
-                    info="是否启用公式识别"
                 )
                 enable_table = gr.Checkbox(
                     label="Enable table recognition",
@@ -140,10 +161,8 @@ with gr.Blocks(title="MinerU Document Processor", theme=gr.themes.Soft()) as dem
                 interactive=False
             )
-            html_output = gr.HTML(
-                label="HTML预览",
-                visible=False
-            )
             download_btn = gr.DownloadButton(
                 "📥 下载Markdown文件",
@@ -163,6 +182,22 @@ with gr.Blocks(title="MinerU Document Processor", theme=gr.themes.Soft()) as dem
         inputs=markdown_output,
         outputs=download_btn
     )
 if __name__ == "__main__":
     demo.launch(

 import gradio as gr
 import fitz  # PyMuPDF
 import os
 import tempfile
 import aiofiles
 from typing import Optional
 import markdown2
 from pathlib import Path
+import io
+import base64
 class MinerUProcessor:
     def __init__(self):
         self.supported_formats = ['.pdf', '.png', '.jpg', '.jpeg', '.bmp', '.tiff']
+    async def process_pdf(self, file_path: str, enable_formula: bool, enable_table: bool) -> str:
+        """处理PDF文件并转换为Markdown"""
+        try:
+            doc = fitz.open(file_path)
+            markdown_content = ""
+            for page_num in range(len(doc)):
+                page = doc.load_page(page_num)
+                # 获取文本内容
+                text = page.get_text()
+                # 简单的Markdown转换
+                markdown_content += f"# Page {page_num + 1}\n\n"
+                markdown_content += text.replace('\n', '  \n') + "\n\n"
+                # 如果启用表格识别，尝试提取表格
+                if enable_table:
+                    tabs = page.find_tables()
+                    if tabs.tables:
+                        markdown_content += "## Tables\n\n"
+                        for i, tab in enumerate(tabs.tables):
+                            table_data = tab.extract()
+                            if table_data:
+                                markdown_content += f"### Table {i + 1}\n\n"
+                                # 简单的表格Markdown格式
+                                for row_idx, row in enumerate(table_data):
+                                    if row_idx == 0:  # 表头
+                                        markdown_content += "| " + " | ".join(str(cell) for cell in row) + " |\n"
+                                        markdown_content += "| " + " | ".join("---" for _ in row) + " |\n"
+                                    else:
+                                        markdown_content += "| " + " | ".join(str(cell) for cell in row) + " |\n"
+                                markdown_content += "\n"
+            doc.close()
+            return markdown_content
+        except Exception as e:
+            return f"处理PDF时出错: {str(e)}"
+    async def process_image(self, file_path: str) -> str:
+        """处理图片文件（简单返回提示信息）"""
+        try:
+            # 对于图片文件，我们使用OCR功能（需要tesseract，这里简单处理）
+            return f"# Image File: {os.path.basename(file_path)}\n\n⚠️ 图片OCR功能需要额外配置Tesseract。当前仅支持PDF文件的文本提取。"
+        except Exception as e:
+            return f"处理图片时出错: {str(e)}"
     async def process_file(self, file_path: str, enable_formula: bool, enable_table: bool) -> str:
         """处理文件并返回markdown内容"""
         try:
             file_ext = os.path.splitext(file_path)[1].lower()
             if file_ext == '.pdf':
+                return await self.process_pdf(file_path, enable_formula, enable_table)
             else:
+                return await self.process_image(file_path)
         except Exception as e:
             return f"处理文件时出错: {str(e)}"
 ):
     """处理文档的主函数"""
     if file is None:
+        return "请上传PDF文件", None, gr.DownloadButton("📥 下载Markdown文件", visible=False)
     try:
         processor = MinerUProcessor()
         file_ext = os.path.splitext(file_path)[1].lower()
         if file_ext not in processor.supported_formats:
+            return f"不支持的文件格式: {file_ext}。支持格式: {', '.join(processor.supported_formats)}", None, gr.DownloadButton("📥 下载Markdown文件", visible=False)
         # 异步处理文件
         markdown_content = await processor.process_file(file_path, enable_formula, enable_table)
         return markdown_content, html_preview, gr.DownloadButton("📥 下载Markdown文件", visible=True)
     except Exception as e:
+        return f"处理过程中出错: {str(e)}", None, gr.DownloadButton("📥 下载Markdown文件", visible=False)
 def create_download_file(markdown_content: str):
     """创建下载文件"""
     return temp_file.name
 # 创建Gradio界面
+with gr.Blocks(title="MinerU PDF to Markdown", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
+    # 📄 MinerU PDF to Markdown Converter
+    将PDF文档转换为Markdown格式，支持基本的文本提取和表格识别。
+    **注意**: 当前版本主要支持PDF文本提取，公式识别需要额外配置。
     """)
     with gr.Row():
         with gr.Column(scale=1):
             file_input = gr.File(
+                label="上传PDF文档",
+                file_types=[".pdf"],
                 type="filepath"
             )
                 gr.Markdown("### 识别选项")
                 enable_formula = gr.Checkbox(
                     label="Enable formula recognition",
+                    value=False,
+                    info="需要额外配置（当前不可用）"
                 )
                 enable_table = gr.Checkbox(
                     label="Enable table recognition",
                 interactive=False
             )
+            with gr.Accordion("HTML预览", open=False):
+                html_output = gr.HTML()
             download_btn = gr.DownloadButton(
                 "📥 下载Markdown文件",
         inputs=markdown_output,
         outputs=download_btn
     )
+    # 添加说明
+    gr.Markdown("""
+    ## 使用说明
+    1. 上传PDF文件
+    2. 选择识别选项
+    3. 点击"开始处理"按钮
+    4. 查看转换结果并下载Markdown文件
+    ## 限制
+    - 当前主要支持PDF文本提取
+    - 公式识别需要额外配置OCR服务
+    - 表格识别为基本功能
+    """)
 if __name__ == "__main__":
     demo.launch(