MinerU-noGPU

Build error

App Files Files Community

MinerU-noGPU / app.py

AloneDancer

Update app.py

55cb0f8 verified 5 months ago

raw

history blame contribute delete

7.65 kB

	import gradio as gr
	import fitz # PyMuPDF
	import os
	import tempfile
	import aiofiles
	from typing import Optional
	import markdown2
	from pathlib import Path
	import io
	import base64

	class MinerUProcessor:
	def __init__(self):
	self.supported_formats = ['.pdf', '.png', '.jpg', '.jpeg', '.bmp', '.tiff']

	async def process_pdf(self, file_path: str, enable_formula: bool, enable_table: bool) -> str:
	"""处理PDF文件并转换为Markdown"""
	try:
	doc = fitz.open(file_path)
	markdown_content = ""

	for page_num in range(len(doc)):
	page = doc.load_page(page_num)

	# 获取文本内容
	text = page.get_text()

	# 简单的Markdown转换
	markdown_content += f"# Page {page_num + 1}\n\n"
	markdown_content += text.replace('\n', ' \n') + "\n\n"

	# 如果启用表格识别，尝试提取表格
	if enable_table:
	tabs = page.find_tables()
	if tabs.tables:
	markdown_content += "## Tables\n\n"
	for i, tab in enumerate(tabs.tables):
	table_data = tab.extract()
	if table_data:
	markdown_content += f"### Table {i + 1}\n\n"
	# 简单的表格Markdown格式
	for row_idx, row in enumerate(table_data):
	if row_idx == 0: # 表头
	markdown_content += "\| " + " \| ".join(str(cell) for cell in row) + " \|\n"
	markdown_content += "\| " + " \| ".join("---" for _ in row) + " \|\n"
	else:
	markdown_content += "\| " + " \| ".join(str(cell) for cell in row) + " \|\n"
	markdown_content += "\n"

	doc.close()
	return markdown_content

	except Exception as e:
	return f"处理PDF时出错: {str(e)}"

	async def process_image(self, file_path: str) -> str:
	"""处理图片文件（简单返回提示信息）"""
	try:
	# 对于图片文件，我们使用OCR功能（需要tesseract，这里简单处理）
	return f"# Image File: {os.path.basename(file_path)}\n\n⚠️ 图片OCR功能需要额外配置Tesseract。当前仅支持PDF文件的文本提取。"

	except Exception as e:
	return f"处理图片时出错: {str(e)}"

	async def process_file(self, file_path: str, enable_formula: bool, enable_table: bool) -> str:
	"""处理文件并返回markdown内容"""
	try:
	file_ext = os.path.splitext(file_path)[1].lower()

	if file_ext == '.pdf':
	return await self.process_pdf(file_path, enable_formula, enable_table)
	else:
	return await self.process_image(file_path)

	except Exception as e:
	return f"处理文件时出错: {str(e)}"

	async def process_document(
	file: Optional[gr.components.File] = None,
	enable_formula: bool = True,
	enable_table: bool = True
	):
	"""处理文档的主函数"""
	if file is None:
	return "请上传PDF文件", None, gr.DownloadButton("📥 下载Markdown文件", visible=False)

	try:
	processor = MinerUProcessor()
	file_path = file.name
	file_ext = os.path.splitext(file_path)[1].lower()

	if file_ext not in processor.supported_formats:
	return f"不支持的文件格式: {file_ext}。支持格式: {', '.join(processor.supported_formats)}", None, gr.DownloadButton("📥 下载Markdown文件", visible=False)

	# 异步处理文件
	markdown_content = await processor.process_file(file_path, enable_formula, enable_table)

	# 生成HTML预览
	html_preview = markdown2.markdown(markdown_content)

	# 生成下载文件名
	original_name = os.path.basename(file_path)
	base_name = os.path.splitext(original_name)[0]
	download_filename = f"{base_name}.md"

	return markdown_content, html_preview, gr.DownloadButton("📥 下载Markdown文件", visible=True)

	except Exception as e:
	return f"处理过程中出错: {str(e)}", None, gr.DownloadButton("📥 下载Markdown文件", visible=False)

	def create_download_file(markdown_content: str):
	"""创建下载文件"""
	if not markdown_content or markdown_content.startswith("处理过程中出错") or markdown_content.startswith("请上传文件"):
	return None

	# 创建临时文件
	temp_file = tempfile.NamedTemporaryFile(suffix='.md', delete=False, mode='w', encoding='utf-8')
	temp_file.write(markdown_content)
	temp_file.close()

	return temp_file.name

	# 创建Gradio界面
	with gr.Blocks(title="MinerU PDF to Markdown", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 📄 MinerU PDF to Markdown Converter

	将PDF文档转换为Markdown格式，支持基本的文本提取和表格识别。

	注意: 当前版本主要支持PDF文本提取，公式识别需要额外配置。
	""")

	with gr.Row():
	with gr.Column(scale=1):
	file_input = gr.File(
	label="上传PDF文档",
	file_types=[".pdf"],
	type="filepath"
	)

	with gr.Group():
	gr.Markdown("### 识别选项")
	enable_formula = gr.Checkbox(
	label="Enable formula recognition",
	value=False,
	info="需要额外配置（当前不可用）"
	)
	enable_table = gr.Checkbox(
	label="Enable table recognition",
	value=True,
	info="是否启用表格识别"
	)

	process_btn = gr.Button("🚀 开始处理", variant="primary")

	with gr.Column(scale=2):
	markdown_output = gr.Code(
	label="转换结果 (Markdown)",
	language="markdown",
	lines=15,
	interactive=False
	)

	with gr.Accordion("HTML预览", open=False):
	html_output = gr.HTML()

	download_btn = gr.DownloadButton(
	"📥 下载Markdown文件",
	visible=False
	)

	# 设置事件处理
	process_btn.click(
	fn=process_document,
	inputs=[file_input, enable_formula, enable_table],
	outputs=[markdown_output, html_output, download_btn]
	)

	# 下载处理
	download_btn.click(
	fn=create_download_file,
	inputs=markdown_output,
	outputs=download_btn
	)

	# 添加说明
	gr.Markdown("""
	## 使用说明

	1. 上传PDF文件
	2. 选择识别选项
	3. 点击"开始处理"按钮
	4. 查看转换结果并下载Markdown文件

	## 限制

	- 当前主要支持PDF文本提取
	- 公式识别需要额外配置OCR服务
	- 表格识别为基本功能
	""")

	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False
	)