Spaces:

marcosremar2
/

docker_mineru

Sleeping

App Files Files Community

docker_mineru / app.py

marcosremar2

Fix: Use PymuDocDataset in API endpoint

53a34c2 8 months ago

raw

history blame

6.44 kB

	from fastapi import FastAPI, UploadFile, File, HTTPException
	from fastapi.responses import JSONResponse
	from fastapi.middleware.cors import CORSMiddleware
	import tempfile
	import os
	import json
	import traceback
	from datetime import datetime
	from typing import Dict, List, Any, Optional

	# Import necessary components from magic_pdf based on convert_pdf.py
	from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
	from magic_pdf.data.dataset import PymuDocDataset
	from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
	from magic_pdf.config.enums import SupportedPdfParseMethod

	# Application metadata
	app_description = """
	# MinerU PDF Processor API

	This API provides PDF processing capabilities using MinerU's magic-pdf library.
	It extracts text content and generates markdown from PDF documents.

	## Features:
	- PDF text extraction
	- Markdown conversion
	- Layout analysis (via output files)
	"""

	app = FastAPI(
	title="MinerU PDF API",
	description=app_description,
	version="1.0.0",
	contact={
	"name": "PDF Converter Service",
	},
	)

	# Add CORS middleware to allow cross-origin requests
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"], # Allow all origins
	allow_credentials=True,
	allow_methods=["*"], # Allow all methods
	allow_headers=["*"], # Allow all headers
	)

	# Define output directories (relative to the app's working directory in the container)
	local_image_dir, local_md_dir = "output/images", "output"
	os.makedirs(local_image_dir, exist_ok=True)
	os.makedirs(local_md_dir, exist_ok=True)

	# Health check endpoint
	@app.get("/health", tags=["Health"])
	async def health_check() -> Dict[str, Any]:
	"""
	Health check endpoint to verify the service is running.
	Returns the service status and current time.
	"""
	return {
	"status": "healthy",
	"timestamp": datetime.now().isoformat(),
	"service": "mineru-pdf-processor"
	}

	@app.post("/extract", tags=["PDF Processing"])
	async def extract(file: UploadFile = File(...)) -> Dict[str, Any]:
	"""
	Process a PDF file using PymuDocDataset and return the extracted markdown content.

	Parameters:
	file: The PDF file to process

	Returns:
	A JSON object containing the extracted markdown and status.
	"""
	if not file.filename or not file.filename.lower().endswith('.pdf'):
	raise HTTPException(status_code=400, detail="Invalid file. Please upload a PDF file.")

	content = await file.read()
	temp_pdf_path = None

	try:
	# Save the uploaded PDF to a temporary file
	with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
	temp_pdf.write(content)
	temp_pdf_path = temp_pdf.name

	# Clear previous output files (optional, depending on desired behavior)
	# You might want to handle output naming differently in a multi-user API context
	# For simplicity, we'll clear the output dir here like in convert_pdf.py
	for item in os.listdir(local_image_dir):
	os.remove(os.path.join(local_image_dir, item))
	for item in os.listdir(local_md_dir):
	if os.path.isfile(os.path.join(local_md_dir, item)):
	os.remove(os.path.join(local_md_dir, item))

	# Get filename and prepare output paths for magic-pdf
	pdf_file_name = os.path.basename(temp_pdf_path)
	name_without_suff = os.path.splitext(pdf_file_name)[0]
	image_dir_rel_path = str(os.path.basename(local_image_dir)) # Relative path for markdown image links

	# Setup writers
	image_writer = FileBasedDataWriter(local_image_dir)
	md_writer = FileBasedDataWriter(local_md_dir)

	# Use PymuDocDataset for processing
	ds = PymuDocDataset(content) # Pass pdf bytes directly

	# Inference and pipeline based on PDF type
	if ds.classify() == SupportedPdfParseMethod.OCR:
	infer_result = ds.apply(doc_analyze, ocr=True)
	pipe_result = infer_result.pipe_ocr_mode(image_writer)
	else:
	infer_result = ds.apply(doc_analyze, ocr=False)
	pipe_result = infer_result.pipe_txt_mode(image_writer)

	# Optional: Generate intermediate output files (comment out if not needed for API)
	infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
	pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
	pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
	pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir_rel_path)
	pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')

	# Get markdown content
	md_content = pipe_result.get_markdown(image_dir_rel_path)

	# Dump markdown to file (optional for API, but useful for debugging/access)
	md_file_path = f"{name_without_suff}.md"
	pipe_result.dump_md(md_writer, md_file_path, image_dir_rel_path)
	print(f"Markdown saved to: {os.path.join(local_md_dir, md_file_path)}")


	# Return the markdown content in the response
	return {
	"filename": file.filename,
	"status": "success",
	"markdown_content": md_content
	# You could potentially add links to the generated files here if needed
	# "output_files": { ... }
	}

	except Exception as e:
	error_detail = str(e)
	error_trace = traceback.format_exc()

	# Log the error
	print(f"Error processing PDF: {error_detail}")
	print(error_trace)

	return JSONResponse(
	status_code=500,
	content={
	"error": "Error processing PDF",
	"detail": error_detail,
	"filename": file.filename if file and hasattr(file, 'filename') else None
	}
	)

	finally:
	# Clean up the temporary file
	if temp_pdf_path and os.path.exists(temp_pdf_path):
	try:
	os.unlink(temp_pdf_path)
	except Exception:
	pass

	if __name__ == "__main__":
	# Keep uvicorn import here for local running
	import uvicorn
	uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)