Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, UploadFile, File, HTTPException | |
| from fastapi.responses import JSONResponse | |
| from fastapi.middleware.cors import CORSMiddleware | |
| import tempfile | |
| import os | |
| import json | |
| import traceback | |
| from datetime import datetime | |
| from typing import Dict, List, Any, Optional | |
| # Import necessary components from magic_pdf based on convert_pdf.py | |
| from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader | |
| from magic_pdf.data.dataset import PymuDocDataset | |
| from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze | |
| from magic_pdf.config.enums import SupportedPdfParseMethod | |
| # Application metadata | |
| app_description = """ | |
| # MinerU PDF Processor API | |
| This API provides PDF processing capabilities using MinerU's magic-pdf library. | |
| It extracts text content and generates markdown from PDF documents. | |
| ## Features: | |
| - PDF text extraction | |
| - Markdown conversion | |
| - Layout analysis (via output files) | |
| """ | |
| app = FastAPI( | |
| title="MinerU PDF API", | |
| description=app_description, | |
| version="1.0.0", | |
| contact={ | |
| "name": "PDF Converter Service", | |
| }, | |
| ) | |
| # Add CORS middleware to allow cross-origin requests | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], # Allow all origins | |
| allow_credentials=True, | |
| allow_methods=["*"], # Allow all methods | |
| allow_headers=["*"], # Allow all headers | |
| ) | |
| # Define output directories (relative to the app's working directory in the container) | |
| local_image_dir, local_md_dir = "output/images", "output" | |
| os.makedirs(local_image_dir, exist_ok=True) | |
| os.makedirs(local_md_dir, exist_ok=True) | |
| # Health check endpoint | |
| async def health_check() -> Dict[str, Any]: | |
| """ | |
| Health check endpoint to verify the service is running. | |
| Returns the service status and current time. | |
| """ | |
| return { | |
| "status": "healthy", | |
| "timestamp": datetime.now().isoformat(), | |
| "service": "mineru-pdf-processor" | |
| } | |
| async def extract(file: UploadFile = File(...)) -> Dict[str, Any]: | |
| """ | |
| Process a PDF file using PymuDocDataset and return the extracted markdown content. | |
| Parameters: | |
| file: The PDF file to process | |
| Returns: | |
| A JSON object containing the extracted markdown and status. | |
| """ | |
| if not file.filename or not file.filename.lower().endswith('.pdf'): | |
| raise HTTPException(status_code=400, detail="Invalid file. Please upload a PDF file.") | |
| content = await file.read() | |
| temp_pdf_path = None | |
| try: | |
| # Save the uploaded PDF to a temporary file | |
| with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf: | |
| temp_pdf.write(content) | |
| temp_pdf_path = temp_pdf.name | |
| # Clear previous output files (optional, depending on desired behavior) | |
| # You might want to handle output naming differently in a multi-user API context | |
| # For simplicity, we'll clear the output dir here like in convert_pdf.py | |
| for item in os.listdir(local_image_dir): | |
| os.remove(os.path.join(local_image_dir, item)) | |
| for item in os.listdir(local_md_dir): | |
| if os.path.isfile(os.path.join(local_md_dir, item)): | |
| os.remove(os.path.join(local_md_dir, item)) | |
| # Get filename and prepare output paths for magic-pdf | |
| pdf_file_name = os.path.basename(temp_pdf_path) | |
| name_without_suff = os.path.splitext(pdf_file_name)[0] | |
| image_dir_rel_path = str(os.path.basename(local_image_dir)) # Relative path for markdown image links | |
| # Setup writers | |
| image_writer = FileBasedDataWriter(local_image_dir) | |
| md_writer = FileBasedDataWriter(local_md_dir) | |
| # Use PymuDocDataset for processing | |
| ds = PymuDocDataset(content) # Pass pdf bytes directly | |
| # Inference and pipeline based on PDF type | |
| if ds.classify() == SupportedPdfParseMethod.OCR: | |
| infer_result = ds.apply(doc_analyze, ocr=True) | |
| pipe_result = infer_result.pipe_ocr_mode(image_writer) | |
| else: | |
| infer_result = ds.apply(doc_analyze, ocr=False) | |
| pipe_result = infer_result.pipe_txt_mode(image_writer) | |
| # Optional: Generate intermediate output files (comment out if not needed for API) | |
| infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf")) | |
| pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf")) | |
| pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf")) | |
| pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir_rel_path) | |
| pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json') | |
| # Get markdown content | |
| md_content = pipe_result.get_markdown(image_dir_rel_path) | |
| # Dump markdown to file (optional for API, but useful for debugging/access) | |
| md_file_path = f"{name_without_suff}.md" | |
| pipe_result.dump_md(md_writer, md_file_path, image_dir_rel_path) | |
| print(f"Markdown saved to: {os.path.join(local_md_dir, md_file_path)}") | |
| # Return the markdown content in the response | |
| return { | |
| "filename": file.filename, | |
| "status": "success", | |
| "markdown_content": md_content | |
| # You could potentially add links to the generated files here if needed | |
| # "output_files": { ... } | |
| } | |
| except Exception as e: | |
| error_detail = str(e) | |
| error_trace = traceback.format_exc() | |
| # Log the error | |
| print(f"Error processing PDF: {error_detail}") | |
| print(error_trace) | |
| return JSONResponse( | |
| status_code=500, | |
| content={ | |
| "error": "Error processing PDF", | |
| "detail": error_detail, | |
| "filename": file.filename if file and hasattr(file, 'filename') else None | |
| } | |
| ) | |
| finally: | |
| # Clean up the temporary file | |
| if temp_pdf_path and os.path.exists(temp_pdf_path): | |
| try: | |
| os.unlink(temp_pdf_path) | |
| except Exception: | |
| pass | |
| if __name__ == "__main__": | |
| # Keep uvicorn import here for local running | |
| import uvicorn | |
| uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False) |