Spaces:

marcosremar2
/

docker_mineru

Sleeping

App Files Files Community

marcosremar2 commited on May 3

Commit

d179ac1

1 Parent(s): 3d9ca9a

Update PDF to Markdown converter API with NVIDIA L4 support

Browse files

Files changed (2) hide show

app.py +6 -167
app/main.py +8 -1

app.py CHANGED Viewed

@@ -1,171 +1,10 @@
-from fastapi import FastAPI, UploadFile, File, HTTPException
-from fastapi.responses import JSONResponse
-from fastapi.middleware.cors import CORSMiddleware
-import tempfile
-import os
-import json
-import traceback
-from datetime import datetime
-from typing import Dict, List, Any, Optional
-# Import necessary components from magic_pdf based on convert_pdf.py
-from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
-from magic_pdf.data.dataset import PymuDocDataset
-from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-from magic_pdf.config.enums import SupportedPdfParseMethod
-# Application metadata
-app_description = """
-# MinerU PDF Processor API
-This API provides PDF processing capabilities using MinerU's magic-pdf library.
-It extracts text content and generates markdown from PDF documents.
-## Features:
-- PDF text extraction
-- Markdown conversion
-- Layout analysis (via output files)
 """
-app = FastAPI(
-    title="MinerU PDF API",
-    description=app_description,
-    version="1.0.0",
-    contact={
-        "name": "PDF Converter Service",
-    },
-)
-# Add CORS middleware to allow cross-origin requests
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],  # Allow all origins
-    allow_credentials=True,
-    allow_methods=["*"],  # Allow all methods
-    allow_headers=["*"],  # Allow all headers
-)
-# Define output directories (relative to the app's working directory in the container)
-local_image_dir, local_md_dir = "output/images", "output"
-os.makedirs(local_image_dir, exist_ok=True)
-os.makedirs(local_md_dir, exist_ok=True)
-# Health check endpoint
-@app.get("/health", tags=["Health"])
-async def health_check() -> Dict[str, Any]:
-    """
-    Health check endpoint to verify the service is running.
-    Returns the service status and current time.
-    """
-    return {
-        "status": "healthy",
-        "timestamp": datetime.now().isoformat(),
-        "service": "mineru-pdf-processor"
-    }
-@app.post("/extract", tags=["PDF Processing"])
-async def extract(file: UploadFile = File(...)) -> Dict[str, Any]:
-    """
-    Process a PDF file using PymuDocDataset and return the extracted markdown content.
-    Parameters:
-        file: The PDF file to process
-    Returns:
-        A JSON object containing the extracted markdown and status.
-    """
-    if not file.filename or not file.filename.lower().endswith('.pdf'):
-        raise HTTPException(status_code=400, detail="Invalid file. Please upload a PDF file.")
-    content = await file.read()
-    temp_pdf_path = None
-    try:
-        # Save the uploaded PDF to a temporary file
-        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
-            temp_pdf.write(content)
-            temp_pdf_path = temp_pdf.name
-        # Clear previous output files (optional, depending on desired behavior)
-        # You might want to handle output naming differently in a multi-user API context
-        # For simplicity, we'll clear the output dir here like in convert_pdf.py
-        for item in os.listdir(local_image_dir):
-            os.remove(os.path.join(local_image_dir, item))
-        for item in os.listdir(local_md_dir):
-             if os.path.isfile(os.path.join(local_md_dir, item)):
-                 os.remove(os.path.join(local_md_dir, item))
-        # Get filename and prepare output paths for magic-pdf
-        pdf_file_name = os.path.basename(temp_pdf_path)
-        name_without_suff = os.path.splitext(pdf_file_name)[0]
-        image_dir_rel_path = str(os.path.basename(local_image_dir)) # Relative path for markdown image links
-        # Setup writers
-        image_writer = FileBasedDataWriter(local_image_dir)
-        md_writer = FileBasedDataWriter(local_md_dir)
-        # Use PymuDocDataset for processing
-        ds = PymuDocDataset(content) # Pass pdf bytes directly
-        # Inference and pipeline based on PDF type
-        if ds.classify() == SupportedPdfParseMethod.OCR:
-            infer_result = ds.apply(doc_analyze, ocr=True)
-            pipe_result = infer_result.pipe_ocr_mode(image_writer)
-        else:
-            infer_result = ds.apply(doc_analyze, ocr=False)
-            pipe_result = infer_result.pipe_txt_mode(image_writer)
-        # Optional: Generate intermediate output files (comment out if not needed for API)
-        infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
-        pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
-        pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
-        pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir_rel_path)
-        pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
-        # Get markdown content
-        md_content = pipe_result.get_markdown(image_dir_rel_path)
-        # Dump markdown to file (optional for API, but useful for debugging/access)
-        md_file_path = f"{name_without_suff}.md"
-        pipe_result.dump_md(md_writer, md_file_path, image_dir_rel_path)
-        print(f"Markdown saved to: {os.path.join(local_md_dir, md_file_path)}")
-        # Return the markdown content in the response
-        return {
-            "filename": file.filename,
-            "status": "success",
-            "markdown_content": md_content
-            # You could potentially add links to the generated files here if needed
-            # "output_files": { ... }
-        }
-    except Exception as e:
-        error_detail = str(e)
-        error_trace = traceback.format_exc()
-        # Log the error
-        print(f"Error processing PDF: {error_detail}")
-        print(error_trace)
-        return JSONResponse(
-            status_code=500,
-            content={
-                "error": "Error processing PDF",
-                "detail": error_detail,
-                "filename": file.filename if file and hasattr(file, 'filename') else None
-            }
-        )
-    finally:
-        # Clean up the temporary file
-        if temp_pdf_path and os.path.exists(temp_pdf_path):
-            try:
-                os.unlink(temp_pdf_path)
-            except Exception:
-                pass
 if __name__ == "__main__":
-    # Keep uvicorn import here for local running
-    import uvicorn
-    uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)

+"""
+Simple entry point for Hugging Face Spaces.
+This file redirects to the FastAPI app in the app directory.
 """
+from app.main import app
 if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run("app:app", host="0.0.0.0", port=7860)

app/main.py CHANGED Viewed

@@ -57,11 +57,18 @@ async def health_check() -> Dict[str, Any]:
     Health check endpoint to verify the service is running.
     Returns the service status and current time.
     """
     return {
         "status": "healthy",
         "timestamp": datetime.now().isoformat(),
         "service": "pdf-to-markdown-converter",
-        "gpu": "CUDA enabled" if torch.cuda.is_available() else "CPU only"
     }
 @app.post("/convert", tags=["PDF Processing"])

     Health check endpoint to verify the service is running.
     Returns the service status and current time.
     """
+    gpu_info = {
+        "cuda_available": torch.cuda.is_available(),
+        "device_count": torch.cuda.device_count() if torch.cuda.is_available() else 0,
+        "device_name": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "N/A",
+        "current_device": torch.cuda.current_device() if torch.cuda.is_available() else -1
+    }
     return {
         "status": "healthy",
         "timestamp": datetime.now().isoformat(),
         "service": "pdf-to-markdown-converter",
+        "gpu": gpu_info
     }
 @app.post("/convert", tags=["PDF Processing"])