Spaces:

marcosremar2
/

MinerUapi

Runtime error

App Files Files Community

marcosremar2 commited on May 2, 2025

Commit

4e3d16d

0 Parent(s):

Initial deployment of MinerU PDF API

Browse files

Files changed (8) hide show

.dockerignore +41 -0
.gitattributes +35 -0
Dockerfile +87 -0
README.md +138 -0
app.py +705 -0
entrypoint.sh +202 -0
requirements.txt +10 -0
space_config.json +4 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,41 @@

+# Git files
+.git
+.gitignore
+# Python cache files
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Editor directories and files
+.idea
+.vscode
+*.swp
+*.swo
+# OS files
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,87 @@

+FROM nvidia/cuda:12.1.0-base-ubuntu22.04
+# Set environment variables
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    python3 \
+    python3-pip \
+    python3-venv \
+    python3-dev \
+    wget \
+    git \
+    build-essential \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    imagemagick \
+    ghostscript \
+    poppler-utils \
+    libmagickwand-dev \
+    fonts-freefont-ttf \
+    ffmpeg \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    pkg-config \
+    libcairo2-dev \
+    && rm -rf /var/lib/apt/lists/*
+# Configure ImageMagick policy to allow PDF conversion (needed for sample PDF creation)
+RUN if [ -f "/etc/ImageMagick-6/policy.xml" ]; then \
+        sed -i 's/rights="none" pattern="PDF"/rights="read|write" pattern="PDF"/g' /etc/ImageMagick-6/policy.xml; \
+    fi
+# Create a virtual environment
+RUN python3 -m venv /opt/mineru_venv
+ENV PATH="/opt/mineru_venv/bin:$PATH"
+# Upgrade pip in the virtual environment
+RUN pip install --upgrade pip
+# Clone the MinerU repository
+RUN git clone https://github.com/opendatalab/MinerU.git /tmp/MinerU
+# Install required packages
+RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
+# Install MinerU with all features
+WORKDIR /tmp/MinerU
+RUN pip install --no-cache-dir -e ".[full]"
+# Install additional dependencies for the web application
+RUN pip install --no-cache-dir flask==2.3.3 flask-cors==4.0.0 werkzeug==2.3.7
+# Create directories for uploads and output
+RUN mkdir -p /tmp/pdf_uploads /tmp/pdf_output
+RUN mkdir -p /tmp/samples
+# Create a non-root user for Hugging Face Spaces
+# This is critical for permissions on HF Spaces
+RUN useradd -m -u 1000 user
+RUN mkdir -p /app/samples && chown -R user:user /app
+# Download model weights
+RUN echo "Downloading MinerU model weights..."
+# This step will automatically download model weights during the first run
+# Copy the application files
+WORKDIR /app
+COPY . /app/
+# Fix permissions for the user
+RUN chown -R user:user /app
+RUN mkdir -p /home/user/.config/magic_pdf && chown -R user:user /home/user/.config
+# Expose the port
+EXPOSE 7860
+# Set up entrypoint
+RUN chmod +x /app/entrypoint.sh
+# Switch to non-root user for running the app
+USER user
+# Start the application
+CMD ["/app/entrypoint.sh"]

README.md ADDED Viewed

	@@ -0,0 +1,138 @@

+---
+title: MinerU PDF Converter
+emoji: 📄
+colorFrom: blue
+colorTo: green
+sdk: docker
+app_port: 7860
+---
+# MinerU PDF Converter
+This Space provides a service for converting PDF files to Markdown and JSON formats using the MinerU PDF extraction tool.
+## Features
+- Web interface for uploading and converting PDF files
+- RESTful API for programmatic access
+- Health monitoring endpoint
+- High-quality PDF extraction with support for tables, formulas, and complex layouts
+- Output in both Markdown and structured JSON formats
+- Comprehensive error handling and fallback mechanisms
+## API Usage
+The service exposes several API endpoints for programmatic access:
+### 1. PDF Conversion Endpoint
+```
+POST /api/convert
+```
+**Request:**
+- Content-Type: multipart/form-data
+- Body: form field 'file' containing the PDF file
+**Response:**
+```json
+{
+  "success": true,
+  "message": "PDF conversion successful",
+  "job_id": "uuid",
+  "base_filename": "filename",
+  "file_info": {
+    "original_filename": "document.pdf",
+    "size_bytes": 42950,
+    "content_type": "application/pdf"
+  },
+  "markdown": "# Converted markdown content...",
+  "json": {
+    "title": "Document Title",
+    "sections": [...]
+  },
+  "log": "Processing log...",
+  "files": {
+    "markdown_path": "document.md",
+    "json_path": "document.json"
+  }
+}
+```
+### 2. Health Check Endpoint
+```
+GET /health
+```
+**Response:**
+```json
+{
+  "status": "healthy",
+  "version": "1.1.0",
+  "environment": {
+    "python_version": "3.10.12",
+    "platform": "Linux-6.1.58+-x86_64-with-glibc2.35",
+    "processor": "x86_64"
+  },
+  "configuration": {
+    "upload_folder_exists": true,
+    "output_folder_exists": true,
+    "magic_pdf_installed": true
+  }
+}
+```
+### Client Example
+A Python client script (`api_client.py`) is included in this repository for easy integration:
+```python
+# Example usage
+python api_client.py path/to/your/document.pdf --api-url https://marcosremar2-mineru.hf.space
+```
+The client includes features such as:
+- Automatic health check to verify API status
+- Retry logic for failed requests
+- Progress tracking
+- Comprehensive error handling
+You can also use curl:
+```bash
+curl -X POST -F "file=@path/to/your/document.pdf" https://marcosremar2-mineru.hf.space/api/convert
+```
+And check health with:
+```bash
+curl https://marcosremar2-mineru.hf.space/health
+```
+## Web Interface
+The Space also provides a web interface where you can:
+- Upload PDF files for conversion
+- View the generated Markdown and JSON
+- Download the converted files
+- View processing logs
+## Implementation Details
+This service uses:
+- MinerU for high-quality PDF extraction
+- PyMuPDF as a fallback conversion method
+- Flask web server for the interface and API
+- Docker container for deployment on Hugging Face Spaces
+## Error Handling
+The service includes robust error handling:
+- Automatic fallback to local PDF conversion if MinerU is unavailable
+- Detailed error messages and logs
+- API responses include comprehensive details for debugging
+## Learn More
+For more information about MinerU, visit [the MinerU repository](https://github.com/opendatalab/MinerU).

app.py ADDED Viewed

	@@ -0,0 +1,705 @@

+from flask import Flask, request, jsonify, render_template_string, redirect, url_for, send_from_directory
+import os
+import subprocess
+import tempfile
+import uuid
+import json
+import shutil
+import time
+import platform
+import sys
+from werkzeug.utils import secure_filename
+from flask_cors import CORS  # Add CORS support
+app = Flask(__name__)
+CORS(app)  # Enable CORS for all routes
+# Use user home directory for better permission handling
+USER_HOME = os.path.expanduser("~")
+UPLOAD_FOLDER = os.path.join(USER_HOME, 'pdf_uploads')
+OUTPUT_FOLDER = os.path.join(USER_HOME, 'pdf_output')
+# Create upload and output directories
+os.makedirs(UPLOAD_FOLDER, exist_ok=True)
+os.makedirs(OUTPUT_FOLDER, exist_ok=True)
+# Version information
+APP_VERSION = "1.1.0"
+HTML_TEMPLATE = """
+<!DOCTYPE html>
+<html>
+<head>
+    <title>MinerU PDF Processing</title>
+    <style>
+        body {
+            font-family: Arial, sans-serif;
+            max-width: 900px;
+            margin: 0 auto;
+            padding: 20px;
+            line-height: 1.6;
+        }
+        .container {
+            background-color: #f9f9f9;
+            padding: 20px;
+            border-radius: 8px;
+            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+            margin-bottom: 20px;
+        }
+        h1 {
+            color: #2c3e50;
+        }
+        pre {
+            background-color: #f1f1f1;
+            padding: 10px;
+            border-radius: 4px;
+            overflow-x: auto;
+            max-height: 300px;
+            overflow-y: auto;
+        }
+        .command {
+            font-family: monospace;
+            background-color: #eee;
+            padding: 5px;
+            border-radius: 3px;
+        }
+        .upload-form {
+            margin: 20px 0;
+            padding: 15px;
+            border: 1px solid #ddd;
+            border-radius: 8px;
+        }
+        .btn {
+            background-color: #4CAF50;
+            color: white;
+            padding: 8px 16px;
+            border: none;
+            border-radius: 4px;
+            cursor: pointer;
+            font-size: 16px;
+        }
+        .btn:hover {
+            background-color: #45a049;
+        }
+        .loading {
+            display: none;
+            color: #666;
+            margin-top: 10px;
+        }
+        .result-section {
+            margin-top: 20px;
+        }
+        .tab {
+            overflow: hidden;
+            border: 1px solid #ccc;
+            background-color: #f1f1f1;
+            margin-top: 20px;
+        }
+        .tab button {
+            background-color: inherit;
+            float: left;
+            border: none;
+            outline: none;
+            cursor: pointer;
+            padding: 10px 16px;
+            transition: 0.3s;
+        }
+        .tab button:hover {
+            background-color: #ddd;
+        }
+        .tab button.active {
+            background-color: #ccc;
+        }
+        .tabcontent {
+            display: none;
+            padding: 6px 12px;
+            border: 1px solid #ccc;
+            border-top: none;
+            max-height: 500px;
+            overflow-y: auto;
+            white-space: pre-wrap;
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>MinerU PDF Processing Service</h1>
+        <p>This service uses MinerU to convert PDF documents to Markdown and JSON formats.</p>
+        <h2>GPU Status</h2>
+        <pre id="gpuStatus">Loading...</pre>
+        <div class="upload-form">
+            <h2>Convert PDF File</h2>
+            <form action="/convert" method="post" enctype="multipart/form-data" id="uploadForm">
+                <input type="file" name="file" accept=".pdf" required>
+                <button type="submit" class="btn">Convert PDF</button>
+            </form>
+            <div id="loadingIndicator" class="loading">Processing PDF file... This may take a minute.</div>
+        </div>
+        <div class="result-section" id="resultSection" style="display: none;">
+            <h2>Conversion Results</h2>
+            <div id="resultInfo"></div>
+            <div class="tab">
+                <button class="tablinks" onclick="openTab(event, 'Markdown')" id="defaultOpen">Markdown</button>
+                <button class="tablinks" onclick="openTab(event, 'JSON')">JSON</button>
+                <button class="tablinks" onclick="openTab(event, 'Log')">Processing Log</button>
+            </div>
+            <div id="Markdown" class="tabcontent">
+                <pre id="markdownContent"></pre>
+                <a id="downloadMarkdown" class="btn" style="margin-top: 10px;">Download Markdown</a>
+            </div>
+            <div id="JSON" class="tabcontent">
+                <pre id="jsonContent"></pre>
+                <a id="downloadJson" class="btn" style="margin-top: 10px;">Download JSON</a>
+            </div>
+            <div id="Log" class="tabcontent">
+                <pre id="logContent"></pre>
+            </div>
+        </div>
+        <h2>Available Commands</h2>
+        <p>MinerU provides the following commands:</p>
+        <p><span class="command">magic-pdf</span> - Process PDF documents</p>
+        <h2>Help Output</h2>
+        <pre id="helpOutput">Loading...</pre>
+    </div>
+    <script>
+        // Fetch GPU status
+        fetch('/gpu-status')
+            .then(response => response.json())
+            .then(data => {
+                document.getElementById('gpuStatus').textContent = data.output;
+            })
+            .catch(error => {
+                document.getElementById('gpuStatus').textContent = 'Error fetching GPU status: ' + error.message;
+            });
+        // Fetch help output
+        fetch('/help-output')
+            .then(response => response.json())
+            .then(data => {
+                document.getElementById('helpOutput').textContent = data.output;
+            })
+            .catch(error => {
+                document.getElementById('helpOutput').textContent = 'Error fetching help: ' + error.message;
+            });
+        // Tab functionality
+        function openTab(evt, tabName) {
+            var i, tabcontent, tablinks;
+            tabcontent = document.getElementsByClassName("tabcontent");
+            for (i = 0; i < tabcontent.length; i++) {
+                tabcontent[i].style.display = "none";
+            }
+            tablinks = document.getElementsByClassName("tablinks");
+            for (i = 0; i < tablinks.length; i++) {
+                tablinks[i].className = tablinks[i].className.replace(" active", "");
+            }
+            document.getElementById(tabName).style.display = "block";
+            evt.currentTarget.className += " active";
+        }
+        // Set up form submission
+        document.getElementById('uploadForm').addEventListener('submit', function(e) {
+            e.preventDefault();
+            const loadingIndicator = document.getElementById('loadingIndicator');
+            loadingIndicator.style.display = 'block';
+            const resultSection = document.getElementById('resultSection');
+            resultSection.style.display = 'none';
+            const formData = new FormData(this);
+            fetch('/convert', {
+                method: 'POST',
+                body: formData
+            })
+            .then(response => response.json())
+            .then(data => {
+                loadingIndicator.style.display = 'none';
+                resultSection.style.display = 'block';
+                document.getElementById('resultInfo').textContent = data.message;
+                // Handle Markdown content
+                if (data.markdown) {
+                    document.getElementById('markdownContent').textContent = data.markdown;
+                    const downloadMarkdown = document.getElementById('downloadMarkdown');
+                    downloadMarkdown.href = data.markdown_url;
+                    downloadMarkdown.download = data.base_filename + '.md';
+                }
+                // Handle JSON content
+                if (data.json) {
+                    document.getElementById('jsonContent').textContent = JSON.stringify(data.json, null, 2);
+                    const downloadJson = document.getElementById('downloadJson');
+                    downloadJson.href = data.json_url;
+                    downloadJson.download = data.base_filename + '.json';
+                }
+                // Handle log content
+                if (data.log) {
+                    document.getElementById('logContent').textContent = data.log;
+                }
+                // Open the markdown tab by default
+                document.getElementById('defaultOpen').click();
+            })
+            .catch(error => {
+                loadingIndicator.style.display = 'none';
+                alert('Error: ' + error.message);
+            });
+        });
+    </script>
+</body>
+</html>
+"""
+@app.route('/')
+def index():
+    return render_template_string(HTML_TEMPLATE)
+@app.route('/gpu-status')
+def gpu_status():
+    import subprocess
+    try:
+        output = subprocess.check_output(['nvidia-smi'], stderr=subprocess.STDOUT).decode('utf-8')
+    except subprocess.CalledProcessError as e:
+        output = f"Error running nvidia-smi: {e.output.decode('utf-8')}"
+    except FileNotFoundError:
+        output = "nvidia-smi command not found. GPU may not be available."
+    return jsonify({"output": output})
+@app.route('/help-output')
+def help_output():
+    import subprocess
+    try:
+        output = subprocess.check_output(['magic-pdf', '--help'], stderr=subprocess.STDOUT).decode('utf-8')
+    except subprocess.CalledProcessError as e:
+        output = f"Error running magic-pdf --help: {e.output.decode('utf-8')}"
+    except FileNotFoundError:
+        output = "magic-pdf command not found. MinerU may not be installed correctly."
+    return jsonify({"output": output})
+@app.route('/convert', methods=['POST'])
+def convert_pdf():
+    if 'file' not in request.files:
+        return jsonify({"error": "No file part"}), 400
+    file = request.files['file']
+    if file.filename == '':
+        return jsonify({"error": "No selected file"}), 400
+    if not file.filename.lower().endswith('.pdf'):
+        return jsonify({"error": "File must be a PDF"}), 400
+    # Generate a unique ID for this conversion
+    job_id = str(uuid.uuid4())
+    job_dir = os.path.join(OUTPUT_FOLDER, job_id)
+    os.makedirs(job_dir, exist_ok=True)
+    # Save the uploaded file
+    filename = secure_filename(file.filename)
+    base_filename = os.path.splitext(filename)[0]
+    pdf_path = os.path.join(job_dir, filename)
+    file.save(pdf_path)
+    # Run magic-pdf on the file
+    output_dir = os.path.join(job_dir, "output")
+    os.makedirs(output_dir, exist_ok=True)
+    log_file = os.path.join(job_dir, "conversion.log")
+    try:
+        # Define the default config dictionary first
+        default_config = {
+            "device-mode": "cpu",
+            "layout-config": {
+                "model": "doclayout_yolo",
+                "enable": True
+            },
+            "formula-config": {
+                "mfd_model": "yolo_v8_mfd",
+                "mfr_model": "unimernet_small",
+                "enable": True
+            },
+            "table-config": {
+                "model": "rapid_table",
+                "sub_model": "slanet_plus",
+                "enable": True,
+                "max_time": 400
+            }
+        }
+        # Create the magic-pdf.json configuration file in .config if it doesn't exist
+        config_dir = os.path.expanduser("~/.config/magic_pdf")
+        os.makedirs(config_dir, exist_ok=True)
+        config_file = os.path.join(config_dir, "magic-pdf.json")
+        if not os.path.exists(config_file):
+            with open(config_file, 'w') as f:
+                json.dump(default_config, f, indent=2)
+        # Also create the config in the home directory as fallback
+        home_config_file = os.path.join(os.path.expanduser("~"), "magic-pdf.json")
+        if not os.path.exists(home_config_file):
+            with open(home_config_file, 'w') as f:
+                json.dump(default_config, f, indent=2)
+        # Add a small delay to ensure config file is written before magic-pdf runs
+        time.sleep(0.5)
+        # Use magic-pdf to convert the PDF to Markdown and JSON
+        cmd = [
+            'magic-pdf',
+            '--path', pdf_path,
+            '--output-dir', output_dir
+        ]
+        # Run the command and capture output
+        with open(log_file, 'w') as f:
+            process = subprocess.Popen(
+                cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                text=True,
+                bufsize=1
+            )
+            # Write process output to log file in real-time
+            for line in process.stdout:
+                f.write(line)
+                f.flush()
+            process.wait()
+            if process.returncode != 0:
+                return jsonify({
+                    "error": f"PDF conversion failed with code {process.returncode}",
+                    "log": open(log_file, 'r').read()
+                }), 500
+        # Get the generated markdown and JSON
+        markdown_file = os.path.join(output_dir, f"{base_filename}.md")
+        json_file = os.path.join(output_dir, f"{base_filename}.json")
+        # Check if the output files exist
+        markdown_content = ""
+        json_content = {}
+        if os.path.exists(markdown_file):
+            with open(markdown_file, 'r', encoding='utf-8') as f:
+                markdown_content = f.read()
+        if os.path.exists(json_file):
+            with open(json_file, 'r', encoding='utf-8') as f:
+                json_content = json.load(f)
+        # Read the log file
+        with open(log_file, 'r') as f:
+            log_content = f.read()
+        # Copy the output files to a location accessible for download
+        output_markdown = os.path.join(job_dir, f"{base_filename}.md")
+        output_json = os.path.join(job_dir, f"{base_filename}.json")
+        if os.path.exists(markdown_file):
+            shutil.copy(markdown_file, output_markdown)
+        if os.path.exists(json_file):
+            shutil.copy(json_file, output_json)
+        # Return the conversion results
+        return jsonify({
+            "message": f"PDF '{filename}' converted successfully",
+            "markdown": markdown_content,
+            "json": json_content,
+            "log": log_content,
+            "base_filename": base_filename,
+            "job_id": job_id,
+            "markdown_url": url_for('download_file', job_id=job_id, filename=f"{base_filename}.md"),
+            "json_url": url_for('download_file', job_id=job_id, filename=f"{base_filename}.json")
+        })
+    except Exception as e:
+        import traceback
+        error_details = traceback.format_exc()
+        return jsonify({
+            "error": f"Failed to convert PDF: {str(e)}",
+            "details": error_details
+        }), 500
+@app.route('/download/<job_id>/<filename>')
+def download_file(job_id, filename):
+    job_dir = os.path.join(OUTPUT_FOLDER, job_id)
+    return send_from_directory(job_dir, filename)
+# Add a sample PDF for testing
+@app.route('/sample')
+def add_sample():
+    try:
+        # Create a tiny text-only PDF using Podofoimpose (if available) or other method
+        sample_dir = os.path.join(UPLOAD_FOLDER, 'sample')
+        os.makedirs(sample_dir, exist_ok=True)
+        sample_path = os.path.join(sample_dir, 'sample.pdf')
+        # Use simple text for the sample
+        with open(os.path.join(sample_dir, 'sample.txt'), 'w') as f:
+            f.write("This is a sample PDF for testing MinerU.\n\nIt contains simple text to demonstrate the PDF to Markdown and JSON conversion capabilities.")
+        # Try to convert the text to PDF if possible
+        try:
+            subprocess.run(['convert', '-size', '612x792', 'caption:@' + os.path.join(sample_dir, 'sample.txt'), sample_path])
+        except:
+            # If ImageMagick's convert fails, try another approach
+            return jsonify({"error": "Could not create sample PDF. Please upload your own PDF file."}), 500
+        return jsonify({"message": "Sample PDF created", "path": sample_path})
+    except Exception as e:
+        return jsonify({"error": f"Failed to create sample PDF: {str(e)}"}), 500
+@app.route('/health')
+def health_check():
+    """
+    Health check endpoint for monitoring.
+    Returns basic information about the service status.
+    """
+    try:
+        # Check if magic-pdf command exists
+        has_magic_pdf = False
+        try:
+            subprocess.run(['magic-pdf', '--version'], capture_output=True, check=False)
+            has_magic_pdf = True
+        except FileNotFoundError:
+            pass
+        # Get runtime information
+        health_info = {
+            'status': 'healthy',
+            'version': APP_VERSION,
+            'environment': {
+                'python_version': platform.python_version(),
+                'platform': platform.platform(),
+                'processor': platform.processor()
+            },
+            'configuration': {
+                'upload_folder_exists': os.path.exists(UPLOAD_FOLDER),
+                'output_folder_exists': os.path.exists(OUTPUT_FOLDER),
+                'magic_pdf_installed': has_magic_pdf
+            }
+        }
+        return jsonify(health_info)
+    except Exception as e:
+        return jsonify({
+            'status': 'unhealthy',
+            'error': str(e)
+        }), 500
+@app.route('/api/convert', methods=['POST'])
+def api_convert_pdf():
+    """
+    API endpoint for programmatic access to PDF conversion.
+    Request:
+        - POST request with 'file' field containing PDF file
+    Response:
+        - JSON with conversion results
+    """
+    # Validate request
+    if 'file' not in request.files:
+        return jsonify({
+            'success': False,
+            'error': 'No file provided. Please upload a PDF file.'
+        }), 400
+    file = request.files['file']
+    if file.filename == '':
+        return jsonify({
+            'success': False,
+            'error': 'No file selected. Please select a PDF file.'
+        }), 400
+    # Check if the file is a PDF
+    if not file.filename.lower().endswith('.pdf'):
+        return jsonify({
+            'success': False,
+            'error': 'Invalid file format. Please upload a PDF file.'
+        }), 400
+    # Generate a job ID
+    job_id = str(uuid.uuid4())
+    # Create job directory
+    job_dir = os.path.join(OUTPUT_FOLDER, job_id)
+    os.makedirs(job_dir, exist_ok=True)
+    # Save the uploaded file
+    filename = secure_filename(file.filename)
+    base_filename, _ = os.path.splitext(filename)
+    pdf_path = os.path.join(job_dir, filename)
+    file.save(pdf_path)
+    try:
+        # Define the default config dictionary first
+        default_config = {
+            "device-mode": "cpu",
+            "layout-config": {
+                "model": "doclayout_yolo",
+                "enable": True
+            },
+            "formula-config": {
+                "mfd_model": "yolo_v8_mfd",
+                "mfr_model": "unimernet_small",
+                "enable": True
+            },
+            "table-config": {
+                "model": "rapid_table",
+                "sub_model": "slanet_plus",
+                "enable": True,
+                "max_time": 400
+            }
+        }
+        # Create the magic-pdf.json configuration file in .config if it doesn't exist
+        config_dir = os.path.expanduser("~/.config/magic_pdf")
+        os.makedirs(config_dir, exist_ok=True)
+        config_file = os.path.join(config_dir, "magic-pdf.json")
+        if not os.path.exists(config_file):
+            with open(config_file, 'w') as f:
+                json.dump(default_config, f, indent=2)
+        # Also create the config in the home directory as fallback
+        home_config_file = os.path.join(os.path.expanduser("~"), "magic-pdf.json")
+        if not os.path.exists(home_config_file):
+            with open(home_config_file, 'w') as f:
+                json.dump(default_config, f, indent=2)
+        # Add a small delay to ensure config file is written before magic-pdf runs
+        time.sleep(0.5)
+        # Log the conversion process
+        log_file = os.path.join(job_dir, "conversion.log")
+        with open(log_file, "w") as log:
+            # Run the MinerU magic-pdf command with correct parameters
+            command = ["magic-pdf", "--path", pdf_path, "--output-dir", job_dir]
+            process = subprocess.Popen(
+                command,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                universal_newlines=True
+            )
+            output = []
+            for line in process.stdout:
+                output.append(line)
+                log.write(line)
+                log.flush()
+            process.wait()
+            exit_code = process.returncode
+        if exit_code != 0:
+            error_message = ''.join(output) if output else "Unknown error during PDF conversion"
+            return jsonify({
+                'success': False,
+                'error': 'PDF conversion failed. Please check the log for details.',
+                'log': error_message,
+                'exit_code': exit_code
+            }), 500
+        # Check for output files
+        markdown_file = os.path.join(job_dir, f"{base_filename}.md")
+        json_file = os.path.join(job_dir, f"{base_filename}.json")
+        # If files don't exist in the job directory, check the same directory as the PDF
+        pdf_dir = os.path.dirname(pdf_path)
+        if not os.path.exists(markdown_file):
+            alt_markdown_file = os.path.join(pdf_dir, f"{base_filename}.md")
+            if os.path.exists(alt_markdown_file):
+                markdown_file = alt_markdown_file
+            else:
+                # Try to find any markdown file in the output directory
+                md_files = [f for f in os.listdir(job_dir) if f.endswith('.md')]
+                if md_files:
+                    markdown_file = os.path.join(job_dir, md_files[0])
+        if not os.path.exists(json_file):
+            alt_json_file = os.path.join(pdf_dir, f"{base_filename}.json")
+            if os.path.exists(alt_json_file):
+                json_file = alt_json_file
+            else:
+                # Try to find any JSON file in the output directory
+                json_files = [f for f in os.listdir(job_dir) if f.endswith('.json')]
+                if json_files:
+                    json_file = os.path.join(job_dir, json_files[0])
+        # Read markdown content
+        markdown_content = ""
+        if os.path.exists(markdown_file):
+            with open(markdown_file, 'r', encoding='utf-8') as f:
+                markdown_content = f.read()
+        else:
+            print(f"Warning: Markdown file not found at {markdown_file}")
+        # Read JSON content
+        json_content = {}
+        if os.path.exists(json_file):
+            with open(json_file, 'r', encoding='utf-8') as f:
+                json_content = json.load(f)
+        else:
+            print(f"Warning: JSON file not found at {json_file}")
+        # Read log content
+        log_content = ""
+        with open(log_file, 'r', encoding='utf-8') as f:
+            log_content = f.read()
+        # Create the result
+        result = {
+            'success': True,
+            'message': 'PDF conversion successful',
+            'job_id': job_id,
+            'base_filename': base_filename,
+            'file_info': {
+                'original_filename': filename,
+                'size_bytes': os.path.getsize(pdf_path),
+                'content_type': 'application/pdf'
+            },
+            'markdown': markdown_content,
+            'json': json_content,
+            'log': log_content,
+            'files': {
+                'markdown_path': os.path.basename(markdown_file) if os.path.exists(markdown_file) else None,
+                'json_path': os.path.basename(json_file) if os.path.exists(json_file) else None
+            }
+        }
+        return jsonify(result)
+    except Exception as e:
+        import traceback
+        error_details = traceback.format_exc()
+        return jsonify({
+            'success': False,
+            'error': f'An error occurred during PDF conversion: {str(e)}',
+            'details': error_details,
+            'job_id': job_id
+        }), 500
+if __name__ == '__main__':
+    app.run(host='0.0.0.0', port=7860, debug=False)

entrypoint.sh ADDED Viewed

	@@ -0,0 +1,202 @@

+#!/bin/bash
+set -e
+# Activate virtual environment
+source /opt/mineru_venv/bin/activate
+# Display GPU information
+echo "Checking NVIDIA GPU status:"
+nvidia-smi
+# Display MinerU version
+echo "MinerU version:"
+magic-pdf --version
+# Create a samples directory
+mkdir -p $HOME/.config/magic_pdf
+mkdir -p /app/samples || mkdir -p /tmp/samples
+# Define the samples directory based on what's writable
+if [ -w "/app/samples" ]; then
+    SAMPLES_DIR="/app/samples"
+else
+    SAMPLES_DIR="/tmp/samples"
+fi
+# Download a sample PDF for testing if it doesn't exist
+if [ ! -f "$SAMPLES_DIR/sample.pdf" ]; then
+    echo "Downloading sample PDF for testing..."
+    # Download a simple paper from arXiv (using a small one for quick processing)
+    wget -q "https://arxiv.org/pdf/2201.08239.pdf" -O "$SAMPLES_DIR/sample.pdf" || true
+    # If that fails, try another source
+    if [ ! -s "$SAMPLES_DIR/sample.pdf" ]; then
+        wget -q "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf" -O "$SAMPLES_DIR/sample.pdf" || true
+    fi
+    # If both fail, create a simple PDF with text
+    if [ ! -s "$SAMPLES_DIR/sample.pdf" ]; then
+        echo "Failed to download sample PDF, creating a simple PDF text file..."
+        echo "This is a sample PDF document for testing MinerU.
+        MinerU is a high-quality tool for converting PDF to Markdown and JSON formats.
+        This file was created for testing purposes." > "$SAMPLES_DIR/sample.txt"
+        # Try using different methods to create a PDF
+        if command -v convert &> /dev/null; then
+            convert -size 612x792 -background white -fill black caption:@"$SAMPLES_DIR/sample.txt" "$SAMPLES_DIR/sample.pdf"
+        else
+            echo "WARNING: Could not create a sample PDF file automatically."
+        fi
+    fi
+fi
+# Create the magic-pdf.json config file
+CONFIG_DIR="$HOME/.config/magic_pdf"
+mkdir -p "$CONFIG_DIR"
+if [ ! -f "$CONFIG_DIR/magic-pdf.json" ]; then
+    echo "Creating magic-pdf.json configuration file..."
+    cat > "$CONFIG_DIR/magic-pdf.json" << EOF
+{
+    "device-mode": "gpu",
+    "layout-config": {
+        "model": "doclayout_yolo",
+        "enable": true
+    },
+    "formula-config": {
+        "mfd_model": "yolo_v8_mfd",
+        "mfr_model": "unimernet_small",
+        "enable": true
+    },
+    "table-config": {
+        "model": "rapid_table",
+        "sub_model": "slanet_plus",
+        "enable": true,
+        "max_time": 400
+    }
+}
+EOF
+fi
+# Start the Flask application if it exists, otherwise provide a shell
+if [ -f "/app/app.py" ]; then
+    echo "Starting Flask application..."
+    python /app/app.py
+else
+    echo "No app.py found. Starting a simple server..."
+    # Create a simple server that shows MinerU is installed
+    TMP_APP_PATH="$HOME/simple_app.py"
+    cat > "$TMP_APP_PATH" << 'EOF'
+from flask import Flask, request, jsonify, render_template_string
+app = Flask(__name__)
+HTML_TEMPLATE = """
+<!DOCTYPE html>
+<html>
+<head>
+    <title>MinerU PDF Processing</title>
+    <style>
+        body {
+            font-family: Arial, sans-serif;
+            max-width: 800px;
+            margin: 0 auto;
+            padding: 20px;
+        }
+        .container {
+            background-color: #f9f9f9;
+            padding: 20px;
+            border-radius: 8px;
+            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+        }
+        h1 {
+            color: #2c3e50;
+        }
+        pre {
+            background-color: #f1f1f1;
+            padding: 10px;
+            border-radius: 4px;
+            overflow-x: auto;
+        }
+        .command {
+            font-family: monospace;
+            background-color: #eee;
+            padding: 5px;
+            border-radius: 3px;
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>MinerU PDF Processing Service</h1>
+        <p>This Space provides PDF processing capabilities using MinerU.</p>
+        <h2>GPU Status</h2>
+        <pre id="gpuStatus">Loading...</pre>
+        <h2>Available Commands</h2>
+        <p>MinerU provides the following commands:</p>
+        <p><span class="command">magic-pdf</span> - Process PDF documents</p>
+        <h2>Help Output</h2>
+        <pre id="helpOutput">Loading...</pre>
+    </div>
+    <script>
+        // Fetch GPU status
+        fetch('/gpu-status')
+            .then(response => response.json())
+            .then(data => {
+                document.getElementById('gpuStatus').textContent = data.output;
+            })
+            .catch(error => {
+                document.getElementById('gpuStatus').textContent = 'Error fetching GPU status: ' + error.message;
+            });
+        // Fetch help output
+        fetch('/help-output')
+            .then(response => response.json())
+            .then(data => {
+                document.getElementById('helpOutput').textContent = data.output;
+            })
+            .catch(error => {
+                document.getElementById('helpOutput').textContent = 'Error fetching help: ' + error.message;
+            });
+    </script>
+</body>
+</html>
+"""
+@app.route('/')
+def index():
+    return render_template_string(HTML_TEMPLATE)
+@app.route('/gpu-status')
+def gpu_status():
+    import subprocess
+    try:
+        output = subprocess.check_output(['nvidia-smi'], stderr=subprocess.STDOUT).decode('utf-8')
+    except subprocess.CalledProcessError as e:
+        output = f"Error running nvidia-smi: {e.output.decode('utf-8')}"
+    except FileNotFoundError:
+        output = "nvidia-smi command not found. GPU may not be available."
+    return jsonify({"output": output})
+@app.route('/help-output')
+def help_output():
+    import subprocess
+    try:
+        output = subprocess.check_output(['magic-pdf', '--help'], stderr=subprocess.STDOUT).decode('utf-8')
+    except subprocess.CalledProcessError as e:
+        output = f"Error running magic-pdf --help: {e.output.decode('utf-8')}"
+    except FileNotFoundError:
+        output = "magic-pdf command not found. MinerU may not be installed correctly."
+    return jsonify({"output": output})
+if __name__ == '__main__':
+    app.run(host='0.0.0.0', port=7860)
+EOF
+    python "$TMP_APP_PATH"
+fi

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+flask==2.3.3
+transformers>=4.37.0
+torch>=2.0.0
+sentencepiece>=0.1.99
+requests>=2.31.0
+accelerate>=0.25.0
+einops>=0.6.0
+packaging>=23.0
+werkzeug>=2.3.0
+flask-cors>=4.0.0

space_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "runtime": "docker",
+    "hardware": "nvidia-l4-1x-16gb"
+}