Spaces:

marcosremar2
/

MinerUapi

Runtime error

App Files Files Community

marcosremar2 commited on May 2, 2025

Commit

128728a

1 Parent(s): a7cd086

Fix MinerU PDF API by adding model download and explicit configuration

Browse files

Files changed (5) hide show

.gitattributes +6 -32
Dockerfile +27 -4
app.py +303 -593
entrypoint.sh +69 -144
requirements.txt +8 -8

.gitattributes CHANGED Viewed

@@ -1,35 +1,9 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
 *.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
 *.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.pth filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.model filter=lfs diff=lfs merge=lfs -text
 *.onnx filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text

Dockerfile CHANGED Viewed

@@ -44,7 +44,7 @@ RUN pip install --upgrade pip
 RUN git clone https://github.com/opendatalab/MinerU.git /tmp/MinerU
 # Install required packages
-RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
 # Install MinerU with all features
 WORKDIR /tmp/MinerU
@@ -57,14 +57,37 @@ RUN pip install --no-cache-dir flask==2.3.3 flask-cors==4.0.0 werkzeug==2.3.7
 RUN mkdir -p /tmp/pdf_uploads /tmp/pdf_output
 RUN mkdir -p /tmp/samples
 # Create a non-root user for Hugging Face Spaces
 # This is critical for permissions on HF Spaces
 RUN useradd -m -u 1000 user
 RUN mkdir -p /app/samples && chown -R user:user /app
-# Download model weights
-RUN echo "Downloading MinerU model weights..."
-# This step will automatically download model weights during the first run
 # Copy the application files
 WORKDIR /app

 RUN git clone https://github.com/opendatalab/MinerU.git /tmp/MinerU
 # Install required packages
+RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
 # Install MinerU with all features
 WORKDIR /tmp/MinerU
 RUN mkdir -p /tmp/pdf_uploads /tmp/pdf_output
 RUN mkdir -p /tmp/samples
+# Create models directory structure
+RUN mkdir -p /tmp/models/MFD/YOLO
+RUN mkdir -p /tmp/models/MFR/unimernet
+RUN mkdir -p /tmp/models/table/rapid
+RUN mkdir -p /tmp/models/layout/doclayout
+# Download model weights
+RUN echo "Downloading MinerU model weights..."
+WORKDIR /tmp/models
+# Download the YOLO model for formula detection
+RUN wget -q https://huggingface.co/opendatalab/MinerU/resolve/main/models/mfd/yolo_v8_mfd.pt -O /tmp/models/MFD/YOLO/yolo_v8_ft.pt || echo "Failed to download yolo_v8_ft.pt"
+RUN wget -q https://huggingface.co/opendatalab/MinerU/resolve/main/models/mfd/yolo_v8_mfd.pt -O /tmp/models/MFD/YOLO/yolo_v8_mfd.pt || echo "Failed to download yolo_v8_mfd.pt"
+# Download the UniMERNet model for formula recognition
+RUN wget -q https://huggingface.co/opendatalab/MinerU/resolve/main/models/mfr/unimernet_small.pth -O /tmp/models/MFR/unimernet/unimernet_small.pth || echo "Failed to download unimernet_small.pth"
+# Download the Rapid Table detection model
+RUN wget -q https://huggingface.co/opendatalab/MinerU/resolve/main/models/table/rapid_table.pt -O /tmp/models/table/rapid/rapid_table.pt || echo "Failed to download rapid_table.pt"
+RUN wget -q https://huggingface.co/opendatalab/MinerU/resolve/main/models/table/slanet_plus.pt -O /tmp/models/table/rapid/slanet_plus.pt || echo "Failed to download slanet_plus.pt"
+# Download the DocLayout YOLO model
+RUN wget -q https://huggingface.co/opendatalab/MinerU/resolve/main/models/layout/doclayout_yolo.pt -O /tmp/models/layout/doclayout/doclayout_yolo.pt || echo "Failed to download doclayout_yolo.pt"
 # Create a non-root user for Hugging Face Spaces
 # This is critical for permissions on HF Spaces
 RUN useradd -m -u 1000 user
 RUN mkdir -p /app/samples && chown -R user:user /app
+# Set permissions for model files
+RUN chown -R user:user /tmp/models
 # Copy the application files
 WORKDIR /app

app.py CHANGED Viewed

@@ -1,705 +1,415 @@
-from flask import Flask, request, jsonify, render_template_string, redirect, url_for, send_from_directory
 import os
 import subprocess
 import tempfile
-import uuid
 import json
-import shutil
 import time
-import platform
-import sys
-from werkzeug.utils import secure_filename
-from flask_cors import CORS  # Add CORS support
 app = Flask(__name__)
-CORS(app)  # Enable CORS for all routes
-# Use user home directory for better permission handling
-USER_HOME = os.path.expanduser("~")
-UPLOAD_FOLDER = os.path.join(USER_HOME, 'pdf_uploads')
-OUTPUT_FOLDER = os.path.join(USER_HOME, 'pdf_output')
-# Create upload and output directories
 os.makedirs(UPLOAD_FOLDER, exist_ok=True)
 os.makedirs(OUTPUT_FOLDER, exist_ok=True)
-# Version information
-APP_VERSION = "1.1.0"
 HTML_TEMPLATE = """
 <!DOCTYPE html>
 <html>
 <head>
-    <title>MinerU PDF Processing</title>
     <style>
         body {
-            font-family: Arial, sans-serif;
-            max-width: 900px;
             margin: 0 auto;
             padding: 20px;
-            line-height: 1.6;
         }
         .container {
-            background-color: #f9f9f9;
             padding: 20px;
             border-radius: 8px;
             box-shadow: 0 2px 4px rgba(0,0,0,0.1);
-            margin-bottom: 20px;
         }
-        h1 {
-            color: #2c3e50;
         }
-        pre {
-            background-color: #f1f1f1;
             padding: 10px;
-            border-radius: 4px;
-            overflow-x: auto;
-            max-height: 300px;
-            overflow-y: auto;
-        }
-        .command {
-            font-family: monospace;
-            background-color: #eee;
-            padding: 5px;
             border-radius: 3px;
         }
-        .upload-form {
             margin: 20px 0;
-            padding: 15px;
-            border: 1px solid #ddd;
-            border-radius: 8px;
         }
-        .btn {
-            background-color: #4CAF50;
             color: white;
-            padding: 8px 16px;
             border: none;
             border-radius: 4px;
             cursor: pointer;
-            font-size: 16px;
         }
-        .btn:hover {
-            background-color: #45a049;
         }
-        .loading {
-            display: none;
-            color: #666;
-            margin-top: 10px;
-        }
-        .result-section {
             margin-top: 20px;
         }
-        .tab {
-            overflow: hidden;
-            border: 1px solid #ccc;
-            background-color: #f1f1f1;
-            margin-top: 20px;
         }
-        .tab button {
-            background-color: inherit;
-            float: left;
-            border: none;
-            outline: none;
-            cursor: pointer;
-            padding: 10px 16px;
-            transition: 0.3s;
         }
-        .tab button:hover {
-            background-color: #ddd;
         }
-        .tab button.active {
-            background-color: #ccc;
-        }
-        .tabcontent {
-            display: none;
-            padding: 6px 12px;
-            border: 1px solid #ccc;
-            border-top: none;
-            max-height: 500px;
-            overflow-y: auto;
-            white-space: pre-wrap;
         }
     </style>
 </head>
 <body>
     <div class="container">
-        <h1>MinerU PDF Processing Service</h1>
-        <p>This service uses MinerU to convert PDF documents to Markdown and JSON formats.</p>
-        <h2>GPU Status</h2>
-        <pre id="gpuStatus">Loading...</pre>
-        <div class="upload-form">
-            <h2>Convert PDF File</h2>
-            <form action="/convert" method="post" enctype="multipart/form-data" id="uploadForm">
-                <input type="file" name="file" accept=".pdf" required>
-                <button type="submit" class="btn">Convert PDF</button>
-            </form>
-            <div id="loadingIndicator" class="loading">Processing PDF file... This may take a minute.</div>
         </div>
-        <div class="result-section" id="resultSection" style="display: none;">
-            <h2>Conversion Results</h2>
-            <div id="resultInfo"></div>
-            <div class="tab">
-                <button class="tablinks" onclick="openTab(event, 'Markdown')" id="defaultOpen">Markdown</button>
-                <button class="tablinks" onclick="openTab(event, 'JSON')">JSON</button>
-                <button class="tablinks" onclick="openTab(event, 'Log')">Processing Log</button>
-            </div>
-            <div id="Markdown" class="tabcontent">
-                <pre id="markdownContent"></pre>
-                <a id="downloadMarkdown" class="btn" style="margin-top: 10px;">Download Markdown</a>
-            </div>
-            <div id="JSON" class="tabcontent">
-                <pre id="jsonContent"></pre>
-                <a id="downloadJson" class="btn" style="margin-top: 10px;">Download JSON</a>
-            </div>
-            <div id="Log" class="tabcontent">
-                <pre id="logContent"></pre>
-            </div>
         </div>
-        <h2>Available Commands</h2>
-        <p>MinerU provides the following commands:</p>
-        <p><span class="command">magic-pdf</span> - Process PDF documents</p>
-        <h2>Help Output</h2>
-        <pre id="helpOutput">Loading...</pre>
     </div>
     <script>
-        // Fetch GPU status
-        fetch('/gpu-status')
-            .then(response => response.json())
-            .then(data => {
-                document.getElementById('gpuStatus').textContent = data.output;
-            })
-            .catch(error => {
-                document.getElementById('gpuStatus').textContent = 'Error fetching GPU status: ' + error.message;
-            });
-        // Fetch help output
-        fetch('/help-output')
-            .then(response => response.json())
-            .then(data => {
-                document.getElementById('helpOutput').textContent = data.output;
-            })
-            .catch(error => {
-                document.getElementById('helpOutput').textContent = 'Error fetching help: ' + error.message;
-            });
-        // Tab functionality
-        function openTab(evt, tabName) {
-            var i, tabcontent, tablinks;
-            tabcontent = document.getElementsByClassName("tabcontent");
-            for (i = 0; i < tabcontent.length; i++) {
-                tabcontent[i].style.display = "none";
-            }
-            tablinks = document.getElementsByClassName("tablinks");
-            for (i = 0; i < tablinks.length; i++) {
-                tablinks[i].className = tablinks[i].className.replace(" active", "");
             }
-            document.getElementById(tabName).style.display = "block";
-            evt.currentTarget.className += " active";
-        }
-        // Set up form submission
-        document.getElementById('uploadForm').addEventListener('submit', function(e) {
-            e.preventDefault();
-            const loadingIndicator = document.getElementById('loadingIndicator');
-            loadingIndicator.style.display = 'block';
-            const resultSection = document.getElementById('resultSection');
-            resultSection.style.display = 'none';
-            const formData = new FormData(this);
-            fetch('/convert', {
-                method: 'POST',
-                body: formData
-            })
-            .then(response => response.json())
-            .then(data => {
-                loadingIndicator.style.display = 'none';
-                resultSection.style.display = 'block';
-                document.getElementById('resultInfo').textContent = data.message;
-                // Handle Markdown content
-                if (data.markdown) {
-                    document.getElementById('markdownContent').textContent = data.markdown;
-                    const downloadMarkdown = document.getElementById('downloadMarkdown');
-                    downloadMarkdown.href = data.markdown_url;
-                    downloadMarkdown.download = data.base_filename + '.md';
-                }
-                // Handle JSON content
-                if (data.json) {
-                    document.getElementById('jsonContent').textContent = JSON.stringify(data.json, null, 2);
-                    const downloadJson = document.getElementById('downloadJson');
-                    downloadJson.href = data.json_url;
-                    downloadJson.download = data.base_filename + '.json';
                 }
-                // Handle log content
-                if (data.log) {
-                    document.getElementById('logContent').textContent = data.log;
-                }
-                // Open the markdown tab by default
-                document.getElementById('defaultOpen').click();
-            })
-            .catch(error => {
-                loadingIndicator.style.display = 'none';
-                alert('Error: ' + error.message);
-            });
         });
     </script>
 </body>
 </html>
 """
 @app.route('/')
 def index():
-    return render_template_string(HTML_TEMPLATE)
 @app.route('/gpu-status')
 def gpu_status():
-    import subprocess
     try:
-        output = subprocess.check_output(['nvidia-smi'], stderr=subprocess.STDOUT).decode('utf-8')
-    except subprocess.CalledProcessError as e:
-        output = f"Error running nvidia-smi: {e.output.decode('utf-8')}"
-    except FileNotFoundError:
-        output = "nvidia-smi command not found. GPU may not be available."
     return jsonify({"output": output})
 @app.route('/help-output')
 def help_output():
-    import subprocess
     try:
-        output = subprocess.check_output(['magic-pdf', '--help'], stderr=subprocess.STDOUT).decode('utf-8')
-    except subprocess.CalledProcessError as e:
-        output = f"Error running magic-pdf --help: {e.output.decode('utf-8')}"
-    except FileNotFoundError:
-        output = "magic-pdf command not found. MinerU may not be installed correctly."
     return jsonify({"output": output})
-@app.route('/convert', methods=['POST'])
 def convert_pdf():
     if 'file' not in request.files:
-        return jsonify({"error": "No file part"}), 400
     file = request.files['file']
-    if file.filename == '':
-        return jsonify({"error": "No selected file"}), 400
-    if not file.filename.lower().endswith('.pdf'):
-        return jsonify({"error": "File must be a PDF"}), 400
-    # Generate a unique ID for this conversion
-    job_id = str(uuid.uuid4())
-    job_dir = os.path.join(OUTPUT_FOLDER, job_id)
-    os.makedirs(job_dir, exist_ok=True)
-    # Save the uploaded file
-    filename = secure_filename(file.filename)
-    base_filename = os.path.splitext(filename)[0]
-    pdf_path = os.path.join(job_dir, filename)
-    file.save(pdf_path)
-    # Run magic-pdf on the file
-    output_dir = os.path.join(job_dir, "output")
-    os.makedirs(output_dir, exist_ok=True)
-    log_file = os.path.join(job_dir, "conversion.log")
     try:
-        # Define the default config dictionary first
-        default_config = {
-            "device-mode": "cpu",
-            "layout-config": {
-                "model": "doclayout_yolo",
-                "enable": True
-            },
-            "formula-config": {
-                "mfd_model": "yolo_v8_mfd",
-                "mfr_model": "unimernet_small",
-                "enable": True
-            },
-            "table-config": {
-                "model": "rapid_table",
-                "sub_model": "slanet_plus",
-                "enable": True,
-                "max_time": 400
-            }
-        }
-        # Create the magic-pdf.json configuration file in .config if it doesn't exist
-        config_dir = os.path.expanduser("~/.config/magic_pdf")
-        os.makedirs(config_dir, exist_ok=True)
-        config_file = os.path.join(config_dir, "magic-pdf.json")
-        if not os.path.exists(config_file):
-            with open(config_file, 'w') as f:
-                json.dump(default_config, f, indent=2)
-        # Also create the config in the home directory as fallback
-        home_config_file = os.path.join(os.path.expanduser("~"), "magic-pdf.json")
-        if not os.path.exists(home_config_file):
-            with open(home_config_file, 'w') as f:
-                json.dump(default_config, f, indent=2)
-        # Add a small delay to ensure config file is written before magic-pdf runs
-        time.sleep(0.5)
-        # Use magic-pdf to convert the PDF to Markdown and JSON
-        cmd = [
-            'magic-pdf',
-            '--path', pdf_path,
-            '--output-dir', output_dir
-        ]
-        # Run the command and capture output
-        with open(log_file, 'w') as f:
-            process = subprocess.Popen(
-                cmd,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.STDOUT,
-                text=True,
-                bufsize=1
-            )
-            # Write process output to log file in real-time
-            for line in process.stdout:
-                f.write(line)
-                f.flush()
-            process.wait()
             if process.returncode != 0:
                 return jsonify({
-                    "error": f"PDF conversion failed with code {process.returncode}",
-                    "log": open(log_file, 'r').read()
                 }), 500
-        # Get the generated markdown and JSON
-        markdown_file = os.path.join(output_dir, f"{base_filename}.md")
-        json_file = os.path.join(output_dir, f"{base_filename}.json")
-        # Check if the output files exist
-        markdown_content = ""
-        json_content = {}
-        if os.path.exists(markdown_file):
-            with open(markdown_file, 'r', encoding='utf-8') as f:
-                markdown_content = f.read()
-        if os.path.exists(json_file):
-            with open(json_file, 'r', encoding='utf-8') as f:
-                json_content = json.load(f)
-        # Read the log file
-        with open(log_file, 'r') as f:
-            log_content = f.read()
-        # Copy the output files to a location accessible for download
-        output_markdown = os.path.join(job_dir, f"{base_filename}.md")
-        output_json = os.path.join(job_dir, f"{base_filename}.json")
-        if os.path.exists(markdown_file):
-            shutil.copy(markdown_file, output_markdown)
-        if os.path.exists(json_file):
-            shutil.copy(json_file, output_json)
-        # Return the conversion results
-        return jsonify({
-            "message": f"PDF '{filename}' converted successfully",
-            "markdown": markdown_content,
-            "json": json_content,
-            "log": log_content,
-            "base_filename": base_filename,
-            "job_id": job_id,
-            "markdown_url": url_for('download_file', job_id=job_id, filename=f"{base_filename}.md"),
-            "json_url": url_for('download_file', job_id=job_id, filename=f"{base_filename}.json")
-        })
-    except Exception as e:
-        import traceback
-        error_details = traceback.format_exc()
-        return jsonify({
-            "error": f"Failed to convert PDF: {str(e)}",
-            "details": error_details
-        }), 500
-@app.route('/download/<job_id>/<filename>')
-def download_file(job_id, filename):
-    job_dir = os.path.join(OUTPUT_FOLDER, job_id)
-    return send_from_directory(job_dir, filename)
-# Add a sample PDF for testing
-@app.route('/sample')
-def add_sample():
-    try:
-        # Create a tiny text-only PDF using Podofoimpose (if available) or other method
-        sample_dir = os.path.join(UPLOAD_FOLDER, 'sample')
-        os.makedirs(sample_dir, exist_ok=True)
-        sample_path = os.path.join(sample_dir, 'sample.pdf')
-        # Use simple text for the sample
-        with open(os.path.join(sample_dir, 'sample.txt'), 'w') as f:
-            f.write("This is a sample PDF for testing MinerU.\n\nIt contains simple text to demonstrate the PDF to Markdown and JSON conversion capabilities.")
-        # Try to convert the text to PDF if possible
-        try:
-            subprocess.run(['convert', '-size', '612x792', 'caption:@' + os.path.join(sample_dir, 'sample.txt'), sample_path])
-        except:
-            # If ImageMagick's convert fails, try another approach
-            return jsonify({"error": "Could not create sample PDF. Please upload your own PDF file."}), 500
-        return jsonify({"message": "Sample PDF created", "path": sample_path})
-    except Exception as e:
-        return jsonify({"error": f"Failed to create sample PDF: {str(e)}"}), 500
-@app.route('/health')
-def health_check():
-    """
-    Health check endpoint for monitoring.
-    Returns basic information about the service status.
-    """
-    try:
-        # Check if magic-pdf command exists
-        has_magic_pdf = False
-        try:
-            subprocess.run(['magic-pdf', '--version'], capture_output=True, check=False)
-            has_magic_pdf = True
-        except FileNotFoundError:
-            pass
-        # Get runtime information
-        health_info = {
-            'status': 'healthy',
-            'version': APP_VERSION,
-            'environment': {
-                'python_version': platform.python_version(),
-                'platform': platform.platform(),
-                'processor': platform.processor()
-            },
-            'configuration': {
-                'upload_folder_exists': os.path.exists(UPLOAD_FOLDER),
-                'output_folder_exists': os.path.exists(OUTPUT_FOLDER),
-                'magic_pdf_installed': has_magic_pdf
-            }
-        }
-        return jsonify(health_info)
-    except Exception as e:
-        return jsonify({
-            'status': 'unhealthy',
-            'error': str(e)
-        }), 500
-@app.route('/api/convert', methods=['POST'])
-def api_convert_pdf():
-    """
-    API endpoint for programmatic access to PDF conversion.
-    Request:
-        - POST request with 'file' field containing PDF file
-    Response:
-        - JSON with conversion results
-    """
-    # Validate request
-    if 'file' not in request.files:
-        return jsonify({
-            'success': False,
-            'error': 'No file provided. Please upload a PDF file.'
-        }), 400
-    file = request.files['file']
-    if file.filename == '':
-        return jsonify({
-            'success': False,
-            'error': 'No file selected. Please select a PDF file.'
-        }), 400
-    # Check if the file is a PDF
-    if not file.filename.lower().endswith('.pdf'):
-        return jsonify({
-            'success': False,
-            'error': 'Invalid file format. Please upload a PDF file.'
-        }), 400
-    # Generate a job ID
-    job_id = str(uuid.uuid4())
-    # Create job directory
-    job_dir = os.path.join(OUTPUT_FOLDER, job_id)
-    os.makedirs(job_dir, exist_ok=True)
-    # Save the uploaded file
-    filename = secure_filename(file.filename)
-    base_filename, _ = os.path.splitext(filename)
-    pdf_path = os.path.join(job_dir, filename)
-    file.save(pdf_path)
-    try:
-        # Define the default config dictionary first
-        default_config = {
-            "device-mode": "cpu",
-            "layout-config": {
-                "model": "doclayout_yolo",
-                "enable": True
-            },
-            "formula-config": {
-                "mfd_model": "yolo_v8_mfd",
-                "mfr_model": "unimernet_small",
-                "enable": True
-            },
-            "table-config": {
-                "model": "rapid_table",
-                "sub_model": "slanet_plus",
-                "enable": True,
-                "max_time": 400
-            }
-        }
-        # Create the magic-pdf.json configuration file in .config if it doesn't exist
-        config_dir = os.path.expanduser("~/.config/magic_pdf")
-        os.makedirs(config_dir, exist_ok=True)
-        config_file = os.path.join(config_dir, "magic-pdf.json")
-        if not os.path.exists(config_file):
-            with open(config_file, 'w') as f:
-                json.dump(default_config, f, indent=2)
-        # Also create the config in the home directory as fallback
-        home_config_file = os.path.join(os.path.expanduser("~"), "magic-pdf.json")
-        if not os.path.exists(home_config_file):
-            with open(home_config_file, 'w') as f:
-                json.dump(default_config, f, indent=2)
-        # Add a small delay to ensure config file is written before magic-pdf runs
-        time.sleep(0.5)
-        # Log the conversion process
-        log_file = os.path.join(job_dir, "conversion.log")
-        with open(log_file, "w") as log:
-            # Run the MinerU magic-pdf command with correct parameters
-            command = ["magic-pdf", "--path", pdf_path, "--output-dir", job_dir]
-            process = subprocess.Popen(
-                command,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.STDOUT,
-                universal_newlines=True
-            )
-            output = []
-            for line in process.stdout:
-                output.append(line)
-                log.write(line)
-                log.flush()
-            process.wait()
-            exit_code = process.returncode
-        if exit_code != 0:
-            error_message = ''.join(output) if output else "Unknown error during PDF conversion"
             return jsonify({
-                'success': False,
-                'error': 'PDF conversion failed. Please check the log for details.',
-                'log': error_message,
-                'exit_code': exit_code
             }), 500
-        # Check for output files
-        markdown_file = os.path.join(job_dir, f"{base_filename}.md")
-        json_file = os.path.join(job_dir, f"{base_filename}.json")
-        # If files don't exist in the job directory, check the same directory as the PDF
-        pdf_dir = os.path.dirname(pdf_path)
-        if not os.path.exists(markdown_file):
-            alt_markdown_file = os.path.join(pdf_dir, f"{base_filename}.md")
-            if os.path.exists(alt_markdown_file):
-                markdown_file = alt_markdown_file
-            else:
-                # Try to find any markdown file in the output directory
-                md_files = [f for f in os.listdir(job_dir) if f.endswith('.md')]
-                if md_files:
-                    markdown_file = os.path.join(job_dir, md_files[0])
-        if not os.path.exists(json_file):
-            alt_json_file = os.path.join(pdf_dir, f"{base_filename}.json")
-            if os.path.exists(alt_json_file):
-                json_file = alt_json_file
-            else:
-                # Try to find any JSON file in the output directory
-                json_files = [f for f in os.listdir(job_dir) if f.endswith('.json')]
-                if json_files:
-                    json_file = os.path.join(job_dir, json_files[0])
-        # Read markdown content
-        markdown_content = ""
-        if os.path.exists(markdown_file):
-            with open(markdown_file, 'r', encoding='utf-8') as f:
-                markdown_content = f.read()
-        else:
-            print(f"Warning: Markdown file not found at {markdown_file}")
-        # Read JSON content
-        json_content = {}
-        if os.path.exists(json_file):
-            with open(json_file, 'r', encoding='utf-8') as f:
-                json_content = json.load(f)
-        else:
-            print(f"Warning: JSON file not found at {json_file}")
-        # Read log content
-        log_content = ""
-        with open(log_file, 'r', encoding='utf-8') as f:
-            log_content = f.read()
-        # Create the result
-        result = {
-            'success': True,
-            'message': 'PDF conversion successful',
-            'job_id': job_id,
-            'base_filename': base_filename,
-            'file_info': {
-                'original_filename': filename,
-                'size_bytes': os.path.getsize(pdf_path),
-                'content_type': 'application/pdf'
-            },
-            'markdown': markdown_content,
-            'json': json_content,
-            'log': log_content,
-            'files': {
-                'markdown_path': os.path.basename(markdown_file) if os.path.exists(markdown_file) else None,
-                'json_path': os.path.basename(json_file) if os.path.exists(json_file) else None
-            }
-        }
-        return jsonify(result)
     except Exception as e:
-        import traceback
-        error_details = traceback.format_exc()
         return jsonify({
-            'success': False,
-            'error': f'An error occurred during PDF conversion: {str(e)}',
-            'details': error_details,
-            'job_id': job_id
         }), 500
 if __name__ == '__main__':
-    app.run(host='0.0.0.0', port=7860, debug=False)

+from flask import Flask, request, jsonify, send_file, render_template_string, abort
+from flask_cors import CORS
 import os
+import uuid
+import traceback
+import logging
 import subprocess
 import tempfile
 import json
 import time
+import shutil
 app = Flask(__name__)
+CORS(app)
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+# Constants
+UPLOAD_FOLDER = os.environ.get('UPLOAD_FOLDER', '/tmp/pdf_uploads')
+OUTPUT_FOLDER = os.environ.get('OUTPUT_FOLDER', '/tmp/pdf_output')
+ALLOWED_EXTENSIONS = {'pdf'}
+MAX_FILE_SIZE = 50 * 1024 * 1024  # 50MB
+# Ensure the directories exist
 os.makedirs(UPLOAD_FOLDER, exist_ok=True)
 os.makedirs(OUTPUT_FOLDER, exist_ok=True)
+# Function to check if file extension is allowed
+def allowed_file(filename):
+    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
+# Template for the main page
 HTML_TEMPLATE = """
 <!DOCTYPE html>
 <html>
 <head>
+    <title>MinerU PDF Converter</title>
     <style>
         body {
+            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+            line-height: 1.6;
+            color: #333;
+            max-width: 800px;
             margin: 0 auto;
             padding: 20px;
+            background-color: #f5f8fa;
+        }
+        h1 {
+            color: #2c3e50;
+            border-bottom: 2px solid #3498db;
+            padding-bottom: 10px;
         }
         .container {
+            background-color: white;
             padding: 20px;
             border-radius: 8px;
             box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+            margin-top: 20px;
         }
+        .info {
+            background-color: #e8f4fc;
+            padding: 15px;
+            border-radius: 5px;
+            margin: 15px 0;
         }
+        .code {
+            background-color: #f4f4f4;
             padding: 10px;
             border-radius: 3px;
+            font-family: monospace;
+            overflow-x: auto;
         }
+        form {
             margin: 20px 0;
         }
+        input[type=file] {
+            padding: 10px;
+            width: 100%;
+            margin-bottom: 10px;
+        }
+        button {
+            padding: 10px 15px;
+            background-color: #3498db;
             color: white;
             border: none;
             border-radius: 4px;
             cursor: pointer;
         }
+        button:hover {
+            background-color: #2980b9;
         }
+        .result {
             margin-top: 20px;
+            display: none;
         }
+        .loading {
+            display: none;
+            text-align: center;
+            margin: 20px 0;
         }
+        .error {
+            background-color: #fee;
+            border-left: 4px solid #e74c3c;
+            padding: 10px;
+            margin: 10px 0;
         }
+        a {
+            color: #3498db;
+            text-decoration: none;
         }
+        a:hover {
+            text-decoration: underline;
         }
     </style>
 </head>
 <body>
+    <h1>📄 MinerU PDF Converter</h1>
     <div class="container">
+        <h2>Convert PDF to Markdown and JSON</h2>
+        <p>Upload a PDF file to convert it to Markdown and structured JSON.</p>
+        <div class="info">
+            <h3>Features</h3>
+            <ul>
+                <li>High-quality PDF extraction</li>
+                <li>Support for tables, formulas, and complex layouts</li>
+                <li>Output in both Markdown and structured JSON</li>
+                <li>Comprehensive error handling</li>
+            </ul>
         </div>
+        <form id="uploadForm" enctype="multipart/form-data">
+            <input type="file" id="pdfFile" accept=".pdf" required>
+            <button type="submit">Convert PDF</button>
+        </form>
+        <div id="loading" class="loading">
+            <p>Converting PDF... This may take a minute for large files.</p>
         </div>
+        <div id="error" class="error" style="display: none;"></div>
+        <div id="result" class="result">
+            <h3>Conversion Results</h3>
+            <p>Your PDF has been converted successfully!</p>
+            <p><a id="markdownLink" href="#" download>Download Markdown</a></p>
+            <p><a id="jsonLink" href="#" download>Download JSON</a></p>
+        </div>
+        <div class="info">
+            <h3>API Usage</h3>
+            <p>You can also use our API endpoint to convert PDFs programmatically:</p>
+            <div class="code">
+                curl -X POST -F "file=@your_file.pdf" https://marcosremar2-mineruapi.hf.space/api/convert
+            </div>
+        </div>
     </div>
     <script>
+        document.getElementById('uploadForm').addEventListener('submit', async function(e) {
+            e.preventDefault();
+            const fileInput = document.getElementById('pdfFile');
+            const file = fileInput.files[0];
+            if (!file) {
+                showError('Please select a PDF file to upload.');
+                return;
             }
+            if (file.size > ${MAX_FILE_SIZE}) {
+                showError(`File size exceeds the ${MAX_FILE_SIZE / (1024 * 1024)}MB limit.`);
+                return;
+            }
+            // Show loading indicator
+            document.getElementById('loading').style.display = 'block';
+            document.getElementById('error').style.display = 'none';
+            document.getElementById('result').style.display = 'none';
+            const formData = new FormData();
+            formData.append('file', file);
+            try {
+                const response = await fetch('/api/convert', {
+                    method: 'POST',
+                    body: formData
+                });
+                const result = await response.json();
+                // Hide loading indicator
+                document.getElementById('loading').style.display = 'none';
+                if (response.ok) {
+                    // Show result links
+                    const markdownLink = document.getElementById('markdownLink');
+                    const jsonLink = document.getElementById('jsonLink');
+                    if (result.markdown_url) {
+                        markdownLink.href = result.markdown_url;
+                        markdownLink.download = file.name.replace('.pdf', '.md');
+                    } else {
+                        markdownLink.parentElement.style.display = 'none';
+                    }
+                    if (result.json_url) {
+                        jsonLink.href = result.json_url;
+                        jsonLink.download = file.name.replace('.pdf', '.json');
+                    } else {
+                        jsonLink.parentElement.style.display = 'none';
+                    }
+                    document.getElementById('result').style.display = 'block';
+                } else {
+                    showError(result.error || 'Failed to convert PDF. Please try again.');
                 }
+            } catch (error) {
+                document.getElementById('loading').style.display = 'none';
+                showError('An error occurred. Please try again later.');
+                console.error(error);
+            }
         });
+        function showError(message) {
+            const errorElement = document.getElementById('error');
+            errorElement.textContent = message;
+            errorElement.style.display = 'block';
+        }
     </script>
 </body>
 </html>
 """
+# Route for the main page
 @app.route('/')
 def index():
+    return render_template_string(HTML_TEMPLATE, MAX_FILE_SIZE=MAX_FILE_SIZE)
+# Route for the health check
+@app.route('/health')
+def health_check():
+    try:
+        # Get MinerU version
+        process = subprocess.run(['magic-pdf', '--version'],
+                                capture_output=True, text=True, check=False)
+        version = process.stdout.strip() if process.returncode == 0 else "Error getting version"
+        # Check CUDA/GPU availability
+        try:
+            subprocess.run(['nvidia-smi'], capture_output=True, check=True)
+            gpu_available = True
+        except (subprocess.CalledProcessError, FileNotFoundError):
+            gpu_available = False
+        # Check if model directories exist
+        model_dirs = {
+            "yolo_model": os.path.exists("/tmp/models/MFD/YOLO/yolo_v8_ft.pt"),
+            "unimernet_model": os.path.exists("/tmp/models/MFR/unimernet/unimernet_small.pth"),
+            "rapid_table_model": os.path.exists("/tmp/models/table/rapid/rapid_table.pt"),
+            "doclayout_model": os.path.exists("/tmp/models/layout/doclayout/doclayout_yolo.pt")
+        }
+        return jsonify({
+            "status": "healthy",
+            "version": version,
+            "gpu_available": gpu_available,
+            "model_dirs": model_dirs,
+            "timestamp": time.time()
+        })
+    except Exception as e:
+        logging.error(f"Health check error: {str(e)}")
+        return jsonify({
+            "status": "unhealthy",
+            "error": str(e),
+            "timestamp": time.time()
+        }), 500
+# Route to display GPU status
 @app.route('/gpu-status')
 def gpu_status():
     try:
+        output = subprocess.check_output(['nvidia-smi'], stderr=subprocess.STDOUT, text=True)
+    except (subprocess.CalledProcessError, FileNotFoundError) as e:
+        output = f"GPU information not available: {str(e)}"
     return jsonify({"output": output})
+# Route to display magic-pdf help
 @app.route('/help-output')
 def help_output():
     try:
+        output = subprocess.check_output(['magic-pdf', '--help'], stderr=subprocess.STDOUT, text=True)
+    except (subprocess.CalledProcessError, FileNotFoundError) as e:
+        output = f"Help information not available: {str(e)}"
     return jsonify({"output": output})
+# Route for PDF conversion
+@app.route('/api/convert', methods=['POST'])
 def convert_pdf():
     if 'file' not in request.files:
+        return jsonify({"error": "No file part in the request"}), 400
     file = request.files['file']
+    if file.filename == '':
+        return jsonify({"error": "No file selected"}), 400
+    if not allowed_file(file.filename):
+        return jsonify({"error": f"Only {', '.join(ALLOWED_EXTENSIONS)} files are allowed"}), 400
     try:
+        # Create a unique session ID
+        session_id = str(uuid.uuid4())
+        session_dir = os.path.join(OUTPUT_FOLDER, session_id)
+        os.makedirs(session_dir, exist_ok=True)
+        # Save the uploaded file
+        input_path = os.path.join(UPLOAD_FOLDER, f"{session_id}_{file.filename}")
+        file.save(input_path)
+        logging.info(f"Saved uploaded file to {input_path}")
+        # Get output file paths
+        base_filename = os.path.splitext(os.path.basename(file.filename))[0]
+        base_filename = ''.join(c if c.isalnum() or c in ['_', '-', '.'] else '_' for c in base_filename)
+        markdown_path = os.path.join(session_dir, f"{base_filename}.md")
+        json_path = os.path.join(session_dir, f"{base_filename}.json")
+        # Define file URLs
+        markdown_url = f"/download/{session_id}/{base_filename}.md"
+        json_url = f"/download/{session_id}/{base_filename}.json"
+        try:
+            # Run MinerU to convert the PDF
+            cmd = [
+                "magic-pdf",
+                "-i", input_path,
+                "-o", markdown_path,
+                "--json", json_path,
+                "-d", "gpu" if os.path.exists("/tmp/models/MFD/YOLO/yolo_v8_ft.pt") else "cpu",
+                "--mfd"
+            ]
+            process = subprocess.run(cmd, capture_output=True, text=True, check=False)
             if process.returncode != 0:
+                logging.error(f"MinerU conversion failed: {process.stderr}")
                 return jsonify({
+                    "error": "PDF conversion failed",
+                    "details": process.stderr,
+                    "command": " ".join(cmd)
                 }), 500
+            # Check if output files exist
+            md_exists = os.path.exists(markdown_path)
+            json_exists = os.path.exists(json_path)
+            if not md_exists:
+                logging.warning(f"Markdown file not found at {markdown_path}")
+            if not json_exists:
+                logging.warning(f"JSON file not found at {json_path}")
+            return jsonify({
+                "success": True,
+                "session_id": session_id,
+                "markdown_url": markdown_url if md_exists else None,
+                "json_url": json_url if json_exists else None,
+                "message": "PDF conversion completed"
+            })
+        except Exception as e:
+            logging.error(f"Error during conversion: {str(e)}")
+            logging.error(traceback.format_exc())
             return jsonify({
+                "error": "Error processing PDF file",
+                "details": str(e)
             }), 500
     except Exception as e:
+        logging.error(f"General error: {str(e)}")
+        logging.error(traceback.format_exc())
         return jsonify({
+            "error": "Failed to process request",
+            "details": str(e)
         }), 500
+    finally:
+        # Clean up the input file
+        try:
+            if os.path.exists(input_path):
+                os.remove(input_path)
+        except Exception as e:
+            logging.warning(f"Failed to clean up input file: {str(e)}")
+# Route to download converted files
+@app.route('/download/<session_id>/<filename>')
+def download_file(session_id, filename):
+    # Validate the session ID and filename
+    if not all(c.isalnum() or c == '-' for c in session_id):
+        abort(400, "Invalid session ID")
+    base_path = os.path.join(OUTPUT_FOLDER, session_id)
+    file_path = os.path.join(base_path, filename)
+    if not os.path.exists(file_path):
+        abort(404, "File not found")
+    return send_file(file_path, as_attachment=True)
 if __name__ == '__main__':
+    app.run(host='0.0.0.0', port=7860)

entrypoint.sh CHANGED Viewed

@@ -5,198 +5,123 @@ set -e
 source /opt/mineru_venv/bin/activate
 # Display GPU information
 echo "Checking NVIDIA GPU status:"
-nvidia-smi
 # Display MinerU version
 echo "MinerU version:"
-magic-pdf --version
 # Create a samples directory
 mkdir -p $HOME/.config/magic_pdf
-mkdir -p /app/samples || mkdir -p /tmp/samples
-# Define the samples directory based on what's writable
-if [ -w "/app/samples" ]; then
-    SAMPLES_DIR="/app/samples"
-else
-    SAMPLES_DIR="/tmp/samples"
-fi
 # Download a sample PDF for testing if it doesn't exist
-if [ ! -f "$SAMPLES_DIR/sample.pdf" ]; then
-    echo "Downloading sample PDF for testing..."
     # Download a simple paper from arXiv (using a small one for quick processing)
-    wget -q "https://arxiv.org/pdf/2201.08239.pdf" -O "$SAMPLES_DIR/sample.pdf" || true
     # If that fails, try another source
-    if [ ! -s "$SAMPLES_DIR/sample.pdf" ]; then
-        wget -q "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf" -O "$SAMPLES_DIR/sample.pdf" || true
     fi
     # If both fail, create a simple PDF with text
-    if [ ! -s "$SAMPLES_DIR/sample.pdf" ]; then
         echo "Failed to download sample PDF, creating a simple PDF text file..."
         echo "This is a sample PDF document for testing MinerU.
         MinerU is a high-quality tool for converting PDF to Markdown and JSON formats.
-        This file was created for testing purposes." > "$SAMPLES_DIR/sample.txt"
         # Try using different methods to create a PDF
         if command -v convert &> /dev/null; then
-            convert -size 612x792 -background white -fill black caption:@"$SAMPLES_DIR/sample.txt" "$SAMPLES_DIR/sample.pdf"
         else
             echo "WARNING: Could not create a sample PDF file automatically."
         fi
     fi
 fi
-# Create the magic-pdf.json config file
-CONFIG_DIR="$HOME/.config/magic_pdf"
-mkdir -p "$CONFIG_DIR"
-if [ ! -f "$CONFIG_DIR/magic-pdf.json" ]; then
-    echo "Creating magic-pdf.json configuration file..."
-    cat > "$CONFIG_DIR/magic-pdf.json" << EOF
 {
     "device-mode": "gpu",
     "layout-config": {
         "model": "doclayout_yolo",
         "enable": true
     },
     "formula-config": {
         "mfd_model": "yolo_v8_mfd",
         "mfr_model": "unimernet_small",
         "enable": true
     },
     "table-config": {
         "model": "rapid_table",
         "sub_model": "slanet_plus",
         "enable": true,
         "max_time": 400
     }
 }
 EOF
-fi
-# Start the Flask application if it exists, otherwise provide a shell
-if [ -f "/app/app.py" ]; then
-    echo "Starting Flask application..."
-    python /app/app.py
-else
-    echo "No app.py found. Starting a simple server..."
-    # Create a simple server that shows MinerU is installed
-    TMP_APP_PATH="$HOME/simple_app.py"
-    cat > "$TMP_APP_PATH" << 'EOF'
-from flask import Flask, request, jsonify, render_template_string
-app = Flask(__name__)
-HTML_TEMPLATE = """
-<!DOCTYPE html>
-<html>
-<head>
-    <title>MinerU PDF Processing</title>
-    <style>
-        body {
-            font-family: Arial, sans-serif;
-            max-width: 800px;
-            margin: 0 auto;
-            padding: 20px;
-        }
-        .container {
-            background-color: #f9f9f9;
-            padding: 20px;
-            border-radius: 8px;
-            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
-        }
-        h1 {
-            color: #2c3e50;
-        }
-        pre {
-            background-color: #f1f1f1;
-            padding: 10px;
-            border-radius: 4px;
-            overflow-x: auto;
-        }
-        .command {
-            font-family: monospace;
-            background-color: #eee;
-            padding: 5px;
-            border-radius: 3px;
-        }
-    </style>
-</head>
-<body>
-    <div class="container">
-        <h1>MinerU PDF Processing Service</h1>
-        <p>This Space provides PDF processing capabilities using MinerU.</p>
-        <h2>GPU Status</h2>
-        <pre id="gpuStatus">Loading...</pre>
-        <h2>Available Commands</h2>
-        <p>MinerU provides the following commands:</p>
-        <p><span class="command">magic-pdf</span> - Process PDF documents</p>
-        <h2>Help Output</h2>
-        <pre id="helpOutput">Loading...</pre>
-    </div>
-    <script>
-        // Fetch GPU status
-        fetch('/gpu-status')
-            .then(response => response.json())
-            .then(data => {
-                document.getElementById('gpuStatus').textContent = data.output;
-            })
-            .catch(error => {
-                document.getElementById('gpuStatus').textContent = 'Error fetching GPU status: ' + error.message;
-            });
-        // Fetch help output
-        fetch('/help-output')
-            .then(response => response.json())
-            .then(data => {
-                document.getElementById('helpOutput').textContent = data.output;
-            })
-            .catch(error => {
-                document.getElementById('helpOutput').textContent = 'Error fetching help: ' + error.message;
-            });
-    </script>
-</body>
-</html>
-"""
-@app.route('/')
-def index():
-    return render_template_string(HTML_TEMPLATE)
-@app.route('/gpu-status')
-def gpu_status():
-    import subprocess
-    try:
-        output = subprocess.check_output(['nvidia-smi'], stderr=subprocess.STDOUT).decode('utf-8')
-    except subprocess.CalledProcessError as e:
-        output = f"Error running nvidia-smi: {e.output.decode('utf-8')}"
-    except FileNotFoundError:
-        output = "nvidia-smi command not found. GPU may not be available."
-    return jsonify({"output": output})
-@app.route('/help-output')
-def help_output():
-    import subprocess
-    try:
-        output = subprocess.check_output(['magic-pdf', '--help'], stderr=subprocess.STDOUT).decode('utf-8')
-    except subprocess.CalledProcessError as e:
-        output = f"Error running magic-pdf --help: {e.output.decode('utf-8')}"
-    except FileNotFoundError:
-        output = "magic-pdf command not found. MinerU may not be installed correctly."
-    return jsonify({"output": output})
-if __name__ == '__main__':
-    app.run(host='0.0.0.0', port=7860)
-EOF
-    python "$TMP_APP_PATH"
-fi

 source /opt/mineru_venv/bin/activate
 # Display GPU information
+echo "===== Application Startup at $(date +'%Y-%m-%d %H:%M:%S') ====="
+echo ""
 echo "Checking NVIDIA GPU status:"
+nvidia-smi || echo "No NVIDIA GPU detected, running in CPU mode"
 # Display MinerU version
 echo "MinerU version:"
+magic-pdf --version || echo "Error: MinerU magic-pdf not found"
+# Create directories for models if they don't exist
+mkdir -p /tmp/models/MFD/YOLO
+mkdir -p /tmp/models/MFR/unimernet
+mkdir -p /tmp/models/table/rapid
+mkdir -p /tmp/models/layout/doclayout
+# Check if model files exist, if not, download them
+echo "Checking model files..."
+MODEL_FILES=(
+    "/tmp/models/MFD/YOLO/yolo_v8_ft.pt"
+    "/tmp/models/MFD/YOLO/yolo_v8_mfd.pt"
+    "/tmp/models/MFR/unimernet/unimernet_small.pth"
+    "/tmp/models/table/rapid/rapid_table.pt"
+    "/tmp/models/table/rapid/slanet_plus.pt"
+    "/tmp/models/layout/doclayout/doclayout_yolo.pt"
+)
+MODELS_REPO="https://huggingface.co/opendatalab/MinerU/resolve/main/models"
+MODEL_URLS=(
+    "${MODELS_REPO}/mfd/yolo_v8_mfd.pt"
+    "${MODELS_REPO}/mfd/yolo_v8_mfd.pt"
+    "${MODELS_REPO}/mfr/unimernet_small.pth"
+    "${MODELS_REPO}/table/rapid_table.pt"
+    "${MODELS_REPO}/table/slanet_plus.pt"
+    "${MODELS_REPO}/layout/doclayout_yolo.pt"
+)
+for i in "${!MODEL_FILES[@]}"; do
+    if [ ! -f "${MODEL_FILES[$i]}" ]; then
+        echo "Downloading ${MODEL_FILES[$i]}..."
+        wget -q "${MODEL_URLS[$i]}" -O "${MODEL_FILES[$i]}" || echo "Failed to download ${MODEL_FILES[$i]}"
+    else
+        echo "${MODEL_FILES[$i]} already exists."
+    fi
+done
 # Create a samples directory
+mkdir -p $HOME/samples
 mkdir -p $HOME/.config/magic_pdf
 # Download a sample PDF for testing if it doesn't exist
+echo "Downloading sample PDF for testing..."
+if [ ! -f "$HOME/samples/sample.pdf" ]; then
     # Download a simple paper from arXiv (using a small one for quick processing)
+    wget -q "https://arxiv.org/pdf/2201.08239.pdf" -O "$HOME/samples/sample.pdf" || true
     # If that fails, try another source
+    if [ ! -s "$HOME/samples/sample.pdf" ]; then
+        wget -q "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf" -O "$HOME/samples/sample.pdf" || true
     fi
     # If both fail, create a simple PDF with text
+    if [ ! -s "$HOME/samples/sample.pdf" ]; then
         echo "Failed to download sample PDF, creating a simple PDF text file..."
         echo "This is a sample PDF document for testing MinerU.
         MinerU is a high-quality tool for converting PDF to Markdown and JSON formats.
+        This file was created for testing purposes." > "$HOME/samples/sample.txt"
         # Try using different methods to create a PDF
         if command -v convert &> /dev/null; then
+            convert -size 612x792 -background white -fill black caption:@"$HOME/samples/sample.txt" "$HOME/samples/sample.pdf"
         else
             echo "WARNING: Could not create a sample PDF file automatically."
         fi
     fi
 fi
+# Create the magic-pdf.json config file with paths to the downloaded models
+echo "Creating magic-pdf.json configuration file..."
+cat > "$HOME/.config/magic_pdf/magic-pdf.json" << EOF
 {
     "device-mode": "gpu",
+    "models-path": "/tmp/models",
     "layout-config": {
         "model": "doclayout_yolo",
+        "model_path": "/tmp/models/layout/doclayout/doclayout_yolo.pt",
         "enable": true
     },
     "formula-config": {
         "mfd_model": "yolo_v8_mfd",
+        "mfd_model_path": "/tmp/models/MFD/YOLO/yolo_v8_mfd.pt",
         "mfr_model": "unimernet_small",
+        "mfr_model_path": "/tmp/models/MFR/unimernet/unimernet_small.pth",
         "enable": true
     },
     "table-config": {
         "model": "rapid_table",
+        "model_path": "/tmp/models/table/rapid/rapid_table.pt",
         "sub_model": "slanet_plus",
+        "sub_model_path": "/tmp/models/table/rapid/slanet_plus.pt",
         "enable": true,
         "max_time": 400
     }
 }
 EOF
+# Also create it in the home directory as some versions of MinerU look for it there
+cp "$HOME/.config/magic_pdf/magic-pdf.json" "$HOME/magic-pdf.json"
+# List model files to verify they're present
+echo "Verifying model files:"
+ls -la /tmp/models/MFD/YOLO/ || echo "YOLO models directory issue"
+ls -la /tmp/models/MFR/unimernet/ || echo "UniMERNet models directory issue"
+ls -la /tmp/models/table/rapid/ || echo "Table models directory issue"
+ls -la /tmp/models/layout/doclayout/ || echo "Layout models directory issue"
+# Start the Flask application
+echo "Starting Flask application..."
+python /app/app.py

requirements.txt CHANGED Viewed

@@ -1,10 +1,10 @@
 flask==2.3.3
-transformers>=4.37.0
-torch>=2.0.0
-sentencepiece>=0.1.99
 requests>=2.31.0
-accelerate>=0.25.0
-einops>=0.6.0
-packaging>=23.0
-werkzeug>=2.3.0
-flask-cors>=4.0.0

 flask==2.3.3
+werkzeug==2.3.7
+flask-cors==4.0.0
 requests>=2.31.0
+pillow>=9.4.0
+numpy>=1.24.0
+wget>=3.2
+magic-pdf[full]>=1.3.0
+uuid>=1.30
+python-magic>=0.4.27