Spaces:
Runtime error
Runtime error
Commit ·
128728a
1
Parent(s): a7cd086
Fix MinerU PDF API by adding model download and explicit configuration
Browse files- .gitattributes +6 -32
- Dockerfile +27 -4
- app.py +303 -593
- entrypoint.sh +69 -144
- requirements.txt +8 -8
.gitattributes
CHANGED
|
@@ -1,35 +1,9 @@
|
|
| 1 |
-
*.
|
| 2 |
-
*.
|
| 3 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.
|
| 18 |
-
*.
|
| 19 |
-
*.
|
| 20 |
-
*.
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 3 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
*.model filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
| 5 |
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.pdf filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Dockerfile
CHANGED
|
@@ -44,7 +44,7 @@ RUN pip install --upgrade pip
|
|
| 44 |
RUN git clone https://github.com/opendatalab/MinerU.git /tmp/MinerU
|
| 45 |
|
| 46 |
# Install required packages
|
| 47 |
-
RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/
|
| 48 |
|
| 49 |
# Install MinerU with all features
|
| 50 |
WORKDIR /tmp/MinerU
|
|
@@ -57,14 +57,37 @@ RUN pip install --no-cache-dir flask==2.3.3 flask-cors==4.0.0 werkzeug==2.3.7
|
|
| 57 |
RUN mkdir -p /tmp/pdf_uploads /tmp/pdf_output
|
| 58 |
RUN mkdir -p /tmp/samples
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
# Create a non-root user for Hugging Face Spaces
|
| 61 |
# This is critical for permissions on HF Spaces
|
| 62 |
RUN useradd -m -u 1000 user
|
| 63 |
RUN mkdir -p /app/samples && chown -R user:user /app
|
| 64 |
|
| 65 |
-
#
|
| 66 |
-
RUN
|
| 67 |
-
# This step will automatically download model weights during the first run
|
| 68 |
|
| 69 |
# Copy the application files
|
| 70 |
WORKDIR /app
|
|
|
|
| 44 |
RUN git clone https://github.com/opendatalab/MinerU.git /tmp/MinerU
|
| 45 |
|
| 46 |
# Install required packages
|
| 47 |
+
RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
|
| 48 |
|
| 49 |
# Install MinerU with all features
|
| 50 |
WORKDIR /tmp/MinerU
|
|
|
|
| 57 |
RUN mkdir -p /tmp/pdf_uploads /tmp/pdf_output
|
| 58 |
RUN mkdir -p /tmp/samples
|
| 59 |
|
| 60 |
+
# Create models directory structure
|
| 61 |
+
RUN mkdir -p /tmp/models/MFD/YOLO
|
| 62 |
+
RUN mkdir -p /tmp/models/MFR/unimernet
|
| 63 |
+
RUN mkdir -p /tmp/models/table/rapid
|
| 64 |
+
RUN mkdir -p /tmp/models/layout/doclayout
|
| 65 |
+
|
| 66 |
+
# Download model weights
|
| 67 |
+
RUN echo "Downloading MinerU model weights..."
|
| 68 |
+
WORKDIR /tmp/models
|
| 69 |
+
|
| 70 |
+
# Download the YOLO model for formula detection
|
| 71 |
+
RUN wget -q https://huggingface.co/opendatalab/MinerU/resolve/main/models/mfd/yolo_v8_mfd.pt -O /tmp/models/MFD/YOLO/yolo_v8_ft.pt || echo "Failed to download yolo_v8_ft.pt"
|
| 72 |
+
RUN wget -q https://huggingface.co/opendatalab/MinerU/resolve/main/models/mfd/yolo_v8_mfd.pt -O /tmp/models/MFD/YOLO/yolo_v8_mfd.pt || echo "Failed to download yolo_v8_mfd.pt"
|
| 73 |
+
|
| 74 |
+
# Download the UniMERNet model for formula recognition
|
| 75 |
+
RUN wget -q https://huggingface.co/opendatalab/MinerU/resolve/main/models/mfr/unimernet_small.pth -O /tmp/models/MFR/unimernet/unimernet_small.pth || echo "Failed to download unimernet_small.pth"
|
| 76 |
+
|
| 77 |
+
# Download the Rapid Table detection model
|
| 78 |
+
RUN wget -q https://huggingface.co/opendatalab/MinerU/resolve/main/models/table/rapid_table.pt -O /tmp/models/table/rapid/rapid_table.pt || echo "Failed to download rapid_table.pt"
|
| 79 |
+
RUN wget -q https://huggingface.co/opendatalab/MinerU/resolve/main/models/table/slanet_plus.pt -O /tmp/models/table/rapid/slanet_plus.pt || echo "Failed to download slanet_plus.pt"
|
| 80 |
+
|
| 81 |
+
# Download the DocLayout YOLO model
|
| 82 |
+
RUN wget -q https://huggingface.co/opendatalab/MinerU/resolve/main/models/layout/doclayout_yolo.pt -O /tmp/models/layout/doclayout/doclayout_yolo.pt || echo "Failed to download doclayout_yolo.pt"
|
| 83 |
+
|
| 84 |
# Create a non-root user for Hugging Face Spaces
|
| 85 |
# This is critical for permissions on HF Spaces
|
| 86 |
RUN useradd -m -u 1000 user
|
| 87 |
RUN mkdir -p /app/samples && chown -R user:user /app
|
| 88 |
|
| 89 |
+
# Set permissions for model files
|
| 90 |
+
RUN chown -R user:user /tmp/models
|
|
|
|
| 91 |
|
| 92 |
# Copy the application files
|
| 93 |
WORKDIR /app
|
app.py
CHANGED
|
@@ -1,705 +1,415 @@
|
|
| 1 |
-
from flask import Flask, request, jsonify,
|
|
|
|
| 2 |
import os
|
|
|
|
|
|
|
|
|
|
| 3 |
import subprocess
|
| 4 |
import tempfile
|
| 5 |
-
import uuid
|
| 6 |
import json
|
| 7 |
-
import shutil
|
| 8 |
import time
|
| 9 |
-
import
|
| 10 |
-
import sys
|
| 11 |
-
from werkzeug.utils import secure_filename
|
| 12 |
-
from flask_cors import CORS # Add CORS support
|
| 13 |
|
| 14 |
app = Flask(__name__)
|
| 15 |
-
CORS(app)
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
-
#
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
|
|
|
| 21 |
|
| 22 |
-
#
|
| 23 |
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
| 24 |
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
|
| 25 |
|
| 26 |
-
#
|
| 27 |
-
|
|
|
|
| 28 |
|
|
|
|
| 29 |
HTML_TEMPLATE = """
|
| 30 |
<!DOCTYPE html>
|
| 31 |
<html>
|
| 32 |
<head>
|
| 33 |
-
<title>MinerU PDF
|
| 34 |
<style>
|
| 35 |
body {
|
| 36 |
-
font-family:
|
| 37 |
-
|
|
|
|
|
|
|
| 38 |
margin: 0 auto;
|
| 39 |
padding: 20px;
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
}
|
| 42 |
.container {
|
| 43 |
-
background-color:
|
| 44 |
padding: 20px;
|
| 45 |
border-radius: 8px;
|
| 46 |
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
| 47 |
-
margin-
|
| 48 |
}
|
| 49 |
-
|
| 50 |
-
color: #
|
|
|
|
|
|
|
|
|
|
| 51 |
}
|
| 52 |
-
|
| 53 |
-
background-color: #
|
| 54 |
padding: 10px;
|
| 55 |
-
border-radius: 4px;
|
| 56 |
-
overflow-x: auto;
|
| 57 |
-
max-height: 300px;
|
| 58 |
-
overflow-y: auto;
|
| 59 |
-
}
|
| 60 |
-
.command {
|
| 61 |
-
font-family: monospace;
|
| 62 |
-
background-color: #eee;
|
| 63 |
-
padding: 5px;
|
| 64 |
border-radius: 3px;
|
|
|
|
|
|
|
| 65 |
}
|
| 66 |
-
|
| 67 |
margin: 20px 0;
|
| 68 |
-
padding: 15px;
|
| 69 |
-
border: 1px solid #ddd;
|
| 70 |
-
border-radius: 8px;
|
| 71 |
}
|
| 72 |
-
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
color: white;
|
| 75 |
-
padding: 8px 16px;
|
| 76 |
border: none;
|
| 77 |
border-radius: 4px;
|
| 78 |
cursor: pointer;
|
| 79 |
-
font-size: 16px;
|
| 80 |
}
|
| 81 |
-
|
| 82 |
-
background-color: #
|
| 83 |
}
|
| 84 |
-
.
|
| 85 |
-
display: none;
|
| 86 |
-
color: #666;
|
| 87 |
-
margin-top: 10px;
|
| 88 |
-
}
|
| 89 |
-
.result-section {
|
| 90 |
margin-top: 20px;
|
|
|
|
| 91 |
}
|
| 92 |
-
.
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
margin-top: 20px;
|
| 97 |
}
|
| 98 |
-
.
|
| 99 |
-
background-color:
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
cursor: pointer;
|
| 104 |
-
padding: 10px 16px;
|
| 105 |
-
transition: 0.3s;
|
| 106 |
}
|
| 107 |
-
|
| 108 |
-
|
|
|
|
| 109 |
}
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
}
|
| 113 |
-
.tabcontent {
|
| 114 |
-
display: none;
|
| 115 |
-
padding: 6px 12px;
|
| 116 |
-
border: 1px solid #ccc;
|
| 117 |
-
border-top: none;
|
| 118 |
-
max-height: 500px;
|
| 119 |
-
overflow-y: auto;
|
| 120 |
-
white-space: pre-wrap;
|
| 121 |
}
|
| 122 |
</style>
|
| 123 |
</head>
|
| 124 |
<body>
|
|
|
|
|
|
|
| 125 |
<div class="container">
|
| 126 |
-
<
|
| 127 |
-
<p>
|
| 128 |
-
|
| 129 |
-
<
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
<
|
| 136 |
-
|
| 137 |
-
</form>
|
| 138 |
-
<div id="loadingIndicator" class="loading">Processing PDF file... This may take a minute.</div>
|
| 139 |
</div>
|
| 140 |
-
|
| 141 |
-
<
|
| 142 |
-
<
|
| 143 |
-
<
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
<button class="tablinks" onclick="openTab(event, 'Log')">Processing Log</button>
|
| 149 |
-
</div>
|
| 150 |
-
|
| 151 |
-
<div id="Markdown" class="tabcontent">
|
| 152 |
-
<pre id="markdownContent"></pre>
|
| 153 |
-
<a id="downloadMarkdown" class="btn" style="margin-top: 10px;">Download Markdown</a>
|
| 154 |
-
</div>
|
| 155 |
-
|
| 156 |
-
<div id="JSON" class="tabcontent">
|
| 157 |
-
<pre id="jsonContent"></pre>
|
| 158 |
-
<a id="downloadJson" class="btn" style="margin-top: 10px;">Download JSON</a>
|
| 159 |
-
</div>
|
| 160 |
-
|
| 161 |
-
<div id="Log" class="tabcontent">
|
| 162 |
-
<pre id="logContent"></pre>
|
| 163 |
-
</div>
|
| 164 |
</div>
|
| 165 |
|
| 166 |
-
<
|
| 167 |
-
|
| 168 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
|
| 170 |
-
<
|
| 171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
</div>
|
| 173 |
-
|
| 174 |
<script>
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
.then(response => response.json())
|
| 178 |
-
.then(data => {
|
| 179 |
-
document.getElementById('gpuStatus').textContent = data.output;
|
| 180 |
-
})
|
| 181 |
-
.catch(error => {
|
| 182 |
-
document.getElementById('gpuStatus').textContent = 'Error fetching GPU status: ' + error.message;
|
| 183 |
-
});
|
| 184 |
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
.then(response => response.json())
|
| 188 |
-
.then(data => {
|
| 189 |
-
document.getElementById('helpOutput').textContent = data.output;
|
| 190 |
-
})
|
| 191 |
-
.catch(error => {
|
| 192 |
-
document.getElementById('helpOutput').textContent = 'Error fetching help: ' + error.message;
|
| 193 |
-
});
|
| 194 |
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
tabcontent = document.getElementsByClassName("tabcontent");
|
| 199 |
-
for (i = 0; i < tabcontent.length; i++) {
|
| 200 |
-
tabcontent[i].style.display = "none";
|
| 201 |
-
}
|
| 202 |
-
tablinks = document.getElementsByClassName("tablinks");
|
| 203 |
-
for (i = 0; i < tablinks.length; i++) {
|
| 204 |
-
tablinks[i].className = tablinks[i].className.replace(" active", "");
|
| 205 |
}
|
| 206 |
-
document.getElementById(tabName).style.display = "block";
|
| 207 |
-
evt.currentTarget.className += " active";
|
| 208 |
-
}
|
| 209 |
-
|
| 210 |
-
// Set up form submission
|
| 211 |
-
document.getElementById('uploadForm').addEventListener('submit', function(e) {
|
| 212 |
-
e.preventDefault();
|
| 213 |
|
| 214 |
-
|
| 215 |
-
|
|
|
|
|
|
|
| 216 |
|
| 217 |
-
|
| 218 |
-
|
|
|
|
|
|
|
| 219 |
|
| 220 |
-
const formData = new FormData(
|
|
|
|
| 221 |
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
.then(data => {
|
| 228 |
-
loadingIndicator.style.display = 'none';
|
| 229 |
-
resultSection.style.display = 'block';
|
| 230 |
|
| 231 |
-
|
| 232 |
|
| 233 |
-
//
|
| 234 |
-
|
| 235 |
-
document.getElementById('markdownContent').textContent = data.markdown;
|
| 236 |
-
const downloadMarkdown = document.getElementById('downloadMarkdown');
|
| 237 |
-
downloadMarkdown.href = data.markdown_url;
|
| 238 |
-
downloadMarkdown.download = data.base_filename + '.md';
|
| 239 |
-
}
|
| 240 |
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
document.getElementById('
|
| 244 |
-
const
|
| 245 |
-
|
| 246 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
}
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
// Open the markdown tab by default
|
| 255 |
-
document.getElementById('defaultOpen').click();
|
| 256 |
-
})
|
| 257 |
-
.catch(error => {
|
| 258 |
-
loadingIndicator.style.display = 'none';
|
| 259 |
-
alert('Error: ' + error.message);
|
| 260 |
-
});
|
| 261 |
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
</script>
|
| 263 |
</body>
|
| 264 |
</html>
|
| 265 |
"""
|
| 266 |
|
|
|
|
| 267 |
@app.route('/')
|
| 268 |
def index():
|
| 269 |
-
return render_template_string(HTML_TEMPLATE)
|
| 270 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
@app.route('/gpu-status')
|
| 272 |
def gpu_status():
|
| 273 |
-
import subprocess
|
| 274 |
try:
|
| 275 |
-
output = subprocess.check_output(['nvidia-smi'], stderr=subprocess.STDOUT)
|
| 276 |
-
except subprocess.CalledProcessError as e:
|
| 277 |
-
output = f"
|
| 278 |
-
except FileNotFoundError:
|
| 279 |
-
output = "nvidia-smi command not found. GPU may not be available."
|
| 280 |
return jsonify({"output": output})
|
| 281 |
|
|
|
|
| 282 |
@app.route('/help-output')
|
| 283 |
def help_output():
|
| 284 |
-
import subprocess
|
| 285 |
try:
|
| 286 |
-
output = subprocess.check_output(['magic-pdf', '--help'], stderr=subprocess.STDOUT)
|
| 287 |
-
except subprocess.CalledProcessError as e:
|
| 288 |
-
output = f"
|
| 289 |
-
except FileNotFoundError:
|
| 290 |
-
output = "magic-pdf command not found. MinerU may not be installed correctly."
|
| 291 |
return jsonify({"output": output})
|
| 292 |
|
| 293 |
-
|
|
|
|
| 294 |
def convert_pdf():
|
| 295 |
if 'file' not in request.files:
|
| 296 |
-
return jsonify({"error": "No file part"}), 400
|
| 297 |
|
| 298 |
file = request.files['file']
|
| 299 |
-
if file.filename == '':
|
| 300 |
-
return jsonify({"error": "No selected file"}), 400
|
| 301 |
-
|
| 302 |
-
if not file.filename.lower().endswith('.pdf'):
|
| 303 |
-
return jsonify({"error": "File must be a PDF"}), 400
|
| 304 |
-
|
| 305 |
-
# Generate a unique ID for this conversion
|
| 306 |
-
job_id = str(uuid.uuid4())
|
| 307 |
-
job_dir = os.path.join(OUTPUT_FOLDER, job_id)
|
| 308 |
-
os.makedirs(job_dir, exist_ok=True)
|
| 309 |
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
base_filename = os.path.splitext(filename)[0]
|
| 313 |
-
pdf_path = os.path.join(job_dir, filename)
|
| 314 |
-
file.save(pdf_path)
|
| 315 |
-
|
| 316 |
-
# Run magic-pdf on the file
|
| 317 |
-
output_dir = os.path.join(job_dir, "output")
|
| 318 |
-
os.makedirs(output_dir, exist_ok=True)
|
| 319 |
|
| 320 |
-
|
|
|
|
| 321 |
|
| 322 |
try:
|
| 323 |
-
#
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
"model": "doclayout_yolo",
|
| 328 |
-
"enable": True
|
| 329 |
-
},
|
| 330 |
-
"formula-config": {
|
| 331 |
-
"mfd_model": "yolo_v8_mfd",
|
| 332 |
-
"mfr_model": "unimernet_small",
|
| 333 |
-
"enable": True
|
| 334 |
-
},
|
| 335 |
-
"table-config": {
|
| 336 |
-
"model": "rapid_table",
|
| 337 |
-
"sub_model": "slanet_plus",
|
| 338 |
-
"enable": True,
|
| 339 |
-
"max_time": 400
|
| 340 |
-
}
|
| 341 |
-
}
|
| 342 |
|
| 343 |
-
#
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
if not os.path.exists(home_config_file):
|
| 355 |
-
with open(home_config_file, 'w') as f:
|
| 356 |
-
json.dump(default_config, f, indent=2)
|
| 357 |
-
|
| 358 |
-
# Add a small delay to ensure config file is written before magic-pdf runs
|
| 359 |
-
time.sleep(0.5)
|
| 360 |
-
|
| 361 |
-
# Use magic-pdf to convert the PDF to Markdown and JSON
|
| 362 |
-
cmd = [
|
| 363 |
-
'magic-pdf',
|
| 364 |
-
'--path', pdf_path,
|
| 365 |
-
'--output-dir', output_dir
|
| 366 |
-
]
|
| 367 |
|
| 368 |
-
#
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
|
| 383 |
-
process.
|
| 384 |
|
| 385 |
if process.returncode != 0:
|
|
|
|
| 386 |
return jsonify({
|
| 387 |
-
"error":
|
| 388 |
-
"
|
|
|
|
| 389 |
}), 500
|
| 390 |
-
|
| 391 |
-
# Get the generated markdown and JSON
|
| 392 |
-
markdown_file = os.path.join(output_dir, f"{base_filename}.md")
|
| 393 |
-
json_file = os.path.join(output_dir, f"{base_filename}.json")
|
| 394 |
-
|
| 395 |
-
# Check if the output files exist
|
| 396 |
-
markdown_content = ""
|
| 397 |
-
json_content = {}
|
| 398 |
-
|
| 399 |
-
if os.path.exists(markdown_file):
|
| 400 |
-
with open(markdown_file, 'r', encoding='utf-8') as f:
|
| 401 |
-
markdown_content = f.read()
|
| 402 |
-
|
| 403 |
-
if os.path.exists(json_file):
|
| 404 |
-
with open(json_file, 'r', encoding='utf-8') as f:
|
| 405 |
-
json_content = json.load(f)
|
| 406 |
-
|
| 407 |
-
# Read the log file
|
| 408 |
-
with open(log_file, 'r') as f:
|
| 409 |
-
log_content = f.read()
|
| 410 |
-
|
| 411 |
-
# Copy the output files to a location accessible for download
|
| 412 |
-
output_markdown = os.path.join(job_dir, f"{base_filename}.md")
|
| 413 |
-
output_json = os.path.join(job_dir, f"{base_filename}.json")
|
| 414 |
-
|
| 415 |
-
if os.path.exists(markdown_file):
|
| 416 |
-
shutil.copy(markdown_file, output_markdown)
|
| 417 |
-
|
| 418 |
-
if os.path.exists(json_file):
|
| 419 |
-
shutil.copy(json_file, output_json)
|
| 420 |
-
|
| 421 |
-
# Return the conversion results
|
| 422 |
-
return jsonify({
|
| 423 |
-
"message": f"PDF '{filename}' converted successfully",
|
| 424 |
-
"markdown": markdown_content,
|
| 425 |
-
"json": json_content,
|
| 426 |
-
"log": log_content,
|
| 427 |
-
"base_filename": base_filename,
|
| 428 |
-
"job_id": job_id,
|
| 429 |
-
"markdown_url": url_for('download_file', job_id=job_id, filename=f"{base_filename}.md"),
|
| 430 |
-
"json_url": url_for('download_file', job_id=job_id, filename=f"{base_filename}.json")
|
| 431 |
-
})
|
| 432 |
-
|
| 433 |
-
except Exception as e:
|
| 434 |
-
import traceback
|
| 435 |
-
error_details = traceback.format_exc()
|
| 436 |
-
return jsonify({
|
| 437 |
-
"error": f"Failed to convert PDF: {str(e)}",
|
| 438 |
-
"details": error_details
|
| 439 |
-
}), 500
|
| 440 |
-
|
| 441 |
-
@app.route('/download/<job_id>/<filename>')
|
| 442 |
-
def download_file(job_id, filename):
|
| 443 |
-
job_dir = os.path.join(OUTPUT_FOLDER, job_id)
|
| 444 |
-
return send_from_directory(job_dir, filename)
|
| 445 |
-
|
| 446 |
-
# Add a sample PDF for testing
|
| 447 |
-
@app.route('/sample')
|
| 448 |
-
def add_sample():
|
| 449 |
-
try:
|
| 450 |
-
# Create a tiny text-only PDF using Podofoimpose (if available) or other method
|
| 451 |
-
sample_dir = os.path.join(UPLOAD_FOLDER, 'sample')
|
| 452 |
-
os.makedirs(sample_dir, exist_ok=True)
|
| 453 |
-
sample_path = os.path.join(sample_dir, 'sample.pdf')
|
| 454 |
-
|
| 455 |
-
# Use simple text for the sample
|
| 456 |
-
with open(os.path.join(sample_dir, 'sample.txt'), 'w') as f:
|
| 457 |
-
f.write("This is a sample PDF for testing MinerU.\n\nIt contains simple text to demonstrate the PDF to Markdown and JSON conversion capabilities.")
|
| 458 |
-
|
| 459 |
-
# Try to convert the text to PDF if possible
|
| 460 |
-
try:
|
| 461 |
-
subprocess.run(['convert', '-size', '612x792', 'caption:@' + os.path.join(sample_dir, 'sample.txt'), sample_path])
|
| 462 |
-
except:
|
| 463 |
-
# If ImageMagick's convert fails, try another approach
|
| 464 |
-
return jsonify({"error": "Could not create sample PDF. Please upload your own PDF file."}), 500
|
| 465 |
-
|
| 466 |
-
return jsonify({"message": "Sample PDF created", "path": sample_path})
|
| 467 |
-
except Exception as e:
|
| 468 |
-
return jsonify({"error": f"Failed to create sample PDF: {str(e)}"}), 500
|
| 469 |
-
|
| 470 |
-
@app.route('/health')
|
| 471 |
-
def health_check():
|
| 472 |
-
"""
|
| 473 |
-
Health check endpoint for monitoring.
|
| 474 |
-
Returns basic information about the service status.
|
| 475 |
-
"""
|
| 476 |
-
try:
|
| 477 |
-
# Check if magic-pdf command exists
|
| 478 |
-
has_magic_pdf = False
|
| 479 |
-
try:
|
| 480 |
-
subprocess.run(['magic-pdf', '--version'], capture_output=True, check=False)
|
| 481 |
-
has_magic_pdf = True
|
| 482 |
-
except FileNotFoundError:
|
| 483 |
-
pass
|
| 484 |
-
|
| 485 |
-
# Get runtime information
|
| 486 |
-
health_info = {
|
| 487 |
-
'status': 'healthy',
|
| 488 |
-
'version': APP_VERSION,
|
| 489 |
-
'environment': {
|
| 490 |
-
'python_version': platform.python_version(),
|
| 491 |
-
'platform': platform.platform(),
|
| 492 |
-
'processor': platform.processor()
|
| 493 |
-
},
|
| 494 |
-
'configuration': {
|
| 495 |
-
'upload_folder_exists': os.path.exists(UPLOAD_FOLDER),
|
| 496 |
-
'output_folder_exists': os.path.exists(OUTPUT_FOLDER),
|
| 497 |
-
'magic_pdf_installed': has_magic_pdf
|
| 498 |
-
}
|
| 499 |
-
}
|
| 500 |
-
|
| 501 |
-
return jsonify(health_info)
|
| 502 |
-
except Exception as e:
|
| 503 |
-
return jsonify({
|
| 504 |
-
'status': 'unhealthy',
|
| 505 |
-
'error': str(e)
|
| 506 |
-
}), 500
|
| 507 |
-
|
| 508 |
-
@app.route('/api/convert', methods=['POST'])
|
| 509 |
-
def api_convert_pdf():
|
| 510 |
-
"""
|
| 511 |
-
API endpoint for programmatic access to PDF conversion.
|
| 512 |
-
|
| 513 |
-
Request:
|
| 514 |
-
- POST request with 'file' field containing PDF file
|
| 515 |
-
|
| 516 |
-
Response:
|
| 517 |
-
- JSON with conversion results
|
| 518 |
-
"""
|
| 519 |
-
# Validate request
|
| 520 |
-
if 'file' not in request.files:
|
| 521 |
-
return jsonify({
|
| 522 |
-
'success': False,
|
| 523 |
-
'error': 'No file provided. Please upload a PDF file.'
|
| 524 |
-
}), 400
|
| 525 |
-
|
| 526 |
-
file = request.files['file']
|
| 527 |
-
|
| 528 |
-
if file.filename == '':
|
| 529 |
-
return jsonify({
|
| 530 |
-
'success': False,
|
| 531 |
-
'error': 'No file selected. Please select a PDF file.'
|
| 532 |
-
}), 400
|
| 533 |
-
|
| 534 |
-
# Check if the file is a PDF
|
| 535 |
-
if not file.filename.lower().endswith('.pdf'):
|
| 536 |
-
return jsonify({
|
| 537 |
-
'success': False,
|
| 538 |
-
'error': 'Invalid file format. Please upload a PDF file.'
|
| 539 |
-
}), 400
|
| 540 |
-
|
| 541 |
-
# Generate a job ID
|
| 542 |
-
job_id = str(uuid.uuid4())
|
| 543 |
-
|
| 544 |
-
# Create job directory
|
| 545 |
-
job_dir = os.path.join(OUTPUT_FOLDER, job_id)
|
| 546 |
-
os.makedirs(job_dir, exist_ok=True)
|
| 547 |
-
|
| 548 |
-
# Save the uploaded file
|
| 549 |
-
filename = secure_filename(file.filename)
|
| 550 |
-
base_filename, _ = os.path.splitext(filename)
|
| 551 |
-
|
| 552 |
-
pdf_path = os.path.join(job_dir, filename)
|
| 553 |
-
file.save(pdf_path)
|
| 554 |
-
|
| 555 |
-
try:
|
| 556 |
-
# Define the default config dictionary first
|
| 557 |
-
default_config = {
|
| 558 |
-
"device-mode": "cpu",
|
| 559 |
-
"layout-config": {
|
| 560 |
-
"model": "doclayout_yolo",
|
| 561 |
-
"enable": True
|
| 562 |
-
},
|
| 563 |
-
"formula-config": {
|
| 564 |
-
"mfd_model": "yolo_v8_mfd",
|
| 565 |
-
"mfr_model": "unimernet_small",
|
| 566 |
-
"enable": True
|
| 567 |
-
},
|
| 568 |
-
"table-config": {
|
| 569 |
-
"model": "rapid_table",
|
| 570 |
-
"sub_model": "slanet_plus",
|
| 571 |
-
"enable": True,
|
| 572 |
-
"max_time": 400
|
| 573 |
-
}
|
| 574 |
-
}
|
| 575 |
-
|
| 576 |
-
# Create the magic-pdf.json configuration file in .config if it doesn't exist
|
| 577 |
-
config_dir = os.path.expanduser("~/.config/magic_pdf")
|
| 578 |
-
os.makedirs(config_dir, exist_ok=True)
|
| 579 |
-
config_file = os.path.join(config_dir, "magic-pdf.json")
|
| 580 |
-
|
| 581 |
-
if not os.path.exists(config_file):
|
| 582 |
-
with open(config_file, 'w') as f:
|
| 583 |
-
json.dump(default_config, f, indent=2)
|
| 584 |
-
|
| 585 |
-
# Also create the config in the home directory as fallback
|
| 586 |
-
home_config_file = os.path.join(os.path.expanduser("~"), "magic-pdf.json")
|
| 587 |
-
if not os.path.exists(home_config_file):
|
| 588 |
-
with open(home_config_file, 'w') as f:
|
| 589 |
-
json.dump(default_config, f, indent=2)
|
| 590 |
-
|
| 591 |
-
# Add a small delay to ensure config file is written before magic-pdf runs
|
| 592 |
-
time.sleep(0.5)
|
| 593 |
-
|
| 594 |
-
# Log the conversion process
|
| 595 |
-
log_file = os.path.join(job_dir, "conversion.log")
|
| 596 |
-
with open(log_file, "w") as log:
|
| 597 |
-
# Run the MinerU magic-pdf command with correct parameters
|
| 598 |
-
command = ["magic-pdf", "--path", pdf_path, "--output-dir", job_dir]
|
| 599 |
-
process = subprocess.Popen(
|
| 600 |
-
command,
|
| 601 |
-
stdout=subprocess.PIPE,
|
| 602 |
-
stderr=subprocess.STDOUT,
|
| 603 |
-
universal_newlines=True
|
| 604 |
-
)
|
| 605 |
|
| 606 |
-
output
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
|
| 610 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 611 |
|
| 612 |
-
process.wait()
|
| 613 |
-
exit_code = process.returncode
|
| 614 |
-
|
| 615 |
-
if exit_code != 0:
|
| 616 |
-
error_message = ''.join(output) if output else "Unknown error during PDF conversion"
|
| 617 |
return jsonify({
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
'log': error_message,
|
| 621 |
-
'exit_code': exit_code
|
| 622 |
}), 500
|
| 623 |
-
|
| 624 |
-
# Check for output files
|
| 625 |
-
markdown_file = os.path.join(job_dir, f"{base_filename}.md")
|
| 626 |
-
json_file = os.path.join(job_dir, f"{base_filename}.json")
|
| 627 |
-
|
| 628 |
-
# If files don't exist in the job directory, check the same directory as the PDF
|
| 629 |
-
pdf_dir = os.path.dirname(pdf_path)
|
| 630 |
-
if not os.path.exists(markdown_file):
|
| 631 |
-
alt_markdown_file = os.path.join(pdf_dir, f"{base_filename}.md")
|
| 632 |
-
if os.path.exists(alt_markdown_file):
|
| 633 |
-
markdown_file = alt_markdown_file
|
| 634 |
-
else:
|
| 635 |
-
# Try to find any markdown file in the output directory
|
| 636 |
-
md_files = [f for f in os.listdir(job_dir) if f.endswith('.md')]
|
| 637 |
-
if md_files:
|
| 638 |
-
markdown_file = os.path.join(job_dir, md_files[0])
|
| 639 |
-
|
| 640 |
-
if not os.path.exists(json_file):
|
| 641 |
-
alt_json_file = os.path.join(pdf_dir, f"{base_filename}.json")
|
| 642 |
-
if os.path.exists(alt_json_file):
|
| 643 |
-
json_file = alt_json_file
|
| 644 |
-
else:
|
| 645 |
-
# Try to find any JSON file in the output directory
|
| 646 |
-
json_files = [f for f in os.listdir(job_dir) if f.endswith('.json')]
|
| 647 |
-
if json_files:
|
| 648 |
-
json_file = os.path.join(job_dir, json_files[0])
|
| 649 |
-
|
| 650 |
-
# Read markdown content
|
| 651 |
-
markdown_content = ""
|
| 652 |
-
if os.path.exists(markdown_file):
|
| 653 |
-
with open(markdown_file, 'r', encoding='utf-8') as f:
|
| 654 |
-
markdown_content = f.read()
|
| 655 |
-
else:
|
| 656 |
-
print(f"Warning: Markdown file not found at {markdown_file}")
|
| 657 |
-
|
| 658 |
-
# Read JSON content
|
| 659 |
-
json_content = {}
|
| 660 |
-
if os.path.exists(json_file):
|
| 661 |
-
with open(json_file, 'r', encoding='utf-8') as f:
|
| 662 |
-
json_content = json.load(f)
|
| 663 |
-
else:
|
| 664 |
-
print(f"Warning: JSON file not found at {json_file}")
|
| 665 |
-
|
| 666 |
-
# Read log content
|
| 667 |
-
log_content = ""
|
| 668 |
-
with open(log_file, 'r', encoding='utf-8') as f:
|
| 669 |
-
log_content = f.read()
|
| 670 |
-
|
| 671 |
-
# Create the result
|
| 672 |
-
result = {
|
| 673 |
-
'success': True,
|
| 674 |
-
'message': 'PDF conversion successful',
|
| 675 |
-
'job_id': job_id,
|
| 676 |
-
'base_filename': base_filename,
|
| 677 |
-
'file_info': {
|
| 678 |
-
'original_filename': filename,
|
| 679 |
-
'size_bytes': os.path.getsize(pdf_path),
|
| 680 |
-
'content_type': 'application/pdf'
|
| 681 |
-
},
|
| 682 |
-
'markdown': markdown_content,
|
| 683 |
-
'json': json_content,
|
| 684 |
-
'log': log_content,
|
| 685 |
-
'files': {
|
| 686 |
-
'markdown_path': os.path.basename(markdown_file) if os.path.exists(markdown_file) else None,
|
| 687 |
-
'json_path': os.path.basename(json_file) if os.path.exists(json_file) else None
|
| 688 |
-
}
|
| 689 |
-
}
|
| 690 |
-
|
| 691 |
-
return jsonify(result)
|
| 692 |
-
|
| 693 |
except Exception as e:
|
| 694 |
-
|
| 695 |
-
|
| 696 |
|
| 697 |
return jsonify({
|
| 698 |
-
|
| 699 |
-
|
| 700 |
-
'details': error_details,
|
| 701 |
-
'job_id': job_id
|
| 702 |
}), 500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 703 |
|
| 704 |
if __name__ == '__main__':
|
| 705 |
-
app.run(host='0.0.0.0', port=7860
|
|
|
|
| 1 |
+
from flask import Flask, request, jsonify, send_file, render_template_string, abort
|
| 2 |
+
from flask_cors import CORS
|
| 3 |
import os
|
| 4 |
+
import uuid
|
| 5 |
+
import traceback
|
| 6 |
+
import logging
|
| 7 |
import subprocess
|
| 8 |
import tempfile
|
|
|
|
| 9 |
import json
|
|
|
|
| 10 |
import time
|
| 11 |
+
import shutil
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
app = Flask(__name__)
|
| 14 |
+
CORS(app)
|
| 15 |
+
|
| 16 |
+
# Configure logging
|
| 17 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
| 18 |
|
| 19 |
+
# Constants
|
| 20 |
+
UPLOAD_FOLDER = os.environ.get('UPLOAD_FOLDER', '/tmp/pdf_uploads')
|
| 21 |
+
OUTPUT_FOLDER = os.environ.get('OUTPUT_FOLDER', '/tmp/pdf_output')
|
| 22 |
+
ALLOWED_EXTENSIONS = {'pdf'}
|
| 23 |
+
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50MB
|
| 24 |
|
| 25 |
+
# Ensure the directories exist
|
| 26 |
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
| 27 |
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
|
| 28 |
|
| 29 |
+
# Function to check if file extension is allowed
|
| 30 |
+
def allowed_file(filename):
|
| 31 |
+
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
|
| 32 |
|
| 33 |
+
# Template for the main page
|
| 34 |
HTML_TEMPLATE = """
|
| 35 |
<!DOCTYPE html>
|
| 36 |
<html>
|
| 37 |
<head>
|
| 38 |
+
<title>MinerU PDF Converter</title>
|
| 39 |
<style>
|
| 40 |
body {
|
| 41 |
+
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
| 42 |
+
line-height: 1.6;
|
| 43 |
+
color: #333;
|
| 44 |
+
max-width: 800px;
|
| 45 |
margin: 0 auto;
|
| 46 |
padding: 20px;
|
| 47 |
+
background-color: #f5f8fa;
|
| 48 |
+
}
|
| 49 |
+
h1 {
|
| 50 |
+
color: #2c3e50;
|
| 51 |
+
border-bottom: 2px solid #3498db;
|
| 52 |
+
padding-bottom: 10px;
|
| 53 |
}
|
| 54 |
.container {
|
| 55 |
+
background-color: white;
|
| 56 |
padding: 20px;
|
| 57 |
border-radius: 8px;
|
| 58 |
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
| 59 |
+
margin-top: 20px;
|
| 60 |
}
|
| 61 |
+
.info {
|
| 62 |
+
background-color: #e8f4fc;
|
| 63 |
+
padding: 15px;
|
| 64 |
+
border-radius: 5px;
|
| 65 |
+
margin: 15px 0;
|
| 66 |
}
|
| 67 |
+
.code {
|
| 68 |
+
background-color: #f4f4f4;
|
| 69 |
padding: 10px;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
border-radius: 3px;
|
| 71 |
+
font-family: monospace;
|
| 72 |
+
overflow-x: auto;
|
| 73 |
}
|
| 74 |
+
form {
|
| 75 |
margin: 20px 0;
|
|
|
|
|
|
|
|
|
|
| 76 |
}
|
| 77 |
+
input[type=file] {
|
| 78 |
+
padding: 10px;
|
| 79 |
+
width: 100%;
|
| 80 |
+
margin-bottom: 10px;
|
| 81 |
+
}
|
| 82 |
+
button {
|
| 83 |
+
padding: 10px 15px;
|
| 84 |
+
background-color: #3498db;
|
| 85 |
color: white;
|
|
|
|
| 86 |
border: none;
|
| 87 |
border-radius: 4px;
|
| 88 |
cursor: pointer;
|
|
|
|
| 89 |
}
|
| 90 |
+
button:hover {
|
| 91 |
+
background-color: #2980b9;
|
| 92 |
}
|
| 93 |
+
.result {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
margin-top: 20px;
|
| 95 |
+
display: none;
|
| 96 |
}
|
| 97 |
+
.loading {
|
| 98 |
+
display: none;
|
| 99 |
+
text-align: center;
|
| 100 |
+
margin: 20px 0;
|
|
|
|
| 101 |
}
|
| 102 |
+
.error {
|
| 103 |
+
background-color: #fee;
|
| 104 |
+
border-left: 4px solid #e74c3c;
|
| 105 |
+
padding: 10px;
|
| 106 |
+
margin: 10px 0;
|
|
|
|
|
|
|
|
|
|
| 107 |
}
|
| 108 |
+
a {
|
| 109 |
+
color: #3498db;
|
| 110 |
+
text-decoration: none;
|
| 111 |
}
|
| 112 |
+
a:hover {
|
| 113 |
+
text-decoration: underline;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
}
|
| 115 |
</style>
|
| 116 |
</head>
|
| 117 |
<body>
|
| 118 |
+
<h1>📄 MinerU PDF Converter</h1>
|
| 119 |
+
|
| 120 |
<div class="container">
|
| 121 |
+
<h2>Convert PDF to Markdown and JSON</h2>
|
| 122 |
+
<p>Upload a PDF file to convert it to Markdown and structured JSON.</p>
|
| 123 |
+
|
| 124 |
+
<div class="info">
|
| 125 |
+
<h3>Features</h3>
|
| 126 |
+
<ul>
|
| 127 |
+
<li>High-quality PDF extraction</li>
|
| 128 |
+
<li>Support for tables, formulas, and complex layouts</li>
|
| 129 |
+
<li>Output in both Markdown and structured JSON</li>
|
| 130 |
+
<li>Comprehensive error handling</li>
|
| 131 |
+
</ul>
|
|
|
|
|
|
|
| 132 |
</div>
|
| 133 |
+
|
| 134 |
+
<form id="uploadForm" enctype="multipart/form-data">
|
| 135 |
+
<input type="file" id="pdfFile" accept=".pdf" required>
|
| 136 |
+
<button type="submit">Convert PDF</button>
|
| 137 |
+
</form>
|
| 138 |
+
|
| 139 |
+
<div id="loading" class="loading">
|
| 140 |
+
<p>Converting PDF... This may take a minute for large files.</p>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
</div>
|
| 142 |
|
| 143 |
+
<div id="error" class="error" style="display: none;"></div>
|
| 144 |
+
|
| 145 |
+
<div id="result" class="result">
|
| 146 |
+
<h3>Conversion Results</h3>
|
| 147 |
+
<p>Your PDF has been converted successfully!</p>
|
| 148 |
+
<p><a id="markdownLink" href="#" download>Download Markdown</a></p>
|
| 149 |
+
<p><a id="jsonLink" href="#" download>Download JSON</a></p>
|
| 150 |
+
</div>
|
| 151 |
|
| 152 |
+
<div class="info">
|
| 153 |
+
<h3>API Usage</h3>
|
| 154 |
+
<p>You can also use our API endpoint to convert PDFs programmatically:</p>
|
| 155 |
+
<div class="code">
|
| 156 |
+
curl -X POST -F "file=@your_file.pdf" https://marcosremar2-mineruapi.hf.space/api/convert
|
| 157 |
+
</div>
|
| 158 |
+
</div>
|
| 159 |
</div>
|
| 160 |
+
|
| 161 |
<script>
|
| 162 |
+
document.getElementById('uploadForm').addEventListener('submit', async function(e) {
|
| 163 |
+
e.preventDefault();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
|
| 165 |
+
const fileInput = document.getElementById('pdfFile');
|
| 166 |
+
const file = fileInput.files[0];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
|
| 168 |
+
if (!file) {
|
| 169 |
+
showError('Please select a PDF file to upload.');
|
| 170 |
+
return;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
|
| 173 |
+
if (file.size > ${MAX_FILE_SIZE}) {
|
| 174 |
+
showError(`File size exceeds the ${MAX_FILE_SIZE / (1024 * 1024)}MB limit.`);
|
| 175 |
+
return;
|
| 176 |
+
}
|
| 177 |
|
| 178 |
+
// Show loading indicator
|
| 179 |
+
document.getElementById('loading').style.display = 'block';
|
| 180 |
+
document.getElementById('error').style.display = 'none';
|
| 181 |
+
document.getElementById('result').style.display = 'none';
|
| 182 |
|
| 183 |
+
const formData = new FormData();
|
| 184 |
+
formData.append('file', file);
|
| 185 |
|
| 186 |
+
try {
|
| 187 |
+
const response = await fetch('/api/convert', {
|
| 188 |
+
method: 'POST',
|
| 189 |
+
body: formData
|
| 190 |
+
});
|
|
|
|
|
|
|
|
|
|
| 191 |
|
| 192 |
+
const result = await response.json();
|
| 193 |
|
| 194 |
+
// Hide loading indicator
|
| 195 |
+
document.getElementById('loading').style.display = 'none';
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
|
| 197 |
+
if (response.ok) {
|
| 198 |
+
// Show result links
|
| 199 |
+
const markdownLink = document.getElementById('markdownLink');
|
| 200 |
+
const jsonLink = document.getElementById('jsonLink');
|
| 201 |
+
|
| 202 |
+
if (result.markdown_url) {
|
| 203 |
+
markdownLink.href = result.markdown_url;
|
| 204 |
+
markdownLink.download = file.name.replace('.pdf', '.md');
|
| 205 |
+
} else {
|
| 206 |
+
markdownLink.parentElement.style.display = 'none';
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
if (result.json_url) {
|
| 210 |
+
jsonLink.href = result.json_url;
|
| 211 |
+
jsonLink.download = file.name.replace('.pdf', '.json');
|
| 212 |
+
} else {
|
| 213 |
+
jsonLink.parentElement.style.display = 'none';
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
document.getElementById('result').style.display = 'block';
|
| 217 |
+
} else {
|
| 218 |
+
showError(result.error || 'Failed to convert PDF. Please try again.');
|
| 219 |
}
|
| 220 |
+
} catch (error) {
|
| 221 |
+
document.getElementById('loading').style.display = 'none';
|
| 222 |
+
showError('An error occurred. Please try again later.');
|
| 223 |
+
console.error(error);
|
| 224 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
});
|
| 226 |
+
|
| 227 |
+
function showError(message) {
|
| 228 |
+
const errorElement = document.getElementById('error');
|
| 229 |
+
errorElement.textContent = message;
|
| 230 |
+
errorElement.style.display = 'block';
|
| 231 |
+
}
|
| 232 |
</script>
|
| 233 |
</body>
|
| 234 |
</html>
|
| 235 |
"""
|
| 236 |
|
| 237 |
+
# Route for the main page
|
| 238 |
@app.route('/')
|
| 239 |
def index():
|
| 240 |
+
return render_template_string(HTML_TEMPLATE, MAX_FILE_SIZE=MAX_FILE_SIZE)
|
| 241 |
|
| 242 |
+
# Route for the health check
|
| 243 |
+
@app.route('/health')
|
| 244 |
+
def health_check():
|
| 245 |
+
try:
|
| 246 |
+
# Get MinerU version
|
| 247 |
+
process = subprocess.run(['magic-pdf', '--version'],
|
| 248 |
+
capture_output=True, text=True, check=False)
|
| 249 |
+
version = process.stdout.strip() if process.returncode == 0 else "Error getting version"
|
| 250 |
+
|
| 251 |
+
# Check CUDA/GPU availability
|
| 252 |
+
try:
|
| 253 |
+
subprocess.run(['nvidia-smi'], capture_output=True, check=True)
|
| 254 |
+
gpu_available = True
|
| 255 |
+
except (subprocess.CalledProcessError, FileNotFoundError):
|
| 256 |
+
gpu_available = False
|
| 257 |
+
|
| 258 |
+
# Check if model directories exist
|
| 259 |
+
model_dirs = {
|
| 260 |
+
"yolo_model": os.path.exists("/tmp/models/MFD/YOLO/yolo_v8_ft.pt"),
|
| 261 |
+
"unimernet_model": os.path.exists("/tmp/models/MFR/unimernet/unimernet_small.pth"),
|
| 262 |
+
"rapid_table_model": os.path.exists("/tmp/models/table/rapid/rapid_table.pt"),
|
| 263 |
+
"doclayout_model": os.path.exists("/tmp/models/layout/doclayout/doclayout_yolo.pt")
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
return jsonify({
|
| 267 |
+
"status": "healthy",
|
| 268 |
+
"version": version,
|
| 269 |
+
"gpu_available": gpu_available,
|
| 270 |
+
"model_dirs": model_dirs,
|
| 271 |
+
"timestamp": time.time()
|
| 272 |
+
})
|
| 273 |
+
except Exception as e:
|
| 274 |
+
logging.error(f"Health check error: {str(e)}")
|
| 275 |
+
return jsonify({
|
| 276 |
+
"status": "unhealthy",
|
| 277 |
+
"error": str(e),
|
| 278 |
+
"timestamp": time.time()
|
| 279 |
+
}), 500
|
| 280 |
+
|
| 281 |
+
# Route to display GPU status
|
| 282 |
@app.route('/gpu-status')
|
| 283 |
def gpu_status():
|
|
|
|
| 284 |
try:
|
| 285 |
+
output = subprocess.check_output(['nvidia-smi'], stderr=subprocess.STDOUT, text=True)
|
| 286 |
+
except (subprocess.CalledProcessError, FileNotFoundError) as e:
|
| 287 |
+
output = f"GPU information not available: {str(e)}"
|
|
|
|
|
|
|
| 288 |
return jsonify({"output": output})
|
| 289 |
|
| 290 |
+
# Route to display magic-pdf help
|
| 291 |
@app.route('/help-output')
|
| 292 |
def help_output():
|
|
|
|
| 293 |
try:
|
| 294 |
+
output = subprocess.check_output(['magic-pdf', '--help'], stderr=subprocess.STDOUT, text=True)
|
| 295 |
+
except (subprocess.CalledProcessError, FileNotFoundError) as e:
|
| 296 |
+
output = f"Help information not available: {str(e)}"
|
|
|
|
|
|
|
| 297 |
return jsonify({"output": output})
|
| 298 |
|
| 299 |
+
# Route for PDF conversion
|
| 300 |
+
@app.route('/api/convert', methods=['POST'])
|
| 301 |
def convert_pdf():
|
| 302 |
if 'file' not in request.files:
|
| 303 |
+
return jsonify({"error": "No file part in the request"}), 400
|
| 304 |
|
| 305 |
file = request.files['file']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 306 |
|
| 307 |
+
if file.filename == '':
|
| 308 |
+
return jsonify({"error": "No file selected"}), 400
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 309 |
|
| 310 |
+
if not allowed_file(file.filename):
|
| 311 |
+
return jsonify({"error": f"Only {', '.join(ALLOWED_EXTENSIONS)} files are allowed"}), 400
|
| 312 |
|
| 313 |
try:
|
| 314 |
+
# Create a unique session ID
|
| 315 |
+
session_id = str(uuid.uuid4())
|
| 316 |
+
session_dir = os.path.join(OUTPUT_FOLDER, session_id)
|
| 317 |
+
os.makedirs(session_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
|
| 319 |
+
# Save the uploaded file
|
| 320 |
+
input_path = os.path.join(UPLOAD_FOLDER, f"{session_id}_{file.filename}")
|
| 321 |
+
file.save(input_path)
|
| 322 |
+
logging.info(f"Saved uploaded file to {input_path}")
|
| 323 |
|
| 324 |
+
# Get output file paths
|
| 325 |
+
base_filename = os.path.splitext(os.path.basename(file.filename))[0]
|
| 326 |
+
base_filename = ''.join(c if c.isalnum() or c in ['_', '-', '.'] else '_' for c in base_filename)
|
| 327 |
|
| 328 |
+
markdown_path = os.path.join(session_dir, f"{base_filename}.md")
|
| 329 |
+
json_path = os.path.join(session_dir, f"{base_filename}.json")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
|
| 331 |
+
# Define file URLs
|
| 332 |
+
markdown_url = f"/download/{session_id}/{base_filename}.md"
|
| 333 |
+
json_url = f"/download/{session_id}/{base_filename}.json"
|
| 334 |
+
|
| 335 |
+
try:
|
| 336 |
+
# Run MinerU to convert the PDF
|
| 337 |
+
cmd = [
|
| 338 |
+
"magic-pdf",
|
| 339 |
+
"-i", input_path,
|
| 340 |
+
"-o", markdown_path,
|
| 341 |
+
"--json", json_path,
|
| 342 |
+
"-d", "gpu" if os.path.exists("/tmp/models/MFD/YOLO/yolo_v8_ft.pt") else "cpu",
|
| 343 |
+
"--mfd"
|
| 344 |
+
]
|
| 345 |
|
| 346 |
+
process = subprocess.run(cmd, capture_output=True, text=True, check=False)
|
| 347 |
|
| 348 |
if process.returncode != 0:
|
| 349 |
+
logging.error(f"MinerU conversion failed: {process.stderr}")
|
| 350 |
return jsonify({
|
| 351 |
+
"error": "PDF conversion failed",
|
| 352 |
+
"details": process.stderr,
|
| 353 |
+
"command": " ".join(cmd)
|
| 354 |
}), 500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
|
| 356 |
+
# Check if output files exist
|
| 357 |
+
md_exists = os.path.exists(markdown_path)
|
| 358 |
+
json_exists = os.path.exists(json_path)
|
| 359 |
+
|
| 360 |
+
if not md_exists:
|
| 361 |
+
logging.warning(f"Markdown file not found at {markdown_path}")
|
| 362 |
+
|
| 363 |
+
if not json_exists:
|
| 364 |
+
logging.warning(f"JSON file not found at {json_path}")
|
| 365 |
+
|
| 366 |
+
return jsonify({
|
| 367 |
+
"success": True,
|
| 368 |
+
"session_id": session_id,
|
| 369 |
+
"markdown_url": markdown_url if md_exists else None,
|
| 370 |
+
"json_url": json_url if json_exists else None,
|
| 371 |
+
"message": "PDF conversion completed"
|
| 372 |
+
})
|
| 373 |
+
|
| 374 |
+
except Exception as e:
|
| 375 |
+
logging.error(f"Error during conversion: {str(e)}")
|
| 376 |
+
logging.error(traceback.format_exc())
|
| 377 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 378 |
return jsonify({
|
| 379 |
+
"error": "Error processing PDF file",
|
| 380 |
+
"details": str(e)
|
|
|
|
|
|
|
| 381 |
}), 500
|
| 382 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
except Exception as e:
|
| 384 |
+
logging.error(f"General error: {str(e)}")
|
| 385 |
+
logging.error(traceback.format_exc())
|
| 386 |
|
| 387 |
return jsonify({
|
| 388 |
+
"error": "Failed to process request",
|
| 389 |
+
"details": str(e)
|
|
|
|
|
|
|
| 390 |
}), 500
|
| 391 |
+
finally:
|
| 392 |
+
# Clean up the input file
|
| 393 |
+
try:
|
| 394 |
+
if os.path.exists(input_path):
|
| 395 |
+
os.remove(input_path)
|
| 396 |
+
except Exception as e:
|
| 397 |
+
logging.warning(f"Failed to clean up input file: {str(e)}")
|
| 398 |
+
|
| 399 |
+
# Route to download converted files
|
| 400 |
+
@app.route('/download/<session_id>/<filename>')
|
| 401 |
+
def download_file(session_id, filename):
|
| 402 |
+
# Validate the session ID and filename
|
| 403 |
+
if not all(c.isalnum() or c == '-' for c in session_id):
|
| 404 |
+
abort(400, "Invalid session ID")
|
| 405 |
+
|
| 406 |
+
base_path = os.path.join(OUTPUT_FOLDER, session_id)
|
| 407 |
+
file_path = os.path.join(base_path, filename)
|
| 408 |
+
|
| 409 |
+
if not os.path.exists(file_path):
|
| 410 |
+
abort(404, "File not found")
|
| 411 |
+
|
| 412 |
+
return send_file(file_path, as_attachment=True)
|
| 413 |
|
| 414 |
if __name__ == '__main__':
|
| 415 |
+
app.run(host='0.0.0.0', port=7860)
|
entrypoint.sh
CHANGED
|
@@ -5,198 +5,123 @@ set -e
|
|
| 5 |
source /opt/mineru_venv/bin/activate
|
| 6 |
|
| 7 |
# Display GPU information
|
|
|
|
|
|
|
| 8 |
echo "Checking NVIDIA GPU status:"
|
| 9 |
-
nvidia-smi
|
| 10 |
|
| 11 |
# Display MinerU version
|
| 12 |
echo "MinerU version:"
|
| 13 |
-
magic-pdf --version
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
# Create a samples directory
|
|
|
|
| 16 |
mkdir -p $HOME/.config/magic_pdf
|
| 17 |
-
mkdir -p /app/samples || mkdir -p /tmp/samples
|
| 18 |
-
|
| 19 |
-
# Define the samples directory based on what's writable
|
| 20 |
-
if [ -w "/app/samples" ]; then
|
| 21 |
-
SAMPLES_DIR="/app/samples"
|
| 22 |
-
else
|
| 23 |
-
SAMPLES_DIR="/tmp/samples"
|
| 24 |
-
fi
|
| 25 |
|
| 26 |
# Download a sample PDF for testing if it doesn't exist
|
| 27 |
-
|
| 28 |
-
|
| 29 |
# Download a simple paper from arXiv (using a small one for quick processing)
|
| 30 |
-
wget -q "https://arxiv.org/pdf/2201.08239.pdf" -O "$
|
| 31 |
|
| 32 |
# If that fails, try another source
|
| 33 |
-
if [ ! -s "$
|
| 34 |
-
wget -q "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf" -O "$
|
| 35 |
fi
|
| 36 |
|
| 37 |
# If both fail, create a simple PDF with text
|
| 38 |
-
if [ ! -s "$
|
| 39 |
echo "Failed to download sample PDF, creating a simple PDF text file..."
|
| 40 |
echo "This is a sample PDF document for testing MinerU.
|
| 41 |
|
| 42 |
MinerU is a high-quality tool for converting PDF to Markdown and JSON formats.
|
| 43 |
|
| 44 |
-
This file was created for testing purposes." > "$
|
| 45 |
|
| 46 |
# Try using different methods to create a PDF
|
| 47 |
if command -v convert &> /dev/null; then
|
| 48 |
-
convert -size 612x792 -background white -fill black caption:@"$
|
| 49 |
else
|
| 50 |
echo "WARNING: Could not create a sample PDF file automatically."
|
| 51 |
fi
|
| 52 |
fi
|
| 53 |
fi
|
| 54 |
|
| 55 |
-
# Create the magic-pdf.json config file
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
if [ ! -f "$CONFIG_DIR/magic-pdf.json" ]; then
|
| 59 |
-
echo "Creating magic-pdf.json configuration file..."
|
| 60 |
-
cat > "$CONFIG_DIR/magic-pdf.json" << EOF
|
| 61 |
{
|
| 62 |
"device-mode": "gpu",
|
|
|
|
| 63 |
"layout-config": {
|
| 64 |
"model": "doclayout_yolo",
|
|
|
|
| 65 |
"enable": true
|
| 66 |
},
|
| 67 |
"formula-config": {
|
| 68 |
"mfd_model": "yolo_v8_mfd",
|
|
|
|
| 69 |
"mfr_model": "unimernet_small",
|
|
|
|
| 70 |
"enable": true
|
| 71 |
},
|
| 72 |
"table-config": {
|
| 73 |
"model": "rapid_table",
|
|
|
|
| 74 |
"sub_model": "slanet_plus",
|
|
|
|
| 75 |
"enable": true,
|
| 76 |
"max_time": 400
|
| 77 |
}
|
| 78 |
}
|
| 79 |
EOF
|
| 80 |
-
fi
|
| 81 |
-
|
| 82 |
-
# Start the Flask application if it exists, otherwise provide a shell
|
| 83 |
-
if [ -f "/app/app.py" ]; then
|
| 84 |
-
echo "Starting Flask application..."
|
| 85 |
-
python /app/app.py
|
| 86 |
-
else
|
| 87 |
-
echo "No app.py found. Starting a simple server..."
|
| 88 |
-
# Create a simple server that shows MinerU is installed
|
| 89 |
-
TMP_APP_PATH="$HOME/simple_app.py"
|
| 90 |
-
cat > "$TMP_APP_PATH" << 'EOF'
|
| 91 |
-
from flask import Flask, request, jsonify, render_template_string
|
| 92 |
|
| 93 |
-
|
|
|
|
| 94 |
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
body {
|
| 102 |
-
font-family: Arial, sans-serif;
|
| 103 |
-
max-width: 800px;
|
| 104 |
-
margin: 0 auto;
|
| 105 |
-
padding: 20px;
|
| 106 |
-
}
|
| 107 |
-
.container {
|
| 108 |
-
background-color: #f9f9f9;
|
| 109 |
-
padding: 20px;
|
| 110 |
-
border-radius: 8px;
|
| 111 |
-
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
| 112 |
-
}
|
| 113 |
-
h1 {
|
| 114 |
-
color: #2c3e50;
|
| 115 |
-
}
|
| 116 |
-
pre {
|
| 117 |
-
background-color: #f1f1f1;
|
| 118 |
-
padding: 10px;
|
| 119 |
-
border-radius: 4px;
|
| 120 |
-
overflow-x: auto;
|
| 121 |
-
}
|
| 122 |
-
.command {
|
| 123 |
-
font-family: monospace;
|
| 124 |
-
background-color: #eee;
|
| 125 |
-
padding: 5px;
|
| 126 |
-
border-radius: 3px;
|
| 127 |
-
}
|
| 128 |
-
</style>
|
| 129 |
-
</head>
|
| 130 |
-
<body>
|
| 131 |
-
<div class="container">
|
| 132 |
-
<h1>MinerU PDF Processing Service</h1>
|
| 133 |
-
<p>This Space provides PDF processing capabilities using MinerU.</p>
|
| 134 |
-
|
| 135 |
-
<h2>GPU Status</h2>
|
| 136 |
-
<pre id="gpuStatus">Loading...</pre>
|
| 137 |
-
|
| 138 |
-
<h2>Available Commands</h2>
|
| 139 |
-
<p>MinerU provides the following commands:</p>
|
| 140 |
-
<p><span class="command">magic-pdf</span> - Process PDF documents</p>
|
| 141 |
-
|
| 142 |
-
<h2>Help Output</h2>
|
| 143 |
-
<pre id="helpOutput">Loading...</pre>
|
| 144 |
-
</div>
|
| 145 |
-
|
| 146 |
-
<script>
|
| 147 |
-
// Fetch GPU status
|
| 148 |
-
fetch('/gpu-status')
|
| 149 |
-
.then(response => response.json())
|
| 150 |
-
.then(data => {
|
| 151 |
-
document.getElementById('gpuStatus').textContent = data.output;
|
| 152 |
-
})
|
| 153 |
-
.catch(error => {
|
| 154 |
-
document.getElementById('gpuStatus').textContent = 'Error fetching GPU status: ' + error.message;
|
| 155 |
-
});
|
| 156 |
-
|
| 157 |
-
// Fetch help output
|
| 158 |
-
fetch('/help-output')
|
| 159 |
-
.then(response => response.json())
|
| 160 |
-
.then(data => {
|
| 161 |
-
document.getElementById('helpOutput').textContent = data.output;
|
| 162 |
-
})
|
| 163 |
-
.catch(error => {
|
| 164 |
-
document.getElementById('helpOutput').textContent = 'Error fetching help: ' + error.message;
|
| 165 |
-
});
|
| 166 |
-
</script>
|
| 167 |
-
</body>
|
| 168 |
-
</html>
|
| 169 |
-
"""
|
| 170 |
-
|
| 171 |
-
@app.route('/')
|
| 172 |
-
def index():
|
| 173 |
-
return render_template_string(HTML_TEMPLATE)
|
| 174 |
-
|
| 175 |
-
@app.route('/gpu-status')
|
| 176 |
-
def gpu_status():
|
| 177 |
-
import subprocess
|
| 178 |
-
try:
|
| 179 |
-
output = subprocess.check_output(['nvidia-smi'], stderr=subprocess.STDOUT).decode('utf-8')
|
| 180 |
-
except subprocess.CalledProcessError as e:
|
| 181 |
-
output = f"Error running nvidia-smi: {e.output.decode('utf-8')}"
|
| 182 |
-
except FileNotFoundError:
|
| 183 |
-
output = "nvidia-smi command not found. GPU may not be available."
|
| 184 |
-
return jsonify({"output": output})
|
| 185 |
-
|
| 186 |
-
@app.route('/help-output')
|
| 187 |
-
def help_output():
|
| 188 |
-
import subprocess
|
| 189 |
-
try:
|
| 190 |
-
output = subprocess.check_output(['magic-pdf', '--help'], stderr=subprocess.STDOUT).decode('utf-8')
|
| 191 |
-
except subprocess.CalledProcessError as e:
|
| 192 |
-
output = f"Error running magic-pdf --help: {e.output.decode('utf-8')}"
|
| 193 |
-
except FileNotFoundError:
|
| 194 |
-
output = "magic-pdf command not found. MinerU may not be installed correctly."
|
| 195 |
-
return jsonify({"output": output})
|
| 196 |
-
|
| 197 |
-
if __name__ == '__main__':
|
| 198 |
-
app.run(host='0.0.0.0', port=7860)
|
| 199 |
-
EOF
|
| 200 |
|
| 201 |
-
|
| 202 |
-
|
|
|
|
|
|
| 5 |
source /opt/mineru_venv/bin/activate
|
| 6 |
|
| 7 |
# Display GPU information
|
| 8 |
+
echo "===== Application Startup at $(date +'%Y-%m-%d %H:%M:%S') ====="
|
| 9 |
+
echo ""
|
| 10 |
echo "Checking NVIDIA GPU status:"
|
| 11 |
+
nvidia-smi || echo "No NVIDIA GPU detected, running in CPU mode"
|
| 12 |
|
| 13 |
# Display MinerU version
|
| 14 |
echo "MinerU version:"
|
| 15 |
+
magic-pdf --version || echo "Error: MinerU magic-pdf not found"
|
| 16 |
+
|
| 17 |
+
# Create directories for models if they don't exist
|
| 18 |
+
mkdir -p /tmp/models/MFD/YOLO
|
| 19 |
+
mkdir -p /tmp/models/MFR/unimernet
|
| 20 |
+
mkdir -p /tmp/models/table/rapid
|
| 21 |
+
mkdir -p /tmp/models/layout/doclayout
|
| 22 |
+
|
| 23 |
+
# Check if model files exist, if not, download them
|
| 24 |
+
echo "Checking model files..."
|
| 25 |
+
MODEL_FILES=(
|
| 26 |
+
"/tmp/models/MFD/YOLO/yolo_v8_ft.pt"
|
| 27 |
+
"/tmp/models/MFD/YOLO/yolo_v8_mfd.pt"
|
| 28 |
+
"/tmp/models/MFR/unimernet/unimernet_small.pth"
|
| 29 |
+
"/tmp/models/table/rapid/rapid_table.pt"
|
| 30 |
+
"/tmp/models/table/rapid/slanet_plus.pt"
|
| 31 |
+
"/tmp/models/layout/doclayout/doclayout_yolo.pt"
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
MODELS_REPO="https://huggingface.co/opendatalab/MinerU/resolve/main/models"
|
| 35 |
+
MODEL_URLS=(
|
| 36 |
+
"${MODELS_REPO}/mfd/yolo_v8_mfd.pt"
|
| 37 |
+
"${MODELS_REPO}/mfd/yolo_v8_mfd.pt"
|
| 38 |
+
"${MODELS_REPO}/mfr/unimernet_small.pth"
|
| 39 |
+
"${MODELS_REPO}/table/rapid_table.pt"
|
| 40 |
+
"${MODELS_REPO}/table/slanet_plus.pt"
|
| 41 |
+
"${MODELS_REPO}/layout/doclayout_yolo.pt"
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
for i in "${!MODEL_FILES[@]}"; do
|
| 45 |
+
if [ ! -f "${MODEL_FILES[$i]}" ]; then
|
| 46 |
+
echo "Downloading ${MODEL_FILES[$i]}..."
|
| 47 |
+
wget -q "${MODEL_URLS[$i]}" -O "${MODEL_FILES[$i]}" || echo "Failed to download ${MODEL_FILES[$i]}"
|
| 48 |
+
else
|
| 49 |
+
echo "${MODEL_FILES[$i]} already exists."
|
| 50 |
+
fi
|
| 51 |
+
done
|
| 52 |
|
| 53 |
# Create a samples directory
|
| 54 |
+
mkdir -p $HOME/samples
|
| 55 |
mkdir -p $HOME/.config/magic_pdf
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
# Download a sample PDF for testing if it doesn't exist
|
| 58 |
+
echo "Downloading sample PDF for testing..."
|
| 59 |
+
if [ ! -f "$HOME/samples/sample.pdf" ]; then
|
| 60 |
# Download a simple paper from arXiv (using a small one for quick processing)
|
| 61 |
+
wget -q "https://arxiv.org/pdf/2201.08239.pdf" -O "$HOME/samples/sample.pdf" || true
|
| 62 |
|
| 63 |
# If that fails, try another source
|
| 64 |
+
if [ ! -s "$HOME/samples/sample.pdf" ]; then
|
| 65 |
+
wget -q "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf" -O "$HOME/samples/sample.pdf" || true
|
| 66 |
fi
|
| 67 |
|
| 68 |
# If both fail, create a simple PDF with text
|
| 69 |
+
if [ ! -s "$HOME/samples/sample.pdf" ]; then
|
| 70 |
echo "Failed to download sample PDF, creating a simple PDF text file..."
|
| 71 |
echo "This is a sample PDF document for testing MinerU.
|
| 72 |
|
| 73 |
MinerU is a high-quality tool for converting PDF to Markdown and JSON formats.
|
| 74 |
|
| 75 |
+
This file was created for testing purposes." > "$HOME/samples/sample.txt"
|
| 76 |
|
| 77 |
# Try using different methods to create a PDF
|
| 78 |
if command -v convert &> /dev/null; then
|
| 79 |
+
convert -size 612x792 -background white -fill black caption:@"$HOME/samples/sample.txt" "$HOME/samples/sample.pdf"
|
| 80 |
else
|
| 81 |
echo "WARNING: Could not create a sample PDF file automatically."
|
| 82 |
fi
|
| 83 |
fi
|
| 84 |
fi
|
| 85 |
|
| 86 |
+
# Create the magic-pdf.json config file with paths to the downloaded models
|
| 87 |
+
echo "Creating magic-pdf.json configuration file..."
|
| 88 |
+
cat > "$HOME/.config/magic_pdf/magic-pdf.json" << EOF
|
|
|
|
|
|
|
|
|
|
| 89 |
{
|
| 90 |
"device-mode": "gpu",
|
| 91 |
+
"models-path": "/tmp/models",
|
| 92 |
"layout-config": {
|
| 93 |
"model": "doclayout_yolo",
|
| 94 |
+
"model_path": "/tmp/models/layout/doclayout/doclayout_yolo.pt",
|
| 95 |
"enable": true
|
| 96 |
},
|
| 97 |
"formula-config": {
|
| 98 |
"mfd_model": "yolo_v8_mfd",
|
| 99 |
+
"mfd_model_path": "/tmp/models/MFD/YOLO/yolo_v8_mfd.pt",
|
| 100 |
"mfr_model": "unimernet_small",
|
| 101 |
+
"mfr_model_path": "/tmp/models/MFR/unimernet/unimernet_small.pth",
|
| 102 |
"enable": true
|
| 103 |
},
|
| 104 |
"table-config": {
|
| 105 |
"model": "rapid_table",
|
| 106 |
+
"model_path": "/tmp/models/table/rapid/rapid_table.pt",
|
| 107 |
"sub_model": "slanet_plus",
|
| 108 |
+
"sub_model_path": "/tmp/models/table/rapid/slanet_plus.pt",
|
| 109 |
"enable": true,
|
| 110 |
"max_time": 400
|
| 111 |
}
|
| 112 |
}
|
| 113 |
EOF
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
+
# Also create it in the home directory as some versions of MinerU look for it there
|
| 116 |
+
cp "$HOME/.config/magic_pdf/magic-pdf.json" "$HOME/magic-pdf.json"
|
| 117 |
|
| 118 |
+
# List model files to verify they're present
|
| 119 |
+
echo "Verifying model files:"
|
| 120 |
+
ls -la /tmp/models/MFD/YOLO/ || echo "YOLO models directory issue"
|
| 121 |
+
ls -la /tmp/models/MFR/unimernet/ || echo "UniMERNet models directory issue"
|
| 122 |
+
ls -la /tmp/models/table/rapid/ || echo "Table models directory issue"
|
| 123 |
+
ls -la /tmp/models/layout/doclayout/ || echo "Layout models directory issue"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
+
# Start the Flask application
|
| 126 |
+
echo "Starting Flask application..."
|
| 127 |
+
python /app/app.py
|
requirements.txt
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
flask==2.3.3
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
sentencepiece>=0.1.99
|
| 5 |
requests>=2.31.0
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
| 1 |
flask==2.3.3
|
| 2 |
+
werkzeug==2.3.7
|
| 3 |
+
flask-cors==4.0.0
|
|
|
|
| 4 |
requests>=2.31.0
|
| 5 |
+
pillow>=9.4.0
|
| 6 |
+
numpy>=1.24.0
|
| 7 |
+
wget>=3.2
|
| 8 |
+
magic-pdf[full]>=1.3.0
|
| 9 |
+
uuid>=1.30
|
| 10 |
+
python-magic>=0.4.27
|