marcosremar2 commited on
Commit
128728a
·
1 Parent(s): a7cd086

Fix MinerU PDF API by adding model download and explicit configuration

Browse files
Files changed (5) hide show
  1. .gitattributes +6 -32
  2. Dockerfile +27 -4
  3. app.py +303 -593
  4. entrypoint.sh +69 -144
  5. requirements.txt +8 -8
.gitattributes CHANGED
@@ -1,35 +1,9 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
  *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.pth filter=lfs diff=lfs merge=lfs -text
2
+ *.pt filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
4
  *.model filter=lfs diff=lfs merge=lfs -text
 
 
 
5
  *.onnx filter=lfs diff=lfs merge=lfs -text
6
+ *.pdf filter=lfs diff=lfs merge=lfs -text
7
+ *.png filter=lfs diff=lfs merge=lfs -text
8
+ *.jpg filter=lfs diff=lfs merge=lfs -text
9
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Dockerfile CHANGED
@@ -44,7 +44,7 @@ RUN pip install --upgrade pip
44
  RUN git clone https://github.com/opendatalab/MinerU.git /tmp/MinerU
45
 
46
  # Install required packages
47
- RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
48
 
49
  # Install MinerU with all features
50
  WORKDIR /tmp/MinerU
@@ -57,14 +57,37 @@ RUN pip install --no-cache-dir flask==2.3.3 flask-cors==4.0.0 werkzeug==2.3.7
57
  RUN mkdir -p /tmp/pdf_uploads /tmp/pdf_output
58
  RUN mkdir -p /tmp/samples
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  # Create a non-root user for Hugging Face Spaces
61
  # This is critical for permissions on HF Spaces
62
  RUN useradd -m -u 1000 user
63
  RUN mkdir -p /app/samples && chown -R user:user /app
64
 
65
- # Download model weights
66
- RUN echo "Downloading MinerU model weights..."
67
- # This step will automatically download model weights during the first run
68
 
69
  # Copy the application files
70
  WORKDIR /app
 
44
  RUN git clone https://github.com/opendatalab/MinerU.git /tmp/MinerU
45
 
46
  # Install required packages
47
+ RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
48
 
49
  # Install MinerU with all features
50
  WORKDIR /tmp/MinerU
 
57
  RUN mkdir -p /tmp/pdf_uploads /tmp/pdf_output
58
  RUN mkdir -p /tmp/samples
59
 
60
+ # Create models directory structure
61
+ RUN mkdir -p /tmp/models/MFD/YOLO
62
+ RUN mkdir -p /tmp/models/MFR/unimernet
63
+ RUN mkdir -p /tmp/models/table/rapid
64
+ RUN mkdir -p /tmp/models/layout/doclayout
65
+
66
+ # Download model weights
67
+ RUN echo "Downloading MinerU model weights..."
68
+ WORKDIR /tmp/models
69
+
70
+ # Download the YOLO model for formula detection
71
+ RUN wget -q https://huggingface.co/opendatalab/MinerU/resolve/main/models/mfd/yolo_v8_mfd.pt -O /tmp/models/MFD/YOLO/yolo_v8_ft.pt || echo "Failed to download yolo_v8_ft.pt"
72
+ RUN wget -q https://huggingface.co/opendatalab/MinerU/resolve/main/models/mfd/yolo_v8_mfd.pt -O /tmp/models/MFD/YOLO/yolo_v8_mfd.pt || echo "Failed to download yolo_v8_mfd.pt"
73
+
74
+ # Download the UniMERNet model for formula recognition
75
+ RUN wget -q https://huggingface.co/opendatalab/MinerU/resolve/main/models/mfr/unimernet_small.pth -O /tmp/models/MFR/unimernet/unimernet_small.pth || echo "Failed to download unimernet_small.pth"
76
+
77
+ # Download the Rapid Table detection model
78
+ RUN wget -q https://huggingface.co/opendatalab/MinerU/resolve/main/models/table/rapid_table.pt -O /tmp/models/table/rapid/rapid_table.pt || echo "Failed to download rapid_table.pt"
79
+ RUN wget -q https://huggingface.co/opendatalab/MinerU/resolve/main/models/table/slanet_plus.pt -O /tmp/models/table/rapid/slanet_plus.pt || echo "Failed to download slanet_plus.pt"
80
+
81
+ # Download the DocLayout YOLO model
82
+ RUN wget -q https://huggingface.co/opendatalab/MinerU/resolve/main/models/layout/doclayout_yolo.pt -O /tmp/models/layout/doclayout/doclayout_yolo.pt || echo "Failed to download doclayout_yolo.pt"
83
+
84
  # Create a non-root user for Hugging Face Spaces
85
  # This is critical for permissions on HF Spaces
86
  RUN useradd -m -u 1000 user
87
  RUN mkdir -p /app/samples && chown -R user:user /app
88
 
89
+ # Set permissions for model files
90
+ RUN chown -R user:user /tmp/models
 
91
 
92
  # Copy the application files
93
  WORKDIR /app
app.py CHANGED
@@ -1,705 +1,415 @@
1
- from flask import Flask, request, jsonify, render_template_string, redirect, url_for, send_from_directory
 
2
  import os
 
 
 
3
  import subprocess
4
  import tempfile
5
- import uuid
6
  import json
7
- import shutil
8
  import time
9
- import platform
10
- import sys
11
- from werkzeug.utils import secure_filename
12
- from flask_cors import CORS # Add CORS support
13
 
14
  app = Flask(__name__)
15
- CORS(app) # Enable CORS for all routes
 
 
 
16
 
17
- # Use user home directory for better permission handling
18
- USER_HOME = os.path.expanduser("~")
19
- UPLOAD_FOLDER = os.path.join(USER_HOME, 'pdf_uploads')
20
- OUTPUT_FOLDER = os.path.join(USER_HOME, 'pdf_output')
 
21
 
22
- # Create upload and output directories
23
  os.makedirs(UPLOAD_FOLDER, exist_ok=True)
24
  os.makedirs(OUTPUT_FOLDER, exist_ok=True)
25
 
26
- # Version information
27
- APP_VERSION = "1.1.0"
 
28
 
 
29
  HTML_TEMPLATE = """
30
  <!DOCTYPE html>
31
  <html>
32
  <head>
33
- <title>MinerU PDF Processing</title>
34
  <style>
35
  body {
36
- font-family: Arial, sans-serif;
37
- max-width: 900px;
 
 
38
  margin: 0 auto;
39
  padding: 20px;
40
- line-height: 1.6;
 
 
 
 
 
41
  }
42
  .container {
43
- background-color: #f9f9f9;
44
  padding: 20px;
45
  border-radius: 8px;
46
  box-shadow: 0 2px 4px rgba(0,0,0,0.1);
47
- margin-bottom: 20px;
48
  }
49
- h1 {
50
- color: #2c3e50;
 
 
 
51
  }
52
- pre {
53
- background-color: #f1f1f1;
54
  padding: 10px;
55
- border-radius: 4px;
56
- overflow-x: auto;
57
- max-height: 300px;
58
- overflow-y: auto;
59
- }
60
- .command {
61
- font-family: monospace;
62
- background-color: #eee;
63
- padding: 5px;
64
  border-radius: 3px;
 
 
65
  }
66
- .upload-form {
67
  margin: 20px 0;
68
- padding: 15px;
69
- border: 1px solid #ddd;
70
- border-radius: 8px;
71
  }
72
- .btn {
73
- background-color: #4CAF50;
 
 
 
 
 
 
74
  color: white;
75
- padding: 8px 16px;
76
  border: none;
77
  border-radius: 4px;
78
  cursor: pointer;
79
- font-size: 16px;
80
  }
81
- .btn:hover {
82
- background-color: #45a049;
83
  }
84
- .loading {
85
- display: none;
86
- color: #666;
87
- margin-top: 10px;
88
- }
89
- .result-section {
90
  margin-top: 20px;
 
91
  }
92
- .tab {
93
- overflow: hidden;
94
- border: 1px solid #ccc;
95
- background-color: #f1f1f1;
96
- margin-top: 20px;
97
  }
98
- .tab button {
99
- background-color: inherit;
100
- float: left;
101
- border: none;
102
- outline: none;
103
- cursor: pointer;
104
- padding: 10px 16px;
105
- transition: 0.3s;
106
  }
107
- .tab button:hover {
108
- background-color: #ddd;
 
109
  }
110
- .tab button.active {
111
- background-color: #ccc;
112
- }
113
- .tabcontent {
114
- display: none;
115
- padding: 6px 12px;
116
- border: 1px solid #ccc;
117
- border-top: none;
118
- max-height: 500px;
119
- overflow-y: auto;
120
- white-space: pre-wrap;
121
  }
122
  </style>
123
  </head>
124
  <body>
 
 
125
  <div class="container">
126
- <h1>MinerU PDF Processing Service</h1>
127
- <p>This service uses MinerU to convert PDF documents to Markdown and JSON formats.</p>
128
-
129
- <h2>GPU Status</h2>
130
- <pre id="gpuStatus">Loading...</pre>
131
-
132
- <div class="upload-form">
133
- <h2>Convert PDF File</h2>
134
- <form action="/convert" method="post" enctype="multipart/form-data" id="uploadForm">
135
- <input type="file" name="file" accept=".pdf" required>
136
- <button type="submit" class="btn">Convert PDF</button>
137
- </form>
138
- <div id="loadingIndicator" class="loading">Processing PDF file... This may take a minute.</div>
139
  </div>
140
-
141
- <div class="result-section" id="resultSection" style="display: none;">
142
- <h2>Conversion Results</h2>
143
- <div id="resultInfo"></div>
144
-
145
- <div class="tab">
146
- <button class="tablinks" onclick="openTab(event, 'Markdown')" id="defaultOpen">Markdown</button>
147
- <button class="tablinks" onclick="openTab(event, 'JSON')">JSON</button>
148
- <button class="tablinks" onclick="openTab(event, 'Log')">Processing Log</button>
149
- </div>
150
-
151
- <div id="Markdown" class="tabcontent">
152
- <pre id="markdownContent"></pre>
153
- <a id="downloadMarkdown" class="btn" style="margin-top: 10px;">Download Markdown</a>
154
- </div>
155
-
156
- <div id="JSON" class="tabcontent">
157
- <pre id="jsonContent"></pre>
158
- <a id="downloadJson" class="btn" style="margin-top: 10px;">Download JSON</a>
159
- </div>
160
-
161
- <div id="Log" class="tabcontent">
162
- <pre id="logContent"></pre>
163
- </div>
164
  </div>
165
 
166
- <h2>Available Commands</h2>
167
- <p>MinerU provides the following commands:</p>
168
- <p><span class="command">magic-pdf</span> - Process PDF documents</p>
 
 
 
 
 
169
 
170
- <h2>Help Output</h2>
171
- <pre id="helpOutput">Loading...</pre>
 
 
 
 
 
172
  </div>
173
-
174
  <script>
175
- // Fetch GPU status
176
- fetch('/gpu-status')
177
- .then(response => response.json())
178
- .then(data => {
179
- document.getElementById('gpuStatus').textContent = data.output;
180
- })
181
- .catch(error => {
182
- document.getElementById('gpuStatus').textContent = 'Error fetching GPU status: ' + error.message;
183
- });
184
 
185
- // Fetch help output
186
- fetch('/help-output')
187
- .then(response => response.json())
188
- .then(data => {
189
- document.getElementById('helpOutput').textContent = data.output;
190
- })
191
- .catch(error => {
192
- document.getElementById('helpOutput').textContent = 'Error fetching help: ' + error.message;
193
- });
194
 
195
- // Tab functionality
196
- function openTab(evt, tabName) {
197
- var i, tabcontent, tablinks;
198
- tabcontent = document.getElementsByClassName("tabcontent");
199
- for (i = 0; i < tabcontent.length; i++) {
200
- tabcontent[i].style.display = "none";
201
- }
202
- tablinks = document.getElementsByClassName("tablinks");
203
- for (i = 0; i < tablinks.length; i++) {
204
- tablinks[i].className = tablinks[i].className.replace(" active", "");
205
  }
206
- document.getElementById(tabName).style.display = "block";
207
- evt.currentTarget.className += " active";
208
- }
209
-
210
- // Set up form submission
211
- document.getElementById('uploadForm').addEventListener('submit', function(e) {
212
- e.preventDefault();
213
 
214
- const loadingIndicator = document.getElementById('loadingIndicator');
215
- loadingIndicator.style.display = 'block';
 
 
216
 
217
- const resultSection = document.getElementById('resultSection');
218
- resultSection.style.display = 'none';
 
 
219
 
220
- const formData = new FormData(this);
 
221
 
222
- fetch('/convert', {
223
- method: 'POST',
224
- body: formData
225
- })
226
- .then(response => response.json())
227
- .then(data => {
228
- loadingIndicator.style.display = 'none';
229
- resultSection.style.display = 'block';
230
 
231
- document.getElementById('resultInfo').textContent = data.message;
232
 
233
- // Handle Markdown content
234
- if (data.markdown) {
235
- document.getElementById('markdownContent').textContent = data.markdown;
236
- const downloadMarkdown = document.getElementById('downloadMarkdown');
237
- downloadMarkdown.href = data.markdown_url;
238
- downloadMarkdown.download = data.base_filename + '.md';
239
- }
240
 
241
- // Handle JSON content
242
- if (data.json) {
243
- document.getElementById('jsonContent').textContent = JSON.stringify(data.json, null, 2);
244
- const downloadJson = document.getElementById('downloadJson');
245
- downloadJson.href = data.json_url;
246
- downloadJson.download = data.base_filename + '.json';
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  }
248
-
249
- // Handle log content
250
- if (data.log) {
251
- document.getElementById('logContent').textContent = data.log;
252
- }
253
-
254
- // Open the markdown tab by default
255
- document.getElementById('defaultOpen').click();
256
- })
257
- .catch(error => {
258
- loadingIndicator.style.display = 'none';
259
- alert('Error: ' + error.message);
260
- });
261
  });
 
 
 
 
 
 
262
  </script>
263
  </body>
264
  </html>
265
  """
266
 
 
267
  @app.route('/')
268
  def index():
269
- return render_template_string(HTML_TEMPLATE)
270
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
  @app.route('/gpu-status')
272
  def gpu_status():
273
- import subprocess
274
  try:
275
- output = subprocess.check_output(['nvidia-smi'], stderr=subprocess.STDOUT).decode('utf-8')
276
- except subprocess.CalledProcessError as e:
277
- output = f"Error running nvidia-smi: {e.output.decode('utf-8')}"
278
- except FileNotFoundError:
279
- output = "nvidia-smi command not found. GPU may not be available."
280
  return jsonify({"output": output})
281
 
 
282
  @app.route('/help-output')
283
  def help_output():
284
- import subprocess
285
  try:
286
- output = subprocess.check_output(['magic-pdf', '--help'], stderr=subprocess.STDOUT).decode('utf-8')
287
- except subprocess.CalledProcessError as e:
288
- output = f"Error running magic-pdf --help: {e.output.decode('utf-8')}"
289
- except FileNotFoundError:
290
- output = "magic-pdf command not found. MinerU may not be installed correctly."
291
  return jsonify({"output": output})
292
 
293
- @app.route('/convert', methods=['POST'])
 
294
  def convert_pdf():
295
  if 'file' not in request.files:
296
- return jsonify({"error": "No file part"}), 400
297
 
298
  file = request.files['file']
299
- if file.filename == '':
300
- return jsonify({"error": "No selected file"}), 400
301
-
302
- if not file.filename.lower().endswith('.pdf'):
303
- return jsonify({"error": "File must be a PDF"}), 400
304
-
305
- # Generate a unique ID for this conversion
306
- job_id = str(uuid.uuid4())
307
- job_dir = os.path.join(OUTPUT_FOLDER, job_id)
308
- os.makedirs(job_dir, exist_ok=True)
309
 
310
- # Save the uploaded file
311
- filename = secure_filename(file.filename)
312
- base_filename = os.path.splitext(filename)[0]
313
- pdf_path = os.path.join(job_dir, filename)
314
- file.save(pdf_path)
315
-
316
- # Run magic-pdf on the file
317
- output_dir = os.path.join(job_dir, "output")
318
- os.makedirs(output_dir, exist_ok=True)
319
 
320
- log_file = os.path.join(job_dir, "conversion.log")
 
321
 
322
  try:
323
- # Define the default config dictionary first
324
- default_config = {
325
- "device-mode": "cpu",
326
- "layout-config": {
327
- "model": "doclayout_yolo",
328
- "enable": True
329
- },
330
- "formula-config": {
331
- "mfd_model": "yolo_v8_mfd",
332
- "mfr_model": "unimernet_small",
333
- "enable": True
334
- },
335
- "table-config": {
336
- "model": "rapid_table",
337
- "sub_model": "slanet_plus",
338
- "enable": True,
339
- "max_time": 400
340
- }
341
- }
342
 
343
- # Create the magic-pdf.json configuration file in .config if it doesn't exist
344
- config_dir = os.path.expanduser("~/.config/magic_pdf")
345
- os.makedirs(config_dir, exist_ok=True)
346
- config_file = os.path.join(config_dir, "magic-pdf.json")
347
 
348
- if not os.path.exists(config_file):
349
- with open(config_file, 'w') as f:
350
- json.dump(default_config, f, indent=2)
351
 
352
- # Also create the config in the home directory as fallback
353
- home_config_file = os.path.join(os.path.expanduser("~"), "magic-pdf.json")
354
- if not os.path.exists(home_config_file):
355
- with open(home_config_file, 'w') as f:
356
- json.dump(default_config, f, indent=2)
357
-
358
- # Add a small delay to ensure config file is written before magic-pdf runs
359
- time.sleep(0.5)
360
-
361
- # Use magic-pdf to convert the PDF to Markdown and JSON
362
- cmd = [
363
- 'magic-pdf',
364
- '--path', pdf_path,
365
- '--output-dir', output_dir
366
- ]
367
 
368
- # Run the command and capture output
369
- with open(log_file, 'w') as f:
370
- process = subprocess.Popen(
371
- cmd,
372
- stdout=subprocess.PIPE,
373
- stderr=subprocess.STDOUT,
374
- text=True,
375
- bufsize=1
376
- )
377
-
378
- # Write process output to log file in real-time
379
- for line in process.stdout:
380
- f.write(line)
381
- f.flush()
382
 
383
- process.wait()
384
 
385
  if process.returncode != 0:
 
386
  return jsonify({
387
- "error": f"PDF conversion failed with code {process.returncode}",
388
- "log": open(log_file, 'r').read()
 
389
  }), 500
390
-
391
- # Get the generated markdown and JSON
392
- markdown_file = os.path.join(output_dir, f"{base_filename}.md")
393
- json_file = os.path.join(output_dir, f"{base_filename}.json")
394
-
395
- # Check if the output files exist
396
- markdown_content = ""
397
- json_content = {}
398
-
399
- if os.path.exists(markdown_file):
400
- with open(markdown_file, 'r', encoding='utf-8') as f:
401
- markdown_content = f.read()
402
-
403
- if os.path.exists(json_file):
404
- with open(json_file, 'r', encoding='utf-8') as f:
405
- json_content = json.load(f)
406
-
407
- # Read the log file
408
- with open(log_file, 'r') as f:
409
- log_content = f.read()
410
-
411
- # Copy the output files to a location accessible for download
412
- output_markdown = os.path.join(job_dir, f"{base_filename}.md")
413
- output_json = os.path.join(job_dir, f"{base_filename}.json")
414
-
415
- if os.path.exists(markdown_file):
416
- shutil.copy(markdown_file, output_markdown)
417
-
418
- if os.path.exists(json_file):
419
- shutil.copy(json_file, output_json)
420
-
421
- # Return the conversion results
422
- return jsonify({
423
- "message": f"PDF '{filename}' converted successfully",
424
- "markdown": markdown_content,
425
- "json": json_content,
426
- "log": log_content,
427
- "base_filename": base_filename,
428
- "job_id": job_id,
429
- "markdown_url": url_for('download_file', job_id=job_id, filename=f"{base_filename}.md"),
430
- "json_url": url_for('download_file', job_id=job_id, filename=f"{base_filename}.json")
431
- })
432
-
433
- except Exception as e:
434
- import traceback
435
- error_details = traceback.format_exc()
436
- return jsonify({
437
- "error": f"Failed to convert PDF: {str(e)}",
438
- "details": error_details
439
- }), 500
440
-
441
- @app.route('/download/<job_id>/<filename>')
442
- def download_file(job_id, filename):
443
- job_dir = os.path.join(OUTPUT_FOLDER, job_id)
444
- return send_from_directory(job_dir, filename)
445
-
446
- # Add a sample PDF for testing
447
- @app.route('/sample')
448
- def add_sample():
449
- try:
450
- # Create a tiny text-only PDF using Podofoimpose (if available) or other method
451
- sample_dir = os.path.join(UPLOAD_FOLDER, 'sample')
452
- os.makedirs(sample_dir, exist_ok=True)
453
- sample_path = os.path.join(sample_dir, 'sample.pdf')
454
-
455
- # Use simple text for the sample
456
- with open(os.path.join(sample_dir, 'sample.txt'), 'w') as f:
457
- f.write("This is a sample PDF for testing MinerU.\n\nIt contains simple text to demonstrate the PDF to Markdown and JSON conversion capabilities.")
458
-
459
- # Try to convert the text to PDF if possible
460
- try:
461
- subprocess.run(['convert', '-size', '612x792', 'caption:@' + os.path.join(sample_dir, 'sample.txt'), sample_path])
462
- except:
463
- # If ImageMagick's convert fails, try another approach
464
- return jsonify({"error": "Could not create sample PDF. Please upload your own PDF file."}), 500
465
-
466
- return jsonify({"message": "Sample PDF created", "path": sample_path})
467
- except Exception as e:
468
- return jsonify({"error": f"Failed to create sample PDF: {str(e)}"}), 500
469
-
470
- @app.route('/health')
471
- def health_check():
472
- """
473
- Health check endpoint for monitoring.
474
- Returns basic information about the service status.
475
- """
476
- try:
477
- # Check if magic-pdf command exists
478
- has_magic_pdf = False
479
- try:
480
- subprocess.run(['magic-pdf', '--version'], capture_output=True, check=False)
481
- has_magic_pdf = True
482
- except FileNotFoundError:
483
- pass
484
-
485
- # Get runtime information
486
- health_info = {
487
- 'status': 'healthy',
488
- 'version': APP_VERSION,
489
- 'environment': {
490
- 'python_version': platform.python_version(),
491
- 'platform': platform.platform(),
492
- 'processor': platform.processor()
493
- },
494
- 'configuration': {
495
- 'upload_folder_exists': os.path.exists(UPLOAD_FOLDER),
496
- 'output_folder_exists': os.path.exists(OUTPUT_FOLDER),
497
- 'magic_pdf_installed': has_magic_pdf
498
- }
499
- }
500
-
501
- return jsonify(health_info)
502
- except Exception as e:
503
- return jsonify({
504
- 'status': 'unhealthy',
505
- 'error': str(e)
506
- }), 500
507
-
508
- @app.route('/api/convert', methods=['POST'])
509
- def api_convert_pdf():
510
- """
511
- API endpoint for programmatic access to PDF conversion.
512
-
513
- Request:
514
- - POST request with 'file' field containing PDF file
515
-
516
- Response:
517
- - JSON with conversion results
518
- """
519
- # Validate request
520
- if 'file' not in request.files:
521
- return jsonify({
522
- 'success': False,
523
- 'error': 'No file provided. Please upload a PDF file.'
524
- }), 400
525
-
526
- file = request.files['file']
527
-
528
- if file.filename == '':
529
- return jsonify({
530
- 'success': False,
531
- 'error': 'No file selected. Please select a PDF file.'
532
- }), 400
533
-
534
- # Check if the file is a PDF
535
- if not file.filename.lower().endswith('.pdf'):
536
- return jsonify({
537
- 'success': False,
538
- 'error': 'Invalid file format. Please upload a PDF file.'
539
- }), 400
540
-
541
- # Generate a job ID
542
- job_id = str(uuid.uuid4())
543
-
544
- # Create job directory
545
- job_dir = os.path.join(OUTPUT_FOLDER, job_id)
546
- os.makedirs(job_dir, exist_ok=True)
547
-
548
- # Save the uploaded file
549
- filename = secure_filename(file.filename)
550
- base_filename, _ = os.path.splitext(filename)
551
-
552
- pdf_path = os.path.join(job_dir, filename)
553
- file.save(pdf_path)
554
-
555
- try:
556
- # Define the default config dictionary first
557
- default_config = {
558
- "device-mode": "cpu",
559
- "layout-config": {
560
- "model": "doclayout_yolo",
561
- "enable": True
562
- },
563
- "formula-config": {
564
- "mfd_model": "yolo_v8_mfd",
565
- "mfr_model": "unimernet_small",
566
- "enable": True
567
- },
568
- "table-config": {
569
- "model": "rapid_table",
570
- "sub_model": "slanet_plus",
571
- "enable": True,
572
- "max_time": 400
573
- }
574
- }
575
-
576
- # Create the magic-pdf.json configuration file in .config if it doesn't exist
577
- config_dir = os.path.expanduser("~/.config/magic_pdf")
578
- os.makedirs(config_dir, exist_ok=True)
579
- config_file = os.path.join(config_dir, "magic-pdf.json")
580
-
581
- if not os.path.exists(config_file):
582
- with open(config_file, 'w') as f:
583
- json.dump(default_config, f, indent=2)
584
-
585
- # Also create the config in the home directory as fallback
586
- home_config_file = os.path.join(os.path.expanduser("~"), "magic-pdf.json")
587
- if not os.path.exists(home_config_file):
588
- with open(home_config_file, 'w') as f:
589
- json.dump(default_config, f, indent=2)
590
-
591
- # Add a small delay to ensure config file is written before magic-pdf runs
592
- time.sleep(0.5)
593
-
594
- # Log the conversion process
595
- log_file = os.path.join(job_dir, "conversion.log")
596
- with open(log_file, "w") as log:
597
- # Run the MinerU magic-pdf command with correct parameters
598
- command = ["magic-pdf", "--path", pdf_path, "--output-dir", job_dir]
599
- process = subprocess.Popen(
600
- command,
601
- stdout=subprocess.PIPE,
602
- stderr=subprocess.STDOUT,
603
- universal_newlines=True
604
- )
605
 
606
- output = []
607
- for line in process.stdout:
608
- output.append(line)
609
- log.write(line)
610
- log.flush()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
611
 
612
- process.wait()
613
- exit_code = process.returncode
614
-
615
- if exit_code != 0:
616
- error_message = ''.join(output) if output else "Unknown error during PDF conversion"
617
  return jsonify({
618
- 'success': False,
619
- 'error': 'PDF conversion failed. Please check the log for details.',
620
- 'log': error_message,
621
- 'exit_code': exit_code
622
  }), 500
623
-
624
- # Check for output files
625
- markdown_file = os.path.join(job_dir, f"{base_filename}.md")
626
- json_file = os.path.join(job_dir, f"{base_filename}.json")
627
-
628
- # If files don't exist in the job directory, check the same directory as the PDF
629
- pdf_dir = os.path.dirname(pdf_path)
630
- if not os.path.exists(markdown_file):
631
- alt_markdown_file = os.path.join(pdf_dir, f"{base_filename}.md")
632
- if os.path.exists(alt_markdown_file):
633
- markdown_file = alt_markdown_file
634
- else:
635
- # Try to find any markdown file in the output directory
636
- md_files = [f for f in os.listdir(job_dir) if f.endswith('.md')]
637
- if md_files:
638
- markdown_file = os.path.join(job_dir, md_files[0])
639
-
640
- if not os.path.exists(json_file):
641
- alt_json_file = os.path.join(pdf_dir, f"{base_filename}.json")
642
- if os.path.exists(alt_json_file):
643
- json_file = alt_json_file
644
- else:
645
- # Try to find any JSON file in the output directory
646
- json_files = [f for f in os.listdir(job_dir) if f.endswith('.json')]
647
- if json_files:
648
- json_file = os.path.join(job_dir, json_files[0])
649
-
650
- # Read markdown content
651
- markdown_content = ""
652
- if os.path.exists(markdown_file):
653
- with open(markdown_file, 'r', encoding='utf-8') as f:
654
- markdown_content = f.read()
655
- else:
656
- print(f"Warning: Markdown file not found at {markdown_file}")
657
-
658
- # Read JSON content
659
- json_content = {}
660
- if os.path.exists(json_file):
661
- with open(json_file, 'r', encoding='utf-8') as f:
662
- json_content = json.load(f)
663
- else:
664
- print(f"Warning: JSON file not found at {json_file}")
665
-
666
- # Read log content
667
- log_content = ""
668
- with open(log_file, 'r', encoding='utf-8') as f:
669
- log_content = f.read()
670
-
671
- # Create the result
672
- result = {
673
- 'success': True,
674
- 'message': 'PDF conversion successful',
675
- 'job_id': job_id,
676
- 'base_filename': base_filename,
677
- 'file_info': {
678
- 'original_filename': filename,
679
- 'size_bytes': os.path.getsize(pdf_path),
680
- 'content_type': 'application/pdf'
681
- },
682
- 'markdown': markdown_content,
683
- 'json': json_content,
684
- 'log': log_content,
685
- 'files': {
686
- 'markdown_path': os.path.basename(markdown_file) if os.path.exists(markdown_file) else None,
687
- 'json_path': os.path.basename(json_file) if os.path.exists(json_file) else None
688
- }
689
- }
690
-
691
- return jsonify(result)
692
-
693
  except Exception as e:
694
- import traceback
695
- error_details = traceback.format_exc()
696
 
697
  return jsonify({
698
- 'success': False,
699
- 'error': f'An error occurred during PDF conversion: {str(e)}',
700
- 'details': error_details,
701
- 'job_id': job_id
702
  }), 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
703
 
704
  if __name__ == '__main__':
705
- app.run(host='0.0.0.0', port=7860, debug=False)
 
1
+ from flask import Flask, request, jsonify, send_file, render_template_string, abort
2
+ from flask_cors import CORS
3
  import os
4
+ import uuid
5
+ import traceback
6
+ import logging
7
  import subprocess
8
  import tempfile
 
9
  import json
 
10
  import time
11
+ import shutil
 
 
 
12
 
13
  app = Flask(__name__)
14
+ CORS(app)
15
+
16
+ # Configure logging
17
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
18
 
19
+ # Constants
20
+ UPLOAD_FOLDER = os.environ.get('UPLOAD_FOLDER', '/tmp/pdf_uploads')
21
+ OUTPUT_FOLDER = os.environ.get('OUTPUT_FOLDER', '/tmp/pdf_output')
22
+ ALLOWED_EXTENSIONS = {'pdf'}
23
+ MAX_FILE_SIZE = 50 * 1024 * 1024 # 50MB
24
 
25
+ # Ensure the directories exist
26
  os.makedirs(UPLOAD_FOLDER, exist_ok=True)
27
  os.makedirs(OUTPUT_FOLDER, exist_ok=True)
28
 
29
+ # Function to check if file extension is allowed
30
+ def allowed_file(filename):
31
+ return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
32
 
33
+ # Template for the main page
34
  HTML_TEMPLATE = """
35
  <!DOCTYPE html>
36
  <html>
37
  <head>
38
+ <title>MinerU PDF Converter</title>
39
  <style>
40
  body {
41
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
42
+ line-height: 1.6;
43
+ color: #333;
44
+ max-width: 800px;
45
  margin: 0 auto;
46
  padding: 20px;
47
+ background-color: #f5f8fa;
48
+ }
49
+ h1 {
50
+ color: #2c3e50;
51
+ border-bottom: 2px solid #3498db;
52
+ padding-bottom: 10px;
53
  }
54
  .container {
55
+ background-color: white;
56
  padding: 20px;
57
  border-radius: 8px;
58
  box-shadow: 0 2px 4px rgba(0,0,0,0.1);
59
+ margin-top: 20px;
60
  }
61
+ .info {
62
+ background-color: #e8f4fc;
63
+ padding: 15px;
64
+ border-radius: 5px;
65
+ margin: 15px 0;
66
  }
67
+ .code {
68
+ background-color: #f4f4f4;
69
  padding: 10px;
 
 
 
 
 
 
 
 
 
70
  border-radius: 3px;
71
+ font-family: monospace;
72
+ overflow-x: auto;
73
  }
74
+ form {
75
  margin: 20px 0;
 
 
 
76
  }
77
+ input[type=file] {
78
+ padding: 10px;
79
+ width: 100%;
80
+ margin-bottom: 10px;
81
+ }
82
+ button {
83
+ padding: 10px 15px;
84
+ background-color: #3498db;
85
  color: white;
 
86
  border: none;
87
  border-radius: 4px;
88
  cursor: pointer;
 
89
  }
90
+ button:hover {
91
+ background-color: #2980b9;
92
  }
93
+ .result {
 
 
 
 
 
94
  margin-top: 20px;
95
+ display: none;
96
  }
97
+ .loading {
98
+ display: none;
99
+ text-align: center;
100
+ margin: 20px 0;
 
101
  }
102
+ .error {
103
+ background-color: #fee;
104
+ border-left: 4px solid #e74c3c;
105
+ padding: 10px;
106
+ margin: 10px 0;
 
 
 
107
  }
108
+ a {
109
+ color: #3498db;
110
+ text-decoration: none;
111
  }
112
+ a:hover {
113
+ text-decoration: underline;
 
 
 
 
 
 
 
 
 
114
  }
115
  </style>
116
  </head>
117
  <body>
118
+ <h1>📄 MinerU PDF Converter</h1>
119
+
120
  <div class="container">
121
+ <h2>Convert PDF to Markdown and JSON</h2>
122
+ <p>Upload a PDF file to convert it to Markdown and structured JSON.</p>
123
+
124
+ <div class="info">
125
+ <h3>Features</h3>
126
+ <ul>
127
+ <li>High-quality PDF extraction</li>
128
+ <li>Support for tables, formulas, and complex layouts</li>
129
+ <li>Output in both Markdown and structured JSON</li>
130
+ <li>Comprehensive error handling</li>
131
+ </ul>
 
 
132
  </div>
133
+
134
+ <form id="uploadForm" enctype="multipart/form-data">
135
+ <input type="file" id="pdfFile" accept=".pdf" required>
136
+ <button type="submit">Convert PDF</button>
137
+ </form>
138
+
139
+ <div id="loading" class="loading">
140
+ <p>Converting PDF... This may take a minute for large files.</p>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  </div>
142
 
143
+ <div id="error" class="error" style="display: none;"></div>
144
+
145
+ <div id="result" class="result">
146
+ <h3>Conversion Results</h3>
147
+ <p>Your PDF has been converted successfully!</p>
148
+ <p><a id="markdownLink" href="#" download>Download Markdown</a></p>
149
+ <p><a id="jsonLink" href="#" download>Download JSON</a></p>
150
+ </div>
151
 
152
+ <div class="info">
153
+ <h3>API Usage</h3>
154
+ <p>You can also use our API endpoint to convert PDFs programmatically:</p>
155
+ <div class="code">
156
+ curl -X POST -F "file=@your_file.pdf" https://marcosremar2-mineruapi.hf.space/api/convert
157
+ </div>
158
+ </div>
159
  </div>
160
+
161
  <script>
162
+ document.getElementById('uploadForm').addEventListener('submit', async function(e) {
163
+ e.preventDefault();
 
 
 
 
 
 
 
164
 
165
+ const fileInput = document.getElementById('pdfFile');
166
+ const file = fileInput.files[0];
 
 
 
 
 
 
 
167
 
168
+ if (!file) {
169
+ showError('Please select a PDF file to upload.');
170
+ return;
 
 
 
 
 
 
 
171
  }
 
 
 
 
 
 
 
172
 
173
+ if (file.size > ${MAX_FILE_SIZE}) {
174
+ showError(`File size exceeds the ${MAX_FILE_SIZE / (1024 * 1024)}MB limit.`);
175
+ return;
176
+ }
177
 
178
+ // Show loading indicator
179
+ document.getElementById('loading').style.display = 'block';
180
+ document.getElementById('error').style.display = 'none';
181
+ document.getElementById('result').style.display = 'none';
182
 
183
+ const formData = new FormData();
184
+ formData.append('file', file);
185
 
186
+ try {
187
+ const response = await fetch('/api/convert', {
188
+ method: 'POST',
189
+ body: formData
190
+ });
 
 
 
191
 
192
+ const result = await response.json();
193
 
194
+ // Hide loading indicator
195
+ document.getElementById('loading').style.display = 'none';
 
 
 
 
 
196
 
197
+ if (response.ok) {
198
+ // Show result links
199
+ const markdownLink = document.getElementById('markdownLink');
200
+ const jsonLink = document.getElementById('jsonLink');
201
+
202
+ if (result.markdown_url) {
203
+ markdownLink.href = result.markdown_url;
204
+ markdownLink.download = file.name.replace('.pdf', '.md');
205
+ } else {
206
+ markdownLink.parentElement.style.display = 'none';
207
+ }
208
+
209
+ if (result.json_url) {
210
+ jsonLink.href = result.json_url;
211
+ jsonLink.download = file.name.replace('.pdf', '.json');
212
+ } else {
213
+ jsonLink.parentElement.style.display = 'none';
214
+ }
215
+
216
+ document.getElementById('result').style.display = 'block';
217
+ } else {
218
+ showError(result.error || 'Failed to convert PDF. Please try again.');
219
  }
220
+ } catch (error) {
221
+ document.getElementById('loading').style.display = 'none';
222
+ showError('An error occurred. Please try again later.');
223
+ console.error(error);
224
+ }
 
 
 
 
 
 
 
 
225
  });
226
+
227
+ function showError(message) {
228
+ const errorElement = document.getElementById('error');
229
+ errorElement.textContent = message;
230
+ errorElement.style.display = 'block';
231
+ }
232
  </script>
233
  </body>
234
  </html>
235
  """
236
 
237
+ # Route for the main page
238
  @app.route('/')
239
  def index():
240
+ return render_template_string(HTML_TEMPLATE, MAX_FILE_SIZE=MAX_FILE_SIZE)
241
 
242
+ # Route for the health check
243
+ @app.route('/health')
244
+ def health_check():
245
+ try:
246
+ # Get MinerU version
247
+ process = subprocess.run(['magic-pdf', '--version'],
248
+ capture_output=True, text=True, check=False)
249
+ version = process.stdout.strip() if process.returncode == 0 else "Error getting version"
250
+
251
+ # Check CUDA/GPU availability
252
+ try:
253
+ subprocess.run(['nvidia-smi'], capture_output=True, check=True)
254
+ gpu_available = True
255
+ except (subprocess.CalledProcessError, FileNotFoundError):
256
+ gpu_available = False
257
+
258
+ # Check if model directories exist
259
+ model_dirs = {
260
+ "yolo_model": os.path.exists("/tmp/models/MFD/YOLO/yolo_v8_ft.pt"),
261
+ "unimernet_model": os.path.exists("/tmp/models/MFR/unimernet/unimernet_small.pth"),
262
+ "rapid_table_model": os.path.exists("/tmp/models/table/rapid/rapid_table.pt"),
263
+ "doclayout_model": os.path.exists("/tmp/models/layout/doclayout/doclayout_yolo.pt")
264
+ }
265
+
266
+ return jsonify({
267
+ "status": "healthy",
268
+ "version": version,
269
+ "gpu_available": gpu_available,
270
+ "model_dirs": model_dirs,
271
+ "timestamp": time.time()
272
+ })
273
+ except Exception as e:
274
+ logging.error(f"Health check error: {str(e)}")
275
+ return jsonify({
276
+ "status": "unhealthy",
277
+ "error": str(e),
278
+ "timestamp": time.time()
279
+ }), 500
280
+
281
+ # Route to display GPU status
282
  @app.route('/gpu-status')
283
  def gpu_status():
 
284
  try:
285
+ output = subprocess.check_output(['nvidia-smi'], stderr=subprocess.STDOUT, text=True)
286
+ except (subprocess.CalledProcessError, FileNotFoundError) as e:
287
+ output = f"GPU information not available: {str(e)}"
 
 
288
  return jsonify({"output": output})
289
 
290
+ # Route to display magic-pdf help
291
  @app.route('/help-output')
292
  def help_output():
 
293
  try:
294
+ output = subprocess.check_output(['magic-pdf', '--help'], stderr=subprocess.STDOUT, text=True)
295
+ except (subprocess.CalledProcessError, FileNotFoundError) as e:
296
+ output = f"Help information not available: {str(e)}"
 
 
297
  return jsonify({"output": output})
298
 
299
+ # Route for PDF conversion
300
+ @app.route('/api/convert', methods=['POST'])
301
  def convert_pdf():
302
  if 'file' not in request.files:
303
+ return jsonify({"error": "No file part in the request"}), 400
304
 
305
  file = request.files['file']
 
 
 
 
 
 
 
 
 
 
306
 
307
+ if file.filename == '':
308
+ return jsonify({"error": "No file selected"}), 400
 
 
 
 
 
 
 
309
 
310
+ if not allowed_file(file.filename):
311
+ return jsonify({"error": f"Only {', '.join(ALLOWED_EXTENSIONS)} files are allowed"}), 400
312
 
313
  try:
314
+ # Create a unique session ID
315
+ session_id = str(uuid.uuid4())
316
+ session_dir = os.path.join(OUTPUT_FOLDER, session_id)
317
+ os.makedirs(session_dir, exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
 
319
+ # Save the uploaded file
320
+ input_path = os.path.join(UPLOAD_FOLDER, f"{session_id}_{file.filename}")
321
+ file.save(input_path)
322
+ logging.info(f"Saved uploaded file to {input_path}")
323
 
324
+ # Get output file paths
325
+ base_filename = os.path.splitext(os.path.basename(file.filename))[0]
326
+ base_filename = ''.join(c if c.isalnum() or c in ['_', '-', '.'] else '_' for c in base_filename)
327
 
328
+ markdown_path = os.path.join(session_dir, f"{base_filename}.md")
329
+ json_path = os.path.join(session_dir, f"{base_filename}.json")
 
 
 
 
 
 
 
 
 
 
 
 
 
330
 
331
+ # Define file URLs
332
+ markdown_url = f"/download/{session_id}/{base_filename}.md"
333
+ json_url = f"/download/{session_id}/{base_filename}.json"
334
+
335
+ try:
336
+ # Run MinerU to convert the PDF
337
+ cmd = [
338
+ "magic-pdf",
339
+ "-i", input_path,
340
+ "-o", markdown_path,
341
+ "--json", json_path,
342
+ "-d", "gpu" if os.path.exists("/tmp/models/MFD/YOLO/yolo_v8_ft.pt") else "cpu",
343
+ "--mfd"
344
+ ]
345
 
346
+ process = subprocess.run(cmd, capture_output=True, text=True, check=False)
347
 
348
  if process.returncode != 0:
349
+ logging.error(f"MinerU conversion failed: {process.stderr}")
350
  return jsonify({
351
+ "error": "PDF conversion failed",
352
+ "details": process.stderr,
353
+ "command": " ".join(cmd)
354
  }), 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
 
356
+ # Check if output files exist
357
+ md_exists = os.path.exists(markdown_path)
358
+ json_exists = os.path.exists(json_path)
359
+
360
+ if not md_exists:
361
+ logging.warning(f"Markdown file not found at {markdown_path}")
362
+
363
+ if not json_exists:
364
+ logging.warning(f"JSON file not found at {json_path}")
365
+
366
+ return jsonify({
367
+ "success": True,
368
+ "session_id": session_id,
369
+ "markdown_url": markdown_url if md_exists else None,
370
+ "json_url": json_url if json_exists else None,
371
+ "message": "PDF conversion completed"
372
+ })
373
+
374
+ except Exception as e:
375
+ logging.error(f"Error during conversion: {str(e)}")
376
+ logging.error(traceback.format_exc())
377
 
 
 
 
 
 
378
  return jsonify({
379
+ "error": "Error processing PDF file",
380
+ "details": str(e)
 
 
381
  }), 500
382
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
383
  except Exception as e:
384
+ logging.error(f"General error: {str(e)}")
385
+ logging.error(traceback.format_exc())
386
 
387
  return jsonify({
388
+ "error": "Failed to process request",
389
+ "details": str(e)
 
 
390
  }), 500
391
+ finally:
392
+ # Clean up the input file
393
+ try:
394
+ if os.path.exists(input_path):
395
+ os.remove(input_path)
396
+ except Exception as e:
397
+ logging.warning(f"Failed to clean up input file: {str(e)}")
398
+
399
+ # Route to download converted files
400
+ @app.route('/download/<session_id>/<filename>')
401
+ def download_file(session_id, filename):
402
+ # Validate the session ID and filename
403
+ if not all(c.isalnum() or c == '-' for c in session_id):
404
+ abort(400, "Invalid session ID")
405
+
406
+ base_path = os.path.join(OUTPUT_FOLDER, session_id)
407
+ file_path = os.path.join(base_path, filename)
408
+
409
+ if not os.path.exists(file_path):
410
+ abort(404, "File not found")
411
+
412
+ return send_file(file_path, as_attachment=True)
413
 
414
  if __name__ == '__main__':
415
+ app.run(host='0.0.0.0', port=7860)
entrypoint.sh CHANGED
@@ -5,198 +5,123 @@ set -e
5
  source /opt/mineru_venv/bin/activate
6
 
7
  # Display GPU information
 
 
8
  echo "Checking NVIDIA GPU status:"
9
- nvidia-smi
10
 
11
  # Display MinerU version
12
  echo "MinerU version:"
13
- magic-pdf --version
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  # Create a samples directory
 
16
  mkdir -p $HOME/.config/magic_pdf
17
- mkdir -p /app/samples || mkdir -p /tmp/samples
18
-
19
- # Define the samples directory based on what's writable
20
- if [ -w "/app/samples" ]; then
21
- SAMPLES_DIR="/app/samples"
22
- else
23
- SAMPLES_DIR="/tmp/samples"
24
- fi
25
 
26
  # Download a sample PDF for testing if it doesn't exist
27
- if [ ! -f "$SAMPLES_DIR/sample.pdf" ]; then
28
- echo "Downloading sample PDF for testing..."
29
  # Download a simple paper from arXiv (using a small one for quick processing)
30
- wget -q "https://arxiv.org/pdf/2201.08239.pdf" -O "$SAMPLES_DIR/sample.pdf" || true
31
 
32
  # If that fails, try another source
33
- if [ ! -s "$SAMPLES_DIR/sample.pdf" ]; then
34
- wget -q "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf" -O "$SAMPLES_DIR/sample.pdf" || true
35
  fi
36
 
37
  # If both fail, create a simple PDF with text
38
- if [ ! -s "$SAMPLES_DIR/sample.pdf" ]; then
39
  echo "Failed to download sample PDF, creating a simple PDF text file..."
40
  echo "This is a sample PDF document for testing MinerU.
41
 
42
  MinerU is a high-quality tool for converting PDF to Markdown and JSON formats.
43
 
44
- This file was created for testing purposes." > "$SAMPLES_DIR/sample.txt"
45
 
46
  # Try using different methods to create a PDF
47
  if command -v convert &> /dev/null; then
48
- convert -size 612x792 -background white -fill black caption:@"$SAMPLES_DIR/sample.txt" "$SAMPLES_DIR/sample.pdf"
49
  else
50
  echo "WARNING: Could not create a sample PDF file automatically."
51
  fi
52
  fi
53
  fi
54
 
55
- # Create the magic-pdf.json config file
56
- CONFIG_DIR="$HOME/.config/magic_pdf"
57
- mkdir -p "$CONFIG_DIR"
58
- if [ ! -f "$CONFIG_DIR/magic-pdf.json" ]; then
59
- echo "Creating magic-pdf.json configuration file..."
60
- cat > "$CONFIG_DIR/magic-pdf.json" << EOF
61
  {
62
  "device-mode": "gpu",
 
63
  "layout-config": {
64
  "model": "doclayout_yolo",
 
65
  "enable": true
66
  },
67
  "formula-config": {
68
  "mfd_model": "yolo_v8_mfd",
 
69
  "mfr_model": "unimernet_small",
 
70
  "enable": true
71
  },
72
  "table-config": {
73
  "model": "rapid_table",
 
74
  "sub_model": "slanet_plus",
 
75
  "enable": true,
76
  "max_time": 400
77
  }
78
  }
79
  EOF
80
- fi
81
-
82
- # Start the Flask application if it exists, otherwise provide a shell
83
- if [ -f "/app/app.py" ]; then
84
- echo "Starting Flask application..."
85
- python /app/app.py
86
- else
87
- echo "No app.py found. Starting a simple server..."
88
- # Create a simple server that shows MinerU is installed
89
- TMP_APP_PATH="$HOME/simple_app.py"
90
- cat > "$TMP_APP_PATH" << 'EOF'
91
- from flask import Flask, request, jsonify, render_template_string
92
 
93
- app = Flask(__name__)
 
94
 
95
- HTML_TEMPLATE = """
96
- <!DOCTYPE html>
97
- <html>
98
- <head>
99
- <title>MinerU PDF Processing</title>
100
- <style>
101
- body {
102
- font-family: Arial, sans-serif;
103
- max-width: 800px;
104
- margin: 0 auto;
105
- padding: 20px;
106
- }
107
- .container {
108
- background-color: #f9f9f9;
109
- padding: 20px;
110
- border-radius: 8px;
111
- box-shadow: 0 2px 4px rgba(0,0,0,0.1);
112
- }
113
- h1 {
114
- color: #2c3e50;
115
- }
116
- pre {
117
- background-color: #f1f1f1;
118
- padding: 10px;
119
- border-radius: 4px;
120
- overflow-x: auto;
121
- }
122
- .command {
123
- font-family: monospace;
124
- background-color: #eee;
125
- padding: 5px;
126
- border-radius: 3px;
127
- }
128
- </style>
129
- </head>
130
- <body>
131
- <div class="container">
132
- <h1>MinerU PDF Processing Service</h1>
133
- <p>This Space provides PDF processing capabilities using MinerU.</p>
134
-
135
- <h2>GPU Status</h2>
136
- <pre id="gpuStatus">Loading...</pre>
137
-
138
- <h2>Available Commands</h2>
139
- <p>MinerU provides the following commands:</p>
140
- <p><span class="command">magic-pdf</span> - Process PDF documents</p>
141
-
142
- <h2>Help Output</h2>
143
- <pre id="helpOutput">Loading...</pre>
144
- </div>
145
-
146
- <script>
147
- // Fetch GPU status
148
- fetch('/gpu-status')
149
- .then(response => response.json())
150
- .then(data => {
151
- document.getElementById('gpuStatus').textContent = data.output;
152
- })
153
- .catch(error => {
154
- document.getElementById('gpuStatus').textContent = 'Error fetching GPU status: ' + error.message;
155
- });
156
-
157
- // Fetch help output
158
- fetch('/help-output')
159
- .then(response => response.json())
160
- .then(data => {
161
- document.getElementById('helpOutput').textContent = data.output;
162
- })
163
- .catch(error => {
164
- document.getElementById('helpOutput').textContent = 'Error fetching help: ' + error.message;
165
- });
166
- </script>
167
- </body>
168
- </html>
169
- """
170
-
171
- @app.route('/')
172
- def index():
173
- return render_template_string(HTML_TEMPLATE)
174
-
175
- @app.route('/gpu-status')
176
- def gpu_status():
177
- import subprocess
178
- try:
179
- output = subprocess.check_output(['nvidia-smi'], stderr=subprocess.STDOUT).decode('utf-8')
180
- except subprocess.CalledProcessError as e:
181
- output = f"Error running nvidia-smi: {e.output.decode('utf-8')}"
182
- except FileNotFoundError:
183
- output = "nvidia-smi command not found. GPU may not be available."
184
- return jsonify({"output": output})
185
-
186
- @app.route('/help-output')
187
- def help_output():
188
- import subprocess
189
- try:
190
- output = subprocess.check_output(['magic-pdf', '--help'], stderr=subprocess.STDOUT).decode('utf-8')
191
- except subprocess.CalledProcessError as e:
192
- output = f"Error running magic-pdf --help: {e.output.decode('utf-8')}"
193
- except FileNotFoundError:
194
- output = "magic-pdf command not found. MinerU may not be installed correctly."
195
- return jsonify({"output": output})
196
-
197
- if __name__ == '__main__':
198
- app.run(host='0.0.0.0', port=7860)
199
- EOF
200
 
201
- python "$TMP_APP_PATH"
202
- fi
 
 
5
  source /opt/mineru_venv/bin/activate
6
 
7
  # Display GPU information
8
+ echo "===== Application Startup at $(date +'%Y-%m-%d %H:%M:%S') ====="
9
+ echo ""
10
  echo "Checking NVIDIA GPU status:"
11
+ nvidia-smi || echo "No NVIDIA GPU detected, running in CPU mode"
12
 
13
  # Display MinerU version
14
  echo "MinerU version:"
15
+ magic-pdf --version || echo "Error: MinerU magic-pdf not found"
16
+
17
+ # Create directories for models if they don't exist
18
+ mkdir -p /tmp/models/MFD/YOLO
19
+ mkdir -p /tmp/models/MFR/unimernet
20
+ mkdir -p /tmp/models/table/rapid
21
+ mkdir -p /tmp/models/layout/doclayout
22
+
23
+ # Check if model files exist, if not, download them
24
+ echo "Checking model files..."
25
+ MODEL_FILES=(
26
+ "/tmp/models/MFD/YOLO/yolo_v8_ft.pt"
27
+ "/tmp/models/MFD/YOLO/yolo_v8_mfd.pt"
28
+ "/tmp/models/MFR/unimernet/unimernet_small.pth"
29
+ "/tmp/models/table/rapid/rapid_table.pt"
30
+ "/tmp/models/table/rapid/slanet_plus.pt"
31
+ "/tmp/models/layout/doclayout/doclayout_yolo.pt"
32
+ )
33
+
34
+ MODELS_REPO="https://huggingface.co/opendatalab/MinerU/resolve/main/models"
35
+ MODEL_URLS=(
36
+ "${MODELS_REPO}/mfd/yolo_v8_mfd.pt"
37
+ "${MODELS_REPO}/mfd/yolo_v8_mfd.pt"
38
+ "${MODELS_REPO}/mfr/unimernet_small.pth"
39
+ "${MODELS_REPO}/table/rapid_table.pt"
40
+ "${MODELS_REPO}/table/slanet_plus.pt"
41
+ "${MODELS_REPO}/layout/doclayout_yolo.pt"
42
+ )
43
+
44
+ for i in "${!MODEL_FILES[@]}"; do
45
+ if [ ! -f "${MODEL_FILES[$i]}" ]; then
46
+ echo "Downloading ${MODEL_FILES[$i]}..."
47
+ wget -q "${MODEL_URLS[$i]}" -O "${MODEL_FILES[$i]}" || echo "Failed to download ${MODEL_FILES[$i]}"
48
+ else
49
+ echo "${MODEL_FILES[$i]} already exists."
50
+ fi
51
+ done
52
 
53
  # Create a samples directory
54
+ mkdir -p $HOME/samples
55
  mkdir -p $HOME/.config/magic_pdf
 
 
 
 
 
 
 
 
56
 
57
  # Download a sample PDF for testing if it doesn't exist
58
+ echo "Downloading sample PDF for testing..."
59
+ if [ ! -f "$HOME/samples/sample.pdf" ]; then
60
  # Download a simple paper from arXiv (using a small one for quick processing)
61
+ wget -q "https://arxiv.org/pdf/2201.08239.pdf" -O "$HOME/samples/sample.pdf" || true
62
 
63
  # If that fails, try another source
64
+ if [ ! -s "$HOME/samples/sample.pdf" ]; then
65
+ wget -q "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf" -O "$HOME/samples/sample.pdf" || true
66
  fi
67
 
68
  # If both fail, create a simple PDF with text
69
+ if [ ! -s "$HOME/samples/sample.pdf" ]; then
70
  echo "Failed to download sample PDF, creating a simple PDF text file..."
71
  echo "This is a sample PDF document for testing MinerU.
72
 
73
  MinerU is a high-quality tool for converting PDF to Markdown and JSON formats.
74
 
75
+ This file was created for testing purposes." > "$HOME/samples/sample.txt"
76
 
77
  # Try using different methods to create a PDF
78
  if command -v convert &> /dev/null; then
79
+ convert -size 612x792 -background white -fill black caption:@"$HOME/samples/sample.txt" "$HOME/samples/sample.pdf"
80
  else
81
  echo "WARNING: Could not create a sample PDF file automatically."
82
  fi
83
  fi
84
  fi
85
 
86
+ # Create the magic-pdf.json config file with paths to the downloaded models
87
+ echo "Creating magic-pdf.json configuration file..."
88
+ cat > "$HOME/.config/magic_pdf/magic-pdf.json" << EOF
 
 
 
89
  {
90
  "device-mode": "gpu",
91
+ "models-path": "/tmp/models",
92
  "layout-config": {
93
  "model": "doclayout_yolo",
94
+ "model_path": "/tmp/models/layout/doclayout/doclayout_yolo.pt",
95
  "enable": true
96
  },
97
  "formula-config": {
98
  "mfd_model": "yolo_v8_mfd",
99
+ "mfd_model_path": "/tmp/models/MFD/YOLO/yolo_v8_mfd.pt",
100
  "mfr_model": "unimernet_small",
101
+ "mfr_model_path": "/tmp/models/MFR/unimernet/unimernet_small.pth",
102
  "enable": true
103
  },
104
  "table-config": {
105
  "model": "rapid_table",
106
+ "model_path": "/tmp/models/table/rapid/rapid_table.pt",
107
  "sub_model": "slanet_plus",
108
+ "sub_model_path": "/tmp/models/table/rapid/slanet_plus.pt",
109
  "enable": true,
110
  "max_time": 400
111
  }
112
  }
113
  EOF
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
+ # Also create it in the home directory as some versions of MinerU look for it there
116
+ cp "$HOME/.config/magic_pdf/magic-pdf.json" "$HOME/magic-pdf.json"
117
 
118
+ # List model files to verify they're present
119
+ echo "Verifying model files:"
120
+ ls -la /tmp/models/MFD/YOLO/ || echo "YOLO models directory issue"
121
+ ls -la /tmp/models/MFR/unimernet/ || echo "UniMERNet models directory issue"
122
+ ls -la /tmp/models/table/rapid/ || echo "Table models directory issue"
123
+ ls -la /tmp/models/layout/doclayout/ || echo "Layout models directory issue"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
+ # Start the Flask application
126
+ echo "Starting Flask application..."
127
+ python /app/app.py
requirements.txt CHANGED
@@ -1,10 +1,10 @@
1
  flask==2.3.3
2
- transformers>=4.37.0
3
- torch>=2.0.0
4
- sentencepiece>=0.1.99
5
  requests>=2.31.0
6
- accelerate>=0.25.0
7
- einops>=0.6.0
8
- packaging>=23.0
9
- werkzeug>=2.3.0
10
- flask-cors>=4.0.0
 
 
1
  flask==2.3.3
2
+ werkzeug==2.3.7
3
+ flask-cors==4.0.0
 
4
  requests>=2.31.0
5
+ pillow>=9.4.0
6
+ numpy>=1.24.0
7
+ wget>=3.2
8
+ magic-pdf[full]>=1.3.0
9
+ uuid>=1.30
10
+ python-magic>=0.4.27