marcosremar2 commited on
Commit
8005bb9
·
1 Parent(s): eaa0dd6

Fix magic-pdf installation and remove fallbacks, ensuring proper operation in Hugging Face Spaces

Browse files
Files changed (4) hide show
  1. Dockerfile +25 -13
  2. app.py +152 -222
  3. entrypoint.sh +68 -59
  4. requirements.txt +6 -2
Dockerfile CHANGED
@@ -9,6 +9,8 @@ RUN apt-get update && \
9
  libglib2.0-0 \
10
  wget \
11
  git \
 
 
12
  && rm -rf /var/lib/apt/lists/*
13
 
14
  # Set working directory
@@ -18,34 +20,44 @@ WORKDIR /app
18
  ENV PYTHONUNBUFFERED=1 \
19
  PYTHONDONTWRITEBYTECODE=1 \
20
  UPLOAD_FOLDER=/tmp/pdf_uploads \
21
- OUTPUT_FOLDER=/tmp/pdf_output
 
 
22
 
23
  # Create necessary directories and set permissions
24
- RUN mkdir -p /tmp/pdf_uploads /tmp/pdf_output /tmp/models \
 
 
 
 
25
  && chmod -R 777 /tmp/pdf_uploads /tmp/pdf_output /tmp/models
26
 
 
 
 
27
  # Copy requirements and app files
28
  COPY requirements.txt /app/
29
  COPY *.py /app/
30
  COPY entrypoint.sh /app/
31
  RUN chmod +x /app/entrypoint.sh
32
 
33
- # Create a non-root user and change ownership
34
- RUN useradd -m -u 1000 appuser && \
35
- chown -R appuser:appuser /app /tmp/pdf_uploads /tmp/pdf_output /tmp/models
36
-
37
  # Install Python dependencies
38
- RUN pip install --no-cache-dir -r requirements.txt && \
39
- pip install --no-cache-dir pymupdf
 
 
 
40
 
41
- # Switch to non-root user
42
- USER appuser
 
43
 
44
- # Install MinerU and PyMuPDF (fallback)
45
- RUN pip install --no-cache-dir --user minerupdf
 
46
 
47
  # Test the installation
48
- RUN ~/.local/bin/magic-pdf --version || echo "magic-pdf not working, will fallback to PyMuPDF"
49
 
50
  # Expose port
51
  EXPOSE 7860
 
9
  libglib2.0-0 \
10
  wget \
11
  git \
12
+ libopenblas-dev \
13
+ libgomp1 \
14
  && rm -rf /var/lib/apt/lists/*
15
 
16
  # Set working directory
 
20
  ENV PYTHONUNBUFFERED=1 \
21
  PYTHONDONTWRITEBYTECODE=1 \
22
  UPLOAD_FOLDER=/tmp/pdf_uploads \
23
+ OUTPUT_FOLDER=/tmp/pdf_output \
24
+ HF_HUB_DISABLE_PROGRESS_BARS=1 \
25
+ HF_HUB_ENABLE_HF_TRANSFER=0
26
 
27
  # Create necessary directories and set permissions
28
+ RUN mkdir -p /tmp/pdf_uploads /tmp/pdf_output \
29
+ && mkdir -p /tmp/models/MFD/YOLO \
30
+ && mkdir -p /tmp/models/MFR/unimernet \
31
+ && mkdir -p /tmp/models/table/rapid \
32
+ && mkdir -p /tmp/models/layout/doclayout \
33
  && chmod -R 777 /tmp/pdf_uploads /tmp/pdf_output /tmp/models
34
 
35
+ # Create magic-pdf config directories
36
+ RUN mkdir -p /root/.config/magic_pdf
37
+
38
  # Copy requirements and app files
39
  COPY requirements.txt /app/
40
  COPY *.py /app/
41
  COPY entrypoint.sh /app/
42
  RUN chmod +x /app/entrypoint.sh
43
 
 
 
 
 
44
  # Install Python dependencies
45
+ RUN pip install --no-cache-dir --upgrade pip setuptools wheel
46
+ RUN pip install --no-cache-dir -r requirements.txt
47
+
48
+ # Copy default magic-pdf config
49
+ RUN echo '{"device-mode":"cpu","layout-config":{"model":"doclayout_yolo","enable":true},"formula-config":{"mfd_model":"yolo_v8_mfd","mfr_model":"unimernet_small","enable":false},"table-config":{"model":"rapid_table","sub_model":"slanet_plus","enable":false}}' > /root/.config/magic_pdf/magic-pdf.json
50
 
51
+ # Download model files directly
52
+ RUN curl -L https://huggingface.co/marcosremar2/mineru-models/resolve/main/doclayout_yolo.pt -o /tmp/models/layout/doclayout/doclayout_yolo.pt
53
+ RUN curl -L https://huggingface.co/marcosremar2/mineru-models/resolve/main/yolo_v8_ft.pt -o /tmp/models/MFD/YOLO/yolo_v8_ft.pt
54
 
55
+ # Verify model files were downloaded correctly
56
+ RUN ls -la /tmp/models/layout/doclayout/doclayout_yolo.pt
57
+ RUN ls -la /tmp/models/MFD/YOLO/yolo_v8_ft.pt
58
 
59
  # Test the installation
60
+ RUN magic-pdf --version
61
 
62
  # Expose port
63
  EXPOSE 7860
app.py CHANGED
@@ -26,39 +26,61 @@ logging.basicConfig(
26
  logger = logging.getLogger()
27
 
28
  # Constants
29
- UPLOAD_FOLDER = os.environ.get('UPLOAD_FOLDER', '/tmp/pdf_uploads')
30
- OUTPUT_FOLDER = os.environ.get('OUTPUT_FOLDER', '/tmp/pdf_output')
31
  ALLOWED_EXTENSIONS = {'pdf'}
32
  MAX_FILE_SIZE = 50 * 1024 * 1024 # 50MB
33
 
34
- # Ensure the directories exist and are writable
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  try:
36
- os.makedirs(UPLOAD_FOLDER, exist_ok=True)
37
- os.makedirs(OUTPUT_FOLDER, exist_ok=True)
38
- # Test if we can write to these directories
39
- test_upload_file = os.path.join(UPLOAD_FOLDER, '.test_file')
40
- test_output_file = os.path.join(OUTPUT_FOLDER, '.test_file')
41
-
42
- with open(test_upload_file, 'w') as f:
43
- f.write('test')
44
- os.remove(test_upload_file)
45
-
46
- with open(test_output_file, 'w') as f:
47
- f.write('test')
48
- os.remove(test_output_file)
49
-
50
- logging.info(f"Using upload folder: {UPLOAD_FOLDER}")
51
- logging.info(f"Using output folder: {OUTPUT_FOLDER}")
52
  except Exception as e:
53
- # Fall back to user's home directory
54
- logging.warning(f"Cannot use specified directories: {str(e)}")
55
- USER_HOME = os.path.expanduser("~")
56
- UPLOAD_FOLDER = os.path.join(USER_HOME, 'pdf_uploads')
57
- OUTPUT_FOLDER = os.path.join(USER_HOME, 'pdf_output')
58
- os.makedirs(UPLOAD_FOLDER, exist_ok=True)
59
- os.makedirs(OUTPUT_FOLDER, exist_ok=True)
60
- logging.info(f"Using fallback upload folder: {UPLOAD_FOLDER}")
61
- logging.info(f"Using fallback output folder: {OUTPUT_FOLDER}")
62
 
63
  # Function to check if file extension is allowed
64
  def allowed_file(filename):
@@ -364,220 +386,128 @@ def help_output():
364
  # Route for PDF conversion
365
  @app.route('/api/convert', methods=['POST'])
366
  def convert_pdf():
367
- if 'file' not in request.files:
368
- return jsonify({"error": "No file part in the request"}), 400
369
-
370
- file = request.files['file']
371
-
372
- if file.filename == '':
373
- return jsonify({"error": "No file selected"}), 400
374
-
375
- if not allowed_file(file.filename):
376
- return jsonify({"error": f"Only {', '.join(ALLOWED_EXTENSIONS)} files are allowed"}), 400
377
-
378
  try:
379
- # Create a unique session ID
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
380
  session_id = str(uuid.uuid4())
381
  session_dir = os.path.join(OUTPUT_FOLDER, session_id)
382
-
383
- try:
384
- os.makedirs(session_dir, exist_ok=True)
385
- logging.info(f"Created session directory: {session_dir}")
386
- except PermissionError as e:
387
- logging.error(f"Permission error creating directory {session_dir}: {str(e)}")
388
- # Try using a temp directory instead
389
- session_dir = tempfile.mkdtemp(prefix="minerupdf_")
390
- logging.info(f"Using temporary directory instead: {session_dir}")
391
 
392
  # Save the uploaded file
393
- input_path = os.path.join(UPLOAD_FOLDER, f"{session_id}_{file.filename}")
394
- try:
395
- file.save(input_path)
396
- logging.info(f"Saved uploaded file to {input_path}")
397
- except PermissionError:
398
- # Try saving in the temp directory
399
- input_path = os.path.join(session_dir, f"{session_id}_{file.filename}")
400
- file.save(input_path)
401
- logging.info(f"Saved uploaded file to alternate location: {input_path}")
402
 
403
- # Get output file paths
404
- base_filename = os.path.splitext(os.path.basename(file.filename))[0]
405
- base_filename = ''.join(c if c.isalnum() or c in ['_', '-', '.'] else '_' for c in base_filename)
406
 
 
407
  markdown_path = os.path.join(session_dir, f"{base_filename}.md")
408
  json_path = os.path.join(session_dir, f"{base_filename}.json")
409
 
410
- # Define alternate locations to search for generated files
411
- alternate_locations = [
412
- os.path.dirname(input_path), # Check PDF directory
413
- os.path.join(OUTPUT_FOLDER, base_filename), # Check output folder with base filename
414
- session_dir # Default session directory
415
- ]
416
-
417
  # URLs for file download
418
  markdown_url = f"/download/{session_id}/{base_filename}.md"
419
  json_url = f"/download/{session_id}/{base_filename}.json"
420
 
421
- try:
422
- # Run MinerU to convert the PDF
423
- cmd = [
424
- "magic-pdf",
425
- "--path", input_path,
426
- "--output-dir", session_dir
427
- ]
428
-
429
- logging.info(f"Running command: {' '.join(cmd)}")
430
- process = subprocess.run(cmd, capture_output=True, text=True, check=False)
431
-
432
- if process.returncode != 0:
433
- logging.error(f"MinerU conversion failed: {process.stderr}")
434
-
435
- # Fallback to PyMuPDF conversion
436
- try:
437
- import fitz # PyMuPDF
438
-
439
- logging.info(f"Falling back to PyMuPDF for PDF conversion: {input_path}")
440
- # Open the PDF file
441
- pdf = fitz.open(input_path)
442
-
443
- # Extract text and create markdown
444
- markdown_content = f"# {base_filename}\n\n"
445
- json_content = {
446
- "title": base_filename,
447
- "pages": len(pdf),
448
- "sections": []
449
- }
450
-
451
- for page_num in range(len(pdf)):
452
- page = pdf[page_num]
453
- text = page.get_text()
454
-
455
- # Add to markdown
456
- markdown_content += f"## Page {page_num + 1}\n\n{text}\n\n"
457
-
458
- # Add to JSON
459
- json_content["sections"].append({
460
- "title": f"Page {page_num + 1}",
461
- "content": text
462
- })
463
-
464
- # Save markdown
465
- with open(markdown_path, 'w', encoding='utf-8') as f:
466
- f.write(markdown_content)
467
-
468
- # Save JSON
469
- with open(json_path, 'w', encoding='utf-8') as f:
470
- json.dump(json_content, f, indent=2, ensure_ascii=False)
471
-
472
- logging.info(f"Successfully converted PDF to markdown and JSON using PyMuPDF")
473
- except ImportError:
474
- # PyMuPDF not available
475
- return jsonify({
476
- "error": "PDF conversion failed with both MinerU and PyMuPDF",
477
- "details": process.stderr,
478
- "command": " ".join(cmd)
479
- }), 500
480
-
481
- # Check if files were generated
482
- md_exists = os.path.exists(markdown_path)
483
- json_exists = os.path.exists(json_path)
484
-
485
- # Search in alternate locations if files not found
486
- if not md_exists or not json_exists:
487
- logging.info("Files not found in primary location, searching alternate locations...")
488
-
489
- for location in alternate_locations:
490
- if not md_exists:
491
- alt_md_path = os.path.join(location, f"{base_filename}.md")
492
- if os.path.exists(alt_md_path):
493
- # Copy to session directory for download access
494
- try:
495
- shutil.copy(alt_md_path, markdown_path)
496
- md_exists = True
497
- logging.info(f"Found and copied Markdown file from {alt_md_path}")
498
- except:
499
- # If copy fails, update path to point to original
500
- markdown_path = alt_md_path
501
- md_exists = True
502
- logging.info(f"Found Markdown file at alternate location: {alt_md_path}")
503
-
504
- if not json_exists:
505
- alt_json_path = os.path.join(location, f"{base_filename}.json")
506
- if os.path.exists(alt_json_path):
507
- # Copy to session directory for download access
508
- try:
509
- shutil.copy(alt_json_path, json_path)
510
- json_exists = True
511
- logging.info(f"Found and copied JSON file from {alt_json_path}")
512
- except:
513
- # If copy fails, update path to point to original
514
- json_path = alt_json_path
515
- json_exists = True
516
- logging.info(f"Found JSON file at alternate location: {alt_json_path}")
517
-
518
- # If still not found, check for any markdown or JSON files in the directory
519
- if not md_exists:
520
- md_files = [f for f in os.listdir(session_dir) if f.endswith('.md')]
521
- if md_files:
522
- markdown_path = os.path.join(session_dir, md_files[0])
523
- md_exists = True
524
- logging.info(f"Found Markdown file at alternate location: {markdown_path}")
525
-
526
- if not json_exists:
527
- json_files = [f for f in os.listdir(session_dir) if f.endswith('.json')]
528
- if json_files:
529
- json_path = os.path.join(session_dir, json_files[0])
530
- json_exists = True
531
- logging.info(f"Found JSON file at alternate location: {json_path}")
532
-
533
- if md_exists:
534
- with open(markdown_path, 'r', encoding='utf-8') as f:
535
- markdown_content = f.read()
536
- else:
537
- markdown_content = None
538
- logging.warning(f"Markdown file not found at {markdown_path}")
539
-
540
- if json_exists:
541
- with open(json_path, 'r', encoding='utf-8') as f:
542
- json_content = json.load(f)
543
- else:
544
- json_content = None
545
- logging.warning(f"JSON file not found at {json_path}")
546
-
547
- return jsonify({
548
- "success": True,
549
- "session_id": session_id,
550
- "markdown_url": markdown_url if md_exists else None,
551
- "json_url": json_url if json_exists else None,
552
- "markdown_content": markdown_content,
553
- "json_content": json_content,
554
- "message": "PDF conversion completed"
555
- })
556
-
557
- except Exception as e:
558
- logging.error(f"Error during conversion: {str(e)}")
559
- logging.error(traceback.format_exc())
560
 
561
  return jsonify({
562
- "error": "Error processing PDF file",
563
- "details": str(e)
 
564
  }), 500
565
-
566
- except Exception as e:
567
- logging.error(f"General error: {str(e)}")
568
- logging.error(traceback.format_exc())
569
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
570
  return jsonify({
571
- "error": "Failed to process request",
572
- "details": str(e)
 
573
  }), 500
574
- finally:
575
- # Clean up the input file
576
- try:
577
- if 'input_path' in locals() and os.path.exists(input_path):
578
- os.remove(input_path)
579
- except Exception as e:
580
- logging.warning(f"Failed to clean up input file: {str(e)}")
581
 
582
  # Route to download converted files
583
  @app.route('/download/<session_id>/<filename>')
 
26
  logger = logging.getLogger()
27
 
28
  # Constants
29
+ UPLOAD_FOLDER = '/tmp/pdf_uploads'
30
+ OUTPUT_FOLDER = '/tmp/pdf_output'
31
  ALLOWED_EXTENSIONS = {'pdf'}
32
  MAX_FILE_SIZE = 50 * 1024 * 1024 # 50MB
33
 
34
+ # Ensure the directories exist
35
+ os.makedirs(UPLOAD_FOLDER, exist_ok=True)
36
+ os.makedirs(OUTPUT_FOLDER, exist_ok=True)
37
+ logging.info(f"Using upload folder: {UPLOAD_FOLDER}")
38
+ logging.info(f"Using output folder: {OUTPUT_FOLDER}")
39
+
40
+ # Create config directories and files if they don't exist
41
+ CONFIG_DIR = os.path.join(os.path.expanduser("~"), ".config", "magic_pdf")
42
+ os.makedirs(CONFIG_DIR, exist_ok=True)
43
+ CONFIG_FILE = os.path.join(CONFIG_DIR, "magic-pdf.json")
44
+
45
+ # Default configuration
46
+ DEFAULT_CONFIG = {
47
+ "device-mode": "cpu",
48
+ "layout-config": {
49
+ "model": "doclayout_yolo",
50
+ "enable": True
51
+ },
52
+ "formula-config": {
53
+ "mfd_model": "yolo_v8_mfd",
54
+ "mfr_model": "unimernet_small",
55
+ "enable": False
56
+ },
57
+ "table-config": {
58
+ "model": "rapid_table",
59
+ "sub_model": "slanet_plus",
60
+ "enable": False
61
+ }
62
+ }
63
+
64
+ # Write config file if it doesn't exist
65
+ if not os.path.exists(CONFIG_FILE):
66
+ with open(CONFIG_FILE, 'w') as f:
67
+ json.dump(DEFAULT_CONFIG, f, indent=2)
68
+ logging.info(f"Created magic-pdf config at {CONFIG_FILE}")
69
+
70
+ # Also create the config in the home directory as a fallback
71
+ HOME_CONFIG = os.path.join(os.path.expanduser("~"), "magic-pdf.json")
72
+ if not os.path.exists(HOME_CONFIG):
73
+ with open(HOME_CONFIG, 'w') as f:
74
+ json.dump(DEFAULT_CONFIG, f, indent=2)
75
+ logging.info(f"Created magic-pdf config at {HOME_CONFIG}")
76
+
77
+ # Verify magic-pdf installation
78
  try:
79
+ result = subprocess.run(['magic-pdf', '--version'], capture_output=True, text=True)
80
+ logging.info(f"magic-pdf version: {result.stdout.strip()}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  except Exception as e:
82
+ logging.error(f"Error checking magic-pdf: {str(e)}")
83
+ raise RuntimeError("magic-pdf is not installed properly. Please check installation.")
 
 
 
 
 
 
 
84
 
85
  # Function to check if file extension is allowed
86
  def allowed_file(filename):
 
386
  # Route for PDF conversion
387
  @app.route('/api/convert', methods=['POST'])
388
  def convert_pdf():
389
+ """
390
+ API endpoint for PDF conversion.
391
+ """
 
 
 
 
 
 
 
 
392
  try:
393
+ # Check if file is in the request
394
+ if 'file' not in request.files:
395
+ return jsonify({
396
+ "success": False,
397
+ "error": "No file part in the request"
398
+ }), 400
399
+
400
+ file = request.files['file']
401
+
402
+ # Check if file is selected
403
+ if file.filename == '':
404
+ return jsonify({
405
+ "success": False,
406
+ "error": "No file selected"
407
+ }), 400
408
+
409
+ # Check if file is a PDF
410
+ if not allowed_file(file.filename):
411
+ return jsonify({
412
+ "success": False,
413
+ "error": "File must be a PDF"
414
+ }), 400
415
+
416
+ # Generate session ID
417
  session_id = str(uuid.uuid4())
418
  session_dir = os.path.join(OUTPUT_FOLDER, session_id)
419
+ os.makedirs(session_dir, exist_ok=True)
 
 
 
 
 
 
 
 
420
 
421
  # Save the uploaded file
422
+ filename = os.path.basename(file.filename)
423
+ base_filename = os.path.splitext(filename)[0]
424
+ input_path = os.path.join(session_dir, filename)
425
+ file.save(input_path)
 
 
 
 
 
426
 
427
+ logging.info(f"Saved PDF file to {input_path}")
 
 
428
 
429
+ # Set paths for output files
430
  markdown_path = os.path.join(session_dir, f"{base_filename}.md")
431
  json_path = os.path.join(session_dir, f"{base_filename}.json")
432
 
 
 
 
 
 
 
 
433
  # URLs for file download
434
  markdown_url = f"/download/{session_id}/{base_filename}.md"
435
  json_url = f"/download/{session_id}/{base_filename}.json"
436
 
437
+ # Run magic-pdf to convert the PDF
438
+ command = ["magic-pdf", "--path", input_path, "--output-dir", session_dir]
439
+
440
+ logging.info(f"Running command: {' '.join(command)}")
441
+
442
+ # Execute the command and capture output
443
+ process_log = []
444
+ process = subprocess.Popen(
445
+ command,
446
+ stdout=subprocess.PIPE,
447
+ stderr=subprocess.STDOUT,
448
+ universal_newlines=True
449
+ )
450
+
451
+ # Capture output in real-time
452
+ for line in process.stdout:
453
+ process_log.append(line)
454
+ logging.info(f"magic-pdf output: {line.strip()}")
455
+
456
+ # Wait for process to complete
457
+ return_code = process.wait()
458
+
459
+ if return_code != 0:
460
+ error_message = "\n".join(process_log)
461
+ logging.error(f"magic-pdf failed with code {return_code}")
462
+ logging.error(f"Output: {error_message}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
463
 
464
  return jsonify({
465
+ "success": False,
466
+ "error": f"PDF conversion failed with exit code {return_code}",
467
+ "log": error_message
468
  }), 500
 
 
 
 
469
 
470
+ # Check if files were generated
471
+ md_exists = os.path.exists(markdown_path)
472
+ json_exists = os.path.exists(json_path)
473
+
474
+ logging.info(f"Markdown file exists: {md_exists} at {markdown_path}")
475
+ logging.info(f"JSON file exists: {json_exists} at {json_path}")
476
+
477
+ # Read file contents if they exist
478
+ if md_exists:
479
+ with open(markdown_path, 'r', encoding='utf-8') as f:
480
+ markdown_content = f.read()
481
+ else:
482
+ markdown_content = None
483
+ logging.warning(f"Markdown file not found at {markdown_path}")
484
+
485
+ if json_exists:
486
+ with open(json_path, 'r', encoding='utf-8') as f:
487
+ json_content = json.load(f)
488
+ else:
489
+ json_content = None
490
+ logging.warning(f"JSON file not found at {json_path}")
491
+
492
+ # Return results
493
+ return jsonify({
494
+ "success": True,
495
+ "session_id": session_id,
496
+ "markdown_url": markdown_url if md_exists else None,
497
+ "json_url": json_url if json_exists else None,
498
+ "markdown_content": markdown_content,
499
+ "json_content": json_content,
500
+ "input_file": filename,
501
+ "log": "\n".join(process_log)
502
+ })
503
+
504
+ except Exception as e:
505
+ logging.exception("Error in convert_pdf endpoint")
506
  return jsonify({
507
+ "success": False,
508
+ "error": str(e),
509
+ "traceback": traceback.format_exc()
510
  }), 500
 
 
 
 
 
 
 
511
 
512
  # Route to download converted files
513
  @app.route('/download/<session_id>/<filename>')
entrypoint.sh CHANGED
@@ -8,39 +8,25 @@ echo "Current directory: $(pwd)"
8
  echo "Directory listing:"
9
  ls -la
10
 
11
- # Create necessary directories
12
- mkdir -p /tmp/pdf_uploads
13
- mkdir -p /tmp/pdf_output
14
- mkdir -p /tmp/models/MFD/YOLO
15
- mkdir -p /tmp/models/MFR/unimernet
16
- mkdir -p /tmp/models/table/rapid
17
- mkdir -p /tmp/models/layout/doclayout
18
 
19
- # Set permissions (ensure directories are writable)
20
- chmod -R 777 /tmp/pdf_uploads
21
- chmod -R 777 /tmp/pdf_output
22
- chmod -R 777 /tmp/models
23
-
24
- # Check if magic-pdf command exists
25
- if ! command -v magic-pdf &> /dev/null; then
26
- echo "Warning: magic-pdf command not found. Installing MinerU PDF processing tools..."
27
- pip install --no-cache-dir minerupdf
28
-
29
- # Verify installation
30
- if command -v magic-pdf &> /dev/null; then
31
- echo "Successfully installed magic-pdf."
32
- else
33
- echo "Failed to install magic-pdf. Will use PyMuPDF fallback."
34
- pip install --no-cache-dir pymupdf
35
  fi
36
- else
37
- echo "magic-pdf command is already installed."
38
- fi
39
 
40
- # Configure magic-pdf
41
- mkdir -p ~/.config/magic_pdf
42
- cat > ~/.config/magic_pdf/magic-pdf.json << EOL
43
- {
44
  "device-mode": "cpu",
45
  "layout-config": {
46
  "model": "doclayout_yolo",
@@ -49,61 +35,84 @@ cat > ~/.config/magic_pdf/magic-pdf.json << EOL
49
  "formula-config": {
50
  "mfd_model": "yolo_v8_mfd",
51
  "mfr_model": "unimernet_small",
52
- "enable": true
53
  },
54
  "table-config": {
55
  "model": "rapid_table",
56
  "sub_model": "slanet_plus",
57
- "enable": true,
58
- "max_time": 400
59
  }
60
- }
61
- EOL
62
 
63
- # Download required model files if they don't exist or are empty
64
- function download_model() {
 
 
 
 
 
 
 
 
65
  local model_path=$1
66
  local model_url=$2
67
  local max_attempts=3
68
  local attempt=1
69
 
 
 
70
  if [ ! -f "$model_path" ] || [ ! -s "$model_path" ]; then
71
- echo "Downloading model to $model_path from $model_url"
72
  while [ $attempt -le $max_attempts ]; do
73
- echo "Download attempt $attempt of $max_attempts..."
74
- curl -L "$model_url" --output "$model_path" --retry 3 --retry-delay 2
75
 
 
76
  if [ -f "$model_path" ] && [ -s "$model_path" ]; then
77
- echo "Successfully downloaded model to $model_path ($(du -h "$model_path" | cut -f1) used)"
 
78
  return 0
 
 
 
 
 
79
  fi
80
-
81
- echo "Download failed or file is empty. Retrying..."
82
- rm -f "$model_path"
83
- attempt=$((attempt+1))
84
- sleep 2
85
  done
86
 
87
- echo "Failed to download model after $max_attempts attempts"
88
- return 1
89
  else
90
- echo "Model already exists at $model_path ($(du -h "$model_path" | cut -f1) used)"
91
- return 0
92
  fi
93
  }
94
 
95
- # Download models
96
- download_model "/tmp/models/MFD/YOLO/yolo_v8_ft.pt" "https://huggingface.co/marcosremar2/mineru-models/resolve/main/yolo_v8_ft.pt"
97
- download_model "/tmp/models/MFR/unimernet/unimernet_small.pth" "https://huggingface.co/marcosremar2/mineru-models/resolve/main/unimernet_small.pth"
98
- download_model "/tmp/models/table/rapid/rapid_table.pt" "https://huggingface.co/marcosremar2/mineru-models/resolve/main/rapid_table.pt"
99
- download_model "/tmp/models/layout/doclayout/doclayout_yolo.pt" "https://huggingface.co/marcosremar2/mineru-models/resolve/main/doclayout_yolo.pt"
100
 
101
- # Verify magic-pdf installation and functionality
 
 
 
 
 
102
  if command -v magic-pdf &> /dev/null; then
103
- echo "Testing magic-pdf command..."
104
- magic-pdf --version || echo "magic-pdf command exists but may not be functioning properly"
105
  else
106
- echo "Warning: magic-pdf command still not available. Will use PyMuPDF as fallback."
 
 
 
 
 
 
 
 
 
 
107
  fi
108
 
109
  # Start the Flask application
 
8
  echo "Directory listing:"
9
  ls -la
10
 
11
+ # Set environment variables to disable HF progress bars (prevents hanging)
12
+ export HF_HUB_DISABLE_PROGRESS_BARS=1
13
+ export HF_HUB_ENABLE_HF_TRANSFER=0
 
 
 
 
14
 
15
+ # Verify directories existence and permissions
16
+ echo "Checking directories..."
17
+ for dir in /tmp/pdf_uploads /tmp/pdf_output /tmp/models/MFD/YOLO /tmp/models/MFR/unimernet /tmp/models/table/rapid /tmp/models/layout/doclayout
18
+ do
19
+ if [ ! -d "$dir" ]; then
20
+ echo "Creating directory: $dir"
21
+ mkdir -p "$dir"
 
 
 
 
 
 
 
 
 
22
  fi
23
+ chmod -R 777 "$dir"
24
+ echo "Directory $dir is ready"
25
+ done
26
 
27
+ # Copy config file to all possible locations to ensure it's found
28
+ echo "Setting up magic-pdf config..."
29
+ CONFIG_CONTENT='{
 
30
  "device-mode": "cpu",
31
  "layout-config": {
32
  "model": "doclayout_yolo",
 
35
  "formula-config": {
36
  "mfd_model": "yolo_v8_mfd",
37
  "mfr_model": "unimernet_small",
38
+ "enable": false
39
  },
40
  "table-config": {
41
  "model": "rapid_table",
42
  "sub_model": "slanet_plus",
43
+ "enable": false
 
44
  }
45
+ }'
 
46
 
47
+ # Create config in all possible locations
48
+ mkdir -p ~/.config/magic_pdf
49
+ echo "$CONFIG_CONTENT" > ~/.config/magic_pdf/magic-pdf.json
50
+ echo "$CONFIG_CONTENT" > ~/magic-pdf.json
51
+ echo "$CONFIG_CONTENT" > /app/magic-pdf.json
52
+ echo "$CONFIG_CONTENT" > /home/user/magic-pdf.json
53
+ echo "$CONFIG_CONTENT" > /root/.config/magic_pdf/magic-pdf.json
54
+
55
+ # Download model function with validation
56
+ function download_model_with_validation() {
57
  local model_path=$1
58
  local model_url=$2
59
  local max_attempts=3
60
  local attempt=1
61
 
62
+ echo "Checking model file: $model_path"
63
+
64
  if [ ! -f "$model_path" ] || [ ! -s "$model_path" ]; then
 
65
  while [ $attempt -le $max_attempts ]; do
66
+ echo "Downloading model attempt $attempt/$max_attempts: $model_url"
67
+ curl -L --retry 5 --retry-delay 2 "$model_url" -o "$model_path"
68
 
69
+ # Verify file exists and has content
70
  if [ -f "$model_path" ] && [ -s "$model_path" ]; then
71
+ size=$(du -h "$model_path" | cut -f1)
72
+ echo "✅ Model downloaded successfully ($size): $model_path"
73
  return 0
74
+ else
75
+ echo "❌ Download failed or file is empty. Retrying..."
76
+ rm -f "$model_path" 2>/dev/null
77
+ attempt=$((attempt+1))
78
+ sleep 2
79
  fi
 
 
 
 
 
80
  done
81
 
82
+ echo "Failed to download model after $max_attempts attempts: $model_url"
83
+ exit 1
84
  else
85
+ size=$(du -h "$model_path" | cut -f1)
86
+ echo "✅ Model already exists ($size): $model_path"
87
  fi
88
  }
89
 
90
+ # Download and verify all required models
91
+ echo "Verifying model files..."
92
+ download_model_with_validation "/tmp/models/layout/doclayout/doclayout_yolo.pt" "https://huggingface.co/marcosremar2/mineru-models/resolve/main/doclayout_yolo.pt"
93
+ download_model_with_validation "/tmp/models/MFD/YOLO/yolo_v8_ft.pt" "https://huggingface.co/marcosremar2/mineru-models/resolve/main/yolo_v8_ft.pt"
 
94
 
95
+ # List all model files for verification
96
+ echo "Model files verification:"
97
+ find /tmp/models -type f -exec ls -la {} \;
98
+
99
+ # Verify magic-pdf exists and is executable
100
+ echo "Checking magic-pdf installation..."
101
  if command -v magic-pdf &> /dev/null; then
102
+ echo "magic-pdf found. Testing version:"
103
+ magic-pdf --version
104
  else
105
+ echo "magic-pdf command not found. Installing:"
106
+ pip install --no-cache-dir minerupdf==1.3.6
107
+ if command -v magic-pdf &> /dev/null; then
108
+ echo "magic-pdf installed successfully. Testing version:"
109
+ magic-pdf --version
110
+ else
111
+ echo "Failed to install magic-pdf. Check PATH and installation."
112
+ echo "PATH: $PATH"
113
+ pip list | grep miner
114
+ exit 1
115
+ fi
116
  fi
117
 
118
  # Start the Flask application
requirements.txt CHANGED
@@ -4,5 +4,9 @@ werkzeug==2.3.7
4
  Pillow>=9.0.0
5
  numpy>=1.20.0
6
  requests>=2.25.0
7
- pymupdf>=1.20.0
8
- minerupdf
 
 
 
 
 
4
  Pillow>=9.0.0
5
  numpy>=1.20.0
6
  requests>=2.25.0
7
+ opencv-python-headless>=4.5.0
8
+ torch>=1.8.0
9
+ torchvision>=0.9.0
10
+ transformers>=4.15.0
11
+ huggingface_hub>=0.11.0
12
+ minerupdf==1.3.6