Spaces:
Runtime error
Runtime error
Commit ·
8005bb9
1
Parent(s): eaa0dd6
Fix magic-pdf installation and remove fallbacks, ensuring proper operation in Hugging Face Spaces
Browse files- Dockerfile +25 -13
- app.py +152 -222
- entrypoint.sh +68 -59
- requirements.txt +6 -2
Dockerfile
CHANGED
|
@@ -9,6 +9,8 @@ RUN apt-get update && \
|
|
| 9 |
libglib2.0-0 \
|
| 10 |
wget \
|
| 11 |
git \
|
|
|
|
|
|
|
| 12 |
&& rm -rf /var/lib/apt/lists/*
|
| 13 |
|
| 14 |
# Set working directory
|
|
@@ -18,34 +20,44 @@ WORKDIR /app
|
|
| 18 |
ENV PYTHONUNBUFFERED=1 \
|
| 19 |
PYTHONDONTWRITEBYTECODE=1 \
|
| 20 |
UPLOAD_FOLDER=/tmp/pdf_uploads \
|
| 21 |
-
OUTPUT_FOLDER=/tmp/pdf_output
|
|
|
|
|
|
|
| 22 |
|
| 23 |
# Create necessary directories and set permissions
|
| 24 |
-
RUN mkdir -p /tmp/pdf_uploads /tmp/pdf_output
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
&& chmod -R 777 /tmp/pdf_uploads /tmp/pdf_output /tmp/models
|
| 26 |
|
|
|
|
|
|
|
|
|
|
| 27 |
# Copy requirements and app files
|
| 28 |
COPY requirements.txt /app/
|
| 29 |
COPY *.py /app/
|
| 30 |
COPY entrypoint.sh /app/
|
| 31 |
RUN chmod +x /app/entrypoint.sh
|
| 32 |
|
| 33 |
-
# Create a non-root user and change ownership
|
| 34 |
-
RUN useradd -m -u 1000 appuser && \
|
| 35 |
-
chown -R appuser:appuser /app /tmp/pdf_uploads /tmp/pdf_output /tmp/models
|
| 36 |
-
|
| 37 |
# Install Python dependencies
|
| 38 |
-
RUN pip install --no-cache-dir -
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
-
#
|
| 42 |
-
|
|
|
|
| 43 |
|
| 44 |
-
#
|
| 45 |
-
RUN
|
|
|
|
| 46 |
|
| 47 |
# Test the installation
|
| 48 |
-
RUN
|
| 49 |
|
| 50 |
# Expose port
|
| 51 |
EXPOSE 7860
|
|
|
|
| 9 |
libglib2.0-0 \
|
| 10 |
wget \
|
| 11 |
git \
|
| 12 |
+
libopenblas-dev \
|
| 13 |
+
libgomp1 \
|
| 14 |
&& rm -rf /var/lib/apt/lists/*
|
| 15 |
|
| 16 |
# Set working directory
|
|
|
|
| 20 |
ENV PYTHONUNBUFFERED=1 \
|
| 21 |
PYTHONDONTWRITEBYTECODE=1 \
|
| 22 |
UPLOAD_FOLDER=/tmp/pdf_uploads \
|
| 23 |
+
OUTPUT_FOLDER=/tmp/pdf_output \
|
| 24 |
+
HF_HUB_DISABLE_PROGRESS_BARS=1 \
|
| 25 |
+
HF_HUB_ENABLE_HF_TRANSFER=0
|
| 26 |
|
| 27 |
# Create necessary directories and set permissions
|
| 28 |
+
RUN mkdir -p /tmp/pdf_uploads /tmp/pdf_output \
|
| 29 |
+
&& mkdir -p /tmp/models/MFD/YOLO \
|
| 30 |
+
&& mkdir -p /tmp/models/MFR/unimernet \
|
| 31 |
+
&& mkdir -p /tmp/models/table/rapid \
|
| 32 |
+
&& mkdir -p /tmp/models/layout/doclayout \
|
| 33 |
&& chmod -R 777 /tmp/pdf_uploads /tmp/pdf_output /tmp/models
|
| 34 |
|
| 35 |
+
# Create magic-pdf config directories
|
| 36 |
+
RUN mkdir -p /root/.config/magic_pdf
|
| 37 |
+
|
| 38 |
# Copy requirements and app files
|
| 39 |
COPY requirements.txt /app/
|
| 40 |
COPY *.py /app/
|
| 41 |
COPY entrypoint.sh /app/
|
| 42 |
RUN chmod +x /app/entrypoint.sh
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
# Install Python dependencies
|
| 45 |
+
RUN pip install --no-cache-dir --upgrade pip setuptools wheel
|
| 46 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 47 |
+
|
| 48 |
+
# Copy default magic-pdf config
|
| 49 |
+
RUN echo '{"device-mode":"cpu","layout-config":{"model":"doclayout_yolo","enable":true},"formula-config":{"mfd_model":"yolo_v8_mfd","mfr_model":"unimernet_small","enable":false},"table-config":{"model":"rapid_table","sub_model":"slanet_plus","enable":false}}' > /root/.config/magic_pdf/magic-pdf.json
|
| 50 |
|
| 51 |
+
# Download model files directly
|
| 52 |
+
RUN curl -L https://huggingface.co/marcosremar2/mineru-models/resolve/main/doclayout_yolo.pt -o /tmp/models/layout/doclayout/doclayout_yolo.pt
|
| 53 |
+
RUN curl -L https://huggingface.co/marcosremar2/mineru-models/resolve/main/yolo_v8_ft.pt -o /tmp/models/MFD/YOLO/yolo_v8_ft.pt
|
| 54 |
|
| 55 |
+
# Verify model files were downloaded correctly
|
| 56 |
+
RUN ls -la /tmp/models/layout/doclayout/doclayout_yolo.pt
|
| 57 |
+
RUN ls -la /tmp/models/MFD/YOLO/yolo_v8_ft.pt
|
| 58 |
|
| 59 |
# Test the installation
|
| 60 |
+
RUN magic-pdf --version
|
| 61 |
|
| 62 |
# Expose port
|
| 63 |
EXPOSE 7860
|
app.py
CHANGED
|
@@ -26,39 +26,61 @@ logging.basicConfig(
|
|
| 26 |
logger = logging.getLogger()
|
| 27 |
|
| 28 |
# Constants
|
| 29 |
-
UPLOAD_FOLDER =
|
| 30 |
-
OUTPUT_FOLDER =
|
| 31 |
ALLOWED_EXTENSIONS = {'pdf'}
|
| 32 |
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50MB
|
| 33 |
|
| 34 |
-
# Ensure the directories exist
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
try:
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
# Test if we can write to these directories
|
| 39 |
-
test_upload_file = os.path.join(UPLOAD_FOLDER, '.test_file')
|
| 40 |
-
test_output_file = os.path.join(OUTPUT_FOLDER, '.test_file')
|
| 41 |
-
|
| 42 |
-
with open(test_upload_file, 'w') as f:
|
| 43 |
-
f.write('test')
|
| 44 |
-
os.remove(test_upload_file)
|
| 45 |
-
|
| 46 |
-
with open(test_output_file, 'w') as f:
|
| 47 |
-
f.write('test')
|
| 48 |
-
os.remove(test_output_file)
|
| 49 |
-
|
| 50 |
-
logging.info(f"Using upload folder: {UPLOAD_FOLDER}")
|
| 51 |
-
logging.info(f"Using output folder: {OUTPUT_FOLDER}")
|
| 52 |
except Exception as e:
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
USER_HOME = os.path.expanduser("~")
|
| 56 |
-
UPLOAD_FOLDER = os.path.join(USER_HOME, 'pdf_uploads')
|
| 57 |
-
OUTPUT_FOLDER = os.path.join(USER_HOME, 'pdf_output')
|
| 58 |
-
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
| 59 |
-
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
|
| 60 |
-
logging.info(f"Using fallback upload folder: {UPLOAD_FOLDER}")
|
| 61 |
-
logging.info(f"Using fallback output folder: {OUTPUT_FOLDER}")
|
| 62 |
|
| 63 |
# Function to check if file extension is allowed
|
| 64 |
def allowed_file(filename):
|
|
@@ -364,220 +386,128 @@ def help_output():
|
|
| 364 |
# Route for PDF conversion
|
| 365 |
@app.route('/api/convert', methods=['POST'])
|
| 366 |
def convert_pdf():
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
file = request.files['file']
|
| 371 |
-
|
| 372 |
-
if file.filename == '':
|
| 373 |
-
return jsonify({"error": "No file selected"}), 400
|
| 374 |
-
|
| 375 |
-
if not allowed_file(file.filename):
|
| 376 |
-
return jsonify({"error": f"Only {', '.join(ALLOWED_EXTENSIONS)} files are allowed"}), 400
|
| 377 |
-
|
| 378 |
try:
|
| 379 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
session_id = str(uuid.uuid4())
|
| 381 |
session_dir = os.path.join(OUTPUT_FOLDER, session_id)
|
| 382 |
-
|
| 383 |
-
try:
|
| 384 |
-
os.makedirs(session_dir, exist_ok=True)
|
| 385 |
-
logging.info(f"Created session directory: {session_dir}")
|
| 386 |
-
except PermissionError as e:
|
| 387 |
-
logging.error(f"Permission error creating directory {session_dir}: {str(e)}")
|
| 388 |
-
# Try using a temp directory instead
|
| 389 |
-
session_dir = tempfile.mkdtemp(prefix="minerupdf_")
|
| 390 |
-
logging.info(f"Using temporary directory instead: {session_dir}")
|
| 391 |
|
| 392 |
# Save the uploaded file
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
except PermissionError:
|
| 398 |
-
# Try saving in the temp directory
|
| 399 |
-
input_path = os.path.join(session_dir, f"{session_id}_{file.filename}")
|
| 400 |
-
file.save(input_path)
|
| 401 |
-
logging.info(f"Saved uploaded file to alternate location: {input_path}")
|
| 402 |
|
| 403 |
-
|
| 404 |
-
base_filename = os.path.splitext(os.path.basename(file.filename))[0]
|
| 405 |
-
base_filename = ''.join(c if c.isalnum() or c in ['_', '-', '.'] else '_' for c in base_filename)
|
| 406 |
|
|
|
|
| 407 |
markdown_path = os.path.join(session_dir, f"{base_filename}.md")
|
| 408 |
json_path = os.path.join(session_dir, f"{base_filename}.json")
|
| 409 |
|
| 410 |
-
# Define alternate locations to search for generated files
|
| 411 |
-
alternate_locations = [
|
| 412 |
-
os.path.dirname(input_path), # Check PDF directory
|
| 413 |
-
os.path.join(OUTPUT_FOLDER, base_filename), # Check output folder with base filename
|
| 414 |
-
session_dir # Default session directory
|
| 415 |
-
]
|
| 416 |
-
|
| 417 |
# URLs for file download
|
| 418 |
markdown_url = f"/download/{session_id}/{base_filename}.md"
|
| 419 |
json_url = f"/download/{session_id}/{base_filename}.json"
|
| 420 |
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
"pages": len(pdf),
|
| 448 |
-
"sections": []
|
| 449 |
-
}
|
| 450 |
-
|
| 451 |
-
for page_num in range(len(pdf)):
|
| 452 |
-
page = pdf[page_num]
|
| 453 |
-
text = page.get_text()
|
| 454 |
-
|
| 455 |
-
# Add to markdown
|
| 456 |
-
markdown_content += f"## Page {page_num + 1}\n\n{text}\n\n"
|
| 457 |
-
|
| 458 |
-
# Add to JSON
|
| 459 |
-
json_content["sections"].append({
|
| 460 |
-
"title": f"Page {page_num + 1}",
|
| 461 |
-
"content": text
|
| 462 |
-
})
|
| 463 |
-
|
| 464 |
-
# Save markdown
|
| 465 |
-
with open(markdown_path, 'w', encoding='utf-8') as f:
|
| 466 |
-
f.write(markdown_content)
|
| 467 |
-
|
| 468 |
-
# Save JSON
|
| 469 |
-
with open(json_path, 'w', encoding='utf-8') as f:
|
| 470 |
-
json.dump(json_content, f, indent=2, ensure_ascii=False)
|
| 471 |
-
|
| 472 |
-
logging.info(f"Successfully converted PDF to markdown and JSON using PyMuPDF")
|
| 473 |
-
except ImportError:
|
| 474 |
-
# PyMuPDF not available
|
| 475 |
-
return jsonify({
|
| 476 |
-
"error": "PDF conversion failed with both MinerU and PyMuPDF",
|
| 477 |
-
"details": process.stderr,
|
| 478 |
-
"command": " ".join(cmd)
|
| 479 |
-
}), 500
|
| 480 |
-
|
| 481 |
-
# Check if files were generated
|
| 482 |
-
md_exists = os.path.exists(markdown_path)
|
| 483 |
-
json_exists = os.path.exists(json_path)
|
| 484 |
-
|
| 485 |
-
# Search in alternate locations if files not found
|
| 486 |
-
if not md_exists or not json_exists:
|
| 487 |
-
logging.info("Files not found in primary location, searching alternate locations...")
|
| 488 |
-
|
| 489 |
-
for location in alternate_locations:
|
| 490 |
-
if not md_exists:
|
| 491 |
-
alt_md_path = os.path.join(location, f"{base_filename}.md")
|
| 492 |
-
if os.path.exists(alt_md_path):
|
| 493 |
-
# Copy to session directory for download access
|
| 494 |
-
try:
|
| 495 |
-
shutil.copy(alt_md_path, markdown_path)
|
| 496 |
-
md_exists = True
|
| 497 |
-
logging.info(f"Found and copied Markdown file from {alt_md_path}")
|
| 498 |
-
except:
|
| 499 |
-
# If copy fails, update path to point to original
|
| 500 |
-
markdown_path = alt_md_path
|
| 501 |
-
md_exists = True
|
| 502 |
-
logging.info(f"Found Markdown file at alternate location: {alt_md_path}")
|
| 503 |
-
|
| 504 |
-
if not json_exists:
|
| 505 |
-
alt_json_path = os.path.join(location, f"{base_filename}.json")
|
| 506 |
-
if os.path.exists(alt_json_path):
|
| 507 |
-
# Copy to session directory for download access
|
| 508 |
-
try:
|
| 509 |
-
shutil.copy(alt_json_path, json_path)
|
| 510 |
-
json_exists = True
|
| 511 |
-
logging.info(f"Found and copied JSON file from {alt_json_path}")
|
| 512 |
-
except:
|
| 513 |
-
# If copy fails, update path to point to original
|
| 514 |
-
json_path = alt_json_path
|
| 515 |
-
json_exists = True
|
| 516 |
-
logging.info(f"Found JSON file at alternate location: {alt_json_path}")
|
| 517 |
-
|
| 518 |
-
# If still not found, check for any markdown or JSON files in the directory
|
| 519 |
-
if not md_exists:
|
| 520 |
-
md_files = [f for f in os.listdir(session_dir) if f.endswith('.md')]
|
| 521 |
-
if md_files:
|
| 522 |
-
markdown_path = os.path.join(session_dir, md_files[0])
|
| 523 |
-
md_exists = True
|
| 524 |
-
logging.info(f"Found Markdown file at alternate location: {markdown_path}")
|
| 525 |
-
|
| 526 |
-
if not json_exists:
|
| 527 |
-
json_files = [f for f in os.listdir(session_dir) if f.endswith('.json')]
|
| 528 |
-
if json_files:
|
| 529 |
-
json_path = os.path.join(session_dir, json_files[0])
|
| 530 |
-
json_exists = True
|
| 531 |
-
logging.info(f"Found JSON file at alternate location: {json_path}")
|
| 532 |
-
|
| 533 |
-
if md_exists:
|
| 534 |
-
with open(markdown_path, 'r', encoding='utf-8') as f:
|
| 535 |
-
markdown_content = f.read()
|
| 536 |
-
else:
|
| 537 |
-
markdown_content = None
|
| 538 |
-
logging.warning(f"Markdown file not found at {markdown_path}")
|
| 539 |
-
|
| 540 |
-
if json_exists:
|
| 541 |
-
with open(json_path, 'r', encoding='utf-8') as f:
|
| 542 |
-
json_content = json.load(f)
|
| 543 |
-
else:
|
| 544 |
-
json_content = None
|
| 545 |
-
logging.warning(f"JSON file not found at {json_path}")
|
| 546 |
-
|
| 547 |
-
return jsonify({
|
| 548 |
-
"success": True,
|
| 549 |
-
"session_id": session_id,
|
| 550 |
-
"markdown_url": markdown_url if md_exists else None,
|
| 551 |
-
"json_url": json_url if json_exists else None,
|
| 552 |
-
"markdown_content": markdown_content,
|
| 553 |
-
"json_content": json_content,
|
| 554 |
-
"message": "PDF conversion completed"
|
| 555 |
-
})
|
| 556 |
-
|
| 557 |
-
except Exception as e:
|
| 558 |
-
logging.error(f"Error during conversion: {str(e)}")
|
| 559 |
-
logging.error(traceback.format_exc())
|
| 560 |
|
| 561 |
return jsonify({
|
| 562 |
-
"
|
| 563 |
-
"
|
|
|
|
| 564 |
}), 500
|
| 565 |
-
|
| 566 |
-
except Exception as e:
|
| 567 |
-
logging.error(f"General error: {str(e)}")
|
| 568 |
-
logging.error(traceback.format_exc())
|
| 569 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 570 |
return jsonify({
|
| 571 |
-
"
|
| 572 |
-
"
|
|
|
|
| 573 |
}), 500
|
| 574 |
-
finally:
|
| 575 |
-
# Clean up the input file
|
| 576 |
-
try:
|
| 577 |
-
if 'input_path' in locals() and os.path.exists(input_path):
|
| 578 |
-
os.remove(input_path)
|
| 579 |
-
except Exception as e:
|
| 580 |
-
logging.warning(f"Failed to clean up input file: {str(e)}")
|
| 581 |
|
| 582 |
# Route to download converted files
|
| 583 |
@app.route('/download/<session_id>/<filename>')
|
|
|
|
| 26 |
logger = logging.getLogger()
|
| 27 |
|
| 28 |
# Constants
|
| 29 |
+
UPLOAD_FOLDER = '/tmp/pdf_uploads'
|
| 30 |
+
OUTPUT_FOLDER = '/tmp/pdf_output'
|
| 31 |
ALLOWED_EXTENSIONS = {'pdf'}
|
| 32 |
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50MB
|
| 33 |
|
| 34 |
+
# Ensure the directories exist
|
| 35 |
+
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
| 36 |
+
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
|
| 37 |
+
logging.info(f"Using upload folder: {UPLOAD_FOLDER}")
|
| 38 |
+
logging.info(f"Using output folder: {OUTPUT_FOLDER}")
|
| 39 |
+
|
| 40 |
+
# Create config directories and files if they don't exist
|
| 41 |
+
CONFIG_DIR = os.path.join(os.path.expanduser("~"), ".config", "magic_pdf")
|
| 42 |
+
os.makedirs(CONFIG_DIR, exist_ok=True)
|
| 43 |
+
CONFIG_FILE = os.path.join(CONFIG_DIR, "magic-pdf.json")
|
| 44 |
+
|
| 45 |
+
# Default configuration
|
| 46 |
+
DEFAULT_CONFIG = {
|
| 47 |
+
"device-mode": "cpu",
|
| 48 |
+
"layout-config": {
|
| 49 |
+
"model": "doclayout_yolo",
|
| 50 |
+
"enable": True
|
| 51 |
+
},
|
| 52 |
+
"formula-config": {
|
| 53 |
+
"mfd_model": "yolo_v8_mfd",
|
| 54 |
+
"mfr_model": "unimernet_small",
|
| 55 |
+
"enable": False
|
| 56 |
+
},
|
| 57 |
+
"table-config": {
|
| 58 |
+
"model": "rapid_table",
|
| 59 |
+
"sub_model": "slanet_plus",
|
| 60 |
+
"enable": False
|
| 61 |
+
}
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
# Write config file if it doesn't exist
|
| 65 |
+
if not os.path.exists(CONFIG_FILE):
|
| 66 |
+
with open(CONFIG_FILE, 'w') as f:
|
| 67 |
+
json.dump(DEFAULT_CONFIG, f, indent=2)
|
| 68 |
+
logging.info(f"Created magic-pdf config at {CONFIG_FILE}")
|
| 69 |
+
|
| 70 |
+
# Also create the config in the home directory as a fallback
|
| 71 |
+
HOME_CONFIG = os.path.join(os.path.expanduser("~"), "magic-pdf.json")
|
| 72 |
+
if not os.path.exists(HOME_CONFIG):
|
| 73 |
+
with open(HOME_CONFIG, 'w') as f:
|
| 74 |
+
json.dump(DEFAULT_CONFIG, f, indent=2)
|
| 75 |
+
logging.info(f"Created magic-pdf config at {HOME_CONFIG}")
|
| 76 |
+
|
| 77 |
+
# Verify magic-pdf installation
|
| 78 |
try:
|
| 79 |
+
result = subprocess.run(['magic-pdf', '--version'], capture_output=True, text=True)
|
| 80 |
+
logging.info(f"magic-pdf version: {result.stdout.strip()}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
except Exception as e:
|
| 82 |
+
logging.error(f"Error checking magic-pdf: {str(e)}")
|
| 83 |
+
raise RuntimeError("magic-pdf is not installed properly. Please check installation.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
# Function to check if file extension is allowed
|
| 86 |
def allowed_file(filename):
|
|
|
|
| 386 |
# Route for PDF conversion
|
| 387 |
@app.route('/api/convert', methods=['POST'])
|
| 388 |
def convert_pdf():
|
| 389 |
+
"""
|
| 390 |
+
API endpoint for PDF conversion.
|
| 391 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 392 |
try:
|
| 393 |
+
# Check if file is in the request
|
| 394 |
+
if 'file' not in request.files:
|
| 395 |
+
return jsonify({
|
| 396 |
+
"success": False,
|
| 397 |
+
"error": "No file part in the request"
|
| 398 |
+
}), 400
|
| 399 |
+
|
| 400 |
+
file = request.files['file']
|
| 401 |
+
|
| 402 |
+
# Check if file is selected
|
| 403 |
+
if file.filename == '':
|
| 404 |
+
return jsonify({
|
| 405 |
+
"success": False,
|
| 406 |
+
"error": "No file selected"
|
| 407 |
+
}), 400
|
| 408 |
+
|
| 409 |
+
# Check if file is a PDF
|
| 410 |
+
if not allowed_file(file.filename):
|
| 411 |
+
return jsonify({
|
| 412 |
+
"success": False,
|
| 413 |
+
"error": "File must be a PDF"
|
| 414 |
+
}), 400
|
| 415 |
+
|
| 416 |
+
# Generate session ID
|
| 417 |
session_id = str(uuid.uuid4())
|
| 418 |
session_dir = os.path.join(OUTPUT_FOLDER, session_id)
|
| 419 |
+
os.makedirs(session_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 420 |
|
| 421 |
# Save the uploaded file
|
| 422 |
+
filename = os.path.basename(file.filename)
|
| 423 |
+
base_filename = os.path.splitext(filename)[0]
|
| 424 |
+
input_path = os.path.join(session_dir, filename)
|
| 425 |
+
file.save(input_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 426 |
|
| 427 |
+
logging.info(f"Saved PDF file to {input_path}")
|
|
|
|
|
|
|
| 428 |
|
| 429 |
+
# Set paths for output files
|
| 430 |
markdown_path = os.path.join(session_dir, f"{base_filename}.md")
|
| 431 |
json_path = os.path.join(session_dir, f"{base_filename}.json")
|
| 432 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 433 |
# URLs for file download
|
| 434 |
markdown_url = f"/download/{session_id}/{base_filename}.md"
|
| 435 |
json_url = f"/download/{session_id}/{base_filename}.json"
|
| 436 |
|
| 437 |
+
# Run magic-pdf to convert the PDF
|
| 438 |
+
command = ["magic-pdf", "--path", input_path, "--output-dir", session_dir]
|
| 439 |
+
|
| 440 |
+
logging.info(f"Running command: {' '.join(command)}")
|
| 441 |
+
|
| 442 |
+
# Execute the command and capture output
|
| 443 |
+
process_log = []
|
| 444 |
+
process = subprocess.Popen(
|
| 445 |
+
command,
|
| 446 |
+
stdout=subprocess.PIPE,
|
| 447 |
+
stderr=subprocess.STDOUT,
|
| 448 |
+
universal_newlines=True
|
| 449 |
+
)
|
| 450 |
+
|
| 451 |
+
# Capture output in real-time
|
| 452 |
+
for line in process.stdout:
|
| 453 |
+
process_log.append(line)
|
| 454 |
+
logging.info(f"magic-pdf output: {line.strip()}")
|
| 455 |
+
|
| 456 |
+
# Wait for process to complete
|
| 457 |
+
return_code = process.wait()
|
| 458 |
+
|
| 459 |
+
if return_code != 0:
|
| 460 |
+
error_message = "\n".join(process_log)
|
| 461 |
+
logging.error(f"magic-pdf failed with code {return_code}")
|
| 462 |
+
logging.error(f"Output: {error_message}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 463 |
|
| 464 |
return jsonify({
|
| 465 |
+
"success": False,
|
| 466 |
+
"error": f"PDF conversion failed with exit code {return_code}",
|
| 467 |
+
"log": error_message
|
| 468 |
}), 500
|
|
|
|
|
|
|
|
|
|
|
|
|
| 469 |
|
| 470 |
+
# Check if files were generated
|
| 471 |
+
md_exists = os.path.exists(markdown_path)
|
| 472 |
+
json_exists = os.path.exists(json_path)
|
| 473 |
+
|
| 474 |
+
logging.info(f"Markdown file exists: {md_exists} at {markdown_path}")
|
| 475 |
+
logging.info(f"JSON file exists: {json_exists} at {json_path}")
|
| 476 |
+
|
| 477 |
+
# Read file contents if they exist
|
| 478 |
+
if md_exists:
|
| 479 |
+
with open(markdown_path, 'r', encoding='utf-8') as f:
|
| 480 |
+
markdown_content = f.read()
|
| 481 |
+
else:
|
| 482 |
+
markdown_content = None
|
| 483 |
+
logging.warning(f"Markdown file not found at {markdown_path}")
|
| 484 |
+
|
| 485 |
+
if json_exists:
|
| 486 |
+
with open(json_path, 'r', encoding='utf-8') as f:
|
| 487 |
+
json_content = json.load(f)
|
| 488 |
+
else:
|
| 489 |
+
json_content = None
|
| 490 |
+
logging.warning(f"JSON file not found at {json_path}")
|
| 491 |
+
|
| 492 |
+
# Return results
|
| 493 |
+
return jsonify({
|
| 494 |
+
"success": True,
|
| 495 |
+
"session_id": session_id,
|
| 496 |
+
"markdown_url": markdown_url if md_exists else None,
|
| 497 |
+
"json_url": json_url if json_exists else None,
|
| 498 |
+
"markdown_content": markdown_content,
|
| 499 |
+
"json_content": json_content,
|
| 500 |
+
"input_file": filename,
|
| 501 |
+
"log": "\n".join(process_log)
|
| 502 |
+
})
|
| 503 |
+
|
| 504 |
+
except Exception as e:
|
| 505 |
+
logging.exception("Error in convert_pdf endpoint")
|
| 506 |
return jsonify({
|
| 507 |
+
"success": False,
|
| 508 |
+
"error": str(e),
|
| 509 |
+
"traceback": traceback.format_exc()
|
| 510 |
}), 500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 511 |
|
| 512 |
# Route to download converted files
|
| 513 |
@app.route('/download/<session_id>/<filename>')
|
entrypoint.sh
CHANGED
|
@@ -8,39 +8,25 @@ echo "Current directory: $(pwd)"
|
|
| 8 |
echo "Directory listing:"
|
| 9 |
ls -la
|
| 10 |
|
| 11 |
-
#
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
mkdir -p /tmp/models/MFD/YOLO
|
| 15 |
-
mkdir -p /tmp/models/MFR/unimernet
|
| 16 |
-
mkdir -p /tmp/models/table/rapid
|
| 17 |
-
mkdir -p /tmp/models/layout/doclayout
|
| 18 |
|
| 19 |
-
#
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
echo "Warning: magic-pdf command not found. Installing MinerU PDF processing tools..."
|
| 27 |
-
pip install --no-cache-dir minerupdf
|
| 28 |
-
|
| 29 |
-
# Verify installation
|
| 30 |
-
if command -v magic-pdf &> /dev/null; then
|
| 31 |
-
echo "Successfully installed magic-pdf."
|
| 32 |
-
else
|
| 33 |
-
echo "Failed to install magic-pdf. Will use PyMuPDF fallback."
|
| 34 |
-
pip install --no-cache-dir pymupdf
|
| 35 |
fi
|
| 36 |
-
|
| 37 |
-
echo "
|
| 38 |
-
|
| 39 |
|
| 40 |
-
#
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
{
|
| 44 |
"device-mode": "cpu",
|
| 45 |
"layout-config": {
|
| 46 |
"model": "doclayout_yolo",
|
|
@@ -49,61 +35,84 @@ cat > ~/.config/magic_pdf/magic-pdf.json << EOL
|
|
| 49 |
"formula-config": {
|
| 50 |
"mfd_model": "yolo_v8_mfd",
|
| 51 |
"mfr_model": "unimernet_small",
|
| 52 |
-
"enable":
|
| 53 |
},
|
| 54 |
"table-config": {
|
| 55 |
"model": "rapid_table",
|
| 56 |
"sub_model": "slanet_plus",
|
| 57 |
-
"enable":
|
| 58 |
-
"max_time": 400
|
| 59 |
}
|
| 60 |
-
}
|
| 61 |
-
EOL
|
| 62 |
|
| 63 |
-
#
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
local model_path=$1
|
| 66 |
local model_url=$2
|
| 67 |
local max_attempts=3
|
| 68 |
local attempt=1
|
| 69 |
|
|
|
|
|
|
|
| 70 |
if [ ! -f "$model_path" ] || [ ! -s "$model_path" ]; then
|
| 71 |
-
echo "Downloading model to $model_path from $model_url"
|
| 72 |
while [ $attempt -le $max_attempts ]; do
|
| 73 |
-
echo "
|
| 74 |
-
curl -L
|
| 75 |
|
|
|
|
| 76 |
if [ -f "$model_path" ] && [ -s "$model_path" ]; then
|
| 77 |
-
|
|
|
|
| 78 |
return 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
fi
|
| 80 |
-
|
| 81 |
-
echo "Download failed or file is empty. Retrying..."
|
| 82 |
-
rm -f "$model_path"
|
| 83 |
-
attempt=$((attempt+1))
|
| 84 |
-
sleep 2
|
| 85 |
done
|
| 86 |
|
| 87 |
-
echo "Failed to download model after $max_attempts attempts"
|
| 88 |
-
|
| 89 |
else
|
| 90 |
-
|
| 91 |
-
|
| 92 |
fi
|
| 93 |
}
|
| 94 |
|
| 95 |
-
# Download models
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
download_model "/tmp/models/layout/doclayout/doclayout_yolo.pt" "https://huggingface.co/marcosremar2/mineru-models/resolve/main/doclayout_yolo.pt"
|
| 100 |
|
| 101 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
if command -v magic-pdf &> /dev/null; then
|
| 103 |
-
echo "
|
| 104 |
-
magic-pdf --version
|
| 105 |
else
|
| 106 |
-
echo "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
fi
|
| 108 |
|
| 109 |
# Start the Flask application
|
|
|
|
| 8 |
echo "Directory listing:"
|
| 9 |
ls -la
|
| 10 |
|
| 11 |
+
# Set environment variables to disable HF progress bars (prevents hanging)
|
| 12 |
+
export HF_HUB_DISABLE_PROGRESS_BARS=1
|
| 13 |
+
export HF_HUB_ENABLE_HF_TRANSFER=0
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
+
# Verify directories existence and permissions
|
| 16 |
+
echo "Checking directories..."
|
| 17 |
+
for dir in /tmp/pdf_uploads /tmp/pdf_output /tmp/models/MFD/YOLO /tmp/models/MFR/unimernet /tmp/models/table/rapid /tmp/models/layout/doclayout
|
| 18 |
+
do
|
| 19 |
+
if [ ! -d "$dir" ]; then
|
| 20 |
+
echo "Creating directory: $dir"
|
| 21 |
+
mkdir -p "$dir"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
fi
|
| 23 |
+
chmod -R 777 "$dir"
|
| 24 |
+
echo "Directory $dir is ready"
|
| 25 |
+
done
|
| 26 |
|
| 27 |
+
# Copy config file to all possible locations to ensure it's found
|
| 28 |
+
echo "Setting up magic-pdf config..."
|
| 29 |
+
CONFIG_CONTENT='{
|
|
|
|
| 30 |
"device-mode": "cpu",
|
| 31 |
"layout-config": {
|
| 32 |
"model": "doclayout_yolo",
|
|
|
|
| 35 |
"formula-config": {
|
| 36 |
"mfd_model": "yolo_v8_mfd",
|
| 37 |
"mfr_model": "unimernet_small",
|
| 38 |
+
"enable": false
|
| 39 |
},
|
| 40 |
"table-config": {
|
| 41 |
"model": "rapid_table",
|
| 42 |
"sub_model": "slanet_plus",
|
| 43 |
+
"enable": false
|
|
|
|
| 44 |
}
|
| 45 |
+
}'
|
|
|
|
| 46 |
|
| 47 |
+
# Create config in all possible locations
|
| 48 |
+
mkdir -p ~/.config/magic_pdf
|
| 49 |
+
echo "$CONFIG_CONTENT" > ~/.config/magic_pdf/magic-pdf.json
|
| 50 |
+
echo "$CONFIG_CONTENT" > ~/magic-pdf.json
|
| 51 |
+
echo "$CONFIG_CONTENT" > /app/magic-pdf.json
|
| 52 |
+
echo "$CONFIG_CONTENT" > /home/user/magic-pdf.json
|
| 53 |
+
echo "$CONFIG_CONTENT" > /root/.config/magic_pdf/magic-pdf.json
|
| 54 |
+
|
| 55 |
+
# Download model function with validation
|
| 56 |
+
function download_model_with_validation() {
|
| 57 |
local model_path=$1
|
| 58 |
local model_url=$2
|
| 59 |
local max_attempts=3
|
| 60 |
local attempt=1
|
| 61 |
|
| 62 |
+
echo "Checking model file: $model_path"
|
| 63 |
+
|
| 64 |
if [ ! -f "$model_path" ] || [ ! -s "$model_path" ]; then
|
|
|
|
| 65 |
while [ $attempt -le $max_attempts ]; do
|
| 66 |
+
echo "Downloading model attempt $attempt/$max_attempts: $model_url"
|
| 67 |
+
curl -L --retry 5 --retry-delay 2 "$model_url" -o "$model_path"
|
| 68 |
|
| 69 |
+
# Verify file exists and has content
|
| 70 |
if [ -f "$model_path" ] && [ -s "$model_path" ]; then
|
| 71 |
+
size=$(du -h "$model_path" | cut -f1)
|
| 72 |
+
echo "✅ Model downloaded successfully ($size): $model_path"
|
| 73 |
return 0
|
| 74 |
+
else
|
| 75 |
+
echo "❌ Download failed or file is empty. Retrying..."
|
| 76 |
+
rm -f "$model_path" 2>/dev/null
|
| 77 |
+
attempt=$((attempt+1))
|
| 78 |
+
sleep 2
|
| 79 |
fi
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
done
|
| 81 |
|
| 82 |
+
echo "❌ Failed to download model after $max_attempts attempts: $model_url"
|
| 83 |
+
exit 1
|
| 84 |
else
|
| 85 |
+
size=$(du -h "$model_path" | cut -f1)
|
| 86 |
+
echo "✅ Model already exists ($size): $model_path"
|
| 87 |
fi
|
| 88 |
}
|
| 89 |
|
| 90 |
+
# Download and verify all required models
|
| 91 |
+
echo "Verifying model files..."
|
| 92 |
+
download_model_with_validation "/tmp/models/layout/doclayout/doclayout_yolo.pt" "https://huggingface.co/marcosremar2/mineru-models/resolve/main/doclayout_yolo.pt"
|
| 93 |
+
download_model_with_validation "/tmp/models/MFD/YOLO/yolo_v8_ft.pt" "https://huggingface.co/marcosremar2/mineru-models/resolve/main/yolo_v8_ft.pt"
|
|
|
|
| 94 |
|
| 95 |
+
# List all model files for verification
|
| 96 |
+
echo "Model files verification:"
|
| 97 |
+
find /tmp/models -type f -exec ls -la {} \;
|
| 98 |
+
|
| 99 |
+
# Verify magic-pdf exists and is executable
|
| 100 |
+
echo "Checking magic-pdf installation..."
|
| 101 |
if command -v magic-pdf &> /dev/null; then
|
| 102 |
+
echo "magic-pdf found. Testing version:"
|
| 103 |
+
magic-pdf --version
|
| 104 |
else
|
| 105 |
+
echo "magic-pdf command not found. Installing:"
|
| 106 |
+
pip install --no-cache-dir minerupdf==1.3.6
|
| 107 |
+
if command -v magic-pdf &> /dev/null; then
|
| 108 |
+
echo "magic-pdf installed successfully. Testing version:"
|
| 109 |
+
magic-pdf --version
|
| 110 |
+
else
|
| 111 |
+
echo "Failed to install magic-pdf. Check PATH and installation."
|
| 112 |
+
echo "PATH: $PATH"
|
| 113 |
+
pip list | grep miner
|
| 114 |
+
exit 1
|
| 115 |
+
fi
|
| 116 |
fi
|
| 117 |
|
| 118 |
# Start the Flask application
|
requirements.txt
CHANGED
|
@@ -4,5 +4,9 @@ werkzeug==2.3.7
|
|
| 4 |
Pillow>=9.0.0
|
| 5 |
numpy>=1.20.0
|
| 6 |
requests>=2.25.0
|
| 7 |
-
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
Pillow>=9.0.0
|
| 5 |
numpy>=1.20.0
|
| 6 |
requests>=2.25.0
|
| 7 |
+
opencv-python-headless>=4.5.0
|
| 8 |
+
torch>=1.8.0
|
| 9 |
+
torchvision>=0.9.0
|
| 10 |
+
transformers>=4.15.0
|
| 11 |
+
huggingface_hub>=0.11.0
|
| 12 |
+
minerupdf==1.3.6
|