#!/usr/bin/env python3 """ Pre-flight validation script for MinerU OCR Service. Run by entrypoint.sh BEFORE uvicorn starts. Exits 0 if all checks pass. Exits 1 if any CRITICAL check fails — this crashes the container loudly so Hugging Face logs show an actionable error instead of a silent crash or a healthy-looking service that fails on every request. Usage: python validate.py # run all checks, exit 0/1 python validate.py --soft # run all checks, always exit 0 (log only) ── FORENSIC NOTES (2025-06) ────────────────────────────────────────────────── OCR engine: The pipeline (full) backend uses paddleocr2pytorch — a self-contained PyTorch reimplementation of PaddleOCR bundled inside the magic-pdf wheel. It uses: torch, cv2, numpy, pyclipper, shapely, yaml. paddlepaddle and paddleocr packages are NOT installed and NOT needed. pp_structure_v2.py (which imports paddleocr) is only loaded in 'lite' model mode. Pipeline backend always uses 'full' mode (CustomPEKModel). That file is never imported at runtime. OCR model path resolution (from pytorch_paddle.py): ocr_models_dir = os.path.join(get_local_models_dir(), 'OCR', 'paddleocr_torch') det_model_path = os.path.join(ocr_models_dir, det_filename) where det_filename comes from models_config.yml keyed by language. Default CPU path: lang='ch' → forced to 'ch_lite' on CPU device. After Dockerfile Layer 3.5 patch: ch_lite.det = ch_PP-OCRv5_det_infer.pth (was ch_PP-OCRv3 — not in HF repo) ch_lite.rec = ch_PP-OCRv5_rec_infer.pth (unchanged — already in HF repo) Arch config lookup (from pytorchocr_utility.py): get_arch_config(model_path) uses Path(model_path).stem as the key into arch_config.yaml (bundled in magic-pdf wheel). Both replacement filenames have entries in arch_config.yaml — verified before patch was written. OpenCV conflict handling: doclayout-yolo, ultralytics, and rapid-table all declare opencv-python (non-headless) as a required dep. pip installs the full build in Layer 3. Layer 4 force-reinstalls opencv-python-headless to overwrite cv2. Both packages expose an identical cv2 API so all callers work correctly at runtime. pip-check shows warnings but they are harmless. onnxruntime: rapid-table declares onnxruntime>1.17.0 as a required (non-optional) dep. pip resolves it automatically when magic-pdf[full] is installed in Layer 3. slanet-plus.onnx (table model): Bundled inside the magic-pdf wheel at: magic_pdf/resources/slanet_plus/slanet-plus.onnx NOT downloaded from HF Hub — no separate download needed. """ import importlib import json import os import shutil import sys import tempfile import traceback SOFT_MODE = "--soft" in sys.argv # never exit 1, just print MODELS_DIR = "/app/models" EXTRACT_KIT_MODELS = os.path.join(MODELS_DIR, "PDF-Extract-Kit-1.0", "models") LAYOUT_MARKER = os.path.join(EXTRACT_KIT_MODELS, "Layout") # canary directory CONFIG_PATH = os.path.expanduser("~/magic-pdf.json") # ── helpers ──────────────────────────────────────────────────────────────────── def ok(label: str, detail: str = "") -> None: suffix = f" ({detail})" if detail else "" print(f" ✓ {label}{suffix}", flush=True) def fail(label: str, detail: str, critical: bool = True) -> None: tag = "CRITICAL" if critical else "WARNING" print(f" ✗ [{tag}] {label}: {detail}", flush=True) def section(title: str) -> None: print(f"\n{'─' * 60}", flush=True) print(f" {title}", flush=True) print(f"{'─' * 60}", flush=True) # ── check registry ───────────────────────────────────────────────────────────── failures: list[tuple[str, str]] = [] warnings: list[tuple[str, str]] = [] def record_fail(label: str, detail: str, critical: bool = True) -> None: fail(label, detail, critical) if critical: failures.append((label, detail)) else: warnings.append((label, detail)) # ═══════════════════════════════════════════════════════════════════════════════ print("\n" + "═" * 60, flush=True) print(" MinerU OCR Service — Pre-flight Validation", flush=True) print("═" * 60, flush=True) # ── 1. Python version ────────────────────────────────────────────────────────── section("1. Python runtime") pv = sys.version_info if pv >= (3, 10): ok("Python version", f"{pv.major}.{pv.minor}.{pv.micro}") else: record_fail("Python version", f"{pv.major}.{pv.minor} detected — magic-pdf requires >= 3.10") # ── 2. cv2 ───────────────────────────────────────────────────────────────────── section("2. OpenCV (cv2)") try: import cv2 ok("cv2 import", f"version {cv2.__version__}") build = cv2.getBuildInformation() if "GTK" in build or "Qt" in build: record_fail("cv2 build", "GUI backend detected — use opencv-python-headless", critical=False) else: ok("cv2 headless", "no GUI backend detected") except ImportError as exc: record_fail( "cv2 import", f"{exc}. " "Layer 4 force-reinstall of opencv-python-headless may have failed. " "Check Docker build log for the 'pip install --force-reinstall opencv-python-headless' step.", ) except Exception as exc: record_fail("cv2 import", f"unexpected error: {exc}") # ── 3. PyTorch ───────────────────────────────────────────────────────────────── section("3. PyTorch + TorchVision") try: import torch ok("torch import", f"version {torch.__version__}") if torch.cuda.is_available(): record_fail("torch CUDA", "CUDA detected on CPU-only space — unexpected", critical=False) else: ok("torch device", "CPU-only (expected for free tier)") except ImportError as exc: record_fail( "torch import", f"{exc}. " "Install from PyTorch CPU index BEFORE magic-pdf in Dockerfile Layer 2: " "pip install --index-url https://download.pytorch.org/whl/cpu torch torchvision", ) except Exception as exc: record_fail("torch import", f"unexpected: {exc}") try: import torchvision ok("torchvision import", f"version {torchvision.__version__}") except ImportError as exc: record_fail("torchvision import", str(exc)) except Exception as exc: record_fail("torchvision import", f"unexpected: {exc}") # ── 4. ultralytics ───────────────────────────────────────────────────────────── section("4. ultralytics (YOLO — required by doclayout_yolo)") try: import ultralytics ok("ultralytics import", f"version {ultralytics.__version__}") except ImportError as exc: record_fail( "ultralytics import", f"{exc}. " "Provided by magic-pdf[full]. " "ROOT CAUSE: [full-cpu] is NOT a valid extra in magic-pdf 1.3.12 — " "pip silently installed only the base package when given an unknown extra. " "Dockerfile Layer 3 must use magic-pdf[full]==1.3.12 (not [full-cpu]).", ) except Exception as exc: record_fail("ultralytics import", f"unexpected: {exc}") # ── 5. doclayout_yolo ────────────────────────────────────────────────────────── section("5. doclayout_yolo (layout detection model)") try: import doclayout_yolo ok("doclayout_yolo import", f"version {getattr(doclayout_yolo, '__version__', 'unknown')}") except ImportError as exc: record_fail( "doclayout_yolo import", f"{exc}. " "Provided by magic-pdf[full] (version 0.0.2b1). " "doclayout-yolo==0.0.2b1 is only on the myhloli custom wheel index — " "Dockerfile Layer 3 must include: " "--extra-index-url https://myhloli.github.io/wheels/", ) except Exception as exc: record_fail("doclayout_yolo import", f"unexpected: {exc}") # ── 6. rapid_table ───────────────────────────────────────────────────────────── section("6. rapid_table (table extraction)") try: import rapid_table ok("rapid_table import", f"version {getattr(rapid_table, '__version__', 'unknown')}") except ImportError as exc: record_fail( "rapid_table import", f"{exc}. Provided by magic-pdf[full]. Check Layer 3 install.", ) except Exception as exc: record_fail("rapid_table import", f"unexpected: {exc}") # ── 7. onnxruntime ───────────────────────────────────────────────────────────── section("7. onnxruntime (required by rapid-table for table model inference)") # onnxruntime is a required (non-optional) dep of rapid-table>=1.0.5. # pip resolves it automatically when magic-pdf[full] is installed in Layer 3. # If it is missing it means rapid-table itself failed to install. try: import onnxruntime ok("onnxruntime import", f"version {onnxruntime.__version__}") except ImportError as exc: record_fail( "onnxruntime import", f"{exc}. " "onnxruntime is a required dep of rapid-table>=1.0.5. " "Its absence means rapid-table failed to install in Layer 3. " "Check Docker build log for rapid-table install errors.", ) except Exception as exc: record_fail("onnxruntime import", f"unexpected: {exc}") # ── 8. magic_pdf core imports ────────────────────────────────────────────────── section("8. magic_pdf core imports") REQUIRED_IMPORTS = [ ("magic_pdf.data.dataset", ["PymuDocDataset", "ImageDataset"]), ("magic_pdf.data.data_reader_writer", ["FileBasedDataReader", "FileBasedDataWriter"]), ("magic_pdf.model.doc_analyze_by_custom_model", ["doc_analyze"]), ("magic_pdf.config.enums", ["SupportedPdfParseMethod"]), ] for module_path, symbols in REQUIRED_IMPORTS: try: mod = importlib.import_module(module_path) missing = [s for s in symbols if not hasattr(mod, s)] if missing: record_fail(f"{module_path}", f"missing symbols: {missing}") else: ok(module_path, ", ".join(symbols)) except ImportError as exc: record_fail(module_path, str(exc)) except Exception as exc: record_fail(module_path, f"unexpected: {exc}") # ── 8b. paddleocr2pytorch (OCR engine bundled inside magic-pdf wheel) ────────── section("8b. paddleocr2pytorch (PyTorch OCR — bundled in magic-pdf wheel)") # This is the actual OCR engine for the pipeline backend. # It is NOT a separate pip package — it lives inside the magic-pdf wheel at # magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/ # If it is missing, the entire magic-pdf package did not install correctly. try: from magic_pdf.model.sub_modules.ocr.paddleocr2pytorch.pytorch_paddle import PytorchPaddleOCR ok("PytorchPaddleOCR (paddleocr2pytorch)", "bundled inside magic-pdf wheel — no paddlepaddle pkg needed") except ImportError as exc: record_fail( "PytorchPaddleOCR import", f"{exc}. " "This module is bundled inside magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/. " "If missing, magic-pdf itself did not install correctly.", ) except Exception as exc: record_fail("PytorchPaddleOCR import", f"unexpected: {exc}") # ── 8c. Deprecated API check ─────────────────────────────────────────────────── section("8c. Deprecated API check (should NOT exist)") OBSOLETE = [ "magic_pdf.pipe.UNIPipe", "magic_pdf.rw.DiskReaderWriter", ] for mod_path in OBSOLETE: try: importlib.import_module(mod_path) record_fail(mod_path, "still importable — code may use old API", critical=False) except ImportError: ok(f"{mod_path} (correctly absent)") # ── 9. End-to-end pipeline smoke test ───────────────────────────────────────── section("9. End-to-end pipeline smoke test") try: from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze # noqa: F401 import ultralytics # noqa: F401 from magic_pdf.data.dataset import ImageDataset # noqa: F401 from magic_pdf.data.data_reader_writer import FileBasedDataReader, FileBasedDataWriter # noqa: F401 ok("Pipeline imports (doc_analyze + ultralytics + ImageDataset + readers)", "all OK") except ImportError as exc: record_fail( "Pipeline smoke test", f"Full pipeline import chain failed: {exc}. " "This means POST /extract will fail on every request.", ) except Exception as exc: record_fail("Pipeline smoke test", f"unexpected: {exc}") # ── 10. Config file ──────────────────────────────────────────────────────────── section("10. MinerU config (magic-pdf.json)") _cfg: dict = {} if os.path.exists(CONFIG_PATH): try: with open(CONFIG_PATH) as f: _cfg = json.load(f) required_keys = ["models-dir", "device-mode"] missing_keys = [k for k in required_keys if k not in _cfg] if missing_keys: record_fail("Config keys", f"missing: {missing_keys}") else: ok("Config file", CONFIG_PATH) ok("device-mode", _cfg.get("device-mode", "?")) ok("models-dir", _cfg.get("models-dir", "?")) ok("formula-enable", str(_cfg.get("formula-config", {}).get("enable", "?"))) ok("table-enable", str(_cfg.get("table-config", {}).get("enable", "?"))) except json.JSONDecodeError as exc: record_fail("Config file", f"invalid JSON: {exc}") except Exception as exc: record_fail("Config file", str(exc)) else: record_fail( "Config file", f"not found at {CONFIG_PATH}. " "Run download_models.py or check Docker build log.", ) # ── 11. Model directory structure ───────────────────────────────────────────── section("11. Model directory structure") model_dir_checks = [ ("PDF-Extract-Kit-1.0 root", os.path.join(MODELS_DIR, "PDF-Extract-Kit-1.0")), ("Layout models", os.path.join(EXTRACT_KIT_MODELS, "Layout")), ("Layout/YOLO", os.path.join(EXTRACT_KIT_MODELS, "Layout", "YOLO")), ("OCR models", os.path.join(EXTRACT_KIT_MODELS, "OCR")), ("OCR/paddleocr_torch", os.path.join(EXTRACT_KIT_MODELS, "OCR", "paddleocr_torch")), ("Table models (TabRec)", os.path.join(EXTRACT_KIT_MODELS, "TabRec")), ] for label, path in model_dir_checks: if os.path.isdir(path): try: n = sum(1 for _ in os.scandir(path)) ok(label, f"{n} entries [{path}]") except OSError: ok(label, path) else: record_fail(label, f"directory not found: {path}") lr_dir = os.path.join(MODELS_DIR, "layoutreader") if os.path.isdir(lr_dir): ok("layoutreader (optional)", lr_dir) else: record_fail("layoutreader (optional)", "not found — MinerU will use fallback ordering (non-critical)", critical=False) # ── 11b. Critical model weight files ────────────────────────────────────────── section("11b. Critical model weight files") # # These are the EXACT files MinerU will try to open when processing a document # on a CPU deployment (default language = ch → forced to ch_lite on CPU). # # After Dockerfile Layer 3.5 patch, models_config.yml now references: # ch_lite.det = ch_PP-OCRv5_det_infer.pth (patched from v3 — v3 NOT in repo) # ch_lite.rec = ch_PP-OCRv5_rec_infer.pth (unchanged — always in repo) # # Layout uses doclayout_yolo (from magic-pdf.json layout-config). # Table (rapid_table) uses slanet-plus.onnx BUNDLED IN THE WHEEL — not here. # Formula is DISABLED — MFD/MFR files not required. # # Any CRITICAL failure here = service boots but crashes on first document. _ocr_dir = os.path.join(EXTRACT_KIT_MODELS, "OCR", "paddleocr_torch") CRITICAL_WEIGHT_FILES: list[tuple[str, str, str]] = [ # (label, relative-to-EXTRACT_KIT_MODELS, reason) ( "OCR det weight (ch_lite, default CPU lang)", os.path.join("OCR", "paddleocr_torch", "ch_PP-OCRv5_det_infer.pth"), "Patched from ch_PP-OCRv3_det_infer.pth (absent in HF repo). " "Missing = all OCR will crash at model load time." ), ( "OCR rec weight (ch_lite)", os.path.join("OCR", "paddleocr_torch", "ch_PP-OCRv5_rec_infer.pth"), "Recognition model for ch_lite. " "Missing = OCR loads det but crashes at recognition." ), ( "OCR cls weight (angle classifier)", os.path.join("OCR", "paddleocr_torch", "ch_ptocr_mobile_v2.0_cls_infer.pth"), "Used when use_angle_cls=True. Default is False so non-critical, " "but its absence causes crash if angle classification is enabled." ), ( "Layout YOLO weight (doclayout_yolo)", os.path.join("Layout", "YOLO", "doclayout_yolo_docstructbench_imgsz1280_2501.pt"), "Layout detection model. Missing = layout detection crashes on every document." ), ( "Layout LayoutLMv3 weight", os.path.join("Layout", "LayoutLMv3", "model_final.pth"), "Alternative layout model. Required even when doclayout_yolo is primary " "because model_configs.yaml always lists it." ), ( "Multilingual OCR det (en/latin fallback)", os.path.join("OCR", "paddleocr_torch", "Multilingual_PP-OCRv3_det_infer.pth"), "Patched det for en and latin languages. Missing = crash if lang=en/latin." ), ] # cls weight is only critical if use_angle_cls=True (default False) NON_CRITICAL_LABELS = {"OCR cls weight (angle classifier)"} for label, rel_path, reason in CRITICAL_WEIGHT_FILES: full_path = os.path.join(EXTRACT_KIT_MODELS, rel_path) is_critical = label not in NON_CRITICAL_LABELS if os.path.isfile(full_path): size_mb = os.path.getsize(full_path) / (1024 * 1024) ok(label, f"{size_mb:.1f} MB [{full_path}]") else: record_fail( label, f"FILE NOT FOUND: {full_path}\n" f" Reason: {reason}", critical=is_critical, ) # ── 11c. models_config.yml consistency check ────────────────────────────────── section("11c. models_config.yml consistency check") # # Reads the installed models_config.yml (inside magic_pdf package) and verifies # that every det/rec file it references for the default CPU language (ch_lite) # actually exists on disk in the expected location. # # This catches future version drift between the magic-pdf package and the HF repo # BEFORE the service starts, rather than mid-request. try: import magic_pdf import yaml as _yaml from pathlib import Path as _Path _pkg = _Path(magic_pdf.__file__).parent _mcfg = _pkg / 'model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml' if not _mcfg.exists(): record_fail("models_config.yml", f"not found at expected path: {_mcfg}") else: with open(_mcfg) as _f: _mc = _yaml.safe_load(_f) _ocr_torch = os.path.join(EXTRACT_KIT_MODELS, "OCR", "paddleocr_torch") # Check the two languages actually used on this CPU deployment _check_langs = ["ch_lite", "ch"] _mc_ok = True for _lang in _check_langs: _entry = _mc.get("lang", {}).get(_lang, {}) for _field in ("det", "rec"): _fname = _entry.get(_field) if not _fname: continue _fpath = os.path.join(_ocr_torch, _fname) if os.path.isfile(_fpath): ok(f"models_config[{_lang}].{_field}", _fname) else: record_fail( f"models_config[{_lang}].{_field}", f"Config references '{_fname}' but file not found at:\n" f" {_fpath}\n" f" Dockerfile Layer 3.5 patch may not have run, " f"or HF repo changed its file structure again.", critical=True, ) _mc_ok = False if _mc_ok: ok("models_config.yml consistency", "all referenced det/rec files exist on disk") except Exception as _exc: record_fail("models_config.yml consistency check", f"unexpected error: {_exc}", critical=False) # ── 11d. Bundled wheel resources ────────────────────────────────────────────── section("11d. Bundled wheel resources (inside magic_pdf package)") # # These files are shipped inside the magic-pdf wheel itself. # They do NOT come from the HF download. Their absence means the wheel # installed incorrectly or was corrupted. try: import magic_pdf as _mp from pathlib import Path as _P _pkg_root = _P(_mp.__file__).parent _bundled = [ ("slanet-plus.onnx (table model)", _pkg_root / "resources" / "slanet_plus" / "slanet-plus.onnx"), ("fasttext langdetect model", _pkg_root / "resources" / "fasttext-langdetect" / "lid.176.ftz"), ("YOLO langdetect model", _pkg_root / "resources" / "yolov11-langdetect" / "yolo_v11_ft.pt"), ("model_configs.yaml (weight path map)", _pkg_root / "resources" / "model_config" / "model_configs.yaml"), ] for _lbl, _p in _bundled: if _p.exists(): _sz = _p.stat().st_size / (1024 * 1024) ok(_lbl, f"{_sz:.2f} MB") else: record_fail(_lbl, f"expected inside wheel at {_p} — magic-pdf install may be corrupted") except Exception as _exc: record_fail("Bundled wheel resources check", f"unexpected: {_exc}", critical=False) # ── 12. Temp storage ─────────────────────────────────────────────────────────── section("12. Temp storage") try: td = tempfile.mkdtemp(prefix="mineru_validate_") test_file = os.path.join(td, "write_test.bin") with open(test_file, "wb") as f: f.write(b"x" * 4096) assert os.path.getsize(test_file) == 4096 shutil.rmtree(td) ok("Temp write + delete", tempfile.gettempdir()) except Exception as exc: record_fail("Temp storage", str(exc)) # ── 13. System memory (cgroups) ──────────────────────────────────────────────── section("13. System memory (cgroups)") mem_source = "unknown" total_mb = used_mb = 0 try: with open("/sys/fs/cgroup/memory.max") as f: raw = f.read().strip() if raw != "max": total_mb = int(raw) // (1024 * 1024) with open("/sys/fs/cgroup/memory.current") as f: used_mb = int(f.read().strip()) // (1024 * 1024) mem_source = "cgroups v2" except (FileNotFoundError, ValueError): pass if total_mb == 0: try: with open("/sys/fs/cgroup/memory/memory.limit_in_bytes") as f: limit = int(f.read().strip()) with open("/sys/fs/cgroup/memory/memory.usage_in_bytes") as f: used_bytes = int(f.read().strip()) if limit < 128 * 1024 * 1024 * 1024: total_mb = limit // (1024 * 1024) used_mb = used_bytes // (1024 * 1024) mem_source = "cgroups v1" except (FileNotFoundError, ValueError): pass if total_mb == 0: try: info: dict[str, int] = {} with open("/proc/meminfo") as f: for line in f: parts = line.split() if len(parts) >= 2: info[parts[0].rstrip(":")] = int(parts[1]) total_mb = info.get("MemTotal", 0) // 1024 used_mb = (info.get("MemTotal", 0) - info.get("MemAvailable", 0)) // 1024 mem_source = "/proc/meminfo (may show host RAM)" except Exception: pass ok("Memory source", mem_source) ok("Total memory", f"{total_mb} MB") ok("Used memory", f"{used_mb} MB") ok("Free memory", f"{total_mb - used_mb} MB") if total_mb > 32 * 1024: record_fail( "Memory total", f"{total_mb} MB seems too large — cgroups may not be available; " "/proc/meminfo showing host RAM. Memory guard in main.py will be conservative.", critical=False, ) # ── 14. /proc/meminfo sanity ─────────────────────────────────────────────────── section("14. /proc/meminfo (reference)") try: with open("/proc/meminfo") as f: lines = f.readlines()[:5] for line in lines: parts = line.split() if len(parts) >= 2: kb = int(parts[1]) ok(parts[0].rstrip(":"), f"{kb // 1024} MB") except Exception as exc: record_fail("/proc/meminfo", str(exc), critical=False) # ═══════════════════════════════════════════════════════════════════════════════ # Summary # ═══════════════════════════════════════════════════════════════════════════════ print("\n" + "═" * 60, flush=True) print(" Validation Summary", flush=True) print("═" * 60, flush=True) if warnings: print(f"\n ⚠ {len(warnings)} warning(s):", flush=True) for label, detail in warnings: print(f" • {label}: {detail}", flush=True) if failures: print(f"\n ✗ {len(failures)} CRITICAL failure(s):", flush=True) for label, detail in failures: print(f" • {label}: {detail}", flush=True) print("\n Service will NOT start until these are resolved.", flush=True) print(" Check Dockerfile pip layers and Docker build log.", flush=True) print("═" * 60 + "\n", flush=True) if not SOFT_MODE: sys.exit(1) else: print(f"\n ✓ All critical checks passed", flush=True) if warnings: print(f" ⚠ {len(warnings)} non-critical warning(s) — see above", flush=True) print("\n Service is ready to start.", flush=True) print("═" * 60 + "\n", flush=True) sys.exit(0)