#!/usr/bin/env python3 """ Pre-flight validation script for MinerU OCR Service. Run by entrypoint.sh BEFORE uvicorn starts. Exits 0 if all checks pass. Exits 1 if any CRITICAL check fails — this crashes the container loudly so Hugging Face logs show an actionable error instead of a silent crash or a healthy-looking service that fails on every request. Usage: python validate.py # run all checks, exit 0/1 python validate.py --soft # run all checks, always exit 0 (log only) """ import importlib import json import os import shutil import sys import tempfile import time import traceback SOFT_MODE = "--soft" in sys.argv # never exit 1, just print MODELS_DIR = "/app/models" EXTRACT_KIT_MODELS = os.path.join(MODELS_DIR, "PDF-Extract-Kit-1.0", "models") LAYOUT_MARKER = os.path.join(EXTRACT_KIT_MODELS, "Layout") # canary directory CONFIG_PATH = os.path.expanduser("~/magic-pdf.json") # ── helpers ──────────────────────────────────────────────────────────────────── def ok(label: str, detail: str = "") -> None: suffix = f" ({detail})" if detail else "" print(f" ✓ {label}{suffix}", flush=True) def fail(label: str, detail: str, critical: bool = True) -> None: tag = "CRITICAL" if critical else "WARNING" print(f" ✗ [{tag}] {label}: {detail}", flush=True) def section(title: str) -> None: print(f"\n{'─' * 60}", flush=True) print(f" {title}", flush=True) print(f"{'─' * 60}", flush=True) # ── check registry ───────────────────────────────────────────────────────────── failures: list[tuple[str, str]] = [] # (label, detail) warnings: list[tuple[str, str]] = [] def record_fail(label: str, detail: str, critical: bool = True) -> None: fail(label, detail, critical) if critical: failures.append((label, detail)) else: warnings.append((label, detail)) # ═══════════════════════════════════════════════════════════════════════════════ print("\n" + "═" * 60, flush=True) print(" MinerU OCR Service — Pre-flight Validation", flush=True) print("═" * 60, flush=True) # ── 1. Python version ────────────────────────────────────────────────────────── section("1. Python runtime") pv = sys.version_info if pv >= (3, 10): ok("Python version", f"{pv.major}.{pv.minor}.{pv.micro}") else: record_fail("Python version", f"{pv.major}.{pv.minor} detected — magic-pdf requires >= 3.10") # ── 2. cv2 ───────────────────────────────────────────────────────────────────── section("2. OpenCV (cv2)") try: import cv2 ok("cv2 import", f"version {cv2.__version__}") # Confirm headless (no X11 dep) by checking build info build = cv2.getBuildInformation() if "GTK" in build or "Qt" in build: record_fail("cv2 build", "GUI backend detected — use opencv-python-headless", critical=False) else: ok("cv2 headless", "no GUI backend detected") except ImportError as exc: record_fail( "cv2 import", f"{exc}. " "Add 'opencv-python-headless>=4.8.0' to Dockerfile pip layer 1 " "BEFORE magic-pdf install.", ) except Exception as exc: record_fail("cv2 import", f"unexpected error: {exc}") # ── 3. magic_pdf core ────────────────────────────────────────────────────────── section("3. magic_pdf core imports") REQUIRED_IMPORTS = [ ("magic_pdf.data.dataset", ["PymuDocDataset", "ImageDataset"]), ("magic_pdf.data.data_reader_writer", ["FileBasedDataReader", "FileBasedDataWriter"]), ("magic_pdf.model.doc_analyze_by_custom_model", ["doc_analyze"]), ("magic_pdf.config.enums", ["SupportedPdfParseMethod"]), ] for module_path, symbols in REQUIRED_IMPORTS: try: mod = importlib.import_module(module_path) missing = [s for s in symbols if not hasattr(mod, s)] if missing: record_fail(f"{module_path}", f"missing symbols: {missing}") else: ok(module_path, ", ".join(symbols)) except ImportError as exc: record_fail(module_path, str(exc)) except Exception as exc: record_fail(module_path, f"unexpected: {exc}") # Confirm removed/deprecated imports are truly gone section("3b. Deprecated API check (should NOT exist)") OBSOLETE = [ "magic_pdf.pipe.UNIPipe", "magic_pdf.rw.DiskReaderWriter", ] for mod_path in OBSOLETE: try: importlib.import_module(mod_path) record_fail(mod_path, "still importable — code may use old API", critical=False) except ImportError: ok(f"{mod_path} (correctly absent)") # ── 4. Config file ───────────────────────────────────────────────────────────── section("4. MinerU config (magic-pdf.json)") if os.path.exists(CONFIG_PATH): try: with open(CONFIG_PATH) as f: cfg = json.load(f) required_keys = ["models-dir", "device-mode"] missing_keys = [k for k in required_keys if k not in cfg] if missing_keys: record_fail("Config keys", f"missing: {missing_keys}") else: ok("Config file", CONFIG_PATH) ok("device-mode", cfg.get("device-mode", "?")) ok("models-dir", cfg.get("models-dir", "?")) ok("formula-enable", str(cfg.get("formula-config", {}).get("enable", "?"))) ok("table-enable", str(cfg.get("table-config", {}).get("enable", "?"))) except json.JSONDecodeError as exc: record_fail("Config file", f"invalid JSON: {exc}") except Exception as exc: record_fail("Config file", str(exc)) else: record_fail( "Config file", f"not found at {CONFIG_PATH}. " "Run download_models.py or check Docker build log.", ) # ── 5. Model files ───────────────────────────────────────────────────────────── section("5. Model files") model_checks = [ ("PDF-Extract-Kit-1.0 root", os.path.join(MODELS_DIR, "PDF-Extract-Kit-1.0")), ("Layout models (canary)", LAYOUT_MARKER), ("MFD models", os.path.join(EXTRACT_KIT_MODELS, "MFD")), ("Table models", os.path.join(EXTRACT_KIT_MODELS, "TabRec")), ] for label, path in model_checks: if os.path.isdir(path): # Count files for a sanity check try: n = sum(1 for _ in os.scandir(path)) ok(label, f"{path} ({n} entries)") except OSError: ok(label, path) else: record_fail(label, f"directory not found: {path}") # layoutreader — optional lr_dir = os.path.join(MODELS_DIR, "layoutreader") if os.path.isdir(lr_dir): ok("layoutreader (optional)", lr_dir) else: record_fail("layoutreader (optional)", "not found — MinerU will use fallback ordering (non-critical)", critical=False) # Validate config models-dir points to existing path try: with open(CONFIG_PATH) as f: cfg = json.load(f) cfg_models = cfg.get("models-dir", "") if cfg_models and os.path.isdir(cfg_models): ok("Config models-dir exists", cfg_models) elif cfg_models: record_fail("Config models-dir", f"points to missing path: {cfg_models}") except Exception: pass # already reported above # ── 6. Temp storage ──────────────────────────────────────────────────────────── section("6. Temp storage") try: td = tempfile.mkdtemp(prefix="mineru_validate_") test_file = os.path.join(td, "write_test.bin") with open(test_file, "wb") as f: f.write(b"x" * 4096) assert os.path.getsize(test_file) == 4096 shutil.rmtree(td) ok("Temp write + delete", tempfile.gettempdir()) except Exception as exc: record_fail("Temp storage", str(exc)) # ── 7. System memory ─────────────────────────────────────────────────────────── section("7. System memory (cgroups)") mem_source = "unknown" total_mb = used_mb = 0 try: with open("/sys/fs/cgroup/memory.max") as f: raw = f.read().strip() if raw != "max": total_mb = int(raw) // (1024 * 1024) with open("/sys/fs/cgroup/memory.current") as f: used_mb = int(f.read().strip()) // (1024 * 1024) mem_source = "cgroups v2" except (FileNotFoundError, ValueError): pass if total_mb == 0: try: with open("/sys/fs/cgroup/memory/memory.limit_in_bytes") as f: limit = int(f.read().strip()) with open("/sys/fs/cgroup/memory/memory.usage_in_bytes") as f: used_bytes = int(f.read().strip()) if limit < 128 * 1024 * 1024 * 1024: total_mb = limit // (1024 * 1024) used_mb = used_bytes // (1024 * 1024) mem_source = "cgroups v1" except (FileNotFoundError, ValueError): pass if total_mb == 0: try: info: dict[str, int] = {} with open("/proc/meminfo") as f: for line in f: parts = line.split() if len(parts) >= 2: info[parts[0].rstrip(":")] = int(parts[1]) total_mb = info.get("MemTotal", 0) // 1024 used_mb = (info.get("MemTotal", 0) - info.get("MemAvailable", 0)) // 1024 mem_source = "/proc/meminfo (may show host RAM)" except Exception: pass ok("Memory source", mem_source) ok("Total memory", f"{total_mb} MB") ok("Used memory", f"{used_mb} MB") ok("Free memory", f"{total_mb - used_mb} MB") if total_mb > 32 * 1024: record_fail( "Memory total", f"{total_mb} MB seems too large for a container — " "cgroups may not be available; /proc/meminfo is showing host RAM. " "Memory guard in main.py will be conservative.", critical=False, ) # ── 8. /proc/meminfo sanity ──────────────────────────────────────────────────── section("8. /proc/meminfo (for reference)") try: with open("/proc/meminfo") as f: lines = f.readlines()[:5] for line in lines: parts = line.split() if len(parts) >= 2: kb = int(parts[1]) ok(parts[0].rstrip(":"), f"{kb // 1024} MB") except Exception as exc: record_fail("/proc/meminfo", str(exc), critical=False) # ═══════════════════════════════════════════════════════════════════════════════ # Summary # ═══════════════════════════════════════════════════════════════════════════════ print("\n" + "═" * 60, flush=True) print(" Validation Summary", flush=True) print("═" * 60, flush=True) if warnings: print(f"\n ⚠ {len(warnings)} warning(s):", flush=True) for label, detail in warnings: print(f" • {label}: {detail}", flush=True) if failures: print(f"\n ✗ {len(failures)} CRITICAL failure(s):", flush=True) for label, detail in failures: print(f" • {label}: {detail}", flush=True) print("\n Service will NOT start until these are resolved.", flush=True) print(" Check Dockerfile pip layers and Docker build log.", flush=True) print("═" * 60 + "\n", flush=True) if not SOFT_MODE: sys.exit(1) else: print(f"\n ✓ All critical checks passed", flush=True) if warnings: print(f" ⚠ {len(warnings)} non-critical warning(s) — see above", flush=True) print("\n Service is ready to start.", flush=True) print("═" * 60 + "\n", flush=True) sys.exit(0)