node-2 / validate.py
sfdghsdvxfbgn's picture
Upload 7 files
d42d358 verified
Raw
History Blame Contribute Delete
13 kB
#!/usr/bin/env python3
"""
Pre-flight validation script for MinerU OCR Service.
Run by entrypoint.sh BEFORE uvicorn starts.
Exits 0 if all checks pass.
Exits 1 if any CRITICAL check fails β€” this crashes the container loudly
so Hugging Face logs show an actionable error instead of a silent crash
or a healthy-looking service that fails on every request.
Usage:
python validate.py # run all checks, exit 0/1
python validate.py --soft # run all checks, always exit 0 (log only)
"""
import importlib
import json
import os
import shutil
import sys
import tempfile
import time
import traceback
SOFT_MODE = "--soft" in sys.argv # never exit 1, just print
MODELS_DIR = "/app/models"
EXTRACT_KIT_MODELS = os.path.join(MODELS_DIR, "PDF-Extract-Kit-1.0", "models")
LAYOUT_MARKER = os.path.join(EXTRACT_KIT_MODELS, "Layout") # canary directory
CONFIG_PATH = os.path.expanduser("~/magic-pdf.json")
# ── helpers ────────────────────────────────────────────────────────────────────
def ok(label: str, detail: str = "") -> None:
suffix = f" ({detail})" if detail else ""
print(f" βœ“ {label}{suffix}", flush=True)
def fail(label: str, detail: str, critical: bool = True) -> None:
tag = "CRITICAL" if critical else "WARNING"
print(f" βœ— [{tag}] {label}: {detail}", flush=True)
def section(title: str) -> None:
print(f"\n{'─' * 60}", flush=True)
print(f" {title}", flush=True)
print(f"{'─' * 60}", flush=True)
# ── check registry ─────────────────────────────────────────────────────────────
failures: list[tuple[str, str]] = [] # (label, detail)
warnings: list[tuple[str, str]] = []
def record_fail(label: str, detail: str, critical: bool = True) -> None:
fail(label, detail, critical)
if critical:
failures.append((label, detail))
else:
warnings.append((label, detail))
# ═══════════════════════════════════════════════════════════════════════════════
print("\n" + "═" * 60, flush=True)
print(" MinerU OCR Service β€” Pre-flight Validation", flush=True)
print("═" * 60, flush=True)
# ── 1. Python version ──────────────────────────────────────────────────────────
section("1. Python runtime")
pv = sys.version_info
if pv >= (3, 10):
ok("Python version", f"{pv.major}.{pv.minor}.{pv.micro}")
else:
record_fail("Python version",
f"{pv.major}.{pv.minor} detected β€” magic-pdf requires >= 3.10")
# ── 2. cv2 ─────────────────────────────────────────────────────────────────────
section("2. OpenCV (cv2)")
try:
import cv2
ok("cv2 import", f"version {cv2.__version__}")
# Confirm headless (no X11 dep) by checking build info
build = cv2.getBuildInformation()
if "GTK" in build or "Qt" in build:
record_fail("cv2 build", "GUI backend detected β€” use opencv-python-headless",
critical=False)
else:
ok("cv2 headless", "no GUI backend detected")
except ImportError as exc:
record_fail(
"cv2 import",
f"{exc}. "
"Add 'opencv-python-headless>=4.8.0' to Dockerfile pip layer 1 "
"BEFORE magic-pdf install.",
)
except Exception as exc:
record_fail("cv2 import", f"unexpected error: {exc}")
# ── 3. magic_pdf core ──────────────────────────────────────────────────────────
section("3. magic_pdf core imports")
REQUIRED_IMPORTS = [
("magic_pdf.data.dataset", ["PymuDocDataset", "ImageDataset"]),
("magic_pdf.data.data_reader_writer", ["FileBasedDataReader", "FileBasedDataWriter"]),
("magic_pdf.model.doc_analyze_by_custom_model", ["doc_analyze"]),
("magic_pdf.config.enums", ["SupportedPdfParseMethod"]),
]
for module_path, symbols in REQUIRED_IMPORTS:
try:
mod = importlib.import_module(module_path)
missing = [s for s in symbols if not hasattr(mod, s)]
if missing:
record_fail(f"{module_path}", f"missing symbols: {missing}")
else:
ok(module_path, ", ".join(symbols))
except ImportError as exc:
record_fail(module_path, str(exc))
except Exception as exc:
record_fail(module_path, f"unexpected: {exc}")
# Confirm removed/deprecated imports are truly gone
section("3b. Deprecated API check (should NOT exist)")
OBSOLETE = [
"magic_pdf.pipe.UNIPipe",
"magic_pdf.rw.DiskReaderWriter",
]
for mod_path in OBSOLETE:
try:
importlib.import_module(mod_path)
record_fail(mod_path, "still importable β€” code may use old API", critical=False)
except ImportError:
ok(f"{mod_path} (correctly absent)")
# ── 4. Config file ─────────────────────────────────────────────────────────────
section("4. MinerU config (magic-pdf.json)")
if os.path.exists(CONFIG_PATH):
try:
with open(CONFIG_PATH) as f:
cfg = json.load(f)
required_keys = ["models-dir", "device-mode"]
missing_keys = [k for k in required_keys if k not in cfg]
if missing_keys:
record_fail("Config keys", f"missing: {missing_keys}")
else:
ok("Config file", CONFIG_PATH)
ok("device-mode", cfg.get("device-mode", "?"))
ok("models-dir", cfg.get("models-dir", "?"))
ok("formula-enable", str(cfg.get("formula-config", {}).get("enable", "?")))
ok("table-enable", str(cfg.get("table-config", {}).get("enable", "?")))
except json.JSONDecodeError as exc:
record_fail("Config file", f"invalid JSON: {exc}")
except Exception as exc:
record_fail("Config file", str(exc))
else:
record_fail(
"Config file",
f"not found at {CONFIG_PATH}. "
"Run download_models.py or check Docker build log.",
)
# ── 5. Model files ─────────────────────────────────────────────────────────────
section("5. Model files")
model_checks = [
("PDF-Extract-Kit-1.0 root", os.path.join(MODELS_DIR, "PDF-Extract-Kit-1.0")),
("Layout models (canary)", LAYOUT_MARKER),
("MFD models", os.path.join(EXTRACT_KIT_MODELS, "MFD")),
("Table models", os.path.join(EXTRACT_KIT_MODELS, "TabRec")),
]
for label, path in model_checks:
if os.path.isdir(path):
# Count files for a sanity check
try:
n = sum(1 for _ in os.scandir(path))
ok(label, f"{path} ({n} entries)")
except OSError:
ok(label, path)
else:
record_fail(label, f"directory not found: {path}")
# layoutreader β€” optional
lr_dir = os.path.join(MODELS_DIR, "layoutreader")
if os.path.isdir(lr_dir):
ok("layoutreader (optional)", lr_dir)
else:
record_fail("layoutreader (optional)",
"not found β€” MinerU will use fallback ordering (non-critical)",
critical=False)
# Validate config models-dir points to existing path
try:
with open(CONFIG_PATH) as f:
cfg = json.load(f)
cfg_models = cfg.get("models-dir", "")
if cfg_models and os.path.isdir(cfg_models):
ok("Config models-dir exists", cfg_models)
elif cfg_models:
record_fail("Config models-dir", f"points to missing path: {cfg_models}")
except Exception:
pass # already reported above
# ── 6. Temp storage ────────────────────────────────────────────────────────────
section("6. Temp storage")
try:
td = tempfile.mkdtemp(prefix="mineru_validate_")
test_file = os.path.join(td, "write_test.bin")
with open(test_file, "wb") as f:
f.write(b"x" * 4096)
assert os.path.getsize(test_file) == 4096
shutil.rmtree(td)
ok("Temp write + delete", tempfile.gettempdir())
except Exception as exc:
record_fail("Temp storage", str(exc))
# ── 7. System memory ───────────────────────────────────────────────────────────
section("7. System memory (cgroups)")
mem_source = "unknown"
total_mb = used_mb = 0
try:
with open("/sys/fs/cgroup/memory.max") as f:
raw = f.read().strip()
if raw != "max":
total_mb = int(raw) // (1024 * 1024)
with open("/sys/fs/cgroup/memory.current") as f:
used_mb = int(f.read().strip()) // (1024 * 1024)
mem_source = "cgroups v2"
except (FileNotFoundError, ValueError):
pass
if total_mb == 0:
try:
with open("/sys/fs/cgroup/memory/memory.limit_in_bytes") as f:
limit = int(f.read().strip())
with open("/sys/fs/cgroup/memory/memory.usage_in_bytes") as f:
used_bytes = int(f.read().strip())
if limit < 128 * 1024 * 1024 * 1024:
total_mb = limit // (1024 * 1024)
used_mb = used_bytes // (1024 * 1024)
mem_source = "cgroups v1"
except (FileNotFoundError, ValueError):
pass
if total_mb == 0:
try:
info: dict[str, int] = {}
with open("/proc/meminfo") as f:
for line in f:
parts = line.split()
if len(parts) >= 2:
info[parts[0].rstrip(":")] = int(parts[1])
total_mb = info.get("MemTotal", 0) // 1024
used_mb = (info.get("MemTotal", 0) - info.get("MemAvailable", 0)) // 1024
mem_source = "/proc/meminfo (may show host RAM)"
except Exception:
pass
ok("Memory source", mem_source)
ok("Total memory", f"{total_mb} MB")
ok("Used memory", f"{used_mb} MB")
ok("Free memory", f"{total_mb - used_mb} MB")
if total_mb > 32 * 1024:
record_fail(
"Memory total",
f"{total_mb} MB seems too large for a container β€” "
"cgroups may not be available; /proc/meminfo is showing host RAM. "
"Memory guard in main.py will be conservative.",
critical=False,
)
# ── 8. /proc/meminfo sanity ────────────────────────────────────────────────────
section("8. /proc/meminfo (for reference)")
try:
with open("/proc/meminfo") as f:
lines = f.readlines()[:5]
for line in lines:
parts = line.split()
if len(parts) >= 2:
kb = int(parts[1])
ok(parts[0].rstrip(":"), f"{kb // 1024} MB")
except Exception as exc:
record_fail("/proc/meminfo", str(exc), critical=False)
# ═══════════════════════════════════════════════════════════════════════════════
# Summary
# ═══════════════════════════════════════════════════════════════════════════════
print("\n" + "═" * 60, flush=True)
print(" Validation Summary", flush=True)
print("═" * 60, flush=True)
if warnings:
print(f"\n ⚠ {len(warnings)} warning(s):", flush=True)
for label, detail in warnings:
print(f" β€’ {label}: {detail}", flush=True)
if failures:
print(f"\n βœ— {len(failures)} CRITICAL failure(s):", flush=True)
for label, detail in failures:
print(f" β€’ {label}: {detail}", flush=True)
print("\n Service will NOT start until these are resolved.", flush=True)
print(" Check Dockerfile pip layers and Docker build log.", flush=True)
print("═" * 60 + "\n", flush=True)
if not SOFT_MODE:
sys.exit(1)
else:
print(f"\n βœ“ All critical checks passed", flush=True)
if warnings:
print(f" ⚠ {len(warnings)} non-critical warning(s) β€” see above", flush=True)
print("\n Service is ready to start.", flush=True)
print("═" * 60 + "\n", flush=True)
sys.exit(0)