Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Pre-flight validation script for MinerU OCR Service. | |
| Run by entrypoint.sh BEFORE uvicorn starts. | |
| Exits 0 if all checks pass. | |
| Exits 1 if any CRITICAL check fails β this crashes the container loudly | |
| so Hugging Face logs show an actionable error instead of a silent crash | |
| or a healthy-looking service that fails on every request. | |
| Usage: | |
| python validate.py # run all checks, exit 0/1 | |
| python validate.py --soft # run all checks, always exit 0 (log only) | |
| """ | |
| import importlib | |
| import json | |
| import os | |
| import shutil | |
| import sys | |
| import tempfile | |
| import time | |
| import traceback | |
| SOFT_MODE = "--soft" in sys.argv # never exit 1, just print | |
| MODELS_DIR = "/app/models" | |
| EXTRACT_KIT_MODELS = os.path.join(MODELS_DIR, "PDF-Extract-Kit-1.0", "models") | |
| LAYOUT_MARKER = os.path.join(EXTRACT_KIT_MODELS, "Layout") # canary directory | |
| CONFIG_PATH = os.path.expanduser("~/magic-pdf.json") | |
| # ββ helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def ok(label: str, detail: str = "") -> None: | |
| suffix = f" ({detail})" if detail else "" | |
| print(f" β {label}{suffix}", flush=True) | |
| def fail(label: str, detail: str, critical: bool = True) -> None: | |
| tag = "CRITICAL" if critical else "WARNING" | |
| print(f" β [{tag}] {label}: {detail}", flush=True) | |
| def section(title: str) -> None: | |
| print(f"\n{'β' * 60}", flush=True) | |
| print(f" {title}", flush=True) | |
| print(f"{'β' * 60}", flush=True) | |
| # ββ check registry βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| failures: list[tuple[str, str]] = [] # (label, detail) | |
| warnings: list[tuple[str, str]] = [] | |
| def record_fail(label: str, detail: str, critical: bool = True) -> None: | |
| fail(label, detail, critical) | |
| if critical: | |
| failures.append((label, detail)) | |
| else: | |
| warnings.append((label, detail)) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("\n" + "β" * 60, flush=True) | |
| print(" MinerU OCR Service β Pre-flight Validation", flush=True) | |
| print("β" * 60, flush=True) | |
| # ββ 1. Python version ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("1. Python runtime") | |
| pv = sys.version_info | |
| if pv >= (3, 10): | |
| ok("Python version", f"{pv.major}.{pv.minor}.{pv.micro}") | |
| else: | |
| record_fail("Python version", | |
| f"{pv.major}.{pv.minor} detected β magic-pdf requires >= 3.10") | |
| # ββ 2. cv2 βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("2. OpenCV (cv2)") | |
| try: | |
| import cv2 | |
| ok("cv2 import", f"version {cv2.__version__}") | |
| # Confirm headless (no X11 dep) by checking build info | |
| build = cv2.getBuildInformation() | |
| if "GTK" in build or "Qt" in build: | |
| record_fail("cv2 build", "GUI backend detected β use opencv-python-headless", | |
| critical=False) | |
| else: | |
| ok("cv2 headless", "no GUI backend detected") | |
| except ImportError as exc: | |
| record_fail( | |
| "cv2 import", | |
| f"{exc}. " | |
| "Add 'opencv-python-headless>=4.8.0' to Dockerfile pip layer 1 " | |
| "BEFORE magic-pdf install.", | |
| ) | |
| except Exception as exc: | |
| record_fail("cv2 import", f"unexpected error: {exc}") | |
| # ββ 3. magic_pdf core ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("3. magic_pdf core imports") | |
| REQUIRED_IMPORTS = [ | |
| ("magic_pdf.data.dataset", ["PymuDocDataset", "ImageDataset"]), | |
| ("magic_pdf.data.data_reader_writer", ["FileBasedDataReader", "FileBasedDataWriter"]), | |
| ("magic_pdf.model.doc_analyze_by_custom_model", ["doc_analyze"]), | |
| ("magic_pdf.config.enums", ["SupportedPdfParseMethod"]), | |
| ] | |
| for module_path, symbols in REQUIRED_IMPORTS: | |
| try: | |
| mod = importlib.import_module(module_path) | |
| missing = [s for s in symbols if not hasattr(mod, s)] | |
| if missing: | |
| record_fail(f"{module_path}", f"missing symbols: {missing}") | |
| else: | |
| ok(module_path, ", ".join(symbols)) | |
| except ImportError as exc: | |
| record_fail(module_path, str(exc)) | |
| except Exception as exc: | |
| record_fail(module_path, f"unexpected: {exc}") | |
| # Confirm removed/deprecated imports are truly gone | |
| section("3b. Deprecated API check (should NOT exist)") | |
| OBSOLETE = [ | |
| "magic_pdf.pipe.UNIPipe", | |
| "magic_pdf.rw.DiskReaderWriter", | |
| ] | |
| for mod_path in OBSOLETE: | |
| try: | |
| importlib.import_module(mod_path) | |
| record_fail(mod_path, "still importable β code may use old API", critical=False) | |
| except ImportError: | |
| ok(f"{mod_path} (correctly absent)") | |
| # ββ 4. Config file βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("4. MinerU config (magic-pdf.json)") | |
| if os.path.exists(CONFIG_PATH): | |
| try: | |
| with open(CONFIG_PATH) as f: | |
| cfg = json.load(f) | |
| required_keys = ["models-dir", "device-mode"] | |
| missing_keys = [k for k in required_keys if k not in cfg] | |
| if missing_keys: | |
| record_fail("Config keys", f"missing: {missing_keys}") | |
| else: | |
| ok("Config file", CONFIG_PATH) | |
| ok("device-mode", cfg.get("device-mode", "?")) | |
| ok("models-dir", cfg.get("models-dir", "?")) | |
| ok("formula-enable", str(cfg.get("formula-config", {}).get("enable", "?"))) | |
| ok("table-enable", str(cfg.get("table-config", {}).get("enable", "?"))) | |
| except json.JSONDecodeError as exc: | |
| record_fail("Config file", f"invalid JSON: {exc}") | |
| except Exception as exc: | |
| record_fail("Config file", str(exc)) | |
| else: | |
| record_fail( | |
| "Config file", | |
| f"not found at {CONFIG_PATH}. " | |
| "Run download_models.py or check Docker build log.", | |
| ) | |
| # ββ 5. Model files βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("5. Model files") | |
| model_checks = [ | |
| ("PDF-Extract-Kit-1.0 root", os.path.join(MODELS_DIR, "PDF-Extract-Kit-1.0")), | |
| ("Layout models (canary)", LAYOUT_MARKER), | |
| ("MFD models", os.path.join(EXTRACT_KIT_MODELS, "MFD")), | |
| ("Table models", os.path.join(EXTRACT_KIT_MODELS, "TabRec")), | |
| ] | |
| for label, path in model_checks: | |
| if os.path.isdir(path): | |
| # Count files for a sanity check | |
| try: | |
| n = sum(1 for _ in os.scandir(path)) | |
| ok(label, f"{path} ({n} entries)") | |
| except OSError: | |
| ok(label, path) | |
| else: | |
| record_fail(label, f"directory not found: {path}") | |
| # layoutreader β optional | |
| lr_dir = os.path.join(MODELS_DIR, "layoutreader") | |
| if os.path.isdir(lr_dir): | |
| ok("layoutreader (optional)", lr_dir) | |
| else: | |
| record_fail("layoutreader (optional)", | |
| "not found β MinerU will use fallback ordering (non-critical)", | |
| critical=False) | |
| # Validate config models-dir points to existing path | |
| try: | |
| with open(CONFIG_PATH) as f: | |
| cfg = json.load(f) | |
| cfg_models = cfg.get("models-dir", "") | |
| if cfg_models and os.path.isdir(cfg_models): | |
| ok("Config models-dir exists", cfg_models) | |
| elif cfg_models: | |
| record_fail("Config models-dir", f"points to missing path: {cfg_models}") | |
| except Exception: | |
| pass # already reported above | |
| # ββ 6. Temp storage ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("6. Temp storage") | |
| try: | |
| td = tempfile.mkdtemp(prefix="mineru_validate_") | |
| test_file = os.path.join(td, "write_test.bin") | |
| with open(test_file, "wb") as f: | |
| f.write(b"x" * 4096) | |
| assert os.path.getsize(test_file) == 4096 | |
| shutil.rmtree(td) | |
| ok("Temp write + delete", tempfile.gettempdir()) | |
| except Exception as exc: | |
| record_fail("Temp storage", str(exc)) | |
| # ββ 7. System memory βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("7. System memory (cgroups)") | |
| mem_source = "unknown" | |
| total_mb = used_mb = 0 | |
| try: | |
| with open("/sys/fs/cgroup/memory.max") as f: | |
| raw = f.read().strip() | |
| if raw != "max": | |
| total_mb = int(raw) // (1024 * 1024) | |
| with open("/sys/fs/cgroup/memory.current") as f: | |
| used_mb = int(f.read().strip()) // (1024 * 1024) | |
| mem_source = "cgroups v2" | |
| except (FileNotFoundError, ValueError): | |
| pass | |
| if total_mb == 0: | |
| try: | |
| with open("/sys/fs/cgroup/memory/memory.limit_in_bytes") as f: | |
| limit = int(f.read().strip()) | |
| with open("/sys/fs/cgroup/memory/memory.usage_in_bytes") as f: | |
| used_bytes = int(f.read().strip()) | |
| if limit < 128 * 1024 * 1024 * 1024: | |
| total_mb = limit // (1024 * 1024) | |
| used_mb = used_bytes // (1024 * 1024) | |
| mem_source = "cgroups v1" | |
| except (FileNotFoundError, ValueError): | |
| pass | |
| if total_mb == 0: | |
| try: | |
| info: dict[str, int] = {} | |
| with open("/proc/meminfo") as f: | |
| for line in f: | |
| parts = line.split() | |
| if len(parts) >= 2: | |
| info[parts[0].rstrip(":")] = int(parts[1]) | |
| total_mb = info.get("MemTotal", 0) // 1024 | |
| used_mb = (info.get("MemTotal", 0) - info.get("MemAvailable", 0)) // 1024 | |
| mem_source = "/proc/meminfo (may show host RAM)" | |
| except Exception: | |
| pass | |
| ok("Memory source", mem_source) | |
| ok("Total memory", f"{total_mb} MB") | |
| ok("Used memory", f"{used_mb} MB") | |
| ok("Free memory", f"{total_mb - used_mb} MB") | |
| if total_mb > 32 * 1024: | |
| record_fail( | |
| "Memory total", | |
| f"{total_mb} MB seems too large for a container β " | |
| "cgroups may not be available; /proc/meminfo is showing host RAM. " | |
| "Memory guard in main.py will be conservative.", | |
| critical=False, | |
| ) | |
| # ββ 8. /proc/meminfo sanity ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("8. /proc/meminfo (for reference)") | |
| try: | |
| with open("/proc/meminfo") as f: | |
| lines = f.readlines()[:5] | |
| for line in lines: | |
| parts = line.split() | |
| if len(parts) >= 2: | |
| kb = int(parts[1]) | |
| ok(parts[0].rstrip(":"), f"{kb // 1024} MB") | |
| except Exception as exc: | |
| record_fail("/proc/meminfo", str(exc), critical=False) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Summary | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("\n" + "β" * 60, flush=True) | |
| print(" Validation Summary", flush=True) | |
| print("β" * 60, flush=True) | |
| if warnings: | |
| print(f"\n β {len(warnings)} warning(s):", flush=True) | |
| for label, detail in warnings: | |
| print(f" β’ {label}: {detail}", flush=True) | |
| if failures: | |
| print(f"\n β {len(failures)} CRITICAL failure(s):", flush=True) | |
| for label, detail in failures: | |
| print(f" β’ {label}: {detail}", flush=True) | |
| print("\n Service will NOT start until these are resolved.", flush=True) | |
| print(" Check Dockerfile pip layers and Docker build log.", flush=True) | |
| print("β" * 60 + "\n", flush=True) | |
| if not SOFT_MODE: | |
| sys.exit(1) | |
| else: | |
| print(f"\n β All critical checks passed", flush=True) | |
| if warnings: | |
| print(f" β {len(warnings)} non-critical warning(s) β see above", flush=True) | |
| print("\n Service is ready to start.", flush=True) | |
| print("β" * 60 + "\n", flush=True) | |
| sys.exit(0) | |