Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Pre-flight validation script for MinerU OCR Service. | |
| Run by entrypoint.sh BEFORE uvicorn starts. | |
| Exits 0 if all checks pass. | |
| Exits 1 if any CRITICAL check fails β this crashes the container loudly | |
| so Hugging Face logs show an actionable error instead of a silent crash | |
| or a healthy-looking service that fails on every request. | |
| Usage: | |
| python validate.py # run all checks, exit 0/1 | |
| python validate.py --soft # run all checks, always exit 0 (log only) | |
| ββ FORENSIC NOTES (2025-06) ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| OCR engine: | |
| The pipeline (full) backend uses paddleocr2pytorch β a self-contained | |
| PyTorch reimplementation of PaddleOCR bundled inside the magic-pdf wheel. | |
| It uses: torch, cv2, numpy, pyclipper, shapely, yaml. | |
| paddlepaddle and paddleocr packages are NOT installed and NOT needed. | |
| pp_structure_v2.py (which imports paddleocr) is only loaded in 'lite' model | |
| mode. Pipeline backend always uses 'full' mode (CustomPEKModel). That file is | |
| never imported at runtime. | |
| OCR model path resolution (from pytorch_paddle.py): | |
| ocr_models_dir = os.path.join(get_local_models_dir(), 'OCR', 'paddleocr_torch') | |
| det_model_path = os.path.join(ocr_models_dir, det_filename) | |
| where det_filename comes from models_config.yml keyed by language. | |
| Default CPU path: lang='ch' β forced to 'ch_lite' on CPU device. | |
| After Dockerfile Layer 3.5 patch: | |
| ch_lite.det = ch_PP-OCRv5_det_infer.pth (was ch_PP-OCRv3 β not in HF repo) | |
| ch_lite.rec = ch_PP-OCRv5_rec_infer.pth (unchanged β already in HF repo) | |
| Arch config lookup (from pytorchocr_utility.py): | |
| get_arch_config(model_path) uses Path(model_path).stem as the key into | |
| arch_config.yaml (bundled in magic-pdf wheel). Both replacement filenames | |
| have entries in arch_config.yaml β verified before patch was written. | |
| OpenCV conflict handling: | |
| doclayout-yolo, ultralytics, and rapid-table all declare opencv-python | |
| (non-headless) as a required dep. pip installs the full build in Layer 3. | |
| Layer 4 force-reinstalls opencv-python-headless to overwrite cv2. Both | |
| packages expose an identical cv2 API so all callers work correctly at | |
| runtime. pip-check shows warnings but they are harmless. | |
| onnxruntime: | |
| rapid-table declares onnxruntime>1.17.0 as a required (non-optional) dep. | |
| pip resolves it automatically when magic-pdf[full] is installed in Layer 3. | |
| slanet-plus.onnx (table model): | |
| Bundled inside the magic-pdf wheel at: | |
| magic_pdf/resources/slanet_plus/slanet-plus.onnx | |
| NOT downloaded from HF Hub β no separate download needed. | |
| """ | |
| import importlib | |
| import json | |
| import os | |
| import shutil | |
| import sys | |
| import tempfile | |
| import traceback | |
| SOFT_MODE = "--soft" in sys.argv # never exit 1, just print | |
| MODELS_DIR = "/app/models" | |
| EXTRACT_KIT_MODELS = os.path.join(MODELS_DIR, "PDF-Extract-Kit-1.0", "models") | |
| LAYOUT_MARKER = os.path.join(EXTRACT_KIT_MODELS, "Layout") # canary directory | |
| CONFIG_PATH = os.path.expanduser("~/magic-pdf.json") | |
| # ββ helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def ok(label: str, detail: str = "") -> None: | |
| suffix = f" ({detail})" if detail else "" | |
| print(f" β {label}{suffix}", flush=True) | |
| def fail(label: str, detail: str, critical: bool = True) -> None: | |
| tag = "CRITICAL" if critical else "WARNING" | |
| print(f" β [{tag}] {label}: {detail}", flush=True) | |
| def section(title: str) -> None: | |
| print(f"\n{'β' * 60}", flush=True) | |
| print(f" {title}", flush=True) | |
| print(f"{'β' * 60}", flush=True) | |
| # ββ check registry βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| failures: list[tuple[str, str]] = [] | |
| warnings: list[tuple[str, str]] = [] | |
| def record_fail(label: str, detail: str, critical: bool = True) -> None: | |
| fail(label, detail, critical) | |
| if critical: | |
| failures.append((label, detail)) | |
| else: | |
| warnings.append((label, detail)) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("\n" + "β" * 60, flush=True) | |
| print(" MinerU OCR Service β Pre-flight Validation", flush=True) | |
| print("β" * 60, flush=True) | |
| # ββ 1. Python version ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("1. Python runtime") | |
| pv = sys.version_info | |
| if pv >= (3, 10): | |
| ok("Python version", f"{pv.major}.{pv.minor}.{pv.micro}") | |
| else: | |
| record_fail("Python version", | |
| f"{pv.major}.{pv.minor} detected β magic-pdf requires >= 3.10") | |
| # ββ 2. cv2 βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("2. OpenCV (cv2)") | |
| try: | |
| import cv2 | |
| ok("cv2 import", f"version {cv2.__version__}") | |
| build = cv2.getBuildInformation() | |
| if "GTK" in build or "Qt" in build: | |
| record_fail("cv2 build", "GUI backend detected β use opencv-python-headless", | |
| critical=False) | |
| else: | |
| ok("cv2 headless", "no GUI backend detected") | |
| except ImportError as exc: | |
| record_fail( | |
| "cv2 import", | |
| f"{exc}. " | |
| "Layer 4 force-reinstall of opencv-python-headless may have failed. " | |
| "Check Docker build log for the 'pip install --force-reinstall opencv-python-headless' step.", | |
| ) | |
| except Exception as exc: | |
| record_fail("cv2 import", f"unexpected error: {exc}") | |
| # ββ 3. PyTorch βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("3. PyTorch + TorchVision") | |
| try: | |
| import torch | |
| ok("torch import", f"version {torch.__version__}") | |
| if torch.cuda.is_available(): | |
| record_fail("torch CUDA", "CUDA detected on CPU-only space β unexpected", | |
| critical=False) | |
| else: | |
| ok("torch device", "CPU-only (expected for free tier)") | |
| except ImportError as exc: | |
| record_fail( | |
| "torch import", | |
| f"{exc}. " | |
| "Install from PyTorch CPU index BEFORE magic-pdf in Dockerfile Layer 2: " | |
| "pip install --index-url https://download.pytorch.org/whl/cpu torch torchvision", | |
| ) | |
| except Exception as exc: | |
| record_fail("torch import", f"unexpected: {exc}") | |
| try: | |
| import torchvision | |
| ok("torchvision import", f"version {torchvision.__version__}") | |
| except ImportError as exc: | |
| record_fail("torchvision import", str(exc)) | |
| except Exception as exc: | |
| record_fail("torchvision import", f"unexpected: {exc}") | |
| # ββ 4. ultralytics βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("4. ultralytics (YOLO β required by doclayout_yolo)") | |
| try: | |
| import ultralytics | |
| ok("ultralytics import", f"version {ultralytics.__version__}") | |
| except ImportError as exc: | |
| record_fail( | |
| "ultralytics import", | |
| f"{exc}. " | |
| "Provided by magic-pdf[full]. " | |
| "ROOT CAUSE: [full-cpu] is NOT a valid extra in magic-pdf 1.3.12 β " | |
| "pip silently installed only the base package when given an unknown extra. " | |
| "Dockerfile Layer 3 must use magic-pdf[full]==1.3.12 (not [full-cpu]).", | |
| ) | |
| except Exception as exc: | |
| record_fail("ultralytics import", f"unexpected: {exc}") | |
| # ββ 5. doclayout_yolo ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("5. doclayout_yolo (layout detection model)") | |
| try: | |
| import doclayout_yolo | |
| ok("doclayout_yolo import", f"version {getattr(doclayout_yolo, '__version__', 'unknown')}") | |
| except ImportError as exc: | |
| record_fail( | |
| "doclayout_yolo import", | |
| f"{exc}. " | |
| "Provided by magic-pdf[full] (version 0.0.2b1). " | |
| "doclayout-yolo==0.0.2b1 is only on the myhloli custom wheel index β " | |
| "Dockerfile Layer 3 must include: " | |
| "--extra-index-url https://myhloli.github.io/wheels/", | |
| ) | |
| except Exception as exc: | |
| record_fail("doclayout_yolo import", f"unexpected: {exc}") | |
| # ββ 6. rapid_table βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("6. rapid_table (table extraction)") | |
| try: | |
| import rapid_table | |
| ok("rapid_table import", f"version {getattr(rapid_table, '__version__', 'unknown')}") | |
| except ImportError as exc: | |
| record_fail( | |
| "rapid_table import", | |
| f"{exc}. Provided by magic-pdf[full]. Check Layer 3 install.", | |
| ) | |
| except Exception as exc: | |
| record_fail("rapid_table import", f"unexpected: {exc}") | |
| # ββ 7. onnxruntime βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("7. onnxruntime (required by rapid-table for table model inference)") | |
| # onnxruntime is a required (non-optional) dep of rapid-table>=1.0.5. | |
| # pip resolves it automatically when magic-pdf[full] is installed in Layer 3. | |
| # If it is missing it means rapid-table itself failed to install. | |
| try: | |
| import onnxruntime | |
| ok("onnxruntime import", f"version {onnxruntime.__version__}") | |
| except ImportError as exc: | |
| record_fail( | |
| "onnxruntime import", | |
| f"{exc}. " | |
| "onnxruntime is a required dep of rapid-table>=1.0.5. " | |
| "Its absence means rapid-table failed to install in Layer 3. " | |
| "Check Docker build log for rapid-table install errors.", | |
| ) | |
| except Exception as exc: | |
| record_fail("onnxruntime import", f"unexpected: {exc}") | |
| # ββ 8. magic_pdf core imports ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("8. magic_pdf core imports") | |
| REQUIRED_IMPORTS = [ | |
| ("magic_pdf.data.dataset", ["PymuDocDataset", "ImageDataset"]), | |
| ("magic_pdf.data.data_reader_writer", ["FileBasedDataReader", "FileBasedDataWriter"]), | |
| ("magic_pdf.model.doc_analyze_by_custom_model", ["doc_analyze"]), | |
| ("magic_pdf.config.enums", ["SupportedPdfParseMethod"]), | |
| ] | |
| for module_path, symbols in REQUIRED_IMPORTS: | |
| try: | |
| mod = importlib.import_module(module_path) | |
| missing = [s for s in symbols if not hasattr(mod, s)] | |
| if missing: | |
| record_fail(f"{module_path}", f"missing symbols: {missing}") | |
| else: | |
| ok(module_path, ", ".join(symbols)) | |
| except ImportError as exc: | |
| record_fail(module_path, str(exc)) | |
| except Exception as exc: | |
| record_fail(module_path, f"unexpected: {exc}") | |
| # ββ 8b. paddleocr2pytorch (OCR engine bundled inside magic-pdf wheel) ββββββββββ | |
| section("8b. paddleocr2pytorch (PyTorch OCR β bundled in magic-pdf wheel)") | |
| # This is the actual OCR engine for the pipeline backend. | |
| # It is NOT a separate pip package β it lives inside the magic-pdf wheel at | |
| # magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/ | |
| # If it is missing, the entire magic-pdf package did not install correctly. | |
| try: | |
| from magic_pdf.model.sub_modules.ocr.paddleocr2pytorch.pytorch_paddle import PytorchPaddleOCR | |
| ok("PytorchPaddleOCR (paddleocr2pytorch)", "bundled inside magic-pdf wheel β no paddlepaddle pkg needed") | |
| except ImportError as exc: | |
| record_fail( | |
| "PytorchPaddleOCR import", | |
| f"{exc}. " | |
| "This module is bundled inside magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/. " | |
| "If missing, magic-pdf itself did not install correctly.", | |
| ) | |
| except Exception as exc: | |
| record_fail("PytorchPaddleOCR import", f"unexpected: {exc}") | |
| # ββ 8c. Deprecated API check βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("8c. Deprecated API check (should NOT exist)") | |
| OBSOLETE = [ | |
| "magic_pdf.pipe.UNIPipe", | |
| "magic_pdf.rw.DiskReaderWriter", | |
| ] | |
| for mod_path in OBSOLETE: | |
| try: | |
| importlib.import_module(mod_path) | |
| record_fail(mod_path, "still importable β code may use old API", critical=False) | |
| except ImportError: | |
| ok(f"{mod_path} (correctly absent)") | |
| # ββ 9. End-to-end pipeline smoke test βββββββββββββββββββββββββββββββββββββββββ | |
| section("9. End-to-end pipeline smoke test") | |
| try: | |
| from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze # noqa: F401 | |
| import ultralytics # noqa: F401 | |
| from magic_pdf.data.dataset import ImageDataset # noqa: F401 | |
| from magic_pdf.data.data_reader_writer import FileBasedDataReader, FileBasedDataWriter # noqa: F401 | |
| ok("Pipeline imports (doc_analyze + ultralytics + ImageDataset + readers)", "all OK") | |
| except ImportError as exc: | |
| record_fail( | |
| "Pipeline smoke test", | |
| f"Full pipeline import chain failed: {exc}. " | |
| "This means POST /extract will fail on every request.", | |
| ) | |
| except Exception as exc: | |
| record_fail("Pipeline smoke test", f"unexpected: {exc}") | |
| # ββ 10. Config file ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("10. MinerU config (magic-pdf.json)") | |
| _cfg: dict = {} | |
| if os.path.exists(CONFIG_PATH): | |
| try: | |
| with open(CONFIG_PATH) as f: | |
| _cfg = json.load(f) | |
| required_keys = ["models-dir", "device-mode"] | |
| missing_keys = [k for k in required_keys if k not in _cfg] | |
| if missing_keys: | |
| record_fail("Config keys", f"missing: {missing_keys}") | |
| else: | |
| ok("Config file", CONFIG_PATH) | |
| ok("device-mode", _cfg.get("device-mode", "?")) | |
| ok("models-dir", _cfg.get("models-dir", "?")) | |
| ok("formula-enable", str(_cfg.get("formula-config", {}).get("enable", "?"))) | |
| ok("table-enable", str(_cfg.get("table-config", {}).get("enable", "?"))) | |
| except json.JSONDecodeError as exc: | |
| record_fail("Config file", f"invalid JSON: {exc}") | |
| except Exception as exc: | |
| record_fail("Config file", str(exc)) | |
| else: | |
| record_fail( | |
| "Config file", | |
| f"not found at {CONFIG_PATH}. " | |
| "Run download_models.py or check Docker build log.", | |
| ) | |
| # ββ 11. Model directory structure βββββββββββββββββββββββββββββββββββββββββββββ | |
| section("11. Model directory structure") | |
| model_dir_checks = [ | |
| ("PDF-Extract-Kit-1.0 root", os.path.join(MODELS_DIR, "PDF-Extract-Kit-1.0")), | |
| ("Layout models", os.path.join(EXTRACT_KIT_MODELS, "Layout")), | |
| ("Layout/YOLO", os.path.join(EXTRACT_KIT_MODELS, "Layout", "YOLO")), | |
| ("OCR models", os.path.join(EXTRACT_KIT_MODELS, "OCR")), | |
| ("OCR/paddleocr_torch", os.path.join(EXTRACT_KIT_MODELS, "OCR", "paddleocr_torch")), | |
| ("Table models (TabRec)", os.path.join(EXTRACT_KIT_MODELS, "TabRec")), | |
| ] | |
| for label, path in model_dir_checks: | |
| if os.path.isdir(path): | |
| try: | |
| n = sum(1 for _ in os.scandir(path)) | |
| ok(label, f"{n} entries [{path}]") | |
| except OSError: | |
| ok(label, path) | |
| else: | |
| record_fail(label, f"directory not found: {path}") | |
| lr_dir = os.path.join(MODELS_DIR, "layoutreader") | |
| if os.path.isdir(lr_dir): | |
| ok("layoutreader (optional)", lr_dir) | |
| else: | |
| record_fail("layoutreader (optional)", | |
| "not found β MinerU will use fallback ordering (non-critical)", | |
| critical=False) | |
| # ββ 11b. Critical model weight files ββββββββββββββββββββββββββββββββββββββββββ | |
| section("11b. Critical model weight files") | |
| # | |
| # These are the EXACT files MinerU will try to open when processing a document | |
| # on a CPU deployment (default language = ch β forced to ch_lite on CPU). | |
| # | |
| # After Dockerfile Layer 3.5 patch, models_config.yml now references: | |
| # ch_lite.det = ch_PP-OCRv5_det_infer.pth (patched from v3 β v3 NOT in repo) | |
| # ch_lite.rec = ch_PP-OCRv5_rec_infer.pth (unchanged β always in repo) | |
| # | |
| # Layout uses doclayout_yolo (from magic-pdf.json layout-config). | |
| # Table (rapid_table) uses slanet-plus.onnx BUNDLED IN THE WHEEL β not here. | |
| # Formula is DISABLED β MFD/MFR files not required. | |
| # | |
| # Any CRITICAL failure here = service boots but crashes on first document. | |
| _ocr_dir = os.path.join(EXTRACT_KIT_MODELS, "OCR", "paddleocr_torch") | |
| CRITICAL_WEIGHT_FILES: list[tuple[str, str, str]] = [ | |
| # (label, relative-to-EXTRACT_KIT_MODELS, reason) | |
| ( | |
| "OCR det weight (ch_lite, default CPU lang)", | |
| os.path.join("OCR", "paddleocr_torch", "ch_PP-OCRv5_det_infer.pth"), | |
| "Patched from ch_PP-OCRv3_det_infer.pth (absent in HF repo). " | |
| "Missing = all OCR will crash at model load time." | |
| ), | |
| ( | |
| "OCR rec weight (ch_lite)", | |
| os.path.join("OCR", "paddleocr_torch", "ch_PP-OCRv5_rec_infer.pth"), | |
| "Recognition model for ch_lite. " | |
| "Missing = OCR loads det but crashes at recognition." | |
| ), | |
| ( | |
| "OCR cls weight (angle classifier)", | |
| os.path.join("OCR", "paddleocr_torch", "ch_ptocr_mobile_v2.0_cls_infer.pth"), | |
| "Used when use_angle_cls=True. Default is False so non-critical, " | |
| "but its absence causes crash if angle classification is enabled." | |
| ), | |
| ( | |
| "Layout YOLO weight (doclayout_yolo)", | |
| os.path.join("Layout", "YOLO", "doclayout_yolo_docstructbench_imgsz1280_2501.pt"), | |
| "Layout detection model. Missing = layout detection crashes on every document." | |
| ), | |
| ( | |
| "Layout LayoutLMv3 weight", | |
| os.path.join("Layout", "LayoutLMv3", "model_final.pth"), | |
| "Alternative layout model. Required even when doclayout_yolo is primary " | |
| "because model_configs.yaml always lists it." | |
| ), | |
| ( | |
| "Multilingual OCR det (en/latin fallback)", | |
| os.path.join("OCR", "paddleocr_torch", "Multilingual_PP-OCRv3_det_infer.pth"), | |
| "Patched det for en and latin languages. Missing = crash if lang=en/latin." | |
| ), | |
| ] | |
| # cls weight is only critical if use_angle_cls=True (default False) | |
| NON_CRITICAL_LABELS = {"OCR cls weight (angle classifier)"} | |
| for label, rel_path, reason in CRITICAL_WEIGHT_FILES: | |
| full_path = os.path.join(EXTRACT_KIT_MODELS, rel_path) | |
| is_critical = label not in NON_CRITICAL_LABELS | |
| if os.path.isfile(full_path): | |
| size_mb = os.path.getsize(full_path) / (1024 * 1024) | |
| ok(label, f"{size_mb:.1f} MB [{full_path}]") | |
| else: | |
| record_fail( | |
| label, | |
| f"FILE NOT FOUND: {full_path}\n" | |
| f" Reason: {reason}", | |
| critical=is_critical, | |
| ) | |
| # ββ 11c. models_config.yml consistency check ββββββββββββββββββββββββββββββββββ | |
| section("11c. models_config.yml consistency check") | |
| # | |
| # Reads the installed models_config.yml (inside magic_pdf package) and verifies | |
| # that every det/rec file it references for the default CPU language (ch_lite) | |
| # actually exists on disk in the expected location. | |
| # | |
| # This catches future version drift between the magic-pdf package and the HF repo | |
| # BEFORE the service starts, rather than mid-request. | |
| try: | |
| import magic_pdf | |
| import yaml as _yaml | |
| from pathlib import Path as _Path | |
| _pkg = _Path(magic_pdf.__file__).parent | |
| _mcfg = _pkg / 'model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml' | |
| if not _mcfg.exists(): | |
| record_fail("models_config.yml", f"not found at expected path: {_mcfg}") | |
| else: | |
| with open(_mcfg) as _f: | |
| _mc = _yaml.safe_load(_f) | |
| _ocr_torch = os.path.join(EXTRACT_KIT_MODELS, "OCR", "paddleocr_torch") | |
| # Check the two languages actually used on this CPU deployment | |
| _check_langs = ["ch_lite", "ch"] | |
| _mc_ok = True | |
| for _lang in _check_langs: | |
| _entry = _mc.get("lang", {}).get(_lang, {}) | |
| for _field in ("det", "rec"): | |
| _fname = _entry.get(_field) | |
| if not _fname: | |
| continue | |
| _fpath = os.path.join(_ocr_torch, _fname) | |
| if os.path.isfile(_fpath): | |
| ok(f"models_config[{_lang}].{_field}", _fname) | |
| else: | |
| record_fail( | |
| f"models_config[{_lang}].{_field}", | |
| f"Config references '{_fname}' but file not found at:\n" | |
| f" {_fpath}\n" | |
| f" Dockerfile Layer 3.5 patch may not have run, " | |
| f"or HF repo changed its file structure again.", | |
| critical=True, | |
| ) | |
| _mc_ok = False | |
| if _mc_ok: | |
| ok("models_config.yml consistency", "all referenced det/rec files exist on disk") | |
| except Exception as _exc: | |
| record_fail("models_config.yml consistency check", f"unexpected error: {_exc}", critical=False) | |
| # ββ 11d. Bundled wheel resources ββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("11d. Bundled wheel resources (inside magic_pdf package)") | |
| # | |
| # These files are shipped inside the magic-pdf wheel itself. | |
| # They do NOT come from the HF download. Their absence means the wheel | |
| # installed incorrectly or was corrupted. | |
| try: | |
| import magic_pdf as _mp | |
| from pathlib import Path as _P | |
| _pkg_root = _P(_mp.__file__).parent | |
| _bundled = [ | |
| ("slanet-plus.onnx (table model)", | |
| _pkg_root / "resources" / "slanet_plus" / "slanet-plus.onnx"), | |
| ("fasttext langdetect model", | |
| _pkg_root / "resources" / "fasttext-langdetect" / "lid.176.ftz"), | |
| ("YOLO langdetect model", | |
| _pkg_root / "resources" / "yolov11-langdetect" / "yolo_v11_ft.pt"), | |
| ("model_configs.yaml (weight path map)", | |
| _pkg_root / "resources" / "model_config" / "model_configs.yaml"), | |
| ] | |
| for _lbl, _p in _bundled: | |
| if _p.exists(): | |
| _sz = _p.stat().st_size / (1024 * 1024) | |
| ok(_lbl, f"{_sz:.2f} MB") | |
| else: | |
| record_fail(_lbl, f"expected inside wheel at {_p} β magic-pdf install may be corrupted") | |
| except Exception as _exc: | |
| record_fail("Bundled wheel resources check", f"unexpected: {_exc}", critical=False) | |
| # ββ 12. Temp storage βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("12. Temp storage") | |
| try: | |
| td = tempfile.mkdtemp(prefix="mineru_validate_") | |
| test_file = os.path.join(td, "write_test.bin") | |
| with open(test_file, "wb") as f: | |
| f.write(b"x" * 4096) | |
| assert os.path.getsize(test_file) == 4096 | |
| shutil.rmtree(td) | |
| ok("Temp write + delete", tempfile.gettempdir()) | |
| except Exception as exc: | |
| record_fail("Temp storage", str(exc)) | |
| # ββ 13. System memory (cgroups) ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("13. System memory (cgroups)") | |
| mem_source = "unknown" | |
| total_mb = used_mb = 0 | |
| try: | |
| with open("/sys/fs/cgroup/memory.max") as f: | |
| raw = f.read().strip() | |
| if raw != "max": | |
| total_mb = int(raw) // (1024 * 1024) | |
| with open("/sys/fs/cgroup/memory.current") as f: | |
| used_mb = int(f.read().strip()) // (1024 * 1024) | |
| mem_source = "cgroups v2" | |
| except (FileNotFoundError, ValueError): | |
| pass | |
| if total_mb == 0: | |
| try: | |
| with open("/sys/fs/cgroup/memory/memory.limit_in_bytes") as f: | |
| limit = int(f.read().strip()) | |
| with open("/sys/fs/cgroup/memory/memory.usage_in_bytes") as f: | |
| used_bytes = int(f.read().strip()) | |
| if limit < 128 * 1024 * 1024 * 1024: | |
| total_mb = limit // (1024 * 1024) | |
| used_mb = used_bytes // (1024 * 1024) | |
| mem_source = "cgroups v1" | |
| except (FileNotFoundError, ValueError): | |
| pass | |
| if total_mb == 0: | |
| try: | |
| info: dict[str, int] = {} | |
| with open("/proc/meminfo") as f: | |
| for line in f: | |
| parts = line.split() | |
| if len(parts) >= 2: | |
| info[parts[0].rstrip(":")] = int(parts[1]) | |
| total_mb = info.get("MemTotal", 0) // 1024 | |
| used_mb = (info.get("MemTotal", 0) - info.get("MemAvailable", 0)) // 1024 | |
| mem_source = "/proc/meminfo (may show host RAM)" | |
| except Exception: | |
| pass | |
| ok("Memory source", mem_source) | |
| ok("Total memory", f"{total_mb} MB") | |
| ok("Used memory", f"{used_mb} MB") | |
| ok("Free memory", f"{total_mb - used_mb} MB") | |
| if total_mb > 32 * 1024: | |
| record_fail( | |
| "Memory total", | |
| f"{total_mb} MB seems too large β cgroups may not be available; " | |
| "/proc/meminfo showing host RAM. Memory guard in main.py will be conservative.", | |
| critical=False, | |
| ) | |
| # ββ 14. /proc/meminfo sanity βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("14. /proc/meminfo (reference)") | |
| try: | |
| with open("/proc/meminfo") as f: | |
| lines = f.readlines()[:5] | |
| for line in lines: | |
| parts = line.split() | |
| if len(parts) >= 2: | |
| kb = int(parts[1]) | |
| ok(parts[0].rstrip(":"), f"{kb // 1024} MB") | |
| except Exception as exc: | |
| record_fail("/proc/meminfo", str(exc), critical=False) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Summary | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("\n" + "β" * 60, flush=True) | |
| print(" Validation Summary", flush=True) | |
| print("β" * 60, flush=True) | |
| if warnings: | |
| print(f"\n β {len(warnings)} warning(s):", flush=True) | |
| for label, detail in warnings: | |
| print(f" β’ {label}: {detail}", flush=True) | |
| if failures: | |
| print(f"\n β {len(failures)} CRITICAL failure(s):", flush=True) | |
| for label, detail in failures: | |
| print(f" β’ {label}: {detail}", flush=True) | |
| print("\n Service will NOT start until these are resolved.", flush=True) | |
| print(" Check Dockerfile pip layers and Docker build log.", flush=True) | |
| print("β" * 60 + "\n", flush=True) | |
| if not SOFT_MODE: | |
| sys.exit(1) | |
| else: | |
| print(f"\n β All critical checks passed", flush=True) | |
| if warnings: | |
| print(f" β {len(warnings)} non-critical warning(s) β see above", flush=True) | |
| print("\n Service is ready to start.", flush=True) | |
| print("β" * 60 + "\n", flush=True) | |
| sys.exit(0) | |