Spaces:
Sleeping
Sleeping
File size: 12,966 Bytes
d42d358 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 | #!/usr/bin/env python3
"""
Pre-flight validation script for MinerU OCR Service.
Run by entrypoint.sh BEFORE uvicorn starts.
Exits 0 if all checks pass.
Exits 1 if any CRITICAL check fails β this crashes the container loudly
so Hugging Face logs show an actionable error instead of a silent crash
or a healthy-looking service that fails on every request.
Usage:
python validate.py # run all checks, exit 0/1
python validate.py --soft # run all checks, always exit 0 (log only)
"""
import importlib
import json
import os
import shutil
import sys
import tempfile
import time
import traceback
SOFT_MODE = "--soft" in sys.argv # never exit 1, just print
MODELS_DIR = "/app/models"
EXTRACT_KIT_MODELS = os.path.join(MODELS_DIR, "PDF-Extract-Kit-1.0", "models")
LAYOUT_MARKER = os.path.join(EXTRACT_KIT_MODELS, "Layout") # canary directory
CONFIG_PATH = os.path.expanduser("~/magic-pdf.json")
# ββ helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def ok(label: str, detail: str = "") -> None:
suffix = f" ({detail})" if detail else ""
print(f" β {label}{suffix}", flush=True)
def fail(label: str, detail: str, critical: bool = True) -> None:
tag = "CRITICAL" if critical else "WARNING"
print(f" β [{tag}] {label}: {detail}", flush=True)
def section(title: str) -> None:
print(f"\n{'β' * 60}", flush=True)
print(f" {title}", flush=True)
print(f"{'β' * 60}", flush=True)
# ββ check registry βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
failures: list[tuple[str, str]] = [] # (label, detail)
warnings: list[tuple[str, str]] = []
def record_fail(label: str, detail: str, critical: bool = True) -> None:
fail(label, detail, critical)
if critical:
failures.append((label, detail))
else:
warnings.append((label, detail))
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
print("\n" + "β" * 60, flush=True)
print(" MinerU OCR Service β Pre-flight Validation", flush=True)
print("β" * 60, flush=True)
# ββ 1. Python version ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
section("1. Python runtime")
pv = sys.version_info
if pv >= (3, 10):
ok("Python version", f"{pv.major}.{pv.minor}.{pv.micro}")
else:
record_fail("Python version",
f"{pv.major}.{pv.minor} detected β magic-pdf requires >= 3.10")
# ββ 2. cv2 βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
section("2. OpenCV (cv2)")
try:
import cv2
ok("cv2 import", f"version {cv2.__version__}")
# Confirm headless (no X11 dep) by checking build info
build = cv2.getBuildInformation()
if "GTK" in build or "Qt" in build:
record_fail("cv2 build", "GUI backend detected β use opencv-python-headless",
critical=False)
else:
ok("cv2 headless", "no GUI backend detected")
except ImportError as exc:
record_fail(
"cv2 import",
f"{exc}. "
"Add 'opencv-python-headless>=4.8.0' to Dockerfile pip layer 1 "
"BEFORE magic-pdf install.",
)
except Exception as exc:
record_fail("cv2 import", f"unexpected error: {exc}")
# ββ 3. magic_pdf core ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
section("3. magic_pdf core imports")
REQUIRED_IMPORTS = [
("magic_pdf.data.dataset", ["PymuDocDataset", "ImageDataset"]),
("magic_pdf.data.data_reader_writer", ["FileBasedDataReader", "FileBasedDataWriter"]),
("magic_pdf.model.doc_analyze_by_custom_model", ["doc_analyze"]),
("magic_pdf.config.enums", ["SupportedPdfParseMethod"]),
]
for module_path, symbols in REQUIRED_IMPORTS:
try:
mod = importlib.import_module(module_path)
missing = [s for s in symbols if not hasattr(mod, s)]
if missing:
record_fail(f"{module_path}", f"missing symbols: {missing}")
else:
ok(module_path, ", ".join(symbols))
except ImportError as exc:
record_fail(module_path, str(exc))
except Exception as exc:
record_fail(module_path, f"unexpected: {exc}")
# Confirm removed/deprecated imports are truly gone
section("3b. Deprecated API check (should NOT exist)")
OBSOLETE = [
"magic_pdf.pipe.UNIPipe",
"magic_pdf.rw.DiskReaderWriter",
]
for mod_path in OBSOLETE:
try:
importlib.import_module(mod_path)
record_fail(mod_path, "still importable β code may use old API", critical=False)
except ImportError:
ok(f"{mod_path} (correctly absent)")
# ββ 4. Config file βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
section("4. MinerU config (magic-pdf.json)")
if os.path.exists(CONFIG_PATH):
try:
with open(CONFIG_PATH) as f:
cfg = json.load(f)
required_keys = ["models-dir", "device-mode"]
missing_keys = [k for k in required_keys if k not in cfg]
if missing_keys:
record_fail("Config keys", f"missing: {missing_keys}")
else:
ok("Config file", CONFIG_PATH)
ok("device-mode", cfg.get("device-mode", "?"))
ok("models-dir", cfg.get("models-dir", "?"))
ok("formula-enable", str(cfg.get("formula-config", {}).get("enable", "?")))
ok("table-enable", str(cfg.get("table-config", {}).get("enable", "?")))
except json.JSONDecodeError as exc:
record_fail("Config file", f"invalid JSON: {exc}")
except Exception as exc:
record_fail("Config file", str(exc))
else:
record_fail(
"Config file",
f"not found at {CONFIG_PATH}. "
"Run download_models.py or check Docker build log.",
)
# ββ 5. Model files βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
section("5. Model files")
model_checks = [
("PDF-Extract-Kit-1.0 root", os.path.join(MODELS_DIR, "PDF-Extract-Kit-1.0")),
("Layout models (canary)", LAYOUT_MARKER),
("MFD models", os.path.join(EXTRACT_KIT_MODELS, "MFD")),
("Table models", os.path.join(EXTRACT_KIT_MODELS, "TabRec")),
]
for label, path in model_checks:
if os.path.isdir(path):
# Count files for a sanity check
try:
n = sum(1 for _ in os.scandir(path))
ok(label, f"{path} ({n} entries)")
except OSError:
ok(label, path)
else:
record_fail(label, f"directory not found: {path}")
# layoutreader β optional
lr_dir = os.path.join(MODELS_DIR, "layoutreader")
if os.path.isdir(lr_dir):
ok("layoutreader (optional)", lr_dir)
else:
record_fail("layoutreader (optional)",
"not found β MinerU will use fallback ordering (non-critical)",
critical=False)
# Validate config models-dir points to existing path
try:
with open(CONFIG_PATH) as f:
cfg = json.load(f)
cfg_models = cfg.get("models-dir", "")
if cfg_models and os.path.isdir(cfg_models):
ok("Config models-dir exists", cfg_models)
elif cfg_models:
record_fail("Config models-dir", f"points to missing path: {cfg_models}")
except Exception:
pass # already reported above
# ββ 6. Temp storage ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
section("6. Temp storage")
try:
td = tempfile.mkdtemp(prefix="mineru_validate_")
test_file = os.path.join(td, "write_test.bin")
with open(test_file, "wb") as f:
f.write(b"x" * 4096)
assert os.path.getsize(test_file) == 4096
shutil.rmtree(td)
ok("Temp write + delete", tempfile.gettempdir())
except Exception as exc:
record_fail("Temp storage", str(exc))
# ββ 7. System memory βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
section("7. System memory (cgroups)")
mem_source = "unknown"
total_mb = used_mb = 0
try:
with open("/sys/fs/cgroup/memory.max") as f:
raw = f.read().strip()
if raw != "max":
total_mb = int(raw) // (1024 * 1024)
with open("/sys/fs/cgroup/memory.current") as f:
used_mb = int(f.read().strip()) // (1024 * 1024)
mem_source = "cgroups v2"
except (FileNotFoundError, ValueError):
pass
if total_mb == 0:
try:
with open("/sys/fs/cgroup/memory/memory.limit_in_bytes") as f:
limit = int(f.read().strip())
with open("/sys/fs/cgroup/memory/memory.usage_in_bytes") as f:
used_bytes = int(f.read().strip())
if limit < 128 * 1024 * 1024 * 1024:
total_mb = limit // (1024 * 1024)
used_mb = used_bytes // (1024 * 1024)
mem_source = "cgroups v1"
except (FileNotFoundError, ValueError):
pass
if total_mb == 0:
try:
info: dict[str, int] = {}
with open("/proc/meminfo") as f:
for line in f:
parts = line.split()
if len(parts) >= 2:
info[parts[0].rstrip(":")] = int(parts[1])
total_mb = info.get("MemTotal", 0) // 1024
used_mb = (info.get("MemTotal", 0) - info.get("MemAvailable", 0)) // 1024
mem_source = "/proc/meminfo (may show host RAM)"
except Exception:
pass
ok("Memory source", mem_source)
ok("Total memory", f"{total_mb} MB")
ok("Used memory", f"{used_mb} MB")
ok("Free memory", f"{total_mb - used_mb} MB")
if total_mb > 32 * 1024:
record_fail(
"Memory total",
f"{total_mb} MB seems too large for a container β "
"cgroups may not be available; /proc/meminfo is showing host RAM. "
"Memory guard in main.py will be conservative.",
critical=False,
)
# ββ 8. /proc/meminfo sanity ββββββββββββββββββββββββββββββββββββββββββββββββββββ
section("8. /proc/meminfo (for reference)")
try:
with open("/proc/meminfo") as f:
lines = f.readlines()[:5]
for line in lines:
parts = line.split()
if len(parts) >= 2:
kb = int(parts[1])
ok(parts[0].rstrip(":"), f"{kb // 1024} MB")
except Exception as exc:
record_fail("/proc/meminfo", str(exc), critical=False)
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Summary
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
print("\n" + "β" * 60, flush=True)
print(" Validation Summary", flush=True)
print("β" * 60, flush=True)
if warnings:
print(f"\n β {len(warnings)} warning(s):", flush=True)
for label, detail in warnings:
print(f" β’ {label}: {detail}", flush=True)
if failures:
print(f"\n β {len(failures)} CRITICAL failure(s):", flush=True)
for label, detail in failures:
print(f" β’ {label}: {detail}", flush=True)
print("\n Service will NOT start until these are resolved.", flush=True)
print(" Check Dockerfile pip layers and Docker build log.", flush=True)
print("β" * 60 + "\n", flush=True)
if not SOFT_MODE:
sys.exit(1)
else:
print(f"\n β All critical checks passed", flush=True)
if warnings:
print(f" β {len(warnings)} non-critical warning(s) β see above", flush=True)
print("\n Service is ready to start.", flush=True)
print("β" * 60 + "\n", flush=True)
sys.exit(0)
|