File size: 12,966 Bytes
d42d358
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
#!/usr/bin/env python3
"""
Pre-flight validation script for MinerU OCR Service.

Run by entrypoint.sh BEFORE uvicorn starts.
Exits 0 if all checks pass.
Exits 1 if any CRITICAL check fails β€” this crashes the container loudly
so Hugging Face logs show an actionable error instead of a silent crash
or a healthy-looking service that fails on every request.

Usage:
    python validate.py           # run all checks, exit 0/1
    python validate.py --soft    # run all checks, always exit 0 (log only)
"""

import importlib
import json
import os
import shutil
import sys
import tempfile
import time
import traceback

SOFT_MODE = "--soft" in sys.argv  # never exit 1, just print

MODELS_DIR = "/app/models"
EXTRACT_KIT_MODELS = os.path.join(MODELS_DIR, "PDF-Extract-Kit-1.0", "models")
LAYOUT_MARKER = os.path.join(EXTRACT_KIT_MODELS, "Layout")  # canary directory
CONFIG_PATH = os.path.expanduser("~/magic-pdf.json")


# ── helpers ────────────────────────────────────────────────────────────────────
def ok(label: str, detail: str = "") -> None:
    suffix = f"  ({detail})" if detail else ""
    print(f"  βœ“  {label}{suffix}", flush=True)


def fail(label: str, detail: str, critical: bool = True) -> None:
    tag = "CRITICAL" if critical else "WARNING"
    print(f"  βœ—  [{tag}] {label}: {detail}", flush=True)


def section(title: str) -> None:
    print(f"\n{'─' * 60}", flush=True)
    print(f"  {title}", flush=True)
    print(f"{'─' * 60}", flush=True)


# ── check registry ─────────────────────────────────────────────────────────────
failures: list[tuple[str, str]] = []   # (label, detail)
warnings: list[tuple[str, str]] = []


def record_fail(label: str, detail: str, critical: bool = True) -> None:
    fail(label, detail, critical)
    if critical:
        failures.append((label, detail))
    else:
        warnings.append((label, detail))


# ═══════════════════════════════════════════════════════════════════════════════
print("\n" + "═" * 60, flush=True)
print("  MinerU OCR Service β€” Pre-flight Validation", flush=True)
print("═" * 60, flush=True)

# ── 1. Python version ──────────────────────────────────────────────────────────
section("1. Python runtime")
pv = sys.version_info
if pv >= (3, 10):
    ok("Python version", f"{pv.major}.{pv.minor}.{pv.micro}")
else:
    record_fail("Python version",
                f"{pv.major}.{pv.minor} detected β€” magic-pdf requires >= 3.10")

# ── 2. cv2 ─────────────────────────────────────────────────────────────────────
section("2. OpenCV (cv2)")
try:
    import cv2
    ok("cv2 import", f"version {cv2.__version__}")

    # Confirm headless (no X11 dep) by checking build info
    build = cv2.getBuildInformation()
    if "GTK" in build or "Qt" in build:
        record_fail("cv2 build", "GUI backend detected β€” use opencv-python-headless",
                    critical=False)
    else:
        ok("cv2 headless", "no GUI backend detected")
except ImportError as exc:
    record_fail(
        "cv2 import",
        f"{exc}. "
        "Add 'opencv-python-headless>=4.8.0' to Dockerfile pip layer 1 "
        "BEFORE magic-pdf install.",
    )
except Exception as exc:
    record_fail("cv2 import", f"unexpected error: {exc}")

# ── 3. magic_pdf core ──────────────────────────────────────────────────────────
section("3. magic_pdf core imports")

REQUIRED_IMPORTS = [
    ("magic_pdf.data.dataset",                 ["PymuDocDataset", "ImageDataset"]),
    ("magic_pdf.data.data_reader_writer",      ["FileBasedDataReader", "FileBasedDataWriter"]),
    ("magic_pdf.model.doc_analyze_by_custom_model", ["doc_analyze"]),
    ("magic_pdf.config.enums",                 ["SupportedPdfParseMethod"]),
]

for module_path, symbols in REQUIRED_IMPORTS:
    try:
        mod = importlib.import_module(module_path)
        missing = [s for s in symbols if not hasattr(mod, s)]
        if missing:
            record_fail(f"{module_path}", f"missing symbols: {missing}")
        else:
            ok(module_path, ", ".join(symbols))
    except ImportError as exc:
        record_fail(module_path, str(exc))
    except Exception as exc:
        record_fail(module_path, f"unexpected: {exc}")

# Confirm removed/deprecated imports are truly gone
section("3b. Deprecated API check (should NOT exist)")
OBSOLETE = [
    "magic_pdf.pipe.UNIPipe",
    "magic_pdf.rw.DiskReaderWriter",
]
for mod_path in OBSOLETE:
    try:
        importlib.import_module(mod_path)
        record_fail(mod_path, "still importable β€” code may use old API", critical=False)
    except ImportError:
        ok(f"{mod_path} (correctly absent)")

# ── 4. Config file ─────────────────────────────────────────────────────────────
section("4. MinerU config (magic-pdf.json)")
if os.path.exists(CONFIG_PATH):
    try:
        with open(CONFIG_PATH) as f:
            cfg = json.load(f)
        required_keys = ["models-dir", "device-mode"]
        missing_keys = [k for k in required_keys if k not in cfg]
        if missing_keys:
            record_fail("Config keys", f"missing: {missing_keys}")
        else:
            ok("Config file", CONFIG_PATH)
            ok("device-mode", cfg.get("device-mode", "?"))
            ok("models-dir",  cfg.get("models-dir",  "?"))
            ok("formula-enable", str(cfg.get("formula-config", {}).get("enable", "?")))
            ok("table-enable",   str(cfg.get("table-config",   {}).get("enable", "?")))
    except json.JSONDecodeError as exc:
        record_fail("Config file", f"invalid JSON: {exc}")
    except Exception as exc:
        record_fail("Config file", str(exc))
else:
    record_fail(
        "Config file",
        f"not found at {CONFIG_PATH}. "
        "Run download_models.py or check Docker build log.",
    )

# ── 5. Model files ─────────────────────────────────────────────────────────────
section("5. Model files")

model_checks = [
    ("PDF-Extract-Kit-1.0 root",  os.path.join(MODELS_DIR, "PDF-Extract-Kit-1.0")),
    ("Layout models (canary)",    LAYOUT_MARKER),
    ("MFD models",                os.path.join(EXTRACT_KIT_MODELS, "MFD")),
    ("Table models",              os.path.join(EXTRACT_KIT_MODELS, "TabRec")),
]

for label, path in model_checks:
    if os.path.isdir(path):
        # Count files for a sanity check
        try:
            n = sum(1 for _ in os.scandir(path))
            ok(label, f"{path}  ({n} entries)")
        except OSError:
            ok(label, path)
    else:
        record_fail(label, f"directory not found: {path}")

# layoutreader β€” optional
lr_dir = os.path.join(MODELS_DIR, "layoutreader")
if os.path.isdir(lr_dir):
    ok("layoutreader (optional)", lr_dir)
else:
    record_fail("layoutreader (optional)",
                "not found β€” MinerU will use fallback ordering (non-critical)",
                critical=False)

# Validate config models-dir points to existing path
try:
    with open(CONFIG_PATH) as f:
        cfg = json.load(f)
    cfg_models = cfg.get("models-dir", "")
    if cfg_models and os.path.isdir(cfg_models):
        ok("Config models-dir exists", cfg_models)
    elif cfg_models:
        record_fail("Config models-dir", f"points to missing path: {cfg_models}")
except Exception:
    pass  # already reported above

# ── 6. Temp storage ────────────────────────────────────────────────────────────
section("6. Temp storage")
try:
    td = tempfile.mkdtemp(prefix="mineru_validate_")
    test_file = os.path.join(td, "write_test.bin")
    with open(test_file, "wb") as f:
        f.write(b"x" * 4096)
    assert os.path.getsize(test_file) == 4096
    shutil.rmtree(td)
    ok("Temp write + delete", tempfile.gettempdir())
except Exception as exc:
    record_fail("Temp storage", str(exc))

# ── 7. System memory ───────────────────────────────────────────────────────────
section("7. System memory (cgroups)")
mem_source = "unknown"
total_mb = used_mb = 0

try:
    with open("/sys/fs/cgroup/memory.max") as f:
        raw = f.read().strip()
    if raw != "max":
        total_mb = int(raw) // (1024 * 1024)
        with open("/sys/fs/cgroup/memory.current") as f:
            used_mb = int(f.read().strip()) // (1024 * 1024)
        mem_source = "cgroups v2"
except (FileNotFoundError, ValueError):
    pass

if total_mb == 0:
    try:
        with open("/sys/fs/cgroup/memory/memory.limit_in_bytes") as f:
            limit = int(f.read().strip())
        with open("/sys/fs/cgroup/memory/memory.usage_in_bytes") as f:
            used_bytes = int(f.read().strip())
        if limit < 128 * 1024 * 1024 * 1024:
            total_mb = limit // (1024 * 1024)
            used_mb  = used_bytes // (1024 * 1024)
            mem_source = "cgroups v1"
    except (FileNotFoundError, ValueError):
        pass

if total_mb == 0:
    try:
        info: dict[str, int] = {}
        with open("/proc/meminfo") as f:
            for line in f:
                parts = line.split()
                if len(parts) >= 2:
                    info[parts[0].rstrip(":")] = int(parts[1])
        total_mb = info.get("MemTotal", 0) // 1024
        used_mb  = (info.get("MemTotal", 0) - info.get("MemAvailable", 0)) // 1024
        mem_source = "/proc/meminfo (may show host RAM)"
    except Exception:
        pass

ok("Memory source", mem_source)
ok("Total memory",  f"{total_mb} MB")
ok("Used memory",   f"{used_mb} MB")
ok("Free memory",   f"{total_mb - used_mb} MB")

if total_mb > 32 * 1024:
    record_fail(
        "Memory total",
        f"{total_mb} MB seems too large for a container β€” "
        "cgroups may not be available; /proc/meminfo is showing host RAM. "
        "Memory guard in main.py will be conservative.",
        critical=False,
    )

# ── 8. /proc/meminfo sanity ────────────────────────────────────────────────────
section("8. /proc/meminfo (for reference)")
try:
    with open("/proc/meminfo") as f:
        lines = f.readlines()[:5]
    for line in lines:
        parts = line.split()
        if len(parts) >= 2:
            kb = int(parts[1])
            ok(parts[0].rstrip(":"), f"{kb // 1024} MB")
except Exception as exc:
    record_fail("/proc/meminfo", str(exc), critical=False)

# ═══════════════════════════════════════════════════════════════════════════════
# Summary
# ═══════════════════════════════════════════════════════════════════════════════
print("\n" + "═" * 60, flush=True)
print("  Validation Summary", flush=True)
print("═" * 60, flush=True)

if warnings:
    print(f"\n  ⚠  {len(warnings)} warning(s):", flush=True)
    for label, detail in warnings:
        print(f"     β€’ {label}: {detail}", flush=True)

if failures:
    print(f"\n  βœ—  {len(failures)} CRITICAL failure(s):", flush=True)
    for label, detail in failures:
        print(f"     β€’ {label}: {detail}", flush=True)
    print("\n  Service will NOT start until these are resolved.", flush=True)
    print("  Check Dockerfile pip layers and Docker build log.", flush=True)
    print("═" * 60 + "\n", flush=True)
    if not SOFT_MODE:
        sys.exit(1)
else:
    print(f"\n  βœ“  All critical checks passed", flush=True)
    if warnings:
        print(f"  ⚠  {len(warnings)} non-critical warning(s) β€” see above", flush=True)
    print("\n  Service is ready to start.", flush=True)
    print("═" * 60 + "\n", flush=True)
    sys.exit(0)