# ───────────────────────────────────────────────────────────────────────────── # MinerU OCR Service — Hugging Face Docker Space (CPU / pipeline backend) # # Optimized for FREE tier: 2 vCPU · 16 GB RAM · 50 GB Disk · No GPU # # ── OCR ROUTING ARCHITECTURE ────────────────────────────────────────────────── # # FAST PATH (images: jpg/png/webp/bmp/heic/etc) # rapidocr-onnxruntime ≥ 1.3.22 # - Pure ONNX inference — no PaddleOCR / paddlepaddle needed # - Models bundled in the pip wheel (~50 MB); no first-use download # - Target latency: 1–5 s on CPU # Multi-pass: if RapidOCR confidence < 0.65 → MinerU fallback automatically # # HEAVY PATH (PDFs, multi-page, forms with layout) # MinerU (magic-pdf pipeline backend) # - Layout detection (doclayout_yolo) # - OCR (paddleocr2pytorch — PyTorch reimplementation bundled in wheel) # - Markdown reconstruction # - Target latency: 5–30 s on CPU # # ── ROOT CAUSE HISTORY ──────────────────────────────────────────────────────── # # FAILURE 1: [full-cpu] is NOT a valid extra → pip silently installs base only # Fix: magic-pdf[full]==1.3.12 # # FAILURE 2: opencv non-headless conflict # Fix: Layer 4 force-reinstall of opencv-python-headless # # FAILURE 3: ch_PP-OCRv3_det_infer.pth not in HF repo (repo updated to v5) # Fix: Layer 3.5 patches models_config.yml inside installed wheel: # ch_PP-OCRv3_det → ch_PP-OCRv5_det (all ch* langs) # en_PP-OCRv3_det → Multilingual_PP-OCRv3_det (en, latin) # Arch safety: both replacement stems verified in arch_config.yaml # # ── System packages ──────────────────────────────────────────────────────────── # libgl1 — OpenCV needs libGL.so.1 for ALL image operations (not just GUI) # libglib2.0-0 — GLib; required by OpenCV and many C extensions # libgomp1 — OpenMP; required by ONNX Runtime and YOLO inference # poppler-utils — pdfinfo/pdftoppm; used by MinerU PDF pre-processing # ───────────────────────────────────────────────────────────────────────────── FROM python:3.10-slim ENV PYTHONUNBUFFERED=1 ENV PYTHONDONTWRITEBYTECODE=1 ENV PORT=7860 ENV MINERU_DEVICE_MODE=cpu ENV MINERU_BACKEND=pipeline # ── System dependencies ──────────────────────────────────────────────────────── RUN apt-get update \ && apt-get install -y --no-install-recommends \ libgl1 \ libglib2.0-0 \ libgomp1 \ poppler-utils \ && rm -rf /var/lib/apt/lists/* WORKDIR /app # ── Layer 1: FastAPI + lightweight runtime deps ─────────────────────────────── # rapidocr-onnxruntime: ONNX-based fast OCR engine; models bundled in wheel. # - requires onnxruntime (will be auto-resolved or overridden by magic-pdf deps) # - requires numpy, pyclipper, shapely — all covered by magic-pdf[full] # - ~50 MB wheel; zero first-use model download needed # opencv-python-headless: placeholder; will be force-reinstalled in Layer 4 RUN pip install --no-cache-dir --timeout 300 \ "fastapi>=0.115.0" \ "uvicorn[standard]>=0.32.0" \ "python-multipart>=0.0.12" \ "Pillow>=10.0.0" \ "pillow-heif>=0.18.0" \ "huggingface_hub>=0.25.0" \ "opencv-python-headless>=4.8.0" \ "rapidocr-onnxruntime>=1.3.22" \ "python-docx>=1.1.0" \ "python-pptx>=0.6.23" \ "openpyxl>=3.1.0" # ── Layer 2: CPU-only PyTorch — MUST precede magic-pdf ─────────────────────── # PyPI serves the CUDA-enabled torch wheel by default (~2.5 GB). # Installing from the official CPU wheel index first causes pip to treat the # already-installed CPU build as satisfying magic-pdf's torch requirement. RUN pip install --no-cache-dir --timeout 600 \ --index-url https://download.pytorch.org/whl/cpu \ "torch>=2.2.2,!=2.5.0,!=2.5.1,<3" \ "torchvision>=0.15.2" # ── Layer 3: magic-pdf with the CORRECT extras ──────────────────────────────── # [full] provides ultralytics, doclayout-yolo==0.0.2b1, rapid-table, shapely, # pyclipper, omegaconf, matplotlib, ftfy, dill, PyYAML, openai, albumentations. # doclayout-yolo==0.0.2b1 is ONLY on the myhloli index — not on PyPI. # onnxruntime resolved automatically as transitive dep of rapid-table. RUN pip install --no-cache-dir --timeout 600 \ --extra-index-url https://myhloli.github.io/wheels/ \ "magic-pdf[full]==1.3.12" # ── Layer 3.5: Patch OCR model config ──────────────────────────────────────── # HF repo opendatalab/PDF-Extract-Kit-1.0 was updated to v5 det models. # magic-pdf 1.3.12 models_config.yml still references v3 det files (absent). # This patch runs at build time so download_models.py fetches correct files. RUN python3 - <<'PYEOF' import sys, yaml from pathlib import Path import magic_pdf pkg = Path(magic_pdf.__file__).parent cfg_path = pkg / 'model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml' arch_path = pkg / 'model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml' print(f"Patching: {cfg_path}") with open(cfg_path) as f: config = yaml.safe_load(f) with open(arch_path) as f: arch_text = f.read() DET_MAP = { 'ch_PP-OCRv3_det_infer.pth': 'ch_PP-OCRv5_det_infer.pth', 'en_PP-OCRv3_det_infer.pth': 'Multilingual_PP-OCRv3_det_infer.pth', } patched = 0 for lang, files in config['lang'].items(): old = files.get('det', '') if old in DET_MAP: new = DET_MAP[old] arch_key = new[:-4] if (arch_key + ':') not in arch_text: print(f"ERROR: arch key '{arch_key}' not found in arch_config.yaml", file=sys.stderr) sys.exit(1) files['det'] = new print(f" [{lang}] det: {old} -> {new}") patched += 1 with open(cfg_path, 'w') as f: yaml.dump(config, f, default_flow_style=False, allow_unicode=True) print(f"Patched {patched} language entries. models_config.yml updated.") PYEOF # ── Layer 4: Restore headless OpenCV ───────────────────────────────────────── # Layer 3 pulled opencv-python (non-headless) via doclayout-yolo/ultralytics/ # rapid-table. Force-reinstall headless build so cv2 works on this slim image. RUN pip install --no-cache-dir --timeout 300 \ --force-reinstall \ "opencv-python-headless>=4.8.0" # ── Application code ────────────────────────────────────────────────────────── COPY download_models.py . COPY validate.py . COPY main.py . COPY entrypoint.sh . RUN chmod +x entrypoint.sh # ── Download models at build time ───────────────────────────────────────────── # MFR (formula recognition, ~1-2 GB) excluded — disabled in config. # rapidocr-onnxruntime models are BUNDLED in the pip wheel; no download needed. RUN python download_models.py RUN mkdir -p /app/config && cp /root/magic-pdf.json /app/config/magic-pdf.json # ── Runtime ─────────────────────────────────────────────────────────────────── EXPOSE 7860 ENTRYPOINT ["/app/entrypoint.sh"]