Spaces:

crazylemonade
/

openskill-ocr

Running

File size: 8,396 Bytes

0ad3f89

# ─────────────────────────────────────────────────────────────────────────────
# MinerU OCR Service — Hugging Face Docker Space (CPU / pipeline backend)
#
# Optimized for FREE tier: 2 vCPU · 16 GB RAM · 50 GB Disk · No GPU
#
# ── OCR ROUTING ARCHITECTURE ──────────────────────────────────────────────────
#
# FAST PATH  (images: jpg/png/webp/bmp/heic/etc)
#   rapidocr-onnxruntime ≥ 1.3.22
#     - Pure ONNX inference — no PaddleOCR / paddlepaddle needed
#     - Models bundled in the pip wheel (~50 MB); no first-use download
#     - Target latency: 1–5 s on CPU
#   Multi-pass: if RapidOCR confidence < 0.65 → MinerU fallback automatically
#
# HEAVY PATH (PDFs, multi-page, forms with layout)
#   MinerU (magic-pdf pipeline backend)
#     - Layout detection (doclayout_yolo)
#     - OCR (paddleocr2pytorch — PyTorch reimplementation bundled in wheel)
#     - Markdown reconstruction
#     - Target latency: 5–30 s on CPU
#
# ── ROOT CAUSE HISTORY ────────────────────────────────────────────────────────
#
# FAILURE 1: [full-cpu] is NOT a valid extra → pip silently installs base only
#   Fix: magic-pdf[full]==1.3.12
#
# FAILURE 2: opencv non-headless conflict
#   Fix: Layer 4 force-reinstall of opencv-python-headless
#
# FAILURE 3: ch_PP-OCRv3_det_infer.pth not in HF repo (repo updated to v5)
#   Fix: Layer 3.5 patches models_config.yml inside installed wheel:
#        ch_PP-OCRv3_det → ch_PP-OCRv5_det   (all ch* langs)
#        en_PP-OCRv3_det → Multilingual_PP-OCRv3_det  (en, latin)
#   Arch safety: both replacement stems verified in arch_config.yaml
#
# ── System packages ────────────────────────────────────────────────────────────
#   libgl1       — OpenCV needs libGL.so.1 for ALL image operations (not just GUI)
#   libglib2.0-0 — GLib; required by OpenCV and many C extensions
#   libgomp1     — OpenMP; required by ONNX Runtime and YOLO inference
#   poppler-utils — pdfinfo/pdftoppm; used by MinerU PDF pre-processing
# ─────────────────────────────────────────────────────────────────────────────

FROM python:3.10-slim

ENV PYTHONUNBUFFERED=1
ENV PYTHONDONTWRITEBYTECODE=1
ENV PORT=7860
ENV MINERU_DEVICE_MODE=cpu
ENV MINERU_BACKEND=pipeline

# ── System dependencies ────────────────────────────────────────────────────────
RUN apt-get update \
    && apt-get install -y --no-install-recommends \
        libgl1 \
        libglib2.0-0 \
        libgomp1 \
        poppler-utils \
    && rm -rf /var/lib/apt/lists/*

WORKDIR /app

# ── Layer 1: FastAPI + lightweight runtime deps ───────────────────────────────
# rapidocr-onnxruntime: ONNX-based fast OCR engine; models bundled in wheel.
#   - requires onnxruntime (will be auto-resolved or overridden by magic-pdf deps)
#   - requires numpy, pyclipper, shapely — all covered by magic-pdf[full]
#   - ~50 MB wheel; zero first-use model download needed
# opencv-python-headless: placeholder; will be force-reinstalled in Layer 4
RUN pip install --no-cache-dir --timeout 300 \
        "fastapi>=0.115.0" \
        "uvicorn[standard]>=0.32.0" \
        "python-multipart>=0.0.12" \
        "Pillow>=10.0.0" \
        "pillow-heif>=0.18.0" \
        "huggingface_hub>=0.25.0" \
        "opencv-python-headless>=4.8.0" \
        "rapidocr-onnxruntime>=1.3.22" \
        "python-docx>=1.1.0" \
        "python-pptx>=0.6.23" \
        "openpyxl>=3.1.0"

# ── Layer 2: CPU-only PyTorch — MUST precede magic-pdf ───────────────────────
# PyPI serves the CUDA-enabled torch wheel by default (~2.5 GB).
# Installing from the official CPU wheel index first causes pip to treat the
# already-installed CPU build as satisfying magic-pdf's torch requirement.
RUN pip install --no-cache-dir --timeout 600 \
        --index-url https://download.pytorch.org/whl/cpu \
        "torch>=2.2.2,!=2.5.0,!=2.5.1,<3" \
        "torchvision>=0.15.2"

# ── Layer 3: magic-pdf with the CORRECT extras ────────────────────────────────
# [full] provides ultralytics, doclayout-yolo==0.0.2b1, rapid-table, shapely,
# pyclipper, omegaconf, matplotlib, ftfy, dill, PyYAML, openai, albumentations.
# doclayout-yolo==0.0.2b1 is ONLY on the myhloli index — not on PyPI.
# onnxruntime resolved automatically as transitive dep of rapid-table.
RUN pip install --no-cache-dir --timeout 600 \
        --extra-index-url https://myhloli.github.io/wheels/ \
        "magic-pdf[full]==1.3.12"

# ── Layer 3.5: Patch OCR model config ────────────────────────────────────────
# HF repo opendatalab/PDF-Extract-Kit-1.0 was updated to v5 det models.
# magic-pdf 1.3.12 models_config.yml still references v3 det files (absent).
# This patch runs at build time so download_models.py fetches correct files.
RUN python3 - <<'PYEOF'
import sys, yaml
from pathlib import Path
import magic_pdf

pkg = Path(magic_pdf.__file__).parent
cfg_path  = pkg / 'model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml'
arch_path = pkg / 'model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml'

print(f"Patching: {cfg_path}")

with open(cfg_path) as f:
    config = yaml.safe_load(f)
with open(arch_path) as f:
    arch_text = f.read()

DET_MAP = {
    'ch_PP-OCRv3_det_infer.pth':  'ch_PP-OCRv5_det_infer.pth',
    'en_PP-OCRv3_det_infer.pth':  'Multilingual_PP-OCRv3_det_infer.pth',
}

patched = 0
for lang, files in config['lang'].items():
    old = files.get('det', '')
    if old in DET_MAP:
        new = DET_MAP[old]
        arch_key = new[:-4]
        if (arch_key + ':') not in arch_text:
            print(f"ERROR: arch key '{arch_key}' not found in arch_config.yaml", file=sys.stderr)
            sys.exit(1)
        files['det'] = new
        print(f"  [{lang}] det: {old}  ->  {new}")
        patched += 1

with open(cfg_path, 'w') as f:
    yaml.dump(config, f, default_flow_style=False, allow_unicode=True)

print(f"Patched {patched} language entries. models_config.yml updated.")
PYEOF

# ── Layer 4: Restore headless OpenCV ─────────────────────────────────────────
# Layer 3 pulled opencv-python (non-headless) via doclayout-yolo/ultralytics/
# rapid-table. Force-reinstall headless build so cv2 works on this slim image.
RUN pip install --no-cache-dir --timeout 300 \
        --force-reinstall \
        "opencv-python-headless>=4.8.0"

# ── Application code ──────────────────────────────────────────────────────────
COPY download_models.py .
COPY validate.py .
COPY main.py .
COPY entrypoint.sh .
RUN chmod +x entrypoint.sh

# ── Download models at build time ─────────────────────────────────────────────
# MFR (formula recognition, ~1-2 GB) excluded — disabled in config.
# rapidocr-onnxruntime models are BUNDLED in the pip wheel; no download needed.
RUN python download_models.py

RUN mkdir -p /app/config && cp /root/magic-pdf.json /app/config/magic-pdf.json

# ── Runtime ───────────────────────────────────────────────────────────────────
EXPOSE 7860
ENTRYPOINT ["/app/entrypoint.sh"]