openskill-ocr / Dockerfile
crazylemonade's picture
Upload 7 files
0ad3f89 verified
Raw
History Blame Contribute Delete
8.4 kB
# ─────────────────────────────────────────────────────────────────────────────
# MinerU OCR Service β€” Hugging Face Docker Space (CPU / pipeline backend)
#
# Optimized for FREE tier: 2 vCPU Β· 16 GB RAM Β· 50 GB Disk Β· No GPU
#
# ── OCR ROUTING ARCHITECTURE ──────────────────────────────────────────────────
#
# FAST PATH (images: jpg/png/webp/bmp/heic/etc)
# rapidocr-onnxruntime β‰₯ 1.3.22
# - Pure ONNX inference β€” no PaddleOCR / paddlepaddle needed
# - Models bundled in the pip wheel (~50 MB); no first-use download
# - Target latency: 1–5 s on CPU
# Multi-pass: if RapidOCR confidence < 0.65 β†’ MinerU fallback automatically
#
# HEAVY PATH (PDFs, multi-page, forms with layout)
# MinerU (magic-pdf pipeline backend)
# - Layout detection (doclayout_yolo)
# - OCR (paddleocr2pytorch β€” PyTorch reimplementation bundled in wheel)
# - Markdown reconstruction
# - Target latency: 5–30 s on CPU
#
# ── ROOT CAUSE HISTORY ────────────────────────────────────────────────────────
#
# FAILURE 1: [full-cpu] is NOT a valid extra β†’ pip silently installs base only
# Fix: magic-pdf[full]==1.3.12
#
# FAILURE 2: opencv non-headless conflict
# Fix: Layer 4 force-reinstall of opencv-python-headless
#
# FAILURE 3: ch_PP-OCRv3_det_infer.pth not in HF repo (repo updated to v5)
# Fix: Layer 3.5 patches models_config.yml inside installed wheel:
# ch_PP-OCRv3_det β†’ ch_PP-OCRv5_det (all ch* langs)
# en_PP-OCRv3_det β†’ Multilingual_PP-OCRv3_det (en, latin)
# Arch safety: both replacement stems verified in arch_config.yaml
#
# ── System packages ────────────────────────────────────────────────────────────
# libgl1 β€” OpenCV needs libGL.so.1 for ALL image operations (not just GUI)
# libglib2.0-0 β€” GLib; required by OpenCV and many C extensions
# libgomp1 β€” OpenMP; required by ONNX Runtime and YOLO inference
# poppler-utils β€” pdfinfo/pdftoppm; used by MinerU PDF pre-processing
# ─────────────────────────────────────────────────────────────────────────────
FROM python:3.10-slim
ENV PYTHONUNBUFFERED=1
ENV PYTHONDONTWRITEBYTECODE=1
ENV PORT=7860
ENV MINERU_DEVICE_MODE=cpu
ENV MINERU_BACKEND=pipeline
# ── System dependencies ────────────────────────────────────────────────────────
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
libgl1 \
libglib2.0-0 \
libgomp1 \
poppler-utils \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
# ── Layer 1: FastAPI + lightweight runtime deps ───────────────────────────────
# rapidocr-onnxruntime: ONNX-based fast OCR engine; models bundled in wheel.
# - requires onnxruntime (will be auto-resolved or overridden by magic-pdf deps)
# - requires numpy, pyclipper, shapely β€” all covered by magic-pdf[full]
# - ~50 MB wheel; zero first-use model download needed
# opencv-python-headless: placeholder; will be force-reinstalled in Layer 4
RUN pip install --no-cache-dir --timeout 300 \
"fastapi>=0.115.0" \
"uvicorn[standard]>=0.32.0" \
"python-multipart>=0.0.12" \
"Pillow>=10.0.0" \
"pillow-heif>=0.18.0" \
"huggingface_hub>=0.25.0" \
"opencv-python-headless>=4.8.0" \
"rapidocr-onnxruntime>=1.3.22" \
"python-docx>=1.1.0" \
"python-pptx>=0.6.23" \
"openpyxl>=3.1.0"
# ── Layer 2: CPU-only PyTorch β€” MUST precede magic-pdf ───────────────────────
# PyPI serves the CUDA-enabled torch wheel by default (~2.5 GB).
# Installing from the official CPU wheel index first causes pip to treat the
# already-installed CPU build as satisfying magic-pdf's torch requirement.
RUN pip install --no-cache-dir --timeout 600 \
--index-url https://download.pytorch.org/whl/cpu \
"torch>=2.2.2,!=2.5.0,!=2.5.1,<3" \
"torchvision>=0.15.2"
# ── Layer 3: magic-pdf with the CORRECT extras ────────────────────────────────
# [full] provides ultralytics, doclayout-yolo==0.0.2b1, rapid-table, shapely,
# pyclipper, omegaconf, matplotlib, ftfy, dill, PyYAML, openai, albumentations.
# doclayout-yolo==0.0.2b1 is ONLY on the myhloli index β€” not on PyPI.
# onnxruntime resolved automatically as transitive dep of rapid-table.
RUN pip install --no-cache-dir --timeout 600 \
--extra-index-url https://myhloli.github.io/wheels/ \
"magic-pdf[full]==1.3.12"
# ── Layer 3.5: Patch OCR model config ────────────────────────────────────────
# HF repo opendatalab/PDF-Extract-Kit-1.0 was updated to v5 det models.
# magic-pdf 1.3.12 models_config.yml still references v3 det files (absent).
# This patch runs at build time so download_models.py fetches correct files.
RUN python3 - <<'PYEOF'
import sys, yaml
from pathlib import Path
import magic_pdf
pkg = Path(magic_pdf.__file__).parent
cfg_path = pkg / 'model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml'
arch_path = pkg / 'model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml'
print(f"Patching: {cfg_path}")
with open(cfg_path) as f:
config = yaml.safe_load(f)
with open(arch_path) as f:
arch_text = f.read()
DET_MAP = {
'ch_PP-OCRv3_det_infer.pth': 'ch_PP-OCRv5_det_infer.pth',
'en_PP-OCRv3_det_infer.pth': 'Multilingual_PP-OCRv3_det_infer.pth',
}
patched = 0
for lang, files in config['lang'].items():
old = files.get('det', '')
if old in DET_MAP:
new = DET_MAP[old]
arch_key = new[:-4]
if (arch_key + ':') not in arch_text:
print(f"ERROR: arch key '{arch_key}' not found in arch_config.yaml", file=sys.stderr)
sys.exit(1)
files['det'] = new
print(f" [{lang}] det: {old} -> {new}")
patched += 1
with open(cfg_path, 'w') as f:
yaml.dump(config, f, default_flow_style=False, allow_unicode=True)
print(f"Patched {patched} language entries. models_config.yml updated.")
PYEOF
# ── Layer 4: Restore headless OpenCV ─────────────────────────────────────────
# Layer 3 pulled opencv-python (non-headless) via doclayout-yolo/ultralytics/
# rapid-table. Force-reinstall headless build so cv2 works on this slim image.
RUN pip install --no-cache-dir --timeout 300 \
--force-reinstall \
"opencv-python-headless>=4.8.0"
# ── Application code ──────────────────────────────────────────────────────────
COPY download_models.py .
COPY validate.py .
COPY main.py .
COPY entrypoint.sh .
RUN chmod +x entrypoint.sh
# ── Download models at build time ─────────────────────────────────────────────
# MFR (formula recognition, ~1-2 GB) excluded β€” disabled in config.
# rapidocr-onnxruntime models are BUNDLED in the pip wheel; no download needed.
RUN python download_models.py
RUN mkdir -p /app/config && cp /root/magic-pdf.json /app/config/magic-pdf.json
# ── Runtime ───────────────────────────────────────────────────────────────────
EXPOSE 7860
ENTRYPOINT ["/app/entrypoint.sh"]