Spaces:
Running
Running
File size: 8,396 Bytes
0ad3f89 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 | # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# MinerU OCR Service β Hugging Face Docker Space (CPU / pipeline backend)
#
# Optimized for FREE tier: 2 vCPU Β· 16 GB RAM Β· 50 GB Disk Β· No GPU
#
# ββ OCR ROUTING ARCHITECTURE ββββββββββββββββββββββββββββββββββββββββββββββββββ
#
# FAST PATH (images: jpg/png/webp/bmp/heic/etc)
# rapidocr-onnxruntime β₯ 1.3.22
# - Pure ONNX inference β no PaddleOCR / paddlepaddle needed
# - Models bundled in the pip wheel (~50 MB); no first-use download
# - Target latency: 1β5 s on CPU
# Multi-pass: if RapidOCR confidence < 0.65 β MinerU fallback automatically
#
# HEAVY PATH (PDFs, multi-page, forms with layout)
# MinerU (magic-pdf pipeline backend)
# - Layout detection (doclayout_yolo)
# - OCR (paddleocr2pytorch β PyTorch reimplementation bundled in wheel)
# - Markdown reconstruction
# - Target latency: 5β30 s on CPU
#
# ββ ROOT CAUSE HISTORY ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
#
# FAILURE 1: [full-cpu] is NOT a valid extra β pip silently installs base only
# Fix: magic-pdf[full]==1.3.12
#
# FAILURE 2: opencv non-headless conflict
# Fix: Layer 4 force-reinstall of opencv-python-headless
#
# FAILURE 3: ch_PP-OCRv3_det_infer.pth not in HF repo (repo updated to v5)
# Fix: Layer 3.5 patches models_config.yml inside installed wheel:
# ch_PP-OCRv3_det β ch_PP-OCRv5_det (all ch* langs)
# en_PP-OCRv3_det β Multilingual_PP-OCRv3_det (en, latin)
# Arch safety: both replacement stems verified in arch_config.yaml
#
# ββ System packages ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# libgl1 β OpenCV needs libGL.so.1 for ALL image operations (not just GUI)
# libglib2.0-0 β GLib; required by OpenCV and many C extensions
# libgomp1 β OpenMP; required by ONNX Runtime and YOLO inference
# poppler-utils β pdfinfo/pdftoppm; used by MinerU PDF pre-processing
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
FROM python:3.10-slim
ENV PYTHONUNBUFFERED=1
ENV PYTHONDONTWRITEBYTECODE=1
ENV PORT=7860
ENV MINERU_DEVICE_MODE=cpu
ENV MINERU_BACKEND=pipeline
# ββ System dependencies ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
libgl1 \
libglib2.0-0 \
libgomp1 \
poppler-utils \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
# ββ Layer 1: FastAPI + lightweight runtime deps βββββββββββββββββββββββββββββββ
# rapidocr-onnxruntime: ONNX-based fast OCR engine; models bundled in wheel.
# - requires onnxruntime (will be auto-resolved or overridden by magic-pdf deps)
# - requires numpy, pyclipper, shapely β all covered by magic-pdf[full]
# - ~50 MB wheel; zero first-use model download needed
# opencv-python-headless: placeholder; will be force-reinstalled in Layer 4
RUN pip install --no-cache-dir --timeout 300 \
"fastapi>=0.115.0" \
"uvicorn[standard]>=0.32.0" \
"python-multipart>=0.0.12" \
"Pillow>=10.0.0" \
"pillow-heif>=0.18.0" \
"huggingface_hub>=0.25.0" \
"opencv-python-headless>=4.8.0" \
"rapidocr-onnxruntime>=1.3.22" \
"python-docx>=1.1.0" \
"python-pptx>=0.6.23" \
"openpyxl>=3.1.0"
# ββ Layer 2: CPU-only PyTorch β MUST precede magic-pdf βββββββββββββββββββββββ
# PyPI serves the CUDA-enabled torch wheel by default (~2.5 GB).
# Installing from the official CPU wheel index first causes pip to treat the
# already-installed CPU build as satisfying magic-pdf's torch requirement.
RUN pip install --no-cache-dir --timeout 600 \
--index-url https://download.pytorch.org/whl/cpu \
"torch>=2.2.2,!=2.5.0,!=2.5.1,<3" \
"torchvision>=0.15.2"
# ββ Layer 3: magic-pdf with the CORRECT extras ββββββββββββββββββββββββββββββββ
# [full] provides ultralytics, doclayout-yolo==0.0.2b1, rapid-table, shapely,
# pyclipper, omegaconf, matplotlib, ftfy, dill, PyYAML, openai, albumentations.
# doclayout-yolo==0.0.2b1 is ONLY on the myhloli index β not on PyPI.
# onnxruntime resolved automatically as transitive dep of rapid-table.
RUN pip install --no-cache-dir --timeout 600 \
--extra-index-url https://myhloli.github.io/wheels/ \
"magic-pdf[full]==1.3.12"
# ββ Layer 3.5: Patch OCR model config ββββββββββββββββββββββββββββββββββββββββ
# HF repo opendatalab/PDF-Extract-Kit-1.0 was updated to v5 det models.
# magic-pdf 1.3.12 models_config.yml still references v3 det files (absent).
# This patch runs at build time so download_models.py fetches correct files.
RUN python3 - <<'PYEOF'
import sys, yaml
from pathlib import Path
import magic_pdf
pkg = Path(magic_pdf.__file__).parent
cfg_path = pkg / 'model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml'
arch_path = pkg / 'model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml'
print(f"Patching: {cfg_path}")
with open(cfg_path) as f:
config = yaml.safe_load(f)
with open(arch_path) as f:
arch_text = f.read()
DET_MAP = {
'ch_PP-OCRv3_det_infer.pth': 'ch_PP-OCRv5_det_infer.pth',
'en_PP-OCRv3_det_infer.pth': 'Multilingual_PP-OCRv3_det_infer.pth',
}
patched = 0
for lang, files in config['lang'].items():
old = files.get('det', '')
if old in DET_MAP:
new = DET_MAP[old]
arch_key = new[:-4]
if (arch_key + ':') not in arch_text:
print(f"ERROR: arch key '{arch_key}' not found in arch_config.yaml", file=sys.stderr)
sys.exit(1)
files['det'] = new
print(f" [{lang}] det: {old} -> {new}")
patched += 1
with open(cfg_path, 'w') as f:
yaml.dump(config, f, default_flow_style=False, allow_unicode=True)
print(f"Patched {patched} language entries. models_config.yml updated.")
PYEOF
# ββ Layer 4: Restore headless OpenCV βββββββββββββββββββββββββββββββββββββββββ
# Layer 3 pulled opencv-python (non-headless) via doclayout-yolo/ultralytics/
# rapid-table. Force-reinstall headless build so cv2 works on this slim image.
RUN pip install --no-cache-dir --timeout 300 \
--force-reinstall \
"opencv-python-headless>=4.8.0"
# ββ Application code ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
COPY download_models.py .
COPY validate.py .
COPY main.py .
COPY entrypoint.sh .
RUN chmod +x entrypoint.sh
# ββ Download models at build time βββββββββββββββββββββββββββββββββββββββββββββ
# MFR (formula recognition, ~1-2 GB) excluded β disabled in config.
# rapidocr-onnxruntime models are BUNDLED in the pip wheel; no download needed.
RUN python download_models.py
RUN mkdir -p /app/config && cp /root/magic-pdf.json /app/config/magic-pdf.json
# ββ Runtime βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
EXPOSE 7860
ENTRYPOINT ["/app/entrypoint.sh"]
|