Spaces:
Sleeping
Sleeping
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # MinerU OCR Service β Hugging Face Docker Space (CPU / pipeline backend) | |
| # | |
| # Optimized for FREE tier: 2 vCPU Β· 16 GB RAM Β· 50 GB Disk Β· No GPU | |
| # | |
| # ββ OCR ROUTING ARCHITECTURE ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # | |
| # FAST PATH (images: jpg/png/webp/bmp/heic/etc) | |
| # rapidocr-onnxruntime β₯ 1.3.22 | |
| # - Pure ONNX inference β no PaddleOCR / paddlepaddle needed | |
| # - Models bundled in the pip wheel (~50 MB); no first-use download | |
| # - Target latency: 1β5 s on CPU | |
| # Multi-pass: if RapidOCR confidence < 0.65 β MinerU fallback automatically | |
| # | |
| # HEAVY PATH (PDFs, multi-page, forms with layout) | |
| # MinerU (magic-pdf pipeline backend) | |
| # - Layout detection (doclayout_yolo) | |
| # - OCR (paddleocr2pytorch β PyTorch reimplementation bundled in wheel) | |
| # - Markdown reconstruction | |
| # - Target latency: 5β30 s on CPU | |
| # | |
| # ββ ROOT CAUSE HISTORY ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # | |
| # FAILURE 1: [full-cpu] is NOT a valid extra β pip silently installs base only | |
| # Fix: magic-pdf[full]==1.3.12 | |
| # | |
| # FAILURE 2: opencv non-headless conflict | |
| # Fix: Layer 4 force-reinstall of opencv-python-headless | |
| # | |
| # FAILURE 3: ch_PP-OCRv3_det_infer.pth not in HF repo (repo updated to v5) | |
| # Fix: Layer 3.5 patches models_config.yml inside installed wheel: | |
| # ch_PP-OCRv3_det β ch_PP-OCRv5_det (all ch* langs) | |
| # en_PP-OCRv3_det β Multilingual_PP-OCRv3_det (en, latin) | |
| # Arch safety: both replacement stems verified in arch_config.yaml | |
| # | |
| # ββ System packages ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # libgl1 β OpenCV needs libGL.so.1 for ALL image operations (not just GUI) | |
| # libglib2.0-0 β GLib; required by OpenCV and many C extensions | |
| # libgomp1 β OpenMP; required by ONNX Runtime and YOLO inference | |
| # poppler-utils β pdfinfo/pdftoppm; used by MinerU PDF pre-processing | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| FROM python:3.10-slim | |
| ENV PYTHONUNBUFFERED=1 | |
| ENV PYTHONDONTWRITEBYTECODE=1 | |
| ENV PORT=7860 | |
| ENV MINERU_DEVICE_MODE=cpu | |
| ENV MINERU_BACKEND=pipeline | |
| # ββ System dependencies ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| RUN apt-get update \ | |
| && apt-get install -y --no-install-recommends \ | |
| libgl1 \ | |
| libglib2.0-0 \ | |
| libgomp1 \ | |
| poppler-utils \ | |
| && rm -rf /var/lib/apt/lists/* | |
| WORKDIR /app | |
| # ββ Layer 1: FastAPI + lightweight runtime deps βββββββββββββββββββββββββββββββ | |
| # rapidocr-onnxruntime: ONNX-based fast OCR engine; models bundled in wheel. | |
| # - requires onnxruntime (will be auto-resolved or overridden by magic-pdf deps) | |
| # - requires numpy, pyclipper, shapely β all covered by magic-pdf[full] | |
| # - ~50 MB wheel; zero first-use model download needed | |
| # opencv-python-headless: placeholder; will be force-reinstalled in Layer 4 | |
| RUN pip install --no-cache-dir --timeout 300 \ | |
| "fastapi>=0.115.0" \ | |
| "uvicorn[standard]>=0.32.0" \ | |
| "python-multipart>=0.0.12" \ | |
| "Pillow>=10.0.0" \ | |
| "pillow-heif>=0.18.0" \ | |
| "huggingface_hub>=0.25.0" \ | |
| "opencv-python-headless>=4.8.0" \ | |
| "rapidocr-onnxruntime>=1.3.22" \ | |
| "python-docx>=1.1.0" \ | |
| "python-pptx>=0.6.23" \ | |
| "openpyxl>=3.1.0" | |
| # ββ Layer 2: CPU-only PyTorch β MUST precede magic-pdf βββββββββββββββββββββββ | |
| # PyPI serves the CUDA-enabled torch wheel by default (~2.5 GB). | |
| # Installing from the official CPU wheel index first causes pip to treat the | |
| # already-installed CPU build as satisfying magic-pdf's torch requirement. | |
| RUN pip install --no-cache-dir --timeout 600 \ | |
| --index-url https://download.pytorch.org/whl/cpu \ | |
| "torch>=2.2.2,!=2.5.0,!=2.5.1,<3" \ | |
| "torchvision>=0.15.2" | |
| # ββ Layer 3: magic-pdf with the CORRECT extras ββββββββββββββββββββββββββββββββ | |
| # [full] provides ultralytics, doclayout-yolo==0.0.2b1, rapid-table, shapely, | |
| # pyclipper, omegaconf, matplotlib, ftfy, dill, PyYAML, openai, albumentations. | |
| # doclayout-yolo==0.0.2b1 is ONLY on the myhloli index β not on PyPI. | |
| # onnxruntime resolved automatically as transitive dep of rapid-table. | |
| RUN pip install --no-cache-dir --timeout 600 \ | |
| --extra-index-url https://myhloli.github.io/wheels/ \ | |
| "magic-pdf[full]==1.3.12" | |
| # ββ Layer 3.5: Patch OCR model config ββββββββββββββββββββββββββββββββββββββββ | |
| # HF repo opendatalab/PDF-Extract-Kit-1.0 was updated to v5 det models. | |
| # magic-pdf 1.3.12 models_config.yml still references v3 det files (absent). | |
| # This patch runs at build time so download_models.py fetches correct files. | |
| RUN python3 - <<'PYEOF' | |
| import sys, yaml | |
| from pathlib import Path | |
| import magic_pdf | |
| pkg = Path(magic_pdf.__file__).parent | |
| cfg_path = pkg / 'model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml' | |
| arch_path = pkg / 'model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml' | |
| print(f"Patching: {cfg_path}") | |
| with open(cfg_path) as f: | |
| config = yaml.safe_load(f) | |
| with open(arch_path) as f: | |
| arch_text = f.read() | |
| DET_MAP = { | |
| 'ch_PP-OCRv3_det_infer.pth': 'ch_PP-OCRv5_det_infer.pth', | |
| 'en_PP-OCRv3_det_infer.pth': 'Multilingual_PP-OCRv3_det_infer.pth', | |
| } | |
| patched = 0 | |
| for lang, files in config['lang'].items(): | |
| old = files.get('det', '') | |
| if old in DET_MAP: | |
| new = DET_MAP[old] | |
| arch_key = new[:-4] | |
| if (arch_key + ':') not in arch_text: | |
| print(f"ERROR: arch key '{arch_key}' not found in arch_config.yaml", file=sys.stderr) | |
| sys.exit(1) | |
| files['det'] = new | |
| print(f" | |