File size: 8,396 Bytes
0ad3f89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# ─────────────────────────────────────────────────────────────────────────────
# MinerU OCR Service β€” Hugging Face Docker Space (CPU / pipeline backend)
#
# Optimized for FREE tier: 2 vCPU Β· 16 GB RAM Β· 50 GB Disk Β· No GPU
#
# ── OCR ROUTING ARCHITECTURE ──────────────────────────────────────────────────
#
# FAST PATH  (images: jpg/png/webp/bmp/heic/etc)
#   rapidocr-onnxruntime β‰₯ 1.3.22
#     - Pure ONNX inference β€” no PaddleOCR / paddlepaddle needed
#     - Models bundled in the pip wheel (~50 MB); no first-use download
#     - Target latency: 1–5 s on CPU
#   Multi-pass: if RapidOCR confidence < 0.65 β†’ MinerU fallback automatically
#
# HEAVY PATH (PDFs, multi-page, forms with layout)
#   MinerU (magic-pdf pipeline backend)
#     - Layout detection (doclayout_yolo)
#     - OCR (paddleocr2pytorch β€” PyTorch reimplementation bundled in wheel)
#     - Markdown reconstruction
#     - Target latency: 5–30 s on CPU
#
# ── ROOT CAUSE HISTORY ────────────────────────────────────────────────────────
#
# FAILURE 1: [full-cpu] is NOT a valid extra β†’ pip silently installs base only
#   Fix: magic-pdf[full]==1.3.12
#
# FAILURE 2: opencv non-headless conflict
#   Fix: Layer 4 force-reinstall of opencv-python-headless
#
# FAILURE 3: ch_PP-OCRv3_det_infer.pth not in HF repo (repo updated to v5)
#   Fix: Layer 3.5 patches models_config.yml inside installed wheel:
#        ch_PP-OCRv3_det β†’ ch_PP-OCRv5_det   (all ch* langs)
#        en_PP-OCRv3_det β†’ Multilingual_PP-OCRv3_det  (en, latin)
#   Arch safety: both replacement stems verified in arch_config.yaml
#
# ── System packages ────────────────────────────────────────────────────────────
#   libgl1       β€” OpenCV needs libGL.so.1 for ALL image operations (not just GUI)
#   libglib2.0-0 β€” GLib; required by OpenCV and many C extensions
#   libgomp1     β€” OpenMP; required by ONNX Runtime and YOLO inference
#   poppler-utils β€” pdfinfo/pdftoppm; used by MinerU PDF pre-processing
# ─────────────────────────────────────────────────────────────────────────────

FROM python:3.10-slim

ENV PYTHONUNBUFFERED=1
ENV PYTHONDONTWRITEBYTECODE=1
ENV PORT=7860
ENV MINERU_DEVICE_MODE=cpu
ENV MINERU_BACKEND=pipeline

# ── System dependencies ────────────────────────────────────────────────────────
RUN apt-get update \
    && apt-get install -y --no-install-recommends \
        libgl1 \
        libglib2.0-0 \
        libgomp1 \
        poppler-utils \
    && rm -rf /var/lib/apt/lists/*

WORKDIR /app

# ── Layer 1: FastAPI + lightweight runtime deps ───────────────────────────────
# rapidocr-onnxruntime: ONNX-based fast OCR engine; models bundled in wheel.
#   - requires onnxruntime (will be auto-resolved or overridden by magic-pdf deps)
#   - requires numpy, pyclipper, shapely β€” all covered by magic-pdf[full]
#   - ~50 MB wheel; zero first-use model download needed
# opencv-python-headless: placeholder; will be force-reinstalled in Layer 4
RUN pip install --no-cache-dir --timeout 300 \
        "fastapi>=0.115.0" \
        "uvicorn[standard]>=0.32.0" \
        "python-multipart>=0.0.12" \
        "Pillow>=10.0.0" \
        "pillow-heif>=0.18.0" \
        "huggingface_hub>=0.25.0" \
        "opencv-python-headless>=4.8.0" \
        "rapidocr-onnxruntime>=1.3.22" \
        "python-docx>=1.1.0" \
        "python-pptx>=0.6.23" \
        "openpyxl>=3.1.0"

# ── Layer 2: CPU-only PyTorch β€” MUST precede magic-pdf ───────────────────────
# PyPI serves the CUDA-enabled torch wheel by default (~2.5 GB).
# Installing from the official CPU wheel index first causes pip to treat the
# already-installed CPU build as satisfying magic-pdf's torch requirement.
RUN pip install --no-cache-dir --timeout 600 \
        --index-url https://download.pytorch.org/whl/cpu \
        "torch>=2.2.2,!=2.5.0,!=2.5.1,<3" \
        "torchvision>=0.15.2"

# ── Layer 3: magic-pdf with the CORRECT extras ────────────────────────────────
# [full] provides ultralytics, doclayout-yolo==0.0.2b1, rapid-table, shapely,
# pyclipper, omegaconf, matplotlib, ftfy, dill, PyYAML, openai, albumentations.
# doclayout-yolo==0.0.2b1 is ONLY on the myhloli index β€” not on PyPI.
# onnxruntime resolved automatically as transitive dep of rapid-table.
RUN pip install --no-cache-dir --timeout 600 \
        --extra-index-url https://myhloli.github.io/wheels/ \
        "magic-pdf[full]==1.3.12"

# ── Layer 3.5: Patch OCR model config ────────────────────────────────────────
# HF repo opendatalab/PDF-Extract-Kit-1.0 was updated to v5 det models.
# magic-pdf 1.3.12 models_config.yml still references v3 det files (absent).
# This patch runs at build time so download_models.py fetches correct files.
RUN python3 - <<'PYEOF'
import sys, yaml
from pathlib import Path
import magic_pdf

pkg = Path(magic_pdf.__file__).parent
cfg_path  = pkg / 'model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml'
arch_path = pkg / 'model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml'

print(f"Patching: {cfg_path}")

with open(cfg_path) as f:
    config = yaml.safe_load(f)
with open(arch_path) as f:
    arch_text = f.read()

DET_MAP = {
    'ch_PP-OCRv3_det_infer.pth':  'ch_PP-OCRv5_det_infer.pth',
    'en_PP-OCRv3_det_infer.pth':  'Multilingual_PP-OCRv3_det_infer.pth',
}

patched = 0
for lang, files in config['lang'].items():
    old = files.get('det', '')
    if old in DET_MAP:
        new = DET_MAP[old]
        arch_key = new[:-4]
        if (arch_key + ':') not in arch_text:
            print(f"ERROR: arch key '{arch_key}' not found in arch_config.yaml", file=sys.stderr)
            sys.exit(1)
        files['det'] = new
        print(f"  [{lang}] det: {old}  ->  {new}")
        patched += 1

with open(cfg_path, 'w') as f:
    yaml.dump(config, f, default_flow_style=False, allow_unicode=True)

print(f"Patched {patched} language entries. models_config.yml updated.")
PYEOF

# ── Layer 4: Restore headless OpenCV ─────────────────────────────────────────
# Layer 3 pulled opencv-python (non-headless) via doclayout-yolo/ultralytics/
# rapid-table. Force-reinstall headless build so cv2 works on this slim image.
RUN pip install --no-cache-dir --timeout 300 \
        --force-reinstall \
        "opencv-python-headless>=4.8.0"

# ── Application code ──────────────────────────────────────────────────────────
COPY download_models.py .
COPY validate.py .
COPY main.py .
COPY entrypoint.sh .
RUN chmod +x entrypoint.sh

# ── Download models at build time ─────────────────────────────────────────────
# MFR (formula recognition, ~1-2 GB) excluded β€” disabled in config.
# rapidocr-onnxruntime models are BUNDLED in the pip wheel; no download needed.
RUN python download_models.py

RUN mkdir -p /app/config && cp /root/magic-pdf.json /app/config/magic-pdf.json

# ── Runtime ───────────────────────────────────────────────────────────────────
EXPOSE 7860
ENTRYPOINT ["/app/entrypoint.sh"]