File size: 4,416 Bytes
d42d358
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# ─────────────────────────────────────────────────────────────────────────────
# MinerU OCR Service β€” Hugging Face Docker Space (CPU / pipeline backend)
#
# Optimized for FREE tier: 2 vCPU Β· 16 GB RAM Β· 50 GB Disk Β· No GPU
#
# System packages β€” what was removed and why:
#   libreoffice      β€” 1.5 GB installed; caused build timeouts/OOM.
#   libsm6 libxext6
#   libxrender-dev   β€” X11 display stubs; only needed for cv2.imshow() GUI.
#                       Headless server never opens a display.
#   libmagic1        β€” Only needed by python-magic, which is not used.
#   wget curl        β€” Runtime testing tools, not needed inside container.
#
# System packages β€” what was kept and why:
#   libgl1           β€” OpenCV requires libGL.so.1 for all image ops (not GUI).
#   libglib2.0-0     β€” GLib; required by OpenCV and many C extensions.
#   libgomp1         β€” OpenMP; required by ONNX Runtime and YOLO inference.
#   poppler-utils    β€” pdfinfo / pdftoppm used by MinerU PDF pre-processing.
#
# Pip strategy β€” TWO separate RUN layers for cache granularity:
#   Layer 1: small/fast packages + opencv-python-headless.
#            opencv-python-headless MUST be in this layer so that when
#            magic-pdf resolves cv2 in layer 2, the headless wheel is already
#            present and pip keeps it (avoids pulling in the full X11 build).
#   Layer 2: magic-pdf[full-cpu] β€” large, slow, custom wheel index.
#            Separate layer so code-only rebuilds don't re-download 2+ GB.
# ─────────────────────────────────────────────────────────────────────────────

FROM python:3.10-slim

ENV PYTHONUNBUFFERED=1
ENV PYTHONDONTWRITEBYTECODE=1
ENV PORT=7860
ENV MINERU_DEVICE_MODE=cpu
ENV MINERU_BACKEND=pipeline

# ── System dependencies (minimal confirmed set) ───────────────────────────────
RUN apt-get update \
    && apt-get install -y --no-install-recommends \
        libgl1 \
        libglib2.0-0 \
        libgomp1 \
        poppler-utils \
    && rm -rf /var/lib/apt/lists/*

WORKDIR /app

# ── Layer 1: small + opencv-headless (cached unless versions change) ──────────
# opencv-python-headless installed HERE so layer-2 magic-pdf install sees cv2
# already satisfied and does not pull in the full X11-dependent opencv-python.
RUN pip install --no-cache-dir --timeout 300 \
        "fastapi>=0.115.0" \
        "uvicorn[standard]>=0.32.0" \
        "python-multipart>=0.0.12" \
        "Pillow>=10.0.0" \
        "pillow-heif>=0.18.0" \
        "huggingface_hub>=0.25.0" \
        "opencv-python-headless>=4.8.0"

# ── Layer 2: magic-pdf (large; cached unless version pin changes) ─────────────
RUN pip install --no-cache-dir --timeout 300 \
        --extra-index-url https://myhloli.github.io/wheels/ \
        "magic-pdf[full-cpu]==1.3.12"

# ── Application code ──────────────────────────────────────────────────────────
COPY download_models.py .
COPY validate.py .
COPY main.py .
COPY entrypoint.sh .
RUN chmod +x entrypoint.sh

# ── Download models at build time ─────────────────────────────────────────────
# Baked into image = zero cold-start download delay.
# Skip-if-exists logic in download_models.py gives Docker layer-cache reuse:
# code-only rebuilds skip the 15-minute model download entirely.
# MFR (formula recognition, ~1-2 GB) is excluded β€” disabled in config.
RUN python download_models.py

# Persist config; entrypoint.sh restores it if /root is wiped on restart.
RUN mkdir -p /app/config && cp /root/magic-pdf.json /app/config/magic-pdf.json

# ── Runtime ───────────────────────────────────────────────────────────────────
EXPOSE 7860
ENTRYPOINT ["/app/entrypoint.sh"]