Spaces:
Running
Running
File size: 4,416 Bytes
d42d358 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 | # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# MinerU OCR Service β Hugging Face Docker Space (CPU / pipeline backend)
#
# Optimized for FREE tier: 2 vCPU Β· 16 GB RAM Β· 50 GB Disk Β· No GPU
#
# System packages β what was removed and why:
# libreoffice β 1.5 GB installed; caused build timeouts/OOM.
# libsm6 libxext6
# libxrender-dev β X11 display stubs; only needed for cv2.imshow() GUI.
# Headless server never opens a display.
# libmagic1 β Only needed by python-magic, which is not used.
# wget curl β Runtime testing tools, not needed inside container.
#
# System packages β what was kept and why:
# libgl1 β OpenCV requires libGL.so.1 for all image ops (not GUI).
# libglib2.0-0 β GLib; required by OpenCV and many C extensions.
# libgomp1 β OpenMP; required by ONNX Runtime and YOLO inference.
# poppler-utils β pdfinfo / pdftoppm used by MinerU PDF pre-processing.
#
# Pip strategy β TWO separate RUN layers for cache granularity:
# Layer 1: small/fast packages + opencv-python-headless.
# opencv-python-headless MUST be in this layer so that when
# magic-pdf resolves cv2 in layer 2, the headless wheel is already
# present and pip keeps it (avoids pulling in the full X11 build).
# Layer 2: magic-pdf[full-cpu] β large, slow, custom wheel index.
# Separate layer so code-only rebuilds don't re-download 2+ GB.
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
FROM python:3.10-slim
ENV PYTHONUNBUFFERED=1
ENV PYTHONDONTWRITEBYTECODE=1
ENV PORT=7860
ENV MINERU_DEVICE_MODE=cpu
ENV MINERU_BACKEND=pipeline
# ββ System dependencies (minimal confirmed set) βββββββββββββββββββββββββββββββ
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
libgl1 \
libglib2.0-0 \
libgomp1 \
poppler-utils \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
# ββ Layer 1: small + opencv-headless (cached unless versions change) ββββββββββ
# opencv-python-headless installed HERE so layer-2 magic-pdf install sees cv2
# already satisfied and does not pull in the full X11-dependent opencv-python.
RUN pip install --no-cache-dir --timeout 300 \
"fastapi>=0.115.0" \
"uvicorn[standard]>=0.32.0" \
"python-multipart>=0.0.12" \
"Pillow>=10.0.0" \
"pillow-heif>=0.18.0" \
"huggingface_hub>=0.25.0" \
"opencv-python-headless>=4.8.0"
# ββ Layer 2: magic-pdf (large; cached unless version pin changes) βββββββββββββ
RUN pip install --no-cache-dir --timeout 300 \
--extra-index-url https://myhloli.github.io/wheels/ \
"magic-pdf[full-cpu]==1.3.12"
# ββ Application code ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
COPY download_models.py .
COPY validate.py .
COPY main.py .
COPY entrypoint.sh .
RUN chmod +x entrypoint.sh
# ββ Download models at build time βββββββββββββββββββββββββββββββββββββββββββββ
# Baked into image = zero cold-start download delay.
# Skip-if-exists logic in download_models.py gives Docker layer-cache reuse:
# code-only rebuilds skip the 15-minute model download entirely.
# MFR (formula recognition, ~1-2 GB) is excluded β disabled in config.
RUN python download_models.py
# Persist config; entrypoint.sh restores it if /root is wiped on restart.
RUN mkdir -p /app/config && cp /root/magic-pdf.json /app/config/magic-pdf.json
# ββ Runtime βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
EXPOSE 7860
ENTRYPOINT ["/app/entrypoint.sh"]
|