Spaces:

sfdghsdvxfbgn
/

node-2

Sleeping

App Files Files Community

node-2 / Dockerfile

sfdghsdvxfbgn

Upload 7 files

d42d358 verified 15 days ago

Raw

History Blame Contribute Delete

4.42 kB

	# ─────────────────────────────────────────────────────────────────────────────
	# MinerU OCR Service — Hugging Face Docker Space (CPU / pipeline backend)
	#
	# Optimized for FREE tier: 2 vCPU · 16 GB RAM · 50 GB Disk · No GPU
	#
	# System packages — what was removed and why:
	# libreoffice — 1.5 GB installed; caused build timeouts/OOM.
	# libsm6 libxext6
	# libxrender-dev — X11 display stubs; only needed for cv2.imshow() GUI.
	# Headless server never opens a display.
	# libmagic1 — Only needed by python-magic, which is not used.
	# wget curl — Runtime testing tools, not needed inside container.
	#
	# System packages — what was kept and why:
	# libgl1 — OpenCV requires libGL.so.1 for all image ops (not GUI).
	# libglib2.0-0 — GLib; required by OpenCV and many C extensions.
	# libgomp1 — OpenMP; required by ONNX Runtime and YOLO inference.
	# poppler-utils — pdfinfo / pdftoppm used by MinerU PDF pre-processing.
	#
	# Pip strategy — TWO separate RUN layers for cache granularity:
	# Layer 1: small/fast packages + opencv-python-headless.
	# opencv-python-headless MUST be in this layer so that when
	# magic-pdf resolves cv2 in layer 2, the headless wheel is already
	# present and pip keeps it (avoids pulling in the full X11 build).
	# Layer 2: magic-pdf[full-cpu] — large, slow, custom wheel index.
	# Separate layer so code-only rebuilds don't re-download 2+ GB.
	# ─────────────────────────────────────────────────────────────────────────────

	FROM python:3.10-slim

	ENV PYTHONUNBUFFERED=1
	ENV PYTHONDONTWRITEBYTECODE=1
	ENV PORT=7860
	ENV MINERU_DEVICE_MODE=cpu
	ENV MINERU_BACKEND=pipeline

	# ── System dependencies (minimal confirmed set) ───────────────────────────────
	RUN apt-get update \
	&& apt-get install -y --no-install-recommends \
	libgl1 \
	libglib2.0-0 \
	libgomp1 \
	poppler-utils \
	&& rm -rf /var/lib/apt/lists/*

	WORKDIR /app

	# ── Layer 1: small + opencv-headless (cached unless versions change) ──────────
	# opencv-python-headless installed HERE so layer-2 magic-pdf install sees cv2
	# already satisfied and does not pull in the full X11-dependent opencv-python.
	RUN pip install --no-cache-dir --timeout 300 \
	"fastapi>=0.115.0" \
	"uvicorn[standard]>=0.32.0" \
	"python-multipart>=0.0.12" \
	"Pillow>=10.0.0" \
	"pillow-heif>=0.18.0" \
	"huggingface_hub>=0.25.0" \
	"opencv-python-headless>=4.8.0"

	# ── Layer 2: magic-pdf (large; cached unless version pin changes) ─────────────
	RUN pip install --no-cache-dir --timeout 300 \
	--extra-index-url https://myhloli.github.io/wheels/ \
	"magic-pdf[full-cpu]==1.3.12"

	# ── Application code ──────────────────────────────────────────────────────────
	COPY download_models.py .
	COPY validate.py .
	COPY main.py .
	COPY entrypoint.sh .
	RUN chmod +x entrypoint.sh

	# ── Download models at build time ─────────────────────────────────────────────
	# Baked into image = zero cold-start download delay.
	# Skip-if-exists logic in download_models.py gives Docker layer-cache reuse:
	# code-only rebuilds skip the 15-minute model download entirely.
	# MFR (formula recognition, ~1-2 GB) is excluded — disabled in config.
	RUN python download_models.py

	# Persist config; entrypoint.sh restores it if /root is wiped on restart.
	RUN mkdir -p /app/config && cp /root/magic-pdf.json /app/config/magic-pdf.json

	# ── Runtime ───────────────────────────────────────────────────────────────────
	EXPOSE 7860
	ENTRYPOINT ["/app/entrypoint.sh"]