Spaces:

crazylemonade
/

openskill-ocr

Sleeping

App Files Files Community

openskill-ocr / Dockerfile

crazylemonade

Upload 7 files

0ad3f89 verified 14 days ago

Raw

History Blame Contribute Delete

8.4 kB

	# ─────────────────────────────────────────────────────────────────────────────
	# MinerU OCR Service — Hugging Face Docker Space (CPU / pipeline backend)
	#
	# Optimized for FREE tier: 2 vCPU · 16 GB RAM · 50 GB Disk · No GPU
	#
	# ── OCR ROUTING ARCHITECTURE ──────────────────────────────────────────────────
	#
	# FAST PATH (images: jpg/png/webp/bmp/heic/etc)
	# rapidocr-onnxruntime ≥ 1.3.22
	# - Pure ONNX inference — no PaddleOCR / paddlepaddle needed
	# - Models bundled in the pip wheel (~50 MB); no first-use download
	# - Target latency: 1–5 s on CPU
	# Multi-pass: if RapidOCR confidence < 0.65 → MinerU fallback automatically
	#
	# HEAVY PATH (PDFs, multi-page, forms with layout)
	# MinerU (magic-pdf pipeline backend)
	# - Layout detection (doclayout_yolo)
	# - OCR (paddleocr2pytorch — PyTorch reimplementation bundled in wheel)
	# - Markdown reconstruction
	# - Target latency: 5–30 s on CPU
	#
	# ── ROOT CAUSE HISTORY ────────────────────────────────────────────────────────
	#
	# FAILURE 1: [full-cpu] is NOT a valid extra → pip silently installs base only
	# Fix: magic-pdf[full]==1.3.12
	#
	# FAILURE 2: opencv non-headless conflict
	# Fix: Layer 4 force-reinstall of opencv-python-headless
	#
	# FAILURE 3: ch_PP-OCRv3_det_infer.pth not in HF repo (repo updated to v5)
	# Fix: Layer 3.5 patches models_config.yml inside installed wheel:
	# ch_PP-OCRv3_det → ch_PP-OCRv5_det (all ch* langs)
	# en_PP-OCRv3_det → Multilingual_PP-OCRv3_det (en, latin)
	# Arch safety: both replacement stems verified in arch_config.yaml
	#
	# ── System packages ────────────────────────────────────────────────────────────
	# libgl1 — OpenCV needs libGL.so.1 for ALL image operations (not just GUI)
	# libglib2.0-0 — GLib; required by OpenCV and many C extensions
	# libgomp1 — OpenMP; required by ONNX Runtime and YOLO inference
	# poppler-utils — pdfinfo/pdftoppm; used by MinerU PDF pre-processing
	# ─────────────────────────────────────────────────────────────────────────────

	FROM python:3.10-slim

	ENV PYTHONUNBUFFERED=1
	ENV PYTHONDONTWRITEBYTECODE=1
	ENV PORT=7860
	ENV MINERU_DEVICE_MODE=cpu
	ENV MINERU_BACKEND=pipeline

	# ── System dependencies ────────────────────────────────────────────────────────
	RUN apt-get update \
	&& apt-get install -y --no-install-recommends \
	libgl1 \
	libglib2.0-0 \
	libgomp1 \
	poppler-utils \
	&& rm -rf /var/lib/apt/lists/*

	WORKDIR /app

	# ── Layer 1: FastAPI + lightweight runtime deps ───────────────────────────────
	# rapidocr-onnxruntime: ONNX-based fast OCR engine; models bundled in wheel.
	# - requires onnxruntime (will be auto-resolved or overridden by magic-pdf deps)
	# - requires numpy, pyclipper, shapely — all covered by magic-pdf[full]
	# - ~50 MB wheel; zero first-use model download needed
	# opencv-python-headless: placeholder; will be force-reinstalled in Layer 4
	RUN pip install --no-cache-dir --timeout 300 \
	"fastapi>=0.115.0" \
	"uvicorn[standard]>=0.32.0" \
	"python-multipart>=0.0.12" \
	"Pillow>=10.0.0" \
	"pillow-heif>=0.18.0" \
	"huggingface_hub>=0.25.0" \
	"opencv-python-headless>=4.8.0" \
	"rapidocr-onnxruntime>=1.3.22" \
	"python-docx>=1.1.0" \
	"python-pptx>=0.6.23" \
	"openpyxl>=3.1.0"

	# ── Layer 2: CPU-only PyTorch — MUST precede magic-pdf ───────────────────────
	# PyPI serves the CUDA-enabled torch wheel by default (~2.5 GB).
	# Installing from the official CPU wheel index first causes pip to treat the
	# already-installed CPU build as satisfying magic-pdf's torch requirement.
	RUN pip install --no-cache-dir --timeout 600 \
	--index-url https://download.pytorch.org/whl/cpu \
	"torch>=2.2.2,!=2.5.0,!=2.5.1,<3" \
	"torchvision>=0.15.2"

	# ── Layer 3: magic-pdf with the CORRECT extras ────────────────────────────────
	# [full] provides ultralytics, doclayout-yolo==0.0.2b1, rapid-table, shapely,
	# pyclipper, omegaconf, matplotlib, ftfy, dill, PyYAML, openai, albumentations.
	# doclayout-yolo==0.0.2b1 is ONLY on the myhloli index — not on PyPI.
	# onnxruntime resolved automatically as transitive dep of rapid-table.
	RUN pip install --no-cache-dir --timeout 600 \
	--extra-index-url https://myhloli.github.io/wheels/ \
	"magic-pdf[full]==1.3.12"

	# ── Layer 3.5: Patch OCR model config ────────────────────────────────────────
	# HF repo opendatalab/PDF-Extract-Kit-1.0 was updated to v5 det models.
	# magic-pdf 1.3.12 models_config.yml still references v3 det files (absent).
	# This patch runs at build time so download_models.py fetches correct files.
	RUN python3 - <<'PYEOF'
	import sys, yaml
	from pathlib import Path
	import magic_pdf

	pkg = Path(magic_pdf.__file__).parent
	cfg_path = pkg / 'model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml'
	arch_path = pkg / 'model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml'

	print(f"Patching: {cfg_path}")

	with open(cfg_path) as f:
	config = yaml.safe_load(f)
	with open(arch_path) as f:
	arch_text = f.read()

	DET_MAP = {
	'ch_PP-OCRv3_det_infer.pth': 'ch_PP-OCRv5_det_infer.pth',
	'en_PP-OCRv3_det_infer.pth': 'Multilingual_PP-OCRv3_det_infer.pth',
	}

	patched = 0
	for lang, files in config['lang'].items():
	old = files.get('det', '')
	if old in DET_MAP:
	new = DET_MAP[old]
	arch_key = new[:-4]
	if (arch_key + ':') not in arch_text:
	print(f"ERROR: arch key '{arch_key}' not found in arch_config.yaml", file=sys.stderr)
	sys.exit(1)
	files['det'] = new
	print(f" [{lang}] det: {old} -> {new}")
	patched += 1

	with open(cfg_path, 'w') as f:
	yaml.dump(config, f, default_flow_style=False, allow_unicode=True)

	print(f"Patched {patched} language entries. models_config.yml updated.")
	PYEOF

	# ── Layer 4: Restore headless OpenCV ─────────────────────────────────────────
	# Layer 3 pulled opencv-python (non-headless) via doclayout-yolo/ultralytics/
	# rapid-table. Force-reinstall headless build so cv2 works on this slim image.
	RUN pip install --no-cache-dir --timeout 300 \
	--force-reinstall \
	"opencv-python-headless>=4.8.0"

	# ── Application code ──────────────────────────────────────────────────────────
	COPY download_models.py .
	COPY validate.py .
	COPY main.py .
	COPY entrypoint.sh .
	RUN chmod +x entrypoint.sh

	# ── Download models at build time ─────────────────────────────────────────────
	# MFR (formula recognition, ~1-2 GB) excluded — disabled in config.
	# rapidocr-onnxruntime models are BUNDLED in the pip wheel; no download needed.
	RUN python download_models.py

	RUN mkdir -p /app/config && cp /root/magic-pdf.json /app/config/magic-pdf.json

	# ── Runtime ───────────────────────────────────────────────────────────────────
	EXPOSE 7860
	ENTRYPOINT ["/app/entrypoint.sh"]