Spaces:

outcomelabs
/

md-parser

Paused

App Files Files Community

md-parser / Dockerfile

sidoutcome

feat: support both API_TOKEN and API_DEV_TOKEN

6162371 2 months ago

raw

history blame contribute delete

5.14 kB

	# Hugging Face Spaces Dockerfile for MinerU Document Parser API
	# Based on official MinerU Docker deployment
	# Optimized for L40S GPU (Ada Lovelace architecture, 48GB VRAM)
	# Build: v1.4.0 - Using mineru[core] for full backend support

	# Use official vLLM image as base (includes CUDA, PyTorch, vLLM properly configured)
	# v0.14.1 includes security patches (CVE-2025-66448/CVE-2025-30165) and memory leak fixes
	# Supports Ampere, Ada Lovelace, Hopper architectures (L40S is Ada Lovelace)
	FROM vllm/vllm-openai:v0.14.1

	USER root

	RUN echo "========== BUILD STARTED at $(date -u '+%Y-%m-%d %H:%M:%S UTC') =========="

	# Install system dependencies (fonts required by MinerU, curl for health checks)
	RUN echo "========== STEP 1: Installing system dependencies ==========" && \
	apt-get update && apt-get install -y --no-install-recommends \
	fonts-noto-core \
	fonts-noto-cjk \
	fontconfig \
	libgl1 \
	curl \
	poppler-utils \
	&& fc-cache -fv && \
	rm -rf /var/lib/apt/lists/* && \
	echo "========== System dependencies installed =========="

	# Create non-root user for HF Spaces (required by HuggingFace)
	RUN useradd -m -u 1000 user

	# Set environment variables (MINERU_MODEL_SOURCE set later after download)
	# LD_LIBRARY_PATH includes pip nvidia packages for cuDNN (libcudnn.so.9)
	ENV PYTHONUNBUFFERED=1 \
	PYTHONDONTWRITEBYTECODE=1 \
	MINERU_BACKEND=pipeline \
	MINERU_LANG=en \
	MAX_FILE_SIZE_MB=1024 \
	HF_HOME=/home/user/.cache/huggingface \
	TORCH_HOME=/home/user/.cache/torch \
	MODELSCOPE_CACHE=/home/user/.cache/modelscope \
	XDG_CACHE_HOME=/home/user/.cache \
	HOME=/home/user \
	PATH=/home/user/.local/bin:/usr/local/bin:/usr/bin:$PATH \
	LD_LIBRARY_PATH=/home/user/.local/lib/python3.12/site-packages/nvidia/cudnn/lib:$LD_LIBRARY_PATH \
	VLLM_GPU_MEMORY_UTILIZATION=0.4

	# Create cache directories with correct ownership
	RUN mkdir -p /home/user/.cache/huggingface \
	/home/user/.cache/torch \
	/home/user/.cache/modelscope \
	/home/user/app && \
	chown -R user:user /home/user

	# Switch to non-root user
	USER user
	WORKDIR /home/user/app

	# Copy requirements first for better caching
	COPY --chown=user:user requirements.txt .

	# Install Python dependencies
	# Note: nvidia-cudnn-cu12 provides libcudnn.so.9 required by torch
	RUN echo "========== STEP 2: Installing Python dependencies ==========" && \
	pip install --user --upgrade pip && \
	pip install --user nvidia-cudnn-cu12 && \
	pip install --user -r requirements.txt && \
	echo "Reinstalling modelscope in user space for torch compatibility..." && \
	pip install --user --force-reinstall modelscope && \
	echo "Installed packages:" && \
	pip list --user \| grep -E "(mineru\|fastapi\|uvicorn\|httpx\|pydantic\|modelscope\|torch\|cudnn\|doclayout)" && \
	echo "========== Python dependencies installed =========="

	# Create MinerU config file (required BEFORE downloading models)
	# The mineru-models-download command reads ~/mineru.json to know where to store models
	RUN echo "========== STEP 3a: Creating MinerU config ==========" && \
	mkdir -p /home/user/.cache/mineru/models && \
	echo '{"models-dir": {"pipeline": "/home/user/.cache/mineru/models", "vlm": "/home/user/.cache/mineru/models"}, "config_version": "1.3.1"}' > /home/user/mineru.json && \
	cat /home/user/mineru.json && \
	echo "========== MinerU config created =========="

	# Download MinerU models using official tool
	RUN echo "========== STEP 3b: Downloading MinerU models ==========" && \
	echo "This downloads all required models (~4-5GB)..." && \
	echo "Cache directories before download:" && \
	ls -la /home/user/.cache/ && \
	echo "Downloading all models from huggingface..." && \
	mineru-models-download --source huggingface --model_type all && \
	echo "" && \
	echo "========== Model cache summary ==========" && \
	echo "MinerU models cache:" && \
	du -sh /home/user/.cache/mineru 2>/dev/null \|\| echo " (empty)" && \
	ls -la /home/user/.cache/mineru/models 2>/dev/null \|\| echo " (no files)" && \
	find /home/user/.cache/mineru -type f 2>/dev/null \| head -20 \|\| echo " (no files found)" && \
	echo "HuggingFace cache:" && \
	du -sh /home/user/.cache/huggingface 2>/dev/null \|\| echo " (empty)" && \
	echo "Total cache size:" && \
	du -sh /home/user/.cache 2>/dev/null \|\| echo " (empty)" && \
	echo "========== Models downloaded =========="

	# Set model source to local AFTER downloading (prevents re-download at runtime)
	ENV MINERU_MODEL_SOURCE=local

	# Copy application code
	COPY --chown=user:user . .

	RUN echo "Files in app directory:" && ls -la /home/user/app/ && \
	echo "========== BUILD COMPLETED at $(date -u '+%Y-%m-%d %H:%M:%S UTC') =========="

	# Expose the port
	EXPOSE 7860

	# Health check
	HEALTHCHECK --interval=30s --timeout=30s --start-period=300s --retries=5 \
	CMD curl -f http://localhost:7860/ \|\| exit 1

	# Override vLLM entrypoint and run our FastAPI server
	ENTRYPOINT []
	CMD ["/usr/bin/python3", "-m", "uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1", "--timeout-keep-alive", "300"]