Add vLLM Docker image for DGX Spark (Blackwell GB10) with CUDA graphs support

Browse files

Files changed (3) hide show

Dockerfile +184 -0
README.md +150 -0
vllm_cmakelists.patch +49 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,184 @@

+# ============================================================================
+# DGX Spark Optimized vLLM - Built from main branch
+# ============================================================================
+# Purpose: Build vLLM from source to include non-gated activations support
+#          for Nemotron3-Nano and other hybrid Mamba-Transformer models
+#
+# Key Features:
+#   - vLLM built from main branch (includes PR #29004 for non-gated activations)
+#   - CUDA 13.0 support for DGX Spark (GB10, compute capability 12.1)
+#   - FlashInfer for optimized attention and MoE kernels
+#   - Full CUDA graph support for hybrid models
+#
+# Build:
+#   docker build -t vllm-dgx-spark:v11 .
+#
+# Usage:
+#   docker run --gpus all --ipc=host -p 8000:8000 \
+#     -e VLLM_FLASHINFER_MOE_BACKEND=latency \
+#     vllm-dgx-spark:v11 \
+#     serve <model> --quantization modelopt_fp4 --kv-cache-dtype fp8
+# ============================================================================
+FROM nvidia/cuda:13.0.2-cudnn-devel-ubuntu24.04
+LABEL maintainer="avarok"
+LABEL version="v11"
+LABEL description="vLLM with non-gated activations support for Nemotron3-Nano on DGX Spark"
+# Build arguments for cache busting and version pinning
+ARG VLLM_COMMIT=main
+ARG CACHEBUST_DEPS=1
+ARG CACHEBUST_VLLM=1
+# ============================================================================
+# System Dependencies
+# ============================================================================
+RUN apt-get update && apt-get install -y \
+    python3.12 python3.12-venv python3.12-dev python3-pip \
+    git wget curl patch \
+    cmake build-essential ninja-build \
+    # InfiniBand/RDMA libraries for multi-node
+    libibverbs1 libibverbs-dev ibverbs-providers rdma-core perftest \
+    # Network utilities
+    iproute2 iputils-ping net-tools openssh-client \
+    && rm -rf /var/lib/apt/lists/*
+# ============================================================================
+# Python Virtual Environment
+# ============================================================================
+WORKDIR /workspace
+RUN python3.12 -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+ENV VIRTUAL_ENV="/opt/venv"
+# Upgrade pip
+RUN pip install --upgrade pip setuptools wheel
+# ============================================================================
+# PyTorch and Core Dependencies
+# ============================================================================
+ARG CACHEBUST_DEPS
+# Install PyTorch with CUDA 13.0 support
+RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130
+# Install xgrammar (structured output generation)
+RUN pip install xgrammar
+# Install FlashInfer (pre-release for CUDA 13.0 support)
+RUN pip install flashinfer-python --pre
+# IMPORTANT: Remove triton after installations as it causes CUDA 13.0 errors
+# Both PyTorch and xgrammar pull it in as dependency
+RUN pip uninstall -y triton || true && echo "Triton removed (if present)"
+# ============================================================================
+# Clone and Build vLLM from Source
+# ============================================================================
+ARG CACHEBUST_VLLM
+ARG VLLM_COMMIT
+WORKDIR /workspace/vllm
+RUN git clone --recursive https://github.com/vllm-project/vllm.git . && \
+    git checkout ${VLLM_COMMIT} && \
+    echo "Building vLLM from commit: $(git rev-parse HEAD)"
+# Prepare for existing torch installation
+RUN python3 use_existing_torch.py
+# Remove flashinfer from requirements (we installed it separately)
+RUN sed -i "/flashinfer/d" requirements/cuda.txt || true
+RUN sed -i '/^triton\b/d' requirements/test.txt || true
+# Install build requirements
+RUN pip install -r requirements/build.txt
+# ============================================================================
+# CMakeLists Patch for DGX Spark (GB10)
+# ============================================================================
+# This patch removes problematic SM12.x architectures from certain kernel
+# compilations that cause issues on DGX Spark's GB10 GPU
+COPY vllm_cmakelists.patch .
+RUN patch -p1 < vllm_cmakelists.patch || echo "Patch may have already been applied or is not needed"
+# ============================================================================
+# Build Environment Variables
+# ============================================================================
+# GB10 compute capability 12.1 (Blackwell architecture)
+# The 'f' suffix enables forward compatibility (PTX JIT for future architectures)
+ENV TORCH_CUDA_ARCH_LIST="12.1f"
+ENV CUDA_VISIBLE_ARCHITECTURES="12.1"
+# Triton paths
+ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
+# Note: Do NOT set TORCH_ALLOW_TF32_CUBLAS_OVERRIDE as it conflicts with PyTorch's new TF32 API
+# TF32 is enabled by default on Ampere+ GPUs
+# ============================================================================
+# Build vLLM
+# ============================================================================
+RUN pip install --no-build-isolation . -v
+# ============================================================================
+# Clean up source directory to avoid import conflicts
+# ============================================================================
+# The source vllm/ directory must be removed or Python will import from it
+# instead of the installed package (which has compiled _C extensions)
+WORKDIR /workspace
+RUN rm -rf /workspace/vllm
+# ============================================================================
+# Install Additional Runtime Dependencies
+# ============================================================================
+RUN pip install ray[default]
+# ============================================================================
+# Download Tiktoken Encodings
+# ============================================================================
+ENV TIKTOKEN_ENCODINGS_BASE=/workspace/tiktoken_encodings
+RUN mkdir -p ${TIKTOKEN_ENCODINGS_BASE} && \
+    wget -O ${TIKTOKEN_ENCODINGS_BASE}/o200k_base.tiktoken \
+        "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" && \
+    wget -O ${TIKTOKEN_ENCODINGS_BASE}/cl100k_base.tiktoken \
+        "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
+# ============================================================================
+# NCCL Configuration for InfiniBand/RoCE Multi-GPU
+# ============================================================================
+ENV NCCL_IB_DISABLE=0
+ENV NCCL_DEBUG=WARN
+ENV NCCL_NET_GDR_LEVEL=2
+ENV NCCL_IB_TIMEOUT=23
+ENV NCCL_IB_GID_INDEX=0
+ENV NCCL_ASYNC_ERROR_HANDLING=1
+ENV TORCH_NCCL_BLOCKING_WAIT=1
+# ============================================================================
+# vLLM V1 Engine and Optimization Settings
+# ============================================================================
+# Enable V1 engine for hybrid model support
+ENV VLLM_USE_V1=1
+# FlashInfer attention backend
+ENV VLLM_ATTENTION_BACKEND=FLASHINFER
+# CUDA graph mode for hybrid Mamba-Transformer models
+ENV VLLM_CUDA_GRAPH_MODE=full_and_piecewise
+# FlashInfer MoE for NVFP4 quantization (required for non-gated activations like ReLU²)
+ENV VLLM_USE_FLASHINFER_MOE_FP4=1
+# Note: Set VLLM_FLASHINFER_MOE_BACKEND=latency at runtime for SM12.1 compatibility
+ENV VLLM_FLASHINFER_MOE_BACKEND=latency
+# ============================================================================
+# Finalize
+# ============================================================================
+WORKDIR /workspace
+# Expose vLLM API port
+EXPOSE 8000
+# Default entrypoint
+ENTRYPOINT ["vllm"]
+CMD ["--help"]

README.md ADDED Viewed

	@@ -0,0 +1,150 @@

+---
+title: vLLM for DGX Spark (Blackwell GB10)
+emoji: 🚀
+colorFrom: green
+colorTo: blue
+sdk: docker
+pinned: false
+license: apache-2.0
+tags:
+  - vllm
+  - dgx-spark
+  - blackwell
+  - gb10
+  - nemotron
+  - cuda-13
+---
+# vLLM for DGX Spark (Blackwell GB10)
+Optimized vLLM Docker image for running Nemotron3-Nano and other models on NVIDIA DGX Spark with CUDA graphs enabled.
+## Credits
+- **Model**: [cybermotaz/nemotron3-nano-nvfp4-w4a16](https://huggingface.co/cybermotaz/nemotron3-nano-nvfp4-w4a16) - NVFP4 quantization by [@cybermotaz](https://huggingface.co/cybermotaz)
+- **Original Model**: [nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16](https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16) by NVIDIA
+- **This Docker Image**: Resolves DGX Spark (GB10/SM12.1) build and runtime issues with the `avarok/vllm-dgx-spark` Docker image
+## Performance
+| Mode | Throughput |
+|------|------------|
+| Eager mode (`--enforce-eager`) | ~42 tok/s |
+| **CUDA graphs enabled** | **~66-67 tok/s** |
+**~60% speedup** with CUDA graphs on DGX Spark GB10!
+## Quick Start (One-Liner)
+```bash
+docker run --rm -it --gpus all --ipc=host -p 8000:8000 -e VLLM_FLASHINFER_MOE_BACKEND=latency -v ~/.cache/huggingface:/root/.cache/huggingface avarok/vllm-dgx-spark:v11 serve cybermotaz/nemotron3-nano-nvfp4-w4a16 --quantization modelopt_fp4 --kv-cache-dtype fp8 --trust-remote-code --max-model-len 131072 --gpu-memory-utilization 0.85
+```
+Then test with:
+```bash
+curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{"model":"cybermotaz/nemotron3-nano-nvfp4-w4a16","messages":[{"role":"user","content":"Hello!"}],"max_tokens":100}'
+```
+## What This Image Fixes
+This image solves several compatibility issues when running vLLM on DGX Spark (Blackwell GB10, SM12.1):
+| Issue | Solution |
+|-------|----------|
+| Non-gated activations (ReLU²) not supported | Built from vLLM main branch with PR #29004 |
+| CUDA architecture mismatch | Built with `TORCH_CUDA_ARCH_LIST="12.1f"` for GB10 |
+| SM120 CUTLASS kernel failures | Uses `VLLM_FLASHINFER_MOE_BACKEND=latency` |
+| FP4/scaled_mm kernel issues | CMakeLists patch to restrict to SM10.0 |
+| CUDA 13.0 compatibility | Full CUDA 13.0 + PyTorch cu130 support |
+## Docker Image
+```bash
+docker pull avarok/vllm-dgx-spark:v11
+```
+Image size: ~27GB
+## Building From Source
+If you prefer to build the image yourself:
+```bash
+git clone https://huggingface.co/avarok/vllm-dgx-spark
+cd vllm-dgx-spark
+docker build -t vllm-dgx-spark:v11 .
+```
+Build time: ~45-60 minutes on DGX Spark
+## Environment Variables
+| Variable | Value | Description |
+|----------|-------|-------------|
+| `VLLM_FLASHINFER_MOE_BACKEND` | `latency` | **Required** for SM12.1 compatibility |
+| `VLLM_USE_V1` | `1` (default) | Use V1 engine |
+| `VLLM_ATTENTION_BACKEND` | `FLASHINFER` (default) | FlashInfer attention |
+| `VLLM_CUDA_GRAPH_MODE` | `full_and_piecewise` (default) | CUDA graph mode |
+## Full Run Command
+```bash
+docker run -d --name vllm-nemotron \
+  --gpus all --ipc=host -p 8000:8000 \
+  -e VLLM_FLASHINFER_MOE_BACKEND=latency \
+  -v ~/.cache/huggingface:/root/.cache/huggingface \
+  avarok/vllm-dgx-spark:v11 \
+  serve cybermotaz/nemotron3-nano-nvfp4-w4a16 \
+    --quantization modelopt_fp4 \
+    --kv-cache-dtype fp8 \
+    --trust-remote-code \
+    --max-model-len 131072 \
+    --gpu-memory-utilization 0.85 \
+    --enable-auto-tool-choice \
+    --tool-call-parser qwen3_coder \
+    --reasoning-parser deepseek_r1
+```
+## Startup Time
+First startup takes ~8-10 minutes due to:
+- torch.compile (~5 min)
+- FlashInfer autotuning (~2 min)
+- CUDA graph capture (~1 min)
+Subsequent startups with cached compilation are faster.
+## Hardware Requirements
+- NVIDIA DGX Spark with GB10 GPU (SM12.1, Blackwell architecture)
+- 128GB unified memory
+- CUDA 13.0+
+## Troubleshooting
+### "Failed to initialize cutlass TMA WS grouped gemm"
+Make sure you're using `-e VLLM_FLASHINFER_MOE_BACKEND=latency`. The `throughput` backend has SM120 kernel issues on SM12.1.
+### Memory errors
+Reduce `--gpu-memory-utilization` to 0.75 or lower, or reduce `--max-model-len`.
+### Slow performance (~42 tok/s instead of ~67 tok/s)
+Check that CUDA graphs are enabled (no `--enforce-eager` flag) and startup completed successfully. Look for "Capturing CUDA graphs" in the logs.
+## Files in This Repo
+- `Dockerfile` - Reproducible build for vLLM on DGX Spark
+- `vllm_cmakelists.patch` - Patch for SM12.x kernel compatibility
+- `README.md` - This file
+## License
+Apache 2.0
+## Links
+- [Docker Hub: avarok/vllm-dgx-spark](https://hub.docker.com/r/avarok/vllm-dgx-spark)
+- [vLLM Project](https://github.com/vllm-project/vllm)
+- [FlashInfer](https://github.com/flashinfer-ai/flashinfer)
+- [Original Nemotron Model](https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16)
+- [NVFP4 Quantization by cybermotaz](https://huggingface.co/cybermotaz/nemotron3-nano-nvfp4-w4a16)

vllm_cmakelists.patch ADDED Viewed

	@@ -0,0 +1,49 @@

+# ============================================================================
+# vLLM CMakeLists.txt Patch for DGX Spark (GB10)
+# ============================================================================
+# This patch removes SM12.0/12.1 architectures from certain CUDA kernel
+# compilations that have issues on DGX Spark's GB10 GPU.
+#
+# The GB10 GPU has compute capability 12.1, but certain FP4 and scaled_mm
+# kernels compiled for SM12.x cause runtime errors. This patch restricts
+# those kernels to SM10.0 (Hopper) architecture only, while still allowing
+# the main model to run on SM12.x.
+# ============================================================================
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 7cb94f919..f860e533e 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -594,9 +594,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
+   # FP4 Archs and flags
+   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+-    cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
++    cuda_archs_loose_intersection(FP4_ARCHS "10.0f" "${CUDA_ARCHS}")
+   else()
+-    cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}")
++    cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
+   endif()
+   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
+     set(SRCS
+@@ -668,7 +668,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
+   endif()
+   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
++    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f" "${CUDA_ARCHS}")
+   else()
+     cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
+   endif()
+@@ -716,9 +716,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
+   endif()
+   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
++    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f" "${CUDA_ARCHS}")
+   else()
+-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
++    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
+   endif()
+   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
+     set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")