nologik commited on
Commit
5dd0dc4
·
verified ·
1 Parent(s): 7ef921e

Add vLLM Docker image for DGX Spark (Blackwell GB10) with CUDA graphs support

Browse files
Files changed (3) hide show
  1. Dockerfile +184 -0
  2. README.md +150 -0
  3. vllm_cmakelists.patch +49 -0
Dockerfile ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ============================================================================
2
+ # DGX Spark Optimized vLLM - Built from main branch
3
+ # ============================================================================
4
+ # Purpose: Build vLLM from source to include non-gated activations support
5
+ # for Nemotron3-Nano and other hybrid Mamba-Transformer models
6
+ #
7
+ # Key Features:
8
+ # - vLLM built from main branch (includes PR #29004 for non-gated activations)
9
+ # - CUDA 13.0 support for DGX Spark (GB10, compute capability 12.1)
10
+ # - FlashInfer for optimized attention and MoE kernels
11
+ # - Full CUDA graph support for hybrid models
12
+ #
13
+ # Build:
14
+ # docker build -t vllm-dgx-spark:v11 .
15
+ #
16
+ # Usage:
17
+ # docker run --gpus all --ipc=host -p 8000:8000 \
18
+ # -e VLLM_FLASHINFER_MOE_BACKEND=latency \
19
+ # vllm-dgx-spark:v11 \
20
+ # serve <model> --quantization modelopt_fp4 --kv-cache-dtype fp8
21
+ # ============================================================================
22
+
23
+ FROM nvidia/cuda:13.0.2-cudnn-devel-ubuntu24.04
24
+
25
+ LABEL maintainer="avarok"
26
+ LABEL version="v11"
27
+ LABEL description="vLLM with non-gated activations support for Nemotron3-Nano on DGX Spark"
28
+
29
+ # Build arguments for cache busting and version pinning
30
+ ARG VLLM_COMMIT=main
31
+ ARG CACHEBUST_DEPS=1
32
+ ARG CACHEBUST_VLLM=1
33
+
34
+ # ============================================================================
35
+ # System Dependencies
36
+ # ============================================================================
37
+ RUN apt-get update && apt-get install -y \
38
+ python3.12 python3.12-venv python3.12-dev python3-pip \
39
+ git wget curl patch \
40
+ cmake build-essential ninja-build \
41
+ # InfiniBand/RDMA libraries for multi-node
42
+ libibverbs1 libibverbs-dev ibverbs-providers rdma-core perftest \
43
+ # Network utilities
44
+ iproute2 iputils-ping net-tools openssh-client \
45
+ && rm -rf /var/lib/apt/lists/*
46
+
47
+ # ============================================================================
48
+ # Python Virtual Environment
49
+ # ============================================================================
50
+ WORKDIR /workspace
51
+ RUN python3.12 -m venv /opt/venv
52
+ ENV PATH="/opt/venv/bin:$PATH"
53
+ ENV VIRTUAL_ENV="/opt/venv"
54
+
55
+ # Upgrade pip
56
+ RUN pip install --upgrade pip setuptools wheel
57
+
58
+ # ============================================================================
59
+ # PyTorch and Core Dependencies
60
+ # ============================================================================
61
+ ARG CACHEBUST_DEPS
62
+ # Install PyTorch with CUDA 13.0 support
63
+ RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130
64
+
65
+ # Install xgrammar (structured output generation)
66
+ RUN pip install xgrammar
67
+
68
+ # Install FlashInfer (pre-release for CUDA 13.0 support)
69
+ RUN pip install flashinfer-python --pre
70
+
71
+ # IMPORTANT: Remove triton after installations as it causes CUDA 13.0 errors
72
+ # Both PyTorch and xgrammar pull it in as dependency
73
+ RUN pip uninstall -y triton || true && echo "Triton removed (if present)"
74
+
75
+ # ============================================================================
76
+ # Clone and Build vLLM from Source
77
+ # ============================================================================
78
+ ARG CACHEBUST_VLLM
79
+ ARG VLLM_COMMIT
80
+
81
+ WORKDIR /workspace/vllm
82
+ RUN git clone --recursive https://github.com/vllm-project/vllm.git . && \
83
+ git checkout ${VLLM_COMMIT} && \
84
+ echo "Building vLLM from commit: $(git rev-parse HEAD)"
85
+
86
+ # Prepare for existing torch installation
87
+ RUN python3 use_existing_torch.py
88
+
89
+ # Remove flashinfer from requirements (we installed it separately)
90
+ RUN sed -i "/flashinfer/d" requirements/cuda.txt || true
91
+ RUN sed -i '/^triton\b/d' requirements/test.txt || true
92
+
93
+ # Install build requirements
94
+ RUN pip install -r requirements/build.txt
95
+
96
+ # ============================================================================
97
+ # CMakeLists Patch for DGX Spark (GB10)
98
+ # ============================================================================
99
+ # This patch removes problematic SM12.x architectures from certain kernel
100
+ # compilations that cause issues on DGX Spark's GB10 GPU
101
+ COPY vllm_cmakelists.patch .
102
+ RUN patch -p1 < vllm_cmakelists.patch || echo "Patch may have already been applied or is not needed"
103
+
104
+ # ============================================================================
105
+ # Build Environment Variables
106
+ # ============================================================================
107
+ # GB10 compute capability 12.1 (Blackwell architecture)
108
+ # The 'f' suffix enables forward compatibility (PTX JIT for future architectures)
109
+ ENV TORCH_CUDA_ARCH_LIST="12.1f"
110
+ ENV CUDA_VISIBLE_ARCHITECTURES="12.1"
111
+
112
+ # Triton paths
113
+ ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
114
+
115
+ # Note: Do NOT set TORCH_ALLOW_TF32_CUBLAS_OVERRIDE as it conflicts with PyTorch's new TF32 API
116
+ # TF32 is enabled by default on Ampere+ GPUs
117
+
118
+ # ============================================================================
119
+ # Build vLLM
120
+ # ============================================================================
121
+ RUN pip install --no-build-isolation . -v
122
+
123
+ # ============================================================================
124
+ # Clean up source directory to avoid import conflicts
125
+ # ============================================================================
126
+ # The source vllm/ directory must be removed or Python will import from it
127
+ # instead of the installed package (which has compiled _C extensions)
128
+ WORKDIR /workspace
129
+ RUN rm -rf /workspace/vllm
130
+
131
+ # ============================================================================
132
+ # Install Additional Runtime Dependencies
133
+ # ============================================================================
134
+ RUN pip install ray[default]
135
+
136
+ # ============================================================================
137
+ # Download Tiktoken Encodings
138
+ # ============================================================================
139
+ ENV TIKTOKEN_ENCODINGS_BASE=/workspace/tiktoken_encodings
140
+ RUN mkdir -p ${TIKTOKEN_ENCODINGS_BASE} && \
141
+ wget -O ${TIKTOKEN_ENCODINGS_BASE}/o200k_base.tiktoken \
142
+ "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" && \
143
+ wget -O ${TIKTOKEN_ENCODINGS_BASE}/cl100k_base.tiktoken \
144
+ "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
145
+
146
+ # ============================================================================
147
+ # NCCL Configuration for InfiniBand/RoCE Multi-GPU
148
+ # ============================================================================
149
+ ENV NCCL_IB_DISABLE=0
150
+ ENV NCCL_DEBUG=WARN
151
+ ENV NCCL_NET_GDR_LEVEL=2
152
+ ENV NCCL_IB_TIMEOUT=23
153
+ ENV NCCL_IB_GID_INDEX=0
154
+ ENV NCCL_ASYNC_ERROR_HANDLING=1
155
+ ENV TORCH_NCCL_BLOCKING_WAIT=1
156
+
157
+ # ============================================================================
158
+ # vLLM V1 Engine and Optimization Settings
159
+ # ============================================================================
160
+ # Enable V1 engine for hybrid model support
161
+ ENV VLLM_USE_V1=1
162
+
163
+ # FlashInfer attention backend
164
+ ENV VLLM_ATTENTION_BACKEND=FLASHINFER
165
+
166
+ # CUDA graph mode for hybrid Mamba-Transformer models
167
+ ENV VLLM_CUDA_GRAPH_MODE=full_and_piecewise
168
+
169
+ # FlashInfer MoE for NVFP4 quantization (required for non-gated activations like ReLU²)
170
+ ENV VLLM_USE_FLASHINFER_MOE_FP4=1
171
+ # Note: Set VLLM_FLASHINFER_MOE_BACKEND=latency at runtime for SM12.1 compatibility
172
+ ENV VLLM_FLASHINFER_MOE_BACKEND=latency
173
+
174
+ # ============================================================================
175
+ # Finalize
176
+ # ============================================================================
177
+ WORKDIR /workspace
178
+
179
+ # Expose vLLM API port
180
+ EXPOSE 8000
181
+
182
+ # Default entrypoint
183
+ ENTRYPOINT ["vllm"]
184
+ CMD ["--help"]
README.md ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: vLLM for DGX Spark (Blackwell GB10)
3
+ emoji: 🚀
4
+ colorFrom: green
5
+ colorTo: blue
6
+ sdk: docker
7
+ pinned: false
8
+ license: apache-2.0
9
+ tags:
10
+ - vllm
11
+ - dgx-spark
12
+ - blackwell
13
+ - gb10
14
+ - nemotron
15
+ - cuda-13
16
+ ---
17
+
18
+ # vLLM for DGX Spark (Blackwell GB10)
19
+
20
+ Optimized vLLM Docker image for running Nemotron3-Nano and other models on NVIDIA DGX Spark with CUDA graphs enabled.
21
+
22
+ ## Credits
23
+
24
+ - **Model**: [cybermotaz/nemotron3-nano-nvfp4-w4a16](https://huggingface.co/cybermotaz/nemotron3-nano-nvfp4-w4a16) - NVFP4 quantization by [@cybermotaz](https://huggingface.co/cybermotaz)
25
+ - **Original Model**: [nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16](https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16) by NVIDIA
26
+ - **This Docker Image**: Resolves DGX Spark (GB10/SM12.1) build and runtime issues with the `avarok/vllm-dgx-spark` Docker image
27
+
28
+ ## Performance
29
+
30
+ | Mode | Throughput |
31
+ |------|------------|
32
+ | Eager mode (`--enforce-eager`) | ~42 tok/s |
33
+ | **CUDA graphs enabled** | **~66-67 tok/s** |
34
+
35
+ **~60% speedup** with CUDA graphs on DGX Spark GB10!
36
+
37
+ ## Quick Start (One-Liner)
38
+
39
+ ```bash
40
+ docker run --rm -it --gpus all --ipc=host -p 8000:8000 -e VLLM_FLASHINFER_MOE_BACKEND=latency -v ~/.cache/huggingface:/root/.cache/huggingface avarok/vllm-dgx-spark:v11 serve cybermotaz/nemotron3-nano-nvfp4-w4a16 --quantization modelopt_fp4 --kv-cache-dtype fp8 --trust-remote-code --max-model-len 131072 --gpu-memory-utilization 0.85
41
+ ```
42
+
43
+ Then test with:
44
+ ```bash
45
+ curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{"model":"cybermotaz/nemotron3-nano-nvfp4-w4a16","messages":[{"role":"user","content":"Hello!"}],"max_tokens":100}'
46
+ ```
47
+
48
+ ## What This Image Fixes
49
+
50
+ This image solves several compatibility issues when running vLLM on DGX Spark (Blackwell GB10, SM12.1):
51
+
52
+ | Issue | Solution |
53
+ |-------|----------|
54
+ | Non-gated activations (ReLU²) not supported | Built from vLLM main branch with PR #29004 |
55
+ | CUDA architecture mismatch | Built with `TORCH_CUDA_ARCH_LIST="12.1f"` for GB10 |
56
+ | SM120 CUTLASS kernel failures | Uses `VLLM_FLASHINFER_MOE_BACKEND=latency` |
57
+ | FP4/scaled_mm kernel issues | CMakeLists patch to restrict to SM10.0 |
58
+ | CUDA 13.0 compatibility | Full CUDA 13.0 + PyTorch cu130 support |
59
+
60
+ ## Docker Image
61
+
62
+ ```bash
63
+ docker pull avarok/vllm-dgx-spark:v11
64
+ ```
65
+
66
+ Image size: ~27GB
67
+
68
+ ## Building From Source
69
+
70
+ If you prefer to build the image yourself:
71
+
72
+ ```bash
73
+ git clone https://huggingface.co/avarok/vllm-dgx-spark
74
+ cd vllm-dgx-spark
75
+ docker build -t vllm-dgx-spark:v11 .
76
+ ```
77
+
78
+ Build time: ~45-60 minutes on DGX Spark
79
+
80
+ ## Environment Variables
81
+
82
+ | Variable | Value | Description |
83
+ |----------|-------|-------------|
84
+ | `VLLM_FLASHINFER_MOE_BACKEND` | `latency` | **Required** for SM12.1 compatibility |
85
+ | `VLLM_USE_V1` | `1` (default) | Use V1 engine |
86
+ | `VLLM_ATTENTION_BACKEND` | `FLASHINFER` (default) | FlashInfer attention |
87
+ | `VLLM_CUDA_GRAPH_MODE` | `full_and_piecewise` (default) | CUDA graph mode |
88
+
89
+ ## Full Run Command
90
+
91
+ ```bash
92
+ docker run -d --name vllm-nemotron \
93
+ --gpus all --ipc=host -p 8000:8000 \
94
+ -e VLLM_FLASHINFER_MOE_BACKEND=latency \
95
+ -v ~/.cache/huggingface:/root/.cache/huggingface \
96
+ avarok/vllm-dgx-spark:v11 \
97
+ serve cybermotaz/nemotron3-nano-nvfp4-w4a16 \
98
+ --quantization modelopt_fp4 \
99
+ --kv-cache-dtype fp8 \
100
+ --trust-remote-code \
101
+ --max-model-len 131072 \
102
+ --gpu-memory-utilization 0.85 \
103
+ --enable-auto-tool-choice \
104
+ --tool-call-parser qwen3_coder \
105
+ --reasoning-parser deepseek_r1
106
+ ```
107
+
108
+ ## Startup Time
109
+
110
+ First startup takes ~8-10 minutes due to:
111
+ - torch.compile (~5 min)
112
+ - FlashInfer autotuning (~2 min)
113
+ - CUDA graph capture (~1 min)
114
+
115
+ Subsequent startups with cached compilation are faster.
116
+
117
+ ## Hardware Requirements
118
+
119
+ - NVIDIA DGX Spark with GB10 GPU (SM12.1, Blackwell architecture)
120
+ - 128GB unified memory
121
+ - CUDA 13.0+
122
+
123
+ ## Troubleshooting
124
+
125
+ ### "Failed to initialize cutlass TMA WS grouped gemm"
126
+ Make sure you're using `-e VLLM_FLASHINFER_MOE_BACKEND=latency`. The `throughput` backend has SM120 kernel issues on SM12.1.
127
+
128
+ ### Memory errors
129
+ Reduce `--gpu-memory-utilization` to 0.75 or lower, or reduce `--max-model-len`.
130
+
131
+ ### Slow performance (~42 tok/s instead of ~67 tok/s)
132
+ Check that CUDA graphs are enabled (no `--enforce-eager` flag) and startup completed successfully. Look for "Capturing CUDA graphs" in the logs.
133
+
134
+ ## Files in This Repo
135
+
136
+ - `Dockerfile` - Reproducible build for vLLM on DGX Spark
137
+ - `vllm_cmakelists.patch` - Patch for SM12.x kernel compatibility
138
+ - `README.md` - This file
139
+
140
+ ## License
141
+
142
+ Apache 2.0
143
+
144
+ ## Links
145
+
146
+ - [Docker Hub: avarok/vllm-dgx-spark](https://hub.docker.com/r/avarok/vllm-dgx-spark)
147
+ - [vLLM Project](https://github.com/vllm-project/vllm)
148
+ - [FlashInfer](https://github.com/flashinfer-ai/flashinfer)
149
+ - [Original Nemotron Model](https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16)
150
+ - [NVFP4 Quantization by cybermotaz](https://huggingface.co/cybermotaz/nemotron3-nano-nvfp4-w4a16)
vllm_cmakelists.patch ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ============================================================================
2
+ # vLLM CMakeLists.txt Patch for DGX Spark (GB10)
3
+ # ============================================================================
4
+ # This patch removes SM12.0/12.1 architectures from certain CUDA kernel
5
+ # compilations that have issues on DGX Spark's GB10 GPU.
6
+ #
7
+ # The GB10 GPU has compute capability 12.1, but certain FP4 and scaled_mm
8
+ # kernels compiled for SM12.x cause runtime errors. This patch restricts
9
+ # those kernels to SM10.0 (Hopper) architecture only, while still allowing
10
+ # the main model to run on SM12.x.
11
+ # ============================================================================
12
+
13
+ diff --git a/CMakeLists.txt b/CMakeLists.txt
14
+ index 7cb94f919..f860e533e 100644
15
+ --- a/CMakeLists.txt
16
+ +++ b/CMakeLists.txt
17
+ @@ -594,9 +594,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
18
+
19
+ # FP4 Archs and flags
20
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
21
+ - cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
22
+ + cuda_archs_loose_intersection(FP4_ARCHS "10.0f" "${CUDA_ARCHS}")
23
+ else()
24
+ - cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}")
25
+ + cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
26
+ endif()
27
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
28
+ set(SRCS
29
+ @@ -668,7 +668,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
30
+ endif()
31
+
32
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
33
+ - cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
34
+ + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f" "${CUDA_ARCHS}")
35
+ else()
36
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
37
+ endif()
38
+ @@ -716,9 +716,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
39
+ endif()
40
+
41
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
42
+ - cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
43
+ + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f" "${CUDA_ARCHS}")
44
+ else()
45
+ - cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
46
+ + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
47
+ endif()
48
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
49
+ set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")