Add vLLM Docker image for DGX Spark (Blackwell GB10) with CUDA graphs support
Browse files- Dockerfile +184 -0
- README.md +150 -0
- vllm_cmakelists.patch +49 -0
Dockerfile
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ============================================================================
|
| 2 |
+
# DGX Spark Optimized vLLM - Built from main branch
|
| 3 |
+
# ============================================================================
|
| 4 |
+
# Purpose: Build vLLM from source to include non-gated activations support
|
| 5 |
+
# for Nemotron3-Nano and other hybrid Mamba-Transformer models
|
| 6 |
+
#
|
| 7 |
+
# Key Features:
|
| 8 |
+
# - vLLM built from main branch (includes PR #29004 for non-gated activations)
|
| 9 |
+
# - CUDA 13.0 support for DGX Spark (GB10, compute capability 12.1)
|
| 10 |
+
# - FlashInfer for optimized attention and MoE kernels
|
| 11 |
+
# - Full CUDA graph support for hybrid models
|
| 12 |
+
#
|
| 13 |
+
# Build:
|
| 14 |
+
# docker build -t vllm-dgx-spark:v11 .
|
| 15 |
+
#
|
| 16 |
+
# Usage:
|
| 17 |
+
# docker run --gpus all --ipc=host -p 8000:8000 \
|
| 18 |
+
# -e VLLM_FLASHINFER_MOE_BACKEND=latency \
|
| 19 |
+
# vllm-dgx-spark:v11 \
|
| 20 |
+
# serve <model> --quantization modelopt_fp4 --kv-cache-dtype fp8
|
| 21 |
+
# ============================================================================
|
| 22 |
+
|
| 23 |
+
FROM nvidia/cuda:13.0.2-cudnn-devel-ubuntu24.04
|
| 24 |
+
|
| 25 |
+
LABEL maintainer="avarok"
|
| 26 |
+
LABEL version="v11"
|
| 27 |
+
LABEL description="vLLM with non-gated activations support for Nemotron3-Nano on DGX Spark"
|
| 28 |
+
|
| 29 |
+
# Build arguments for cache busting and version pinning
|
| 30 |
+
ARG VLLM_COMMIT=main
|
| 31 |
+
ARG CACHEBUST_DEPS=1
|
| 32 |
+
ARG CACHEBUST_VLLM=1
|
| 33 |
+
|
| 34 |
+
# ============================================================================
|
| 35 |
+
# System Dependencies
|
| 36 |
+
# ============================================================================
|
| 37 |
+
RUN apt-get update && apt-get install -y \
|
| 38 |
+
python3.12 python3.12-venv python3.12-dev python3-pip \
|
| 39 |
+
git wget curl patch \
|
| 40 |
+
cmake build-essential ninja-build \
|
| 41 |
+
# InfiniBand/RDMA libraries for multi-node
|
| 42 |
+
libibverbs1 libibverbs-dev ibverbs-providers rdma-core perftest \
|
| 43 |
+
# Network utilities
|
| 44 |
+
iproute2 iputils-ping net-tools openssh-client \
|
| 45 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 46 |
+
|
| 47 |
+
# ============================================================================
|
| 48 |
+
# Python Virtual Environment
|
| 49 |
+
# ============================================================================
|
| 50 |
+
WORKDIR /workspace
|
| 51 |
+
RUN python3.12 -m venv /opt/venv
|
| 52 |
+
ENV PATH="/opt/venv/bin:$PATH"
|
| 53 |
+
ENV VIRTUAL_ENV="/opt/venv"
|
| 54 |
+
|
| 55 |
+
# Upgrade pip
|
| 56 |
+
RUN pip install --upgrade pip setuptools wheel
|
| 57 |
+
|
| 58 |
+
# ============================================================================
|
| 59 |
+
# PyTorch and Core Dependencies
|
| 60 |
+
# ============================================================================
|
| 61 |
+
ARG CACHEBUST_DEPS
|
| 62 |
+
# Install PyTorch with CUDA 13.0 support
|
| 63 |
+
RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130
|
| 64 |
+
|
| 65 |
+
# Install xgrammar (structured output generation)
|
| 66 |
+
RUN pip install xgrammar
|
| 67 |
+
|
| 68 |
+
# Install FlashInfer (pre-release for CUDA 13.0 support)
|
| 69 |
+
RUN pip install flashinfer-python --pre
|
| 70 |
+
|
| 71 |
+
# IMPORTANT: Remove triton after installations as it causes CUDA 13.0 errors
|
| 72 |
+
# Both PyTorch and xgrammar pull it in as dependency
|
| 73 |
+
RUN pip uninstall -y triton || true && echo "Triton removed (if present)"
|
| 74 |
+
|
| 75 |
+
# ============================================================================
|
| 76 |
+
# Clone and Build vLLM from Source
|
| 77 |
+
# ============================================================================
|
| 78 |
+
ARG CACHEBUST_VLLM
|
| 79 |
+
ARG VLLM_COMMIT
|
| 80 |
+
|
| 81 |
+
WORKDIR /workspace/vllm
|
| 82 |
+
RUN git clone --recursive https://github.com/vllm-project/vllm.git . && \
|
| 83 |
+
git checkout ${VLLM_COMMIT} && \
|
| 84 |
+
echo "Building vLLM from commit: $(git rev-parse HEAD)"
|
| 85 |
+
|
| 86 |
+
# Prepare for existing torch installation
|
| 87 |
+
RUN python3 use_existing_torch.py
|
| 88 |
+
|
| 89 |
+
# Remove flashinfer from requirements (we installed it separately)
|
| 90 |
+
RUN sed -i "/flashinfer/d" requirements/cuda.txt || true
|
| 91 |
+
RUN sed -i '/^triton\b/d' requirements/test.txt || true
|
| 92 |
+
|
| 93 |
+
# Install build requirements
|
| 94 |
+
RUN pip install -r requirements/build.txt
|
| 95 |
+
|
| 96 |
+
# ============================================================================
|
| 97 |
+
# CMakeLists Patch for DGX Spark (GB10)
|
| 98 |
+
# ============================================================================
|
| 99 |
+
# This patch removes problematic SM12.x architectures from certain kernel
|
| 100 |
+
# compilations that cause issues on DGX Spark's GB10 GPU
|
| 101 |
+
COPY vllm_cmakelists.patch .
|
| 102 |
+
RUN patch -p1 < vllm_cmakelists.patch || echo "Patch may have already been applied or is not needed"
|
| 103 |
+
|
| 104 |
+
# ============================================================================
|
| 105 |
+
# Build Environment Variables
|
| 106 |
+
# ============================================================================
|
| 107 |
+
# GB10 compute capability 12.1 (Blackwell architecture)
|
| 108 |
+
# The 'f' suffix enables forward compatibility (PTX JIT for future architectures)
|
| 109 |
+
ENV TORCH_CUDA_ARCH_LIST="12.1f"
|
| 110 |
+
ENV CUDA_VISIBLE_ARCHITECTURES="12.1"
|
| 111 |
+
|
| 112 |
+
# Triton paths
|
| 113 |
+
ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
|
| 114 |
+
|
| 115 |
+
# Note: Do NOT set TORCH_ALLOW_TF32_CUBLAS_OVERRIDE as it conflicts with PyTorch's new TF32 API
|
| 116 |
+
# TF32 is enabled by default on Ampere+ GPUs
|
| 117 |
+
|
| 118 |
+
# ============================================================================
|
| 119 |
+
# Build vLLM
|
| 120 |
+
# ============================================================================
|
| 121 |
+
RUN pip install --no-build-isolation . -v
|
| 122 |
+
|
| 123 |
+
# ============================================================================
|
| 124 |
+
# Clean up source directory to avoid import conflicts
|
| 125 |
+
# ============================================================================
|
| 126 |
+
# The source vllm/ directory must be removed or Python will import from it
|
| 127 |
+
# instead of the installed package (which has compiled _C extensions)
|
| 128 |
+
WORKDIR /workspace
|
| 129 |
+
RUN rm -rf /workspace/vllm
|
| 130 |
+
|
| 131 |
+
# ============================================================================
|
| 132 |
+
# Install Additional Runtime Dependencies
|
| 133 |
+
# ============================================================================
|
| 134 |
+
RUN pip install ray[default]
|
| 135 |
+
|
| 136 |
+
# ============================================================================
|
| 137 |
+
# Download Tiktoken Encodings
|
| 138 |
+
# ============================================================================
|
| 139 |
+
ENV TIKTOKEN_ENCODINGS_BASE=/workspace/tiktoken_encodings
|
| 140 |
+
RUN mkdir -p ${TIKTOKEN_ENCODINGS_BASE} && \
|
| 141 |
+
wget -O ${TIKTOKEN_ENCODINGS_BASE}/o200k_base.tiktoken \
|
| 142 |
+
"https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" && \
|
| 143 |
+
wget -O ${TIKTOKEN_ENCODINGS_BASE}/cl100k_base.tiktoken \
|
| 144 |
+
"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
|
| 145 |
+
|
| 146 |
+
# ============================================================================
|
| 147 |
+
# NCCL Configuration for InfiniBand/RoCE Multi-GPU
|
| 148 |
+
# ============================================================================
|
| 149 |
+
ENV NCCL_IB_DISABLE=0
|
| 150 |
+
ENV NCCL_DEBUG=WARN
|
| 151 |
+
ENV NCCL_NET_GDR_LEVEL=2
|
| 152 |
+
ENV NCCL_IB_TIMEOUT=23
|
| 153 |
+
ENV NCCL_IB_GID_INDEX=0
|
| 154 |
+
ENV NCCL_ASYNC_ERROR_HANDLING=1
|
| 155 |
+
ENV TORCH_NCCL_BLOCKING_WAIT=1
|
| 156 |
+
|
| 157 |
+
# ============================================================================
|
| 158 |
+
# vLLM V1 Engine and Optimization Settings
|
| 159 |
+
# ============================================================================
|
| 160 |
+
# Enable V1 engine for hybrid model support
|
| 161 |
+
ENV VLLM_USE_V1=1
|
| 162 |
+
|
| 163 |
+
# FlashInfer attention backend
|
| 164 |
+
ENV VLLM_ATTENTION_BACKEND=FLASHINFER
|
| 165 |
+
|
| 166 |
+
# CUDA graph mode for hybrid Mamba-Transformer models
|
| 167 |
+
ENV VLLM_CUDA_GRAPH_MODE=full_and_piecewise
|
| 168 |
+
|
| 169 |
+
# FlashInfer MoE for NVFP4 quantization (required for non-gated activations like ReLU²)
|
| 170 |
+
ENV VLLM_USE_FLASHINFER_MOE_FP4=1
|
| 171 |
+
# Note: Set VLLM_FLASHINFER_MOE_BACKEND=latency at runtime for SM12.1 compatibility
|
| 172 |
+
ENV VLLM_FLASHINFER_MOE_BACKEND=latency
|
| 173 |
+
|
| 174 |
+
# ============================================================================
|
| 175 |
+
# Finalize
|
| 176 |
+
# ============================================================================
|
| 177 |
+
WORKDIR /workspace
|
| 178 |
+
|
| 179 |
+
# Expose vLLM API port
|
| 180 |
+
EXPOSE 8000
|
| 181 |
+
|
| 182 |
+
# Default entrypoint
|
| 183 |
+
ENTRYPOINT ["vllm"]
|
| 184 |
+
CMD ["--help"]
|
README.md
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: vLLM for DGX Spark (Blackwell GB10)
|
| 3 |
+
emoji: 🚀
|
| 4 |
+
colorFrom: green
|
| 5 |
+
colorTo: blue
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
license: apache-2.0
|
| 9 |
+
tags:
|
| 10 |
+
- vllm
|
| 11 |
+
- dgx-spark
|
| 12 |
+
- blackwell
|
| 13 |
+
- gb10
|
| 14 |
+
- nemotron
|
| 15 |
+
- cuda-13
|
| 16 |
+
---
|
| 17 |
+
|
| 18 |
+
# vLLM for DGX Spark (Blackwell GB10)
|
| 19 |
+
|
| 20 |
+
Optimized vLLM Docker image for running Nemotron3-Nano and other models on NVIDIA DGX Spark with CUDA graphs enabled.
|
| 21 |
+
|
| 22 |
+
## Credits
|
| 23 |
+
|
| 24 |
+
- **Model**: [cybermotaz/nemotron3-nano-nvfp4-w4a16](https://huggingface.co/cybermotaz/nemotron3-nano-nvfp4-w4a16) - NVFP4 quantization by [@cybermotaz](https://huggingface.co/cybermotaz)
|
| 25 |
+
- **Original Model**: [nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16](https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16) by NVIDIA
|
| 26 |
+
- **This Docker Image**: Resolves DGX Spark (GB10/SM12.1) build and runtime issues with the `avarok/vllm-dgx-spark` Docker image
|
| 27 |
+
|
| 28 |
+
## Performance
|
| 29 |
+
|
| 30 |
+
| Mode | Throughput |
|
| 31 |
+
|------|------------|
|
| 32 |
+
| Eager mode (`--enforce-eager`) | ~42 tok/s |
|
| 33 |
+
| **CUDA graphs enabled** | **~66-67 tok/s** |
|
| 34 |
+
|
| 35 |
+
**~60% speedup** with CUDA graphs on DGX Spark GB10!
|
| 36 |
+
|
| 37 |
+
## Quick Start (One-Liner)
|
| 38 |
+
|
| 39 |
+
```bash
|
| 40 |
+
docker run --rm -it --gpus all --ipc=host -p 8000:8000 -e VLLM_FLASHINFER_MOE_BACKEND=latency -v ~/.cache/huggingface:/root/.cache/huggingface avarok/vllm-dgx-spark:v11 serve cybermotaz/nemotron3-nano-nvfp4-w4a16 --quantization modelopt_fp4 --kv-cache-dtype fp8 --trust-remote-code --max-model-len 131072 --gpu-memory-utilization 0.85
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
Then test with:
|
| 44 |
+
```bash
|
| 45 |
+
curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{"model":"cybermotaz/nemotron3-nano-nvfp4-w4a16","messages":[{"role":"user","content":"Hello!"}],"max_tokens":100}'
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
## What This Image Fixes
|
| 49 |
+
|
| 50 |
+
This image solves several compatibility issues when running vLLM on DGX Spark (Blackwell GB10, SM12.1):
|
| 51 |
+
|
| 52 |
+
| Issue | Solution |
|
| 53 |
+
|-------|----------|
|
| 54 |
+
| Non-gated activations (ReLU²) not supported | Built from vLLM main branch with PR #29004 |
|
| 55 |
+
| CUDA architecture mismatch | Built with `TORCH_CUDA_ARCH_LIST="12.1f"` for GB10 |
|
| 56 |
+
| SM120 CUTLASS kernel failures | Uses `VLLM_FLASHINFER_MOE_BACKEND=latency` |
|
| 57 |
+
| FP4/scaled_mm kernel issues | CMakeLists patch to restrict to SM10.0 |
|
| 58 |
+
| CUDA 13.0 compatibility | Full CUDA 13.0 + PyTorch cu130 support |
|
| 59 |
+
|
| 60 |
+
## Docker Image
|
| 61 |
+
|
| 62 |
+
```bash
|
| 63 |
+
docker pull avarok/vllm-dgx-spark:v11
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
Image size: ~27GB
|
| 67 |
+
|
| 68 |
+
## Building From Source
|
| 69 |
+
|
| 70 |
+
If you prefer to build the image yourself:
|
| 71 |
+
|
| 72 |
+
```bash
|
| 73 |
+
git clone https://huggingface.co/avarok/vllm-dgx-spark
|
| 74 |
+
cd vllm-dgx-spark
|
| 75 |
+
docker build -t vllm-dgx-spark:v11 .
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
Build time: ~45-60 minutes on DGX Spark
|
| 79 |
+
|
| 80 |
+
## Environment Variables
|
| 81 |
+
|
| 82 |
+
| Variable | Value | Description |
|
| 83 |
+
|----------|-------|-------------|
|
| 84 |
+
| `VLLM_FLASHINFER_MOE_BACKEND` | `latency` | **Required** for SM12.1 compatibility |
|
| 85 |
+
| `VLLM_USE_V1` | `1` (default) | Use V1 engine |
|
| 86 |
+
| `VLLM_ATTENTION_BACKEND` | `FLASHINFER` (default) | FlashInfer attention |
|
| 87 |
+
| `VLLM_CUDA_GRAPH_MODE` | `full_and_piecewise` (default) | CUDA graph mode |
|
| 88 |
+
|
| 89 |
+
## Full Run Command
|
| 90 |
+
|
| 91 |
+
```bash
|
| 92 |
+
docker run -d --name vllm-nemotron \
|
| 93 |
+
--gpus all --ipc=host -p 8000:8000 \
|
| 94 |
+
-e VLLM_FLASHINFER_MOE_BACKEND=latency \
|
| 95 |
+
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
| 96 |
+
avarok/vllm-dgx-spark:v11 \
|
| 97 |
+
serve cybermotaz/nemotron3-nano-nvfp4-w4a16 \
|
| 98 |
+
--quantization modelopt_fp4 \
|
| 99 |
+
--kv-cache-dtype fp8 \
|
| 100 |
+
--trust-remote-code \
|
| 101 |
+
--max-model-len 131072 \
|
| 102 |
+
--gpu-memory-utilization 0.85 \
|
| 103 |
+
--enable-auto-tool-choice \
|
| 104 |
+
--tool-call-parser qwen3_coder \
|
| 105 |
+
--reasoning-parser deepseek_r1
|
| 106 |
+
```
|
| 107 |
+
|
| 108 |
+
## Startup Time
|
| 109 |
+
|
| 110 |
+
First startup takes ~8-10 minutes due to:
|
| 111 |
+
- torch.compile (~5 min)
|
| 112 |
+
- FlashInfer autotuning (~2 min)
|
| 113 |
+
- CUDA graph capture (~1 min)
|
| 114 |
+
|
| 115 |
+
Subsequent startups with cached compilation are faster.
|
| 116 |
+
|
| 117 |
+
## Hardware Requirements
|
| 118 |
+
|
| 119 |
+
- NVIDIA DGX Spark with GB10 GPU (SM12.1, Blackwell architecture)
|
| 120 |
+
- 128GB unified memory
|
| 121 |
+
- CUDA 13.0+
|
| 122 |
+
|
| 123 |
+
## Troubleshooting
|
| 124 |
+
|
| 125 |
+
### "Failed to initialize cutlass TMA WS grouped gemm"
|
| 126 |
+
Make sure you're using `-e VLLM_FLASHINFER_MOE_BACKEND=latency`. The `throughput` backend has SM120 kernel issues on SM12.1.
|
| 127 |
+
|
| 128 |
+
### Memory errors
|
| 129 |
+
Reduce `--gpu-memory-utilization` to 0.75 or lower, or reduce `--max-model-len`.
|
| 130 |
+
|
| 131 |
+
### Slow performance (~42 tok/s instead of ~67 tok/s)
|
| 132 |
+
Check that CUDA graphs are enabled (no `--enforce-eager` flag) and startup completed successfully. Look for "Capturing CUDA graphs" in the logs.
|
| 133 |
+
|
| 134 |
+
## Files in This Repo
|
| 135 |
+
|
| 136 |
+
- `Dockerfile` - Reproducible build for vLLM on DGX Spark
|
| 137 |
+
- `vllm_cmakelists.patch` - Patch for SM12.x kernel compatibility
|
| 138 |
+
- `README.md` - This file
|
| 139 |
+
|
| 140 |
+
## License
|
| 141 |
+
|
| 142 |
+
Apache 2.0
|
| 143 |
+
|
| 144 |
+
## Links
|
| 145 |
+
|
| 146 |
+
- [Docker Hub: avarok/vllm-dgx-spark](https://hub.docker.com/r/avarok/vllm-dgx-spark)
|
| 147 |
+
- [vLLM Project](https://github.com/vllm-project/vllm)
|
| 148 |
+
- [FlashInfer](https://github.com/flashinfer-ai/flashinfer)
|
| 149 |
+
- [Original Nemotron Model](https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16)
|
| 150 |
+
- [NVFP4 Quantization by cybermotaz](https://huggingface.co/cybermotaz/nemotron3-nano-nvfp4-w4a16)
|
vllm_cmakelists.patch
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ============================================================================
|
| 2 |
+
# vLLM CMakeLists.txt Patch for DGX Spark (GB10)
|
| 3 |
+
# ============================================================================
|
| 4 |
+
# This patch removes SM12.0/12.1 architectures from certain CUDA kernel
|
| 5 |
+
# compilations that have issues on DGX Spark's GB10 GPU.
|
| 6 |
+
#
|
| 7 |
+
# The GB10 GPU has compute capability 12.1, but certain FP4 and scaled_mm
|
| 8 |
+
# kernels compiled for SM12.x cause runtime errors. This patch restricts
|
| 9 |
+
# those kernels to SM10.0 (Hopper) architecture only, while still allowing
|
| 10 |
+
# the main model to run on SM12.x.
|
| 11 |
+
# ============================================================================
|
| 12 |
+
|
| 13 |
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
|
| 14 |
+
index 7cb94f919..f860e533e 100644
|
| 15 |
+
--- a/CMakeLists.txt
|
| 16 |
+
+++ b/CMakeLists.txt
|
| 17 |
+
@@ -594,9 +594,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
| 18 |
+
|
| 19 |
+
# FP4 Archs and flags
|
| 20 |
+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
| 21 |
+
- cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
| 22 |
+
+ cuda_archs_loose_intersection(FP4_ARCHS "10.0f" "${CUDA_ARCHS}")
|
| 23 |
+
else()
|
| 24 |
+
- cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}")
|
| 25 |
+
+ cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
|
| 26 |
+
endif()
|
| 27 |
+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
| 28 |
+
set(SRCS
|
| 29 |
+
@@ -668,7 +668,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
| 30 |
+
endif()
|
| 31 |
+
|
| 32 |
+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
| 33 |
+
- cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
|
| 34 |
+
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f" "${CUDA_ARCHS}")
|
| 35 |
+
else()
|
| 36 |
+
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
| 37 |
+
endif()
|
| 38 |
+
@@ -716,9 +716,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
| 39 |
+
endif()
|
| 40 |
+
|
| 41 |
+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
| 42 |
+
- cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
| 43 |
+
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f" "${CUDA_ARCHS}")
|
| 44 |
+
else()
|
| 45 |
+
- cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
| 46 |
+
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
|
| 47 |
+
endif()
|
| 48 |
+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
| 49 |
+
set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
|