2 DGX Spark cluster recipe

#6
by susni - opened

Using eugr's vllm repo. Thanks to the guys on the DGX Spark forum for getting this up and running day 1. Some of the things here might be redundant/unoptimal, I just needed it working.

https://forums.developer.nvidia.com/t/minimax-m2-7-nfvp4-recipe-benchmarks/366324

VLLM_SPARK_EXTRA_DOCKER_ARGS="-v /your/path/here:/models"
./launch-cluster.sh -t vllm-node
-e VLLM_USE_FLASHINFER_MOE_FP4=1
-e VLLM_NVFP4_GEMM_BACKEND=flashinfer-cutlass
-e VLLM_ALLOW_LONG_MAX_MODEL_LEN=1
-e OMP_NUM_THREADS=8
-e VLLM_FLOAT32_MATMUL_PRECISION=high
-e VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1
-e VLLM_FLASHINFER_MOE_BACKEND=throughput
exec
vllm serve /models/MiniMax-M2.7-NVFP4
--host 0.0.0.0
--port 8000
--served-model-name your_alias_here
--attention-backend flashinfer
--max-num-seqs 5
--max-num-batched-tokens 8192
--gpu-memory-utilization 0.85
--mamba_ssm_cache_dtype float32
--enable-prefix-caching
--enable-auto-tool-choice
--tool-call-parser minimax_m2
--reasoning-parser minimax_m2_append_think
--kv-cache-dtype fp8
--quantization modelopt_fp4
--moe-backend flashinfer_cutlass
--disable-custom-all-reduce
--dtype auto
--trust-remote-code
--tensor-parallel-size 2
--distributed-executor-backend ray

Sign up or log in to comment