2 DGX Spark cluster recipe
Using eugr's vllm repo. Thanks to the guys on the DGX Spark forum for getting this up and running day 1. Some of the things here might be redundant/unoptimal, I just needed it working.
https://forums.developer.nvidia.com/t/minimax-m2-7-nfvp4-recipe-benchmarks/366324
VLLM_SPARK_EXTRA_DOCKER_ARGS="-v /your/path/here:/models"
./launch-cluster.sh -t vllm-node
-e VLLM_USE_FLASHINFER_MOE_FP4=1
-e VLLM_NVFP4_GEMM_BACKEND=flashinfer-cutlass
-e VLLM_ALLOW_LONG_MAX_MODEL_LEN=1
-e OMP_NUM_THREADS=8
-e VLLM_FLOAT32_MATMUL_PRECISION=high
-e VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1
-e VLLM_FLASHINFER_MOE_BACKEND=throughput
exec
vllm serve /models/MiniMax-M2.7-NVFP4
--host 0.0.0.0
--port 8000
--served-model-name your_alias_here
--attention-backend flashinfer
--max-num-seqs 5
--max-num-batched-tokens 8192
--gpu-memory-utilization 0.85
--mamba_ssm_cache_dtype float32
--enable-prefix-caching
--enable-auto-tool-choice
--tool-call-parser minimax_m2
--reasoning-parser minimax_m2_append_think
--kv-cache-dtype fp8
--quantization modelopt_fp4
--moe-backend flashinfer_cutlass
--disable-custom-all-reduce
--dtype auto
--trust-remote-code
--tensor-parallel-size 2
--distributed-executor-backend ray