Working VLLM Command
#10
by nmitchko - opened
Using VLLM:Nightly cu130
CUDA_VISIBLE_DEVICES=0,1 \
SAFETENSORS_FAST_GPU=1 \
OMP_NUM_THREADS=8 \
NCCL_P2P_DISABLE=1 \
VLLM_USE_FLASHINFER_MOE_FP4=0 \
vllm serve lukealonso/MiniMax-M2.7-NVFP4 \
--port 23333 \
--trust-remote-code \
--served-model-name MM-27 \
--tensor-parallel-size 2 \
--enable-auto-tool-choice \
--tool-call-parser minimax_m2 \
--reasoning-parser minimax_m2_append_think \
--disable-custom-all-reduce \
--enable_expert_parallel \
--gpu-memory-utilization 0.95 \
--max-num-seqs 8 \
--kv-cache-dtype turboquant_k8v4 \
--max-model-len 130k