I can't run with openai vllm docker image on B300, but can start on B200.

#1
by luweizhou2016 - opened

model=/models/nv_Qwen3-235B-A22B-NVFP
max_model_len=22000 # Must be >= the input + the output token lengths.
max_num_seqs=4096 # max seqs in one schedule iteration
max_num_batched_tokens=100000 #max tokens in all the sequence.
tensor_parallel_size=1
export VLLM_ATTENTION_BACKEND=FLASHINFER
export VLLM_USE_FLASHINFER_MOE_FP4=1
export VLLM_FLASHINFER_MOE_BACKEND=latency

vllm serve $model --host localhost --port 8000 --swap-space 64 --max-num-batched-tokens ${max_num_batched_tokens}
--max-model-len ${max_model_len} -dp 1 --tensor-parallel-size ${tensor_parallel_size} --max-num-seqs ${max_num_seqs}
--kv-cache-dtype fp8 --gpu-memory-utilization 0.85 --no-enable-prefix-caching --async-scheduling

Sign up or log in to comment