| # docker-run.sh β bare docker run command (without start-qwen3-coder-next.sh lifecycle logic) | |
| # Useful for manual testing or embedding in other scripts. | |
| # | |
| # Usage: bash docker-run.sh | |
| # | |
| # Environment variables: | |
| # HF_TOKEN β optional Hugging Face token (required for gated models) | |
| # HF_CACHE β local weight cache path (default: ~/.cache/huggingface) | |
| set -euo pipefail | |
| HF_CACHE="${HF_CACHE:-${HOME}/.cache/huggingface}" | |
| mkdir -p "${HF_CACHE}" | |
| docker run \ | |
| --name qwen3-coder-next-vllm \ | |
| --rm \ | |
| --runtime=nvidia \ | |
| --gpus all \ | |
| -p 0.0.0.0:8000:8000 \ | |
| -v "${HF_CACHE}:/root/.cache/huggingface" \ | |
| --shm-size=32g \ | |
| -e VLLM_NVFP4_GEMM_BACKEND=marlin \ | |
| -e VLLM_TEST_FORCE_FP8_MARLIN=1 \ | |
| -e VLLM_USE_FLASHINFER_MOE_FP4=0 \ | |
| -e VLLM_MARLIN_USE_ATOMIC_ADD=1 \ | |
| ${HF_TOKEN:+-e HF_TOKEN="${HF_TOKEN}"} \ | |
| vllm/vllm-openai:cu130-nightly \ | |
| gdubicki/Qwen3-Coder-Next-NVFP4-GB10 \ | |
| --dtype auto \ | |
| --gpu-memory-utilization 0.90 \ | |
| --kv-cache-dtype fp8 \ | |
| --max-model-len 262144 \ | |
| --max-num-seqs 64 \ | |
| --max-num-batched-tokens 8192 \ | |
| --attention-backend flashinfer \ | |
| --enable-prefix-caching \ | |
| --enable-chunked-prefill \ | |
| --enable-auto-tool-choice \ | |
| --tool-call-parser qwen3_coder \ | |
| --host 0.0.0.0 \ | |
| --port 8000 | |
| # --------------------------------------------------------------------------- | |
| # Flag reference: | |
| # | |
| # vllm/vllm-openai:cu130-nightly | |
| # Native qwen3_next support (vLLM 0.19+). | |
| # | |
| # VLLM_NVFP4_GEMM_BACKEND=marlin | |
| # SM12.1 (GB10) has no native CUTLASS FP4 kernel. | |
| # Marlin handles NVFP4 W4A16 GEMM β 15% faster than CUTLASS for 512 experts. | |
| # | |
| # VLLM_TEST_FORCE_FP8_MARLIN=1 | |
| # Forces FP8 Marlin path on GB10 SM12.1. | |
| # | |
| # VLLM_USE_FLASHINFER_MOE_FP4=0 | |
| # FlashInfer MoE FP4 path not supported on GB10 SM12.1. | |
| # | |
| # VLLM_MARLIN_USE_ATOMIC_ADD=1 | |
| # GB10-specific Marlin optimization for correct FP4 GEMM on SM12.1. | |
| # | |
| # No --quantization flag | |
| # compressed-tensors format is auto-detected from config.json. | |
| # | |
| # --dtype auto | |
| # BF16 for non-quantized layers (DeltaNet linear_attn, router gates, lm_head). | |
| # | |
| # --gpu-memory-utilization 0.90 | |
| # 0.90 Γ 128 GB = 115 GB for vLLM. Weights: ~43 GB. KV cache: ~72 GB. | |
| # Safe limit per saricles testing (0.93 is risky). | |
| # | |
| # --kv-cache-dtype fp8 | |
| # FP8 KV cache. Only applies to the 12 full-attention layers (not DeltaNet). | |
| # | |
| # --max-model-len 262144 | |
| # Full native context. Tested with FP8 KV cache by saricles. | |
| # | |
| # --max-num-seqs 64 | |
| # Max concurrent requests. | |
| # | |
| # --max-num-batched-tokens 8192 | |
| # Limits tokens per batch β prevents OOM on long contexts. | |
| # | |
| # --attention-backend flashinfer | |
| # Required for FP8 KV cache + chunked prefill on GB10. | |
| # | |
| # --enable-prefix-caching | |
| # Reuses KV cache for repeated prompt prefixes (system prompts, etc.). | |
| # | |
| # --enable-chunked-prefill | |
| # Reduces memory spikes during long-prompt processing. | |
| # | |
| # --enable-auto-tool-choice --tool-call-parser qwen3_coder | |
| # Enables OpenAI-compatible tool calling for this model. | |
| # | |
| # --host 0.0.0.0 --port 8000 | |
| # OpenAI-compatible REST API, reachable from LAN. | |
| # --------------------------------------------------------------------------- | |