qwen3-coder-next / docker-run.sh
gdubicki's picture
Switch model source to gdubicki/Qwen3-Coder-Next-NVFP4-GB10
088ba1d verified
#!/usr/bin/env bash
# docker-run.sh β€” bare docker run command (without start-qwen3-coder-next.sh lifecycle logic)
# Useful for manual testing or embedding in other scripts.
#
# Usage: bash docker-run.sh
#
# Environment variables:
# HF_TOKEN β€” optional Hugging Face token (required for gated models)
# HF_CACHE β€” local weight cache path (default: ~/.cache/huggingface)
set -euo pipefail
HF_CACHE="${HF_CACHE:-${HOME}/.cache/huggingface}"
mkdir -p "${HF_CACHE}"
docker run \
--name qwen3-coder-next-vllm \
--rm \
--runtime=nvidia \
--gpus all \
-p 0.0.0.0:8000:8000 \
-v "${HF_CACHE}:/root/.cache/huggingface" \
--shm-size=32g \
-e VLLM_NVFP4_GEMM_BACKEND=marlin \
-e VLLM_TEST_FORCE_FP8_MARLIN=1 \
-e VLLM_USE_FLASHINFER_MOE_FP4=0 \
-e VLLM_MARLIN_USE_ATOMIC_ADD=1 \
${HF_TOKEN:+-e HF_TOKEN="${HF_TOKEN}"} \
vllm/vllm-openai:cu130-nightly \
gdubicki/Qwen3-Coder-Next-NVFP4-GB10 \
--dtype auto \
--gpu-memory-utilization 0.90 \
--kv-cache-dtype fp8 \
--max-model-len 262144 \
--max-num-seqs 64 \
--max-num-batched-tokens 8192 \
--attention-backend flashinfer \
--enable-prefix-caching \
--enable-chunked-prefill \
--enable-auto-tool-choice \
--tool-call-parser qwen3_coder \
--host 0.0.0.0 \
--port 8000
# ---------------------------------------------------------------------------
# Flag reference:
#
# vllm/vllm-openai:cu130-nightly
# Native qwen3_next support (vLLM 0.19+).
#
# VLLM_NVFP4_GEMM_BACKEND=marlin
# SM12.1 (GB10) has no native CUTLASS FP4 kernel.
# Marlin handles NVFP4 W4A16 GEMM β€” 15% faster than CUTLASS for 512 experts.
#
# VLLM_TEST_FORCE_FP8_MARLIN=1
# Forces FP8 Marlin path on GB10 SM12.1.
#
# VLLM_USE_FLASHINFER_MOE_FP4=0
# FlashInfer MoE FP4 path not supported on GB10 SM12.1.
#
# VLLM_MARLIN_USE_ATOMIC_ADD=1
# GB10-specific Marlin optimization for correct FP4 GEMM on SM12.1.
#
# No --quantization flag
# compressed-tensors format is auto-detected from config.json.
#
# --dtype auto
# BF16 for non-quantized layers (DeltaNet linear_attn, router gates, lm_head).
#
# --gpu-memory-utilization 0.90
# 0.90 Γ— 128 GB = 115 GB for vLLM. Weights: ~43 GB. KV cache: ~72 GB.
# Safe limit per saricles testing (0.93 is risky).
#
# --kv-cache-dtype fp8
# FP8 KV cache. Only applies to the 12 full-attention layers (not DeltaNet).
#
# --max-model-len 262144
# Full native context. Tested with FP8 KV cache by saricles.
#
# --max-num-seqs 64
# Max concurrent requests.
#
# --max-num-batched-tokens 8192
# Limits tokens per batch β€” prevents OOM on long contexts.
#
# --attention-backend flashinfer
# Required for FP8 KV cache + chunked prefill on GB10.
#
# --enable-prefix-caching
# Reuses KV cache for repeated prompt prefixes (system prompts, etc.).
#
# --enable-chunked-prefill
# Reduces memory spikes during long-prompt processing.
#
# --enable-auto-tool-choice --tool-call-parser qwen3_coder
# Enables OpenAI-compatible tool calling for this model.
#
# --host 0.0.0.0 --port 8000
# OpenAI-compatible REST API, reachable from LAN.
# ---------------------------------------------------------------------------