#!/usr/bin/env bash # docker-run.sh — bare docker run command (without start-qwen3-coder-next.sh lifecycle logic) # Useful for manual testing or embedding in other scripts. # # Usage: bash docker-run.sh # # Environment variables: # HF_TOKEN — optional Hugging Face token (required for gated models) # HF_CACHE — local weight cache path (default: ~/.cache/huggingface) set -euo pipefail HF_CACHE="${HF_CACHE:-${HOME}/.cache/huggingface}" mkdir -p "${HF_CACHE}" docker run \ --name qwen3-coder-next-vllm \ --rm \ --runtime=nvidia \ --gpus all \ -p 0.0.0.0:8000:8000 \ -v "${HF_CACHE}:/root/.cache/huggingface" \ --shm-size=32g \ -e VLLM_NVFP4_GEMM_BACKEND=marlin \ -e VLLM_TEST_FORCE_FP8_MARLIN=1 \ -e VLLM_USE_FLASHINFER_MOE_FP4=0 \ -e VLLM_MARLIN_USE_ATOMIC_ADD=1 \ ${HF_TOKEN:+-e HF_TOKEN="${HF_TOKEN}"} \ vllm/vllm-openai:cu130-nightly \ gdubicki/Qwen3-Coder-Next-NVFP4-GB10 \ --dtype auto \ --gpu-memory-utilization 0.90 \ --kv-cache-dtype fp8 \ --max-model-len 262144 \ --max-num-seqs 64 \ --max-num-batched-tokens 8192 \ --attention-backend flashinfer \ --enable-prefix-caching \ --enable-chunked-prefill \ --enable-auto-tool-choice \ --tool-call-parser qwen3_coder \ --host 0.0.0.0 \ --port 8000 # --------------------------------------------------------------------------- # Flag reference: # # vllm/vllm-openai:cu130-nightly # Native qwen3_next support (vLLM 0.19+). # # VLLM_NVFP4_GEMM_BACKEND=marlin # SM12.1 (GB10) has no native CUTLASS FP4 kernel. # Marlin handles NVFP4 W4A16 GEMM — 15% faster than CUTLASS for 512 experts. # # VLLM_TEST_FORCE_FP8_MARLIN=1 # Forces FP8 Marlin path on GB10 SM12.1. # # VLLM_USE_FLASHINFER_MOE_FP4=0 # FlashInfer MoE FP4 path not supported on GB10 SM12.1. # # VLLM_MARLIN_USE_ATOMIC_ADD=1 # GB10-specific Marlin optimization for correct FP4 GEMM on SM12.1. # # No --quantization flag # compressed-tensors format is auto-detected from config.json. # # --dtype auto # BF16 for non-quantized layers (DeltaNet linear_attn, router gates, lm_head). # # --gpu-memory-utilization 0.90 # 0.90 × 128 GB = 115 GB for vLLM. Weights: ~43 GB. KV cache: ~72 GB. # Safe limit per saricles testing (0.93 is risky). # # --kv-cache-dtype fp8 # FP8 KV cache. Only applies to the 12 full-attention layers (not DeltaNet). # # --max-model-len 262144 # Full native context. Tested with FP8 KV cache by saricles. # # --max-num-seqs 64 # Max concurrent requests. # # --max-num-batched-tokens 8192 # Limits tokens per batch — prevents OOM on long contexts. # # --attention-backend flashinfer # Required for FP8 KV cache + chunked prefill on GB10. # # --enable-prefix-caching # Reuses KV cache for repeated prompt prefixes (system prompts, etc.). # # --enable-chunked-prefill # Reduces memory spikes during long-prompt processing. # # --enable-auto-tool-choice --tool-call-parser qwen3_coder # Enables OpenAI-compatible tool calling for this model. # # --host 0.0.0.0 --port 8000 # OpenAI-compatible REST API, reachable from LAN. # ---------------------------------------------------------------------------