Switch model source to gdubicki/Qwen3-Coder-Next-NVFP4-GB10

088ba1d verified about 2 months ago

3.29 kB

	#!/usr/bin/env bash
	# docker-run.sh — bare docker run command (without start-qwen3-coder-next.sh lifecycle logic)
	# Useful for manual testing or embedding in other scripts.
	#
	# Usage: bash docker-run.sh
	#
	# Environment variables:
	# HF_TOKEN — optional Hugging Face token (required for gated models)
	# HF_CACHE — local weight cache path (default: ~/.cache/huggingface)

	set -euo pipefail

	HF_CACHE="${HF_CACHE:-${HOME}/.cache/huggingface}"
	mkdir -p "${HF_CACHE}"

	docker run \
	--name qwen3-coder-next-vllm \
	--rm \
	--runtime=nvidia \
	--gpus all \
	-p 0.0.0.0:8000:8000 \
	-v "${HF_CACHE}:/root/.cache/huggingface" \
	--shm-size=32g \
	-e VLLM_NVFP4_GEMM_BACKEND=marlin \
	-e VLLM_TEST_FORCE_FP8_MARLIN=1 \
	-e VLLM_USE_FLASHINFER_MOE_FP4=0 \
	-e VLLM_MARLIN_USE_ATOMIC_ADD=1 \
	${HF_TOKEN:+-e HF_TOKEN="${HF_TOKEN}"} \
	vllm/vllm-openai:cu130-nightly \
	gdubicki/Qwen3-Coder-Next-NVFP4-GB10 \
	--dtype auto \
	--gpu-memory-utilization 0.90 \
	--kv-cache-dtype fp8 \
	--max-model-len 262144 \
	--max-num-seqs 64 \
	--max-num-batched-tokens 8192 \
	--attention-backend flashinfer \
	--enable-prefix-caching \
	--enable-chunked-prefill \
	--enable-auto-tool-choice \
	--tool-call-parser qwen3_coder \
	--host 0.0.0.0 \
	--port 8000

	# ---------------------------------------------------------------------------
	# Flag reference:
	#
	# vllm/vllm-openai:cu130-nightly
	# Native qwen3_next support (vLLM 0.19+).
	#
	# VLLM_NVFP4_GEMM_BACKEND=marlin
	# SM12.1 (GB10) has no native CUTLASS FP4 kernel.
	# Marlin handles NVFP4 W4A16 GEMM — 15% faster than CUTLASS for 512 experts.
	#
	# VLLM_TEST_FORCE_FP8_MARLIN=1
	# Forces FP8 Marlin path on GB10 SM12.1.
	#
	# VLLM_USE_FLASHINFER_MOE_FP4=0
	# FlashInfer MoE FP4 path not supported on GB10 SM12.1.
	#
	# VLLM_MARLIN_USE_ATOMIC_ADD=1
	# GB10-specific Marlin optimization for correct FP4 GEMM on SM12.1.
	#
	# No --quantization flag
	# compressed-tensors format is auto-detected from config.json.
	#
	# --dtype auto
	# BF16 for non-quantized layers (DeltaNet linear_attn, router gates, lm_head).
	#
	# --gpu-memory-utilization 0.90
	# 0.90 × 128 GB = 115 GB for vLLM. Weights: ~43 GB. KV cache: ~72 GB.
	# Safe limit per saricles testing (0.93 is risky).
	#
	# --kv-cache-dtype fp8
	# FP8 KV cache. Only applies to the 12 full-attention layers (not DeltaNet).
	#
	# --max-model-len 262144
	# Full native context. Tested with FP8 KV cache by saricles.
	#
	# --max-num-seqs 64
	# Max concurrent requests.
	#
	# --max-num-batched-tokens 8192
	# Limits tokens per batch — prevents OOM on long contexts.
	#
	# --attention-backend flashinfer
	# Required for FP8 KV cache + chunked prefill on GB10.
	#
	# --enable-prefix-caching
	# Reuses KV cache for repeated prompt prefixes (system prompts, etc.).
	#
	# --enable-chunked-prefill
	# Reduces memory spikes during long-prompt processing.
	#
	# --enable-auto-tool-choice --tool-call-parser qwen3_coder
	# Enables OpenAI-compatible tool calling for this model.
	#
	# --host 0.0.0.0 --port 8000
	# OpenAI-compatible REST API, reachable from LAN.
	# ---------------------------------------------------------------------------