#!/usr/bin/env bash
# docker-run.sh — bare docker run command (without start-qwen3-coder-next.sh lifecycle logic)
# Useful for manual testing or embedding in other scripts.
#
# Usage: bash docker-run.sh
#
# Environment variables:
#   HF_TOKEN   — optional Hugging Face token (required for gated models)
#   HF_CACHE   — local weight cache path (default: ~/.cache/huggingface)

set -euo pipefail

HF_CACHE="${HF_CACHE:-${HOME}/.cache/huggingface}"
mkdir -p "${HF_CACHE}"

docker run \
    --name qwen3-coder-next-vllm \
    --rm \
    --runtime=nvidia \
    --gpus all \
    -p 0.0.0.0:8000:8000 \
    -v "${HF_CACHE}:/root/.cache/huggingface" \
    --shm-size=32g \
    -e VLLM_NVFP4_GEMM_BACKEND=marlin \
    -e VLLM_TEST_FORCE_FP8_MARLIN=1 \
    -e VLLM_USE_FLASHINFER_MOE_FP4=0 \
    -e VLLM_MARLIN_USE_ATOMIC_ADD=1 \
    ${HF_TOKEN:+-e HF_TOKEN="${HF_TOKEN}"} \
    vllm/vllm-openai:cu130-nightly \
    gdubicki/Qwen3-Coder-Next-NVFP4-GB10 \
    --dtype auto \
    --gpu-memory-utilization 0.90 \
    --kv-cache-dtype fp8 \
    --max-model-len 262144 \
    --max-num-seqs 64 \
    --max-num-batched-tokens 8192 \
    --attention-backend flashinfer \
    --enable-prefix-caching \
    --enable-chunked-prefill \
    --enable-auto-tool-choice \
    --tool-call-parser qwen3_coder \
    --host 0.0.0.0 \
    --port 8000

# ---------------------------------------------------------------------------
# Flag reference:
#
#  vllm/vllm-openai:cu130-nightly
#       Native qwen3_next support (vLLM 0.19+).
#
#  VLLM_NVFP4_GEMM_BACKEND=marlin
#       SM12.1 (GB10) has no native CUTLASS FP4 kernel.
#       Marlin handles NVFP4 W4A16 GEMM — 15% faster than CUTLASS for 512 experts.
#
#  VLLM_TEST_FORCE_FP8_MARLIN=1
#       Forces FP8 Marlin path on GB10 SM12.1.
#
#  VLLM_USE_FLASHINFER_MOE_FP4=0
#       FlashInfer MoE FP4 path not supported on GB10 SM12.1.
#
#  VLLM_MARLIN_USE_ATOMIC_ADD=1
#       GB10-specific Marlin optimization for correct FP4 GEMM on SM12.1.
#
#  No --quantization flag
#       compressed-tensors format is auto-detected from config.json.
#
#  --dtype auto
#       BF16 for non-quantized layers (DeltaNet linear_attn, router gates, lm_head).
#
#  --gpu-memory-utilization 0.90
#       0.90 × 128 GB = 115 GB for vLLM. Weights: ~43 GB. KV cache: ~72 GB.
#       Safe limit per saricles testing (0.93 is risky).
#
#  --kv-cache-dtype fp8
#       FP8 KV cache. Only applies to the 12 full-attention layers (not DeltaNet).
#
#  --max-model-len 262144
#       Full native context. Tested with FP8 KV cache by saricles.
#
#  --max-num-seqs 64
#       Max concurrent requests.
#
#  --max-num-batched-tokens 8192
#       Limits tokens per batch — prevents OOM on long contexts.
#
#  --attention-backend flashinfer
#       Required for FP8 KV cache + chunked prefill on GB10.
#
#  --enable-prefix-caching
#       Reuses KV cache for repeated prompt prefixes (system prompts, etc.).
#
#  --enable-chunked-prefill
#       Reduces memory spikes during long-prompt processing.
#
#  --enable-auto-tool-choice --tool-call-parser qwen3_coder
#       Enables OpenAI-compatible tool calling for this model.
#
#  --host 0.0.0.0 --port 8000
#       OpenAI-compatible REST API, reachable from LAN.
# ---------------------------------------------------------------------------