File size: 7,585 Bytes

#!/usr/bin/env bash
set -euo pipefail

COMMAND="${1:-serve}"
if [ "$#" -gt 0 ]; then
  shift
fi

MODEL="${MODEL:-Intel/gemma-4-31B-it-int4-AutoRound}"
DRAFT="${DRAFT:-RedHatAI/gemma-4-31B-it-speculator.eagle3}"
SERVED_MODEL_NAME="${SERVED_MODEL_NAME:-gemmacut-spectral}"
SPECTRAL_SIDECAR="${SPECTRAL_SIDECAR:-/opt/gemmacut/artifacts/spectral_sidecar_chat_v2.pt}"
VLLM_SOURCE="${VLLM_SOURCE:-/opt/vllm-spectral}"
PORT="${PORT:-8000}"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-512}"
MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-512}"
MAX_NUM_SEQS="${MAX_NUM_SEQS:-2}"
GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.8}"
NUM_SPEC_TOKENS="${NUM_SPEC_TOKENS:-3}"
ENABLE_SPECTRAL="${ENABLE_SPECTRAL:-1}"
ENABLE_EAGLE="${ENABLE_EAGLE:-1}"
SPECTRAL_CUDA_GRAPH="${SPECTRAL_CUDA_GRAPH:-1}"
VLLM_LOGGING_LEVEL="${VLLM_LOGGING_LEVEL:-INFO}"
DISABLE_HYBRID_KV_CACHE_MANAGER="${DISABLE_HYBRID_KV_CACHE_MANAGER:-0}"
RESULTS_ROOT="${RESULTS_ROOT:-/workspace/results_bench}"

export VLLM_LOGGING_LEVEL
export SPECTRAL_CUDA_GRAPH
export SPECTRAL_TRITON_COMPRESS="${SPECTRAL_TRITON_COMPRESS:-1}"
export SPECTRAL_TRITON_DEQUANT="${SPECTRAL_TRITON_DEQUANT:-1}"
export SPECTRAL_VERIFY="${SPECTRAL_VERIFY:-0}"
export HF_HUB_DISABLE_XET="${HF_HUB_DISABLE_XET:-1}"
unset SPECTRAL_SHARED_ALLOC

if [ "${HF_HUB_OFFLINE:-0}" = "1" ]; then
  export HF_HUB_OFFLINE=1
else
  unset HF_HUB_OFFLINE
fi

prepare_overlay() {
  local run_src="${SPECTRAL_RUN_SRC:-/tmp/vllm-spectral-run}"
  local site

  if [ ! -d "$VLLM_SOURCE" ]; then
    echo "Missing VLLM_SOURCE: $VLLM_SOURCE" >&2
    exit 1
  fi
  if [ "$ENABLE_SPECTRAL" = "1" ] && [ ! -f "$SPECTRAL_SIDECAR" ]; then
    echo "Missing SPECTRAL_SIDECAR: $SPECTRAL_SIDECAR" >&2
    exit 1
  fi

  site="$(python3 - <<'PY'
import pathlib
import vllm
print(pathlib.Path(vllm.__file__).resolve().parent)
PY
)"

  rm -rf "$run_src"
  cp -a "$VLLM_SOURCE" "$run_src"

  shopt -s nullglob
  for f in "$site"/_C*.so "$site"/_moe_C*.so "$site"/_flashmla*.so "$site"/cumem_allocator*.so; do
    ln -sf "$f" "$run_src/vllm/"
  done
  mkdir -p "$run_src/vllm/vllm_flash_attn"
  for f in "$site"/vllm_flash_attn/_vllm_fa2_C*.so "$site"/vllm_flash_attn/_vllm_fa3_C*.so; do
    ln -sf "$f" "$run_src/vllm/vllm_flash_attn/"
  done
  ln -sfn "$site/vllm_flash_attn/cute" "$run_src/vllm/vllm_flash_attn/cute"
  ln -sfn "$site/vllm_flash_attn/layers" "$run_src/vllm/vllm_flash_attn/layers"
  mkdir -p "$run_src/vllm/third_party" "$run_src/vllm/third_party/flashmla"
  ln -sfn "$site/third_party/triton_kernels" "$run_src/vllm/third_party/triton_kernels"
  ln -sf "$site/third_party/flashmla/flash_mla_interface.py" "$run_src/vllm/third_party/flashmla/"
  shopt -u nullglob

  export PYTHONPATH="$run_src:$run_src/vllm/third_party${PYTHONPATH:+:$PYTHONPATH}"
}

server_args() {
  local args=(
    --host "${HOST:-0.0.0.0}"
    --port "$PORT"
    --model "$MODEL"
    --served-model-name "$SERVED_MODEL_NAME"
    --kv-cache-dtype fp8_e4m3
    --max-model-len "$MAX_MODEL_LEN"
    --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS"
    --max-num-seqs "$MAX_NUM_SEQS"
    --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION"
    --compilation-config "{\"compile_sizes\": []}"
  )
  if [ "$ENABLE_SPECTRAL" = "1" ]; then
    args+=(
      --spectral-calibration "$SPECTRAL_SIDECAR"
      --spectral-quantize
    )
  fi
  if [ "$ENABLE_EAGLE" = "1" ]; then
    args+=(--speculative-config "{\"model\":\"$DRAFT\",\"num_speculative_tokens\":$NUM_SPEC_TOKENS,\"method\":\"eagle3\"}")
  fi
  if [ "$DISABLE_HYBRID_KV_CACHE_MANAGER" = "1" ]; then
    args+=(--disable-hybrid-kv-cache-manager)
  fi
  printf '%s\0' "${args[@]}"
}

run_server() {
  prepare_overlay
  local args=()
  while IFS= read -r -d '' item; do
    args+=("$item")
  done < <(server_args)
  exec python3 -m vllm.entrypoints.openai.api_server "${args[@]}" "$@"
}

wait_for_server() {
  python3 - <<PY
import os
import sys
import time
import urllib.request

pid = int(os.environ["SERVER_PID"])
port = int(os.environ["PORT"])
deadline = time.time() + int(os.environ.get("SERVER_TIMEOUT", "300"))
url = f"http://127.0.0.1:{port}/v1/models"
while time.time() < deadline:
    try:
        os.kill(pid, 0)
    except OSError:
        raise SystemExit("server exited early")
    try:
        with urllib.request.urlopen(url, timeout=2) as response:
            if response.status == 200:
                print("SERVER_READY", flush=True)
                raise SystemExit(0)
    except Exception:
        time.sleep(1)
raise SystemExit("server did not become ready")
PY
}

start_background_server() {
  prepare_overlay
  local args=()
  HOST=127.0.0.1
  export HOST
  while IFS= read -r -d '' item; do
    args+=("$item")
  done < <(server_args)
  python3 -m vllm.entrypoints.openai.api_server "${args[@]}" > "$SERVER_LOG" 2>&1 &
  SERVER_PID=$!
  export SERVER_PID PORT
  trap 'kill "$SERVER_PID" >/dev/null 2>&1 || true; wait "$SERVER_PID" >/dev/null 2>&1 || true' EXIT
  wait_for_server
}

run_smoke_client() {
  python3 - <<PY
import json
import urllib.request

model = "${SERVED_MODEL_NAME}"
url = "http://127.0.0.1:${PORT}/v1/chat/completions"
checks = [
    ("What is 2+2? Answer with just the number.", "4"),
    ("Paris is the capital of which country? Answer with one word.", "France"),
]

for prompt, expected in checks:
    payload = {
        "model": model,
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": 16,
        "temperature": 0,
    }
    request = urllib.request.Request(
        url,
        data=json.dumps(payload).encode("utf-8"),
        headers={"Content-Type": "application/json"},
        method="POST",
    )
    with urllib.request.urlopen(request, timeout=120) as response:
        data = json.load(response)
    text = data["choices"][0]["message"]["content"].strip()
    print(f"{prompt} => {text}", flush=True)
    if expected.lower() not in text.lower():
        raise SystemExit(
            f"semantic smoke failed: expected {expected!r} in response {text!r}")

print("SMOKE_PROMPTS_OK", flush=True)
PY
}

run_smoke() {
  RUN_ID="${RUN_ID:-smoke_$(date +%Y%m%d_%H%M%S)}"
  OUT="${RESULTS_DIR:-$RESULTS_ROOT/$RUN_ID}"
  mkdir -p "$OUT"
  SERVER_LOG="$OUT/server.log"
  start_background_server
  run_smoke_client | tee "$OUT/smoke_outputs.txt"
  echo "SMOKE_OUT=$OUT"
}

run_bench() {
  RUN_ID="${RUN_ID:-tokens_sec_phase2_eagle_$(date +%Y%m%d_%H%M%S)}"
  OUT="${RESULTS_DIR:-$RESULTS_ROOT/$RUN_ID}"
  mkdir -p "$OUT"
  SERVER_LOG="$OUT/server.log"
  start_background_server

  if [ "${RUN_SMOKE:-0}" = "1" ]; then
    run_smoke_client | tee "$OUT/smoke_outputs.txt"
  fi
  if [ "${SMOKE_ONLY:-0}" = "1" ]; then
    echo "SMOKE_ONLY=1; skipping benchmark"
    echo "BENCH_OUT=$OUT"
    exit 0
  fi

  python3 -m vllm.entrypoints.cli.main bench serve \
    --backend openai-chat \
    --base-url "http://127.0.0.1:$PORT" \
    --endpoint /v1/chat/completions \
    --model "$SERVED_MODEL_NAME" \
    --tokenizer "$MODEL" \
    --dataset-name random \
    --random-input-len "${INPUT_LEN:-128}" \
    --random-output-len "${OUTPUT_LEN:-32}" \
    --num-prompts "${NUM_PROMPTS:-8}" \
    --num-warmups "${NUM_WARMUPS:-1}" \
    --request-rate "${REQUEST_RATE:-inf}" \
    --temperature 0 \
    --ignore-eos \
    --disable-tqdm \
    --save-result \
    --result-dir "$OUT" \
    --result-filename bench.json \
    2>&1 | tee "$OUT/bench.log"

  echo "BENCH_OUT=$OUT"
}

case "$COMMAND" in
  serve)
    run_server "$@"
    ;;
  smoke)
    run_smoke
    ;;
  bench)
    run_bench
    ;;
  bash|sh)
    exec "$COMMAND" "$@"
    ;;
  *)
    exec "$COMMAND" "$@"
    ;;
esac