#!/usr/bin/env bash set -euo pipefail COMMAND="${1:-serve}" if [ "$#" -gt 0 ]; then shift fi MODEL="${MODEL:-Intel/gemma-4-31B-it-int4-AutoRound}" DRAFT="${DRAFT:-RedHatAI/gemma-4-31B-it-speculator.eagle3}" SERVED_MODEL_NAME="${SERVED_MODEL_NAME:-gemmacut-spectral}" SPECTRAL_SIDECAR="${SPECTRAL_SIDECAR:-/opt/gemmacut/artifacts/spectral_sidecar_chat_v2.pt}" VLLM_SOURCE="${VLLM_SOURCE:-/opt/vllm-spectral}" PORT="${PORT:-8000}" MAX_MODEL_LEN="${MAX_MODEL_LEN:-512}" MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-512}" MAX_NUM_SEQS="${MAX_NUM_SEQS:-2}" GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.8}" NUM_SPEC_TOKENS="${NUM_SPEC_TOKENS:-3}" ENABLE_SPECTRAL="${ENABLE_SPECTRAL:-1}" ENABLE_EAGLE="${ENABLE_EAGLE:-1}" SPECTRAL_CUDA_GRAPH="${SPECTRAL_CUDA_GRAPH:-1}" VLLM_LOGGING_LEVEL="${VLLM_LOGGING_LEVEL:-INFO}" DISABLE_HYBRID_KV_CACHE_MANAGER="${DISABLE_HYBRID_KV_CACHE_MANAGER:-0}" RESULTS_ROOT="${RESULTS_ROOT:-/workspace/results_bench}" export VLLM_LOGGING_LEVEL export SPECTRAL_CUDA_GRAPH export SPECTRAL_TRITON_COMPRESS="${SPECTRAL_TRITON_COMPRESS:-1}" export SPECTRAL_TRITON_DEQUANT="${SPECTRAL_TRITON_DEQUANT:-1}" export SPECTRAL_VERIFY="${SPECTRAL_VERIFY:-0}" export HF_HUB_DISABLE_XET="${HF_HUB_DISABLE_XET:-1}" unset SPECTRAL_SHARED_ALLOC if [ "${HF_HUB_OFFLINE:-0}" = "1" ]; then export HF_HUB_OFFLINE=1 else unset HF_HUB_OFFLINE fi prepare_overlay() { local run_src="${SPECTRAL_RUN_SRC:-/tmp/vllm-spectral-run}" local site if [ ! -d "$VLLM_SOURCE" ]; then echo "Missing VLLM_SOURCE: $VLLM_SOURCE" >&2 exit 1 fi if [ "$ENABLE_SPECTRAL" = "1" ] && [ ! -f "$SPECTRAL_SIDECAR" ]; then echo "Missing SPECTRAL_SIDECAR: $SPECTRAL_SIDECAR" >&2 exit 1 fi site="$(python3 - <<'PY' import pathlib import vllm print(pathlib.Path(vllm.__file__).resolve().parent) PY )" rm -rf "$run_src" cp -a "$VLLM_SOURCE" "$run_src" shopt -s nullglob for f in "$site"/_C*.so "$site"/_moe_C*.so "$site"/_flashmla*.so "$site"/cumem_allocator*.so; do ln -sf "$f" "$run_src/vllm/" done mkdir -p "$run_src/vllm/vllm_flash_attn" for f in "$site"/vllm_flash_attn/_vllm_fa2_C*.so "$site"/vllm_flash_attn/_vllm_fa3_C*.so; do ln -sf "$f" "$run_src/vllm/vllm_flash_attn/" done ln -sfn "$site/vllm_flash_attn/cute" "$run_src/vllm/vllm_flash_attn/cute" ln -sfn "$site/vllm_flash_attn/layers" "$run_src/vllm/vllm_flash_attn/layers" mkdir -p "$run_src/vllm/third_party" "$run_src/vllm/third_party/flashmla" ln -sfn "$site/third_party/triton_kernels" "$run_src/vllm/third_party/triton_kernels" ln -sf "$site/third_party/flashmla/flash_mla_interface.py" "$run_src/vllm/third_party/flashmla/" shopt -u nullglob export PYTHONPATH="$run_src:$run_src/vllm/third_party${PYTHONPATH:+:$PYTHONPATH}" } server_args() { local args=( --host "${HOST:-0.0.0.0}" --port "$PORT" --model "$MODEL" --served-model-name "$SERVED_MODEL_NAME" --kv-cache-dtype fp8_e4m3 --max-model-len "$MAX_MODEL_LEN" --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" --max-num-seqs "$MAX_NUM_SEQS" --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" --compilation-config "{\"compile_sizes\": []}" ) if [ "$ENABLE_SPECTRAL" = "1" ]; then args+=( --spectral-calibration "$SPECTRAL_SIDECAR" --spectral-quantize ) fi if [ "$ENABLE_EAGLE" = "1" ]; then args+=(--speculative-config "{\"model\":\"$DRAFT\",\"num_speculative_tokens\":$NUM_SPEC_TOKENS,\"method\":\"eagle3\"}") fi if [ "$DISABLE_HYBRID_KV_CACHE_MANAGER" = "1" ]; then args+=(--disable-hybrid-kv-cache-manager) fi printf '%s\0' "${args[@]}" } run_server() { prepare_overlay local args=() while IFS= read -r -d '' item; do args+=("$item") done < <(server_args) exec python3 -m vllm.entrypoints.openai.api_server "${args[@]}" "$@" } wait_for_server() { python3 - < "$SERVER_LOG" 2>&1 & SERVER_PID=$! export SERVER_PID PORT trap 'kill "$SERVER_PID" >/dev/null 2>&1 || true; wait "$SERVER_PID" >/dev/null 2>&1 || true' EXIT wait_for_server } run_smoke_client() { python3 - < {text}", flush=True) if expected.lower() not in text.lower(): raise SystemExit( f"semantic smoke failed: expected {expected!r} in response {text!r}") print("SMOKE_PROMPTS_OK", flush=True) PY } run_smoke() { RUN_ID="${RUN_ID:-smoke_$(date +%Y%m%d_%H%M%S)}" OUT="${RESULTS_DIR:-$RESULTS_ROOT/$RUN_ID}" mkdir -p "$OUT" SERVER_LOG="$OUT/server.log" start_background_server run_smoke_client | tee "$OUT/smoke_outputs.txt" echo "SMOKE_OUT=$OUT" } run_bench() { RUN_ID="${RUN_ID:-tokens_sec_phase2_eagle_$(date +%Y%m%d_%H%M%S)}" OUT="${RESULTS_DIR:-$RESULTS_ROOT/$RUN_ID}" mkdir -p "$OUT" SERVER_LOG="$OUT/server.log" start_background_server if [ "${RUN_SMOKE:-0}" = "1" ]; then run_smoke_client | tee "$OUT/smoke_outputs.txt" fi if [ "${SMOKE_ONLY:-0}" = "1" ]; then echo "SMOKE_ONLY=1; skipping benchmark" echo "BENCH_OUT=$OUT" exit 0 fi python3 -m vllm.entrypoints.cli.main bench serve \ --backend openai-chat \ --base-url "http://127.0.0.1:$PORT" \ --endpoint /v1/chat/completions \ --model "$SERVED_MODEL_NAME" \ --tokenizer "$MODEL" \ --dataset-name random \ --random-input-len "${INPUT_LEN:-128}" \ --random-output-len "${OUTPUT_LEN:-32}" \ --num-prompts "${NUM_PROMPTS:-8}" \ --num-warmups "${NUM_WARMUPS:-1}" \ --request-rate "${REQUEST_RATE:-inf}" \ --temperature 0 \ --ignore-eos \ --disable-tqdm \ --save-result \ --result-dir "$OUT" \ --result-filename bench.json \ 2>&1 | tee "$OUT/bench.log" echo "BENCH_OUT=$OUT" } case "$COMMAND" in serve) run_server "$@" ;; smoke) run_smoke ;; bench) run_bench ;; bash|sh) exec "$COMMAND" "$@" ;; *) exec "$COMMAND" "$@" ;; esac