| #!/usr/bin/env bash |
| set -euo pipefail |
|
|
| COMMAND="${1:-serve}" |
| if [ "$#" -gt 0 ]; then |
| shift |
| fi |
|
|
| MODEL="${MODEL:-Intel/gemma-4-31B-it-int4-AutoRound}" |
| DRAFT="${DRAFT:-RedHatAI/gemma-4-31B-it-speculator.eagle3}" |
| SERVED_MODEL_NAME="${SERVED_MODEL_NAME:-gemmacut-spectral}" |
| SPECTRAL_SIDECAR="${SPECTRAL_SIDECAR:-/opt/gemmacut/artifacts/spectral_sidecar_chat_v2.pt}" |
| VLLM_SOURCE="${VLLM_SOURCE:-/opt/vllm-spectral}" |
| PORT="${PORT:-8000}" |
| MAX_MODEL_LEN="${MAX_MODEL_LEN:-512}" |
| MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-512}" |
| MAX_NUM_SEQS="${MAX_NUM_SEQS:-2}" |
| GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.8}" |
| NUM_SPEC_TOKENS="${NUM_SPEC_TOKENS:-3}" |
| ENABLE_SPECTRAL="${ENABLE_SPECTRAL:-1}" |
| ENABLE_EAGLE="${ENABLE_EAGLE:-1}" |
| SPECTRAL_CUDA_GRAPH="${SPECTRAL_CUDA_GRAPH:-1}" |
| VLLM_LOGGING_LEVEL="${VLLM_LOGGING_LEVEL:-INFO}" |
| DISABLE_HYBRID_KV_CACHE_MANAGER="${DISABLE_HYBRID_KV_CACHE_MANAGER:-0}" |
| RESULTS_ROOT="${RESULTS_ROOT:-/workspace/results_bench}" |
|
|
| export VLLM_LOGGING_LEVEL |
| export SPECTRAL_CUDA_GRAPH |
| export SPECTRAL_TRITON_COMPRESS="${SPECTRAL_TRITON_COMPRESS:-1}" |
| export SPECTRAL_TRITON_DEQUANT="${SPECTRAL_TRITON_DEQUANT:-1}" |
| export SPECTRAL_VERIFY="${SPECTRAL_VERIFY:-0}" |
| export HF_HUB_DISABLE_XET="${HF_HUB_DISABLE_XET:-1}" |
| unset SPECTRAL_SHARED_ALLOC |
|
|
| if [ "${HF_HUB_OFFLINE:-0}" = "1" ]; then |
| export HF_HUB_OFFLINE=1 |
| else |
| unset HF_HUB_OFFLINE |
| fi |
|
|
| prepare_overlay() { |
| local run_src="${SPECTRAL_RUN_SRC:-/tmp/vllm-spectral-run}" |
| local site |
|
|
| if [ ! -d "$VLLM_SOURCE" ]; then |
| echo "Missing VLLM_SOURCE: $VLLM_SOURCE" >&2 |
| exit 1 |
| fi |
| if [ "$ENABLE_SPECTRAL" = "1" ] && [ ! -f "$SPECTRAL_SIDECAR" ]; then |
| echo "Missing SPECTRAL_SIDECAR: $SPECTRAL_SIDECAR" >&2 |
| exit 1 |
| fi |
|
|
| site="$(python3 - <<'PY' |
| import pathlib |
| import vllm |
| print(pathlib.Path(vllm.__file__).resolve().parent) |
| PY |
| )" |
|
|
| rm -rf "$run_src" |
| cp -a "$VLLM_SOURCE" "$run_src" |
|
|
| shopt -s nullglob |
| for f in "$site"/_C*.so "$site"/_moe_C*.so "$site"/_flashmla*.so "$site"/cumem_allocator*.so; do |
| ln -sf "$f" "$run_src/vllm/" |
| done |
| mkdir -p "$run_src/vllm/vllm_flash_attn" |
| for f in "$site"/vllm_flash_attn/_vllm_fa2_C*.so "$site"/vllm_flash_attn/_vllm_fa3_C*.so; do |
| ln -sf "$f" "$run_src/vllm/vllm_flash_attn/" |
| done |
| ln -sfn "$site/vllm_flash_attn/cute" "$run_src/vllm/vllm_flash_attn/cute" |
| ln -sfn "$site/vllm_flash_attn/layers" "$run_src/vllm/vllm_flash_attn/layers" |
| mkdir -p "$run_src/vllm/third_party" "$run_src/vllm/third_party/flashmla" |
| ln -sfn "$site/third_party/triton_kernels" "$run_src/vllm/third_party/triton_kernels" |
| ln -sf "$site/third_party/flashmla/flash_mla_interface.py" "$run_src/vllm/third_party/flashmla/" |
| shopt -u nullglob |
|
|
| export PYTHONPATH="$run_src:$run_src/vllm/third_party${PYTHONPATH:+:$PYTHONPATH}" |
| } |
|
|
| server_args() { |
| local args=( |
| --host "${HOST:-0.0.0.0}" |
| --port "$PORT" |
| --model "$MODEL" |
| --served-model-name "$SERVED_MODEL_NAME" |
| --kv-cache-dtype fp8_e4m3 |
| --max-model-len "$MAX_MODEL_LEN" |
| --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" |
| --max-num-seqs "$MAX_NUM_SEQS" |
| --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" |
| --compilation-config "{\"compile_sizes\": []}" |
| ) |
| if [ "$ENABLE_SPECTRAL" = "1" ]; then |
| args+=( |
| --spectral-calibration "$SPECTRAL_SIDECAR" |
| --spectral-quantize |
| ) |
| fi |
| if [ "$ENABLE_EAGLE" = "1" ]; then |
| args+=(--speculative-config "{\"model\":\"$DRAFT\",\"num_speculative_tokens\":$NUM_SPEC_TOKENS,\"method\":\"eagle3\"}") |
| fi |
| if [ "$DISABLE_HYBRID_KV_CACHE_MANAGER" = "1" ]; then |
| args+=(--disable-hybrid-kv-cache-manager) |
| fi |
| printf '%s\0' "${args[@]}" |
| } |
|
|
| run_server() { |
| prepare_overlay |
| local args=() |
| while IFS= read -r -d '' item; do |
| args+=("$item") |
| done < <(server_args) |
| exec python3 -m vllm.entrypoints.openai.api_server "${args[@]}" "$@" |
| } |
|
|
| wait_for_server() { |
| python3 - <<PY |
| import os |
| import sys |
| import time |
| import urllib.request |
| |
| pid = int(os.environ["SERVER_PID"]) |
| port = int(os.environ["PORT"]) |
| deadline = time.time() + int(os.environ.get("SERVER_TIMEOUT", "300")) |
| url = f"http://127.0.0.1:{port}/v1/models" |
| while time.time() < deadline: |
| try: |
| os.kill(pid, 0) |
| except OSError: |
| raise SystemExit("server exited early") |
| try: |
| with urllib.request.urlopen(url, timeout=2) as response: |
| if response.status == 200: |
| print("SERVER_READY", flush=True) |
| raise SystemExit(0) |
| except Exception: |
| time.sleep(1) |
| raise SystemExit("server did not become ready") |
| PY |
| } |
|
|
| start_background_server() { |
| prepare_overlay |
| local args=() |
| HOST=127.0.0.1 |
| export HOST |
| while IFS= read -r -d '' item; do |
| args+=("$item") |
| done < <(server_args) |
| python3 -m vllm.entrypoints.openai.api_server "${args[@]}" > "$SERVER_LOG" 2>&1 & |
| SERVER_PID=$! |
| export SERVER_PID PORT |
| trap 'kill "$SERVER_PID" >/dev/null 2>&1 || true; wait "$SERVER_PID" >/dev/null 2>&1 || true' EXIT |
| wait_for_server |
| } |
|
|
| run_smoke_client() { |
| python3 - <<PY |
| import json |
| import urllib.request |
| |
| model = "${SERVED_MODEL_NAME}" |
| url = "http://127.0.0.1:${PORT}/v1/chat/completions" |
| checks = [ |
| ("What is 2+2? Answer with just the number.", "4"), |
| ("Paris is the capital of which country? Answer with one word.", "France"), |
| ] |
| |
| for prompt, expected in checks: |
| payload = { |
| "model": model, |
| "messages": [{"role": "user", "content": prompt}], |
| "max_tokens": 16, |
| "temperature": 0, |
| } |
| request = urllib.request.Request( |
| url, |
| data=json.dumps(payload).encode("utf-8"), |
| headers={"Content-Type": "application/json"}, |
| method="POST", |
| ) |
| with urllib.request.urlopen(request, timeout=120) as response: |
| data = json.load(response) |
| text = data["choices"][0]["message"]["content"].strip() |
| print(f"{prompt} => {text}", flush=True) |
| if expected.lower() not in text.lower(): |
| raise SystemExit( |
| f"semantic smoke failed: expected {expected!r} in response {text!r}") |
| |
| print("SMOKE_PROMPTS_OK", flush=True) |
| PY |
| } |
|
|
| run_smoke() { |
| RUN_ID="${RUN_ID:-smoke_$(date +%Y%m%d_%H%M%S)}" |
| OUT="${RESULTS_DIR:-$RESULTS_ROOT/$RUN_ID}" |
| mkdir -p "$OUT" |
| SERVER_LOG="$OUT/server.log" |
| start_background_server |
| run_smoke_client | tee "$OUT/smoke_outputs.txt" |
| echo "SMOKE_OUT=$OUT" |
| } |
|
|
| run_bench() { |
| RUN_ID="${RUN_ID:-tokens_sec_phase2_eagle_$(date +%Y%m%d_%H%M%S)}" |
| OUT="${RESULTS_DIR:-$RESULTS_ROOT/$RUN_ID}" |
| mkdir -p "$OUT" |
| SERVER_LOG="$OUT/server.log" |
| start_background_server |
|
|
| if [ "${RUN_SMOKE:-0}" = "1" ]; then |
| run_smoke_client | tee "$OUT/smoke_outputs.txt" |
| fi |
| if [ "${SMOKE_ONLY:-0}" = "1" ]; then |
| echo "SMOKE_ONLY=1; skipping benchmark" |
| echo "BENCH_OUT=$OUT" |
| exit 0 |
| fi |
|
|
| python3 -m vllm.entrypoints.cli.main bench serve \ |
| --backend openai-chat \ |
| --base-url "http://127.0.0.1:$PORT" \ |
| --endpoint /v1/chat/completions \ |
| --model "$SERVED_MODEL_NAME" \ |
| --tokenizer "$MODEL" \ |
| --dataset-name random \ |
| --random-input-len "${INPUT_LEN:-128}" \ |
| --random-output-len "${OUTPUT_LEN:-32}" \ |
| --num-prompts "${NUM_PROMPTS:-8}" \ |
| --num-warmups "${NUM_WARMUPS:-1}" \ |
| --request-rate "${REQUEST_RATE:-inf}" \ |
| --temperature 0 \ |
| --ignore-eos \ |
| --disable-tqdm \ |
| --save-result \ |
| --result-dir "$OUT" \ |
| --result-filename bench.json \ |
| 2>&1 | tee "$OUT/bench.log" |
|
|
| echo "BENCH_OUT=$OUT" |
| } |
|
|
| case "$COMMAND" in |
| serve) |
| run_server "$@" |
| ;; |
| smoke) |
| run_smoke |
| ;; |
| bench) |
| run_bench |
| ;; |
| bash|sh) |
| exec "$COMMAND" "$@" |
| ;; |
| *) |
| exec "$COMMAND" "$@" |
| ;; |
| esac |
|
|