gemmacut-spectral / docker /entrypoint.sh
satya007's picture
Add constrained smoke switches
b025705 verified
#!/usr/bin/env bash
set -euo pipefail
COMMAND="${1:-serve}"
if [ "$#" -gt 0 ]; then
shift
fi
MODEL="${MODEL:-Intel/gemma-4-31B-it-int4-AutoRound}"
DRAFT="${DRAFT:-RedHatAI/gemma-4-31B-it-speculator.eagle3}"
SERVED_MODEL_NAME="${SERVED_MODEL_NAME:-gemmacut-spectral}"
SPECTRAL_SIDECAR="${SPECTRAL_SIDECAR:-/opt/gemmacut/artifacts/spectral_sidecar_chat_v2.pt}"
VLLM_SOURCE="${VLLM_SOURCE:-/opt/vllm-spectral}"
PORT="${PORT:-8000}"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-512}"
MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-512}"
MAX_NUM_SEQS="${MAX_NUM_SEQS:-2}"
GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.8}"
NUM_SPEC_TOKENS="${NUM_SPEC_TOKENS:-3}"
ENABLE_SPECTRAL="${ENABLE_SPECTRAL:-1}"
ENABLE_EAGLE="${ENABLE_EAGLE:-1}"
SPECTRAL_CUDA_GRAPH="${SPECTRAL_CUDA_GRAPH:-1}"
VLLM_LOGGING_LEVEL="${VLLM_LOGGING_LEVEL:-INFO}"
DISABLE_HYBRID_KV_CACHE_MANAGER="${DISABLE_HYBRID_KV_CACHE_MANAGER:-0}"
RESULTS_ROOT="${RESULTS_ROOT:-/workspace/results_bench}"
export VLLM_LOGGING_LEVEL
export SPECTRAL_CUDA_GRAPH
export SPECTRAL_TRITON_COMPRESS="${SPECTRAL_TRITON_COMPRESS:-1}"
export SPECTRAL_TRITON_DEQUANT="${SPECTRAL_TRITON_DEQUANT:-1}"
export SPECTRAL_VERIFY="${SPECTRAL_VERIFY:-0}"
export HF_HUB_DISABLE_XET="${HF_HUB_DISABLE_XET:-1}"
unset SPECTRAL_SHARED_ALLOC
if [ "${HF_HUB_OFFLINE:-0}" = "1" ]; then
export HF_HUB_OFFLINE=1
else
unset HF_HUB_OFFLINE
fi
prepare_overlay() {
local run_src="${SPECTRAL_RUN_SRC:-/tmp/vllm-spectral-run}"
local site
if [ ! -d "$VLLM_SOURCE" ]; then
echo "Missing VLLM_SOURCE: $VLLM_SOURCE" >&2
exit 1
fi
if [ "$ENABLE_SPECTRAL" = "1" ] && [ ! -f "$SPECTRAL_SIDECAR" ]; then
echo "Missing SPECTRAL_SIDECAR: $SPECTRAL_SIDECAR" >&2
exit 1
fi
site="$(python3 - <<'PY'
import pathlib
import vllm
print(pathlib.Path(vllm.__file__).resolve().parent)
PY
)"
rm -rf "$run_src"
cp -a "$VLLM_SOURCE" "$run_src"
shopt -s nullglob
for f in "$site"/_C*.so "$site"/_moe_C*.so "$site"/_flashmla*.so "$site"/cumem_allocator*.so; do
ln -sf "$f" "$run_src/vllm/"
done
mkdir -p "$run_src/vllm/vllm_flash_attn"
for f in "$site"/vllm_flash_attn/_vllm_fa2_C*.so "$site"/vllm_flash_attn/_vllm_fa3_C*.so; do
ln -sf "$f" "$run_src/vllm/vllm_flash_attn/"
done
ln -sfn "$site/vllm_flash_attn/cute" "$run_src/vllm/vllm_flash_attn/cute"
ln -sfn "$site/vllm_flash_attn/layers" "$run_src/vllm/vllm_flash_attn/layers"
mkdir -p "$run_src/vllm/third_party" "$run_src/vllm/third_party/flashmla"
ln -sfn "$site/third_party/triton_kernels" "$run_src/vllm/third_party/triton_kernels"
ln -sf "$site/third_party/flashmla/flash_mla_interface.py" "$run_src/vllm/third_party/flashmla/"
shopt -u nullglob
export PYTHONPATH="$run_src:$run_src/vllm/third_party${PYTHONPATH:+:$PYTHONPATH}"
}
server_args() {
local args=(
--host "${HOST:-0.0.0.0}"
--port "$PORT"
--model "$MODEL"
--served-model-name "$SERVED_MODEL_NAME"
--kv-cache-dtype fp8_e4m3
--max-model-len "$MAX_MODEL_LEN"
--max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS"
--max-num-seqs "$MAX_NUM_SEQS"
--gpu-memory-utilization "$GPU_MEMORY_UTILIZATION"
--compilation-config "{\"compile_sizes\": []}"
)
if [ "$ENABLE_SPECTRAL" = "1" ]; then
args+=(
--spectral-calibration "$SPECTRAL_SIDECAR"
--spectral-quantize
)
fi
if [ "$ENABLE_EAGLE" = "1" ]; then
args+=(--speculative-config "{\"model\":\"$DRAFT\",\"num_speculative_tokens\":$NUM_SPEC_TOKENS,\"method\":\"eagle3\"}")
fi
if [ "$DISABLE_HYBRID_KV_CACHE_MANAGER" = "1" ]; then
args+=(--disable-hybrid-kv-cache-manager)
fi
printf '%s\0' "${args[@]}"
}
run_server() {
prepare_overlay
local args=()
while IFS= read -r -d '' item; do
args+=("$item")
done < <(server_args)
exec python3 -m vllm.entrypoints.openai.api_server "${args[@]}" "$@"
}
wait_for_server() {
python3 - <<PY
import os
import sys
import time
import urllib.request
pid = int(os.environ["SERVER_PID"])
port = int(os.environ["PORT"])
deadline = time.time() + int(os.environ.get("SERVER_TIMEOUT", "300"))
url = f"http://127.0.0.1:{port}/v1/models"
while time.time() < deadline:
try:
os.kill(pid, 0)
except OSError:
raise SystemExit("server exited early")
try:
with urllib.request.urlopen(url, timeout=2) as response:
if response.status == 200:
print("SERVER_READY", flush=True)
raise SystemExit(0)
except Exception:
time.sleep(1)
raise SystemExit("server did not become ready")
PY
}
start_background_server() {
prepare_overlay
local args=()
HOST=127.0.0.1
export HOST
while IFS= read -r -d '' item; do
args+=("$item")
done < <(server_args)
python3 -m vllm.entrypoints.openai.api_server "${args[@]}" > "$SERVER_LOG" 2>&1 &
SERVER_PID=$!
export SERVER_PID PORT
trap 'kill "$SERVER_PID" >/dev/null 2>&1 || true; wait "$SERVER_PID" >/dev/null 2>&1 || true' EXIT
wait_for_server
}
run_smoke_client() {
python3 - <<PY
import json
import urllib.request
model = "${SERVED_MODEL_NAME}"
url = "http://127.0.0.1:${PORT}/v1/chat/completions"
checks = [
("What is 2+2? Answer with just the number.", "4"),
("Paris is the capital of which country? Answer with one word.", "France"),
]
for prompt, expected in checks:
payload = {
"model": model,
"messages": [{"role": "user", "content": prompt}],
"max_tokens": 16,
"temperature": 0,
}
request = urllib.request.Request(
url,
data=json.dumps(payload).encode("utf-8"),
headers={"Content-Type": "application/json"},
method="POST",
)
with urllib.request.urlopen(request, timeout=120) as response:
data = json.load(response)
text = data["choices"][0]["message"]["content"].strip()
print(f"{prompt} => {text}", flush=True)
if expected.lower() not in text.lower():
raise SystemExit(
f"semantic smoke failed: expected {expected!r} in response {text!r}")
print("SMOKE_PROMPTS_OK", flush=True)
PY
}
run_smoke() {
RUN_ID="${RUN_ID:-smoke_$(date +%Y%m%d_%H%M%S)}"
OUT="${RESULTS_DIR:-$RESULTS_ROOT/$RUN_ID}"
mkdir -p "$OUT"
SERVER_LOG="$OUT/server.log"
start_background_server
run_smoke_client | tee "$OUT/smoke_outputs.txt"
echo "SMOKE_OUT=$OUT"
}
run_bench() {
RUN_ID="${RUN_ID:-tokens_sec_phase2_eagle_$(date +%Y%m%d_%H%M%S)}"
OUT="${RESULTS_DIR:-$RESULTS_ROOT/$RUN_ID}"
mkdir -p "$OUT"
SERVER_LOG="$OUT/server.log"
start_background_server
if [ "${RUN_SMOKE:-0}" = "1" ]; then
run_smoke_client | tee "$OUT/smoke_outputs.txt"
fi
if [ "${SMOKE_ONLY:-0}" = "1" ]; then
echo "SMOKE_ONLY=1; skipping benchmark"
echo "BENCH_OUT=$OUT"
exit 0
fi
python3 -m vllm.entrypoints.cli.main bench serve \
--backend openai-chat \
--base-url "http://127.0.0.1:$PORT" \
--endpoint /v1/chat/completions \
--model "$SERVED_MODEL_NAME" \
--tokenizer "$MODEL" \
--dataset-name random \
--random-input-len "${INPUT_LEN:-128}" \
--random-output-len "${OUTPUT_LEN:-32}" \
--num-prompts "${NUM_PROMPTS:-8}" \
--num-warmups "${NUM_WARMUPS:-1}" \
--request-rate "${REQUEST_RATE:-inf}" \
--temperature 0 \
--ignore-eos \
--disable-tqdm \
--save-result \
--result-dir "$OUT" \
--result-filename bench.json \
2>&1 | tee "$OUT/bench.log"
echo "BENCH_OUT=$OUT"
}
case "$COMMAND" in
serve)
run_server "$@"
;;
smoke)
run_smoke
;;
bench)
run_bench
;;
bash|sh)
exec "$COMMAND" "$@"
;;
*)
exec "$COMMAND" "$@"
;;
esac