gemmacut-spectral / docker /entrypoint.sh

Add constrained smoke switches

b025705 verified about 1 month ago

7.59 kB

	#!/usr/bin/env bash
	set -euo pipefail

	COMMAND="${1:-serve}"
	if [ "$#" -gt 0 ]; then
	shift
	fi

	MODEL="${MODEL:-Intel/gemma-4-31B-it-int4-AutoRound}"
	DRAFT="${DRAFT:-RedHatAI/gemma-4-31B-it-speculator.eagle3}"
	SERVED_MODEL_NAME="${SERVED_MODEL_NAME:-gemmacut-spectral}"
	SPECTRAL_SIDECAR="${SPECTRAL_SIDECAR:-/opt/gemmacut/artifacts/spectral_sidecar_chat_v2.pt}"
	VLLM_SOURCE="${VLLM_SOURCE:-/opt/vllm-spectral}"
	PORT="${PORT:-8000}"
	MAX_MODEL_LEN="${MAX_MODEL_LEN:-512}"
	MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-512}"
	MAX_NUM_SEQS="${MAX_NUM_SEQS:-2}"
	GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.8}"
	NUM_SPEC_TOKENS="${NUM_SPEC_TOKENS:-3}"
	ENABLE_SPECTRAL="${ENABLE_SPECTRAL:-1}"
	ENABLE_EAGLE="${ENABLE_EAGLE:-1}"
	SPECTRAL_CUDA_GRAPH="${SPECTRAL_CUDA_GRAPH:-1}"
	VLLM_LOGGING_LEVEL="${VLLM_LOGGING_LEVEL:-INFO}"
	DISABLE_HYBRID_KV_CACHE_MANAGER="${DISABLE_HYBRID_KV_CACHE_MANAGER:-0}"
	RESULTS_ROOT="${RESULTS_ROOT:-/workspace/results_bench}"

	export VLLM_LOGGING_LEVEL
	export SPECTRAL_CUDA_GRAPH
	export SPECTRAL_TRITON_COMPRESS="${SPECTRAL_TRITON_COMPRESS:-1}"
	export SPECTRAL_TRITON_DEQUANT="${SPECTRAL_TRITON_DEQUANT:-1}"
	export SPECTRAL_VERIFY="${SPECTRAL_VERIFY:-0}"
	export HF_HUB_DISABLE_XET="${HF_HUB_DISABLE_XET:-1}"
	unset SPECTRAL_SHARED_ALLOC

	if [ "${HF_HUB_OFFLINE:-0}" = "1" ]; then
	export HF_HUB_OFFLINE=1
	else
	unset HF_HUB_OFFLINE
	fi

	prepare_overlay() {
	local run_src="${SPECTRAL_RUN_SRC:-/tmp/vllm-spectral-run}"
	local site

	if [ ! -d "$VLLM_SOURCE" ]; then
	echo "Missing VLLM_SOURCE: $VLLM_SOURCE" >&2
	exit 1
	fi
	if [ "$ENABLE_SPECTRAL" = "1" ] && [ ! -f "$SPECTRAL_SIDECAR" ]; then
	echo "Missing SPECTRAL_SIDECAR: $SPECTRAL_SIDECAR" >&2
	exit 1
	fi

	site="$(python3 - <<'PY'
	import pathlib
	import vllm
	print(pathlib.Path(vllm.__file__).resolve().parent)
	PY
	)"

	rm -rf "$run_src"
	cp -a "$VLLM_SOURCE" "$run_src"

	shopt -s nullglob
	for f in "$site"/_C.so "$site"/_moe_C.so "$site"/_flashmla.so "$site"/cumem_allocator.so; do
	ln -sf "$f" "$run_src/vllm/"
	done
	mkdir -p "$run_src/vllm/vllm_flash_attn"
	for f in "$site"/vllm_flash_attn/_vllm_fa2_C.so "$site"/vllm_flash_attn/_vllm_fa3_C.so; do
	ln -sf "$f" "$run_src/vllm/vllm_flash_attn/"
	done
	ln -sfn "$site/vllm_flash_attn/cute" "$run_src/vllm/vllm_flash_attn/cute"
	ln -sfn "$site/vllm_flash_attn/layers" "$run_src/vllm/vllm_flash_attn/layers"
	mkdir -p "$run_src/vllm/third_party" "$run_src/vllm/third_party/flashmla"
	ln -sfn "$site/third_party/triton_kernels" "$run_src/vllm/third_party/triton_kernels"
	ln -sf "$site/third_party/flashmla/flash_mla_interface.py" "$run_src/vllm/third_party/flashmla/"
	shopt -u nullglob

	export PYTHONPATH="$run_src:$run_src/vllm/third_party${PYTHONPATH:+:$PYTHONPATH}"
	}

	server_args() {
	local args=(
	--host "${HOST:-0.0.0.0}"
	--port "$PORT"
	--model "$MODEL"
	--served-model-name "$SERVED_MODEL_NAME"
	--kv-cache-dtype fp8_e4m3
	--max-model-len "$MAX_MODEL_LEN"
	--max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS"
	--max-num-seqs "$MAX_NUM_SEQS"
	--gpu-memory-utilization "$GPU_MEMORY_UTILIZATION"
	--compilation-config "{\"compile_sizes\": []}"
	)
	if [ "$ENABLE_SPECTRAL" = "1" ]; then
	args+=(
	--spectral-calibration "$SPECTRAL_SIDECAR"
	--spectral-quantize
	)
	fi
	if [ "$ENABLE_EAGLE" = "1" ]; then
	args+=(--speculative-config "{\"model\":\"$DRAFT\",\"num_speculative_tokens\":$NUM_SPEC_TOKENS,\"method\":\"eagle3\"}")
	fi
	if [ "$DISABLE_HYBRID_KV_CACHE_MANAGER" = "1" ]; then
	args+=(--disable-hybrid-kv-cache-manager)
	fi
	printf '%s\0' "${args[@]}"
	}

	run_server() {
	prepare_overlay
	local args=()
	while IFS= read -r -d '' item; do
	args+=("$item")
	done < <(server_args)
	exec python3 -m vllm.entrypoints.openai.api_server "${args[@]}" "$@"
	}

	wait_for_server() {
	python3 - <<PY
	import os
	import sys
	import time
	import urllib.request

	pid = int(os.environ["SERVER_PID"])
	port = int(os.environ["PORT"])
	deadline = time.time() + int(os.environ.get("SERVER_TIMEOUT", "300"))
	url = f"http://127.0.0.1:{port}/v1/models"
	while time.time() < deadline:
	try:
	os.kill(pid, 0)
	except OSError:
	raise SystemExit("server exited early")
	try:
	with urllib.request.urlopen(url, timeout=2) as response:
	if response.status == 200:
	print("SERVER_READY", flush=True)
	raise SystemExit(0)
	except Exception:
	time.sleep(1)
	raise SystemExit("server did not become ready")
	PY
	}

	start_background_server() {
	prepare_overlay
	local args=()
	HOST=127.0.0.1
	export HOST
	while IFS= read -r -d '' item; do
	args+=("$item")
	done < <(server_args)
	python3 -m vllm.entrypoints.openai.api_server "${args[@]}" > "$SERVER_LOG" 2>&1 &
	SERVER_PID=$!
	export SERVER_PID PORT
	trap 'kill "$SERVER_PID" >/dev/null 2>&1 \|\| true; wait "$SERVER_PID" >/dev/null 2>&1 \|\| true' EXIT
	wait_for_server
	}

	run_smoke_client() {
	python3 - <<PY
	import json
	import urllib.request

	model = "${SERVED_MODEL_NAME}"
	url = "http://127.0.0.1:${PORT}/v1/chat/completions"
	checks = [
	("What is 2+2? Answer with just the number.", "4"),
	("Paris is the capital of which country? Answer with one word.", "France"),
	]

	for prompt, expected in checks:
	payload = {
	"model": model,
	"messages": [{"role": "user", "content": prompt}],
	"max_tokens": 16,
	"temperature": 0,
	}
	request = urllib.request.Request(
	url,
	data=json.dumps(payload).encode("utf-8"),
	headers={"Content-Type": "application/json"},
	method="POST",
	)
	with urllib.request.urlopen(request, timeout=120) as response:
	data = json.load(response)
	text = data["choices"][0]["message"]["content"].strip()
	print(f"{prompt} => {text}", flush=True)
	if expected.lower() not in text.lower():
	raise SystemExit(
	f"semantic smoke failed: expected {expected!r} in response {text!r}")

	print("SMOKE_PROMPTS_OK", flush=True)
	PY
	}

	run_smoke() {
	RUN_ID="${RUN_ID:-smoke_$(date +%Y%m%d_%H%M%S)}"
	OUT="${RESULTS_DIR:-$RESULTS_ROOT/$RUN_ID}"
	mkdir -p "$OUT"
	SERVER_LOG="$OUT/server.log"
	start_background_server
	run_smoke_client \| tee "$OUT/smoke_outputs.txt"
	echo "SMOKE_OUT=$OUT"
	}

	run_bench() {
	RUN_ID="${RUN_ID:-tokens_sec_phase2_eagle_$(date +%Y%m%d_%H%M%S)}"
	OUT="${RESULTS_DIR:-$RESULTS_ROOT/$RUN_ID}"
	mkdir -p "$OUT"
	SERVER_LOG="$OUT/server.log"
	start_background_server

	if [ "${RUN_SMOKE:-0}" = "1" ]; then
	run_smoke_client \| tee "$OUT/smoke_outputs.txt"
	fi
	if [ "${SMOKE_ONLY:-0}" = "1" ]; then
	echo "SMOKE_ONLY=1; skipping benchmark"
	echo "BENCH_OUT=$OUT"
	exit 0
	fi

	python3 -m vllm.entrypoints.cli.main bench serve \
	--backend openai-chat \
	--base-url "http://127.0.0.1:$PORT" \
	--endpoint /v1/chat/completions \
	--model "$SERVED_MODEL_NAME" \
	--tokenizer "$MODEL" \
	--dataset-name random \
	--random-input-len "${INPUT_LEN:-128}" \
	--random-output-len "${OUTPUT_LEN:-32}" \
	--num-prompts "${NUM_PROMPTS:-8}" \
	--num-warmups "${NUM_WARMUPS:-1}" \
	--request-rate "${REQUEST_RATE:-inf}" \
	--temperature 0 \
	--ignore-eos \
	--disable-tqdm \
	--save-result \
	--result-dir "$OUT" \
	--result-filename bench.json \
	2>&1 \| tee "$OUT/bench.log"

	echo "BENCH_OUT=$OUT"
	}

	case "$COMMAND" in
	serve)
	run_server "$@"
	;;
	smoke)
	run_smoke
	;;
	bench)
	run_bench
	;;
	bash\|sh)
	exec "$COMMAND" "$@"
	;;
	*)
	exec "$COMMAND" "$@"
	;;
	esac