kaiju-coder-7-quantized-runtime / scripts /run-gojira-b-vllm-serving-benchmark.sh
restokes92's picture
Upload Kaiju Coder 7 runtime quantization recipe
6d7449a verified
#!/usr/bin/env bash
set -euo pipefail
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
PORT="${KAIJU_VLLM_PORT:-18084}"
MODEL="${KAIJU_VLLM_MODEL_NAME:-kaiju-coder-7}"
CONTEXT="${KAIJU_VLLM_CONTEXT:-16384}"
READY_TIMEOUT="${KAIJU_VLLM_READY_TIMEOUT:-900}"
KEEP_VLLM="${KAIJU_VLLM_KEEP_RUNNING:-0}"
PROMPTS="${KAIJU_VLLM_PROMPTS:-identity code_patch}"
MAX_TOKENS="${KAIJU_VLLM_MAX_TOKENS:-128}"
TIMEOUT="${KAIJU_VLLM_PROMPT_TIMEOUT:-300}"
BASE_URL="http://100.109.109.14:${PORT}/v1"
restore_sglang() {
if [[ "${KEEP_VLLM}" == "1" ]]; then
return
fi
"${ROOT}/scripts/stop-qwen36-merged-vllm.sh" >/dev/null 2>&1 || true
KAIJU_QWEN36_MERGED_CONTEXT="${KAIJU_QWEN36_MERGED_CONTEXT:-32768}" \
"${ROOT}/scripts/start-qwen36-merged-sglang.sh" >/dev/null 2>&1 || true
}
trap restore_sglang EXIT
"${ROOT}/scripts/stop-qwen36-merged-sglang.sh"
"${ROOT}/scripts/stop-qwen36-merged-vllm.sh"
KAIJU_VLLM_CONTEXT="${CONTEXT}" "${ROOT}/scripts/start-qwen36-merged-vllm.sh"
deadline=$((SECONDS + READY_TIMEOUT))
until curl -fsSL "${BASE_URL}/models" | grep -q "\"${MODEL}\""; do
if (( SECONDS >= deadline )); then
echo "vLLM endpoint did not become ready at ${BASE_URL}" >&2
exit 1
fi
sleep 10
done
python3 "${ROOT}/scripts/benchmark_kaiju_serving.py" \
--base-url "${BASE_URL}" \
--model "${MODEL}" \
--contexts "${CONTEXT}" \
--prompts ${PROMPTS} \
--max-tokens "${MAX_TOKENS}" \
--timeout "${TIMEOUT}"