#!/usr/bin/env bash # Thanatos-27B — tok/s benchmark via Ollama. # # Reads timing from Ollama's /api/chat response metadata (eval_count and # eval_duration are authoritative — no client-side stopwatch noise) and # averages over a handful of prompts that vary in output length so the # number generalises a bit beyond a single shape. # # Usage: # ./scripts/bench.sh # uses MODEL=thanatos-27b # MODEL=thanatos-27b ./scripts/bench.sh # HOST=http://localhost:11434 ./scripts/bench.sh # # Requires: curl, jq, a running Ollama daemon with the model created. set -euo pipefail MODEL="${MODEL:-thanatos-27b}" HOST="${HOST:-http://localhost:11434}" red() { printf "\033[31m%s\033[0m\n" "$*" >&2; } green() { printf "\033[32m%s\033[0m\n" "$*"; } blue() { printf "\033[34m%s\033[0m\n" "$*"; } # tok_per_s -> "X.YZ" (2 dp, floor). tok_per_s() { jq -n --argjson c "$1" --argjson n "$2" '($c / ($n / 1e9)) | . * 100 | floor / 100' } for dep in curl jq; do if ! command -v "$dep" >/dev/null 2>&1; then red "[!] missing dependency: $dep"; exit 1 fi done # Single /api/tags fetch covers both checks below. if ! TAGS="$(curl -fsS "${HOST}/api/tags")"; then red "[!] Ollama not reachable at ${HOST}" exit 1 fi # Match case-insensitively: Ollama 0.24's API tag list preserves the # case of whatever `general.name` it inferred at create time, which # can differ from the case the user passed to `ollama create` / typed # into `ollama run`. Both `ollama show ` and `ollama show # ` resolve to the same model, so the bench check should too. if ! jq -e --arg m "${MODEL}" '.models[] | select(.name | ascii_downcase | startswith($m | ascii_downcase))' >/dev/null <<<"${TAGS}"; then red "[!] Model '${MODEL}' not found. Build it first: ./scripts/build.sh" exit 1 fi # Mix of short / medium / long output lengths — single shape would skew # the average toward whatever the model decides to do for that prompt. PROMPTS=( "Reply with only the word OK." "Explain the time complexity of mergesort in one short paragraph." "Write a 120-word explanation of what a Bloom filter is and when to use it." ) blue "[*] host: ${HOST}" blue "[*] model: ${MODEL}" blue "[*] prompts: ${#PROMPTS[@]}" echo # Warmup — first call pays the model-load cost; we don't want that in # the average. Result is discarded. blue "[*] warmup..." curl -fsS "${HOST}/api/chat" \ -H 'Content-Type: application/json' \ -d "$(jq -n --arg m "${MODEL}" '{ model: $m, messages: [{role:"user", content:"warmup"}], stream: false }')" >/dev/null TOTAL_TOKENS=0 TOTAL_NS=0 printf "%-4s %8s %12s %8s\n" "#" "tokens" "eval_ms" "tok/s" printf "%-4s %8s %12s %8s\n" "----" "--------" "------------" "--------" for i in "${!PROMPTS[@]}"; do prompt="${PROMPTS[$i]}" resp="$(curl -fsS "${HOST}/api/chat" \ -H 'Content-Type: application/json' \ -d "$(jq -n --arg m "${MODEL}" --arg p "$prompt" '{ model: $m, messages: [{role:"user", content:$p}], stream: false }')")" eval_count="$(jq -r '.eval_count // 0' <<<"$resp")" eval_ns="$(jq -r '.eval_duration // 0' <<<"$resp")" if [[ "$eval_count" -eq 0 || "$eval_ns" -eq 0 ]]; then red "[!] prompt $((i+1)) returned no timing data" echo "$resp" | jq -r '.message.content // .' | head -3 exit 1 fi eval_ms=$(( eval_ns / 1000000 )) toks_per_s="$(tok_per_s "$eval_count" "$eval_ns")" printf "%-4s %8s %12s %8s\n" "$((i+1))" "$eval_count" "$eval_ms" "$toks_per_s" TOTAL_TOKENS=$(( TOTAL_TOKENS + eval_count )) TOTAL_NS=$(( TOTAL_NS + eval_ns )) done echo avg="$(tok_per_s "$TOTAL_TOKENS" "$TOTAL_NS")" green "[+] aggregate: ${TOTAL_TOKENS} tokens / $(( TOTAL_NS / 1000000 )) ms = ${avg} tok/s"