File size: 3,881 Bytes
d344201
7197abd
d344201
 
 
 
 
 
 
7197abd
 
d344201
 
 
 
 
7197abd
d344201
 
32d9533
d344201
 
 
32d9533
 
 
 
 
d344201
 
 
 
 
 
32d9533
 
d344201
 
 
124302d
 
 
 
 
 
d344201
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32d9533
d344201
 
 
 
 
 
 
32d9533
d344201
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env bash
# Thanatos-27B — tok/s benchmark via Ollama.
#
# Reads timing from Ollama's /api/chat response metadata (eval_count and
# eval_duration are authoritative — no client-side stopwatch noise) and
# averages over a handful of prompts that vary in output length so the
# number generalises a bit beyond a single shape.
#
# Usage:
#   ./scripts/bench.sh                       # uses MODEL=thanatos-27b
#   MODEL=thanatos-27b ./scripts/bench.sh
#   HOST=http://localhost:11434 ./scripts/bench.sh
#
# Requires: curl, jq, a running Ollama daemon with the model created.
set -euo pipefail

MODEL="${MODEL:-thanatos-27b}"
HOST="${HOST:-http://localhost:11434}"

red()   { printf "\033[31m%s\033[0m\n" "$*" >&2; }
green() { printf "\033[32m%s\033[0m\n" "$*"; }
blue()  { printf "\033[34m%s\033[0m\n" "$*"; }

# tok_per_s <eval_count> <eval_duration_ns> -> "X.YZ" (2 dp, floor).
tok_per_s() {
    jq -n --argjson c "$1" --argjson n "$2" '($c / ($n / 1e9)) | . * 100 | floor / 100'
}

for dep in curl jq; do
    if ! command -v "$dep" >/dev/null 2>&1; then
        red "[!] missing dependency: $dep"; exit 1
    fi
done

# Single /api/tags fetch covers both checks below.
if ! TAGS="$(curl -fsS "${HOST}/api/tags")"; then
    red "[!] Ollama not reachable at ${HOST}"
    exit 1
fi
# Match case-insensitively: Ollama 0.24's API tag list preserves the
# case of whatever `general.name` it inferred at create time, which
# can differ from the case the user passed to `ollama create` / typed
# into `ollama run`. Both `ollama show <lower>` and `ollama show
# <Mixed>` resolve to the same model, so the bench check should too.
if ! jq -e --arg m "${MODEL}" '.models[] | select(.name | ascii_downcase | startswith($m | ascii_downcase))' >/dev/null <<<"${TAGS}"; then
    red "[!] Model '${MODEL}' not found. Build it first: ./scripts/build.sh"
    exit 1
fi

# Mix of short / medium / long output lengths — single shape would skew
# the average toward whatever the model decides to do for that prompt.
PROMPTS=(
    "Reply with only the word OK."
    "Explain the time complexity of mergesort in one short paragraph."
    "Write a 120-word explanation of what a Bloom filter is and when to use it."
)

blue "[*] host:  ${HOST}"
blue "[*] model: ${MODEL}"
blue "[*] prompts: ${#PROMPTS[@]}"
echo

# Warmup — first call pays the model-load cost; we don't want that in
# the average. Result is discarded.
blue "[*] warmup..."
curl -fsS "${HOST}/api/chat" \
    -H 'Content-Type: application/json' \
    -d "$(jq -n --arg m "${MODEL}" '{
        model: $m,
        messages: [{role:"user", content:"warmup"}],
        stream: false
    }')" >/dev/null

TOTAL_TOKENS=0
TOTAL_NS=0

printf "%-4s  %8s  %12s  %8s\n" "#" "tokens" "eval_ms" "tok/s"
printf "%-4s  %8s  %12s  %8s\n" "----" "--------" "------------" "--------"

for i in "${!PROMPTS[@]}"; do
    prompt="${PROMPTS[$i]}"
    resp="$(curl -fsS "${HOST}/api/chat" \
        -H 'Content-Type: application/json' \
        -d "$(jq -n --arg m "${MODEL}" --arg p "$prompt" '{
            model: $m,
            messages: [{role:"user", content:$p}],
            stream: false
        }')")"

    eval_count="$(jq -r '.eval_count // 0' <<<"$resp")"
    eval_ns="$(jq -r '.eval_duration // 0' <<<"$resp")"

    if [[ "$eval_count" -eq 0 || "$eval_ns" -eq 0 ]]; then
        red "[!] prompt $((i+1)) returned no timing data"
        echo "$resp" | jq -r '.message.content // .' | head -3
        exit 1
    fi

    eval_ms=$(( eval_ns / 1000000 ))
    toks_per_s="$(tok_per_s "$eval_count" "$eval_ns")"
    printf "%-4s  %8s  %12s  %8s\n" "$((i+1))" "$eval_count" "$eval_ms" "$toks_per_s"

    TOTAL_TOKENS=$(( TOTAL_TOKENS + eval_count ))
    TOTAL_NS=$(( TOTAL_NS + eval_ns ))
done

echo
avg="$(tok_per_s "$TOTAL_TOKENS" "$TOTAL_NS")"
green "[+] aggregate: ${TOTAL_TOKENS} tokens / $(( TOTAL_NS / 1000000 )) ms = ${avg} tok/s"