File size: 6,384 Bytes
2d05890
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#!/usr/bin/env bash
# Mac M-series PEFT PROBE. Short bench (~15 cases, stratified across all 11
# MetroLLM-Bench categories) for cross-Mac comparison of TTFT + tok/s + RAM
# without paying the 156-case wallclock.
#
# Captures the same telemetry shape as run_bench.sh, just with N small enough
# that running on M2 Air / M4 Pro / M2 Max each takes 15-30 min.
#
# Run:
#   bash scripts/mac_bench/run_probe.sh 2b           # 15 stratified MARTA cases
#   bash scripts/mac_bench/run_probe.sh 4b --ctx 16384
#
# Output: results/mac_bench/<chip>-<ram>gb-<size>-probe/

set -u
cd "$(dirname "$0")/../.." || exit 1

# 15 stratified case IDs covering all 11 MetroLLM-Bench categories on MARTA.
# Picked to give 1-2 cases per category, biased toward C/K (most diagnostic).
PROBE_CASE_IDS="MARTA-A-001,MARTA-A-005,MARTA-B-001,MARTA-C-001,MARTA-C-005,MARTA-D-001,MARTA-E-001,MARTA-F-001,MARTA-G-001,MARTA-H-001,MARTA-I-001,MARTA-J-001,MARTA-K-001,MARTA-K-002,MARTA-K-003"

# ---- arg parse ----
SIZE=""
CTX=""
while [[ $# -gt 0 ]]; do
  case "$1" in
    --ctx)        CTX="$2"; shift 2 ;;
    --ctx=*)      CTX="${1#--ctx=}"; shift ;;
    -h|--help)    grep -E '^# ' "$0" | sed 's/^# *//'; exit 0 ;;
    *)            SIZE="$1"; shift ;;
  esac
done
[[ -z "$SIZE" ]] && { echo "Usage: $0 {2b|4b|9b|27b} [--ctx N]" >&2; exit 2; }
case "$SIZE" in 2b|4b|9b|27b) ;; *) echo "Bad size" >&2; exit 2 ;; esac
SIZE_UP=$(echo "$SIZE" | tr '[:lower:]' '[:upper:]')

if [[ -z "$CTX" ]]; then
  case "$SIZE" in
    2b)  CTX=32768 ;;
    4b)  CTX=16384 ;;
    9b)  CTX=16384 ;;
    27b) CTX=16384 ;;
  esac
fi

REPO_ID="continker/Qwen3.5-${SIZE_UP}-metro-v23"
GGUF_NAME="Qwen3.5-${SIZE_UP}-metro-v23-Q4_K_M.gguf"
LOCAL_GGUF_DIR="data/mac_models"
LOCAL_GGUF="$LOCAL_GGUF_DIR/$GGUF_NAME"

CHIP=$(sysctl -n machdep.cpu.brand_string | sed 's/Apple //; s/ /-/g')
RAM_GB=$(sysctl -n hw.memsize | awk '{printf "%.0f", $1/1024/1024/1024}')
RUN_TAG="${CHIP}-${RAM_GB}gb-${SIZE}-probe"
OUT_DIR="results/mac_bench/${RUN_TAG}"
mkdir -p "$OUT_DIR"

LLAMA_PORT=8081
MOCK_PORT=8102
LLAMA_LOG="$OUT_DIR/llama_server.log"
RSS_LOG="$OUT_DIR/llama_rss.log"
MOCK_LOG="$OUT_DIR/mock_server.log"
RAW_RESULTS="$OUT_DIR/marta_raw.json"
SCORED_RESULTS="$OUT_DIR/marta_scored.json"
TELEMETRY_JSON="$OUT_DIR/telemetry.json"

log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }

command -v llama-server >/dev/null 2>&1 || { log "ERROR: brew install llama.cpp"; exit 1; }
command -v uv >/dev/null 2>&1 || { log "ERROR: brew install uv"; exit 1; }

# ---- download GGUF ----
mkdir -p "$LOCAL_GGUF_DIR"
if [[ -f "$LOCAL_GGUF" ]]; then
  log "GGUF cached: $LOCAL_GGUF ($(du -h "$LOCAL_GGUF" | awk '{print $1}'))"
else
  log "Downloading $REPO_ID/$GGUF_NAME"
  uv run --with huggingface_hub python - <<PY
from huggingface_hub import hf_hub_download
import os; os.makedirs("$LOCAL_GGUF_DIR", exist_ok=True)
hf_hub_download(repo_id="$REPO_ID", filename="$GGUF_NAME", local_dir="$LOCAL_GGUF_DIR")
PY
fi
[[ -f "$LOCAL_GGUF" ]] || { log "ERROR: download failed"; exit 1; }

# ---- kill stale processes ----
kill_port() {
  local pids; pids=$(lsof -t -i :"$1" -P -n 2>/dev/null || true)
  [[ -n "$pids" ]] && { kill $pids 2>/dev/null || true; sleep 1; kill -9 $pids 2>/dev/null || true; sleep 1; }
}
kill_port $LLAMA_PORT
kill_port $MOCK_PORT

# ---- start llama-server ----
log "Starting llama-server :$LLAMA_PORT (Metal, parallel=1, ctx=$CTX)"
llama-server --model "$LOCAL_GGUF" --port $LLAMA_PORT --n-gpu-layers 999 \
  --ctx-size "$CTX" --parallel 1 --flash-attn on --no-mmap \
  --alias "${SIZE}-metro-v23" > "$LLAMA_LOG" 2>&1 &
LLAMA_PID=$!
until curl -sf "http://localhost:${LLAMA_PORT}/health" >/dev/null 2>&1; do
  kill -0 "$LLAMA_PID" 2>/dev/null || { log "ERROR: llama-server died"; tail -30 "$LLAMA_LOG"; exit 1; }
  sleep 2
done
log "llama-server ready (PID=$LLAMA_PID)"

# ---- RSS sampler ----
( while kill -0 "$LLAMA_PID" 2>/dev/null; do
    rss=$(ps -o rss= -p "$LLAMA_PID" 2>/dev/null | tr -d ' ')
    [[ -n "$rss" ]] && echo "$(date +%s) $rss"
    sleep 1
  done ) > "$RSS_LOG" 2>&1 &
RSS_PID=$!

# ---- start mock_server ----
uv run python -m harness.mock_server --system marta --port $MOCK_PORT > "$MOCK_LOG" 2>&1 &
MOCK_PID=$!
until curl -sf "http://localhost:${MOCK_PORT}/health" >/dev/null 2>&1; do
  kill -0 "$MOCK_PID" 2>/dev/null || { log "ERROR: mock died"; tail -30 "$MOCK_LOG"; kill $LLAMA_PID $RSS_PID 2>/dev/null; exit 1; }
  sleep 1
done

# ---- run probe ----
log "Running probe (15 stratified cases): $PROBE_CASE_IDS"
RUN_START=$(date +%s)
uv run python -m harness.runner \
  --cases cases/marta_cases.json --system marta \
  --llm-url "http://localhost:${LLAMA_PORT}/v1" --llm-key sk-mac-bench \
  --llm-model "${SIZE}-metro-v23" \
  --case-ids "$PROBE_CASE_IDS" \
  --thinking --parallel 1 \
  --mock-url "http://localhost:${MOCK_PORT}" \
  --output "$RAW_RESULTS" 2>&1 | tee "$OUT_DIR/runner.log" | tail -5
RUN_END=$(date +%s)
log "Runner wallclock: $((RUN_END - RUN_START))s"

# ---- shutdown ----
kill "$MOCK_PID" 2>/dev/null || true
kill "$LLAMA_PID" 2>/dev/null || true
sleep 2
kill -9 "$LLAMA_PID" 2>/dev/null || true
kill "$RSS_PID" 2>/dev/null || true
wait 2>/dev/null || true

# ---- score (judge always on) ----
[[ -f "$RAW_RESULTS" ]] && uv run python -m harness.scorer \
  --system marta --results "$RAW_RESULTS" --output "$SCORED_RESULTS" 2>&1 | tail -3

# ---- telemetry ----
uv run python scripts/mac_bench/parse_telemetry.py \
  --llama-log "$LLAMA_LOG" --rss-log "$RSS_LOG" \
  --raw-results "$RAW_RESULTS" --scored-results "$SCORED_RESULTS" \
  --runner-wallclock $((RUN_END - RUN_START)) \
  --chip "$CHIP" --ram-gb "$RAM_GB" --size "$SIZE" --ctx-size "$CTX" \
  --output "$TELEMETRY_JSON"

log "Done. Output: $OUT_DIR"
uv run python -c "
import json
t = json.loads(open('$TELEMETRY_JSON').read())
print(f\"  chip:           {t['hardware']['chip']} ({t['hardware']['ram_gb']} GB)\")
print(f\"  model:          {t['model']['size']} ctx={t['model']['ctx_size']}\")
print(f\"  tier1:          {t['eval'].get('tier1_composite', 'n/a')}  (n={t['eval'].get('n_cases', 'n/a')})\")
print(f\"  decode tok/s:   {t['perf']['decode_tok_s_median']:.1f} median, {t['perf']['decode_tok_s_p10']:.1f} p10\")
print(f\"  ttft ms:        {t['perf']['ttft_ms_median']:.0f} median\")
print(f\"  peak rss:       {t['perf']['peak_rss_gb']:.2f} GB\")
print(f\"  wallclock:      {t['perf']['runner_wallclock_s']}s\")
"