File size: 6,384 Bytes
2d05890 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 | #!/usr/bin/env bash
# Mac M-series PEFT PROBE. Short bench (~15 cases, stratified across all 11
# MetroLLM-Bench categories) for cross-Mac comparison of TTFT + tok/s + RAM
# without paying the 156-case wallclock.
#
# Captures the same telemetry shape as run_bench.sh, just with N small enough
# that running on M2 Air / M4 Pro / M2 Max each takes 15-30 min.
#
# Run:
# bash scripts/mac_bench/run_probe.sh 2b # 15 stratified MARTA cases
# bash scripts/mac_bench/run_probe.sh 4b --ctx 16384
#
# Output: results/mac_bench/<chip>-<ram>gb-<size>-probe/
set -u
cd "$(dirname "$0")/../.." || exit 1
# 15 stratified case IDs covering all 11 MetroLLM-Bench categories on MARTA.
# Picked to give 1-2 cases per category, biased toward C/K (most diagnostic).
PROBE_CASE_IDS="MARTA-A-001,MARTA-A-005,MARTA-B-001,MARTA-C-001,MARTA-C-005,MARTA-D-001,MARTA-E-001,MARTA-F-001,MARTA-G-001,MARTA-H-001,MARTA-I-001,MARTA-J-001,MARTA-K-001,MARTA-K-002,MARTA-K-003"
# ---- arg parse ----
SIZE=""
CTX=""
while [[ $# -gt 0 ]]; do
case "$1" in
--ctx) CTX="$2"; shift 2 ;;
--ctx=*) CTX="${1#--ctx=}"; shift ;;
-h|--help) grep -E '^# ' "$0" | sed 's/^# *//'; exit 0 ;;
*) SIZE="$1"; shift ;;
esac
done
[[ -z "$SIZE" ]] && { echo "Usage: $0 {2b|4b|9b|27b} [--ctx N]" >&2; exit 2; }
case "$SIZE" in 2b|4b|9b|27b) ;; *) echo "Bad size" >&2; exit 2 ;; esac
SIZE_UP=$(echo "$SIZE" | tr '[:lower:]' '[:upper:]')
if [[ -z "$CTX" ]]; then
case "$SIZE" in
2b) CTX=32768 ;;
4b) CTX=16384 ;;
9b) CTX=16384 ;;
27b) CTX=16384 ;;
esac
fi
REPO_ID="continker/Qwen3.5-${SIZE_UP}-metro-v23"
GGUF_NAME="Qwen3.5-${SIZE_UP}-metro-v23-Q4_K_M.gguf"
LOCAL_GGUF_DIR="data/mac_models"
LOCAL_GGUF="$LOCAL_GGUF_DIR/$GGUF_NAME"
CHIP=$(sysctl -n machdep.cpu.brand_string | sed 's/Apple //; s/ /-/g')
RAM_GB=$(sysctl -n hw.memsize | awk '{printf "%.0f", $1/1024/1024/1024}')
RUN_TAG="${CHIP}-${RAM_GB}gb-${SIZE}-probe"
OUT_DIR="results/mac_bench/${RUN_TAG}"
mkdir -p "$OUT_DIR"
LLAMA_PORT=8081
MOCK_PORT=8102
LLAMA_LOG="$OUT_DIR/llama_server.log"
RSS_LOG="$OUT_DIR/llama_rss.log"
MOCK_LOG="$OUT_DIR/mock_server.log"
RAW_RESULTS="$OUT_DIR/marta_raw.json"
SCORED_RESULTS="$OUT_DIR/marta_scored.json"
TELEMETRY_JSON="$OUT_DIR/telemetry.json"
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
command -v llama-server >/dev/null 2>&1 || { log "ERROR: brew install llama.cpp"; exit 1; }
command -v uv >/dev/null 2>&1 || { log "ERROR: brew install uv"; exit 1; }
# ---- download GGUF ----
mkdir -p "$LOCAL_GGUF_DIR"
if [[ -f "$LOCAL_GGUF" ]]; then
log "GGUF cached: $LOCAL_GGUF ($(du -h "$LOCAL_GGUF" | awk '{print $1}'))"
else
log "Downloading $REPO_ID/$GGUF_NAME"
uv run --with huggingface_hub python - <<PY
from huggingface_hub import hf_hub_download
import os; os.makedirs("$LOCAL_GGUF_DIR", exist_ok=True)
hf_hub_download(repo_id="$REPO_ID", filename="$GGUF_NAME", local_dir="$LOCAL_GGUF_DIR")
PY
fi
[[ -f "$LOCAL_GGUF" ]] || { log "ERROR: download failed"; exit 1; }
# ---- kill stale processes ----
kill_port() {
local pids; pids=$(lsof -t -i :"$1" -P -n 2>/dev/null || true)
[[ -n "$pids" ]] && { kill $pids 2>/dev/null || true; sleep 1; kill -9 $pids 2>/dev/null || true; sleep 1; }
}
kill_port $LLAMA_PORT
kill_port $MOCK_PORT
# ---- start llama-server ----
log "Starting llama-server :$LLAMA_PORT (Metal, parallel=1, ctx=$CTX)"
llama-server --model "$LOCAL_GGUF" --port $LLAMA_PORT --n-gpu-layers 999 \
--ctx-size "$CTX" --parallel 1 --flash-attn on --no-mmap \
--alias "${SIZE}-metro-v23" > "$LLAMA_LOG" 2>&1 &
LLAMA_PID=$!
until curl -sf "http://localhost:${LLAMA_PORT}/health" >/dev/null 2>&1; do
kill -0 "$LLAMA_PID" 2>/dev/null || { log "ERROR: llama-server died"; tail -30 "$LLAMA_LOG"; exit 1; }
sleep 2
done
log "llama-server ready (PID=$LLAMA_PID)"
# ---- RSS sampler ----
( while kill -0 "$LLAMA_PID" 2>/dev/null; do
rss=$(ps -o rss= -p "$LLAMA_PID" 2>/dev/null | tr -d ' ')
[[ -n "$rss" ]] && echo "$(date +%s) $rss"
sleep 1
done ) > "$RSS_LOG" 2>&1 &
RSS_PID=$!
# ---- start mock_server ----
uv run python -m harness.mock_server --system marta --port $MOCK_PORT > "$MOCK_LOG" 2>&1 &
MOCK_PID=$!
until curl -sf "http://localhost:${MOCK_PORT}/health" >/dev/null 2>&1; do
kill -0 "$MOCK_PID" 2>/dev/null || { log "ERROR: mock died"; tail -30 "$MOCK_LOG"; kill $LLAMA_PID $RSS_PID 2>/dev/null; exit 1; }
sleep 1
done
# ---- run probe ----
log "Running probe (15 stratified cases): $PROBE_CASE_IDS"
RUN_START=$(date +%s)
uv run python -m harness.runner \
--cases cases/marta_cases.json --system marta \
--llm-url "http://localhost:${LLAMA_PORT}/v1" --llm-key sk-mac-bench \
--llm-model "${SIZE}-metro-v23" \
--case-ids "$PROBE_CASE_IDS" \
--thinking --parallel 1 \
--mock-url "http://localhost:${MOCK_PORT}" \
--output "$RAW_RESULTS" 2>&1 | tee "$OUT_DIR/runner.log" | tail -5
RUN_END=$(date +%s)
log "Runner wallclock: $((RUN_END - RUN_START))s"
# ---- shutdown ----
kill "$MOCK_PID" 2>/dev/null || true
kill "$LLAMA_PID" 2>/dev/null || true
sleep 2
kill -9 "$LLAMA_PID" 2>/dev/null || true
kill "$RSS_PID" 2>/dev/null || true
wait 2>/dev/null || true
# ---- score (judge always on) ----
[[ -f "$RAW_RESULTS" ]] && uv run python -m harness.scorer \
--system marta --results "$RAW_RESULTS" --output "$SCORED_RESULTS" 2>&1 | tail -3
# ---- telemetry ----
uv run python scripts/mac_bench/parse_telemetry.py \
--llama-log "$LLAMA_LOG" --rss-log "$RSS_LOG" \
--raw-results "$RAW_RESULTS" --scored-results "$SCORED_RESULTS" \
--runner-wallclock $((RUN_END - RUN_START)) \
--chip "$CHIP" --ram-gb "$RAM_GB" --size "$SIZE" --ctx-size "$CTX" \
--output "$TELEMETRY_JSON"
log "Done. Output: $OUT_DIR"
uv run python -c "
import json
t = json.loads(open('$TELEMETRY_JSON').read())
print(f\" chip: {t['hardware']['chip']} ({t['hardware']['ram_gb']} GB)\")
print(f\" model: {t['model']['size']} ctx={t['model']['ctx_size']}\")
print(f\" tier1: {t['eval'].get('tier1_composite', 'n/a')} (n={t['eval'].get('n_cases', 'n/a')})\")
print(f\" decode tok/s: {t['perf']['decode_tok_s_median']:.1f} median, {t['perf']['decode_tok_s_p10']:.1f} p10\")
print(f\" ttft ms: {t['perf']['ttft_ms_median']:.0f} median\")
print(f\" peak rss: {t['perf']['peak_rss_gb']:.2f} GB\")
print(f\" wallclock: {t['perf']['runner_wallclock_s']}s\")
"
|