File size: 9,547 Bytes

2d05890

#!/usr/bin/env bash
# Mac M-series PEFT bench. One model size per invocation.
#
# What it captures: tier1 / composite (existing scorer) + decode tok/s + TTFT
# + peak RAM + chip/RAM/fanless metadata. Single-system MARTA bench (~150 cases).
#
# Pull artefacts from continker/ HF org. No teacher box / network back to LAN.
#
# Prereqs (one-time per Mac):
#   - macOS 14+ (Apple Silicon)
#   - Homebrew
#   - llama.cpp:  brew install llama.cpp
#   - uv:         brew install uv  (or `curl -LsSf https://astral.sh/uv/install.sh | sh`)
#   - Repo cloned + `uv sync` in the repo root
#   - .env with ANTHROPIC_API_KEY (for Tier 2 judge)
#
# Run:
#   bash scripts/mac_bench/run_bench.sh 2b               # default ctx=32768 (2B may chain long)
#   bash scripts/mac_bench/run_bench.sh 4b               # default ctx=16384
#   bash scripts/mac_bench/run_bench.sh 9b               # default ctx=16384
#   bash scripts/mac_bench/run_bench.sh 9b --ctx 8192    # tighter ctx for low-RAM Macs
#   (skip 27b on Macs <48 GB unified RAM)
#
# Context-size requirements (fp16 KV cache, --parallel 1, see docs in README.md):
#   p99 final-conversation tokens, measured across 8 Qwen3.5 PEFT/base models on MARTA:
#     2B FT: not yet measured (v17 2B PEFT hit 18.8K → 32K default for safety)
#     4B FT: 8.7K  (16K default → 7.3K headroom for next response)
#     9B FT: 7.8K  (16K default → 8.2K headroom)
#     27B FT: 9.6K (16K default → 6.4K headroom)
#   llama.cpp allocates the full KV cache UPFRONT at server start.
#   Reducing ctx-size below the defaults risks "context full" mid-bench failures.

set -u
cd "$(dirname "$0")/../.." || exit 1

# ---- arg parse ----
SIZE=""
CTX=""
while [[ $# -gt 0 ]]; do
  case "$1" in
    --ctx)        CTX="$2"; shift 2 ;;
    --ctx=*)      CTX="${1#--ctx=}"; shift ;;
    -h|--help)
      grep -E '^# (Run:|  bash|  -|  $)' "$0" | sed 's/^# *//'
      exit 0
      ;;
    *)            SIZE="$1"; shift ;;
  esac
done
if [[ -z "$SIZE" ]]; then
  echo "Usage: $0 {2b|4b|9b|27b} [--ctx N]" >&2
  exit 2
fi
case "$SIZE" in
  2b|4b|9b|27b) ;;
  *) echo "Bad size: $SIZE" >&2; exit 2 ;;
esac
SIZE_UP=$(echo "$SIZE" | tr '[:lower:]' '[:upper:]')

# Default ctx-size per model (rounded to powers of 2 covering measured p99 + ~6K headroom).
# Override with --ctx for tight-RAM Macs; below 8192 risks bench failures on long chains.
if [[ -z "$CTX" ]]; then
  case "$SIZE" in
    2b)  CTX=32768 ;;   # 2B may retry more, KV cost is small (~1.2 GB) so 32K is cheap
    4b)  CTX=16384 ;;   # measured max 10.3K, 16K covers comfortably
    9b)  CTX=16384 ;;   # measured max 8.2K
    27b) CTX=16384 ;;   # measured max 11.5K (not run on Mac in default flow)
  esac
fi

REPO_ID="continker/Qwen3.5-${SIZE_UP}-metro-v23"
GGUF_NAME="Qwen3.5-${SIZE_UP}-metro-v23-Q4_K_M.gguf"
LOCAL_GGUF_DIR="data/mac_models"
LOCAL_GGUF="$LOCAL_GGUF_DIR/$GGUF_NAME"

CHIP=$(sysctl -n machdep.cpu.brand_string | sed 's/Apple //; s/ /-/g')
RAM_GB=$(sysctl -n hw.memsize | awk '{printf "%.0f", $1/1024/1024/1024}')
RUN_TAG="${CHIP}-${RAM_GB}gb-${SIZE}"
OUT_DIR="results/mac_bench/${RUN_TAG}"
mkdir -p "$OUT_DIR"

LLAMA_PORT=8081  # different from box-bench default 8080 to avoid clash
MOCK_PORT=8102   # different from box-bench default 8100 — both mocks may run concurrently on the same Mac
LLAMA_LOG="$OUT_DIR/llama_server.log"
RSS_LOG="$OUT_DIR/llama_rss.log"
MOCK_LOG="$OUT_DIR/mock_server.log"
RAW_RESULTS="$OUT_DIR/marta_raw.json"
SCORED_RESULTS="$OUT_DIR/marta_scored.json"
TELEMETRY_JSON="$OUT_DIR/telemetry.json"

log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }

# ---- prereq checks ----
if ! command -v llama-server >/dev/null 2>&1; then
  log "ERROR: llama-server not on PATH. brew install llama.cpp"
  exit 1
fi
if ! command -v uv >/dev/null 2>&1; then
  log "ERROR: uv not on PATH. brew install uv"
  exit 1
fi
if ! command -v hf >/dev/null 2>&1 && ! command -v huggingface-cli >/dev/null 2>&1; then
  log "Note: 'hf' CLI not found; will use uv-managed huggingface_hub Python lib for download."
fi

# ---- download GGUF if missing ----
mkdir -p "$LOCAL_GGUF_DIR"
if [[ -f "$LOCAL_GGUF" ]]; then
  log "GGUF cached: $LOCAL_GGUF ($(du -h "$LOCAL_GGUF" | awk '{print $1}'))"
else
  log "Downloading $REPO_ID/$GGUF_NAME -> $LOCAL_GGUF"
  uv run --with huggingface_hub python - <<PY
from huggingface_hub import hf_hub_download
import os
os.makedirs("$LOCAL_GGUF_DIR", exist_ok=True)
path = hf_hub_download(
    repo_id="$REPO_ID",
    filename="$GGUF_NAME",
    local_dir="$LOCAL_GGUF_DIR",
)
print("downloaded:", path)
PY
fi

if [[ ! -f "$LOCAL_GGUF" ]]; then
  log "ERROR: download failed"
  exit 1
fi

# ---- kill anything on llama port + mock port ----
kill_port() {
  local port=$1
  local pids
  pids=$(lsof -t -i :${port} -P -n 2>/dev/null || true)
  if [[ -n "$pids" ]]; then
    kill $pids 2>/dev/null || true
    sleep 1
    pids=$(lsof -t -i :${port} -P -n 2>/dev/null || true)
    [[ -n "$pids" ]] && { kill -9 $pids 2>/dev/null || true; sleep 1; }
  fi
}
kill_port $LLAMA_PORT
kill_port $MOCK_PORT

# ---- estimated RAM check ----
# Rough KV cost (fp16, GQA): 2B = 36 KB/tok, 4B/9B = 144 KB/tok, 27B = 256 KB/tok
case "$SIZE" in
  2b)  KV_PER_TOK_KB=36;  WEIGHTS_GB=1.2 ;;
  4b)  KV_PER_TOK_KB=144; WEIGHTS_GB=2.6 ;;
  9b)  KV_PER_TOK_KB=144; WEIGHTS_GB=5.3 ;;
  27b) KV_PER_TOK_KB=256; WEIGHTS_GB=16.0 ;;
esac
KV_GB=$(awk "BEGIN {printf \"%.2f\", $KV_PER_TOK_KB * $CTX / 1024 / 1024}")
EST_GB=$(awk "BEGIN {printf \"%.1f\", $WEIGHTS_GB + $KV_GB + 1.5}")  # +1.5 for Metal/buffers
log "Mem estimate: weights $WEIGHTS_GB GB + KV@${CTX} $KV_GB GB + overhead 1.5 GB = ${EST_GB} GB total."
log "Available: ${RAM_GB} GB unified. (macOS + apps typically reserve 4-6 GB.)"

# ---- start llama-server ----
log "Starting llama-server on :$LLAMA_PORT (Metal full-offload, parallel=1, ctx=$CTX)"
llama-server \
  --model "$LOCAL_GGUF" \
  --port $LLAMA_PORT \
  --n-gpu-layers 999 \
  --ctx-size "$CTX" \
  --parallel 1 \
  --flash-attn on \
  --alias "${SIZE}-metro-v23" \
  --no-mmap \
  > "$LLAMA_LOG" 2>&1 &
LLAMA_PID=$!
log "llama-server PID=$LLAMA_PID"

# wait for ready
log "Waiting for llama-server health..."
until curl -sf "http://localhost:${LLAMA_PORT}/health" >/dev/null 2>&1; do
  if ! kill -0 "$LLAMA_PID" 2>/dev/null; then
    log "ERROR: llama-server died during startup. Last 30 lines:"
    tail -30 "$LLAMA_LOG"
    exit 1
  fi
  sleep 2
done
log "llama-server ready"

# ---- start RSS sampler (1s cadence) ----
(
  while kill -0 "$LLAMA_PID" 2>/dev/null; do
    rss_kb=$(ps -o rss= -p "$LLAMA_PID" 2>/dev/null | tr -d ' ')
    [[ -n "$rss_kb" ]] && echo "$(date +%s) $rss_kb"
    sleep 1
  done
) > "$RSS_LOG" 2>&1 &
RSS_PID=$!

# ---- start mock_server ----
log "Starting mock_server on :$MOCK_PORT (system=marta)"
uv run python -m harness.mock_server --system marta --port $MOCK_PORT \
  > "$MOCK_LOG" 2>&1 &
MOCK_PID=$!
until curl -sf "http://localhost:${MOCK_PORT}/health" >/dev/null 2>&1; do
  if ! kill -0 "$MOCK_PID" 2>/dev/null; then
    log "ERROR: mock_server died. Last 30 lines:"
    tail -30 "$MOCK_LOG"
    kill "$LLAMA_PID" "$RSS_PID" 2>/dev/null || true
    exit 1
  fi
  sleep 1
done

# ---- run bench ----
RUN_START=$(date +%s)
log "Running runner (MARTA, parallel=1, thinking on)..."
if ! uv run python -m harness.runner \
    --cases "cases/marta_cases.json" --system marta \
    --llm-url "http://localhost:${LLAMA_PORT}/v1" \
    --llm-key "sk-mac-bench" \
    --llm-model "${SIZE}-metro-v23" \
    --thinking --parallel 1 \
    --mock-url "http://localhost:${MOCK_PORT}" \
    --output "$RAW_RESULTS" 2>&1 | tee "$OUT_DIR/runner.log" | tail -5; then
  log "WARN: runner returned non-zero; will still attempt scoring"
fi
RUN_END=$(date +%s)
log "Runner wallclock: $((RUN_END - RUN_START))s"

# ---- shutdown llama + mock + rss in correct order ----
log "Stopping mock_server..."
kill "$MOCK_PID" 2>/dev/null || true
log "Stopping llama-server..."
kill "$LLAMA_PID" 2>/dev/null || true
sleep 2
kill -9 "$LLAMA_PID" 2>/dev/null || true
kill "$RSS_PID" 2>/dev/null || true
wait 2>/dev/null || true

# ---- score (scorer always uses LLM judge; needs ANTHROPIC_API_KEY in .env) ----
if [[ -f "$RAW_RESULTS" ]]; then
  log "Scoring (Claude Haiku judge)..."
  uv run python -m harness.scorer \
    --system marta --results "$RAW_RESULTS" --output "$SCORED_RESULTS" \
    2>&1 | tail -5
else
  log "WARN: no raw results to score"
fi

# ---- parse telemetry ----
log "Parsing telemetry..."
uv run python scripts/mac_bench/parse_telemetry.py \
  --llama-log "$LLAMA_LOG" \
  --rss-log "$RSS_LOG" \
  --raw-results "$RAW_RESULTS" \
  --scored-results "$SCORED_RESULTS" \
  --runner-wallclock $((RUN_END - RUN_START)) \
  --chip "$CHIP" --ram-gb "$RAM_GB" --size "$SIZE" \
  --ctx-size "$CTX" \
  --output "$TELEMETRY_JSON"

log "Done. Output: $OUT_DIR"
log ""
log "Telemetry summary:"
uv run python -c "
import json
t = json.loads(open('$TELEMETRY_JSON').read())
print(f\"  chip:           {t['hardware']['chip']} ({t['hardware']['ram_gb']} GB)\")
print(f\"  model:          {t['model']['size']} ({t['model']['gguf_gb']:.2f} GB GGUF)\")
print(f\"  tier1:          {t['eval'].get('tier1_composite', 'n/a')}\")
print(f\"  composite:      {t['eval'].get('metrollm_composite', 'n/a')}\")
print(f\"  decode tok/s:   {t['perf']['decode_tok_s_median']:.1f} median, {t['perf']['decode_tok_s_p10']:.1f} p10\")
print(f\"  ttft ms:        {t['perf']['ttft_ms_median']:.0f} median\")
print(f\"  peak rss:       {t['perf']['peak_rss_gb']:.2f} GB\")
print(f\"  wallclock:      {t['perf']['runner_wallclock_s']}s\")
"