| #!/usr/bin/env bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| set -u |
| cd "$(dirname "$0")/../.." || exit 1 |
|
|
| |
| SIZE="" |
| CTX="" |
| while [[ $# -gt 0 ]]; do |
| case "$1" in |
| --ctx) CTX="$2"; shift 2 ;; |
| --ctx=*) CTX="${1#--ctx=}"; shift ;; |
| -h|--help) |
| grep -E '^# (Run:| bash| -| $)' "$0" | sed 's/^# *//' |
| exit 0 |
| ;; |
| *) SIZE="$1"; shift ;; |
| esac |
| done |
| if [[ -z "$SIZE" ]]; then |
| echo "Usage: $0 {2b|4b|9b|27b} [--ctx N]" >&2 |
| exit 2 |
| fi |
| case "$SIZE" in |
| 2b|4b|9b|27b) ;; |
| *) echo "Bad size: $SIZE" >&2; exit 2 ;; |
| esac |
| SIZE_UP=$(echo "$SIZE" | tr '[:lower:]' '[:upper:]') |
|
|
| |
| |
| if [[ -z "$CTX" ]]; then |
| case "$SIZE" in |
| 2b) CTX=32768 ;; |
| 4b) CTX=16384 ;; |
| 9b) CTX=16384 ;; |
| 27b) CTX=16384 ;; |
| esac |
| fi |
|
|
| REPO_ID="continker/Qwen3.5-${SIZE_UP}-metro-v23" |
| GGUF_NAME="Qwen3.5-${SIZE_UP}-metro-v23-Q4_K_M.gguf" |
| LOCAL_GGUF_DIR="data/mac_models" |
| LOCAL_GGUF="$LOCAL_GGUF_DIR/$GGUF_NAME" |
|
|
| CHIP=$(sysctl -n machdep.cpu.brand_string | sed 's/Apple //; s/ /-/g') |
| RAM_GB=$(sysctl -n hw.memsize | awk '{printf "%.0f", $1/1024/1024/1024}') |
| RUN_TAG="${CHIP}-${RAM_GB}gb-${SIZE}" |
| OUT_DIR="results/mac_bench/${RUN_TAG}" |
| mkdir -p "$OUT_DIR" |
|
|
| LLAMA_PORT=8081 |
| MOCK_PORT=8102 |
| LLAMA_LOG="$OUT_DIR/llama_server.log" |
| RSS_LOG="$OUT_DIR/llama_rss.log" |
| MOCK_LOG="$OUT_DIR/mock_server.log" |
| RAW_RESULTS="$OUT_DIR/marta_raw.json" |
| SCORED_RESULTS="$OUT_DIR/marta_scored.json" |
| TELEMETRY_JSON="$OUT_DIR/telemetry.json" |
|
|
| log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; } |
|
|
| |
| if ! command -v llama-server >/dev/null 2>&1; then |
| log "ERROR: llama-server not on PATH. brew install llama.cpp" |
| exit 1 |
| fi |
| if ! command -v uv >/dev/null 2>&1; then |
| log "ERROR: uv not on PATH. brew install uv" |
| exit 1 |
| fi |
| if ! command -v hf >/dev/null 2>&1 && ! command -v huggingface-cli >/dev/null 2>&1; then |
| log "Note: 'hf' CLI not found; will use uv-managed huggingface_hub Python lib for download." |
| fi |
|
|
| |
| mkdir -p "$LOCAL_GGUF_DIR" |
| if [[ -f "$LOCAL_GGUF" ]]; then |
| log "GGUF cached: $LOCAL_GGUF ($(du -h "$LOCAL_GGUF" | awk '{print $1}'))" |
| else |
| log "Downloading $REPO_ID/$GGUF_NAME -> $LOCAL_GGUF" |
| uv run --with huggingface_hub python - <<PY |
| from huggingface_hub import hf_hub_download |
| import os |
| os.makedirs("$LOCAL_GGUF_DIR", exist_ok=True) |
| path = hf_hub_download( |
| repo_id="$REPO_ID", |
| filename="$GGUF_NAME", |
| local_dir="$LOCAL_GGUF_DIR", |
| ) |
| print("downloaded:", path) |
| PY |
| fi |
|
|
| if [[ ! -f "$LOCAL_GGUF" ]]; then |
| log "ERROR: download failed" |
| exit 1 |
| fi |
|
|
| |
| kill_port() { |
| local port=$1 |
| local pids |
| pids=$(lsof -t -i :${port} -P -n 2>/dev/null || true) |
| if [[ -n "$pids" ]]; then |
| kill $pids 2>/dev/null || true |
| sleep 1 |
| pids=$(lsof -t -i :${port} -P -n 2>/dev/null || true) |
| [[ -n "$pids" ]] && { kill -9 $pids 2>/dev/null || true; sleep 1; } |
| fi |
| } |
| kill_port $LLAMA_PORT |
| kill_port $MOCK_PORT |
|
|
| |
| |
| case "$SIZE" in |
| 2b) KV_PER_TOK_KB=36; WEIGHTS_GB=1.2 ;; |
| 4b) KV_PER_TOK_KB=144; WEIGHTS_GB=2.6 ;; |
| 9b) KV_PER_TOK_KB=144; WEIGHTS_GB=5.3 ;; |
| 27b) KV_PER_TOK_KB=256; WEIGHTS_GB=16.0 ;; |
| esac |
| KV_GB=$(awk "BEGIN {printf \"%.2f\", $KV_PER_TOK_KB * $CTX / 1024 / 1024}") |
| EST_GB=$(awk "BEGIN {printf \"%.1f\", $WEIGHTS_GB + $KV_GB + 1.5}") |
| log "Mem estimate: weights $WEIGHTS_GB GB + KV@${CTX} $KV_GB GB + overhead 1.5 GB = ${EST_GB} GB total." |
| log "Available: ${RAM_GB} GB unified. (macOS + apps typically reserve 4-6 GB.)" |
|
|
| |
| log "Starting llama-server on :$LLAMA_PORT (Metal full-offload, parallel=1, ctx=$CTX)" |
| llama-server \ |
| --model "$LOCAL_GGUF" \ |
| --port $LLAMA_PORT \ |
| --n-gpu-layers 999 \ |
| --ctx-size "$CTX" \ |
| --parallel 1 \ |
| --flash-attn on \ |
| --alias "${SIZE}-metro-v23" \ |
| --no-mmap \ |
| > "$LLAMA_LOG" 2>&1 & |
| LLAMA_PID=$! |
| log "llama-server PID=$LLAMA_PID" |
|
|
| |
| log "Waiting for llama-server health..." |
| until curl -sf "http://localhost:${LLAMA_PORT}/health" >/dev/null 2>&1; do |
| if ! kill -0 "$LLAMA_PID" 2>/dev/null; then |
| log "ERROR: llama-server died during startup. Last 30 lines:" |
| tail -30 "$LLAMA_LOG" |
| exit 1 |
| fi |
| sleep 2 |
| done |
| log "llama-server ready" |
|
|
| |
| ( |
| while kill -0 "$LLAMA_PID" 2>/dev/null; do |
| rss_kb=$(ps -o rss= -p "$LLAMA_PID" 2>/dev/null | tr -d ' ') |
| [[ -n "$rss_kb" ]] && echo "$(date +%s) $rss_kb" |
| sleep 1 |
| done |
| ) > "$RSS_LOG" 2>&1 & |
| RSS_PID=$! |
|
|
| |
| log "Starting mock_server on :$MOCK_PORT (system=marta)" |
| uv run python -m harness.mock_server --system marta --port $MOCK_PORT \ |
| > "$MOCK_LOG" 2>&1 & |
| MOCK_PID=$! |
| until curl -sf "http://localhost:${MOCK_PORT}/health" >/dev/null 2>&1; do |
| if ! kill -0 "$MOCK_PID" 2>/dev/null; then |
| log "ERROR: mock_server died. Last 30 lines:" |
| tail -30 "$MOCK_LOG" |
| kill "$LLAMA_PID" "$RSS_PID" 2>/dev/null || true |
| exit 1 |
| fi |
| sleep 1 |
| done |
|
|
| |
| RUN_START=$(date +%s) |
| log "Running runner (MARTA, parallel=1, thinking on)..." |
| if ! uv run python -m harness.runner \ |
| --cases "cases/marta_cases.json" --system marta \ |
| --llm-url "http://localhost:${LLAMA_PORT}/v1" \ |
| --llm-key "sk-mac-bench" \ |
| --llm-model "${SIZE}-metro-v23" \ |
| --thinking --parallel 1 \ |
| --mock-url "http://localhost:${MOCK_PORT}" \ |
| --output "$RAW_RESULTS" 2>&1 | tee "$OUT_DIR/runner.log" | tail -5; then |
| log "WARN: runner returned non-zero; will still attempt scoring" |
| fi |
| RUN_END=$(date +%s) |
| log "Runner wallclock: $((RUN_END - RUN_START))s" |
|
|
| |
| log "Stopping mock_server..." |
| kill "$MOCK_PID" 2>/dev/null || true |
| log "Stopping llama-server..." |
| kill "$LLAMA_PID" 2>/dev/null || true |
| sleep 2 |
| kill -9 "$LLAMA_PID" 2>/dev/null || true |
| kill "$RSS_PID" 2>/dev/null || true |
| wait 2>/dev/null || true |
|
|
| |
| if [[ -f "$RAW_RESULTS" ]]; then |
| log "Scoring (Claude Haiku judge)..." |
| uv run python -m harness.scorer \ |
| --system marta --results "$RAW_RESULTS" --output "$SCORED_RESULTS" \ |
| 2>&1 | tail -5 |
| else |
| log "WARN: no raw results to score" |
| fi |
|
|
| |
| log "Parsing telemetry..." |
| uv run python scripts/mac_bench/parse_telemetry.py \ |
| --llama-log "$LLAMA_LOG" \ |
| --rss-log "$RSS_LOG" \ |
| --raw-results "$RAW_RESULTS" \ |
| --scored-results "$SCORED_RESULTS" \ |
| --runner-wallclock $((RUN_END - RUN_START)) \ |
| --chip "$CHIP" --ram-gb "$RAM_GB" --size "$SIZE" \ |
| --ctx-size "$CTX" \ |
| --output "$TELEMETRY_JSON" |
|
|
| log "Done. Output: $OUT_DIR" |
| log "" |
| log "Telemetry summary:" |
| uv run python -c " |
| import json |
| t = json.loads(open('$TELEMETRY_JSON').read()) |
| print(f\" chip: {t['hardware']['chip']} ({t['hardware']['ram_gb']} GB)\") |
| print(f\" model: {t['model']['size']} ({t['model']['gguf_gb']:.2f} GB GGUF)\") |
| print(f\" tier1: {t['eval'].get('tier1_composite', 'n/a')}\") |
| print(f\" composite: {t['eval'].get('metrollm_composite', 'n/a')}\") |
| print(f\" decode tok/s: {t['perf']['decode_tok_s_median']:.1f} median, {t['perf']['decode_tok_s_p10']:.1f} p10\") |
| print(f\" ttft ms: {t['perf']['ttft_ms_median']:.0f} median\") |
| print(f\" peak rss: {t['perf']['peak_rss_gb']:.2f} GB\") |
| print(f\" wallclock: {t['perf']['runner_wallclock_s']}s\") |
| " |
|
|