File size: 7,965 Bytes
2d05890 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 | #!/usr/bin/env bash
# Mac M-series PEFT THERMAL/SUSTAINED-LOAD bench.
#
# Replays MARTA cases on a loop for N minutes against a local llama-server,
# while a parallel sampler records tok/s + RSS every 30 s. Captures the
# cold-start → sustained → throttle curve under a realistic kiosk-dialogue
# workload (multi-round tool-using cases, not synthetic 1024-token streams).
#
# **Run only on fanless / passively-cooled silicon.** On fan-cooled Macs
# (M2 Pro, M2 Max, M3/M4 Pro/Max) the curve is flat — use run_probe.sh
# instead for cross-Mac comparison.
#
# Run:
# bash scripts/mac_bench/run_thermal.sh 2b # default 45 min
# bash scripts/mac_bench/run_thermal.sh 2b --duration 30m
# bash scripts/mac_bench/run_thermal.sh 4b --duration 60m --ctx 16384
#
# Output: results/mac_bench/<chip>-<ram>gb-<size>-thermal/
# - thermal_curve.csv (one row per 30 s window)
# - thermal_curve.json (full samples + cold/sustained/throttle summary)
# - llama_server.log
# - llama_rss.log
# - mock_server.log
set -u
cd "$(dirname "$0")/../.." || exit 1
# ---- arg parse ----
SIZE=""
CTX=""
DURATION_RAW="45m"
while [[ $# -gt 0 ]]; do
case "$1" in
--ctx) CTX="$2"; shift 2 ;;
--ctx=*) CTX="${1#--ctx=}"; shift ;;
--duration) DURATION_RAW="$2"; shift 2 ;;
--duration=*) DURATION_RAW="${1#--duration=}"; shift ;;
-h|--help) grep -E '^# ' "$0" | sed 's/^# *//'; exit 0 ;;
*) SIZE="$1"; shift ;;
esac
done
[[ -z "$SIZE" ]] && { echo "Usage: $0 {2b|4b|9b|27b} [--ctx N] [--duration 45m]" >&2; exit 2; }
case "$SIZE" in 2b|4b|9b|27b) ;; *) echo "Bad size" >&2; exit 2 ;; esac
SIZE_UP=$(echo "$SIZE" | tr '[:lower:]' '[:upper:]')
# Parse duration: accept "45m", "30m", "1h", "1800s", or bare seconds.
case "$DURATION_RAW" in
*m) DURATION_SEC=$(( ${DURATION_RAW%m} * 60 )) ;;
*h) DURATION_SEC=$(( ${DURATION_RAW%h} * 3600 )) ;;
*s) DURATION_SEC=${DURATION_RAW%s} ;;
*) DURATION_SEC=$DURATION_RAW ;;
esac
if [[ -z "$CTX" ]]; then
case "$SIZE" in
2b) CTX=32768 ;;
4b) CTX=16384 ;;
9b) CTX=16384 ;;
27b) CTX=16384 ;;
esac
fi
REPO_ID="continker/Qwen3.5-${SIZE_UP}-metro-v23"
GGUF_NAME="Qwen3.5-${SIZE_UP}-metro-v23-Q4_K_M.gguf"
LOCAL_GGUF_DIR="data/mac_models"
LOCAL_GGUF="$LOCAL_GGUF_DIR/$GGUF_NAME"
CHIP=$(sysctl -n machdep.cpu.brand_string | sed 's/Apple //; s/ /-/g')
RAM_GB=$(sysctl -n hw.memsize | awk '{printf "%.0f", $1/1024/1024/1024}')
RUN_TAG="${CHIP}-${RAM_GB}gb-${SIZE}-thermal"
OUT_DIR="results/mac_bench/${RUN_TAG}"
mkdir -p "$OUT_DIR"
LLAMA_PORT=8081
MOCK_PORT=8102
LLAMA_LOG="$OUT_DIR/llama_server.log"
RSS_LOG="$OUT_DIR/llama_rss.log"
MOCK_LOG="$OUT_DIR/mock_server.log"
RAW_RESULTS="$OUT_DIR/marta_thermal_raw.json"
CURVE_CSV="$OUT_DIR/thermal_curve.csv"
CURVE_JSON="$OUT_DIR/thermal_curve.json"
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
command -v llama-server >/dev/null 2>&1 || { log "ERROR: brew install llama.cpp"; exit 1; }
command -v uv >/dev/null 2>&1 || { log "ERROR: brew install uv"; exit 1; }
# ---- download GGUF ----
mkdir -p "$LOCAL_GGUF_DIR"
if [[ ! -f "$LOCAL_GGUF" ]]; then
log "Downloading $REPO_ID/$GGUF_NAME"
uv run --with huggingface_hub python - <<PY
from huggingface_hub import hf_hub_download
import os; os.makedirs("$LOCAL_GGUF_DIR", exist_ok=True)
hf_hub_download(repo_id="$REPO_ID", filename="$GGUF_NAME", local_dir="$LOCAL_GGUF_DIR")
PY
fi
[[ -f "$LOCAL_GGUF" ]] || { log "ERROR: download failed"; exit 1; }
# ---- kill stale ports ----
kill_port() {
local pids; pids=$(lsof -t -i :"$1" -P -n 2>/dev/null || true)
[[ -n "$pids" ]] && { kill $pids 2>/dev/null || true; sleep 1; kill -9 $pids 2>/dev/null || true; sleep 1; }
}
kill_port $LLAMA_PORT
kill_port $MOCK_PORT
# ---- start llama-server ----
log "Starting llama-server :$LLAMA_PORT (Metal, parallel=1, ctx=$CTX)"
llama-server --model "$LOCAL_GGUF" --port $LLAMA_PORT --n-gpu-layers 999 \
--ctx-size "$CTX" --parallel 1 --flash-attn on --no-mmap \
--alias "${SIZE}-metro-v23" > "$LLAMA_LOG" 2>&1 &
LLAMA_PID=$!
until curl -sf "http://localhost:${LLAMA_PORT}/health" >/dev/null 2>&1; do
kill -0 "$LLAMA_PID" 2>/dev/null || { log "ERROR: llama-server died"; tail -30 "$LLAMA_LOG"; exit 1; }
sleep 2
done
log "llama-server ready (PID=$LLAMA_PID)"
# ---- RSS sampler (1 s cadence, full duration) ----
( while kill -0 "$LLAMA_PID" 2>/dev/null; do
rss=$(ps -o rss= -p "$LLAMA_PID" 2>/dev/null | tr -d ' ')
[[ -n "$rss" ]] && echo "$(date +%s) $rss"
sleep 1
done ) > "$RSS_LOG" 2>&1 &
RSS_PID=$!
# ---- start mock_server ----
uv run python -m harness.mock_server --system marta --port $MOCK_PORT > "$MOCK_LOG" 2>&1 &
MOCK_PID=$!
until curl -sf "http://localhost:${MOCK_PORT}/health" >/dev/null 2>&1; do
kill -0 "$MOCK_PID" 2>/dev/null || { log "ERROR: mock died"; kill $LLAMA_PID $RSS_PID 2>/dev/null; exit 1; }
sleep 1
done
# ---- start thermal sampler in parallel (real-time poll of llama log + RSS) ----
log "Starting thermal sampler (interval=30s, duration=${DURATION_SEC}s)"
uv run python scripts/mac_bench/thermal_sampler.py \
--llama-log "$LLAMA_LOG" --rss-log "$RSS_LOG" \
--out-csv "$CURVE_CSV" --out-json "$CURVE_JSON" \
--interval 30 --duration "$DURATION_SEC" > "$OUT_DIR/sampler.log" 2>&1 &
SAMPLER_PID=$!
# ---- start runner against full 156-case MARTA set; will be killed at duration ----
log "Starting runner (full MARTA, parallel=1, thinking on) — will run for ${DURATION_RAW}"
RUN_START=$(date +%s)
uv run python -m harness.runner \
--cases cases/marta_cases.json --system marta \
--llm-url "http://localhost:${LLAMA_PORT}/v1" --llm-key sk-mac-bench \
--llm-model "${SIZE}-metro-v23" \
--thinking --parallel 1 \
--mock-url "http://localhost:${MOCK_PORT}" \
--output "$RAW_RESULTS" > "$OUT_DIR/runner.log" 2>&1 &
RUNNER_PID=$!
# ---- wait until duration elapses or runner finishes (whichever first) ----
DEADLINE=$(( $(date +%s) + DURATION_SEC + 30 )) # +30s grace for sampler write
while (( $(date +%s) < DEADLINE )); do
# If sampler finished, we have all the data we need — break
if ! kill -0 "$SAMPLER_PID" 2>/dev/null; then
break
fi
# If runner finished early (very fast hardware), keep sampler going until duration
if ! kill -0 "$RUNNER_PID" 2>/dev/null; then
log "Runner finished early at $(( $(date +%s) - RUN_START ))s; sampler continuing on warm llama-server"
# Re-launch a tight idle-decode loop so the sampler still sees activity?
# No — just let the sampler finish; flat tail is meaningful (hardware idle behavior).
break
fi
sleep 5
done
# ---- shutdown ----
log "Stopping runner (PID=$RUNNER_PID)..."
kill "$RUNNER_PID" 2>/dev/null || true
sleep 2
kill -9 "$RUNNER_PID" 2>/dev/null || true
log "Waiting for sampler to finish (max 60s)..."
SAMPLER_DEADLINE=$(( $(date +%s) + 60 ))
while kill -0 "$SAMPLER_PID" 2>/dev/null && (( $(date +%s) < SAMPLER_DEADLINE )); do
sleep 2
done
kill "$SAMPLER_PID" 2>/dev/null || true
kill "$MOCK_PID" 2>/dev/null || true
kill "$LLAMA_PID" 2>/dev/null || true
sleep 2
kill -9 "$LLAMA_PID" 2>/dev/null || true
kill "$RSS_PID" 2>/dev/null || true
wait 2>/dev/null || true
RUN_END=$(date +%s)
log "Total wallclock: $((RUN_END - RUN_START))s"
# ---- print summary ----
log "Done. Output: $OUT_DIR"
log ""
log "Thermal summary:"
if [[ -f "$CURVE_JSON" ]]; then
uv run python -c "
import json
s = json.loads(open('$CURVE_JSON').read())
print(f\" duration: {s['duration_sec']}s, samples: {s['n_samples']}\")
print(f\" cold: {s['tok_s_cold']:.1f} tok/s\")
print(f\" sustained: {s['tok_s_sustained_last5']:.1f} tok/s (last 5 samples)\")
print(f\" median: {s['tok_s_median_overall']:.1f} tok/s (overall)\")
print(f\" throttle: {s['throttle_pct_cold_to_sustained']:+.1f}% (cold → sustained)\")
print(f\" peak rss: {s['peak_rss_gb']:.2f} GB\")
"
else
log "(no thermal_curve.json — sampler may have failed; see $OUT_DIR/sampler.log)"
fi
|