#!/usr/bin/env bash # Launch the BLT pilot AND chain the pre-registered z-ablation eval on the # final checkpoint. The eval (eval.py) is the *success criterion*; per the # pre-registration in README.md, we look at Δ_random and Δ_zero BEFORE # looking at raw GSM8K accuracy. set -euo pipefail cd /home/ubuntu export TOKENIZERS_PARALLELISM=false export TRANSFORMERS_NO_ADVISORY_WARNINGS=1 export HF_HUB_DISABLE_PROGRESS_BARS=1 export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True OUT=/home/ubuntu/work/blt_pilot1 CFG=/home/ubuntu/experiments/blt_reasoner/configs/pilot_qwen15b_gsm8k.json mkdir -p "$OUT" LOG="$OUT/run.log" EVAL_LOG="$OUT/eval.log" # 1) Train (foreground inside nohup); only proceed if exit=0 nohup bash -c " set -e python3 -u -m experiments.blt_reasoner.train --config $CFG \\ >> $LOG 2>&1 echo '[wrapper] training exited 0 — running z-ablation eval' >> $LOG # 2) z-ablation eval on the final ckpt (pre-registered success criterion) python3 -u -m experiments.blt_reasoner.eval \\ --ckpt $OUT/final \\ --config $CFG \\ --n 200 \\ --out $OUT/final/ablation_n200.json \\ >> $EVAL_LOG 2>&1 echo '[wrapper] eval exited 0; ablation written to $OUT/final/ablation_n200.json' >> $LOG " & PID=$! echo "$PID" > "$OUT/run.pid" echo "Launched BLT pilot+eval wrapper pid=$PID log=$LOG eval_log=$EVAL_LOG"