#!/usr/bin/env bash
# Launch the BLT pilot AND chain the pre-registered z-ablation eval on the
# final checkpoint. The eval (eval.py) is the *success criterion*; per the
# pre-registration in README.md, we look at Δ_random and Δ_zero BEFORE
# looking at raw GSM8K accuracy.
set -euo pipefail

cd /home/ubuntu
export TOKENIZERS_PARALLELISM=false
export TRANSFORMERS_NO_ADVISORY_WARNINGS=1
export HF_HUB_DISABLE_PROGRESS_BARS=1
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

OUT=/home/ubuntu/work/blt_pilot1
CFG=/home/ubuntu/experiments/blt_reasoner/configs/pilot_qwen15b_gsm8k.json
mkdir -p "$OUT"

LOG="$OUT/run.log"
EVAL_LOG="$OUT/eval.log"

# 1) Train (foreground inside nohup); only proceed if exit=0
nohup bash -c "
  set -e
  python3 -u -m experiments.blt_reasoner.train --config $CFG \\
    >> $LOG 2>&1

  echo '[wrapper] training exited 0 — running z-ablation eval' >> $LOG
  # 2) z-ablation eval on the final ckpt (pre-registered success criterion)
  python3 -u -m experiments.blt_reasoner.eval \\
      --ckpt $OUT/final \\
      --config $CFG \\
      --n 200 \\
      --out $OUT/final/ablation_n200.json \\
      >> $EVAL_LOG 2>&1
  echo '[wrapper] eval exited 0; ablation written to $OUT/final/ablation_n200.json' >> $LOG
" &
PID=$!
echo "$PID" > "$OUT/run.pid"
echo "Launched BLT pilot+eval wrapper pid=$PID log=$LOG eval_log=$EVAL_LOG"