#!/usr/bin/env bash # Chain script: wait for pilot final ablation, then train + eval + push two # control ablations sequentially. # # Run as: # BLT_HF_TOKEN= nohup bash queue_controls.sh > /home/ubuntu/work/queue_controls.log 2>&1 & # # Each control is 3000 K=4 steps (~1.6h on GH200) followed by an n=100 z-ablation. # Both then get pushed under `controls//` in the HF repo. set -uo pipefail REPO="LauraGG/blt-reasoner-pilot1" PILOT_FINAL_ABL="/home/ubuntu/work/blt_pilot1/final/ablation_n200.json" LOG="/home/ubuntu/work/queue_controls.log" log() { echo "[$(date +%T)] $*" | tee -a "$LOG"; } cd /home/ubuntu export TOKENIZERS_PARALLELISM=false TRANSFORMERS_NO_ADVISORY_WARNINGS=1 HF_HUB_DISABLE_PROGRESS_BARS=1 export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True # ---- 1) wait until pilot final ablation is on disk ------------------------ log "queue_controls.sh starting; waiting for $PILOT_FINAL_ABL" DEADLINE=$(( $(date +%s) + 6*3600 )) # hard 6h wait limit while [ ! -f "$PILOT_FINAL_ABL" ]; do if [ "$(date +%s)" -gt "$DEADLINE" ]; then log "FATAL: deadline exceeded waiting for pilot final ablation; aborting" exit 1 fi sleep 60 done log "pilot final ablation present; proceeding" sleep 30 # let GPU drain run_control () { local name="$1" local cfg="$2" local out_dir="$3" log "==========================================" log "CONTROL [$name]: train cfg=$cfg out=$out_dir" log "==========================================" python3 -u -m experiments.blt_reasoner.train --config "$cfg" \ > "$out_dir/train.log" 2>&1 rc=$? log "control [$name]: train exit=$rc" if [ $rc -ne 0 ]; then log "control [$name]: train FAILED; skipping eval+push" return fi log "control [$name]: running n=100 K=4 z-ablation on final ckpt" python3 -u -m experiments.blt_reasoner.eval \ --ckpt "$out_dir/final" --config "$cfg" \ --n 100 --K 4 --max_new_tokens 192 --temperature 0.0 \ --out "$out_dir/final/ablation_K4_n100.json" \ >> "$out_dir/eval.log" 2>&1 rc=$? log "control [$name]: eval exit=$rc" log "control [$name]: pushing to HF under controls/$name/" python3 - <