#!/usr/bin/env bash # Auto-launches after current block_z exp finishes. Runs Options 1+3 experiment: # * Option 1: full-y InfoNCE target (max_len=128) # * Option 3: MLP projector π (d→4d→d with GELU) # Then runs BOTH TF-ablation and AR-ablation (the real, autoregressive eval) on # the final ckpt, and pushes everything to HF under exp7b_opt13/. set -uo pipefail REPO="LauraGG/blt-reasoner-pilot1" OUT="/home/ubuntu/work/blt_exp7b_opt13" CFG="/home/ubuntu/experiments/blt_reasoner/configs/exp7b_opt13.json" WAIT_FOR="/home/ubuntu/work/blt_exp7b_blockz/final/ablation_teacher_forced.json" LOG="/home/ubuntu/work/queue_opt13_exp.log" log() { echo "[$(date +%T)] $*" | tee -a "$LOG"; } mkdir -p "$OUT" cd /home/ubuntu export TOKENIZERS_PARALLELISM=false TRANSFORMERS_NO_ADVISORY_WARNINGS=1 HF_HUB_DISABLE_PROGRESS_BARS=1 export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True log "queue_opt13_exp.sh starting; waiting for block_z experiment to finish at $WAIT_FOR" DEADLINE=$(( $(date +%s) + 4*3600 )) while [ ! -f "$WAIT_FOR" ]; do if [ "$(date +%s)" -gt "$DEADLINE" ]; then log "deadline reached waiting for block_z exp; proceeding anyway" break fi sleep 60 done log "proceeding; waiting 30s for GPU drain" sleep 30 log "===========================================" log "OPTIONS 1+3 EXPERIMENT: full-y InfoNCE + MLP projector" log "===========================================" # Quick smoke to catch bugs before the multi-hour run log "running 5-step smoke..." python3 -u -m experiments.blt_reasoner.train \ --config /home/ubuntu/experiments/blt_reasoner/configs/exp7b_opt13_smoke.json \ > "$OUT/smoke.log" 2>&1 SMOKE_RC=$? log "smoke exit=$SMOKE_RC" if [ $SMOKE_RC -ne 0 ]; then log "SMOKE FAILED — see $OUT/smoke.log; aborting full run" tail -20 "$OUT/smoke.log" | tee -a "$LOG" exit 1 fi log "smoke OK; launching full training (3500 steps)" python3 -u -m experiments.blt_reasoner.train --config "$CFG" \ > "$OUT/train.log" 2>&1 log "full train exit=$?" # Two ablations: TF (cheap, comparable to baseline TF) and AR (the real test) log "running TF n=200 K=16 ablation on final" python3 -u -m experiments.blt_reasoner.scripts.ablate_teacher_forced \ --ckpt "$OUT/final" --config "$CFG" --n 200 --K 16 \ --out "$OUT/final/ablation_teacher_forced.json" \ > "$OUT/tf_eval.log" 2>&1 log "TF ablate exit=$?" log "running AR n=200 K=16 ablation on final" python3 -u -m experiments.blt_reasoner.eval \ --ckpt "$OUT/final" --config "$CFG" --n 200 --K 16 \ --max_new_tokens 192 --temperature 0.0 \ --out "$OUT/final/ablation_n200_K16.json" \ > "$OUT/ar_eval.log" 2>&1 log "AR ablate exit=$?" log "pushing exp7b_opt13/ to HF" python3 - <