blt-reasoner-pilot1 / code /scripts /queue_opt13_exp.sh
LauraGG's picture
Refresh code/ with latest BLT-Reasoner sources (post-campaign)
bc7101b verified
#!/usr/bin/env bash
# Auto-launches after current block_z exp finishes. Runs Options 1+3 experiment:
# * Option 1: full-y InfoNCE target (max_len=128)
# * Option 3: MLP projector π (d→4d→d with GELU)
# Then runs BOTH TF-ablation and AR-ablation (the real, autoregressive eval) on
# the final ckpt, and pushes everything to HF under exp7b_opt13/.
set -uo pipefail
REPO="LauraGG/blt-reasoner-pilot1"
OUT="/home/ubuntu/work/blt_exp7b_opt13"
CFG="/home/ubuntu/experiments/blt_reasoner/configs/exp7b_opt13.json"
WAIT_FOR="/home/ubuntu/work/blt_exp7b_blockz/final/ablation_teacher_forced.json"
LOG="/home/ubuntu/work/queue_opt13_exp.log"
log() { echo "[$(date +%T)] $*" | tee -a "$LOG"; }
mkdir -p "$OUT"
cd /home/ubuntu
export TOKENIZERS_PARALLELISM=false TRANSFORMERS_NO_ADVISORY_WARNINGS=1 HF_HUB_DISABLE_PROGRESS_BARS=1
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
log "queue_opt13_exp.sh starting; waiting for block_z experiment to finish at $WAIT_FOR"
DEADLINE=$(( $(date +%s) + 4*3600 ))
while [ ! -f "$WAIT_FOR" ]; do
if [ "$(date +%s)" -gt "$DEADLINE" ]; then
log "deadline reached waiting for block_z exp; proceeding anyway"
break
fi
sleep 60
done
log "proceeding; waiting 30s for GPU drain"
sleep 30
log "==========================================="
log "OPTIONS 1+3 EXPERIMENT: full-y InfoNCE + MLP projector"
log "==========================================="
# Quick smoke to catch bugs before the multi-hour run
log "running 5-step smoke..."
python3 -u -m experiments.blt_reasoner.train \
--config /home/ubuntu/experiments/blt_reasoner/configs/exp7b_opt13_smoke.json \
> "$OUT/smoke.log" 2>&1
SMOKE_RC=$?
log "smoke exit=$SMOKE_RC"
if [ $SMOKE_RC -ne 0 ]; then
log "SMOKE FAILED — see $OUT/smoke.log; aborting full run"
tail -20 "$OUT/smoke.log" | tee -a "$LOG"
exit 1
fi
log "smoke OK; launching full training (3500 steps)"
python3 -u -m experiments.blt_reasoner.train --config "$CFG" \
> "$OUT/train.log" 2>&1
log "full train exit=$?"
# Two ablations: TF (cheap, comparable to baseline TF) and AR (the real test)
log "running TF n=200 K=16 ablation on final"
python3 -u -m experiments.blt_reasoner.scripts.ablate_teacher_forced \
--ckpt "$OUT/final" --config "$CFG" --n 200 --K 16 \
--out "$OUT/final/ablation_teacher_forced.json" \
> "$OUT/tf_eval.log" 2>&1
log "TF ablate exit=$?"
log "running AR n=200 K=16 ablation on final"
python3 -u -m experiments.blt_reasoner.eval \
--ckpt "$OUT/final" --config "$CFG" --n 200 --K 16 \
--max_new_tokens 192 --temperature 0.0 \
--out "$OUT/final/ablation_n200_K16.json" \
> "$OUT/ar_eval.log" 2>&1
log "AR ablate exit=$?"
log "pushing exp7b_opt13/ to HF"
python3 - <<PYEND
import os
from huggingface_hub import HfApi
token = os.environ.get("BLT_HF_TOKEN", "").strip()
assert token.startswith("hf_"), "BLT_HF_TOKEN missing"
api = HfApi(token=token)
api.upload_folder(folder_path="$OUT", path_in_repo="exp7b_opt13",
repo_id="$REPO", repo_type="model",
commit_message="EXP: Options 1+3 (full-y InfoNCE + MLP projector)")
print("[push] done")
PYEND
log "queue_opt13_exp.sh DONE"