blt-reasoner-pilot1 / code /scripts /queue_controls.sh
LauraGG's picture
Refresh code/ with latest BLT-Reasoner sources (post-campaign)
bc7101b verified
#!/usr/bin/env bash
# Chain script: wait for pilot final ablation, then train + eval + push two
# control ablations sequentially.
#
# Run as:
# BLT_HF_TOKEN=<hf_token> nohup bash queue_controls.sh > /home/ubuntu/work/queue_controls.log 2>&1 &
#
# Each control is 3000 K=4 steps (~1.6h on GH200) followed by an n=100 z-ablation.
# Both then get pushed under `controls/<name>/` in the HF repo.
set -uo pipefail
REPO="LauraGG/blt-reasoner-pilot1"
PILOT_FINAL_ABL="/home/ubuntu/work/blt_pilot1/final/ablation_n200.json"
LOG="/home/ubuntu/work/queue_controls.log"
log() { echo "[$(date +%T)] $*" | tee -a "$LOG"; }
cd /home/ubuntu
export TOKENIZERS_PARALLELISM=false TRANSFORMERS_NO_ADVISORY_WARNINGS=1 HF_HUB_DISABLE_PROGRESS_BARS=1
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
# ---- 1) wait until pilot final ablation is on disk ------------------------
log "queue_controls.sh starting; waiting for $PILOT_FINAL_ABL"
DEADLINE=$(( $(date +%s) + 6*3600 )) # hard 6h wait limit
while [ ! -f "$PILOT_FINAL_ABL" ]; do
if [ "$(date +%s)" -gt "$DEADLINE" ]; then
log "FATAL: deadline exceeded waiting for pilot final ablation; aborting"
exit 1
fi
sleep 60
done
log "pilot final ablation present; proceeding"
sleep 30 # let GPU drain
run_control () {
local name="$1"
local cfg="$2"
local out_dir="$3"
log "=========================================="
log "CONTROL [$name]: train cfg=$cfg out=$out_dir"
log "=========================================="
python3 -u -m experiments.blt_reasoner.train --config "$cfg" \
> "$out_dir/train.log" 2>&1
rc=$?
log "control [$name]: train exit=$rc"
if [ $rc -ne 0 ]; then
log "control [$name]: train FAILED; skipping eval+push"
return
fi
log "control [$name]: running n=100 K=4 z-ablation on final ckpt"
python3 -u -m experiments.blt_reasoner.eval \
--ckpt "$out_dir/final" --config "$cfg" \
--n 100 --K 4 --max_new_tokens 192 --temperature 0.0 \
--out "$out_dir/final/ablation_K4_n100.json" \
>> "$out_dir/eval.log" 2>&1
rc=$?
log "control [$name]: eval exit=$rc"
log "control [$name]: pushing to HF under controls/$name/"
python3 - <<PY
import os
from huggingface_hub import HfApi
token = os.environ.get("BLT_HF_TOKEN", "").strip()
assert token.startswith("hf_"), "BLT_HF_TOKEN missing"
api = HfApi(token=token)
api.upload_folder(
folder_path="$out_dir",
path_in_repo="controls/$name",
repo_id="$REPO", repo_type="model",
commit_message=f"Add control: $name (final ckpt + n=100 ablation)",
)
print("[push] done")
PY
log "control [$name]: push done"
}
# ---- 2) run no-infonce ---------------------------------------------------
mkdir -p /home/ubuntu/work/blt_control_no_infonce
run_control no_infonce \
/home/ubuntu/experiments/blt_reasoner/configs/control_no_infonce.json \
/home/ubuntu/work/blt_control_no_infonce
# ---- 3) run no-bottleneck ------------------------------------------------
mkdir -p /home/ubuntu/work/blt_control_no_bottleneck
run_control no_bottleneck \
/home/ubuntu/experiments/blt_reasoner/configs/control_no_bottleneck.json \
/home/ubuntu/work/blt_control_no_bottleneck
log "queue_controls.sh DONE"