File size: 3,268 Bytes
bc7101b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 | #!/usr/bin/env bash
# Chain script: wait for pilot final ablation, then train + eval + push two
# control ablations sequentially.
#
# Run as:
# BLT_HF_TOKEN=<hf_token> nohup bash queue_controls.sh > /home/ubuntu/work/queue_controls.log 2>&1 &
#
# Each control is 3000 K=4 steps (~1.6h on GH200) followed by an n=100 z-ablation.
# Both then get pushed under `controls/<name>/` in the HF repo.
set -uo pipefail
REPO="LauraGG/blt-reasoner-pilot1"
PILOT_FINAL_ABL="/home/ubuntu/work/blt_pilot1/final/ablation_n200.json"
LOG="/home/ubuntu/work/queue_controls.log"
log() { echo "[$(date +%T)] $*" | tee -a "$LOG"; }
cd /home/ubuntu
export TOKENIZERS_PARALLELISM=false TRANSFORMERS_NO_ADVISORY_WARNINGS=1 HF_HUB_DISABLE_PROGRESS_BARS=1
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
# ---- 1) wait until pilot final ablation is on disk ------------------------
log "queue_controls.sh starting; waiting for $PILOT_FINAL_ABL"
DEADLINE=$(( $(date +%s) + 6*3600 )) # hard 6h wait limit
while [ ! -f "$PILOT_FINAL_ABL" ]; do
if [ "$(date +%s)" -gt "$DEADLINE" ]; then
log "FATAL: deadline exceeded waiting for pilot final ablation; aborting"
exit 1
fi
sleep 60
done
log "pilot final ablation present; proceeding"
sleep 30 # let GPU drain
run_control () {
local name="$1"
local cfg="$2"
local out_dir="$3"
log "=========================================="
log "CONTROL [$name]: train cfg=$cfg out=$out_dir"
log "=========================================="
python3 -u -m experiments.blt_reasoner.train --config "$cfg" \
> "$out_dir/train.log" 2>&1
rc=$?
log "control [$name]: train exit=$rc"
if [ $rc -ne 0 ]; then
log "control [$name]: train FAILED; skipping eval+push"
return
fi
log "control [$name]: running n=100 K=4 z-ablation on final ckpt"
python3 -u -m experiments.blt_reasoner.eval \
--ckpt "$out_dir/final" --config "$cfg" \
--n 100 --K 4 --max_new_tokens 192 --temperature 0.0 \
--out "$out_dir/final/ablation_K4_n100.json" \
>> "$out_dir/eval.log" 2>&1
rc=$?
log "control [$name]: eval exit=$rc"
log "control [$name]: pushing to HF under controls/$name/"
python3 - <<PY
import os
from huggingface_hub import HfApi
token = os.environ.get("BLT_HF_TOKEN", "").strip()
assert token.startswith("hf_"), "BLT_HF_TOKEN missing"
api = HfApi(token=token)
api.upload_folder(
folder_path="$out_dir",
path_in_repo="controls/$name",
repo_id="$REPO", repo_type="model",
commit_message=f"Add control: $name (final ckpt + n=100 ablation)",
)
print("[push] done")
PY
log "control [$name]: push done"
}
# ---- 2) run no-infonce ---------------------------------------------------
mkdir -p /home/ubuntu/work/blt_control_no_infonce
run_control no_infonce \
/home/ubuntu/experiments/blt_reasoner/configs/control_no_infonce.json \
/home/ubuntu/work/blt_control_no_infonce
# ---- 3) run no-bottleneck ------------------------------------------------
mkdir -p /home/ubuntu/work/blt_control_no_bottleneck
run_control no_bottleneck \
/home/ubuntu/experiments/blt_reasoner/configs/control_no_bottleneck.json \
/home/ubuntu/work/blt_control_no_bottleneck
log "queue_controls.sh DONE"
|