| #!/usr/bin/env bash |
| |
| |
| |
| |
| |
| set -uo pipefail |
|
|
| REPO="LauraGG/blt-reasoner-pilot1" |
| OUT="/home/ubuntu/work/blt_exp7b_opt13" |
| CFG="/home/ubuntu/experiments/blt_reasoner/configs/exp7b_opt13.json" |
| WAIT_FOR="/home/ubuntu/work/blt_exp7b_blockz/final/ablation_teacher_forced.json" |
| LOG="/home/ubuntu/work/queue_opt13_exp.log" |
|
|
| log() { echo "[$(date +%T)] $*" | tee -a "$LOG"; } |
|
|
| mkdir -p "$OUT" |
| cd /home/ubuntu |
| export TOKENIZERS_PARALLELISM=false TRANSFORMERS_NO_ADVISORY_WARNINGS=1 HF_HUB_DISABLE_PROGRESS_BARS=1 |
| export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True |
|
|
| log "queue_opt13_exp.sh starting; waiting for block_z experiment to finish at $WAIT_FOR" |
| DEADLINE=$(( $(date +%s) + 4*3600 )) |
| while [ ! -f "$WAIT_FOR" ]; do |
| if [ "$(date +%s)" -gt "$DEADLINE" ]; then |
| log "deadline reached waiting for block_z exp; proceeding anyway" |
| break |
| fi |
| sleep 60 |
| done |
| log "proceeding; waiting 30s for GPU drain" |
| sleep 30 |
|
|
| log "===========================================" |
| log "OPTIONS 1+3 EXPERIMENT: full-y InfoNCE + MLP projector" |
| log "===========================================" |
|
|
| |
| log "running 5-step smoke..." |
| python3 -u -m experiments.blt_reasoner.train \ |
| --config /home/ubuntu/experiments/blt_reasoner/configs/exp7b_opt13_smoke.json \ |
| > "$OUT/smoke.log" 2>&1 |
| SMOKE_RC=$? |
| log "smoke exit=$SMOKE_RC" |
| if [ $SMOKE_RC -ne 0 ]; then |
| log "SMOKE FAILED — see $OUT/smoke.log; aborting full run" |
| tail -20 "$OUT/smoke.log" | tee -a "$LOG" |
| exit 1 |
| fi |
|
|
| log "smoke OK; launching full training (3500 steps)" |
| python3 -u -m experiments.blt_reasoner.train --config "$CFG" \ |
| > "$OUT/train.log" 2>&1 |
| log "full train exit=$?" |
|
|
| |
| log "running TF n=200 K=16 ablation on final" |
| python3 -u -m experiments.blt_reasoner.scripts.ablate_teacher_forced \ |
| --ckpt "$OUT/final" --config "$CFG" --n 200 --K 16 \ |
| --out "$OUT/final/ablation_teacher_forced.json" \ |
| > "$OUT/tf_eval.log" 2>&1 |
| log "TF ablate exit=$?" |
|
|
| log "running AR n=200 K=16 ablation on final" |
| python3 -u -m experiments.blt_reasoner.eval \ |
| --ckpt "$OUT/final" --config "$CFG" --n 200 --K 16 \ |
| --max_new_tokens 192 --temperature 0.0 \ |
| --out "$OUT/final/ablation_n200_K16.json" \ |
| > "$OUT/ar_eval.log" 2>&1 |
| log "AR ablate exit=$?" |
|
|
| log "pushing exp7b_opt13/ to HF" |
| python3 - <<PYEND |
| import os |
| from huggingface_hub import HfApi |
| token = os.environ.get("BLT_HF_TOKEN", "").strip() |
| assert token.startswith("hf_"), "BLT_HF_TOKEN missing" |
| api = HfApi(token=token) |
| api.upload_folder(folder_path="$OUT", path_in_repo="exp7b_opt13", |
| repo_id="$REPO", repo_type="model", |
| commit_message="EXP: Options 1+3 (full-y InfoNCE + MLP projector)") |
| print("[push] done") |
| PYEND |
| log "queue_opt13_exp.sh DONE" |
|
|