File size: 3,268 Bytes
bc7101b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env bash
# Chain script: wait for pilot final ablation, then train + eval + push two
# control ablations sequentially.
#
# Run as:
#   BLT_HF_TOKEN=<hf_token> nohup bash queue_controls.sh > /home/ubuntu/work/queue_controls.log 2>&1 &
#
# Each control is 3000 K=4 steps (~1.6h on GH200) followed by an n=100 z-ablation.
# Both then get pushed under `controls/<name>/` in the HF repo.

set -uo pipefail

REPO="LauraGG/blt-reasoner-pilot1"
PILOT_FINAL_ABL="/home/ubuntu/work/blt_pilot1/final/ablation_n200.json"
LOG="/home/ubuntu/work/queue_controls.log"

log() { echo "[$(date +%T)] $*" | tee -a "$LOG"; }

cd /home/ubuntu
export TOKENIZERS_PARALLELISM=false TRANSFORMERS_NO_ADVISORY_WARNINGS=1 HF_HUB_DISABLE_PROGRESS_BARS=1
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

# ---- 1) wait until pilot final ablation is on disk ------------------------
log "queue_controls.sh starting; waiting for $PILOT_FINAL_ABL"
DEADLINE=$(( $(date +%s) + 6*3600 ))   # hard 6h wait limit
while [ ! -f "$PILOT_FINAL_ABL" ]; do
    if [ "$(date +%s)" -gt "$DEADLINE" ]; then
        log "FATAL: deadline exceeded waiting for pilot final ablation; aborting"
        exit 1
    fi
    sleep 60
done
log "pilot final ablation present; proceeding"
sleep 30  # let GPU drain

run_control () {
    local name="$1"
    local cfg="$2"
    local out_dir="$3"

    log "=========================================="
    log "CONTROL [$name]: train  cfg=$cfg  out=$out_dir"
    log "=========================================="
    python3 -u -m experiments.blt_reasoner.train --config "$cfg" \
        > "$out_dir/train.log" 2>&1
    rc=$?
    log "control [$name]: train exit=$rc"
    if [ $rc -ne 0 ]; then
        log "control [$name]: train FAILED; skipping eval+push"
        return
    fi

    log "control [$name]: running n=100 K=4 z-ablation on final ckpt"
    python3 -u -m experiments.blt_reasoner.eval \
        --ckpt "$out_dir/final" --config "$cfg" \
        --n 100 --K 4 --max_new_tokens 192 --temperature 0.0 \
        --out "$out_dir/final/ablation_K4_n100.json" \
        >> "$out_dir/eval.log" 2>&1
    rc=$?
    log "control [$name]: eval exit=$rc"

    log "control [$name]: pushing to HF under controls/$name/"
    python3 - <<PY
import os
from huggingface_hub import HfApi
token = os.environ.get("BLT_HF_TOKEN", "").strip()
assert token.startswith("hf_"), "BLT_HF_TOKEN missing"
api = HfApi(token=token)
api.upload_folder(
    folder_path="$out_dir",
    path_in_repo="controls/$name",
    repo_id="$REPO", repo_type="model",
    commit_message=f"Add control: $name (final ckpt + n=100 ablation)",
)
print("[push] done")
PY
    log "control [$name]: push done"
}

# ---- 2) run no-infonce ---------------------------------------------------
mkdir -p /home/ubuntu/work/blt_control_no_infonce
run_control no_infonce \
    /home/ubuntu/experiments/blt_reasoner/configs/control_no_infonce.json \
    /home/ubuntu/work/blt_control_no_infonce

# ---- 3) run no-bottleneck ------------------------------------------------
mkdir -p /home/ubuntu/work/blt_control_no_bottleneck
run_control no_bottleneck \
    /home/ubuntu/experiments/blt_reasoner/configs/control_no_bottleneck.json \
    /home/ubuntu/work/blt_control_no_bottleneck

log "queue_controls.sh DONE"