JulianHJR commited on
Commit
9fa8ff4
·
verified ·
1 Parent(s): 398396b

Upload runallsingle.sh with huggingface_hub

Browse files
Files changed (1) hide show
  1. runallsingle.sh +195 -0
runallsingle.sh ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # ============================================================
3
+ # Student Simulation v5 — SINGLE-GPU pipeline (May 2026)
4
+ # ============================================================
5
+ #
6
+ # WHEN TO USE THIS instead of runall.sh
7
+ # - You only have one GPU available, OR
8
+ # - You want to debug stage 14 without the parallel-shard machinery,
9
+ # - The 6-GPU version (runall.sh) is hanging and you need a known-good run.
10
+ #
11
+ # DEFAULT GPU
12
+ # CUDA 7. Override with `CUDA_VISIBLE_DEVICES=N bash runallsingle.sh`.
13
+ # Inside slurm with --gres=gpu:1 the cgroup will renumber the visible
14
+ # device to 0; we detect that and DO NOT override CUDA_VISIBLE_DEVICES if
15
+ # slurm has already set it. That way the same script works in both
16
+ # contexts.
17
+ #
18
+ # WHAT'S DIFFERENT FROM runall.sh
19
+ # - Stage 14 runs ALL 16 probe-ranked layers in a SINGLE process — no
20
+ # sharding, no shard files, no merge step. Output goes straight to the
21
+ # canonical per_layer_calibration_monitoring.json path.
22
+ # - Baselines for stage 14 are computed ONCE (hoisted in the patched
23
+ # 14_calibrate_per_layer.py), so no time wasted re-running the same
24
+ # baselines per layer.
25
+ # - CPU thread caps are still applied so a single big-MoE process doesn't
26
+ # accidentally oversubscribe a 200-thread node.
27
+ #
28
+ # ROUGH RUNTIME (n_test=10, n_repeats=3, 16 layers, max_new_tokens=2048):
29
+ # baselines: 30 gens (~15-30 min)
30
+ # per-layer: 16 layers × 4 alphas × 3 repeats × 10 problems = 1920 gens
31
+ # total: ~16-25 hours on one H20-3e (vs ~3-4h on 6 cards in parallel
32
+ # IF the 6-GPU run actually works).
33
+ #
34
+ # QUICK START
35
+ # bash runallsingle.sh # full single-GPU pipeline
36
+ # STAGES=5b,14,16,15,13 bash runallsingle.sh # skip data prep
37
+ # STAGES=14 N_CALIB=5 bash runallsingle.sh # quick stage-14 smoke test
38
+ # CUDA_VISIBLE_DEVICES=3 bash runallsingle.sh # use GPU 3 instead of 7
39
+ # ============================================================
40
+
41
+ set -e
42
+ set -u
43
+ set -o pipefail
44
+
45
+ PROJECT_ROOT="$(cd "$(dirname "$0")" && pwd)"
46
+ cd "$PROJECT_ROOT"
47
+
48
+ # ============================================================
49
+ # GPU SELECTION
50
+ # ============================================================
51
+ # Priority:
52
+ # 1. CUDA_VISIBLE_DEVICES already set (e.g. by slurm or by user) → respect it.
53
+ # 2. SLURM_JOB_ID is set but CUDA_VISIBLE_DEVICES is not → unusual but trust slurm.
54
+ # 3. Otherwise → default to GPU 7.
55
+ if [[ -n "${CUDA_VISIBLE_DEVICES:-}" ]]; then
56
+ echo "[gpu] Using existing CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
57
+ elif [[ -n "${SLURM_JOB_ID:-}" ]]; then
58
+ echo "[gpu] Inside slurm but CUDA_VISIBLE_DEVICES unset — letting slurm/cgroup decide"
59
+ else
60
+ export CUDA_VISIBLE_DEVICES=7
61
+ echo "[gpu] Defaulting to CUDA_VISIBLE_DEVICES=7"
62
+ echo "[gpu] Override with: CUDA_VISIBLE_DEVICES=N bash $(basename "$0")"
63
+ fi
64
+
65
+ # ============================================================
66
+ # CPU THREAD CAPS
67
+ # ============================================================
68
+ # A single process loading a 30B MoE will spawn (cores) BLAS threads by default.
69
+ # On a 200-thread node that's fine alone, but pinned-memory copies and tokenizer
70
+ # work both benefit from a sane cap. These caps are also defensive in case this
71
+ # script is launched alongside another job on the same node.
72
+ export OMP_NUM_THREADS="${OMP_NUM_THREADS:-16}"
73
+ export MKL_NUM_THREADS="${MKL_NUM_THREADS:-16}"
74
+ export NUMEXPR_NUM_THREADS="${NUMEXPR_NUM_THREADS:-16}"
75
+ export TOKENIZERS_PARALLELISM=false
76
+
77
+ # ============================================================
78
+ # PATHS
79
+ # ============================================================
80
+ export DATA_ROOT="${DATA_ROOT:-$PROJECT_ROOT/data}"
81
+ export PYTHONPATH="$PROJECT_ROOT:${PYTHONPATH:-}"
82
+
83
+ N_TRAIN="${N_TRAIN:-150}"
84
+ N_MATH_TEST="${N_MATH_TEST:-50}"
85
+ N_AIME="${N_AIME:-30}"
86
+ N_GPQA="${N_GPQA:-20}"
87
+ N_CALIB="${N_CALIB:-10}"
88
+ N_K_TEST="${N_K_TEST:-10}"
89
+ N_REPEATS="${N_REPEATS:-3}"
90
+
91
+ mkdir -p "$DATA_ROOT/logs" "$DATA_ROOT/results"
92
+ RUNALL_LOG="$DATA_ROOT/logs/runallsingle.log"
93
+
94
+ echo "=========================================================" | tee -a "$RUNALL_LOG"
95
+ echo "Student Simulation v5 (SINGLE-GPU) - $(date)" | tee -a "$RUNALL_LOG"
96
+ echo "PROJECT_ROOT: $PROJECT_ROOT" | tee -a "$RUNALL_LOG"
97
+ echo "CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES:-<slurm-managed>}" | tee -a "$RUNALL_LOG"
98
+ echo "OMP_NUM_THREADS: $OMP_NUM_THREADS" | tee -a "$RUNALL_LOG"
99
+ echo "N_CALIB: $N_CALIB N_REPEATS: $N_REPEATS" | tee -a "$RUNALL_LOG"
100
+ echo "N_K_TEST: $N_K_TEST" | tee -a "$RUNALL_LOG"
101
+ echo "=========================================================" | tee -a "$RUNALL_LOG"
102
+
103
+ python -m configs.paths 2>&1 | tee -a "$RUNALL_LOG"
104
+
105
+ # Default stage list. Same as runall.sh but stage 14 is run UNSHARDED.
106
+ STAGES="${STAGES:-1,2,3,4,5,6,7,8,5b,14,16,15,13}"
107
+
108
+ run_stage() {
109
+ local stage_num="$1"
110
+ local stage_name="$2"
111
+ shift 2
112
+ if [[ ",$STAGES," != *",$stage_num,"* ]]; then
113
+ echo "[skip] Stage $stage_num: $stage_name" | tee -a "$RUNALL_LOG"
114
+ return 0
115
+ fi
116
+ echo "" | tee -a "$RUNALL_LOG"
117
+ echo "==================== Stage $stage_num: $stage_name ====================" | tee -a "$RUNALL_LOG"
118
+ local t_start; t_start=$(date +%s)
119
+ "$@" 2>&1 | tee -a "$RUNALL_LOG"
120
+ local t_end; t_end=$(date +%s)
121
+ echo "Stage $stage_num took $((t_end - t_start))s" | tee -a "$RUNALL_LOG"
122
+ }
123
+
124
+ # ============================================================
125
+ # Data-prep / direction-extraction stages (always single-GPU)
126
+ # ============================================================
127
+ if [[ -z "${SKIP_DOWNLOAD:-}" ]]; then
128
+ run_stage 1 "Download model" \
129
+ python scripts/01_download_model.py
130
+ fi
131
+
132
+ run_stage 2 "Generate CoTs" \
133
+ python scripts/02_generate_cots.py \
134
+ --n_train "$N_TRAIN" --n_math_test "$N_MATH_TEST" \
135
+ --n_aime "$N_AIME" --n_gpqa "$N_GPQA" --resume
136
+
137
+ run_stage 3 "Label CoTs" \
138
+ python scripts/03_label_cots.py --resume
139
+
140
+ run_stage 4 "Capture routing" \
141
+ python scripts/04_capture_routing.py --resume
142
+
143
+ run_stage 5 "Select top experts" \
144
+ python scripts/05_select_top_experts.py --resume
145
+
146
+ run_stage 6 "Interaction analysis" \
147
+ python scripts/06_interaction_analysis.py
148
+
149
+ run_stage 7 "Capture residuals" \
150
+ python scripts/07_capture_residuals.py --resume
151
+
152
+ run_stage 8 "Compute v4_clean directions" \
153
+ python scripts/08_compute_directions.py --resume
154
+
155
+ run_stage 5b "Probe-based layer ranking" \
156
+ python scripts/05b_probe_ranking.py --dim monitoring
157
+
158
+ # ============================================================
159
+ # Stage 14 — UNSHARDED.
160
+ # All 16 layers in one process, baselines hoisted, output goes
161
+ # directly to per_layer_calibration_monitoring.json. No shard
162
+ # files, no merge step.
163
+ # ============================================================
164
+ run_stage 14 "Per-layer calibration (single-GPU, all layers)" \
165
+ python scripts/14_calibrate_per_layer.py \
166
+ --dim monitoring \
167
+ --n_test "$N_CALIB" \
168
+ --n_repeats "$N_REPEATS"
169
+
170
+ # ============================================================
171
+ # Final stages (single-GPU in both versions)
172
+ # ============================================================
173
+ run_stage 16 "Cumulative top-k multi-layer sweep" \
174
+ python scripts/16_cumulative_topk.py \
175
+ --dim monitoring --n_test "$N_K_TEST"
176
+
177
+ run_stage 15 "Calibrated inference (monitoring)" \
178
+ python scripts/15_infer_calibrated.py \
179
+ --dim monitoring --auto_problems \
180
+ --save_to "$DATA_ROOT/results/infer_calibrated_monitoring_v5.json"
181
+
182
+ run_stage 13 "Final analysis + report" \
183
+ python scripts/13_analyze_and_report.py
184
+
185
+ echo "" | tee -a "$RUNALL_LOG"
186
+ echo "=========================================================" | tee -a "$RUNALL_LOG"
187
+ echo "v5 single-GPU pipeline complete - $(date)" | tee -a "$RUNALL_LOG"
188
+ echo "=========================================================" | tee -a "$RUNALL_LOG"
189
+ echo "KEY FILES TO READ FIRST:" | tee -a "$RUNALL_LOG"
190
+ echo " $DATA_ROOT/checkpoints/probe_layer_ranking_monitoring.json" | tee -a "$RUNALL_LOG"
191
+ echo " $DATA_ROOT/results/per_layer_calibration_monitoring.json <- safe_layers" | tee -a "$RUNALL_LOG"
192
+ echo " $DATA_ROOT/results/stage16_cumulative_topk_summary.json <- collapse cliff" | tee -a "$RUNALL_LOG"
193
+ echo " $DATA_ROOT/results/infer_calibrated_monitoring_v5.json <- final output" | tee -a "$RUNALL_LOG"
194
+ echo " $DATA_ROOT/results/final_report.md" | tee -a "$RUNALL_LOG"
195
+ echo "=========================================================" | tee -a "$RUNALL_LOG"