JulianHJR commited on
Commit
c04eaa2
·
verified ·
1 Parent(s): 9fa8ff4

Upload runall.sh with huggingface_hub

Browse files
Files changed (1) hide show
  1. runall.sh +213 -0
runall.sh ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # ============================================================
3
+ # Student Simulation v5 — 6-GPU pipeline (May 2026)
4
+ # ============================================================
5
+ #
6
+ # WHAT v5 DOES DIFFERENTLY (vs v4):
7
+ # - DROPPED stage 9 (global α sweep). v4 small-batch proved it unreliable
8
+ # for 30B MoE multi-layer interaction.
9
+ # - NEW stage 5b: probe-based layer ranking. Pick top-K layers by
10
+ # linear-probe accuracy on the existing residuals. Replaces v4's
11
+ # "take back half" heuristic.
12
+ # - HARDENED stage 14:
13
+ # * residual_after_general < 0.3 → AUTO-SKIP (noise vector)
14
+ # * n_repeats=3 (averaged) → kill single-run noise
15
+ # * min_reduction_threshold=1.0 → no "noise victories"
16
+ # * active_threshold=1, side_effect_rate=0.25 (relaxed from v4)
17
+ # * SHARDED across 6 GPUs (each card calibrates ~3 of 16 layers)
18
+ # - NEW stage 16: cumulative top-k multi-layer sweep.
19
+ # k = 1, 3, 5, 8, 12, 16 layers steered together.
20
+ # Reports collapse rate at each k → finds the multi-layer cliff.
21
+ #
22
+ # 6-GPU LAYOUT
23
+ # Single-GPU phase (GPU 0):
24
+ # stages 1-8 data prep, expert select, residuals, directions
25
+ # stage 5b probe ranking (uses residuals only, fast)
26
+ # Parallel phase (GPU 0-5, 6 cards in parallel):
27
+ # stage 14 per-layer calibration, sharded 0/6 ... 5/6
28
+ # merged into one calibration file
29
+ # Single-GPU again (GPU 0):
30
+ # stage 16 cumulative top-k multi-layer sweep
31
+ # stage 15 calibrated inference (baseline vs intervened)
32
+ # stage 13 final analysis + report
33
+ # ============================================================
34
+ #
35
+ # QUICK START
36
+ # bash runall.sh # full pipeline
37
+ # STAGES=5b,14,16,15,13 bash runall.sh # skip data prep
38
+ # STAGES=14,16,15,13 bash runall.sh # skip probe (already ran)
39
+ #
40
+ # ENV VARS
41
+ # STAGES comma-list of stages to run
42
+ # N_TRAIN # CoTs to generate (default 150)
43
+ # N_CALIB # problems for stage 14 (default 10)
44
+ # N_K_TEST # problems for stage 16 (default 10)
45
+ # N_REPEATS # stage 14 repeats (default 3)
46
+ # PROBE_TOP_K # # layers from stage 5b (default 16)
47
+ # ============================================================
48
+
49
+ set -e
50
+ set -u
51
+ set -o pipefail
52
+
53
+ PROJECT_ROOT="$(cd "$(dirname "$0")" && pwd)"
54
+ cd "$PROJECT_ROOT"
55
+
56
+ export DATA_ROOT="${DATA_ROOT:-$PROJECT_ROOT/data}"
57
+ export PYTHONPATH="$PROJECT_ROOT:${PYTHONPATH:-}"
58
+ export TOKENIZERS_PARALLELISM=false
59
+
60
+ # CPU thread caps. Without these, each of the 6 parallel shards spawns ~64
61
+ # BLAS threads (PyTorch defaults to nproc), so 6 processes × 64 = 384 threads
62
+ # fight for ~64 cores → cache thrash → generation appears to hang.
63
+ # 8 threads × 6 procs = 48 threads, well within capacity.
64
+ export OMP_NUM_THREADS="${OMP_NUM_THREADS:-8}"
65
+ export MKL_NUM_THREADS="${MKL_NUM_THREADS:-8}"
66
+ export NUMEXPR_NUM_THREADS="${NUMEXPR_NUM_THREADS:-8}"
67
+
68
+ N_TRAIN="${N_TRAIN:-150}"
69
+ N_MATH_TEST="${N_MATH_TEST:-50}"
70
+ N_AIME="${N_AIME:-30}"
71
+ N_GPQA="${N_GPQA:-20}"
72
+ N_CALIB="${N_CALIB:-10}"
73
+ N_K_TEST="${N_K_TEST:-10}"
74
+ N_REPEATS="${N_REPEATS:-3}"
75
+
76
+ mkdir -p "$DATA_ROOT/logs" "$DATA_ROOT/results"
77
+ RUNALL_LOG="$DATA_ROOT/logs/runall.log"
78
+
79
+ echo "=========================================================" | tee -a "$RUNALL_LOG"
80
+ echo "Student Simulation v5 (6-GPU) - $(date)" | tee -a "$RUNALL_LOG"
81
+ echo "PROJECT_ROOT: $PROJECT_ROOT" | tee -a "$RUNALL_LOG"
82
+ echo "N_CALIB: $N_CALIB N_REPEATS: $N_REPEATS" | tee -a "$RUNALL_LOG"
83
+ echo "N_K_TEST: $N_K_TEST" | tee -a "$RUNALL_LOG"
84
+ echo "=========================================================" | tee -a "$RUNALL_LOG"
85
+
86
+ python -m configs.paths 2>&1 | tee -a "$RUNALL_LOG"
87
+
88
+ STAGES="${STAGES:-1,2,3,4,5,6,7,8,5b,14,16,15,13}"
89
+
90
+ run_stage() {
91
+ local stage_num="$1"
92
+ local stage_name="$2"
93
+ shift 2
94
+ if [[ ",$STAGES," != *",$stage_num,"* ]]; then
95
+ echo "[skip] Stage $stage_num: $stage_name" | tee -a "$RUNALL_LOG"
96
+ return 0
97
+ fi
98
+ echo "" | tee -a "$RUNALL_LOG"
99
+ echo "==================== Stage $stage_num: $stage_name ====================" | tee -a "$RUNALL_LOG"
100
+ local t_start; t_start=$(date +%s)
101
+ "$@" 2>&1 | tee -a "$RUNALL_LOG"
102
+ local t_end; t_end=$(date +%s)
103
+ echo "Stage $stage_num took $((t_end - t_start))s" | tee -a "$RUNALL_LOG"
104
+ }
105
+
106
+ # Single-GPU stages
107
+ export CUDA_VISIBLE_DEVICES=0
108
+
109
+ if [[ -z "${SKIP_DOWNLOAD:-}" ]]; then
110
+ run_stage 1 "Download model" \
111
+ python scripts/01_download_model.py
112
+ fi
113
+
114
+ run_stage 2 "Generate CoTs" \
115
+ python scripts/02_generate_cots.py \
116
+ --n_train "$N_TRAIN" --n_math_test "$N_MATH_TEST" \
117
+ --n_aime "$N_AIME" --n_gpqa "$N_GPQA" --resume
118
+
119
+ run_stage 3 "Label CoTs" \
120
+ python scripts/03_label_cots.py --resume
121
+
122
+ run_stage 4 "Capture routing" \
123
+ python scripts/04_capture_routing.py --resume
124
+
125
+ run_stage 5 "Select top experts" \
126
+ python scripts/05_select_top_experts.py --resume
127
+
128
+ run_stage 6 "Interaction analysis" \
129
+ python scripts/06_interaction_analysis.py
130
+
131
+ run_stage 7 "Capture residuals" \
132
+ python scripts/07_capture_residuals.py --resume
133
+
134
+ run_stage 8 "Compute v4_clean directions" \
135
+ python scripts/08_compute_directions.py --resume
136
+
137
+ run_stage 5b "Probe-based layer ranking" \
138
+ python scripts/05b_probe_ranking.py --dim monitoring
139
+
140
+ # ============================================================
141
+ # 6-GPU PARALLEL PHASE: stage 14 sharded
142
+ # ============================================================
143
+ if [[ ",$STAGES," == *",14,"* ]]; then
144
+ echo "" | tee -a "$RUNALL_LOG"
145
+ echo "==================== 6-GPU Stage 14 (sharded) ====================" | tee -a "$RUNALL_LOG"
146
+ t_start=$(date +%s)
147
+
148
+ PIDS=()
149
+ SHARD_FILES=()
150
+ for shard_id in 0 1 2 3 4 5; do
151
+ out_path="$DATA_ROOT/results/per_layer_calibration_monitoring_shard${shard_id}.json"
152
+ SHARD_FILES+=("$out_path")
153
+ (
154
+ # Bind this shard to ONE physical GPU by exporting inside the
155
+ # subshell BEFORE python starts. Inline 'VAR=val python ... | tee'
156
+ # is unreliable under `&`: the python process can fork before the
157
+ # prefix takes effect, ending up with the parent env's full GPU list.
158
+ export CUDA_VISIBLE_DEVICES="$shard_id"
159
+ echo "[shard $shard_id] CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" \
160
+ > "$DATA_ROOT/logs/14_mon_shard${shard_id}.log"
161
+ python scripts/14_calibrate_per_layer.py \
162
+ --dim monitoring \
163
+ --n_test "$N_CALIB" \
164
+ --n_repeats "$N_REPEATS" \
165
+ --layer_shard "${shard_id}/6" \
166
+ --shard_id "shard${shard_id}" \
167
+ >> "$DATA_ROOT/logs/14_mon_shard${shard_id}.log" 2>&1
168
+ ) &
169
+ PIDS+=($!)
170
+ echo "Spawned stage 14 shard $shard_id on GPU $shard_id (PID $!)" | tee -a "$RUNALL_LOG"
171
+ done
172
+
173
+ wait "${PIDS[@]}"
174
+ echo "All 6 stage-14 shards finished" | tee -a "$RUNALL_LOG"
175
+
176
+ # Merge
177
+ python scripts/14_merge_shards.py \
178
+ --dim monitoring \
179
+ --shards "${SHARD_FILES[@]}" \
180
+ 2>&1 | tee -a "$RUNALL_LOG"
181
+
182
+ t_end=$(date +%s)
183
+ echo "Stage 14 (parallel + merge) took $((t_end - t_start))s" | tee -a "$RUNALL_LOG"
184
+ fi
185
+
186
+ # ============================================================
187
+ # Single-GPU final stages
188
+ # ============================================================
189
+ export CUDA_VISIBLE_DEVICES=0
190
+
191
+ run_stage 16 "Cumulative top-k multi-layer sweep" \
192
+ python scripts/16_cumulative_topk.py \
193
+ --dim monitoring --n_test "$N_K_TEST"
194
+
195
+ run_stage 15 "Calibrated inference (monitoring)" \
196
+ python scripts/15_infer_calibrated.py \
197
+ --dim monitoring --auto_problems \
198
+ --save_to "$DATA_ROOT/results/infer_calibrated_monitoring_v5.json"
199
+
200
+ run_stage 13 "Final analysis + report" \
201
+ python scripts/13_analyze_and_report.py
202
+
203
+ echo "" | tee -a "$RUNALL_LOG"
204
+ echo "=========================================================" | tee -a "$RUNALL_LOG"
205
+ echo "v5 pipeline complete - $(date)" | tee -a "$RUNALL_LOG"
206
+ echo "=========================================================" | tee -a "$RUNALL_LOG"
207
+ echo "KEY FILES TO READ FIRST:" | tee -a "$RUNALL_LOG"
208
+ echo " $DATA_ROOT/checkpoints/probe_layer_ranking_monitoring.json" | tee -a "$RUNALL_LOG"
209
+ echo " $DATA_ROOT/results/per_layer_calibration_monitoring.json <- safe_layers" | tee -a "$RUNALL_LOG"
210
+ echo " $DATA_ROOT/results/stage16_cumulative_topk_summary.json <- collapse cliff" | tee -a "$RUNALL_LOG"
211
+ echo " $DATA_ROOT/results/infer_calibrated_monitoring_v5.json <- final output" | tee -a "$RUNALL_LOG"
212
+ echo " $DATA_ROOT/results/final_report.md" | tee -a "$RUNALL_LOG"
213
+ echo "=========================================================" | tee -a "$RUNALL_LOG"