Upload runall.sh with huggingface_hub
Browse files
runall.sh
ADDED
|
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# ============================================================
|
| 3 |
+
# Student Simulation v5 — 6-GPU pipeline (May 2026)
|
| 4 |
+
# ============================================================
|
| 5 |
+
#
|
| 6 |
+
# WHAT v5 DOES DIFFERENTLY (vs v4):
|
| 7 |
+
# - DROPPED stage 9 (global α sweep). v4 small-batch proved it unreliable
|
| 8 |
+
# for 30B MoE multi-layer interaction.
|
| 9 |
+
# - NEW stage 5b: probe-based layer ranking. Pick top-K layers by
|
| 10 |
+
# linear-probe accuracy on the existing residuals. Replaces v4's
|
| 11 |
+
# "take back half" heuristic.
|
| 12 |
+
# - HARDENED stage 14:
|
| 13 |
+
# * residual_after_general < 0.3 → AUTO-SKIP (noise vector)
|
| 14 |
+
# * n_repeats=3 (averaged) → kill single-run noise
|
| 15 |
+
# * min_reduction_threshold=1.0 → no "noise victories"
|
| 16 |
+
# * active_threshold=1, side_effect_rate=0.25 (relaxed from v4)
|
| 17 |
+
# * SHARDED across 6 GPUs (each card calibrates ~3 of 16 layers)
|
| 18 |
+
# - NEW stage 16: cumulative top-k multi-layer sweep.
|
| 19 |
+
# k = 1, 3, 5, 8, 12, 16 layers steered together.
|
| 20 |
+
# Reports collapse rate at each k → finds the multi-layer cliff.
|
| 21 |
+
#
|
| 22 |
+
# 6-GPU LAYOUT
|
| 23 |
+
# Single-GPU phase (GPU 0):
|
| 24 |
+
# stages 1-8 data prep, expert select, residuals, directions
|
| 25 |
+
# stage 5b probe ranking (uses residuals only, fast)
|
| 26 |
+
# Parallel phase (GPU 0-5, 6 cards in parallel):
|
| 27 |
+
# stage 14 per-layer calibration, sharded 0/6 ... 5/6
|
| 28 |
+
# merged into one calibration file
|
| 29 |
+
# Single-GPU again (GPU 0):
|
| 30 |
+
# stage 16 cumulative top-k multi-layer sweep
|
| 31 |
+
# stage 15 calibrated inference (baseline vs intervened)
|
| 32 |
+
# stage 13 final analysis + report
|
| 33 |
+
# ============================================================
|
| 34 |
+
#
|
| 35 |
+
# QUICK START
|
| 36 |
+
# bash runall.sh # full pipeline
|
| 37 |
+
# STAGES=5b,14,16,15,13 bash runall.sh # skip data prep
|
| 38 |
+
# STAGES=14,16,15,13 bash runall.sh # skip probe (already ran)
|
| 39 |
+
#
|
| 40 |
+
# ENV VARS
|
| 41 |
+
# STAGES comma-list of stages to run
|
| 42 |
+
# N_TRAIN # CoTs to generate (default 150)
|
| 43 |
+
# N_CALIB # problems for stage 14 (default 10)
|
| 44 |
+
# N_K_TEST # problems for stage 16 (default 10)
|
| 45 |
+
# N_REPEATS # stage 14 repeats (default 3)
|
| 46 |
+
# PROBE_TOP_K # # layers from stage 5b (default 16)
|
| 47 |
+
# ============================================================
|
| 48 |
+
|
| 49 |
+
set -e
|
| 50 |
+
set -u
|
| 51 |
+
set -o pipefail
|
| 52 |
+
|
| 53 |
+
PROJECT_ROOT="$(cd "$(dirname "$0")" && pwd)"
|
| 54 |
+
cd "$PROJECT_ROOT"
|
| 55 |
+
|
| 56 |
+
export DATA_ROOT="${DATA_ROOT:-$PROJECT_ROOT/data}"
|
| 57 |
+
export PYTHONPATH="$PROJECT_ROOT:${PYTHONPATH:-}"
|
| 58 |
+
export TOKENIZERS_PARALLELISM=false
|
| 59 |
+
|
| 60 |
+
# CPU thread caps. Without these, each of the 6 parallel shards spawns ~64
|
| 61 |
+
# BLAS threads (PyTorch defaults to nproc), so 6 processes × 64 = 384 threads
|
| 62 |
+
# fight for ~64 cores → cache thrash → generation appears to hang.
|
| 63 |
+
# 8 threads × 6 procs = 48 threads, well within capacity.
|
| 64 |
+
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-8}"
|
| 65 |
+
export MKL_NUM_THREADS="${MKL_NUM_THREADS:-8}"
|
| 66 |
+
export NUMEXPR_NUM_THREADS="${NUMEXPR_NUM_THREADS:-8}"
|
| 67 |
+
|
| 68 |
+
N_TRAIN="${N_TRAIN:-150}"
|
| 69 |
+
N_MATH_TEST="${N_MATH_TEST:-50}"
|
| 70 |
+
N_AIME="${N_AIME:-30}"
|
| 71 |
+
N_GPQA="${N_GPQA:-20}"
|
| 72 |
+
N_CALIB="${N_CALIB:-10}"
|
| 73 |
+
N_K_TEST="${N_K_TEST:-10}"
|
| 74 |
+
N_REPEATS="${N_REPEATS:-3}"
|
| 75 |
+
|
| 76 |
+
mkdir -p "$DATA_ROOT/logs" "$DATA_ROOT/results"
|
| 77 |
+
RUNALL_LOG="$DATA_ROOT/logs/runall.log"
|
| 78 |
+
|
| 79 |
+
echo "=========================================================" | tee -a "$RUNALL_LOG"
|
| 80 |
+
echo "Student Simulation v5 (6-GPU) - $(date)" | tee -a "$RUNALL_LOG"
|
| 81 |
+
echo "PROJECT_ROOT: $PROJECT_ROOT" | tee -a "$RUNALL_LOG"
|
| 82 |
+
echo "N_CALIB: $N_CALIB N_REPEATS: $N_REPEATS" | tee -a "$RUNALL_LOG"
|
| 83 |
+
echo "N_K_TEST: $N_K_TEST" | tee -a "$RUNALL_LOG"
|
| 84 |
+
echo "=========================================================" | tee -a "$RUNALL_LOG"
|
| 85 |
+
|
| 86 |
+
python -m configs.paths 2>&1 | tee -a "$RUNALL_LOG"
|
| 87 |
+
|
| 88 |
+
STAGES="${STAGES:-1,2,3,4,5,6,7,8,5b,14,16,15,13}"
|
| 89 |
+
|
| 90 |
+
run_stage() {
|
| 91 |
+
local stage_num="$1"
|
| 92 |
+
local stage_name="$2"
|
| 93 |
+
shift 2
|
| 94 |
+
if [[ ",$STAGES," != *",$stage_num,"* ]]; then
|
| 95 |
+
echo "[skip] Stage $stage_num: $stage_name" | tee -a "$RUNALL_LOG"
|
| 96 |
+
return 0
|
| 97 |
+
fi
|
| 98 |
+
echo "" | tee -a "$RUNALL_LOG"
|
| 99 |
+
echo "==================== Stage $stage_num: $stage_name ====================" | tee -a "$RUNALL_LOG"
|
| 100 |
+
local t_start; t_start=$(date +%s)
|
| 101 |
+
"$@" 2>&1 | tee -a "$RUNALL_LOG"
|
| 102 |
+
local t_end; t_end=$(date +%s)
|
| 103 |
+
echo "Stage $stage_num took $((t_end - t_start))s" | tee -a "$RUNALL_LOG"
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
# Single-GPU stages
|
| 107 |
+
export CUDA_VISIBLE_DEVICES=0
|
| 108 |
+
|
| 109 |
+
if [[ -z "${SKIP_DOWNLOAD:-}" ]]; then
|
| 110 |
+
run_stage 1 "Download model" \
|
| 111 |
+
python scripts/01_download_model.py
|
| 112 |
+
fi
|
| 113 |
+
|
| 114 |
+
run_stage 2 "Generate CoTs" \
|
| 115 |
+
python scripts/02_generate_cots.py \
|
| 116 |
+
--n_train "$N_TRAIN" --n_math_test "$N_MATH_TEST" \
|
| 117 |
+
--n_aime "$N_AIME" --n_gpqa "$N_GPQA" --resume
|
| 118 |
+
|
| 119 |
+
run_stage 3 "Label CoTs" \
|
| 120 |
+
python scripts/03_label_cots.py --resume
|
| 121 |
+
|
| 122 |
+
run_stage 4 "Capture routing" \
|
| 123 |
+
python scripts/04_capture_routing.py --resume
|
| 124 |
+
|
| 125 |
+
run_stage 5 "Select top experts" \
|
| 126 |
+
python scripts/05_select_top_experts.py --resume
|
| 127 |
+
|
| 128 |
+
run_stage 6 "Interaction analysis" \
|
| 129 |
+
python scripts/06_interaction_analysis.py
|
| 130 |
+
|
| 131 |
+
run_stage 7 "Capture residuals" \
|
| 132 |
+
python scripts/07_capture_residuals.py --resume
|
| 133 |
+
|
| 134 |
+
run_stage 8 "Compute v4_clean directions" \
|
| 135 |
+
python scripts/08_compute_directions.py --resume
|
| 136 |
+
|
| 137 |
+
run_stage 5b "Probe-based layer ranking" \
|
| 138 |
+
python scripts/05b_probe_ranking.py --dim monitoring
|
| 139 |
+
|
| 140 |
+
# ============================================================
|
| 141 |
+
# 6-GPU PARALLEL PHASE: stage 14 sharded
|
| 142 |
+
# ============================================================
|
| 143 |
+
if [[ ",$STAGES," == *",14,"* ]]; then
|
| 144 |
+
echo "" | tee -a "$RUNALL_LOG"
|
| 145 |
+
echo "==================== 6-GPU Stage 14 (sharded) ====================" | tee -a "$RUNALL_LOG"
|
| 146 |
+
t_start=$(date +%s)
|
| 147 |
+
|
| 148 |
+
PIDS=()
|
| 149 |
+
SHARD_FILES=()
|
| 150 |
+
for shard_id in 0 1 2 3 4 5; do
|
| 151 |
+
out_path="$DATA_ROOT/results/per_layer_calibration_monitoring_shard${shard_id}.json"
|
| 152 |
+
SHARD_FILES+=("$out_path")
|
| 153 |
+
(
|
| 154 |
+
# Bind this shard to ONE physical GPU by exporting inside the
|
| 155 |
+
# subshell BEFORE python starts. Inline 'VAR=val python ... | tee'
|
| 156 |
+
# is unreliable under `&`: the python process can fork before the
|
| 157 |
+
# prefix takes effect, ending up with the parent env's full GPU list.
|
| 158 |
+
export CUDA_VISIBLE_DEVICES="$shard_id"
|
| 159 |
+
echo "[shard $shard_id] CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" \
|
| 160 |
+
> "$DATA_ROOT/logs/14_mon_shard${shard_id}.log"
|
| 161 |
+
python scripts/14_calibrate_per_layer.py \
|
| 162 |
+
--dim monitoring \
|
| 163 |
+
--n_test "$N_CALIB" \
|
| 164 |
+
--n_repeats "$N_REPEATS" \
|
| 165 |
+
--layer_shard "${shard_id}/6" \
|
| 166 |
+
--shard_id "shard${shard_id}" \
|
| 167 |
+
>> "$DATA_ROOT/logs/14_mon_shard${shard_id}.log" 2>&1
|
| 168 |
+
) &
|
| 169 |
+
PIDS+=($!)
|
| 170 |
+
echo "Spawned stage 14 shard $shard_id on GPU $shard_id (PID $!)" | tee -a "$RUNALL_LOG"
|
| 171 |
+
done
|
| 172 |
+
|
| 173 |
+
wait "${PIDS[@]}"
|
| 174 |
+
echo "All 6 stage-14 shards finished" | tee -a "$RUNALL_LOG"
|
| 175 |
+
|
| 176 |
+
# Merge
|
| 177 |
+
python scripts/14_merge_shards.py \
|
| 178 |
+
--dim monitoring \
|
| 179 |
+
--shards "${SHARD_FILES[@]}" \
|
| 180 |
+
2>&1 | tee -a "$RUNALL_LOG"
|
| 181 |
+
|
| 182 |
+
t_end=$(date +%s)
|
| 183 |
+
echo "Stage 14 (parallel + merge) took $((t_end - t_start))s" | tee -a "$RUNALL_LOG"
|
| 184 |
+
fi
|
| 185 |
+
|
| 186 |
+
# ============================================================
|
| 187 |
+
# Single-GPU final stages
|
| 188 |
+
# ============================================================
|
| 189 |
+
export CUDA_VISIBLE_DEVICES=0
|
| 190 |
+
|
| 191 |
+
run_stage 16 "Cumulative top-k multi-layer sweep" \
|
| 192 |
+
python scripts/16_cumulative_topk.py \
|
| 193 |
+
--dim monitoring --n_test "$N_K_TEST"
|
| 194 |
+
|
| 195 |
+
run_stage 15 "Calibrated inference (monitoring)" \
|
| 196 |
+
python scripts/15_infer_calibrated.py \
|
| 197 |
+
--dim monitoring --auto_problems \
|
| 198 |
+
--save_to "$DATA_ROOT/results/infer_calibrated_monitoring_v5.json"
|
| 199 |
+
|
| 200 |
+
run_stage 13 "Final analysis + report" \
|
| 201 |
+
python scripts/13_analyze_and_report.py
|
| 202 |
+
|
| 203 |
+
echo "" | tee -a "$RUNALL_LOG"
|
| 204 |
+
echo "=========================================================" | tee -a "$RUNALL_LOG"
|
| 205 |
+
echo "v5 pipeline complete - $(date)" | tee -a "$RUNALL_LOG"
|
| 206 |
+
echo "=========================================================" | tee -a "$RUNALL_LOG"
|
| 207 |
+
echo "KEY FILES TO READ FIRST:" | tee -a "$RUNALL_LOG"
|
| 208 |
+
echo " $DATA_ROOT/checkpoints/probe_layer_ranking_monitoring.json" | tee -a "$RUNALL_LOG"
|
| 209 |
+
echo " $DATA_ROOT/results/per_layer_calibration_monitoring.json <- safe_layers" | tee -a "$RUNALL_LOG"
|
| 210 |
+
echo " $DATA_ROOT/results/stage16_cumulative_topk_summary.json <- collapse cliff" | tee -a "$RUNALL_LOG"
|
| 211 |
+
echo " $DATA_ROOT/results/infer_calibrated_monitoring_v5.json <- final output" | tee -a "$RUNALL_LOG"
|
| 212 |
+
echo " $DATA_ROOT/results/final_report.md" | tee -a "$RUNALL_LOG"
|
| 213 |
+
echo "=========================================================" | tee -a "$RUNALL_LOG"
|