File size: 8,678 Bytes
c04eaa2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 | #!/bin/bash
# ============================================================
# Student Simulation v5 — 6-GPU pipeline (May 2026)
# ============================================================
#
# WHAT v5 DOES DIFFERENTLY (vs v4):
# - DROPPED stage 9 (global α sweep). v4 small-batch proved it unreliable
# for 30B MoE multi-layer interaction.
# - NEW stage 5b: probe-based layer ranking. Pick top-K layers by
# linear-probe accuracy on the existing residuals. Replaces v4's
# "take back half" heuristic.
# - HARDENED stage 14:
# * residual_after_general < 0.3 → AUTO-SKIP (noise vector)
# * n_repeats=3 (averaged) → kill single-run noise
# * min_reduction_threshold=1.0 → no "noise victories"
# * active_threshold=1, side_effect_rate=0.25 (relaxed from v4)
# * SHARDED across 6 GPUs (each card calibrates ~3 of 16 layers)
# - NEW stage 16: cumulative top-k multi-layer sweep.
# k = 1, 3, 5, 8, 12, 16 layers steered together.
# Reports collapse rate at each k → finds the multi-layer cliff.
#
# 6-GPU LAYOUT
# Single-GPU phase (GPU 0):
# stages 1-8 data prep, expert select, residuals, directions
# stage 5b probe ranking (uses residuals only, fast)
# Parallel phase (GPU 0-5, 6 cards in parallel):
# stage 14 per-layer calibration, sharded 0/6 ... 5/6
# merged into one calibration file
# Single-GPU again (GPU 0):
# stage 16 cumulative top-k multi-layer sweep
# stage 15 calibrated inference (baseline vs intervened)
# stage 13 final analysis + report
# ============================================================
#
# QUICK START
# bash runall.sh # full pipeline
# STAGES=5b,14,16,15,13 bash runall.sh # skip data prep
# STAGES=14,16,15,13 bash runall.sh # skip probe (already ran)
#
# ENV VARS
# STAGES comma-list of stages to run
# N_TRAIN # CoTs to generate (default 150)
# N_CALIB # problems for stage 14 (default 10)
# N_K_TEST # problems for stage 16 (default 10)
# N_REPEATS # stage 14 repeats (default 3)
# PROBE_TOP_K # # layers from stage 5b (default 16)
# ============================================================
set -e
set -u
set -o pipefail
PROJECT_ROOT="$(cd "$(dirname "$0")" && pwd)"
cd "$PROJECT_ROOT"
export DATA_ROOT="${DATA_ROOT:-$PROJECT_ROOT/data}"
export PYTHONPATH="$PROJECT_ROOT:${PYTHONPATH:-}"
export TOKENIZERS_PARALLELISM=false
# CPU thread caps. Without these, each of the 6 parallel shards spawns ~64
# BLAS threads (PyTorch defaults to nproc), so 6 processes × 64 = 384 threads
# fight for ~64 cores → cache thrash → generation appears to hang.
# 8 threads × 6 procs = 48 threads, well within capacity.
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-8}"
export MKL_NUM_THREADS="${MKL_NUM_THREADS:-8}"
export NUMEXPR_NUM_THREADS="${NUMEXPR_NUM_THREADS:-8}"
N_TRAIN="${N_TRAIN:-150}"
N_MATH_TEST="${N_MATH_TEST:-50}"
N_AIME="${N_AIME:-30}"
N_GPQA="${N_GPQA:-20}"
N_CALIB="${N_CALIB:-10}"
N_K_TEST="${N_K_TEST:-10}"
N_REPEATS="${N_REPEATS:-3}"
mkdir -p "$DATA_ROOT/logs" "$DATA_ROOT/results"
RUNALL_LOG="$DATA_ROOT/logs/runall.log"
echo "=========================================================" | tee -a "$RUNALL_LOG"
echo "Student Simulation v5 (6-GPU) - $(date)" | tee -a "$RUNALL_LOG"
echo "PROJECT_ROOT: $PROJECT_ROOT" | tee -a "$RUNALL_LOG"
echo "N_CALIB: $N_CALIB N_REPEATS: $N_REPEATS" | tee -a "$RUNALL_LOG"
echo "N_K_TEST: $N_K_TEST" | tee -a "$RUNALL_LOG"
echo "=========================================================" | tee -a "$RUNALL_LOG"
python -m configs.paths 2>&1 | tee -a "$RUNALL_LOG"
STAGES="${STAGES:-1,2,3,4,5,6,7,8,5b,14,16,15,13}"
run_stage() {
local stage_num="$1"
local stage_name="$2"
shift 2
if [[ ",$STAGES," != *",$stage_num,"* ]]; then
echo "[skip] Stage $stage_num: $stage_name" | tee -a "$RUNALL_LOG"
return 0
fi
echo "" | tee -a "$RUNALL_LOG"
echo "==================== Stage $stage_num: $stage_name ====================" | tee -a "$RUNALL_LOG"
local t_start; t_start=$(date +%s)
"$@" 2>&1 | tee -a "$RUNALL_LOG"
local t_end; t_end=$(date +%s)
echo "Stage $stage_num took $((t_end - t_start))s" | tee -a "$RUNALL_LOG"
}
# Single-GPU stages
export CUDA_VISIBLE_DEVICES=0
if [[ -z "${SKIP_DOWNLOAD:-}" ]]; then
run_stage 1 "Download model" \
python scripts/01_download_model.py
fi
run_stage 2 "Generate CoTs" \
python scripts/02_generate_cots.py \
--n_train "$N_TRAIN" --n_math_test "$N_MATH_TEST" \
--n_aime "$N_AIME" --n_gpqa "$N_GPQA" --resume
run_stage 3 "Label CoTs" \
python scripts/03_label_cots.py --resume
run_stage 4 "Capture routing" \
python scripts/04_capture_routing.py --resume
run_stage 5 "Select top experts" \
python scripts/05_select_top_experts.py --resume
run_stage 6 "Interaction analysis" \
python scripts/06_interaction_analysis.py
run_stage 7 "Capture residuals" \
python scripts/07_capture_residuals.py --resume
run_stage 8 "Compute v4_clean directions" \
python scripts/08_compute_directions.py --resume
run_stage 5b "Probe-based layer ranking" \
python scripts/05b_probe_ranking.py --dim monitoring
# ============================================================
# 6-GPU PARALLEL PHASE: stage 14 sharded
# ============================================================
if [[ ",$STAGES," == *",14,"* ]]; then
echo "" | tee -a "$RUNALL_LOG"
echo "==================== 6-GPU Stage 14 (sharded) ====================" | tee -a "$RUNALL_LOG"
t_start=$(date +%s)
PIDS=()
SHARD_FILES=()
for shard_id in 0 1 2 3 4 5; do
out_path="$DATA_ROOT/results/per_layer_calibration_monitoring_shard${shard_id}.json"
SHARD_FILES+=("$out_path")
(
# Bind this shard to ONE physical GPU by exporting inside the
# subshell BEFORE python starts. Inline 'VAR=val python ... | tee'
# is unreliable under `&`: the python process can fork before the
# prefix takes effect, ending up with the parent env's full GPU list.
export CUDA_VISIBLE_DEVICES="$shard_id"
echo "[shard $shard_id] CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" \
> "$DATA_ROOT/logs/14_mon_shard${shard_id}.log"
python scripts/14_calibrate_per_layer.py \
--dim monitoring \
--n_test "$N_CALIB" \
--n_repeats "$N_REPEATS" \
--layer_shard "${shard_id}/6" \
--shard_id "shard${shard_id}" \
>> "$DATA_ROOT/logs/14_mon_shard${shard_id}.log" 2>&1
) &
PIDS+=($!)
echo "Spawned stage 14 shard $shard_id on GPU $shard_id (PID $!)" | tee -a "$RUNALL_LOG"
done
wait "${PIDS[@]}"
echo "All 6 stage-14 shards finished" | tee -a "$RUNALL_LOG"
# Merge
python scripts/14_merge_shards.py \
--dim monitoring \
--shards "${SHARD_FILES[@]}" \
2>&1 | tee -a "$RUNALL_LOG"
t_end=$(date +%s)
echo "Stage 14 (parallel + merge) took $((t_end - t_start))s" | tee -a "$RUNALL_LOG"
fi
# ============================================================
# Single-GPU final stages
# ============================================================
export CUDA_VISIBLE_DEVICES=0
run_stage 16 "Cumulative top-k multi-layer sweep" \
python scripts/16_cumulative_topk.py \
--dim monitoring --n_test "$N_K_TEST"
run_stage 15 "Calibrated inference (monitoring)" \
python scripts/15_infer_calibrated.py \
--dim monitoring --auto_problems \
--save_to "$DATA_ROOT/results/infer_calibrated_monitoring_v5.json"
run_stage 13 "Final analysis + report" \
python scripts/13_analyze_and_report.py
echo "" | tee -a "$RUNALL_LOG"
echo "=========================================================" | tee -a "$RUNALL_LOG"
echo "v5 pipeline complete - $(date)" | tee -a "$RUNALL_LOG"
echo "=========================================================" | tee -a "$RUNALL_LOG"
echo "KEY FILES TO READ FIRST:" | tee -a "$RUNALL_LOG"
echo " $DATA_ROOT/checkpoints/probe_layer_ranking_monitoring.json" | tee -a "$RUNALL_LOG"
echo " $DATA_ROOT/results/per_layer_calibration_monitoring.json <- safe_layers" | tee -a "$RUNALL_LOG"
echo " $DATA_ROOT/results/stage16_cumulative_topk_summary.json <- collapse cliff" | tee -a "$RUNALL_LOG"
echo " $DATA_ROOT/results/infer_calibrated_monitoring_v5.json <- final output" | tee -a "$RUNALL_LOG"
echo " $DATA_ROOT/results/final_report.md" | tee -a "$RUNALL_LOG"
echo "=========================================================" | tee -a "$RUNALL_LOG"
|