#!/bin/bash # ============================================================ # Student Simulation v5 — 6-GPU pipeline (May 2026) # ============================================================ # # WHAT v5 DOES DIFFERENTLY (vs v4): # - DROPPED stage 9 (global α sweep). v4 small-batch proved it unreliable # for 30B MoE multi-layer interaction. # - NEW stage 5b: probe-based layer ranking. Pick top-K layers by # linear-probe accuracy on the existing residuals. Replaces v4's # "take back half" heuristic. # - HARDENED stage 14: # * residual_after_general < 0.3 → AUTO-SKIP (noise vector) # * n_repeats=3 (averaged) → kill single-run noise # * min_reduction_threshold=1.0 → no "noise victories" # * active_threshold=1, side_effect_rate=0.25 (relaxed from v4) # * SHARDED across 6 GPUs (each card calibrates ~3 of 16 layers) # - NEW stage 16: cumulative top-k multi-layer sweep. # k = 1, 3, 5, 8, 12, 16 layers steered together. # Reports collapse rate at each k → finds the multi-layer cliff. # # 6-GPU LAYOUT # Single-GPU phase (GPU 0): # stages 1-8 data prep, expert select, residuals, directions # stage 5b probe ranking (uses residuals only, fast) # Parallel phase (GPU 0-5, 6 cards in parallel): # stage 14 per-layer calibration, sharded 0/6 ... 5/6 # merged into one calibration file # Single-GPU again (GPU 0): # stage 16 cumulative top-k multi-layer sweep # stage 15 calibrated inference (baseline vs intervened) # stage 13 final analysis + report # ============================================================ # # QUICK START # bash runall.sh # full pipeline # STAGES=5b,14,16,15,13 bash runall.sh # skip data prep # STAGES=14,16,15,13 bash runall.sh # skip probe (already ran) # # ENV VARS # STAGES comma-list of stages to run # N_TRAIN # CoTs to generate (default 150) # N_CALIB # problems for stage 14 (default 10) # N_K_TEST # problems for stage 16 (default 10) # N_REPEATS # stage 14 repeats (default 3) # PROBE_TOP_K # # layers from stage 5b (default 16) # ============================================================ set -e set -u set -o pipefail PROJECT_ROOT="$(cd "$(dirname "$0")" && pwd)" cd "$PROJECT_ROOT" export DATA_ROOT="${DATA_ROOT:-$PROJECT_ROOT/data}" export PYTHONPATH="$PROJECT_ROOT:${PYTHONPATH:-}" export TOKENIZERS_PARALLELISM=false # CPU thread caps. Without these, each of the 6 parallel shards spawns ~64 # BLAS threads (PyTorch defaults to nproc), so 6 processes × 64 = 384 threads # fight for ~64 cores → cache thrash → generation appears to hang. # 8 threads × 6 procs = 48 threads, well within capacity. export OMP_NUM_THREADS="${OMP_NUM_THREADS:-8}" export MKL_NUM_THREADS="${MKL_NUM_THREADS:-8}" export NUMEXPR_NUM_THREADS="${NUMEXPR_NUM_THREADS:-8}" N_TRAIN="${N_TRAIN:-150}" N_MATH_TEST="${N_MATH_TEST:-50}" N_AIME="${N_AIME:-30}" N_GPQA="${N_GPQA:-20}" N_CALIB="${N_CALIB:-10}" N_K_TEST="${N_K_TEST:-10}" N_REPEATS="${N_REPEATS:-3}" mkdir -p "$DATA_ROOT/logs" "$DATA_ROOT/results" RUNALL_LOG="$DATA_ROOT/logs/runall.log" echo "=========================================================" | tee -a "$RUNALL_LOG" echo "Student Simulation v5 (6-GPU) - $(date)" | tee -a "$RUNALL_LOG" echo "PROJECT_ROOT: $PROJECT_ROOT" | tee -a "$RUNALL_LOG" echo "N_CALIB: $N_CALIB N_REPEATS: $N_REPEATS" | tee -a "$RUNALL_LOG" echo "N_K_TEST: $N_K_TEST" | tee -a "$RUNALL_LOG" echo "=========================================================" | tee -a "$RUNALL_LOG" python -m configs.paths 2>&1 | tee -a "$RUNALL_LOG" STAGES="${STAGES:-1,2,3,4,5,6,7,8,5b,14,16,15,13}" run_stage() { local stage_num="$1" local stage_name="$2" shift 2 if [[ ",$STAGES," != *",$stage_num,"* ]]; then echo "[skip] Stage $stage_num: $stage_name" | tee -a "$RUNALL_LOG" return 0 fi echo "" | tee -a "$RUNALL_LOG" echo "==================== Stage $stage_num: $stage_name ====================" | tee -a "$RUNALL_LOG" local t_start; t_start=$(date +%s) "$@" 2>&1 | tee -a "$RUNALL_LOG" local t_end; t_end=$(date +%s) echo "Stage $stage_num took $((t_end - t_start))s" | tee -a "$RUNALL_LOG" } # Single-GPU stages export CUDA_VISIBLE_DEVICES=0 if [[ -z "${SKIP_DOWNLOAD:-}" ]]; then run_stage 1 "Download model" \ python scripts/01_download_model.py fi run_stage 2 "Generate CoTs" \ python scripts/02_generate_cots.py \ --n_train "$N_TRAIN" --n_math_test "$N_MATH_TEST" \ --n_aime "$N_AIME" --n_gpqa "$N_GPQA" --resume run_stage 3 "Label CoTs" \ python scripts/03_label_cots.py --resume run_stage 4 "Capture routing" \ python scripts/04_capture_routing.py --resume run_stage 5 "Select top experts" \ python scripts/05_select_top_experts.py --resume run_stage 6 "Interaction analysis" \ python scripts/06_interaction_analysis.py run_stage 7 "Capture residuals" \ python scripts/07_capture_residuals.py --resume run_stage 8 "Compute v4_clean directions" \ python scripts/08_compute_directions.py --resume run_stage 5b "Probe-based layer ranking" \ python scripts/05b_probe_ranking.py --dim monitoring # ============================================================ # 6-GPU PARALLEL PHASE: stage 14 sharded # ============================================================ if [[ ",$STAGES," == *",14,"* ]]; then echo "" | tee -a "$RUNALL_LOG" echo "==================== 6-GPU Stage 14 (sharded) ====================" | tee -a "$RUNALL_LOG" t_start=$(date +%s) PIDS=() SHARD_FILES=() for shard_id in 0 1 2 3 4 5; do out_path="$DATA_ROOT/results/per_layer_calibration_monitoring_shard${shard_id}.json" SHARD_FILES+=("$out_path") ( # Bind this shard to ONE physical GPU by exporting inside the # subshell BEFORE python starts. Inline 'VAR=val python ... | tee' # is unreliable under `&`: the python process can fork before the # prefix takes effect, ending up with the parent env's full GPU list. export CUDA_VISIBLE_DEVICES="$shard_id" echo "[shard $shard_id] CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" \ > "$DATA_ROOT/logs/14_mon_shard${shard_id}.log" python scripts/14_calibrate_per_layer.py \ --dim monitoring \ --n_test "$N_CALIB" \ --n_repeats "$N_REPEATS" \ --layer_shard "${shard_id}/6" \ --shard_id "shard${shard_id}" \ >> "$DATA_ROOT/logs/14_mon_shard${shard_id}.log" 2>&1 ) & PIDS+=($!) echo "Spawned stage 14 shard $shard_id on GPU $shard_id (PID $!)" | tee -a "$RUNALL_LOG" done wait "${PIDS[@]}" echo "All 6 stage-14 shards finished" | tee -a "$RUNALL_LOG" # Merge python scripts/14_merge_shards.py \ --dim monitoring \ --shards "${SHARD_FILES[@]}" \ 2>&1 | tee -a "$RUNALL_LOG" t_end=$(date +%s) echo "Stage 14 (parallel + merge) took $((t_end - t_start))s" | tee -a "$RUNALL_LOG" fi # ============================================================ # Single-GPU final stages # ============================================================ export CUDA_VISIBLE_DEVICES=0 run_stage 16 "Cumulative top-k multi-layer sweep" \ python scripts/16_cumulative_topk.py \ --dim monitoring --n_test "$N_K_TEST" run_stage 15 "Calibrated inference (monitoring)" \ python scripts/15_infer_calibrated.py \ --dim monitoring --auto_problems \ --save_to "$DATA_ROOT/results/infer_calibrated_monitoring_v5.json" run_stage 13 "Final analysis + report" \ python scripts/13_analyze_and_report.py echo "" | tee -a "$RUNALL_LOG" echo "=========================================================" | tee -a "$RUNALL_LOG" echo "v5 pipeline complete - $(date)" | tee -a "$RUNALL_LOG" echo "=========================================================" | tee -a "$RUNALL_LOG" echo "KEY FILES TO READ FIRST:" | tee -a "$RUNALL_LOG" echo " $DATA_ROOT/checkpoints/probe_layer_ranking_monitoring.json" | tee -a "$RUNALL_LOG" echo " $DATA_ROOT/results/per_layer_calibration_monitoring.json <- safe_layers" | tee -a "$RUNALL_LOG" echo " $DATA_ROOT/results/stage16_cumulative_topk_summary.json <- collapse cliff" | tee -a "$RUNALL_LOG" echo " $DATA_ROOT/results/infer_calibrated_monitoring_v5.json <- final output" | tee -a "$RUNALL_LOG" echo " $DATA_ROOT/results/final_report.md" | tee -a "$RUNALL_LOG" echo "=========================================================" | tee -a "$RUNALL_LOG"