tempv2 / runall.sh

Upload runall.sh with huggingface_hub

c04eaa2 verified 20 days ago

8.68 kB

	#!/bin/bash
	# ============================================================
	# Student Simulation v5 — 6-GPU pipeline (May 2026)
	# ============================================================
	#
	# WHAT v5 DOES DIFFERENTLY (vs v4):
	# - DROPPED stage 9 (global α sweep). v4 small-batch proved it unreliable
	# for 30B MoE multi-layer interaction.
	# - NEW stage 5b: probe-based layer ranking. Pick top-K layers by
	# linear-probe accuracy on the existing residuals. Replaces v4's
	# "take back half" heuristic.
	# - HARDENED stage 14:
	# * residual_after_general < 0.3 → AUTO-SKIP (noise vector)
	# * n_repeats=3 (averaged) → kill single-run noise
	# * min_reduction_threshold=1.0 → no "noise victories"
	# * active_threshold=1, side_effect_rate=0.25 (relaxed from v4)
	# * SHARDED across 6 GPUs (each card calibrates ~3 of 16 layers)
	# - NEW stage 16: cumulative top-k multi-layer sweep.
	# k = 1, 3, 5, 8, 12, 16 layers steered together.
	# Reports collapse rate at each k → finds the multi-layer cliff.
	#
	# 6-GPU LAYOUT
	# Single-GPU phase (GPU 0):
	# stages 1-8 data prep, expert select, residuals, directions
	# stage 5b probe ranking (uses residuals only, fast)
	# Parallel phase (GPU 0-5, 6 cards in parallel):
	# stage 14 per-layer calibration, sharded 0/6 ... 5/6
	# merged into one calibration file
	# Single-GPU again (GPU 0):
	# stage 16 cumulative top-k multi-layer sweep
	# stage 15 calibrated inference (baseline vs intervened)
	# stage 13 final analysis + report
	# ============================================================
	#
	# QUICK START
	# bash runall.sh # full pipeline
	# STAGES=5b,14,16,15,13 bash runall.sh # skip data prep
	# STAGES=14,16,15,13 bash runall.sh # skip probe (already ran)
	#
	# ENV VARS
	# STAGES comma-list of stages to run
	# N_TRAIN # CoTs to generate (default 150)
	# N_CALIB # problems for stage 14 (default 10)
	# N_K_TEST # problems for stage 16 (default 10)
	# N_REPEATS # stage 14 repeats (default 3)
	# PROBE_TOP_K # # layers from stage 5b (default 16)
	# ============================================================

	set -e
	set -u
	set -o pipefail

	PROJECT_ROOT="$(cd "$(dirname "$0")" && pwd)"
	cd "$PROJECT_ROOT"

	export DATA_ROOT="${DATA_ROOT:-$PROJECT_ROOT/data}"
	export PYTHONPATH="$PROJECT_ROOT:${PYTHONPATH:-}"
	export TOKENIZERS_PARALLELISM=false

	# CPU thread caps. Without these, each of the 6 parallel shards spawns ~64
	# BLAS threads (PyTorch defaults to nproc), so 6 processes × 64 = 384 threads
	# fight for ~64 cores → cache thrash → generation appears to hang.
	# 8 threads × 6 procs = 48 threads, well within capacity.
	export OMP_NUM_THREADS="${OMP_NUM_THREADS:-8}"
	export MKL_NUM_THREADS="${MKL_NUM_THREADS:-8}"
	export NUMEXPR_NUM_THREADS="${NUMEXPR_NUM_THREADS:-8}"

	N_TRAIN="${N_TRAIN:-150}"
	N_MATH_TEST="${N_MATH_TEST:-50}"
	N_AIME="${N_AIME:-30}"
	N_GPQA="${N_GPQA:-20}"
	N_CALIB="${N_CALIB:-10}"
	N_K_TEST="${N_K_TEST:-10}"
	N_REPEATS="${N_REPEATS:-3}"

	mkdir -p "$DATA_ROOT/logs" "$DATA_ROOT/results"
	RUNALL_LOG="$DATA_ROOT/logs/runall.log"

	echo "=========================================================" \| tee -a "$RUNALL_LOG"
	echo "Student Simulation v5 (6-GPU) - $(date)" \| tee -a "$RUNALL_LOG"
	echo "PROJECT_ROOT: $PROJECT_ROOT" \| tee -a "$RUNALL_LOG"
	echo "N_CALIB: $N_CALIB N_REPEATS: $N_REPEATS" \| tee -a "$RUNALL_LOG"
	echo "N_K_TEST: $N_K_TEST" \| tee -a "$RUNALL_LOG"
	echo "=========================================================" \| tee -a "$RUNALL_LOG"

	python -m configs.paths 2>&1 \| tee -a "$RUNALL_LOG"

	STAGES="${STAGES:-1,2,3,4,5,6,7,8,5b,14,16,15,13}"

	run_stage() {
	local stage_num="$1"
	local stage_name="$2"
	shift 2
	if [[ ",$STAGES," != ",$stage_num," ]]; then
	echo "[skip] Stage $stage_num: $stage_name" \| tee -a "$RUNALL_LOG"
	return 0
	fi
	echo "" \| tee -a "$RUNALL_LOG"
	echo "==================== Stage $stage_num: $stage_name ====================" \| tee -a "$RUNALL_LOG"
	local t_start; t_start=$(date +%s)
	"$@" 2>&1 \| tee -a "$RUNALL_LOG"
	local t_end; t_end=$(date +%s)
	echo "Stage $stage_num took $((t_end - t_start))s" \| tee -a "$RUNALL_LOG"
	}

	# Single-GPU stages
	export CUDA_VISIBLE_DEVICES=0

	if [[ -z "${SKIP_DOWNLOAD:-}" ]]; then
	run_stage 1 "Download model" \
	python scripts/01_download_model.py
	fi

	run_stage 2 "Generate CoTs" \
	python scripts/02_generate_cots.py \
	--n_train "$N_TRAIN" --n_math_test "$N_MATH_TEST" \
	--n_aime "$N_AIME" --n_gpqa "$N_GPQA" --resume

	run_stage 3 "Label CoTs" \
	python scripts/03_label_cots.py --resume

	run_stage 4 "Capture routing" \
	python scripts/04_capture_routing.py --resume

	run_stage 5 "Select top experts" \
	python scripts/05_select_top_experts.py --resume

	run_stage 6 "Interaction analysis" \
	python scripts/06_interaction_analysis.py

	run_stage 7 "Capture residuals" \
	python scripts/07_capture_residuals.py --resume

	run_stage 8 "Compute v4_clean directions" \
	python scripts/08_compute_directions.py --resume

	run_stage 5b "Probe-based layer ranking" \
	python scripts/05b_probe_ranking.py --dim monitoring

	# ============================================================
	# 6-GPU PARALLEL PHASE: stage 14 sharded
	# ============================================================
	if [[ ",$STAGES," == ",14," ]]; then
	echo "" \| tee -a "$RUNALL_LOG"
	echo "==================== 6-GPU Stage 14 (sharded) ====================" \| tee -a "$RUNALL_LOG"
	t_start=$(date +%s)

	PIDS=()
	SHARD_FILES=()
	for shard_id in 0 1 2 3 4 5; do
	out_path="$DATA_ROOT/results/per_layer_calibration_monitoring_shard${shard_id}.json"
	SHARD_FILES+=("$out_path")
	(
	# Bind this shard to ONE physical GPU by exporting inside the
	# subshell BEFORE python starts. Inline 'VAR=val python ... \| tee'
	# is unreliable under `&`: the python process can fork before the
	# prefix takes effect, ending up with the parent env's full GPU list.
	export CUDA_VISIBLE_DEVICES="$shard_id"
	echo "[shard $shard_id] CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" \
	> "$DATA_ROOT/logs/14_mon_shard${shard_id}.log"
	python scripts/14_calibrate_per_layer.py \
	--dim monitoring \
	--n_test "$N_CALIB" \
	--n_repeats "$N_REPEATS" \
	--layer_shard "${shard_id}/6" \
	--shard_id "shard${shard_id}" \
	>> "$DATA_ROOT/logs/14_mon_shard${shard_id}.log" 2>&1
	) &
	PIDS+=($!)
	echo "Spawned stage 14 shard $shard_id on GPU $shard_id (PID $!)" \| tee -a "$RUNALL_LOG"
	done

	wait "${PIDS[@]}"
	echo "All 6 stage-14 shards finished" \| tee -a "$RUNALL_LOG"

	# Merge
	python scripts/14_merge_shards.py \
	--dim monitoring \
	--shards "${SHARD_FILES[@]}" \
	2>&1 \| tee -a "$RUNALL_LOG"

	t_end=$(date +%s)
	echo "Stage 14 (parallel + merge) took $((t_end - t_start))s" \| tee -a "$RUNALL_LOG"
	fi

	# ============================================================
	# Single-GPU final stages
	# ============================================================
	export CUDA_VISIBLE_DEVICES=0

	run_stage 16 "Cumulative top-k multi-layer sweep" \
	python scripts/16_cumulative_topk.py \
	--dim monitoring --n_test "$N_K_TEST"

	run_stage 15 "Calibrated inference (monitoring)" \
	python scripts/15_infer_calibrated.py \
	--dim monitoring --auto_problems \
	--save_to "$DATA_ROOT/results/infer_calibrated_monitoring_v5.json"

	run_stage 13 "Final analysis + report" \
	python scripts/13_analyze_and_report.py

	echo "" \| tee -a "$RUNALL_LOG"
	echo "=========================================================" \| tee -a "$RUNALL_LOG"
	echo "v5 pipeline complete - $(date)" \| tee -a "$RUNALL_LOG"
	echo "=========================================================" \| tee -a "$RUNALL_LOG"
	echo "KEY FILES TO READ FIRST:" \| tee -a "$RUNALL_LOG"
	echo " $DATA_ROOT/checkpoints/probe_layer_ranking_monitoring.json" \| tee -a "$RUNALL_LOG"
	echo " $DATA_ROOT/results/per_layer_calibration_monitoring.json <- safe_layers" \| tee -a "$RUNALL_LOG"
	echo " $DATA_ROOT/results/stage16_cumulative_topk_summary.json <- collapse cliff" \| tee -a "$RUNALL_LOG"
	echo " $DATA_ROOT/results/infer_calibrated_monitoring_v5.json <- final output" \| tee -a "$RUNALL_LOG"
	echo " $DATA_ROOT/results/final_report.md" \| tee -a "$RUNALL_LOG"
	echo "=========================================================" \| tee -a "$RUNALL_LOG"