v2 / runall.sh

Upload folder using huggingface_hub

e53f10b verified about 1 month ago

5.14 kB

	#!/bin/bash
	# ============================================================
	# Student Simulation — Full Pipeline (v2, Apr 2026)
	# ============================================================
	#
	# CHANGES vs v1:
	# - α sweep restricted to [0, 1] only (no over-suppression)
	# - 2 direction versions (v1_raw + v_pca_subspace) instead of 4
	# - --save_texts default ON (CoT always saved)
	# - 08b attention diagnostic added
	# - 10_infer.py runs as sanity check after sweep
	# - LLM rater removed
	# - max_new_tokens raised to support full Qwen3-Thinking output
	# ============================================================

	set -e
	set -u
	set -o pipefail

	PROJECT_ROOT="$(cd "$(dirname "$0")" && pwd)"
	cd "$PROJECT_ROOT"

	export DATA_ROOT="${DATA_ROOT:-$PROJECT_ROOT/data}"
	export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
	export PYTHONPATH="$PROJECT_ROOT:${PYTHONPATH:-}"
	export TOKENIZERS_PARALLELISM=false

	N_TRAIN="${N_TRAIN:-150}"
	N_MATH_TEST="${N_MATH_TEST:-50}"
	N_AIME="${N_AIME:-30}"
	N_GPQA="${N_GPQA:-20}"
	N_SWEEP="${N_SWEEP:-30}"

	# Joint anti-leak flag (default OFF; set JOINT=1 to enable)
	JOINT_FLAG=""
	if [[ "${JOINT:-0}" == "1" ]]; then
	JOINT_FLAG="--joint"
	fi

	mkdir -p "$DATA_ROOT/logs"
	RUNALL_LOG="$DATA_ROOT/logs/runall.log"

	echo "=========================================================" \| tee -a "$RUNALL_LOG"
	echo "Student Simulation v2 - $(date)" \| tee -a "$RUNALL_LOG"
	echo "PROJECT_ROOT: $PROJECT_ROOT" \| tee -a "$RUNALL_LOG"
	echo "DATA_ROOT: $DATA_ROOT" \| tee -a "$RUNALL_LOG"
	echo "CUDA: $CUDA_VISIBLE_DEVICES" \| tee -a "$RUNALL_LOG"
	echo "N_TRAIN: $N_TRAIN" \| tee -a "$RUNALL_LOG"
	echo "N_SWEEP: $N_SWEEP" \| tee -a "$RUNALL_LOG"
	echo "JOINT: ${JOINT:-0}" \| tee -a "$RUNALL_LOG"
	echo "=========================================================" \| tee -a "$RUNALL_LOG"

	python -m configs.paths 2>&1 \| tee -a "$RUNALL_LOG"

	# Stage selector
	STAGES="${STAGES:-0,1,2,3,4,5,6,7,8,8b,9,10,12,13}"

	run_stage() {
	local stage_num="$1"
	local stage_name="$2"
	shift 2
	if [[ ",$STAGES," != ",$stage_num," ]]; then
	echo "[skip] Stage $stage_num: $stage_name" \| tee -a "$RUNALL_LOG"
	return 0
	fi
	echo "" \| tee -a "$RUNALL_LOG"
	echo "==================== Stage $stage_num: $stage_name ====================" \| tee -a "$RUNALL_LOG"
	local t_start
	t_start=$(date +%s)
	"$@" 2>&1 \| tee -a "$RUNALL_LOG"
	local t_end
	t_end=$(date +%s)
	echo "Stage $stage_num took $((t_end - t_start))s" \| tee -a "$RUNALL_LOG"
	}

	# ============================================================
	if [[ -z "${SKIP_DOWNLOAD:-}" ]]; then
	run_stage 1 "Download model" \
	python scripts/01_download_model.py
	fi

	run_stage 2 "Generate CoTs (MATH training + test sets)" \
	python scripts/02_generate_cots.py \
	--n_train "$N_TRAIN" --n_math_test "$N_MATH_TEST" \
	--n_aime "$N_AIME" --n_gpqa "$N_GPQA" --resume

	run_stage 3 "Label CoTs (decision points)" \
	python scripts/03_label_cots.py --resume

	run_stage 4 "Capture routing dumps (Stage 1)" \
	python scripts/04_capture_routing.py --resume

	run_stage 5 "Select top experts" \
	python scripts/05_select_top_experts.py --resume

	run_stage 6 "Interaction analysis (Stage 2)" \
	python scripts/06_interaction_analysis.py

	run_stage 7 "Capture decision-point residuals" \
	python scripts/07_capture_residuals.py --resume

	run_stage 8 "Compute direction vectors (v1_raw + v_pca_subspace)" \
	python scripts/08_compute_directions.py --resume

	run_stage 8b "Attention diagnostic (informational)" \
	python scripts/08b_attention_diagnostic.py --resume

	run_stage 9 "Steering sweep (α ∈ [0,1], with text saving)" \
	python scripts/09_steering_sweep.py \
	--n_test "$N_SWEEP" --resume \
	--save_texts $JOINT_FLAG

	run_stage 10 "Sanity-check inference on 2 default problems" \
	python scripts/10_infer.py \
	--auto_problems \
	--dim planning --version v_pca_subspace \
	--alphas 1.0 0.5 0.0 \
	--save_to "$DATA_ROOT/results/infer_sanity_planning.json"

	run_stage 10b "Sanity-check inference (monitoring)" \
	python scripts/10_infer.py \
	--auto_problems \
	--dim monitoring --version v_pca_subspace \
	--alphas 1.0 0.5 0.0 \
	--save_to "$DATA_ROOT/results/infer_sanity_monitoring.json"

	run_stage 12 "Downstream accuracy eval" \
	python scripts/12_downstream_eval.py --resume

	run_stage 13 "Final analysis + report" \
	python scripts/13_analyze_and_report.py

	echo "" \| tee -a "$RUNALL_LOG"
	echo "=========================================================" \| tee -a "$RUNALL_LOG"
	echo "Pipeline complete - $(date)" \| tee -a "$RUNALL_LOG"
	echo "Final report: $DATA_ROOT/results/final_report.md" \| tee -a "$RUNALL_LOG"
	echo "Sweep curves: $DATA_ROOT/results/sweep_curves.png" \| tee -a "$RUNALL_LOG"
	echo "Attn diagnostic: $DATA_ROOT/results/attention_diagnostic.png" \| tee -a "$RUNALL_LOG"
	echo "Sanity inference: $DATA_ROOT/results/infer_sanity_*.json" \| tee -a "$RUNALL_LOG"
	echo "=========================================================" \| tee -a "$RUNALL_LOG"