JulianHJR
/

tempv2

Model card Files Files and versions

xet

Community

JulianHJR commited on 22 days ago

Commit

9fa8ff4

verified ·

1 Parent(s): 398396b

Upload runallsingle.sh with huggingface_hub

Browse files

Files changed (1) hide show

runallsingle.sh +195 -0

runallsingle.sh ADDED Viewed

	@@ -0,0 +1,195 @@

+#!/bin/bash
+# ============================================================
+# Student Simulation v5 — SINGLE-GPU pipeline (May 2026)
+# ============================================================
+#
+# WHEN TO USE THIS instead of runall.sh
+#   - You only have one GPU available, OR
+#   - You want to debug stage 14 without the parallel-shard machinery,
+#   - The 6-GPU version (runall.sh) is hanging and you need a known-good run.
+#
+# DEFAULT GPU
+#   CUDA 7. Override with `CUDA_VISIBLE_DEVICES=N bash runallsingle.sh`.
+#   Inside slurm with --gres=gpu:1 the cgroup will renumber the visible
+#   device to 0; we detect that and DO NOT override CUDA_VISIBLE_DEVICES if
+#   slurm has already set it. That way the same script works in both
+#   contexts.
+#
+# WHAT'S DIFFERENT FROM runall.sh
+#   - Stage 14 runs ALL 16 probe-ranked layers in a SINGLE process — no
+#     sharding, no shard files, no merge step. Output goes straight to the
+#     canonical per_layer_calibration_monitoring.json path.
+#   - Baselines for stage 14 are computed ONCE (hoisted in the patched
+#     14_calibrate_per_layer.py), so no time wasted re-running the same
+#     baselines per layer.
+#   - CPU thread caps are still applied so a single big-MoE process doesn't
+#     accidentally oversubscribe a 200-thread node.
+#
+# ROUGH RUNTIME (n_test=10, n_repeats=3, 16 layers, max_new_tokens=2048):
+#   baselines:   30 gens (~15-30 min)
+#   per-layer:   16 layers × 4 alphas × 3 repeats × 10 problems = 1920 gens
+#   total:       ~16-25 hours on one H20-3e (vs ~3-4h on 6 cards in parallel
+#                IF the 6-GPU run actually works).
+#
+# QUICK START
+#   bash runallsingle.sh                            # full single-GPU pipeline
+#   STAGES=5b,14,16,15,13 bash runallsingle.sh      # skip data prep
+#   STAGES=14 N_CALIB=5 bash runallsingle.sh        # quick stage-14 smoke test
+#   CUDA_VISIBLE_DEVICES=3 bash runallsingle.sh     # use GPU 3 instead of 7
+# ============================================================
+set -e
+set -u
+set -o pipefail
+PROJECT_ROOT="$(cd "$(dirname "$0")" && pwd)"
+cd "$PROJECT_ROOT"
+# ============================================================
+# GPU SELECTION
+# ============================================================
+# Priority:
+#   1. CUDA_VISIBLE_DEVICES already set (e.g. by slurm or by user) → respect it.
+#   2. SLURM_JOB_ID is set but CUDA_VISIBLE_DEVICES is not → unusual but trust slurm.
+#   3. Otherwise → default to GPU 7.
+if [[ -n "${CUDA_VISIBLE_DEVICES:-}" ]]; then
+    echo "[gpu] Using existing CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+elif [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "[gpu] Inside slurm but CUDA_VISIBLE_DEVICES unset — letting slurm/cgroup decide"
+else
+    export CUDA_VISIBLE_DEVICES=7
+    echo "[gpu] Defaulting to CUDA_VISIBLE_DEVICES=7"
+    echo "[gpu] Override with: CUDA_VISIBLE_DEVICES=N bash $(basename "$0")"
+fi
+# ============================================================
+# CPU THREAD CAPS
+# ============================================================
+# A single process loading a 30B MoE will spawn (cores) BLAS threads by default.
+# On a 200-thread node that's fine alone, but pinned-memory copies and tokenizer
+# work both benefit from a sane cap. These caps are also defensive in case this
+# script is launched alongside another job on the same node.
+export OMP_NUM_THREADS="${OMP_NUM_THREADS:-16}"
+export MKL_NUM_THREADS="${MKL_NUM_THREADS:-16}"
+export NUMEXPR_NUM_THREADS="${NUMEXPR_NUM_THREADS:-16}"
+export TOKENIZERS_PARALLELISM=false
+# ============================================================
+# PATHS
+# ============================================================
+export DATA_ROOT="${DATA_ROOT:-$PROJECT_ROOT/data}"
+export PYTHONPATH="$PROJECT_ROOT:${PYTHONPATH:-}"
+N_TRAIN="${N_TRAIN:-150}"
+N_MATH_TEST="${N_MATH_TEST:-50}"
+N_AIME="${N_AIME:-30}"
+N_GPQA="${N_GPQA:-20}"
+N_CALIB="${N_CALIB:-10}"
+N_K_TEST="${N_K_TEST:-10}"
+N_REPEATS="${N_REPEATS:-3}"
+mkdir -p "$DATA_ROOT/logs" "$DATA_ROOT/results"
+RUNALL_LOG="$DATA_ROOT/logs/runallsingle.log"
+echo "=========================================================" | tee -a "$RUNALL_LOG"
+echo "Student Simulation v5 (SINGLE-GPU) - $(date)" | tee -a "$RUNALL_LOG"
+echo "PROJECT_ROOT:        $PROJECT_ROOT" | tee -a "$RUNALL_LOG"
+echo "CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES:-<slurm-managed>}" | tee -a "$RUNALL_LOG"
+echo "OMP_NUM_THREADS:      $OMP_NUM_THREADS" | tee -a "$RUNALL_LOG"
+echo "N_CALIB:              $N_CALIB    N_REPEATS: $N_REPEATS" | tee -a "$RUNALL_LOG"
+echo "N_K_TEST:             $N_K_TEST" | tee -a "$RUNALL_LOG"
+echo "=========================================================" | tee -a "$RUNALL_LOG"
+python -m configs.paths 2>&1 | tee -a "$RUNALL_LOG"
+# Default stage list. Same as runall.sh but stage 14 is run UNSHARDED.
+STAGES="${STAGES:-1,2,3,4,5,6,7,8,5b,14,16,15,13}"
+run_stage() {
+    local stage_num="$1"
+    local stage_name="$2"
+    shift 2
+    if [[ ",$STAGES," != *",$stage_num,"* ]]; then
+        echo "[skip] Stage $stage_num: $stage_name" | tee -a "$RUNALL_LOG"
+        return 0
+    fi
+    echo "" | tee -a "$RUNALL_LOG"
+    echo "==================== Stage $stage_num: $stage_name ====================" | tee -a "$RUNALL_LOG"
+    local t_start; t_start=$(date +%s)
+    "$@" 2>&1 | tee -a "$RUNALL_LOG"
+    local t_end; t_end=$(date +%s)
+    echo "Stage $stage_num took $((t_end - t_start))s" | tee -a "$RUNALL_LOG"
+}
+# ============================================================
+# Data-prep / direction-extraction stages (always single-GPU)
+# ============================================================
+if [[ -z "${SKIP_DOWNLOAD:-}" ]]; then
+    run_stage 1 "Download model" \
+        python scripts/01_download_model.py
+fi
+run_stage 2 "Generate CoTs" \
+    python scripts/02_generate_cots.py \
+        --n_train "$N_TRAIN" --n_math_test "$N_MATH_TEST" \
+        --n_aime "$N_AIME" --n_gpqa "$N_GPQA" --resume
+run_stage 3 "Label CoTs" \
+    python scripts/03_label_cots.py --resume
+run_stage 4 "Capture routing" \
+    python scripts/04_capture_routing.py --resume
+run_stage 5 "Select top experts" \
+    python scripts/05_select_top_experts.py --resume
+run_stage 6 "Interaction analysis" \
+    python scripts/06_interaction_analysis.py
+run_stage 7 "Capture residuals" \
+    python scripts/07_capture_residuals.py --resume
+run_stage 8 "Compute v4_clean directions" \
+    python scripts/08_compute_directions.py --resume
+run_stage 5b "Probe-based layer ranking" \
+    python scripts/05b_probe_ranking.py --dim monitoring
+# ============================================================
+# Stage 14 — UNSHARDED.
+# All 16 layers in one process, baselines hoisted, output goes
+# directly to per_layer_calibration_monitoring.json. No shard
+# files, no merge step.
+# ============================================================
+run_stage 14 "Per-layer calibration (single-GPU, all layers)" \
+    python scripts/14_calibrate_per_layer.py \
+        --dim monitoring \
+        --n_test "$N_CALIB" \
+        --n_repeats "$N_REPEATS"
+# ============================================================
+# Final stages (single-GPU in both versions)
+# ============================================================
+run_stage 16 "Cumulative top-k multi-layer sweep" \
+    python scripts/16_cumulative_topk.py \
+        --dim monitoring --n_test "$N_K_TEST"
+run_stage 15 "Calibrated inference (monitoring)" \
+    python scripts/15_infer_calibrated.py \
+        --dim monitoring --auto_problems \
+        --save_to "$DATA_ROOT/results/infer_calibrated_monitoring_v5.json"
+run_stage 13 "Final analysis + report" \
+    python scripts/13_analyze_and_report.py
+echo "" | tee -a "$RUNALL_LOG"
+echo "=========================================================" | tee -a "$RUNALL_LOG"
+echo "v5 single-GPU pipeline complete - $(date)" | tee -a "$RUNALL_LOG"
+echo "=========================================================" | tee -a "$RUNALL_LOG"
+echo "KEY FILES TO READ FIRST:" | tee -a "$RUNALL_LOG"
+echo "  $DATA_ROOT/checkpoints/probe_layer_ranking_monitoring.json" | tee -a "$RUNALL_LOG"
+echo "  $DATA_ROOT/results/per_layer_calibration_monitoring.json    <- safe_layers" | tee -a "$RUNALL_LOG"
+echo "  $DATA_ROOT/results/stage16_cumulative_topk_summary.json     <- collapse cliff" | tee -a "$RUNALL_LOG"
+echo "  $DATA_ROOT/results/infer_calibrated_monitoring_v5.json      <- final output" | tee -a "$RUNALL_LOG"
+echo "  $DATA_ROOT/results/final_report.md" | tee -a "$RUNALL_LOG"
+echo "=========================================================" | tee -a "$RUNALL_LOG"