curriculum-cot-code / _runs /launch_baseline_push_v5.sh

Add data/ JSONLs + _runs/ launch scripts (override .gitignore)

48c96cf verified 10 days ago

3.48 kB

	#!/usr/bin/env bash
	# Wave-5: push baseline 1.5B past solve=0.35.
	#
	# Idea: best ckpts so far cap at per-cell-exact ~0.943 (solve 0.35 = 0.943^20).
	# To reach solve=0.5 we need exact ~= 0.965. That's +2.2pp of per-cell exact.
	#
	# 4 variants, single-GPU each, on GPUs 4..7.
	# All start from the leader (pipe_m post-S3-GRPO at solve=0.35) or its S3 SFT
	# ckpt, then push S3 GRPO further with different levers:
	# - lower LR (escape / fine refine)
	# - longer steps (3000 instead of 1500)
	# - KL anchor (beta>0) to prevent regression
	# - sharper rewards (mirror what worked for the latent's `s3_grpo_sharp_rwd`)
	set -euo pipefail

	ROOT=/home/ubuntu/curriculum_cot
	SWEEP_ROOT=/home/ubuntu/curriculum_cot/_runs/baseline_1p5b_v4_20260523_184952
	PIPELINE=$ROOT/_runs/baseline_1p5b_pipeline_v4.sh

	# best wave-2 anchors
	PIPE_M_S3GRPO_LATEST=$(ls -dt $SWEEP_ROOT/pipe_m_s3sft_from_b/s3_grpo/checkpoint-* 2>/dev/null \| head -1)
	PIPE_M_S3SFT_LATEST=$SWEEP_ROOT/pipe_m_s3sft_from_b/s3_sft/checkpoint-step-02400
	PIPE_O_S3SFT_LATEST=$SWEEP_ROOT/pipe_o_s3sft_lr5e6/s3_sft/checkpoint-step-02400
	PIPE_J_S3GRPO_LATEST=$(ls -dt $SWEEP_ROOT/pipe_j_s3sft_lr5e5_lr1e5/s3_grpo/checkpoint-* 2>/dev/null \| head -1)

	# Sanity
	for c in "$PIPE_M_S3GRPO_LATEST" "$PIPE_M_S3SFT_LATEST" "$PIPE_O_S3SFT_LATEST" "$PIPE_J_S3GRPO_LATEST"; do
	[[ -d "$c" ]] \|\| { echo "MISSING: $c"; exit 1; }
	done

	CKPT_LR5E5=$ROOT/checkpoints/sudoku-9x9-20empty-baseline-1p5b-sweep/baseline_lr5e5_lowsft_v3/s2_sft_v3/checkpoint-step-03000

	launch() {
	local gpu="$1" variant="$2"; shift 2
	local out=$SWEEP_ROOT/$variant; mkdir -p "$out"
	nohup env ROOT="$ROOT" VARIANT="$variant" GPU="$gpu" S2_SFT_CKPT="$CKPT_LR5E5" \
	OUTPUT_ROOT="$out" USE_WANDB=0 WANDB_MODE=offline "$@" \
	bash "$PIPELINE" </dev/null >"$out/nohup.log" 2>&1 &
	local pid=$!
	echo "$pid $gpu $variant" >> "$SWEEP_ROOT/PIDS.txt"
	disown $pid 2>/dev/null \|\| true
	printf 'GPU %s -> %s pid=%s\n' "$gpu" "$variant" "$pid"
	}

	# pipe_t (GPU 4): continue pipe_m's S3 GRPO with lower LR + KL anchor + longer steps.
	# Keep the policy near the SFT reference to avoid the regression we saw earlier.
	launch 4 pipe_t_grpo_low_kl \
	START_PHASE=s3_grpo S3_GRPO_INIT="$PIPE_M_S3GRPO_LATEST" \
	GRPO_LR=1e-6 GRPO_BS=32 GRPO_GA=1 GRPO_NG=8 \
	GRPO_BETA=0.04 \
	S3_GRPO_MAX_STEPS=3000 \
	USE_GC=0

	# pipe_u (GPU 5): re-run S3 GRPO from pipe_m's S3-SFT ckpt with sharper rewards
	# (mirror latent `s3_grpo_sharp_rwd` recipe: bigger penalty for bad).
	launch 5 pipe_u_grpo_sharp_rwd \
	START_PHASE=s3_grpo S3_GRPO_INIT="$PIPE_M_S3SFT_LATEST" \
	GRPO_LR=5e-6 GRPO_BS=32 GRPO_GA=1 GRPO_NG=8 \
	REWARD_GOOD=1.5 PENALTY_BAD=2.0 PENALTY_MAL=4.0 \
	S3_GRPO_MAX_STEPS=3000 \
	USE_GC=0

	# pipe_v (GPU 6): extend pipe_o's S3 SFT (the strongest pure-SFT path) with very
	# low LR for 4000 more steps. Then S3 GRPO at LR=1e-6.
	launch 6 pipe_v_sft_extend \
	START_PHASE=s3_sft S3_SFT_INIT="$PIPE_O_S3SFT_LATEST" \
	SFT_LR_S3=2e-6 SFT_BS=16 SFT_GA=1 \
	S3_SFT_MAX_STEPS=4000 \
	GRPO_LR=1e-6 GRPO_BS=32 GRPO_GA=1 GRPO_NG=8 \
	S3_GRPO_MAX_STEPS=2000 \
	USE_GC=0

	# pipe_w (GPU 7): continue pipe_j's S3 GRPO with very low LR + KL anchor.
	# Different lineage from pipe_m, so this gives an independent push.
	launch 7 pipe_w_j_low_kl \
	START_PHASE=s3_grpo S3_GRPO_INIT="$PIPE_J_S3GRPO_LATEST" \
	GRPO_LR=2e-6 GRPO_BS=32 GRPO_GA=1 GRPO_NG=8 \
	GRPO_BETA=0.02 \
	S3_GRPO_MAX_STEPS=3000 \
	USE_GC=0

	echo
	echo "=== launched ==="
	cat "$SWEEP_ROOT/PIDS.txt" \| tail -4