#!/usr/bin/env bash # Launch two adaptive-k variants (single-stage cell-policy at stage_i=3, # no curriculum, but with growing recurrent-hidden thought tokens k). set -euo pipefail ROOT="${ROOT:-/home/ubuntu/curriculum_cot}" TS="$(date +%Y%m%d_%H%M%S)" SWEEP_ROOT="${ROOT}/_runs/adaptive_k_cellpolicy_${TS}" mkdir -p "${SWEEP_ROOT}" PY="${ROOT}/_runs/adaptive_k_cellpolicy_pipeline.py" launch() { # Usage: launch ... (ignored, args passed via positional CLI args) local variant="$1" gpu="$2" shift 2 local out="${SWEEP_ROOT}/${variant}" mkdir -p "${out}" echo "[launch] ${variant} on GPU ${gpu} out=${out}" nohup /opt/pytorch/bin/python -u "${PY}" \ --variant "${variant}" \ --gpu "${gpu}" \ --output_root "${out}" \ "$@" > "${out}/console.log" 2>&1 & local pid=$! disown "${pid}" || true echo "${variant}=${pid}" >> "${SWEEP_ROOT}/PIDS.txt" } # adaptive_a: classic schedule (start at k=0, plateau-bumps with eps=0.01). launch adaptive_a_eps01 2 \ --start_k 0 --max_k 4 --steps_per_phase 600 --max_phases_per_k 2 \ --plateau_eps 0.01 --sft_lr 2e-5 --sft_bs 8 --sft_ga 4 \ --grpo_steps 1500 --grpo_lr 5e-6 --grpo_bs 8 --grpo_ga 4 --grpo_ng 8 # adaptive_b: faster k-growth (max_phases_per_k=1, force bump every phase). launch adaptive_b_fastgrow 3 \ --start_k 0 --max_k 4 --steps_per_phase 800 --max_phases_per_k 1 \ --plateau_eps 1.0 --sft_lr 2e-5 --sft_bs 8 --sft_ga 4 \ --grpo_steps 1500 --grpo_lr 5e-6 --grpo_bs 8 --grpo_ga 4 --grpo_ng 8 echo "[launch] sweep root: ${SWEEP_ROOT}" echo "[launch] PIDs:" cat "${SWEEP_ROOT}/PIDS.txt"