#!/usr/bin/env bash
# Launch two adaptive-k variants (single-stage cell-policy at stage_i=3,
# no curriculum, but with growing recurrent-hidden thought tokens k).
set -euo pipefail
ROOT="${ROOT:-/home/ubuntu/curriculum_cot}"
TS="$(date +%Y%m%d_%H%M%S)"
SWEEP_ROOT="${ROOT}/_runs/adaptive_k_cellpolicy_${TS}"
mkdir -p "${SWEEP_ROOT}"
PY="${ROOT}/_runs/adaptive_k_cellpolicy_pipeline.py"

launch() {
  # Usage: launch <variant> <gpu> <KEY=VALUE>...  (ignored, args passed via positional CLI args)
  local variant="$1" gpu="$2"
  shift 2
  local out="${SWEEP_ROOT}/${variant}"
  mkdir -p "${out}"
  echo "[launch] ${variant} on GPU ${gpu}  out=${out}"
  nohup /opt/pytorch/bin/python -u "${PY}" \
    --variant "${variant}" \
    --gpu "${gpu}" \
    --output_root "${out}" \
    "$@" > "${out}/console.log" 2>&1 &
  local pid=$!
  disown "${pid}" || true
  echo "${variant}=${pid}" >> "${SWEEP_ROOT}/PIDS.txt"
}

# adaptive_a: classic schedule (start at k=0, plateau-bumps with eps=0.01).
launch adaptive_a_eps01 2 \
  --start_k 0 --max_k 4 --steps_per_phase 600 --max_phases_per_k 2 \
  --plateau_eps 0.01 --sft_lr 2e-5 --sft_bs 8 --sft_ga 4 \
  --grpo_steps 1500 --grpo_lr 5e-6 --grpo_bs 8 --grpo_ga 4 --grpo_ng 8

# adaptive_b: faster k-growth (max_phases_per_k=1, force bump every phase).
launch adaptive_b_fastgrow 3 \
  --start_k 0 --max_k 4 --steps_per_phase 800 --max_phases_per_k 1 \
  --plateau_eps 1.0 --sft_lr 2e-5 --sft_bs 8 --sft_ga 4 \
  --grpo_steps 1500 --grpo_lr 5e-6 --grpo_bs 8 --grpo_ga 4 --grpo_ng 8

echo "[launch] sweep root: ${SWEEP_ROOT}"
echo "[launch] PIDs:"
cat "${SWEEP_ROOT}/PIDS.txt"