#!/usr/bin/env bash # Strawman baseline for the rebuttal: vanilla LoRA, no curriculum, no thought # tokens, single-shot whole-puzzle prediction. SFT followed by GRPO. # # Same model (Qwen2.5-1.5B-Instruct), same LoRA (r=32, α=64, dropout=0.05), # same JSONL data file, same Qwen chat template wrapping as the cell-policy # experiments. The ONLY differences from the cell-policy baseline are: # - no per-cell expansion (one example per puzzle) # - no stage_i / curriculum # - no multi_value_oversample, no exact_match_bonus / cardinality penalties # - reward = number of correct values out of 20 + whole-solve bonus set -euo pipefail ROOT=/home/ubuntu/curriculum_cot SCRIPT=${ROOT}/_runs/simple_baseline_sudoku_train.py PYTHON_BIN=/opt/pytorch/bin/python TRAIN_JSONL=${ROOT}/data/sudoku_t3_20empty_value_qwen_text_stage1_train.jsonl EVAL_JSONL=${ROOT}/data/sudoku_t3_20empty_value_qwen_text_stage1_eval.jsonl SWEEP_ROOT=${ROOT}/_runs/strawman_baseline_$(date +%Y%m%d_%H%M%S) mkdir -p "${SWEEP_ROOT}" echo "${SWEEP_ROOT}" > "${ROOT}/_runs/current_strawman_sweep_dir" echo "SWEEP_ROOT=${SWEEP_ROOT}" export TOKENIZERS_PARALLELISM=false export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True export HF_HOME="${ROOT}/.hf_cache" export TRANSFORMERS_CACHE="${ROOT}/.hf_cache" export WANDB_MODE=offline run_pipeline() { local gpu="$1" tag="$2" sft_lr="$3" grpo_lr="$4" sft_max="$5" grpo_max="$6" local out=${SWEEP_ROOT}/${tag} mkdir -p "${out}" local log=${out}/pipeline.log : > "${log}" ( export CUDA_VISIBLE_DEVICES="${gpu}" echo "[$(date +%H:%M:%S)] === ${tag} on GPU ${gpu}: SFT lr=${sft_lr} max_steps=${sft_max} ===" >> "${log}" "${PYTHON_BIN}" -u "${SCRIPT}" \ --phase sft \ --train_jsonl "${TRAIN_JSONL}" \ --eval_jsonl "${EVAL_JSONL}" \ --output_dir "${out}/sft" \ --learning_rate "${sft_lr}" \ --max_steps "${sft_max}" \ --per_device_train_batch_size 8 \ --gradient_accumulation_steps 2 \ --num_epochs 8 \ --logging_steps 25 \ --save_steps 200 \ --eval_rows 100 \ --max_completion_length 96 \ --max_prompt_length 1024 \ --lora_r 32 --lora_alpha 64 --lora_dropout 0.05 \ --seed 0 \ >> "${log}" 2>&1 echo "[$(date +%H:%M:%S)] === ${tag} on GPU ${gpu}: GRPO lr=${grpo_lr} max_steps=${grpo_max} ===" >> "${log}" "${PYTHON_BIN}" -u "${SCRIPT}" \ --phase grpo \ --init_adapter_dir "${out}/sft/final" \ --train_jsonl "${TRAIN_JSONL}" \ --eval_jsonl "${EVAL_JSONL}" \ --output_dir "${out}/grpo" \ --learning_rate "${grpo_lr}" \ --max_steps "${grpo_max}" \ --per_device_train_batch_size 4 \ --gradient_accumulation_steps 2 \ --num_generations 8 \ --beta 0.0 \ --temperature 1.0 \ --num_epochs 50 \ --logging_steps 25 \ --save_steps 200 \ --eval_rows 100 \ --max_completion_length 96 \ --max_prompt_length 1024 \ --lora_r 32 --lora_alpha 64 --lora_dropout 0.05 \ --seed 0 \ >> "${log}" 2>&1 echo "[$(date +%H:%M:%S)] === ${tag} DONE ===" >> "${log}" ) >/dev/null 2>&1 & local pid=$! echo "$pid $gpu $tag" >> "${SWEEP_ROOT}/PIDS.txt" disown $pid 2>/dev/null || true printf 'GPU %s -> %s pid=%s log=%s\n' "$gpu" "$tag" "$pid" "$log" } # 2 variants on GPUs 0,1: explore SFT LR (5e-5 and 1e-4) — same GRPO LR (5e-6). run_pipeline 0 strawman_a_sft5e5_grpo5e6 5e-5 5e-6 2000 1500 run_pipeline 1 strawman_b_sft1e4_grpo5e6 1e-4 5e-6 2000 1500 echo echo "=== launched ===" cat "${SWEEP_ROOT}/PIDS.txt"