#!/usr/bin/env bash set -eo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" cd "${REPO_ROOT}" # shellcheck disable=SC1091 source "${SCRIPT_DIR}/conda_env.sh" set -u # Train one GRPO run with: # - lambda = 0.1 # - strict format gate = true # - non-multiplicative interaction (correctness_length) BASE_CONFIG="${BASE_CONFIG:-configs/grpo_llama32_3b_bf16.yaml}" ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-configs/accelerate_ddp_4gpu.yaml}" NUM_PROCESSES="${NUM_PROCESSES:-4}" PYTHON_BIN="${PYTHON_BIN:-python}" OUT_ROOT="${OUT_ROOT:-artifacts/sweeps/reward_variants_lambda_0p1}" export WANDB_MODE="${WANDB_MODE:-offline}" export HF_HUB_OFFLINE="${HF_HUB_OFFLINE:-1}" export HF_DATASETS_OFFLINE="${HF_DATASETS_OFFLINE:-1}" export TRANSFORMERS_OFFLINE="${TRANSFORMERS_OFFLINE:-1}" export PYTHONUNBUFFERED="${PYTHONUNBUFFERED:-1}" export ACCELERATE_LOG_LEVEL="${ACCELERATE_LOG_LEVEL:-info}" mkdir -p "${OUT_ROOT}" LOG_PATH="${LOG_PATH:-${OUT_ROOT}/run_lambda_0p1_existing_gate_token_util_$(date +%Y%m%d_%H%M%S).log}" exec > >(tee -a "${LOG_PATH}") 2>&1 TMP_DIR="$(mktemp -d "${OUT_ROOT}/tmp_cfgs.XXXXXX")" trap 'rm -rf "${TMP_DIR}"' EXIT CFG_PATH="${TMP_DIR}/grpo_lambda_0p1_existing_gate_token_util.yaml" echo "Preparing config..." echo "Base config: ${BASE_CONFIG}" echo "Out root: ${OUT_ROOT}" echo "Accelerate config: ${ACCELERATE_CONFIG}" echo "Num processes (GPUs): ${NUM_PROCESSES}" echo "Log path: ${LOG_PATH}" BASE_CONFIG="${BASE_CONFIG}" OUT_ROOT="${OUT_ROOT}" CFG_PATH="${CFG_PATH}" REPO_ROOT="${REPO_ROOT}" "${PYTHON_BIN}" - <<'PY' import copy import os from pathlib import Path import yaml repo_root = Path(os.environ["REPO_ROOT"]) base_config = Path(os.environ["BASE_CONFIG"]) out_root = Path(os.environ["OUT_ROOT"]) cfg_path = Path(os.environ["CFG_PATH"]) with base_config.open("r", encoding="utf-8") as handle: cfg = yaml.safe_load(handle) cfg = copy.deepcopy(cfg) cfg.setdefault("rewards", {}) cfg["rewards"].setdefault("kwargs", {}) rewards_kwargs = cfg["rewards"]["kwargs"] rewards_kwargs.setdefault("token_utilisation_reward", {}) rewards_kwargs["token_utilisation_reward"]["results_jsonl_path"] = str( repo_root / "artifacts/eval/gsm8k_train_zeroshot/results.jsonl" ) cfg.setdefault("objective", {}) cfg["objective"].setdefault("kwargs", {}) obj = cfg["objective"]["kwargs"] obj["enable_length_penalty"] = True obj["enable_token_utilisation_reward"] = True obj["reward_mode"] = "weighted_length_penalty" obj["length_penalty_lambda"] = 0.1 obj["length_penalty_interaction"] = "correctness_length_format" obj["strict_format_gate"] = True obj["non_strict_penalty"] = -1.0 cfg.setdefault("trainer", {}) base_run_name = cfg["trainer"].get("run_name", "grpo") cfg["trainer"]["run_name"] = f"{base_run_name}-lambda-0p1-existing-gate-token-util" cfg["trainer"]["output_dir"] = str(out_root / "run_lambda_0p1_existing_gate_token_util") with cfg_path.open("w", encoding="utf-8") as handle: yaml.safe_dump(cfg, handle, sort_keys=False) print(f"Wrote config: {cfg_path}") print(f"Run name: {cfg['trainer']['run_name']}") print(f"Output dir: {cfg['trainer']['output_dir']}") PY echo echo "Starting training..." export PYTHONPATH="${REPO_ROOT}/src" accelerate launch \ --config_file "${ACCELERATE_CONFIG}" \ --num_processes "${NUM_PROCESSES}" \ src/train_grpo.py \ --config "${CFG_PATH}" echo echo "Done." echo "Model outputs/checkpoints under: ${OUT_ROOT}/run_lambda_0p1_existing_gate_token_util" echo "Full log: ${LOG_PATH}"