File size: 3,530 Bytes
dbc69f3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 | #!/usr/bin/env bash
set -eo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
cd "${REPO_ROOT}"
# shellcheck disable=SC1091
source "${SCRIPT_DIR}/conda_env.sh"
set -u
# Train one GRPO run with:
# - lambda = 0.1
# - strict format gate = true
# - non-multiplicative interaction (correctness_length)
BASE_CONFIG="${BASE_CONFIG:-configs/grpo_llama32_3b_bf16.yaml}"
ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-configs/accelerate_ddp_4gpu.yaml}"
NUM_PROCESSES="${NUM_PROCESSES:-4}"
PYTHON_BIN="${PYTHON_BIN:-python}"
OUT_ROOT="${OUT_ROOT:-artifacts/sweeps/reward_variants_lambda_0p1}"
export WANDB_MODE="${WANDB_MODE:-offline}"
export HF_HUB_OFFLINE="${HF_HUB_OFFLINE:-1}"
export HF_DATASETS_OFFLINE="${HF_DATASETS_OFFLINE:-1}"
export TRANSFORMERS_OFFLINE="${TRANSFORMERS_OFFLINE:-1}"
export PYTHONUNBUFFERED="${PYTHONUNBUFFERED:-1}"
export ACCELERATE_LOG_LEVEL="${ACCELERATE_LOG_LEVEL:-info}"
mkdir -p "${OUT_ROOT}"
LOG_PATH="${LOG_PATH:-${OUT_ROOT}/run_lambda_0p1_existing_gate_token_util_$(date +%Y%m%d_%H%M%S).log}"
exec > >(tee -a "${LOG_PATH}") 2>&1
TMP_DIR="$(mktemp -d "${OUT_ROOT}/tmp_cfgs.XXXXXX")"
trap 'rm -rf "${TMP_DIR}"' EXIT
CFG_PATH="${TMP_DIR}/grpo_lambda_0p1_existing_gate_token_util.yaml"
echo "Preparing config..."
echo "Base config: ${BASE_CONFIG}"
echo "Out root: ${OUT_ROOT}"
echo "Accelerate config: ${ACCELERATE_CONFIG}"
echo "Num processes (GPUs): ${NUM_PROCESSES}"
echo "Log path: ${LOG_PATH}"
BASE_CONFIG="${BASE_CONFIG}" OUT_ROOT="${OUT_ROOT}" CFG_PATH="${CFG_PATH}" REPO_ROOT="${REPO_ROOT}" "${PYTHON_BIN}" - <<'PY'
import copy
import os
from pathlib import Path
import yaml
repo_root = Path(os.environ["REPO_ROOT"])
base_config = Path(os.environ["BASE_CONFIG"])
out_root = Path(os.environ["OUT_ROOT"])
cfg_path = Path(os.environ["CFG_PATH"])
with base_config.open("r", encoding="utf-8") as handle:
cfg = yaml.safe_load(handle)
cfg = copy.deepcopy(cfg)
cfg.setdefault("rewards", {})
cfg["rewards"].setdefault("kwargs", {})
rewards_kwargs = cfg["rewards"]["kwargs"]
rewards_kwargs.setdefault("token_utilisation_reward", {})
rewards_kwargs["token_utilisation_reward"]["results_jsonl_path"] = str(
repo_root / "artifacts/eval/gsm8k_train_zeroshot/results.jsonl"
)
cfg.setdefault("objective", {})
cfg["objective"].setdefault("kwargs", {})
obj = cfg["objective"]["kwargs"]
obj["enable_length_penalty"] = True
obj["enable_token_utilisation_reward"] = True
obj["reward_mode"] = "weighted_length_penalty"
obj["length_penalty_lambda"] = 0.1
obj["length_penalty_interaction"] = "correctness_length_format"
obj["strict_format_gate"] = True
obj["non_strict_penalty"] = -1.0
cfg.setdefault("trainer", {})
base_run_name = cfg["trainer"].get("run_name", "grpo")
cfg["trainer"]["run_name"] = f"{base_run_name}-lambda-0p1-existing-gate-token-util"
cfg["trainer"]["output_dir"] = str(out_root / "run_lambda_0p1_existing_gate_token_util")
with cfg_path.open("w", encoding="utf-8") as handle:
yaml.safe_dump(cfg, handle, sort_keys=False)
print(f"Wrote config: {cfg_path}")
print(f"Run name: {cfg['trainer']['run_name']}")
print(f"Output dir: {cfg['trainer']['output_dir']}")
PY
echo
echo "Starting training..."
export PYTHONPATH="${REPO_ROOT}/src"
accelerate launch \
--config_file "${ACCELERATE_CONFIG}" \
--num_processes "${NUM_PROCESSES}" \
src/train_grpo.py \
--config "${CFG_PATH}"
echo
echo "Done."
echo "Model outputs/checkpoints under: ${OUT_ROOT}/run_lambda_0p1_existing_gate_token_util"
echo "Full log: ${LOG_PATH}"
|