| #!/usr/bin/env bash |
| set -eo pipefail |
|
|
| SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
| REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" |
| cd "${REPO_ROOT}" |
| |
| source "${SCRIPT_DIR}/conda_env.sh" |
| set -u |
|
|
|
|
| |
| |
| |
| |
|
|
| BASE_CONFIG="${BASE_CONFIG:-configs/grpo_llama32_3b_bf16.yaml}" |
| ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-configs/accelerate_ddp_4gpu.yaml}" |
| NUM_PROCESSES="${NUM_PROCESSES:-4}" |
| PYTHON_BIN="${PYTHON_BIN:-python}" |
| OUT_ROOT="${OUT_ROOT:-artifacts/sweeps/reward_variants_lambda_0p1}" |
|
|
| export WANDB_MODE="${WANDB_MODE:-offline}" |
| export HF_HUB_OFFLINE="${HF_HUB_OFFLINE:-1}" |
| export HF_DATASETS_OFFLINE="${HF_DATASETS_OFFLINE:-1}" |
| export TRANSFORMERS_OFFLINE="${TRANSFORMERS_OFFLINE:-1}" |
| export PYTHONUNBUFFERED="${PYTHONUNBUFFERED:-1}" |
| export ACCELERATE_LOG_LEVEL="${ACCELERATE_LOG_LEVEL:-info}" |
|
|
| mkdir -p "${OUT_ROOT}" |
| LOG_PATH="${LOG_PATH:-${OUT_ROOT}/run_lambda_0p1_existing_gate_token_util_$(date +%Y%m%d_%H%M%S).log}" |
| exec > >(tee -a "${LOG_PATH}") 2>&1 |
|
|
| TMP_DIR="$(mktemp -d "${OUT_ROOT}/tmp_cfgs.XXXXXX")" |
| trap 'rm -rf "${TMP_DIR}"' EXIT |
| CFG_PATH="${TMP_DIR}/grpo_lambda_0p1_existing_gate_token_util.yaml" |
|
|
| echo "Preparing config..." |
| echo "Base config: ${BASE_CONFIG}" |
| echo "Out root: ${OUT_ROOT}" |
| echo "Accelerate config: ${ACCELERATE_CONFIG}" |
| echo "Num processes (GPUs): ${NUM_PROCESSES}" |
| echo "Log path: ${LOG_PATH}" |
|
|
| BASE_CONFIG="${BASE_CONFIG}" OUT_ROOT="${OUT_ROOT}" CFG_PATH="${CFG_PATH}" REPO_ROOT="${REPO_ROOT}" "${PYTHON_BIN}" - <<'PY' |
| import copy |
| import os |
| from pathlib import Path |
|
|
| import yaml |
|
|
| repo_root = Path(os.environ["REPO_ROOT"]) |
| base_config = Path(os.environ["BASE_CONFIG"]) |
| out_root = Path(os.environ["OUT_ROOT"]) |
| cfg_path = Path(os.environ["CFG_PATH"]) |
|
|
| with base_config.open("r", encoding="utf-8") as handle: |
| cfg = yaml.safe_load(handle) |
|
|
| cfg = copy.deepcopy(cfg) |
|
|
| cfg.setdefault("rewards", {}) |
| cfg["rewards"].setdefault("kwargs", {}) |
| rewards_kwargs = cfg["rewards"]["kwargs"] |
| rewards_kwargs.setdefault("token_utilisation_reward", {}) |
| rewards_kwargs["token_utilisation_reward"]["results_jsonl_path"] = str( |
| repo_root / "artifacts/eval/gsm8k_train_zeroshot/results.jsonl" |
| ) |
|
|
| cfg.setdefault("objective", {}) |
| cfg["objective"].setdefault("kwargs", {}) |
| obj = cfg["objective"]["kwargs"] |
|
|
| obj["enable_length_penalty"] = True |
| obj["enable_token_utilisation_reward"] = True |
| obj["reward_mode"] = "weighted_length_penalty" |
| obj["length_penalty_lambda"] = 0.1 |
| obj["length_penalty_interaction"] = "correctness_length_format" |
| obj["strict_format_gate"] = True |
| obj["non_strict_penalty"] = -1.0 |
|
|
| cfg.setdefault("trainer", {}) |
| base_run_name = cfg["trainer"].get("run_name", "grpo") |
| cfg["trainer"]["run_name"] = f"{base_run_name}-lambda-0p1-existing-gate-token-util" |
| cfg["trainer"]["output_dir"] = str(out_root / "run_lambda_0p1_existing_gate_token_util") |
|
|
| with cfg_path.open("w", encoding="utf-8") as handle: |
| yaml.safe_dump(cfg, handle, sort_keys=False) |
|
|
| print(f"Wrote config: {cfg_path}") |
| print(f"Run name: {cfg['trainer']['run_name']}") |
| print(f"Output dir: {cfg['trainer']['output_dir']}") |
| PY |
|
|
| echo |
| echo "Starting training..." |
| export PYTHONPATH="${REPO_ROOT}/src" |
| accelerate launch \ |
| --config_file "${ACCELERATE_CONFIG}" \ |
| --num_processes "${NUM_PROCESSES}" \ |
| src/train_grpo.py \ |
| --config "${CFG_PATH}" |
|
|
| echo |
| echo "Done." |
| echo "Model outputs/checkpoints under: ${OUT_ROOT}/run_lambda_0p1_existing_gate_token_util" |
| echo "Full log: ${LOG_PATH}" |
|
|