#!/usr/bin/env bash # Evaluate every checkpoint-* under a single checkpoints/permanent directory, then write # JSON/CSV summary, per-checkpoint JSONL, and accuracy_vs_avg_cot_words.svg (scatter + step labels). # # Usage: # PERMANENT_ROOT=artifacts/runs/.../checkpoints/permanent ./scripts/eval_permanent_root_acc_cot.sh # # Optional: RUN_LABEL (default permanent), OUT_ROOT, BASE_CONFIG, EVAL_MAX_SAMPLES, NUM_PROCESSES, offline HF vars. set -eo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" cd "${REPO_ROOT}" # shellcheck disable=SC1091 source "${SCRIPT_DIR}/conda_env.sh" set -u : "${PERMANENT_ROOT:?Set PERMANENT_ROOT to your checkpoints/permanent directory}" BASE_CONFIG="${BASE_CONFIG:-configs/grpo_llama32_3b_bf16.yaml}" ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-configs/accelerate_ddp_4gpu.yaml}" NUM_PROCESSES="${NUM_PROCESSES:-4}" RUN_LABEL="${RUN_LABEL:-permanent}" EVAL_MAX_SAMPLES="${EVAL_MAX_SAMPLES:-200}" EVAL_BATCH_SIZE="${EVAL_BATCH_SIZE:-4}" ROLLOUT_SAMPLES="${ROLLOUT_SAMPLES:-8}" export WANDB_MODE="${WANDB_MODE:-offline}" # Allow dataset download into cache unless you already mirrored GSM8K (set HF_DATASETS_OFFLINE=1). export HF_HUB_OFFLINE="${HF_HUB_OFFLINE:-1}" export HF_DATASETS_OFFLINE="${HF_DATASETS_OFFLINE:-0}" export TRANSFORMERS_OFFLINE="${TRANSFORMERS_OFFLINE:-1}" export PYTHONUNBUFFERED="${PYTHONUNBUFFERED:-1}" CMD_ENV=( PYTHONPATH="${REPO_ROOT}/src" BASE_CONFIG="${BASE_CONFIG}" PERMANENT_ROOT="${PERMANENT_ROOT}" RUN_LABEL="${RUN_LABEL}" EVAL_MAX_SAMPLES="${EVAL_MAX_SAMPLES}" EVAL_BATCH_SIZE="${EVAL_BATCH_SIZE}" ROLLOUT_SAMPLES="${ROLLOUT_SAMPLES}" ) if [[ -n "${OUT_ROOT:-}" ]]; then CMD_ENV+=(OUT_ROOT="${OUT_ROOT}") fi env "${CMD_ENV[@]}" accelerate launch \ --config_file "${ACCELERATE_CONFIG}" \ --num_processes "${NUM_PROCESSES}" \ src/eval_permanent_checkpoints.py echo "Done. Summary and accuracy_vs_avg_cot_words.svg under OUT_ROOT (default: /eval_permanent)."