neuralese_temp / scripts /eval_permanent_root_acc_cot.sh
psidharth567's picture
Export neuralese codebase (cache and .env excluded).
dbc69f3
#!/usr/bin/env bash
# Evaluate every checkpoint-* under a single checkpoints/permanent directory, then write
# JSON/CSV summary, per-checkpoint JSONL, and accuracy_vs_avg_cot_words.svg (scatter + step labels).
#
# Usage:
# PERMANENT_ROOT=artifacts/runs/.../checkpoints/permanent ./scripts/eval_permanent_root_acc_cot.sh
#
# Optional: RUN_LABEL (default permanent), OUT_ROOT, BASE_CONFIG, EVAL_MAX_SAMPLES, NUM_PROCESSES, offline HF vars.
set -eo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
cd "${REPO_ROOT}"
# shellcheck disable=SC1091
source "${SCRIPT_DIR}/conda_env.sh"
set -u
: "${PERMANENT_ROOT:?Set PERMANENT_ROOT to your checkpoints/permanent directory}"
BASE_CONFIG="${BASE_CONFIG:-configs/grpo_llama32_3b_bf16.yaml}"
ACCELERATE_CONFIG="${ACCELERATE_CONFIG:-configs/accelerate_ddp_4gpu.yaml}"
NUM_PROCESSES="${NUM_PROCESSES:-4}"
RUN_LABEL="${RUN_LABEL:-permanent}"
EVAL_MAX_SAMPLES="${EVAL_MAX_SAMPLES:-200}"
EVAL_BATCH_SIZE="${EVAL_BATCH_SIZE:-4}"
ROLLOUT_SAMPLES="${ROLLOUT_SAMPLES:-8}"
export WANDB_MODE="${WANDB_MODE:-offline}"
# Allow dataset download into cache unless you already mirrored GSM8K (set HF_DATASETS_OFFLINE=1).
export HF_HUB_OFFLINE="${HF_HUB_OFFLINE:-1}"
export HF_DATASETS_OFFLINE="${HF_DATASETS_OFFLINE:-0}"
export TRANSFORMERS_OFFLINE="${TRANSFORMERS_OFFLINE:-1}"
export PYTHONUNBUFFERED="${PYTHONUNBUFFERED:-1}"
CMD_ENV=(
PYTHONPATH="${REPO_ROOT}/src"
BASE_CONFIG="${BASE_CONFIG}"
PERMANENT_ROOT="${PERMANENT_ROOT}"
RUN_LABEL="${RUN_LABEL}"
EVAL_MAX_SAMPLES="${EVAL_MAX_SAMPLES}"
EVAL_BATCH_SIZE="${EVAL_BATCH_SIZE}"
ROLLOUT_SAMPLES="${ROLLOUT_SAMPLES}"
)
if [[ -n "${OUT_ROOT:-}" ]]; then
CMD_ENV+=(OUT_ROOT="${OUT_ROOT}")
fi
env "${CMD_ENV[@]}" accelerate launch \
--config_file "${ACCELERATE_CONFIG}" \
--num_processes "${NUM_PROCESSES}" \
src/eval_permanent_checkpoints.py
echo "Done. Summary and accuracy_vs_avg_cot_words.svg under OUT_ROOT (default: <PERMANENT_ROOT>/eval_permanent)."