#!/usr/bin/env bash # Run all evals after the production training jobs finish. # # This script orchestrates the post-training eval sweep: # 1. Policy baseline (deterministic, no LLM) # 2. Base model eval per size (Qwen3-0.6B / 1.7B / 4B, untrained) # 3. Trained model eval per size (3 trained checkpoints) # # Outputs go to outputs/eval__.json — exactly the # layout consumed by scripts/make_plots.py. # # Required env (one HF write-token with read access to the trained model repos): # HF_TOKEN token for downloading the trained models from HF Hub # # Optional env: # ENV_BASE_URL default https://agarwalanu3103-clarify-rl.hf.space # API_BASE_URL default https://router.huggingface.co/v1 (HF Inference Router) # LIMIT max scenarios to evaluate (default 100, set to 300 for full) # TIMEOUT_S per-scenario timeout (default 60) # SKIP_POLICY "1" to skip the policy baseline (already have it) # SKIP_BASE "1" to skip base-model evals # SKIP_TRAINED "1" to skip trained-model evals # # Usage: # HF_TOKEN=hf_xxx ./scripts/run_post_train_eval.sh # # Trained model repo names (these are the OUTPUT_DIRs from launch_all.sh): # /clarify-rl-grpo-qwen3-0-6b # /clarify-rl-grpo-qwen3-1-7b # /clarify-rl-grpo-qwen3-4b # # Set MODEL_0_6B / MODEL_1_7B / MODEL_4B env vars if your usernames differ. set -euo pipefail : "${HF_TOKEN:?HF_TOKEN required (read access to trained model repos)}" : "${ENV_BASE_URL:=https://agarwalanu3103-clarify-rl.hf.space}" : "${API_BASE_URL:=https://router.huggingface.co/v1}" : "${LIMIT:=100}" : "${TIMEOUT_S:=60}" : "${SKIP_POLICY:=0}" : "${SKIP_BASE:=0}" : "${SKIP_TRAINED:=0}" # Defaults assume agarwalanu3103 owns the 0.6B run. : "${MODEL_0_6B:=agarwalanu3103/clarify-rl-grpo-qwen3-0-6b}" : "${MODEL_1_7B:=agarwalanu3103/clarify-rl-grpo-qwen3-1-7b}" : "${MODEL_4B:=agarwalanu3103/clarify-rl-grpo-qwen3-4b}" OUT_DIR="outputs" mkdir -p "$OUT_DIR" cat <