#!/bin/bash # Mirror of run_eval_v10_r2_ckpt956.sh, evaluating the v11 over-prediction # variant. Picks the highest-numbered checkpoint under outputs_forensics/v11_overpred. # cot=false, max_new_tokens=64, temp=0.0, 8 ranks. set -e cd /mnt/local-fast/zhangt/forensics_grpo CKPT_ROOT="${CKPT_ROOT:-outputs_forensics/v11_overpred}" MODEL=$(ls -d "${CKPT_ROOT}"/checkpoint-* 2>/dev/null \ | awk -F'-' '{print $NF, $0}' | sort -n -k1,1 | tail -1 | awk '{print $2}') if [ -z "$MODEL" ]; then echo "No checkpoint found under ${CKPT_ROOT}/checkpoint-*" >&2 exit 1 fi CKPT_TAG=$(basename "$MODEL" | sed 's/checkpoint-/ckpt/') OUT="eval_v11_overpred_${CKPT_TAG}" mkdir -p "$OUT/logs" echo "Evaluating $MODEL -> $OUT" export PATH="/mnt/local-fast/zhangt/torch_env/bin:$PATH" export LD_LIBRARY_PATH="/opt/conda/lib:${LD_LIBRARY_PATH}" export PYTHONPATH=".:$PYTHONPATH" for R in 0 1 2 3 4 5 6 7; do CUDA_VISIBLE_DEVICES=$R python evaluate_forensics.py \ --model_path "$MODEL" \ --rank $R --world_size 8 --device 0 \ --out_dir "$OUT" \ --cot false --max_new_tokens 64 --temperature 0.0 \ > "$OUT/logs/rank_${R}.log" 2>&1 & done wait echo "all 8 ranks done" python evaluate_grounding_metrics.py --out_dir "$OUT" | tee "$OUT/grounding_metrics.txt"