File size: 1,352 Bytes
33569f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#!/bin/bash
# Mirror of run_eval_v10_r2_ckpt956.sh, but evaluates the latest checkpoint
# of the strict single-span TempSamp-R1 baseline. cot=false, max_new_tokens=64,
# temp=0.0, 8 ranks.
set -e

cd /mnt/local-fast/zhangt/forensics_grpo

CKPT_ROOT=/mnt/local-fast/zhangt/baselines/tempsamp_r1/logs/TempSampR1_single_span_forensics_7B_8gpu_4ep
# Pick the highest-numbered checkpoint-*/ subdir.
MODEL=$(ls -d "${CKPT_ROOT}"/checkpoint-* 2>/dev/null \
    | awk -F'-' '{print $NF, $0}' | sort -n -k1,1 | tail -1 | awk '{print $2}')
if [ -z "$MODEL" ]; then
    echo "No checkpoint found under ${CKPT_ROOT}/checkpoint-*" >&2
    exit 1
fi
CKPT_TAG=$(basename "$MODEL" | sed 's/checkpoint-/ckpt/')
OUT=eval_tempsamp_single_span_${CKPT_TAG}
mkdir -p "$OUT/logs"
echo "Evaluating $MODEL  ->  $OUT"

export PATH="/mnt/local-fast/zhangt/torch_env/bin:$PATH"
export LD_LIBRARY_PATH="/opt/conda/lib:${LD_LIBRARY_PATH}"
export PYTHONPATH=".:$PYTHONPATH"

for R in 0 1 2 3 4 5 6 7; do
  CUDA_VISIBLE_DEVICES=$R python evaluate_forensics.py \
      --model_path "$MODEL" \
      --rank $R --world_size 8 --device 0 \
      --out_dir "$OUT" \
      --cot false --max_new_tokens 64 --temperature 0.0 \
      > "$OUT/logs/rank_${R}.log" 2>&1 &
done
wait
echo "all 8 ranks done"

python evaluate_grounding_metrics.py --out_dir "$OUT" | tee "$OUT/grounding_metrics.txt"