File size: 3,263 Bytes
0d00bbe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/bin/bash
# 评测 bf16 / cts / dts_exp 三种模型在比赛指定 benchmark 上的精度
#
# 用法:
#   bash run_eval.sh              # 评测全部三个模型
#   bash run_eval.sh bf16         # 只评测某一个模型
#
# 结果保存到 results/<model_name>/

source /home/chengyingying/miniconda3/etc/profile.d/conda.sh
conda activate pangu1b_eval
export PATH="/home/chengyingying/miniconda3/envs/pangu1b_eval/bin:$PATH"

SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
RESULTS_DIR="/home/chengyingying/pangu_pretrain/evaluate_benchmarks/results"
CKPT_DIR="/home/chengyingying/pangu_pretrain/checkpoints"

# ── 评测任务配置 ─────────────────────────────────────────────────────────────
# 任务名称(lm-eval 标准名),shot 数参考各 benchmark 论文惯例
TASKS="mmlu,gsm8k,minerva_math,hellaswag,arc_easy,arc_challenge"

declare -A TASK_SHOTS=(
    ["mmlu"]="5"
    ["gsm8k"]="5"
    ["minerva_math"]="4"   # MATH500 使用 minerva_math(4-shot CoT)
    ["hellaswag"]="10"
    ["arc_easy"]="25"
    ["arc_challenge"]="25"
    ["piqa"]="0"
)

# ── 待评测模型 ────────────────────────────────────────────────────────────────
# declare -A MODELS=(
    # ["bf16"]="$CKPT_DIR/bf16/final"
    # ["cts"]="$CKPT_DIR/cts/final"
    # ["dts_exp"]="$CKPT_DIR/dts_exp/final"
# )

declare -A MODELS=(
    ["bf16"]="$CKPT_DIR/bf16/final"
    ["cts"]="$CKPT_DIR/cts/final"
    ["max_quant"]="$CKPT_DIR/max_quant/final"
    ["bf16_lr1e5"]="$CKPT_DIR/bf16_lr1e5/final"
    ["max_quant_lr1e5"]="$CKPT_DIR/max_quant_lr1e5/final"
    # ["dts_exp"]="$CKPT_DIR/dts_exp_2/final"
)

# 若指定了参数,只评测该模型
if [[ -n "$1" ]]; then
    if [[ -z "${MODELS[$1]}" ]]; then
        echo "Unknown model: $1. Available: ${!MODELS[*]}"
        exit 1
    fi
    TARGETS=("$1")
else
    TARGETS=("${!MODELS[@]}")
fi

# ── 逐个模型评测 ──────────────────────────────────────────────────────────────
for MODEL_NAME in "${TARGETS[@]}"; do
    MODEL_PATH="${MODELS[$MODEL_NAME]}"
    OUT_DIR="$RESULTS_DIR/$MODEL_NAME"
    mkdir -p "$OUT_DIR"

    echo "======================================================"
    echo "Evaluating: $MODEL_NAME  ($MODEL_PATH)"
    echo "======================================================"

    for TASK in mmlu gsm8k minerva_math hellaswag arc_easy arc_challenge; do
        SHOTS="${TASK_SHOTS[$TASK]}"
        lm_eval \
            --model vllm \
            --model_args "pretrained=$MODEL_PATH,dtype=bfloat16,trust_remote_code=True,tensor_parallel_size=1" \
            --tasks "$TASK" \
            --num_fewshot "$SHOTS" \
            --apply_chat_template False \
            --output_path "$OUT_DIR" \
            --log_samples \
            2>&1 | tee -a "$OUT_DIR/eval.log"
    done

    echo "Done: $MODEL_NAME$OUT_DIR"
done

echo ""
echo "All evaluations complete. Run compare_results.py to see summary."