| #!/bin/bash |
| |
| |
| |
| |
| |
| |
| |
|
|
| source /home/chengyingying/miniconda3/etc/profile.d/conda.sh |
| conda activate pangu1b_eval |
| export PATH="/home/chengyingying/miniconda3/envs/pangu1b_eval/bin:$PATH" |
|
|
| SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" |
| RESULTS_DIR="/home/chengyingying/pangu_pretrain/evaluate_benchmarks/results" |
| CKPT_DIR="/home/chengyingying/pangu_pretrain/checkpoints" |
|
|
| |
| |
| TASKS="mmlu,gsm8k,minerva_math,hellaswag,arc_easy,arc_challenge" |
|
|
| declare -A TASK_SHOTS=( |
| ["mmlu"]="5" |
| ["gsm8k"]="5" |
| ["minerva_math"]="4" |
| ["hellaswag"]="10" |
| ["arc_easy"]="25" |
| ["arc_challenge"]="25" |
| ["piqa"]="0" |
| ) |
|
|
| |
| |
| |
| |
| |
| |
|
|
| declare -A MODELS=( |
| ["bf16"]="$CKPT_DIR/bf16/final" |
| ["cts"]="$CKPT_DIR/cts/final" |
| ["max_quant"]="$CKPT_DIR/max_quant/final" |
| ["bf16_lr1e5"]="$CKPT_DIR/bf16_lr1e5/final" |
| ["max_quant_lr1e5"]="$CKPT_DIR/max_quant_lr1e5/final" |
| |
| ) |
|
|
| |
| if [[ -n "$1" ]]; then |
| if [[ -z "${MODELS[$1]}" ]]; then |
| echo "Unknown model: $1. Available: ${!MODELS[*]}" |
| exit 1 |
| fi |
| TARGETS=("$1") |
| else |
| TARGETS=("${!MODELS[@]}") |
| fi |
|
|
| |
| for MODEL_NAME in "${TARGETS[@]}"; do |
| MODEL_PATH="${MODELS[$MODEL_NAME]}" |
| OUT_DIR="$RESULTS_DIR/$MODEL_NAME" |
| mkdir -p "$OUT_DIR" |
|
|
| echo "======================================================" |
| echo "Evaluating: $MODEL_NAME ($MODEL_PATH)" |
| echo "======================================================" |
|
|
| for TASK in mmlu gsm8k minerva_math hellaswag arc_easy arc_challenge; do |
| SHOTS="${TASK_SHOTS[$TASK]}" |
| lm_eval \ |
| --model vllm \ |
| --model_args "pretrained=$MODEL_PATH,dtype=bfloat16,trust_remote_code=True,tensor_parallel_size=1" \ |
| --tasks "$TASK" \ |
| --num_fewshot "$SHOTS" \ |
| --apply_chat_template False \ |
| --output_path "$OUT_DIR" \ |
| --log_samples \ |
| 2>&1 | tee -a "$OUT_DIR/eval.log" |
| done |
|
|
| echo "Done: $MODEL_NAME → $OUT_DIR" |
| done |
|
|
| echo "" |
| echo "All evaluations complete. Run compare_results.py to see summary." |