File size: 2,528 Bytes
5342d35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/bin/bash

LOG_FILE="/mnt/data8tb/Documents/lm-evaluation-harness/results/bench_project_1/logs/main_loop_log.txt"
COMBINED_LOG_FILE="/mnt/data8tb/Documents/lm-evaluation-harness/results/bench_project_1/logs"

# List of HuggingFace-style model directories
models=(

     "/mnt/data8tb/Documents/llm/llm_models/deepseek-ai_deepseek-moe-16b-base"
     "/mnt/data8tb/Documents/llm/llm_models/Deepseek-ai_deepseek-moe-16b-chat"
     "/mnt/data8tb/Documents/llm/llm_models/Qwen_Qwen-7B-Chat"
     "/mnt/data8tb/Documents/llm/llm_models/Qwen_Qwen-7B"
     "/mnt/data8tb/Documents/llm/llm_models/baichuan-inc_Baichuan2-13B-Chat"
     "/mnt/data8tb/Documents/llm/llm_models/Qwen_Qwen2.5-14B-Instruct"
     "/mnt/data8tb/Documents/llm/llm_models/Qwen_Qwen3-14B"


)

# Benchmark task (customize if needed)
task="mmlu,arc_challenge,gsm8k,bbh,truthfulqa,piqa,hellaswag,winogrande,boolq,drop,triviaqa,\
nq_open,sciq,qnli,gpqa,openbookqa,anli_r1,anli_r2,anli_r3"

# Output directory
output_dir="/mnt/data8tb/Documents/lm-evaluation-harness/results/bench_project_1/results"

# CUDA device
device="cuda:0"

# Loop through models
for model_path in "${models[@]}"; do

    model_name=$(basename "$model_path")
    ERR_FILE="$COMBINED_LOG_FILE/${model_name}_stderr.log"

    {
    gpu_log_file="/mnt/data8tb/Documents/lm-evaluation-harness/results/bench_project_1/gpu_usage/${model_name}_gpu_usage.csv"

    echo "==== New Run: $(date) ====" >> "$gpu_log_file"
    nvidia-smi --query-gpu=timestamp,utilization.gpu,memory.used --format=csv -l 60 >> "$gpu_log_file" &
    SMI_PID=$!
    trap "kill $SMI_PID" EXIT  
    START=$(date +%s)

    echo "🔥 Starting benchmark for $model_name"

    output_file="${output_dir}"

    lm_eval \
        --model hf \
        --model_args pretrained="${model_path}",trust_remote_code=True,device_map=auto \
        --tasks "${task}" \
        --device "${device}" \
        --batch_size auto\
        --output_path "${output_file}" 

    
    END=$(date +%s)
    DURATION=$((END - START))

    HOURS=$((DURATION / 3600))
    MINUTES=$(((DURATION % 3600) / 60))
    SECONDS=$((DURATION % 60))

    echo "$model_name: ${HOURS}h ${MINUTES}m ${SECONDS}s"
    kill $SMI_PID

    # Check if previous command failed
    if [ $? -ne 0 ]; then
        echo "❌ Benchmark failed for $model_name. Skipping to next..."
        continue
    fi

    echo "✅ Benchmark completed for $model_name"
    echo ""
    } > >(tee -a "$LOG_FILE" | tee -a "$ERR_FILE") 2> >(tee -a "$ERR_FILE" >&2)
done

echo "🏁 ALL DONE!"