export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 mkdir -p logs TASKS=( "gsm8k:128" "math500:128" "aime24:30" "aime25:30" "humaneval:164" "mbpp:128" "livecodebench:128" "swe-bench:128" "mt-bench:80" "alpaca:128" ) for task in "${TASKS[@]}"; do IFS=':' read -r DATASET_NAME MAX_SAMPLES <<< "$task" echo "========================================================" echo "Running Benchmark: $DATASET_NAME with $MAX_SAMPLES samples" echo "========================================================" torchrun \ --nproc_per_node=8 \ --master_port=29600 \ benchmark.py \ --dataset "$DATASET_NAME" \ --max-samples "$MAX_SAMPLES" \ --model-name-or-path Qwen/Qwen3-4B \ --draft-name-or-path z-lab/Qwen3-4B-DFlash-b16 \ --max-new-tokens 2048 \ --temperature 0.0 \ 2>&1 | tee "logs/${DATASET_NAME}.log" done