export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7

mkdir -p logs

TASKS=(
  "gsm8k:128"
  "math500:128"
  "aime24:30"
  "aime25:30"
  "humaneval:164"
  "mbpp:128"
  "livecodebench:128"
  "swe-bench:128"
  "mt-bench:80"
  "alpaca:128"
)

for task in "${TASKS[@]}"; do
  IFS=':' read -r DATASET_NAME MAX_SAMPLES <<< "$task"

  echo "========================================================"
  echo "Running Benchmark: $DATASET_NAME with $MAX_SAMPLES samples"
  echo "========================================================"

  torchrun \
    --nproc_per_node=8 \
    --master_port=29600 \
    benchmark.py \
    --dataset "$DATASET_NAME" \
    --max-samples "$MAX_SAMPLES" \
    --model-name-or-path Qwen/Qwen3-4B \
    --draft-name-or-path z-lab/Qwen3-4B-DFlash-b16 \
    --max-new-tokens 2048 \
    --temperature 0.0 \
    2>&1 | tee "logs/${DATASET_NAME}.log"

done