File size: 2,619 Bytes
2d67aa6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/bin/bash

# DFlash evaluation: compare baseline vs multi-step denoising (8 GPU data parallel)
#
# Usage:
#   bash examples/run_eval_dflash.sh          # run step=1,2,3 all
#   bash examples/run_eval_dflash.sh 2        # only step=2
#
# Each GPU loads target+draft model independently, samples are split across GPUs.

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
ROOT_DIR=$(dirname $SCRIPT_DIR)

# Activate conda env
source /workspace/miniconda3/etc/profile.d/conda.sh
conda activate specforge

export PYTHONPATH=$ROOT_DIR:$PYTHONPATH
export HF_DATASETS_CACHE=/workspace/hanrui/datasets
export HF_HOME=/workspace/hanrui/cache/specforge_hf_home
export HF_DATASETS_OFFLINE=1
export HF_HUB_OFFLINE=1
export TRANSFORMERS_OFFLINE=1

# ============ Configuration ============
NUM_GPUS=${NUM_GPUS:-8}
TARGET_MODEL=${TARGET_MODEL:-"/workspace/models/Qwen3-8B"}
DRAFT_MODEL=${DRAFT_MODEL:-"/workspace/models/Qwen3-8B-DFlash-b16"}
DATASET=${DATASET:-"math500"}
MAX_SAMPLES=${MAX_SAMPLES:-500}
MAX_NEW_TOKENS=${MAX_NEW_TOKENS:-512}
TEMPERATURE=${TEMPERATURE:-0.0}
OUTPUT_DIR=${OUTPUT_DIR:-"$ROOT_DIR/results/dflash_eval"}
# ========================================

mkdir -p $OUTPUT_DIR

run_eval() {
    local steps=$1
    echo ""
    echo "============================================"
    echo "  Running DFlash eval: denoise_steps=$steps"
    echo "  GPUs: $NUM_GPUS, Samples: $MAX_SAMPLES"
    echo "============================================"

    torchrun \
        --standalone \
        --nproc_per_node $NUM_GPUS \
        $ROOT_DIR/scripts/eval_dflash.py \
        --target-model-path $TARGET_MODEL \
        --draft-model-path $DRAFT_MODEL \
        --dataset $DATASET \
        --max-samples $MAX_SAMPLES \
        --max-new-tokens $MAX_NEW_TOKENS \
        --num-denoise-steps $steps \
        --temperature $TEMPERATURE \
        --output-file $OUTPUT_DIR/${DATASET}_steps${steps}.json \
        2>&1 | tee $OUTPUT_DIR/${DATASET}_steps${steps}.log
}

if [ -n "$1" ]; then
    run_eval $1
else
    run_eval 1
    run_eval 2
    run_eval 3

    echo ""
    echo "============================================"
    echo "  All evaluations complete!"
    echo "  Results in: $OUTPUT_DIR/"
    echo "============================================"
    echo ""
    echo "Quick comparison:"
    for f in $OUTPUT_DIR/${DATASET}_steps*.json; do
        steps=$(echo $f | grep -oP 'steps\K[0-9]+')
        tau=$(python -c "import json; d=json.load(open('$f')); print(f'{d[\"results\"][\"avg_tau\"]:.2f}')" 2>/dev/null || echo "N/A")
        echo "  steps=$steps  avg_tau=$tau"
    done
fi