#!/usr/bin/env bash # bench_hccl.sh — HCCL 参数矩阵 benchmark for TG # # 遍历 HCCL_ALGO × HCCL_BUFFSIZE 组合,每组 N_RUNS 次取中位数,找最佳配置。 # 固定 prompt + seed=0 + n_predict=200 保证可比性。 set -u cd "$(dirname "$0")/.." MODEL="${MODEL_DIR:-/path/to/Qwen3-235B-A22B-Instruct-2507-BF16}" BIN="./build/qwen3-moe-aclnn" LAUNCH="./scripts/tp_launch.sh" TP="${TP_SIZE:-16}" N_PREDICT="${N_PREDICT:-150}" N_RUNS="${N_RUNS:-2}" PROMPT="${PROMPT:-The history of artificial intelligence spans several decades and}" VOCAB="tokenizer_data/vocab.bin" OUT=/tmp/bench_hccl_results.csv echo "algo,buffsize,runs,best_tgs" > $OUT run_one() { local algo="$1" buf="$2" local tgs=() for r in $(seq 1 $N_RUNS); do export HCCL_ALGO="$algo" HCCL_BUFFSIZE="$buf" local out out=$(${LAUNCH} ${TP} ${BIN} --model-dir "$MODEL" \ --prompt "$PROMPT" --n-predict $N_PREDICT \ --vocab "$VOCAB" --seed 0 2>&1 | grep "decode :" | awk '{print $(NF-2)}') tgs+=("${out:-0}") done local sorted=($(printf '%s\n' "${tgs[@]}" | sort -n)) local best="${sorted[-1]}" local csv="$algo,$buf,${tgs[*]},$best" echo "$csv" | sed 's/ /|/g' >> $OUT printf " %-22s buf=%-4s %s best=%s\n" \ "${algo:-(auto)}" "$buf" "${tgs[*]}" "$best" } # Matrix ALGOS=("" "level0:ring" "level0:fullmesh") BUFSIZES=("100" "200" "400") echo "HCCL matrix: ${#ALGOS[@]} algos × ${#BUFSIZES[@]} buffsizes × ${N_RUNS} runs each" echo "Results → $OUT" echo "" for algo in "${ALGOS[@]}"; do for buf in "${BUFSIZES[@]}"; do run_one "$algo" "$buf" done done echo "" echo "====== Summary (sorted by best TG) ======" (head -1 $OUT; tail -n +2 $OUT | sort -t, -k4 -gr) | column -t -s,