#!/usr/bin/env bash # bench_hccl_adv.sh — 进阶 HCCL 参数调优 # 在已确定的 ring:200 baseline 上加入 OP_EXPANSION_MODE=AIV 等 knob set -u cd "$(dirname "$0")/.." MODEL="${MODEL_DIR:-/path/to/Qwen3-235B-A22B-Instruct-2507-BF16}" BIN="./build/qwen3-moe-aclnn" LAUNCH="./scripts/tp_launch.sh" TP=16 N_PREDICT=200 N_RUNS=2 PROMPT="The history of artificial intelligence spans several decades and" VOCAB="tokenizer_data/vocab.bin" OUT=/tmp/bench_hccl_adv.csv echo "config,run1,run2,best,median" > $OUT run_one() { local name="$1"; shift # remaining args are env assignments: KEY=VALUE ... local tgs=() for r in $(seq 1 $N_RUNS); do local out # set env vars for this run local env_cmd="" for a in "$@"; do env_cmd="$env_cmd $a"; done out=$(env HCCL_ALGO=level0:ring HCCL_BUFFSIZE=200 $@ \ ${LAUNCH} ${TP} ${BIN} --model-dir "$MODEL" \ --prompt "$PROMPT" --n-predict $N_PREDICT \ --vocab "$VOCAB" --seed 0 2>&1 | grep "decode :" | awk '{print $(NF-2)}') tgs+=("${out:-0}") done local sorted=($(printf '%s\n' "${tgs[@]}" | sort -n)) local best="${sorted[-1]}" local median="${sorted[$((${#sorted[@]}/2))]}" echo "$name,${tgs[0]},${tgs[1]},$best,$median" >> $OUT printf " %-40s %s best=%s median=%s\n" "$name" "${tgs[*]}" "$best" "$median" } echo "Adv HCCL bench: baseline ring:200 + additional knobs" echo "Results → $OUT" echo "" run_one "baseline (ring+200 only)" run_one "+ OP_EXPANSION_MODE=AIV" HCCL_OP_EXPANSION_MODE=AIV run_one "+ OP_BASE_FFTS_MODE=1" HCCL_OP_BASE_FFTS_MODE_ENABLE=1 run_one "+ OP_EXPANSION=AIV + FFTS=1" HCCL_OP_EXPANSION_MODE=AIV HCCL_OP_BASE_FFTS_MODE_ENABLE=1 run_one "+ OP_EXPANSION=AIV + BUF=256" HCCL_OP_EXPANSION_MODE=AIV HCCL_BUFFSIZE=256 run_one "+ OP_EXPANSION=AIV + BUF=512" HCCL_OP_EXPANSION_MODE=AIV HCCL_BUFFSIZE=512 run_one "+ OP_EXPANSION=AIV + ALGO=fullmesh" HCCL_OP_EXPANSION_MODE=AIV HCCL_ALGO=level0:fullmesh echo "" echo "====== Sorted by best TG ======" (head -1 $OUT; tail -n +2 $OUT | sort -t, -k4 -gr) | column -t -s,