llm_mutil_npu / scripts /bench_hccl.sh
xianglarry's picture
Initial C++ aclnn EAGER inference for Qwen3-235B-A22B MoE on Ascend 910 × 16 NPU
4b9fefd
#!/usr/bin/env bash
# bench_hccl.sh — HCCL 参数矩阵 benchmark for TG
#
# 遍历 HCCL_ALGO × HCCL_BUFFSIZE 组合,每组 N_RUNS 次取中位数,找最佳配置。
# 固定 prompt + seed=0 + n_predict=200 保证可比性。
set -u
cd "$(dirname "$0")/.."
MODEL="${MODEL_DIR:-/path/to/Qwen3-235B-A22B-Instruct-2507-BF16}"
BIN="./build/qwen3-moe-aclnn"
LAUNCH="./scripts/tp_launch.sh"
TP="${TP_SIZE:-16}"
N_PREDICT="${N_PREDICT:-150}"
N_RUNS="${N_RUNS:-2}"
PROMPT="${PROMPT:-The history of artificial intelligence spans several decades and}"
VOCAB="tokenizer_data/vocab.bin"
OUT=/tmp/bench_hccl_results.csv
echo "algo,buffsize,runs,best_tgs" > $OUT
run_one() {
local algo="$1" buf="$2"
local tgs=()
for r in $(seq 1 $N_RUNS); do
export HCCL_ALGO="$algo" HCCL_BUFFSIZE="$buf"
local out
out=$(${LAUNCH} ${TP} ${BIN} --model-dir "$MODEL" \
--prompt "$PROMPT" --n-predict $N_PREDICT \
--vocab "$VOCAB" --seed 0 2>&1 | grep "decode :" | awk '{print $(NF-2)}')
tgs+=("${out:-0}")
done
local sorted=($(printf '%s\n' "${tgs[@]}" | sort -n))
local best="${sorted[-1]}"
local csv="$algo,$buf,${tgs[*]},$best"
echo "$csv" | sed 's/ /|/g' >> $OUT
printf " %-22s buf=%-4s %s best=%s\n" \
"${algo:-(auto)}" "$buf" "${tgs[*]}" "$best"
}
# Matrix
ALGOS=("" "level0:ring" "level0:fullmesh")
BUFSIZES=("100" "200" "400")
echo "HCCL matrix: ${#ALGOS[@]} algos × ${#BUFSIZES[@]} buffsizes × ${N_RUNS} runs each"
echo "Results → $OUT"
echo ""
for algo in "${ALGOS[@]}"; do
for buf in "${BUFSIZES[@]}"; do
run_one "$algo" "$buf"
done
done
echo ""
echo "====== Summary (sorted by best TG) ======"
(head -1 $OUT; tail -n +2 $OUT | sort -t, -k4 -gr) | column -t -s,