Initial C++ aclnn EAGER inference for Qwen3-235B-A22B MoE on Ascend 910 × 16 NPU

4b9fefd 20 days ago

1.79 kB

	#!/usr/bin/env bash
	# bench_hccl.sh — HCCL 参数矩阵 benchmark for TG
	#
	# 遍历 HCCL_ALGO × HCCL_BUFFSIZE 组合，每组 N_RUNS 次取中位数，找最佳配置。
	# 固定 prompt + seed=0 + n_predict=200 保证可比性。
	set -u
	cd "$(dirname "$0")/.."

	MODEL="${MODEL_DIR:-/path/to/Qwen3-235B-A22B-Instruct-2507-BF16}"
	BIN="./build/qwen3-moe-aclnn"
	LAUNCH="./scripts/tp_launch.sh"
	TP="${TP_SIZE:-16}"
	N_PREDICT="${N_PREDICT:-150}"
	N_RUNS="${N_RUNS:-2}"
	PROMPT="${PROMPT:-The history of artificial intelligence spans several decades and}"
	VOCAB="tokenizer_data/vocab.bin"

	OUT=/tmp/bench_hccl_results.csv
	echo "algo,buffsize,runs,best_tgs" > $OUT

	run_one() {
	local algo="$1" buf="$2"
	local tgs=()
	for r in $(seq 1 $N_RUNS); do
	export HCCL_ALGO="$algo" HCCL_BUFFSIZE="$buf"
	local out
	out=$(${LAUNCH} ${TP} ${BIN} --model-dir "$MODEL" \
	--prompt "$PROMPT" --n-predict $N_PREDICT \
	--vocab "$VOCAB" --seed 0 2>&1 \| grep "decode :" \| awk '{print $(NF-2)}')
	tgs+=("${out:-0}")
	done
	local sorted=($(printf '%s\n' "${tgs[@]}" \| sort -n))
	local best="${sorted[-1]}"
	local csv="$algo,$buf,${tgs[*]},$best"
	echo "$csv" \| sed 's/ /\|/g' >> $OUT
	printf " %-22s buf=%-4s %s best=%s\n" \
	"${algo:-(auto)}" "$buf" "${tgs[*]}" "$best"
	}

	# Matrix
	ALGOS=("" "level0:ring" "level0:fullmesh")
	BUFSIZES=("100" "200" "400")

	echo "HCCL matrix: ${#ALGOS[@]} algos × ${#BUFSIZES[@]} buffsizes × ${N_RUNS} runs each"
	echo "Results → $OUT"
	echo ""

	for algo in "${ALGOS[@]}"; do
	for buf in "${BUFSIZES[@]}"; do
	run_one "$algo" "$buf"
	done
	done

	echo ""
	echo "====== Summary (sorted by best TG) ======"
	(head -1 $OUT; tail -n +2 $OUT \| sort -t, -k4 -gr) \| column -t -s,