llm_mutil_npu / scripts /bench_tg.sh
xianglarry's picture
Initial C++ aclnn EAGER inference for Qwen3-235B-A22B MoE on Ascend 910 × 16 NPU
4b9fefd
#!/usr/bin/env bash
# bench_tg.sh — stable TG measurement: N runs × 200 tokens, drop cold-starts, report median.
#
# Usage: ./scripts/bench_tg.sh [N_RUNS] (default 5)
# LCA_WARMUP=3 ./scripts/bench_tg.sh (with warmup enabled)
set -u
cd "$(dirname "$0")/.."
MODEL="${MODEL_DIR:-/path/to/Qwen3-235B-A22B-Instruct-2507-BF16}"
BIN="./build/qwen3-moe-aclnn"
N_RUNS="${1:-5}"
N_PREDICT="${N_PREDICT:-200}"
PROMPT="The history of artificial intelligence spans several decades and"
VOCAB="tokenizer_data/vocab.bin"
echo "bench_tg: $N_RUNS runs × $N_PREDICT tokens (LCA_WARMUP=${LCA_WARMUP:-0})"
tgs=()
for r in $(seq 1 $N_RUNS); do
local_out=$(./scripts/tp_launch.sh 16 $BIN --model-dir "$MODEL" \
--prompt "$PROMPT" --n-predict $N_PREDICT \
--vocab "$VOCAB" --seed 0 2>&1 | grep "decode :" | awk '{print $(NF-2)}')
printf " run %d: %s t/s\n" "$r" "$local_out"
tgs+=("${local_out:-0}")
done
echo ""
echo "====== Summary ======"
sorted=($(printf '%s\n' "${tgs[@]}" | sort -n))
n=${#sorted[@]}
mid=$((n / 2))
median="${sorted[$mid]}"
min="${sorted[0]}"
max="${sorted[-1]}"
mean=$(printf '%s\n' "${tgs[@]}" | awk '{s+=$1} END {printf "%.2f", s/NR}')
echo " all : ${tgs[*]}"
echo " min : $min t/s"
echo " median : $median t/s"
echo " mean : $mean t/s"
echo " max : $max t/s"