#!/usr/bin/env bash
# bench_hccl.sh — HCCL 参数矩阵 benchmark for TG
#
# 遍历 HCCL_ALGO × HCCL_BUFFSIZE 组合，每组 N_RUNS 次取中位数，找最佳配置。
# 固定 prompt + seed=0 + n_predict=200 保证可比性。
set -u
cd "$(dirname "$0")/.."

MODEL="${MODEL_DIR:-/path/to/Qwen3-235B-A22B-Instruct-2507-BF16}"
BIN="./build/qwen3-moe-aclnn"
LAUNCH="./scripts/tp_launch.sh"
TP="${TP_SIZE:-16}"
N_PREDICT="${N_PREDICT:-150}"
N_RUNS="${N_RUNS:-2}"
PROMPT="${PROMPT:-The history of artificial intelligence spans several decades and}"
VOCAB="tokenizer_data/vocab.bin"

OUT=/tmp/bench_hccl_results.csv
echo "algo,buffsize,runs,best_tgs" > $OUT

run_one() {
    local algo="$1" buf="$2"
    local tgs=()
    for r in $(seq 1 $N_RUNS); do
        export HCCL_ALGO="$algo" HCCL_BUFFSIZE="$buf"
        local out
        out=$(${LAUNCH} ${TP} ${BIN} --model-dir "$MODEL" \
                --prompt "$PROMPT" --n-predict $N_PREDICT \
                --vocab "$VOCAB" --seed 0 2>&1 | grep "decode :" | awk '{print $(NF-2)}')
        tgs+=("${out:-0}")
    done
    local sorted=($(printf '%s\n' "${tgs[@]}" | sort -n))
    local best="${sorted[-1]}"
    local csv="$algo,$buf,${tgs[*]},$best"
    echo "$csv" | sed 's/ /|/g' >> $OUT
    printf "  %-22s  buf=%-4s  %s  best=%s\n" \
        "${algo:-(auto)}" "$buf" "${tgs[*]}" "$best"
}

# Matrix
ALGOS=("" "level0:ring" "level0:fullmesh")
BUFSIZES=("100" "200" "400")

echo "HCCL matrix: ${#ALGOS[@]} algos × ${#BUFSIZES[@]} buffsizes × ${N_RUNS} runs each"
echo "Results → $OUT"
echo ""

for algo in "${ALGOS[@]}"; do
    for buf in "${BUFSIZES[@]}"; do
        run_one "$algo" "$buf"
    done
done

echo ""
echo "====== Summary (sorted by best TG) ======"
(head -1 $OUT; tail -n +2 $OUT | sort -t, -k4 -gr) | column -t -s,