Hanrui / SpecForge-ext /test_accept_length.md

Add files using upload-large-folder tool

7c50656 verified 27 days ago

9.06 kB

	# Accept Length 测试指南

	## 0. 准备工作

	### 创建目录
	```bash
	cd /workspace/hanrui/SpecForge-ext
	mkdir -p logs results
	```

	### 下载数据集（首次运行）
	```bash
	cd /workspace/hanrui/SpecForge-ext
	python download_datasets.py
	```

	数据保存位置：
	- MT-Bench: `/workspace/hanrui/datasets/mtbench/question.jsonl`
	- GSM8K: `/workspace/hanrui/datasets/gsm8k/test.jsonl`
	- HumanEval: `/workspace/hanrui/datasets/humaneval/test.jsonl`

	---

	## 1. 测试 Baseline 模型

	### 启动服务器（终端1）
	```bash
	cd /workspace/hanrui/SpecForge-ext

	# 设置环境变量
	export NO_PROXY="localhost,127.0.0.1,::1,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16"
	export no_proxy="localhost,127.0.0.1,::1,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16"

	# 启动 baseline 服务器
	python3 -m sglang.launch_server \
	--model /workspace/Qwen3-8B \
	--speculative-algorithm EAGLE3 \
	--speculative-draft-model-path /workspace/qwen3_8b_eagle3 \
	--speculative-num-steps 3 \
	--speculative-eagle-topk 1 \
	--speculative-num-draft-tokens 4 \
	--mem-fraction-static 0.75 \
	--cuda-graph-max-bs 1 \
	--tp 1 \
	--trust-remote-code \
	--host 0.0.0.0 \
	--port 30000 \
	--dtype bfloat16 \
	--skip-server-warmup
	```

	等待看到 `Application startup complete` 后，继续下一步。

	### 运行三个 Benchmark（终端2）
	```bash
	cd /workspace/hanrui/SpecForge-ext
	conda activate /workspace/Hanrui/

	# 设置环境变量
	export NO_PROXY="localhost,127.0.0.1,::1,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16"
	export no_proxy="localhost,127.0.0.1,::1,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16"

	# 1. MT-Bench
	echo "=== Running MT-Bench (Baseline) ==="
	python benchmarks/bench_eagle3.py \
	--model-path /workspace/Qwen3-8B \
	--host 10.1.1.31 \
	--port 30000 \
	--config-list 1,3,1,4 \
	--benchmark-list mtbench:80 \
	--dtype bfloat16 \
	--skip-launch-server \
	--name baseline_mtbench \
	--output-dir ./results \
	2>&1 \| tee logs/baseline_mtbench_$(date +%Y%m%d_%H%M%S).log

	# 2. GSM8K
	echo "=== Running GSM8K (Baseline) ==="
	python benchmarks/bench_eagle3.py \
	--model-path /workspace/Qwen3-8B \
	--host 10.1.1.31 \
	--port 30000 \
	--config-list 1,3,1,4 \
	--benchmark-list gsm8k:100 \
	--dtype bfloat16 \
	--skip-launch-server \
	--name baseline_gsm8k \
	--output-dir ./results \
	2>&1 \| tee logs/baseline_gsm8k_$(date +%Y%m%d_%H%M%S).log

	# 3. HumanEval
	echo "=== Running HumanEval (Baseline) ==="
	python benchmarks/bench_eagle3.py \
	--model-path /workspace/Qwen3-8B \
	--host 10.1.1.31 \
	--port 30000 \
	--config-list 1,3,1,4 \
	--benchmark-list humaneval:164 \
	--dtype bfloat16 \
	--skip-launch-server \
	--name baseline_humaneval \
	--output-dir ./results \
	2>&1 \| tee logs/baseline_humaneval_$(date +%Y%m%d_%H%M%S).log

	echo "=== Baseline 测试完成 ==="
	```

	---

	## 2. 测试训练后的模型

	### 停止 Baseline 服务器并启动训练后的服务器（终端1）
	```bash
	cd /workspace/hanrui/SpecForge-ext

	# 停止旧服务器
	pkill -f "sglang.launch_server"
	sleep 5

	# 设置环境变量
	export NO_PROXY="localhost,127.0.0.1,::1,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16"
	export no_proxy="localhost,127.0.0.1,::1,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16"

	# 启动训练后的服务器
	python3 -m sglang.launch_server \
	--model /workspace/Qwen3-8B \
	--speculative-algorithm EAGLE3 \
	--speculative-draft-model-path /workspace/hanrui/SpecForge-ext/outputs/qwen3-8b-qwen3eagle-5layer/epoch_9_step_12310 \
	--speculative-num-steps 3 \
	--speculative-eagle-topk 1 \
	--speculative-num-draft-tokens 4 \
	--mem-fraction-static 0.75 \
	--cuda-graph-max-bs 1 \
	--tp 1 \
	--trust-remote-code \
	--host 0.0.0.0 \
	--port 30000 \
	--dtype bfloat16 \
	--skip-server-warmup
	```

	等待看到 `Application startup complete` 后，继续下一步。

	### 运行三个 Benchmark（终端2）
	```bash
	cd /workspace/hanrui/SpecForge-ext
	conda activate /workspace/Hanrui/

	# 设置环境变量
	export NO_PROXY="localhost,127.0.0.1,::1,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16"
	export no_proxy="localhost,127.0.0.1,::1,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16"

	# 1. MT-Bench
	echo "=== Running MT-Bench (Trained) ==="
	python benchmarks/bench_eagle3.py \
	--model-path /workspace/Qwen3-8B \
	--host 10.1.1.31 \
	--port 30000 \
	--config-list 1,3,1,4 \
	--benchmark-list mtbench:80 \
	--dtype bfloat16 \
	--skip-launch-server \
	--name trained_mtbench \
	--output-dir ./results \
	2>&1 \| tee logs/trained_mtbench_$(date +%Y%m%d_%H%M%S).log

	# 2. GSM8K
	echo "=== Running GSM8K (Trained) ==="
	python benchmarks/bench_eagle3.py \
	--model-path /workspace/Qwen3-8B \
	--host 10.1.1.31 \
	--port 30000 \
	--config-list 1,3,1,4 \
	--benchmark-list gsm8k:100 \
	--dtype bfloat16 \
	--skip-launch-server \
	--name trained_gsm8k \
	--output-dir ./results \
	2>&1 \| tee logs/trained_gsm8k_$(date +%Y%m%d_%H%M%S).log

	# 3. HumanEval
	echo "=== Running HumanEval (Trained) ==="
	python benchmarks/bench_eagle3.py \
	--model-path /workspace/Qwen3-8B \
	--host 10.1.1.31 \
	--port 30000 \
	--config-list 1,3,1,4 \
	--benchmark-list humaneval:164 \
	--dtype bfloat16 \
	--skip-launch-server \
	--name trained_humaneval \
	--output-dir ./results \
	2>&1 \| tee logs/trained_humaneval_$(date +%Y%m%d_%H%M%S).log

	echo "=== Trained 测试完成 ==="
	```

	---

	## 3. 查看结果

	### 日志文件位置
	所有日志保存在：`/workspace/hanrui/SpecForge-ext/logs/`
	- `baseline_mtbench_*.log`
	- `baseline_gsm8k_*.log`
	- `baseline_humaneval_*.log`
	- `trained_mtbench_*.log`
	- `trained_gsm8k_*.log`
	- `trained_humaneval_*.log`

	所有结果保存在：`/workspace/hanrui/SpecForge-ext/results/`
	- `baseline_mtbench_*.jsonl`
	- `baseline_gsm8k_*.jsonl`
	- `baseline_humaneval_*.jsonl`
	- `trained_mtbench_*.jsonl`
	- `trained_gsm8k_*.jsonl`
	- `trained_humaneval_*.jsonl`

	### 生成对比报告
	```bash
	cd /workspace/hanrui/SpecForge-ext

	python3 << 'EOF'
	import json
	import glob

	print("=" * 80)
	print("Accept Length 对比报告")
	print("=" * 80)

	datasets = ['mtbench', 'gsm8k', 'humaneval']

	for dataset in datasets:
	print(f"\n{'=' * 80}")
	print(f"{dataset.upper()} 结果对比")
	print('=' * 80)

	baseline_files = sorted(glob.glob(f'results/baseline_{dataset}_*.jsonl'))
	trained_files = sorted(glob.glob(f'results/trained_{dataset}_*.jsonl'))

	if not baseline_files or not trained_files:
	print(f" 未找到 {dataset} 的结果文件")
	continue

	with open(baseline_files[-1], 'r') as f:
	baseline = json.load(f)

	with open(trained_files[-1], 'r') as f:
	trained = json.load(f)

	baseline_metrics = baseline[dataset][0]['metrics'][0]
	trained_metrics = trained[dataset][0]['metrics'][0]

	print(f"\nBaseline:")
	print(f" Accept Length: {baseline_metrics['accept_length']:.4f}")
	print(f" Output Throughput: {baseline_metrics['output_throughput']:.2f} tokens/s")
	if 'accuracy' in baseline_metrics and baseline_metrics['accuracy'] is not None:
	print(f" Accuracy: {baseline_metrics['accuracy']:.2%}")

	print(f"\nTrained:")
	print(f" Accept Length: {trained_metrics['accept_length']:.4f}")
	print(f" Output Throughput: {trained_metrics['output_throughput']:.2f} tokens/s")
	if 'accuracy' in trained_metrics and trained_metrics['accuracy'] is not None:
	print(f" Accuracy: {trained_metrics['accuracy']:.2%}")

	accept_diff = trained_metrics['accept_length'] - baseline_metrics['accept_length']
	accept_pct = (accept_diff / baseline_metrics['accept_length']) * 100

	throughput_diff = trained_metrics['output_throughput'] - baseline_metrics['output_throughput']
	throughput_pct = (throughput_diff / baseline_metrics['output_throughput']) * 100

	print(f"\n差异:")
	print(f" Accept Length: {accept_diff:+.4f} ({accept_pct:+.2f}%)")
	print(f" Throughput: {throughput_diff:+.2f} tokens/s ({throughput_pct:+.2f}%)")

	if 'accuracy' in baseline_metrics and baseline_metrics['accuracy'] is not None:
	acc_diff = trained_metrics['accuracy'] - baseline_metrics['accuracy']
	acc_pct = acc_diff * 100
	print(f" Accuracy: {acc_pct:+.2f} percentage points")

	print("\n" + "=" * 80)
	EOF
	```

	---

	## 4. 快速查看单个结果
	```bash
	cd /workspace/hanrui/SpecForge-ext

	# 查看 baseline 的 accept_length
	cat results/baseline_mtbench_*.jsonl \| jq '.mtbench[0].metrics[0].accept_length'
	cat results/baseline_gsm8k_*.jsonl \| jq '.gsm8k[0].metrics[0].accept_length'
	cat results/baseline_humaneval_*.jsonl \| jq '.humaneval[0].metrics[0].accept_length'

	# 查看 trained 的 accept_length
	cat results/trained_mtbench_*.jsonl \| jq '.mtbench[0].metrics[0].accept_length'
	cat results/trained_gsm8k_*.jsonl \| jq '.gsm8k[0].metrics[0].accept_length'
	cat results/trained_humaneval_*.jsonl \| jq '.humaneval[0].metrics[0].accept_length'
	```