Hanrui / syxin /start_server_dflash.sh
Lekr0's picture
Add files using upload-large-folder tool
7c50656 verified
#!/bin/bash
# Evaluate DFlash-LoRA-Inject: measure accepted length OFFLINE.
# 8 GPUs parallel by default, each GPU runs a shard of prompts independently.
#
# WHY offline?
# sglang STANDALONE treats draft as an independent autoregressive model,
# completely ignoring the layer-by-layer injection that LoRA-Inject was
# trained with. Result: accept_length ≈ 4.7 for ALL models (no signal).
#
# sglang DFLASH expects the DFlash-b16 architecture (5-layer, fc+hidden_norm),
# which is structurally different from LoRA-Inject (full 36-layer + LoRA).
#
# So we run offline spec-generate with the correct injection pattern.
#
# Usage:
# bash start_server_dflash.sh # 8 GPUs, all benchmarks
# bash start_server_dflash.sh 4 # 4 GPUs
# bash start_server_dflash.sh 8 humaneval # specific benchmark
# bash start_server_dflash.sh 8 --num-samples 20 # quick test
set -e
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
NUM_GPUS=${1:-8}
shift 2>/dev/null || true
# ---- defaults ----
BASE_MODEL=/workspace/models/Qwen3-8B
ADAPTER_ROOT=/workspace/hanrui/syxin_old/Specforge/outputs/qwen3-8b-dflash-lora-inject
CKPT=epoch_3_step_1400
MERGED=/workspace/hanrui/syxin_old/Specforge/outputs/qwen3-8b-dflash-lora-inject-merged
RESULT_DIR=/workspace/hanrui/syxin_old/Specforge/benchmarks/results
PYTHON=/workspace/miniconda3/envs/spec/bin/python3
echo "============================================"
echo " DFlash-LoRA-Inject Offline Evaluation"
echo " target : $BASE_MODEL"
echo " ckpt : $CKPT"
echo " merged : $MERGED"
echo " GPUs : $NUM_GPUS"
echo "============================================"
$PYTHON -m torch.distributed.run \
--standalone \
--nproc_per_node $NUM_GPUS \
$SCRIPT_DIR/eval_dflash_lora_inject.py \
--base-model $BASE_MODEL \
--adapter-root $ADAPTER_ROOT \
--ckpt $CKPT \
--merged-path $MERGED \
--block-size 16 \
--output-dir $RESULT_DIR \
"$@"