#!/bin/bash # Evaluate DFlash-LoRA-Inject: measure accepted length OFFLINE. # 8 GPUs parallel by default, each GPU runs a shard of prompts independently. # # WHY offline? # sglang STANDALONE treats draft as an independent autoregressive model, # completely ignoring the layer-by-layer injection that LoRA-Inject was # trained with. Result: accept_length ≈ 4.7 for ALL models (no signal). # # sglang DFLASH expects the DFlash-b16 architecture (5-layer, fc+hidden_norm), # which is structurally different from LoRA-Inject (full 36-layer + LoRA). # # So we run offline spec-generate with the correct injection pattern. # # Usage: # bash start_server_dflash.sh # 8 GPUs, all benchmarks # bash start_server_dflash.sh 4 # 4 GPUs # bash start_server_dflash.sh 8 humaneval # specific benchmark # bash start_server_dflash.sh 8 --num-samples 20 # quick test set -e SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) NUM_GPUS=${1:-8} shift 2>/dev/null || true # ---- defaults ---- BASE_MODEL=/workspace/models/Qwen3-8B ADAPTER_ROOT=/workspace/hanrui/syxin_old/Specforge/outputs/qwen3-8b-dflash-lora-inject CKPT=epoch_3_step_1400 MERGED=/workspace/hanrui/syxin_old/Specforge/outputs/qwen3-8b-dflash-lora-inject-merged RESULT_DIR=/workspace/hanrui/syxin_old/Specforge/benchmarks/results PYTHON=/workspace/miniconda3/envs/spec/bin/python3 echo "============================================" echo " DFlash-LoRA-Inject Offline Evaluation" echo " target : $BASE_MODEL" echo " ckpt : $CKPT" echo " merged : $MERGED" echo " GPUs : $NUM_GPUS" echo "============================================" $PYTHON -m torch.distributed.run \ --standalone \ --nproc_per_node $NUM_GPUS \ $SCRIPT_DIR/eval_dflash_lora_inject.py \ --base-model $BASE_MODEL \ --adapter-root $ADAPTER_ROOT \ --ckpt $CKPT \ --merged-path $MERGED \ --block-size 16 \ --output-dir $RESULT_DIR \ "$@"