Hanrui / syxin /run_train_multinode.sh
Lekr0's picture
Add files using upload-large-folder tool
7c50656 verified
#!/bin/bash
set -euo pipefail
ROOT_DIR=/workspace/hanrui/syxin/Specforge
NUM_GPUS=8
OUTPUT_DIR=$ROOT_DIR/outputs/qwen3-8b-sft-32gpu-v3
CACHE_DIR=/tmp/specforge_cache
# Parse arguments
if [[ $# -ge 1 ]]; then
NUM_GPUS=$1
shift
fi
if [[ $# -ge 1 && "${1:0:1}" != "-" ]]; then
OUTPUT_DIR=$1
shift
fi
EXTRA_ARGS=("$@")
# Environment variables
export TORCHINDUCTOR_CACHE_DIR=/tmp/specforge_cache/compiled_kernels
export SPECFORGE_DATA_NUM_PROC=16
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export PYTORCH_ALLOC_CONF=expandable_segments:True
export PYTHONPATH="$ROOT_DIR:${PYTHONPATH:-}"
export HF_DATASETS_CACHE=/tmp/specforge_cache/hf_datasets
export HF_HOME=/tmp/specforge_cache/hf_home
# Python binary
DEFAULT_SPECFORGE_PY=/workspace/miniconda3/envs/spec/bin/python3
if [[ -z "${PYTHON_BIN:-}" ]]; then
if [[ -x "$DEFAULT_SPECFORGE_PY" ]]; then
PYTHON_BIN="$DEFAULT_SPECFORGE_PY"
else
PYTHON_BIN=python3
fi
fi
cd $ROOT_DIR
# northjob 已经通过 torchrun 设置了分布式环境变量
# 直接运行训练脚本,不要再启动 torch.distributed.run
$PYTHON_BIN scripts/train_dflash_lora_inject.py \
--target-model-path /workspace/models/Qwen3-8B \
--target-model-backend hf \
--train-data-path /workspace/hanrui/datasets/Nemotron-CodeAlpaca-qwen3-8b-800K \
--output-dir $OUTPUT_DIR \
--block-size 16 \
--attention-backend additive \
--attn-implementation sdpa \
--max-length 2048 \
--batch-size 4 \
--accumulation-steps 16 \
--num-epochs 3 \
--learning-rate 5e-5 \
--loss-decay-gamma 7 \
--gradient-checkpointing \
--chat-template qwen \
--log-interval 50 \
--save-interval 500 \
--cache-dir $CACHE_DIR \
--lora-rank 32 \
--lora-alpha 64 \
--lora-dropout 0.1 \
--trust-remote-code \
--dataloader-num-workers 0 \
"${EXTRA_ARGS[@]}"