FasterDFlash
/

Hanrui

Model card Files Files and versions

Hanrui / syxin /run_train_multinode.sh

Lekr0's picture

Add files using upload-large-folder tool

7c50656 verified about 1 month ago

history blame contribute delete

1.82 kB

	#!/bin/bash
	set -euo pipefail

	ROOT_DIR=/workspace/hanrui/syxin/Specforge
	NUM_GPUS=8
	OUTPUT_DIR=$ROOT_DIR/outputs/qwen3-8b-sft-32gpu-v3
	CACHE_DIR=/tmp/specforge_cache

	# Parse arguments
	if [[ $# -ge 1 ]]; then
	NUM_GPUS=$1
	shift
	fi
	if [[ $# -ge 1 && "${1:0:1}" != "-" ]]; then
	OUTPUT_DIR=$1
	shift
	fi
	EXTRA_ARGS=("$@")

	# Environment variables
	export TORCHINDUCTOR_CACHE_DIR=/tmp/specforge_cache/compiled_kernels
	export SPECFORGE_DATA_NUM_PROC=16
	export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
	export PYTORCH_ALLOC_CONF=expandable_segments:True
	export PYTHONPATH="$ROOT_DIR:${PYTHONPATH:-}"
	export HF_DATASETS_CACHE=/tmp/specforge_cache/hf_datasets
	export HF_HOME=/tmp/specforge_cache/hf_home

	# Python binary
	DEFAULT_SPECFORGE_PY=/workspace/miniconda3/envs/spec/bin/python3
	if [[ -z "${PYTHON_BIN:-}" ]]; then
	if [[ -x "$DEFAULT_SPECFORGE_PY" ]]; then
	PYTHON_BIN="$DEFAULT_SPECFORGE_PY"
	else
	PYTHON_BIN=python3
	fi
	fi

	cd $ROOT_DIR

	# northjob 已经通过 torchrun 设置了分布式环境变量
	# 直接运行训练脚本，不要再启动 torch.distributed.run
	$PYTHON_BIN scripts/train_dflash_lora_inject.py \
	--target-model-path /workspace/models/Qwen3-8B \
	--target-model-backend hf \
	--train-data-path /workspace/hanrui/datasets/Nemotron-CodeAlpaca-qwen3-8b-800K \
	--output-dir $OUTPUT_DIR \
	--block-size 16 \
	--attention-backend additive \
	--attn-implementation sdpa \
	--max-length 2048 \
	--batch-size 4 \
	--accumulation-steps 16 \
	--num-epochs 3 \
	--learning-rate 5e-5 \
	--loss-decay-gamma 7 \
	--gradient-checkpointing \
	--chat-template qwen \
	--log-interval 50 \
	--save-interval 500 \
	--cache-dir $CACHE_DIR \
	--lora-rank 32 \
	--lora-alpha 64 \
	--lora-dropout 0.1 \
	--trust-remote-code \
	--dataloader-num-workers 0 \
	"${EXTRA_ARGS[@]}"