Hanrui / syxin /launch_train.sh
Lekr0's picture
Add files using upload-large-folder tool
7c50656 verified
#!/bin/bash
set -euo pipefail
cd /workspace/hanrui/syxin/Specforge
export TORCHINDUCTOR_CACHE_DIR=/workspace/hanrui/cache/compiled_kernels
export SPECFORGE_DATA_NUM_PROC=16
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export PYTORCH_ALLOC_CONF=expandable_segments:True
export HF_DATASETS_CACHE=/workspace/hanrui/cache/hf_datasets
export HF_HOME=/workspace/hanrui/cache/hf_home
torchrun --nproc_per_node=8 \
scripts/train_dflash_lora_inject.py \
--target-model-path /workspace/models/Qwen3-8B \
--target-model-backend hf \
--train-data-path /workspace/hanrui/datasets/Nemotron-CodeAlpaca-qwen3-8b-800K \
--output-dir outputs/qwen3-8b-sft-32gpu-v2 \
--block-size 16 \
--attention-backend additive \
--attn-implementation sdpa \
--max-length 2048 \
--batch-size 4 \
--accumulation-steps 8 \
--num-epochs 3 \
--learning-rate 5e-5 \
--loss-decay-gamma 7 \
--gradient-checkpointing \
--chat-template qwen \
--log-interval 50 \
--save-interval 500 \
--cache-dir /workspace/hanrui/cache \
--lora-rank 32 \
--lora-alpha 64 \
--lora-dropout 0.1 \
--trust-remote-code \
--dataloader-num-workers 0