#!/bin/bash set -euo pipefail cd /workspace/hanrui/syxin/Specforge export TORCHINDUCTOR_CACHE_DIR=/workspace/hanrui/cache/compiled_kernels export SPECFORGE_DATA_NUM_PROC=16 export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True export PYTORCH_ALLOC_CONF=expandable_segments:True export HF_DATASETS_CACHE=/workspace/hanrui/cache/hf_datasets export HF_HOME=/workspace/hanrui/cache/hf_home torchrun --nproc_per_node=8 \ scripts/train_dflash_lora_inject.py \ --target-model-path /workspace/models/Qwen3-8B \ --target-model-backend hf \ --train-data-path /workspace/hanrui/datasets/Nemotron-CodeAlpaca-qwen3-8b-800K \ --output-dir outputs/qwen3-8b-sft-32gpu-v2 \ --block-size 16 \ --attention-backend additive \ --attn-implementation sdpa \ --max-length 2048 \ --batch-size 4 \ --accumulation-steps 8 \ --num-epochs 3 \ --learning-rate 5e-5 \ --loss-decay-gamma 7 \ --gradient-checkpointing \ --chat-template qwen \ --log-interval 50 \ --save-interval 500 \ --cache-dir /workspace/hanrui/cache \ --lora-rank 32 \ --lora-alpha 64 \ --lora-dropout 0.1 \ --trust-remote-code \ --dataloader-num-workers 0