#!/bin/bash set -euo pipefail ROOT_DIR=/workspace/hanrui/syxin/Specforge NUM_GPUS=8 OUTPUT_DIR=$ROOT_DIR/outputs/qwen3-8b-sft-32gpu-v3 CACHE_DIR=/tmp/specforge_cache # Parse arguments if [[ $# -ge 1 ]]; then NUM_GPUS=$1 shift fi if [[ $# -ge 1 && "${1:0:1}" != "-" ]]; then OUTPUT_DIR=$1 shift fi EXTRA_ARGS=("$@") # Environment variables export TORCHINDUCTOR_CACHE_DIR=/tmp/specforge_cache/compiled_kernels export SPECFORGE_DATA_NUM_PROC=16 export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True export PYTORCH_ALLOC_CONF=expandable_segments:True export PYTHONPATH="$ROOT_DIR:${PYTHONPATH:-}" export HF_DATASETS_CACHE=/tmp/specforge_cache/hf_datasets export HF_HOME=/tmp/specforge_cache/hf_home # Python binary DEFAULT_SPECFORGE_PY=/workspace/miniconda3/envs/spec/bin/python3 if [[ -z "${PYTHON_BIN:-}" ]]; then if [[ -x "$DEFAULT_SPECFORGE_PY" ]]; then PYTHON_BIN="$DEFAULT_SPECFORGE_PY" else PYTHON_BIN=python3 fi fi cd $ROOT_DIR # northjob 已经通过 torchrun 设置了分布式环境变量 # 直接运行训练脚本,不要再启动 torch.distributed.run $PYTHON_BIN scripts/train_dflash_lora_inject.py \ --target-model-path /workspace/models/Qwen3-8B \ --target-model-backend hf \ --train-data-path /workspace/hanrui/datasets/Nemotron-CodeAlpaca-qwen3-8b-800K \ --output-dir $OUTPUT_DIR \ --block-size 16 \ --attention-backend additive \ --attn-implementation sdpa \ --max-length 2048 \ --batch-size 4 \ --accumulation-steps 16 \ --num-epochs 3 \ --learning-rate 5e-5 \ --loss-decay-gamma 7 \ --gradient-checkpointing \ --chat-template qwen \ --log-interval 50 \ --save-interval 500 \ --cache-dir $CACHE_DIR \ --lora-rank 32 \ --lora-alpha 64 \ --lora-dropout 0.1 \ --trust-remote-code \ --dataloader-num-workers 0 \ "${EXTRA_ARGS[@]}"