#!/bin/bash ROOT_DIR=/workspace/hanrui/syxin/Specforge CACHE_DIR=/tmp/specforge_cache export TORCHINDUCTOR_CACHE_DIR=$CACHE_DIR/compiled_kernels export SPECFORGE_DATA_NUM_PROC=16 export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True export PYTORCH_ALLOC_CONF=expandable_segments:True export PYTHONPATH=$ROOT_DIR:${PYTHONPATH:-} export HF_DATASETS_CACHE=$CACHE_DIR/hf_datasets export HF_HOME=$CACHE_DIR/hf_home NUM_GPUS=${1:-8} if [[ $# -ge 1 ]]; then shift; fi EXTRA_ARGS=("$@") PYTHON_BIN=/workspace/miniconda3/envs/spec/bin/python3 $PYTHON_BIN -m torch.distributed.run \ --standalone \ --nproc_per_node $NUM_GPUS \ $ROOT_DIR/scripts/train_dflash_lora_inject.py \ --target-model-path /workspace/models/Qwen3-8B \ --target-model-backend hf \ --train-data-path /workspace/hanrui/datasets/Nemotron-CodeAlpaca-qwen3-8b-800K \ --output-dir $ROOT_DIR/outputs/qwen3-8b-dflash-lora-inject \ --block-size 16 \ --attention-backend additive \ --attn-implementation sdpa \ --max-length 2048 \ --batch-size 8 \ --accumulation-steps 8 \ --num-epochs 3 \ --learning-rate 5e-5 \ --loss-decay-gamma 7 \ --gradient-checkpointing \ --chat-template qwen \ --log-interval 50 \ --save-interval 500 \ --cache-dir $CACHE_DIR \ --lora-rank 32 \ --lora-alpha 64 \ --lora-dropout 0.1 \ --trust-remote-code \ --dataloader-num-workers 0 \ --early-stop \ --early-stop-patience 5 \ --early-stop-min-delta 0.005 \ "${EXTRA_ARGS[@]}"