#!/bin/bash
set -euo pipefail

cd /workspace/hanrui/syxin/Specforge

export TORCHINDUCTOR_CACHE_DIR=/workspace/hanrui/cache/compiled_kernels
export SPECFORGE_DATA_NUM_PROC=16
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export PYTORCH_ALLOC_CONF=expandable_segments:True
export HF_DATASETS_CACHE=/workspace/hanrui/cache/hf_datasets
export HF_HOME=/workspace/hanrui/cache/hf_home

torchrun --nproc_per_node=8 \
  scripts/train_dflash_lora_inject.py \
  --target-model-path /workspace/models/Qwen3-8B \
  --target-model-backend hf \
  --train-data-path /workspace/hanrui/datasets/Nemotron-CodeAlpaca-qwen3-8b-800K \
  --output-dir outputs/qwen3-8b-sft-32gpu-v2 \
  --block-size 16 \
  --attention-backend additive \
  --attn-implementation sdpa \
  --max-length 2048 \
  --batch-size 4 \
  --accumulation-steps 8 \
  --num-epochs 3 \
  --learning-rate 5e-5 \
  --loss-decay-gamma 7 \
  --gradient-checkpointing \
  --chat-template qwen \
  --log-interval 50 \
  --save-interval 500 \
  --cache-dir /workspace/hanrui/cache \
  --lora-rank 32 \
  --lora-alpha 64 \
  --lora-dropout 0.1 \
  --trust-remote-code \
  --dataloader-num-workers 0