# Atlas A2 * 2 nodes * 8 cards per node

ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
NNODES=2 \
NODE_RANK=1 \
MASTER_ADDR=xxx.xxx.xxx.xxx \
MASTER_PORT=29500 \
NPROC_PER_NODE=8 \
HCCL_SOCKET_IFNAME=xxx \
megatron sft \
    --model 'Qwen/Qwen3-8B' \
    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#1000' \
    --save './SAVE' \
    --tuner_type 'lora' \
    --lora_rank 8 \
    --lora_alpha 32 \
    --target_modules 'all-linear' \
    --tensor_model_parallel_size 2 \
    --pipeline_model_parallel_size 1 \
    --context_parallel_size 1 \
    --sequence_parallel true \
    --micro_batch_size 1 \
    --global_batch_size 64 \
    --recompute_granularity selective \
    --recompute_modules core_attn \
    --cross_entropy_loss_fusion true \
    --no_gradient_accumulation_fusion true \
    --lr 1e-4 \
    --lr_warmup_fraction 0.05 \
    --min_lr 1e-5 \
    --max_epochs 1 \
    --log_interval 5 \
    --num_workers 4