| # Atlas A2 * 2 nodes * 8 cards per node | |
| ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ | |
| NNODES=2 \ | |
| NODE_RANK=1 \ | |
| MASTER_ADDR=xxx.xxx.xxx.xxx \ | |
| MASTER_PORT=29500 \ | |
| NPROC_PER_NODE=8 \ | |
| HCCL_SOCKET_IFNAME=xxx \ | |
| megatron sft \ | |
| --model 'Qwen/Qwen3-8B' \ | |
| --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#1000' \ | |
| --save './SAVE' \ | |
| --tuner_type 'lora' \ | |
| --lora_rank 8 \ | |
| --lora_alpha 32 \ | |
| --target_modules 'all-linear' \ | |
| --tensor_model_parallel_size 2 \ | |
| --pipeline_model_parallel_size 1 \ | |
| --context_parallel_size 1 \ | |
| --sequence_parallel true \ | |
| --micro_batch_size 1 \ | |
| --global_batch_size 64 \ | |
| --recompute_granularity selective \ | |
| --recompute_modules core_attn \ | |
| --cross_entropy_loss_fusion true \ | |
| --no_gradient_accumulation_fusion true \ | |
| --lr 1e-4 \ | |
| --lr_warmup_fraction 0.05 \ | |
| --min_lr 1e-5 \ | |
| --max_epochs 1 \ | |
| --log_interval 5 \ | |
| --num_workers 4 | |