# For more information on multi-node training launch methods, refer to: # https://github.com/modelscope/ms-swift/tree/main/examples/train/multi-node CUDA_VISIBLE_DEVICES=0,1,2,3 \ NNODES=2 \ NODE_RANK=0 \ MASTER_ADDR=127.0.0.1 \ MASTER_PORT=29500 \ NPROC_PER_NODE=4 \ megatron sft \ --load Qwen2.5-14B-mcore \ --dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT' \ --tensor_model_parallel_size 4 \ --micro_batch_size 1 \ --global_batch_size 16 \ --packing true \ --recompute_granularity selective \ --train_iters 2000 \ --eval_iters 50 \ --finetune true \ --cross_entropy_loss_fusion true \ --lr 1e-5 \ --lr_warmup_iters 100 \ --min_lr 1e-6 \ --save megatron_output/Qwen2.5-14B \ --eval_interval 200 \ --save_interval 200 \ --max_length 8192 \ --num_workers 8 \ --dataset_num_proc 8 \ --no_save_optim true \ --no_save_rng true \ --sequence_parallel true \ --use_flash_attn true