# Env: 8 * A100 # Max Length: 65536 # GPU Memory: 8 * 40GiB, Training Speed 26s/it NPROC_PER_NODE=8 \ CELOSS_PARALLEL_SIZE=2048 \ swift sft \ --model Qwen/Qwen2.5-3B-Instruct \ --dataset 'AI-ModelScope/LongAlpaca-12k' \ --load_from_cache_file true \ --tuner_type lora \ --torch_dtype bfloat16 \ --per_device_train_batch_size 4 \ --target_modules all-linear \ --gradient_accumulation_steps 8 \ --save_total_limit 2 \ --save_only_model true \ --save_steps 50 \ --max_length 65536 \ --warmup_ratio 0.05 \ --attn_impl flash_attn \ --sequence_parallel_size 8 \ --logging_steps 1 \ --use_logits_to_keep false \ --padding_free true \