#!/bin/bash MODEL_PATH="/model/BitCPM-CANN-1B-unquantized" DATA_PATH="/dataset/HuggingFaceH4_ultrachat_200k/data/train_sft-00000-of-00003-a3ecf92756993583.parquet" OUTPUT_DIR="./output_sft" DS_CONFIG="./ds_config.json" NUM_GPUS=8 BATCH_SIZE_PER_GPU=2 GRAD_ACCUM_STEPS=1 MAX_SEQ_LENGTH=8192 export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export DS_SKIP_CUDA_CHECK=1 torchrun --nproc_per_node=$NUM_GPUS train_sft.py \ --model_name_or_path $MODEL_PATH \ --data_path $DATA_PATH \ --max_seq_length $MAX_SEQ_LENGTH \ --output_dir $OUTPUT_DIR \ --per_device_train_batch_size $BATCH_SIZE_PER_GPU \ --gradient_accumulation_steps $GRAD_ACCUM_STEPS \ --max_steps 100 \ --learning_rate 2e-5 \ --lr_scheduler_type cosine \ --warmup_ratio 0.2 \ --weight_decay 0.0 \ --logging_steps 2 \ --save_steps 500 \ --save_total_limit 3 \ --bf16 \ --deepspeed $DS_CONFIG \ --gradient_checkpointing \ --seed 42 \ --dataloader_num_workers 4 \ --report_to tensorboard \ --logging_dir /data/tensorboard/sft \ --train_on_prompt false \ --gradient_checkpointing_kwargs '{"use_reentrant": false}'