| #! /bin/bash |
| set -eux |
| ROOT_DIR=$(dirname $(dirname `readlink -f $0`)) |
|
|
| export HF_HOME="$ROOT_DIR/cache/" |
| export MODELSCOPE_CACHE="$ROOT_DIR/cache/" |
| export HF_EVALUATE_OFFLINE=1 |
| export HF_DATASETS_OFFLINE=1 |
| export NPROC_PER_NODE=8 |
|
|
| |
| |
| |
| |
| model_name=Qwen3-4B-Base |
| |
| model_dir=$ROOT_DIR/exps_arr/Qwen3-4B-Base/cpt_mono_0.5B |
| config_file=$ROOT_DIR/configs/ds_z2_config_bf16.json |
| |
|
|
| |
| dataset=$ROOT_DIR/data_arr/sft_0915_0.1/train.jsonl |
| val_dataset=$ROOT_DIR/data_arr/sft_0915_0.1/valid.jsonl |
| per_device_train_batch_size=12 |
| gradient_accumulation_steps=1 |
|
|
| max_lengths=1024 |
| num_train_epochs=1 |
|
|
| |
| task=sft_0915_0.1 |
| tag=base |
|
|
| output_dir=$ROOT_DIR/exps_arr/$model_name/$task/$tag |
| mkdir -p $output_dir |
| cp $0 $output_dir |
|
|
|
|
| swift sft \ |
| --deepspeed $config_file \ |
| --add_version False \ |
| --check_model False \ |
| --load_from_cache_file \ |
| --model $model_dir \ |
| --train_type full \ |
| --attn_impl flash_attn \ |
| --dataset $dataset \ |
| --split_dataset_ratio 0 \ |
| --val_dataset $val_dataset \ |
| --torch_dtype bfloat16 \ |
| --num_train_epochs $num_train_epochs \ |
| --per_device_train_batch_size $per_device_train_batch_size \ |
| --per_device_eval_batch_size $per_device_train_batch_size \ |
| --learning_rate 2e-5 \ |
| --gradient_accumulation_steps $gradient_accumulation_steps \ |
| --save_strategy steps \ |
| --logging_strategy steps \ |
| --eval_strategy steps \ |
| --eval_steps 0.1 \ |
| --save_steps 0.1 \ |
| --logging_steps 10 \ |
| --max_length $max_lengths \ |
| --output_dir $output_dir \ |
| --create_checkpoint_symlink \ |
| --warmup_ratio 0.01 \ |
| --dataloader_num_workers 8 \ |
| --dataset_num_proc 16 \ |
| --seed 42 \ |
| --report_to tensorboard \ |
| --save_only_model \ |
| --save_total_limit 3 \ |
| --ddp_timeout 180000000 | tee $output_dir/train.log |
|
|
|
|
| |
| bash inference.sh $output_dir/best |