lmt-arr / scripts /sft_mt_4b.sh
sleepyhead111's picture
Update scripts/sft_mt_4b.sh
1be953f verified
raw
history blame
2 kB
#! /bin/bash
set -eux
ROOT_DIR=$(dirname $(dirname `readlink -f $0`))
export HF_HOME="$ROOT_DIR/cache/"
export MODELSCOPE_CACHE="$ROOT_DIR/cache/"
export HF_EVALUATE_OFFLINE=1
export HF_DATASETS_OFFLINE=1
export NPROC_PER_NODE=8
# model
# model_name=GemmaX2-28-2B-Pretrain
# model_name=Qwen2.5-3B
# model_name=Qwen2.5-7B
model_name=Qwen3-4B-Base
# model_dir=$ROOT_DIR/model_card/$model_name
model_dir=$ROOT_DIR/exps_arr/Qwen3-4B-Base/cpt_mono_0.5B
config_file=$ROOT_DIR/configs/ds_z2_config_bf16.json
# resume_from_checkpoint=
# data
dataset=$ROOT_DIR/data_arr/sft_0915_0.1/train.jsonl
val_dataset=$ROOT_DIR/data_arr/sft_0915_0.1/valid.jsonl
per_device_train_batch_size=12
gradient_accumulation_steps=1 #
max_lengths=1024
num_train_epochs=1
# save
task=sft_0915_0.1
tag=base
output_dir=$ROOT_DIR/exps_arr/$model_name/$task/$tag
mkdir -p $output_dir
cp $0 $output_dir
swift sft \
--deepspeed $config_file \
--add_version False \
--check_model False \
--load_from_cache_file \
--model $model_dir \
--train_type full \
--attn_impl flash_attn \
--dataset $dataset \
--split_dataset_ratio 0 \
--val_dataset $val_dataset \
--torch_dtype bfloat16 \
--num_train_epochs $num_train_epochs \
--per_device_train_batch_size $per_device_train_batch_size \
--per_device_eval_batch_size $per_device_train_batch_size \
--learning_rate 2e-5 \
--gradient_accumulation_steps $gradient_accumulation_steps \
--save_strategy steps \
--logging_strategy steps \
--eval_strategy steps \
--eval_steps 0.1 \
--save_steps 0.1 \
--logging_steps 10 \
--max_length $max_lengths \
--output_dir $output_dir \
--create_checkpoint_symlink \
--warmup_ratio 0.01 \
--dataloader_num_workers 8 \
--dataset_num_proc 16 \
--seed 42 \
--report_to tensorboard \
--save_only_model \
--save_total_limit 3 \
--ddp_timeout 180000000 | tee $output_dir/train.log
# predict
bash inference.sh $output_dir/best