Upload folder using huggingface_hub
Browse files- scripts/cpt_mt_4b.sh +86 -0
- scripts/eval_multi.sh +60 -0
- scripts/inference.sh +168 -0
- scripts/nohup.out +0 -0
- scripts/result_mt.xlsx +0 -0
- scripts/sft_mt_4b.sh +76 -0
scripts/cpt_mt_4b.sh
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#! /bin/bash
|
| 2 |
+
set -eux
|
| 3 |
+
ROOT_DIR=$(dirname $(dirname `readlink -f $0`))
|
| 4 |
+
|
| 5 |
+
export HF_HOME="$ROOT_DIR/cache/"
|
| 6 |
+
export MODELSCOPE_CACHE="$ROOT_DIR/cache/"
|
| 7 |
+
export HF_EVALUATE_OFFLINE=1
|
| 8 |
+
export HF_DATASETS_OFFLINE=1
|
| 9 |
+
export NPROC_PER_NODE=8
|
| 10 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 11 |
+
|
| 12 |
+
# model
|
| 13 |
+
model_name=Qwen3-4B-Base
|
| 14 |
+
model_dir=$ROOT_DIR/model_card/$model_name
|
| 15 |
+
|
| 16 |
+
# config_file=$ROOT_DIR/configs/ds_z0_config.json
|
| 17 |
+
config_file=$ROOT_DIR/configs/ds_z2_config_bf16.json
|
| 18 |
+
# resume_from_checkpoint=$ROOT_DIR/exps_arr/Qwen3-1.7B-Base/cpt_96b_s2/60_langs_continue/checkpoint-14000
|
| 19 |
+
|
| 20 |
+
# data
|
| 21 |
+
train_dataset=(
|
| 22 |
+
$ROOT_DIR/data_arr/10lang_cpt_mono_0.5B/train1.jsonl
|
| 23 |
+
|
| 24 |
+
)
|
| 25 |
+
val_dataset=$ROOT_DIR/data_arr/10lang_cpt_mono_0.5B/valid.jsonl
|
| 26 |
+
# val_dataset=$ROOT_DIR/data/60lang_cpt_96b_s2/valid.jsonl
|
| 27 |
+
per_device_train_batch_size=25 # 20
|
| 28 |
+
per_device_eval_batch_size=25
|
| 29 |
+
gradient_accumulation_steps=3 # 4 for 10B, 6 for 15B
|
| 30 |
+
max_lengths=2048
|
| 31 |
+
max_steps=5000
|
| 32 |
+
|
| 33 |
+
# save
|
| 34 |
+
task=cpt_10lang_mono
|
| 35 |
+
tag=0.5B
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
##############################
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
output_dir=$ROOT_DIR/exps_arr/$model_name/$task/$tag
|
| 42 |
+
mkdir -p $output_dir
|
| 43 |
+
cp $0 $output_dir
|
| 44 |
+
|
| 45 |
+
# --resume_from_checkpoint $resume_from_checkpoint \
|
| 46 |
+
|
| 47 |
+
swift pt \
|
| 48 |
+
--deepspeed $config_file \
|
| 49 |
+
--add_version False \
|
| 50 |
+
--check_model False \
|
| 51 |
+
--model $model_dir \
|
| 52 |
+
--train_type full \
|
| 53 |
+
--streaming true \
|
| 54 |
+
--packing true \
|
| 55 |
+
--attn_impl flash_attn \
|
| 56 |
+
--dataset "${train_dataset[@]}" \
|
| 57 |
+
--split_dataset_ratio 0 \
|
| 58 |
+
--val_dataset $val_dataset \
|
| 59 |
+
--torch_dtype bfloat16 \
|
| 60 |
+
--per_device_train_batch_size $per_device_train_batch_size \
|
| 61 |
+
--per_device_eval_batch_size $per_device_train_batch_size \
|
| 62 |
+
--learning_rate 2e-5 \
|
| 63 |
+
--warmup_ratio 0.05 \
|
| 64 |
+
--gradient_accumulation_steps $gradient_accumulation_steps \
|
| 65 |
+
--save_strategy steps \
|
| 66 |
+
--logging_strategy steps \
|
| 67 |
+
--eval_strategy steps \
|
| 68 |
+
--eval_steps 1000 \
|
| 69 |
+
--save_steps 1000 \
|
| 70 |
+
--logging_steps 10 \
|
| 71 |
+
--max_length $max_lengths \
|
| 72 |
+
--max_steps $max_steps \
|
| 73 |
+
--output_dir $output_dir \
|
| 74 |
+
--dataloader_num_workers 8 \
|
| 75 |
+
--dataset_num_proc 1 \
|
| 76 |
+
--seed 42 \
|
| 77 |
+
--report_to tensorboard \
|
| 78 |
+
--ddp_timeout 180000000 | tee $output_dir/train.log
|
| 79 |
+
|
| 80 |
+
# --save_only_model \
|
| 81 |
+
|
| 82 |
+
####
|
| 83 |
+
# bash sft_mt.sh
|
| 84 |
+
|
| 85 |
+
# benchmark
|
| 86 |
+
# bash $ROOT_DIR/llm_evaluation/scripts/eval_all.sh
|
scripts/eval_multi.sh
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# !/bin/bash
|
| 2 |
+
set -eux
|
| 3 |
+
ROOT_DIR=$(dirname $(dirname `readlink -f $0`))
|
| 4 |
+
|
| 5 |
+
export HF_HOME="./cache/"
|
| 6 |
+
export HF_DATASETS_CACHE="./cache/huggingface_cache/datasets"
|
| 7 |
+
export HF_EVALUATE_OFFLINE=1
|
| 8 |
+
export HF_DATASETS_OFFLINE=1
|
| 9 |
+
|
| 10 |
+
decode_dir=${1:-""}
|
| 11 |
+
|
| 12 |
+
comet_model=$ROOT_DIR/model_card/wmt22-comet-da/checkpoints/model.ckpt
|
| 13 |
+
xcome_model=$ROOT_DIR/model_card/XCOMET-XXL/checkpoints/model.ckpt
|
| 14 |
+
|
| 15 |
+
src_file_strs=""
|
| 16 |
+
ref_file_strs=""
|
| 17 |
+
hypo_file_strs=""
|
| 18 |
+
lang_pair_strs=""
|
| 19 |
+
|
| 20 |
+
for lang in en de ru bn hi th jv sw si km;do
|
| 21 |
+
for src in $lang zh ;do
|
| 22 |
+
|
| 23 |
+
if [ $src = "zh" ]; then # en2zh
|
| 24 |
+
src_lang=zh
|
| 25 |
+
tgt_lang=$lang
|
| 26 |
+
else # zh2en
|
| 27 |
+
src_lang=$lang
|
| 28 |
+
tgt_lang=zh
|
| 29 |
+
fi
|
| 30 |
+
|
| 31 |
+
lp=${src_lang}2${tgt_lang}
|
| 32 |
+
# hypo_file=$decode_dir/${lang_pair}.txt
|
| 33 |
+
# hypo_file=$decode_dir/hypo.${lp}.txt
|
| 34 |
+
hypo_file=$decode_dir/$lp/hypo.${lp}.txt
|
| 35 |
+
# hypo_file=$decode_dir/niu.${lp}.txt
|
| 36 |
+
# hypo_file=$decode_dir/hypo.${lp}.$tgt_lang
|
| 37 |
+
# hypo_file=$decode_dir/hypo_${lang_pair}.txt
|
| 38 |
+
src_file=$ROOT_DIR/data/flores200/zh-${lang}/test.zh-$lang.$src_lang
|
| 39 |
+
ref_file=$ROOT_DIR/data/flores200/zh-${lang}/test.zh-$lang.$tgt_lang
|
| 40 |
+
|
| 41 |
+
src_file_strs=${src_file_strs:+$src_file_strs,}$src_file
|
| 42 |
+
ref_file_strs=${ref_file_strs:+$ref_file_strs,}$ref_file
|
| 43 |
+
hypo_file_strs=${hypo_file_strs:+$hypo_file_strs,}$hypo_file
|
| 44 |
+
lang_pair_strs=${lang_pair_strs:+$lang_pair_strs,}$lp
|
| 45 |
+
|
| 46 |
+
done
|
| 47 |
+
done
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
# metric="bleu,comet_22,xcomet_xxl"
|
| 51 |
+
metric="bleu,comet_22"
|
| 52 |
+
python $ROOT_DIR/src/mt_scoring.py \
|
| 53 |
+
--metric $metric \
|
| 54 |
+
--comet_22_path $comet_model \
|
| 55 |
+
--xcomet_xxl_path $xcome_model \
|
| 56 |
+
--lang_pair $lang_pair_strs \
|
| 57 |
+
--src_file $src_file_strs \
|
| 58 |
+
--ref_file $ref_file_strs \
|
| 59 |
+
--hypo_file $hypo_file_strs \
|
| 60 |
+
--record_file "result_mt.xlsx"
|
scripts/inference.sh
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#! /bin/bash
|
| 2 |
+
set -eux
|
| 3 |
+
ROOT_DIR=$(dirname $(dirname `readlink -f $0`))
|
| 4 |
+
|
| 5 |
+
export HF_HOME="$ROOT_DIR/cache/"
|
| 6 |
+
export MODELSCOPE_CACHE="$ROOT_DIR/cache/"
|
| 7 |
+
export HF_EVALUATE_OFFLINE=1
|
| 8 |
+
export HF_DATASETS_OFFLINE=1
|
| 9 |
+
|
| 10 |
+
config_file=$ROOT_DIR/configs/accelerate_config.yaml
|
| 11 |
+
export NPROC_PER_NODE=8
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# model
|
| 15 |
+
predict_model_dir=${1:-""}
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# eval
|
| 19 |
+
comet_model=$ROOT_DIR/model_card/wmt22-comet-da/checkpoints/model.ckpt
|
| 20 |
+
xcome_model=$ROOT_DIR/model_card/XCOMET-XXL/checkpoints/model.ckpt
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
lang_pair_strs=""
|
| 24 |
+
src_file_strs=""
|
| 25 |
+
ref_file_strs=""
|
| 26 |
+
hypo_file_strs=""
|
| 27 |
+
|
| 28 |
+
# for lang in en ja ru de ug; do
|
| 29 |
+
for lang in en de ru bn hi th jv sw si km; do
|
| 30 |
+
# for lang in en ja ko ru de fr it pt es;do
|
| 31 |
+
for src in $lang zh ;do
|
| 32 |
+
|
| 33 |
+
if [ $src = "zh" ]; then # en2zh
|
| 34 |
+
src_lang=zh
|
| 35 |
+
tgt_lang=$lang
|
| 36 |
+
else # zh2en
|
| 37 |
+
src_lang=$lang
|
| 38 |
+
tgt_lang=zh
|
| 39 |
+
fi
|
| 40 |
+
|
| 41 |
+
lp=${src_lang}2${tgt_lang}
|
| 42 |
+
src_file=$ROOT_DIR/data_arr/flores200/zh-${lang}/test.zh-$lang.$src_lang
|
| 43 |
+
ref_file=$ROOT_DIR/data_arr/flores200/zh-${lang}/test.zh-$lang.$tgt_lang
|
| 44 |
+
# test_file=$ROOT_DIR/data_arr/sft_100k_ugbomn/test.$lp.jsonl
|
| 45 |
+
test_file=$ROOT_DIR/data_arr/test/test.$lp.jsonl
|
| 46 |
+
|
| 47 |
+
output_dir=$predict_model_dir/decode_result/$lp
|
| 48 |
+
mkdir -p $output_dir
|
| 49 |
+
############################!!!!!
|
| 50 |
+
rm -rf $output_dir/*
|
| 51 |
+
#######################
|
| 52 |
+
cp $0 $output_dir
|
| 53 |
+
|
| 54 |
+
swift infer \
|
| 55 |
+
--infer_backend pt \
|
| 56 |
+
--val_dataset $test_file \
|
| 57 |
+
--load_from_cache_file True \
|
| 58 |
+
--dataset_shuffle False \
|
| 59 |
+
--val_dataset_shuffle False \
|
| 60 |
+
--model $predict_model_dir \
|
| 61 |
+
--torch_dtype bfloat16 \
|
| 62 |
+
--max_new_tokens 1024 \
|
| 63 |
+
--max_batch_size 16 \
|
| 64 |
+
--num_beams 5 \
|
| 65 |
+
--max_length 1024 \
|
| 66 |
+
--dataset_num_proc 8 \
|
| 67 |
+
--temperature 0 \
|
| 68 |
+
--result_path $output_dir/generated_predictions.jsonl | tee $output_dir/train.log
|
| 69 |
+
|
| 70 |
+
jq -r '.response' $output_dir/generated_predictions.jsonl > $output_dir/hypo.$lp.txt
|
| 71 |
+
|
| 72 |
+
hypo_file=$output_dir/hypo.$lp.txt
|
| 73 |
+
|
| 74 |
+
lang_pair_strs=${lang_pair_strs:+$lang_pair_strs,}$lp
|
| 75 |
+
src_file_strs=${src_file_strs:+$src_file_strs,}$src_file
|
| 76 |
+
ref_file_strs=${ref_file_strs:+$ref_file_strs,}$ref_file
|
| 77 |
+
hypo_file_strs=${hypo_file_strs:+$hypo_file_strs,}$hypo_file
|
| 78 |
+
done
|
| 79 |
+
done
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
# # metric="bleu,comet_22,xcomet_xxl"
|
| 83 |
+
metric="bleu,comet_22"
|
| 84 |
+
python $ROOT_DIR/src/mt_scoring.py \
|
| 85 |
+
--metric $metric \
|
| 86 |
+
--comet_22_path $comet_model \
|
| 87 |
+
--xcomet_xxl_path $xcome_model \
|
| 88 |
+
--lang_pair $lang_pair_strs \
|
| 89 |
+
--src_file $src_file_strs \
|
| 90 |
+
--ref_file $ref_file_strs \
|
| 91 |
+
--hypo_file $hypo_file_strs \
|
| 92 |
+
--record_file "result_mt.xlsx"
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
# lang_pair_strs=""
|
| 96 |
+
# src_file_strs=""
|
| 97 |
+
# ref_file_strs=""
|
| 98 |
+
# hypo_file_strs=""
|
| 99 |
+
|
| 100 |
+
# #
|
| 101 |
+
# # for lang in ja ko ru de fr it pt es;do
|
| 102 |
+
# # for lang in ja ru de ug; do
|
| 103 |
+
# for lang in mn_cn; do
|
| 104 |
+
# for src in $lang en ;do
|
| 105 |
+
|
| 106 |
+
# if [ $src = "en" ]; then # en2zh
|
| 107 |
+
# src_lang=en
|
| 108 |
+
# tgt_lang=$lang
|
| 109 |
+
# else # zh2en
|
| 110 |
+
# src_lang=$lang
|
| 111 |
+
# tgt_lang=en
|
| 112 |
+
# fi
|
| 113 |
+
|
| 114 |
+
# lp=${src_lang}2${tgt_lang}
|
| 115 |
+
# src_file=$ROOT_DIR/data_arr/flores200/en-${lang}/test.en-$lang.$src_lang
|
| 116 |
+
# ref_file=$ROOT_DIR/data_arr/flores200/en-${lang}/test.en-$lang.$tgt_lang
|
| 117 |
+
# # test_file=$ROOT_DIR/data_arr/sft_100k_ugbomn/test.$lp.jsonl
|
| 118 |
+
# test_file=/mnt/nvme1/luoyingfeng/llm-mt/data_arr/merge_0701/train1/test/test.$lp.jsonl
|
| 119 |
+
|
| 120 |
+
# output_dir=$predict_model_dir/decode_result/$lp
|
| 121 |
+
# mkdir -p $output_dir
|
| 122 |
+
# #############################!!!!!
|
| 123 |
+
# rm -rf $output_dir/*
|
| 124 |
+
# ########################
|
| 125 |
+
# cp $0 $output_dir
|
| 126 |
+
|
| 127 |
+
# # --load_args False \
|
| 128 |
+
# swift infer \
|
| 129 |
+
# --infer_backend pt \
|
| 130 |
+
# --val_dataset $test_file \
|
| 131 |
+
# --load_from_cache_file True \
|
| 132 |
+
# --dataset_shuffle False \
|
| 133 |
+
# --val_dataset_shuffle False \
|
| 134 |
+
# --model $predict_model_dir \
|
| 135 |
+
# --torch_dtype bfloat16 \
|
| 136 |
+
# --max_new_tokens 1024 \
|
| 137 |
+
# --max_batch_size 8 \
|
| 138 |
+
# --num_beams 5 \
|
| 139 |
+
# --max_length 1024 \
|
| 140 |
+
# --dataset_num_proc 8 \
|
| 141 |
+
# --temperature 0 \
|
| 142 |
+
# --result_path $output_dir/generated_predictions.jsonl | tee $output_dir/train.log
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
# jq -r '.response' $output_dir/generated_predictions.jsonl > $output_dir/hypo.$lp.txt
|
| 146 |
+
|
| 147 |
+
# hypo_file=$output_dir/hypo.$lp.txt
|
| 148 |
+
|
| 149 |
+
# lang_pair_strs=${lang_pair_strs:+$lang_pair_strs,}$lp
|
| 150 |
+
# src_file_strs=${src_file_strs:+$src_file_strs,}$src_file
|
| 151 |
+
# ref_file_strs=${ref_file_strs:+$ref_file_strs,}$ref_file
|
| 152 |
+
# hypo_file_strs=${hypo_file_strs:+$hypo_file_strs,}$hypo_file
|
| 153 |
+
|
| 154 |
+
# done
|
| 155 |
+
# done
|
| 156 |
+
|
| 157 |
+
# # metric="bleu,comet_22,xcomet_xxl"
|
| 158 |
+
# metric="bleu,comet_22"
|
| 159 |
+
# python $ROOT_DIR/src/mt_scoring.py \
|
| 160 |
+
# --metric $metric \
|
| 161 |
+
# --comet_22_path $comet_model \
|
| 162 |
+
# --xcomet_xxl_path $xcome_model \
|
| 163 |
+
# --lang_pair $lang_pair_strs \
|
| 164 |
+
# --src_file $src_file_strs \
|
| 165 |
+
# --ref_file $ref_file_strs \
|
| 166 |
+
# --hypo_file $hypo_file_strs \
|
| 167 |
+
# --record_file "result_mt.xlsx"
|
| 168 |
+
|
scripts/nohup.out
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
scripts/result_mt.xlsx
ADDED
|
Binary file (5.79 kB). View file
|
|
|
scripts/sft_mt_4b.sh
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#! /bin/bash
|
| 2 |
+
set -eux
|
| 3 |
+
ROOT_DIR=$(dirname $(dirname `readlink -f $0`))
|
| 4 |
+
|
| 5 |
+
export HF_HOME="$ROOT_DIR/cache/"
|
| 6 |
+
export MODELSCOPE_CACHE="$ROOT_DIR/cache/"
|
| 7 |
+
export HF_EVALUATE_OFFLINE=1
|
| 8 |
+
export HF_DATASETS_OFFLINE=1
|
| 9 |
+
export NPROC_PER_NODE=8
|
| 10 |
+
|
| 11 |
+
# model
|
| 12 |
+
# model_name=GemmaX2-28-2B-Pretrain
|
| 13 |
+
# model_name=Qwen2.5-3B
|
| 14 |
+
# model_name=Qwen2.5-7B
|
| 15 |
+
model_name=Qwen3-4B-Base
|
| 16 |
+
model_dir=$ROOT_DIR/model_card/$model_name
|
| 17 |
+
# model_dir=$ROOT_DIR/exps_arr/Qwen3-4B-Base/cpt_mono_0.5B
|
| 18 |
+
config_file=$ROOT_DIR/configs/ds_z2_config_bf16.json
|
| 19 |
+
# resume_from_checkpoint=
|
| 20 |
+
|
| 21 |
+
# data
|
| 22 |
+
dataset=$ROOT_DIR/data_arr/sft_0915_0.1/train.jsonl
|
| 23 |
+
val_dataset=$ROOT_DIR/data_arr/sft_0915_0.1/valid.jsonl
|
| 24 |
+
per_device_train_batch_size=12
|
| 25 |
+
gradient_accumulation_steps=1 #
|
| 26 |
+
|
| 27 |
+
max_lengths=1024
|
| 28 |
+
num_train_epochs=1
|
| 29 |
+
|
| 30 |
+
# save
|
| 31 |
+
task=sft_0915_0.1
|
| 32 |
+
tag=base
|
| 33 |
+
|
| 34 |
+
output_dir=$ROOT_DIR/exps_arr/$model_name/$task/$tag
|
| 35 |
+
mkdir -p $output_dir
|
| 36 |
+
cp $0 $output_dir
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
swift sft \
|
| 40 |
+
--deepspeed $config_file \
|
| 41 |
+
--add_version False \
|
| 42 |
+
--check_model False \
|
| 43 |
+
--load_from_cache_file \
|
| 44 |
+
--model $model_dir \
|
| 45 |
+
--train_type full \
|
| 46 |
+
--attn_impl flash_attn \
|
| 47 |
+
--dataset $dataset \
|
| 48 |
+
--split_dataset_ratio 0 \
|
| 49 |
+
--val_dataset $val_dataset \
|
| 50 |
+
--torch_dtype bfloat16 \
|
| 51 |
+
--num_train_epochs $num_train_epochs \
|
| 52 |
+
--per_device_train_batch_size $per_device_train_batch_size \
|
| 53 |
+
--per_device_eval_batch_size $per_device_train_batch_size \
|
| 54 |
+
--learning_rate 2e-5 \
|
| 55 |
+
--gradient_accumulation_steps $gradient_accumulation_steps \
|
| 56 |
+
--save_strategy steps \
|
| 57 |
+
--logging_strategy steps \
|
| 58 |
+
--eval_strategy steps \
|
| 59 |
+
--eval_steps 0.1 \
|
| 60 |
+
--save_steps 0.1 \
|
| 61 |
+
--logging_steps 10 \
|
| 62 |
+
--max_length $max_lengths \
|
| 63 |
+
--output_dir $output_dir \
|
| 64 |
+
--create_checkpoint_symlink \
|
| 65 |
+
--warmup_ratio 0.01 \
|
| 66 |
+
--dataloader_num_workers 8 \
|
| 67 |
+
--dataset_num_proc 16 \
|
| 68 |
+
--seed 42 \
|
| 69 |
+
--report_to tensorboard \
|
| 70 |
+
--save_only_model \
|
| 71 |
+
--save_total_limit 3 \
|
| 72 |
+
--ddp_timeout 180000000 | tee $output_dir/train.log
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
# predict
|
| 76 |
+
bash inference.sh $output_dir/best
|