sleepyhead111 commited on
Commit
746b7a6
·
verified ·
1 Parent(s): 0365ba9

Upload folder using huggingface_hub

Browse files
scripts/cpt_mt_4b.sh ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #! /bin/bash
2
+ set -eux
3
+ ROOT_DIR=$(dirname $(dirname `readlink -f $0`))
4
+
5
+ export HF_HOME="$ROOT_DIR/cache/"
6
+ export MODELSCOPE_CACHE="$ROOT_DIR/cache/"
7
+ export HF_EVALUATE_OFFLINE=1
8
+ export HF_DATASETS_OFFLINE=1
9
+ export NPROC_PER_NODE=8
10
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
11
+
12
+ # model
13
+ model_name=Qwen3-4B-Base
14
+ model_dir=$ROOT_DIR/model_card/$model_name
15
+
16
+ # config_file=$ROOT_DIR/configs/ds_z0_config.json
17
+ config_file=$ROOT_DIR/configs/ds_z2_config_bf16.json
18
+ # resume_from_checkpoint=$ROOT_DIR/exps_arr/Qwen3-1.7B-Base/cpt_96b_s2/60_langs_continue/checkpoint-14000
19
+
20
+ # data
21
+ train_dataset=(
22
+ $ROOT_DIR/data_arr/10lang_cpt_mono_0.5B/train1.jsonl
23
+
24
+ )
25
+ val_dataset=$ROOT_DIR/data_arr/10lang_cpt_mono_0.5B/valid.jsonl
26
+ # val_dataset=$ROOT_DIR/data/60lang_cpt_96b_s2/valid.jsonl
27
+ per_device_train_batch_size=25 # 20
28
+ per_device_eval_batch_size=25
29
+ gradient_accumulation_steps=3 # 4 for 10B, 6 for 15B
30
+ max_lengths=2048
31
+ max_steps=5000
32
+
33
+ # save
34
+ task=cpt_10lang_mono
35
+ tag=0.5B
36
+
37
+
38
+ ##############################
39
+
40
+
41
+ output_dir=$ROOT_DIR/exps_arr/$model_name/$task/$tag
42
+ mkdir -p $output_dir
43
+ cp $0 $output_dir
44
+
45
+ # --resume_from_checkpoint $resume_from_checkpoint \
46
+
47
+ swift pt \
48
+ --deepspeed $config_file \
49
+ --add_version False \
50
+ --check_model False \
51
+ --model $model_dir \
52
+ --train_type full \
53
+ --streaming true \
54
+ --packing true \
55
+ --attn_impl flash_attn \
56
+ --dataset "${train_dataset[@]}" \
57
+ --split_dataset_ratio 0 \
58
+ --val_dataset $val_dataset \
59
+ --torch_dtype bfloat16 \
60
+ --per_device_train_batch_size $per_device_train_batch_size \
61
+ --per_device_eval_batch_size $per_device_train_batch_size \
62
+ --learning_rate 2e-5 \
63
+ --warmup_ratio 0.05 \
64
+ --gradient_accumulation_steps $gradient_accumulation_steps \
65
+ --save_strategy steps \
66
+ --logging_strategy steps \
67
+ --eval_strategy steps \
68
+ --eval_steps 1000 \
69
+ --save_steps 1000 \
70
+ --logging_steps 10 \
71
+ --max_length $max_lengths \
72
+ --max_steps $max_steps \
73
+ --output_dir $output_dir \
74
+ --dataloader_num_workers 8 \
75
+ --dataset_num_proc 1 \
76
+ --seed 42 \
77
+ --report_to tensorboard \
78
+ --ddp_timeout 180000000 | tee $output_dir/train.log
79
+
80
+ # --save_only_model \
81
+
82
+ ####
83
+ # bash sft_mt.sh
84
+
85
+ # benchmark
86
+ # bash $ROOT_DIR/llm_evaluation/scripts/eval_all.sh
scripts/eval_multi.sh ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # !/bin/bash
2
+ set -eux
3
+ ROOT_DIR=$(dirname $(dirname `readlink -f $0`))
4
+
5
+ export HF_HOME="./cache/"
6
+ export HF_DATASETS_CACHE="./cache/huggingface_cache/datasets"
7
+ export HF_EVALUATE_OFFLINE=1
8
+ export HF_DATASETS_OFFLINE=1
9
+
10
+ decode_dir=${1:-""}
11
+
12
+ comet_model=$ROOT_DIR/model_card/wmt22-comet-da/checkpoints/model.ckpt
13
+ xcome_model=$ROOT_DIR/model_card/XCOMET-XXL/checkpoints/model.ckpt
14
+
15
+ src_file_strs=""
16
+ ref_file_strs=""
17
+ hypo_file_strs=""
18
+ lang_pair_strs=""
19
+
20
+ for lang in en de ru bn hi th jv sw si km;do
21
+ for src in $lang zh ;do
22
+
23
+ if [ $src = "zh" ]; then # en2zh
24
+ src_lang=zh
25
+ tgt_lang=$lang
26
+ else # zh2en
27
+ src_lang=$lang
28
+ tgt_lang=zh
29
+ fi
30
+
31
+ lp=${src_lang}2${tgt_lang}
32
+ # hypo_file=$decode_dir/${lang_pair}.txt
33
+ # hypo_file=$decode_dir/hypo.${lp}.txt
34
+ hypo_file=$decode_dir/$lp/hypo.${lp}.txt
35
+ # hypo_file=$decode_dir/niu.${lp}.txt
36
+ # hypo_file=$decode_dir/hypo.${lp}.$tgt_lang
37
+ # hypo_file=$decode_dir/hypo_${lang_pair}.txt
38
+ src_file=$ROOT_DIR/data/flores200/zh-${lang}/test.zh-$lang.$src_lang
39
+ ref_file=$ROOT_DIR/data/flores200/zh-${lang}/test.zh-$lang.$tgt_lang
40
+
41
+ src_file_strs=${src_file_strs:+$src_file_strs,}$src_file
42
+ ref_file_strs=${ref_file_strs:+$ref_file_strs,}$ref_file
43
+ hypo_file_strs=${hypo_file_strs:+$hypo_file_strs,}$hypo_file
44
+ lang_pair_strs=${lang_pair_strs:+$lang_pair_strs,}$lp
45
+
46
+ done
47
+ done
48
+
49
+
50
+ # metric="bleu,comet_22,xcomet_xxl"
51
+ metric="bleu,comet_22"
52
+ python $ROOT_DIR/src/mt_scoring.py \
53
+ --metric $metric \
54
+ --comet_22_path $comet_model \
55
+ --xcomet_xxl_path $xcome_model \
56
+ --lang_pair $lang_pair_strs \
57
+ --src_file $src_file_strs \
58
+ --ref_file $ref_file_strs \
59
+ --hypo_file $hypo_file_strs \
60
+ --record_file "result_mt.xlsx"
scripts/inference.sh ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #! /bin/bash
2
+ set -eux
3
+ ROOT_DIR=$(dirname $(dirname `readlink -f $0`))
4
+
5
+ export HF_HOME="$ROOT_DIR/cache/"
6
+ export MODELSCOPE_CACHE="$ROOT_DIR/cache/"
7
+ export HF_EVALUATE_OFFLINE=1
8
+ export HF_DATASETS_OFFLINE=1
9
+
10
+ config_file=$ROOT_DIR/configs/accelerate_config.yaml
11
+ export NPROC_PER_NODE=8
12
+
13
+
14
+ # model
15
+ predict_model_dir=${1:-""}
16
+
17
+
18
+ # eval
19
+ comet_model=$ROOT_DIR/model_card/wmt22-comet-da/checkpoints/model.ckpt
20
+ xcome_model=$ROOT_DIR/model_card/XCOMET-XXL/checkpoints/model.ckpt
21
+
22
+
23
+ lang_pair_strs=""
24
+ src_file_strs=""
25
+ ref_file_strs=""
26
+ hypo_file_strs=""
27
+
28
+ # for lang in en ja ru de ug; do
29
+ for lang in en de ru bn hi th jv sw si km; do
30
+ # for lang in en ja ko ru de fr it pt es;do
31
+ for src in $lang zh ;do
32
+
33
+ if [ $src = "zh" ]; then # en2zh
34
+ src_lang=zh
35
+ tgt_lang=$lang
36
+ else # zh2en
37
+ src_lang=$lang
38
+ tgt_lang=zh
39
+ fi
40
+
41
+ lp=${src_lang}2${tgt_lang}
42
+ src_file=$ROOT_DIR/data_arr/flores200/zh-${lang}/test.zh-$lang.$src_lang
43
+ ref_file=$ROOT_DIR/data_arr/flores200/zh-${lang}/test.zh-$lang.$tgt_lang
44
+ # test_file=$ROOT_DIR/data_arr/sft_100k_ugbomn/test.$lp.jsonl
45
+ test_file=$ROOT_DIR/data_arr/test/test.$lp.jsonl
46
+
47
+ output_dir=$predict_model_dir/decode_result/$lp
48
+ mkdir -p $output_dir
49
+ ############################!!!!!
50
+ rm -rf $output_dir/*
51
+ #######################
52
+ cp $0 $output_dir
53
+
54
+ swift infer \
55
+ --infer_backend pt \
56
+ --val_dataset $test_file \
57
+ --load_from_cache_file True \
58
+ --dataset_shuffle False \
59
+ --val_dataset_shuffle False \
60
+ --model $predict_model_dir \
61
+ --torch_dtype bfloat16 \
62
+ --max_new_tokens 1024 \
63
+ --max_batch_size 16 \
64
+ --num_beams 5 \
65
+ --max_length 1024 \
66
+ --dataset_num_proc 8 \
67
+ --temperature 0 \
68
+ --result_path $output_dir/generated_predictions.jsonl | tee $output_dir/train.log
69
+
70
+ jq -r '.response' $output_dir/generated_predictions.jsonl > $output_dir/hypo.$lp.txt
71
+
72
+ hypo_file=$output_dir/hypo.$lp.txt
73
+
74
+ lang_pair_strs=${lang_pair_strs:+$lang_pair_strs,}$lp
75
+ src_file_strs=${src_file_strs:+$src_file_strs,}$src_file
76
+ ref_file_strs=${ref_file_strs:+$ref_file_strs,}$ref_file
77
+ hypo_file_strs=${hypo_file_strs:+$hypo_file_strs,}$hypo_file
78
+ done
79
+ done
80
+
81
+
82
+ # # metric="bleu,comet_22,xcomet_xxl"
83
+ metric="bleu,comet_22"
84
+ python $ROOT_DIR/src/mt_scoring.py \
85
+ --metric $metric \
86
+ --comet_22_path $comet_model \
87
+ --xcomet_xxl_path $xcome_model \
88
+ --lang_pair $lang_pair_strs \
89
+ --src_file $src_file_strs \
90
+ --ref_file $ref_file_strs \
91
+ --hypo_file $hypo_file_strs \
92
+ --record_file "result_mt.xlsx"
93
+
94
+
95
+ # lang_pair_strs=""
96
+ # src_file_strs=""
97
+ # ref_file_strs=""
98
+ # hypo_file_strs=""
99
+
100
+ # #
101
+ # # for lang in ja ko ru de fr it pt es;do
102
+ # # for lang in ja ru de ug; do
103
+ # for lang in mn_cn; do
104
+ # for src in $lang en ;do
105
+
106
+ # if [ $src = "en" ]; then # en2zh
107
+ # src_lang=en
108
+ # tgt_lang=$lang
109
+ # else # zh2en
110
+ # src_lang=$lang
111
+ # tgt_lang=en
112
+ # fi
113
+
114
+ # lp=${src_lang}2${tgt_lang}
115
+ # src_file=$ROOT_DIR/data_arr/flores200/en-${lang}/test.en-$lang.$src_lang
116
+ # ref_file=$ROOT_DIR/data_arr/flores200/en-${lang}/test.en-$lang.$tgt_lang
117
+ # # test_file=$ROOT_DIR/data_arr/sft_100k_ugbomn/test.$lp.jsonl
118
+ # test_file=/mnt/nvme1/luoyingfeng/llm-mt/data_arr/merge_0701/train1/test/test.$lp.jsonl
119
+
120
+ # output_dir=$predict_model_dir/decode_result/$lp
121
+ # mkdir -p $output_dir
122
+ # #############################!!!!!
123
+ # rm -rf $output_dir/*
124
+ # ########################
125
+ # cp $0 $output_dir
126
+
127
+ # # --load_args False \
128
+ # swift infer \
129
+ # --infer_backend pt \
130
+ # --val_dataset $test_file \
131
+ # --load_from_cache_file True \
132
+ # --dataset_shuffle False \
133
+ # --val_dataset_shuffle False \
134
+ # --model $predict_model_dir \
135
+ # --torch_dtype bfloat16 \
136
+ # --max_new_tokens 1024 \
137
+ # --max_batch_size 8 \
138
+ # --num_beams 5 \
139
+ # --max_length 1024 \
140
+ # --dataset_num_proc 8 \
141
+ # --temperature 0 \
142
+ # --result_path $output_dir/generated_predictions.jsonl | tee $output_dir/train.log
143
+
144
+
145
+ # jq -r '.response' $output_dir/generated_predictions.jsonl > $output_dir/hypo.$lp.txt
146
+
147
+ # hypo_file=$output_dir/hypo.$lp.txt
148
+
149
+ # lang_pair_strs=${lang_pair_strs:+$lang_pair_strs,}$lp
150
+ # src_file_strs=${src_file_strs:+$src_file_strs,}$src_file
151
+ # ref_file_strs=${ref_file_strs:+$ref_file_strs,}$ref_file
152
+ # hypo_file_strs=${hypo_file_strs:+$hypo_file_strs,}$hypo_file
153
+
154
+ # done
155
+ # done
156
+
157
+ # # metric="bleu,comet_22,xcomet_xxl"
158
+ # metric="bleu,comet_22"
159
+ # python $ROOT_DIR/src/mt_scoring.py \
160
+ # --metric $metric \
161
+ # --comet_22_path $comet_model \
162
+ # --xcomet_xxl_path $xcome_model \
163
+ # --lang_pair $lang_pair_strs \
164
+ # --src_file $src_file_strs \
165
+ # --ref_file $ref_file_strs \
166
+ # --hypo_file $hypo_file_strs \
167
+ # --record_file "result_mt.xlsx"
168
+
scripts/nohup.out ADDED
The diff for this file is too large to render. See raw diff
 
scripts/result_mt.xlsx ADDED
Binary file (5.79 kB). View file
 
scripts/sft_mt_4b.sh ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #! /bin/bash
2
+ set -eux
3
+ ROOT_DIR=$(dirname $(dirname `readlink -f $0`))
4
+
5
+ export HF_HOME="$ROOT_DIR/cache/"
6
+ export MODELSCOPE_CACHE="$ROOT_DIR/cache/"
7
+ export HF_EVALUATE_OFFLINE=1
8
+ export HF_DATASETS_OFFLINE=1
9
+ export NPROC_PER_NODE=8
10
+
11
+ # model
12
+ # model_name=GemmaX2-28-2B-Pretrain
13
+ # model_name=Qwen2.5-3B
14
+ # model_name=Qwen2.5-7B
15
+ model_name=Qwen3-4B-Base
16
+ model_dir=$ROOT_DIR/model_card/$model_name
17
+ # model_dir=$ROOT_DIR/exps_arr/Qwen3-4B-Base/cpt_mono_0.5B
18
+ config_file=$ROOT_DIR/configs/ds_z2_config_bf16.json
19
+ # resume_from_checkpoint=
20
+
21
+ # data
22
+ dataset=$ROOT_DIR/data_arr/sft_0915_0.1/train.jsonl
23
+ val_dataset=$ROOT_DIR/data_arr/sft_0915_0.1/valid.jsonl
24
+ per_device_train_batch_size=12
25
+ gradient_accumulation_steps=1 #
26
+
27
+ max_lengths=1024
28
+ num_train_epochs=1
29
+
30
+ # save
31
+ task=sft_0915_0.1
32
+ tag=base
33
+
34
+ output_dir=$ROOT_DIR/exps_arr/$model_name/$task/$tag
35
+ mkdir -p $output_dir
36
+ cp $0 $output_dir
37
+
38
+
39
+ swift sft \
40
+ --deepspeed $config_file \
41
+ --add_version False \
42
+ --check_model False \
43
+ --load_from_cache_file \
44
+ --model $model_dir \
45
+ --train_type full \
46
+ --attn_impl flash_attn \
47
+ --dataset $dataset \
48
+ --split_dataset_ratio 0 \
49
+ --val_dataset $val_dataset \
50
+ --torch_dtype bfloat16 \
51
+ --num_train_epochs $num_train_epochs \
52
+ --per_device_train_batch_size $per_device_train_batch_size \
53
+ --per_device_eval_batch_size $per_device_train_batch_size \
54
+ --learning_rate 2e-5 \
55
+ --gradient_accumulation_steps $gradient_accumulation_steps \
56
+ --save_strategy steps \
57
+ --logging_strategy steps \
58
+ --eval_strategy steps \
59
+ --eval_steps 0.1 \
60
+ --save_steps 0.1 \
61
+ --logging_steps 10 \
62
+ --max_length $max_lengths \
63
+ --output_dir $output_dir \
64
+ --create_checkpoint_symlink \
65
+ --warmup_ratio 0.01 \
66
+ --dataloader_num_workers 8 \
67
+ --dataset_num_proc 16 \
68
+ --seed 42 \
69
+ --report_to tensorboard \
70
+ --save_only_model \
71
+ --save_total_limit 3 \
72
+ --ddp_timeout 180000000 | tee $output_dir/train.log
73
+
74
+
75
+ # predict
76
+ bash inference.sh $output_dir/best