| set -eux | |
| ROOT_DIR=$(dirname $(dirname `readlink -f $0`)) | |
| export HF_HOME="$ROOT_DIR/cache/" | |
| export MODELSCOPE_CACHE="$ROOT_DIR/cache/" | |
| export HF_EVALUATE_OFFLINE=1 | |
| export HF_DATASETS_OFFLINE=1 | |
| config_file=$ROOT_DIR/configs/accelerate_config.yaml | |
| export NPROC_PER_NODE=8 | |
| # model | |
| predict_model_dir=${1:-""} | |
| # eval | |
| comet_model=$ROOT_DIR/model_card/wmt22-comet-da/checkpoints/model.ckpt | |
| xcome_model=$ROOT_DIR/model_card/XCOMET-XXL/checkpoints/model.ckpt | |
| lang_pair_strs="" | |
| src_file_strs="" | |
| ref_file_strs="" | |
| hypo_file_strs="" | |
| # for lang in en ja ru de ug; do | |
| for lang in en de ru bn hi th jv sw si km; do | |
| # for lang in en ja ko ru de fr it pt es;do | |
| for src in $lang zh ;do | |
| if [ $src = "zh" ]; then # en2zh | |
| src_lang=zh | |
| tgt_lang=$lang | |
| else # zh2en | |
| src_lang=$lang | |
| tgt_lang=zh | |
| fi | |
| lp=${src_lang}2${tgt_lang} | |
| src_file=$ROOT_DIR/data_arr/flores200/zh-${lang}/test.zh-$lang.$src_lang | |
| ref_file=$ROOT_DIR/data_arr/flores200/zh-${lang}/test.zh-$lang.$tgt_lang | |
| # test_file=$ROOT_DIR/data_arr/sft_100k_ugbomn/test.$lp.jsonl | |
| test_file=$ROOT_DIR/data_arr/test/test.$lp.jsonl | |
| output_dir=$predict_model_dir/decode_result/$lp | |
| mkdir -p $output_dir | |
| ############################!!!!! | |
| rm -rf $output_dir/* | |
| ####################### | |
| cp $0 $output_dir | |
| swift infer \ | |
| --infer_backend pt \ | |
| --val_dataset $test_file \ | |
| --load_from_cache_file True \ | |
| --dataset_shuffle False \ | |
| --val_dataset_shuffle False \ | |
| --model $predict_model_dir \ | |
| --torch_dtype bfloat16 \ | |
| --max_new_tokens 1024 \ | |
| --max_batch_size 16 \ | |
| --num_beams 5 \ | |
| --max_length 1024 \ | |
| --dataset_num_proc 8 \ | |
| --temperature 0 \ | |
| --result_path $output_dir/generated_predictions.jsonl | tee $output_dir/train.log | |
| jq -r '.response' $output_dir/generated_predictions.jsonl > $output_dir/hypo.$lp.txt | |
| hypo_file=$output_dir/hypo.$lp.txt | |
| lang_pair_strs=${lang_pair_strs:+$lang_pair_strs,}$lp | |
| src_file_strs=${src_file_strs:+$src_file_strs,}$src_file | |
| ref_file_strs=${ref_file_strs:+$ref_file_strs,}$ref_file | |
| hypo_file_strs=${hypo_file_strs:+$hypo_file_strs,}$hypo_file | |
| done | |
| done | |
| # # metric="bleu,comet_22,xcomet_xxl" | |
| metric="bleu,comet_22" | |
| python $ROOT_DIR/src/mt_scoring.py \ | |
| --metric $metric \ | |
| --comet_22_path $comet_model \ | |
| --xcomet_xxl_path $xcome_model \ | |
| --lang_pair $lang_pair_strs \ | |
| --src_file $src_file_strs \ | |
| --ref_file $ref_file_strs \ | |
| --hypo_file $hypo_file_strs \ | |
| --record_file "result_mt.xlsx" | |
| # lang_pair_strs="" | |
| # src_file_strs="" | |
| # ref_file_strs="" | |
| # hypo_file_strs="" | |
| # # | |
| # # for lang in ja ko ru de fr it pt es;do | |
| # # for lang in ja ru de ug; do | |
| # for lang in mn_cn; do | |
| # for src in $lang en ;do | |
| # if [ $src = "en" ]; then # en2zh | |
| # src_lang=en | |
| # tgt_lang=$lang | |
| # else # zh2en | |
| # src_lang=$lang | |
| # tgt_lang=en | |
| # fi | |
| # lp=${src_lang}2${tgt_lang} | |
| # src_file=$ROOT_DIR/data_arr/flores200/en-${lang}/test.en-$lang.$src_lang | |
| # ref_file=$ROOT_DIR/data_arr/flores200/en-${lang}/test.en-$lang.$tgt_lang | |
| # # test_file=$ROOT_DIR/data_arr/sft_100k_ugbomn/test.$lp.jsonl | |
| # test_file=/mnt/nvme1/luoyingfeng/llm-mt/data_arr/merge_0701/train1/test/test.$lp.jsonl | |
| # output_dir=$predict_model_dir/decode_result/$lp | |
| # mkdir -p $output_dir | |
| # #############################!!!!! | |
| # rm -rf $output_dir/* | |
| # ######################## | |
| # cp $0 $output_dir | |
| # # --load_args False \ | |
| # swift infer \ | |
| # --infer_backend pt \ | |
| # --val_dataset $test_file \ | |
| # --load_from_cache_file True \ | |
| # --dataset_shuffle False \ | |
| # --val_dataset_shuffle False \ | |
| # --model $predict_model_dir \ | |
| # --torch_dtype bfloat16 \ | |
| # --max_new_tokens 1024 \ | |
| # --max_batch_size 8 \ | |
| # --num_beams 5 \ | |
| # --max_length 1024 \ | |
| # --dataset_num_proc 8 \ | |
| # --temperature 0 \ | |
| # --result_path $output_dir/generated_predictions.jsonl | tee $output_dir/train.log | |
| # jq -r '.response' $output_dir/generated_predictions.jsonl > $output_dir/hypo.$lp.txt | |
| # hypo_file=$output_dir/hypo.$lp.txt | |
| # lang_pair_strs=${lang_pair_strs:+$lang_pair_strs,}$lp | |
| # src_file_strs=${src_file_strs:+$src_file_strs,}$src_file | |
| # ref_file_strs=${ref_file_strs:+$ref_file_strs,}$ref_file | |
| # hypo_file_strs=${hypo_file_strs:+$hypo_file_strs,}$hypo_file | |
| # done | |
| # done | |
| # # metric="bleu,comet_22,xcomet_xxl" | |
| # metric="bleu,comet_22" | |
| # python $ROOT_DIR/src/mt_scoring.py \ | |
| # --metric $metric \ | |
| # --comet_22_path $comet_model \ | |
| # --xcomet_xxl_path $xcome_model \ | |
| # --lang_pair $lang_pair_strs \ | |
| # --src_file $src_file_strs \ | |
| # --ref_file $ref_file_strs \ | |
| # --hypo_file $hypo_file_strs \ | |
| # --record_file "result_mt.xlsx" | |