#! /usr/bin/bash set -eux train_device=0,1,2,3,4,5,6,7 eval_device=0 # xzq-fairseq root_dir=$(dirname "$PWD") src_lang=en tgt_lang=de threshold=0.7 data_name=wmt23 # pair_lang=${src_lang}-${tgt_lang} task_name=${src_lang}2${tgt_lang} data_dir=$root_dir/data/${task_name}/${threshold} raw_data_dir=$data_dir/raw trainable_data_dir=$data_dir/trainable_data ## eval&decode param decode_max_tokens=2048 beam=5 nbest=1 lenpen=1.0 ## common param criterion=label_smoothed_cross_entropy label_smoothing=0.1 seed=42 max_epoch=40 keep_last_epochs=1 keep_best_checkpoints=5 patience=5 num_workers=8 # specified param conf_name=transformer_big # Global Batch=卡数*max-tokens*梯度累计,对于训练数据较大的语种(train-set几十M),global batch在 100k tokens以上较好 if [ $conf_name == "transformer_big" ]; then arch=transformer_vaswani_wmt_en_de_big activation_fn=relu encoder_ffn_embed_dim=4096 share_all_embeddings=0 share_decoder_input_output_embed=1 learing_rate=1e-3 warmup=4000 max_tokens=8192 weight_decay=0.0 dropout=0.3 gradient_accumulation_steps=4 else echo "unknown conf_name=$conf_name" exit fi model_dir=$root_dir/exps/$task_name/${threshold}/${conf_name}_${data_name} mkdir -p $model_dir cp ${BASH_SOURCE[0]} $model_dir gpu_num=`echo "$train_device" | awk '{split($0,arr,",");print length(arr)}'` export CUDA_VISIBLE_DEVICES=$train_device cmd="fairseq-train $trainable_data_dir \ --distributed-world-size $gpu_num -s $src_lang -t $tgt_lang \ --arch $arch \ --fp16 \ --optimizer adam --clip-norm 0.0 \ --lr-scheduler inverse_sqrt --warmup-init-lr 1e-07 --warmup-updates $warmup \ --lr $learing_rate --adam-betas '(0.9, 0.98)' \ --weight-decay $weight_decay \ --dropout $dropout \ --criterion $criterion --label-smoothing $label_smoothing \ --max-epoch $max_epoch \ --max-tokens $max_tokens \ --update-freq $gradient_accumulation_steps \ --activation-fn $activation_fn \ --encoder-ffn-embed-dim $encoder_ffn_embed_dim \ --seed $seed \ --num-workers $num_workers \ --no-epoch-checkpoints \ --keep-last-epochs $keep_last_epochs \ --keep-best-checkpoints $keep_best_checkpoints \ --patience $patience \ --no-progress-bar \ --log-interval 100 \ --task "translation" \ --ddp-backend no_c10d \ --save-dir $model_dir \ --tensorboard-logdir $model_dir" # add param if [ $share_all_embeddings -eq 1 ]; then cmd=${cmd}" --share-all-embeddings " fi if [ $share_decoder_input_output_embed -eq 1 ]; then cmd=${cmd}" --share-decoder-input-output-embed " fi if [ ${max_update:=0} -ne 0 ]; then cmd=${cmd}" --max-update $max_update" fi # run command cur_time=`date +"%Y-%m-%d %H:%M:%S"` echo "=============$cur_time===================" >> $model_dir/train.log cmd="nohup ${cmd} >> $model_dir/train.log 2>&1 &" eval $cmd # wait # ### decode # checkpoint_path=$model_dir/checkpoint_best.pt # save_dir=$model_dir/decode_result # mkdir -p $save_dir # cp ${BASH_SOURCE[0]} $save_dir # declare -A gen_subset_dict # gen_subset_dict=([test]=flores [test1]=wmt22 [test2]=wmt23) # for gen_subset in ${!gen_subset_dict[*]} # do # decode_file=$save_dir/decode_${gen_subset_dict[$gen_subset]}_beam${beam}_lenpen${lenpen}.$tgt_lang # pure_file=$save_dir/pure_decode_${gen_subset_dict[$gen_subset]}_beam${beam}_lenpen${lenpen}.$tgt_lang # CUDA_VISIBLE_DEVICES=$eval_device fairseq-generate \ # $trainable_data_dir \ # -s $src_lang -t $tgt_lang \ # --user-dir $user_dir \ # --gen-subset $gen_subset \ # --path $checkpoint_path \ # --max-tokens $decode_max_tokens \ # --beam $beam \ # --nbest $nbest \ # --lenpen $lenpen \ # --seed $seed \ # --remove-bpe | tee $decode_file # ### eval # # purify file # grep ^H $decode_file | LC_ALL=C sort -V | cut -f3- | perl $root_dir/mosesdecoder/scripts/tokenizer/detokenizer.perl -l $tgt_lang > $pure_file # eval_file=$model_dir/eval_${gen_subset_dict[$gen_subset]}.log # cur_time=`date +"%Y-%m-%d %H:%M:%S"` # echo "=============$cur_time===================" >> $eval_file # echo $checkpoint_path >> $eval_file # tail -n1 $decode_file >> $eval_file # multi-bleu # # get score # src_file=$raw_data_dir/test.${gen_subset_dict[$gen_subset]}.$src_lang # ref_file=$raw_data_dir/test.${gen_subset_dict[$gen_subset]}.$tgt_lang # sacrebleu_file=$save_dir/sacrebleu.${gen_subset_dict[$gen_subset]}.beam${beam}_lenpen${lenpen} # comet22_file=$save_dir/comet22.${gen_subset_dict[$gen_subset]}.beam${beam}_lenpen${lenpen} # sacrebleu $ref_file -i $pure_file -w 2 >> $eval_file # comet-score -s $src_file -t $pure_file -r $ref_file --model $root_dir/wmt22-comet-da/checkpoints/model.ckpt | tee $comet22_file # echo "Comet22 Score" >> $eval_file # tail -n1 $comet22_file >> $eval_file # 只取平均comet分 # echo -e "decode finished! \n decode tokenized file in $decode_file \n detokenized file in $pure_file \n sacrebleu file in $eval_file" # done