|
|
#! /usr/bin/bash |
|
|
set -eux |
|
|
|
|
|
train_device=0,1,2,3,4,5,6,7 |
|
|
eval_device=0 |
|
|
|
|
|
root_dir=$(dirname "$PWD") |
|
|
|
|
|
src_lang=en |
|
|
tgt_lang=de |
|
|
threshold=0.7 |
|
|
|
|
|
data_name=wmt23 |
|
|
|
|
|
task_name=${src_lang}2${tgt_lang} |
|
|
data_dir=$root_dir/data/${task_name}/${threshold} |
|
|
raw_data_dir=$data_dir/raw |
|
|
trainable_data_dir=$data_dir/trainable_data |
|
|
|
|
|
|
|
|
decode_max_tokens=2048 |
|
|
beam=5 |
|
|
nbest=1 |
|
|
lenpen=1.0 |
|
|
|
|
|
|
|
|
criterion=label_smoothed_cross_entropy |
|
|
label_smoothing=0.1 |
|
|
seed=42 |
|
|
max_epoch=40 |
|
|
keep_last_epochs=1 |
|
|
keep_best_checkpoints=5 |
|
|
patience=5 |
|
|
num_workers=8 |
|
|
|
|
|
|
|
|
conf_name=transformer_big |
|
|
|
|
|
if [ $conf_name == "transformer_big" ]; then |
|
|
arch=transformer_vaswani_wmt_en_de_big |
|
|
activation_fn=relu |
|
|
encoder_ffn_embed_dim=4096 |
|
|
share_all_embeddings=0 |
|
|
share_decoder_input_output_embed=1 |
|
|
learing_rate=1e-3 |
|
|
warmup=4000 |
|
|
max_tokens=8192 |
|
|
weight_decay=0.0 |
|
|
dropout=0.3 |
|
|
gradient_accumulation_steps=4 |
|
|
else |
|
|
echo "unknown conf_name=$conf_name" |
|
|
exit |
|
|
fi |
|
|
|
|
|
model_dir=$root_dir/exps/$task_name/${threshold}/${conf_name}_${data_name} |
|
|
mkdir -p $model_dir |
|
|
cp ${BASH_SOURCE[0]} $model_dir |
|
|
|
|
|
gpu_num=`echo "$train_device" | awk '{split($0,arr,",");print length(arr)}'` |
|
|
export CUDA_VISIBLE_DEVICES=$train_device |
|
|
cmd="fairseq-train $trainable_data_dir \ |
|
|
--distributed-world-size $gpu_num -s $src_lang -t $tgt_lang \ |
|
|
--arch $arch \ |
|
|
--fp16 \ |
|
|
--optimizer adam --clip-norm 0.0 \ |
|
|
--lr-scheduler inverse_sqrt --warmup-init-lr 1e-07 --warmup-updates $warmup \ |
|
|
--lr $learing_rate --adam-betas '(0.9, 0.98)' \ |
|
|
--weight-decay $weight_decay \ |
|
|
--dropout $dropout \ |
|
|
--criterion $criterion --label-smoothing $label_smoothing \ |
|
|
--max-epoch $max_epoch \ |
|
|
--max-tokens $max_tokens \ |
|
|
--update-freq $gradient_accumulation_steps \ |
|
|
--activation-fn $activation_fn \ |
|
|
--encoder-ffn-embed-dim $encoder_ffn_embed_dim \ |
|
|
--seed $seed \ |
|
|
--num-workers $num_workers \ |
|
|
--no-epoch-checkpoints \ |
|
|
--keep-last-epochs $keep_last_epochs \ |
|
|
--keep-best-checkpoints $keep_best_checkpoints \ |
|
|
--patience $patience \ |
|
|
--no-progress-bar \ |
|
|
--log-interval 100 \ |
|
|
--task "translation" \ |
|
|
--ddp-backend no_c10d \ |
|
|
--save-dir $model_dir \ |
|
|
--tensorboard-logdir $model_dir" |
|
|
|
|
|
|
|
|
if [ $share_all_embeddings -eq 1 ]; then |
|
|
cmd=${cmd}" --share-all-embeddings " |
|
|
fi |
|
|
if [ $share_decoder_input_output_embed -eq 1 ]; then |
|
|
cmd=${cmd}" --share-decoder-input-output-embed " |
|
|
fi |
|
|
if [ ${max_update:=0} -ne 0 ]; then |
|
|
cmd=${cmd}" --max-update $max_update" |
|
|
fi |
|
|
|
|
|
|
|
|
cur_time=`date +"%Y-%m-%d %H:%M:%S"` |
|
|
echo "=============$cur_time===================" >> $model_dir/train.log |
|
|
cmd="nohup ${cmd} >> $model_dir/train.log 2>&1 &" |
|
|
|
|
|
eval $cmd |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|