| model: | |
| d_model: 768 | |
| enc_layers: 12 | |
| dec_layers: 2 | |
| n_heads: 16 | |
| ffn_dim: 4096 | |
| max_len: 256 | |
| vocab_size_src: 32000 | |
| vocab_size_tgt: 32000 | |
| dropout: 0.05 | |
| mlp_type: "standard" # standard or gated | |
| activation: "gelu" # gelu or silu | |
| norm_type: "layernorm" # layernorm or rmsnorm | |
| ff_bias: true | |
| tie_decoder_embeddings: false | |
| layernorm_eps: 1.0e-5 | |
| data: | |
| src_lang: "uk" | |
| tgt_lang: "en" | |
| src_dev_path: "dev.ukr" | |
| tgt_dev_path: "dev.eng" | |
| max_tokens_per_batch: 6000 | |
| src_spm_nbest_size: -1 | |
| src_spm_alpha: 0.5 | |
| tgt_spm_nbest_size: 1 | |
| tgt_spm_alpha: 1.0 | |
| corpora: | |
| - src_file: "train.cleaned.filtered.ukr" | |
| tgt_file: "train.cleaned.filtered.eng" | |
| weight: 1 | |
| start_step: 1000 | |
| - src_file: "finetranslations.ukr_Cyrl-eng_Latn.ukr_Cyrl" | |
| tgt_file: "finetranslations.ukr_Cyrl-eng_Latn.eng_Latn" | |
| weight: 1 | |
| start_step: 0 | |
| stop_step: 80000 | |
| train: | |
| experiment_name: "uken-base" | |
| lr: 2.5e-3 | |
| grad_clip: 0.5 | |
| accum_steps: 20 | |
| max_checkpoints: 10 | |
| precision: "bfloat16" | |
| warmup_steps: 5000 | |
| max_steps: 108000 | |
| eval_steps: 1000 | |
| export: | |
| k: 5 |