model: d_model: 768 enc_layers: 12 dec_layers: 2 n_heads: 16 ffn_dim: 4096 max_len: 256 vocab_size_src: 32000 vocab_size_tgt: 32000 dropout: 0.05 mlp_type: "standard" # standard or gated activation: "gelu" # gelu or silu norm_type: "layernorm" # layernorm or rmsnorm ff_bias: true tie_decoder_embeddings: false layernorm_eps: 1.0e-5 data: src_lang: "uk" tgt_lang: "en" src_dev_path: "dev.ukr" tgt_dev_path: "dev.eng" max_tokens_per_batch: 6000 src_spm_nbest_size: -1 src_spm_alpha: 0.5 tgt_spm_nbest_size: 1 tgt_spm_alpha: 1.0 corpora: - src_file: "train.cleaned.filtered.ukr" tgt_file: "train.cleaned.filtered.eng" weight: 1 start_step: 1000 - src_file: "finetranslations.ukr_Cyrl-eng_Latn.ukr_Cyrl" tgt_file: "finetranslations.ukr_Cyrl-eng_Latn.eng_Latn" weight: 1 start_step: 0 stop_step: 80000 train: experiment_name: "uken-base" lr: 2.5e-3 grad_clip: 0.5 accum_steps: 20 max_checkpoints: 10 precision: "bfloat16" warmup_steps: 5000 max_steps: 108000 eval_steps: 1000 export: k: 5