train: experiment_name: "isen1" lr: 2.5e-3 accum_steps: 6 warmup_steps: 10000 max_steps: 100000 eval_steps: 1000 max_checkpoints: 10 precision: "bfloat16" # or float16 with an older GPU enable_torch_compile: true checkpoint_strategy: best early_stopping_patience: 0 early_stopping_metric: chrf use_ema: true ema_decay: 0.9999 ema_start_step: 10000 z_loss_coeff: 0.0005 weight_decay_embeddings: false scheduler_type: "cosine" data: src_lang: "is" tgt_lang: "en" src_dev_path: "quickmt-valid.is-en.is" tgt_dev_path: "quickmt-valid.is-en.en" input_sentence_size: 10000000 max_tokens_per_batch: 20000 buffer_size: 40000 num_workers: 4 prefetch_factor: 128 pad_multiple: 1 corpora: - src_file: "quickmt-train.is-en.is" tgt_file: "quickmt-train.is-en.en" weight: 10 start_step: 0 - src_file: "finetranslations-sample-is-en.is" tgt_file: "finetranslations-sample-is-en.en" weight: 4 start_step: 0 stop_step: 80000 - src_file: "newscrawl2024-en-backtranslated-is.is" tgt_file: "newscrawl2024-en-backtranslated-is.en" start_step: 0 weight: 5 stop_step: 80000 model: d_model: 768 enc_layers: 12 dec_layers: 2 n_heads: 12 ffn_dim: 4096 max_len: 256 vocab_size_src: 32000 vocab_size_tgt: 32000 norm_type: "rmsnorm" mlp_type: "gated" activation: "silu" ff_bias: false layernorm_eps: 1.0e-5 dropout: 0.1 export: max_len: 256