| train: | |
| experiment_name: "isen1" | |
| lr: 2.5e-3 | |
| accum_steps: 6 | |
| warmup_steps: 10000 | |
| max_steps: 100000 | |
| eval_steps: 1000 | |
| max_checkpoints: 10 | |
| precision: "bfloat16" # or float16 with an older GPU | |
| enable_torch_compile: true | |
| checkpoint_strategy: best | |
| early_stopping_patience: 0 | |
| early_stopping_metric: chrf | |
| use_ema: true | |
| ema_decay: 0.9999 | |
| ema_start_step: 10000 | |
| z_loss_coeff: 0.0005 | |
| weight_decay_embeddings: false | |
| scheduler_type: "cosine" | |
| data: | |
| src_lang: "is" | |
| tgt_lang: "en" | |
| src_dev_path: "quickmt-valid.is-en.is" | |
| tgt_dev_path: "quickmt-valid.is-en.en" | |
| input_sentence_size: 10000000 | |
| max_tokens_per_batch: 20000 | |
| buffer_size: 40000 | |
| num_workers: 4 | |
| prefetch_factor: 128 | |
| pad_multiple: 1 | |
| corpora: | |
| - src_file: "quickmt-train.is-en.is" | |
| tgt_file: "quickmt-train.is-en.en" | |
| weight: 10 | |
| start_step: 0 | |
| - src_file: "finetranslations-sample-is-en.is" | |
| tgt_file: "finetranslations-sample-is-en.en" | |
| weight: 4 | |
| start_step: 0 | |
| stop_step: 80000 | |
| - src_file: "newscrawl2024-en-backtranslated-is.is" | |
| tgt_file: "newscrawl2024-en-backtranslated-is.en" | |
| start_step: 0 | |
| weight: 5 | |
| stop_step: 80000 | |
| model: | |
| d_model: 768 | |
| enc_layers: 12 | |
| dec_layers: 2 | |
| n_heads: 12 | |
| ffn_dim: 4096 | |
| max_len: 256 | |
| vocab_size_src: 32000 | |
| vocab_size_tgt: 32000 | |
| norm_type: "rmsnorm" | |
| mlp_type: "gated" | |
| activation: "silu" | |
| ff_bias: false | |
| layernorm_eps: 1.0e-5 | |
| dropout: 0.1 | |
| export: | |
| max_len: 256 | |