quickmt-is-en / config.yaml
radinplaid's picture
Upload folder using huggingface_hub
2e2007c verified
Raw
History Blame Contribute Delete
1.49 kB
train:
experiment_name: "isen1"
lr: 2.5e-3
accum_steps: 6
warmup_steps: 10000
max_steps: 100000
eval_steps: 1000
max_checkpoints: 10
precision: "bfloat16" # or float16 with an older GPU
enable_torch_compile: true
checkpoint_strategy: best
early_stopping_patience: 0
early_stopping_metric: chrf
use_ema: true
ema_decay: 0.9999
ema_start_step: 10000
z_loss_coeff: 0.0005
weight_decay_embeddings: false
scheduler_type: "cosine"
data:
src_lang: "is"
tgt_lang: "en"
src_dev_path: "quickmt-valid.is-en.is"
tgt_dev_path: "quickmt-valid.is-en.en"
input_sentence_size: 10000000
max_tokens_per_batch: 20000
buffer_size: 40000
num_workers: 4
prefetch_factor: 128
pad_multiple: 1
corpora:
- src_file: "quickmt-train.is-en.is"
tgt_file: "quickmt-train.is-en.en"
weight: 10
start_step: 0
- src_file: "finetranslations-sample-is-en.is"
tgt_file: "finetranslations-sample-is-en.en"
weight: 4
start_step: 0
stop_step: 80000
- src_file: "newscrawl2024-en-backtranslated-is.is"
tgt_file: "newscrawl2024-en-backtranslated-is.en"
start_step: 0
weight: 5
stop_step: 80000
model:
d_model: 768
enc_layers: 12
dec_layers: 2
n_heads: 12
ffn_dim: 4096
max_len: 256
vocab_size_src: 32000
vocab_size_tgt: 32000
norm_type: "rmsnorm"
mlp_type: "gated"
activation: "silu"
ff_bias: false
layernorm_eps: 1.0e-5
dropout: 0.1
export:
max_len: 256