## IO save_data: data overwrite: True seed: 1234 report_every: 100 valid_metrics: ["BLEU"] tensorboard: true tensorboard_log_dir: tensorboard ### Vocab src_vocab: fren/fr.eole.vocab tgt_vocab: fren/en.eole.vocab src_vocab_size: 32000 tgt_vocab_size: 32000 vocab_size_multiple: 8 share_vocab: false n_sample: 0 data: corpus_1: path_src: fren/train.cleaned.filtered.fr path_tgt: fren/train.cleaned.filtered.en weight: 200 corpus_2: path_src: ../data/newscrawl.backtrans.cleaned.filtered.fr path_tgt: ../data/newscrawl.backtrans.cleaned.filtered.en weight: 35 corpus_3: path_src: ../data/madlad.backtrans.cleaned.filtered.fr path_tgt: ../data/madlad.backtrans.cleaned.filtered.en weight: 68 corpus_4: path_src: ../data/hansard.fr path_tgt: ../data/hansard.en weight: 5 valid: path_src: fren/dev.fr path_tgt: fren/dev.en transforms: [sentencepiece, filtertoolong] transforms_configs: sentencepiece: src_subword_model: "fren/fr.spm.model" tgt_subword_model: "fren/en.spm.model" filtertoolong: src_seq_length: 256 tgt_seq_length: 256 training: # Run configuration model_path: quickmt-fr-en-eole-model keep_checkpoint: 4 train_steps: 200000 save_checkpoint_steps: 5000 valid_steps: 5000 # Train on a single GPU world_size: 1 gpu_ranks: [0] # Batching 120,000 tokens # For RTX 5090, 15000 batch size, accum_count 8 batch_type: "tokens" batch_size: 6000 valid_batch_size: 2048 batch_size_multiple: 8 accum_count: [20] accum_steps: [0] # Optimizer & Compute compute_dtype: "fp16" optim: "adamw" #use_amp: True learning_rate: 3.0 warmup_steps: 5000 decay_method: "noam" adam_beta2: 0.998 # Data loading bucket_size: 256000 num_workers: 4 prefetch_factor: 128 # Hyperparams dropout_steps: [0] dropout: [0.1] attention_dropout: [0.1] max_grad_norm: 0 label_smoothing: 0.1 average_decay: 0.0001 param_init_method: xavier_uniform normalization: "tokens" model: architecture: "transformer" share_embeddings: false share_decoder_embeddings: false add_estimator: false add_ffnbias: true add_qkvbias: false layer_norm: standard mlp_activation_fn: gelu hidden_size: 768 encoder: layers: 12 decoder: layers: 2 heads: 16 transformer_ff: 4096 embeddings: word_vec_size: 768 position_encoding_type: "SinusoidalInterleaved"