|
|
|
|
|
save_data: data |
|
|
overwrite: True |
|
|
seed: 1234 |
|
|
report_every: 100 |
|
|
valid_metrics: ["BLEU"] |
|
|
tensorboard: true |
|
|
tensorboard_log_dir: tensorboard |
|
|
|
|
|
|
|
|
src_vocab: fa.eole.vocab |
|
|
tgt_vocab: en.eole.vocab |
|
|
src_vocab_size: 32000 |
|
|
tgt_vocab_size: 32000 |
|
|
vocab_size_multiple: 8 |
|
|
share_vocab: false |
|
|
n_sample: 0 |
|
|
|
|
|
data: |
|
|
corpus_1: |
|
|
path_src: hf://quickmt/quickmt-train.fa-en/fa |
|
|
path_tgt: hf://quickmt/quickmt-train.fa-en/en |
|
|
path_sco: hf://quickmt/quickmt-train.fa-en/sco |
|
|
weight: 2 |
|
|
corpus_2: |
|
|
path_src: hf://quickmt/newscrawl2024-en-backtranslated-fa/fa |
|
|
path_tgt: hf://quickmt/newscrawl2024-en-backtranslated-fa/en |
|
|
path_sco: hf://quickmt/newscrawl2024-en-backtranslated-fa/sco |
|
|
weight: 1 |
|
|
corpus_3: |
|
|
path_src: hf://quickmt/madlad400-en-backtranslated-fa/fa |
|
|
path_tgt: hf://quickmt/madlad400-en-backtranslated-fa/en |
|
|
path_sco: hf://quickmt/madlad400-en-backtranslated-fa/sco |
|
|
weight: 2 |
|
|
valid: |
|
|
path_src: dev.fa |
|
|
path_tgt: dev.en |
|
|
|
|
|
transforms: [sentencepiece, filtertoolong] |
|
|
transforms_configs: |
|
|
sentencepiece: |
|
|
src_subword_model: "fa.spm.model" |
|
|
tgt_subword_model: "en.spm.model" |
|
|
filtertoolong: |
|
|
src_seq_length: 256 |
|
|
tgt_seq_length: 256 |
|
|
|
|
|
training: |
|
|
|
|
|
model_path: quickmt-fa-en-eole-model |
|
|
keep_checkpoint: 4 |
|
|
train_steps: 200000 |
|
|
save_checkpoint_steps: 5000 |
|
|
valid_steps: 5000 |
|
|
|
|
|
|
|
|
world_size: 1 |
|
|
gpu_ranks: [0] |
|
|
|
|
|
|
|
|
batch_type: "tokens" |
|
|
batch_size: 6000 |
|
|
valid_batch_size: 2048 |
|
|
batch_size_multiple: 8 |
|
|
accum_count: [20] |
|
|
accum_steps: [0] |
|
|
|
|
|
|
|
|
compute_dtype: "fp16" |
|
|
optim: "adamw" |
|
|
learning_rate: 3.0 |
|
|
warmup_steps: 5000 |
|
|
decay_method: "noam" |
|
|
adam_beta2: 0.998 |
|
|
|
|
|
|
|
|
bucket_size: 256000 |
|
|
num_workers: 4 |
|
|
prefetch_factor: 128 |
|
|
|
|
|
|
|
|
dropout_steps: [0] |
|
|
dropout: [0.1] |
|
|
attention_dropout: [0.1] |
|
|
max_grad_norm: 0 |
|
|
label_smoothing: 0.1 |
|
|
average_decay: 0.0001 |
|
|
param_init_method: xavier_uniform |
|
|
normalization: "tokens" |
|
|
|
|
|
model: |
|
|
architecture: "transformer" |
|
|
share_embeddings: false |
|
|
share_decoder_embeddings: false |
|
|
add_estimator: false |
|
|
add_ffnbias: true |
|
|
add_qkvbias: false |
|
|
layer_norm: standard |
|
|
mlp_activation_fn: gelu |
|
|
hidden_size: 768 |
|
|
encoder: |
|
|
layers: 12 |
|
|
decoder: |
|
|
layers: 2 |
|
|
heads: 16 |
|
|
transformer_ff: 4096 |
|
|
embeddings: |
|
|
word_vec_size: 768 |
|
|
position_encoding_type: "SinusoidalInterleaved" |
|
|
|
|
|
|
|
|
|