File size: 3,089 Bytes
321e894 a81dfb3 321e894 a81dfb3 321e894 a81dfb3 321e894 a81dfb3 321e894 a81dfb3 321e894 a81dfb3 321e894 a81dfb3 321e894 a81dfb3 321e894 a81dfb3 321e894 a81dfb3 321e894 a81dfb3 321e894 a81dfb3 321e894 a81dfb3 321e894 a81dfb3 321e894 a81dfb3 321e894 a81dfb3 321e894 a81dfb3 321e894 a81dfb3 321e894 a81dfb3 321e894 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
## IO
save_data: data
overwrite: True
seed: 1234
report_every: 100
valid_metrics: ["BLEU"]
tensorboard: true
tensorboard_log_dir: tensorboard
### Vocab
src_vocab: ar.eole.vocab
tgt_vocab: en.eole.vocab
src_vocab_size: 32000
tgt_vocab_size: 32000
vocab_size_multiple: 8
share_vocab: false
n_sample: 0
data:
corpus_1:
path_src: train.ar
path_tgt: train.en
weight: 2
corpus_2:
path_src: newscrawl.backtrans.ar
path_tgt: newscrawl.2024.en
weight: 1
corpus_3:
path_src: madlad.backtrans.ar
path_tgt: madlad.en
weight: 2
valid:
path_src: valid.ar
path_tgt: valid.en
# data:
# corpus_1:
# path_src: hf://quickmt/quickmt-train.ar-en/ar
# path_tgt: hf://quickmt/quickmt-train.ar-en/en
# path_sco: hf://quickmt/quickmt-train.ar-en/sco
# weight: 2
# corpus_2:
# path_src: hf://quickmt/newscrawl2024-en-backtranslated-ar/ar
# path_tgt: hf://quickmt/newscrawl2024-en-backtranslated-ar/en
# path_sco: hf://quickmt/newscrawl2024-en-backtranslated-ar/sco
# weight: 1
# corpus_3:
# path_src: hf://quickmt/madlad400-en-backtranslated-ar/ar
# path_tgt: hf://quickmt/madlad400-en-backtranslated-ar/en
# path_sco: hf://quickmt/madlad400-en-backtranslated-ar/sco
# weight: 2
# valid:
# path_src: valid.ar
# path_tgt: valid.en
transforms: [sentencepiece, filtertoolong]
transforms_configs:
sentencepiece:
src_subword_model: "ar.spm.model"
tgt_subword_model: "en.spm.model"
filtertoolong:
src_seq_length: 256
tgt_seq_length: 256
training:
# Run configuration
model_path: quickmt-ar-en-eole-model
keep_checkpoint: 4
train_steps: 200000
save_checkpoint_steps: 5000
valid_steps: 5000
# Train on a single GPU
world_size: 1
gpu_ranks: [0]
# Batching 120,000 tokens
# For RTX 5090, 15000 batch size, accum_count 8
batch_type: "tokens"
batch_size: 15000
valid_batch_size: 2048
batch_size_multiple: 8
accum_count: [8]
accum_steps: [0]
# Optimizer & Compute
compute_dtype: "fp16"
optim: "adamw"
use_amp: True
learning_rate: 3.0
warmup_steps: 5000
decay_method: "noam"
adam_beta2: 0.998
# Data loading
bucket_size: 256000
num_workers: 4
prefetch_factor: 128
# Hyperparams
dropout_steps: [0]
dropout: [0.1]
attention_dropout: [0.1]
max_grad_norm: 0
label_smoothing: 0.1
average_decay: 0.0001
param_init_method: xavier_uniform
normalization: "tokens"
model:
architecture: "transformer"
share_embeddings: false
share_decoder_embeddings: false
add_estimator: false
add_ffnbias: true
add_qkvbias: false
layer_norm: standard
mlp_activation_fn: gelu
hidden_size: 768
encoder:
layers: 12
decoder:
layers: 2
heads: 16
transformer_ff: 4096
embeddings:
word_vec_size: 768
position_encoding_type: "SinusoidalInterleaved"
|