Translation
Arabic
English
Eval Results
File size: 3,089 Bytes
321e894
a81dfb3
321e894
 
 
 
 
 
 
 
 
 
a81dfb3
 
321e894
 
 
 
 
 
a81dfb3
 
 
 
 
 
 
 
 
 
 
321e894
a81dfb3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321e894
 
 
 
 
 
 
 
 
 
 
 
a81dfb3
321e894
 
a81dfb3
 
321e894
 
 
 
 
a81dfb3
 
321e894
a81dfb3
 
321e894
a81dfb3
321e894
 
 
 
a81dfb3
 
 
321e894
 
 
 
 
a81dfb3
321e894
a81dfb3
321e894
 
 
 
a81dfb3
 
321e894
 
 
 
 
 
 
 
a81dfb3
321e894
a81dfb3
321e894
a81dfb3
 
 
321e894
a81dfb3
321e894
 
a81dfb3
321e894
 
a81dfb3
321e894
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
## IO
save_data: data
overwrite: True
seed: 1234
report_every: 100
valid_metrics: ["BLEU"]
tensorboard: true
tensorboard_log_dir: tensorboard

### Vocab
src_vocab: ar.eole.vocab
tgt_vocab: en.eole.vocab
src_vocab_size: 32000
tgt_vocab_size: 32000
vocab_size_multiple: 8
share_vocab: false
n_sample: 0

data:
    corpus_1:
        path_src: train.ar
        path_tgt: train.en
        weight: 2
    corpus_2:
        path_src: newscrawl.backtrans.ar
        path_tgt: newscrawl.2024.en
        weight: 1
    corpus_3:
        path_src: madlad.backtrans.ar
        path_tgt: madlad.en
        weight: 2
    valid:
        path_src: valid.ar
        path_tgt: valid.en

# data:
#     corpus_1:
#         path_src: hf://quickmt/quickmt-train.ar-en/ar
#         path_tgt: hf://quickmt/quickmt-train.ar-en/en
#         path_sco: hf://quickmt/quickmt-train.ar-en/sco
#         weight: 2
#     corpus_2:
#         path_src: hf://quickmt/newscrawl2024-en-backtranslated-ar/ar
#         path_tgt: hf://quickmt/newscrawl2024-en-backtranslated-ar/en
#         path_sco: hf://quickmt/newscrawl2024-en-backtranslated-ar/sco
#         weight: 1
#     corpus_3:
#         path_src: hf://quickmt/madlad400-en-backtranslated-ar/ar
#         path_tgt: hf://quickmt/madlad400-en-backtranslated-ar/en
#         path_sco: hf://quickmt/madlad400-en-backtranslated-ar/sco
#         weight: 2
#     valid:
#         path_src: valid.ar
#         path_tgt: valid.en



transforms: [sentencepiece, filtertoolong]
transforms_configs:
  sentencepiece:
    src_subword_model: "ar.spm.model"
    tgt_subword_model: "en.spm.model"
  filtertoolong:
    src_seq_length: 256
    tgt_seq_length: 256

training:
    # Run configuration
    model_path: quickmt-ar-en-eole-model
    keep_checkpoint: 4
    train_steps: 200000
    save_checkpoint_steps: 5000
    valid_steps: 5000
    
    # Train on a single GPU
    world_size: 1
    gpu_ranks: [0]

    # Batching 120,000 tokens
    # For RTX 5090, 15000 batch size, accum_count 8
    batch_type: "tokens"
    batch_size: 15000
    valid_batch_size: 2048
    batch_size_multiple: 8
    accum_count: [8]
    accum_steps: [0]

    # Optimizer & Compute
    compute_dtype: "fp16"
    optim: "adamw"
    use_amp: True
    learning_rate: 3.0
    warmup_steps: 5000
    decay_method: "noam"
    adam_beta2: 0.998

    # Data loading
    bucket_size: 256000
    num_workers: 4
    prefetch_factor: 128
    
    # Hyperparams
    dropout_steps: [0]
    dropout: [0.1]
    attention_dropout: [0.1]
    max_grad_norm: 0
    label_smoothing: 0.1
    average_decay: 0.0001
    param_init_method: xavier_uniform
    normalization: "tokens"

model:
    architecture: "transformer"
    share_embeddings: false
    share_decoder_embeddings: false
    add_estimator: false
    add_ffnbias: true
    add_qkvbias: false
    layer_norm: standard
    mlp_activation_fn: gelu
    hidden_size: 768
    encoder:
        layers: 12
    decoder:
        layers: 2
    heads: 16
    transformer_ff: 4096
    embeddings:
        word_vec_size: 768
        position_encoding_type: "SinusoidalInterleaved"