File size: 2,611 Bytes
d00d112
cb934b0
d00d112
 
 
 
 
 
 
 
cb934b0
 
 
 
d00d112
cb934b0
d00d112
 
 
 
cb934b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d00d112
cb934b0
 
d00d112
 
 
 
cb934b0
 
d00d112
 
 
 
 
 
cb934b0
d00d112
cb934b0
 
 
d00d112
 
 
 
 
cb934b0
 
d00d112
cb934b0
 
d00d112
cb934b0
d00d112
 
 
cb934b0
 
 
 
 
d00d112
 
 
 
cb934b0
d00d112
cb934b0
d00d112
 
 
 
 
cb934b0
d00d112
 
 
 
 
 
 
cb934b0
 
d00d112
cb934b0
d00d112
cb934b0
 
 
d00d112
cb934b0
d00d112
 
cb934b0
d00d112
 
cb934b0
d00d112
 
cb934b0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
## IO
save_data: data
overwrite: True
seed: 1234
report_every: 100
valid_metrics: ["BLEU"]
tensorboard: true
tensorboard_log_dir: tensorboard

### Vocab
src_vocab: fren/fr.eole.vocab
tgt_vocab: fren/en.eole.vocab
src_vocab_size: 32000
tgt_vocab_size: 32000
vocab_size_multiple: 8
share_vocab: false
n_sample: 0

data:
    corpus_1:
        path_src: fren/train.cleaned.filtered.fr
        path_tgt: fren/train.cleaned.filtered.en
        weight: 200
    corpus_2:
        path_src: ../data/newscrawl.backtrans.cleaned.filtered.fr
        path_tgt: ../data/newscrawl.backtrans.cleaned.filtered.en
        weight: 35
    corpus_3:
        path_src: ../data/madlad.backtrans.cleaned.filtered.fr
        path_tgt: ../data/madlad.backtrans.cleaned.filtered.en
        weight: 68
    corpus_4:
        path_src: ../data/hansard.fr
        path_tgt: ../data/hansard.en
        weight: 5
    valid:
        path_src: fren/dev.fr
        path_tgt: fren/dev.en

transforms: [sentencepiece, filtertoolong]
transforms_configs:
  sentencepiece:
    src_subword_model: "fren/fr.spm.model"
    tgt_subword_model: "fren/en.spm.model"
  filtertoolong:
    src_seq_length: 256
    tgt_seq_length: 256

training:
    # Run configuration
    model_path: quickmt-fr-en-eole-model
    keep_checkpoint: 4
    train_steps: 200000
    save_checkpoint_steps: 5000
    valid_steps: 5000
    
    # Train on a single GPU
    world_size: 1
    gpu_ranks: [0]

    # Batching 120,000 tokens
    # For RTX 5090, 15000 batch size, accum_count 8
    batch_type: "tokens"
    batch_size: 6000
    valid_batch_size: 2048
    batch_size_multiple: 8
    accum_count: [20]
    accum_steps: [0]

    # Optimizer & Compute
    compute_dtype: "fp16"
    optim: "adamw"
    #use_amp: True
    learning_rate: 3.0
    warmup_steps: 5000
    decay_method: "noam"
    adam_beta2: 0.998

    # Data loading
    bucket_size: 256000
    num_workers: 4
    prefetch_factor: 128
    
    # Hyperparams
    dropout_steps: [0]
    dropout: [0.1]
    attention_dropout: [0.1]
    max_grad_norm: 0
    label_smoothing: 0.1
    average_decay: 0.0001
    param_init_method: xavier_uniform
    normalization: "tokens"

model:
    architecture: "transformer"
    share_embeddings: false
    share_decoder_embeddings: false
    add_estimator: false
    add_ffnbias: true
    add_qkvbias: false
    layer_norm: standard
    mlp_activation_fn: gelu
    hidden_size: 768
    encoder:
        layers: 12
    decoder:
        layers: 2
    heads: 16
    transformer_ff: 4096
    embeddings:
        word_vec_size: 768
        position_encoding_type: "SinusoidalInterleaved"