Translation
Arabic
English
Eval Results
quickmt-ar-en / eole-config.yaml
radinplaid's picture
Upload folder using huggingface_hub
a81dfb3 verified
## IO
save_data: data
overwrite: True
seed: 1234
report_every: 100
valid_metrics: ["BLEU"]
tensorboard: true
tensorboard_log_dir: tensorboard
### Vocab
src_vocab: ar.eole.vocab
tgt_vocab: en.eole.vocab
src_vocab_size: 32000
tgt_vocab_size: 32000
vocab_size_multiple: 8
share_vocab: false
n_sample: 0
data:
corpus_1:
path_src: train.ar
path_tgt: train.en
weight: 2
corpus_2:
path_src: newscrawl.backtrans.ar
path_tgt: newscrawl.2024.en
weight: 1
corpus_3:
path_src: madlad.backtrans.ar
path_tgt: madlad.en
weight: 2
valid:
path_src: valid.ar
path_tgt: valid.en
# data:
# corpus_1:
# path_src: hf://quickmt/quickmt-train.ar-en/ar
# path_tgt: hf://quickmt/quickmt-train.ar-en/en
# path_sco: hf://quickmt/quickmt-train.ar-en/sco
# weight: 2
# corpus_2:
# path_src: hf://quickmt/newscrawl2024-en-backtranslated-ar/ar
# path_tgt: hf://quickmt/newscrawl2024-en-backtranslated-ar/en
# path_sco: hf://quickmt/newscrawl2024-en-backtranslated-ar/sco
# weight: 1
# corpus_3:
# path_src: hf://quickmt/madlad400-en-backtranslated-ar/ar
# path_tgt: hf://quickmt/madlad400-en-backtranslated-ar/en
# path_sco: hf://quickmt/madlad400-en-backtranslated-ar/sco
# weight: 2
# valid:
# path_src: valid.ar
# path_tgt: valid.en
transforms: [sentencepiece, filtertoolong]
transforms_configs:
sentencepiece:
src_subword_model: "ar.spm.model"
tgt_subword_model: "en.spm.model"
filtertoolong:
src_seq_length: 256
tgt_seq_length: 256
training:
# Run configuration
model_path: quickmt-ar-en-eole-model
keep_checkpoint: 4
train_steps: 200000
save_checkpoint_steps: 5000
valid_steps: 5000
# Train on a single GPU
world_size: 1
gpu_ranks: [0]
# Batching 120,000 tokens
# For RTX 5090, 15000 batch size, accum_count 8
batch_type: "tokens"
batch_size: 15000
valid_batch_size: 2048
batch_size_multiple: 8
accum_count: [8]
accum_steps: [0]
# Optimizer & Compute
compute_dtype: "fp16"
optim: "adamw"
use_amp: True
learning_rate: 3.0
warmup_steps: 5000
decay_method: "noam"
adam_beta2: 0.998
# Data loading
bucket_size: 256000
num_workers: 4
prefetch_factor: 128
# Hyperparams
dropout_steps: [0]
dropout: [0.1]
attention_dropout: [0.1]
max_grad_norm: 0
label_smoothing: 0.1
average_decay: 0.0001
param_init_method: xavier_uniform
normalization: "tokens"
model:
architecture: "transformer"
share_embeddings: false
share_decoder_embeddings: false
add_estimator: false
add_ffnbias: true
add_qkvbias: false
layer_norm: standard
mlp_activation_fn: gelu
hidden_size: 768
encoder:
layers: 12
decoder:
layers: 2
heads: 16
transformer_ff: 4096
embeddings:
word_vec_size: 768
position_encoding_type: "SinusoidalInterleaved"