NeMo / examples /nlp /machine_translation /conf /huggingface.yaml
camenduru's picture
thanks to NVIDIA ❤
7934b29
name: HuggingFaceEncoder
do_training: True # set to False if only preprocessing data
do_testing: False # set to True to run evaluation on test data after training
model:
beam_size: 4
len_pen: 0.6
max_generation_delta: -1
label_smoothing: 0.1
shared_tokenizer: false
preproc_out_dir: null
src_language: 'en'
tgt_language: 'de'
train_ds:
src_file_name: null
tgt_file_name: null
use_tarred_dataset: False # if true tar_file_name and meta_file_name will be used (or created automatically)
# config for preprocessing training data and creating a tarred datset automatically
tar_file_prefix: parallel # prefix for tar file names
tar_files: null # if data has already been preprocessed (rest of config ignored)
metadata_file: null # metadata for tarred dataset
lines_per_dataset_fragment: 1000000 # Number of lines to consider for bucketing and padding
num_batches_per_tarfile: 100 # Number of batches (pickle files) within each tarfile
tar_shuffle_n: 100 # How many samples to look ahead and load to be shuffled
shard_strategy: scatter # tarred dataset shard distribution strategy
n_preproc_jobs: -2 # number of processes to use for data preprocessing (-2 means all but 2)
tokens_in_batch: 512
clean: true
max_seq_length: 512
shuffle: true
num_samples: -1
drop_last: false
pin_memory: false
num_workers: 8
validation_ds:
src_file_name: null
tgt_file_name: null
tokens_in_batch: 512
clean: false
max_seq_length: 512
shuffle: false
num_samples: -1
drop_last: false
pin_memory: false
num_workers: 8
test_ds:
src_file_name: null
tgt_file_name: null
tokens_in_batch: 512
clean: false
max_seq_length: 512
shuffle: false
num_samples: -1
drop_last: false
pin_memory: false
num_workers: 8
optim:
name: adam
lr: 0.001
betas:
- 0.9
- 0.98
weight_decay: 0.0
sched:
name: InverseSquareRootAnnealing
min_lr: 0.0
last_epoch: -1
warmup_ratio: 0.1
encoder_tokenizer:
library: huggingface
tokenizer_model: null
vocab_file: null
special_tokens: null
vocab_size: null
decoder_tokenizer:
library: yttm
tokenizer_model: null
vocab_file: null
special_tokens: null
vocab_size: null
encoder:
library: huggingface
model_name: bert-base-uncased
pretrained: false
decoder:
library: nemo
model_name: null
pretrained: false
max_sequence_length: 512
num_token_types: 2
embedding_dropout: 0.1
learn_positional_encodings: false
hidden_size: 512
inner_size: 2048
num_layers: 6
num_attention_heads: 8
ffn_dropout: 0.1
attn_score_dropout: 0.1
attn_layer_dropout: 0.1
hidden_act: relu
pre_ln: false
head:
num_layers: 1
activation: relu
log_softmax: true
dropout: 0.0
use_transformer_init: true
trainer:
devices: 4
num_nodes: 1
max_epochs: 200
precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0
accelerator: gpu
enable_checkpointing: False
logger: False
log_every_n_steps: 50 # Interval of logging.
check_val_every_n_epoch: 1
benchmark: False
exp_manager:
name: HuggingFaceEncoder
files_to_copy: []