camenduru's picture
thanks to NVIDIA ❤
7934b29
name: &name DuplexTextNormalization
mode: joint # Three possible choices ['tn', 'itn', 'joint']
lang: ??? # Supported languages are ['en', 'ru', 'de', 'multilingual']
# Pretrained Nemo Models
tagger_pretrained_model: null
decoder_pretrained_model: null
# Tagger
tagger_trainer:
devices: 1 # the number of gpus, 0 for CPU
num_nodes: 1
max_epochs: 5 # the number of training epochs (for ru or de or multilingual, try 10)
enable_checkpointing: False # provided by exp_manager
logger: false # provided by exp_manager
accumulate_grad_batches: 1 # accumulates grads every k batches
gradient_clip_val: 0.0
precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
accelerator: gpu
strategy: ddp
tagger_model:
do_training: true
transformer: albert-base-v2 # For ru, try cointegrated/rubert-tiny | For de, try bert-base-german-cased | For multilingual, try bert-base-multilingual-cased
tokenizer: ${tagger_model.transformer}
max_sequence_len: 128
nemo_path: ${tagger_exp_manager.exp_dir}/tagger_model.nemo # exported .nemo path
lang: ${lang}
mode: ${mode}
optim:
name: adamw
lr: 5e-5
weight_decay: 0.01
sched:
name: WarmupAnnealing
# pytorch lightning args
monitor: val_token_precision
reduce_on_plateau: false
# scheduler config override
warmup_steps: null
warmup_ratio: 0.1
last_epoch: -1
tagger_exp_manager:
exp_dir: nemo_experiments # where to store logs and checkpoints
name: tagger_training # name of experiment
create_tensorboard_logger: True
create_checkpoint_callback: True
checkpoint_callback_params:
save_top_k: 3
monitor: "val_token_precision"
mode: "max"
save_best_model: true
always_save_nemo: true
# Decoder
decoder_trainer:
devices: 1 # the number of gpus, 0 for CPU
num_nodes: 1
max_epochs: 3 # the number of training epochs
enable_checkpointing: False # provided by exp_manager
logger: false # provided by exp_manager
accumulate_grad_batches: 1 # accumulates grads every k batches
gradient_clip_val: 0.0
precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
accelerator: gpu
strategy: ddp
log_every_n_steps: 1 # Interval of logging.
val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
decoder_model:
do_training: true
transformer: t5-small # For ru, try cointegrated/rut5-base | For de or multilingual, try google/mt5-base
max_sequence_len: 80
tokenizer: ${decoder_model.transformer}
nemo_path: ${decoder_exp_manager.exp_dir}/decoder_model.nemo # exported .nemo path
lang: ${lang}
mode: ${mode}
# Options related to covering grammars for TN
use_cg: false # Use covering grammars to avoid catastrophic errors
neural_confidence_threshold: 0.99 # If the neural model is not confident, then use the covering grammars
n_tagged: 1 # number of tagged options to consider, -1 - to get all possible tagged options
optim:
name: adamw
lr: 2e-4
weight_decay: 0.01
sched:
name: WarmupAnnealing
# pytorch lightning args
monitor: val_loss
reduce_on_plateau: false
# scheduler config override
warmup_steps: null
warmup_ratio: 0.0
last_epoch: -1
decoder_exp_manager:
exp_dir: nemo_experiments # where to store logs and checkpoints
name: decoder_training # name of experiment
create_tensorboard_logger: True
create_checkpoint_callback: True
checkpoint_callback_params:
save_top_k: 3
monitor: "val_loss"
mode: "min"
save_best_model: True
# Data
data:
train_ds:
data_path: train.tsv # provide the full path to the file. Ignored when using tarred dataset, tar_metadata_file is used instead.
batch_size: 64 # local training batch size for each worker. Ignored when using tarred dataset, the batch size of the tarred dataset is used instead.
shuffle: true
max_insts: -1 # Maximum number of instances (-1 means no limit)
# Refer to the text_normalization doc for more information about data augmentation
tagger_data_augmentation: false
decoder_data_augmentation: true
use_cache: false # uses a cache to store the processed dataset, you may use it for large datasets for speed up (especially when using multi GPUs)
num_workers: 3
pin_memory: false
drop_last: false
use_tarred_dataset: False # if true tar_metadata_file will be used
tar_metadata_file: null # metadata for tarred dataset. A JSON file containing the list of tar_files in "text_tar_filepaths" field
tar_shuffle_n: 100 # How many samples to look ahead and load to be shuffled
validation_ds:
data_path: dev.tsv # provide the full path to the file. Provide multiple paths to run evaluation on multiple datasets
batch_size: 64
shuffle: false
max_insts: -1 # Maximum number of instances (-1 means no limit)
use_cache: false # uses a cache to store the processed dataset, you may use it for large datasets for speed up (especially when using multi GPUs)
num_workers: 3
pin_memory: false
drop_last: false
test_ds:
data_path: test.tsv # provide the full path to the file
batch_size: 64
shuffle: false
use_cache: false # uses a cache to store the processed dataset, you may use it for large datasets for speed up (especially when using multi GPUs)
num_workers: 3
pin_memory: false
drop_last: false
errors_log_fp: errors.txt # Path to the file for logging the errors
# Inference
inference:
interactive: false # Set to true if you want to enable the interactive mode when running duplex_text_normalization_test.py
from_file: null # Path to the raw text, no labels required. Each sentence on a separate line
batch_size: 16 # batch size for inference.from_file