File size: 5,986 Bytes
7934b29 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 | name: &name DuplexTextNormalization
mode: joint # Three possible choices ['tn', 'itn', 'joint']
lang: ??? # Supported languages are ['en', 'ru', 'de', 'multilingual']
# Pretrained Nemo Models
tagger_pretrained_model: null
decoder_pretrained_model: null
# Tagger
tagger_trainer:
devices: 1 # the number of gpus, 0 for CPU
num_nodes: 1
max_epochs: 5 # the number of training epochs (for ru or de or multilingual, try 10)
enable_checkpointing: False # provided by exp_manager
logger: false # provided by exp_manager
accumulate_grad_batches: 1 # accumulates grads every k batches
gradient_clip_val: 0.0
precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
accelerator: gpu
strategy: ddp
tagger_model:
do_training: true
transformer: albert-base-v2 # For ru, try cointegrated/rubert-tiny | For de, try bert-base-german-cased | For multilingual, try bert-base-multilingual-cased
tokenizer: ${tagger_model.transformer}
max_sequence_len: 128
nemo_path: ${tagger_exp_manager.exp_dir}/tagger_model.nemo # exported .nemo path
lang: ${lang}
mode: ${mode}
optim:
name: adamw
lr: 5e-5
weight_decay: 0.01
sched:
name: WarmupAnnealing
# pytorch lightning args
monitor: val_token_precision
reduce_on_plateau: false
# scheduler config override
warmup_steps: null
warmup_ratio: 0.1
last_epoch: -1
tagger_exp_manager:
exp_dir: nemo_experiments # where to store logs and checkpoints
name: tagger_training # name of experiment
create_tensorboard_logger: True
create_checkpoint_callback: True
checkpoint_callback_params:
save_top_k: 3
monitor: "val_token_precision"
mode: "max"
save_best_model: true
always_save_nemo: true
# Decoder
decoder_trainer:
devices: 1 # the number of gpus, 0 for CPU
num_nodes: 1
max_epochs: 3 # the number of training epochs
enable_checkpointing: False # provided by exp_manager
logger: false # provided by exp_manager
accumulate_grad_batches: 1 # accumulates grads every k batches
gradient_clip_val: 0.0
precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
accelerator: gpu
strategy: ddp
log_every_n_steps: 1 # Interval of logging.
val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
decoder_model:
do_training: true
transformer: t5-small # For ru, try cointegrated/rut5-base | For de or multilingual, try google/mt5-base
max_sequence_len: 80
tokenizer: ${decoder_model.transformer}
nemo_path: ${decoder_exp_manager.exp_dir}/decoder_model.nemo # exported .nemo path
lang: ${lang}
mode: ${mode}
# Options related to covering grammars for TN
use_cg: false # Use covering grammars to avoid catastrophic errors
neural_confidence_threshold: 0.99 # If the neural model is not confident, then use the covering grammars
n_tagged: 1 # number of tagged options to consider, -1 - to get all possible tagged options
optim:
name: adamw
lr: 2e-4
weight_decay: 0.01
sched:
name: WarmupAnnealing
# pytorch lightning args
monitor: val_loss
reduce_on_plateau: false
# scheduler config override
warmup_steps: null
warmup_ratio: 0.0
last_epoch: -1
decoder_exp_manager:
exp_dir: nemo_experiments # where to store logs and checkpoints
name: decoder_training # name of experiment
create_tensorboard_logger: True
create_checkpoint_callback: True
checkpoint_callback_params:
save_top_k: 3
monitor: "val_loss"
mode: "min"
save_best_model: True
# Data
data:
train_ds:
data_path: train.tsv # provide the full path to the file. Ignored when using tarred dataset, tar_metadata_file is used instead.
batch_size: 64 # local training batch size for each worker. Ignored when using tarred dataset, the batch size of the tarred dataset is used instead.
shuffle: true
max_insts: -1 # Maximum number of instances (-1 means no limit)
# Refer to the text_normalization doc for more information about data augmentation
tagger_data_augmentation: false
decoder_data_augmentation: true
use_cache: false # uses a cache to store the processed dataset, you may use it for large datasets for speed up (especially when using multi GPUs)
num_workers: 3
pin_memory: false
drop_last: false
use_tarred_dataset: False # if true tar_metadata_file will be used
tar_metadata_file: null # metadata for tarred dataset. A JSON file containing the list of tar_files in "text_tar_filepaths" field
tar_shuffle_n: 100 # How many samples to look ahead and load to be shuffled
validation_ds:
data_path: dev.tsv # provide the full path to the file. Provide multiple paths to run evaluation on multiple datasets
batch_size: 64
shuffle: false
max_insts: -1 # Maximum number of instances (-1 means no limit)
use_cache: false # uses a cache to store the processed dataset, you may use it for large datasets for speed up (especially when using multi GPUs)
num_workers: 3
pin_memory: false
drop_last: false
test_ds:
data_path: test.tsv # provide the full path to the file
batch_size: 64
shuffle: false
use_cache: false # uses a cache to store the processed dataset, you may use it for large datasets for speed up (especially when using multi GPUs)
num_workers: 3
pin_memory: false
drop_last: false
errors_log_fp: errors.txt # Path to the file for logging the errors
# Inference
inference:
interactive: false # Set to true if you want to enable the interactive mode when running duplex_text_normalization_test.py
from_file: null # Path to the raw text, no labels required. Each sentence on a separate line
batch_size: 16 # batch size for inference.from_file
|