| name: &name DuplexTextNormalization | |
| mode: joint # Three possible choices ['tn', 'itn', 'joint'] | |
| lang: ??? # Supported languages are ['en', 'ru', 'de', 'multilingual'] | |
| # Pretrained Nemo Models | |
| tagger_pretrained_model: null | |
| decoder_pretrained_model: null | |
| # Tagger | |
| tagger_trainer: | |
| devices: 1 # the number of gpus, 0 for CPU | |
| num_nodes: 1 | |
| max_epochs: 5 # the number of training epochs (for ru or de or multilingual, try 10) | |
| enable_checkpointing: False # provided by exp_manager | |
| logger: false # provided by exp_manager | |
| accumulate_grad_batches: 1 # accumulates grads every k batches | |
| gradient_clip_val: 0.0 | |
| precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. | |
| accelerator: gpu | |
| strategy: ddp | |
| tagger_model: | |
| do_training: true | |
| transformer: albert-base-v2 # For ru, try cointegrated/rubert-tiny | For de, try bert-base-german-cased | For multilingual, try bert-base-multilingual-cased | |
| tokenizer: ${tagger_model.transformer} | |
| max_sequence_len: 128 | |
| nemo_path: ${tagger_exp_manager.exp_dir}/tagger_model.nemo # exported .nemo path | |
| lang: ${lang} | |
| mode: ${mode} | |
| optim: | |
| name: adamw | |
| lr: 5e-5 | |
| weight_decay: 0.01 | |
| sched: | |
| name: WarmupAnnealing | |
| # pytorch lightning args | |
| monitor: val_token_precision | |
| reduce_on_plateau: false | |
| # scheduler config override | |
| warmup_steps: null | |
| warmup_ratio: 0.1 | |
| last_epoch: -1 | |
| tagger_exp_manager: | |
| exp_dir: nemo_experiments # where to store logs and checkpoints | |
| name: tagger_training # name of experiment | |
| create_tensorboard_logger: True | |
| create_checkpoint_callback: True | |
| checkpoint_callback_params: | |
| save_top_k: 3 | |
| monitor: "val_token_precision" | |
| mode: "max" | |
| save_best_model: true | |
| always_save_nemo: true | |
| # Decoder | |
| decoder_trainer: | |
| devices: 1 # the number of gpus, 0 for CPU | |
| num_nodes: 1 | |
| max_epochs: 3 # the number of training epochs | |
| enable_checkpointing: False # provided by exp_manager | |
| logger: false # provided by exp_manager | |
| accumulate_grad_batches: 1 # accumulates grads every k batches | |
| gradient_clip_val: 0.0 | |
| precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. | |
| accelerator: gpu | |
| strategy: ddp | |
| log_every_n_steps: 1 # Interval of logging. | |
| val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations | |
| resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. | |
| decoder_model: | |
| do_training: true | |
| transformer: t5-small # For ru, try cointegrated/rut5-base | For de or multilingual, try google/mt5-base | |
| max_sequence_len: 80 | |
| tokenizer: ${decoder_model.transformer} | |
| nemo_path: ${decoder_exp_manager.exp_dir}/decoder_model.nemo # exported .nemo path | |
| lang: ${lang} | |
| mode: ${mode} | |
| # Options related to covering grammars for TN | |
| use_cg: false # Use covering grammars to avoid catastrophic errors | |
| neural_confidence_threshold: 0.99 # If the neural model is not confident, then use the covering grammars | |
| n_tagged: 1 # number of tagged options to consider, -1 - to get all possible tagged options | |
| optim: | |
| name: adamw | |
| lr: 2e-4 | |
| weight_decay: 0.01 | |
| sched: | |
| name: WarmupAnnealing | |
| # pytorch lightning args | |
| monitor: val_loss | |
| reduce_on_plateau: false | |
| # scheduler config override | |
| warmup_steps: null | |
| warmup_ratio: 0.0 | |
| last_epoch: -1 | |
| decoder_exp_manager: | |
| exp_dir: nemo_experiments # where to store logs and checkpoints | |
| name: decoder_training # name of experiment | |
| create_tensorboard_logger: True | |
| create_checkpoint_callback: True | |
| checkpoint_callback_params: | |
| save_top_k: 3 | |
| monitor: "val_loss" | |
| mode: "min" | |
| save_best_model: True | |
| # Data | |
| data: | |
| train_ds: | |
| data_path: train.tsv # provide the full path to the file. Ignored when using tarred dataset, tar_metadata_file is used instead. | |
| batch_size: 64 # local training batch size for each worker. Ignored when using tarred dataset, the batch size of the tarred dataset is used instead. | |
| shuffle: true | |
| max_insts: -1 # Maximum number of instances (-1 means no limit) | |
| # Refer to the text_normalization doc for more information about data augmentation | |
| tagger_data_augmentation: false | |
| decoder_data_augmentation: true | |
| use_cache: false # uses a cache to store the processed dataset, you may use it for large datasets for speed up (especially when using multi GPUs) | |
| num_workers: 3 | |
| pin_memory: false | |
| drop_last: false | |
| use_tarred_dataset: False # if true tar_metadata_file will be used | |
| tar_metadata_file: null # metadata for tarred dataset. A JSON file containing the list of tar_files in "text_tar_filepaths" field | |
| tar_shuffle_n: 100 # How many samples to look ahead and load to be shuffled | |
| validation_ds: | |
| data_path: dev.tsv # provide the full path to the file. Provide multiple paths to run evaluation on multiple datasets | |
| batch_size: 64 | |
| shuffle: false | |
| max_insts: -1 # Maximum number of instances (-1 means no limit) | |
| use_cache: false # uses a cache to store the processed dataset, you may use it for large datasets for speed up (especially when using multi GPUs) | |
| num_workers: 3 | |
| pin_memory: false | |
| drop_last: false | |
| test_ds: | |
| data_path: test.tsv # provide the full path to the file | |
| batch_size: 64 | |
| shuffle: false | |
| use_cache: false # uses a cache to store the processed dataset, you may use it for large datasets for speed up (especially when using multi GPUs) | |
| num_workers: 3 | |
| pin_memory: false | |
| drop_last: false | |
| errors_log_fp: errors.txt # Path to the file for logging the errors | |
| # Inference | |
| inference: | |
| interactive: false # Set to true if you want to enable the interactive mode when running duplex_text_normalization_test.py | |
| from_file: null # Path to the raw text, no labels required. Each sentence on a separate line | |
| batch_size: 16 # batch size for inference.from_file | |