name: &name DuplexTextNormalization mode: joint # Three possible choices ['tn', 'itn', 'joint'] lang: ??? # Supported languages are ['en', 'ru', 'de', 'multilingual'] # Pretrained Nemo Models tagger_pretrained_model: null decoder_pretrained_model: null # Tagger tagger_trainer: devices: 1 # the number of gpus, 0 for CPU num_nodes: 1 max_epochs: 5 # the number of training epochs (for ru or de or multilingual, try 10) enable_checkpointing: False # provided by exp_manager logger: false # provided by exp_manager accumulate_grad_batches: 1 # accumulates grads every k batches gradient_clip_val: 0.0 precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. accelerator: gpu strategy: ddp tagger_model: do_training: true transformer: albert-base-v2 # For ru, try cointegrated/rubert-tiny | For de, try bert-base-german-cased | For multilingual, try bert-base-multilingual-cased tokenizer: ${tagger_model.transformer} max_sequence_len: 128 nemo_path: ${tagger_exp_manager.exp_dir}/tagger_model.nemo # exported .nemo path lang: ${lang} mode: ${mode} optim: name: adamw lr: 5e-5 weight_decay: 0.01 sched: name: WarmupAnnealing # pytorch lightning args monitor: val_token_precision reduce_on_plateau: false # scheduler config override warmup_steps: null warmup_ratio: 0.1 last_epoch: -1 tagger_exp_manager: exp_dir: nemo_experiments # where to store logs and checkpoints name: tagger_training # name of experiment create_tensorboard_logger: True create_checkpoint_callback: True checkpoint_callback_params: save_top_k: 3 monitor: "val_token_precision" mode: "max" save_best_model: true always_save_nemo: true # Decoder decoder_trainer: devices: 1 # the number of gpus, 0 for CPU num_nodes: 1 max_epochs: 3 # the number of training epochs enable_checkpointing: False # provided by exp_manager logger: false # provided by exp_manager accumulate_grad_batches: 1 # accumulates grads every k batches gradient_clip_val: 0.0 precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. accelerator: gpu strategy: ddp log_every_n_steps: 1 # Interval of logging. val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. decoder_model: do_training: true transformer: t5-small # For ru, try cointegrated/rut5-base | For de or multilingual, try google/mt5-base max_sequence_len: 80 tokenizer: ${decoder_model.transformer} nemo_path: ${decoder_exp_manager.exp_dir}/decoder_model.nemo # exported .nemo path lang: ${lang} mode: ${mode} # Options related to covering grammars for TN use_cg: false # Use covering grammars to avoid catastrophic errors neural_confidence_threshold: 0.99 # If the neural model is not confident, then use the covering grammars n_tagged: 1 # number of tagged options to consider, -1 - to get all possible tagged options optim: name: adamw lr: 2e-4 weight_decay: 0.01 sched: name: WarmupAnnealing # pytorch lightning args monitor: val_loss reduce_on_plateau: false # scheduler config override warmup_steps: null warmup_ratio: 0.0 last_epoch: -1 decoder_exp_manager: exp_dir: nemo_experiments # where to store logs and checkpoints name: decoder_training # name of experiment create_tensorboard_logger: True create_checkpoint_callback: True checkpoint_callback_params: save_top_k: 3 monitor: "val_loss" mode: "min" save_best_model: True # Data data: train_ds: data_path: train.tsv # provide the full path to the file. Ignored when using tarred dataset, tar_metadata_file is used instead. batch_size: 64 # local training batch size for each worker. Ignored when using tarred dataset, the batch size of the tarred dataset is used instead. shuffle: true max_insts: -1 # Maximum number of instances (-1 means no limit) # Refer to the text_normalization doc for more information about data augmentation tagger_data_augmentation: false decoder_data_augmentation: true use_cache: false # uses a cache to store the processed dataset, you may use it for large datasets for speed up (especially when using multi GPUs) num_workers: 3 pin_memory: false drop_last: false use_tarred_dataset: False # if true tar_metadata_file will be used tar_metadata_file: null # metadata for tarred dataset. A JSON file containing the list of tar_files in "text_tar_filepaths" field tar_shuffle_n: 100 # How many samples to look ahead and load to be shuffled validation_ds: data_path: dev.tsv # provide the full path to the file. Provide multiple paths to run evaluation on multiple datasets batch_size: 64 shuffle: false max_insts: -1 # Maximum number of instances (-1 means no limit) use_cache: false # uses a cache to store the processed dataset, you may use it for large datasets for speed up (especially when using multi GPUs) num_workers: 3 pin_memory: false drop_last: false test_ds: data_path: test.tsv # provide the full path to the file batch_size: 64 shuffle: false use_cache: false # uses a cache to store the processed dataset, you may use it for large datasets for speed up (especially when using multi GPUs) num_workers: 3 pin_memory: false drop_last: false errors_log_fp: errors.txt # Path to the file for logging the errors # Inference inference: interactive: false # Set to true if you want to enable the interactive mode when running duplex_text_normalization_test.py from_file: null # Path to the raw text, no labels required. Each sentence on a separate line batch_size: 16 # batch size for inference.from_file