name: &name DuplexTextNormalization
mode: joint      # Three possible choices ['tn', 'itn', 'joint']
lang: ???        # Supported languages are ['en', 'ru', 'de', 'multilingual']

# Pretrained Nemo Models
tagger_pretrained_model: null
decoder_pretrained_model: null

# Tagger
tagger_trainer:
  devices: 1        # the number of gpus, 0 for CPU
  num_nodes: 1
  max_epochs: 5  # the number of training epochs (for ru or de or multilingual, try 10)
  enable_checkpointing: False  # provided by exp_manager
  logger: false  # provided by exp_manager
  accumulate_grad_batches: 1 # accumulates grads every k batches
  gradient_clip_val: 0.0
  precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
  accelerator: gpu
  strategy: ddp

tagger_model:
  do_training: true
  transformer: albert-base-v2 # For ru, try cointegrated/rubert-tiny | For de, try bert-base-german-cased | For multilingual, try bert-base-multilingual-cased
  tokenizer: ${tagger_model.transformer}
  max_sequence_len: 128
  nemo_path: ${tagger_exp_manager.exp_dir}/tagger_model.nemo # exported .nemo path
  lang: ${lang}
  mode: ${mode}

  optim:
    name: adamw
    lr: 5e-5
    weight_decay: 0.01

    sched:
      name: WarmupAnnealing

      # pytorch lightning args
      monitor: val_token_precision
      reduce_on_plateau: false

      # scheduler config override
      warmup_steps: null
      warmup_ratio: 0.1
      last_epoch: -1

tagger_exp_manager:
  exp_dir: nemo_experiments # where to store logs and checkpoints
  name: tagger_training # name of experiment
  create_tensorboard_logger: True
  create_checkpoint_callback: True
  checkpoint_callback_params:
    save_top_k: 3
    monitor: "val_token_precision"
    mode: "max"
    save_best_model: true
    always_save_nemo: true

# Decoder
decoder_trainer:
  devices: 1 # the number of gpus, 0 for CPU
  num_nodes: 1
  max_epochs: 3  # the number of training epochs
  enable_checkpointing: False  # provided by exp_manager
  logger: false  # provided by exp_manager
  accumulate_grad_batches: 1 # accumulates grads every k batches
  gradient_clip_val: 0.0
  precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
  accelerator: gpu
  strategy: ddp
  log_every_n_steps: 1  # Interval of logging.
  val_check_interval: 1.0  # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.

decoder_model:
  do_training: true
  transformer: t5-small  # For ru, try cointegrated/rut5-base | For de or multilingual, try google/mt5-base
  max_sequence_len: 80
  tokenizer: ${decoder_model.transformer}
  nemo_path: ${decoder_exp_manager.exp_dir}/decoder_model.nemo # exported .nemo path
  lang: ${lang}
  mode: ${mode}

  # Options related to covering grammars for TN
  use_cg: false # Use covering grammars to avoid catastrophic errors
  neural_confidence_threshold: 0.99 # If the neural model is not confident, then use the covering grammars
  n_tagged: 1 # number of tagged options to consider, -1 - to get all possible tagged options

  optim:
    name: adamw
    lr: 2e-4
    weight_decay: 0.01

    sched:
      name: WarmupAnnealing

      # pytorch lightning args
      monitor: val_loss
      reduce_on_plateau: false

      # scheduler config override
      warmup_steps: null
      warmup_ratio: 0.0
      last_epoch: -1

decoder_exp_manager:
  exp_dir: nemo_experiments # where to store logs and checkpoints
  name: decoder_training # name of experiment
  create_tensorboard_logger: True
  create_checkpoint_callback: True
  checkpoint_callback_params:
    save_top_k: 3
    monitor: "val_loss"
    mode: "min"
    save_best_model: True

# Data
data:
  train_ds:
    data_path: train.tsv # provide the full path to the file. Ignored when using tarred dataset, tar_metadata_file is used instead.
    batch_size: 64 # local training batch size for each worker. Ignored when using tarred dataset, the batch size of the tarred dataset is used instead.
    shuffle: true
    max_insts: -1 # Maximum number of instances (-1 means no limit)
    # Refer to the text_normalization doc for more information about data augmentation
    tagger_data_augmentation: false
    decoder_data_augmentation: true
    use_cache: false # uses a cache to store the processed dataset, you may use it for large datasets for speed up (especially when using multi GPUs)
    num_workers: 3
    pin_memory: false
    drop_last: false
    use_tarred_dataset: False # if true tar_metadata_file will be used
    tar_metadata_file: null # metadata for tarred dataset. A JSON file containing the list of tar_files in "text_tar_filepaths" field
    tar_shuffle_n: 100 # How many samples to look ahead and load to be shuffled

  validation_ds:
    data_path: dev.tsv # provide the full path to the file. Provide multiple paths to run evaluation on multiple datasets
    batch_size: 64
    shuffle: false
    max_insts: -1 # Maximum number of instances (-1 means no limit)
    use_cache: false # uses a cache to store the processed dataset, you may use it for large datasets for speed up (especially when using multi GPUs)
    num_workers: 3
    pin_memory: false
    drop_last: false

  test_ds:
    data_path: test.tsv # provide the full path to the file
    batch_size: 64
    shuffle: false
    use_cache: false # uses a cache to store the processed dataset, you may use it for large datasets for speed up (especially when using multi GPUs)
    num_workers: 3
    pin_memory: false
    drop_last: false
    errors_log_fp: errors.txt # Path to the file for logging the errors

# Inference
inference:
  interactive: false  # Set to true if you want to enable the interactive mode when running duplex_text_normalization_test.py
  from_file: null # Path to the raw text, no labels required. Each sentence on a separate line
  batch_size: 16 # batch size for inference.from_file