File size: 5,986 Bytes
7934b29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
name: &name DuplexTextNormalization
mode: joint      # Three possible choices ['tn', 'itn', 'joint']
lang: ???        # Supported languages are ['en', 'ru', 'de', 'multilingual']

# Pretrained Nemo Models
tagger_pretrained_model: null
decoder_pretrained_model: null

# Tagger
tagger_trainer:
  devices: 1        # the number of gpus, 0 for CPU
  num_nodes: 1
  max_epochs: 5  # the number of training epochs (for ru or de or multilingual, try 10)
  enable_checkpointing: False  # provided by exp_manager
  logger: false  # provided by exp_manager
  accumulate_grad_batches: 1 # accumulates grads every k batches
  gradient_clip_val: 0.0
  precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
  accelerator: gpu
  strategy: ddp

tagger_model:
  do_training: true
  transformer: albert-base-v2 # For ru, try cointegrated/rubert-tiny | For de, try bert-base-german-cased | For multilingual, try bert-base-multilingual-cased
  tokenizer: ${tagger_model.transformer}
  max_sequence_len: 128
  nemo_path: ${tagger_exp_manager.exp_dir}/tagger_model.nemo # exported .nemo path
  lang: ${lang}
  mode: ${mode}

  optim:
    name: adamw
    lr: 5e-5
    weight_decay: 0.01

    sched:
      name: WarmupAnnealing

      # pytorch lightning args
      monitor: val_token_precision
      reduce_on_plateau: false

      # scheduler config override
      warmup_steps: null
      warmup_ratio: 0.1
      last_epoch: -1

tagger_exp_manager:
  exp_dir: nemo_experiments # where to store logs and checkpoints
  name: tagger_training # name of experiment
  create_tensorboard_logger: True
  create_checkpoint_callback: True
  checkpoint_callback_params:
    save_top_k: 3
    monitor: "val_token_precision"
    mode: "max"
    save_best_model: true
    always_save_nemo: true

# Decoder
decoder_trainer:
  devices: 1 # the number of gpus, 0 for CPU
  num_nodes: 1
  max_epochs: 3  # the number of training epochs
  enable_checkpointing: False  # provided by exp_manager
  logger: false  # provided by exp_manager
  accumulate_grad_batches: 1 # accumulates grads every k batches
  gradient_clip_val: 0.0
  precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
  accelerator: gpu
  strategy: ddp
  log_every_n_steps: 1  # Interval of logging.
  val_check_interval: 1.0  # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.

decoder_model:
  do_training: true
  transformer: t5-small  # For ru, try cointegrated/rut5-base | For de or multilingual, try google/mt5-base
  max_sequence_len: 80
  tokenizer: ${decoder_model.transformer}
  nemo_path: ${decoder_exp_manager.exp_dir}/decoder_model.nemo # exported .nemo path
  lang: ${lang}
  mode: ${mode}

  # Options related to covering grammars for TN
  use_cg: false # Use covering grammars to avoid catastrophic errors
  neural_confidence_threshold: 0.99 # If the neural model is not confident, then use the covering grammars
  n_tagged: 1 # number of tagged options to consider, -1 - to get all possible tagged options

  optim:
    name: adamw
    lr: 2e-4
    weight_decay: 0.01

    sched:
      name: WarmupAnnealing

      # pytorch lightning args
      monitor: val_loss
      reduce_on_plateau: false

      # scheduler config override
      warmup_steps: null
      warmup_ratio: 0.0
      last_epoch: -1

decoder_exp_manager:
  exp_dir: nemo_experiments # where to store logs and checkpoints
  name: decoder_training # name of experiment
  create_tensorboard_logger: True
  create_checkpoint_callback: True
  checkpoint_callback_params:
    save_top_k: 3
    monitor: "val_loss"
    mode: "min"
    save_best_model: True

# Data
data:
  train_ds:
    data_path: train.tsv # provide the full path to the file. Ignored when using tarred dataset, tar_metadata_file is used instead.
    batch_size: 64 # local training batch size for each worker. Ignored when using tarred dataset, the batch size of the tarred dataset is used instead.
    shuffle: true
    max_insts: -1 # Maximum number of instances (-1 means no limit)
    # Refer to the text_normalization doc for more information about data augmentation
    tagger_data_augmentation: false
    decoder_data_augmentation: true
    use_cache: false # uses a cache to store the processed dataset, you may use it for large datasets for speed up (especially when using multi GPUs)
    num_workers: 3
    pin_memory: false
    drop_last: false
    use_tarred_dataset: False # if true tar_metadata_file will be used
    tar_metadata_file: null # metadata for tarred dataset. A JSON file containing the list of tar_files in "text_tar_filepaths" field
    tar_shuffle_n: 100 # How many samples to look ahead and load to be shuffled

  validation_ds:
    data_path: dev.tsv # provide the full path to the file. Provide multiple paths to run evaluation on multiple datasets
    batch_size: 64
    shuffle: false
    max_insts: -1 # Maximum number of instances (-1 means no limit)
    use_cache: false # uses a cache to store the processed dataset, you may use it for large datasets for speed up (especially when using multi GPUs)
    num_workers: 3
    pin_memory: false
    drop_last: false

  test_ds:
    data_path: test.tsv # provide the full path to the file
    batch_size: 64
    shuffle: false
    use_cache: false # uses a cache to store the processed dataset, you may use it for large datasets for speed up (especially when using multi GPUs)
    num_workers: 3
    pin_memory: false
    drop_last: false
    errors_log_fp: errors.txt # Path to the file for logging the errors

# Inference
inference:
  interactive: false  # Set to true if you want to enable the interactive mode when running duplex_text_normalization_test.py
  from_file: null # Path to the raw text, no labels required. Each sentence on a separate line
  batch_size: 16 # batch size for inference.from_file