File size: 2,587 Bytes
08276da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
model:
  _target_: src.model.ConformerModel
  input_dim: 128
writer:
  _target_: src.logger.CometMLWriter
  project_name: pytorch_template_asr_example
  workspace: null
  run_name: conformer_30m
  mode: online
  loss_names:
  - loss
  log_checkpoints: false
  id_length: 32
  run_id: m2guzao93o9ytjxogwt78mftkyiqalsf
metrics:
  train: []
  inference:
  - _target_: src.metrics.ArgmaxCERMetric
    name: CER_(Argmax)
  - _target_: src.metrics.ArgmaxWERMetric
    name: WER_(Argmax)
  - _target_: src.metrics.WER
    name: WER
  - _target_: src.metrics.CER
    name: CER
datasets:
  train:
    _target_: src.datasets.LibrispeechDataset
    part: train-other-500
    instance_transforms: ${transforms.instance_transforms.train}
  val:
    _target_: src.datasets.LibrispeechDataset
    part: test-clean
    instance_transforms: ${transforms.instance_transforms.inference}
  test:
    _target_: src.datasets.LibrispeechDataset
    part: test-other
    instance_transforms: ${transforms.instance_transforms.inference}
dataloader:
  _target_: torch.utils.data.DataLoader
  batch_size: 30
  num_workers: 2
  pin_memory: true
transforms:
  instance_transforms:
    train:
      get_spectrogram:
        _target_: torchaudio.transforms.MelSpectrogram
        sample_rate: 16000
      audio:
        _target_: torchvision.transforms.v2.Compose
        transforms:
        - _target_: src.transforms.wav_augs.Gain
          sample_rate: 16000
          min_gain_in_db: -6
          max_gain_in_db: 6
          p: 0.2
        - _target_: src.transforms.wav_augs.Shift
          p: 0.2
        - _target_: src.transforms.wav_augs.PitchShift
          min_semitones: -2
          max_semitones: 2
          p: 0.2
        - _target_: src.transforms.wav_augs.Noise
          p: 0.3
    inference:
      get_spectrogram:
        _target_: torchaudio.transforms.MelSpectrogram
        sample_rate: 16000
  batch_transforms:
    train: null
    inference: null
optimizer:
  _target_: torch.optim.AdamW
  lr: 5.0e-05
lr_scheduler:
  _target_: torch.optim.lr_scheduler.OneCycleLR
  max_lr: 0.0001
  pct_start: 0.1
  steps_per_epoch: ${trainer.epoch_len}
  epochs: ${trainer.n_epochs}
  anneal_strategy: cos
loss_function:
  _target_: src.loss.CTCLossWrapper
text_encoder:
  _target_: src.text_encoder.CTCTextEncoder
trainer:
  log_step: 200
  n_epochs: 150
  epoch_len: 1300
  device_tensors:
  - spectrogram
  - text_encoded
  resume_from: checkpoint-epoch62.pth
  device: auto
  override: false
  monitor: min val_WER_(Argmax)
  save_period: 5
  early_stop: ${trainer.n_epochs}
  save_dir: saved
  seed: 1