| { | |
| "name": "default_test_config", | |
| "n_gpu": 1, | |
| "text_encoder": { | |
| "type": "CTCCharTextEncoder", | |
| "args": { | |
| "kenlm_model_path": "hw_asr/text_encoder/lower_3-gram.arpa", | |
| "unigrams_path": "hw_asr/text_encoder/librispeech-fixed-vocab.txt" | |
| } | |
| }, | |
| "preprocessing": { | |
| "sr": 16000, | |
| "spectrogram": { | |
| "type": "MelSpectrogram", | |
| "args": { | |
| "n_mels": 256 | |
| } | |
| }, | |
| "log_spec": true | |
| }, | |
| "augmentations": { | |
| "random_apply_p": 0.6, | |
| "wave": [ | |
| { | |
| "type": "AddColoredNoise", | |
| "args": { | |
| "p": 1, | |
| "sample_rate": 16000 | |
| } | |
| }, | |
| { | |
| "type": "Gain", | |
| "args": { | |
| "p": 0.8, | |
| "sample_rate": 16000 | |
| } | |
| }, | |
| { | |
| "type": "HighPassFilter", | |
| "args": { | |
| "p": 0, | |
| "sample_rate": 16000 | |
| } | |
| }, | |
| { | |
| "type": "LowPassFilter", | |
| "args": { | |
| "p": 0, | |
| "sample_rate": 16000 | |
| } | |
| }, | |
| { | |
| "type": "PitchShift", | |
| "args": { | |
| "p": 0.8, | |
| "min_transpose_semitones": -2, | |
| "max_transpose_semitones": 2, | |
| "sample_rate": 16000 | |
| } | |
| }, | |
| { | |
| "type": "PolarityInversion", | |
| "args": { | |
| "p": 0.8, | |
| "sample_rate": 16000 | |
| } | |
| }, | |
| { | |
| "type": "Shift", | |
| "args": { | |
| "p": 0.8, | |
| "sample_rate": 16000 | |
| } | |
| } | |
| ], | |
| "spectrogram": [ | |
| { | |
| "type": "TimeMasking", | |
| "args": { | |
| "time_mask_param": 80, | |
| "p": 0.05 | |
| } | |
| }, | |
| { | |
| "type": "FrequencyMasking", | |
| "args": { | |
| "freq_mask_param": 80 | |
| } | |
| } | |
| ] | |
| }, | |
| "arch": { | |
| "type": "DeepSpeech2Model", | |
| "args": { | |
| "n_feats": 256, | |
| "n_rnn_layers": 6, | |
| "rnn_hidden_size": 512, | |
| "rnn_dropout": 0.2 | |
| } | |
| }, | |
| "data": { | |
| "test": { | |
| "batch_size": 64, | |
| "num_workers": 4, | |
| "datasets": [ | |
| { | |
| "type": "LibrispeechDataset", | |
| "args": { | |
| "part": "test-other" | |
| } | |
| } | |
| ] | |
| } | |
| }, | |
| "optimizer": { | |
| "type": "AdamW", | |
| "args": { | |
| "lr": 0.0003, | |
| "weight_decay": 1e-05 | |
| } | |
| }, | |
| "loss": { | |
| "type": "CTCLoss", | |
| "args": {} | |
| }, | |
| "metrics": [ | |
| { | |
| "type": "ArgmaxWERMetric", | |
| "args": { | |
| "name": "WER (argmax)" | |
| } | |
| }, | |
| { | |
| "type": "ArgmaxCERMetric", | |
| "args": { | |
| "name": "CER (argmax)" | |
| } | |
| }, | |
| { | |
| "type": "BeamSearchWERMetric", | |
| "args": { | |
| "beam_size": 4, | |
| "name": "WER (beam search)" | |
| } | |
| }, | |
| { | |
| "type": "BeamSearchCERMetric", | |
| "args": { | |
| "beam_size": 4, | |
| "name": "CER (beam search)" | |
| } | |
| }, | |
| { | |
| "type": "LanguageModelWERMetric", | |
| "args": { | |
| "name": "WER (LM)" | |
| } | |
| }, | |
| { | |
| "type": "LanguageModelCERMetric", | |
| "args": { | |
| "name": "CER (LM)" | |
| } | |
| } | |
| ], | |
| "lr_scheduler": { | |
| "type": "OneCycleLR", | |
| "args": { | |
| "steps_per_epoch": 1000, | |
| "epochs": 50, | |
| "anneal_strategy": "cos", | |
| "max_lr": 0.0003, | |
| "pct_start": 0.1 | |
| } | |
| }, | |
| "trainer": { | |
| "epochs": 50, | |
| "save_dir": "saved/", | |
| "save_period": 5, | |
| "verbosity": 2, | |
| "monitor": "min val_loss", | |
| "early_stop": 100, | |
| "visualize": "wandb", | |
| "wandb_project": "asr_project", | |
| "len_epoch": 1000, | |
| "grad_norm_clip": 10 | |
| } | |
| } |