| feature_extractor: | |
| class_path: vocos.feature_extractors.MelSpectrogramFeatures | |
| init_args: | |
| sample_rate: 24000 | |
| n_fft: 1024 | |
| hop_length: 256 | |
| n_mels: 100 | |
| padding: center | |
| backbone: | |
| class_path: vocos.models.VocosBackbone | |
| init_args: | |
| input_channels: 100 | |
| dim: 512 | |
| intermediate_dim: 1536 | |
| num_layers: 8 | |
| head: | |
| class_path: vocos.heads.ISTFTHead | |
| init_args: | |
| dim: 512 | |
| n_fft: 1024 | |
| hop_length: 256 | |
| padding: center | |
| head_48k: | |
| class_path: vocos.heads.ISTFTHead | |
| init_args: | |
| dim: 512 | |
| n_fft: 1024 | |
| hop_length: 256 | |
| padding: center | |
| upsampler: | |
| class_path: linacodec.vocoder.upsampler_block.UpSamplerBlock | |
| init_args: | |
| in_channels: 512 | |
| upsample_factors: [2, 1] | |
| kernel_sizes: [8, 8] |