| { | |
| "feature_extractor": { | |
| "class_path": "vocos.feature_extractors.EncodecFeatures", | |
| "init_args": { | |
| "encodec_model": "encodec_24khz", | |
| "bandwidths": [ | |
| 1.5, | |
| 3.0, | |
| 6.0, | |
| 12.0 | |
| ], | |
| "train_codebooks": false | |
| } | |
| }, | |
| "backbone": { | |
| "class_path": "vocos.models.VocosBackbone", | |
| "init_args": { | |
| "input_channels": 128, | |
| "dim": 384, | |
| "intermediate_dim": 1152, | |
| "num_layers": 8, | |
| "adanorm_num_embeddings": 4 | |
| } | |
| }, | |
| "head": { | |
| "class_path": "vocos.heads.ISTFTHead", | |
| "init_args": { | |
| "dim": 384, | |
| "n_fft": 1280, | |
| "hop_length": 320, | |
| "padding": "same" | |
| } | |
| } | |
| } |