| { | |
| "adanorm_num_embeddings": 4, | |
| "architectures": [ | |
| "VocosWithEncodecModel" | |
| ], | |
| "bandwidths": [ | |
| 1.5, | |
| 3.0, | |
| 6.0, | |
| 12.0 | |
| ], | |
| "encodec_config": { | |
| "audio_channels": 1, | |
| "chunk_length_s": null, | |
| "codebook_dim": 128, | |
| "codebook_size": 1024, | |
| "compress": 2, | |
| "dilation_growth_rate": 2, | |
| "hidden_size": 128, | |
| "kernel_size": 7, | |
| "last_kernel_size": 7, | |
| "model_type": "encodec", | |
| "norm_type": "weight_norm", | |
| "normalize": false, | |
| "num_filters": 32, | |
| "num_lstm_layers": 2, | |
| "num_residual_layers": 1, | |
| "overlap": null, | |
| "pad_mode": "reflect", | |
| "residual_kernel_size": 3, | |
| "sampling_rate": 24000, | |
| "target_bandwidths": [ | |
| 1.5, | |
| 3.0, | |
| 6.0, | |
| 12.0, | |
| 24.0 | |
| ], | |
| "trim_right_ratio": 1.0, | |
| "upsampling_ratios": [ | |
| 8, | |
| 5, | |
| 4, | |
| 2 | |
| ], | |
| "use_causal_conv": true, | |
| "use_conv_shortcut": true | |
| }, | |
| "hidden_dim": 384, | |
| "hop_length": 320, | |
| "input_channels": 128, | |
| "intermediate_dim": 1152, | |
| "kernel_size": 7, | |
| "layer_norm_eps": 1e-06, | |
| "layer_scale_init_value": 0.125, | |
| "model_type": "vocos_with_encodec", | |
| "n_fft": 1280, | |
| "num_layers": 8, | |
| "padding": 3, | |
| "spec_padding": "same", | |
| "torch_dtype": "float32", | |
| "train_codebooks": false, | |
| "transformers_version": "4.55.2", | |
| "use_adaptive_norm": true | |
| } | |