| { | |
| "model_type": "wavcoch", | |
| "architectures": [ | |
| "WavCoch" | |
| ], | |
| "auto_map": { | |
| "AutoConfig": "configuration_wavcoch.WavCochConfig", | |
| "AutoModel": "modeling_wavcoch.WavCoch" | |
| }, | |
| "torch_dtype": "float32", | |
| "transformers_version": "4.40.0", | |
| "sample_rate": 16000, | |
| "causal_pad_mode": "repeat", | |
| "out_channels": 211, | |
| "has_vocoder": true, | |
| "vocoder_upsample_rates": [ | |
| 5, | |
| 4, | |
| 2, | |
| 2 | |
| ], | |
| "vocoder_upsample_kernel_sizes": [ | |
| 10, | |
| 8, | |
| 4, | |
| 4 | |
| ], | |
| "vocoder_upsample_initial_channel": 512, | |
| "vocoder_resblock": "1", | |
| "vocoder_resblock_kernel_sizes": [ | |
| 11, | |
| 7, | |
| 3 | |
| ], | |
| "vocoder_resblock_dilation_sizes": [ | |
| [ | |
| 1, | |
| 3, | |
| 5 | |
| ], | |
| [ | |
| 1, | |
| 3, | |
| 5 | |
| ], | |
| [ | |
| 1, | |
| 3, | |
| 5 | |
| ] | |
| ], | |
| "window_size": 1001, | |
| "window_padding": 1000, | |
| "hop_length": 80, | |
| "causal_convs": true, | |
| "encoder_layers": 8, | |
| "encoder_dim": 512, | |
| "encoder_kernel_size": 3, | |
| "decoder_layers": 8, | |
| "decoder_dim": 512, | |
| "decoder_kernel_size": 9, | |
| "quantizer": "FSQ", | |
| "channels": [ | |
| 8, | |
| 8, | |
| 8, | |
| 4, | |
| 4 | |
| ], | |
| "vocab_size": 8192 | |
| } |