| { | |
| "_name_or_path": "TuKoResearch/WavTokenizer", | |
| "architectures": ["WavTokenizer"], | |
| "auto_map": { | |
| "AutoConfig": "configuration_wavtokenizer.WavTokenizerConfig", | |
| "AutoModel": "modeling_wavtokenizer.WavTokenizer" | |
| }, | |
| "model_type": "wavtokenizer", | |
| "sample_rate": 24000, | |
| "n_fft": 1280, | |
| "hop_length": 320, | |
| "n_mels": 128, | |
| "padding": "center", | |
| "feature_dim": 512, | |
| "encoder_dim": 32, | |
| "encoder_rates": [2, 4, 5, 8], | |
| "latent_dim": 512, | |
| "codebook_size": 4096, | |
| "codebook_dim": 512, | |
| "num_quantizers": 1, | |
| "backbone_type": "vocos", | |
| "backbone_dim": 768, | |
| "backbone_num_blocks": 12, | |
| "backbone_intermediate_dim": 2304, | |
| "backbone_kernel_size": 7, | |
| "backbone_layer_scale_init_value": 1e-6, | |
| "head_type": "istft", | |
| "head_dim": 641, | |
| "use_attention": false, | |
| "attention_dim": 768, | |
| "attention_heads": 8, | |
| "attention_layers": 0, | |
| "torch_dtype": "float32", | |
| "transformers_version": "4.40.0" | |
| } | |