{ "_name_or_path": "TuKoResearch/WavTokenizer", "architectures": ["WavTokenizer"], "auto_map": { "AutoConfig": "configuration_wavtokenizer.WavTokenizerConfig", "AutoModel": "modeling_wavtokenizer.WavTokenizer" }, "model_type": "wavtokenizer", "sample_rate": 24000, "n_fft": 1280, "hop_length": 320, "n_mels": 128, "padding": "center", "feature_dim": 512, "encoder_dim": 32, "encoder_rates": [2, 4, 5, 8], "latent_dim": 512, "codebook_size": 4096, "codebook_dim": 512, "num_quantizers": 1, "backbone_type": "vocos", "backbone_dim": 768, "backbone_num_blocks": 12, "backbone_intermediate_dim": 2304, "backbone_kernel_size": 7, "backbone_layer_scale_init_value": 1e-6, "head_type": "istft", "head_dim": 641, "use_attention": false, "attention_dim": 768, "attention_heads": 8, "attention_layers": 0, "torch_dtype": "float32", "transformers_version": "4.40.0" }