{ "code_dim": 3072, "decoder_upsample_rate": 2560, "dtype": "float32", "encoder_downsample_rate": 1280, "initializer_range": 0.02, "input_sample_rate": 16000, "input_sampling_rate": 16000, "model_type": "xy_tokenizer", "output_sample_rate": 32000, "params": { "acoustic_decoder_kwargs": { "activation_function": "gelu", "d_model": 768, "decoder_attention_heads": 12, "decoder_ffn_dim": 3072, "decoder_layers": 12, "hop_length": 160, "kernel_size": 3, "max_audio_seconds": 30, "num_mel_bins": 80, "sampling_rate": 16000, "scale_embedding": false, "stride_size": 2 }, "acoustic_encoder_kwargs": { "activation_function": "gelu", "d_model": 768, "encoder_attention_heads": 12, "encoder_ffn_dim": 3072, "encoder_layers": 12, "hop_length": 160, "kernel_size": 3, "max_audio_seconds": 30, "num_mel_bins": 80, "sampling_rate": 16000, "scale_embedding": false, "stride_size": 2 }, "downsample_kwargs": { "avg_pooler": 4, "d_model": 768 }, "feature_extractor_kwargs": { "chunk_length": 30, "feature_size": 80, "hop_length": 160, "n_fft": 400, "n_samples": 480000, "nb_max_frames": 3000, "padding_side": "right", "padding_value": 0.0, "return_attention_mask": true, "return_tensors": "pt", "sampling_rate": 16000 }, "post_rvq_adapter_kwargs": { "d_model": 768, "encoder_attention_heads": 12, "encoder_ffn_dim": 3072, "encoder_layers": 4, "input_dim": 3072, "max_source_positions": 375, "output_dim": 3072 }, "pre_rvq_adapter_kwargs": { "d_model": 768, "encoder_attention_heads": 12, "encoder_ffn_dim": 3072, "encoder_layers": 4, "input_dim": 1536, "max_source_positions": 1500, "output_dim": 768 }, "quantizer_kwargs": { "codebook_dim": 512, "codebook_size": 1024, "input_dim": 3072, "num_quantizers": 8, "output_dim": 3072, "quantizer_dropout": 0.0, "rvq_dim": 512 }, "semantic_encoder_adapter_kwargs": { "d_model": 768, "encoder_attention_heads": 12, "encoder_ffn_dim": 3072, "encoder_layers": 4, "input_dim": 768, "max_source_positions": 1500, "output_dim": 768 }, "semantic_encoder_kwargs": { "activation_function": "gelu", "d_model": 768, "encoder_attention_heads": 12, "encoder_ffn_dim": 3072, "encoder_layers": 12, "hop_length": 160, "kernel_size": 3, "max_audio_seconds": 30, "num_mel_bins": 80, "sampling_rate": 16000, "scale_embedding": false, "stride_size": 2 }, "upsample_kwargs": { "d_model": 768, "stride": 4 }, "vocos_kwargs": { "dim": 512, "hop_size": 320, "input_channels": 80, "intermediate_dim": 4096, "n_fft": 1280, "num_layers": 30, "padding": "same" } }, "sampling_rate": 32000, "transformers_version": "4.56.1", "use_cache": true }