MOSS_TTSD_tokenizer_hf / config.json
CloudRipple's picture
Init repository with huggingface version of MOSS_TTSD_tokenizer
9f16a12 verified
{
"code_dim": 3072,
"decoder_upsample_rate": 2560,
"dtype": "float32",
"encoder_downsample_rate": 1280,
"initializer_range": 0.02,
"input_sample_rate": 16000,
"input_sampling_rate": 16000,
"model_type": "xy_tokenizer",
"output_sample_rate": 32000,
"params": {
"acoustic_decoder_kwargs": {
"activation_function": "gelu",
"d_model": 768,
"decoder_attention_heads": 12,
"decoder_ffn_dim": 3072,
"decoder_layers": 12,
"hop_length": 160,
"kernel_size": 3,
"max_audio_seconds": 30,
"num_mel_bins": 80,
"sampling_rate": 16000,
"scale_embedding": false,
"stride_size": 2
},
"acoustic_encoder_kwargs": {
"activation_function": "gelu",
"d_model": 768,
"encoder_attention_heads": 12,
"encoder_ffn_dim": 3072,
"encoder_layers": 12,
"hop_length": 160,
"kernel_size": 3,
"max_audio_seconds": 30,
"num_mel_bins": 80,
"sampling_rate": 16000,
"scale_embedding": false,
"stride_size": 2
},
"downsample_kwargs": {
"avg_pooler": 4,
"d_model": 768
},
"feature_extractor_kwargs": {
"chunk_length": 30,
"feature_size": 80,
"hop_length": 160,
"n_fft": 400,
"n_samples": 480000,
"nb_max_frames": 3000,
"padding_side": "right",
"padding_value": 0.0,
"return_attention_mask": true,
"return_tensors": "pt",
"sampling_rate": 16000
},
"post_rvq_adapter_kwargs": {
"d_model": 768,
"encoder_attention_heads": 12,
"encoder_ffn_dim": 3072,
"encoder_layers": 4,
"input_dim": 3072,
"max_source_positions": 375,
"output_dim": 3072
},
"pre_rvq_adapter_kwargs": {
"d_model": 768,
"encoder_attention_heads": 12,
"encoder_ffn_dim": 3072,
"encoder_layers": 4,
"input_dim": 1536,
"max_source_positions": 1500,
"output_dim": 768
},
"quantizer_kwargs": {
"codebook_dim": 512,
"codebook_size": 1024,
"input_dim": 3072,
"num_quantizers": 8,
"output_dim": 3072,
"quantizer_dropout": 0.0,
"rvq_dim": 512
},
"semantic_encoder_adapter_kwargs": {
"d_model": 768,
"encoder_attention_heads": 12,
"encoder_ffn_dim": 3072,
"encoder_layers": 4,
"input_dim": 768,
"max_source_positions": 1500,
"output_dim": 768
},
"semantic_encoder_kwargs": {
"activation_function": "gelu",
"d_model": 768,
"encoder_attention_heads": 12,
"encoder_ffn_dim": 3072,
"encoder_layers": 12,
"hop_length": 160,
"kernel_size": 3,
"max_audio_seconds": 30,
"num_mel_bins": 80,
"sampling_rate": 16000,
"scale_embedding": false,
"stride_size": 2
},
"upsample_kwargs": {
"d_model": 768,
"stride": 4
},
"vocos_kwargs": {
"dim": 512,
"hop_size": 320,
"input_channels": 80,
"intermediate_dim": 4096,
"n_fft": 1280,
"num_layers": 30,
"padding": "same"
}
},
"sampling_rate": 32000,
"transformers_version": "4.56.1",
"use_cache": true
}