File size: 3,308 Bytes
4b0005e 0b4c806 4b0005e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
{
"model_type": "xy_tokenizer",
"auto_map": {
"AutoFeatureExtractor": "feature_extraction_xy_tokenizer.XYTokenizerFeatureExtractor",
"AutoConfig": "configuration_xy_tokenizer.XYTokenizerConfig",
"AutoModel": "modeling_xy_tokenizer.XYTokenizerModel"
},
"input_sample_rate": 16000,
"output_sample_rate": 24000,
"encoder_downsample_rate": 1280,
"decoder_upsample_rate": 1920,
"code_dim": 3072,
"params": {
"feature_extractor_kwargs": {
"chunk_length": 30,
"feature_size": 80,
"hop_length": 160,
"n_fft": 400,
"n_samples": 480000,
"nb_max_frames": 3000,
"padding_side": "right",
"padding_value": 0.0,
"sampling_rate": 16000,
"return_attention_mask": true,
"return_tensors": "pt"
},
"semantic_encoder_kwargs": {
"num_mel_bins": 80,
"sampling_rate": 16000,
"hop_length": 160,
"stride_size": 2,
"kernel_size": 3,
"d_model": 768,
"scale_embedding": false,
"max_audio_seconds": 30,
"encoder_layers": 12,
"encoder_attention_heads": 12,
"encoder_ffn_dim": 3072,
"activation_function": "gelu"
},
"semantic_encoder_adapter_kwargs": {
"input_dim": 768,
"output_dim": 768,
"d_model": 768,
"max_source_positions": 1500,
"encoder_layers": 4,
"encoder_attention_heads": 12,
"encoder_ffn_dim": 3072
},
"acoustic_encoder_kwargs": {
"num_mel_bins": 80,
"sampling_rate": 16000,
"hop_length": 160,
"stride_size": 2,
"kernel_size": 3,
"d_model": 768,
"scale_embedding": false,
"max_audio_seconds": 30,
"encoder_layers": 12,
"encoder_attention_heads": 12,
"encoder_ffn_dim": 3072,
"activation_function": "gelu"
},
"pre_rvq_adapter_kwargs": {
"input_dim": 1536,
"output_dim": 768,
"d_model": 768,
"max_source_positions": 1500,
"encoder_layers": 4,
"encoder_attention_heads": 12,
"encoder_ffn_dim": 3072
},
"downsample_kwargs": {
"d_model": 768,
"avg_pooler": 4
},
"quantizer_kwargs": {
"input_dim": 3072,
"rvq_dim": 512,
"output_dim": 3072,
"num_quantizers": 8,
"codebook_size": 1024,
"codebook_dim": 512,
"quantizer_dropout": 0.0
},
"post_rvq_adapter_kwargs": {
"input_dim": 3072,
"output_dim": 3072,
"d_model": 768,
"max_source_positions": 375,
"encoder_layers": 4,
"encoder_attention_heads": 12,
"encoder_ffn_dim": 3072
},
"upsample_kwargs": {
"d_model": 768,
"stride": 4
},
"acoustic_decoder_kwargs": {
"num_mel_bins": 80,
"sampling_rate": 16000,
"hop_length": 160,
"stride_size": 2,
"kernel_size": 3,
"d_model": 768,
"scale_embedding": false,
"max_audio_seconds": 30,
"decoder_layers": 12,
"decoder_attention_heads": 12,
"decoder_ffn_dim": 3072,
"activation_function": "gelu"
},
"vocos_kwargs": {
"input_channels": 80,
"dim": 512,
"intermediate_dim": 4096,
"num_layers": 30,
"n_fft": 960,
"hop_size": 240,
"padding": "same"
}
},
"torch_dtype": "float32",
"transformers_version": "4.51.0"
} |