File size: 3,308 Bytes
4b0005e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b4c806
 
 
4b0005e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
{
  "model_type": "xy_tokenizer",
  "auto_map": {
    "AutoFeatureExtractor": "feature_extraction_xy_tokenizer.XYTokenizerFeatureExtractor",
    "AutoConfig": "configuration_xy_tokenizer.XYTokenizerConfig",
    "AutoModel": "modeling_xy_tokenizer.XYTokenizerModel"
  },
  "input_sample_rate": 16000,
  "output_sample_rate": 24000,
  "encoder_downsample_rate": 1280,
  "decoder_upsample_rate": 1920,
  "code_dim": 3072,
  "params": {
    "feature_extractor_kwargs": {
      "chunk_length": 30,
      "feature_size": 80,
      "hop_length": 160,
      "n_fft": 400,
      "n_samples": 480000,
      "nb_max_frames": 3000,
      "padding_side": "right",
      "padding_value": 0.0,
      "sampling_rate": 16000,
      "return_attention_mask": true,
      "return_tensors": "pt"
    },
    "semantic_encoder_kwargs": {
      "num_mel_bins": 80,
      "sampling_rate": 16000,
      "hop_length": 160,
      "stride_size": 2,
      "kernel_size": 3,
      "d_model": 768,
      "scale_embedding": false,
      "max_audio_seconds": 30,
      "encoder_layers": 12,
      "encoder_attention_heads": 12,
      "encoder_ffn_dim": 3072,
      "activation_function": "gelu"
    },
    "semantic_encoder_adapter_kwargs": {
      "input_dim": 768,
      "output_dim": 768,
      "d_model": 768,
      "max_source_positions": 1500,
      "encoder_layers": 4,
      "encoder_attention_heads": 12,
      "encoder_ffn_dim": 3072
    },
    "acoustic_encoder_kwargs": {
      "num_mel_bins": 80,
      "sampling_rate": 16000,
      "hop_length": 160,
      "stride_size": 2,
      "kernel_size": 3,
      "d_model": 768,
      "scale_embedding": false,
      "max_audio_seconds": 30,
      "encoder_layers": 12,
      "encoder_attention_heads": 12,
      "encoder_ffn_dim": 3072,
      "activation_function": "gelu"
    },
    "pre_rvq_adapter_kwargs": {
      "input_dim": 1536,
      "output_dim": 768,
      "d_model": 768,
      "max_source_positions": 1500,
      "encoder_layers": 4,
      "encoder_attention_heads": 12,
      "encoder_ffn_dim": 3072
    },
    "downsample_kwargs": {
      "d_model": 768,
      "avg_pooler": 4
    },
    "quantizer_kwargs": {
      "input_dim": 3072,
      "rvq_dim": 512,
      "output_dim": 3072,
      "num_quantizers": 8,
      "codebook_size": 1024,
      "codebook_dim": 512,
      "quantizer_dropout": 0.0
    },
    "post_rvq_adapter_kwargs": {
      "input_dim": 3072,
      "output_dim": 3072,
      "d_model": 768,
      "max_source_positions": 375,
      "encoder_layers": 4,
      "encoder_attention_heads": 12,
      "encoder_ffn_dim": 3072
    },
    "upsample_kwargs": {
      "d_model": 768,
      "stride": 4
    },
    "acoustic_decoder_kwargs": {
      "num_mel_bins": 80,
      "sampling_rate": 16000,
      "hop_length": 160,
      "stride_size": 2,
      "kernel_size": 3,
      "d_model": 768,
      "scale_embedding": false,
      "max_audio_seconds": 30,
      "decoder_layers": 12,
      "decoder_attention_heads": 12,
      "decoder_ffn_dim": 3072,
      "activation_function": "gelu"
    },
    "vocos_kwargs": {
      "input_channels": 80,
      "dim": 512,
      "intermediate_dim": 4096,
      "num_layers": 30,
      "n_fft": 960,
      "hop_size": 240,
      "padding": "same"
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.51.0"
}