File size: 5,332 Bytes
2784721
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
{
  "architecture": "voxcpm2",
  "lm_config": {
    "bos_token_id": 1,
    "eos_token_id": 2,
    "hidden_size": 2048,
    "intermediate_size": 6144,
    "max_position_embeddings": 32768,
    "num_attention_heads": 16,
    "num_hidden_layers": 28,
    "num_key_value_heads": 2,
    "rms_norm_eps": 1e-05,
    "rope_theta": 10000,
    "kv_channels": 128,
    "rope_scaling": {
      "type": "longrope",
      "long_factor": [
        0.9977997200264581,
        1.014658295992452,
        1.0349680404997148,
        1.059429246056193,
        1.0888815016813513,
        1.1243301355211495,
        1.166977103606075,
        1.2182568066927284,
        1.2798772354275727,
        1.3538666751582975,
        1.4426259039919596,
        1.5489853358570191,
        1.6762658237220625,
        1.8283407612492941,
        2.0096956085876183,
        2.225478927469756,
        2.481536379650452,
        2.784415934557119,
        3.1413289096347365,
        3.560047844772632,
        4.048719380066383,
        4.615569542115128,
        5.2684819496549835,
        6.014438591970396,
        6.858830049237097,
        7.804668263503327,
        8.851768731513417,
        9.99600492938444,
        11.228766118181639,
        12.536757560834843,
        13.902257701387796,
        15.303885189125953,
        16.717837610115794,
        18.119465097853947,
        19.484965238406907,
        20.792956681060105,
        22.02571786985731,
        23.16995406772833,
        24.217054535738416,
        25.16289275000465,
        26.007284207271347,
        26.753240849586767,
        27.40615325712662,
        27.973003419175363,
        28.461674954469114,
        28.880393889607006,
        29.237306864684626,
        29.540186419591297,
        29.79624387177199,
        30.01202719065413,
        30.193382037992453,
        30.34545697551969,
        30.47273746338473,
        30.579096895249787,
        30.66785612408345,
        30.741845563814174,
        30.80346599254902,
        30.85474569563567,
        30.897392663720595,
        30.932841297560394,
        30.962293553185553,
        30.986754758742034,
        31.007064503249293,
        31.02392307921529
      ],
      "short_factor": [
        0.9977997200264581,
        1.014658295992452,
        1.0349680404997148,
        1.059429246056193,
        1.0888815016813513,
        1.1243301355211495,
        1.166977103606075,
        1.2182568066927284,
        1.2798772354275727,
        1.3538666751582975,
        1.4426259039919596,
        1.5489853358570191,
        1.6762658237220625,
        1.8283407612492941,
        2.0096956085876183,
        2.225478927469756,
        2.481536379650452,
        2.784415934557119,
        3.1413289096347365,
        3.560047844772632,
        4.048719380066383,
        4.615569542115128,
        5.2684819496549835,
        6.014438591970396,
        6.858830049237097,
        7.804668263503327,
        8.851768731513417,
        9.99600492938444,
        11.228766118181639,
        12.536757560834843,
        13.902257701387796,
        15.303885189125953,
        16.717837610115794,
        18.119465097853947,
        19.484965238406907,
        20.792956681060105,
        22.02571786985731,
        23.16995406772833,
        24.217054535738416,
        25.16289275000465,
        26.007284207271347,
        26.753240849586767,
        27.40615325712662,
        27.973003419175363,
        28.461674954469114,
        28.880393889607006,
        29.237306864684626,
        29.540186419591297,
        29.79624387177199,
        30.01202719065413,
        30.193382037992453,
        30.34545697551969,
        30.47273746338473,
        30.579096895249787,
        30.66785612408345,
        30.741845563814174,
        30.80346599254902,
        30.85474569563567,
        30.897392663720595,
        30.932841297560394,
        30.962293553185553,
        30.986754758742034,
        31.007064503249293,
        31.02392307921529
      ],
      "original_max_position_embeddings": 32768
    },
    "vocab_size": 73448,
    "use_mup": false,
    "scale_emb": 12,
    "dim_model_base": 256,
    "scale_depth": 1.4
  },
  "patch_size": 4,
  "feat_dim": 64,
  "scalar_quantization_latent_dim": 512,
  "scalar_quantization_scale": 9,
  "residual_lm_num_layers": 8,
  "residual_lm_no_rope": true,
  "encoder_config": {
    "hidden_dim": 1024,
    "ffn_dim": 4096,
    "num_heads": 16,
    "num_layers": 12,
    "kv_channels": 128
  },
  "dit_config": {
    "hidden_dim": 1024,
    "ffn_dim": 4096,
    "num_heads": 16,
    "num_layers": 12,
    "kv_channels": 128,
    "mean_mode": false,
    "cfm_config": {
      "sigma_min": 1e-06,
      "solver": "euler",
      "t_scheduler": "log-norm",
      "inference_cfg_rate": 2.0
    }
  },
  "audio_vae_config": {
    "encoder_dim": 128,
    "encoder_rates": [
      2,
      5,
      8,
      8
    ],
    "latent_dim": 64,
    "decoder_dim": 2048,
    "decoder_rates": [
      8,
      6,
      5,
      2,
      2,
      2
    ],
    "sr_bin_boundaries": [
      20000,
      30000,
      40000
    ],
    "sample_rate": 16000,
    "out_sample_rate": 48000
  },
  "max_length": 8192,
  "model_type": "voxcpm2",
  "quantization": {
    "bits": 4,
    "group_size": 64,
    "targets": [
      "base_lm",
      "residual_lm"
    ]
  }
}