File size: 8,074 Bytes
4e9e0e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
{
  "barbet_config": {
    "attention_dropout": 0.0,
    "attention_sink": false,
    "bos_token_id": 114689,
    "eos_token_id": 114690,
    "global_attention_layers": [
      0,
      4,
      8,
      12,
      16,
      20,
      24
    ],
    "head_dim": 128,
    "hidden_dropout": 0.0,
    "hidden_size": 1536,
    "initializer_range": 0.02,
    "intermediate_size": 5120,
    "mamba_d_conv": 4,
    "mamba_d_state": 64,
    "mamba_expand": 2,
    "mamba_layers": [
      3,
      7,
      11,
      15,
      19,
      23,
      27
    ],
    "max_position_embeddings": 262144,
    "mtp_enabled": false,
    "mtp_loss_weights": {
      "2": 0.2,
      "3": 0.1
    },
    "mtp_offsets": [
      2,
      3
    ],
    "num_attention_heads": 16,
    "num_hidden_layers": 28,
    "num_key_value_heads": 2,
    "pad_token_id": 114691,
    "qk_clip_alpha": 0.5,
    "qk_clip_threshold": 100.0,
    "qk_logit_clip": false,
    "qk_norm": true,
    "rms_norm_eps": 1e-06,
    "rope_theta": 10000000.0,
    "sliding_window_size": 8192,
    "tie_word_embeddings": true,
    "unk_token_id": 114688,
    "use_cache": true,
    "vocab_size": 114944
  },
  "vox_lm_config": {
    "bos_token_id": 1,
    "eos_token_id": 2,
    "hidden_size": 2048,
    "intermediate_size": 6144,
    "max_position_embeddings": 32768,
    "num_attention_heads": 16,
    "num_hidden_layers": 28,
    "num_key_value_heads": 2,
    "rms_norm_eps": 1e-05,
    "rope_scaling": {
      "type": "longrope",
      "long_factor": [
        0.9977997200264581,
        1.014658295992452,
        1.0349680404997148,
        1.059429246056193,
        1.0888815016813513,
        1.1243301355211495,
        1.166977103606075,
        1.2182568066927284,
        1.2798772354275727,
        1.3538666751582975,
        1.4426259039919596,
        1.5489853358570191,
        1.6762658237220625,
        1.8283407612492941,
        2.0096956085876183,
        2.225478927469756,
        2.481536379650452,
        2.784415934557119,
        3.1413289096347365,
        3.560047844772632,
        4.048719380066383,
        4.615569542115128,
        5.2684819496549835,
        6.014438591970396,
        6.858830049237097,
        7.804668263503327,
        8.851768731513417,
        9.99600492938444,
        11.228766118181639,
        12.536757560834843,
        13.902257701387796,
        15.303885189125953,
        16.717837610115794,
        18.119465097853947,
        19.484965238406907,
        20.792956681060105,
        22.02571786985731,
        23.16995406772833,
        24.217054535738416,
        25.16289275000465,
        26.007284207271347,
        26.753240849586767,
        27.40615325712662,
        27.973003419175363,
        28.461674954469114,
        28.880393889607006,
        29.237306864684626,
        29.540186419591297,
        29.79624387177199,
        30.01202719065413,
        30.193382037992453,
        30.34545697551969,
        30.47273746338473,
        30.579096895249787,
        30.66785612408345,
        30.741845563814174,
        30.80346599254902,
        30.85474569563567,
        30.897392663720595,
        30.932841297560394,
        30.962293553185553,
        30.986754758742034,
        31.007064503249293,
        31.02392307921529
      ],
      "short_factor": [
        0.9977997200264581,
        1.014658295992452,
        1.0349680404997148,
        1.059429246056193,
        1.0888815016813513,
        1.1243301355211495,
        1.166977103606075,
        1.2182568066927284,
        1.2798772354275727,
        1.3538666751582975,
        1.4426259039919596,
        1.5489853358570191,
        1.6762658237220625,
        1.8283407612492941,
        2.0096956085876183,
        2.225478927469756,
        2.481536379650452,
        2.784415934557119,
        3.1413289096347365,
        3.560047844772632,
        4.048719380066383,
        4.615569542115128,
        5.2684819496549835,
        6.014438591970396,
        6.858830049237097,
        7.804668263503327,
        8.851768731513417,
        9.99600492938444,
        11.228766118181639,
        12.536757560834843,
        13.902257701387796,
        15.303885189125953,
        16.717837610115794,
        18.119465097853947,
        19.484965238406907,
        20.792956681060105,
        22.02571786985731,
        23.16995406772833,
        24.217054535738416,
        25.16289275000465,
        26.007284207271347,
        26.753240849586767,
        27.40615325712662,
        27.973003419175363,
        28.461674954469114,
        28.880393889607006,
        29.237306864684626,
        29.540186419591297,
        29.79624387177199,
        30.01202719065413,
        30.193382037992453,
        30.34545697551969,
        30.47273746338473,
        30.579096895249787,
        30.66785612408345,
        30.741845563814174,
        30.80346599254902,
        30.85474569563567,
        30.897392663720595,
        30.932841297560394,
        30.962293553185553,
        30.986754758742034,
        31.007064503249293,
        31.02392307921529
      ],
      "original_max_position_embeddings": 32768
    },
    "vocab_size": 73448,
    "use_mup": false,
    "scale_emb": 12.0,
    "dim_model_base": 256,
    "scale_depth": 1.4,
    "rope_theta": 10000.0,
    "kv_channels": 128,
    "no_rope": false
  },
  "patch_size": 4,
  "feat_dim": 64,
  "residual_lm_num_layers": 8,
  "residual_lm_no_rope": true,
  "scalar_quantization_latent_dim": 512,
  "scalar_quantization_scale": 9,
  "encoder_config": {
    "hidden_dim": 1024,
    "ffn_dim": 4096,
    "num_heads": 16,
    "num_layers": 12,
    "kv_channels": 128
  },
  "dit_config": {
    "hidden_dim": 1024,
    "ffn_dim": 4096,
    "num_heads": 16,
    "num_layers": 12,
    "kv_channels": 128,
    "dit_mean_mode": false,
    "cfm_config": {
      "sigma_min": 1e-06,
      "solver": "euler",
      "t_scheduler": "log-norm",
      "training_cfg_rate": 0.1,
      "inference_cfg_rate": 2.0,
      "reg_loss_type": "l1",
      "ratio_r_neq_t_range": [
        0.25,
        0.75
      ],
      "noise_cond_prob_range": [
        0.0,
        0.0
      ],
      "noise_cond_scale": 0.0
    }
  },
  "audio_vae_config": {
    "encoder_dim": 128,
    "encoder_rates": [
      2,
      5,
      8,
      8
    ],
    "latent_dim": 64,
    "decoder_dim": 2048,
    "decoder_rates": [
      8,
      6,
      5,
      2,
      2,
      2
    ],
    "depthwise": true,
    "sample_rate": 16000,
    "out_sample_rate": 48000,
    "use_noise_block": false,
    "sr_bin_boundaries": [
      20000,
      30000,
      40000
    ],
    "cond_type": "scale_bias",
    "cond_dim": 128,
    "cond_out_layer": false
  },
  "adapter_config": {
    "num_residual_blocks": 1,
    "ffn_mult": 2.0,
    "rms_norm_eps": 1e-06
  },
  "speaker_embed_dim": 192,
  "audio_start_token": -1,
  "audio_end_token": -1,
  "ref_audio_start_token": -1,
  "ref_audio_end_token": -1,
  "spk_token": -1,
  "barbet_effective_vocab_size": null,
  "max_length": 8192,
  "device": "cuda",
  "dtype": "bfloat16",
  "generation_defaults": {
    "cfg_value": 2.8,
    "inference_timesteps": 9,
    "max_len": 2000,
    "retry_badcase": true,
    "retry_badcase_max_times": 3,
    "retry_badcase_ratio_threshold": 6.0,
    "speaker_id": "hung_yi_lee",
    "speaker_source_dataset": "voidful/hung-yi_lee",
    "speaker_centroid_path": "checkpoints/hung_yi_lee_speaker_centroids.pt",
    "speaker_centroid_sha256": "e1d4c95a4c33935ff1fee0ab47fa796dcc13908a183c60e9b02bc0a61c541c4c",
    "speaker_centroid_dim": 192
  },
  "generation_defaults_source": {
    "scope": "tts_hard_sentences_zh_500 + Breeze-ASR-25 normalized CER",
    "sentences": "/home/voidful/tts_hard_sentences_zh_500.txt",
    "asr_model": "MediaTek-Research/Breeze-ASR-25",
    "conversion": "s2twp",
    "normalized_cer": 0.09669792733863977,
    "mixed_token_error_rate": 0.0911015155363644,
    "char_errors": 1227,
    "char_reference_length": 12689,
    "evaluated_examples": 500,
    "trial": "hy_cfg2p8_steps9",
    "run": "hungyi_high_refine_hy_cfg2p8_steps9_20260620"
  }
}