VoxCPM2-bf16 / config.json
acul3's picture
Upload config.json with huggingface_hub
953e9c8 verified
{
"architecture": "voxcpm2",
"lm_config": {
"bos_token_id": 1,
"eos_token_id": 2,
"hidden_size": 2048,
"intermediate_size": 6144,
"max_position_embeddings": 32768,
"num_attention_heads": 16,
"num_hidden_layers": 28,
"num_key_value_heads": 2,
"rms_norm_eps": 1e-05,
"rope_theta": 10000,
"kv_channels": 128,
"rope_scaling": {
"type": "longrope",
"long_factor": [
0.9977997200264581,
1.014658295992452,
1.0349680404997148,
1.059429246056193,
1.0888815016813513,
1.1243301355211495,
1.166977103606075,
1.2182568066927284,
1.2798772354275727,
1.3538666751582975,
1.4426259039919596,
1.5489853358570191,
1.6762658237220625,
1.8283407612492941,
2.0096956085876183,
2.225478927469756,
2.481536379650452,
2.784415934557119,
3.1413289096347365,
3.560047844772632,
4.048719380066383,
4.615569542115128,
5.2684819496549835,
6.014438591970396,
6.858830049237097,
7.804668263503327,
8.851768731513417,
9.99600492938444,
11.228766118181639,
12.536757560834843,
13.902257701387796,
15.303885189125953,
16.717837610115794,
18.119465097853947,
19.484965238406907,
20.792956681060105,
22.02571786985731,
23.16995406772833,
24.217054535738416,
25.16289275000465,
26.007284207271347,
26.753240849586767,
27.40615325712662,
27.973003419175363,
28.461674954469114,
28.880393889607006,
29.237306864684626,
29.540186419591297,
29.79624387177199,
30.01202719065413,
30.193382037992453,
30.34545697551969,
30.47273746338473,
30.579096895249787,
30.66785612408345,
30.741845563814174,
30.80346599254902,
30.85474569563567,
30.897392663720595,
30.932841297560394,
30.962293553185553,
30.986754758742034,
31.007064503249293,
31.02392307921529
],
"short_factor": [
0.9977997200264581,
1.014658295992452,
1.0349680404997148,
1.059429246056193,
1.0888815016813513,
1.1243301355211495,
1.166977103606075,
1.2182568066927284,
1.2798772354275727,
1.3538666751582975,
1.4426259039919596,
1.5489853358570191,
1.6762658237220625,
1.8283407612492941,
2.0096956085876183,
2.225478927469756,
2.481536379650452,
2.784415934557119,
3.1413289096347365,
3.560047844772632,
4.048719380066383,
4.615569542115128,
5.2684819496549835,
6.014438591970396,
6.858830049237097,
7.804668263503327,
8.851768731513417,
9.99600492938444,
11.228766118181639,
12.536757560834843,
13.902257701387796,
15.303885189125953,
16.717837610115794,
18.119465097853947,
19.484965238406907,
20.792956681060105,
22.02571786985731,
23.16995406772833,
24.217054535738416,
25.16289275000465,
26.007284207271347,
26.753240849586767,
27.40615325712662,
27.973003419175363,
28.461674954469114,
28.880393889607006,
29.237306864684626,
29.540186419591297,
29.79624387177199,
30.01202719065413,
30.193382037992453,
30.34545697551969,
30.47273746338473,
30.579096895249787,
30.66785612408345,
30.741845563814174,
30.80346599254902,
30.85474569563567,
30.897392663720595,
30.932841297560394,
30.962293553185553,
30.986754758742034,
31.007064503249293,
31.02392307921529
],
"original_max_position_embeddings": 32768
},
"vocab_size": 73448,
"use_mup": false,
"scale_emb": 12,
"dim_model_base": 256,
"scale_depth": 1.4
},
"patch_size": 4,
"feat_dim": 64,
"scalar_quantization_latent_dim": 512,
"scalar_quantization_scale": 9,
"residual_lm_num_layers": 8,
"residual_lm_no_rope": true,
"encoder_config": {
"hidden_dim": 1024,
"ffn_dim": 4096,
"num_heads": 16,
"num_layers": 12,
"kv_channels": 128
},
"dit_config": {
"hidden_dim": 1024,
"ffn_dim": 4096,
"num_heads": 16,
"num_layers": 12,
"kv_channels": 128,
"mean_mode": false,
"cfm_config": {
"sigma_min": 1e-06,
"solver": "euler",
"t_scheduler": "log-norm",
"inference_cfg_rate": 2.0
}
},
"audio_vae_config": {
"encoder_dim": 128,
"encoder_rates": [
2,
5,
8,
8
],
"latent_dim": 64,
"decoder_dim": 2048,
"decoder_rates": [
8,
6,
5,
2,
2,
2
],
"sr_bin_boundaries": [
20000,
30000,
40000
],
"sample_rate": 16000,
"out_sample_rate": 48000
},
"max_length": 8192,
"model_type": "voxcpm2"
}