leduclinh's picture
Duplicate from aufklarer/CosyVoice3-0.5B-MLX-4bit
88c0abe
{
"model_type": "cosyvoice3",
"version": "Fun-CosyVoice3-0.5B-2512",
"llm": {
"hidden_size": 896,
"num_hidden_layers": 24,
"num_attention_heads": 14,
"num_key_value_heads": 2,
"intermediate_size": 4864,
"head_dim": 64,
"max_position_embeddings": 32768,
"vocab_size": 151936,
"rms_norm_eps": 1e-06,
"rope_theta": 1000000.0,
"tie_word_embeddings": true,
"speech_token_size": 6561,
"text_token_size": 151936
},
"flow": {
"input_size": 512,
"output_size": 80,
"vocab_size": 6561,
"spk_embed_dim": 192,
"token_frame_rate": 25,
"token_mel_ratio": 2,
"pre_lookahead_len": 3,
"dit": {
"dim": 1024,
"depth": 22,
"heads": 16,
"dim_head": 64,
"ff_mult": 2,
"mel_dim": 80,
"spk_dim": 80,
"static_chunk_size": 50
}
},
"hifigan": {
"sampling_rate": 24000,
"in_channels": 80,
"base_channels": 512,
"nb_harmonics": 8,
"upsample_rates": [
8,
5,
3
],
"upsample_kernel_sizes": [
16,
11,
7
],
"istft_n_fft": 16,
"istft_hop_len": 4,
"resblock_kernel_sizes": [
3,
7,
11
],
"resblock_dilation_sizes": [
[
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"source_resblock_kernel_sizes": [
7,
7,
11
],
"nsf_alpha": 0.1,
"nsf_sigma": 0.003,
"nsf_voiced_threshold": 10,
"audio_limit": 0.99
},
"mel": {
"n_fft": 1920,
"num_mels": 80,
"hop_size": 480,
"win_size": 1920,
"sample_rate": 24000
},
"tokenizer": {
"type": "fsq",
"codebook_size": 6561,
"frame_rate": 25
},
"quantization": {
"bits": 4,
"group_size": 64,
"quantized_layers": [
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj",
"speech_head"
]
}
}