qazljlj commited on
Commit
b5eb06b
·
verified ·
1 Parent(s): 6840fbc

Upload 22 files

Browse files
Files changed (23) hide show
  1. .gitattributes +1 -0
  2. models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/blobs/dfc11073787daf1b0f9c0f1499487ab5f4c93738 +14 -0
  3. models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/refs/main +1 -0
  4. models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/.gitattributes +36 -0
  5. models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/CosyVoice-BlankEN/config.json +27 -0
  6. models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/CosyVoice-BlankEN/generation_config.json +14 -0
  7. models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/CosyVoice-BlankEN/merges.txt +0 -0
  8. models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/CosyVoice-BlankEN/model.safetensors +3 -0
  9. models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/CosyVoice-BlankEN/tokenizer_config.json +40 -0
  10. models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/CosyVoice-BlankEN/vocab.json +0 -0
  11. models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/README.md +235 -0
  12. models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/asset/dingding.png +3 -0
  13. models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/campplus.onnx +3 -0
  14. models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/config.json +1 -0
  15. models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/configuration.json +1 -0
  16. models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/cosyvoice3.yaml +223 -0
  17. models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/flow.decoder.estimator.fp32.onnx +3 -0
  18. models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/flow.pt +3 -0
  19. models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/hift.pt +3 -0
  20. models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/llm.pt +3 -0
  21. models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/llm.rl.pt +3 -0
  22. models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/speech_tokenizer_v3.batch.onnx +3 -0
  23. models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/speech_tokenizer_v3.onnx +3 -0
.gitattributes CHANGED
@@ -37,3 +37,4 @@ models/diffusion_models/qwen-image-edit-2511-Q5_K_M.gguf filter=lfs diff=lfs mer
37
  models/diffusion_models/qwen-image-2512-Q5_K_M.gguf filter=lfs diff=lfs merge=lfs -text
38
  models/LLM/Qwen3-VL-8B-Instruct-abliterated-v2.mmproj-f16.gguf filter=lfs diff=lfs merge=lfs -text
39
  models/LLM/Qwen3-VL-8B-Instruct-abliterated-v2.Q6_K.gguf filter=lfs diff=lfs merge=lfs -text
 
 
37
  models/diffusion_models/qwen-image-2512-Q5_K_M.gguf filter=lfs diff=lfs merge=lfs -text
38
  models/LLM/Qwen3-VL-8B-Instruct-abliterated-v2.mmproj-f16.gguf filter=lfs diff=lfs merge=lfs -text
39
  models/LLM/Qwen3-VL-8B-Instruct-abliterated-v2.Q6_K.gguf filter=lfs diff=lfs merge=lfs -text
40
+ models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/asset/dingding.png filter=lfs diff=lfs merge=lfs -text
models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/blobs/dfc11073787daf1b0f9c0f1499487ab5f4c93738 ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "pad_token_id": 151643,
4
+ "do_sample": true,
5
+ "eos_token_id": [
6
+ 151645,
7
+ 151643
8
+ ],
9
+ "repetition_penalty": 1.1,
10
+ "temperature": 0.7,
11
+ "top_p": 0.8,
12
+ "top_k": 20,
13
+ "transformers_version": "4.37.0"
14
+ }
models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/refs/main ADDED
@@ -0,0 +1 @@
 
 
1
+ 29e01c4e8d000f4bcd70751be16fa94bf3d85a18
models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ asset/dingding.png filter=lfs diff=lfs merge=lfs -text
models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/CosyVoice-BlankEN/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 151643,
7
+ "eos_token_id": 151645,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 896,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 4864,
12
+ "max_position_embeddings": 32768,
13
+ "max_window_layers": 24,
14
+ "model_type": "qwen2",
15
+ "num_attention_heads": 14,
16
+ "num_hidden_layers": 24,
17
+ "num_key_value_heads": 2,
18
+ "rms_norm_eps": 1e-06,
19
+ "rope_theta": 1000000.0,
20
+ "sliding_window": 32768,
21
+ "tie_word_embeddings": true,
22
+ "torch_dtype": "bfloat16",
23
+ "transformers_version": "4.40.1",
24
+ "use_cache": true,
25
+ "use_sliding_window": false,
26
+ "vocab_size": 151936
27
+ }
models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/CosyVoice-BlankEN/generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "pad_token_id": 151643,
4
+ "do_sample": true,
5
+ "eos_token_id": [
6
+ 151645,
7
+ 151643
8
+ ],
9
+ "repetition_penalty": 1.1,
10
+ "temperature": 0.7,
11
+ "top_p": 0.8,
12
+ "top_k": 20,
13
+ "transformers_version": "4.37.0"
14
+ }
models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/CosyVoice-BlankEN/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/CosyVoice-BlankEN/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:130282af0dfa9fe5840737cc49a0d339d06075f83c5a315c3372c9a0740d0b96
3
+ size 988097824
models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/CosyVoice-BlankEN/tokenizer_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "additional_special_tokens": ["<|im_start|>", "<|im_end|>"],
30
+ "bos_token": null,
31
+ "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "<|im_end|>",
34
+ "errors": "replace",
35
+ "model_max_length": 32768,
36
+ "pad_token": "<|endoftext|>",
37
+ "split_special_tokens": false,
38
+ "tokenizer_class": "Qwen2Tokenizer",
39
+ "unk_token": null
40
+ }
models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/CosyVoice-BlankEN/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/README.md ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ language:
4
+ - zh
5
+ - en
6
+ - fr
7
+ - es
8
+ - ja
9
+ - ko
10
+ - it
11
+ - ru
12
+ - de
13
+ pipeline_tag: text-to-speech
14
+ ---
15
+
16
+ ![SVG Banners](https://svg-banners.vercel.app/api?type=origin&text1=CosyVoice🤠&text2=Text-to-Speech%20💖%20Large%20Language%20Model&width=800&height=210)
17
+
18
+ ## 👉🏻 CosyVoice 👈🏻
19
+
20
+ **Fun-CosyVoice 3.0**: [Demos](https://funaudiollm.github.io/cosyvoice3/); [Paper](https://arxiv.org/abs/2505.17589); [Modelscope](https://www.modelscope.cn/models/FunAudioLLM/Fun-CosyVoice3-0.5B-2512); [Huggingface](https://huggingface.co/FunAudioLLM/Fun-CosyVoice3-0.5B-2512); [CV3-Eval](https://github.com/FunAudioLLM/CV3-Eval)
21
+
22
+ **CosyVoice 2.0**: [Demos](https://funaudiollm.github.io/cosyvoice2/); [Paper](https://arxiv.org/abs/2412.10117); [Modelscope](https://www.modelscope.cn/models/iic/CosyVoice2-0.5B); [HuggingFace](https://huggingface.co/FunAudioLLM/CosyVoice2-0.5B)
23
+
24
+ **CosyVoice 1.0**: [Demos](https://fun-audio-llm.github.io); [Paper](https://funaudiollm.github.io/pdf/CosyVoice_v1.pdf); [Modelscope](https://www.modelscope.cn/models/iic/CosyVoice-300M); [HuggingFace](https://huggingface.co/FunAudioLLM/CosyVoice-300M)
25
+
26
+ ## Highlight🔥
27
+
28
+ **Fun-CosyVoice 3.0** is an advanced text-to-speech (TTS) system based on large language models (LLM), surpassing its predecessor (CosyVoice 2.0) in content consistency, speaker similarity, and prosody naturalness. It is designed for zero-shot multilingual speech synthesis in the wild.
29
+ ### Key Features
30
+ - **Language Coverage**: Covers 9 common languages (Chinese, English, Japanese, Korean, German, Spanish, French, Italian, Russian), 18+ Chinese dialects/accents (Guangdong, Minnan, Sichuan, Dongbei, Shan3xi, Shan1xi, Shanghai, Tianjin, Shandong, Ningxia, Gansu, etc.) and meanwhile supports both multi-lingual/cross-lingual zero-shot voice cloning.
31
+ - **Content Consistency & Naturalness**: Achieves state-of-the-art performance in content consistency, speaker similarity, and prosody naturalness.
32
+ - **Pronunciation Inpainting**: Supports pronunciation inpainting of Chinese Pinyin and English CMU phonemes, providing more controllability and thus suitable for production use.
33
+ - **Text Normalization**: Supports reading of numbers, special symbols and various text formats without a traditional frontend module.
34
+ - **Bi-Streaming**: Support both text-in streaming and audio-out streaming, and achieves latency as low as 150ms while maintaining high-quality audio output.
35
+ - **Instruct Support**: Supports various instructions such as languages, dialects, emotions, speed, volume, etc.
36
+
37
+
38
+ ## Roadmap
39
+
40
+ - [x] 2025/12
41
+
42
+ - [x] release Fun-CosyVoice3-0.5B-2512 base model, rl model and its training/inference script
43
+ - [x] release Fun-CosyVoice3-0.5B modelscope gradio space
44
+
45
+ - [x] 2025/08
46
+
47
+ - [x] Thanks to the contribution from NVIDIA Yuekai Zhang, add triton trtllm runtime support and cosyvoice2 grpo training support
48
+
49
+ - [x] 2025/07
50
+
51
+ - [x] release Fun-CosyVoice 3.0 eval set
52
+
53
+ - [x] 2025/05
54
+
55
+ - [x] add CosyVoice2-0.5B vllm support
56
+
57
+ - [x] 2024/12
58
+
59
+ - [x] 25hz CosyVoice2-0.5B released
60
+
61
+ - [x] 2024/09
62
+
63
+ - [x] 25hz CosyVoice-300M base model
64
+ - [x] 25hz CosyVoice-300M voice conversion function
65
+
66
+ - [x] 2024/08
67
+
68
+ - [x] Repetition Aware Sampling(RAS) inference for llm stability
69
+ - [x] Streaming inference mode support, including kv cache and sdpa for rtf optimization
70
+
71
+ - [x] 2024/07
72
+
73
+ - [x] Flow matching training support
74
+ - [x] WeTextProcessing support when ttsfrd is not available
75
+ - [x] Fastapi server and client
76
+
77
+ ## Evaluation
78
+
79
+ | Model | Open-Source | Model Size | test-zh<br>CER (%) ↓ | test-zh<br>Speaker Similarity (%) ↑ | test-en<br>WER (%) ↓ | test-en<br>Speaker Similarity (%) ↑ | test-hard<br>CER (%) ↓ | test-hard<br>Speaker Similarity (%) ↑ |
80
+ | :--- | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
81
+ | Human | - | - | 1.26 | 75.5 | 2.14 | 73.4 | - | - |
82
+ | Seed-TTS | ❌ | - | 1.12 | 79.6 | 2.25 | 76.2 | 7.59 | 77.6 |
83
+ | MiniMax-Speech | ❌ | - | 0.83 | 78.3 | 1.65 | 69.2 | - | - |
84
+ | F5-TTS | ✅ | 0.3B | 1.52 | 74.1 | 2.00 | 64.7 | 8.67 | 71.3 |
85
+ | Spark TTS | ✅ | 0.5B | 1.2 | 66.0 | 1.98 | 57.3 | - | - |
86
+ | CosyVoice2 | ✅ | 0.5B | 1.45 | 75.7 | 2.57 | 65.9 | 6.83 | 72.4 |
87
+ | FireRedTTS2 | ✅ | 1.5B | 1.14 | 73.2 | 1.95 | 66.5 | - | - |
88
+ | Index-TTS2 | ✅ | 1.5B | 1.03 | 76.5 | 2.23 | 70.6 | 7.12 | 75.5 |
89
+ | VibeVoice-1.5B | ✅ | 1.5B | 1.16 | 74.4 | 3.04 | 68.9 | - | - |
90
+ | VibeVoice-Realtime | ✅ | 0.5B | - | - | 2.05 | 63.3 | - | - |
91
+ | HiggsAudio-v2 | ✅ | 3B | 1.50 | 74.0 | 2.44 | 67.7 | - | - |
92
+ | VoxCPM | ✅ | 0.5B | 0.93 | 77.2 | 1.85 | 72.9 | 8.87 | 73.0 |
93
+ | GLM-TTS | ✅ | 1.5B | 1.03 | 76.1 | - | - | - | - |
94
+ | GLM-TTS RL | ✅ | 1.5B | 0.89 | 76.4 | - | - | - | - |
95
+ | Fun-CosyVoice3-0.5B-2512 | ✅ | 0.5B | 1.21 | 78.0 | 2.24 | 71.8 | 6.71 | 75.8 |
96
+ | Fun-CosyVoice3-0.5B-2512_RL | ✅ | 0.5B | 0.81 | 77.4 | 1.68 | 69.5 | 5.44 | 75.0 |
97
+
98
+
99
+ ## Install
100
+
101
+ ### Clone and install
102
+
103
+ - Clone the repo
104
+ ``` sh
105
+ git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git
106
+ # If you failed to clone the submodule due to network failures, please run the following command until success
107
+ cd CosyVoice
108
+ git submodule update --init --recursive
109
+ ```
110
+
111
+ - Install Conda: please see https://docs.conda.io/en/latest/miniconda.html
112
+ - Create Conda env:
113
+
114
+ ``` sh
115
+ conda create -n cosyvoice -y python=3.10
116
+ conda activate cosyvoice
117
+ pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com
118
+
119
+ # If you encounter sox compatibility issues
120
+ # ubuntu
121
+ sudo apt-get install sox libsox-dev
122
+ # centos
123
+ sudo yum install sox sox-devel
124
+ ```
125
+
126
+ ### Model download
127
+
128
+ ``` python
129
+ from huggingface_hub import snapshot_download
130
+ snapshot_download('FunAudioLLM/Fun-CosyVoice3-0.5B-2512', local_dir='pretrained_models/Fun-CosyVoice3-0.5B')
131
+ snapshot_download('FunAudioLLM/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice-ttsfrd')
132
+ ```
133
+
134
+ Optionally, you can unzip `ttsfrd` resource and install `ttsfrd` package for better text normalization performance.
135
+
136
+ Notice that this step is not necessary. If you do not install `ttsfrd` package, we will use wetext by default.
137
+
138
+ ``` sh
139
+ cd pretrained_models/CosyVoice-ttsfrd/
140
+ unzip resource.zip -d .
141
+ pip install ttsfrd_dependency-0.1-py3-none-any.whl
142
+ pip install ttsfrd-0.4.2-cp310-cp310-linux_x86_64.whl
143
+ ```
144
+
145
+ ### Basic Usage
146
+
147
+ ``` python
148
+ import sys
149
+ sys.path.append('third_party/Matcha-TTS')
150
+ from cosyvoice.cli.cosyvoice import AutoModel
151
+ import torchaudio
152
+
153
+ """ CosyVoice3 Usage, check https://funaudiollm.github.io/cosyvoice3/ for more details
154
+ """
155
+ cosyvoice = AutoModel(model_dir='pretrained_models/Fun-CosyVoice3-0.5B')
156
+ # en zero_shot usage
157
+ for i, j in enumerate(cosyvoice.inference_zero_shot('CosyVoice is undergoing a comprehensive upgrade, providing more accurate, stable, faster, and better voice generation capabilities.', 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。',
158
+ './asset/zero_shot_prompt.wav', stream=False)):
159
+ torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
160
+ # zh zero_shot usage
161
+ for i, j in enumerate(cosyvoice.inference_zero_shot('八百标兵奔北坡,北坡炮兵并排跑,炮兵怕把标兵碰,标兵怕碰炮兵炮。', 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。',
162
+ './asset/zero_shot_prompt.wav', stream=False)):
163
+ torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
164
+
165
+ # fine grained control, for supported control, check cosyvoice/tokenizer/tokenizer.py#L280
166
+ for i, j in enumerate(cosyvoice.inference_cross_lingual('You are a helpful assistant.<|endofprompt|>[breath]因为他们那一辈人[breath]在乡里面住的要习惯一点,[breath]邻居都很活络,[breath]嗯,都很熟悉。[breath]',
167
+ './asset/zero_shot_prompt.wav', stream=False)):
168
+ torchaudio.save('fine_grained_control_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
169
+
170
+ # instruct usage, for supported control, check cosyvoice/utils/common.py#L28
171
+ for i, j in enumerate(cosyvoice.inference_instruct2('好少咯,一般系放嗰啲国庆啊,中秋嗰啲可能会咯。', 'You are a helpful assistant. 请用广东话表达。<|endofprompt|>',
172
+ './asset/zero_shot_prompt.wav', stream=False)):
173
+ torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
174
+ for i, j in enumerate(cosyvoice.inference_instruct2('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', 'You are a helpful assistant. 请用尽可能快地语速说一句话。<|endofprompt|>',
175
+ './asset/zero_shot_prompt.wav', stream=False)):
176
+ torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
177
+
178
+ # hotfix usage
179
+ for i, j in enumerate(cosyvoice.inference_zero_shot('高管也通过电话、短信、微信等方式对报道[j][ǐ]予好评。', 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。',
180
+ './asset/zero_shot_prompt.wav', stream=False)):
181
+ torchaudio.save('hotfix_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
182
+ ```
183
+
184
+ ## Discussion & Communication
185
+
186
+ You can directly discuss on [Github Issues](https://github.com/FunAudioLLM/CosyVoice/issues).
187
+
188
+ You can also scan the QR code to join our official Dingding chat group.
189
+
190
+ <img src="./asset/dingding.png" width="250px">
191
+
192
+ ## Acknowledge
193
+
194
+ 1. We borrowed a lot of code from [FunASR](https://github.com/modelscope/FunASR).
195
+ 2. We borrowed a lot of code from [FunCodec](https://github.com/modelscope/FunCodec).
196
+ 3. We borrowed a lot of code from [Matcha-TTS](https://github.com/shivammehta25/Matcha-TTS).
197
+ 4. We borrowed a lot of code from [AcademiCodec](https://github.com/yangdongchao/AcademiCodec).
198
+ 5. We borrowed a lot of code from [WeNet](https://github.com/wenet-e2e/wenet).
199
+
200
+ ## Citations
201
+
202
+ ``` bibtex
203
+ @article{du2024cosyvoice,
204
+ title={Cosyvoice: A scalable multilingual zero-shot text-to-speech synthesizer based on supervised semantic tokens},
205
+ author={Du, Zhihao and Chen, Qian and Zhang, Shiliang and Hu, Kai and Lu, Heng and Yang, Yexin and Hu, Hangrui and Zheng, Siqi and Gu, Yue and Ma, Ziyang and others},
206
+ journal={arXiv preprint arXiv:2407.05407},
207
+ year={2024}
208
+ }
209
+
210
+ @article{du2024cosyvoice,
211
+ title={Cosyvoice 2: Scalable streaming speech synthesis with large language models},
212
+ author={Du, Zhihao and Wang, Yuxuan and Chen, Qian and Shi, Xian and Lv, Xiang and Zhao, Tianyu and Gao, Zhifu and Yang, Yexin and Gao, Changfeng and Wang, Hui and others},
213
+ journal={arXiv preprint arXiv:2412.10117},
214
+ year={2024}
215
+ }
216
+
217
+ @article{du2025cosyvoice,
218
+ title={CosyVoice 3: Towards In-the-wild Speech Generation via Scaling-up and Post-training},
219
+ author={Du, Zhihao and Gao, Changfeng and Wang, Yuxuan and Yu, Fan and Zhao, Tianyu and Wang, Hao and Lv, Xiang and Wang, Hui and Shi, Xian and An, Keyu and others},
220
+ journal={arXiv preprint arXiv:2505.17589},
221
+ year={2025}
222
+ }
223
+
224
+ @inproceedings{lyu2025build,
225
+ title={Build LLM-Based Zero-Shot Streaming TTS System with Cosyvoice},
226
+ author={Lyu, Xiang and Wang, Yuxuan and Zhao, Tianyu and Wang, Hao and Liu, Huadai and Du, Zhihao},
227
+ booktitle={ICASSP 2025-2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
228
+ pages={1--2},
229
+ year={2025},
230
+ organization={IEEE}
231
+ }
232
+ ```
233
+
234
+ ## Disclaimer
235
+ The content provided above is for academic purposes only and is intended to demonstrate technical capabilities. Some examples are sourced from the internet. If any content infringes on your rights, please contact us to request its removal.
models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/asset/dingding.png ADDED

Git LFS Details

  • SHA256: 7f04815e2e676d31b089af6fa270135f3214f2193d5e0ad98b491d007d48f1c6
  • Pointer size: 131 Bytes
  • Size of remote file: 123 kB
models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/campplus.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6ac6a63997761ae2997373e2ee1c47040854b4b759ea41ec48e4e42df0f4d73
3
+ size 28303423
models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/configuration.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"framework":"Pytorch","task":"text-to-speech"}
models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/cosyvoice3.yaml ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set random seed, so that you may reproduce your result.
2
+ __set_seed1: !apply:random.seed [1986]
3
+ __set_seed2: !apply:numpy.random.seed [1986]
4
+ __set_seed3: !apply:torch.manual_seed [1986]
5
+ __set_seed4: !apply:torch.cuda.manual_seed_all [1986]
6
+
7
+ # fixed params
8
+ sample_rate: 24000
9
+ llm_input_size: 896
10
+ llm_output_size: 896
11
+ spk_embed_dim: 192
12
+ qwen_pretrain_path: ''
13
+ token_frame_rate: 25
14
+ token_mel_ratio: 2
15
+
16
+ # stream related params
17
+ chunk_size: 25 # streaming inference chunk size, in token
18
+ num_decoding_left_chunks: -1 # streaming inference flow decoder left chunk size, <0 means use all left chunks
19
+
20
+ # model params
21
+ # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
22
+ # for system/third_party class/function, we do not require this.
23
+ llm: !new:cosyvoice.llm.llm.CosyVoice3LM
24
+ llm_input_size: !ref <llm_input_size>
25
+ llm_output_size: !ref <llm_output_size>
26
+ speech_token_size: 6561
27
+ length_normalized_loss: True
28
+ lsm_weight: 0
29
+ mix_ratio: [5, 15]
30
+ llm: !new:cosyvoice.llm.llm.Qwen2Encoder
31
+ pretrain_path: !ref <qwen_pretrain_path>
32
+ sampling: !name:cosyvoice.utils.common.ras_sampling
33
+ top_p: 0.8
34
+ top_k: 25
35
+ win_size: 10
36
+ tau_r: 0.1
37
+
38
+ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithDiT
39
+ input_size: 80
40
+ output_size: 80
41
+ spk_embed_dim: !ref <spk_embed_dim>
42
+ output_type: 'mel'
43
+ vocab_size: 6561
44
+ input_frame_rate: !ref <token_frame_rate>
45
+ only_mask_loss: True
46
+ token_mel_ratio: !ref <token_mel_ratio>
47
+ pre_lookahead_len: 3
48
+ pre_lookahead_layer: !new:cosyvoice.transformer.upsample_encoder.PreLookaheadLayer
49
+ in_channels: 80
50
+ channels: 1024
51
+ pre_lookahead_len: 3
52
+ decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
53
+ in_channels: 240
54
+ n_spks: 1
55
+ spk_emb_dim: 80
56
+ cfm_params: !new:omegaconf.DictConfig
57
+ content:
58
+ sigma_min: 1e-06
59
+ solver: 'euler'
60
+ t_scheduler: 'cosine'
61
+ training_cfg_rate: 0.2
62
+ inference_cfg_rate: 0.7
63
+ reg_loss_type: 'l1'
64
+ estimator: !new:cosyvoice.flow.DiT.dit.DiT
65
+ dim: 1024
66
+ depth: 22
67
+ heads: 16
68
+ dim_head: 64
69
+ ff_mult: 2
70
+ mel_dim: 80
71
+ mu_dim: 80
72
+ spk_dim: 80
73
+ out_channels: 80
74
+ static_chunk_size: !ref <chunk_size> * <token_mel_ratio>
75
+ num_decoding_left_chunks: !ref <num_decoding_left_chunks>
76
+
77
+ hift: !new:cosyvoice.hifigan.generator.CausalHiFTGenerator
78
+ in_channels: 80
79
+ base_channels: 512
80
+ nb_harmonics: 8
81
+ sampling_rate: !ref <sample_rate>
82
+ nsf_alpha: 0.1
83
+ nsf_sigma: 0.003
84
+ nsf_voiced_threshold: 10
85
+ upsample_rates: [8, 5, 3]
86
+ upsample_kernel_sizes: [16, 11, 7]
87
+ istft_params:
88
+ n_fft: 16
89
+ hop_len: 4
90
+ resblock_kernel_sizes: [3, 7, 11]
91
+ resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
92
+ source_resblock_kernel_sizes: [7, 7, 11]
93
+ source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
94
+ lrelu_slope: 0.1
95
+ audio_limit: 0.99
96
+ conv_pre_look_right: 4
97
+ f0_predictor: !new:cosyvoice.hifigan.f0_predictor.CausalConvRNNF0Predictor
98
+ num_class: 1
99
+ in_channels: 80
100
+ cond_channels: 512
101
+
102
+ # gan related module
103
+ mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
104
+ n_fft: 1920
105
+ num_mels: 80
106
+ sampling_rate: !ref <sample_rate>
107
+ hop_size: 480
108
+ win_size: 1920
109
+ fmin: 0
110
+ fmax: null
111
+ center: False
112
+ hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
113
+ generator: !ref <hift>
114
+ discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
115
+ mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
116
+ mrd: !new:cosyvoice.hifigan.discriminator.MultiResSpecDiscriminator
117
+ mel_spec_transform: [
118
+ !ref <mel_spec_transform1>
119
+ ]
120
+
121
+ # processor functions
122
+ parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
123
+ get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
124
+ token_path: !ref <qwen_pretrain_path>
125
+ skip_special_tokens: True
126
+ version: cosyvoice3
127
+ allowed_special: 'all'
128
+ tokenize: !name:cosyvoice.dataset.processor.tokenize
129
+ get_tokenizer: !ref <get_tokenizer>
130
+ allowed_special: !ref <allowed_special>
131
+ filter: !name:cosyvoice.dataset.processor.filter
132
+ max_length: 40960
133
+ min_length: 100
134
+ token_max_length: 200
135
+ token_min_length: 1
136
+ resample: !name:cosyvoice.dataset.processor.resample
137
+ resample_rate: !ref <sample_rate>
138
+ truncate: !name:cosyvoice.dataset.processor.truncate
139
+ truncate_length: 24480 # must be a multiplier of hop_size
140
+ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
141
+ n_fft: 1920
142
+ num_mels: 80
143
+ sampling_rate: !ref <sample_rate>
144
+ hop_size: 480
145
+ win_size: 1920
146
+ fmin: 0
147
+ fmax: null
148
+ center: False
149
+ compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
150
+ feat_extractor: !ref <feat_extractor>
151
+ compute_f0: !name:cosyvoice.dataset.processor.compute_f0
152
+ sample_rate: !ref <sample_rate>
153
+ hop_size: 480
154
+ parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
155
+ normalize: True
156
+ shuffle: !name:cosyvoice.dataset.processor.shuffle
157
+ shuffle_size: 1000
158
+ sort: !name:cosyvoice.dataset.processor.sort
159
+ sort_size: 500 # sort_size should be less than shuffle_size
160
+ batch: !name:cosyvoice.dataset.processor.batch
161
+ batch_type: 'dynamic'
162
+ max_frames_in_batch: 2000
163
+ padding: !name:cosyvoice.dataset.processor.padding
164
+ use_spk_embedding: False # change to True during sft
165
+
166
+
167
+ # dataset processor pipeline
168
+ data_pipeline: [
169
+ !ref <parquet_opener>,
170
+ !ref <tokenize>,
171
+ !ref <filter>,
172
+ !ref <resample>,
173
+ !ref <compute_fbank>,
174
+ !ref <parse_embedding>,
175
+ !ref <shuffle>,
176
+ !ref <sort>,
177
+ !ref <batch>,
178
+ !ref <padding>,
179
+ ]
180
+ data_pipeline_gan: [
181
+ !ref <parquet_opener>,
182
+ !ref <tokenize>,
183
+ !ref <filter>,
184
+ !ref <resample>,
185
+ !ref <truncate>,
186
+ !ref <compute_fbank>,
187
+ !ref <compute_f0>,
188
+ !ref <parse_embedding>,
189
+ !ref <shuffle>,
190
+ !ref <sort>,
191
+ !ref <batch>,
192
+ !ref <padding>,
193
+ ]
194
+
195
+ # llm flow train conf
196
+ train_conf:
197
+ optim: adam
198
+ optim_conf:
199
+ lr: 1e-5 # change to 1e-5 during sft
200
+ scheduler: constantlr # change to constantlr during sft
201
+ scheduler_conf:
202
+ warmup_steps: 2500
203
+ max_epoch: 200
204
+ grad_clip: 5
205
+ accum_grad: 2
206
+ log_interval: 100
207
+ save_per_step: -1
208
+
209
+ # gan train conf
210
+ train_conf_gan:
211
+ optim: adam
212
+ optim_conf:
213
+ lr: 0.0002 # use small lr for gan training
214
+ scheduler: constantlr
215
+ optim_d: adam
216
+ optim_conf_d:
217
+ lr: 0.0002 # use small lr for gan training
218
+ scheduler_d: constantlr
219
+ max_epoch: 200
220
+ grad_clip: 5
221
+ accum_grad: 1 # in gan training, accum_grad must be 1
222
+ log_interval: 100
223
+ save_per_step: -1
models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/flow.decoder.estimator.fp32.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b51b9533a55937762b262bf2cf9c6220ce40760f76d6532cb16a6a6d84059a8
3
+ size 1326216933
models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/flow.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6fab32a7825e5b0bc855ddd948f8db9370b0a786fbc249caa4595e95b608e4b
3
+ size 1329116148
models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/hift.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b279d7641eb97ae55b3b540cfba4f953c26492a2df758328a89a4d007ab87a65
3
+ size 83202622
models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/llm.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69f43bd545131c30e98947fb360ea8b4dc9916d8e83dded7757c7ea4f5a24970
3
+ size 2024669519
models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/llm.rl.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74d34b01a80c7154670ae75ac372d1b1712c78bceae9f467eb9f1f6f61ec764f
3
+ size 2024682701
models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/speech_tokenizer_v3.batch.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b156b8a7bbff436585e153f4637b9a368009005ac66efa108a6c8bfb34e5ee43
3
+ size 969451579
models/cosyvoice/Fun-CosyVoice3-0.5B/models--FunAudioLLM--Fun-CosyVoice3-0.5B-2512/snapshots/29e01c4e8d000f4bcd70751be16fa94bf3d85a18/speech_tokenizer_v3.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23236a74175dbdda47afc66dbadd5bcb41303c467a57c261cb8539ad9db9208d
3
+ size 969451503