leduclinh aufklarer commited on
Commit
88c0abe
·
0 Parent(s):

Duplicate from aufklarer/CosyVoice3-0.5B-MLX-4bit

Browse files

Co-authored-by: Ivan <aufklarer@users.noreply.huggingface.co>

.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - zh
4
+ - en
5
+ - ja
6
+ - ko
7
+ - de
8
+ - es
9
+ - fr
10
+ - it
11
+ - ru
12
+ license: apache-2.0
13
+ tags:
14
+ - tts
15
+ - text-to-speech
16
+ - speech-synthesis
17
+ - mlx
18
+ - apple-silicon
19
+ - cosyvoice
20
+ base_model: FunAudioLLM/Fun-CosyVoice3-0.5B-2512
21
+ pipeline_tag: text-to-speech
22
+ ---
23
+
24
+ # CosyVoice3-0.5B MLX 4-bit
25
+
26
+ [CosyVoice 3](https://arxiv.org/abs/2505.17589) text-to-speech model converted to MLX safetensors format with 4-bit quantization for Apple Silicon inference.
27
+
28
+ Converted from [FunAudioLLM/Fun-CosyVoice3-0.5B-2512](https://huggingface.co/FunAudioLLM/Fun-CosyVoice3-0.5B-2512).
29
+
30
+ **Swift inference**: [ivan-digital/qwen3-asr-swift](https://github.com/ivan-digital/qwen3-asr-swift)
31
+
32
+ ## Model Details
33
+
34
+ | Component | Architecture | Size |
35
+ |-----------|-------------|------|
36
+ | LLM | Qwen2.5-0.5B (24L, 896d, 14Q/2KV heads) | 467 MB (4-bit) |
37
+ | DiT Flow Matching | 22-layer DiT (1024d, 16 heads, 10 ODE steps) | 634 MB (fp16) |
38
+ | HiFi-GAN Vocoder | NSF + F0 predictor + ISTFT | 79 MB (fp16) |
39
+ | **Total** | | **~1.2 GB** |
40
+
41
+ ## Pipeline
42
+
43
+ ```
44
+ Text → LLM (Qwen2.5-0.5B) → Speech Tokens (FSQ 6561) → DiT Flow Matching → Mel (80-band) → HiFi-GAN → Audio (24kHz)
45
+ ```
46
+
47
+ ## Languages
48
+
49
+ Chinese, English, Japanese, Korean, German, Spanish, French, Italian, Russian
50
+
51
+ ## Files
52
+
53
+ - `llm.safetensors` — LLM weights (4-bit quantized)
54
+ - `flow.safetensors` — DiT flow matching decoder (fp16)
55
+ - `hifigan.safetensors` — HiFi-GAN vocoder (fp16, weight-norm folded)
56
+ - `config.json` — Model configuration
57
+
58
+ ## Conversion Details
59
+
60
+ - LLM: 4-bit quantization (group_size=64) of attention projections, MLP, and speech head
61
+ - Flow: fp16 (flow matching is sensitive to quantization)
62
+ - HiFi-GAN: fp16 with weight normalization folded (`w = g * v / ||v||`)
63
+ - Conv1d weights transposed from PyTorch `[out, in, kernel]` to MLX `[out, kernel, in]`
64
+
65
+ ## Usage
66
+
67
+ For use with [ivan-digital/qwen3-asr-swift](https://github.com/ivan-digital/qwen3-asr-swift):
68
+
69
+ ```swift
70
+ import CosyVoiceTTS
71
+
72
+ let model = try await CosyVoiceTTSModel.fromPretrained()
73
+ let audio = model.synthesize(text: "Hello, how are you?", language: "english")
74
+ ```
75
+
76
+ ### CLI
77
+
78
+ ```bash
79
+ swift run cosyvoice-tts-cli --text "Hello, how are you?" --lang english --output hello.wav
80
+ ```
81
+
82
+ ## License
83
+
84
+ Apache 2.0 (same as upstream CosyVoice 3)
85
+
86
+ ## Citation
87
+
88
+ ```bibtex
89
+ @article{du2025cosyvoice3,
90
+ title={CosyVoice 3: Towards In-the-wild Speech Generation via Scaling-up and Post-training},
91
+ author={Du, Zhihao and others},
92
+ journal={arXiv preprint arXiv:2505.17589},
93
+ year={2025}
94
+ }
95
+ ```
config.json ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "cosyvoice3",
3
+ "version": "Fun-CosyVoice3-0.5B-2512",
4
+ "llm": {
5
+ "hidden_size": 896,
6
+ "num_hidden_layers": 24,
7
+ "num_attention_heads": 14,
8
+ "num_key_value_heads": 2,
9
+ "intermediate_size": 4864,
10
+ "head_dim": 64,
11
+ "max_position_embeddings": 32768,
12
+ "vocab_size": 151936,
13
+ "rms_norm_eps": 1e-06,
14
+ "rope_theta": 1000000.0,
15
+ "tie_word_embeddings": true,
16
+ "speech_token_size": 6561,
17
+ "text_token_size": 151936
18
+ },
19
+ "flow": {
20
+ "input_size": 512,
21
+ "output_size": 80,
22
+ "vocab_size": 6561,
23
+ "spk_embed_dim": 192,
24
+ "token_frame_rate": 25,
25
+ "token_mel_ratio": 2,
26
+ "pre_lookahead_len": 3,
27
+ "dit": {
28
+ "dim": 1024,
29
+ "depth": 22,
30
+ "heads": 16,
31
+ "dim_head": 64,
32
+ "ff_mult": 2,
33
+ "mel_dim": 80,
34
+ "spk_dim": 80,
35
+ "static_chunk_size": 50
36
+ }
37
+ },
38
+ "hifigan": {
39
+ "sampling_rate": 24000,
40
+ "in_channels": 80,
41
+ "base_channels": 512,
42
+ "nb_harmonics": 8,
43
+ "upsample_rates": [
44
+ 8,
45
+ 5,
46
+ 3
47
+ ],
48
+ "upsample_kernel_sizes": [
49
+ 16,
50
+ 11,
51
+ 7
52
+ ],
53
+ "istft_n_fft": 16,
54
+ "istft_hop_len": 4,
55
+ "resblock_kernel_sizes": [
56
+ 3,
57
+ 7,
58
+ 11
59
+ ],
60
+ "resblock_dilation_sizes": [
61
+ [
62
+ 1,
63
+ 3,
64
+ 5
65
+ ],
66
+ [
67
+ 1,
68
+ 3,
69
+ 5
70
+ ],
71
+ [
72
+ 1,
73
+ 3,
74
+ 5
75
+ ]
76
+ ],
77
+ "source_resblock_kernel_sizes": [
78
+ 7,
79
+ 7,
80
+ 11
81
+ ],
82
+ "nsf_alpha": 0.1,
83
+ "nsf_sigma": 0.003,
84
+ "nsf_voiced_threshold": 10,
85
+ "audio_limit": 0.99
86
+ },
87
+ "mel": {
88
+ "n_fft": 1920,
89
+ "num_mels": 80,
90
+ "hop_size": 480,
91
+ "win_size": 1920,
92
+ "sample_rate": 24000
93
+ },
94
+ "tokenizer": {
95
+ "type": "fsq",
96
+ "codebook_size": 6561,
97
+ "frame_rate": 25
98
+ },
99
+ "quantization": {
100
+ "bits": 4,
101
+ "group_size": 64,
102
+ "quantized_layers": [
103
+ "q_proj",
104
+ "k_proj",
105
+ "v_proj",
106
+ "o_proj",
107
+ "gate_proj",
108
+ "up_proj",
109
+ "down_proj",
110
+ "speech_head"
111
+ ]
112
+ }
113
+ }
flow.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00357cda773dc8d17af72570aa655fc826f514c0314b40b57b9eeb690853db0a
3
+ size 194964136
hifigan.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffae1b73e76efc7ea9778c3c07c24e64ffe7c124caf5f43a67f4fec9fcaca97c
3
+ size 83086548
llm.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d7cfab99cf91c569509174a7383a46427405302b93dd60097e8187e46bc3757
3
+ size 489278536
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "additional_special_tokens": ["<|im_start|>", "<|im_end|>"],
30
+ "bos_token": null,
31
+ "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "<|im_end|>",
34
+ "errors": "replace",
35
+ "model_max_length": 32768,
36
+ "pad_token": "<|endoftext|>",
37
+ "split_special_tokens": false,
38
+ "tokenizer_class": "Qwen2Tokenizer",
39
+ "unk_token": null
40
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
weight_shapes.json ADDED
The diff for this file is too large to render. See raw diff