alexwengg commited on
Commit
d988946
·
verified ·
1 Parent(s): e8cc0e1

Upload 38 files

Browse files
Files changed (38) hide show
  1. Flow-N250-fp32.mlmodelc/analytics/coremldata.bin +3 -0
  2. Flow-N250-fp32.mlmodelc/coremldata.bin +3 -0
  3. Flow-N250-fp32.mlmodelc/model.mil +0 -0
  4. Flow-N250-fp32.mlmodelc/weights/weight.bin +3 -0
  5. Flow-N250-fp32.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
  6. Flow-N250-fp32.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
  7. Flow-N250-fp32.mlpackage/Manifest.json +18 -0
  8. HiFT-T500-fp16.mlmodelc/analytics/coremldata.bin +3 -0
  9. HiFT-T500-fp16.mlmodelc/coremldata.bin +3 -0
  10. HiFT-T500-fp16.mlmodelc/model.mil +0 -0
  11. HiFT-T500-fp16.mlmodelc/weights/weight.bin +3 -0
  12. HiFT-T500-fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
  13. HiFT-T500-fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
  14. HiFT-T500-fp16.mlpackage/Manifest.json +18 -0
  15. LLM-Decode-M768-fp16.mlmodelc/analytics/coremldata.bin +3 -0
  16. LLM-Decode-M768-fp16.mlmodelc/coremldata.bin +3 -0
  17. LLM-Decode-M768-fp16.mlmodelc/model.mil +0 -0
  18. LLM-Decode-M768-fp16.mlmodelc/weights/weight.bin +3 -0
  19. LLM-Decode-M768-fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
  20. LLM-Decode-M768-fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
  21. LLM-Decode-M768-fp16.mlpackage/Manifest.json +18 -0
  22. LLM-Prefill-T256-M768-fp16.mlmodelc/analytics/coremldata.bin +3 -0
  23. LLM-Prefill-T256-M768-fp16.mlmodelc/coremldata.bin +3 -0
  24. LLM-Prefill-T256-M768-fp16.mlmodelc/model.mil +0 -0
  25. LLM-Prefill-T256-M768-fp16.mlmodelc/weights/weight.bin +3 -0
  26. LLM-Prefill-T256-M768-fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
  27. LLM-Prefill-T256-M768-fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
  28. LLM-Prefill-T256-M768-fp16.mlpackage/Manifest.json +18 -0
  29. README.md +98 -0
  30. embeddings/embeddings-runtime-fp32.safetensors +3 -0
  31. embeddings/speech_embedding-fp16.safetensors +3 -0
  32. manifest.json +172 -0
  33. tokenizer/merges.txt +0 -0
  34. tokenizer/special_tokens.json +283 -0
  35. tokenizer/tokenizer_config.json +40 -0
  36. tokenizer/vocab.json +0 -0
  37. voices/cosyvoice3-default-zh.json +3 -0
  38. voices/cosyvoice3-default-zh.safetensors +3 -0
Flow-N250-fp32.mlmodelc/analytics/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:098b273f0b0891822792c749e1d71fa660a7501a57f38e1c05f55486075f84b6
3
+ size 243
Flow-N250-fp32.mlmodelc/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f3fec3b119e5324229c04c10c7da66db7b1de7ea9b16d99b5124ac2fa129a8c
3
+ size 491
Flow-N250-fp32.mlmodelc/model.mil ADDED
The diff for this file is too large to render. See raw diff
 
Flow-N250-fp32.mlmodelc/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:429480f1c2a509c2f3f612e679b113943d482409a3a4eb00fcafb95fc23b4d4c
3
+ size 1329136000
Flow-N250-fp32.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4af44aa2368a20aeb2700165866525cac034f54f771d3021731d1ebed36ce499
3
+ size 3939600
Flow-N250-fp32.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:429480f1c2a509c2f3f612e679b113943d482409a3a4eb00fcafb95fc23b4d4c
3
+ size 1329136000
Flow-N250-fp32.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "04D8665B-267F-4E5D-901F-C9EDC843DC37": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Weights",
7
+ "name": "weights",
8
+ "path": "com.apple.CoreML/weights"
9
+ },
10
+ "2AB57201-1486-4222-A707-440F776BB6EC": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Specification",
13
+ "name": "model.mlmodel",
14
+ "path": "com.apple.CoreML/model.mlmodel"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "2AB57201-1486-4222-A707-440F776BB6EC"
18
+ }
HiFT-T500-fp16.mlmodelc/analytics/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d62418181d6824d86adf8345d29346f6f334751a785f00b895d90c609cfa7830
3
+ size 243
HiFT-T500-fp16.mlmodelc/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef50842cece141ff17df5420ab0ab6daecf456e58509c2c35877989e52fc9bea
3
+ size 436
HiFT-T500-fp16.mlmodelc/model.mil ADDED
The diff for this file is too large to render. See raw diff
 
HiFT-T500-fp16.mlmodelc/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17b8e2bb0a67be7c1e67fdf3cad23741b5cf353461173918040dc4d3bd8c6519
3
+ size 46124000
HiFT-T500-fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7eb104c2bc5f37a4678fae006adca5b1605f96beeb8b57d51bd402589cb8776c
3
+ size 316657
HiFT-T500-fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17b8e2bb0a67be7c1e67fdf3cad23741b5cf353461173918040dc4d3bd8c6519
3
+ size 46124000
HiFT-T500-fp16.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "208BEE18-4DCB-466A-AFB9-22F89DEC193F": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Weights",
7
+ "name": "weights",
8
+ "path": "com.apple.CoreML/weights"
9
+ },
10
+ "FA462075-CE04-4BFA-854A-FE4287ED328F": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Specification",
13
+ "name": "model.mlmodel",
14
+ "path": "com.apple.CoreML/model.mlmodel"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "FA462075-CE04-4BFA-854A-FE4287ED328F"
18
+ }
LLM-Decode-M768-fp16.mlmodelc/analytics/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:618f3e003e54e6455bf498e6947f7a3c26d17d86005b0947f907b24a629fa1cd
3
+ size 243
LLM-Decode-M768-fp16.mlmodelc/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c55086879450735729e18522b2d229a9ecd4b81bd5357997336788cb1dc6e513
3
+ size 514
LLM-Decode-M768-fp16.mlmodelc/model.mil ADDED
The diff for this file is too large to render. See raw diff
 
LLM-Decode-M768-fp16.mlmodelc/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd3d0286f7645d10aedafaf3033d68412d2bd53828ff084c3322920d00efec27
3
+ size 727959122
LLM-Decode-M768-fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3248aa37d9579596982f56c220be92380c7412807a7ab680fb37e85494fb9e3
3
+ size 600357
LLM-Decode-M768-fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd3d0286f7645d10aedafaf3033d68412d2bd53828ff084c3322920d00efec27
3
+ size 727959122
LLM-Decode-M768-fp16.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "533AD866-41F8-4A31-BE97-CFFFE5CAEBE1": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Weights",
7
+ "name": "weights",
8
+ "path": "com.apple.CoreML/weights"
9
+ },
10
+ "C40E8821-472D-4251-B03E-0AAEAEFF2462": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Specification",
13
+ "name": "model.mlmodel",
14
+ "path": "com.apple.CoreML/model.mlmodel"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "C40E8821-472D-4251-B03E-0AAEAEFF2462"
18
+ }
LLM-Prefill-T256-M768-fp16.mlmodelc/analytics/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c8f5a18e2ca38e70ec09bb926989bf110ee28ca9486303e8498c87d9a51fdc5
3
+ size 243
LLM-Prefill-T256-M768-fp16.mlmodelc/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:029e60d0af895b8ae43d3277707f0288dafd3397f1a1f92aa034aa327de65f17
3
+ size 492
LLM-Prefill-T256-M768-fp16.mlmodelc/model.mil ADDED
The diff for this file is too large to render. See raw diff
 
LLM-Prefill-T256-M768-fp16.mlmodelc/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8f8cd89961a52aa583fc988de9c35ceee556dbfc4a1f1329a608e10086b4606
3
+ size 728414866
LLM-Prefill-T256-M768-fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9d9b57839d2cfb7da953b485f929e8e5d3e766f10efc97ffc38addba0b9f147
3
+ size 620444
LLM-Prefill-T256-M768-fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8f8cd89961a52aa583fc988de9c35ceee556dbfc4a1f1329a608e10086b4606
3
+ size 728414866
LLM-Prefill-T256-M768-fp16.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "18AF7B08-983C-4F06-820F-5C60330ED316": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Weights",
7
+ "name": "weights",
8
+ "path": "com.apple.CoreML/weights"
9
+ },
10
+ "C2D0C244-33DC-4550-9334-CB28AA3FFED8": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Specification",
13
+ "name": "model.mlmodel",
14
+ "path": "com.apple.CoreML/model.mlmodel"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "C2D0C244-33DC-4550-9334-CB28AA3FFED8"
18
+ }
README.md ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ language:
4
+ - zh
5
+ pipeline_tag: text-to-speech
6
+ tags:
7
+ - tts
8
+ - cosyvoice3
9
+ - coreml
10
+ - apple-silicon
11
+ - ane
12
+ - mandarin
13
+ library_name: fluidaudio
14
+ ---
15
+
16
+ # CosyVoice3 (Mandarin) — CoreML Models for FluidAudio
17
+
18
+ CoreML conversions of CosyVoice3's four inference stages, frozen to the exact
19
+ shapes the [FluidAudio](https://github.com/FluidInference/FluidAudio) Swift
20
+ package's `CosyVoice3TtsManager` loads at runtime. Targets Apple Silicon
21
+ (M-series) with the Neural Engine for LLM + HiFT, CPU for Flow.
22
+
23
+ A default voice ships in `voices/` so the repo is self-contained. Additional
24
+ voices (as they're extracted) live in the companion repo
25
+ `FluidInference/cosyvoice3-voices-zh`.
26
+
27
+ ## Shipping configuration (frozen)
28
+
29
+ Each model is shipped in two formats: `.mlpackage` (source, portable) and
30
+ `.mlmodelc` (pre-compiled for macOS 14 / iOS 17 + Apple Silicon). Swift can
31
+ load either; `.mlmodelc` skips the one-time compile step on first use
32
+ (~20-30 s for Flow without it).
33
+
34
+ | Model | Compute | Purpose | dtype |
35
+ |---|---|---|---|
36
+ | `LLM-Prefill-T256-M768-fp16` | CPU + ANE | Qwen2-0.5B prefill, 256-token context, 768-slot KV cache | fp16 |
37
+ | `LLM-Decode-M768-fp16` | CPU + ANE | Single-step AR decode, 768-slot KV cache, 24 layers × 2 KV heads × 64 dim | fp16 |
38
+ | `Flow-N250-fp32` | CPU only | Speech-token → mel (80-bin, 24 kHz), N_total=250 | fp32 (fp16 NaNs on fused LayerNorm) |
39
+ | `HiFT-T500-fp16` | CPU + ANE | Mel → 24 kHz PCM, T=500 frames | fp16 |
40
+
41
+ Total disk footprint (`.mlmodelc` + `.mlpackage` + runtime tables): ~6.6 GB on
42
+ disk. If you only need one format, delete the other after download.
43
+
44
+ ## Runtime tables
45
+
46
+ `embeddings/`
47
+ - `embeddings-runtime-fp32.safetensors` — 542 MB. Qwen2 `model.embed_tokens.weight`
48
+ at **runtime** (post-`.float()`) dtype. Required for bit-exact parity with
49
+ the Python reference — shipping raw `.pt` weights introduces ~4.7e-4 error
50
+ through the HuggingFace dtype round-trip. Swift mmaps this file.
51
+ - `speech_embedding-fp16.safetensors` — 12 MB. CosyVoice3 `speech_embedding`
52
+ table (6761 × 896 fp16); row-lookup per decoded speech token.
53
+
54
+ `voices/`
55
+ - `cosyvoice3-default-zh.safetensors` + `.json` — default zero-shot voice
56
+ bundle extracted from CosyVoice upstream `zero_shot_prompt.wav`
57
+ (utterance: "希望你以后能够做的比我还好呦。", N_speech = 87).
58
+ Schema documented in the voices repo README.
59
+
60
+ `tokenizer/`
61
+ - `vocab.json` + `merges.txt` + `tokenizer_config.json` — stock Qwen2 BPE
62
+ tokenizer assets (copied from HuggingFace `FunAudioLLM/CosyVoice-BlankEN`).
63
+ - `special_tokens.json` — 281 runtime-added CosyVoice3 special token → ID map
64
+ (`<|endofprompt|>`, `[breath]`, ARPAbet phonemes, etc.). Covers IDs
65
+ 151643..151923.
66
+
67
+ ## Swift usage (FluidAudio)
68
+
69
+ ```swift
70
+ import FluidAudio
71
+
72
+ let manager = CosyVoice3TtsManager(
73
+ modelsDirectory: modelsURL, // this repo root
74
+ tokenizerDirectory: modelsURL.appendingPathComponent("tokenizer"),
75
+ textEmbeddingsFile: modelsURL.appendingPathComponent("embeddings/embeddings-runtime-fp32.safetensors"),
76
+ specialTokensFile: modelsURL.appendingPathComponent("tokenizer/special_tokens.json"))
77
+ try await manager.initialize()
78
+
79
+ let prompt = try CosyVoice3PromptAssets.load(
80
+ from: voiceURL.appendingPathComponent("cosyvoice3-default-zh.safetensors"))
81
+
82
+ let result = try await manager.synthesize(
83
+ text: "今天天气真的很不错,适合出门散步。",
84
+ promptAssets: prompt)
85
+ // result.samples — [Float] @ 24 kHz mono
86
+ ```
87
+
88
+ ## Model graph quick reference
89
+
90
+ - Qwen2 decoder: hidden=896, 24 layers, 14 Q heads, 2 KV heads, head_dim=64
91
+ - Speech vocab: 6761 (6561 tokens + sos/eos/task_id/stops)
92
+ - SOS=6561, EOS=6562, TASK_ID=6563
93
+ - Flow: 80-bin mel @ 24 kHz, hop=480, n_fft=1920
94
+ - HiFT: iSTFT-based vocoder, upsamples mel to 24 kHz PCM
95
+
96
+ ## License
97
+
98
+ Apache-2.0. Derived from FunAudioLLM/CosyVoice3 weights; see upstream license.
embeddings/embeddings-runtime-fp32.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:143f2698c0be3c3ef66e6e172899f1c2f99011169c405fe8d9925dff1df93203
3
+ size 568770400
embeddings/speech_embedding-fp16.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ed70f85074a2625eb86fb38c09c13b6c4ba87b48b92f345a38a8b97b48aabc1
3
+ size 12115808
manifest.json ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "cosyvoice3-coreml",
3
+ "version": "1.0.0",
4
+ "language": "zh",
5
+ "library": "fluidaudio",
6
+ "description": "CoreML conversions of CosyVoice3 Mandarin TTS (Qwen2-0.5B LLM + Flow mel generator + HiFT vocoder).",
7
+ "pipeline_tag": "text-to-speech",
8
+ "sample_rate_hz": 24000,
9
+ "compute": {
10
+ "target_platform": "Apple Silicon (M-series)",
11
+ "min_os": "macOS 14 / iOS 17",
12
+ "neural_engine": ["LLM-Prefill", "LLM-Decode", "HiFT"],
13
+ "cpu_only": ["Flow"]
14
+ },
15
+ "model_graph": {
16
+ "llm_hidden_dim": 896,
17
+ "llm_layers": 24,
18
+ "llm_query_heads": 14,
19
+ "llm_kv_heads": 2,
20
+ "llm_head_dim": 64,
21
+ "llm_text_vocab": 151936,
22
+ "speech_vocab": 6761,
23
+ "speech_sos": 6561,
24
+ "speech_eos": 6562,
25
+ "speech_task_id": 6563,
26
+ "mel_bins": 80,
27
+ "mel_hop": 480,
28
+ "mel_nfft": 1920
29
+ },
30
+ "models": [
31
+ {
32
+ "name": "LLM-Prefill-T256-M768-fp16",
33
+ "paths": {
34
+ "mlpackage": "LLM-Prefill-T256-M768-fp16.mlpackage",
35
+ "mlmodelc": "LLM-Prefill-T256-M768-fp16.mlmodelc"
36
+ },
37
+ "dtype": "fp16",
38
+ "compute_units": "cpuAndNeuralEngine",
39
+ "purpose": "Qwen2 prefill over 256-token context, initializes 768-slot KV cache.",
40
+ "size_bytes": 729042944,
41
+ "inputs": {
42
+ "inputs_embeds": "[1, 256, 896] fp16",
43
+ "attention_mask": "[1, 256] int32",
44
+ "position_ids": "[1, 256] int32"
45
+ },
46
+ "outputs": {
47
+ "logits": "[1, 256, 6761] fp16 (speech vocab)",
48
+ "kv_k_out": "[24, 1, 2, 768, 64] fp16",
49
+ "kv_v_out": "[24, 1, 2, 768, 64] fp16"
50
+ }
51
+ },
52
+ {
53
+ "name": "LLM-Decode-M768-fp16",
54
+ "paths": {
55
+ "mlpackage": "LLM-Decode-M768-fp16.mlpackage",
56
+ "mlmodelc": "LLM-Decode-M768-fp16.mlmodelc"
57
+ },
58
+ "dtype": "fp16",
59
+ "compute_units": "cpuAndNeuralEngine",
60
+ "purpose": "Single-step AR decode against a 768-slot KV cache.",
61
+ "size_bytes": 728567808,
62
+ "inputs": {
63
+ "inputs_embeds": "[1, 1, 896] fp16",
64
+ "cur_len": "[1] int32",
65
+ "kv_k_in": "[24, 1, 2, 768, 64] fp16",
66
+ "kv_v_in": "[24, 1, 2, 768, 64] fp16"
67
+ },
68
+ "outputs": {
69
+ "logits": "[1, 1, 6761] fp16",
70
+ "kv_k_out": "[24, 1, 2, 768, 64] fp16",
71
+ "kv_v_out": "[24, 1, 2, 768, 64] fp16"
72
+ }
73
+ },
74
+ {
75
+ "name": "Flow-N250-fp32",
76
+ "paths": {
77
+ "mlpackage": "Flow-N250-fp32.mlpackage",
78
+ "mlmodelc": "Flow-N250-fp32.mlmodelc"
79
+ },
80
+ "dtype": "fp32",
81
+ "compute_units": "cpuOnly",
82
+ "purpose": "Speech tokens -> 80-bin log-mel @ 24 kHz. fp16 produces NaNs on fused LayerNorm.",
83
+ "size_bytes": 1333084160,
84
+ "inputs": {
85
+ "token_total": "[1, 250] int32 (prompt_ids || new_ids, right-padded)",
86
+ "num_prompt_tokens": "[1] int32",
87
+ "num_new_tokens": "[1] int32",
88
+ "prompt_feat": "[1, 500, 80] fp32 (right-padded)",
89
+ "num_prompt_mel": "[1] int32",
90
+ "embedding": "[1, 192] fp32 (CAMPPlus speaker embedding)"
91
+ },
92
+ "outputs": {
93
+ "mel": "[1, 80, 500] fp32 (full buffer; slice to num_prompt_mel..num_prompt_mel+2*N_new)"
94
+ }
95
+ },
96
+ {
97
+ "name": "HiFT-T500-fp16",
98
+ "paths": {
99
+ "mlpackage": "HiFT-T500-fp16.mlpackage",
100
+ "mlmodelc": "HiFT-T500-fp16.mlmodelc"
101
+ },
102
+ "dtype": "fp16",
103
+ "compute_units": "cpuAndNeuralEngine",
104
+ "purpose": "Mel -> 24 kHz PCM via iSTFT-based vocoder.",
105
+ "size_bytes": 46448640,
106
+ "inputs": {
107
+ "mel": "[1, 80, 500] fp16 (right-padded)",
108
+ "num_valid_frames": "[1] int32"
109
+ },
110
+ "outputs": {
111
+ "audio": "[1, 240000] fp16 (clip to 480 * num_valid_frames samples)"
112
+ }
113
+ }
114
+ ],
115
+ "embeddings": [
116
+ {
117
+ "name": "embeddings-runtime-fp32",
118
+ "path": "embeddings/embeddings-runtime-fp32.safetensors",
119
+ "shape": [151936, 896],
120
+ "dtype": "fp32",
121
+ "size_bytes": 568770400,
122
+ "purpose": "Qwen2 model.embed_tokens.weight at post-.float() runtime dtype. Required for bit-exact parity with Python reference. Swift mmaps this file."
123
+ },
124
+ {
125
+ "name": "speech_embedding-fp16",
126
+ "path": "embeddings/speech_embedding-fp16.safetensors",
127
+ "shape": [6761, 896],
128
+ "dtype": "fp16",
129
+ "size_bytes": 12115808,
130
+ "purpose": "CosyVoice3 speech_embedding table. Row-lookup per decoded speech token in the decode loop."
131
+ }
132
+ ],
133
+ "tokenizer": {
134
+ "kind": "qwen2-bpe",
135
+ "vocab_file": "tokenizer/vocab.json",
136
+ "merges_file": "tokenizer/merges.txt",
137
+ "config_file": "tokenizer/tokenizer_config.json",
138
+ "special_tokens_file": "tokenizer/special_tokens.json",
139
+ "base_vocab_size": 151936,
140
+ "special_token_count": 281,
141
+ "special_token_id_range": [151643, 151923],
142
+ "required_tokens": {
143
+ "endofprompt": 151646,
144
+ "endoftext": 151643,
145
+ "im_start": 151644,
146
+ "im_end": 151645
147
+ }
148
+ },
149
+ "voices": [
150
+ {
151
+ "voice_id": "cosyvoice3-default-zh",
152
+ "files": {
153
+ "tensors": "voices/cosyvoice3-default-zh.safetensors",
154
+ "metadata": "voices/cosyvoice3-default-zh.json"
155
+ },
156
+ "reference_wav": "CosyVoice upstream zero_shot_prompt.wav",
157
+ "prompt_utterance": "希望你以后能够做的比我还好呦。",
158
+ "n_speech": 87,
159
+ "mel_frames": 174,
160
+ "size_bytes": 57244
161
+ }
162
+ ],
163
+ "additional_voices_repo": "FluidInference/cosyvoice3-voices-zh",
164
+ "swift": {
165
+ "library": "FluidAudio",
166
+ "manager": "CosyVoice3TtsManager",
167
+ "public_api": "synthesize(text: String, promptAssets: CosyVoice3PromptAssets) async throws -> SynthesisResult",
168
+ "default_voice": "voices/cosyvoice3-default-zh.safetensors"
169
+ },
170
+ "license": "Apache-2.0",
171
+ "upstream": "FunAudioLLM/CosyVoice3"
172
+ }
tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/special_tokens.json ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 151643,
3
+ "<|im_start|>": 151644,
4
+ "<|im_end|>": 151645,
5
+ "<|endofprompt|>": 151646,
6
+ "[breath]": 151647,
7
+ "<strong>": 151648,
8
+ "</strong>": 151649,
9
+ "[noise]": 151650,
10
+ "[laughter]": 151651,
11
+ "[cough]": 151652,
12
+ "[clucking]": 151653,
13
+ "[accent]": 151654,
14
+ "[quick_breath]": 151655,
15
+ "<laughter>": 151656,
16
+ "</laughter>": 151657,
17
+ "[hissing]": 151658,
18
+ "[sigh]": 151659,
19
+ "[vocalized-noise]": 151660,
20
+ "[lipsmack]": 151661,
21
+ "[mn]": 151662,
22
+ "<|endofsystem|>": 151663,
23
+ "[AA]": 151664,
24
+ "[AA0]": 151665,
25
+ "[AA1]": 151666,
26
+ "[AA2]": 151667,
27
+ "[AE]": 151668,
28
+ "[AE0]": 151669,
29
+ "[AE1]": 151670,
30
+ "[AE2]": 151671,
31
+ "[AH]": 151672,
32
+ "[AH0]": 151673,
33
+ "[AH1]": 151674,
34
+ "[AH2]": 151675,
35
+ "[AO]": 151676,
36
+ "[AO0]": 151677,
37
+ "[AO1]": 151678,
38
+ "[AO2]": 151679,
39
+ "[AW]": 151680,
40
+ "[AW0]": 151681,
41
+ "[AW1]": 151682,
42
+ "[AW2]": 151683,
43
+ "[AY]": 151684,
44
+ "[AY0]": 151685,
45
+ "[AY1]": 151686,
46
+ "[AY2]": 151687,
47
+ "[B]": 151688,
48
+ "[CH]": 151689,
49
+ "[D]": 151690,
50
+ "[DH]": 151691,
51
+ "[EH]": 151692,
52
+ "[EH0]": 151693,
53
+ "[EH1]": 151694,
54
+ "[EH2]": 151695,
55
+ "[ER]": 151696,
56
+ "[ER0]": 151697,
57
+ "[ER1]": 151698,
58
+ "[ER2]": 151699,
59
+ "[EY]": 151700,
60
+ "[EY0]": 151701,
61
+ "[EY1]": 151702,
62
+ "[EY2]": 151703,
63
+ "[F]": 151704,
64
+ "[G]": 151705,
65
+ "[HH]": 151706,
66
+ "[IH]": 151707,
67
+ "[IH0]": 151708,
68
+ "[IH1]": 151709,
69
+ "[IH2]": 151710,
70
+ "[IY]": 151711,
71
+ "[IY0]": 151712,
72
+ "[IY1]": 151713,
73
+ "[IY2]": 151714,
74
+ "[JH]": 151715,
75
+ "[K]": 151716,
76
+ "[L]": 151717,
77
+ "[M]": 151718,
78
+ "[N]": 151719,
79
+ "[NG]": 151720,
80
+ "[OW]": 151721,
81
+ "[OW0]": 151722,
82
+ "[OW1]": 151723,
83
+ "[OW2]": 151724,
84
+ "[OY]": 151725,
85
+ "[OY0]": 151726,
86
+ "[OY1]": 151727,
87
+ "[OY2]": 151728,
88
+ "[P]": 151729,
89
+ "[R]": 151730,
90
+ "[S]": 151731,
91
+ "[SH]": 151732,
92
+ "[T]": 151733,
93
+ "[TH]": 151734,
94
+ "[UH]": 151735,
95
+ "[UH0]": 151736,
96
+ "[UH1]": 151737,
97
+ "[UH2]": 151738,
98
+ "[UW]": 151739,
99
+ "[UW0]": 151740,
100
+ "[UW1]": 151741,
101
+ "[UW2]": 151742,
102
+ "[V]": 151743,
103
+ "[W]": 151744,
104
+ "[Y]": 151745,
105
+ "[Z]": 151746,
106
+ "[ZH]": 151747,
107
+ "[a]": 151748,
108
+ "[ai]": 151749,
109
+ "[an]": 151750,
110
+ "[ang]": 151751,
111
+ "[ao]": 151752,
112
+ "[b]": 151753,
113
+ "[c]": 151754,
114
+ "[ch]": 151755,
115
+ "[d]": 151756,
116
+ "[e]": 151757,
117
+ "[ei]": 151758,
118
+ "[en]": 151759,
119
+ "[eng]": 151760,
120
+ "[f]": 151761,
121
+ "[g]": 151762,
122
+ "[h]": 151763,
123
+ "[i]": 151764,
124
+ "[ian]": 151765,
125
+ "[in]": 151766,
126
+ "[ing]": 151767,
127
+ "[iu]": 151768,
128
+ "[ià]": 151769,
129
+ "[iàn]": 151770,
130
+ "[iàng]": 151771,
131
+ "[iào]": 151772,
132
+ "[iá]": 151773,
133
+ "[ián]": 151774,
134
+ "[iáng]": 151775,
135
+ "[iáo]": 151776,
136
+ "[iè]": 151777,
137
+ "[ié]": 151778,
138
+ "[iòng]": 151779,
139
+ "[ióng]": 151780,
140
+ "[iù]": 151781,
141
+ "[iú]": 151782,
142
+ "[iā]": 151783,
143
+ "[iān]": 151784,
144
+ "[iāng]": 151785,
145
+ "[iāo]": 151786,
146
+ "[iē]": 151787,
147
+ "[iě]": 151788,
148
+ "[iōng]": 151789,
149
+ "[iū]": 151790,
150
+ "[iǎ]": 151791,
151
+ "[iǎn]": 151792,
152
+ "[iǎng]": 151793,
153
+ "[iǎo]": 151794,
154
+ "[iǒng]": 151795,
155
+ "[iǔ]": 151796,
156
+ "[j]": 151797,
157
+ "[k]": 151798,
158
+ "[l]": 151799,
159
+ "[m]": 151800,
160
+ "[n]": 151801,
161
+ "[o]": 151802,
162
+ "[ong]": 151803,
163
+ "[ou]": 151804,
164
+ "[p]": 151805,
165
+ "[q]": 151806,
166
+ "[r]": 151807,
167
+ "[s]": 151808,
168
+ "[sh]": 151809,
169
+ "[t]": 151810,
170
+ "[u]": 151811,
171
+ "[uang]": 151812,
172
+ "[ue]": 151813,
173
+ "[un]": 151814,
174
+ "[uo]": 151815,
175
+ "[uà]": 151816,
176
+ "[uài]": 151817,
177
+ "[uàn]": 151818,
178
+ "[uàng]": 151819,
179
+ "[uá]": 151820,
180
+ "[uái]": 151821,
181
+ "[uán]": 151822,
182
+ "[uáng]": 151823,
183
+ "[uè]": 151824,
184
+ "[ué]": 151825,
185
+ "[uì]": 151826,
186
+ "[uí]": 151827,
187
+ "[uò]": 151828,
188
+ "[uó]": 151829,
189
+ "[uā]": 151830,
190
+ "[uāi]": 151831,
191
+ "[uān]": 151832,
192
+ "[uāng]": 151833,
193
+ "[uē]": 151834,
194
+ "[uě]": 151835,
195
+ "[uī]": 151836,
196
+ "[uō]": 151837,
197
+ "[uǎ]": 151838,
198
+ "[uǎi]": 151839,
199
+ "[uǎn]": 151840,
200
+ "[uǎng]": 151841,
201
+ "[uǐ]": 151842,
202
+ "[uǒ]": 151843,
203
+ "[vè]": 151844,
204
+ "[w]": 151845,
205
+ "[x]": 151846,
206
+ "[y]": 151847,
207
+ "[z]": 151848,
208
+ "[zh]": 151849,
209
+ "[à]": 151850,
210
+ "[ài]": 151851,
211
+ "[àn]": 151852,
212
+ "[àng]": 151853,
213
+ "[ào]": 151854,
214
+ "[á]": 151855,
215
+ "[ái]": 151856,
216
+ "[án]": 151857,
217
+ "[áng]": 151858,
218
+ "[áo]": 151859,
219
+ "[è]": 151860,
220
+ "[èi]": 151861,
221
+ "[èn]": 151862,
222
+ "[èng]": 151863,
223
+ "[èr]": 151864,
224
+ "[é]": 151865,
225
+ "[éi]": 151866,
226
+ "[én]": 151867,
227
+ "[éng]": 151868,
228
+ "[ér]": 151869,
229
+ "[ì]": 151870,
230
+ "[ìn]": 151871,
231
+ "[ìng]": 151872,
232
+ "[í]": 151873,
233
+ "[ín]": 151874,
234
+ "[íng]": 151875,
235
+ "[ò]": 151876,
236
+ "[òng]": 151877,
237
+ "[òu]": 151878,
238
+ "[ó]": 151879,
239
+ "[óng]": 151880,
240
+ "[óu]": 151881,
241
+ "[ù]": 151882,
242
+ "[ùn]": 151883,
243
+ "[ú]": 151884,
244
+ "[ún]": 151885,
245
+ "[ā]": 151886,
246
+ "[āi]": 151887,
247
+ "[ān]": 151888,
248
+ "[āng]": 151889,
249
+ "[āo]": 151890,
250
+ "[ē]": 151891,
251
+ "[ēi]": 151892,
252
+ "[ēn]": 151893,
253
+ "[ēng]": 151894,
254
+ "[ě]": 151895,
255
+ "[ěi]": 151896,
256
+ "[ěn]": 151897,
257
+ "[ěng]": 151898,
258
+ "[ěr]": 151899,
259
+ "[ī]": 151900,
260
+ "[īn]": 151901,
261
+ "[īng]": 151902,
262
+ "[ō]": 151903,
263
+ "[ōng]": 151904,
264
+ "[ōu]": 151905,
265
+ "[ū]": 151906,
266
+ "[ūn]": 151907,
267
+ "[ǎ]": 151908,
268
+ "[ǎi]": 151909,
269
+ "[ǎn]": 151910,
270
+ "[ǎng]": 151911,
271
+ "[ǎo]": 151912,
272
+ "[ǐ]": 151913,
273
+ "[ǐn]": 151914,
274
+ "[ǐng]": 151915,
275
+ "[ǒ]": 151916,
276
+ "[ǒng]": 151917,
277
+ "[ǒu]": 151918,
278
+ "[ǔ]": 151919,
279
+ "[ǔn]": 151920,
280
+ "[ǘ]": 151921,
281
+ "[ǚ]": 151922,
282
+ "[ǜ]": 151923
283
+ }
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "additional_special_tokens": ["<|im_start|>", "<|im_end|>"],
30
+ "bos_token": null,
31
+ "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "<|im_end|>",
34
+ "errors": "replace",
35
+ "model_max_length": 32768,
36
+ "pad_token": "<|endoftext|>",
37
+ "split_special_tokens": false,
38
+ "tokenizer_class": "Qwen2Tokenizer",
39
+ "unk_token": null
40
+ }
tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
voices/cosyvoice3-default-zh.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "prompt_text": "You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。"
3
+ }
voices/cosyvoice3-default-zh.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3486fa47b1f36ca647b41592affb32d37914927161e8ce1f286107e4422e86b
3
+ size 57244