Upload 38 files
Browse files- Flow-N250-fp32.mlmodelc/analytics/coremldata.bin +3 -0
- Flow-N250-fp32.mlmodelc/coremldata.bin +3 -0
- Flow-N250-fp32.mlmodelc/model.mil +0 -0
- Flow-N250-fp32.mlmodelc/weights/weight.bin +3 -0
- Flow-N250-fp32.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
- Flow-N250-fp32.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
- Flow-N250-fp32.mlpackage/Manifest.json +18 -0
- HiFT-T500-fp16.mlmodelc/analytics/coremldata.bin +3 -0
- HiFT-T500-fp16.mlmodelc/coremldata.bin +3 -0
- HiFT-T500-fp16.mlmodelc/model.mil +0 -0
- HiFT-T500-fp16.mlmodelc/weights/weight.bin +3 -0
- HiFT-T500-fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
- HiFT-T500-fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
- HiFT-T500-fp16.mlpackage/Manifest.json +18 -0
- LLM-Decode-M768-fp16.mlmodelc/analytics/coremldata.bin +3 -0
- LLM-Decode-M768-fp16.mlmodelc/coremldata.bin +3 -0
- LLM-Decode-M768-fp16.mlmodelc/model.mil +0 -0
- LLM-Decode-M768-fp16.mlmodelc/weights/weight.bin +3 -0
- LLM-Decode-M768-fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
- LLM-Decode-M768-fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
- LLM-Decode-M768-fp16.mlpackage/Manifest.json +18 -0
- LLM-Prefill-T256-M768-fp16.mlmodelc/analytics/coremldata.bin +3 -0
- LLM-Prefill-T256-M768-fp16.mlmodelc/coremldata.bin +3 -0
- LLM-Prefill-T256-M768-fp16.mlmodelc/model.mil +0 -0
- LLM-Prefill-T256-M768-fp16.mlmodelc/weights/weight.bin +3 -0
- LLM-Prefill-T256-M768-fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
- LLM-Prefill-T256-M768-fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
- LLM-Prefill-T256-M768-fp16.mlpackage/Manifest.json +18 -0
- README.md +98 -0
- embeddings/embeddings-runtime-fp32.safetensors +3 -0
- embeddings/speech_embedding-fp16.safetensors +3 -0
- manifest.json +172 -0
- tokenizer/merges.txt +0 -0
- tokenizer/special_tokens.json +283 -0
- tokenizer/tokenizer_config.json +40 -0
- tokenizer/vocab.json +0 -0
- voices/cosyvoice3-default-zh.json +3 -0
- voices/cosyvoice3-default-zh.safetensors +3 -0
Flow-N250-fp32.mlmodelc/analytics/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:098b273f0b0891822792c749e1d71fa660a7501a57f38e1c05f55486075f84b6
|
| 3 |
+
size 243
|
Flow-N250-fp32.mlmodelc/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1f3fec3b119e5324229c04c10c7da66db7b1de7ea9b16d99b5124ac2fa129a8c
|
| 3 |
+
size 491
|
Flow-N250-fp32.mlmodelc/model.mil
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Flow-N250-fp32.mlmodelc/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:429480f1c2a509c2f3f612e679b113943d482409a3a4eb00fcafb95fc23b4d4c
|
| 3 |
+
size 1329136000
|
Flow-N250-fp32.mlpackage/Data/com.apple.CoreML/model.mlmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4af44aa2368a20aeb2700165866525cac034f54f771d3021731d1ebed36ce499
|
| 3 |
+
size 3939600
|
Flow-N250-fp32.mlpackage/Data/com.apple.CoreML/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:429480f1c2a509c2f3f612e679b113943d482409a3a4eb00fcafb95fc23b4d4c
|
| 3 |
+
size 1329136000
|
Flow-N250-fp32.mlpackage/Manifest.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fileFormatVersion": "1.0.0",
|
| 3 |
+
"itemInfoEntries": {
|
| 4 |
+
"04D8665B-267F-4E5D-901F-C9EDC843DC37": {
|
| 5 |
+
"author": "com.apple.CoreML",
|
| 6 |
+
"description": "CoreML Model Weights",
|
| 7 |
+
"name": "weights",
|
| 8 |
+
"path": "com.apple.CoreML/weights"
|
| 9 |
+
},
|
| 10 |
+
"2AB57201-1486-4222-A707-440F776BB6EC": {
|
| 11 |
+
"author": "com.apple.CoreML",
|
| 12 |
+
"description": "CoreML Model Specification",
|
| 13 |
+
"name": "model.mlmodel",
|
| 14 |
+
"path": "com.apple.CoreML/model.mlmodel"
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"rootModelIdentifier": "2AB57201-1486-4222-A707-440F776BB6EC"
|
| 18 |
+
}
|
HiFT-T500-fp16.mlmodelc/analytics/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d62418181d6824d86adf8345d29346f6f334751a785f00b895d90c609cfa7830
|
| 3 |
+
size 243
|
HiFT-T500-fp16.mlmodelc/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ef50842cece141ff17df5420ab0ab6daecf456e58509c2c35877989e52fc9bea
|
| 3 |
+
size 436
|
HiFT-T500-fp16.mlmodelc/model.mil
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
HiFT-T500-fp16.mlmodelc/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:17b8e2bb0a67be7c1e67fdf3cad23741b5cf353461173918040dc4d3bd8c6519
|
| 3 |
+
size 46124000
|
HiFT-T500-fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7eb104c2bc5f37a4678fae006adca5b1605f96beeb8b57d51bd402589cb8776c
|
| 3 |
+
size 316657
|
HiFT-T500-fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:17b8e2bb0a67be7c1e67fdf3cad23741b5cf353461173918040dc4d3bd8c6519
|
| 3 |
+
size 46124000
|
HiFT-T500-fp16.mlpackage/Manifest.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fileFormatVersion": "1.0.0",
|
| 3 |
+
"itemInfoEntries": {
|
| 4 |
+
"208BEE18-4DCB-466A-AFB9-22F89DEC193F": {
|
| 5 |
+
"author": "com.apple.CoreML",
|
| 6 |
+
"description": "CoreML Model Weights",
|
| 7 |
+
"name": "weights",
|
| 8 |
+
"path": "com.apple.CoreML/weights"
|
| 9 |
+
},
|
| 10 |
+
"FA462075-CE04-4BFA-854A-FE4287ED328F": {
|
| 11 |
+
"author": "com.apple.CoreML",
|
| 12 |
+
"description": "CoreML Model Specification",
|
| 13 |
+
"name": "model.mlmodel",
|
| 14 |
+
"path": "com.apple.CoreML/model.mlmodel"
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"rootModelIdentifier": "FA462075-CE04-4BFA-854A-FE4287ED328F"
|
| 18 |
+
}
|
LLM-Decode-M768-fp16.mlmodelc/analytics/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:618f3e003e54e6455bf498e6947f7a3c26d17d86005b0947f907b24a629fa1cd
|
| 3 |
+
size 243
|
LLM-Decode-M768-fp16.mlmodelc/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c55086879450735729e18522b2d229a9ecd4b81bd5357997336788cb1dc6e513
|
| 3 |
+
size 514
|
LLM-Decode-M768-fp16.mlmodelc/model.mil
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
LLM-Decode-M768-fp16.mlmodelc/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cd3d0286f7645d10aedafaf3033d68412d2bd53828ff084c3322920d00efec27
|
| 3 |
+
size 727959122
|
LLM-Decode-M768-fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d3248aa37d9579596982f56c220be92380c7412807a7ab680fb37e85494fb9e3
|
| 3 |
+
size 600357
|
LLM-Decode-M768-fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cd3d0286f7645d10aedafaf3033d68412d2bd53828ff084c3322920d00efec27
|
| 3 |
+
size 727959122
|
LLM-Decode-M768-fp16.mlpackage/Manifest.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fileFormatVersion": "1.0.0",
|
| 3 |
+
"itemInfoEntries": {
|
| 4 |
+
"533AD866-41F8-4A31-BE97-CFFFE5CAEBE1": {
|
| 5 |
+
"author": "com.apple.CoreML",
|
| 6 |
+
"description": "CoreML Model Weights",
|
| 7 |
+
"name": "weights",
|
| 8 |
+
"path": "com.apple.CoreML/weights"
|
| 9 |
+
},
|
| 10 |
+
"C40E8821-472D-4251-B03E-0AAEAEFF2462": {
|
| 11 |
+
"author": "com.apple.CoreML",
|
| 12 |
+
"description": "CoreML Model Specification",
|
| 13 |
+
"name": "model.mlmodel",
|
| 14 |
+
"path": "com.apple.CoreML/model.mlmodel"
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"rootModelIdentifier": "C40E8821-472D-4251-B03E-0AAEAEFF2462"
|
| 18 |
+
}
|
LLM-Prefill-T256-M768-fp16.mlmodelc/analytics/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3c8f5a18e2ca38e70ec09bb926989bf110ee28ca9486303e8498c87d9a51fdc5
|
| 3 |
+
size 243
|
LLM-Prefill-T256-M768-fp16.mlmodelc/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:029e60d0af895b8ae43d3277707f0288dafd3397f1a1f92aa034aa327de65f17
|
| 3 |
+
size 492
|
LLM-Prefill-T256-M768-fp16.mlmodelc/model.mil
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
LLM-Prefill-T256-M768-fp16.mlmodelc/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d8f8cd89961a52aa583fc988de9c35ceee556dbfc4a1f1329a608e10086b4606
|
| 3 |
+
size 728414866
|
LLM-Prefill-T256-M768-fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f9d9b57839d2cfb7da953b485f929e8e5d3e766f10efc97ffc38addba0b9f147
|
| 3 |
+
size 620444
|
LLM-Prefill-T256-M768-fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d8f8cd89961a52aa583fc988de9c35ceee556dbfc4a1f1329a608e10086b4606
|
| 3 |
+
size 728414866
|
LLM-Prefill-T256-M768-fp16.mlpackage/Manifest.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fileFormatVersion": "1.0.0",
|
| 3 |
+
"itemInfoEntries": {
|
| 4 |
+
"18AF7B08-983C-4F06-820F-5C60330ED316": {
|
| 5 |
+
"author": "com.apple.CoreML",
|
| 6 |
+
"description": "CoreML Model Weights",
|
| 7 |
+
"name": "weights",
|
| 8 |
+
"path": "com.apple.CoreML/weights"
|
| 9 |
+
},
|
| 10 |
+
"C2D0C244-33DC-4550-9334-CB28AA3FFED8": {
|
| 11 |
+
"author": "com.apple.CoreML",
|
| 12 |
+
"description": "CoreML Model Specification",
|
| 13 |
+
"name": "model.mlmodel",
|
| 14 |
+
"path": "com.apple.CoreML/model.mlmodel"
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"rootModelIdentifier": "C2D0C244-33DC-4550-9334-CB28AA3FFED8"
|
| 18 |
+
}
|
README.md
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
language:
|
| 4 |
+
- zh
|
| 5 |
+
pipeline_tag: text-to-speech
|
| 6 |
+
tags:
|
| 7 |
+
- tts
|
| 8 |
+
- cosyvoice3
|
| 9 |
+
- coreml
|
| 10 |
+
- apple-silicon
|
| 11 |
+
- ane
|
| 12 |
+
- mandarin
|
| 13 |
+
library_name: fluidaudio
|
| 14 |
+
---
|
| 15 |
+
|
| 16 |
+
# CosyVoice3 (Mandarin) — CoreML Models for FluidAudio
|
| 17 |
+
|
| 18 |
+
CoreML conversions of CosyVoice3's four inference stages, frozen to the exact
|
| 19 |
+
shapes the [FluidAudio](https://github.com/FluidInference/FluidAudio) Swift
|
| 20 |
+
package's `CosyVoice3TtsManager` loads at runtime. Targets Apple Silicon
|
| 21 |
+
(M-series) with the Neural Engine for LLM + HiFT, CPU for Flow.
|
| 22 |
+
|
| 23 |
+
A default voice ships in `voices/` so the repo is self-contained. Additional
|
| 24 |
+
voices (as they're extracted) live in the companion repo
|
| 25 |
+
`FluidInference/cosyvoice3-voices-zh`.
|
| 26 |
+
|
| 27 |
+
## Shipping configuration (frozen)
|
| 28 |
+
|
| 29 |
+
Each model is shipped in two formats: `.mlpackage` (source, portable) and
|
| 30 |
+
`.mlmodelc` (pre-compiled for macOS 14 / iOS 17 + Apple Silicon). Swift can
|
| 31 |
+
load either; `.mlmodelc` skips the one-time compile step on first use
|
| 32 |
+
(~20-30 s for Flow without it).
|
| 33 |
+
|
| 34 |
+
| Model | Compute | Purpose | dtype |
|
| 35 |
+
|---|---|---|---|
|
| 36 |
+
| `LLM-Prefill-T256-M768-fp16` | CPU + ANE | Qwen2-0.5B prefill, 256-token context, 768-slot KV cache | fp16 |
|
| 37 |
+
| `LLM-Decode-M768-fp16` | CPU + ANE | Single-step AR decode, 768-slot KV cache, 24 layers × 2 KV heads × 64 dim | fp16 |
|
| 38 |
+
| `Flow-N250-fp32` | CPU only | Speech-token → mel (80-bin, 24 kHz), N_total=250 | fp32 (fp16 NaNs on fused LayerNorm) |
|
| 39 |
+
| `HiFT-T500-fp16` | CPU + ANE | Mel → 24 kHz PCM, T=500 frames | fp16 |
|
| 40 |
+
|
| 41 |
+
Total disk footprint (`.mlmodelc` + `.mlpackage` + runtime tables): ~6.6 GB on
|
| 42 |
+
disk. If you only need one format, delete the other after download.
|
| 43 |
+
|
| 44 |
+
## Runtime tables
|
| 45 |
+
|
| 46 |
+
`embeddings/`
|
| 47 |
+
- `embeddings-runtime-fp32.safetensors` — 542 MB. Qwen2 `model.embed_tokens.weight`
|
| 48 |
+
at **runtime** (post-`.float()`) dtype. Required for bit-exact parity with
|
| 49 |
+
the Python reference — shipping raw `.pt` weights introduces ~4.7e-4 error
|
| 50 |
+
through the HuggingFace dtype round-trip. Swift mmaps this file.
|
| 51 |
+
- `speech_embedding-fp16.safetensors` — 12 MB. CosyVoice3 `speech_embedding`
|
| 52 |
+
table (6761 × 896 fp16); row-lookup per decoded speech token.
|
| 53 |
+
|
| 54 |
+
`voices/`
|
| 55 |
+
- `cosyvoice3-default-zh.safetensors` + `.json` — default zero-shot voice
|
| 56 |
+
bundle extracted from CosyVoice upstream `zero_shot_prompt.wav`
|
| 57 |
+
(utterance: "希望你以后能够做的比我还好呦。", N_speech = 87).
|
| 58 |
+
Schema documented in the voices repo README.
|
| 59 |
+
|
| 60 |
+
`tokenizer/`
|
| 61 |
+
- `vocab.json` + `merges.txt` + `tokenizer_config.json` — stock Qwen2 BPE
|
| 62 |
+
tokenizer assets (copied from HuggingFace `FunAudioLLM/CosyVoice-BlankEN`).
|
| 63 |
+
- `special_tokens.json` — 281 runtime-added CosyVoice3 special token → ID map
|
| 64 |
+
(`<|endofprompt|>`, `[breath]`, ARPAbet phonemes, etc.). Covers IDs
|
| 65 |
+
151643..151923.
|
| 66 |
+
|
| 67 |
+
## Swift usage (FluidAudio)
|
| 68 |
+
|
| 69 |
+
```swift
|
| 70 |
+
import FluidAudio
|
| 71 |
+
|
| 72 |
+
let manager = CosyVoice3TtsManager(
|
| 73 |
+
modelsDirectory: modelsURL, // this repo root
|
| 74 |
+
tokenizerDirectory: modelsURL.appendingPathComponent("tokenizer"),
|
| 75 |
+
textEmbeddingsFile: modelsURL.appendingPathComponent("embeddings/embeddings-runtime-fp32.safetensors"),
|
| 76 |
+
specialTokensFile: modelsURL.appendingPathComponent("tokenizer/special_tokens.json"))
|
| 77 |
+
try await manager.initialize()
|
| 78 |
+
|
| 79 |
+
let prompt = try CosyVoice3PromptAssets.load(
|
| 80 |
+
from: voiceURL.appendingPathComponent("cosyvoice3-default-zh.safetensors"))
|
| 81 |
+
|
| 82 |
+
let result = try await manager.synthesize(
|
| 83 |
+
text: "今天天气真的很不错,适合出门散步。",
|
| 84 |
+
promptAssets: prompt)
|
| 85 |
+
// result.samples — [Float] @ 24 kHz mono
|
| 86 |
+
```
|
| 87 |
+
|
| 88 |
+
## Model graph quick reference
|
| 89 |
+
|
| 90 |
+
- Qwen2 decoder: hidden=896, 24 layers, 14 Q heads, 2 KV heads, head_dim=64
|
| 91 |
+
- Speech vocab: 6761 (6561 tokens + sos/eos/task_id/stops)
|
| 92 |
+
- SOS=6561, EOS=6562, TASK_ID=6563
|
| 93 |
+
- Flow: 80-bin mel @ 24 kHz, hop=480, n_fft=1920
|
| 94 |
+
- HiFT: iSTFT-based vocoder, upsamples mel to 24 kHz PCM
|
| 95 |
+
|
| 96 |
+
## License
|
| 97 |
+
|
| 98 |
+
Apache-2.0. Derived from FunAudioLLM/CosyVoice3 weights; see upstream license.
|
embeddings/embeddings-runtime-fp32.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:143f2698c0be3c3ef66e6e172899f1c2f99011169c405fe8d9925dff1df93203
|
| 3 |
+
size 568770400
|
embeddings/speech_embedding-fp16.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9ed70f85074a2625eb86fb38c09c13b6c4ba87b48b92f345a38a8b97b48aabc1
|
| 3 |
+
size 12115808
|
manifest.json
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "cosyvoice3-coreml",
|
| 3 |
+
"version": "1.0.0",
|
| 4 |
+
"language": "zh",
|
| 5 |
+
"library": "fluidaudio",
|
| 6 |
+
"description": "CoreML conversions of CosyVoice3 Mandarin TTS (Qwen2-0.5B LLM + Flow mel generator + HiFT vocoder).",
|
| 7 |
+
"pipeline_tag": "text-to-speech",
|
| 8 |
+
"sample_rate_hz": 24000,
|
| 9 |
+
"compute": {
|
| 10 |
+
"target_platform": "Apple Silicon (M-series)",
|
| 11 |
+
"min_os": "macOS 14 / iOS 17",
|
| 12 |
+
"neural_engine": ["LLM-Prefill", "LLM-Decode", "HiFT"],
|
| 13 |
+
"cpu_only": ["Flow"]
|
| 14 |
+
},
|
| 15 |
+
"model_graph": {
|
| 16 |
+
"llm_hidden_dim": 896,
|
| 17 |
+
"llm_layers": 24,
|
| 18 |
+
"llm_query_heads": 14,
|
| 19 |
+
"llm_kv_heads": 2,
|
| 20 |
+
"llm_head_dim": 64,
|
| 21 |
+
"llm_text_vocab": 151936,
|
| 22 |
+
"speech_vocab": 6761,
|
| 23 |
+
"speech_sos": 6561,
|
| 24 |
+
"speech_eos": 6562,
|
| 25 |
+
"speech_task_id": 6563,
|
| 26 |
+
"mel_bins": 80,
|
| 27 |
+
"mel_hop": 480,
|
| 28 |
+
"mel_nfft": 1920
|
| 29 |
+
},
|
| 30 |
+
"models": [
|
| 31 |
+
{
|
| 32 |
+
"name": "LLM-Prefill-T256-M768-fp16",
|
| 33 |
+
"paths": {
|
| 34 |
+
"mlpackage": "LLM-Prefill-T256-M768-fp16.mlpackage",
|
| 35 |
+
"mlmodelc": "LLM-Prefill-T256-M768-fp16.mlmodelc"
|
| 36 |
+
},
|
| 37 |
+
"dtype": "fp16",
|
| 38 |
+
"compute_units": "cpuAndNeuralEngine",
|
| 39 |
+
"purpose": "Qwen2 prefill over 256-token context, initializes 768-slot KV cache.",
|
| 40 |
+
"size_bytes": 729042944,
|
| 41 |
+
"inputs": {
|
| 42 |
+
"inputs_embeds": "[1, 256, 896] fp16",
|
| 43 |
+
"attention_mask": "[1, 256] int32",
|
| 44 |
+
"position_ids": "[1, 256] int32"
|
| 45 |
+
},
|
| 46 |
+
"outputs": {
|
| 47 |
+
"logits": "[1, 256, 6761] fp16 (speech vocab)",
|
| 48 |
+
"kv_k_out": "[24, 1, 2, 768, 64] fp16",
|
| 49 |
+
"kv_v_out": "[24, 1, 2, 768, 64] fp16"
|
| 50 |
+
}
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"name": "LLM-Decode-M768-fp16",
|
| 54 |
+
"paths": {
|
| 55 |
+
"mlpackage": "LLM-Decode-M768-fp16.mlpackage",
|
| 56 |
+
"mlmodelc": "LLM-Decode-M768-fp16.mlmodelc"
|
| 57 |
+
},
|
| 58 |
+
"dtype": "fp16",
|
| 59 |
+
"compute_units": "cpuAndNeuralEngine",
|
| 60 |
+
"purpose": "Single-step AR decode against a 768-slot KV cache.",
|
| 61 |
+
"size_bytes": 728567808,
|
| 62 |
+
"inputs": {
|
| 63 |
+
"inputs_embeds": "[1, 1, 896] fp16",
|
| 64 |
+
"cur_len": "[1] int32",
|
| 65 |
+
"kv_k_in": "[24, 1, 2, 768, 64] fp16",
|
| 66 |
+
"kv_v_in": "[24, 1, 2, 768, 64] fp16"
|
| 67 |
+
},
|
| 68 |
+
"outputs": {
|
| 69 |
+
"logits": "[1, 1, 6761] fp16",
|
| 70 |
+
"kv_k_out": "[24, 1, 2, 768, 64] fp16",
|
| 71 |
+
"kv_v_out": "[24, 1, 2, 768, 64] fp16"
|
| 72 |
+
}
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"name": "Flow-N250-fp32",
|
| 76 |
+
"paths": {
|
| 77 |
+
"mlpackage": "Flow-N250-fp32.mlpackage",
|
| 78 |
+
"mlmodelc": "Flow-N250-fp32.mlmodelc"
|
| 79 |
+
},
|
| 80 |
+
"dtype": "fp32",
|
| 81 |
+
"compute_units": "cpuOnly",
|
| 82 |
+
"purpose": "Speech tokens -> 80-bin log-mel @ 24 kHz. fp16 produces NaNs on fused LayerNorm.",
|
| 83 |
+
"size_bytes": 1333084160,
|
| 84 |
+
"inputs": {
|
| 85 |
+
"token_total": "[1, 250] int32 (prompt_ids || new_ids, right-padded)",
|
| 86 |
+
"num_prompt_tokens": "[1] int32",
|
| 87 |
+
"num_new_tokens": "[1] int32",
|
| 88 |
+
"prompt_feat": "[1, 500, 80] fp32 (right-padded)",
|
| 89 |
+
"num_prompt_mel": "[1] int32",
|
| 90 |
+
"embedding": "[1, 192] fp32 (CAMPPlus speaker embedding)"
|
| 91 |
+
},
|
| 92 |
+
"outputs": {
|
| 93 |
+
"mel": "[1, 80, 500] fp32 (full buffer; slice to num_prompt_mel..num_prompt_mel+2*N_new)"
|
| 94 |
+
}
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"name": "HiFT-T500-fp16",
|
| 98 |
+
"paths": {
|
| 99 |
+
"mlpackage": "HiFT-T500-fp16.mlpackage",
|
| 100 |
+
"mlmodelc": "HiFT-T500-fp16.mlmodelc"
|
| 101 |
+
},
|
| 102 |
+
"dtype": "fp16",
|
| 103 |
+
"compute_units": "cpuAndNeuralEngine",
|
| 104 |
+
"purpose": "Mel -> 24 kHz PCM via iSTFT-based vocoder.",
|
| 105 |
+
"size_bytes": 46448640,
|
| 106 |
+
"inputs": {
|
| 107 |
+
"mel": "[1, 80, 500] fp16 (right-padded)",
|
| 108 |
+
"num_valid_frames": "[1] int32"
|
| 109 |
+
},
|
| 110 |
+
"outputs": {
|
| 111 |
+
"audio": "[1, 240000] fp16 (clip to 480 * num_valid_frames samples)"
|
| 112 |
+
}
|
| 113 |
+
}
|
| 114 |
+
],
|
| 115 |
+
"embeddings": [
|
| 116 |
+
{
|
| 117 |
+
"name": "embeddings-runtime-fp32",
|
| 118 |
+
"path": "embeddings/embeddings-runtime-fp32.safetensors",
|
| 119 |
+
"shape": [151936, 896],
|
| 120 |
+
"dtype": "fp32",
|
| 121 |
+
"size_bytes": 568770400,
|
| 122 |
+
"purpose": "Qwen2 model.embed_tokens.weight at post-.float() runtime dtype. Required for bit-exact parity with Python reference. Swift mmaps this file."
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"name": "speech_embedding-fp16",
|
| 126 |
+
"path": "embeddings/speech_embedding-fp16.safetensors",
|
| 127 |
+
"shape": [6761, 896],
|
| 128 |
+
"dtype": "fp16",
|
| 129 |
+
"size_bytes": 12115808,
|
| 130 |
+
"purpose": "CosyVoice3 speech_embedding table. Row-lookup per decoded speech token in the decode loop."
|
| 131 |
+
}
|
| 132 |
+
],
|
| 133 |
+
"tokenizer": {
|
| 134 |
+
"kind": "qwen2-bpe",
|
| 135 |
+
"vocab_file": "tokenizer/vocab.json",
|
| 136 |
+
"merges_file": "tokenizer/merges.txt",
|
| 137 |
+
"config_file": "tokenizer/tokenizer_config.json",
|
| 138 |
+
"special_tokens_file": "tokenizer/special_tokens.json",
|
| 139 |
+
"base_vocab_size": 151936,
|
| 140 |
+
"special_token_count": 281,
|
| 141 |
+
"special_token_id_range": [151643, 151923],
|
| 142 |
+
"required_tokens": {
|
| 143 |
+
"endofprompt": 151646,
|
| 144 |
+
"endoftext": 151643,
|
| 145 |
+
"im_start": 151644,
|
| 146 |
+
"im_end": 151645
|
| 147 |
+
}
|
| 148 |
+
},
|
| 149 |
+
"voices": [
|
| 150 |
+
{
|
| 151 |
+
"voice_id": "cosyvoice3-default-zh",
|
| 152 |
+
"files": {
|
| 153 |
+
"tensors": "voices/cosyvoice3-default-zh.safetensors",
|
| 154 |
+
"metadata": "voices/cosyvoice3-default-zh.json"
|
| 155 |
+
},
|
| 156 |
+
"reference_wav": "CosyVoice upstream zero_shot_prompt.wav",
|
| 157 |
+
"prompt_utterance": "希望你以后能够做的比我还好呦。",
|
| 158 |
+
"n_speech": 87,
|
| 159 |
+
"mel_frames": 174,
|
| 160 |
+
"size_bytes": 57244
|
| 161 |
+
}
|
| 162 |
+
],
|
| 163 |
+
"additional_voices_repo": "FluidInference/cosyvoice3-voices-zh",
|
| 164 |
+
"swift": {
|
| 165 |
+
"library": "FluidAudio",
|
| 166 |
+
"manager": "CosyVoice3TtsManager",
|
| 167 |
+
"public_api": "synthesize(text: String, promptAssets: CosyVoice3PromptAssets) async throws -> SynthesisResult",
|
| 168 |
+
"default_voice": "voices/cosyvoice3-default-zh.safetensors"
|
| 169 |
+
},
|
| 170 |
+
"license": "Apache-2.0",
|
| 171 |
+
"upstream": "FunAudioLLM/CosyVoice3"
|
| 172 |
+
}
|
tokenizer/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer/special_tokens.json
ADDED
|
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"<|endoftext|>": 151643,
|
| 3 |
+
"<|im_start|>": 151644,
|
| 4 |
+
"<|im_end|>": 151645,
|
| 5 |
+
"<|endofprompt|>": 151646,
|
| 6 |
+
"[breath]": 151647,
|
| 7 |
+
"<strong>": 151648,
|
| 8 |
+
"</strong>": 151649,
|
| 9 |
+
"[noise]": 151650,
|
| 10 |
+
"[laughter]": 151651,
|
| 11 |
+
"[cough]": 151652,
|
| 12 |
+
"[clucking]": 151653,
|
| 13 |
+
"[accent]": 151654,
|
| 14 |
+
"[quick_breath]": 151655,
|
| 15 |
+
"<laughter>": 151656,
|
| 16 |
+
"</laughter>": 151657,
|
| 17 |
+
"[hissing]": 151658,
|
| 18 |
+
"[sigh]": 151659,
|
| 19 |
+
"[vocalized-noise]": 151660,
|
| 20 |
+
"[lipsmack]": 151661,
|
| 21 |
+
"[mn]": 151662,
|
| 22 |
+
"<|endofsystem|>": 151663,
|
| 23 |
+
"[AA]": 151664,
|
| 24 |
+
"[AA0]": 151665,
|
| 25 |
+
"[AA1]": 151666,
|
| 26 |
+
"[AA2]": 151667,
|
| 27 |
+
"[AE]": 151668,
|
| 28 |
+
"[AE0]": 151669,
|
| 29 |
+
"[AE1]": 151670,
|
| 30 |
+
"[AE2]": 151671,
|
| 31 |
+
"[AH]": 151672,
|
| 32 |
+
"[AH0]": 151673,
|
| 33 |
+
"[AH1]": 151674,
|
| 34 |
+
"[AH2]": 151675,
|
| 35 |
+
"[AO]": 151676,
|
| 36 |
+
"[AO0]": 151677,
|
| 37 |
+
"[AO1]": 151678,
|
| 38 |
+
"[AO2]": 151679,
|
| 39 |
+
"[AW]": 151680,
|
| 40 |
+
"[AW0]": 151681,
|
| 41 |
+
"[AW1]": 151682,
|
| 42 |
+
"[AW2]": 151683,
|
| 43 |
+
"[AY]": 151684,
|
| 44 |
+
"[AY0]": 151685,
|
| 45 |
+
"[AY1]": 151686,
|
| 46 |
+
"[AY2]": 151687,
|
| 47 |
+
"[B]": 151688,
|
| 48 |
+
"[CH]": 151689,
|
| 49 |
+
"[D]": 151690,
|
| 50 |
+
"[DH]": 151691,
|
| 51 |
+
"[EH]": 151692,
|
| 52 |
+
"[EH0]": 151693,
|
| 53 |
+
"[EH1]": 151694,
|
| 54 |
+
"[EH2]": 151695,
|
| 55 |
+
"[ER]": 151696,
|
| 56 |
+
"[ER0]": 151697,
|
| 57 |
+
"[ER1]": 151698,
|
| 58 |
+
"[ER2]": 151699,
|
| 59 |
+
"[EY]": 151700,
|
| 60 |
+
"[EY0]": 151701,
|
| 61 |
+
"[EY1]": 151702,
|
| 62 |
+
"[EY2]": 151703,
|
| 63 |
+
"[F]": 151704,
|
| 64 |
+
"[G]": 151705,
|
| 65 |
+
"[HH]": 151706,
|
| 66 |
+
"[IH]": 151707,
|
| 67 |
+
"[IH0]": 151708,
|
| 68 |
+
"[IH1]": 151709,
|
| 69 |
+
"[IH2]": 151710,
|
| 70 |
+
"[IY]": 151711,
|
| 71 |
+
"[IY0]": 151712,
|
| 72 |
+
"[IY1]": 151713,
|
| 73 |
+
"[IY2]": 151714,
|
| 74 |
+
"[JH]": 151715,
|
| 75 |
+
"[K]": 151716,
|
| 76 |
+
"[L]": 151717,
|
| 77 |
+
"[M]": 151718,
|
| 78 |
+
"[N]": 151719,
|
| 79 |
+
"[NG]": 151720,
|
| 80 |
+
"[OW]": 151721,
|
| 81 |
+
"[OW0]": 151722,
|
| 82 |
+
"[OW1]": 151723,
|
| 83 |
+
"[OW2]": 151724,
|
| 84 |
+
"[OY]": 151725,
|
| 85 |
+
"[OY0]": 151726,
|
| 86 |
+
"[OY1]": 151727,
|
| 87 |
+
"[OY2]": 151728,
|
| 88 |
+
"[P]": 151729,
|
| 89 |
+
"[R]": 151730,
|
| 90 |
+
"[S]": 151731,
|
| 91 |
+
"[SH]": 151732,
|
| 92 |
+
"[T]": 151733,
|
| 93 |
+
"[TH]": 151734,
|
| 94 |
+
"[UH]": 151735,
|
| 95 |
+
"[UH0]": 151736,
|
| 96 |
+
"[UH1]": 151737,
|
| 97 |
+
"[UH2]": 151738,
|
| 98 |
+
"[UW]": 151739,
|
| 99 |
+
"[UW0]": 151740,
|
| 100 |
+
"[UW1]": 151741,
|
| 101 |
+
"[UW2]": 151742,
|
| 102 |
+
"[V]": 151743,
|
| 103 |
+
"[W]": 151744,
|
| 104 |
+
"[Y]": 151745,
|
| 105 |
+
"[Z]": 151746,
|
| 106 |
+
"[ZH]": 151747,
|
| 107 |
+
"[a]": 151748,
|
| 108 |
+
"[ai]": 151749,
|
| 109 |
+
"[an]": 151750,
|
| 110 |
+
"[ang]": 151751,
|
| 111 |
+
"[ao]": 151752,
|
| 112 |
+
"[b]": 151753,
|
| 113 |
+
"[c]": 151754,
|
| 114 |
+
"[ch]": 151755,
|
| 115 |
+
"[d]": 151756,
|
| 116 |
+
"[e]": 151757,
|
| 117 |
+
"[ei]": 151758,
|
| 118 |
+
"[en]": 151759,
|
| 119 |
+
"[eng]": 151760,
|
| 120 |
+
"[f]": 151761,
|
| 121 |
+
"[g]": 151762,
|
| 122 |
+
"[h]": 151763,
|
| 123 |
+
"[i]": 151764,
|
| 124 |
+
"[ian]": 151765,
|
| 125 |
+
"[in]": 151766,
|
| 126 |
+
"[ing]": 151767,
|
| 127 |
+
"[iu]": 151768,
|
| 128 |
+
"[ià]": 151769,
|
| 129 |
+
"[iàn]": 151770,
|
| 130 |
+
"[iàng]": 151771,
|
| 131 |
+
"[iào]": 151772,
|
| 132 |
+
"[iá]": 151773,
|
| 133 |
+
"[ián]": 151774,
|
| 134 |
+
"[iáng]": 151775,
|
| 135 |
+
"[iáo]": 151776,
|
| 136 |
+
"[iè]": 151777,
|
| 137 |
+
"[ié]": 151778,
|
| 138 |
+
"[iòng]": 151779,
|
| 139 |
+
"[ióng]": 151780,
|
| 140 |
+
"[iù]": 151781,
|
| 141 |
+
"[iú]": 151782,
|
| 142 |
+
"[iā]": 151783,
|
| 143 |
+
"[iān]": 151784,
|
| 144 |
+
"[iāng]": 151785,
|
| 145 |
+
"[iāo]": 151786,
|
| 146 |
+
"[iē]": 151787,
|
| 147 |
+
"[iě]": 151788,
|
| 148 |
+
"[iōng]": 151789,
|
| 149 |
+
"[iū]": 151790,
|
| 150 |
+
"[iǎ]": 151791,
|
| 151 |
+
"[iǎn]": 151792,
|
| 152 |
+
"[iǎng]": 151793,
|
| 153 |
+
"[iǎo]": 151794,
|
| 154 |
+
"[iǒng]": 151795,
|
| 155 |
+
"[iǔ]": 151796,
|
| 156 |
+
"[j]": 151797,
|
| 157 |
+
"[k]": 151798,
|
| 158 |
+
"[l]": 151799,
|
| 159 |
+
"[m]": 151800,
|
| 160 |
+
"[n]": 151801,
|
| 161 |
+
"[o]": 151802,
|
| 162 |
+
"[ong]": 151803,
|
| 163 |
+
"[ou]": 151804,
|
| 164 |
+
"[p]": 151805,
|
| 165 |
+
"[q]": 151806,
|
| 166 |
+
"[r]": 151807,
|
| 167 |
+
"[s]": 151808,
|
| 168 |
+
"[sh]": 151809,
|
| 169 |
+
"[t]": 151810,
|
| 170 |
+
"[u]": 151811,
|
| 171 |
+
"[uang]": 151812,
|
| 172 |
+
"[ue]": 151813,
|
| 173 |
+
"[un]": 151814,
|
| 174 |
+
"[uo]": 151815,
|
| 175 |
+
"[uà]": 151816,
|
| 176 |
+
"[uài]": 151817,
|
| 177 |
+
"[uàn]": 151818,
|
| 178 |
+
"[uàng]": 151819,
|
| 179 |
+
"[uá]": 151820,
|
| 180 |
+
"[uái]": 151821,
|
| 181 |
+
"[uán]": 151822,
|
| 182 |
+
"[uáng]": 151823,
|
| 183 |
+
"[uè]": 151824,
|
| 184 |
+
"[ué]": 151825,
|
| 185 |
+
"[uì]": 151826,
|
| 186 |
+
"[uí]": 151827,
|
| 187 |
+
"[uò]": 151828,
|
| 188 |
+
"[uó]": 151829,
|
| 189 |
+
"[uā]": 151830,
|
| 190 |
+
"[uāi]": 151831,
|
| 191 |
+
"[uān]": 151832,
|
| 192 |
+
"[uāng]": 151833,
|
| 193 |
+
"[uē]": 151834,
|
| 194 |
+
"[uě]": 151835,
|
| 195 |
+
"[uī]": 151836,
|
| 196 |
+
"[uō]": 151837,
|
| 197 |
+
"[uǎ]": 151838,
|
| 198 |
+
"[uǎi]": 151839,
|
| 199 |
+
"[uǎn]": 151840,
|
| 200 |
+
"[uǎng]": 151841,
|
| 201 |
+
"[uǐ]": 151842,
|
| 202 |
+
"[uǒ]": 151843,
|
| 203 |
+
"[vè]": 151844,
|
| 204 |
+
"[w]": 151845,
|
| 205 |
+
"[x]": 151846,
|
| 206 |
+
"[y]": 151847,
|
| 207 |
+
"[z]": 151848,
|
| 208 |
+
"[zh]": 151849,
|
| 209 |
+
"[à]": 151850,
|
| 210 |
+
"[ài]": 151851,
|
| 211 |
+
"[àn]": 151852,
|
| 212 |
+
"[àng]": 151853,
|
| 213 |
+
"[ào]": 151854,
|
| 214 |
+
"[á]": 151855,
|
| 215 |
+
"[ái]": 151856,
|
| 216 |
+
"[án]": 151857,
|
| 217 |
+
"[áng]": 151858,
|
| 218 |
+
"[áo]": 151859,
|
| 219 |
+
"[è]": 151860,
|
| 220 |
+
"[èi]": 151861,
|
| 221 |
+
"[èn]": 151862,
|
| 222 |
+
"[èng]": 151863,
|
| 223 |
+
"[èr]": 151864,
|
| 224 |
+
"[é]": 151865,
|
| 225 |
+
"[éi]": 151866,
|
| 226 |
+
"[én]": 151867,
|
| 227 |
+
"[éng]": 151868,
|
| 228 |
+
"[ér]": 151869,
|
| 229 |
+
"[ì]": 151870,
|
| 230 |
+
"[ìn]": 151871,
|
| 231 |
+
"[ìng]": 151872,
|
| 232 |
+
"[í]": 151873,
|
| 233 |
+
"[ín]": 151874,
|
| 234 |
+
"[íng]": 151875,
|
| 235 |
+
"[ò]": 151876,
|
| 236 |
+
"[òng]": 151877,
|
| 237 |
+
"[òu]": 151878,
|
| 238 |
+
"[ó]": 151879,
|
| 239 |
+
"[óng]": 151880,
|
| 240 |
+
"[óu]": 151881,
|
| 241 |
+
"[ù]": 151882,
|
| 242 |
+
"[ùn]": 151883,
|
| 243 |
+
"[ú]": 151884,
|
| 244 |
+
"[ún]": 151885,
|
| 245 |
+
"[ā]": 151886,
|
| 246 |
+
"[āi]": 151887,
|
| 247 |
+
"[ān]": 151888,
|
| 248 |
+
"[āng]": 151889,
|
| 249 |
+
"[āo]": 151890,
|
| 250 |
+
"[ē]": 151891,
|
| 251 |
+
"[ēi]": 151892,
|
| 252 |
+
"[ēn]": 151893,
|
| 253 |
+
"[ēng]": 151894,
|
| 254 |
+
"[ě]": 151895,
|
| 255 |
+
"[ěi]": 151896,
|
| 256 |
+
"[ěn]": 151897,
|
| 257 |
+
"[ěng]": 151898,
|
| 258 |
+
"[ěr]": 151899,
|
| 259 |
+
"[ī]": 151900,
|
| 260 |
+
"[īn]": 151901,
|
| 261 |
+
"[īng]": 151902,
|
| 262 |
+
"[ō]": 151903,
|
| 263 |
+
"[ōng]": 151904,
|
| 264 |
+
"[ōu]": 151905,
|
| 265 |
+
"[ū]": 151906,
|
| 266 |
+
"[ūn]": 151907,
|
| 267 |
+
"[ǎ]": 151908,
|
| 268 |
+
"[ǎi]": 151909,
|
| 269 |
+
"[ǎn]": 151910,
|
| 270 |
+
"[ǎng]": 151911,
|
| 271 |
+
"[ǎo]": 151912,
|
| 272 |
+
"[ǐ]": 151913,
|
| 273 |
+
"[ǐn]": 151914,
|
| 274 |
+
"[ǐng]": 151915,
|
| 275 |
+
"[ǒ]": 151916,
|
| 276 |
+
"[ǒng]": 151917,
|
| 277 |
+
"[ǒu]": 151918,
|
| 278 |
+
"[ǔ]": 151919,
|
| 279 |
+
"[ǔn]": 151920,
|
| 280 |
+
"[ǘ]": 151921,
|
| 281 |
+
"[ǚ]": 151922,
|
| 282 |
+
"[ǜ]": 151923
|
| 283 |
+
}
|
tokenizer/tokenizer_config.json
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"added_tokens_decoder": {
|
| 4 |
+
"151643": {
|
| 5 |
+
"content": "<|endoftext|>",
|
| 6 |
+
"lstrip": false,
|
| 7 |
+
"normalized": false,
|
| 8 |
+
"rstrip": false,
|
| 9 |
+
"single_word": false,
|
| 10 |
+
"special": true
|
| 11 |
+
},
|
| 12 |
+
"151644": {
|
| 13 |
+
"content": "<|im_start|>",
|
| 14 |
+
"lstrip": false,
|
| 15 |
+
"normalized": false,
|
| 16 |
+
"rstrip": false,
|
| 17 |
+
"single_word": false,
|
| 18 |
+
"special": true
|
| 19 |
+
},
|
| 20 |
+
"151645": {
|
| 21 |
+
"content": "<|im_end|>",
|
| 22 |
+
"lstrip": false,
|
| 23 |
+
"normalized": false,
|
| 24 |
+
"rstrip": false,
|
| 25 |
+
"single_word": false,
|
| 26 |
+
"special": true
|
| 27 |
+
}
|
| 28 |
+
},
|
| 29 |
+
"additional_special_tokens": ["<|im_start|>", "<|im_end|>"],
|
| 30 |
+
"bos_token": null,
|
| 31 |
+
"chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
|
| 32 |
+
"clean_up_tokenization_spaces": false,
|
| 33 |
+
"eos_token": "<|im_end|>",
|
| 34 |
+
"errors": "replace",
|
| 35 |
+
"model_max_length": 32768,
|
| 36 |
+
"pad_token": "<|endoftext|>",
|
| 37 |
+
"split_special_tokens": false,
|
| 38 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 39 |
+
"unk_token": null
|
| 40 |
+
}
|
tokenizer/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
voices/cosyvoice3-default-zh.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"prompt_text": "You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。"
|
| 3 |
+
}
|
voices/cosyvoice3-default-zh.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b3486fa47b1f36ca647b41592affb32d37914927161e8ce1f286107e4422e86b
|
| 3 |
+
size 57244
|