alexwengg commited on Apr 21

Commit

d988946

verified ·

1 Parent(s): e8cc0e1

Upload 38 files

Browse files

Files changed (38) hide show

Flow-N250-fp32.mlmodelc/analytics/coremldata.bin +3 -0
Flow-N250-fp32.mlmodelc/coremldata.bin +3 -0
Flow-N250-fp32.mlmodelc/model.mil +0 -0
Flow-N250-fp32.mlmodelc/weights/weight.bin +3 -0
Flow-N250-fp32.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
Flow-N250-fp32.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
Flow-N250-fp32.mlpackage/Manifest.json +18 -0
HiFT-T500-fp16.mlmodelc/analytics/coremldata.bin +3 -0
HiFT-T500-fp16.mlmodelc/coremldata.bin +3 -0
HiFT-T500-fp16.mlmodelc/model.mil +0 -0
HiFT-T500-fp16.mlmodelc/weights/weight.bin +3 -0
HiFT-T500-fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
HiFT-T500-fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
HiFT-T500-fp16.mlpackage/Manifest.json +18 -0
LLM-Decode-M768-fp16.mlmodelc/analytics/coremldata.bin +3 -0
LLM-Decode-M768-fp16.mlmodelc/coremldata.bin +3 -0
LLM-Decode-M768-fp16.mlmodelc/model.mil +0 -0
LLM-Decode-M768-fp16.mlmodelc/weights/weight.bin +3 -0
LLM-Decode-M768-fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
LLM-Decode-M768-fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
LLM-Decode-M768-fp16.mlpackage/Manifest.json +18 -0
LLM-Prefill-T256-M768-fp16.mlmodelc/analytics/coremldata.bin +3 -0
LLM-Prefill-T256-M768-fp16.mlmodelc/coremldata.bin +3 -0
LLM-Prefill-T256-M768-fp16.mlmodelc/model.mil +0 -0
LLM-Prefill-T256-M768-fp16.mlmodelc/weights/weight.bin +3 -0
LLM-Prefill-T256-M768-fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
LLM-Prefill-T256-M768-fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
LLM-Prefill-T256-M768-fp16.mlpackage/Manifest.json +18 -0
README.md +98 -0
embeddings/embeddings-runtime-fp32.safetensors +3 -0
embeddings/speech_embedding-fp16.safetensors +3 -0
manifest.json +172 -0
tokenizer/merges.txt +0 -0
tokenizer/special_tokens.json +283 -0
tokenizer/tokenizer_config.json +40 -0
tokenizer/vocab.json +0 -0
voices/cosyvoice3-default-zh.json +3 -0
voices/cosyvoice3-default-zh.safetensors +3 -0

Flow-N250-fp32.mlmodelc/analytics/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:098b273f0b0891822792c749e1d71fa660a7501a57f38e1c05f55486075f84b6
+size 243

Flow-N250-fp32.mlmodelc/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f3fec3b119e5324229c04c10c7da66db7b1de7ea9b16d99b5124ac2fa129a8c
+size 491

Flow-N250-fp32.mlmodelc/model.mil ADDED Viewed

The diff for this file is too large to render. See raw diff

Flow-N250-fp32.mlmodelc/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:429480f1c2a509c2f3f612e679b113943d482409a3a4eb00fcafb95fc23b4d4c
+size 1329136000

Flow-N250-fp32.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4af44aa2368a20aeb2700165866525cac034f54f771d3021731d1ebed36ce499
+size 3939600

Flow-N250-fp32.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:429480f1c2a509c2f3f612e679b113943d482409a3a4eb00fcafb95fc23b4d4c
+size 1329136000

Flow-N250-fp32.mlpackage/Manifest.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "fileFormatVersion": "1.0.0",
+    "itemInfoEntries": {
+        "04D8665B-267F-4E5D-901F-C9EDC843DC37": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Weights",
+            "name": "weights",
+            "path": "com.apple.CoreML/weights"
+        },
+        "2AB57201-1486-4222-A707-440F776BB6EC": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Specification",
+            "name": "model.mlmodel",
+            "path": "com.apple.CoreML/model.mlmodel"
+        }
+    },
+    "rootModelIdentifier": "2AB57201-1486-4222-A707-440F776BB6EC"
+}

HiFT-T500-fp16.mlmodelc/analytics/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d62418181d6824d86adf8345d29346f6f334751a785f00b895d90c609cfa7830
+size 243

HiFT-T500-fp16.mlmodelc/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef50842cece141ff17df5420ab0ab6daecf456e58509c2c35877989e52fc9bea
+size 436

HiFT-T500-fp16.mlmodelc/model.mil ADDED Viewed

The diff for this file is too large to render. See raw diff

HiFT-T500-fp16.mlmodelc/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:17b8e2bb0a67be7c1e67fdf3cad23741b5cf353461173918040dc4d3bd8c6519
+size 46124000

HiFT-T500-fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7eb104c2bc5f37a4678fae006adca5b1605f96beeb8b57d51bd402589cb8776c
+size 316657

HiFT-T500-fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:17b8e2bb0a67be7c1e67fdf3cad23741b5cf353461173918040dc4d3bd8c6519
+size 46124000

HiFT-T500-fp16.mlpackage/Manifest.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "fileFormatVersion": "1.0.0",
+    "itemInfoEntries": {
+        "208BEE18-4DCB-466A-AFB9-22F89DEC193F": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Weights",
+            "name": "weights",
+            "path": "com.apple.CoreML/weights"
+        },
+        "FA462075-CE04-4BFA-854A-FE4287ED328F": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Specification",
+            "name": "model.mlmodel",
+            "path": "com.apple.CoreML/model.mlmodel"
+        }
+    },
+    "rootModelIdentifier": "FA462075-CE04-4BFA-854A-FE4287ED328F"
+}

LLM-Decode-M768-fp16.mlmodelc/analytics/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:618f3e003e54e6455bf498e6947f7a3c26d17d86005b0947f907b24a629fa1cd
+size 243

LLM-Decode-M768-fp16.mlmodelc/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c55086879450735729e18522b2d229a9ecd4b81bd5357997336788cb1dc6e513
+size 514

LLM-Decode-M768-fp16.mlmodelc/model.mil ADDED Viewed

The diff for this file is too large to render. See raw diff

LLM-Decode-M768-fp16.mlmodelc/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd3d0286f7645d10aedafaf3033d68412d2bd53828ff084c3322920d00efec27
+size 727959122

LLM-Decode-M768-fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d3248aa37d9579596982f56c220be92380c7412807a7ab680fb37e85494fb9e3
+size 600357

LLM-Decode-M768-fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd3d0286f7645d10aedafaf3033d68412d2bd53828ff084c3322920d00efec27
+size 727959122

LLM-Decode-M768-fp16.mlpackage/Manifest.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "fileFormatVersion": "1.0.0",
+    "itemInfoEntries": {
+        "533AD866-41F8-4A31-BE97-CFFFE5CAEBE1": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Weights",
+            "name": "weights",
+            "path": "com.apple.CoreML/weights"
+        },
+        "C40E8821-472D-4251-B03E-0AAEAEFF2462": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Specification",
+            "name": "model.mlmodel",
+            "path": "com.apple.CoreML/model.mlmodel"
+        }
+    },
+    "rootModelIdentifier": "C40E8821-472D-4251-B03E-0AAEAEFF2462"
+}

LLM-Prefill-T256-M768-fp16.mlmodelc/analytics/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3c8f5a18e2ca38e70ec09bb926989bf110ee28ca9486303e8498c87d9a51fdc5
+size 243

LLM-Prefill-T256-M768-fp16.mlmodelc/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:029e60d0af895b8ae43d3277707f0288dafd3397f1a1f92aa034aa327de65f17
+size 492

LLM-Prefill-T256-M768-fp16.mlmodelc/model.mil ADDED Viewed

The diff for this file is too large to render. See raw diff

LLM-Prefill-T256-M768-fp16.mlmodelc/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d8f8cd89961a52aa583fc988de9c35ceee556dbfc4a1f1329a608e10086b4606
+size 728414866

LLM-Prefill-T256-M768-fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f9d9b57839d2cfb7da953b485f929e8e5d3e766f10efc97ffc38addba0b9f147
+size 620444

LLM-Prefill-T256-M768-fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d8f8cd89961a52aa583fc988de9c35ceee556dbfc4a1f1329a608e10086b4606
+size 728414866

LLM-Prefill-T256-M768-fp16.mlpackage/Manifest.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "fileFormatVersion": "1.0.0",
+    "itemInfoEntries": {
+        "18AF7B08-983C-4F06-820F-5C60330ED316": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Weights",
+            "name": "weights",
+            "path": "com.apple.CoreML/weights"
+        },
+        "C2D0C244-33DC-4550-9334-CB28AA3FFED8": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Specification",
+            "name": "model.mlmodel",
+            "path": "com.apple.CoreML/model.mlmodel"
+        }
+    },
+    "rootModelIdentifier": "C2D0C244-33DC-4550-9334-CB28AA3FFED8"
+}

README.md ADDED Viewed

	@@ -0,0 +1,98 @@

+---
+license: apache-2.0
+language:
+  - zh
+pipeline_tag: text-to-speech
+tags:
+  - tts
+  - cosyvoice3
+  - coreml
+  - apple-silicon
+  - ane
+  - mandarin
+library_name: fluidaudio
+---
+# CosyVoice3 (Mandarin) — CoreML Models for FluidAudio
+CoreML conversions of CosyVoice3's four inference stages, frozen to the exact
+shapes the [FluidAudio](https://github.com/FluidInference/FluidAudio) Swift
+package's `CosyVoice3TtsManager` loads at runtime. Targets Apple Silicon
+(M-series) with the Neural Engine for LLM + HiFT, CPU for Flow.
+A default voice ships in `voices/` so the repo is self-contained. Additional
+voices (as they're extracted) live in the companion repo
+`FluidInference/cosyvoice3-voices-zh`.
+## Shipping configuration (frozen)
+Each model is shipped in two formats: `.mlpackage` (source, portable) and
+`.mlmodelc` (pre-compiled for macOS 14 / iOS 17 + Apple Silicon). Swift can
+load either; `.mlmodelc` skips the one-time compile step on first use
+(~20-30 s for Flow without it).
+| Model | Compute | Purpose | dtype |
+|---|---|---|---|
+| `LLM-Prefill-T256-M768-fp16` | CPU + ANE | Qwen2-0.5B prefill, 256-token context, 768-slot KV cache | fp16 |
+| `LLM-Decode-M768-fp16` | CPU + ANE | Single-step AR decode, 768-slot KV cache, 24 layers × 2 KV heads × 64 dim | fp16 |
+| `Flow-N250-fp32` | CPU only | Speech-token → mel (80-bin, 24 kHz), N_total=250 | fp32 (fp16 NaNs on fused LayerNorm) |
+| `HiFT-T500-fp16` | CPU + ANE | Mel → 24 kHz PCM, T=500 frames | fp16 |
+Total disk footprint (`.mlmodelc` + `.mlpackage` + runtime tables): ~6.6 GB on
+disk. If you only need one format, delete the other after download.
+## Runtime tables
+`embeddings/`
+- `embeddings-runtime-fp32.safetensors` — 542 MB. Qwen2 `model.embed_tokens.weight`
+  at **runtime** (post-`.float()`) dtype. Required for bit-exact parity with
+  the Python reference — shipping raw `.pt` weights introduces ~4.7e-4 error
+  through the HuggingFace dtype round-trip. Swift mmaps this file.
+- `speech_embedding-fp16.safetensors` — 12 MB. CosyVoice3 `speech_embedding`
+  table (6761 × 896 fp16); row-lookup per decoded speech token.
+`voices/`
+- `cosyvoice3-default-zh.safetensors` + `.json` — default zero-shot voice
+  bundle extracted from CosyVoice upstream `zero_shot_prompt.wav`
+  (utterance: "希望你以后能够做的比我还好呦。", N_speech = 87).
+  Schema documented in the voices repo README.
+`tokenizer/`
+- `vocab.json` + `merges.txt` + `tokenizer_config.json` — stock Qwen2 BPE
+  tokenizer assets (copied from HuggingFace `FunAudioLLM/CosyVoice-BlankEN`).
+- `special_tokens.json` — 281 runtime-added CosyVoice3 special token → ID map
+  (`<|endofprompt|>`, `[breath]`, ARPAbet phonemes, etc.). Covers IDs
+  151643..151923.
+## Swift usage (FluidAudio)
+```swift
+import FluidAudio
+let manager = CosyVoice3TtsManager(
+    modelsDirectory:     modelsURL,                            // this repo root
+    tokenizerDirectory:  modelsURL.appendingPathComponent("tokenizer"),
+    textEmbeddingsFile:  modelsURL.appendingPathComponent("embeddings/embeddings-runtime-fp32.safetensors"),
+    specialTokensFile:   modelsURL.appendingPathComponent("tokenizer/special_tokens.json"))
+try await manager.initialize()
+let prompt = try CosyVoice3PromptAssets.load(
+    from: voiceURL.appendingPathComponent("cosyvoice3-default-zh.safetensors"))
+let result = try await manager.synthesize(
+    text: "今天天气真的很不错，适合出门散步。",
+    promptAssets: prompt)
+// result.samples — [Float] @ 24 kHz mono
+```
+## Model graph quick reference
+- Qwen2 decoder: hidden=896, 24 layers, 14 Q heads, 2 KV heads, head_dim=64
+- Speech vocab: 6761 (6561 tokens + sos/eos/task_id/stops)
+- SOS=6561, EOS=6562, TASK_ID=6563
+- Flow: 80-bin mel @ 24 kHz, hop=480, n_fft=1920
+- HiFT: iSTFT-based vocoder, upsamples mel to 24 kHz PCM
+## License
+Apache-2.0. Derived from FunAudioLLM/CosyVoice3 weights; see upstream license.

embeddings/embeddings-runtime-fp32.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:143f2698c0be3c3ef66e6e172899f1c2f99011169c405fe8d9925dff1df93203
+size 568770400

embeddings/speech_embedding-fp16.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ed70f85074a2625eb86fb38c09c13b6c4ba87b48b92f345a38a8b97b48aabc1
+size 12115808

manifest.json ADDED Viewed

	@@ -0,0 +1,172 @@

+{
+  "name": "cosyvoice3-coreml",
+  "version": "1.0.0",
+  "language": "zh",
+  "library": "fluidaudio",
+  "description": "CoreML conversions of CosyVoice3 Mandarin TTS (Qwen2-0.5B LLM + Flow mel generator + HiFT vocoder).",
+  "pipeline_tag": "text-to-speech",
+  "sample_rate_hz": 24000,
+  "compute": {
+    "target_platform": "Apple Silicon (M-series)",
+    "min_os": "macOS 14 / iOS 17",
+    "neural_engine": ["LLM-Prefill", "LLM-Decode", "HiFT"],
+    "cpu_only": ["Flow"]
+  },
+  "model_graph": {
+    "llm_hidden_dim": 896,
+    "llm_layers": 24,
+    "llm_query_heads": 14,
+    "llm_kv_heads": 2,
+    "llm_head_dim": 64,
+    "llm_text_vocab": 151936,
+    "speech_vocab": 6761,
+    "speech_sos": 6561,
+    "speech_eos": 6562,
+    "speech_task_id": 6563,
+    "mel_bins": 80,
+    "mel_hop": 480,
+    "mel_nfft": 1920
+  },
+  "models": [
+    {
+      "name": "LLM-Prefill-T256-M768-fp16",
+      "paths": {
+        "mlpackage": "LLM-Prefill-T256-M768-fp16.mlpackage",
+        "mlmodelc":  "LLM-Prefill-T256-M768-fp16.mlmodelc"
+      },
+      "dtype": "fp16",
+      "compute_units": "cpuAndNeuralEngine",
+      "purpose": "Qwen2 prefill over 256-token context, initializes 768-slot KV cache.",
+      "size_bytes": 729042944,
+      "inputs": {
+        "inputs_embeds": "[1, 256, 896] fp16",
+        "attention_mask": "[1, 256] int32",
+        "position_ids": "[1, 256] int32"
+      },
+      "outputs": {
+        "logits": "[1, 256, 6761] fp16 (speech vocab)",
+        "kv_k_out": "[24, 1, 2, 768, 64] fp16",
+        "kv_v_out": "[24, 1, 2, 768, 64] fp16"
+      }
+    },
+    {
+      "name": "LLM-Decode-M768-fp16",
+      "paths": {
+        "mlpackage": "LLM-Decode-M768-fp16.mlpackage",
+        "mlmodelc":  "LLM-Decode-M768-fp16.mlmodelc"
+      },
+      "dtype": "fp16",
+      "compute_units": "cpuAndNeuralEngine",
+      "purpose": "Single-step AR decode against a 768-slot KV cache.",
+      "size_bytes": 728567808,
+      "inputs": {
+        "inputs_embeds": "[1, 1, 896] fp16",
+        "cur_len": "[1] int32",
+        "kv_k_in": "[24, 1, 2, 768, 64] fp16",
+        "kv_v_in": "[24, 1, 2, 768, 64] fp16"
+      },
+      "outputs": {
+        "logits": "[1, 1, 6761] fp16",
+        "kv_k_out": "[24, 1, 2, 768, 64] fp16",
+        "kv_v_out": "[24, 1, 2, 768, 64] fp16"
+      }
+    },
+    {
+      "name": "Flow-N250-fp32",
+      "paths": {
+        "mlpackage": "Flow-N250-fp32.mlpackage",
+        "mlmodelc":  "Flow-N250-fp32.mlmodelc"
+      },
+      "dtype": "fp32",
+      "compute_units": "cpuOnly",
+      "purpose": "Speech tokens -> 80-bin log-mel @ 24 kHz. fp16 produces NaNs on fused LayerNorm.",
+      "size_bytes": 1333084160,
+      "inputs": {
+        "token_total": "[1, 250] int32 (prompt_ids || new_ids, right-padded)",
+        "num_prompt_tokens": "[1] int32",
+        "num_new_tokens": "[1] int32",
+        "prompt_feat": "[1, 500, 80] fp32 (right-padded)",
+        "num_prompt_mel": "[1] int32",
+        "embedding": "[1, 192] fp32 (CAMPPlus speaker embedding)"
+      },
+      "outputs": {
+        "mel": "[1, 80, 500] fp32 (full buffer; slice to num_prompt_mel..num_prompt_mel+2*N_new)"
+      }
+    },
+    {
+      "name": "HiFT-T500-fp16",
+      "paths": {
+        "mlpackage": "HiFT-T500-fp16.mlpackage",
+        "mlmodelc":  "HiFT-T500-fp16.mlmodelc"
+      },
+      "dtype": "fp16",
+      "compute_units": "cpuAndNeuralEngine",
+      "purpose": "Mel -> 24 kHz PCM via iSTFT-based vocoder.",
+      "size_bytes": 46448640,
+      "inputs": {
+        "mel": "[1, 80, 500] fp16 (right-padded)",
+        "num_valid_frames": "[1] int32"
+      },
+      "outputs": {
+        "audio": "[1, 240000] fp16 (clip to 480 * num_valid_frames samples)"
+      }
+    }
+  ],
+  "embeddings": [
+    {
+      "name": "embeddings-runtime-fp32",
+      "path": "embeddings/embeddings-runtime-fp32.safetensors",
+      "shape": [151936, 896],
+      "dtype": "fp32",
+      "size_bytes": 568770400,
+      "purpose": "Qwen2 model.embed_tokens.weight at post-.float() runtime dtype. Required for bit-exact parity with Python reference. Swift mmaps this file."
+    },
+    {
+      "name": "speech_embedding-fp16",
+      "path": "embeddings/speech_embedding-fp16.safetensors",
+      "shape": [6761, 896],
+      "dtype": "fp16",
+      "size_bytes": 12115808,
+      "purpose": "CosyVoice3 speech_embedding table. Row-lookup per decoded speech token in the decode loop."
+    }
+  ],
+  "tokenizer": {
+    "kind": "qwen2-bpe",
+    "vocab_file": "tokenizer/vocab.json",
+    "merges_file": "tokenizer/merges.txt",
+    "config_file": "tokenizer/tokenizer_config.json",
+    "special_tokens_file": "tokenizer/special_tokens.json",
+    "base_vocab_size": 151936,
+    "special_token_count": 281,
+    "special_token_id_range": [151643, 151923],
+    "required_tokens": {
+      "endofprompt": 151646,
+      "endoftext": 151643,
+      "im_start": 151644,
+      "im_end": 151645
+    }
+  },
+  "voices": [
+    {
+      "voice_id": "cosyvoice3-default-zh",
+      "files": {
+        "tensors":  "voices/cosyvoice3-default-zh.safetensors",
+        "metadata": "voices/cosyvoice3-default-zh.json"
+      },
+      "reference_wav": "CosyVoice upstream zero_shot_prompt.wav",
+      "prompt_utterance": "希望你以后能够做的比我还好呦。",
+      "n_speech": 87,
+      "mel_frames": 174,
+      "size_bytes": 57244
+    }
+  ],
+  "additional_voices_repo": "FluidInference/cosyvoice3-voices-zh",
+  "swift": {
+    "library": "FluidAudio",
+    "manager": "CosyVoice3TtsManager",
+    "public_api": "synthesize(text: String, promptAssets: CosyVoice3PromptAssets) async throws -> SynthesisResult",
+    "default_voice": "voices/cosyvoice3-default-zh.safetensors"
+  },
+  "license": "Apache-2.0",
+  "upstream": "FunAudioLLM/CosyVoice3"
+}

tokenizer/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer/special_tokens.json ADDED Viewed

	@@ -0,0 +1,283 @@

+{
+  "<|endoftext|>": 151643,
+  "<|im_start|>": 151644,
+  "<|im_end|>": 151645,
+  "<|endofprompt|>": 151646,
+  "[breath]": 151647,
+  "<strong>": 151648,
+  "</strong>": 151649,
+  "[noise]": 151650,
+  "[laughter]": 151651,
+  "[cough]": 151652,
+  "[clucking]": 151653,
+  "[accent]": 151654,
+  "[quick_breath]": 151655,
+  "<laughter>": 151656,
+  "</laughter>": 151657,
+  "[hissing]": 151658,
+  "[sigh]": 151659,
+  "[vocalized-noise]": 151660,
+  "[lipsmack]": 151661,
+  "[mn]": 151662,
+  "<|endofsystem|>": 151663,
+  "[AA]": 151664,
+  "[AA0]": 151665,
+  "[AA1]": 151666,
+  "[AA2]": 151667,
+  "[AE]": 151668,
+  "[AE0]": 151669,
+  "[AE1]": 151670,
+  "[AE2]": 151671,
+  "[AH]": 151672,
+  "[AH0]": 151673,
+  "[AH1]": 151674,
+  "[AH2]": 151675,
+  "[AO]": 151676,
+  "[AO0]": 151677,
+  "[AO1]": 151678,
+  "[AO2]": 151679,
+  "[AW]": 151680,
+  "[AW0]": 151681,
+  "[AW1]": 151682,
+  "[AW2]": 151683,
+  "[AY]": 151684,
+  "[AY0]": 151685,
+  "[AY1]": 151686,
+  "[AY2]": 151687,
+  "[B]": 151688,
+  "[CH]": 151689,
+  "[D]": 151690,
+  "[DH]": 151691,
+  "[EH]": 151692,
+  "[EH0]": 151693,
+  "[EH1]": 151694,
+  "[EH2]": 151695,
+  "[ER]": 151696,
+  "[ER0]": 151697,
+  "[ER1]": 151698,
+  "[ER2]": 151699,
+  "[EY]": 151700,
+  "[EY0]": 151701,
+  "[EY1]": 151702,
+  "[EY2]": 151703,
+  "[F]": 151704,
+  "[G]": 151705,
+  "[HH]": 151706,
+  "[IH]": 151707,
+  "[IH0]": 151708,
+  "[IH1]": 151709,
+  "[IH2]": 151710,
+  "[IY]": 151711,
+  "[IY0]": 151712,
+  "[IY1]": 151713,
+  "[IY2]": 151714,
+  "[JH]": 151715,
+  "[K]": 151716,
+  "[L]": 151717,
+  "[M]": 151718,
+  "[N]": 151719,
+  "[NG]": 151720,
+  "[OW]": 151721,
+  "[OW0]": 151722,
+  "[OW1]": 151723,
+  "[OW2]": 151724,
+  "[OY]": 151725,
+  "[OY0]": 151726,
+  "[OY1]": 151727,
+  "[OY2]": 151728,
+  "[P]": 151729,
+  "[R]": 151730,
+  "[S]": 151731,
+  "[SH]": 151732,
+  "[T]": 151733,
+  "[TH]": 151734,
+  "[UH]": 151735,
+  "[UH0]": 151736,
+  "[UH1]": 151737,
+  "[UH2]": 151738,
+  "[UW]": 151739,
+  "[UW0]": 151740,
+  "[UW1]": 151741,
+  "[UW2]": 151742,
+  "[V]": 151743,
+  "[W]": 151744,
+  "[Y]": 151745,
+  "[Z]": 151746,
+  "[ZH]": 151747,
+  "[a]": 151748,
+  "[ai]": 151749,
+  "[an]": 151750,
+  "[ang]": 151751,
+  "[ao]": 151752,
+  "[b]": 151753,
+  "[c]": 151754,
+  "[ch]": 151755,
+  "[d]": 151756,
+  "[e]": 151757,
+  "[ei]": 151758,
+  "[en]": 151759,
+  "[eng]": 151760,
+  "[f]": 151761,
+  "[g]": 151762,
+  "[h]": 151763,
+  "[i]": 151764,
+  "[ian]": 151765,
+  "[in]": 151766,
+  "[ing]": 151767,
+  "[iu]": 151768,
+  "[ià]": 151769,
+  "[iàn]": 151770,
+  "[iàng]": 151771,
+  "[iào]": 151772,
+  "[iá]": 151773,
+  "[ián]": 151774,
+  "[iáng]": 151775,
+  "[iáo]": 151776,
+  "[iè]": 151777,
+  "[ié]": 151778,
+  "[iòng]": 151779,
+  "[ióng]": 151780,
+  "[iù]": 151781,
+  "[iú]": 151782,
+  "[iā]": 151783,
+  "[iān]": 151784,
+  "[iāng]": 151785,
+  "[iāo]": 151786,
+  "[iē]": 151787,
+  "[iě]": 151788,
+  "[iōng]": 151789,
+  "[iū]": 151790,
+  "[iǎ]": 151791,
+  "[iǎn]": 151792,
+  "[iǎng]": 151793,
+  "[iǎo]": 151794,
+  "[iǒng]": 151795,
+  "[iǔ]": 151796,
+  "[j]": 151797,
+  "[k]": 151798,
+  "[l]": 151799,
+  "[m]": 151800,
+  "[n]": 151801,
+  "[o]": 151802,
+  "[ong]": 151803,
+  "[ou]": 151804,
+  "[p]": 151805,
+  "[q]": 151806,
+  "[r]": 151807,
+  "[s]": 151808,
+  "[sh]": 151809,
+  "[t]": 151810,
+  "[u]": 151811,
+  "[uang]": 151812,
+  "[ue]": 151813,
+  "[un]": 151814,
+  "[uo]": 151815,
+  "[uà]": 151816,
+  "[uài]": 151817,
+  "[uàn]": 151818,
+  "[uàng]": 151819,
+  "[uá]": 151820,
+  "[uái]": 151821,
+  "[uán]": 151822,
+  "[uáng]": 151823,
+  "[uè]": 151824,
+  "[ué]": 151825,
+  "[uì]": 151826,
+  "[uí]": 151827,
+  "[uò]": 151828,
+  "[uó]": 151829,
+  "[uā]": 151830,
+  "[uāi]": 151831,
+  "[uān]": 151832,
+  "[uāng]": 151833,
+  "[uē]": 151834,
+  "[uě]": 151835,
+  "[uī]": 151836,
+  "[uō]": 151837,
+  "[uǎ]": 151838,
+  "[uǎi]": 151839,
+  "[uǎn]": 151840,
+  "[uǎng]": 151841,
+  "[uǐ]": 151842,
+  "[uǒ]": 151843,
+  "[vè]": 151844,
+  "[w]": 151845,
+  "[x]": 151846,
+  "[y]": 151847,
+  "[z]": 151848,
+  "[zh]": 151849,
+  "[à]": 151850,
+  "[ài]": 151851,
+  "[àn]": 151852,
+  "[àng]": 151853,
+  "[ào]": 151854,
+  "[á]": 151855,
+  "[ái]": 151856,
+  "[án]": 151857,
+  "[áng]": 151858,
+  "[áo]": 151859,
+  "[è]": 151860,
+  "[èi]": 151861,
+  "[èn]": 151862,
+  "[èng]": 151863,
+  "[èr]": 151864,
+  "[é]": 151865,
+  "[éi]": 151866,
+  "[én]": 151867,
+  "[éng]": 151868,
+  "[ér]": 151869,
+  "[ì]": 151870,
+  "[ìn]": 151871,
+  "[ìng]": 151872,
+  "[í]": 151873,
+  "[ín]": 151874,
+  "[íng]": 151875,
+  "[ò]": 151876,
+  "[òng]": 151877,
+  "[òu]": 151878,
+  "[ó]": 151879,
+  "[óng]": 151880,
+  "[óu]": 151881,
+  "[ù]": 151882,
+  "[ùn]": 151883,
+  "[ú]": 151884,
+  "[ún]": 151885,
+  "[ā]": 151886,
+  "[āi]": 151887,
+  "[ān]": 151888,
+  "[āng]": 151889,
+  "[āo]": 151890,
+  "[ē]": 151891,
+  "[ēi]": 151892,
+  "[ēn]": 151893,
+  "[ēng]": 151894,
+  "[ě]": 151895,
+  "[ěi]": 151896,
+  "[ěn]": 151897,
+  "[ěng]": 151898,
+  "[ěr]": 151899,
+  "[ī]": 151900,
+  "[īn]": 151901,
+  "[īng]": 151902,
+  "[ō]": 151903,
+  "[ōng]": 151904,
+  "[ōu]": 151905,
+  "[ū]": 151906,
+  "[ūn]": 151907,
+  "[ǎ]": 151908,
+  "[ǎi]": 151909,
+  "[ǎn]": 151910,
+  "[ǎng]": 151911,
+  "[ǎo]": 151912,
+  "[ǐ]": 151913,
+  "[ǐn]": 151914,
+  "[ǐng]": 151915,
+  "[ǒ]": 151916,
+  "[ǒng]": 151917,
+  "[ǒu]": 151918,
+  "[ǔ]": 151919,
+  "[ǔn]": 151920,
+  "[ǘ]": 151921,
+  "[ǚ]": 151922,
+  "[ǜ]": 151923
+}

tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": ["<|im_start|>", "<|im_end|>"],
+  "bos_token": null,
+  "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

tokenizer/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

voices/cosyvoice3-default-zh.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "prompt_text": "You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。"
+}

voices/cosyvoice3-default-zh.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b3486fa47b1f36ca647b41592affb32d37914927161e8ce1f286107e4422e86b
+size 57244