Commit ·
2dc4c4a
0
Parent(s):
Duplicate from Zyphra/ZONOS2
Browse filesCo-authored-by: Gabriel Clark <gabrielclark3330@users.noreply.huggingface.co>
- .gitattributes +38 -0
- README.md +121 -0
- assets/ZONOS2BlogThumbnail.png +3 -0
- assets/zonos2_arlooop_animated.gif +3 -0
- model.pth +3 -0
- params.json +59 -0
.gitattributes
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
zonos2_arlooop_animated.gif filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
ZONOS2[[:space:]]Blog[[:space:]]Thumbnail.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
assets/ZONOS2BlogThumbnail.png filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
pipeline_tag: text-to-speech
|
| 4 |
+
library_name: ZONOS2
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
# ZONOS2
|
| 8 |
+
|
| 9 |
+
<p align="center">
|
| 10 |
+
<img src="./assets/ZONOS2BlogThumbnail.png" alt="ZONOS2 title card" width="750" />
|
| 11 |
+
</p>
|
| 12 |
+
|
| 13 |
+
<div align="center">
|
| 14 |
+
<a href="https://discord.gg/gTW9JwST8q" target="_blank">
|
| 15 |
+
<img src="https://img.shields.io/badge/Join%20Our%20Discord-7289DA?style=for-the-badge&logo=discord&logoColor=white" alt="Discord">
|
| 16 |
+
</a>
|
| 17 |
+
</div>
|
| 18 |
+
|
| 19 |
+
---
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
ZONOS2 is our latest text-to-speech model trained on more than 6 million hours of varied multilingual speech, delivering expressiveness and quality on par with—or even surpassing—top TTS providers at low latency with MoE. ZONOS2 excels at high-fidelity and naturalistic voice cloning.
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
During inference we use nemo TN normalized UTF-8 bytes and an ECAPA-TDNN embedding to generate DAC tokens with our MoE backbone. An inference overview can be seen below.
|
| 26 |
+
<p align="center">
|
| 27 |
+
<img src="./assets/zonos2_arlooop_animated.gif" alt="ZONOS2 title card" width="750" />
|
| 28 |
+
</p>
|
| 29 |
+
|
| 30 |
+
Language support is as follows.
|
| 31 |
+
| Tier | Languages |
|
| 32 |
+
| ------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| 33 |
+
| Tier 1 | English, Mandarin Chinese, Japanese |
|
| 34 |
+
| Tier 2 | Korean, Russian, Italian, Portuguese, French, Spanish, Vietnamese, German, Hebrew, Dutch |
|
| 35 |
+
| Tier 3 | Swedish, Hindi, Tamil, Telugu, Thai, Norwegian, Bengali, Tagalog, Arabic, Danish, Indonesian, Polish, Ukrainian, Romanian, Finnish, Hungarian, Lithuanian, Estonian, Slovak, Croatian, Latvian |
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
For local inference we provide a high-performance TTS inference server built on [Mini-SGLang](https://github.com/sgl-project/mini-sglang).
|
| 39 |
+
|
| 40 |
+
**For more details and speech samples, check out our [blog](https://www.zyphra.com/our-work/zonos2).**
|
| 41 |
+
|
| 42 |
+
**We also have a hosted version available at [cloud.zyphra.com/audio-playground](https://cloud.zyphra.com/audio-playground).**
|
| 43 |
+
|
| 44 |
+
---
|
| 45 |
+
|
| 46 |
+
## Quick Start
|
| 47 |
+
|
| 48 |
+
> **Platform Support**: Linux only (x86_64). Requires NVIDIA GPU with CUDA toolkit matching your driver version (`nvidia-smi` to check).
|
| 49 |
+
|
| 50 |
+
### 1. Installation
|
| 51 |
+
|
| 52 |
+
Requires [uv](https://docs.astral.sh/uv/getting-started/installation/).
|
| 53 |
+
|
| 54 |
+
```bash
|
| 55 |
+
git clone https://github.com/Zyphra/ZONOS2.git
|
| 56 |
+
cd ZONOS2
|
| 57 |
+
uv sync
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
### 2. Launch the TTS Server
|
| 61 |
+
|
| 62 |
+
```bash
|
| 63 |
+
uv run python -m minisgl --model-path Zyphra/ZONOS2 --tts-default-voices-dir ./default_voices/
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
`uv run` always uses the project environment, so no venv activation is needed.
|
| 67 |
+
|
| 68 |
+
The server starts on `http://localhost:1919` by default. TTS mode is auto-detected for zonos2 models.
|
| 69 |
+
`--tts-default-voices-dir <folder>` pre-populates the web UI with voice-clone
|
| 70 |
+
speakers from disk; the folder is scanned recursively for speaker audio
|
| 71 |
+
(`.wav`, `.mp3`, `.flac`, `.m4a`, `.ogg`, `.opus`, `.aac`, `.webm`) and saved
|
| 72 |
+
embeddings (`.npy`, `.npz`). The newest voice is selected automatically on
|
| 73 |
+
startup.
|
| 74 |
+
|
| 75 |
+
### 3. Generate Speech
|
| 76 |
+
|
| 77 |
+
**curl:**
|
| 78 |
+
|
| 79 |
+
```bash
|
| 80 |
+
curl -X POST http://localhost:1919/tts/generate \
|
| 81 |
+
-H "Content-Type: application/json" \
|
| 82 |
+
-d '{"text": "Hello world", "stream": true}' \
|
| 83 |
+
--output output.pcm
|
| 84 |
+
|
| 85 |
+
# Convert to WAV
|
| 86 |
+
ffmpeg -f f32le -ar 44100 -ac 1 -i output.pcm output.wav
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
**Web UI:** Open `http://localhost:1919/` in your browser.
|
| 90 |
+
|
| 91 |
+
## Python API (offline inference)
|
| 92 |
+
|
| 93 |
+
You can also run the engine directly in a Python script, without starting a
|
| 94 |
+
server, via `TTSLLM`:
|
| 95 |
+
|
| 96 |
+
```python
|
| 97 |
+
from minisgl.message import TTSSamplingParams
|
| 98 |
+
from minisgl.tts import TTSLLM
|
| 99 |
+
|
| 100 |
+
tts = TTSLLM(model_path="Zyphra/ZONOS2")
|
| 101 |
+
|
| 102 |
+
results = tts.generate(
|
| 103 |
+
["Hello from the offline Python API.", "Batched prompts work too."],
|
| 104 |
+
TTSSamplingParams(seed=42),
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
for i, result in enumerate(results):
|
| 108 |
+
print(f"frames={len(result['audio_tokens'])}, eos_frame={result['eos_frame']}")
|
| 109 |
+
tts.save_audio(result["audio"], f"output_{i}.wav")
|
| 110 |
+
```
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
## Citation
|
| 114 |
+
If you find this model useful in an academic context please cite as:
|
| 115 |
+
```
|
| 116 |
+
@misc{zyphra2025zonos,
|
| 117 |
+
title = {Zonos V2 Technical Report},
|
| 118 |
+
author = {Gabriel Clark, Sofian Mejjoute, Mohamed Osman, George Close, Beren Millidge},
|
| 119 |
+
year = {2026},
|
| 120 |
+
}
|
| 121 |
+
```
|
assets/ZONOS2BlogThumbnail.png
ADDED
|
|
Git LFS Details
|
assets/zonos2_arlooop_animated.gif
ADDED
|
Git LFS Details
|
model.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5f6aa0fff9036ee44ccbc625d40aa6bdd8ea223480a5447e9f6aad70c38b6ecd
|
| 3 |
+
size 15336390655
|
params.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_type": "zonos2",
|
| 3 |
+
"dtype": "bfloat16",
|
| 4 |
+
"n_layers": 28,
|
| 5 |
+
"dim": 2048,
|
| 6 |
+
"head_dim": 128,
|
| 7 |
+
"n_heads": null,
|
| 8 |
+
"n_kv_heads": 4,
|
| 9 |
+
"ffn_dim_multiplier": 1.5,
|
| 10 |
+
"multiple_of": 256,
|
| 11 |
+
"norm_eps": 1e-05,
|
| 12 |
+
"rope_theta": 10000.0,
|
| 13 |
+
"max_seqlen": 6144,
|
| 14 |
+
"n_codebooks": 9,
|
| 15 |
+
"codebook_size": 1024,
|
| 16 |
+
"eoa_id": 1024,
|
| 17 |
+
"audio_pad_id": 1025,
|
| 18 |
+
"text_vocab": 519,
|
| 19 |
+
"loss_softcap": 15.0,
|
| 20 |
+
"speaker_enabled": true,
|
| 21 |
+
"speaker_embedding_dim": 2048,
|
| 22 |
+
"speaker_lda_dim": 1024,
|
| 23 |
+
"speaker_background_token_enabled": true,
|
| 24 |
+
"accurate_mode_token_enabled": true,
|
| 25 |
+
"speaking_rate_num_buckets": 8,
|
| 26 |
+
"speaking_rate_buckets": ["0-8", "8-11", "11-14", "14-17", "17-21", "21-28", "28-40", "40+"],
|
| 27 |
+
"quality_num_buckets": 60,
|
| 28 |
+
"quality_features": [
|
| 29 |
+
"lufs",
|
| 30 |
+
"estimated_snr",
|
| 31 |
+
"max_pause",
|
| 32 |
+
"estimated_bandlimit_hz",
|
| 33 |
+
"leading_silence_s",
|
| 34 |
+
"trailing_silence_s"
|
| 35 |
+
],
|
| 36 |
+
"quality_buckets": {
|
| 37 |
+
"lufs": ["-1000--50", "-50--45.5", "-45.5--41", "-41--36.5", "-36.5--32", "-32--27.5", "-27.5--23", "-23--18.5", "-18.5--14", "-14--9.5", "-9.5--5", "-5+"],
|
| 38 |
+
"estimated_snr": ["-1000-0", "0-6", "6-12", "12-18", "18-24", "24-30", "30-36", "36-42", "42-48", "48-54", "54-60", "60+"],
|
| 39 |
+
"max_pause": ["0-0.5", "0.5-1", "1-1.5", "1.5-2", "2-2.5", "2.5-3", "3-3.5", "3.5-4", "4-4.5", "4.5-5", "5-5.5", "5.5-6"],
|
| 40 |
+
"estimated_bandlimit_hz": ["495.3-3433", "3433-6371", "6371-9310", "9310-12248", "12248-15186", "15186-18124", "18124-21062", "21062-24000"],
|
| 41 |
+
"leading_silence_s": ["0-0.05", "0.05-0.1", "0.1-0.25", "0.25-0.5", "0.5-1", "1-2", "2-4", "4+"],
|
| 42 |
+
"trailing_silence_s": ["0-0.05", "0.05-0.1", "0.1-0.25", "0.25-0.5", "0.5-1", "1-2", "2-4", "4+"]
|
| 43 |
+
},
|
| 44 |
+
"quality_dropout": {
|
| 45 |
+
"lufs": 0.25,
|
| 46 |
+
"estimated_snr": 0.25,
|
| 47 |
+
"max_pause": 0.25,
|
| 48 |
+
"estimated_bandlimit_hz": 0.25,
|
| 49 |
+
"leading_silence_s": 0.25,
|
| 50 |
+
"trailing_silence_s": 0.25
|
| 51 |
+
},
|
| 52 |
+
"moe_impl": "sonic",
|
| 53 |
+
"moe_n_experts": 16,
|
| 54 |
+
"moe_router_topk": 1,
|
| 55 |
+
"special_topk_layers": {"26": 2},
|
| 56 |
+
"moe_router_dim": 128,
|
| 57 |
+
"moe_start_from_layer": 3,
|
| 58 |
+
"moe_end_from_layer": 1
|
| 59 |
+
}
|