krcv gabrielclark3330 commited on
Commit
2dc4c4a
·
0 Parent(s):

Duplicate from Zyphra/ZONOS2

Browse files

Co-authored-by: Gabriel Clark <gabrielclark3330@users.noreply.huggingface.co>

.gitattributes ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ zonos2_arlooop_animated.gif filter=lfs diff=lfs merge=lfs -text
37
+ ZONOS2[[:space:]]Blog[[:space:]]Thumbnail.png filter=lfs diff=lfs merge=lfs -text
38
+ assets/ZONOS2BlogThumbnail.png filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ pipeline_tag: text-to-speech
4
+ library_name: ZONOS2
5
+ ---
6
+
7
+ # ZONOS2
8
+
9
+ <p align="center">
10
+ <img src="./assets/ZONOS2BlogThumbnail.png" alt="ZONOS2 title card" width="750" />
11
+ </p>
12
+
13
+ <div align="center">
14
+ <a href="https://discord.gg/gTW9JwST8q" target="_blank">
15
+ <img src="https://img.shields.io/badge/Join%20Our%20Discord-7289DA?style=for-the-badge&logo=discord&logoColor=white" alt="Discord">
16
+ </a>
17
+ </div>
18
+
19
+ ---
20
+
21
+
22
+ ZONOS2 is our latest text-to-speech model trained on more than 6 million hours of varied multilingual speech, delivering expressiveness and quality on par with—or even surpassing—top TTS providers at low latency with MoE. ZONOS2 excels at high-fidelity and naturalistic voice cloning.
23
+
24
+
25
+ During inference we use nemo TN normalized UTF-8 bytes and an ECAPA-TDNN embedding to generate DAC tokens with our MoE backbone. An inference overview can be seen below.
26
+ <p align="center">
27
+ <img src="./assets/zonos2_arlooop_animated.gif" alt="ZONOS2 title card" width="750" />
28
+ </p>
29
+
30
+ Language support is as follows.
31
+ | Tier | Languages |
32
+ | ------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
33
+ | Tier 1 | English, Mandarin Chinese, Japanese |
34
+ | Tier 2 | Korean, Russian, Italian, Portuguese, French, Spanish, Vietnamese, German, Hebrew, Dutch |
35
+ | Tier 3 | Swedish, Hindi, Tamil, Telugu, Thai, Norwegian, Bengali, Tagalog, Arabic, Danish, Indonesian, Polish, Ukrainian, Romanian, Finnish, Hungarian, Lithuanian, Estonian, Slovak, Croatian, Latvian |
36
+
37
+
38
+ For local inference we provide a high-performance TTS inference server built on [Mini-SGLang](https://github.com/sgl-project/mini-sglang).
39
+
40
+ **For more details and speech samples, check out our [blog](https://www.zyphra.com/our-work/zonos2).**
41
+
42
+ **We also have a hosted version available at [cloud.zyphra.com/audio-playground](https://cloud.zyphra.com/audio-playground).**
43
+
44
+ ---
45
+
46
+ ## Quick Start
47
+
48
+ > **Platform Support**: Linux only (x86_64). Requires NVIDIA GPU with CUDA toolkit matching your driver version (`nvidia-smi` to check).
49
+
50
+ ### 1. Installation
51
+
52
+ Requires [uv](https://docs.astral.sh/uv/getting-started/installation/).
53
+
54
+ ```bash
55
+ git clone https://github.com/Zyphra/ZONOS2.git
56
+ cd ZONOS2
57
+ uv sync
58
+ ```
59
+
60
+ ### 2. Launch the TTS Server
61
+
62
+ ```bash
63
+ uv run python -m minisgl --model-path Zyphra/ZONOS2 --tts-default-voices-dir ./default_voices/
64
+ ```
65
+
66
+ `uv run` always uses the project environment, so no venv activation is needed.
67
+
68
+ The server starts on `http://localhost:1919` by default. TTS mode is auto-detected for zonos2 models.
69
+ `--tts-default-voices-dir <folder>` pre-populates the web UI with voice-clone
70
+ speakers from disk; the folder is scanned recursively for speaker audio
71
+ (`.wav`, `.mp3`, `.flac`, `.m4a`, `.ogg`, `.opus`, `.aac`, `.webm`) and saved
72
+ embeddings (`.npy`, `.npz`). The newest voice is selected automatically on
73
+ startup.
74
+
75
+ ### 3. Generate Speech
76
+
77
+ **curl:**
78
+
79
+ ```bash
80
+ curl -X POST http://localhost:1919/tts/generate \
81
+ -H "Content-Type: application/json" \
82
+ -d '{"text": "Hello world", "stream": true}' \
83
+ --output output.pcm
84
+
85
+ # Convert to WAV
86
+ ffmpeg -f f32le -ar 44100 -ac 1 -i output.pcm output.wav
87
+ ```
88
+
89
+ **Web UI:** Open `http://localhost:1919/` in your browser.
90
+
91
+ ## Python API (offline inference)
92
+
93
+ You can also run the engine directly in a Python script, without starting a
94
+ server, via `TTSLLM`:
95
+
96
+ ```python
97
+ from minisgl.message import TTSSamplingParams
98
+ from minisgl.tts import TTSLLM
99
+
100
+ tts = TTSLLM(model_path="Zyphra/ZONOS2")
101
+
102
+ results = tts.generate(
103
+ ["Hello from the offline Python API.", "Batched prompts work too."],
104
+ TTSSamplingParams(seed=42),
105
+ )
106
+
107
+ for i, result in enumerate(results):
108
+ print(f"frames={len(result['audio_tokens'])}, eos_frame={result['eos_frame']}")
109
+ tts.save_audio(result["audio"], f"output_{i}.wav")
110
+ ```
111
+
112
+
113
+ ## Citation
114
+ If you find this model useful in an academic context please cite as:
115
+ ```
116
+ @misc{zyphra2025zonos,
117
+ title = {Zonos V2 Technical Report},
118
+ author = {Gabriel Clark, Sofian Mejjoute, Mohamed Osman, George Close, Beren Millidge},
119
+ year = {2026},
120
+ }
121
+ ```
assets/ZONOS2BlogThumbnail.png ADDED

Git LFS Details

  • SHA256: d9c3c09b213fe59c7bd0214a75219f37ff4cf51da45a245f4943e759fdeef47c
  • Pointer size: 131 Bytes
  • Size of remote file: 595 kB
assets/zonos2_arlooop_animated.gif ADDED

Git LFS Details

  • SHA256: 6fe9bb07bfe7651272be63beed980231b3e2c27d0dd2c7b5eb33554e5fa24900
  • Pointer size: 133 Bytes
  • Size of remote file: 14.1 MB
model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f6aa0fff9036ee44ccbc625d40aa6bdd8ea223480a5447e9f6aad70c38b6ecd
3
+ size 15336390655
params.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "zonos2",
3
+ "dtype": "bfloat16",
4
+ "n_layers": 28,
5
+ "dim": 2048,
6
+ "head_dim": 128,
7
+ "n_heads": null,
8
+ "n_kv_heads": 4,
9
+ "ffn_dim_multiplier": 1.5,
10
+ "multiple_of": 256,
11
+ "norm_eps": 1e-05,
12
+ "rope_theta": 10000.0,
13
+ "max_seqlen": 6144,
14
+ "n_codebooks": 9,
15
+ "codebook_size": 1024,
16
+ "eoa_id": 1024,
17
+ "audio_pad_id": 1025,
18
+ "text_vocab": 519,
19
+ "loss_softcap": 15.0,
20
+ "speaker_enabled": true,
21
+ "speaker_embedding_dim": 2048,
22
+ "speaker_lda_dim": 1024,
23
+ "speaker_background_token_enabled": true,
24
+ "accurate_mode_token_enabled": true,
25
+ "speaking_rate_num_buckets": 8,
26
+ "speaking_rate_buckets": ["0-8", "8-11", "11-14", "14-17", "17-21", "21-28", "28-40", "40+"],
27
+ "quality_num_buckets": 60,
28
+ "quality_features": [
29
+ "lufs",
30
+ "estimated_snr",
31
+ "max_pause",
32
+ "estimated_bandlimit_hz",
33
+ "leading_silence_s",
34
+ "trailing_silence_s"
35
+ ],
36
+ "quality_buckets": {
37
+ "lufs": ["-1000--50", "-50--45.5", "-45.5--41", "-41--36.5", "-36.5--32", "-32--27.5", "-27.5--23", "-23--18.5", "-18.5--14", "-14--9.5", "-9.5--5", "-5+"],
38
+ "estimated_snr": ["-1000-0", "0-6", "6-12", "12-18", "18-24", "24-30", "30-36", "36-42", "42-48", "48-54", "54-60", "60+"],
39
+ "max_pause": ["0-0.5", "0.5-1", "1-1.5", "1.5-2", "2-2.5", "2.5-3", "3-3.5", "3.5-4", "4-4.5", "4.5-5", "5-5.5", "5.5-6"],
40
+ "estimated_bandlimit_hz": ["495.3-3433", "3433-6371", "6371-9310", "9310-12248", "12248-15186", "15186-18124", "18124-21062", "21062-24000"],
41
+ "leading_silence_s": ["0-0.05", "0.05-0.1", "0.1-0.25", "0.25-0.5", "0.5-1", "1-2", "2-4", "4+"],
42
+ "trailing_silence_s": ["0-0.05", "0.05-0.1", "0.1-0.25", "0.25-0.5", "0.5-1", "1-2", "2-4", "4+"]
43
+ },
44
+ "quality_dropout": {
45
+ "lufs": 0.25,
46
+ "estimated_snr": 0.25,
47
+ "max_pause": 0.25,
48
+ "estimated_bandlimit_hz": 0.25,
49
+ "leading_silence_s": 0.25,
50
+ "trailing_silence_s": 0.25
51
+ },
52
+ "moe_impl": "sonic",
53
+ "moe_n_experts": 16,
54
+ "moe_router_topk": 1,
55
+ "special_topk_layers": {"26": 2},
56
+ "moe_router_dim": 128,
57
+ "moe_start_from_layer": 3,
58
+ "moe_end_from_layer": 1
59
+ }