ailuntz commited on
Commit
100bd71
·
verified ·
1 Parent(s): 632cd07

Add Supertonic 3 MLX metadata and graph topology

Browse files
.gitattributes CHANGED
@@ -1,35 +1,2 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
  *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  *.npz filter=lfs diff=lfs merge=lfs -text
2
+ *.wav filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ base_model: Supertone/supertonic-3
4
+ library_name: mlx
5
+ tags:
6
+ - mlx
7
+ - text-to-speech
8
+ - on-device
9
+ - audio
10
+ ---
11
+
12
+ # Supertonic 3 MLX
13
+
14
+ This repository contains a community MLX conversion of [`Supertone/supertonic-3`](https://huggingface.co/Supertone/supertonic-3).
15
+
16
+ The original ONNX graphs are converted into JSON topology plus NPZ initializers. Inference is executed with MLX arrays through the Supertonic-specific graph runtime in [`ailuntx/supertonic`](https://github.com/ailuntx/supertonic).
17
+
18
+ ```bash
19
+ git clone https://github.com/ailuntx/supertonic
20
+ cd supertonic
21
+ python scripts/infer_mlx.py \
22
+ --model /path/to/supertonic-3 \
23
+ --text "Supertonic 3 is running with MLX." \
24
+ --lang en \
25
+ --voice M1 \
26
+ --total-step 8 \
27
+ --output output.wav
28
+ ```
29
+
30
+ The MLX graph runtime has been checked against ONNX Runtime on the official assets; per-stage maximum absolute errors are around `1e-5`.
31
+
32
+ ## Original Model Card
33
+
34
+ ---
35
+ license: openrail
36
+ language:
37
+ - en
38
+ - ko
39
+ - ja
40
+ - ar
41
+ - bg
42
+ - cs
43
+ - da
44
+ - de
45
+ - el
46
+ - es
47
+ - et
48
+ - fi
49
+ - fr
50
+ - hi
51
+ - hr
52
+ - hu
53
+ - id
54
+ - it
55
+ - lt
56
+ - lv
57
+ - nl
58
+ - pl
59
+ - pt
60
+ - ro
61
+ - ru
62
+ - sk
63
+ - sl
64
+ - sv
65
+ - tr
66
+ - uk
67
+ - vi
68
+ pipeline_tag: text-to-speech
69
+ tags:
70
+ - text-to-speech
71
+ - speech-synthesis
72
+ - tts
73
+ - onnx
74
+ - multilingual
75
+ - on-device
76
+ library_name: supertonic
77
+ ---
78
+
79
+ # Supertonic 3 | Lightning Fast, On-Device, Accurate TTS
80
+
81
+ ![Supertonic 3 Preview](img/Supertonic3_HeroImage.png)
82
+
83
+ <p align="center">
84
+ <a href="https://huggingface.co/spaces/Supertone/supertonic-3"><img src="https://img.shields.io/badge/Demo-Hugging_Face-yellow?style=for-the-badge" alt="Demo"></a>
85
+ <a href="https://github.com/supertone-inc/supertonic"><img src="https://img.shields.io/badge/Code-GitHub-black?style=for-the-badge&logo=github" alt="Code"></a>
86
+ <a href="https://pypi.org/project/supertonic/"><img src="https://img.shields.io/badge/Python-SDK-blue?style=for-the-badge&logo=python" alt="Python SDK"></a>
87
+ </p>
88
+
89
+ **Supertonic** is a lightweight text-to-speech system for local inference. It runs with ONNX Runtime entirely on your device, with no cloud call required for synthesis.
90
+
91
+ **Supertonic 3** expands the open-weight release from 5 to **31 languages**, improves reading stability, and reduces repeat/skip failures.
92
+
93
+ ## Quick Start
94
+
95
+ Install the Python SDK and generate speech immediately. On first run, the SDK downloads the model assets from Hugging Face.
96
+
97
+ ```bash
98
+ pip install supertonic
99
+ ```
100
+
101
+ ```python
102
+ from supertonic import TTS
103
+
104
+ tts = TTS(auto_download=True)
105
+ style = tts.get_voice_style(voice_name="M1")
106
+
107
+ text = "A gentle breeze moved through the open window while everyone listened to the story."
108
+ wav, duration = tts.synthesize(text, voice_style=style, lang="en")
109
+
110
+ tts.save_audio(wav, "output.wav")
111
+ print(f"Generated {duration:.2f}s of audio")
112
+ ```
113
+
114
+ ## What's New in Supertonic 3
115
+
116
+ - **31 languages**: expanded from the 5-language Supertonic 2 release.
117
+ - **More stable reading**: fewer repeat and skip failures, especially on short and long utterances.
118
+ - **Higher speaker similarity**: improved similarity across the shared-language set compared with Supertonic 2.
119
+ - **Expression tags**: supports simple tags such as `<laugh>`, `<breath>`, and `<sigh>`.
120
+
121
+ ## Custom Voices and Audio Samples
122
+
123
+ The open-weight package includes fixed preset voice styles for immediate local inference. If you want to hear how Supertonic 3 performs with zero-shot custom voice styles, visit the [Audio Sample Demo](https://supertonic3.github.io/) to compare reference audio and generated speech across several use cases. To create your own Supertonic 3 voice-style JSON from reference audio, use [Supertonic Voice Builder](https://supertonic.supertone.ai/voice-builder); purchased Voice Builder styles include downloadable embeddings for both Supertonic 2 and Supertonic 3.
124
+
125
+ Here are a few reference/generated pairs from the audio sample demo:
126
+
127
+ **Call center, English**
128
+ Text: Good morning, thank you for calling. How can I help you today?
129
+
130
+ | Reference voice | Supertonic 3 output |
131
+ |---|---|
132
+ | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/nora_reference.wav"></audio> | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/nora_supertonic3.wav"></audio> |
133
+
134
+ **Character voice, Japanese**
135
+ Text: ふふっ、退屈してたところなの。ちょうどいい遊び相手、見つけたかも♪
136
+
137
+ | Reference voice | Supertonic 3 output |
138
+ |---|---|
139
+ | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/moka_reference.wav"></audio> | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/moka_supertonic3.wav"></audio> |
140
+
141
+ **Elder character voice, Korean**
142
+ Text: 혼자 떠나기엔 길이 험하구나. 이 ��은 검을 가져가거라. 언젠가 어둠이 네 이름을 부르더라도, 부디 빛을 잊지 말거라.
143
+
144
+ | Reference voice | Supertonic 3 output |
145
+ |---|---|
146
+ | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/alphonse_reference.wav"></audio> | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/alphonse_supertonic3.wav"></audio> |
147
+
148
+ **Audiobook, English**
149
+ Text: I was not afraid of silence. I had lived with it long enough to know that, sometimes, it speaks more honestly than people do.
150
+
151
+ | Reference voice | Supertonic 3 output |
152
+ |---|---|
153
+ | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/luna_reference.wav"></audio> | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/luna_supertonic3.wav"></audio> |
154
+
155
+ **Audiobook, Japanese**
156
+ Text: その朝、ロンドンの霧はいつになく低く垂れこめていた。私はただの訪問者だと思っていたが、ホームズの目はすでに別の結論にたどり着いていた。
157
+
158
+ | Reference voice | Supertonic 3 output |
159
+ |---|---|
160
+ | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/watson_reference.wav"></audio> | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/watson_supertonic3.wav"></audio> |
161
+
162
+ **News, English**
163
+ Text: Here’s a story worth paying attention to. Supertone has released Supertonic 3, its on-device TTS model. This version expands support to thirty-one languages and improves reading stability.
164
+
165
+ | Reference voice | Supertonic 3 output |
166
+ |---|---|
167
+ | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/keld_reference.wav"></audio> | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/keld_supertonic3.wav"></audio> |
168
+
169
+ ## Performance Highlights
170
+
171
+ Supertonic 3 is designed for practical on-device inference: compact enough to run locally, while staying competitive with much larger open TTS systems.
172
+
173
+ ### Reading Accuracy
174
+
175
+ <p align="center">
176
+ <img src="img/metrics/s3_vs_measured_wer_range_voxcpm2.png" alt="Supertonic 3 reading accuracy compared with measured model ranges and VoxCPM2">
177
+ </p>
178
+
179
+ Across measured languages, Supertonic 3 stays within a competitive WER/CER range against much larger open TTS models such as VoxCPM2, while preserving a lightweight on-device deployment path. Asterisked languages use CER; the others use WER.
180
+
181
+ ### Supertonic 2 to Supertonic 3
182
+
183
+ <p align="center">
184
+ <img src="img/metrics/supertonic2_vs_3_comparison.png" alt="Supertonic 2 and Supertonic 3 comparison">
185
+ </p>
186
+
187
+ Compared with Supertonic 2, Supertonic 3 reduces repeat and skip failures, improves speaker similarity across the shared-language set, and expands language coverage from 5 to 31 languages.
188
+
189
+ ### Runtime Footprint
190
+
191
+ <p align="center">
192
+ <img src="img/metrics/runtime_cpu_gpu_latency_memory.png" alt="Supertonic CPU runtime compared with GPU baselines">
193
+ </p>
194
+
195
+ Supertonic 3 runs fast on CPU, even compared with larger baselines measured on A100 GPU, and uses substantially less memory. It does not require a GPU, which makes local, browser, and edge deployment much easier.
196
+
197
+ ### Model Size
198
+
199
+ <p align="center">
200
+ <img src="img/metrics/model_size_comparison.png" alt="Model size comparison">
201
+ </p>
202
+
203
+ At about 99M parameters across the public ONNX assets, Supertonic 3 is much smaller than 0.7B to 2B class open TTS systems. The smaller model size is a practical advantage for download size, startup time, and on-device inference.
204
+
205
+ ## Supported Languages
206
+
207
+ | Code | Language | Code | Language | Code | Language | Code | Language |
208
+ |------|----------|------|----------|------|----------|------|----------|
209
+ | `en` | English | `ko` | Korean | `ja` | Japanese | `ar` | Arabic |
210
+ | `bg` | Bulgarian | `cs` | Czech | `da` | Danish | `de` | German |
211
+ | `el` | Greek | `es` | Spanish | `et` | Estonian | `fi` | Finnish |
212
+ | `fr` | French | `hi` | Hindi | `hr` | Croatian | `hu` | Hungarian |
213
+ | `id` | Indonesian | `it` | Italian | `lt` | Lithuanian | `lv` | Latvian |
214
+ | `nl` | Dutch | `pl` | Polish | `pt` | Portuguese | `ro` | Romanian |
215
+ | `ru` | Russian | `sk` | Slovak | `sl` | Slovenian | `sv` | Swedish |
216
+ | `tr` | Turkish | `uk` | Ukrainian | `vi` | Vietnamese | | |
217
+
218
+ ## License
219
+
220
+ This project's sample code is released under the MIT License. See the [GitHub repository](https://github.com/supertone-inc/supertonic) for details.
221
+
222
+ The accompanying model is released under the OpenRAIL-M License. See the [LICENSE](https://huggingface.co/Supertone/supertonic-3/blob/main/LICENSE) file in this repository for details.
223
+
224
+ This model was trained using PyTorch, which is licensed under the BSD 3-Clause License but is not redistributed with this project. See the [PyTorch license](https://docs.pytorch.org/FBGEMM/general/License.html) for details.
225
+
226
+ Copyright (c) 2026 Supertone Inc.
README.official.md ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: openrail
3
+ language:
4
+ - en
5
+ - ko
6
+ - ja
7
+ - ar
8
+ - bg
9
+ - cs
10
+ - da
11
+ - de
12
+ - el
13
+ - es
14
+ - et
15
+ - fi
16
+ - fr
17
+ - hi
18
+ - hr
19
+ - hu
20
+ - id
21
+ - it
22
+ - lt
23
+ - lv
24
+ - nl
25
+ - pl
26
+ - pt
27
+ - ro
28
+ - ru
29
+ - sk
30
+ - sl
31
+ - sv
32
+ - tr
33
+ - uk
34
+ - vi
35
+ pipeline_tag: text-to-speech
36
+ tags:
37
+ - text-to-speech
38
+ - speech-synthesis
39
+ - tts
40
+ - onnx
41
+ - multilingual
42
+ - on-device
43
+ library_name: supertonic
44
+ ---
45
+
46
+ # Supertonic 3 | Lightning Fast, On-Device, Accurate TTS
47
+
48
+ ![Supertonic 3 Preview](img/Supertonic3_HeroImage.png)
49
+
50
+ <p align="center">
51
+ <a href="https://huggingface.co/spaces/Supertone/supertonic-3"><img src="https://img.shields.io/badge/Demo-Hugging_Face-yellow?style=for-the-badge" alt="Demo"></a>
52
+ <a href="https://github.com/supertone-inc/supertonic"><img src="https://img.shields.io/badge/Code-GitHub-black?style=for-the-badge&logo=github" alt="Code"></a>
53
+ <a href="https://pypi.org/project/supertonic/"><img src="https://img.shields.io/badge/Python-SDK-blue?style=for-the-badge&logo=python" alt="Python SDK"></a>
54
+ </p>
55
+
56
+ **Supertonic** is a lightweight text-to-speech system for local inference. It runs with ONNX Runtime entirely on your device, with no cloud call required for synthesis.
57
+
58
+ **Supertonic 3** expands the open-weight release from 5 to **31 languages**, improves reading stability, and reduces repeat/skip failures.
59
+
60
+ ## Quick Start
61
+
62
+ Install the Python SDK and generate speech immediately. On first run, the SDK downloads the model assets from Hugging Face.
63
+
64
+ ```bash
65
+ pip install supertonic
66
+ ```
67
+
68
+ ```python
69
+ from supertonic import TTS
70
+
71
+ tts = TTS(auto_download=True)
72
+ style = tts.get_voice_style(voice_name="M1")
73
+
74
+ text = "A gentle breeze moved through the open window while everyone listened to the story."
75
+ wav, duration = tts.synthesize(text, voice_style=style, lang="en")
76
+
77
+ tts.save_audio(wav, "output.wav")
78
+ print(f"Generated {duration:.2f}s of audio")
79
+ ```
80
+
81
+ ## What's New in Supertonic 3
82
+
83
+ - **31 languages**: expanded from the 5-language Supertonic 2 release.
84
+ - **More stable reading**: fewer repeat and skip failures, especially on short and long utterances.
85
+ - **Higher speaker similarity**: improved similarity across the shared-language set compared with Supertonic 2.
86
+ - **Expression tags**: supports simple tags such as `<laugh>`, `<breath>`, and `<sigh>`.
87
+
88
+ ## Custom Voices and Audio Samples
89
+
90
+ The open-weight package includes fixed preset voice styles for immediate local inference. If you want to hear how Supertonic 3 performs with zero-shot custom voice styles, visit the [Audio Sample Demo](https://supertonic3.github.io/) to compare reference audio and generated speech across several use cases. To create your own Supertonic 3 voice-style JSON from reference audio, use [Supertonic Voice Builder](https://supertonic.supertone.ai/voice-builder); purchased Voice Builder styles include downloadable embeddings for both Supertonic 2 and Supertonic 3.
91
+
92
+ Here are a few reference/generated pairs from the audio sample demo:
93
+
94
+ **Call center, English**
95
+ Text: Good morning, thank you for calling. How can I help you today?
96
+
97
+ | Reference voice | Supertonic 3 output |
98
+ |---|---|
99
+ | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/nora_reference.wav"></audio> | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/nora_supertonic3.wav"></audio> |
100
+
101
+ **Character voice, Japanese**
102
+ Text: ふふっ、退屈してたところなの。ちょうどいい遊び相手、見つけたかも♪
103
+
104
+ | Reference voice | Supertonic 3 output |
105
+ |---|---|
106
+ | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/moka_reference.wav"></audio> | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/moka_supertonic3.wav"></audio> |
107
+
108
+ **Elder character voice, Korean**
109
+ Text: 혼자 떠나기엔 길이 험하구나. 이 낡은 검을 가져가거라. 언젠가 어둠이 네 이름을 부르더라도, 부디 빛을 잊지 말거라.
110
+
111
+ | Reference voice | Supertonic 3 output |
112
+ |---|---|
113
+ | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/alphonse_reference.wav"></audio> | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/alphonse_supertonic3.wav"></audio> |
114
+
115
+ **Audiobook, English**
116
+ Text: I was not afraid of silence. I had lived with it long enough to know that, sometimes, it speaks more honestly than people do.
117
+
118
+ | Reference voice | Supertonic 3 output |
119
+ |---|---|
120
+ | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/luna_reference.wav"></audio> | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/luna_supertonic3.wav"></audio> |
121
+
122
+ **Audiobook, Japanese**
123
+ Text: その朝、ロンドンの霧はいつになく低く垂れこめていた。私はただの訪問者だと思っていたが、ホームズの目はすでに別の結論にたどり着いていた。
124
+
125
+ | Reference voice | Supertonic 3 output |
126
+ |---|---|
127
+ | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/watson_reference.wav"></audio> | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/watson_supertonic3.wav"></audio> |
128
+
129
+ **News, English**
130
+ Text: Here’s a story worth paying attention to. Supertone has released Supertonic 3, its on-device TTS model. This version expands support to thirty-one languages and improves reading stability.
131
+
132
+ | Reference voice | Supertonic 3 output |
133
+ |---|---|
134
+ | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/keld_reference.wav"></audio> | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/keld_supertonic3.wav"></audio> |
135
+
136
+ ## Performance Highlights
137
+
138
+ Supertonic 3 is designed for practical on-device inference: compact enough to run locally, while staying competitive with much larger open TTS systems.
139
+
140
+ ### Reading Accuracy
141
+
142
+ <p align="center">
143
+ <img src="img/metrics/s3_vs_measured_wer_range_voxcpm2.png" alt="Supertonic 3 reading accuracy compared with measured model ranges and VoxCPM2">
144
+ </p>
145
+
146
+ Across measured languages, Supertonic 3 stays within a competitive WER/CER range against much larger open TTS models such as VoxCPM2, while preserving a lightweight on-device deployment path. Asterisked languages use CER; the others use WER.
147
+
148
+ ### Supertonic 2 to Supertonic 3
149
+
150
+ <p align="center">
151
+ <img src="img/metrics/supertonic2_vs_3_comparison.png" alt="Supertonic 2 and Supertonic 3 comparison">
152
+ </p>
153
+
154
+ Compared with Supertonic 2, Supertonic 3 reduces repeat and skip failures, improves speaker similarity across the shared-language set, and expands language coverage from 5 to 31 languages.
155
+
156
+ ### Runtime Footprint
157
+
158
+ <p align="center">
159
+ <img src="img/metrics/runtime_cpu_gpu_latency_memory.png" alt="Supertonic CPU runtime compared with GPU baselines">
160
+ </p>
161
+
162
+ Supertonic 3 runs fast on CPU, even compared with larger baselines measured on A100 GPU, and uses substantially less memory. It does not require a GPU, which makes local, browser, and edge deployment much easier.
163
+
164
+ ### Model Size
165
+
166
+ <p align="center">
167
+ <img src="img/metrics/model_size_comparison.png" alt="Model size comparison">
168
+ </p>
169
+
170
+ At about 99M parameters across the public ONNX assets, Supertonic 3 is much smaller than 0.7B to 2B class open TTS systems. The smaller model size is a practical advantage for download size, startup time, and on-device inference.
171
+
172
+ ## Supported Languages
173
+
174
+ | Code | Language | Code | Language | Code | Language | Code | Language |
175
+ |------|----------|------|----------|------|----------|------|----------|
176
+ | `en` | English | `ko` | Korean | `ja` | Japanese | `ar` | Arabic |
177
+ | `bg` | Bulgarian | `cs` | Czech | `da` | Danish | `de` | German |
178
+ | `el` | Greek | `es` | Spanish | `et` | Estonian | `fi` | Finnish |
179
+ | `fr` | French | `hi` | Hindi | `hr` | Croatian | `hu` | Hungarian |
180
+ | `id` | Indonesian | `it` | Italian | `lt` | Lithuanian | `lv` | Latvian |
181
+ | `nl` | Dutch | `pl` | Polish | `pt` | Portuguese | `ro` | Romanian |
182
+ | `ru` | Russian | `sk` | Slovak | `sl` | Slovenian | `sv` | Swedish |
183
+ | `tr` | Turkish | `uk` | Ukrainian | `vi` | Vietnamese | | |
184
+
185
+ ## License
186
+
187
+ This project's sample code is released under the MIT License. See the [GitHub repository](https://github.com/supertone-inc/supertonic) for details.
188
+
189
+ The accompanying model is released under the OpenRAIL-M License. See the [LICENSE](https://huggingface.co/Supertone/supertonic-3/blob/main/LICENSE) file in this repository for details.
190
+
191
+ This model was trained using PyTorch, which is licensed under the BSD 3-Clause License but is not redistributed with this project. See the [PyTorch license](https://docs.pytorch.org/FBGEMM/general/License.html) for details.
192
+
193
+ Copyright (c) 2026 Supertone Inc.
graphs/duration_predictor.json ADDED
The diff for this file is too large to render. See raw diff
 
graphs/text_encoder.json ADDED
The diff for this file is too large to render. See raw diff
 
graphs/vector_estimator.json ADDED
The diff for this file is too large to render. See raw diff
 
graphs/vocoder.json ADDED
The diff for this file is too large to render. See raw diff
 
mlx_manifest.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "format": "supertonic-mlx-graph",
3
+ "source_repo": "Supertone/supertonic-3",
4
+ "target_repo": "mlx-community/supertonic-3",
5
+ "graphs": [
6
+ "duration_predictor",
7
+ "text_encoder",
8
+ "vector_estimator",
9
+ "vocoder"
10
+ ],
11
+ "sample_rate": 44100
12
+ }
tts.json ADDED
@@ -0,0 +1,311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tts_version": "v1.7.3",
3
+ "split": "opensource-multilingual",
4
+ "ttl": {
5
+ "latent_dim": 24,
6
+ "chunk_compress_factor": 6,
7
+ "batch_expander": {
8
+ "n_batch_expand": 6
9
+ },
10
+ "normalizer": {
11
+ "scale": 0.25
12
+ },
13
+ "text_encoder": {
14
+ "n_langs": 0,
15
+ "lang_emb_dim": 0,
16
+ "text_embedder": {
17
+ "char_emb_dim": 256
18
+ },
19
+ "convnext": {
20
+ "idim": 256,
21
+ "ksz": 5,
22
+ "intermediate_dim": 1024,
23
+ "num_layers": 6,
24
+ "dilation_lst": [
25
+ 1,
26
+ 1,
27
+ 2,
28
+ 2,
29
+ 4,
30
+ 4
31
+ ]
32
+ },
33
+ "attn_encoder": {
34
+ "hidden_channels": 256,
35
+ "filter_channels": 1024,
36
+ "n_heads": 4,
37
+ "n_layers": 4,
38
+ "p_dropout": 0.0
39
+ },
40
+ "proj_out": {
41
+ "idim": 256,
42
+ "odim": 256
43
+ }
44
+ },
45
+ "flow_matching": {
46
+ "sig_min": 1e-08
47
+ },
48
+ "style_encoder": {
49
+ "proj_in": {
50
+ "ldim": 24,
51
+ "chunk_compress_factor": 6,
52
+ "odim": 256
53
+ },
54
+ "convnext": {
55
+ "idim": 256,
56
+ "ksz": 5,
57
+ "intermediate_dim": 1024,
58
+ "num_layers": 6,
59
+ "dilation_lst": [
60
+ 1,
61
+ 1,
62
+ 1,
63
+ 1,
64
+ 1,
65
+ 1
66
+ ]
67
+ },
68
+ "style_token_layer": {
69
+ "input_dim": 256,
70
+ "n_style": 50,
71
+ "style_key_dim": 256,
72
+ "style_value_dim": 256,
73
+ "prototype_dim": 256,
74
+ "n_units": 256,
75
+ "n_heads": 2
76
+ }
77
+ },
78
+ "speech_prompted_text_encoder": {
79
+ "text_dim": 256,
80
+ "style_dim": 256,
81
+ "n_units": 256,
82
+ "n_heads": 2
83
+ },
84
+ "uncond_masker": {
85
+ "prob_both_uncond": 0.04,
86
+ "prob_text_uncond": 0.01,
87
+ "std": 0.1,
88
+ "text_dim": 256,
89
+ "n_style": 50,
90
+ "style_key_dim": 256,
91
+ "style_value_dim": 256
92
+ },
93
+ "vector_field": {
94
+ "n_langs": 0,
95
+ "lang_emb_dim": 0,
96
+ "proj_in": {
97
+ "ldim": 24,
98
+ "chunk_compress_factor": 6,
99
+ "odim": 512
100
+ },
101
+ "time_encoder": {
102
+ "time_dim": 64,
103
+ "hdim": 256
104
+ },
105
+ "main_blocks": {
106
+ "n_blocks": 4,
107
+ "time_cond_layer": {
108
+ "idim": 512,
109
+ "time_dim": 64
110
+ },
111
+ "style_cond_layer": {
112
+ "idim": 512,
113
+ "style_dim": 256
114
+ },
115
+ "text_cond_layer": {
116
+ "idim": 512,
117
+ "text_dim": 256,
118
+ "n_heads": 8,
119
+ "n_units": 512,
120
+ "use_residual": true,
121
+ "rotary_base": 10000,
122
+ "rotary_scale": 10
123
+ },
124
+ "convnext_0": {
125
+ "idim": 512,
126
+ "ksz": 5,
127
+ "intermediate_dim": 2048,
128
+ "num_layers": 4,
129
+ "dilation_lst": [
130
+ 1,
131
+ 2,
132
+ 4,
133
+ 8
134
+ ]
135
+ },
136
+ "convnext_1": {
137
+ "idim": 512,
138
+ "ksz": 5,
139
+ "intermediate_dim": 2048,
140
+ "num_layers": 1,
141
+ "dilation_lst": [
142
+ 1
143
+ ]
144
+ },
145
+ "convnext_2": {
146
+ "idim": 512,
147
+ "ksz": 5,
148
+ "intermediate_dim": 2048,
149
+ "num_layers": 1,
150
+ "dilation_lst": [
151
+ 1
152
+ ]
153
+ }
154
+ },
155
+ "last_convnext": {
156
+ "idim": 512,
157
+ "ksz": 5,
158
+ "intermediate_dim": 2048,
159
+ "num_layers": 4,
160
+ "dilation_lst": [
161
+ 1,
162
+ 1,
163
+ 1,
164
+ 1
165
+ ]
166
+ },
167
+ "proj_out": {
168
+ "idim": 512,
169
+ "chunk_compress_factor": 6,
170
+ "ldim": 24
171
+ }
172
+ }
173
+ },
174
+ "ae": {
175
+ "sample_rate": 44100,
176
+ "n_delay": 0,
177
+ "base_chunk_size": 512,
178
+ "chunk_compress_factor": 1,
179
+ "ldim": 24,
180
+ "encoder": {
181
+ "spec_processor": {
182
+ "n_fft": 2048,
183
+ "win_length": 2048,
184
+ "hop_length": 512,
185
+ "n_mels": 228,
186
+ "sample_rate": 44100,
187
+ "eps": 1e-05,
188
+ "norm_mean": 0.0,
189
+ "norm_std": 1.0
190
+ },
191
+ "ksz_init": 7,
192
+ "ksz": 7,
193
+ "num_layers": 10,
194
+ "dilation_lst": [
195
+ 1,
196
+ 1,
197
+ 1,
198
+ 1,
199
+ 1,
200
+ 1,
201
+ 1,
202
+ 1,
203
+ 1,
204
+ 1
205
+ ],
206
+ "intermediate_dim": 2048,
207
+ "idim": 1253,
208
+ "hdim": 512,
209
+ "odim": 24
210
+ },
211
+ "decoder": {
212
+ "ksz_init": 7,
213
+ "ksz": 7,
214
+ "num_layers": 10,
215
+ "dilation_lst": [
216
+ 1,
217
+ 2,
218
+ 4,
219
+ 1,
220
+ 2,
221
+ 4,
222
+ 1,
223
+ 1,
224
+ 1,
225
+ 1
226
+ ],
227
+ "intermediate_dim": 2048,
228
+ "idim": 24,
229
+ "hdim": 512,
230
+ "head": {
231
+ "idim": 512,
232
+ "hdim": 2048,
233
+ "odim": 512,
234
+ "ksz": 3
235
+ }
236
+ }
237
+ },
238
+ "dp": {
239
+ "latent_dim": 24,
240
+ "chunk_compress_factor": 6,
241
+ "normalizer": {
242
+ "scale": 1.0
243
+ },
244
+ "sentence_encoder": {
245
+ "char_emb_dim": 64,
246
+ "text_embedder": {
247
+ "char_emb_dim": 64
248
+ },
249
+ "convnext": {
250
+ "idim": 64,
251
+ "ksz": 5,
252
+ "intermediate_dim": 256,
253
+ "num_layers": 6,
254
+ "dilation_lst": [
255
+ 1,
256
+ 1,
257
+ 1,
258
+ 1,
259
+ 1,
260
+ 1
261
+ ]
262
+ },
263
+ "attn_encoder": {
264
+ "hidden_channels": 64,
265
+ "filter_channels": 256,
266
+ "n_heads": 2,
267
+ "n_layers": 2,
268
+ "p_dropout": 0.0
269
+ },
270
+ "proj_out": {
271
+ "idim": 64,
272
+ "odim": 64
273
+ }
274
+ },
275
+ "style_encoder": {
276
+ "proj_in": {
277
+ "ldim": 24,
278
+ "chunk_compress_factor": 6,
279
+ "odim": 64
280
+ },
281
+ "convnext": {
282
+ "idim": 64,
283
+ "ksz": 5,
284
+ "intermediate_dim": 256,
285
+ "num_layers": 4,
286
+ "dilation_lst": [
287
+ 1,
288
+ 1,
289
+ 1,
290
+ 1
291
+ ]
292
+ },
293
+ "style_token_layer": {
294
+ "input_dim": 64,
295
+ "n_style": 8,
296
+ "style_key_dim": 0,
297
+ "style_value_dim": 16,
298
+ "prototype_dim": 64,
299
+ "n_units": 64,
300
+ "n_heads": 2
301
+ }
302
+ },
303
+ "predictor": {
304
+ "sentence_dim": 64,
305
+ "n_style": 8,
306
+ "style_dim": 16,
307
+ "hdim": 128,
308
+ "n_layer": 2
309
+ }
310
+ }
311
+ }
unicode_indexer.json ADDED
The diff for this file is too large to render. See raw diff
 
voice_styles/F1.json ADDED
The diff for this file is too large to render. See raw diff
 
voice_styles/F2.json ADDED
The diff for this file is too large to render. See raw diff
 
voice_styles/F3.json ADDED
The diff for this file is too large to render. See raw diff
 
voice_styles/F4.json ADDED
The diff for this file is too large to render. See raw diff
 
voice_styles/F5.json ADDED
The diff for this file is too large to render. See raw diff
 
voice_styles/M1.json ADDED
The diff for this file is too large to render. See raw diff
 
voice_styles/M2.json ADDED
The diff for this file is too large to render. See raw diff
 
voice_styles/M3.json ADDED
The diff for this file is too large to render. See raw diff
 
voice_styles/M4.json ADDED
The diff for this file is too large to render. See raw diff
 
voice_styles/M5.json ADDED
The diff for this file is too large to render. See raw diff