Text-to-Speech
MLX
Supertonic
supertonic-3-mlx
supertonic-3
apple-silicon
tts
speech-synthesis
multilingual
flow-matching
Instructions to use ambassadia/supertonic-3-mlx with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- MLX
How to use ambassadia/supertonic-3-mlx with MLX:
# Download the model from the Hub pip install huggingface_hub[hf_xet] huggingface-cli download --local-dir supertonic-3-mlx ambassadia/supertonic-3-mlx
- Supertonic
How to use ambassadia/supertonic-3-mlx with Supertonic:
from supertonic import TTS tts = TTS(auto_download=True) style = tts.get_voice_style(voice_name="M1") text = "The train delay was announced at 4:45 PM on Wed, Apr 3, 2024 due to track maintenance." wav, duration = tts.synthesize(text, voice_style=style) tts.save_audio(wav, "output.wav")
- Notebooks
- Google Colab
- Kaggle
- Local Apps
- LM Studio
| { | |
| "models": [ | |
| { | |
| "model": "VectorEstimator", | |
| "onnx": "vector_estimator.onnx", | |
| "safetensors": "weights/vector_estimator.safetensors", | |
| "bytes": 256053073, | |
| "sha256": "2359240f2dcaee03b4800102aa0bea00223d2867ab752ef01af2b1cfaf92f3a6", | |
| "weights_kept": 351, | |
| "weights_dropped": 120, | |
| "dropped_detail": { | |
| "tts.ae.vector_field.proj_in.net.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.0.convnext.0.pwconv1.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.0.convnext.0.pwconv1.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.0.convnext.0.pwconv2.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.0.convnext.0.pwconv2.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.0.convnext.1.pwconv1.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.0.convnext.1.pwconv1.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.0.convnext.1.pwconv2.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.0.convnext.1.pwconv2.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.0.convnext.2.pwconv1.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.0.convnext.2.pwconv1.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.0.convnext.2.pwconv2.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.0.convnext.2.pwconv2.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.0.convnext.3.pwconv1.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.0.convnext.3.pwconv1.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.0.convnext.3.pwconv2.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.0.convnext.3.pwconv2.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.2.convnext.0.pwconv1.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.2.convnext.0.pwconv1.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.2.convnext.0.pwconv2.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.2.convnext.0.pwconv2.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.4.convnext.0.pwconv1.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.4.convnext.0.pwconv1.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.4.convnext.0.pwconv2.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.4.convnext.0.pwconv2.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.6.convnext.0.pwconv1.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.6.convnext.0.pwconv1.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.6.convnext.0.pwconv2.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.6.convnext.0.pwconv2.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.6.convnext.1.pwconv1.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.6.convnext.1.pwconv1.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.6.convnext.1.pwconv2.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.6.convnext.1.pwconv2.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.6.convnext.2.pwconv1.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.6.convnext.2.pwconv1.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.6.convnext.2.pwconv2.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.6.convnext.2.pwconv2.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.6.convnext.3.pwconv1.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.6.convnext.3.pwconv1.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.6.convnext.3.pwconv2.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.6.convnext.3.pwconv2.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.8.convnext.0.pwconv1.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.8.convnext.0.pwconv1.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.8.convnext.0.pwconv2.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.8.convnext.0.pwconv2.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.10.convnext.0.pwconv1.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.10.convnext.0.pwconv1.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.10.convnext.0.pwconv2.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.10.convnext.0.pwconv2.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.12.convnext.0.pwconv1.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.12.convnext.0.pwconv1.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.12.convnext.0.pwconv2.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.12.convnext.0.pwconv2.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.12.convnext.1.pwconv1.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.12.convnext.1.pwconv1.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.12.convnext.1.pwconv2.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.12.convnext.1.pwconv2.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.12.convnext.2.pwconv1.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.12.convnext.2.pwconv1.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.12.convnext.2.pwconv2.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.12.convnext.2.pwconv2.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.12.convnext.3.pwconv1.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.12.convnext.3.pwconv1.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.12.convnext.3.pwconv2.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.12.convnext.3.pwconv2.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.14.convnext.0.pwconv1.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.14.convnext.0.pwconv1.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.14.convnext.0.pwconv2.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.14.convnext.0.pwconv2.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.16.convnext.0.pwconv1.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.16.convnext.0.pwconv1.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.16.convnext.0.pwconv2.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.16.convnext.0.pwconv2.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.18.convnext.0.pwconv1.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.18.convnext.0.pwconv1.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.18.convnext.0.pwconv2.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.18.convnext.0.pwconv2.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.18.convnext.1.pwconv1.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.18.convnext.1.pwconv1.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.18.convnext.1.pwconv2.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.18.convnext.1.pwconv2.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.18.convnext.2.pwconv1.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.18.convnext.2.pwconv1.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.18.convnext.2.pwconv2.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.18.convnext.2.pwconv2.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.18.convnext.3.pwconv1.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.18.convnext.3.pwconv1.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.18.convnext.3.pwconv2.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.18.convnext.3.pwconv2.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.20.convnext.0.pwconv1.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.20.convnext.0.pwconv1.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.20.convnext.0.pwconv2.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.20.convnext.0.pwconv2.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.22.convnext.0.pwconv1.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.22.convnext.0.pwconv1.bias": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.22.convnext.0.pwconv2.weight": "not-in-model", | |
| "tts.ae.vector_field.main_blocks.22.convnext.0.pwconv2.bias": "not-in-model", | |
| "tts.ae.vector_field.last_convnext.convnext.0.pwconv1.weight": "not-in-model", | |
| "tts.ae.vector_field.last_convnext.convnext.0.pwconv1.bias": "not-in-model", | |
| "tts.ae.vector_field.last_convnext.convnext.0.pwconv2.weight": "not-in-model", | |
| "tts.ae.vector_field.last_convnext.convnext.0.pwconv2.bias": "not-in-model", | |
| "tts.ae.vector_field.last_convnext.convnext.1.pwconv1.weight": "not-in-model", | |
| "tts.ae.vector_field.last_convnext.convnext.1.pwconv1.bias": "not-in-model", | |
| "tts.ae.vector_field.last_convnext.convnext.1.pwconv2.weight": "not-in-model", | |
| "tts.ae.vector_field.last_convnext.convnext.1.pwconv2.bias": "not-in-model", | |
| "tts.ae.vector_field.last_convnext.convnext.2.pwconv1.weight": "not-in-model", | |
| "tts.ae.vector_field.last_convnext.convnext.2.pwconv1.bias": "not-in-model", | |
| "tts.ae.vector_field.last_convnext.convnext.2.pwconv2.weight": "not-in-model", | |
| "tts.ae.vector_field.last_convnext.convnext.2.pwconv2.bias": "not-in-model", | |
| "tts.ae.vector_field.last_convnext.convnext.3.pwconv1.weight": "not-in-model", | |
| "tts.ae.vector_field.last_convnext.convnext.3.pwconv1.bias": "not-in-model", | |
| "tts.ae.vector_field.last_convnext.convnext.3.pwconv2.weight": "not-in-model", | |
| "tts.ae.vector_field.last_convnext.convnext.3.pwconv2.bias": "not-in-model", | |
| "tts.ae.vector_field.proj_out.net.weight": "not-in-model", | |
| "<missing>.vector_field.main_blocks.9.attn.theta": "expected-but-not-extracted", | |
| "<missing>.vector_field.main_blocks.9.attn.increments": "expected-but-not-extracted", | |
| "<missing>.vector_field.main_blocks.15.attn.theta": "expected-but-not-extracted", | |
| "<missing>.vector_field.main_blocks.15.attn.increments": "expected-but-not-extracted", | |
| "<missing>.vector_field.main_blocks.21.attn.theta": "expected-but-not-extracted", | |
| "<missing>.vector_field.main_blocks.21.attn.increments": "expected-but-not-extracted" | |
| }, | |
| "elapsed_s": 0.289 | |
| }, | |
| { | |
| "model": "TextEncoder", | |
| "onnx": "text_encoder.onnx", | |
| "safetensors": "weights/text_encoder.safetensors", | |
| "bytes": 36022466, | |
| "sha256": "9df20bb79496718b36d2c0fc37636d3f78d6ef751b2899ff6dfeb975ae737ada", | |
| "weights_kept": 146, | |
| "weights_dropped": 0, | |
| "dropped_detail": {}, | |
| "elapsed_s": 0.035 | |
| }, | |
| { | |
| "model": "DurationPredictor", | |
| "onnx": "duration_predictor.onnx", | |
| "safetensors": "weights/duration_predictor.safetensors", | |
| "bytes": 3470807, | |
| "sha256": "cd473acb6e0ac27426084488ccb3b3cc184e70d05db90897e2b892846db5dcb3", | |
| "weights_kept": 98, | |
| "weights_dropped": 0, | |
| "dropped_detail": {}, | |
| "elapsed_s": 0.007 | |
| }, | |
| { | |
| "model": "Vocoder", | |
| "onnx": "vocoder.onnx", | |
| "safetensors": "weights/vocoder.safetensors", | |
| "bytes": 101364763, | |
| "sha256": "b2ec31ab7c554f6e15b9a6780554b5d3502345de7848b310966bfb4e1ea4e526", | |
| "weights_kept": 103, | |
| "weights_dropped": 0, | |
| "dropped_detail": {}, | |
| "elapsed_s": 0.079 | |
| } | |
| ], | |
| "ancillary": [ | |
| { | |
| "name": "unicode_indexer.json", | |
| "bytes": 277676, | |
| "sha256": "9bf7346e43883a81f8645c81224f786d43c5b57f3641f6e7671a7d6c493cb24f" | |
| }, | |
| { | |
| "name": "voice_styles/F1.json", | |
| "bytes": 292046, | |
| "sha256": "bbdec6ee00231c2c742ad05483df5334cab3b52fda3ba38e6a07059c4563dbc2" | |
| }, | |
| { | |
| "name": "voice_styles/F2.json", | |
| "bytes": 292423, | |
| "sha256": "7c722c6a72707b1a77f035d67f0d1351ba187738e06f7683e8c72b1df3477fc6" | |
| }, | |
| { | |
| "name": "voice_styles/F3.json", | |
| "bytes": 290794, | |
| "sha256": "12f6ef2573baa2defa1128069cb59f203e3ab67c92af77b42df8a0e3a2f7c6ab" | |
| }, | |
| { | |
| "name": "voice_styles/F4.json", | |
| "bytes": 291808, | |
| "sha256": "c2fa764c1225a76dfc3e2c73e8aa4f70d9ee48793860eb34c295fff01c2e032b" | |
| }, | |
| { | |
| "name": "voice_styles/F5.json", | |
| "bytes": 291479, | |
| "sha256": "45966e73316415626cf41a7d1c6f3b4c70dbc1ba2bee5c1978ef0ce33244fc8d" | |
| }, | |
| { | |
| "name": "voice_styles/M1.json", | |
| "bytes": 291748, | |
| "sha256": "e35604687f5d23694b8e91593a93eec0e4eca6c0b02bb8ed69139ab2ea6b0a5b" | |
| }, | |
| { | |
| "name": "voice_styles/M2.json", | |
| "bytes": 292055, | |
| "sha256": "b76cbf62bac707c710cf0ae5aba5e31eea1a6339a9734bfae33ab98499534a50" | |
| }, | |
| { | |
| "name": "voice_styles/M3.json", | |
| "bytes": 290198, | |
| "sha256": "ea1ac35ccb91b0d7ecad533a2fbd0eec10c91513d8951e3b25fbba99954e159b" | |
| }, | |
| { | |
| "name": "voice_styles/M4.json", | |
| "bytes": 291522, | |
| "sha256": "ca8eefad4fcd989c9379032ff3e50738adc547eeb5e221b82593a6d7b3bac303" | |
| }, | |
| { | |
| "name": "voice_styles/M5.json", | |
| "bytes": 291469, | |
| "sha256": "dd22b92740314321f8ae11c5e87f8dd60d060f15dd3a632b5adf77f471f77af2" | |
| } | |
| ] | |
| } |