Text-to-Speech
Core ML
Supertonic
speech
audio
tts
ane
apple-silicon
flow-matching
diffusion
multilingual
Instructions to use FluidInference/supertonic-3-coreml with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Supertonic
How to use FluidInference/supertonic-3-coreml with Supertonic:
from supertonic import TTS tts = TTS(auto_download=True) style = tts.get_voice_style(voice_name="M1") text = "The train delay was announced at 4:45 PM on Wed, Apr 3, 2024 due to track maintenance." wav, duration = tts.synthesize(text, voice_style=style) tts.save_audio(wav, "output.wav")
- Notebooks
- Google Colab
- Kaggle
| """Minimal self-contained Supertonic-3 CoreML inference script. | |
| Loads the four .mlpackage modules from this directory, tokenizes text via | |
| unicode_indexer.json, runs the 8-step flow-matching loop, and writes a 44.1 kHz | |
| WAV. No external dependencies beyond `coremltools`, `numpy`, and `soundfile`. | |
| Example | |
| ------- | |
| python infer.py "Hello, world." --voice-style voice_styles/M1.json -o hello.wav | |
| python infer.py "Bonjour le monde." --lang fr --voice-style voice_styles/M1.json -o fr.wav | |
| For the full driver (text chunking, batch synthesis, multi-utt) see the | |
| mobius conversion repo: github.com/FluidInference/mobius | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import re | |
| import time | |
| from pathlib import Path | |
| from typing import Tuple | |
| from unicodedata import normalize | |
| import coremltools as ct | |
| import numpy as np | |
| # Languages supported by Supertonic-3 v1.7.3. | |
| AVAILABLE_LANGS = [ | |
| "en", "ko", "ja", "ar", "bg", "cs", "da", "de", "el", "es", | |
| "et", "fi", "fr", "hi", "hr", "hu", "id", "it", "lt", "lv", | |
| "nl", "pl", "pt", "ro", "ru", "sk", "sl", "sv", "tr", "uk", | |
| "vi", "na", | |
| ] | |
| # CoreML shape pins (must match conversion settings; see mobius trials.md). | |
| TEXT_T_FIXED = 128 # text_encoder / duration_predictor pinned T | |
| VEC_EST_L_MIN = 17 # vector_estimator latent/text RangeDim lower bound | |
| _EMOJI_RE = re.compile( | |
| "[\U0001f600-\U0001f64f\U0001f300-\U0001f5ff\U0001f680-\U0001f6ff" | |
| "\U0001f700-\U0001f77f\U0001f780-\U0001f7ff\U0001f800-\U0001f8ff" | |
| "\U0001f900-\U0001f9ff\U0001fa00-\U0001fa6f\U0001fa70-\U0001faff" | |
| "\u2600-\u26ff\u2700-\u27bf\U0001f1e6-\U0001f1ff]+", | |
| flags=re.UNICODE, | |
| ) | |
| _CHAR_REPL = { | |
| "–": "-", "‑": "-", "—": "-", "_": " ", | |
| "\u201c": '"', "\u201d": '"', "\u2018": "'", "\u2019": "'", | |
| "´": "'", "`": "'", | |
| "[": " ", "]": " ", "|": " ", "/": " ", "#": " ", "→": " ", "←": " ", | |
| } | |
| def preprocess_text(text: str, lang: str) -> str: | |
| text = normalize("NFKD", text) | |
| text = _EMOJI_RE.sub("", text) | |
| for k, v in _CHAR_REPL.items(): | |
| text = text.replace(k, v) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| if not re.search(r"[.!?;:,'\"')\]}…。」』】〉》›»]$", text): | |
| text += "." | |
| if lang not in AVAILABLE_LANGS: | |
| raise ValueError(f"Unsupported lang '{lang}'. Available: {AVAILABLE_LANGS}") | |
| return f"<{lang}>" + text + f"</{lang}>" | |
| def tokenize(text: str, lang: str, indexer: list) -> Tuple[np.ndarray, np.ndarray]: | |
| """Convert text to (text_ids[1, T], text_mask[1, 1, T]) padded to TEXT_T_FIXED.""" | |
| s = preprocess_text(text, lang) | |
| ids = np.zeros((1, TEXT_T_FIXED), dtype=np.int32) | |
| mask = np.zeros((1, 1, TEXT_T_FIXED), dtype=np.float32) | |
| codepoints = [ord(c) for c in s][:TEXT_T_FIXED] | |
| for i, cp in enumerate(codepoints): | |
| ids[0, i] = indexer[cp] | |
| mask[0, 0, : len(codepoints)] = 1.0 | |
| return ids, mask | |
| def load_voice_style(path: Path) -> Tuple[np.ndarray, np.ndarray]: | |
| with open(path) as f: | |
| cfg = json.load(f) | |
| ttl_d = cfg["style_ttl"]["dims"] | |
| dp_d = cfg["style_dp"]["dims"] | |
| ttl = np.array(cfg["style_ttl"]["data"], dtype=np.float32).reshape(1, ttl_d[1], ttl_d[2]) | |
| dp = np.array(cfg["style_dp"]["data"], dtype=np.float32).reshape(1, dp_d[1], dp_d[2]) | |
| return ttl, dp | |
| def sample_noisy_latent( | |
| duration_sec: float, sample_rate: int, base_chunk_size: int, | |
| chunk_compress_factor: int, latent_dim: int, rng: np.random.Generator, | |
| ) -> Tuple[np.ndarray, np.ndarray]: | |
| wav_len = int(duration_sec * sample_rate) | |
| chunk_size = base_chunk_size * chunk_compress_factor | |
| L = (wav_len + chunk_size - 1) // chunk_size | |
| noisy = rng.standard_normal((1, latent_dim * chunk_compress_factor, L)).astype(np.float32) | |
| latent_mask = np.zeros((1, 1, L), dtype=np.float32) | |
| latent_mask[0, 0, :L] = 1.0 | |
| return noisy * latent_mask, latent_mask | |
| def pad_last(arr: np.ndarray, target: int) -> np.ndarray: | |
| if arr.shape[-1] >= target: | |
| return arr | |
| pad = [(0, 0)] * arr.ndim | |
| pad[-1] = (0, target - arr.shape[-1]) | |
| return np.pad(arr, pad, constant_values=0.0) | |
| class Supertonic3TTS: | |
| def __init__(self, model_dir: Path, compute_units: ct.ComputeUnit = ct.ComputeUnit.CPU_AND_NE): | |
| with open(model_dir / "tts.json") as f: | |
| cfg = json.load(f) | |
| self.sample_rate = int(cfg["ae"]["sample_rate"]) | |
| self.base_chunk_size = int(cfg["ae"]["base_chunk_size"]) | |
| self.ccf = int(cfg["ttl"]["chunk_compress_factor"]) | |
| self.ldim = int(cfg["ttl"]["latent_dim"]) | |
| with open(model_dir / "unicode_indexer.json") as f: | |
| self.indexer = json.load(f) | |
| def _load(name: str) -> ct.models.MLModel: | |
| # coremltools loads .mlpackage; .mlmodelc is for direct Swift/Obj-C use. | |
| return ct.models.MLModel( | |
| str(model_dir / f"{name}.mlpackage"), | |
| compute_units=compute_units, | |
| ) | |
| print(f"Loading models from {model_dir} (compute_units={compute_units.name})") | |
| self.dp = _load("DurationPredictor") | |
| self.te = _load("TextEncoder") | |
| self.ve = _load("VectorEstimator") | |
| self.vc = _load("Vocoder") | |
| self.rng = np.random.default_rng() | |
| def synthesize(self, text: str, voice_style_path: Path, lang: str = "en", | |
| total_step: int = 8, speed: float = 1.05) -> Tuple[np.ndarray, float]: | |
| ttl, dp_style = load_voice_style(voice_style_path) | |
| text_ids, text_mask = tokenize(text, lang, self.indexer) | |
| # 1. Duration. | |
| dp_out = self.dp.predict({ | |
| "text_ids": text_ids, "style_dp": dp_style, "text_mask": text_mask, | |
| }) | |
| duration = float(np.asarray(dp_out["duration"], dtype=np.float32)[0]) / speed | |
| # 2. Text embedding. | |
| te_out = self.te.predict({ | |
| "text_ids": text_ids, "style_ttl": ttl, "text_mask": text_mask, | |
| }) | |
| text_emb = np.asarray(te_out["text_emb"], dtype=np.float32) | |
| # 3. Noisy latent. | |
| noisy, latent_mask = sample_noisy_latent( | |
| duration, self.sample_rate, self.base_chunk_size, self.ccf, self.ldim, self.rng, | |
| ) | |
| L_true = noisy.shape[-1] | |
| L_use = max(L_true, VEC_EST_L_MIN) | |
| noisy = pad_last(noisy, L_use) | |
| latent_mask = pad_last(latent_mask, L_use) | |
| # 4. 8-step flow-matching diffusion. | |
| xt = noisy | |
| total_t = np.array([float(total_step)], dtype=np.float32) | |
| for step in range(total_step): | |
| cur_t = np.array([float(step)], dtype=np.float32) | |
| ve_out = self.ve.predict({ | |
| "noisy_latent": xt, "text_emb": text_emb, "style_ttl": ttl, | |
| "latent_mask": latent_mask, "text_mask": text_mask, | |
| "current_step": cur_t, "total_step": total_t, | |
| }) | |
| xt = np.asarray(ve_out["denoised_latent"], dtype=np.float32) | |
| # 5. Vocoder → 44.1 kHz wav. | |
| vc_out = self.vc.predict({"latent": xt}) | |
| wav = np.asarray(vc_out["wav"], dtype=np.float32) | |
| wav = wav[:, : (self.base_chunk_size * self.ccf) * L_true] # trim pad | |
| wav = wav[0, : int(self.sample_rate * duration)] # trim per-sample | |
| return wav, duration | |
| def main() -> None: | |
| ap = argparse.ArgumentParser(description="Supertonic-3 CoreML TTS — minimal demo") | |
| ap.add_argument("text", type=str, help="Text to synthesize") | |
| ap.add_argument("--voice-style", type=Path, default=Path("voice_styles/M1.json")) | |
| ap.add_argument("--lang", type=str, default="en") | |
| ap.add_argument("--model-dir", type=Path, default=Path(".")) | |
| ap.add_argument("-o", "--output", type=Path, default=Path("output.wav")) | |
| ap.add_argument("--total-step", type=int, default=8) | |
| ap.add_argument("--speed", type=float, default=1.05) | |
| ap.add_argument("--compute-units", type=str, default="CPU_AND_NE", | |
| choices=["CPU_ONLY", "CPU_AND_GPU", "CPU_AND_NE", "ALL"]) | |
| args = ap.parse_args() | |
| try: | |
| import soundfile as sf | |
| except ImportError as e: | |
| raise SystemExit("install soundfile: pip install soundfile") from e | |
| tts = Supertonic3TTS(args.model_dir, getattr(ct.ComputeUnit, args.compute_units)) | |
| t0 = time.time() | |
| wav, dur = tts.synthesize(args.text, args.voice_style, args.lang, args.total_step, args.speed) | |
| elapsed = time.time() - t0 | |
| sf.write(args.output, wav, tts.sample_rate) | |
| print(f"wrote {args.output} ({dur:.2f}s audio in {elapsed:.2f}s, RTFx {dur / elapsed:.1f}x)") | |
| if __name__ == "__main__": | |
| main() | |