Add Supertonic 3 MLX metadata and graph topology

Browse files

Files changed (20) hide show

.gitattributes +1 -34
README.md +226 -0
README.official.md +193 -0
graphs/duration_predictor.json +0 -0
graphs/text_encoder.json +0 -0
graphs/vector_estimator.json +0 -0
graphs/vocoder.json +0 -0
mlx_manifest.json +12 -0
tts.json +311 -0
unicode_indexer.json +0 -0
voice_styles/F1.json +0 -0
voice_styles/F2.json +0 -0
voice_styles/F3.json +0 -0
voice_styles/F4.json +0 -0
voice_styles/F5.json +0 -0
voice_styles/M1.json +0 -0
voice_styles/M2.json +0 -0
voice_styles/M3.json +0 -0
voice_styles/M4.json +0 -0
voice_styles/M5.json +0 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,2 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
 *.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
















1	*.npz filter=lfs diff=lfs merge=lfs -text
2	+ *.wav filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,226 @@

+---
+license: mit
+base_model: Supertone/supertonic-3
+library_name: mlx
+tags:
+- mlx
+- text-to-speech
+- on-device
+- audio
+---
+# Supertonic 3 MLX
+This repository contains a community MLX conversion of [`Supertone/supertonic-3`](https://huggingface.co/Supertone/supertonic-3).
+The original ONNX graphs are converted into JSON topology plus NPZ initializers. Inference is executed with MLX arrays through the Supertonic-specific graph runtime in [`ailuntx/supertonic`](https://github.com/ailuntx/supertonic).
+```bash
+git clone https://github.com/ailuntx/supertonic
+cd supertonic
+python scripts/infer_mlx.py \
+  --model /path/to/supertonic-3 \
+  --text "Supertonic 3 is running with MLX." \
+  --lang en \
+  --voice M1 \
+  --total-step 8 \
+  --output output.wav
+```
+The MLX graph runtime has been checked against ONNX Runtime on the official assets; per-stage maximum absolute errors are around `1e-5`.
+## Original Model Card
+---
+license: openrail
+language:
+    - en
+    - ko
+    - ja
+    - ar
+    - bg
+    - cs
+    - da
+    - de
+    - el
+    - es
+    - et
+    - fi
+    - fr
+    - hi
+    - hr
+    - hu
+    - id
+    - it
+    - lt
+    - lv
+    - nl
+    - pl
+    - pt
+    - ro
+    - ru
+    - sk
+    - sl
+    - sv
+    - tr
+    - uk
+    - vi
+pipeline_tag: text-to-speech
+tags:
+    - text-to-speech
+    - speech-synthesis
+    - tts
+    - onnx
+    - multilingual
+    - on-device
+library_name: supertonic
+---
+# Supertonic 3 | Lightning Fast, On-Device, Accurate TTS
+![Supertonic 3 Preview](img/Supertonic3_HeroImage.png)
+<p align="center">
+  <a href="https://huggingface.co/spaces/Supertone/supertonic-3"><img src="https://img.shields.io/badge/Demo-Hugging_Face-yellow?style=for-the-badge" alt="Demo"></a>
+  <a href="https://github.com/supertone-inc/supertonic"><img src="https://img.shields.io/badge/Code-GitHub-black?style=for-the-badge&logo=github" alt="Code"></a>
+  <a href="https://pypi.org/project/supertonic/"><img src="https://img.shields.io/badge/Python-SDK-blue?style=for-the-badge&logo=python" alt="Python SDK"></a>
+</p>
+**Supertonic** is a lightweight text-to-speech system for local inference. It runs with ONNX Runtime entirely on your device, with no cloud call required for synthesis.
+**Supertonic 3** expands the open-weight release from 5 to **31 languages**, improves reading stability, and reduces repeat/skip failures.
+## Quick Start
+Install the Python SDK and generate speech immediately. On first run, the SDK downloads the model assets from Hugging Face.
+```bash
+pip install supertonic
+```
+```python
+from supertonic import TTS
+tts = TTS(auto_download=True)
+style = tts.get_voice_style(voice_name="M1")
+text = "A gentle breeze moved through the open window while everyone listened to the story."
+wav, duration = tts.synthesize(text, voice_style=style, lang="en")
+tts.save_audio(wav, "output.wav")
+print(f"Generated {duration:.2f}s of audio")
+```
+## What's New in Supertonic 3
+- **31 languages**: expanded from the 5-language Supertonic 2 release.
+- **More stable reading**: fewer repeat and skip failures, especially on short and long utterances.
+- **Higher speaker similarity**: improved similarity across the shared-language set compared with Supertonic 2.
+- **Expression tags**: supports simple tags such as `<laugh>`, `<breath>`, and `<sigh>`.
+## Custom Voices and Audio Samples
+The open-weight package includes fixed preset voice styles for immediate local inference. If you want to hear how Supertonic 3 performs with zero-shot custom voice styles, visit the [Audio Sample Demo](https://supertonic3.github.io/) to compare reference audio and generated speech across several use cases. To create your own Supertonic 3 voice-style JSON from reference audio, use [Supertonic Voice Builder](https://supertonic.supertone.ai/voice-builder); purchased Voice Builder styles include downloadable embeddings for both Supertonic 2 and Supertonic 3.
+Here are a few reference/generated pairs from the audio sample demo:
+**Call center, English**
+Text: Good morning, thank you for calling. How can I help you today?
+| Reference voice | Supertonic 3 output |
+|---|---|
+| <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/nora_reference.wav"></audio> | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/nora_supertonic3.wav"></audio> |
+**Character voice, Japanese**
+Text: ふふっ、退屈してたところなの。ちょうどいい遊び相手、見つけたかも♪
+| Reference voice | Supertonic 3 output |
+|---|---|
+| <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/moka_reference.wav"></audio> | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/moka_supertonic3.wav"></audio> |
+**Elder character voice, Korean**
+Text: 혼자 떠나기엔 길이 험하구나. 이 ��은 검을 가져가거라. 언젠가 어둠이 네 이름을 부르더라도, 부디 빛을 잊지 말거라.
+| Reference voice | Supertonic 3 output |
+|---|---|
+| <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/alphonse_reference.wav"></audio> | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/alphonse_supertonic3.wav"></audio> |
+**Audiobook, English**
+Text: I was not afraid of silence. I had lived with it long enough to know that, sometimes, it speaks more honestly than people do.
+| Reference voice | Supertonic 3 output |
+|---|---|
+| <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/luna_reference.wav"></audio> | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/luna_supertonic3.wav"></audio> |
+**Audiobook, Japanese**
+Text: その朝、ロンドンの霧はいつになく低く垂れこめていた。私はただの訪問者だと思っていたが、ホームズの目はすでに別の結論にたどり着いていた。
+| Reference voice | Supertonic 3 output |
+|---|---|
+| <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/watson_reference.wav"></audio> | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/watson_supertonic3.wav"></audio> |
+**News, English**
+Text: Here’s a story worth paying attention to. Supertone has released Supertonic 3, its on-device TTS model. This version expands support to thirty-one languages and improves reading stability.
+| Reference voice | Supertonic 3 output |
+|---|---|
+| <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/keld_reference.wav"></audio> | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/keld_supertonic3.wav"></audio> |
+## Performance Highlights
+Supertonic 3 is designed for practical on-device inference: compact enough to run locally, while staying competitive with much larger open TTS systems.
+### Reading Accuracy
+<p align="center">
+  <img src="img/metrics/s3_vs_measured_wer_range_voxcpm2.png" alt="Supertonic 3 reading accuracy compared with measured model ranges and VoxCPM2">
+</p>
+Across measured languages, Supertonic 3 stays within a competitive WER/CER range against much larger open TTS models such as VoxCPM2, while preserving a lightweight on-device deployment path. Asterisked languages use CER; the others use WER.
+### Supertonic 2 to Supertonic 3
+<p align="center">
+  <img src="img/metrics/supertonic2_vs_3_comparison.png" alt="Supertonic 2 and Supertonic 3 comparison">
+</p>
+Compared with Supertonic 2, Supertonic 3 reduces repeat and skip failures, improves speaker similarity across the shared-language set, and expands language coverage from 5 to 31 languages.
+### Runtime Footprint
+<p align="center">
+  <img src="img/metrics/runtime_cpu_gpu_latency_memory.png" alt="Supertonic CPU runtime compared with GPU baselines">
+</p>
+Supertonic 3 runs fast on CPU, even compared with larger baselines measured on A100 GPU, and uses substantially less memory. It does not require a GPU, which makes local, browser, and edge deployment much easier.
+### Model Size
+<p align="center">
+  <img src="img/metrics/model_size_comparison.png" alt="Model size comparison">
+</p>
+At about 99M parameters across the public ONNX assets, Supertonic 3 is much smaller than 0.7B to 2B class open TTS systems. The smaller model size is a practical advantage for download size, startup time, and on-device inference.
+## Supported Languages
+| Code | Language | Code | Language | Code | Language | Code | Language |
+|------|----------|------|----------|------|----------|------|----------|
+| `en` | English | `ko` | Korean | `ja` | Japanese | `ar` | Arabic |
+| `bg` | Bulgarian | `cs` | Czech | `da` | Danish | `de` | German |
+| `el` | Greek | `es` | Spanish | `et` | Estonian | `fi` | Finnish |
+| `fr` | French | `hi` | Hindi | `hr` | Croatian | `hu` | Hungarian |
+| `id` | Indonesian | `it` | Italian | `lt` | Lithuanian | `lv` | Latvian |
+| `nl` | Dutch | `pl` | Polish | `pt` | Portuguese | `ro` | Romanian |
+| `ru` | Russian | `sk` | Slovak | `sl` | Slovenian | `sv` | Swedish |
+| `tr` | Turkish | `uk` | Ukrainian | `vi` | Vietnamese | | |
+## License
+This project's sample code is released under the MIT License. See the [GitHub repository](https://github.com/supertone-inc/supertonic) for details.
+The accompanying model is released under the OpenRAIL-M License. See the [LICENSE](https://huggingface.co/Supertone/supertonic-3/blob/main/LICENSE) file in this repository for details.
+This model was trained using PyTorch, which is licensed under the BSD 3-Clause License but is not redistributed with this project. See the [PyTorch license](https://docs.pytorch.org/FBGEMM/general/License.html) for details.
+Copyright (c) 2026 Supertone Inc.

README.official.md ADDED Viewed

	@@ -0,0 +1,193 @@

+---
+license: openrail
+language:
+    - en
+    - ko
+    - ja
+    - ar
+    - bg
+    - cs
+    - da
+    - de
+    - el
+    - es
+    - et
+    - fi
+    - fr
+    - hi
+    - hr
+    - hu
+    - id
+    - it
+    - lt
+    - lv
+    - nl
+    - pl
+    - pt
+    - ro
+    - ru
+    - sk
+    - sl
+    - sv
+    - tr
+    - uk
+    - vi
+pipeline_tag: text-to-speech
+tags:
+    - text-to-speech
+    - speech-synthesis
+    - tts
+    - onnx
+    - multilingual
+    - on-device
+library_name: supertonic
+---
+# Supertonic 3 | Lightning Fast, On-Device, Accurate TTS
+![Supertonic 3 Preview](img/Supertonic3_HeroImage.png)
+<p align="center">
+  <a href="https://huggingface.co/spaces/Supertone/supertonic-3"><img src="https://img.shields.io/badge/Demo-Hugging_Face-yellow?style=for-the-badge" alt="Demo"></a>
+  <a href="https://github.com/supertone-inc/supertonic"><img src="https://img.shields.io/badge/Code-GitHub-black?style=for-the-badge&logo=github" alt="Code"></a>
+  <a href="https://pypi.org/project/supertonic/"><img src="https://img.shields.io/badge/Python-SDK-blue?style=for-the-badge&logo=python" alt="Python SDK"></a>
+</p>
+**Supertonic** is a lightweight text-to-speech system for local inference. It runs with ONNX Runtime entirely on your device, with no cloud call required for synthesis.
+**Supertonic 3** expands the open-weight release from 5 to **31 languages**, improves reading stability, and reduces repeat/skip failures.
+## Quick Start
+Install the Python SDK and generate speech immediately. On first run, the SDK downloads the model assets from Hugging Face.
+```bash
+pip install supertonic
+```
+```python
+from supertonic import TTS
+tts = TTS(auto_download=True)
+style = tts.get_voice_style(voice_name="M1")
+text = "A gentle breeze moved through the open window while everyone listened to the story."
+wav, duration = tts.synthesize(text, voice_style=style, lang="en")
+tts.save_audio(wav, "output.wav")
+print(f"Generated {duration:.2f}s of audio")
+```
+## What's New in Supertonic 3
+- **31 languages**: expanded from the 5-language Supertonic 2 release.
+- **More stable reading**: fewer repeat and skip failures, especially on short and long utterances.
+- **Higher speaker similarity**: improved similarity across the shared-language set compared with Supertonic 2.
+- **Expression tags**: supports simple tags such as `<laugh>`, `<breath>`, and `<sigh>`.
+## Custom Voices and Audio Samples
+The open-weight package includes fixed preset voice styles for immediate local inference. If you want to hear how Supertonic 3 performs with zero-shot custom voice styles, visit the [Audio Sample Demo](https://supertonic3.github.io/) to compare reference audio and generated speech across several use cases. To create your own Supertonic 3 voice-style JSON from reference audio, use [Supertonic Voice Builder](https://supertonic.supertone.ai/voice-builder); purchased Voice Builder styles include downloadable embeddings for both Supertonic 2 and Supertonic 3.
+Here are a few reference/generated pairs from the audio sample demo:
+**Call center, English**
+Text: Good morning, thank you for calling. How can I help you today?
+| Reference voice | Supertonic 3 output |
+|---|---|
+| <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/nora_reference.wav"></audio> | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/nora_supertonic3.wav"></audio> |
+**Character voice, Japanese**
+Text: ふふっ、退屈してたところなの。ちょうどいい遊び相手、見つけたかも♪
+| Reference voice | Supertonic 3 output |
+|---|---|
+| <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/moka_reference.wav"></audio> | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/moka_supertonic3.wav"></audio> |
+**Elder character voice, Korean**
+Text: 혼자 떠나기엔 길이 험하구나. 이 낡은 검을 가져가거라. 언젠가 어둠이 네 이름을 부르더라도, 부디 빛을 잊지 말거라.
+| Reference voice | Supertonic 3 output |
+|---|---|
+| <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/alphonse_reference.wav"></audio> | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/alphonse_supertonic3.wav"></audio> |
+**Audiobook, English**
+Text: I was not afraid of silence. I had lived with it long enough to know that, sometimes, it speaks more honestly than people do.
+| Reference voice | Supertonic 3 output |
+|---|---|
+| <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/luna_reference.wav"></audio> | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/luna_supertonic3.wav"></audio> |
+**Audiobook, Japanese**
+Text: その朝、ロンドンの霧はいつになく低く垂れこめていた。私はただの訪問者だと思っていたが、ホームズの目はすでに別の結論にたどり着いていた。
+| Reference voice | Supertonic 3 output |
+|---|---|
+| <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/watson_reference.wav"></audio> | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/watson_supertonic3.wav"></audio> |
+**News, English**
+Text: Here’s a story worth paying attention to. Supertone has released Supertonic 3, its on-device TTS model. This version expands support to thirty-one languages and improves reading stability.
+| Reference voice | Supertonic 3 output |
+|---|---|
+| <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/keld_reference.wav"></audio> | <audio controls preload="metadata" src="https://huggingface.co/Supertone/supertonic-3/resolve/main/audio_samples/keld_supertonic3.wav"></audio> |
+## Performance Highlights
+Supertonic 3 is designed for practical on-device inference: compact enough to run locally, while staying competitive with much larger open TTS systems.
+### Reading Accuracy
+<p align="center">
+  <img src="img/metrics/s3_vs_measured_wer_range_voxcpm2.png" alt="Supertonic 3 reading accuracy compared with measured model ranges and VoxCPM2">
+</p>
+Across measured languages, Supertonic 3 stays within a competitive WER/CER range against much larger open TTS models such as VoxCPM2, while preserving a lightweight on-device deployment path. Asterisked languages use CER; the others use WER.
+### Supertonic 2 to Supertonic 3
+<p align="center">
+  <img src="img/metrics/supertonic2_vs_3_comparison.png" alt="Supertonic 2 and Supertonic 3 comparison">
+</p>
+Compared with Supertonic 2, Supertonic 3 reduces repeat and skip failures, improves speaker similarity across the shared-language set, and expands language coverage from 5 to 31 languages.
+### Runtime Footprint
+<p align="center">
+  <img src="img/metrics/runtime_cpu_gpu_latency_memory.png" alt="Supertonic CPU runtime compared with GPU baselines">
+</p>
+Supertonic 3 runs fast on CPU, even compared with larger baselines measured on A100 GPU, and uses substantially less memory. It does not require a GPU, which makes local, browser, and edge deployment much easier.
+### Model Size
+<p align="center">
+  <img src="img/metrics/model_size_comparison.png" alt="Model size comparison">
+</p>
+At about 99M parameters across the public ONNX assets, Supertonic 3 is much smaller than 0.7B to 2B class open TTS systems. The smaller model size is a practical advantage for download size, startup time, and on-device inference.
+## Supported Languages
+| Code | Language | Code | Language | Code | Language | Code | Language |
+|------|----------|------|----------|------|----------|------|----------|
+| `en` | English | `ko` | Korean | `ja` | Japanese | `ar` | Arabic |
+| `bg` | Bulgarian | `cs` | Czech | `da` | Danish | `de` | German |
+| `el` | Greek | `es` | Spanish | `et` | Estonian | `fi` | Finnish |
+| `fr` | French | `hi` | Hindi | `hr` | Croatian | `hu` | Hungarian |
+| `id` | Indonesian | `it` | Italian | `lt` | Lithuanian | `lv` | Latvian |
+| `nl` | Dutch | `pl` | Polish | `pt` | Portuguese | `ro` | Romanian |
+| `ru` | Russian | `sk` | Slovak | `sl` | Slovenian | `sv` | Swedish |
+| `tr` | Turkish | `uk` | Ukrainian | `vi` | Vietnamese | | |
+## License
+This project's sample code is released under the MIT License. See the [GitHub repository](https://github.com/supertone-inc/supertonic) for details.
+The accompanying model is released under the OpenRAIL-M License. See the [LICENSE](https://huggingface.co/Supertone/supertonic-3/blob/main/LICENSE) file in this repository for details.
+This model was trained using PyTorch, which is licensed under the BSD 3-Clause License but is not redistributed with this project. See the [PyTorch license](https://docs.pytorch.org/FBGEMM/general/License.html) for details.
+Copyright (c) 2026 Supertone Inc.

graphs/duration_predictor.json ADDED Viewed

The diff for this file is too large to render. See raw diff

graphs/text_encoder.json ADDED Viewed

The diff for this file is too large to render. See raw diff

graphs/vector_estimator.json ADDED Viewed

The diff for this file is too large to render. See raw diff

graphs/vocoder.json ADDED Viewed

The diff for this file is too large to render. See raw diff

mlx_manifest.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "format": "supertonic-mlx-graph",
+  "source_repo": "Supertone/supertonic-3",
+  "target_repo": "mlx-community/supertonic-3",
+  "graphs": [
+    "duration_predictor",
+    "text_encoder",
+    "vector_estimator",
+    "vocoder"
+  ],
+  "sample_rate": 44100
+}

tts.json ADDED Viewed

	@@ -0,0 +1,311 @@

+{
+    "tts_version": "v1.7.3",
+    "split": "opensource-multilingual",
+    "ttl": {
+        "latent_dim": 24,
+        "chunk_compress_factor": 6,
+        "batch_expander": {
+            "n_batch_expand": 6
+        },
+        "normalizer": {
+            "scale": 0.25
+        },
+        "text_encoder": {
+            "n_langs": 0,
+            "lang_emb_dim": 0,
+            "text_embedder": {
+                "char_emb_dim": 256
+            },
+            "convnext": {
+                "idim": 256,
+                "ksz": 5,
+                "intermediate_dim": 1024,
+                "num_layers": 6,
+                "dilation_lst": [
+                    1,
+                    1,
+                    2,
+                    2,
+                    4,
+                    4
+                ]
+            },
+            "attn_encoder": {
+                "hidden_channels": 256,
+                "filter_channels": 1024,
+                "n_heads": 4,
+                "n_layers": 4,
+                "p_dropout": 0.0
+            },
+            "proj_out": {
+                "idim": 256,
+                "odim": 256
+            }
+        },
+        "flow_matching": {
+            "sig_min": 1e-08
+        },
+        "style_encoder": {
+            "proj_in": {
+                "ldim": 24,
+                "chunk_compress_factor": 6,
+                "odim": 256
+            },
+            "convnext": {
+                "idim": 256,
+                "ksz": 5,
+                "intermediate_dim": 1024,
+                "num_layers": 6,
+                "dilation_lst": [
+                    1,
+                    1,
+                    1,
+                    1,
+                    1,
+                    1
+                ]
+            },
+            "style_token_layer": {
+                "input_dim": 256,
+                "n_style": 50,
+                "style_key_dim": 256,
+                "style_value_dim": 256,
+                "prototype_dim": 256,
+                "n_units": 256,
+                "n_heads": 2
+            }
+        },
+        "speech_prompted_text_encoder": {
+            "text_dim": 256,
+            "style_dim": 256,
+            "n_units": 256,
+            "n_heads": 2
+        },
+        "uncond_masker": {
+            "prob_both_uncond": 0.04,
+            "prob_text_uncond": 0.01,
+            "std": 0.1,
+            "text_dim": 256,
+            "n_style": 50,
+            "style_key_dim": 256,
+            "style_value_dim": 256
+        },
+        "vector_field": {
+            "n_langs": 0,
+            "lang_emb_dim": 0,
+            "proj_in": {
+                "ldim": 24,
+                "chunk_compress_factor": 6,
+                "odim": 512
+            },
+            "time_encoder": {
+                "time_dim": 64,
+                "hdim": 256
+            },
+            "main_blocks": {
+                "n_blocks": 4,
+                "time_cond_layer": {
+                    "idim": 512,
+                    "time_dim": 64
+                },
+                "style_cond_layer": {
+                    "idim": 512,
+                    "style_dim": 256
+                },
+                "text_cond_layer": {
+                    "idim": 512,
+                    "text_dim": 256,
+                    "n_heads": 8,
+                    "n_units": 512,
+                    "use_residual": true,
+                    "rotary_base": 10000,
+                    "rotary_scale": 10
+                },
+                "convnext_0": {
+                    "idim": 512,
+                    "ksz": 5,
+                    "intermediate_dim": 2048,
+                    "num_layers": 4,
+                    "dilation_lst": [
+                        1,
+                        2,
+                        4,
+                        8
+                    ]
+                },
+                "convnext_1": {
+                    "idim": 512,
+                    "ksz": 5,
+                    "intermediate_dim": 2048,
+                    "num_layers": 1,
+                    "dilation_lst": [
+                        1
+                    ]
+                },
+                "convnext_2": {
+                    "idim": 512,
+                    "ksz": 5,
+                    "intermediate_dim": 2048,
+                    "num_layers": 1,
+                    "dilation_lst": [
+                        1
+                    ]
+                }
+            },
+            "last_convnext": {
+                "idim": 512,
+                "ksz": 5,
+                "intermediate_dim": 2048,
+                "num_layers": 4,
+                "dilation_lst": [
+                    1,
+                    1,
+                    1,
+                    1
+                ]
+            },
+            "proj_out": {
+                "idim": 512,
+                "chunk_compress_factor": 6,
+                "ldim": 24
+            }
+        }
+    },
+    "ae": {
+        "sample_rate": 44100,
+        "n_delay": 0,
+        "base_chunk_size": 512,
+        "chunk_compress_factor": 1,
+        "ldim": 24,
+        "encoder": {
+            "spec_processor": {
+                "n_fft": 2048,
+                "win_length": 2048,
+                "hop_length": 512,
+                "n_mels": 228,
+                "sample_rate": 44100,
+                "eps": 1e-05,
+                "norm_mean": 0.0,
+                "norm_std": 1.0
+            },
+            "ksz_init": 7,
+            "ksz": 7,
+            "num_layers": 10,
+            "dilation_lst": [
+                1,
+                1,
+                1,
+                1,
+                1,
+                1,
+                1,
+                1,
+                1,
+                1
+            ],
+            "intermediate_dim": 2048,
+            "idim": 1253,
+            "hdim": 512,
+            "odim": 24
+        },
+        "decoder": {
+            "ksz_init": 7,
+            "ksz": 7,
+            "num_layers": 10,
+            "dilation_lst": [
+                1,
+                2,
+                4,
+                1,
+                2,
+                4,
+                1,
+                1,
+                1,
+                1
+            ],
+            "intermediate_dim": 2048,
+            "idim": 24,
+            "hdim": 512,
+            "head": {
+                "idim": 512,
+                "hdim": 2048,
+                "odim": 512,
+                "ksz": 3
+            }
+        }
+    },
+    "dp": {
+        "latent_dim": 24,
+        "chunk_compress_factor": 6,
+        "normalizer": {
+            "scale": 1.0
+        },
+        "sentence_encoder": {
+            "char_emb_dim": 64,
+            "text_embedder": {
+                "char_emb_dim": 64
+            },
+            "convnext": {
+                "idim": 64,
+                "ksz": 5,
+                "intermediate_dim": 256,
+                "num_layers": 6,
+                "dilation_lst": [
+                    1,
+                    1,
+                    1,
+                    1,
+                    1,
+                    1
+                ]
+            },
+            "attn_encoder": {
+                "hidden_channels": 64,
+                "filter_channels": 256,
+                "n_heads": 2,
+                "n_layers": 2,
+                "p_dropout": 0.0
+            },
+            "proj_out": {
+                "idim": 64,
+                "odim": 64
+            }
+        },
+        "style_encoder": {
+            "proj_in": {
+                "ldim": 24,
+                "chunk_compress_factor": 6,
+                "odim": 64
+            },
+            "convnext": {
+                "idim": 64,
+                "ksz": 5,
+                "intermediate_dim": 256,
+                "num_layers": 4,
+                "dilation_lst": [
+                    1,
+                    1,
+                    1,
+                    1
+                ]
+            },
+            "style_token_layer": {
+                "input_dim": 64,
+                "n_style": 8,
+                "style_key_dim": 0,
+                "style_value_dim": 16,
+                "prototype_dim": 64,
+                "n_units": 64,
+                "n_heads": 2
+            }
+        },
+        "predictor": {
+            "sentence_dim": 64,
+            "n_style": 8,
+            "style_dim": 16,
+            "hdim": 128,
+            "n_layer": 2
+        }
+    }
+}

unicode_indexer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

voice_styles/F1.json ADDED Viewed

The diff for this file is too large to render. See raw diff

voice_styles/F2.json ADDED Viewed

The diff for this file is too large to render. See raw diff

voice_styles/F3.json ADDED Viewed

The diff for this file is too large to render. See raw diff

voice_styles/F4.json ADDED Viewed

The diff for this file is too large to render. See raw diff

voice_styles/F5.json ADDED Viewed

The diff for this file is too large to render. See raw diff

voice_styles/M1.json ADDED Viewed

The diff for this file is too large to render. See raw diff

voice_styles/M2.json ADDED Viewed

The diff for this file is too large to render. See raw diff

voice_styles/M3.json ADDED Viewed

The diff for this file is too large to render. See raw diff

voice_styles/M4.json ADDED Viewed

The diff for this file is too large to render. See raw diff

voice_styles/M5.json ADDED Viewed

The diff for this file is too large to render. See raw diff