niobures commited on
Commit
62d7cc7
·
verified ·
1 Parent(s): dc389b6

MOSS-TTS-Nano-100M-ONNX

Browse files
.gitattributes CHANGED
@@ -69,3 +69,5 @@ models/MOSS-TTS-Realtime-ONNX/onnx_models/local_transformer_f32/local_transforme
69
  models/MOSS-TTS-Realtime-ONNX/tokenizers/tokenizer.json filter=lfs diff=lfs merge=lfs -text
70
  models/MOSS-TTS/tokenizer.json filter=lfs diff=lfs merge=lfs -text
71
  MOSS-TTS[[:space:]]Technical[[:space:]]Report.pdf filter=lfs diff=lfs merge=lfs -text
 
 
 
69
  models/MOSS-TTS-Realtime-ONNX/tokenizers/tokenizer.json filter=lfs diff=lfs merge=lfs -text
70
  models/MOSS-TTS/tokenizer.json filter=lfs diff=lfs merge=lfs -text
71
  MOSS-TTS[[:space:]]Technical[[:space:]]Report.pdf filter=lfs diff=lfs merge=lfs -text
72
+ models/MOSS-TTS-Nano-100M-ONNX/moss_tts_global_shared.data filter=lfs diff=lfs merge=lfs -text
73
+ models/MOSS-TTS-Nano-100M-ONNX/moss_tts_local_shared.data filter=lfs diff=lfs merge=lfs -text
models/MOSS-TTS-Nano-100M-ONNX/.gitattributes ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.data filter=lfs diff=lfs merge=lfs -text
37
+ *.png filter=lfs diff=lfs merge=lfs -text
38
+ *.jpg filter=lfs diff=lfs merge=lfs -text
39
+ *.wav filter=lfs diff=lfs merge=lfs -text
40
+ *.gguf filter=lfs diff=lfs merge=lfs -text
41
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
models/MOSS-TTS-Nano-100M-ONNX/README.md ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ library_name: onnx
4
+ pipeline_tag: text-to-speech
5
+ tags:
6
+ - text-to-speech
7
+ - audio
8
+ - moss-tts-family
9
+ - moss-tts-nano
10
+ - onnx
11
+ - onnxruntime
12
+ - browser
13
+ - cpu
14
+ ---
15
+
16
+ # MOSS-TTS-Nano-100M-ONNX
17
+
18
+ This repository provides the **ONNX exports** of [MOSS-TTS-Nano](https://huggingface.co/OpenMOSS-Team/MOSS-TTS-Nano), a **0.1B multilingual tiny speech generation model** from [MOSI.AI](https://mosi.cn/#hero) and the [OpenMOSS team](https://www.open-moss.com/). It is designed for **torch-free**, lightweight deployment on CPU and in the browser, and is intended to be used together with [MOSS-Audio-Tokenizer-Nano-ONNX](https://huggingface.co/OpenMOSS-Team/MOSS-Audio-Tokenizer-Nano-ONNX).
19
+
20
+ ## Overview
21
+
22
+ MOSS-TTS-Nano focuses on the part of TTS deployment that matters most in practice: **small footprint**, **low latency**, **good enough quality for realtime products**, and **simple local setup**. It uses a pure autoregressive **Audio Tokenizer + LLM** pipeline and keeps the inference workflow friendly for browser demos, local CPU runtimes, and other lightweight integrations.
23
+
24
+ Main characteristics:
25
+
26
+ - **Tiny model size**: about **0.1B parameters**
27
+ - **Native audio format**: **48 kHz**, **2-channel** output
28
+ - **Multilingual**: same language coverage as the PyTorch `MOSS-TTS-Nano` release
29
+ - **Pure autoregressive architecture**: built on **Audio Tokenizer + LLM**
30
+ - **Streaming-friendly export**: split into prefill / decode-step / local decoder ONNX graphs
31
+ - **CPU and browser deployment**: designed for `onnxruntime` and `onnxruntime-web`
32
+
33
+ This repository contains the exported ONNX graphs only. If you want the original PyTorch model card and plug-and-play local inference scripts, please use [OpenMOSS-Team/MOSS-TTS-Nano](https://huggingface.co/OpenMOSS-Team/MOSS-TTS-Nano) or the [OpenMOSS/MOSS-TTS-Nano](https://github.com/OpenMOSS/MOSS-TTS-Nano) source repository.
34
+
35
+ ## Supported Backends
36
+
37
+ | Backend | Runtime | Use Case |
38
+ |---------|---------|----------|
39
+ | **ONNX Runtime (CPU)** | `onnxruntime` | Local CPU inference |
40
+ | **ONNX Runtime Web** | `onnxruntime-web` | Browser demos / extensions |
41
+
42
+ ## Repository Contents
43
+
44
+ | File | Description |
45
+ |------|-------------|
46
+ | `moss_tts_prefill.onnx` | Global transformer prefill graph |
47
+ | `moss_tts_decode_step.onnx` | Global transformer decode-step graph with KV cache |
48
+ | `moss_tts_local_decoder.onnx` | Local decoder graph |
49
+ | `moss_tts_local_cached_step.onnx` | Local cached-step graph |
50
+ | `moss_tts_local_fixed_sampled_frame.onnx` | Local frame sampling graph |
51
+ | `moss_tts_global_shared.data` | External weights shared by the global graphs |
52
+ | `moss_tts_local_shared.data` | External weights shared by the local graphs |
53
+ | `tokenizer.model` | SentencePiece tokenizer used by the text frontend |
54
+ | `tts_browser_onnx_meta.json` | Metadata for ONNX runtime integration |
55
+ | `browser_poc_manifest.json` | Example manifest for browser-based integration |
56
+
57
+ ## Quick Start
58
+
59
+ ```bash
60
+ huggingface-cli download OpenMOSS-Team/MOSS-TTS-Nano-100M-ONNX \
61
+ --local-dir weights/MOSS-TTS-Nano-100M-ONNX
62
+
63
+ huggingface-cli download OpenMOSS-Team/MOSS-Audio-Tokenizer-Nano-ONNX \
64
+ --local-dir weights/MOSS-Audio-Tokenizer-Nano-ONNX
65
+ ```
66
+
67
+ The TTS repo provides the language model and text tokenizer exports, while the companion codec repo provides waveform encode/decode ONNX models.
68
+
69
+ ## Main Repositories
70
+
71
+ | Repository | Description |
72
+ |------------|-------------|
73
+ | [OpenMOSS/MOSS-TTS-Nano](https://github.com/OpenMOSS/MOSS-TTS-Nano) | MOSS-TTS-Nano source code, demos, and PyTorch inference |
74
+ | [OpenMOSS-Team/MOSS-TTS-Nano](https://huggingface.co/OpenMOSS-Team/MOSS-TTS-Nano) | PyTorch MOSS-TTS-Nano weights |
75
+ | [OpenMOSS-Team/MOSS-Audio-Tokenizer-Nano-ONNX](https://huggingface.co/OpenMOSS-Team/MOSS-Audio-Tokenizer-Nano-ONNX) | Companion ONNX audio tokenizer |
76
+ | [OpenMOSS-Team/MOSS-Audio-Tokenizer-Nano](https://huggingface.co/OpenMOSS-Team/MOSS-Audio-Tokenizer-Nano) | PyTorch audio tokenizer weights |
77
+ | [OpenMOSS/MOSS-TTS-Nano-Reader](https://github.com/OpenMOSS/MOSS-TTS-Nano-Reader) | Browser reading application built on top of the ONNX stack |
78
+
79
+ ## About MOSS-TTS-Nano
80
+
81
+ MOSS-TTS-Nano is an open-source multilingual tiny speech generation model built for realtime speech generation and lightweight deployment. The ONNX export keeps the same core architecture as the PyTorch release while making it easier to integrate into browser and CPU-only runtimes without a PyTorch dependency.
82
+
83
+ For the full project introduction, demos, and PyTorch usage, see:
84
+
85
+ - [MOSS-TTS-Nano Repository](https://github.com/OpenMOSS/MOSS-TTS-Nano)
86
+ - [MOSS-TTS-Nano on Hugging Face](https://huggingface.co/OpenMOSS-Team/MOSS-TTS-Nano)
87
+
88
+ ## Citation
89
+
90
+ If you use the MOSS-TTS work in your research or product, please cite:
91
+
92
+ ```bibtex
93
+ @misc{openmoss2026mossttsnano,
94
+ title={MOSS-TTS-Nano},
95
+ author={OpenMOSS Team},
96
+ year={2026},
97
+ howpublished={GitHub repository},
98
+ url={https://github.com/OpenMOSS/MOSS-TTS-Nano}
99
+ }
100
+ ```
101
+
102
+ ```bibtex
103
+ @misc{gong2026mossttstechnicalreport,
104
+ title={MOSS-TTS Technical Report},
105
+ author={Yitian Gong and Botian Jiang and Yiwei Zhao and Yucheng Yuan and Kuangwei Chen and Yaozhou Jiang and Cheng Chang and Dong Hong and Mingshu Chen and Ruixiao Li and Yiyang Zhang and Yang Gao and Hanfu Chen and Ke Chen and Songlin Wang and Xiaogui Yang and Yuqian Zhang and Kexin Huang and ZhengYuan Lin and Kang Yu and Ziqi Chen and Jin Wang and Zhaoye Fei and Qinyuan Cheng and Shimin Li and Xipeng Qiu},
106
+ year={2026},
107
+ eprint={2603.18090},
108
+ archivePrefix={arXiv},
109
+ primaryClass={cs.SD},
110
+ url={https://arxiv.org/abs/2603.18090}
111
+ }
112
+ ```
113
+
114
+ ```bibtex
115
+ @misc{gong2026mossaudiotokenizerscalingaudiotokenizers,
116
+ title={MOSS-Audio-Tokenizer: Scaling Audio Tokenizers for Future Audio Foundation Models},
117
+ author={Yitian Gong and Kuangwei Chen and Zhaoye Fei and Xiaogui Yang and Ke Chen and Yang Wang and Kexin Huang and Mingshu Chen and Ruixiao Li and Qingyuan Cheng and Shimin Li and Xipeng Qiu},
118
+ year={2026},
119
+ eprint={2602.10934},
120
+ archivePrefix={arXiv},
121
+ primaryClass={cs.SD},
122
+ url={https://arxiv.org/abs/2602.10934}
123
+ }
124
+ ```
models/MOSS-TTS-Nano-100M-ONNX/browser_poc_manifest.json ADDED
The diff for this file is too large to render. See raw diff
 
models/MOSS-TTS-Nano-100M-ONNX/moss_tts_decode_step.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:698cbc2fc1c2feca16e5895614ed52bbb32ded10f236c076f477b2e69abf32d8
3
+ size 291483
models/MOSS-TTS-Nano-100M-ONNX/moss_tts_global_shared.data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bce8312c3df6a44545302cae229b61054fe0672e0b252ba59cba47adeed831dc
3
+ size 440813568
models/MOSS-TTS-Nano-100M-ONNX/moss_tts_local_cached_step.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa9035fefc1c138a951a8bcfc0374fb03a25f1ece67f7f7f53bce349b84a1dd5
3
+ size 53685
models/MOSS-TTS-Nano-100M-ONNX/moss_tts_local_decoder.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51aa754301b38550a5f9adda0ad93bd3dc95819afb511e6dcabf4a90b345a454
3
+ size 49231
models/MOSS-TTS-Nano-100M-ONNX/moss_tts_local_fixed_sampled_frame.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40cdb00efc171c450cf91468e01429caa41b0252222cd308e978f58fe354afa8
3
+ size 471262
models/MOSS-TTS-Nano-100M-ONNX/moss_tts_local_shared.data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bae7782032c0fb12490ab42afe009f87ae6c75a0f0596fc7b5c08e4d5ee93916
3
+ size 229678080
models/MOSS-TTS-Nano-100M-ONNX/moss_tts_prefill.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d56126dcd0574c2f15d98fc6b35eda68d0386b5bd9c5e38e28548d6f2ea8f3db
3
+ size 283305
models/MOSS-TTS-Nano-100M-ONNX/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/OpenMOSS-Team/MOSS-TTS-Nano-100M-ONNX
models/MOSS-TTS-Nano-100M-ONNX/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c353ee1479b536bf414c1b247f5542b6607fb8ae91320e5af1781fee200fddff
3
+ size 470897
models/MOSS-TTS-Nano-100M-ONNX/tts_browser_onnx_meta.json ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "format_version": 1,
3
+ "checkpoint_path": "MOSS-TTS-Nano",
4
+ "files": {
5
+ "prefill": "moss_tts_prefill.onnx",
6
+ "decode_step": "moss_tts_decode_step.onnx",
7
+ "local_decoder": "moss_tts_local_decoder.onnx",
8
+ "local_cached_step": "moss_tts_local_cached_step.onnx",
9
+ "local_fixed_sampled_frame": "moss_tts_local_fixed_sampled_frame.onnx"
10
+ },
11
+ "external_data_files": {
12
+ "moss_tts_prefill.onnx": [
13
+ "moss_tts_global_shared.data"
14
+ ],
15
+ "moss_tts_decode_step.onnx": [
16
+ "moss_tts_global_shared.data"
17
+ ],
18
+ "moss_tts_local_decoder.onnx": [
19
+ "moss_tts_local_shared.data"
20
+ ],
21
+ "moss_tts_local_cached_step.onnx": [
22
+ "moss_tts_local_shared.data"
23
+ ],
24
+ "moss_tts_local_fixed_sampled_frame.onnx": [
25
+ "moss_tts_local_shared.data"
26
+ ]
27
+ },
28
+ "model_config": {
29
+ "n_vq": 16,
30
+ "row_width": 17,
31
+ "hidden_size": 768,
32
+ "global_layers": 12,
33
+ "global_heads": 12,
34
+ "head_dim": 64,
35
+ "local_layers": 1,
36
+ "local_heads": 12,
37
+ "local_head_dim": 64,
38
+ "vocab_size": 16384,
39
+ "audio_codebook_sizes": [
40
+ 1024,
41
+ 1024,
42
+ 1024,
43
+ 1024,
44
+ 1024,
45
+ 1024,
46
+ 1024,
47
+ 1024,
48
+ 1024,
49
+ 1024,
50
+ 1024,
51
+ 1024,
52
+ 1024,
53
+ 1024,
54
+ 1024,
55
+ 1024
56
+ ],
57
+ "audio_pad_token_id": 1024,
58
+ "pad_token_id": 3,
59
+ "im_start_token_id": 4,
60
+ "im_end_token_id": 5,
61
+ "audio_start_token_id": 6,
62
+ "audio_end_token_id": 7,
63
+ "audio_user_slot_token_id": 8,
64
+ "audio_assistant_slot_token_id": 9
65
+ },
66
+ "onnx": {
67
+ "opset": 17,
68
+ "prefill_output_names": [
69
+ "global_hidden",
70
+ "present_key_0",
71
+ "present_value_0",
72
+ "present_key_1",
73
+ "present_value_1",
74
+ "present_key_2",
75
+ "present_value_2",
76
+ "present_key_3",
77
+ "present_value_3",
78
+ "present_key_4",
79
+ "present_value_4",
80
+ "present_key_5",
81
+ "present_value_5",
82
+ "present_key_6",
83
+ "present_value_6",
84
+ "present_key_7",
85
+ "present_value_7",
86
+ "present_key_8",
87
+ "present_value_8",
88
+ "present_key_9",
89
+ "present_value_9",
90
+ "present_key_10",
91
+ "present_value_10",
92
+ "present_key_11",
93
+ "present_value_11"
94
+ ],
95
+ "decode_input_names": [
96
+ "input_ids",
97
+ "past_valid_lengths",
98
+ "past_key_0",
99
+ "past_value_0",
100
+ "past_key_1",
101
+ "past_value_1",
102
+ "past_key_2",
103
+ "past_value_2",
104
+ "past_key_3",
105
+ "past_value_3",
106
+ "past_key_4",
107
+ "past_value_4",
108
+ "past_key_5",
109
+ "past_value_5",
110
+ "past_key_6",
111
+ "past_value_6",
112
+ "past_key_7",
113
+ "past_value_7",
114
+ "past_key_8",
115
+ "past_value_8",
116
+ "past_key_9",
117
+ "past_value_9",
118
+ "past_key_10",
119
+ "past_value_10",
120
+ "past_key_11",
121
+ "past_value_11"
122
+ ],
123
+ "decode_output_names": [
124
+ "global_hidden",
125
+ "present_key_0",
126
+ "present_value_0",
127
+ "present_key_1",
128
+ "present_value_1",
129
+ "present_key_2",
130
+ "present_value_2",
131
+ "present_key_3",
132
+ "present_value_3",
133
+ "present_key_4",
134
+ "present_value_4",
135
+ "present_key_5",
136
+ "present_value_5",
137
+ "present_key_6",
138
+ "present_value_6",
139
+ "present_key_7",
140
+ "present_value_7",
141
+ "present_key_8",
142
+ "present_value_8",
143
+ "present_key_9",
144
+ "present_value_9",
145
+ "present_key_10",
146
+ "present_value_10",
147
+ "present_key_11",
148
+ "present_value_11"
149
+ ],
150
+ "local_cached_input_names": [
151
+ "global_hidden",
152
+ "text_token_id",
153
+ "audio_token_id",
154
+ "channel_index",
155
+ "step_type",
156
+ "past_valid_lengths",
157
+ "local_past_key_0",
158
+ "local_past_value_0"
159
+ ],
160
+ "local_cached_output_names": [
161
+ "text_logits",
162
+ "audio_logits",
163
+ "local_present_key_0",
164
+ "local_present_value_0"
165
+ ],
166
+ "local_fixed_sampled_frame_input_names": [
167
+ "global_hidden",
168
+ "repetition_seen_mask",
169
+ "assistant_random_u",
170
+ "audio_random_u"
171
+ ],
172
+ "local_fixed_sampled_frame_output_names": [
173
+ "should_continue",
174
+ "frame_token_ids"
175
+ ],
176
+ "fixed_sampled_frame_constants": {
177
+ "text_temperature": 1.0,
178
+ "text_top_p": 1.0,
179
+ "text_top_k": 50,
180
+ "audio_temperature": 0.8,
181
+ "audio_top_p": 0.95,
182
+ "audio_top_k": 25,
183
+ "audio_repetition_penalty": 1.2
184
+ }
185
+ }
186
+ }