niobures commited on
Commit
052c0a4
·
verified ·
1 Parent(s): 5d5fac5

MOSS-TTS-Nano-100M

Browse files
.gitattributes CHANGED
@@ -72,3 +72,5 @@ MOSS-TTS[[:space:]]Technical[[:space:]]Report.pdf filter=lfs diff=lfs merge=lfs
72
  models/MOSS-TTS-Nano-100M-ONNX/moss_tts_global_shared.data filter=lfs diff=lfs merge=lfs -text
73
  models/MOSS-TTS-Nano-100M-ONNX/moss_tts_local_shared.data filter=lfs diff=lfs merge=lfs -text
74
  models/MOSS-TTS-Local-Transformer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
72
  models/MOSS-TTS-Nano-100M-ONNX/moss_tts_global_shared.data filter=lfs diff=lfs merge=lfs -text
73
  models/MOSS-TTS-Nano-100M-ONNX/moss_tts_local_shared.data filter=lfs diff=lfs merge=lfs -text
74
  models/MOSS-TTS-Local-Transformer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
75
+ models/MOSS-TTS-Nano-100M/assets/images/arch_moss_audio_tokenizer_nano.png filter=lfs diff=lfs merge=lfs -text
76
+ models/MOSS-TTS-Nano-100M/assets/images/concept.png filter=lfs diff=lfs merge=lfs -text
models/MOSS-TTS-Nano-100M/.gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.png filter=lfs diff=lfs merge=lfs -text
37
+ *.jpg filter=lfs diff=lfs merge=lfs -text
models/MOSS-TTS-Nano-100M/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
models/MOSS-TTS-Nano-100M/README.md ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ tags:
4
+ - text-to-speech
5
+ language:
6
+ - zh
7
+ - en
8
+ - de
9
+ - es
10
+ - fr
11
+ - ja
12
+ - it
13
+ - he
14
+ - ko
15
+ - ru
16
+ - fa
17
+ - ar
18
+ - pl
19
+ - pt
20
+ - cs
21
+ - da
22
+ - sv
23
+ - hu
24
+ - el
25
+ - tr
26
+ ---
27
+
28
+ # MOSS-TTS-Nano
29
+
30
+ <br>
31
+
32
+ <p align="center">
33
+ <img src="./assets/images/OpenMOSS_Logo.png" height="70" align="middle" />
34
+ &nbsp;&nbsp;&nbsp;&nbsp;
35
+ <img src="./assets/images/mosi-logo.png" height="50" align="middle" />
36
+ </p>
37
+
38
+ <div align="center">
39
+ <a href="https://clawhub.ai/luogao2333/moss-tts-voice"><img src="https://img.shields.io/badge/🦞_OpenClaw-Skills-8A2BE2" alt="OpenClaw"></a>
40
+ <a href="https://huggingface.co/OpenMOSS-Team/MOSS-TTS-Nano"><img src="https://img.shields.io/badge/Huggingface-Models-orange?logo=huggingface&amp"></a>
41
+ <a href="https://modelscope.cn/models/openmoss/MOSS-TTS-Nano"><img src="https://img.shields.io/badge/ModelScope-Models-7B61FF?logo=modelscope&amp;logoColor=white"></a>
42
+ <a href="https://mosi.cn/#models"><img src="https://img.shields.io/badge/Blog-View-blue?logo=internet-explorer&amp"></a>
43
+ <a href="https://arxiv.org/abs/2603.18090"><img src="https://img.shields.io/badge/Arxiv-2603.18090-red?logo=arxiv&amp"></a>
44
+
45
+ <a href="https://studio.mosi.cn/experiments/moss-tts-nano"><img src="https://img.shields.io/badge/AIStudio-Try-green?logo=internet-explorer&amp"></a>
46
+ <a href="https://studio.mosi.cn/docs/moss-tts-nano"><img src="https://img.shields.io/badge/API-Docs-00A3FF?logo=fastapi&amp"></a>
47
+ <a href="https://x.com/Open_MOSS"><img src="https://img.shields.io/badge/Twitter-Follow-black?logo=x&amp"></a>
48
+ <a href="https://discord.gg/Xf3aXddCjc"><img src="https://img.shields.io/badge/Discord-Join-5865F2?logo=discord&amp"></a>
49
+ <a href="./assets/images/wechat.jpg"><img src="https://img.shields.io/badge/WeChat-Join-07C160?logo=wechat&amp;logoColor=white" alt="WeChat"></a>
50
+ </div>
51
+
52
+ MOSS-TTS-Nano is an open-source **multilingual tiny speech generation model** from [MOSI.AI](https://mosi.cn/#hero) and the [OpenMOSS team](https://www.open-moss.com/). With only **0.1B parameters**, it is designed for **realtime speech generation**, can run directly on **CPU without a GPU**, and keeps the deployment stack simple enough for local demos, web serving, and lightweight product integration.
53
+
54
+ ## News
55
+
56
+ * 2026.4.10: We release **MOSS-TTS-Nano**. A demo Space is available at [OpenMOSS-Team/MOSS-TTS-Nano](https://huggingface.co/spaces/OpenMOSS-Team/MOSS-TTS-Nano). You can also view the demo and more details at [openmoss.github.io/MOSS-TTS-Nano-Demo/](https://openmoss.github.io/MOSS-TTS-Nano-Demo/).
57
+
58
+ ## Demo
59
+
60
+ - Online Demo: [https://openmoss.github.io/MOSS-TTS-Nano-Demo/](https://openmoss.github.io/MOSS-TTS-Nano-Demo/)
61
+ - Hugging Face Space: [OpenMOSS-Team/MOSS-TTS-Nano](https://huggingface.co/spaces/OpenMOSS-Team/MOSS-TTS-Nano)
62
+
63
+ ## Contents
64
+
65
+ - [News](#news)
66
+ - [Demo](#demo)
67
+ - [Introduction](#introduction)
68
+ - [Main Features](#main-features)
69
+ - [Supported Languages](#supported-languages)
70
+ - [Quickstart](#quickstart)
71
+ - [Environment Setup](#environment-setup)
72
+ - [Voice Clone with `infer.py`](#voice-clone-with-inferpy)
73
+ - [Local Web Demo with `app.py`](#local-web-demo-with-apppy)
74
+ - [CLI Command: `moss-tts-nano generate`](#cli-command-moss-tts-nano-generate)
75
+ - [CLI Command: `moss-tts-nano serve`](#cli-command-moss-tts-nano-serve)
76
+ - [MOSS-Audio-Tokenizer-Nano](#moss-audio-tokenizer-nano)
77
+ - [License](#license)
78
+ - [Citation](#citation)
79
+ - [Star History](#star-history)
80
+
81
+ ## Introduction
82
+
83
+ <p align="center">
84
+ <img src="./assets/images/concept.png" alt="MOSS-TTS-Nano concept" width="85%" />
85
+ </p>
86
+
87
+ MOSS-TTS-Nano focuses on the part of TTS deployment that matters most in practice: **small footprint**, **low latency**, **good enough quality for realtime products**, and **simple local setup**. It uses a pure autoregressive **Audio Tokenizer + LLM** pipeline and keeps the inference workflow friendly for both terminal users and web-demo users.
88
+
89
+ ### Main Features
90
+
91
+ - **Tiny model size**: only **0.1B parameters**
92
+ - **Native audio format**: **48 kHz**, **2-channel** output
93
+ - **Multilingual**: supports **Chinese, English, and more**
94
+ - **Pure autoregressive architecture**: built on **Audio Tokenizer + LLM**
95
+ - **Streaming inference**: low realtime latency and fast first audio
96
+ - **CPU friendly**: streaming generation can run on a **4-core CPU**
97
+ - **Long-text capable**: supports long input with automatic chunked voice cloning
98
+ - **Open-source deployment**: direct `python infer.py`, `python app.py`, and packaged CLI support
99
+
100
+ ## Supported Languages
101
+
102
+ MOSS-TTS-Nano currently supports **20 languages**:
103
+
104
+ | Language | Code | Flag | Language | Code | Flag | Language | Code | Flag |
105
+ |---|---|---|---|---|---|---|---|---|
106
+ | Chinese | zh | 🇨🇳 | English | en | 🇺🇸 | German | de | 🇩🇪 |
107
+ | Spanish | es | 🇪🇸 | French | fr | 🇫🇷 | Japanese | ja | 🇯🇵 |
108
+ | Italian | it | 🇮🇹 | Hungarian | hu | 🇭🇺 | Korean | ko | 🇰🇷 |
109
+ | Russian | ru | 🇷🇺 | Persian (Farsi) | fa | 🇮🇷 | Arabic | ar | 🇸🇦 |
110
+ | Polish | pl | ����🇱 | Portuguese | pt | 🇵🇹 | Czech | cs | 🇨🇿 |
111
+ | Danish | da | 🇩🇰 | Swedish | sv | 🇸🇪 | Greek | el | 🇬🇷 |
112
+ | Turkish | tr | 🇹🇷 | | | | | | |
113
+
114
+ ## Quickstart
115
+
116
+ ### Environment Setup
117
+
118
+ We recommend a clean Python environment first, then installing the project in editable mode so the `moss-tts-nano` command becomes available locally.
119
+ The examples below intentionally keep arguments minimal and rely on the repository defaults.
120
+ By default, the code loads `OpenMOSS-Team/MOSS-TTS-Nano` and `OpenMOSS-Team/MOSS-Audio-Tokenizer-Nano`.
121
+
122
+ #### Using Conda
123
+
124
+ ```bash
125
+ conda create -n moss-tts-nano python=3.12 -y
126
+ conda activate moss-tts-nano
127
+
128
+ git clone https://github.com/OpenMOSS/MOSS-TTS-Nano.git
129
+ cd MOSS-TTS-Nano
130
+
131
+ pip install -r requirements.txt
132
+ pip install -e .
133
+ ```
134
+
135
+ If `WeTextProcessing` fails to install from `requirements.txt`, try installing it manually in the same environment:
136
+
137
+ ```bash
138
+ conda install -c conda-forge pynini=2.1.6.post1 -y
139
+ pip install git+https://github.com/WhizZest/WeTextProcessing.git
140
+ ```
141
+
142
+ ### Voice Clone with `infer.py`
143
+
144
+ This repository keeps the direct Python entrypoint for local inference. The example below uses **voice clone mode**, which is the main recommended workflow for MOSS-TTS-Nano.
145
+
146
+ ```bash
147
+ python infer.py \
148
+ --prompt-audio-path assets/audio/zh_1.wav \
149
+ --text "欢迎关注模思智能、上海创智学院与复旦大学自然语言处理实验室。"
150
+ ```
151
+
152
+ This writes audio to `generated_audio/infer_output.wav` by default.
153
+
154
+ ### Local Web Demo with `app.py`
155
+
156
+ You can launch the local FastAPI demo for browser-based testing:
157
+
158
+ ```bash
159
+ python app.py
160
+ ```
161
+
162
+ Then open `http://127.0.0.1:18083` in your browser.
163
+
164
+ ### CLI Command: `moss-tts-nano generate`
165
+
166
+ After `pip install -e .`, you can call the packaged CLI directly:
167
+
168
+ ```bash
169
+ moss-tts-nano generate \
170
+ --prompt-speech assets/audio/zh_1.wav \
171
+ --text "欢迎关注模思智能、上海创智学院与复旦大学自然语言处理实验室。"
172
+ ```
173
+
174
+ Useful notes:
175
+
176
+ - `moss-tts-nano generate` writes to `generated_audio/moss_tts_nano_output.wav` by default.
177
+ - `--prompt-speech` is the friendly alias for the reference audio path used by voice cloning.
178
+ - `--text-file` is supported for long-form synthesis.
179
+
180
+ ### CLI Command: `moss-tts-nano serve`
181
+
182
+ You can also launch the web demo through the packaged CLI:
183
+
184
+ ```bash
185
+ moss-tts-nano serve
186
+ ```
187
+
188
+ This command forwards to `app.py`, keeps the model loaded in memory, and serves the local browser demo plus HTTP generation endpoints.
189
+
190
+ ## MOSS-Audio-Tokenizer-Nano
191
+
192
+ <a id="mat-intro"></a>
193
+ ### Introduction
194
+ **MOSS-Audio-Tokenizer** is the unified discrete audio interface for the entire MOSS-TTS family. It is built on the **Cat** (**C**ausal **A**udio **T**okenizer with **T**ransformer) architecture, a CNN-free audio tokenizer composed entirely of causal Transformer blocks. It serves as the shared audio backbone for MOSS-TTS, MOSS-TTS-Nano, MOSS-TTSD, MOSS-VoiceGenerator, MOSS-SoundEffect, and MOSS-TTS-Realtime, providing a consistent audio representation across the full product family.
195
+
196
+ To further improve perceptual quality while reducing inference cost, we trained **MOSS-Audio-Tokenizer-Nano**, a lightweight tokenizer with approximately **20 million parameters** designed for high-fidelity audio compression. It supports **48 kHz** input and output as well as **stereo audio**, which helps reduce compression loss and improve listening quality. It can compress **48 kHz stereo audio** into a **12.5 Hz** token stream and uses **RVQ with 16 codebooks**, enabling high-fidelity reconstruction across variable bitrates from **0.125 kbps to 4 kbps**.
197
+
198
+
199
+ To learn more about setup, advanced usage, and evaluation metrics, please visit the [MOSS-Audio-Tokenizer Repository](https://github.com/OpenMOSS/MOSS-Audio-Tokenizer)
200
+
201
+ <p align="center">
202
+ <img src="./assets/images/arch_moss_audio_tokenizer_nano.png" alt="MOSS-Audio-Tokenizer-Nano architecture" width="100%" />
203
+ Architecture of MOSS-Audio-Tokenizer-Nano
204
+ </p>
205
+
206
+ ### Model Weights
207
+
208
+ | Model | Hugging Face | ModelScope |
209
+ |:-----:|:------------:|:----------:|
210
+ | **MOSS-Audio-Tokenizer-Nano** | [![Hugging Face](https://img.shields.io/badge/Huggingface-Model-orange?logo=huggingface)](https://huggingface.co/OpenMOSS-Team/MOSS-Audio-Tokenizer-Nano) | [![ModelScope](https://img.shields.io/badge/ModelScope-Models-7B61FF?logo=modelscope&amp;logoColor=white)](https://modelscope.cn/models/openmoss/MOSS-Audio-Tokenizer-Nano) |
211
+
212
+
213
+ ## License
214
+
215
+ This repository will follow the license specified in the root `LICENSE` file. If you are reading this before that file is published, please treat the repository as **not yet licensed for redistribution**.
216
+
217
+ ## Citation
218
+
219
+ If you use the MOSS-TTS work in your research or product, please cite:
220
+
221
+ ```bibtex
222
+ @misc{openmoss2026mossttsnano,
223
+ title={MOSS-TTS-Nano},
224
+ author={OpenMOSS Team},
225
+ year={2026},
226
+ howpublished={GitHub repository},
227
+ url={https://github.com/OpenMOSS/MOSS-TTS-Nano}
228
+ }
229
+ ```
230
+
231
+ ```bibtex
232
+ @misc{gong2026mossttstechnicalreport,
233
+ title={MOSS-TTS Technical Report},
234
+ author={Yitian Gong and Botian Jiang and Yiwei Zhao and Yucheng Yuan and Kuangwei Chen and Yaozhou Jiang and Cheng Chang and Dong Hong and Mingshu Chen and Ruixiao Li and Yiyang Zhang and Yang Gao and Hanfu Chen and Ke Chen and Songlin Wang and Xiaogui Yang and Yuqian Zhang and Kexin Huang and ZhengYuan Lin and Kang Yu and Ziqi Chen and Jin Wang and Zhaoye Fei and Qinyuan Cheng and Shimin Li and Xipeng Qiu},
235
+ year={2026},
236
+ eprint={2603.18090},
237
+ archivePrefix={arXiv},
238
+ primaryClass={cs.SD},
239
+ url={https://arxiv.org/abs/2603.18090}
240
+ }
241
+ ```
242
+
243
+ ```bibtex
244
+ @misc{gong2026mossaudiotokenizerscalingaudiotokenizers,
245
+ title={MOSS-Audio-Tokenizer: Scaling Audio Tokenizers for Future Audio Foundation Models},
246
+ author={Yitian Gong and Kuangwei Chen and Zhaoye Fei and Xiaogui Yang and Ke Chen and Yang Wang and Kexin Huang and Mingshu Chen and Ruixiao Li and Qingyuan Cheng and Shimin Li and Xipeng Qiu},
247
+ year={2026},
248
+ eprint={2602.10934},
249
+ archivePrefix={arXiv},
250
+ primaryClass={cs.SD},
251
+ url={https://arxiv.org/abs/2602.10934},
252
+ }
253
+ ```
models/MOSS-TTS-Nano-100M/__init__.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .configuration_moss_tts_nano import MossTTSNanoConfig
2
+ from .modeling_moss_tts_nano import (
3
+ MossTTSNanoForCausalLM,
4
+ MossTTSNanoGenerationOutput,
5
+ MossTTSNanoOutput,
6
+ )
7
+ from .tokenization_moss_tts_nano import MossTTSNanoSentencePieceTokenizer
8
+
9
+ try:
10
+ MossTTSNanoConfig.register_for_auto_class()
11
+ except Exception:
12
+ pass
13
+
14
+ for auto_class_name in ("AutoModel", "AutoModelForCausalLM"):
15
+ try:
16
+ MossTTSNanoForCausalLM.register_for_auto_class(auto_class_name)
17
+ except Exception:
18
+ pass
19
+
20
+ try:
21
+ MossTTSNanoSentencePieceTokenizer.register_for_auto_class("AutoTokenizer")
22
+ except Exception:
23
+ pass
24
+
25
+ __all__ = [
26
+ "MossTTSNanoConfig",
27
+ "MossTTSNanoForCausalLM",
28
+ "MossTTSNanoSentencePieceTokenizer",
29
+ "MossTTSNanoGenerationOutput",
30
+ "MossTTSNanoOutput",
31
+ ]
models/MOSS-TTS-Nano-100M/assets/images/OpenMOSS_Logo.png ADDED
models/MOSS-TTS-Nano-100M/assets/images/arch_moss_audio_tokenizer_nano.png ADDED

Git LFS Details

  • SHA256: 2975096ead35b386724868a86a79de46a044eea2cbb815fb75b16f8ac9511db4
  • Pointer size: 131 Bytes
  • Size of remote file: 174 kB
models/MOSS-TTS-Nano-100M/assets/images/concept.png ADDED

Git LFS Details

  • SHA256: 18c079211d63da4e3bc622d49c72ecb96d0f5f078fc912fb5f29065cd4ad3a5f
  • Pointer size: 132 Bytes
  • Size of remote file: 2.23 MB
models/MOSS-TTS-Nano-100M/assets/images/mosi-logo.png ADDED
models/MOSS-TTS-Nano-100M/assets/images/wechat.jpg ADDED
models/MOSS-TTS-Nano-100M/config.json ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_cross_attention": false,
3
+ "architectures": [
4
+ "MossTTSNanoForCausalLM"
5
+ ],
6
+ "attn_implementation": "flash_attention_2",
7
+ "audio_assistant_slot_token_id": 9,
8
+ "audio_codebook_sizes": [
9
+ 1024,
10
+ 1024,
11
+ 1024,
12
+ 1024,
13
+ 1024,
14
+ 1024,
15
+ 1024,
16
+ 1024,
17
+ 1024,
18
+ 1024,
19
+ 1024,
20
+ 1024,
21
+ 1024,
22
+ 1024,
23
+ 1024,
24
+ 1024
25
+ ],
26
+ "audio_end_token_id": 7,
27
+ "audio_pad_token_id": 1024,
28
+ "audio_start_token_id": 6,
29
+ "audio_tokenizer_pretrained_name_or_path": "OpenMOSS-Team/MOSS-Audio-Tokenizer-Nano",
30
+ "audio_tokenizer_sample_rate": 48000,
31
+ "audio_tokenizer_type": "moss-audio-tokenizer-nano",
32
+ "audio_user_slot_token_id": 8,
33
+ "audio_vocab_size": 1024,
34
+ "bad_words_ids": null,
35
+ "begin_suppress_tokens": null,
36
+ "bos_token_id": null,
37
+ "chunk_size_feed_forward": 0,
38
+ "cross_attention_hidden_size": null,
39
+ "decoder_start_token_id": null,
40
+ "diversity_penalty": 0.0,
41
+ "do_sample": false,
42
+ "dtype": "float32",
43
+ "early_stopping": false,
44
+ "encoder_no_repeat_ngram_size": 0,
45
+ "eos_token_id": null,
46
+ "exponential_decay_length_penalty": null,
47
+ "finetuning_task": null,
48
+ "forced_bos_token_id": null,
49
+ "forced_eos_token_id": null,
50
+ "gpt2_config": {
51
+ "_name_or_path": "",
52
+ "activation_function": "gelu_new",
53
+ "add_cross_attention": false,
54
+ "architectures": null,
55
+ "attn_pdrop": 0.0,
56
+ "bad_words_ids": null,
57
+ "begin_suppress_tokens": null,
58
+ "bos_token_id": 1,
59
+ "chunk_size_feed_forward": 0,
60
+ "cross_attention_hidden_size": null,
61
+ "decoder_start_token_id": null,
62
+ "diversity_penalty": 0.0,
63
+ "do_sample": false,
64
+ "dtype": null,
65
+ "early_stopping": false,
66
+ "embd_pdrop": 0.0,
67
+ "encoder_no_repeat_ngram_size": 0,
68
+ "eos_token_id": 2,
69
+ "exponential_decay_length_penalty": null,
70
+ "finetuning_task": null,
71
+ "forced_bos_token_id": null,
72
+ "forced_eos_token_id": null,
73
+ "id2label": {
74
+ "0": "LABEL_0",
75
+ "1": "LABEL_1"
76
+ },
77
+ "initializer_range": 0.02,
78
+ "is_decoder": false,
79
+ "is_encoder_decoder": false,
80
+ "label2id": {
81
+ "LABEL_0": 0,
82
+ "LABEL_1": 1
83
+ },
84
+ "layer_norm_epsilon": 1e-05,
85
+ "length_penalty": 1.0,
86
+ "max_length": 20,
87
+ "min_length": 0,
88
+ "model_type": "gpt2",
89
+ "n_ctx": 32768,
90
+ "n_embd": 768,
91
+ "n_head": 12,
92
+ "n_inner": 3072,
93
+ "n_layer": 12,
94
+ "n_positions": 32768,
95
+ "no_repeat_ngram_size": 0,
96
+ "num_beam_groups": 1,
97
+ "num_beams": 1,
98
+ "num_return_sequences": 1,
99
+ "output_attentions": false,
100
+ "output_hidden_states": false,
101
+ "output_scores": false,
102
+ "pad_token_id": 3,
103
+ "position_embedding_type": "rope",
104
+ "prefix": null,
105
+ "problem_type": null,
106
+ "pruned_heads": {},
107
+ "remove_invalid_values": false,
108
+ "reorder_and_upcast_attn": false,
109
+ "repetition_penalty": 1.0,
110
+ "resid_pdrop": 0.0,
111
+ "return_dict": true,
112
+ "return_dict_in_generate": false,
113
+ "rope_base": 10000.0,
114
+ "scale_attn_by_inverse_layer_idx": false,
115
+ "scale_attn_weights": true,
116
+ "sep_token_id": null,
117
+ "summary_activation": null,
118
+ "summary_first_dropout": 0.1,
119
+ "summary_proj_to_labels": true,
120
+ "summary_type": "cls_index",
121
+ "summary_use_proj": true,
122
+ "suppress_tokens": null,
123
+ "task_specific_params": null,
124
+ "temperature": 1.0,
125
+ "tf_legacy_loss": false,
126
+ "tie_encoder_decoder": false,
127
+ "tie_word_embeddings": true,
128
+ "tokenizer_class": null,
129
+ "top_k": 50,
130
+ "top_p": 1.0,
131
+ "torchscript": false,
132
+ "transformers_version": "4.57.1",
133
+ "typical_p": 1.0,
134
+ "use_bfloat16": false,
135
+ "use_cache": true,
136
+ "vocab_size": 16384
137
+ },
138
+ "hidden_size": 768,
139
+ "id2label": {
140
+ "0": "LABEL_0",
141
+ "1": "LABEL_1"
142
+ },
143
+ "im_end_token_id": 5,
144
+ "im_start_token_id": 4,
145
+ "initializer_range": 0.02,
146
+ "is_decoder": false,
147
+ "is_encoder_decoder": false,
148
+ "label2id": {
149
+ "LABEL_0": 0,
150
+ "LABEL_1": 1
151
+ },
152
+ "length_penalty": 1.0,
153
+ "local_transformer_attn_implementation": "flash_attention_2",
154
+ "local_transformer_layers": 1,
155
+ "max_length": 20,
156
+ "max_position_embeddings": 32768,
157
+ "min_length": 0,
158
+ "model_architecture": "global_local_transformer",
159
+ "model_type": "moss_tts_nano",
160
+ "n_vq": 16,
161
+ "no_repeat_ngram_size": 0,
162
+ "num_beam_groups": 1,
163
+ "num_beams": 1,
164
+ "num_return_sequences": 1,
165
+ "output_attentions": false,
166
+ "output_hidden_states": false,
167
+ "output_scores": false,
168
+ "pad_token_id": 3,
169
+ "prefix": null,
170
+ "problem_type": null,
171
+ "pruned_heads": {},
172
+ "remove_invalid_values": false,
173
+ "repetition_penalty": 1.0,
174
+ "return_dict": true,
175
+ "return_dict_in_generate": false,
176
+ "sep_token_id": null,
177
+ "suppress_tokens": null,
178
+ "task_specific_params": null,
179
+ "temperature": 1.0,
180
+ "tf_legacy_loss": false,
181
+ "tie_encoder_decoder": false,
182
+ "tie_word_embeddings": true,
183
+ "tokenizer_class": "MossTTSNanoSentencePieceTokenizer",
184
+ "tokenizer_use_fast": false,
185
+ "top_k": 50,
186
+ "top_p": 1.0,
187
+ "torchscript": false,
188
+ "transformers_version": "4.57.1",
189
+ "typical_p": 1.0,
190
+ "use_bfloat16": false,
191
+ "vocab_size": 16384,
192
+ "auto_map": {
193
+ "AutoConfig": "configuration_moss_tts_nano.MossTTSNanoConfig",
194
+ "AutoModel": "modeling_moss_tts_nano.MossTTSNanoForCausalLM",
195
+ "AutoModelForCausalLM": "modeling_moss_tts_nano.MossTTSNanoForCausalLM"
196
+ }
197
+ }
models/MOSS-TTS-Nano-100M/configuration_moss_tts_nano.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ from typing import Any, Dict, Optional, Union
3
+
4
+ from transformers.configuration_utils import PretrainedConfig
5
+ from transformers.models.gpt2.configuration_gpt2 import GPT2Config
6
+
7
+
8
+ class MossTTSNanoConfig(PretrainedConfig):
9
+ model_type = "moss_tts_nano"
10
+ keys_to_ignore_at_inference = ["past_key_values"]
11
+
12
+ def __init__(
13
+ self,
14
+ gpt2_config: Optional[Union[GPT2Config, Dict[str, Any]]] = None,
15
+ n_vq: int = 8,
16
+ audio_vocab_size: Optional[int] = 1024,
17
+ audio_codebook_sizes: Optional[list[int]] = None,
18
+ audio_pad_token_id: int = 1024,
19
+ pad_token_id: int = 151643,
20
+ im_start_token_id: int = 151644,
21
+ im_end_token_id: int = 151645,
22
+ audio_start_token_id: int = 151652,
23
+ audio_end_token_id: int = 151653,
24
+ audio_user_slot_token_id: int = 151654,
25
+ audio_assistant_slot_token_id: int = 151656,
26
+ tokenizer_use_fast: bool = False,
27
+ audio_tokenizer_type: str = "moss-audio-tokenizer-nano",
28
+ audio_tokenizer_pretrained_name_or_path: Optional[str] = "OpenMOSS-Team/MOSS-Audio-Tokenizer-Nano",
29
+ audio_tokenizer_sample_rate: int = 48000,
30
+ attn_implementation: str = "flash_attention_2",
31
+ initializer_range: float = 0.02,
32
+ model_architecture: str = "global_local_transformer",
33
+ local_transformer_layers: int = 4,
34
+ local_transformer_attn_implementation: Optional[str] = None,
35
+ **kwargs: Any,
36
+ ) -> None:
37
+ if isinstance(gpt2_config, dict):
38
+ self.gpt2_config = GPT2Config(**gpt2_config)
39
+ elif gpt2_config is None:
40
+ self.gpt2_config = GPT2Config()
41
+ else:
42
+ self.gpt2_config = gpt2_config
43
+
44
+ self.n_vq = int(n_vq)
45
+ if audio_codebook_sizes is None:
46
+ if audio_vocab_size is None:
47
+ raise ValueError("audio_vocab_size must be set when audio_codebook_sizes is not provided.")
48
+ resolved_audio_codebook_sizes = [int(audio_vocab_size)] * self.n_vq
49
+ else:
50
+ resolved_audio_codebook_sizes = [int(codebook_size) for codebook_size in audio_codebook_sizes]
51
+ if len(resolved_audio_codebook_sizes) != self.n_vq:
52
+ raise ValueError(
53
+ "audio_codebook_sizes must have length n_vq "
54
+ f"(expected {self.n_vq}, got {len(resolved_audio_codebook_sizes)})."
55
+ )
56
+ if any(codebook_size <= 0 for codebook_size in resolved_audio_codebook_sizes):
57
+ raise ValueError("audio_codebook_sizes must contain positive integers.")
58
+
59
+ max_audio_codebook_size = max(resolved_audio_codebook_sizes)
60
+ if audio_vocab_size is not None and int(audio_vocab_size) < max_audio_codebook_size:
61
+ raise ValueError(
62
+ "audio_vocab_size must be >= max(audio_codebook_sizes) "
63
+ f"(got {audio_vocab_size}, expected at least {max_audio_codebook_size})."
64
+ )
65
+
66
+ self.audio_codebook_sizes = resolved_audio_codebook_sizes
67
+ self.audio_vocab_size = (
68
+ max_audio_codebook_size if audio_vocab_size is None else int(audio_vocab_size)
69
+ )
70
+ self.audio_pad_token_id = int(audio_pad_token_id)
71
+ if self.audio_pad_token_id < max_audio_codebook_size:
72
+ raise ValueError(
73
+ "audio_pad_token_id must be >= max(audio_codebook_sizes) so pad stays outside every codebook "
74
+ f"(got {self.audio_pad_token_id}, max codebook size {max_audio_codebook_size})."
75
+ )
76
+ self.pad_token_id = pad_token_id
77
+ self.im_start_token_id = im_start_token_id
78
+ self.im_end_token_id = im_end_token_id
79
+ self.audio_start_token_id = audio_start_token_id
80
+ self.audio_end_token_id = audio_end_token_id
81
+ self.audio_user_slot_token_id = audio_user_slot_token_id
82
+ self.audio_assistant_slot_token_id = audio_assistant_slot_token_id
83
+ self.tokenizer_use_fast = tokenizer_use_fast
84
+ self.audio_tokenizer_type = audio_tokenizer_type
85
+ self.audio_tokenizer_pretrained_name_or_path = audio_tokenizer_pretrained_name_or_path
86
+ self.audio_tokenizer_sample_rate = audio_tokenizer_sample_rate
87
+ self.attn_implementation = attn_implementation
88
+ self.initializer_range = initializer_range
89
+ self.model_architecture = model_architecture
90
+ self.local_transformer_layers = local_transformer_layers
91
+ self.local_transformer_attn_implementation = (
92
+ attn_implementation
93
+ if local_transformer_attn_implementation is None
94
+ else local_transformer_attn_implementation
95
+ )
96
+ self.vocab_size = self.gpt2_config.vocab_size
97
+ self.hidden_size = self.gpt2_config.hidden_size
98
+ self.max_position_embeddings = self.gpt2_config.n_positions
99
+
100
+ super().__init__(pad_token_id=pad_token_id, **kwargs)
101
+
102
+ def to_dict(self) -> Dict[str, Any]:
103
+ output = super().to_dict()
104
+ output["gpt2_config"] = self.gpt2_config.to_dict()
105
+ return output
106
+
107
+
108
+ __all__ = ["MossTTSNanoConfig"]
models/MOSS-TTS-Nano-100M/gpt2_decoder.py ADDED
@@ -0,0 +1,618 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ from __future__ import annotations
3
+
4
+ from dataclasses import dataclass
5
+ from typing import Optional
6
+
7
+ import torch
8
+ import torch.nn as nn
9
+ import torch.utils.checkpoint
10
+ from transformers.activations import ACT2FN
11
+ from transformers.modeling_outputs import BaseModelOutputWithPast
12
+ from transformers.models.gpt2.configuration_gpt2 import GPT2Config
13
+
14
+ try:
15
+ from flash_attn import flash_attn_func, flash_attn_varlen_func
16
+ from flash_attn.bert_padding import pad_input, unpad_input
17
+
18
+ _FLASH_ATTN_AVAILABLE = True
19
+ except Exception:
20
+ flash_attn_func = None
21
+ flash_attn_varlen_func = None
22
+ pad_input = None
23
+ unpad_input = None
24
+ _FLASH_ATTN_AVAILABLE = False
25
+
26
+
27
+ @dataclass
28
+ class PackedSequenceMetadata:
29
+ cu_seqlens: torch.Tensor
30
+ max_seqlen: int
31
+ indices: Optional[torch.Tensor] = None
32
+ batch_size: Optional[int] = None
33
+ seq_len: Optional[int] = None
34
+
35
+
36
+ class MossTTSNanoGPT2RotaryEmbedding(nn.Module):
37
+ def __init__(self, dim: int, base: float = 10000.0) -> None:
38
+ super().__init__()
39
+ if dim % 2 != 0:
40
+ raise ValueError(f"RoPE head_dim must be even, got {dim}")
41
+ inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
42
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
43
+
44
+ def forward(
45
+ self,
46
+ position_ids: torch.LongTensor,
47
+ *,
48
+ device: torch.device,
49
+ dtype: torch.dtype,
50
+ ) -> tuple[torch.Tensor, torch.Tensor]:
51
+ if position_ids.ndim == 1:
52
+ position_ids = position_ids.unsqueeze(0)
53
+ freqs = torch.einsum("bs,d->bsd", position_ids.to(device=device, dtype=self.inv_freq.dtype), self.inv_freq)
54
+ cos = freqs.cos().repeat_interleave(2, dim=-1).unsqueeze(2).to(dtype=dtype)
55
+ sin = freqs.sin().repeat_interleave(2, dim=-1).unsqueeze(2).to(dtype=dtype)
56
+ return cos, sin
57
+
58
+
59
+ def rotate_half(hidden_states: torch.Tensor) -> torch.Tensor:
60
+ even = hidden_states[..., ::2]
61
+ odd = hidden_states[..., 1::2]
62
+ return torch.stack((-odd, even), dim=-1).reshape_as(hidden_states)
63
+
64
+
65
+ def apply_rotary_pos_emb(
66
+ hidden_states: torch.Tensor,
67
+ cos: torch.Tensor,
68
+ sin: torch.Tensor,
69
+ ) -> torch.Tensor:
70
+ return (hidden_states * cos) + (rotate_half(hidden_states) * sin)
71
+
72
+
73
+ class MossTTSNanoGPT2MLP(nn.Module):
74
+ def __init__(self, config: GPT2Config) -> None:
75
+ super().__init__()
76
+ hidden_size = int(config.hidden_size)
77
+ inner_size = int(config.n_inner or 4 * hidden_size)
78
+ self.fc_in = nn.Linear(hidden_size, inner_size)
79
+ self.fc_out = nn.Linear(inner_size, hidden_size)
80
+ self.act = ACT2FN[config.activation_function]
81
+ self.dropout = nn.Dropout(config.resid_pdrop)
82
+
83
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
84
+ hidden_states = self.fc_in(hidden_states)
85
+ hidden_states = self.act(hidden_states)
86
+ hidden_states = self.fc_out(hidden_states)
87
+ return self.dropout(hidden_states)
88
+
89
+
90
+ class MossTTSNanoGPT2Attention(nn.Module):
91
+ def __init__(self, config: GPT2Config, layer_idx: int, attn_implementation: str) -> None:
92
+ super().__init__()
93
+ hidden_size = int(config.hidden_size)
94
+ num_heads = int(config.num_attention_heads)
95
+ if hidden_size % num_heads != 0:
96
+ raise ValueError(f"hidden_size={hidden_size} must be divisible by num_attention_heads={num_heads}")
97
+
98
+ self.num_heads = num_heads
99
+ self.head_dim = hidden_size // num_heads
100
+ self.embed_dim = hidden_size
101
+ self.layer_idx = layer_idx
102
+ self.attn_implementation = attn_implementation
103
+ self.attn_dropout = float(config.attn_pdrop)
104
+ self.resid_dropout = nn.Dropout(config.resid_pdrop)
105
+ self.scale_attn_weights = bool(getattr(config, "scale_attn_weights", True))
106
+ self.scale_attn_by_inverse_layer_idx = bool(getattr(config, "scale_attn_by_inverse_layer_idx", False))
107
+ self.position_embedding_type = str(getattr(config, "position_embedding_type", "absolute")).lower()
108
+ if self.position_embedding_type not in {"absolute", "rope"}:
109
+ raise ValueError(f"Unsupported position_embedding_type={self.position_embedding_type!r}")
110
+
111
+ self.c_attn = nn.Linear(hidden_size, 3 * hidden_size)
112
+ self.c_proj = nn.Linear(hidden_size, hidden_size)
113
+ self.rotary_emb = None
114
+ if self.position_embedding_type == "rope":
115
+ self.rotary_emb = MossTTSNanoGPT2RotaryEmbedding(
116
+ self.head_dim,
117
+ base=float(getattr(config, "rope_base", 10000.0)),
118
+ )
119
+
120
+ def _split_heads(self, tensor: torch.Tensor) -> torch.Tensor:
121
+ if tensor.ndim == 3:
122
+ batch_size, seq_len, _ = tensor.shape
123
+ return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim)
124
+ if tensor.ndim == 2:
125
+ total_tokens, _ = tensor.shape
126
+ return tensor.view(total_tokens, self.num_heads, self.head_dim)
127
+ raise ValueError(f"Unsupported tensor rank for attention split: {tensor.ndim}")
128
+
129
+ def _merge_heads(self, tensor: torch.Tensor) -> torch.Tensor:
130
+ if tensor.ndim == 4:
131
+ batch_size, seq_len, _, _ = tensor.shape
132
+ return tensor.reshape(batch_size, seq_len, self.embed_dim)
133
+ if tensor.ndim == 3:
134
+ total_tokens, _, _ = tensor.shape
135
+ return tensor.reshape(total_tokens, self.embed_dim)
136
+ raise ValueError(f"Unsupported tensor rank for attention merge: {tensor.ndim}")
137
+
138
+ def _causal_attention_mask(
139
+ self,
140
+ attention_mask: Optional[torch.Tensor],
141
+ query_length: int,
142
+ key_length: int,
143
+ device: torch.device,
144
+ ) -> torch.Tensor:
145
+ query_positions = torch.arange(query_length, device=device, dtype=torch.long)
146
+ query_positions = query_positions + max(key_length - query_length, 0)
147
+ key_positions = torch.arange(key_length, device=device, dtype=torch.long)
148
+ causal = key_positions.unsqueeze(0) <= query_positions.unsqueeze(1)
149
+ causal = causal.unsqueeze(0).unsqueeze(0)
150
+ if attention_mask is None:
151
+ return causal
152
+ key_mask = attention_mask[:, None, None, :].to(dtype=torch.bool)
153
+ return causal & key_mask
154
+
155
+ def _eager_attention(
156
+ self,
157
+ query: torch.Tensor,
158
+ key: torch.Tensor,
159
+ value: torch.Tensor,
160
+ attention_mask: Optional[torch.Tensor],
161
+ ) -> torch.Tensor:
162
+ query = query.transpose(1, 2)
163
+ key = key.transpose(1, 2)
164
+ value = value.transpose(1, 2)
165
+
166
+ scale = 1.0
167
+ if self.scale_attn_weights:
168
+ scale /= self.head_dim ** 0.5
169
+ if self.scale_attn_by_inverse_layer_idx:
170
+ scale /= float(self.layer_idx + 1)
171
+
172
+ scores = torch.matmul(query, key.transpose(-1, -2)) * scale
173
+ causal_mask = self._causal_attention_mask(
174
+ attention_mask=attention_mask,
175
+ query_length=query.shape[-2],
176
+ key_length=key.shape[-2],
177
+ device=query.device,
178
+ )
179
+ scores = scores.masked_fill(~causal_mask, torch.finfo(scores.dtype).min)
180
+ probs = torch.softmax(scores, dim=-1)
181
+ if self.training and self.attn_dropout > 0:
182
+ probs = torch.dropout(probs, self.attn_dropout, train=True)
183
+ output = torch.matmul(probs, value)
184
+ return output.transpose(1, 2).contiguous()
185
+
186
+ def _sdpa_attention(
187
+ self,
188
+ query: torch.Tensor,
189
+ key: torch.Tensor,
190
+ value: torch.Tensor,
191
+ attention_mask: Optional[torch.Tensor],
192
+ ) -> torch.Tensor:
193
+ query = query.transpose(1, 2)
194
+ key = key.transpose(1, 2)
195
+ value = value.transpose(1, 2)
196
+ mask = None
197
+ query_attention_mask = None
198
+ if attention_mask is not None:
199
+ query_length = query.shape[-2]
200
+ key_length = key.shape[-2]
201
+ mask = self._causal_attention_mask(
202
+ attention_mask=attention_mask,
203
+ query_length=query_length,
204
+ key_length=key_length,
205
+ device=query.device,
206
+ )
207
+ query_attention_mask = attention_mask[:, -query_length:].to(dtype=torch.bool, device=query.device)
208
+ if not bool(query_attention_mask.all()):
209
+ # SDPA can produce NaNs when a query row is fully masked. For padded query positions,
210
+ # keep a single aligned key visible, then zero the query output after attention.
211
+ mask = mask.expand(query.shape[0], -1, -1, -1).clone()
212
+ invalid_batch, invalid_query = torch.nonzero(~query_attention_mask, as_tuple=True)
213
+ aligned_key = invalid_query + max(key_length - query_length, 0)
214
+ mask[invalid_batch, :, invalid_query, aligned_key] = True
215
+ output = torch.nn.functional.scaled_dot_product_attention(
216
+ query,
217
+ key,
218
+ value,
219
+ attn_mask=mask,
220
+ dropout_p=self.attn_dropout if self.training else 0.0,
221
+ is_causal=mask is None,
222
+ )
223
+ if query_attention_mask is not None and not bool(query_attention_mask.all()):
224
+ output = output.masked_fill(~query_attention_mask[:, None, :, None], 0.0)
225
+ return output.transpose(1, 2).contiguous()
226
+
227
+ def _flash_attention(
228
+ self,
229
+ query: torch.Tensor,
230
+ key: torch.Tensor,
231
+ value: torch.Tensor,
232
+ attention_mask: Optional[torch.Tensor],
233
+ packed_metadata: Optional[PackedSequenceMetadata],
234
+ ) -> torch.Tensor:
235
+ if not _FLASH_ATTN_AVAILABLE:
236
+ raise ImportError("flash_attn is not installed, but attn_implementation='flash_attention_2' was requested.")
237
+ if query.device.type != "cuda":
238
+ raise ValueError("flash_attention_2 requires CUDA tensors.")
239
+ if query.dtype not in (torch.float16, torch.bfloat16):
240
+ raise ValueError(
241
+ f"flash_attention_2 requires fp16/bf16 tensors, but received dtype={query.dtype}."
242
+ )
243
+
244
+ dropout_p = self.attn_dropout if self.training else 0.0
245
+ if packed_metadata is not None:
246
+ if packed_metadata.indices is not None:
247
+ query = query.reshape(-1, self.num_heads, self.head_dim).index_select(0, packed_metadata.indices)
248
+ key = key.reshape(-1, self.num_heads, self.head_dim).index_select(0, packed_metadata.indices)
249
+ value = value.reshape(-1, self.num_heads, self.head_dim).index_select(0, packed_metadata.indices)
250
+ output = flash_attn_varlen_func(
251
+ query,
252
+ key,
253
+ value,
254
+ packed_metadata.cu_seqlens,
255
+ packed_metadata.cu_seqlens,
256
+ packed_metadata.max_seqlen,
257
+ packed_metadata.max_seqlen,
258
+ dropout_p=dropout_p,
259
+ causal=True,
260
+ )
261
+ if packed_metadata.indices is None:
262
+ return output
263
+ return pad_input(
264
+ output,
265
+ packed_metadata.indices,
266
+ packed_metadata.batch_size,
267
+ packed_metadata.seq_len,
268
+ )
269
+
270
+ if attention_mask is None or bool(attention_mask.all()):
271
+ return flash_attn_func(
272
+ query,
273
+ key,
274
+ value,
275
+ dropout_p=dropout_p,
276
+ causal=True,
277
+ )
278
+
279
+ unpadded_query, indices, cu_seqlens, max_seqlen, _ = unpad_input(query, attention_mask)
280
+ unpadded_key, _, _, _, _ = unpad_input(key, attention_mask)
281
+ unpadded_value, _, _, _, _ = unpad_input(value, attention_mask)
282
+ output = flash_attn_varlen_func(
283
+ unpadded_query,
284
+ unpadded_key,
285
+ unpadded_value,
286
+ cu_seqlens,
287
+ cu_seqlens,
288
+ max_seqlen,
289
+ max_seqlen,
290
+ dropout_p=dropout_p,
291
+ causal=True,
292
+ )
293
+ return pad_input(output, indices, query.shape[0], query.shape[1])
294
+
295
+ def forward(
296
+ self,
297
+ hidden_states: torch.Tensor,
298
+ attention_mask: Optional[torch.Tensor] = None,
299
+ position_ids: Optional[torch.LongTensor] = None,
300
+ packed_metadata: Optional[PackedSequenceMetadata] = None,
301
+ layer_past: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
302
+ use_cache: bool = False,
303
+ ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, torch.Tensor]]]:
304
+ qkv = self.c_attn(hidden_states)
305
+ query, key, value = qkv.split(self.embed_dim, dim=-1)
306
+ query = self._split_heads(query)
307
+ key = self._split_heads(key)
308
+ value = self._split_heads(value)
309
+
310
+ if self.rotary_emb is not None:
311
+ if position_ids is None:
312
+ raise ValueError("position_ids must be provided when position_embedding_type='rope'.")
313
+ cos, sin = self.rotary_emb(
314
+ position_ids.to(device=query.device),
315
+ device=query.device,
316
+ dtype=query.dtype,
317
+ )
318
+ query = apply_rotary_pos_emb(query, cos, sin)
319
+ key = apply_rotary_pos_emb(key, cos, sin)
320
+
321
+ if layer_past is not None:
322
+ past_key, past_value = layer_past
323
+ key = torch.cat([past_key.to(device=key.device, dtype=key.dtype), key], dim=1)
324
+ value = torch.cat([past_value.to(device=value.device, dtype=value.dtype), value], dim=1)
325
+
326
+ present = (key, value) if use_cache else None
327
+
328
+ if self.attn_implementation == "flash_attention_2" and layer_past is None:
329
+ attn_output = self._flash_attention(
330
+ query=query,
331
+ key=key,
332
+ value=value,
333
+ attention_mask=attention_mask,
334
+ packed_metadata=packed_metadata,
335
+ )
336
+ elif self.attn_implementation == "sdpa":
337
+ attn_output = self._sdpa_attention(
338
+ query=query,
339
+ key=key,
340
+ value=value,
341
+ attention_mask=attention_mask,
342
+ )
343
+ else:
344
+ attn_output = self._eager_attention(
345
+ query=query,
346
+ key=key,
347
+ value=value,
348
+ attention_mask=attention_mask,
349
+ )
350
+
351
+ attn_output = self._merge_heads(attn_output)
352
+ attn_output = self.c_proj(attn_output)
353
+ return self.resid_dropout(attn_output), present
354
+
355
+
356
+ class MossTTSNanoGPT2Block(nn.Module):
357
+ def __init__(self, config: GPT2Config, layer_idx: int, attn_implementation: str) -> None:
358
+ super().__init__()
359
+ hidden_size = int(config.hidden_size)
360
+ self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
361
+ self.attn = MossTTSNanoGPT2Attention(config, layer_idx=layer_idx, attn_implementation=attn_implementation)
362
+ self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
363
+ self.mlp = MossTTSNanoGPT2MLP(config)
364
+
365
+ def forward(
366
+ self,
367
+ hidden_states: torch.Tensor,
368
+ attention_mask: Optional[torch.Tensor] = None,
369
+ position_ids: Optional[torch.LongTensor] = None,
370
+ packed_metadata: Optional[PackedSequenceMetadata] = None,
371
+ layer_past: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
372
+ use_cache: bool = False,
373
+ ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, torch.Tensor]]]:
374
+ attn_output, present = self.attn(
375
+ self.ln_1(hidden_states),
376
+ attention_mask=attention_mask,
377
+ position_ids=position_ids,
378
+ packed_metadata=packed_metadata,
379
+ layer_past=layer_past,
380
+ use_cache=use_cache,
381
+ )
382
+ hidden_states = hidden_states + attn_output
383
+ hidden_states = hidden_states + self.mlp(self.ln_2(hidden_states))
384
+ return hidden_states, present
385
+
386
+
387
+ class MossTTSNanoGPT2Model(nn.Module):
388
+ def __init__(self, config: GPT2Config, attn_implementation: str = "eager") -> None:
389
+ super().__init__()
390
+ self.config = config
391
+ self.attn_implementation = attn_implementation
392
+ self.position_embedding_type = str(getattr(config, "position_embedding_type", "absolute")).lower()
393
+ if self.position_embedding_type not in {"absolute", "rope"}:
394
+ raise ValueError(f"Unsupported position_embedding_type={self.position_embedding_type!r}")
395
+ hidden_size = int(config.hidden_size)
396
+ self.wte = nn.Embedding(config.vocab_size, hidden_size)
397
+ self.wpe = nn.Embedding(config.n_positions, hidden_size) if self.position_embedding_type == "absolute" else nn.Identity()
398
+ self.drop = nn.Dropout(config.embd_pdrop)
399
+ self.h = nn.ModuleList(
400
+ [MossTTSNanoGPT2Block(config, layer_idx=index, attn_implementation=attn_implementation) for index in range(config.n_layer)]
401
+ )
402
+ self.ln_f = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
403
+ self.gradient_checkpointing = False
404
+ self._reset_parameters()
405
+
406
+ def _reset_parameters(self) -> None:
407
+ init_std = float(self.config.initializer_range)
408
+ for module in self.modules():
409
+ if isinstance(module, nn.Linear):
410
+ nn.init.normal_(module.weight, mean=0.0, std=init_std)
411
+ if module.bias is not None:
412
+ nn.init.zeros_(module.bias)
413
+ elif isinstance(module, nn.Embedding):
414
+ nn.init.normal_(module.weight, mean=0.0, std=init_std)
415
+ elif isinstance(module, nn.LayerNorm):
416
+ nn.init.ones_(module.weight)
417
+ nn.init.zeros_(module.bias)
418
+
419
+ @staticmethod
420
+ def _normalize_num_sequences(
421
+ cu_seqlens: torch.Tensor,
422
+ num_sequences: Optional[torch.Tensor],
423
+ device: torch.device,
424
+ ) -> torch.Tensor:
425
+ if cu_seqlens.ndim == 1:
426
+ cu_seqlens = cu_seqlens.unsqueeze(0)
427
+ if num_sequences is None:
428
+ counts = []
429
+ for boundary in cu_seqlens:
430
+ diffs = boundary[1:] - boundary[:-1]
431
+ counts.append(int((diffs > 0).sum().item()))
432
+ return torch.tensor(counts, dtype=torch.int32, device=device)
433
+ if num_sequences.ndim == 0:
434
+ return num_sequences.unsqueeze(0)
435
+ return num_sequences
436
+
437
+ @staticmethod
438
+ def build_packed_position_ids(
439
+ attention_mask: Optional[torch.Tensor],
440
+ cu_seqlens: torch.Tensor,
441
+ num_sequences: Optional[torch.Tensor],
442
+ ) -> torch.Tensor:
443
+ if cu_seqlens.ndim == 1:
444
+ cu_seqlens = cu_seqlens.unsqueeze(0)
445
+ batch_size, seq_len = cu_seqlens.shape[0], cu_seqlens.shape[1] - 1
446
+ device = cu_seqlens.device
447
+ position_ids = torch.zeros((batch_size, seq_len), dtype=torch.long, device=device)
448
+ counts = MossTTSNanoGPT2Model._normalize_num_sequences(cu_seqlens, num_sequences, device=device)
449
+ for batch_index in range(batch_size):
450
+ sequence_count = int(counts[batch_index].item())
451
+ boundaries = cu_seqlens[batch_index, : sequence_count + 1].tolist()
452
+ for start, end in zip(boundaries[:-1], boundaries[1:]):
453
+ start = int(start)
454
+ end = int(end)
455
+ if end > start:
456
+ position_ids[batch_index, start:end] = torch.arange(end - start, device=device)
457
+ if attention_mask is not None:
458
+ position_ids = position_ids * attention_mask.to(dtype=position_ids.dtype)
459
+ return position_ids
460
+
461
+ @staticmethod
462
+ def build_packed_metadata(
463
+ hidden_states: torch.Tensor,
464
+ cu_seqlens: torch.Tensor,
465
+ num_sequences: Optional[torch.Tensor],
466
+ ) -> PackedSequenceMetadata:
467
+ if cu_seqlens.ndim == 1:
468
+ cu_seqlens = cu_seqlens.unsqueeze(0)
469
+ device = hidden_states.device
470
+ counts = MossTTSNanoGPT2Model._normalize_num_sequences(cu_seqlens, num_sequences, device=device)
471
+ flat_indices = []
472
+ cumulative = [0]
473
+ max_seqlen = 0
474
+ seq_len = hidden_states.shape[1]
475
+
476
+ for batch_index in range(hidden_states.shape[0]):
477
+ sequence_count = int(counts[batch_index].item())
478
+ boundaries = cu_seqlens[batch_index, : sequence_count + 1].tolist()
479
+ for start, end in zip(boundaries[:-1], boundaries[1:]):
480
+ start = int(start)
481
+ end = int(end)
482
+ if end <= start:
483
+ continue
484
+ segment_indices = batch_index * seq_len + torch.arange(start, end, device=device)
485
+ flat_indices.append(segment_indices)
486
+ cumulative.append(cumulative[-1] + (end - start))
487
+ max_seqlen = max(max_seqlen, end - start)
488
+
489
+ if not flat_indices:
490
+ raise ValueError("cu_seqlens did not describe any non-empty packed sequences.")
491
+
492
+ indices = torch.cat(flat_indices, dim=0)
493
+ return PackedSequenceMetadata(
494
+ cu_seqlens=torch.tensor(cumulative, dtype=torch.int32, device=device),
495
+ max_seqlen=max_seqlen,
496
+ indices=indices,
497
+ batch_size=hidden_states.shape[0],
498
+ seq_len=hidden_states.shape[1],
499
+ )
500
+
501
+ def forward(
502
+ self,
503
+ input_ids: Optional[torch.LongTensor] = None,
504
+ past_key_values: Optional[tuple[tuple[torch.Tensor, torch.Tensor], ...]] = None,
505
+ attention_mask: Optional[torch.Tensor] = None,
506
+ position_ids: Optional[torch.LongTensor] = None,
507
+ inputs_embeds: Optional[torch.FloatTensor] = None,
508
+ use_cache: Optional[bool] = None,
509
+ output_attentions: Optional[bool] = None,
510
+ output_hidden_states: Optional[bool] = None,
511
+ return_dict: bool = True,
512
+ cu_seqlens: Optional[torch.Tensor] = None,
513
+ num_sequences: Optional[torch.Tensor] = None,
514
+ ) -> BaseModelOutputWithPast:
515
+ del input_ids, output_attentions
516
+
517
+ if inputs_embeds is None:
518
+ raise ValueError("inputs_embeds must be provided.")
519
+
520
+ use_cache = bool(use_cache)
521
+ if use_cache and cu_seqlens is not None:
522
+ raise ValueError("use_cache=True is not supported together with cu_seqlens packing.")
523
+
524
+ hidden_states = inputs_embeds
525
+ if attention_mask is None:
526
+ attention_mask = torch.ones(hidden_states.shape[:2], dtype=torch.bool, device=hidden_states.device)
527
+ else:
528
+ attention_mask = attention_mask.to(dtype=torch.bool, device=hidden_states.device)
529
+ query_attention_mask = attention_mask[:, -hidden_states.shape[1] :]
530
+
531
+ packed_metadata = None
532
+ if position_ids is None:
533
+ if cu_seqlens is not None:
534
+ position_ids = self.build_packed_position_ids(
535
+ attention_mask=attention_mask,
536
+ cu_seqlens=cu_seqlens.to(device=hidden_states.device),
537
+ num_sequences=num_sequences.to(device=hidden_states.device) if num_sequences is not None else None,
538
+ )
539
+ elif attention_mask is not None:
540
+ position_ids = attention_mask.long().cumsum(dim=-1) - 1
541
+ position_ids = position_ids.masked_fill(~attention_mask, 0)
542
+ position_ids = position_ids[:, -hidden_states.shape[1] :]
543
+ else:
544
+ past_length = 0
545
+ if past_key_values is not None and len(past_key_values) > 0:
546
+ past_length = past_key_values[0][0].shape[1]
547
+ position_ids = torch.arange(hidden_states.shape[1], device=hidden_states.device, dtype=torch.long)
548
+ position_ids = position_ids + past_length
549
+ position_ids = position_ids.unsqueeze(0).expand(hidden_states.shape[0], -1)
550
+
551
+ if cu_seqlens is not None and self.attn_implementation == "flash_attention_2":
552
+ packed_metadata = self.build_packed_metadata(
553
+ hidden_states=hidden_states,
554
+ cu_seqlens=cu_seqlens.to(device=hidden_states.device),
555
+ num_sequences=num_sequences.to(device=hidden_states.device) if num_sequences is not None else None,
556
+ )
557
+
558
+ if self.position_embedding_type == "absolute":
559
+ hidden_states = hidden_states + self.wpe(position_ids)
560
+ hidden_states = self.drop(hidden_states)
561
+ hidden_states = hidden_states * query_attention_mask.unsqueeze(-1).to(dtype=hidden_states.dtype)
562
+
563
+ all_hidden_states = () if output_hidden_states else None
564
+ presents = [] if use_cache else None
565
+ for layer_index, block in enumerate(self.h):
566
+ if output_hidden_states:
567
+ all_hidden_states = all_hidden_states + (hidden_states,)
568
+
569
+ if self.gradient_checkpointing and self.training:
570
+ if use_cache:
571
+ raise ValueError("use_cache=True is not supported when gradient checkpointing is enabled during training.")
572
+
573
+ def custom_forward(*inputs):
574
+ output, _ = block(
575
+ inputs[0],
576
+ attention_mask=inputs[1],
577
+ position_ids=inputs[2],
578
+ packed_metadata=packed_metadata,
579
+ layer_past=None,
580
+ use_cache=False,
581
+ )
582
+ return output
583
+
584
+ hidden_states = torch.utils.checkpoint.checkpoint(
585
+ custom_forward,
586
+ hidden_states,
587
+ attention_mask,
588
+ position_ids,
589
+ use_reentrant=False,
590
+ )
591
+ present = None
592
+ else:
593
+ hidden_states, present = block(
594
+ hidden_states,
595
+ attention_mask=attention_mask,
596
+ position_ids=position_ids,
597
+ packed_metadata=packed_metadata,
598
+ layer_past=None if past_key_values is None else past_key_values[layer_index],
599
+ use_cache=use_cache,
600
+ )
601
+ hidden_states = hidden_states * query_attention_mask.unsqueeze(-1).to(dtype=hidden_states.dtype)
602
+ if presents is not None:
603
+ presents.append(present)
604
+
605
+ hidden_states = self.ln_f(hidden_states)
606
+ hidden_states = hidden_states * query_attention_mask.unsqueeze(-1).to(dtype=hidden_states.dtype)
607
+ if output_hidden_states:
608
+ all_hidden_states = all_hidden_states + (hidden_states,)
609
+
610
+ if not return_dict:
611
+ return (hidden_states, tuple(presents) if presents is not None else None, all_hidden_states, None)
612
+
613
+ return BaseModelOutputWithPast(
614
+ last_hidden_state=hidden_states,
615
+ past_key_values=tuple(presents) if presents is not None else None,
616
+ hidden_states=all_hidden_states,
617
+ attentions=None,
618
+ )
models/MOSS-TTS-Nano-100M/languages.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Chinese
2
+ English
3
+ German
4
+ Spanish
5
+ French
6
+ Japanese
7
+ Italian
8
+ Hebrew
9
+ Korean
10
+ Russian
11
+ Persian
12
+ Arabic
13
+ Polish
14
+ Portuguese
15
+ Czech
16
+ Danish
17
+ Swedish
18
+ Hungarian
19
+ Greek
models/MOSS-TTS-Nano-100M/modeling_moss_tts_nano.py ADDED
The diff for this file is too large to render. See raw diff
 
models/MOSS-TTS-Nano-100M/prompting.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import List, Sequence
4
+
5
+ from .configuration_moss_tts_nano import MossTTSNanoConfig
6
+
7
+
8
+ USER_ROLE_PREFIX = "user\n"
9
+ USER_TEMPLATE_REFERENCE_PREFIX = (
10
+ "<user_inst>\n"
11
+ "- Reference(s):\n"
12
+ )
13
+ USER_TEMPLATE_AFTER_REFERENCE = (
14
+ "\n- Instruction:\nNone\n"
15
+ "- Tokens:\nNone\n"
16
+ "- Quality:\nNone\n"
17
+ "- Sound Event:\nNone\n"
18
+ "- Ambient Sound:\nNone\n"
19
+ "- Language:\nNone\n"
20
+ "- Text:\n"
21
+ )
22
+ USER_TEMPLATE_PREFIX = USER_TEMPLATE_REFERENCE_PREFIX + "None" + USER_TEMPLATE_AFTER_REFERENCE
23
+ USER_TEMPLATE_SUFFIX = "\n</user_inst>"
24
+ ASSISTANT_TURN_PREFIX = "\n"
25
+ ASSISTANT_ROLE_PREFIX = "assistant\n"
26
+
27
+
28
+ def encode_text(tokenizer, text: str) -> List[int]:
29
+ try:
30
+ return list(tokenizer.encode(text, add_special_tokens=False))
31
+ except TypeError:
32
+ return list(tokenizer.encode(text))
33
+
34
+
35
+ def decode_text(tokenizer, token_ids: Sequence[int]) -> str:
36
+ try:
37
+ return str(
38
+ tokenizer.decode(
39
+ list(token_ids),
40
+ skip_special_tokens=False,
41
+ clean_up_tokenization_spaces=False,
42
+ )
43
+ )
44
+ except TypeError:
45
+ try:
46
+ return str(tokenizer.decode(list(token_ids), skip_special_tokens=False))
47
+ except TypeError:
48
+ return str(tokenizer.decode(list(token_ids)))
49
+
50
+
51
+ def build_user_prompt_prefix(tokenizer, config: MossTTSNanoConfig) -> List[int]:
52
+ return [config.im_start_token_id] + encode_text(tokenizer, USER_ROLE_PREFIX) + encode_text(
53
+ tokenizer,
54
+ USER_TEMPLATE_REFERENCE_PREFIX,
55
+ )
56
+
57
+
58
+ def build_user_prompt_after_reference(tokenizer) -> List[int]:
59
+ return encode_text(tokenizer, USER_TEMPLATE_AFTER_REFERENCE)
60
+
61
+
62
+ def build_assistant_prompt_prefix(tokenizer, config: MossTTSNanoConfig) -> List[int]:
63
+ return encode_text(tokenizer, USER_TEMPLATE_SUFFIX) + [config.im_end_token_id] + encode_text(
64
+ tokenizer,
65
+ ASSISTANT_TURN_PREFIX,
66
+ ) + [config.im_start_token_id] + encode_text(
67
+ tokenizer,
68
+ ASSISTANT_ROLE_PREFIX,
69
+ )
70
+
71
+
72
+ def build_prompt_prefix(tokenizer, config: MossTTSNanoConfig) -> List[int]:
73
+ return (
74
+ build_user_prompt_prefix(tokenizer, config)
75
+ + encode_text(tokenizer, "None")
76
+ + build_user_prompt_after_reference(tokenizer)
77
+ )
78
+
79
+
80
+ def build_prompt_suffix(tokenizer, config: MossTTSNanoConfig) -> List[int]:
81
+ return build_assistant_prompt_prefix(tokenizer, config)
82
+
83
+
84
+ def build_prompt_token_ids(
85
+ tokenizer,
86
+ config: MossTTSNanoConfig,
87
+ text_token_ids: Sequence[int],
88
+ ) -> List[int]:
89
+ return build_prompt_prefix(tokenizer, config) + [int(token_id) for token_id in text_token_ids] + build_prompt_suffix(
90
+ tokenizer,
91
+ config,
92
+ )
models/MOSS-TTS-Nano-100M/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24003f2f11ac8a2cbf70514db2d8f1c02fb451aa6b3c0bffc9da09f31cd7caa5
3
+ size 234693095
models/MOSS-TTS-Nano-100M/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/wittin/MOSS-TTS-Nano-100M
models/MOSS-TTS-Nano-100M/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
models/MOSS-TTS-Nano-100M/tokenization_moss_tts_nano.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import shutil
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ import sentencepiece as spm
8
+ from transformers import PreTrainedTokenizer
9
+
10
+
11
+ VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
12
+
13
+
14
+ class MossTTSNanoSentencePieceTokenizer(PreTrainedTokenizer):
15
+ vocab_files_names = VOCAB_FILES_NAMES
16
+ model_input_names = ["input_ids", "attention_mask"]
17
+
18
+ def __init__(
19
+ self,
20
+ vocab_file: str,
21
+ unk_token: str = "<unk>",
22
+ bos_token: str = "<s>",
23
+ eos_token: str = "</s>",
24
+ pad_token: str = "<pad>",
25
+ sp_model_kwargs: dict[str, Any] | None = None,
26
+ **kwargs,
27
+ ) -> None:
28
+ self.vocab_file = str(vocab_file)
29
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else dict(sp_model_kwargs)
30
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
31
+ self.sp_model.Load(self.vocab_file)
32
+ super().__init__(
33
+ unk_token=unk_token,
34
+ bos_token=bos_token,
35
+ eos_token=eos_token,
36
+ pad_token=pad_token,
37
+ **kwargs,
38
+ )
39
+
40
+ @property
41
+ def vocab_size(self) -> int:
42
+ return int(self.sp_model.get_piece_size())
43
+
44
+ def get_vocab(self) -> dict[str, int]:
45
+ vocab = {self.sp_model.id_to_piece(i): i for i in range(self.vocab_size)}
46
+ vocab.update(self.added_tokens_encoder)
47
+ return vocab
48
+
49
+ def _tokenize(self, text: str) -> list[str]:
50
+ return list(self.sp_model.encode(text, out_type=str))
51
+
52
+ def _convert_token_to_id(self, token: str) -> int:
53
+ token_id = int(self.sp_model.piece_to_id(token))
54
+ return token_id
55
+
56
+ def _convert_id_to_token(self, index: int) -> str:
57
+ return str(self.sp_model.id_to_piece(int(index)))
58
+
59
+ def convert_tokens_to_string(self, tokens: list[str]) -> str:
60
+ return str(self.sp_model.decode(tokens))
61
+
62
+ def save_vocabulary(self, save_directory: str, filename_prefix: str | None = None) -> tuple[str]:
63
+ save_dir = Path(save_directory)
64
+ save_dir.mkdir(parents=True, exist_ok=True)
65
+ out_name = "tokenizer.model" if filename_prefix is None else f"{filename_prefix}-tokenizer.model"
66
+ out_path = save_dir / out_name
67
+ if Path(self.vocab_file).resolve() != out_path.resolve():
68
+ shutil.copyfile(self.vocab_file, out_path)
69
+ return (str(out_path),)
70
+
71
+ def build_inputs_with_special_tokens(
72
+ self,
73
+ token_ids_0: list[int],
74
+ token_ids_1: list[int] | None = None,
75
+ ) -> list[int]:
76
+ if token_ids_1 is None:
77
+ return list(token_ids_0)
78
+ return list(token_ids_0) + list(token_ids_1)
79
+
80
+ def get_special_tokens_mask(
81
+ self,
82
+ token_ids_0: list[int],
83
+ token_ids_1: list[int] | None = None,
84
+ already_has_special_tokens: bool = False,
85
+ ) -> list[int]:
86
+ if already_has_special_tokens:
87
+ return super().get_special_tokens_mask(
88
+ token_ids_0=token_ids_0,
89
+ token_ids_1=token_ids_1,
90
+ already_has_special_tokens=True,
91
+ )
92
+ if token_ids_1 is None:
93
+ return [0] * len(token_ids_0)
94
+ return [0] * (len(token_ids_0) + len(token_ids_1))
95
+
96
+ def create_token_type_ids_from_sequences(
97
+ self,
98
+ token_ids_0: list[int],
99
+ token_ids_1: list[int] | None = None,
100
+ ) -> list[int]:
101
+ if token_ids_1 is None:
102
+ return [0] * len(token_ids_0)
103
+ return [0] * (len(token_ids_0) + len(token_ids_1))
104
+
105
+
106
+ __all__ = ["MossTTSNanoSentencePieceTokenizer"]
models/MOSS-TTS-Nano-100M/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c353ee1479b536bf414c1b247f5542b6607fb8ae91320e5af1781fee200fddff
3
+ size 470897
models/MOSS-TTS-Nano-100M/tokenizer_config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<unk>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<s>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<pad>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "additional_special_tokens": [],
37
+ "auto_map": {
38
+ "AutoTokenizer": [
39
+ "tokenization_moss_tts_nano.MossTTSNanoSentencePieceTokenizer",
40
+ null
41
+ ]
42
+ },
43
+ "backend": "custom",
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": false,
46
+ "eos_token": "</s>",
47
+ "extra_special_tokens": {},
48
+ "model_max_length": 16384,
49
+ "pad_token": "<pad>",
50
+ "tokenizer_class": "MossTTSNanoSentencePieceTokenizer",
51
+ "unk_token": "<unk>"
52
+ }