square-zero-labs Xenova HF Staff commited on
Commit
cb23798
·
0 Parent(s):

Duplicate from onnx-community/Supertonic-TTS-ONNX

Browse files

Co-authored-by: Joshua <Xenova@users.noreply.huggingface.co>

.gitattributes ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ onnx/latent_denoiser.onnx_data filter=lfs diff=lfs merge=lfs -text
37
+ onnx/text_encoder.onnx_data filter=lfs diff=lfs merge=lfs -text
38
+ onnx/voice_decoder.onnx_data filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: openrail
3
+ base_model:
4
+ - Supertone/supertonic
5
+ library_name: transformers.js
6
+ language:
7
+ - en
8
+ pipeline_tag: text-to-speech
9
+ ---
10
+
11
+ ## Usage
12
+
13
+ ### Transformers.js
14
+
15
+ If you haven't already, you can install the [Transformers.js](https://huggingface.co/docs/transformers.js) JavaScript library from [NPM](https://www.npmjs.com/package/@huggingface/transformers) using:
16
+ ```bash
17
+ npm i @huggingface/transformers
18
+ ```
19
+
20
+ You can then generate audio as follows:
21
+ ```js
22
+ import { pipeline } from '@huggingface/transformers';
23
+
24
+ const tts = await pipeline('text-to-speech', 'onnx-community/Supertonic-TTS-ONNX');
25
+
26
+ const input_text = 'This is really cool!';
27
+ const audio = await tts(input_text, {
28
+ speaker_embeddings: 'https://huggingface.co/onnx-community/Supertonic-TTS-ONNX/resolve/main/voices/F1.bin',
29
+ num_inference_steps: 5, // Higher = better quality (typically 1-50)
30
+ speed: 1.05, // Higher = faster speech (typically 0.8-1.2)
31
+ });
32
+ await audio.save('output.wav'); // or `audio.toBlob()`;
33
+ ```
34
+
35
+ ### ONNXRuntime
36
+
37
+ First, let's create a helper class, `SupertonicTTS`:
38
+
39
+ ```py
40
+ import os
41
+ import numpy as np
42
+ import onnxruntime as ort
43
+ from transformers import AutoTokenizer
44
+
45
+ class SupertonicTTS:
46
+ SAMPLE_RATE = 44100
47
+ CHUNK_COMPRESS_FACTOR = 6
48
+ BASE_CHUNK_SIZE = 512
49
+ LATENT_DIM = 24
50
+ STYLE_DIM = 128
51
+ LATENT_SIZE = BASE_CHUNK_SIZE * CHUNK_COMPRESS_FACTOR
52
+
53
+ def __init__(self, model_path):
54
+ self.model_path = model_path
55
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
56
+
57
+ # Load ONNX sessions
58
+ self.text_encoder = ort.InferenceSession(os.path.join(self.model_path, "onnx", "text_encoder.onnx"))
59
+ self.latent_denoiser = ort.InferenceSession(os.path.join(self.model_path, "onnx", "latent_denoiser.onnx"))
60
+ self.voice_decoder = ort.InferenceSession(os.path.join(self.model_path, "onnx", "voice_decoder.onnx"))
61
+
62
+ def _load_style(self, voice: str) -> np.ndarray:
63
+ voice_path = os.path.join(self.model_path, "voices", f"{voice}.bin")
64
+ if not os.path.exists(voice_path):
65
+ raise ValueError(f"Voice '{voice}' not found.")
66
+
67
+ style_vec = np.fromfile(voice_path, dtype=np.float32)
68
+ return style_vec.reshape(1, -1, self.STYLE_DIM)
69
+
70
+ def generate(self, text: list[str], *, voice: str = "M1", speed: float = 1.0, steps: int = 5) -> list[np.ndarray]:
71
+ # 1. Prepare Text Inputs
72
+ inputs = self.tokenizer(text, return_tensors="np", padding=True, truncation=True)
73
+ input_ids = inputs["input_ids"]
74
+ attn_mask = inputs["attention_mask"]
75
+ batch_size = input_ids.shape[0]
76
+
77
+ # 2. Prepare Style
78
+ style = self._load_style(voice).repeat(batch_size, axis=0)
79
+
80
+ # 3. Text Encoding
81
+ last_hidden_state, raw_durations = self.text_encoder.run(
82
+ None,
83
+ {"input_ids": input_ids, "attention_mask": attn_mask, "style": style}
84
+ )
85
+ durations = (raw_durations / speed * self.SAMPLE_RATE).astype(np.int64)
86
+
87
+ # 4. Latent Preparation
88
+ latent_lengths = (durations + self.LATENT_SIZE - 1) // self.LATENT_SIZE
89
+ max_len = latent_lengths.max()
90
+ latent_mask = (np.arange(max_len) < latent_lengths[:, None]).astype(np.int64)
91
+ latents = np.random.randn(batch_size, self.LATENT_DIM * self.CHUNK_COMPRESS_FACTOR, max_len).astype(np.float32)
92
+ latents *= latent_mask[:, None, :]
93
+
94
+ # 5. Denoising Loop
95
+ num_inference_steps = np.full(batch_size, steps, dtype=np.float32)
96
+ for step in range(steps):
97
+ timestep = np.full(batch_size, step, dtype=np.float32)
98
+ latents = self.latent_denoiser.run(
99
+ None,
100
+ {
101
+ "noisy_latents": latents,
102
+ "latent_mask": latent_mask,
103
+ "style": style,
104
+ "encoder_outputs": last_hidden_state,
105
+ "attention_mask": attn_mask,
106
+ "timestep": timestep,
107
+ "num_inference_steps": num_inference_steps,
108
+ },
109
+ )[0]
110
+
111
+ # 6. Decode Latents to Audio
112
+ waveforms = self.voice_decoder.run(None, {"latents": latents})[0]
113
+
114
+ # 7. Post-process: Trim padding and return list of arrays
115
+ results = []
116
+ for i, length in enumerate(latent_mask.sum(axis=1) * self.LATENT_SIZE):
117
+ results.append(waveforms[i, :length])
118
+
119
+ return results
120
+ ```
121
+
122
+ Next, we clone this repository (using whichever way you want, `git clone`, `huggingface_hub`, etc.)
123
+ ```py
124
+ # (Optional) Download model files (or use existing local directory)
125
+ from huggingface_hub import snapshot_download
126
+ model_id = "onnx-community/Supertonic-TTS-ONNX"
127
+ local_dir = "supertonic"
128
+ snapshot_download(model_id, local_dir=local_dir)
129
+ ```
130
+
131
+ We can then use the model as follows:
132
+
133
+ ```py
134
+ # Initialize TTS
135
+ tts = SupertonicTTS(local_dir)
136
+
137
+ # Generate audio
138
+ prompts = [
139
+ "Once upon a time, there was a brave knight.",
140
+ "Refactoring code makes it much easier to read!",
141
+ "I love this!"
142
+ ]
143
+ audio_data = tts.generate(prompts, voice="M1", speed=1.0, steps=10)
144
+
145
+ # (Optional) Save to files
146
+ import soundfile as sf
147
+ for i, audio in enumerate(audio_data):
148
+ filename = f"output_{i}.wav"
149
+ sf.write(filename, audio, tts.SAMPLE_RATE)
150
+ print(f"Saved {filename}")
151
+ ```
config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_chunk_size": 512,
3
+ "chunk_compress_factor": 6,
4
+ "latent_dim": 24,
5
+ "model_type": "supertonic",
6
+ "sampling_rate": 44100,
7
+ "style_dim": 128,
8
+ "transformers.js_config": {
9
+ "dtype": "fp32",
10
+ "use_external_data_format": true
11
+ }
12
+ }
onnx/latent_denoiser.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a639a8c05c9be111848562c5cf10ea2697a589c6341830aac479d0ce7b75aa9
3
+ size 398102
onnx/latent_denoiser.onnx_data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cde4abf1136defce235bc446eaab4954a57721ae8d5a4754cdd337bf191b612f
3
+ size 132098880
onnx/text_encoder.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50a03d29d5dc95918eeff578f542b814f3cf5a741f927116f5a8462a76ff6898
3
+ size 433169
onnx/text_encoder.onnx_data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6415854f135a318909dc716e90f83a391d9a91bd9da09bdb6d6763d6b0a6c102
3
+ size 28426752
onnx/voice_decoder.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83c104006dabcd6b568c0d5acb6fec18f65609d2391dd2c459e4440e85027669
3
+ size 59921
onnx/voice_decoder.onnx_data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea52402c9ba5131ee2b3901a86db2f0b435b322169cd75157e053493d967d17f
3
+ size 101353472
tokenizer.json ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [],
6
+ "normalizer": {
7
+ "type": "Sequence",
8
+ "normalizers": [
9
+ {
10
+ "type": "NFKD"
11
+ },
12
+ {
13
+ "type": "Replace",
14
+ "pattern": {
15
+ "Regex": "\\s+"
16
+ },
17
+ "content": " "
18
+ },
19
+ {
20
+ "type": "Replace",
21
+ "pattern": {
22
+ "Regex": "[\u2013\u2014]"
23
+ },
24
+ "content": "-"
25
+ },
26
+ {
27
+ "type": "Replace",
28
+ "pattern": {
29
+ "Regex": "[^ -\"$-.0-;?A-Za-z£́]"
30
+ },
31
+ "content": ""
32
+ }
33
+ ]
34
+ },
35
+ "pre_tokenizer": {
36
+ "type": "FixedLength",
37
+ "length": 1
38
+ },
39
+ "post_processor": null,
40
+ "decoder": {
41
+ "type": "Fuse"
42
+ },
43
+ "model": {
44
+ "type": "WordLevel",
45
+ "vocab": {
46
+ " ": 0,
47
+ "!": 1,
48
+ "\"": 2,
49
+ "$": 3,
50
+ "%": 4,
51
+ "&": 5,
52
+ "'": 6,
53
+ "(": 7,
54
+ ")": 8,
55
+ "*": 9,
56
+ "+": 10,
57
+ ",": 11,
58
+ "-": 12,
59
+ ".": 13,
60
+ "0": 14,
61
+ "1": 15,
62
+ "2": 16,
63
+ "3": 17,
64
+ "4": 18,
65
+ "5": 19,
66
+ "6": 20,
67
+ "7": 21,
68
+ "8": 22,
69
+ "9": 23,
70
+ ":": 24,
71
+ ";": 25,
72
+ "?": 26,
73
+ "A": 27,
74
+ "B": 28,
75
+ "C": 29,
76
+ "D": 30,
77
+ "E": 31,
78
+ "F": 32,
79
+ "G": 33,
80
+ "H": 34,
81
+ "I": 35,
82
+ "J": 36,
83
+ "K": 37,
84
+ "L": 38,
85
+ "M": 39,
86
+ "N": 40,
87
+ "O": 41,
88
+ "P": 42,
89
+ "Q": 43,
90
+ "R": 44,
91
+ "S": 45,
92
+ "T": 46,
93
+ "U": 47,
94
+ "V": 48,
95
+ "W": 49,
96
+ "X": 50,
97
+ "Y": 51,
98
+ "Z": 52,
99
+ "a": 53,
100
+ "b": 54,
101
+ "c": 55,
102
+ "d": 56,
103
+ "e": 57,
104
+ "f": 58,
105
+ "g": 59,
106
+ "h": 60,
107
+ "i": 61,
108
+ "j": 62,
109
+ "k": 63,
110
+ "l": 64,
111
+ "m": 65,
112
+ "n": 66,
113
+ "o": 67,
114
+ "p": 68,
115
+ "q": 69,
116
+ "r": 70,
117
+ "s": 71,
118
+ "t": 72,
119
+ "u": 73,
120
+ "v": 74,
121
+ "w": 75,
122
+ "x": 76,
123
+ "y": 77,
124
+ "z": 78,
125
+ "£": 79,
126
+ "\u0301": 80
127
+ },
128
+ "unk_token": "\u0301"
129
+ }
130
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "tokenizer_class": "PreTrainedTokenizerFast",
3
+ "model_max_length": 1000,
4
+ "pad_token": " ",
5
+ "pad_token_id": 0
6
+ }
voices/F1.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ef84e3421e4f80994a5a40a18ba39ba9fc48175c41ae6cf3e56418820872dbf
3
+ size 51712
voices/F2.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1949cf0e066c4278980d2b835cf334dab0f8f781704c9116bf48a072278f7c72
3
+ size 51712
voices/F3.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38ee1d62ad8a02877ab0d08b501742b76cf3586ed888514df1a7f27cc0f8d171
3
+ size 51712
voices/F4.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63890c361868a296c51f9aee114f51e0a9a92c3f46a91582539545f7ab408a72
3
+ size 51712
voices/F5.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:793223d8d11e0ee49721842ebdc7bd46b4487579588f646953e75ad3fc8ffb9c
3
+ size 51712
voices/M1.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d53fbaaccf39a358010dcc5f289fc1d5cb350fe5f518be35f62cc518d794892
3
+ size 51712
voices/M2.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e02979a394f89002d920f0bcc006206d4cd8da90e8cc82d0532831a5bb20e79
3
+ size 51712
voices/M3.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:470d2b6b77239628ce90ba879ca5366fb5e6103fdd7e7053954a7b6d5dc2142a
3
+ size 51712
voices/M4.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4700e92c614fd34971a8ed9c8140c2f2162ab8ef3067f8e1e7ef67c3e6488fb7
3
+ size 51712
voices/M5.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c40fbc4093d113ef261cbc7bfe3f080dd813d3168347d682c78b1ca71a07da1f
3
+ size 51712