Lamapi commited on
Commit
bb23613
·
verified ·
1 Parent(s): c43ad90

Upload 6 files

Browse files
onnx/duration_predictor.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3eb91414d5ff8a7a239b7fe9e34e7e2bf8a8140d8375ffb14718b1c639325db
3
+ size 3700147
onnx/text_encoder.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7befd5ea8c3119769e8a6c1486c4edc6a3bc8365c67621c881bbb774b9902ff
3
+ size 36416150
onnx/tts.json ADDED
@@ -0,0 +1,311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tts_version": "v1.7.3",
3
+ "split": "opensource-multilingual",
4
+ "ttl": {
5
+ "latent_dim": 24,
6
+ "chunk_compress_factor": 6,
7
+ "batch_expander": {
8
+ "n_batch_expand": 6
9
+ },
10
+ "normalizer": {
11
+ "scale": 0.25
12
+ },
13
+ "text_encoder": {
14
+ "n_langs": 0,
15
+ "lang_emb_dim": 0,
16
+ "text_embedder": {
17
+ "char_emb_dim": 256
18
+ },
19
+ "convnext": {
20
+ "idim": 256,
21
+ "ksz": 5,
22
+ "intermediate_dim": 1024,
23
+ "num_layers": 6,
24
+ "dilation_lst": [
25
+ 1,
26
+ 1,
27
+ 2,
28
+ 2,
29
+ 4,
30
+ 4
31
+ ]
32
+ },
33
+ "attn_encoder": {
34
+ "hidden_channels": 256,
35
+ "filter_channels": 1024,
36
+ "n_heads": 4,
37
+ "n_layers": 4,
38
+ "p_dropout": 0.0
39
+ },
40
+ "proj_out": {
41
+ "idim": 256,
42
+ "odim": 256
43
+ }
44
+ },
45
+ "flow_matching": {
46
+ "sig_min": 1e-08
47
+ },
48
+ "style_encoder": {
49
+ "proj_in": {
50
+ "ldim": 24,
51
+ "chunk_compress_factor": 6,
52
+ "odim": 256
53
+ },
54
+ "convnext": {
55
+ "idim": 256,
56
+ "ksz": 5,
57
+ "intermediate_dim": 1024,
58
+ "num_layers": 6,
59
+ "dilation_lst": [
60
+ 1,
61
+ 1,
62
+ 1,
63
+ 1,
64
+ 1,
65
+ 1
66
+ ]
67
+ },
68
+ "style_token_layer": {
69
+ "input_dim": 256,
70
+ "n_style": 50,
71
+ "style_key_dim": 256,
72
+ "style_value_dim": 256,
73
+ "prototype_dim": 256,
74
+ "n_units": 256,
75
+ "n_heads": 2
76
+ }
77
+ },
78
+ "speech_prompted_text_encoder": {
79
+ "text_dim": 256,
80
+ "style_dim": 256,
81
+ "n_units": 256,
82
+ "n_heads": 2
83
+ },
84
+ "uncond_masker": {
85
+ "prob_both_uncond": 0.04,
86
+ "prob_text_uncond": 0.01,
87
+ "std": 0.1,
88
+ "text_dim": 256,
89
+ "n_style": 50,
90
+ "style_key_dim": 256,
91
+ "style_value_dim": 256
92
+ },
93
+ "vector_field": {
94
+ "n_langs": 0,
95
+ "lang_emb_dim": 0,
96
+ "proj_in": {
97
+ "ldim": 24,
98
+ "chunk_compress_factor": 6,
99
+ "odim": 512
100
+ },
101
+ "time_encoder": {
102
+ "time_dim": 64,
103
+ "hdim": 256
104
+ },
105
+ "main_blocks": {
106
+ "n_blocks": 4,
107
+ "time_cond_layer": {
108
+ "idim": 512,
109
+ "time_dim": 64
110
+ },
111
+ "style_cond_layer": {
112
+ "idim": 512,
113
+ "style_dim": 256
114
+ },
115
+ "text_cond_layer": {
116
+ "idim": 512,
117
+ "text_dim": 256,
118
+ "n_heads": 8,
119
+ "n_units": 512,
120
+ "use_residual": true,
121
+ "rotary_base": 10000,
122
+ "rotary_scale": 10
123
+ },
124
+ "convnext_0": {
125
+ "idim": 512,
126
+ "ksz": 5,
127
+ "intermediate_dim": 2048,
128
+ "num_layers": 4,
129
+ "dilation_lst": [
130
+ 1,
131
+ 2,
132
+ 4,
133
+ 8
134
+ ]
135
+ },
136
+ "convnext_1": {
137
+ "idim": 512,
138
+ "ksz": 5,
139
+ "intermediate_dim": 2048,
140
+ "num_layers": 1,
141
+ "dilation_lst": [
142
+ 1
143
+ ]
144
+ },
145
+ "convnext_2": {
146
+ "idim": 512,
147
+ "ksz": 5,
148
+ "intermediate_dim": 2048,
149
+ "num_layers": 1,
150
+ "dilation_lst": [
151
+ 1
152
+ ]
153
+ }
154
+ },
155
+ "last_convnext": {
156
+ "idim": 512,
157
+ "ksz": 5,
158
+ "intermediate_dim": 2048,
159
+ "num_layers": 4,
160
+ "dilation_lst": [
161
+ 1,
162
+ 1,
163
+ 1,
164
+ 1
165
+ ]
166
+ },
167
+ "proj_out": {
168
+ "idim": 512,
169
+ "chunk_compress_factor": 6,
170
+ "ldim": 24
171
+ }
172
+ }
173
+ },
174
+ "ae": {
175
+ "sample_rate": 44100,
176
+ "n_delay": 0,
177
+ "base_chunk_size": 512,
178
+ "chunk_compress_factor": 1,
179
+ "ldim": 24,
180
+ "encoder": {
181
+ "spec_processor": {
182
+ "n_fft": 2048,
183
+ "win_length": 2048,
184
+ "hop_length": 512,
185
+ "n_mels": 228,
186
+ "sample_rate": 44100,
187
+ "eps": 1e-05,
188
+ "norm_mean": 0.0,
189
+ "norm_std": 1.0
190
+ },
191
+ "ksz_init": 7,
192
+ "ksz": 7,
193
+ "num_layers": 10,
194
+ "dilation_lst": [
195
+ 1,
196
+ 1,
197
+ 1,
198
+ 1,
199
+ 1,
200
+ 1,
201
+ 1,
202
+ 1,
203
+ 1,
204
+ 1
205
+ ],
206
+ "intermediate_dim": 2048,
207
+ "idim": 1253,
208
+ "hdim": 512,
209
+ "odim": 24
210
+ },
211
+ "decoder": {
212
+ "ksz_init": 7,
213
+ "ksz": 7,
214
+ "num_layers": 10,
215
+ "dilation_lst": [
216
+ 1,
217
+ 2,
218
+ 4,
219
+ 1,
220
+ 2,
221
+ 4,
222
+ 1,
223
+ 1,
224
+ 1,
225
+ 1
226
+ ],
227
+ "intermediate_dim": 2048,
228
+ "idim": 24,
229
+ "hdim": 512,
230
+ "head": {
231
+ "idim": 512,
232
+ "hdim": 2048,
233
+ "odim": 512,
234
+ "ksz": 3
235
+ }
236
+ }
237
+ },
238
+ "dp": {
239
+ "latent_dim": 24,
240
+ "chunk_compress_factor": 6,
241
+ "normalizer": {
242
+ "scale": 1.0
243
+ },
244
+ "sentence_encoder": {
245
+ "char_emb_dim": 64,
246
+ "text_embedder": {
247
+ "char_emb_dim": 64
248
+ },
249
+ "convnext": {
250
+ "idim": 64,
251
+ "ksz": 5,
252
+ "intermediate_dim": 256,
253
+ "num_layers": 6,
254
+ "dilation_lst": [
255
+ 1,
256
+ 1,
257
+ 1,
258
+ 1,
259
+ 1,
260
+ 1
261
+ ]
262
+ },
263
+ "attn_encoder": {
264
+ "hidden_channels": 64,
265
+ "filter_channels": 256,
266
+ "n_heads": 2,
267
+ "n_layers": 2,
268
+ "p_dropout": 0.0
269
+ },
270
+ "proj_out": {
271
+ "idim": 64,
272
+ "odim": 64
273
+ }
274
+ },
275
+ "style_encoder": {
276
+ "proj_in": {
277
+ "ldim": 24,
278
+ "chunk_compress_factor": 6,
279
+ "odim": 64
280
+ },
281
+ "convnext": {
282
+ "idim": 64,
283
+ "ksz": 5,
284
+ "intermediate_dim": 256,
285
+ "num_layers": 4,
286
+ "dilation_lst": [
287
+ 1,
288
+ 1,
289
+ 1,
290
+ 1
291
+ ]
292
+ },
293
+ "style_token_layer": {
294
+ "input_dim": 64,
295
+ "n_style": 8,
296
+ "style_key_dim": 0,
297
+ "style_value_dim": 16,
298
+ "prototype_dim": 64,
299
+ "n_units": 64,
300
+ "n_heads": 2
301
+ }
302
+ },
303
+ "predictor": {
304
+ "sentence_dim": 64,
305
+ "n_style": 8,
306
+ "style_dim": 16,
307
+ "hdim": 128,
308
+ "n_layer": 2
309
+ }
310
+ }
311
+ }
onnx/unicode_indexer.json ADDED
The diff for this file is too large to render. See raw diff
 
onnx/vector_estimator.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:883ac868ea0275ef0e991524dc64f16b3c0376efd7c320af6b53f5b780d7c61c
3
+ size 256534781
onnx/vocoder.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:085de76dd8e8d5836d6ca66826601f615939218f90e519f70ee8a36ed2a4c4ba
3
+ size 101424195