mimba commited on
Commit
f861007
·
verified ·
1 Parent(s): 20befc3

Delete onnx

Browse files
onnx/duration_predictor.onnx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b861580c56a0cba2a2b82aa697ecb3c5a163c3240c60a0ddfac369d21d054092
3
- size 1500789
 
 
 
 
onnx/m.txt DELETED
File without changes
onnx/text_encoder.onnx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ba0c8ea74aeb5df00d21a89b8d47c71317f47120232e3deef95024dba37dbd88
3
- size 27348373
 
 
 
 
onnx/tts.json DELETED
@@ -1,316 +0,0 @@
1
- {
2
- "tts_version": "v1.5.0",
3
- "split": "opensource-en",
4
- "ttl_ckpt_path": "unknown.pt",
5
- "dp_ckpt_path": "unknown.pt",
6
- "ae_ckpt_path": "unknown.pt",
7
- "ttl_train": "unknown",
8
- "dp_train": "unknown",
9
- "ae_train": "unknown",
10
- "ttl": {
11
- "latent_dim": 24,
12
- "chunk_compress_factor": 6,
13
- "batch_expander": {
14
- "n_batch_expand": 6
15
- },
16
- "normalizer": {
17
- "scale": 0.25
18
- },
19
- "text_encoder": {
20
- "char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
21
- "text_embedder": {
22
- "char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
23
- "char_emb_dim": 256
24
- },
25
- "convnext": {
26
- "idim": 256,
27
- "ksz": 5,
28
- "intermediate_dim": 1024,
29
- "num_layers": 6,
30
- "dilation_lst": [
31
- 1,
32
- 1,
33
- 1,
34
- 1,
35
- 1,
36
- 1
37
- ]
38
- },
39
- "attn_encoder": {
40
- "hidden_channels": 256,
41
- "filter_channels": 1024,
42
- "n_heads": 4,
43
- "n_layers": 4,
44
- "p_dropout": 0.0
45
- },
46
- "proj_out": {
47
- "idim": 256,
48
- "odim": 256
49
- }
50
- },
51
- "flow_matching": {
52
- "sig_min": 0
53
- },
54
- "style_encoder": {
55
- "proj_in": {
56
- "ldim": 24,
57
- "chunk_compress_factor": 6,
58
- "odim": 256
59
- },
60
- "convnext": {
61
- "idim": 256,
62
- "ksz": 5,
63
- "intermediate_dim": 1024,
64
- "num_layers": 6,
65
- "dilation_lst": [
66
- 1,
67
- 1,
68
- 1,
69
- 1,
70
- 1,
71
- 1
72
- ]
73
- },
74
- "style_token_layer": {
75
- "input_dim": 256,
76
- "n_style": 50,
77
- "style_key_dim": 256,
78
- "style_value_dim": 256,
79
- "prototype_dim": 256,
80
- "n_units": 256,
81
- "n_heads": 2
82
- }
83
- },
84
- "speech_prompted_text_encoder": {
85
- "text_dim": 256,
86
- "style_dim": 256,
87
- "n_units": 256,
88
- "n_heads": 2
89
- },
90
- "uncond_masker": {
91
- "prob_both_uncond": 0.04,
92
- "prob_text_uncond": 0.01,
93
- "std": 0.1,
94
- "text_dim": 256,
95
- "n_style": 50,
96
- "style_key_dim": 256,
97
- "style_value_dim": 256
98
- },
99
- "vector_field": {
100
- "proj_in": {
101
- "ldim": 24,
102
- "chunk_compress_factor": 6,
103
- "odim": 512
104
- },
105
- "time_encoder": {
106
- "time_dim": 64,
107
- "hdim": 256
108
- },
109
- "main_blocks": {
110
- "n_blocks": 4,
111
- "time_cond_layer": {
112
- "idim": 512,
113
- "time_dim": 64
114
- },
115
- "style_cond_layer": {
116
- "idim": 512,
117
- "style_dim": 256
118
- },
119
- "text_cond_layer": {
120
- "idim": 512,
121
- "text_dim": 256,
122
- "n_heads": 4,
123
- "use_residual": true,
124
- "rotary_base": 10000,
125
- "rotary_scale": 10
126
- },
127
- "convnext_0": {
128
- "idim": 512,
129
- "ksz": 5,
130
- "intermediate_dim": 1024,
131
- "num_layers": 4,
132
- "dilation_lst": [
133
- 1,
134
- 2,
135
- 4,
136
- 8
137
- ]
138
- },
139
- "convnext_1": {
140
- "idim": 512,
141
- "ksz": 5,
142
- "intermediate_dim": 1024,
143
- "num_layers": 1,
144
- "dilation_lst": [
145
- 1
146
- ]
147
- },
148
- "convnext_2": {
149
- "idim": 512,
150
- "ksz": 5,
151
- "intermediate_dim": 1024,
152
- "num_layers": 1,
153
- "dilation_lst": [
154
- 1
155
- ]
156
- }
157
- },
158
- "last_convnext": {
159
- "idim": 512,
160
- "ksz": 5,
161
- "intermediate_dim": 1024,
162
- "num_layers": 4,
163
- "dilation_lst": [
164
- 1,
165
- 1,
166
- 1,
167
- 1
168
- ]
169
- },
170
- "proj_out": {
171
- "idim": 512,
172
- "chunk_compress_factor": 6,
173
- "ldim": 24
174
- }
175
- }
176
- },
177
- "ae": {
178
- "sample_rate": 44100,
179
- "n_delay": 0,
180
- "base_chunk_size": 512,
181
- "chunk_compress_factor": 1,
182
- "ldim": 24,
183
- "encoder": {
184
- "spec_processor": {
185
- "n_fft": 2048,
186
- "win_length": 2048,
187
- "hop_length": 512,
188
- "n_mels": 228,
189
- "sample_rate": 44100,
190
- "eps": 1e-05,
191
- "norm_mean": 0.0,
192
- "norm_std": 1.0
193
- },
194
- "ksz_init": 7,
195
- "ksz": 7,
196
- "num_layers": 10,
197
- "dilation_lst": [
198
- 1,
199
- 1,
200
- 1,
201
- 1,
202
- 1,
203
- 1,
204
- 1,
205
- 1,
206
- 1,
207
- 1
208
- ],
209
- "intermediate_dim": 2048,
210
- "idim": 1253,
211
- "hdim": 512,
212
- "odim": 24
213
- },
214
- "decoder": {
215
- "ksz_init": 7,
216
- "ksz": 7,
217
- "num_layers": 10,
218
- "dilation_lst": [
219
- 1,
220
- 2,
221
- 4,
222
- 1,
223
- 2,
224
- 4,
225
- 1,
226
- 1,
227
- 1,
228
- 1
229
- ],
230
- "intermediate_dim": 2048,
231
- "idim": 24,
232
- "hdim": 512,
233
- "head": {
234
- "idim": 512,
235
- "hdim": 2048,
236
- "odim": 512,
237
- "ksz": 3
238
- }
239
- }
240
- },
241
- "dp": {
242
- "latent_dim": 24,
243
- "chunk_compress_factor": 6,
244
- "normalizer": {
245
- "scale": 1.0
246
- },
247
- "sentence_encoder": {
248
- "char_emb_dim": 64,
249
- "char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
250
- "text_embedder": {
251
- "char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
252
- "char_emb_dim": 64
253
- },
254
- "convnext": {
255
- "idim": 64,
256
- "ksz": 5,
257
- "intermediate_dim": 256,
258
- "num_layers": 6,
259
- "dilation_lst": [
260
- 1,
261
- 1,
262
- 1,
263
- 1,
264
- 1,
265
- 1
266
- ]
267
- },
268
- "attn_encoder": {
269
- "hidden_channels": 64,
270
- "filter_channels": 256,
271
- "n_heads": 2,
272
- "n_layers": 2,
273
- "p_dropout": 0.0
274
- },
275
- "proj_out": {
276
- "idim": 64,
277
- "odim": 64
278
- }
279
- },
280
- "style_encoder": {
281
- "proj_in": {
282
- "ldim": 24,
283
- "chunk_compress_factor": 6,
284
- "odim": 64
285
- },
286
- "convnext": {
287
- "idim": 64,
288
- "ksz": 5,
289
- "intermediate_dim": 256,
290
- "num_layers": 4,
291
- "dilation_lst": [
292
- 1,
293
- 1,
294
- 1,
295
- 1
296
- ]
297
- },
298
- "style_token_layer": {
299
- "input_dim": 64,
300
- "n_style": 8,
301
- "style_key_dim": 0,
302
- "style_value_dim": 16,
303
- "prototype_dim": 64,
304
- "n_units": 64,
305
- "n_heads": 2
306
- }
307
- },
308
- "predictor": {
309
- "sentence_dim": 64,
310
- "n_style": 8,
311
- "style_dim": 16,
312
- "hdim": 128,
313
- "n_layer": 2
314
- }
315
- }
316
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
onnx/tts.yml DELETED
@@ -1,223 +0,0 @@
1
- tts_version: "v1.5.0"
2
-
3
- split: "opensource-en"
4
-
5
- ttl_ckpt_path: "unknown.pt"
6
-
7
- dp_ckpt_path: "unknown.pt"
8
-
9
- ae_ckpt_path: "unknown.pt"
10
-
11
- ttl_train: "unknown"
12
-
13
- dp_train: "unknown"
14
-
15
- ae_train: "unknown"
16
-
17
- ttl:
18
- latent_dim: 24
19
- chunk_compress_factor: 6
20
- batch_expander:
21
- n_batch_expand: 6
22
- normalizer:
23
- scale: 0.25
24
- text_encoder:
25
- char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
26
- text_embedder:
27
- char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
28
- char_emb_dim: 256
29
- convnext:
30
- idim: 256
31
- ksz: 5
32
- intermediate_dim: 1024
33
- num_layers: 6
34
- dilation_lst: [1, 1, 1, 1, 1, 1]
35
- attn_encoder:
36
- hidden_channels: 256
37
- filter_channels: 1024
38
- n_heads: 4
39
- n_layers: 4
40
- p_dropout: 0.0
41
- proj_out:
42
- idim: 256
43
- odim: 256
44
- flow_matching:
45
- sig_min: 0
46
- style_encoder:
47
- proj_in:
48
- ldim: 24
49
- chunk_compress_factor: 6
50
- odim: 256
51
- convnext:
52
- idim: 256
53
- ksz: 5
54
- intermediate_dim: 1024
55
- num_layers: 6
56
- dilation_lst: [1, 1, 1, 1, 1, 1]
57
- style_token_layer:
58
- input_dim: 256
59
- n_style: 50
60
- style_key_dim: 256
61
- style_value_dim: 256
62
- prototype_dim: 256
63
- n_units: 256
64
- n_heads: 2
65
- speech_prompted_text_encoder:
66
- text_dim: 256
67
- style_dim: 256
68
- n_units: 256
69
- n_heads: 2
70
- uncond_masker:
71
- prob_both_uncond: 0.04
72
- prob_text_uncond: 0.01
73
- std: 0.1
74
- text_dim: 256
75
- n_style: 50
76
- style_key_dim: 256
77
- style_value_dim: 256
78
- vector_field:
79
- proj_in:
80
- ldim: 24
81
- chunk_compress_factor: 6
82
- odim: 512
83
- time_encoder:
84
- time_dim: 64
85
- hdim: 256
86
- main_blocks:
87
- n_blocks: 4
88
- time_cond_layer:
89
- idim: 512
90
- time_dim: 64
91
- style_cond_layer:
92
- idim: 512
93
- style_dim: 256
94
- text_cond_layer:
95
- idim: 512
96
- text_dim: 256
97
- n_heads: 4
98
- use_residual: True
99
- rotary_base: 10000
100
- rotary_scale: 10
101
- convnext_0:
102
- idim: 512
103
- ksz: 5
104
- intermediate_dim: 1024
105
- num_layers: 4
106
- dilation_lst: [1, 2, 4, 8]
107
- convnext_1:
108
- idim: 512
109
- ksz: 5
110
- intermediate_dim: 1024
111
- num_layers: 1
112
- dilation_lst: [1]
113
- convnext_2:
114
- idim: 512
115
- ksz: 5
116
- intermediate_dim: 1024
117
- num_layers: 1
118
- dilation_lst: [1]
119
- last_convnext:
120
- idim: 512
121
- ksz: 5
122
- intermediate_dim: 1024
123
- num_layers: 4
124
- dilation_lst: [1, 1, 1, 1]
125
- proj_out:
126
- idim: 512
127
- chunk_compress_factor: 6
128
- ldim: 24
129
-
130
- ae:
131
- sample_rate: 44100
132
- n_delay: 0
133
- base_chunk_size: 512
134
- chunk_compress_factor: 1
135
- ldim: 24
136
- encoder:
137
- spec_processor:
138
- n_fft: 2048
139
- win_length: 2048
140
- hop_length: 512
141
- n_mels: 228
142
- sample_rate: 44100
143
- eps: 1e-05
144
- norm_mean: 0.0
145
- norm_std: 1.0
146
- ksz_init: 7
147
- ksz: 7
148
- num_layers: 10
149
- dilation_lst: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
150
- intermediate_dim: 2048
151
- idim: 1253
152
- hdim: 512
153
- odim: 24
154
- decoder:
155
- ksz_init: 7
156
- ksz: 7
157
- num_layers: 10
158
- dilation_lst: [1, 2, 4, 1, 2, 4, 1, 1, 1, 1]
159
- intermediate_dim: 2048
160
- idim: 24
161
- hdim: 512
162
- head:
163
- idim: 512
164
- hdim: 2048
165
- odim: 512
166
- ksz: 3
167
-
168
- dp:
169
- latent_dim: 24
170
- chunk_compress_factor: 6
171
- normalizer:
172
- scale: 1.0
173
- sentence_encoder:
174
- char_emb_dim: 64
175
- char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
176
- text_embedder:
177
- char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
178
- char_emb_dim: 64
179
- convnext:
180
- idim: 64
181
- ksz: 5
182
- intermediate_dim: 256
183
- num_layers: 6
184
- dilation_lst: [1, 1, 1, 1, 1, 1]
185
- attn_encoder:
186
- hidden_channels: 64
187
- filter_channels: 256
188
- n_heads: 2
189
- n_layers: 2
190
- p_dropout: 0.0
191
- proj_out:
192
- idim: 64
193
- odim: 64
194
- style_encoder:
195
- proj_in:
196
- ldim: 24
197
- chunk_compress_factor: 6
198
- odim: 64
199
- convnext:
200
- idim: 64
201
- ksz: 5
202
- intermediate_dim: 256
203
- num_layers: 4
204
- dilation_lst: [1, 1, 1, 1]
205
- style_token_layer:
206
- input_dim: 64
207
- n_style: 8
208
- style_key_dim: 0
209
- style_value_dim: 16
210
- prototype_dim: 64
211
- n_units: 64
212
- n_heads: 2
213
- predictor:
214
- sentence_dim: 64
215
- n_style: 8
216
- style_dim: 16
217
- hdim: 128
218
- n_layer: 2
219
-
220
- unicode_indexer_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/unicode_indexer.npy"
221
- unicode_indexer_json_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/unicode_indexer.json"
222
- window_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/window.json"
223
- filter_bank_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/filter_bank.json"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
onnx/unicode_indexer.json DELETED
The diff for this file is too large to render. See raw diff
 
onnx/vector_estimator.onnx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b3f82ecd2e9decc4e2236048b03628a1c1d5f14a792ba274a59b7325107aa6a6
3
- size 132471364
 
 
 
 
onnx/vocoder.onnx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:19bd51f47a186069c752403518a40f7ea4c647455056d2511f7249691ecddf7c
3
- size 101405066