kybird commited on
Commit
96af89d
·
verified ·
1 Parent(s): dab8940

Upload supertonic/tts.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. supertonic/tts.json +316 -0
supertonic/tts.json ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tts_version": "v1.6.0",
3
+ "split": "opensource-multilingual",
4
+ "ttl_ckpt_path": "unknown.pt",
5
+ "dp_ckpt_path": "unknown.pt",
6
+ "ae_ckpt_path": "unknown.pt",
7
+ "ttl_train": "unknown",
8
+ "dp_train": "unknown",
9
+ "ae_train": "unknown",
10
+ "ttl": {
11
+ "latent_dim": 24,
12
+ "chunk_compress_factor": 6,
13
+ "batch_expander": {
14
+ "n_batch_expand": 6
15
+ },
16
+ "normalizer": {
17
+ "scale": 0.25
18
+ },
19
+ "text_encoder": {
20
+ "char_dict_path": "resources/metadata/char_dict/opensource-multilingual2/char_dict.json",
21
+ "text_embedder": {
22
+ "char_dict_path": "resources/metadata/char_dict/opensource-multilingual2/char_dict.json",
23
+ "char_emb_dim": 256
24
+ },
25
+ "convnext": {
26
+ "idim": 256,
27
+ "ksz": 5,
28
+ "intermediate_dim": 1024,
29
+ "num_layers": 6,
30
+ "dilation_lst": [
31
+ 1,
32
+ 1,
33
+ 1,
34
+ 1,
35
+ 1,
36
+ 1
37
+ ]
38
+ },
39
+ "attn_encoder": {
40
+ "hidden_channels": 256,
41
+ "filter_channels": 1024,
42
+ "n_heads": 4,
43
+ "n_layers": 4,
44
+ "p_dropout": 0.1
45
+ },
46
+ "proj_out": {
47
+ "idim": 256,
48
+ "odim": 256
49
+ }
50
+ },
51
+ "flow_matching": {
52
+ "sig_min": 0
53
+ },
54
+ "style_encoder": {
55
+ "proj_in": {
56
+ "ldim": 24,
57
+ "chunk_compress_factor": 6,
58
+ "odim": 256
59
+ },
60
+ "convnext": {
61
+ "idim": 256,
62
+ "ksz": 5,
63
+ "intermediate_dim": 1024,
64
+ "num_layers": 6,
65
+ "dilation_lst": [
66
+ 1,
67
+ 1,
68
+ 1,
69
+ 1,
70
+ 1,
71
+ 1
72
+ ]
73
+ },
74
+ "style_token_layer": {
75
+ "input_dim": 256,
76
+ "n_style": 50,
77
+ "style_key_dim": 256,
78
+ "style_value_dim": 256,
79
+ "prototype_dim": 256,
80
+ "n_units": 256,
81
+ "n_heads": 2
82
+ }
83
+ },
84
+ "speech_prompted_text_encoder": {
85
+ "text_dim": 256,
86
+ "style_dim": 256,
87
+ "n_units": 256,
88
+ "n_heads": 2
89
+ },
90
+ "uncond_masker": {
91
+ "prob_both_uncond": 0.04,
92
+ "prob_text_uncond": 0.01,
93
+ "std": 0.1,
94
+ "text_dim": 256,
95
+ "n_style": 50,
96
+ "style_key_dim": 256,
97
+ "style_value_dim": 256
98
+ },
99
+ "vector_field": {
100
+ "proj_in": {
101
+ "ldim": 24,
102
+ "chunk_compress_factor": 6,
103
+ "odim": 512
104
+ },
105
+ "time_encoder": {
106
+ "time_dim": 64,
107
+ "hdim": 256
108
+ },
109
+ "main_blocks": {
110
+ "n_blocks": 4,
111
+ "time_cond_layer": {
112
+ "idim": 512,
113
+ "time_dim": 64
114
+ },
115
+ "style_cond_layer": {
116
+ "idim": 512,
117
+ "style_dim": 256
118
+ },
119
+ "text_cond_layer": {
120
+ "idim": 512,
121
+ "text_dim": 256,
122
+ "n_heads": 4,
123
+ "use_residual": true,
124
+ "rotary_base": 10000,
125
+ "rotary_scale": 10
126
+ },
127
+ "convnext_0": {
128
+ "idim": 512,
129
+ "ksz": 5,
130
+ "intermediate_dim": 1024,
131
+ "num_layers": 4,
132
+ "dilation_lst": [
133
+ 1,
134
+ 2,
135
+ 4,
136
+ 8
137
+ ]
138
+ },
139
+ "convnext_1": {
140
+ "idim": 512,
141
+ "ksz": 5,
142
+ "intermediate_dim": 1024,
143
+ "num_layers": 1,
144
+ "dilation_lst": [
145
+ 1
146
+ ]
147
+ },
148
+ "convnext_2": {
149
+ "idim": 512,
150
+ "ksz": 5,
151
+ "intermediate_dim": 1024,
152
+ "num_layers": 1,
153
+ "dilation_lst": [
154
+ 1
155
+ ]
156
+ }
157
+ },
158
+ "last_convnext": {
159
+ "idim": 512,
160
+ "ksz": 5,
161
+ "intermediate_dim": 1024,
162
+ "num_layers": 4,
163
+ "dilation_lst": [
164
+ 1,
165
+ 1,
166
+ 1,
167
+ 1
168
+ ]
169
+ },
170
+ "proj_out": {
171
+ "idim": 512,
172
+ "chunk_compress_factor": 6,
173
+ "ldim": 24
174
+ }
175
+ }
176
+ },
177
+ "ae": {
178
+ "sample_rate": 44100,
179
+ "n_delay": 0,
180
+ "base_chunk_size": 512,
181
+ "chunk_compress_factor": 1,
182
+ "ldim": 24,
183
+ "encoder": {
184
+ "spec_processor": {
185
+ "n_fft": 2048,
186
+ "win_length": 2048,
187
+ "hop_length": 512,
188
+ "n_mels": 228,
189
+ "sample_rate": 44100,
190
+ "eps": 1e-05,
191
+ "norm_mean": 0.0,
192
+ "norm_std": 1.0
193
+ },
194
+ "ksz_init": 7,
195
+ "ksz": 7,
196
+ "num_layers": 10,
197
+ "dilation_lst": [
198
+ 1,
199
+ 1,
200
+ 1,
201
+ 1,
202
+ 1,
203
+ 1,
204
+ 1,
205
+ 1,
206
+ 1,
207
+ 1
208
+ ],
209
+ "intermediate_dim": 2048,
210
+ "idim": 1253,
211
+ "hdim": 512,
212
+ "odim": 24
213
+ },
214
+ "decoder": {
215
+ "ksz_init": 7,
216
+ "ksz": 7,
217
+ "num_layers": 10,
218
+ "dilation_lst": [
219
+ 1,
220
+ 2,
221
+ 4,
222
+ 1,
223
+ 2,
224
+ 4,
225
+ 1,
226
+ 1,
227
+ 1,
228
+ 1
229
+ ],
230
+ "intermediate_dim": 2048,
231
+ "idim": 24,
232
+ "hdim": 512,
233
+ "head": {
234
+ "idim": 512,
235
+ "hdim": 2048,
236
+ "odim": 512,
237
+ "ksz": 3
238
+ }
239
+ }
240
+ },
241
+ "dp": {
242
+ "latent_dim": 24,
243
+ "chunk_compress_factor": 6,
244
+ "normalizer": {
245
+ "scale": 1.0
246
+ },
247
+ "sentence_encoder": {
248
+ "char_emb_dim": 64,
249
+ "char_dict_path": "resources/metadata/char_dict/opensource-multilingual2/char_dict.json",
250
+ "text_embedder": {
251
+ "char_dict_path": "resources/metadata/char_dict/opensource-multilingual2/char_dict.json",
252
+ "char_emb_dim": 64
253
+ },
254
+ "convnext": {
255
+ "idim": 64,
256
+ "ksz": 5,
257
+ "intermediate_dim": 256,
258
+ "num_layers": 6,
259
+ "dilation_lst": [
260
+ 1,
261
+ 1,
262
+ 1,
263
+ 1,
264
+ 1,
265
+ 1
266
+ ]
267
+ },
268
+ "attn_encoder": {
269
+ "hidden_channels": 64,
270
+ "filter_channels": 256,
271
+ "n_heads": 2,
272
+ "n_layers": 2,
273
+ "p_dropout": 0.0
274
+ },
275
+ "proj_out": {
276
+ "idim": 64,
277
+ "odim": 64
278
+ }
279
+ },
280
+ "style_encoder": {
281
+ "proj_in": {
282
+ "ldim": 24,
283
+ "chunk_compress_factor": 6,
284
+ "odim": 64
285
+ },
286
+ "convnext": {
287
+ "idim": 64,
288
+ "ksz": 5,
289
+ "intermediate_dim": 256,
290
+ "num_layers": 4,
291
+ "dilation_lst": [
292
+ 1,
293
+ 1,
294
+ 1,
295
+ 1
296
+ ]
297
+ },
298
+ "style_token_layer": {
299
+ "input_dim": 64,
300
+ "n_style": 8,
301
+ "style_key_dim": 0,
302
+ "style_value_dim": 16,
303
+ "prototype_dim": 64,
304
+ "n_units": 64,
305
+ "n_heads": 2
306
+ }
307
+ },
308
+ "predictor": {
309
+ "sentence_dim": 64,
310
+ "n_style": 8,
311
+ "style_dim": 16,
312
+ "hdim": 128,
313
+ "n_layer": 2
314
+ }
315
+ }
316
+ }