Approximetal commited on
Commit
f99f75d
·
verified ·
1 Parent(s): 16fa5ac

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +24 -0
  2. pretrained_models/ckpts/prosody_encoder/expressivity_encoder_key.txt +403 -0
  3. pretrained_models/ckpts/prosody_encoder/pretssel_cfg.json +822 -0
  4. pretrained_models/ckpts/prosody_encoder/pretssel_model.pt +3 -0
  5. pretrained_models/ckpts/prosody_encoder/prosody_UnitY2_keys.txt +1737 -0
  6. pretrained_models/ckpts/prosody_encoder/prosody_encoder_UnitY2.pt +3 -0
  7. pretrained_models/ckpts/prosody_encoder/prosody_encoder_pretssel.pt +3 -0
  8. pretrained_models/ckpts/vocos-mel-24khz/.gitattributes +34 -0
  9. pretrained_models/ckpts/vocos-mel-24khz/README.md +71 -0
  10. pretrained_models/ckpts/vocos-mel-24khz/config.yaml +24 -0
  11. pretrained_models/ckpts/vocos-mel-24khz/pytorch_model.bin +3 -0
  12. pretrained_models/data/multilingual_grl/vocab.txt +898 -0
  13. pretrained_models/data/multilingual_prosody/vocab.txt +898 -0
  14. pretrained_models/data/test_examples/en.wav +3 -0
  15. pretrained_models/data/test_examples/es.wav +3 -0
  16. pretrained_models/data/test_examples/pt.wav +3 -0
  17. pretrained_models/denoiser_model.onnx +3 -0
  18. pretrained_models/espeak-ng-data/af_dict +3 -0
  19. pretrained_models/espeak-ng-data/am_dict +0 -0
  20. pretrained_models/espeak-ng-data/an_dict +0 -0
  21. pretrained_models/espeak-ng-data/ar_dict +3 -0
  22. pretrained_models/espeak-ng-data/as_dict +0 -0
  23. pretrained_models/espeak-ng-data/az_dict +0 -0
  24. pretrained_models/espeak-ng-data/ba_dict +0 -0
  25. pretrained_models/espeak-ng-data/be_dict +0 -0
  26. pretrained_models/espeak-ng-data/bg_dict +0 -0
  27. pretrained_models/espeak-ng-data/bn_dict +0 -0
  28. pretrained_models/espeak-ng-data/bpy_dict +0 -0
  29. pretrained_models/espeak-ng-data/bs_dict +0 -0
  30. pretrained_models/espeak-ng-data/ca_dict +3 -0
  31. pretrained_models/espeak-ng-data/chr_dict +0 -0
  32. pretrained_models/espeak-ng-data/cmn_dict +3 -0
  33. pretrained_models/espeak-ng-data/cs_dict +0 -0
  34. pretrained_models/espeak-ng-data/cv_dict +0 -0
  35. pretrained_models/espeak-ng-data/cy_dict +0 -0
  36. pretrained_models/espeak-ng-data/da_dict +3 -0
  37. pretrained_models/espeak-ng-data/de_dict +0 -0
  38. pretrained_models/espeak-ng-data/el_dict +0 -0
  39. pretrained_models/espeak-ng-data/en_dict +3 -0
  40. pretrained_models/espeak-ng-data/eo_dict +0 -0
  41. pretrained_models/espeak-ng-data/es_dict +0 -0
  42. pretrained_models/espeak-ng-data/et_dict +0 -0
  43. pretrained_models/espeak-ng-data/eu_dict +0 -0
  44. pretrained_models/espeak-ng-data/fa_dict +3 -0
  45. pretrained_models/espeak-ng-data/fi_dict +0 -0
  46. pretrained_models/espeak-ng-data/fo_dict +3 -0
  47. pretrained_models/espeak-ng-data/fr_dict +0 -0
  48. pretrained_models/espeak-ng-data/ga_dict +0 -0
  49. pretrained_models/espeak-ng-data/gd_dict +0 -0
  50. pretrained_models/espeak-ng-data/gn_dict +0 -0
.gitattributes CHANGED
@@ -33,3 +33,27 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ pretrained_models/data/test_examples/en.wav filter=lfs diff=lfs merge=lfs -text
37
+ pretrained_models/data/test_examples/es.wav filter=lfs diff=lfs merge=lfs -text
38
+ pretrained_models/data/test_examples/pt.wav filter=lfs diff=lfs merge=lfs -text
39
+ pretrained_models/espeak-ng-data/af_dict filter=lfs diff=lfs merge=lfs -text
40
+ pretrained_models/espeak-ng-data/ar_dict filter=lfs diff=lfs merge=lfs -text
41
+ pretrained_models/espeak-ng-data/ca_dict filter=lfs diff=lfs merge=lfs -text
42
+ pretrained_models/espeak-ng-data/cmn_dict filter=lfs diff=lfs merge=lfs -text
43
+ pretrained_models/espeak-ng-data/da_dict filter=lfs diff=lfs merge=lfs -text
44
+ pretrained_models/espeak-ng-data/en_dict filter=lfs diff=lfs merge=lfs -text
45
+ pretrained_models/espeak-ng-data/fa_dict filter=lfs diff=lfs merge=lfs -text
46
+ pretrained_models/espeak-ng-data/fo_dict filter=lfs diff=lfs merge=lfs -text
47
+ pretrained_models/espeak-ng-data/hu_dict filter=lfs diff=lfs merge=lfs -text
48
+ pretrained_models/espeak-ng-data/ia_dict filter=lfs diff=lfs merge=lfs -text
49
+ pretrained_models/espeak-ng-data/it_dict filter=lfs diff=lfs merge=lfs -text
50
+ pretrained_models/espeak-ng-data/lb_dict filter=lfs diff=lfs merge=lfs -text
51
+ pretrained_models/espeak-ng-data/phondata filter=lfs diff=lfs merge=lfs -text
52
+ pretrained_models/espeak-ng-data/ru_dict filter=lfs diff=lfs merge=lfs -text
53
+ pretrained_models/espeak-ng-data/ta_dict filter=lfs diff=lfs merge=lfs -text
54
+ pretrained_models/espeak-ng-data/ur_dict filter=lfs diff=lfs merge=lfs -text
55
+ pretrained_models/espeak-ng-data/yue_dict filter=lfs diff=lfs merge=lfs -text
56
+ pretrained_models/espeak-ng-lib/libespeak-ng.so filter=lfs diff=lfs merge=lfs -text
57
+ pretrained_models/espeak-ng-lib/libespeak-ng.so.1 filter=lfs diff=lfs merge=lfs -text
58
+ pretrained_models/espeak-ng-lib/libespeak-ng.so.1.1.51 filter=lfs diff=lfs merge=lfs -text
59
+ pretrained_models/whisperx-vad-segmentation.bak filter=lfs diff=lfs merge=lfs -text
pretrained_models/ckpts/prosody_encoder/expressivity_encoder_key.txt ADDED
@@ -0,0 +1,403 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ encoder.pos_emb_alpha
2
+ encoder.embed_tokens.weight
3
+ encoder.embed_positions._float_tensor
4
+ encoder.fft_layers.0.self_attn.k_proj.weight
5
+ encoder.fft_layers.0.self_attn.k_proj.bias
6
+ encoder.fft_layers.0.self_attn.v_proj.weight
7
+ encoder.fft_layers.0.self_attn.v_proj.bias
8
+ encoder.fft_layers.0.self_attn.q_proj.weight
9
+ encoder.fft_layers.0.self_attn.q_proj.bias
10
+ encoder.fft_layers.0.self_attn.out_proj.weight
11
+ encoder.fft_layers.0.self_attn.out_proj.bias
12
+ encoder.fft_layers.0.layer_norm.weight
13
+ encoder.fft_layers.0.layer_norm.bias
14
+ encoder.fft_layers.0.ffn.ffn.0.weight
15
+ encoder.fft_layers.0.ffn.ffn.0.bias
16
+ encoder.fft_layers.0.ffn.ffn.2.weight
17
+ encoder.fft_layers.0.ffn.ffn.2.bias
18
+ encoder.fft_layers.0.ffn.layer_norm.weight
19
+ encoder.fft_layers.0.ffn.layer_norm.bias
20
+ encoder.fft_layers.0.film.s_gamma
21
+ encoder.fft_layers.0.film.s_beta
22
+ encoder.fft_layers.0.film.proj.weight
23
+ encoder.fft_layers.0.film.proj.bias
24
+ encoder.fft_layers.1.self_attn.k_proj.weight
25
+ encoder.fft_layers.1.self_attn.k_proj.bias
26
+ encoder.fft_layers.1.self_attn.v_proj.weight
27
+ encoder.fft_layers.1.self_attn.v_proj.bias
28
+ encoder.fft_layers.1.self_attn.q_proj.weight
29
+ encoder.fft_layers.1.self_attn.q_proj.bias
30
+ encoder.fft_layers.1.self_attn.out_proj.weight
31
+ encoder.fft_layers.1.self_attn.out_proj.bias
32
+ encoder.fft_layers.1.layer_norm.weight
33
+ encoder.fft_layers.1.layer_norm.bias
34
+ encoder.fft_layers.1.ffn.ffn.0.weight
35
+ encoder.fft_layers.1.ffn.ffn.0.bias
36
+ encoder.fft_layers.1.ffn.ffn.2.weight
37
+ encoder.fft_layers.1.ffn.ffn.2.bias
38
+ encoder.fft_layers.1.ffn.layer_norm.weight
39
+ encoder.fft_layers.1.ffn.layer_norm.bias
40
+ encoder.fft_layers.1.film.s_gamma
41
+ encoder.fft_layers.1.film.s_beta
42
+ encoder.fft_layers.1.film.proj.weight
43
+ encoder.fft_layers.1.film.proj.bias
44
+ encoder.fft_layers.2.self_attn.k_proj.weight
45
+ encoder.fft_layers.2.self_attn.k_proj.bias
46
+ encoder.fft_layers.2.self_attn.v_proj.weight
47
+ encoder.fft_layers.2.self_attn.v_proj.bias
48
+ encoder.fft_layers.2.self_attn.q_proj.weight
49
+ encoder.fft_layers.2.self_attn.q_proj.bias
50
+ encoder.fft_layers.2.self_attn.out_proj.weight
51
+ encoder.fft_layers.2.self_attn.out_proj.bias
52
+ encoder.fft_layers.2.layer_norm.weight
53
+ encoder.fft_layers.2.layer_norm.bias
54
+ encoder.fft_layers.2.ffn.ffn.0.weight
55
+ encoder.fft_layers.2.ffn.ffn.0.bias
56
+ encoder.fft_layers.2.ffn.ffn.2.weight
57
+ encoder.fft_layers.2.ffn.ffn.2.bias
58
+ encoder.fft_layers.2.ffn.layer_norm.weight
59
+ encoder.fft_layers.2.ffn.layer_norm.bias
60
+ encoder.fft_layers.2.film.s_gamma
61
+ encoder.fft_layers.2.film.s_beta
62
+ encoder.fft_layers.2.film.proj.weight
63
+ encoder.fft_layers.2.film.proj.bias
64
+ encoder.fft_layers.3.self_attn.k_proj.weight
65
+ encoder.fft_layers.3.self_attn.k_proj.bias
66
+ encoder.fft_layers.3.self_attn.v_proj.weight
67
+ encoder.fft_layers.3.self_attn.v_proj.bias
68
+ encoder.fft_layers.3.self_attn.q_proj.weight
69
+ encoder.fft_layers.3.self_attn.q_proj.bias
70
+ encoder.fft_layers.3.self_attn.out_proj.weight
71
+ encoder.fft_layers.3.self_attn.out_proj.bias
72
+ encoder.fft_layers.3.layer_norm.weight
73
+ encoder.fft_layers.3.layer_norm.bias
74
+ encoder.fft_layers.3.ffn.ffn.0.weight
75
+ encoder.fft_layers.3.ffn.ffn.0.bias
76
+ encoder.fft_layers.3.ffn.ffn.2.weight
77
+ encoder.fft_layers.3.ffn.ffn.2.bias
78
+ encoder.fft_layers.3.ffn.layer_norm.weight
79
+ encoder.fft_layers.3.ffn.layer_norm.bias
80
+ encoder.fft_layers.3.film.s_gamma
81
+ encoder.fft_layers.3.film.s_beta
82
+ encoder.fft_layers.3.film.proj.weight
83
+ encoder.fft_layers.3.film.proj.bias
84
+ prosody_encoder.blocks.0.conv.weight
85
+ prosody_encoder.blocks.0.conv.bias
86
+ prosody_encoder.blocks.0.norm.weight
87
+ prosody_encoder.blocks.0.norm.bias
88
+ prosody_encoder.blocks.1.tdnn1.conv.weight
89
+ prosody_encoder.blocks.1.tdnn1.conv.bias
90
+ prosody_encoder.blocks.1.tdnn1.norm.weight
91
+ prosody_encoder.blocks.1.tdnn1.norm.bias
92
+ prosody_encoder.blocks.1.res2net_block.blocks.0.conv.weight
93
+ prosody_encoder.blocks.1.res2net_block.blocks.0.conv.bias
94
+ prosody_encoder.blocks.1.res2net_block.blocks.0.norm.weight
95
+ prosody_encoder.blocks.1.res2net_block.blocks.0.norm.bias
96
+ prosody_encoder.blocks.1.res2net_block.blocks.1.conv.weight
97
+ prosody_encoder.blocks.1.res2net_block.blocks.1.conv.bias
98
+ prosody_encoder.blocks.1.res2net_block.blocks.1.norm.weight
99
+ prosody_encoder.blocks.1.res2net_block.blocks.1.norm.bias
100
+ prosody_encoder.blocks.1.res2net_block.blocks.2.conv.weight
101
+ prosody_encoder.blocks.1.res2net_block.blocks.2.conv.bias
102
+ prosody_encoder.blocks.1.res2net_block.blocks.2.norm.weight
103
+ prosody_encoder.blocks.1.res2net_block.blocks.2.norm.bias
104
+ prosody_encoder.blocks.1.res2net_block.blocks.3.conv.weight
105
+ prosody_encoder.blocks.1.res2net_block.blocks.3.conv.bias
106
+ prosody_encoder.blocks.1.res2net_block.blocks.3.norm.weight
107
+ prosody_encoder.blocks.1.res2net_block.blocks.3.norm.bias
108
+ prosody_encoder.blocks.1.res2net_block.blocks.4.conv.weight
109
+ prosody_encoder.blocks.1.res2net_block.blocks.4.conv.bias
110
+ prosody_encoder.blocks.1.res2net_block.blocks.4.norm.weight
111
+ prosody_encoder.blocks.1.res2net_block.blocks.4.norm.bias
112
+ prosody_encoder.blocks.1.res2net_block.blocks.5.conv.weight
113
+ prosody_encoder.blocks.1.res2net_block.blocks.5.conv.bias
114
+ prosody_encoder.blocks.1.res2net_block.blocks.5.norm.weight
115
+ prosody_encoder.blocks.1.res2net_block.blocks.5.norm.bias
116
+ prosody_encoder.blocks.1.res2net_block.blocks.6.conv.weight
117
+ prosody_encoder.blocks.1.res2net_block.blocks.6.conv.bias
118
+ prosody_encoder.blocks.1.res2net_block.blocks.6.norm.weight
119
+ prosody_encoder.blocks.1.res2net_block.blocks.6.norm.bias
120
+ prosody_encoder.blocks.1.tdnn2.conv.weight
121
+ prosody_encoder.blocks.1.tdnn2.conv.bias
122
+ prosody_encoder.blocks.1.tdnn2.norm.weight
123
+ prosody_encoder.blocks.1.tdnn2.norm.bias
124
+ prosody_encoder.blocks.1.se_block.conv1.weight
125
+ prosody_encoder.blocks.1.se_block.conv1.bias
126
+ prosody_encoder.blocks.1.se_block.conv2.weight
127
+ prosody_encoder.blocks.1.se_block.conv2.bias
128
+ prosody_encoder.blocks.2.tdnn1.conv.weight
129
+ prosody_encoder.blocks.2.tdnn1.conv.bias
130
+ prosody_encoder.blocks.2.tdnn1.norm.weight
131
+ prosody_encoder.blocks.2.tdnn1.norm.bias
132
+ prosody_encoder.blocks.2.res2net_block.blocks.0.conv.weight
133
+ prosody_encoder.blocks.2.res2net_block.blocks.0.conv.bias
134
+ prosody_encoder.blocks.2.res2net_block.blocks.0.norm.weight
135
+ prosody_encoder.blocks.2.res2net_block.blocks.0.norm.bias
136
+ prosody_encoder.blocks.2.res2net_block.blocks.1.conv.weight
137
+ prosody_encoder.blocks.2.res2net_block.blocks.1.conv.bias
138
+ prosody_encoder.blocks.2.res2net_block.blocks.1.norm.weight
139
+ prosody_encoder.blocks.2.res2net_block.blocks.1.norm.bias
140
+ prosody_encoder.blocks.2.res2net_block.blocks.2.conv.weight
141
+ prosody_encoder.blocks.2.res2net_block.blocks.2.conv.bias
142
+ prosody_encoder.blocks.2.res2net_block.blocks.2.norm.weight
143
+ prosody_encoder.blocks.2.res2net_block.blocks.2.norm.bias
144
+ prosody_encoder.blocks.2.res2net_block.blocks.3.conv.weight
145
+ prosody_encoder.blocks.2.res2net_block.blocks.3.conv.bias
146
+ prosody_encoder.blocks.2.res2net_block.blocks.3.norm.weight
147
+ prosody_encoder.blocks.2.res2net_block.blocks.3.norm.bias
148
+ prosody_encoder.blocks.2.res2net_block.blocks.4.conv.weight
149
+ prosody_encoder.blocks.2.res2net_block.blocks.4.conv.bias
150
+ prosody_encoder.blocks.2.res2net_block.blocks.4.norm.weight
151
+ prosody_encoder.blocks.2.res2net_block.blocks.4.norm.bias
152
+ prosody_encoder.blocks.2.res2net_block.blocks.5.conv.weight
153
+ prosody_encoder.blocks.2.res2net_block.blocks.5.conv.bias
154
+ prosody_encoder.blocks.2.res2net_block.blocks.5.norm.weight
155
+ prosody_encoder.blocks.2.res2net_block.blocks.5.norm.bias
156
+ prosody_encoder.blocks.2.res2net_block.blocks.6.conv.weight
157
+ prosody_encoder.blocks.2.res2net_block.blocks.6.conv.bias
158
+ prosody_encoder.blocks.2.res2net_block.blocks.6.norm.weight
159
+ prosody_encoder.blocks.2.res2net_block.blocks.6.norm.bias
160
+ prosody_encoder.blocks.2.tdnn2.conv.weight
161
+ prosody_encoder.blocks.2.tdnn2.conv.bias
162
+ prosody_encoder.blocks.2.tdnn2.norm.weight
163
+ prosody_encoder.blocks.2.tdnn2.norm.bias
164
+ prosody_encoder.blocks.2.se_block.conv1.weight
165
+ prosody_encoder.blocks.2.se_block.conv1.bias
166
+ prosody_encoder.blocks.2.se_block.conv2.weight
167
+ prosody_encoder.blocks.2.se_block.conv2.bias
168
+ prosody_encoder.blocks.3.tdnn1.conv.weight
169
+ prosody_encoder.blocks.3.tdnn1.conv.bias
170
+ prosody_encoder.blocks.3.tdnn1.norm.weight
171
+ prosody_encoder.blocks.3.tdnn1.norm.bias
172
+ prosody_encoder.blocks.3.res2net_block.blocks.0.conv.weight
173
+ prosody_encoder.blocks.3.res2net_block.blocks.0.conv.bias
174
+ prosody_encoder.blocks.3.res2net_block.blocks.0.norm.weight
175
+ prosody_encoder.blocks.3.res2net_block.blocks.0.norm.bias
176
+ prosody_encoder.blocks.3.res2net_block.blocks.1.conv.weight
177
+ prosody_encoder.blocks.3.res2net_block.blocks.1.conv.bias
178
+ prosody_encoder.blocks.3.res2net_block.blocks.1.norm.weight
179
+ prosody_encoder.blocks.3.res2net_block.blocks.1.norm.bias
180
+ prosody_encoder.blocks.3.res2net_block.blocks.2.conv.weight
181
+ prosody_encoder.blocks.3.res2net_block.blocks.2.conv.bias
182
+ prosody_encoder.blocks.3.res2net_block.blocks.2.norm.weight
183
+ prosody_encoder.blocks.3.res2net_block.blocks.2.norm.bias
184
+ prosody_encoder.blocks.3.res2net_block.blocks.3.conv.weight
185
+ prosody_encoder.blocks.3.res2net_block.blocks.3.conv.bias
186
+ prosody_encoder.blocks.3.res2net_block.blocks.3.norm.weight
187
+ prosody_encoder.blocks.3.res2net_block.blocks.3.norm.bias
188
+ prosody_encoder.blocks.3.res2net_block.blocks.4.conv.weight
189
+ prosody_encoder.blocks.3.res2net_block.blocks.4.conv.bias
190
+ prosody_encoder.blocks.3.res2net_block.blocks.4.norm.weight
191
+ prosody_encoder.blocks.3.res2net_block.blocks.4.norm.bias
192
+ prosody_encoder.blocks.3.res2net_block.blocks.5.conv.weight
193
+ prosody_encoder.blocks.3.res2net_block.blocks.5.conv.bias
194
+ prosody_encoder.blocks.3.res2net_block.blocks.5.norm.weight
195
+ prosody_encoder.blocks.3.res2net_block.blocks.5.norm.bias
196
+ prosody_encoder.blocks.3.res2net_block.blocks.6.conv.weight
197
+ prosody_encoder.blocks.3.res2net_block.blocks.6.conv.bias
198
+ prosody_encoder.blocks.3.res2net_block.blocks.6.norm.weight
199
+ prosody_encoder.blocks.3.res2net_block.blocks.6.norm.bias
200
+ prosody_encoder.blocks.3.tdnn2.conv.weight
201
+ prosody_encoder.blocks.3.tdnn2.conv.bias
202
+ prosody_encoder.blocks.3.tdnn2.norm.weight
203
+ prosody_encoder.blocks.3.tdnn2.norm.bias
204
+ prosody_encoder.blocks.3.se_block.conv1.weight
205
+ prosody_encoder.blocks.3.se_block.conv1.bias
206
+ prosody_encoder.blocks.3.se_block.conv2.weight
207
+ prosody_encoder.blocks.3.se_block.conv2.bias
208
+ prosody_encoder.mfa.conv.weight
209
+ prosody_encoder.mfa.conv.bias
210
+ prosody_encoder.mfa.norm.weight
211
+ prosody_encoder.mfa.norm.bias
212
+ prosody_encoder.asp.tdnn.conv.weight
213
+ prosody_encoder.asp.tdnn.conv.bias
214
+ prosody_encoder.asp.tdnn.norm.weight
215
+ prosody_encoder.asp.tdnn.norm.bias
216
+ prosody_encoder.asp.conv.weight
217
+ prosody_encoder.asp.conv.bias
218
+ prosody_encoder.asp_norm.weight
219
+ prosody_encoder.asp_norm.bias
220
+ prosody_encoder.fc.weight
221
+ prosody_encoder.fc.bias
222
+ enc_emb_proj.weight
223
+ enc_emb_proj.bias
224
+ embed_lang.weight
225
+ decoder.pos_emb_alpha
226
+ decoder.var_adaptor.duration_predictor.conv1.0.weight
227
+ decoder.var_adaptor.duration_predictor.conv1.0.bias
228
+ decoder.var_adaptor.duration_predictor.ln1.weight
229
+ decoder.var_adaptor.duration_predictor.ln1.bias
230
+ decoder.var_adaptor.duration_predictor.conv2.0.weight
231
+ decoder.var_adaptor.duration_predictor.conv2.0.bias
232
+ decoder.var_adaptor.duration_predictor.ln2.weight
233
+ decoder.var_adaptor.duration_predictor.ln2.bias
234
+ decoder.var_adaptor.duration_predictor.proj.weight
235
+ decoder.var_adaptor.duration_predictor.proj.bias
236
+ decoder.var_adaptor.duration_predictor.film.s_gamma
237
+ decoder.var_adaptor.duration_predictor.film.s_beta
238
+ decoder.var_adaptor.duration_predictor.film.proj.weight
239
+ decoder.var_adaptor.duration_predictor.film.proj.bias
240
+ decoder.var_adaptor.pitch_predictor.conv1.0.weight
241
+ decoder.var_adaptor.pitch_predictor.conv1.0.bias
242
+ decoder.var_adaptor.pitch_predictor.ln1.weight
243
+ decoder.var_adaptor.pitch_predictor.ln1.bias
244
+ decoder.var_adaptor.pitch_predictor.conv2.0.weight
245
+ decoder.var_adaptor.pitch_predictor.conv2.0.bias
246
+ decoder.var_adaptor.pitch_predictor.ln2.weight
247
+ decoder.var_adaptor.pitch_predictor.ln2.bias
248
+ decoder.var_adaptor.pitch_predictor.proj.weight
249
+ decoder.var_adaptor.pitch_predictor.proj.bias
250
+ decoder.var_adaptor.pitch_predictor.film.s_gamma
251
+ decoder.var_adaptor.pitch_predictor.film.s_beta
252
+ decoder.var_adaptor.pitch_predictor.film.proj.weight
253
+ decoder.var_adaptor.pitch_predictor.film.proj.bias
254
+ decoder.var_adaptor.embed_pitch.weight
255
+ decoder.var_adaptor.embed_pitch.bias
256
+ decoder.var_adaptor.vuv_predictor.conv1.0.weight
257
+ decoder.var_adaptor.vuv_predictor.conv1.0.bias
258
+ decoder.var_adaptor.vuv_predictor.ln1.weight
259
+ decoder.var_adaptor.vuv_predictor.ln1.bias
260
+ decoder.var_adaptor.vuv_predictor.conv2.0.weight
261
+ decoder.var_adaptor.vuv_predictor.conv2.0.bias
262
+ decoder.var_adaptor.vuv_predictor.ln2.weight
263
+ decoder.var_adaptor.vuv_predictor.ln2.bias
264
+ decoder.var_adaptor.vuv_predictor.proj.weight
265
+ decoder.var_adaptor.vuv_predictor.proj.bias
266
+ decoder.var_adaptor.vuv_predictor.film.s_gamma
267
+ decoder.var_adaptor.vuv_predictor.film.s_beta
268
+ decoder.var_adaptor.vuv_predictor.film.proj.weight
269
+ decoder.var_adaptor.vuv_predictor.film.proj.bias
270
+ decoder.var_adaptor.energy_predictor.conv1.0.weight
271
+ decoder.var_adaptor.energy_predictor.conv1.0.bias
272
+ decoder.var_adaptor.energy_predictor.ln1.weight
273
+ decoder.var_adaptor.energy_predictor.ln1.bias
274
+ decoder.var_adaptor.energy_predictor.conv2.0.weight
275
+ decoder.var_adaptor.energy_predictor.conv2.0.bias
276
+ decoder.var_adaptor.energy_predictor.ln2.weight
277
+ decoder.var_adaptor.energy_predictor.ln2.bias
278
+ decoder.var_adaptor.energy_predictor.proj.weight
279
+ decoder.var_adaptor.energy_predictor.proj.bias
280
+ decoder.var_adaptor.energy_predictor.film.s_gamma
281
+ decoder.var_adaptor.energy_predictor.film.s_beta
282
+ decoder.var_adaptor.energy_predictor.film.proj.weight
283
+ decoder.var_adaptor.energy_predictor.film.proj.bias
284
+ decoder.var_adaptor.embed_energy.weight
285
+ decoder.var_adaptor.embed_energy.bias
286
+ decoder.embed_positions._float_tensor
287
+ decoder.fft_layers.0.self_attn.k_proj.weight
288
+ decoder.fft_layers.0.self_attn.k_proj.bias
289
+ decoder.fft_layers.0.self_attn.v_proj.weight
290
+ decoder.fft_layers.0.self_attn.v_proj.bias
291
+ decoder.fft_layers.0.self_attn.q_proj.weight
292
+ decoder.fft_layers.0.self_attn.q_proj.bias
293
+ decoder.fft_layers.0.self_attn.out_proj.weight
294
+ decoder.fft_layers.0.self_attn.out_proj.bias
295
+ decoder.fft_layers.0.layer_norm.weight
296
+ decoder.fft_layers.0.layer_norm.bias
297
+ decoder.fft_layers.0.ffn.ffn.0.weight
298
+ decoder.fft_layers.0.ffn.ffn.0.bias
299
+ decoder.fft_layers.0.ffn.ffn.2.weight
300
+ decoder.fft_layers.0.ffn.ffn.2.bias
301
+ decoder.fft_layers.0.ffn.layer_norm.weight
302
+ decoder.fft_layers.0.ffn.layer_norm.bias
303
+ decoder.fft_layers.0.film.s_gamma
304
+ decoder.fft_layers.0.film.s_beta
305
+ decoder.fft_layers.0.film.proj.weight
306
+ decoder.fft_layers.0.film.proj.bias
307
+ decoder.fft_layers.1.self_attn.k_proj.weight
308
+ decoder.fft_layers.1.self_attn.k_proj.bias
309
+ decoder.fft_layers.1.self_attn.v_proj.weight
310
+ decoder.fft_layers.1.self_attn.v_proj.bias
311
+ decoder.fft_layers.1.self_attn.q_proj.weight
312
+ decoder.fft_layers.1.self_attn.q_proj.bias
313
+ decoder.fft_layers.1.self_attn.out_proj.weight
314
+ decoder.fft_layers.1.self_attn.out_proj.bias
315
+ decoder.fft_layers.1.layer_norm.weight
316
+ decoder.fft_layers.1.layer_norm.bias
317
+ decoder.fft_layers.1.ffn.ffn.0.weight
318
+ decoder.fft_layers.1.ffn.ffn.0.bias
319
+ decoder.fft_layers.1.ffn.ffn.2.weight
320
+ decoder.fft_layers.1.ffn.ffn.2.bias
321
+ decoder.fft_layers.1.ffn.layer_norm.weight
322
+ decoder.fft_layers.1.ffn.layer_norm.bias
323
+ decoder.fft_layers.1.film.s_gamma
324
+ decoder.fft_layers.1.film.s_beta
325
+ decoder.fft_layers.1.film.proj.weight
326
+ decoder.fft_layers.1.film.proj.bias
327
+ decoder.fft_layers.2.self_attn.k_proj.weight
328
+ decoder.fft_layers.2.self_attn.k_proj.bias
329
+ decoder.fft_layers.2.self_attn.v_proj.weight
330
+ decoder.fft_layers.2.self_attn.v_proj.bias
331
+ decoder.fft_layers.2.self_attn.q_proj.weight
332
+ decoder.fft_layers.2.self_attn.q_proj.bias
333
+ decoder.fft_layers.2.self_attn.out_proj.weight
334
+ decoder.fft_layers.2.self_attn.out_proj.bias
335
+ decoder.fft_layers.2.layer_norm.weight
336
+ decoder.fft_layers.2.layer_norm.bias
337
+ decoder.fft_layers.2.ffn.ffn.0.weight
338
+ decoder.fft_layers.2.ffn.ffn.0.bias
339
+ decoder.fft_layers.2.ffn.ffn.2.weight
340
+ decoder.fft_layers.2.ffn.ffn.2.bias
341
+ decoder.fft_layers.2.ffn.layer_norm.weight
342
+ decoder.fft_layers.2.ffn.layer_norm.bias
343
+ decoder.fft_layers.2.film.s_gamma
344
+ decoder.fft_layers.2.film.s_beta
345
+ decoder.fft_layers.2.film.proj.weight
346
+ decoder.fft_layers.2.film.proj.bias
347
+ decoder.fft_layers.3.self_attn.k_proj.weight
348
+ decoder.fft_layers.3.self_attn.k_proj.bias
349
+ decoder.fft_layers.3.self_attn.v_proj.weight
350
+ decoder.fft_layers.3.self_attn.v_proj.bias
351
+ decoder.fft_layers.3.self_attn.q_proj.weight
352
+ decoder.fft_layers.3.self_attn.q_proj.bias
353
+ decoder.fft_layers.3.self_attn.out_proj.weight
354
+ decoder.fft_layers.3.self_attn.out_proj.bias
355
+ decoder.fft_layers.3.layer_norm.weight
356
+ decoder.fft_layers.3.layer_norm.bias
357
+ decoder.fft_layers.3.ffn.ffn.0.weight
358
+ decoder.fft_layers.3.ffn.ffn.0.bias
359
+ decoder.fft_layers.3.ffn.ffn.2.weight
360
+ decoder.fft_layers.3.ffn.ffn.2.bias
361
+ decoder.fft_layers.3.ffn.layer_norm.weight
362
+ decoder.fft_layers.3.ffn.layer_norm.bias
363
+ decoder.fft_layers.3.film.s_gamma
364
+ decoder.fft_layers.3.film.s_beta
365
+ decoder.fft_layers.3.film.proj.weight
366
+ decoder.fft_layers.3.film.proj.bias
367
+ decoder.out_proj.weight
368
+ decoder.out_proj.bias
369
+ decoder.postnet.convolutions.0.0.weight
370
+ decoder.postnet.convolutions.0.0.bias
371
+ decoder.postnet.convolutions.0.1.weight
372
+ decoder.postnet.convolutions.0.1.bias
373
+ decoder.postnet.convolutions.0.1.running_mean
374
+ decoder.postnet.convolutions.0.1.running_var
375
+ decoder.postnet.convolutions.0.1.num_batches_tracked
376
+ decoder.postnet.convolutions.1.0.weight
377
+ decoder.postnet.convolutions.1.0.bias
378
+ decoder.postnet.convolutions.1.1.weight
379
+ decoder.postnet.convolutions.1.1.bias
380
+ decoder.postnet.convolutions.1.1.running_mean
381
+ decoder.postnet.convolutions.1.1.running_var
382
+ decoder.postnet.convolutions.1.1.num_batches_tracked
383
+ decoder.postnet.convolutions.2.0.weight
384
+ decoder.postnet.convolutions.2.0.bias
385
+ decoder.postnet.convolutions.2.1.weight
386
+ decoder.postnet.convolutions.2.1.bias
387
+ decoder.postnet.convolutions.2.1.running_mean
388
+ decoder.postnet.convolutions.2.1.running_var
389
+ decoder.postnet.convolutions.2.1.num_batches_tracked
390
+ decoder.postnet.convolutions.3.0.weight
391
+ decoder.postnet.convolutions.3.0.bias
392
+ decoder.postnet.convolutions.3.1.weight
393
+ decoder.postnet.convolutions.3.1.bias
394
+ decoder.postnet.convolutions.3.1.running_mean
395
+ decoder.postnet.convolutions.3.1.running_var
396
+ decoder.postnet.convolutions.3.1.num_batches_tracked
397
+ decoder.postnet.convolutions.4.0.weight
398
+ decoder.postnet.convolutions.4.0.bias
399
+ decoder.postnet.convolutions.4.1.weight
400
+ decoder.postnet.convolutions.4.1.bias
401
+ decoder.postnet.convolutions.4.1.running_mean
402
+ decoder.postnet.convolutions.4.1.running_var
403
+ decoder.postnet.convolutions.4.1.num_batches_tracked
pretrained_models/ckpts/prosody_encoder/pretssel_cfg.json ADDED
@@ -0,0 +1,822 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name": null,
3
+ "common": {
4
+ "_name": null,
5
+ "no_progress_bar": false,
6
+ "log_interval": 100,
7
+ "log_format": "simple",
8
+ "log_file": null,
9
+ "aim_repo": null,
10
+ "aim_run_hash": null,
11
+ "tensorboard_logdir": "/checkpoint/mjhwang/experiments/230930-noiseaug_p2v-mls_multilingual_6lang/231005-noiseaug_p2v-mls_multilingual_6lang-alignfix.config_v2.langemb1.vuv_logit1.denoise.ngpu16",
12
+ "wandb_project": null,
13
+ "azureml_logging": false,
14
+ "seed": 1,
15
+ "cpu": false,
16
+ "tpu": false,
17
+ "bf16": false,
18
+ "fp16": false,
19
+ "memory_efficient_fp16": false,
20
+ "fp16_no_flatten_grads": false,
21
+ "fp16_init_scale": 128,
22
+ "fp16_scale_window": null,
23
+ "fp16_scale_tolerance": 0.0,
24
+ "on_cpu_convert_precision": false,
25
+ "min_loss_scale": 0.0001,
26
+ "threshold_loss_scale": null,
27
+ "amp": false,
28
+ "amp_batch_retries": 2,
29
+ "amp_init_scale": 128,
30
+ "amp_scale_window": null,
31
+ "user_dir": null,
32
+ "empty_cache_freq": 0,
33
+ "all_gather_list_size": 9999999,
34
+ "model_parallel_size": 1,
35
+ "quantization_config_path": null,
36
+ "profile": false,
37
+ "reset_logging": false,
38
+ "suppress_crashes": false,
39
+ "use_plasma_view": false,
40
+ "plasma_path": "/tmp/plasma",
41
+ "log_nvidia_smi": false,
42
+ "use_tutel_moe": false
43
+ },
44
+ "common_eval": {
45
+ "_name": null,
46
+ "path": null,
47
+ "post_process": null,
48
+ "quiet": false,
49
+ "model_overrides": "{}",
50
+ "results_path": null,
51
+ "is_moe": false,
52
+ "moe_generation": false
53
+ },
54
+ "distributed_training": {
55
+ "_name": null,
56
+ "distributed_world_size": 16,
57
+ "distributed_num_procs": 8,
58
+ "distributed_rank": 0,
59
+ "distributed_backend": "nccl",
60
+ "distributed_init_method": "tcp://learnfair0791:15129",
61
+ "distributed_port": 15129,
62
+ "device_id": 0,
63
+ "distributed_no_spawn": false,
64
+ "ddp_backend": "legacy_ddp",
65
+ "ddp_comm_hook": "none",
66
+ "bucket_cap_mb": 25,
67
+ "fix_batches_to_gpus": false,
68
+ "find_unused_parameters": true,
69
+ "gradient_as_bucket_view": false,
70
+ "fast_stat_sync": false,
71
+ "heartbeat_timeout": -1,
72
+ "broadcast_buffers": false,
73
+ "slowmo_momentum": null,
74
+ "slowmo_base_algorithm": "localsgd",
75
+ "localsgd_frequency": 3,
76
+ "nprocs_per_node": 8,
77
+ "pipeline_model_parallel": false,
78
+ "pipeline_balance": null,
79
+ "pipeline_devices": null,
80
+ "pipeline_chunks": 0,
81
+ "pipeline_encoder_balance": null,
82
+ "pipeline_encoder_devices": null,
83
+ "pipeline_decoder_balance": null,
84
+ "pipeline_decoder_devices": null,
85
+ "pipeline_checkpoint": "never",
86
+ "zero_sharding": "none",
87
+ "fp16": false,
88
+ "bf16": false,
89
+ "memory_efficient_fp16": false,
90
+ "tpu": false,
91
+ "no_reshard_after_forward": false,
92
+ "fp32_reduce_scatter": false,
93
+ "cpu_offload": false,
94
+ "use_sharded_state": false,
95
+ "not_fsdp_flatten_parameters": false,
96
+ "freeze_up_to_layer": null
97
+ },
98
+ "dataset": {
99
+ "_name": null,
100
+ "num_workers": 0,
101
+ "num_workers_valid": 0,
102
+ "skip_invalid_size_inputs_valid_test": true,
103
+ "max_tokens": 300000,
104
+ "batch_size": null,
105
+ "required_batch_size_multiple": 8,
106
+ "required_seq_len_multiple": 1,
107
+ "dataset_impl": null,
108
+ "data_buffer_size": 10,
109
+ "train_subset": "train_wenet_cmn_9_10,train_wenet_cmn_8_10,train_wenet_cmn_7_10,train_wenet_cmn_6_10,train_wenet_cmn_5_10,train_wenet_cmn_4_10,train_wenet_cmn_3_10,train_wenet_cmn_2_10,train_wenet_cmn_1_10,train_wenet_cmn_0_10,train_mls_en_9_10,train_mls_en_8_10,train_mls_en_7_10,train_mls_en_6_10,train_mls_en_5_10,train_mls_en_4_10,train_mls_en_3_10,train_mls_en_2_10,train_mls_en_1_10,train_mls_en_0_10,train_mls_deu,train_mls_fra,train_mls_spa,train_cv12_cmn,train_mls_ita,train_cv12_cmn_2,train_vl107_cmn",
110
+ "valid_subset": "dev_all",
111
+ "combine_valid_subsets": null,
112
+ "ignore_unused_valid_subsets": false,
113
+ "validate_interval": 1,
114
+ "validate_interval_updates": 5000,
115
+ "validate_after_updates": 0,
116
+ "fixed_validation_seed": null,
117
+ "disable_validation": false,
118
+ "max_tokens_valid": 300000,
119
+ "batch_size_valid": null,
120
+ "max_valid_steps": null,
121
+ "curriculum": 0,
122
+ "gen_subset": "test",
123
+ "num_shards": 1,
124
+ "shard_id": 0,
125
+ "grouped_shuffling": false,
126
+ "update_epoch_batch_itr": false,
127
+ "update_ordered_indices_seed": false
128
+ },
129
+ "optimization": {
130
+ "_name": null,
131
+ "max_epoch": 0,
132
+ "max_update": 500000,
133
+ "stop_time_hours": 0.0,
134
+ "clip_norm": 1.0,
135
+ "clip_norm_type": "l2",
136
+ "sentence_avg": false,
137
+ "update_freq": [
138
+ 4
139
+ ],
140
+ "lr": [
141
+ 0.0001
142
+ ],
143
+ "stop_min_lr": -1.0,
144
+ "use_bmuf": false,
145
+ "skip_remainder_batch": false
146
+ },
147
+ "checkpoint": {
148
+ "_name": null,
149
+ "save_dir": "/checkpoint/mjhwang/experiments/230930-noiseaug_p2v-mls_multilingual_6lang/231005-noiseaug_p2v-mls_multilingual_6lang-alignfix.config_v2.langemb1.vuv_logit1.denoise.ngpu16",
150
+ "restore_file": "checkpoint_last.pt",
151
+ "continue_once": null,
152
+ "finetune_from_model": null,
153
+ "ignore_suffix": false,
154
+ "reset_dataloader": true,
155
+ "reset_lr_scheduler": false,
156
+ "reset_meters": false,
157
+ "reset_optimizer": false,
158
+ "optimizer_overrides": "{}",
159
+ "save_interval": 1,
160
+ "save_interval_updates": 10000,
161
+ "keep_interval_updates": 1,
162
+ "keep_interval_updates_pattern": -1,
163
+ "keep_last_epochs": -1,
164
+ "keep_best_checkpoints": 10,
165
+ "no_save": false,
166
+ "no_epoch_checkpoints": true,
167
+ "no_last_checkpoints": false,
168
+ "no_best_checkpoints": false,
169
+ "no_save_optimizer_state": false,
170
+ "no_save_optimizer_state_on_training_finished": false,
171
+ "synchronize_checkpoints_before_copy": false,
172
+ "symlink_best_and_last_checkpoints": false,
173
+ "best_checkpoint_metric": "mse_loss",
174
+ "maximize_best_checkpoint_metric": false,
175
+ "patience": 20,
176
+ "checkpoint_suffix": "",
177
+ "checkpoint_shard_count": 1,
178
+ "load_checkpoint_on_all_dp_ranks": false,
179
+ "write_checkpoints_asynchronously": false,
180
+ "s3_upload_path": null,
181
+ "replication_count": 1,
182
+ "model_parallel_size": 1
183
+ },
184
+ "bmuf": {
185
+ "_name": null,
186
+ "block_lr": 1.0,
187
+ "block_momentum": 0.875,
188
+ "global_sync_iter": 50,
189
+ "warmup_iterations": 500,
190
+ "use_nbm": false,
191
+ "average_sync": false,
192
+ "distributed_world_size": 16
193
+ },
194
+ "generation": {
195
+ "_name": null,
196
+ "beam": 5,
197
+ "beam_mt": 0,
198
+ "nbest": 1,
199
+ "max_len_a": 0.0,
200
+ "max_len_b": 200,
201
+ "max_len_a_mt": 0.0,
202
+ "max_len_b_mt": 200,
203
+ "min_len": 1,
204
+ "match_source_len": false,
205
+ "unnormalized": false,
206
+ "no_early_stop": false,
207
+ "no_beamable_mm": false,
208
+ "lenpen": 1.0,
209
+ "lenpen_mt": 1.0,
210
+ "unkpen": 0.0,
211
+ "blankpen": 0.0,
212
+ "replace_unk": null,
213
+ "sacrebleu": false,
214
+ "score_reference": false,
215
+ "prefix_size": 0,
216
+ "no_repeat_ngram_size": 0,
217
+ "sampling": false,
218
+ "sampling_topk": -1,
219
+ "sampling_topp": -1.0,
220
+ "constraints": null,
221
+ "temperature": 1.0,
222
+ "diverse_beam_groups": -1,
223
+ "diverse_beam_strength": 0.5,
224
+ "diversity_rate": -1.0,
225
+ "print_alignment": null,
226
+ "print_step": false,
227
+ "lm_path": null,
228
+ "lm_weight": 0.0,
229
+ "iter_decode_eos_penalty": 0.0,
230
+ "iter_decode_max_iter": 10,
231
+ "iter_decode_force_max_iter": false,
232
+ "iter_decode_with_beam": 1,
233
+ "iter_decode_with_external_reranker": false,
234
+ "retain_iter_history": false,
235
+ "retain_dropout": false,
236
+ "retain_dropout_modules": null,
237
+ "decoding_format": null,
238
+ "no_seed_provided": false,
239
+ "eos_token": null
240
+ },
241
+ "eval_lm": {
242
+ "_name": null,
243
+ "output_word_probs": false,
244
+ "output_word_stats": false,
245
+ "context_window": 0,
246
+ "softmax_batch": 9223372036854775807,
247
+ "stats_path": null,
248
+ "max_valid_steps": null
249
+ },
250
+ "interactive": {
251
+ "_name": null,
252
+ "buffer_size": 0,
253
+ "input": "-"
254
+ },
255
+ "model": {
256
+ "no_progress_bar": false,
257
+ "log_interval": 100,
258
+ "log_format": "simple",
259
+ "log_file": null,
260
+ "aim_repo": null,
261
+ "aim_run_hash": null,
262
+ "tensorboard_logdir": "/checkpoint/mjhwang/experiments/230930-noiseaug_p2v-mls_multilingual_6lang/231005-noiseaug_p2v-mls_multilingual_6lang-alignfix.config_v2.langemb1.vuv_logit1.denoise.ngpu16",
263
+ "wandb_project": null,
264
+ "azureml_logging": false,
265
+ "seed": 1,
266
+ "cpu": false,
267
+ "tpu": false,
268
+ "bf16": false,
269
+ "fp16": false,
270
+ "memory_efficient_fp16": false,
271
+ "fp16_no_flatten_grads": false,
272
+ "fp16_init_scale": 128,
273
+ "fp16_scale_window": null,
274
+ "fp16_scale_tolerance": 0.0,
275
+ "on_cpu_convert_precision": false,
276
+ "min_loss_scale": 0.0001,
277
+ "threshold_loss_scale": null,
278
+ "amp": false,
279
+ "amp_batch_retries": 2,
280
+ "amp_init_scale": 128,
281
+ "amp_scale_window": null,
282
+ "user_dir": null,
283
+ "empty_cache_freq": 0,
284
+ "all_gather_list_size": 9999999,
285
+ "model_parallel_size": 1,
286
+ "quantization_config_path": null,
287
+ "profile": false,
288
+ "reset_logging": false,
289
+ "suppress_crashes": false,
290
+ "use_plasma_view": false,
291
+ "plasma_path": "/tmp/plasma",
292
+ "log_nvidia_smi": false,
293
+ "use_tutel_moe": false,
294
+ "tokenizer": null,
295
+ "bpe": null,
296
+ "optimizer": "adam",
297
+ "lr_scheduler": "fixed",
298
+ "simul_type": null,
299
+ "criterion": "nar_prosody2vec",
300
+ "scoring": "bleu",
301
+ "task": "prosody2vec",
302
+ "num_workers": 0,
303
+ "num_workers_valid": 0,
304
+ "skip_invalid_size_inputs_valid_test": true,
305
+ "max_tokens": 300000,
306
+ "batch_size": null,
307
+ "required_batch_size_multiple": 8,
308
+ "required_seq_len_multiple": 1,
309
+ "dataset_impl": null,
310
+ "data_buffer_size": 10,
311
+ "train_subset": "train_wenet_cmn_9_10,train_wenet_cmn_8_10,train_wenet_cmn_7_10,train_wenet_cmn_6_10,train_wenet_cmn_5_10,train_wenet_cmn_4_10,train_wenet_cmn_3_10,train_wenet_cmn_2_10,train_wenet_cmn_1_10,train_wenet_cmn_0_10,train_mls_en_9_10,train_mls_en_8_10,train_mls_en_7_10,train_mls_en_6_10,train_mls_en_5_10,train_mls_en_4_10,train_mls_en_3_10,train_mls_en_2_10,train_mls_en_1_10,train_mls_en_0_10,train_mls_deu,train_mls_fra,train_mls_spa,train_cv12_cmn,train_mls_ita,train_cv12_cmn_2,train_vl107_cmn",
312
+ "valid_subset": "dev_all",
313
+ "combine_valid_subsets": null,
314
+ "ignore_unused_valid_subsets": false,
315
+ "validate_interval": 1,
316
+ "validate_interval_updates": 5000,
317
+ "validate_after_updates": 0,
318
+ "fixed_validation_seed": null,
319
+ "disable_validation": false,
320
+ "max_tokens_valid": "300000",
321
+ "batch_size_valid": null,
322
+ "max_valid_steps": null,
323
+ "curriculum": 0,
324
+ "gen_subset": "test",
325
+ "num_shards": 1,
326
+ "shard_id": 0,
327
+ "grouped_shuffling": false,
328
+ "update_epoch_batch_itr": false,
329
+ "update_ordered_indices_seed": false,
330
+ "distributed_world_size": 16,
331
+ "distributed_num_procs": 8,
332
+ "distributed_rank": 0,
333
+ "distributed_backend": "nccl",
334
+ "distributed_init_method": null,
335
+ "distributed_port": 15129,
336
+ "device_id": 0,
337
+ "distributed_no_spawn": false,
338
+ "ddp_backend": "legacy_ddp",
339
+ "ddp_comm_hook": "none",
340
+ "bucket_cap_mb": 25,
341
+ "fix_batches_to_gpus": false,
342
+ "find_unused_parameters": true,
343
+ "gradient_as_bucket_view": false,
344
+ "fast_stat_sync": false,
345
+ "heartbeat_timeout": -1,
346
+ "broadcast_buffers": false,
347
+ "slowmo_momentum": null,
348
+ "slowmo_base_algorithm": "localsgd",
349
+ "localsgd_frequency": 3,
350
+ "nprocs_per_node": 8,
351
+ "pipeline_model_parallel": false,
352
+ "pipeline_balance": null,
353
+ "pipeline_devices": null,
354
+ "pipeline_chunks": 0,
355
+ "pipeline_encoder_balance": null,
356
+ "pipeline_encoder_devices": null,
357
+ "pipeline_decoder_balance": null,
358
+ "pipeline_decoder_devices": null,
359
+ "pipeline_checkpoint": "never",
360
+ "zero_sharding": "none",
361
+ "no_reshard_after_forward": false,
362
+ "fp32_reduce_scatter": false,
363
+ "cpu_offload": false,
364
+ "use_sharded_state": false,
365
+ "not_fsdp_flatten_parameters": false,
366
+ "freeze_up_to_layer": null,
367
+ "arch": "nar_p2v",
368
+ "max_epoch": 0,
369
+ "max_update": 500000,
370
+ "stop_time_hours": 0,
371
+ "clip_norm": 1.0,
372
+ "clip_norm_type": "l2",
373
+ "sentence_avg": false,
374
+ "update_freq": [
375
+ 4
376
+ ],
377
+ "lr": [
378
+ 0.0001
379
+ ],
380
+ "stop_min_lr": -1.0,
381
+ "use_bmuf": false,
382
+ "skip_remainder_batch": false,
383
+ "save_dir": "/checkpoint/mjhwang/experiments/230930-noiseaug_p2v-mls_multilingual_6lang/231005-noiseaug_p2v-mls_multilingual_6lang-alignfix.config_v2.langemb1.vuv_logit1.denoise.ngpu16",
384
+ "restore_file": "checkpoint_last.pt",
385
+ "continue_once": null,
386
+ "finetune_from_model": null,
387
+ "ignore_suffix": false,
388
+ "reset_dataloader": true,
389
+ "reset_lr_scheduler": false,
390
+ "reset_meters": false,
391
+ "reset_optimizer": false,
392
+ "optimizer_overrides": "{}",
393
+ "save_interval": 1,
394
+ "save_interval_updates": 10000,
395
+ "keep_interval_updates": 1,
396
+ "keep_interval_updates_pattern": -1,
397
+ "keep_last_epochs": -1,
398
+ "keep_best_checkpoints": 10,
399
+ "no_save": false,
400
+ "no_epoch_checkpoints": true,
401
+ "no_last_checkpoints": false,
402
+ "no_best_checkpoints": false,
403
+ "no_save_optimizer_state": false,
404
+ "no_save_optimizer_state_on_training_finished": false,
405
+ "synchronize_checkpoints_before_copy": false,
406
+ "symlink_best_and_last_checkpoints": false,
407
+ "best_checkpoint_metric": "mse_loss",
408
+ "maximize_best_checkpoint_metric": false,
409
+ "patience": 20,
410
+ "checkpoint_suffix": "",
411
+ "checkpoint_shard_count": 1,
412
+ "load_checkpoint_on_all_dp_ranks": false,
413
+ "write_checkpoints_asynchronously": false,
414
+ "s3_upload_path": null,
415
+ "replication_count": 1,
416
+ "store_ema": false,
417
+ "ema_decay": 0.9999,
418
+ "ema_start_update": 0,
419
+ "ema_seed_model": null,
420
+ "ema_update_freq": 1,
421
+ "ema_fp32": false,
422
+ "load_prosody_encoder_from": null,
423
+ "freeze_prosody_encoder": false,
424
+ "unit_encoder_arch": "daft_exprt_encoder",
425
+ "prosody_encoder_arch": "ecapa_tdnn2",
426
+ "decoder_arch": "daft_exprt_decoder",
427
+ "data": "/large_experiments/seamless/ust/mjhwang/data/denoise_prosody2vec/mls_multilingual_6lang_xlsr_10k_noiseaug",
428
+ "config_yaml": "config_v2.yaml",
429
+ "max_source_positions": 300000,
430
+ "max_target_positions": 300000,
431
+ "n_frames_per_step": 1,
432
+ "eos_prob_threshold": 0.5,
433
+ "eval_inference": true,
434
+ "eval_tb_nsample": 8,
435
+ "eval_bleu": false,
436
+ "vocoder": "griffin_lim",
437
+ "spec_bwd_max_iter": 8,
438
+ "jit_data_offloading": true,
439
+ "jit_data_root": "/scratch/slurm_tmpdir/${SLURM_JOB_ID}",
440
+ "adam_betas": "(0.9, 0.98)",
441
+ "adam_eps": 1e-08,
442
+ "weight_decay": 0.0,
443
+ "use_old_adam": false,
444
+ "fp16_adam_stats": false,
445
+ "block_wise": false,
446
+ "force_anneal": null,
447
+ "lr_shrink": 0.1,
448
+ "warmup_updates": 1000,
449
+ "ctc_weight": 0.0,
450
+ "forward_sum_weight": 1.0,
451
+ "bin_loss_start_ratio": 0.1,
452
+ "bin_loss_warmup_steps": 6000,
453
+ "film_regul_weight": 0.001,
454
+ "pros_consist_weight": 0.0,
455
+ "denoise_target": true,
456
+ "snr_threshold": 2000000000000000.0,
457
+ "pad": 1,
458
+ "eos": 2,
459
+ "unk": 3,
460
+ "use_spkr_emb": 0,
461
+ "use_lang_emb": 1,
462
+ "prosody_embed_dim": 512,
463
+ "use_ucmvn": 0,
464
+ "use_spec_augment": 1,
465
+ "use_prosody_layernorm": 1,
466
+ "var_pred_hidden_dim": 512,
467
+ "var_pred_kernel_size": 5,
468
+ "var_pred_n_bins": -1,
469
+ "add_variance_parallel": 1,
470
+ "use_film_decoder": 1,
471
+ "predict_var_vuv": 1,
472
+ "predict_vuv_logit": 1,
473
+ "predict_frm_f0_vuv": 0,
474
+ "no_seed_provided": false,
475
+ "speaker_embed_dim": 192,
476
+ "use_utterance_speaker_embed": false,
477
+ "lang_embed_dim": 64,
478
+ "_name": "nar_p2v",
479
+ "lang_to_id": {
480
+ "cmn": 0,
481
+ "deu": 1,
482
+ "eng": 2,
483
+ "fra": 3,
484
+ "ita": 4,
485
+ "spa": 5
486
+ },
487
+ "pitch_min": 0.0,
488
+ "pitch_max": 6.858574643755327,
489
+ "energy_min": 0.0,
490
+ "energy_max": 6.360039234161377,
491
+ "speaker_emb_path": null,
492
+ "input_feat_per_channel": 80,
493
+ "input_channels": 1,
494
+ "speaker_to_id": null,
495
+ "dropout": 0.2,
496
+ "fft_hidden_dim": 1024,
497
+ "fft_kernel_size": 9,
498
+ "attention_dropout": 0.0,
499
+ "encoder_layers": 4,
500
+ "encoder_embed_dim": 256,
501
+ "encoder_attention_heads": 2,
502
+ "output_frame_dim": 80,
503
+ "prosody_channels": [
504
+ 512,
505
+ 512,
506
+ 512,
507
+ 512,
508
+ 1536
509
+ ],
510
+ "prosody_kernel_sizes": [
511
+ 5,
512
+ 3,
513
+ 3,
514
+ 3,
515
+ 1
516
+ ],
517
+ "prosody_dilations": [
518
+ 1,
519
+ 2,
520
+ 3,
521
+ 4,
522
+ 1
523
+ ],
524
+ "prosody_attention_channels": 128,
525
+ "prosody_res2net_scale": 8,
526
+ "prosody_se_channels": 128,
527
+ "prosody_global_context": true,
528
+ "prosody_groups": [
529
+ 1,
530
+ 1,
531
+ 1,
532
+ 1,
533
+ 1
534
+ ],
535
+ "decoder_layers": 4,
536
+ "decoder_embed_dim": 256,
537
+ "decoder_attention_heads": 2,
538
+ "var_pred_dropout": 0.5,
539
+ "add_postnet": true,
540
+ "postnet_dropout": 0.5,
541
+ "postnet_layers": 5,
542
+ "postnet_conv_dim": 512,
543
+ "postnet_conv_kernel_size": 5,
544
+ "upsampling": "gaussian"
545
+ },
546
+ "task": {
547
+ "no_progress_bar": false,
548
+ "log_interval": 100,
549
+ "log_format": "simple",
550
+ "log_file": null,
551
+ "aim_repo": null,
552
+ "aim_run_hash": null,
553
+ "tensorboard_logdir": "/checkpoint/mjhwang/experiments/230930-noiseaug_p2v-mls_multilingual_6lang/231005-noiseaug_p2v-mls_multilingual_6lang-alignfix.config_v2.langemb1.vuv_logit1.denoise.ngpu16",
554
+ "wandb_project": null,
555
+ "azureml_logging": false,
556
+ "seed": 1,
557
+ "cpu": false,
558
+ "tpu": false,
559
+ "bf16": false,
560
+ "fp16": false,
561
+ "memory_efficient_fp16": false,
562
+ "fp16_no_flatten_grads": false,
563
+ "fp16_init_scale": 128,
564
+ "fp16_scale_window": null,
565
+ "fp16_scale_tolerance": 0.0,
566
+ "on_cpu_convert_precision": false,
567
+ "min_loss_scale": 0.0001,
568
+ "threshold_loss_scale": null,
569
+ "amp": false,
570
+ "amp_batch_retries": 2,
571
+ "amp_init_scale": 128,
572
+ "amp_scale_window": null,
573
+ "user_dir": null,
574
+ "empty_cache_freq": 0,
575
+ "all_gather_list_size": 9999999,
576
+ "model_parallel_size": 1,
577
+ "quantization_config_path": null,
578
+ "profile": false,
579
+ "reset_logging": false,
580
+ "suppress_crashes": false,
581
+ "use_plasma_view": false,
582
+ "plasma_path": "/tmp/plasma",
583
+ "log_nvidia_smi": false,
584
+ "use_tutel_moe": false,
585
+ "tokenizer": null,
586
+ "bpe": null,
587
+ "optimizer": "adam",
588
+ "lr_scheduler": "fixed",
589
+ "simul_type": null,
590
+ "criterion": "nar_prosody2vec",
591
+ "scoring": "bleu",
592
+ "task": "prosody2vec",
593
+ "num_workers": 0,
594
+ "num_workers_valid": 0,
595
+ "skip_invalid_size_inputs_valid_test": true,
596
+ "max_tokens": 300000,
597
+ "batch_size": null,
598
+ "required_batch_size_multiple": 8,
599
+ "required_seq_len_multiple": 1,
600
+ "dataset_impl": null,
601
+ "data_buffer_size": 10,
602
+ "train_subset": "train_wenet_cmn_9_10,train_wenet_cmn_8_10,train_wenet_cmn_7_10,train_wenet_cmn_6_10,train_wenet_cmn_5_10,train_wenet_cmn_4_10,train_wenet_cmn_3_10,train_wenet_cmn_2_10,train_wenet_cmn_1_10,train_wenet_cmn_0_10,train_mls_en_9_10,train_mls_en_8_10,train_mls_en_7_10,train_mls_en_6_10,train_mls_en_5_10,train_mls_en_4_10,train_mls_en_3_10,train_mls_en_2_10,train_mls_en_1_10,train_mls_en_0_10,train_mls_deu,train_mls_fra,train_mls_spa,train_cv12_cmn,train_mls_ita,train_cv12_cmn_2,train_vl107_cmn",
603
+ "valid_subset": "dev_all",
604
+ "combine_valid_subsets": null,
605
+ "ignore_unused_valid_subsets": false,
606
+ "validate_interval": 1,
607
+ "validate_interval_updates": 5000,
608
+ "validate_after_updates": 0,
609
+ "fixed_validation_seed": null,
610
+ "disable_validation": false,
611
+ "max_tokens_valid": "300000",
612
+ "batch_size_valid": null,
613
+ "max_valid_steps": null,
614
+ "curriculum": 0,
615
+ "gen_subset": "test",
616
+ "num_shards": 1,
617
+ "shard_id": 0,
618
+ "grouped_shuffling": false,
619
+ "update_epoch_batch_itr": false,
620
+ "update_ordered_indices_seed": false,
621
+ "distributed_world_size": 16,
622
+ "distributed_num_procs": 8,
623
+ "distributed_rank": 0,
624
+ "distributed_backend": "nccl",
625
+ "distributed_init_method": null,
626
+ "distributed_port": 15129,
627
+ "device_id": 0,
628
+ "distributed_no_spawn": false,
629
+ "ddp_backend": "legacy_ddp",
630
+ "ddp_comm_hook": "none",
631
+ "bucket_cap_mb": 25,
632
+ "fix_batches_to_gpus": false,
633
+ "find_unused_parameters": true,
634
+ "gradient_as_bucket_view": false,
635
+ "fast_stat_sync": false,
636
+ "heartbeat_timeout": -1,
637
+ "broadcast_buffers": false,
638
+ "slowmo_momentum": null,
639
+ "slowmo_base_algorithm": "localsgd",
640
+ "localsgd_frequency": 3,
641
+ "nprocs_per_node": 8,
642
+ "pipeline_model_parallel": false,
643
+ "pipeline_balance": null,
644
+ "pipeline_devices": null,
645
+ "pipeline_chunks": 0,
646
+ "pipeline_encoder_balance": null,
647
+ "pipeline_encoder_devices": null,
648
+ "pipeline_decoder_balance": null,
649
+ "pipeline_decoder_devices": null,
650
+ "pipeline_checkpoint": "never",
651
+ "zero_sharding": "none",
652
+ "no_reshard_after_forward": false,
653
+ "fp32_reduce_scatter": false,
654
+ "cpu_offload": false,
655
+ "use_sharded_state": false,
656
+ "not_fsdp_flatten_parameters": false,
657
+ "freeze_up_to_layer": null,
658
+ "arch": "nar_p2v",
659
+ "max_epoch": 0,
660
+ "max_update": 500000,
661
+ "stop_time_hours": 0,
662
+ "clip_norm": 1.0,
663
+ "clip_norm_type": "l2",
664
+ "sentence_avg": false,
665
+ "update_freq": [
666
+ 4
667
+ ],
668
+ "lr": [
669
+ 0.0001
670
+ ],
671
+ "stop_min_lr": -1.0,
672
+ "use_bmuf": false,
673
+ "skip_remainder_batch": false,
674
+ "save_dir": "/checkpoint/mjhwang/experiments/230930-noiseaug_p2v-mls_multilingual_6lang/231005-noiseaug_p2v-mls_multilingual_6lang-alignfix.config_v2.langemb1.vuv_logit1.denoise.ngpu16",
675
+ "restore_file": "checkpoint_last.pt",
676
+ "continue_once": null,
677
+ "finetune_from_model": null,
678
+ "ignore_suffix": false,
679
+ "reset_dataloader": true,
680
+ "reset_lr_scheduler": false,
681
+ "reset_meters": false,
682
+ "reset_optimizer": false,
683
+ "optimizer_overrides": "{}",
684
+ "save_interval": 1,
685
+ "save_interval_updates": 10000,
686
+ "keep_interval_updates": 1,
687
+ "keep_interval_updates_pattern": -1,
688
+ "keep_last_epochs": -1,
689
+ "keep_best_checkpoints": 10,
690
+ "no_save": false,
691
+ "no_epoch_checkpoints": true,
692
+ "no_last_checkpoints": false,
693
+ "no_best_checkpoints": false,
694
+ "no_save_optimizer_state": false,
695
+ "no_save_optimizer_state_on_training_finished": false,
696
+ "synchronize_checkpoints_before_copy": false,
697
+ "symlink_best_and_last_checkpoints": false,
698
+ "best_checkpoint_metric": "mse_loss",
699
+ "maximize_best_checkpoint_metric": false,
700
+ "patience": 20,
701
+ "checkpoint_suffix": "",
702
+ "checkpoint_shard_count": 1,
703
+ "load_checkpoint_on_all_dp_ranks": false,
704
+ "write_checkpoints_asynchronously": false,
705
+ "s3_upload_path": null,
706
+ "replication_count": 1,
707
+ "store_ema": false,
708
+ "ema_decay": 0.9999,
709
+ "ema_start_update": 0,
710
+ "ema_seed_model": null,
711
+ "ema_update_freq": 1,
712
+ "ema_fp32": false,
713
+ "load_prosody_encoder_from": null,
714
+ "freeze_prosody_encoder": false,
715
+ "unit_encoder_arch": "daft_exprt_encoder",
716
+ "prosody_encoder_arch": "ecapa_tdnn2",
717
+ "decoder_arch": "daft_exprt_decoder",
718
+ "data": "/large_experiments/seamless/ust/mjhwang/data/denoise_prosody2vec/mls_multilingual_6lang_xlsr_10k_noiseaug",
719
+ "config_yaml": "config_v2.yaml",
720
+ "max_source_positions": 300000,
721
+ "max_target_positions": 300000,
722
+ "n_frames_per_step": 1,
723
+ "eos_prob_threshold": 0.5,
724
+ "eval_inference": true,
725
+ "eval_tb_nsample": 8,
726
+ "eval_bleu": false,
727
+ "vocoder": "griffin_lim",
728
+ "spec_bwd_max_iter": 8,
729
+ "jit_data_offloading": true,
730
+ "jit_data_root": "/scratch/slurm_tmpdir/${SLURM_JOB_ID}",
731
+ "adam_betas": "(0.9, 0.98)",
732
+ "adam_eps": 1e-08,
733
+ "weight_decay": 0.0,
734
+ "use_old_adam": false,
735
+ "fp16_adam_stats": false,
736
+ "block_wise": false,
737
+ "force_anneal": null,
738
+ "lr_shrink": 0.1,
739
+ "warmup_updates": 1000,
740
+ "ctc_weight": 0.0,
741
+ "forward_sum_weight": 1.0,
742
+ "bin_loss_start_ratio": 0.1,
743
+ "bin_loss_warmup_steps": 6000,
744
+ "film_regul_weight": 0.001,
745
+ "pros_consist_weight": 0.0,
746
+ "denoise_target": true,
747
+ "snr_threshold": 2000000000000000.0,
748
+ "pad": 1,
749
+ "eos": 2,
750
+ "unk": 3,
751
+ "use_spkr_emb": 0,
752
+ "use_lang_emb": 1,
753
+ "prosody_embed_dim": 512,
754
+ "use_ucmvn": 0,
755
+ "use_spec_augment": 1,
756
+ "use_prosody_layernorm": 1,
757
+ "var_pred_hidden_dim": 512,
758
+ "var_pred_kernel_size": 5,
759
+ "var_pred_n_bins": -1,
760
+ "add_variance_parallel": 1,
761
+ "use_film_decoder": 1,
762
+ "predict_var_vuv": 1,
763
+ "predict_vuv_logit": 1,
764
+ "predict_frm_f0_vuv": 0,
765
+ "no_seed_provided": false,
766
+ "speaker_embed_dim": 192,
767
+ "use_utterance_speaker_embed": false,
768
+ "lang_embed_dim": 64,
769
+ "_name": "prosody2vec"
770
+ },
771
+ "criterion": {
772
+ "_name": "nar_prosody2vec",
773
+ "ctc_weight": 0.0,
774
+ "forward_sum_weight": 1.0,
775
+ "bin_loss_start_ratio": 0.1,
776
+ "bin_loss_warmup_steps": 6000,
777
+ "film_regul_weight": 0.001,
778
+ "pros_consist_weight": 0.0,
779
+ "denoise_target": true,
780
+ "snr_threshold": 2000000000000000.0
781
+ },
782
+ "optimizer": {
783
+ "_name": "adam",
784
+ "adam_betas": "(0.9, 0.98)",
785
+ "adam_eps": 1e-08,
786
+ "weight_decay": 0.0,
787
+ "use_old_adam": false,
788
+ "fp16_adam_stats": false,
789
+ "tpu": false,
790
+ "lr": [
791
+ 0.0001
792
+ ],
793
+ "block_wise": false
794
+ },
795
+ "lr_scheduler": {
796
+ "_name": "fixed",
797
+ "force_anneal": null,
798
+ "lr_shrink": 0.1,
799
+ "warmup_updates": 1000,
800
+ "lr": [
801
+ 0.0001
802
+ ]
803
+ },
804
+ "scoring": {
805
+ "_name": "bleu",
806
+ "pad": 1,
807
+ "eos": 2,
808
+ "unk": 3
809
+ },
810
+ "bpe": null,
811
+ "tokenizer": null,
812
+ "ema": {
813
+ "_name": null,
814
+ "store_ema": false,
815
+ "ema_decay": 0.9999,
816
+ "ema_start_update": 0,
817
+ "ema_seed_model": null,
818
+ "ema_update_freq": 1,
819
+ "ema_fp32": false
820
+ },
821
+ "simul_type": null
822
+ }
pretrained_models/ckpts/prosody_encoder/pretssel_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8cc4824bf5506ce2a34b3d32d237451244ed97e89395ef7881edb1d3b72f1fa
3
+ size 267306349
pretrained_models/ckpts/prosody_encoder/prosody_UnitY2_keys.txt ADDED
@@ -0,0 +1,1737 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ speech_encoder_frontend.model_dim_proj.weight
2
+ speech_encoder_frontend.model_dim_proj.bias
3
+ speech_encoder.inner.layers.0.self_attn_layer_norm.weight
4
+ speech_encoder.inner.layers.0.self_attn_layer_norm.bias
5
+ speech_encoder.inner.layers.0.self_attn.k_proj.weight
6
+ speech_encoder.inner.layers.0.self_attn.k_proj.bias
7
+ speech_encoder.inner.layers.0.self_attn.v_proj.weight
8
+ speech_encoder.inner.layers.0.self_attn.v_proj.bias
9
+ speech_encoder.inner.layers.0.self_attn.q_proj.weight
10
+ speech_encoder.inner.layers.0.self_attn.q_proj.bias
11
+ speech_encoder.inner.layers.0.self_attn.output_proj.weight
12
+ speech_encoder.inner.layers.0.self_attn.output_proj.bias
13
+ speech_encoder.inner.layers.0.self_attn.sdpa.rel_k_embed.weight
14
+ speech_encoder.inner.layers.0.conv_layer_norm.weight
15
+ speech_encoder.inner.layers.0.conv_layer_norm.bias
16
+ speech_encoder.inner.layers.0.conv.pointwise_conv1.weight
17
+ speech_encoder.inner.layers.0.conv.depthwise_conv.weight
18
+ speech_encoder.inner.layers.0.conv.layer_norm.weight
19
+ speech_encoder.inner.layers.0.conv.layer_norm.bias
20
+ speech_encoder.inner.layers.0.conv.pointwise_conv2.weight
21
+ speech_encoder.inner.layers.0.ffn1_layer_norm.weight
22
+ speech_encoder.inner.layers.0.ffn1_layer_norm.bias
23
+ speech_encoder.inner.layers.0.ffn1.inner_proj.weight
24
+ speech_encoder.inner.layers.0.ffn1.inner_proj.bias
25
+ speech_encoder.inner.layers.0.ffn1.output_proj.weight
26
+ speech_encoder.inner.layers.0.ffn1.output_proj.bias
27
+ speech_encoder.inner.layers.0.ffn2_layer_norm.weight
28
+ speech_encoder.inner.layers.0.ffn2_layer_norm.bias
29
+ speech_encoder.inner.layers.0.ffn2.inner_proj.weight
30
+ speech_encoder.inner.layers.0.ffn2.inner_proj.bias
31
+ speech_encoder.inner.layers.0.ffn2.output_proj.weight
32
+ speech_encoder.inner.layers.0.ffn2.output_proj.bias
33
+ speech_encoder.inner.layers.0.layer_norm.weight
34
+ speech_encoder.inner.layers.0.layer_norm.bias
35
+ speech_encoder.inner.layers.1.self_attn_layer_norm.weight
36
+ speech_encoder.inner.layers.1.self_attn_layer_norm.bias
37
+ speech_encoder.inner.layers.1.self_attn.k_proj.weight
38
+ speech_encoder.inner.layers.1.self_attn.k_proj.bias
39
+ speech_encoder.inner.layers.1.self_attn.v_proj.weight
40
+ speech_encoder.inner.layers.1.self_attn.v_proj.bias
41
+ speech_encoder.inner.layers.1.self_attn.q_proj.weight
42
+ speech_encoder.inner.layers.1.self_attn.q_proj.bias
43
+ speech_encoder.inner.layers.1.self_attn.output_proj.weight
44
+ speech_encoder.inner.layers.1.self_attn.output_proj.bias
45
+ speech_encoder.inner.layers.1.self_attn.sdpa.rel_k_embed.weight
46
+ speech_encoder.inner.layers.1.conv_layer_norm.weight
47
+ speech_encoder.inner.layers.1.conv_layer_norm.bias
48
+ speech_encoder.inner.layers.1.conv.pointwise_conv1.weight
49
+ speech_encoder.inner.layers.1.conv.depthwise_conv.weight
50
+ speech_encoder.inner.layers.1.conv.layer_norm.weight
51
+ speech_encoder.inner.layers.1.conv.layer_norm.bias
52
+ speech_encoder.inner.layers.1.conv.pointwise_conv2.weight
53
+ speech_encoder.inner.layers.1.ffn1_layer_norm.weight
54
+ speech_encoder.inner.layers.1.ffn1_layer_norm.bias
55
+ speech_encoder.inner.layers.1.ffn1.inner_proj.weight
56
+ speech_encoder.inner.layers.1.ffn1.inner_proj.bias
57
+ speech_encoder.inner.layers.1.ffn1.output_proj.weight
58
+ speech_encoder.inner.layers.1.ffn1.output_proj.bias
59
+ speech_encoder.inner.layers.1.ffn2_layer_norm.weight
60
+ speech_encoder.inner.layers.1.ffn2_layer_norm.bias
61
+ speech_encoder.inner.layers.1.ffn2.inner_proj.weight
62
+ speech_encoder.inner.layers.1.ffn2.inner_proj.bias
63
+ speech_encoder.inner.layers.1.ffn2.output_proj.weight
64
+ speech_encoder.inner.layers.1.ffn2.output_proj.bias
65
+ speech_encoder.inner.layers.1.layer_norm.weight
66
+ speech_encoder.inner.layers.1.layer_norm.bias
67
+ speech_encoder.inner.layers.2.self_attn_layer_norm.weight
68
+ speech_encoder.inner.layers.2.self_attn_layer_norm.bias
69
+ speech_encoder.inner.layers.2.self_attn.k_proj.weight
70
+ speech_encoder.inner.layers.2.self_attn.k_proj.bias
71
+ speech_encoder.inner.layers.2.self_attn.v_proj.weight
72
+ speech_encoder.inner.layers.2.self_attn.v_proj.bias
73
+ speech_encoder.inner.layers.2.self_attn.q_proj.weight
74
+ speech_encoder.inner.layers.2.self_attn.q_proj.bias
75
+ speech_encoder.inner.layers.2.self_attn.output_proj.weight
76
+ speech_encoder.inner.layers.2.self_attn.output_proj.bias
77
+ speech_encoder.inner.layers.2.self_attn.sdpa.rel_k_embed.weight
78
+ speech_encoder.inner.layers.2.conv_layer_norm.weight
79
+ speech_encoder.inner.layers.2.conv_layer_norm.bias
80
+ speech_encoder.inner.layers.2.conv.pointwise_conv1.weight
81
+ speech_encoder.inner.layers.2.conv.depthwise_conv.weight
82
+ speech_encoder.inner.layers.2.conv.layer_norm.weight
83
+ speech_encoder.inner.layers.2.conv.layer_norm.bias
84
+ speech_encoder.inner.layers.2.conv.pointwise_conv2.weight
85
+ speech_encoder.inner.layers.2.ffn1_layer_norm.weight
86
+ speech_encoder.inner.layers.2.ffn1_layer_norm.bias
87
+ speech_encoder.inner.layers.2.ffn1.inner_proj.weight
88
+ speech_encoder.inner.layers.2.ffn1.inner_proj.bias
89
+ speech_encoder.inner.layers.2.ffn1.output_proj.weight
90
+ speech_encoder.inner.layers.2.ffn1.output_proj.bias
91
+ speech_encoder.inner.layers.2.ffn2_layer_norm.weight
92
+ speech_encoder.inner.layers.2.ffn2_layer_norm.bias
93
+ speech_encoder.inner.layers.2.ffn2.inner_proj.weight
94
+ speech_encoder.inner.layers.2.ffn2.inner_proj.bias
95
+ speech_encoder.inner.layers.2.ffn2.output_proj.weight
96
+ speech_encoder.inner.layers.2.ffn2.output_proj.bias
97
+ speech_encoder.inner.layers.2.layer_norm.weight
98
+ speech_encoder.inner.layers.2.layer_norm.bias
99
+ speech_encoder.inner.layers.3.self_attn_layer_norm.weight
100
+ speech_encoder.inner.layers.3.self_attn_layer_norm.bias
101
+ speech_encoder.inner.layers.3.self_attn.k_proj.weight
102
+ speech_encoder.inner.layers.3.self_attn.k_proj.bias
103
+ speech_encoder.inner.layers.3.self_attn.v_proj.weight
104
+ speech_encoder.inner.layers.3.self_attn.v_proj.bias
105
+ speech_encoder.inner.layers.3.self_attn.q_proj.weight
106
+ speech_encoder.inner.layers.3.self_attn.q_proj.bias
107
+ speech_encoder.inner.layers.3.self_attn.output_proj.weight
108
+ speech_encoder.inner.layers.3.self_attn.output_proj.bias
109
+ speech_encoder.inner.layers.3.self_attn.sdpa.rel_k_embed.weight
110
+ speech_encoder.inner.layers.3.conv_layer_norm.weight
111
+ speech_encoder.inner.layers.3.conv_layer_norm.bias
112
+ speech_encoder.inner.layers.3.conv.pointwise_conv1.weight
113
+ speech_encoder.inner.layers.3.conv.depthwise_conv.weight
114
+ speech_encoder.inner.layers.3.conv.layer_norm.weight
115
+ speech_encoder.inner.layers.3.conv.layer_norm.bias
116
+ speech_encoder.inner.layers.3.conv.pointwise_conv2.weight
117
+ speech_encoder.inner.layers.3.ffn1_layer_norm.weight
118
+ speech_encoder.inner.layers.3.ffn1_layer_norm.bias
119
+ speech_encoder.inner.layers.3.ffn1.inner_proj.weight
120
+ speech_encoder.inner.layers.3.ffn1.inner_proj.bias
121
+ speech_encoder.inner.layers.3.ffn1.output_proj.weight
122
+ speech_encoder.inner.layers.3.ffn1.output_proj.bias
123
+ speech_encoder.inner.layers.3.ffn2_layer_norm.weight
124
+ speech_encoder.inner.layers.3.ffn2_layer_norm.bias
125
+ speech_encoder.inner.layers.3.ffn2.inner_proj.weight
126
+ speech_encoder.inner.layers.3.ffn2.inner_proj.bias
127
+ speech_encoder.inner.layers.3.ffn2.output_proj.weight
128
+ speech_encoder.inner.layers.3.ffn2.output_proj.bias
129
+ speech_encoder.inner.layers.3.layer_norm.weight
130
+ speech_encoder.inner.layers.3.layer_norm.bias
131
+ speech_encoder.inner.layers.4.self_attn_layer_norm.weight
132
+ speech_encoder.inner.layers.4.self_attn_layer_norm.bias
133
+ speech_encoder.inner.layers.4.self_attn.k_proj.weight
134
+ speech_encoder.inner.layers.4.self_attn.k_proj.bias
135
+ speech_encoder.inner.layers.4.self_attn.v_proj.weight
136
+ speech_encoder.inner.layers.4.self_attn.v_proj.bias
137
+ speech_encoder.inner.layers.4.self_attn.q_proj.weight
138
+ speech_encoder.inner.layers.4.self_attn.q_proj.bias
139
+ speech_encoder.inner.layers.4.self_attn.output_proj.weight
140
+ speech_encoder.inner.layers.4.self_attn.output_proj.bias
141
+ speech_encoder.inner.layers.4.self_attn.sdpa.rel_k_embed.weight
142
+ speech_encoder.inner.layers.4.conv_layer_norm.weight
143
+ speech_encoder.inner.layers.4.conv_layer_norm.bias
144
+ speech_encoder.inner.layers.4.conv.pointwise_conv1.weight
145
+ speech_encoder.inner.layers.4.conv.depthwise_conv.weight
146
+ speech_encoder.inner.layers.4.conv.layer_norm.weight
147
+ speech_encoder.inner.layers.4.conv.layer_norm.bias
148
+ speech_encoder.inner.layers.4.conv.pointwise_conv2.weight
149
+ speech_encoder.inner.layers.4.ffn1_layer_norm.weight
150
+ speech_encoder.inner.layers.4.ffn1_layer_norm.bias
151
+ speech_encoder.inner.layers.4.ffn1.inner_proj.weight
152
+ speech_encoder.inner.layers.4.ffn1.inner_proj.bias
153
+ speech_encoder.inner.layers.4.ffn1.output_proj.weight
154
+ speech_encoder.inner.layers.4.ffn1.output_proj.bias
155
+ speech_encoder.inner.layers.4.ffn2_layer_norm.weight
156
+ speech_encoder.inner.layers.4.ffn2_layer_norm.bias
157
+ speech_encoder.inner.layers.4.ffn2.inner_proj.weight
158
+ speech_encoder.inner.layers.4.ffn2.inner_proj.bias
159
+ speech_encoder.inner.layers.4.ffn2.output_proj.weight
160
+ speech_encoder.inner.layers.4.ffn2.output_proj.bias
161
+ speech_encoder.inner.layers.4.layer_norm.weight
162
+ speech_encoder.inner.layers.4.layer_norm.bias
163
+ speech_encoder.inner.layers.5.self_attn_layer_norm.weight
164
+ speech_encoder.inner.layers.5.self_attn_layer_norm.bias
165
+ speech_encoder.inner.layers.5.self_attn.k_proj.weight
166
+ speech_encoder.inner.layers.5.self_attn.k_proj.bias
167
+ speech_encoder.inner.layers.5.self_attn.v_proj.weight
168
+ speech_encoder.inner.layers.5.self_attn.v_proj.bias
169
+ speech_encoder.inner.layers.5.self_attn.q_proj.weight
170
+ speech_encoder.inner.layers.5.self_attn.q_proj.bias
171
+ speech_encoder.inner.layers.5.self_attn.output_proj.weight
172
+ speech_encoder.inner.layers.5.self_attn.output_proj.bias
173
+ speech_encoder.inner.layers.5.self_attn.sdpa.rel_k_embed.weight
174
+ speech_encoder.inner.layers.5.conv_layer_norm.weight
175
+ speech_encoder.inner.layers.5.conv_layer_norm.bias
176
+ speech_encoder.inner.layers.5.conv.pointwise_conv1.weight
177
+ speech_encoder.inner.layers.5.conv.depthwise_conv.weight
178
+ speech_encoder.inner.layers.5.conv.layer_norm.weight
179
+ speech_encoder.inner.layers.5.conv.layer_norm.bias
180
+ speech_encoder.inner.layers.5.conv.pointwise_conv2.weight
181
+ speech_encoder.inner.layers.5.ffn1_layer_norm.weight
182
+ speech_encoder.inner.layers.5.ffn1_layer_norm.bias
183
+ speech_encoder.inner.layers.5.ffn1.inner_proj.weight
184
+ speech_encoder.inner.layers.5.ffn1.inner_proj.bias
185
+ speech_encoder.inner.layers.5.ffn1.output_proj.weight
186
+ speech_encoder.inner.layers.5.ffn1.output_proj.bias
187
+ speech_encoder.inner.layers.5.ffn2_layer_norm.weight
188
+ speech_encoder.inner.layers.5.ffn2_layer_norm.bias
189
+ speech_encoder.inner.layers.5.ffn2.inner_proj.weight
190
+ speech_encoder.inner.layers.5.ffn2.inner_proj.bias
191
+ speech_encoder.inner.layers.5.ffn2.output_proj.weight
192
+ speech_encoder.inner.layers.5.ffn2.output_proj.bias
193
+ speech_encoder.inner.layers.5.layer_norm.weight
194
+ speech_encoder.inner.layers.5.layer_norm.bias
195
+ speech_encoder.inner.layers.6.self_attn_layer_norm.weight
196
+ speech_encoder.inner.layers.6.self_attn_layer_norm.bias
197
+ speech_encoder.inner.layers.6.self_attn.k_proj.weight
198
+ speech_encoder.inner.layers.6.self_attn.k_proj.bias
199
+ speech_encoder.inner.layers.6.self_attn.v_proj.weight
200
+ speech_encoder.inner.layers.6.self_attn.v_proj.bias
201
+ speech_encoder.inner.layers.6.self_attn.q_proj.weight
202
+ speech_encoder.inner.layers.6.self_attn.q_proj.bias
203
+ speech_encoder.inner.layers.6.self_attn.output_proj.weight
204
+ speech_encoder.inner.layers.6.self_attn.output_proj.bias
205
+ speech_encoder.inner.layers.6.self_attn.sdpa.rel_k_embed.weight
206
+ speech_encoder.inner.layers.6.conv_layer_norm.weight
207
+ speech_encoder.inner.layers.6.conv_layer_norm.bias
208
+ speech_encoder.inner.layers.6.conv.pointwise_conv1.weight
209
+ speech_encoder.inner.layers.6.conv.depthwise_conv.weight
210
+ speech_encoder.inner.layers.6.conv.layer_norm.weight
211
+ speech_encoder.inner.layers.6.conv.layer_norm.bias
212
+ speech_encoder.inner.layers.6.conv.pointwise_conv2.weight
213
+ speech_encoder.inner.layers.6.ffn1_layer_norm.weight
214
+ speech_encoder.inner.layers.6.ffn1_layer_norm.bias
215
+ speech_encoder.inner.layers.6.ffn1.inner_proj.weight
216
+ speech_encoder.inner.layers.6.ffn1.inner_proj.bias
217
+ speech_encoder.inner.layers.6.ffn1.output_proj.weight
218
+ speech_encoder.inner.layers.6.ffn1.output_proj.bias
219
+ speech_encoder.inner.layers.6.ffn2_layer_norm.weight
220
+ speech_encoder.inner.layers.6.ffn2_layer_norm.bias
221
+ speech_encoder.inner.layers.6.ffn2.inner_proj.weight
222
+ speech_encoder.inner.layers.6.ffn2.inner_proj.bias
223
+ speech_encoder.inner.layers.6.ffn2.output_proj.weight
224
+ speech_encoder.inner.layers.6.ffn2.output_proj.bias
225
+ speech_encoder.inner.layers.6.layer_norm.weight
226
+ speech_encoder.inner.layers.6.layer_norm.bias
227
+ speech_encoder.inner.layers.7.self_attn_layer_norm.weight
228
+ speech_encoder.inner.layers.7.self_attn_layer_norm.bias
229
+ speech_encoder.inner.layers.7.self_attn.k_proj.weight
230
+ speech_encoder.inner.layers.7.self_attn.k_proj.bias
231
+ speech_encoder.inner.layers.7.self_attn.v_proj.weight
232
+ speech_encoder.inner.layers.7.self_attn.v_proj.bias
233
+ speech_encoder.inner.layers.7.self_attn.q_proj.weight
234
+ speech_encoder.inner.layers.7.self_attn.q_proj.bias
235
+ speech_encoder.inner.layers.7.self_attn.output_proj.weight
236
+ speech_encoder.inner.layers.7.self_attn.output_proj.bias
237
+ speech_encoder.inner.layers.7.self_attn.sdpa.rel_k_embed.weight
238
+ speech_encoder.inner.layers.7.conv_layer_norm.weight
239
+ speech_encoder.inner.layers.7.conv_layer_norm.bias
240
+ speech_encoder.inner.layers.7.conv.pointwise_conv1.weight
241
+ speech_encoder.inner.layers.7.conv.depthwise_conv.weight
242
+ speech_encoder.inner.layers.7.conv.layer_norm.weight
243
+ speech_encoder.inner.layers.7.conv.layer_norm.bias
244
+ speech_encoder.inner.layers.7.conv.pointwise_conv2.weight
245
+ speech_encoder.inner.layers.7.ffn1_layer_norm.weight
246
+ speech_encoder.inner.layers.7.ffn1_layer_norm.bias
247
+ speech_encoder.inner.layers.7.ffn1.inner_proj.weight
248
+ speech_encoder.inner.layers.7.ffn1.inner_proj.bias
249
+ speech_encoder.inner.layers.7.ffn1.output_proj.weight
250
+ speech_encoder.inner.layers.7.ffn1.output_proj.bias
251
+ speech_encoder.inner.layers.7.ffn2_layer_norm.weight
252
+ speech_encoder.inner.layers.7.ffn2_layer_norm.bias
253
+ speech_encoder.inner.layers.7.ffn2.inner_proj.weight
254
+ speech_encoder.inner.layers.7.ffn2.inner_proj.bias
255
+ speech_encoder.inner.layers.7.ffn2.output_proj.weight
256
+ speech_encoder.inner.layers.7.ffn2.output_proj.bias
257
+ speech_encoder.inner.layers.7.layer_norm.weight
258
+ speech_encoder.inner.layers.7.layer_norm.bias
259
+ speech_encoder.inner.layers.8.self_attn_layer_norm.weight
260
+ speech_encoder.inner.layers.8.self_attn_layer_norm.bias
261
+ speech_encoder.inner.layers.8.self_attn.k_proj.weight
262
+ speech_encoder.inner.layers.8.self_attn.k_proj.bias
263
+ speech_encoder.inner.layers.8.self_attn.v_proj.weight
264
+ speech_encoder.inner.layers.8.self_attn.v_proj.bias
265
+ speech_encoder.inner.layers.8.self_attn.q_proj.weight
266
+ speech_encoder.inner.layers.8.self_attn.q_proj.bias
267
+ speech_encoder.inner.layers.8.self_attn.output_proj.weight
268
+ speech_encoder.inner.layers.8.self_attn.output_proj.bias
269
+ speech_encoder.inner.layers.8.self_attn.sdpa.rel_k_embed.weight
270
+ speech_encoder.inner.layers.8.conv_layer_norm.weight
271
+ speech_encoder.inner.layers.8.conv_layer_norm.bias
272
+ speech_encoder.inner.layers.8.conv.pointwise_conv1.weight
273
+ speech_encoder.inner.layers.8.conv.depthwise_conv.weight
274
+ speech_encoder.inner.layers.8.conv.layer_norm.weight
275
+ speech_encoder.inner.layers.8.conv.layer_norm.bias
276
+ speech_encoder.inner.layers.8.conv.pointwise_conv2.weight
277
+ speech_encoder.inner.layers.8.ffn1_layer_norm.weight
278
+ speech_encoder.inner.layers.8.ffn1_layer_norm.bias
279
+ speech_encoder.inner.layers.8.ffn1.inner_proj.weight
280
+ speech_encoder.inner.layers.8.ffn1.inner_proj.bias
281
+ speech_encoder.inner.layers.8.ffn1.output_proj.weight
282
+ speech_encoder.inner.layers.8.ffn1.output_proj.bias
283
+ speech_encoder.inner.layers.8.ffn2_layer_norm.weight
284
+ speech_encoder.inner.layers.8.ffn2_layer_norm.bias
285
+ speech_encoder.inner.layers.8.ffn2.inner_proj.weight
286
+ speech_encoder.inner.layers.8.ffn2.inner_proj.bias
287
+ speech_encoder.inner.layers.8.ffn2.output_proj.weight
288
+ speech_encoder.inner.layers.8.ffn2.output_proj.bias
289
+ speech_encoder.inner.layers.8.layer_norm.weight
290
+ speech_encoder.inner.layers.8.layer_norm.bias
291
+ speech_encoder.inner.layers.9.self_attn_layer_norm.weight
292
+ speech_encoder.inner.layers.9.self_attn_layer_norm.bias
293
+ speech_encoder.inner.layers.9.self_attn.k_proj.weight
294
+ speech_encoder.inner.layers.9.self_attn.k_proj.bias
295
+ speech_encoder.inner.layers.9.self_attn.v_proj.weight
296
+ speech_encoder.inner.layers.9.self_attn.v_proj.bias
297
+ speech_encoder.inner.layers.9.self_attn.q_proj.weight
298
+ speech_encoder.inner.layers.9.self_attn.q_proj.bias
299
+ speech_encoder.inner.layers.9.self_attn.output_proj.weight
300
+ speech_encoder.inner.layers.9.self_attn.output_proj.bias
301
+ speech_encoder.inner.layers.9.self_attn.sdpa.rel_k_embed.weight
302
+ speech_encoder.inner.layers.9.conv_layer_norm.weight
303
+ speech_encoder.inner.layers.9.conv_layer_norm.bias
304
+ speech_encoder.inner.layers.9.conv.pointwise_conv1.weight
305
+ speech_encoder.inner.layers.9.conv.depthwise_conv.weight
306
+ speech_encoder.inner.layers.9.conv.layer_norm.weight
307
+ speech_encoder.inner.layers.9.conv.layer_norm.bias
308
+ speech_encoder.inner.layers.9.conv.pointwise_conv2.weight
309
+ speech_encoder.inner.layers.9.ffn1_layer_norm.weight
310
+ speech_encoder.inner.layers.9.ffn1_layer_norm.bias
311
+ speech_encoder.inner.layers.9.ffn1.inner_proj.weight
312
+ speech_encoder.inner.layers.9.ffn1.inner_proj.bias
313
+ speech_encoder.inner.layers.9.ffn1.output_proj.weight
314
+ speech_encoder.inner.layers.9.ffn1.output_proj.bias
315
+ speech_encoder.inner.layers.9.ffn2_layer_norm.weight
316
+ speech_encoder.inner.layers.9.ffn2_layer_norm.bias
317
+ speech_encoder.inner.layers.9.ffn2.inner_proj.weight
318
+ speech_encoder.inner.layers.9.ffn2.inner_proj.bias
319
+ speech_encoder.inner.layers.9.ffn2.output_proj.weight
320
+ speech_encoder.inner.layers.9.ffn2.output_proj.bias
321
+ speech_encoder.inner.layers.9.layer_norm.weight
322
+ speech_encoder.inner.layers.9.layer_norm.bias
323
+ speech_encoder.inner.layers.10.self_attn_layer_norm.weight
324
+ speech_encoder.inner.layers.10.self_attn_layer_norm.bias
325
+ speech_encoder.inner.layers.10.self_attn.k_proj.weight
326
+ speech_encoder.inner.layers.10.self_attn.k_proj.bias
327
+ speech_encoder.inner.layers.10.self_attn.v_proj.weight
328
+ speech_encoder.inner.layers.10.self_attn.v_proj.bias
329
+ speech_encoder.inner.layers.10.self_attn.q_proj.weight
330
+ speech_encoder.inner.layers.10.self_attn.q_proj.bias
331
+ speech_encoder.inner.layers.10.self_attn.output_proj.weight
332
+ speech_encoder.inner.layers.10.self_attn.output_proj.bias
333
+ speech_encoder.inner.layers.10.self_attn.sdpa.rel_k_embed.weight
334
+ speech_encoder.inner.layers.10.conv_layer_norm.weight
335
+ speech_encoder.inner.layers.10.conv_layer_norm.bias
336
+ speech_encoder.inner.layers.10.conv.pointwise_conv1.weight
337
+ speech_encoder.inner.layers.10.conv.depthwise_conv.weight
338
+ speech_encoder.inner.layers.10.conv.layer_norm.weight
339
+ speech_encoder.inner.layers.10.conv.layer_norm.bias
340
+ speech_encoder.inner.layers.10.conv.pointwise_conv2.weight
341
+ speech_encoder.inner.layers.10.ffn1_layer_norm.weight
342
+ speech_encoder.inner.layers.10.ffn1_layer_norm.bias
343
+ speech_encoder.inner.layers.10.ffn1.inner_proj.weight
344
+ speech_encoder.inner.layers.10.ffn1.inner_proj.bias
345
+ speech_encoder.inner.layers.10.ffn1.output_proj.weight
346
+ speech_encoder.inner.layers.10.ffn1.output_proj.bias
347
+ speech_encoder.inner.layers.10.ffn2_layer_norm.weight
348
+ speech_encoder.inner.layers.10.ffn2_layer_norm.bias
349
+ speech_encoder.inner.layers.10.ffn2.inner_proj.weight
350
+ speech_encoder.inner.layers.10.ffn2.inner_proj.bias
351
+ speech_encoder.inner.layers.10.ffn2.output_proj.weight
352
+ speech_encoder.inner.layers.10.ffn2.output_proj.bias
353
+ speech_encoder.inner.layers.10.layer_norm.weight
354
+ speech_encoder.inner.layers.10.layer_norm.bias
355
+ speech_encoder.inner.layers.11.self_attn_layer_norm.weight
356
+ speech_encoder.inner.layers.11.self_attn_layer_norm.bias
357
+ speech_encoder.inner.layers.11.self_attn.k_proj.weight
358
+ speech_encoder.inner.layers.11.self_attn.k_proj.bias
359
+ speech_encoder.inner.layers.11.self_attn.v_proj.weight
360
+ speech_encoder.inner.layers.11.self_attn.v_proj.bias
361
+ speech_encoder.inner.layers.11.self_attn.q_proj.weight
362
+ speech_encoder.inner.layers.11.self_attn.q_proj.bias
363
+ speech_encoder.inner.layers.11.self_attn.output_proj.weight
364
+ speech_encoder.inner.layers.11.self_attn.output_proj.bias
365
+ speech_encoder.inner.layers.11.self_attn.sdpa.rel_k_embed.weight
366
+ speech_encoder.inner.layers.11.conv_layer_norm.weight
367
+ speech_encoder.inner.layers.11.conv_layer_norm.bias
368
+ speech_encoder.inner.layers.11.conv.pointwise_conv1.weight
369
+ speech_encoder.inner.layers.11.conv.depthwise_conv.weight
370
+ speech_encoder.inner.layers.11.conv.layer_norm.weight
371
+ speech_encoder.inner.layers.11.conv.layer_norm.bias
372
+ speech_encoder.inner.layers.11.conv.pointwise_conv2.weight
373
+ speech_encoder.inner.layers.11.ffn1_layer_norm.weight
374
+ speech_encoder.inner.layers.11.ffn1_layer_norm.bias
375
+ speech_encoder.inner.layers.11.ffn1.inner_proj.weight
376
+ speech_encoder.inner.layers.11.ffn1.inner_proj.bias
377
+ speech_encoder.inner.layers.11.ffn1.output_proj.weight
378
+ speech_encoder.inner.layers.11.ffn1.output_proj.bias
379
+ speech_encoder.inner.layers.11.ffn2_layer_norm.weight
380
+ speech_encoder.inner.layers.11.ffn2_layer_norm.bias
381
+ speech_encoder.inner.layers.11.ffn2.inner_proj.weight
382
+ speech_encoder.inner.layers.11.ffn2.inner_proj.bias
383
+ speech_encoder.inner.layers.11.ffn2.output_proj.weight
384
+ speech_encoder.inner.layers.11.ffn2.output_proj.bias
385
+ speech_encoder.inner.layers.11.layer_norm.weight
386
+ speech_encoder.inner.layers.11.layer_norm.bias
387
+ speech_encoder.inner.layers.12.self_attn_layer_norm.weight
388
+ speech_encoder.inner.layers.12.self_attn_layer_norm.bias
389
+ speech_encoder.inner.layers.12.self_attn.k_proj.weight
390
+ speech_encoder.inner.layers.12.self_attn.k_proj.bias
391
+ speech_encoder.inner.layers.12.self_attn.v_proj.weight
392
+ speech_encoder.inner.layers.12.self_attn.v_proj.bias
393
+ speech_encoder.inner.layers.12.self_attn.q_proj.weight
394
+ speech_encoder.inner.layers.12.self_attn.q_proj.bias
395
+ speech_encoder.inner.layers.12.self_attn.output_proj.weight
396
+ speech_encoder.inner.layers.12.self_attn.output_proj.bias
397
+ speech_encoder.inner.layers.12.self_attn.sdpa.rel_k_embed.weight
398
+ speech_encoder.inner.layers.12.conv_layer_norm.weight
399
+ speech_encoder.inner.layers.12.conv_layer_norm.bias
400
+ speech_encoder.inner.layers.12.conv.pointwise_conv1.weight
401
+ speech_encoder.inner.layers.12.conv.depthwise_conv.weight
402
+ speech_encoder.inner.layers.12.conv.layer_norm.weight
403
+ speech_encoder.inner.layers.12.conv.layer_norm.bias
404
+ speech_encoder.inner.layers.12.conv.pointwise_conv2.weight
405
+ speech_encoder.inner.layers.12.ffn1_layer_norm.weight
406
+ speech_encoder.inner.layers.12.ffn1_layer_norm.bias
407
+ speech_encoder.inner.layers.12.ffn1.inner_proj.weight
408
+ speech_encoder.inner.layers.12.ffn1.inner_proj.bias
409
+ speech_encoder.inner.layers.12.ffn1.output_proj.weight
410
+ speech_encoder.inner.layers.12.ffn1.output_proj.bias
411
+ speech_encoder.inner.layers.12.ffn2_layer_norm.weight
412
+ speech_encoder.inner.layers.12.ffn2_layer_norm.bias
413
+ speech_encoder.inner.layers.12.ffn2.inner_proj.weight
414
+ speech_encoder.inner.layers.12.ffn2.inner_proj.bias
415
+ speech_encoder.inner.layers.12.ffn2.output_proj.weight
416
+ speech_encoder.inner.layers.12.ffn2.output_proj.bias
417
+ speech_encoder.inner.layers.12.layer_norm.weight
418
+ speech_encoder.inner.layers.12.layer_norm.bias
419
+ speech_encoder.inner.layers.13.self_attn_layer_norm.weight
420
+ speech_encoder.inner.layers.13.self_attn_layer_norm.bias
421
+ speech_encoder.inner.layers.13.self_attn.k_proj.weight
422
+ speech_encoder.inner.layers.13.self_attn.k_proj.bias
423
+ speech_encoder.inner.layers.13.self_attn.v_proj.weight
424
+ speech_encoder.inner.layers.13.self_attn.v_proj.bias
425
+ speech_encoder.inner.layers.13.self_attn.q_proj.weight
426
+ speech_encoder.inner.layers.13.self_attn.q_proj.bias
427
+ speech_encoder.inner.layers.13.self_attn.output_proj.weight
428
+ speech_encoder.inner.layers.13.self_attn.output_proj.bias
429
+ speech_encoder.inner.layers.13.self_attn.sdpa.rel_k_embed.weight
430
+ speech_encoder.inner.layers.13.conv_layer_norm.weight
431
+ speech_encoder.inner.layers.13.conv_layer_norm.bias
432
+ speech_encoder.inner.layers.13.conv.pointwise_conv1.weight
433
+ speech_encoder.inner.layers.13.conv.depthwise_conv.weight
434
+ speech_encoder.inner.layers.13.conv.layer_norm.weight
435
+ speech_encoder.inner.layers.13.conv.layer_norm.bias
436
+ speech_encoder.inner.layers.13.conv.pointwise_conv2.weight
437
+ speech_encoder.inner.layers.13.ffn1_layer_norm.weight
438
+ speech_encoder.inner.layers.13.ffn1_layer_norm.bias
439
+ speech_encoder.inner.layers.13.ffn1.inner_proj.weight
440
+ speech_encoder.inner.layers.13.ffn1.inner_proj.bias
441
+ speech_encoder.inner.layers.13.ffn1.output_proj.weight
442
+ speech_encoder.inner.layers.13.ffn1.output_proj.bias
443
+ speech_encoder.inner.layers.13.ffn2_layer_norm.weight
444
+ speech_encoder.inner.layers.13.ffn2_layer_norm.bias
445
+ speech_encoder.inner.layers.13.ffn2.inner_proj.weight
446
+ speech_encoder.inner.layers.13.ffn2.inner_proj.bias
447
+ speech_encoder.inner.layers.13.ffn2.output_proj.weight
448
+ speech_encoder.inner.layers.13.ffn2.output_proj.bias
449
+ speech_encoder.inner.layers.13.layer_norm.weight
450
+ speech_encoder.inner.layers.13.layer_norm.bias
451
+ speech_encoder.inner.layers.14.self_attn_layer_norm.weight
452
+ speech_encoder.inner.layers.14.self_attn_layer_norm.bias
453
+ speech_encoder.inner.layers.14.self_attn.k_proj.weight
454
+ speech_encoder.inner.layers.14.self_attn.k_proj.bias
455
+ speech_encoder.inner.layers.14.self_attn.v_proj.weight
456
+ speech_encoder.inner.layers.14.self_attn.v_proj.bias
457
+ speech_encoder.inner.layers.14.self_attn.q_proj.weight
458
+ speech_encoder.inner.layers.14.self_attn.q_proj.bias
459
+ speech_encoder.inner.layers.14.self_attn.output_proj.weight
460
+ speech_encoder.inner.layers.14.self_attn.output_proj.bias
461
+ speech_encoder.inner.layers.14.self_attn.sdpa.rel_k_embed.weight
462
+ speech_encoder.inner.layers.14.conv_layer_norm.weight
463
+ speech_encoder.inner.layers.14.conv_layer_norm.bias
464
+ speech_encoder.inner.layers.14.conv.pointwise_conv1.weight
465
+ speech_encoder.inner.layers.14.conv.depthwise_conv.weight
466
+ speech_encoder.inner.layers.14.conv.layer_norm.weight
467
+ speech_encoder.inner.layers.14.conv.layer_norm.bias
468
+ speech_encoder.inner.layers.14.conv.pointwise_conv2.weight
469
+ speech_encoder.inner.layers.14.ffn1_layer_norm.weight
470
+ speech_encoder.inner.layers.14.ffn1_layer_norm.bias
471
+ speech_encoder.inner.layers.14.ffn1.inner_proj.weight
472
+ speech_encoder.inner.layers.14.ffn1.inner_proj.bias
473
+ speech_encoder.inner.layers.14.ffn1.output_proj.weight
474
+ speech_encoder.inner.layers.14.ffn1.output_proj.bias
475
+ speech_encoder.inner.layers.14.ffn2_layer_norm.weight
476
+ speech_encoder.inner.layers.14.ffn2_layer_norm.bias
477
+ speech_encoder.inner.layers.14.ffn2.inner_proj.weight
478
+ speech_encoder.inner.layers.14.ffn2.inner_proj.bias
479
+ speech_encoder.inner.layers.14.ffn2.output_proj.weight
480
+ speech_encoder.inner.layers.14.ffn2.output_proj.bias
481
+ speech_encoder.inner.layers.14.layer_norm.weight
482
+ speech_encoder.inner.layers.14.layer_norm.bias
483
+ speech_encoder.inner.layers.15.self_attn_layer_norm.weight
484
+ speech_encoder.inner.layers.15.self_attn_layer_norm.bias
485
+ speech_encoder.inner.layers.15.self_attn.k_proj.weight
486
+ speech_encoder.inner.layers.15.self_attn.k_proj.bias
487
+ speech_encoder.inner.layers.15.self_attn.v_proj.weight
488
+ speech_encoder.inner.layers.15.self_attn.v_proj.bias
489
+ speech_encoder.inner.layers.15.self_attn.q_proj.weight
490
+ speech_encoder.inner.layers.15.self_attn.q_proj.bias
491
+ speech_encoder.inner.layers.15.self_attn.output_proj.weight
492
+ speech_encoder.inner.layers.15.self_attn.output_proj.bias
493
+ speech_encoder.inner.layers.15.self_attn.sdpa.rel_k_embed.weight
494
+ speech_encoder.inner.layers.15.conv_layer_norm.weight
495
+ speech_encoder.inner.layers.15.conv_layer_norm.bias
496
+ speech_encoder.inner.layers.15.conv.pointwise_conv1.weight
497
+ speech_encoder.inner.layers.15.conv.depthwise_conv.weight
498
+ speech_encoder.inner.layers.15.conv.layer_norm.weight
499
+ speech_encoder.inner.layers.15.conv.layer_norm.bias
500
+ speech_encoder.inner.layers.15.conv.pointwise_conv2.weight
501
+ speech_encoder.inner.layers.15.ffn1_layer_norm.weight
502
+ speech_encoder.inner.layers.15.ffn1_layer_norm.bias
503
+ speech_encoder.inner.layers.15.ffn1.inner_proj.weight
504
+ speech_encoder.inner.layers.15.ffn1.inner_proj.bias
505
+ speech_encoder.inner.layers.15.ffn1.output_proj.weight
506
+ speech_encoder.inner.layers.15.ffn1.output_proj.bias
507
+ speech_encoder.inner.layers.15.ffn2_layer_norm.weight
508
+ speech_encoder.inner.layers.15.ffn2_layer_norm.bias
509
+ speech_encoder.inner.layers.15.ffn2.inner_proj.weight
510
+ speech_encoder.inner.layers.15.ffn2.inner_proj.bias
511
+ speech_encoder.inner.layers.15.ffn2.output_proj.weight
512
+ speech_encoder.inner.layers.15.ffn2.output_proj.bias
513
+ speech_encoder.inner.layers.15.layer_norm.weight
514
+ speech_encoder.inner.layers.15.layer_norm.bias
515
+ speech_encoder.inner.layers.16.self_attn_layer_norm.weight
516
+ speech_encoder.inner.layers.16.self_attn_layer_norm.bias
517
+ speech_encoder.inner.layers.16.self_attn.k_proj.weight
518
+ speech_encoder.inner.layers.16.self_attn.k_proj.bias
519
+ speech_encoder.inner.layers.16.self_attn.v_proj.weight
520
+ speech_encoder.inner.layers.16.self_attn.v_proj.bias
521
+ speech_encoder.inner.layers.16.self_attn.q_proj.weight
522
+ speech_encoder.inner.layers.16.self_attn.q_proj.bias
523
+ speech_encoder.inner.layers.16.self_attn.output_proj.weight
524
+ speech_encoder.inner.layers.16.self_attn.output_proj.bias
525
+ speech_encoder.inner.layers.16.self_attn.sdpa.rel_k_embed.weight
526
+ speech_encoder.inner.layers.16.conv_layer_norm.weight
527
+ speech_encoder.inner.layers.16.conv_layer_norm.bias
528
+ speech_encoder.inner.layers.16.conv.pointwise_conv1.weight
529
+ speech_encoder.inner.layers.16.conv.depthwise_conv.weight
530
+ speech_encoder.inner.layers.16.conv.layer_norm.weight
531
+ speech_encoder.inner.layers.16.conv.layer_norm.bias
532
+ speech_encoder.inner.layers.16.conv.pointwise_conv2.weight
533
+ speech_encoder.inner.layers.16.ffn1_layer_norm.weight
534
+ speech_encoder.inner.layers.16.ffn1_layer_norm.bias
535
+ speech_encoder.inner.layers.16.ffn1.inner_proj.weight
536
+ speech_encoder.inner.layers.16.ffn1.inner_proj.bias
537
+ speech_encoder.inner.layers.16.ffn1.output_proj.weight
538
+ speech_encoder.inner.layers.16.ffn1.output_proj.bias
539
+ speech_encoder.inner.layers.16.ffn2_layer_norm.weight
540
+ speech_encoder.inner.layers.16.ffn2_layer_norm.bias
541
+ speech_encoder.inner.layers.16.ffn2.inner_proj.weight
542
+ speech_encoder.inner.layers.16.ffn2.inner_proj.bias
543
+ speech_encoder.inner.layers.16.ffn2.output_proj.weight
544
+ speech_encoder.inner.layers.16.ffn2.output_proj.bias
545
+ speech_encoder.inner.layers.16.layer_norm.weight
546
+ speech_encoder.inner.layers.16.layer_norm.bias
547
+ speech_encoder.inner.layers.17.self_attn_layer_norm.weight
548
+ speech_encoder.inner.layers.17.self_attn_layer_norm.bias
549
+ speech_encoder.inner.layers.17.self_attn.k_proj.weight
550
+ speech_encoder.inner.layers.17.self_attn.k_proj.bias
551
+ speech_encoder.inner.layers.17.self_attn.v_proj.weight
552
+ speech_encoder.inner.layers.17.self_attn.v_proj.bias
553
+ speech_encoder.inner.layers.17.self_attn.q_proj.weight
554
+ speech_encoder.inner.layers.17.self_attn.q_proj.bias
555
+ speech_encoder.inner.layers.17.self_attn.output_proj.weight
556
+ speech_encoder.inner.layers.17.self_attn.output_proj.bias
557
+ speech_encoder.inner.layers.17.self_attn.sdpa.rel_k_embed.weight
558
+ speech_encoder.inner.layers.17.conv_layer_norm.weight
559
+ speech_encoder.inner.layers.17.conv_layer_norm.bias
560
+ speech_encoder.inner.layers.17.conv.pointwise_conv1.weight
561
+ speech_encoder.inner.layers.17.conv.depthwise_conv.weight
562
+ speech_encoder.inner.layers.17.conv.layer_norm.weight
563
+ speech_encoder.inner.layers.17.conv.layer_norm.bias
564
+ speech_encoder.inner.layers.17.conv.pointwise_conv2.weight
565
+ speech_encoder.inner.layers.17.ffn1_layer_norm.weight
566
+ speech_encoder.inner.layers.17.ffn1_layer_norm.bias
567
+ speech_encoder.inner.layers.17.ffn1.inner_proj.weight
568
+ speech_encoder.inner.layers.17.ffn1.inner_proj.bias
569
+ speech_encoder.inner.layers.17.ffn1.output_proj.weight
570
+ speech_encoder.inner.layers.17.ffn1.output_proj.bias
571
+ speech_encoder.inner.layers.17.ffn2_layer_norm.weight
572
+ speech_encoder.inner.layers.17.ffn2_layer_norm.bias
573
+ speech_encoder.inner.layers.17.ffn2.inner_proj.weight
574
+ speech_encoder.inner.layers.17.ffn2.inner_proj.bias
575
+ speech_encoder.inner.layers.17.ffn2.output_proj.weight
576
+ speech_encoder.inner.layers.17.ffn2.output_proj.bias
577
+ speech_encoder.inner.layers.17.layer_norm.weight
578
+ speech_encoder.inner.layers.17.layer_norm.bias
579
+ speech_encoder.inner.layers.18.self_attn_layer_norm.weight
580
+ speech_encoder.inner.layers.18.self_attn_layer_norm.bias
581
+ speech_encoder.inner.layers.18.self_attn.k_proj.weight
582
+ speech_encoder.inner.layers.18.self_attn.k_proj.bias
583
+ speech_encoder.inner.layers.18.self_attn.v_proj.weight
584
+ speech_encoder.inner.layers.18.self_attn.v_proj.bias
585
+ speech_encoder.inner.layers.18.self_attn.q_proj.weight
586
+ speech_encoder.inner.layers.18.self_attn.q_proj.bias
587
+ speech_encoder.inner.layers.18.self_attn.output_proj.weight
588
+ speech_encoder.inner.layers.18.self_attn.output_proj.bias
589
+ speech_encoder.inner.layers.18.self_attn.sdpa.rel_k_embed.weight
590
+ speech_encoder.inner.layers.18.conv_layer_norm.weight
591
+ speech_encoder.inner.layers.18.conv_layer_norm.bias
592
+ speech_encoder.inner.layers.18.conv.pointwise_conv1.weight
593
+ speech_encoder.inner.layers.18.conv.depthwise_conv.weight
594
+ speech_encoder.inner.layers.18.conv.layer_norm.weight
595
+ speech_encoder.inner.layers.18.conv.layer_norm.bias
596
+ speech_encoder.inner.layers.18.conv.pointwise_conv2.weight
597
+ speech_encoder.inner.layers.18.ffn1_layer_norm.weight
598
+ speech_encoder.inner.layers.18.ffn1_layer_norm.bias
599
+ speech_encoder.inner.layers.18.ffn1.inner_proj.weight
600
+ speech_encoder.inner.layers.18.ffn1.inner_proj.bias
601
+ speech_encoder.inner.layers.18.ffn1.output_proj.weight
602
+ speech_encoder.inner.layers.18.ffn1.output_proj.bias
603
+ speech_encoder.inner.layers.18.ffn2_layer_norm.weight
604
+ speech_encoder.inner.layers.18.ffn2_layer_norm.bias
605
+ speech_encoder.inner.layers.18.ffn2.inner_proj.weight
606
+ speech_encoder.inner.layers.18.ffn2.inner_proj.bias
607
+ speech_encoder.inner.layers.18.ffn2.output_proj.weight
608
+ speech_encoder.inner.layers.18.ffn2.output_proj.bias
609
+ speech_encoder.inner.layers.18.layer_norm.weight
610
+ speech_encoder.inner.layers.18.layer_norm.bias
611
+ speech_encoder.inner.layers.19.self_attn_layer_norm.weight
612
+ speech_encoder.inner.layers.19.self_attn_layer_norm.bias
613
+ speech_encoder.inner.layers.19.self_attn.k_proj.weight
614
+ speech_encoder.inner.layers.19.self_attn.k_proj.bias
615
+ speech_encoder.inner.layers.19.self_attn.v_proj.weight
616
+ speech_encoder.inner.layers.19.self_attn.v_proj.bias
617
+ speech_encoder.inner.layers.19.self_attn.q_proj.weight
618
+ speech_encoder.inner.layers.19.self_attn.q_proj.bias
619
+ speech_encoder.inner.layers.19.self_attn.output_proj.weight
620
+ speech_encoder.inner.layers.19.self_attn.output_proj.bias
621
+ speech_encoder.inner.layers.19.self_attn.sdpa.rel_k_embed.weight
622
+ speech_encoder.inner.layers.19.conv_layer_norm.weight
623
+ speech_encoder.inner.layers.19.conv_layer_norm.bias
624
+ speech_encoder.inner.layers.19.conv.pointwise_conv1.weight
625
+ speech_encoder.inner.layers.19.conv.depthwise_conv.weight
626
+ speech_encoder.inner.layers.19.conv.layer_norm.weight
627
+ speech_encoder.inner.layers.19.conv.layer_norm.bias
628
+ speech_encoder.inner.layers.19.conv.pointwise_conv2.weight
629
+ speech_encoder.inner.layers.19.ffn1_layer_norm.weight
630
+ speech_encoder.inner.layers.19.ffn1_layer_norm.bias
631
+ speech_encoder.inner.layers.19.ffn1.inner_proj.weight
632
+ speech_encoder.inner.layers.19.ffn1.inner_proj.bias
633
+ speech_encoder.inner.layers.19.ffn1.output_proj.weight
634
+ speech_encoder.inner.layers.19.ffn1.output_proj.bias
635
+ speech_encoder.inner.layers.19.ffn2_layer_norm.weight
636
+ speech_encoder.inner.layers.19.ffn2_layer_norm.bias
637
+ speech_encoder.inner.layers.19.ffn2.inner_proj.weight
638
+ speech_encoder.inner.layers.19.ffn2.inner_proj.bias
639
+ speech_encoder.inner.layers.19.ffn2.output_proj.weight
640
+ speech_encoder.inner.layers.19.ffn2.output_proj.bias
641
+ speech_encoder.inner.layers.19.layer_norm.weight
642
+ speech_encoder.inner.layers.19.layer_norm.bias
643
+ speech_encoder.inner.layers.20.self_attn_layer_norm.weight
644
+ speech_encoder.inner.layers.20.self_attn_layer_norm.bias
645
+ speech_encoder.inner.layers.20.self_attn.k_proj.weight
646
+ speech_encoder.inner.layers.20.self_attn.k_proj.bias
647
+ speech_encoder.inner.layers.20.self_attn.v_proj.weight
648
+ speech_encoder.inner.layers.20.self_attn.v_proj.bias
649
+ speech_encoder.inner.layers.20.self_attn.q_proj.weight
650
+ speech_encoder.inner.layers.20.self_attn.q_proj.bias
651
+ speech_encoder.inner.layers.20.self_attn.output_proj.weight
652
+ speech_encoder.inner.layers.20.self_attn.output_proj.bias
653
+ speech_encoder.inner.layers.20.self_attn.sdpa.rel_k_embed.weight
654
+ speech_encoder.inner.layers.20.conv_layer_norm.weight
655
+ speech_encoder.inner.layers.20.conv_layer_norm.bias
656
+ speech_encoder.inner.layers.20.conv.pointwise_conv1.weight
657
+ speech_encoder.inner.layers.20.conv.depthwise_conv.weight
658
+ speech_encoder.inner.layers.20.conv.layer_norm.weight
659
+ speech_encoder.inner.layers.20.conv.layer_norm.bias
660
+ speech_encoder.inner.layers.20.conv.pointwise_conv2.weight
661
+ speech_encoder.inner.layers.20.ffn1_layer_norm.weight
662
+ speech_encoder.inner.layers.20.ffn1_layer_norm.bias
663
+ speech_encoder.inner.layers.20.ffn1.inner_proj.weight
664
+ speech_encoder.inner.layers.20.ffn1.inner_proj.bias
665
+ speech_encoder.inner.layers.20.ffn1.output_proj.weight
666
+ speech_encoder.inner.layers.20.ffn1.output_proj.bias
667
+ speech_encoder.inner.layers.20.ffn2_layer_norm.weight
668
+ speech_encoder.inner.layers.20.ffn2_layer_norm.bias
669
+ speech_encoder.inner.layers.20.ffn2.inner_proj.weight
670
+ speech_encoder.inner.layers.20.ffn2.inner_proj.bias
671
+ speech_encoder.inner.layers.20.ffn2.output_proj.weight
672
+ speech_encoder.inner.layers.20.ffn2.output_proj.bias
673
+ speech_encoder.inner.layers.20.layer_norm.weight
674
+ speech_encoder.inner.layers.20.layer_norm.bias
675
+ speech_encoder.inner.layers.21.self_attn_layer_norm.weight
676
+ speech_encoder.inner.layers.21.self_attn_layer_norm.bias
677
+ speech_encoder.inner.layers.21.self_attn.k_proj.weight
678
+ speech_encoder.inner.layers.21.self_attn.k_proj.bias
679
+ speech_encoder.inner.layers.21.self_attn.v_proj.weight
680
+ speech_encoder.inner.layers.21.self_attn.v_proj.bias
681
+ speech_encoder.inner.layers.21.self_attn.q_proj.weight
682
+ speech_encoder.inner.layers.21.self_attn.q_proj.bias
683
+ speech_encoder.inner.layers.21.self_attn.output_proj.weight
684
+ speech_encoder.inner.layers.21.self_attn.output_proj.bias
685
+ speech_encoder.inner.layers.21.self_attn.sdpa.rel_k_embed.weight
686
+ speech_encoder.inner.layers.21.conv_layer_norm.weight
687
+ speech_encoder.inner.layers.21.conv_layer_norm.bias
688
+ speech_encoder.inner.layers.21.conv.pointwise_conv1.weight
689
+ speech_encoder.inner.layers.21.conv.depthwise_conv.weight
690
+ speech_encoder.inner.layers.21.conv.layer_norm.weight
691
+ speech_encoder.inner.layers.21.conv.layer_norm.bias
692
+ speech_encoder.inner.layers.21.conv.pointwise_conv2.weight
693
+ speech_encoder.inner.layers.21.ffn1_layer_norm.weight
694
+ speech_encoder.inner.layers.21.ffn1_layer_norm.bias
695
+ speech_encoder.inner.layers.21.ffn1.inner_proj.weight
696
+ speech_encoder.inner.layers.21.ffn1.inner_proj.bias
697
+ speech_encoder.inner.layers.21.ffn1.output_proj.weight
698
+ speech_encoder.inner.layers.21.ffn1.output_proj.bias
699
+ speech_encoder.inner.layers.21.ffn2_layer_norm.weight
700
+ speech_encoder.inner.layers.21.ffn2_layer_norm.bias
701
+ speech_encoder.inner.layers.21.ffn2.inner_proj.weight
702
+ speech_encoder.inner.layers.21.ffn2.inner_proj.bias
703
+ speech_encoder.inner.layers.21.ffn2.output_proj.weight
704
+ speech_encoder.inner.layers.21.ffn2.output_proj.bias
705
+ speech_encoder.inner.layers.21.layer_norm.weight
706
+ speech_encoder.inner.layers.21.layer_norm.bias
707
+ speech_encoder.inner.layers.22.self_attn_layer_norm.weight
708
+ speech_encoder.inner.layers.22.self_attn_layer_norm.bias
709
+ speech_encoder.inner.layers.22.self_attn.k_proj.weight
710
+ speech_encoder.inner.layers.22.self_attn.k_proj.bias
711
+ speech_encoder.inner.layers.22.self_attn.v_proj.weight
712
+ speech_encoder.inner.layers.22.self_attn.v_proj.bias
713
+ speech_encoder.inner.layers.22.self_attn.q_proj.weight
714
+ speech_encoder.inner.layers.22.self_attn.q_proj.bias
715
+ speech_encoder.inner.layers.22.self_attn.output_proj.weight
716
+ speech_encoder.inner.layers.22.self_attn.output_proj.bias
717
+ speech_encoder.inner.layers.22.self_attn.sdpa.rel_k_embed.weight
718
+ speech_encoder.inner.layers.22.conv_layer_norm.weight
719
+ speech_encoder.inner.layers.22.conv_layer_norm.bias
720
+ speech_encoder.inner.layers.22.conv.pointwise_conv1.weight
721
+ speech_encoder.inner.layers.22.conv.depthwise_conv.weight
722
+ speech_encoder.inner.layers.22.conv.layer_norm.weight
723
+ speech_encoder.inner.layers.22.conv.layer_norm.bias
724
+ speech_encoder.inner.layers.22.conv.pointwise_conv2.weight
725
+ speech_encoder.inner.layers.22.ffn1_layer_norm.weight
726
+ speech_encoder.inner.layers.22.ffn1_layer_norm.bias
727
+ speech_encoder.inner.layers.22.ffn1.inner_proj.weight
728
+ speech_encoder.inner.layers.22.ffn1.inner_proj.bias
729
+ speech_encoder.inner.layers.22.ffn1.output_proj.weight
730
+ speech_encoder.inner.layers.22.ffn1.output_proj.bias
731
+ speech_encoder.inner.layers.22.ffn2_layer_norm.weight
732
+ speech_encoder.inner.layers.22.ffn2_layer_norm.bias
733
+ speech_encoder.inner.layers.22.ffn2.inner_proj.weight
734
+ speech_encoder.inner.layers.22.ffn2.inner_proj.bias
735
+ speech_encoder.inner.layers.22.ffn2.output_proj.weight
736
+ speech_encoder.inner.layers.22.ffn2.output_proj.bias
737
+ speech_encoder.inner.layers.22.layer_norm.weight
738
+ speech_encoder.inner.layers.22.layer_norm.bias
739
+ speech_encoder.inner.layers.23.self_attn_layer_norm.weight
740
+ speech_encoder.inner.layers.23.self_attn_layer_norm.bias
741
+ speech_encoder.inner.layers.23.self_attn.k_proj.weight
742
+ speech_encoder.inner.layers.23.self_attn.k_proj.bias
743
+ speech_encoder.inner.layers.23.self_attn.v_proj.weight
744
+ speech_encoder.inner.layers.23.self_attn.v_proj.bias
745
+ speech_encoder.inner.layers.23.self_attn.q_proj.weight
746
+ speech_encoder.inner.layers.23.self_attn.q_proj.bias
747
+ speech_encoder.inner.layers.23.self_attn.output_proj.weight
748
+ speech_encoder.inner.layers.23.self_attn.output_proj.bias
749
+ speech_encoder.inner.layers.23.self_attn.sdpa.rel_k_embed.weight
750
+ speech_encoder.inner.layers.23.conv_layer_norm.weight
751
+ speech_encoder.inner.layers.23.conv_layer_norm.bias
752
+ speech_encoder.inner.layers.23.conv.pointwise_conv1.weight
753
+ speech_encoder.inner.layers.23.conv.depthwise_conv.weight
754
+ speech_encoder.inner.layers.23.conv.layer_norm.weight
755
+ speech_encoder.inner.layers.23.conv.layer_norm.bias
756
+ speech_encoder.inner.layers.23.conv.pointwise_conv2.weight
757
+ speech_encoder.inner.layers.23.ffn1_layer_norm.weight
758
+ speech_encoder.inner.layers.23.ffn1_layer_norm.bias
759
+ speech_encoder.inner.layers.23.ffn1.inner_proj.weight
760
+ speech_encoder.inner.layers.23.ffn1.inner_proj.bias
761
+ speech_encoder.inner.layers.23.ffn1.output_proj.weight
762
+ speech_encoder.inner.layers.23.ffn1.output_proj.bias
763
+ speech_encoder.inner.layers.23.ffn2_layer_norm.weight
764
+ speech_encoder.inner.layers.23.ffn2_layer_norm.bias
765
+ speech_encoder.inner.layers.23.ffn2.inner_proj.weight
766
+ speech_encoder.inner.layers.23.ffn2.inner_proj.bias
767
+ speech_encoder.inner.layers.23.ffn2.output_proj.weight
768
+ speech_encoder.inner.layers.23.ffn2.output_proj.bias
769
+ speech_encoder.inner.layers.23.layer_norm.weight
770
+ speech_encoder.inner.layers.23.layer_norm.bias
771
+ speech_encoder.inner_layer_norm.weight
772
+ speech_encoder.inner_layer_norm.bias
773
+ speech_encoder_frontend.post_extract_layer_norm.weight
774
+ speech_encoder_frontend.post_extract_layer_norm.bias
775
+ speech_encoder.proj1.weight
776
+ speech_encoder.proj1.bias
777
+ speech_encoder.proj2.weight
778
+ speech_encoder.proj2.bias
779
+ speech_encoder.adaptor_layers.0.self_attn.k_proj.weight
780
+ speech_encoder.adaptor_layers.0.self_attn.k_proj.bias
781
+ speech_encoder.adaptor_layers.0.self_attn.v_proj.weight
782
+ speech_encoder.adaptor_layers.0.self_attn.v_proj.bias
783
+ speech_encoder.adaptor_layers.0.self_attn.q_proj.weight
784
+ speech_encoder.adaptor_layers.0.self_attn.q_proj.bias
785
+ speech_encoder.adaptor_layers.0.self_attn.output_proj.weight
786
+ speech_encoder.adaptor_layers.0.self_attn.output_proj.bias
787
+ speech_encoder.adaptor_layers.0.self_attn_layer_norm.weight
788
+ speech_encoder.adaptor_layers.0.self_attn_layer_norm.bias
789
+ speech_encoder.adaptor_layers.0.ffn.inner_proj.weight
790
+ speech_encoder.adaptor_layers.0.ffn.inner_proj.bias
791
+ speech_encoder.adaptor_layers.0.ffn.output_proj.weight
792
+ speech_encoder.adaptor_layers.0.ffn.output_proj.bias
793
+ speech_encoder.adaptor_layers.0.ffn_layer_norm.weight
794
+ speech_encoder.adaptor_layers.0.ffn_layer_norm.bias
795
+ speech_encoder.adaptor_layers.0.residual_layer_norm.weight
796
+ speech_encoder.adaptor_layers.0.residual_layer_norm.bias
797
+ speech_encoder.adaptor_layers.0.residual_conv.weight
798
+ speech_encoder.adaptor_layers.0.residual_conv.bias
799
+ speech_encoder.adaptor_layers.0.self_attn_conv.weight
800
+ speech_encoder.adaptor_layers.0.self_attn_conv.bias
801
+ speech_encoder.layer_norm.weight
802
+ speech_encoder.layer_norm.bias
803
+ text_decoder_frontend.embed.weight
804
+ text_decoder.layers.0.self_attn.k_proj.weight
805
+ text_decoder.layers.0.self_attn.k_proj.bias
806
+ text_decoder.layers.0.self_attn.v_proj.weight
807
+ text_decoder.layers.0.self_attn.v_proj.bias
808
+ text_decoder.layers.0.self_attn.q_proj.weight
809
+ text_decoder.layers.0.self_attn.q_proj.bias
810
+ text_decoder.layers.0.self_attn.output_proj.weight
811
+ text_decoder.layers.0.self_attn.output_proj.bias
812
+ text_decoder.layers.0.self_attn_layer_norm.weight
813
+ text_decoder.layers.0.self_attn_layer_norm.bias
814
+ text_decoder.layers.0.encoder_decoder_attn.k_proj.weight
815
+ text_decoder.layers.0.encoder_decoder_attn.k_proj.bias
816
+ text_decoder.layers.0.encoder_decoder_attn.v_proj.weight
817
+ text_decoder.layers.0.encoder_decoder_attn.v_proj.bias
818
+ text_decoder.layers.0.encoder_decoder_attn.q_proj.weight
819
+ text_decoder.layers.0.encoder_decoder_attn.q_proj.bias
820
+ text_decoder.layers.0.encoder_decoder_attn.output_proj.weight
821
+ text_decoder.layers.0.encoder_decoder_attn.output_proj.bias
822
+ text_decoder.layers.0.encoder_decoder_attn_layer_norm.weight
823
+ text_decoder.layers.0.encoder_decoder_attn_layer_norm.bias
824
+ text_decoder.layers.0.ffn.inner_proj.weight
825
+ text_decoder.layers.0.ffn.inner_proj.bias
826
+ text_decoder.layers.0.ffn.output_proj.weight
827
+ text_decoder.layers.0.ffn.output_proj.bias
828
+ text_decoder.layers.0.ffn_layer_norm.weight
829
+ text_decoder.layers.0.ffn_layer_norm.bias
830
+ text_decoder.layers.1.self_attn.k_proj.weight
831
+ text_decoder.layers.1.self_attn.k_proj.bias
832
+ text_decoder.layers.1.self_attn.v_proj.weight
833
+ text_decoder.layers.1.self_attn.v_proj.bias
834
+ text_decoder.layers.1.self_attn.q_proj.weight
835
+ text_decoder.layers.1.self_attn.q_proj.bias
836
+ text_decoder.layers.1.self_attn.output_proj.weight
837
+ text_decoder.layers.1.self_attn.output_proj.bias
838
+ text_decoder.layers.1.self_attn_layer_norm.weight
839
+ text_decoder.layers.1.self_attn_layer_norm.bias
840
+ text_decoder.layers.1.encoder_decoder_attn.k_proj.weight
841
+ text_decoder.layers.1.encoder_decoder_attn.k_proj.bias
842
+ text_decoder.layers.1.encoder_decoder_attn.v_proj.weight
843
+ text_decoder.layers.1.encoder_decoder_attn.v_proj.bias
844
+ text_decoder.layers.1.encoder_decoder_attn.q_proj.weight
845
+ text_decoder.layers.1.encoder_decoder_attn.q_proj.bias
846
+ text_decoder.layers.1.encoder_decoder_attn.output_proj.weight
847
+ text_decoder.layers.1.encoder_decoder_attn.output_proj.bias
848
+ text_decoder.layers.1.encoder_decoder_attn_layer_norm.weight
849
+ text_decoder.layers.1.encoder_decoder_attn_layer_norm.bias
850
+ text_decoder.layers.1.ffn.inner_proj.weight
851
+ text_decoder.layers.1.ffn.inner_proj.bias
852
+ text_decoder.layers.1.ffn.output_proj.weight
853
+ text_decoder.layers.1.ffn.output_proj.bias
854
+ text_decoder.layers.1.ffn_layer_norm.weight
855
+ text_decoder.layers.1.ffn_layer_norm.bias
856
+ text_decoder.layers.2.self_attn.k_proj.weight
857
+ text_decoder.layers.2.self_attn.k_proj.bias
858
+ text_decoder.layers.2.self_attn.v_proj.weight
859
+ text_decoder.layers.2.self_attn.v_proj.bias
860
+ text_decoder.layers.2.self_attn.q_proj.weight
861
+ text_decoder.layers.2.self_attn.q_proj.bias
862
+ text_decoder.layers.2.self_attn.output_proj.weight
863
+ text_decoder.layers.2.self_attn.output_proj.bias
864
+ text_decoder.layers.2.self_attn_layer_norm.weight
865
+ text_decoder.layers.2.self_attn_layer_norm.bias
866
+ text_decoder.layers.2.encoder_decoder_attn.k_proj.weight
867
+ text_decoder.layers.2.encoder_decoder_attn.k_proj.bias
868
+ text_decoder.layers.2.encoder_decoder_attn.v_proj.weight
869
+ text_decoder.layers.2.encoder_decoder_attn.v_proj.bias
870
+ text_decoder.layers.2.encoder_decoder_attn.q_proj.weight
871
+ text_decoder.layers.2.encoder_decoder_attn.q_proj.bias
872
+ text_decoder.layers.2.encoder_decoder_attn.output_proj.weight
873
+ text_decoder.layers.2.encoder_decoder_attn.output_proj.bias
874
+ text_decoder.layers.2.encoder_decoder_attn_layer_norm.weight
875
+ text_decoder.layers.2.encoder_decoder_attn_layer_norm.bias
876
+ text_decoder.layers.2.ffn.inner_proj.weight
877
+ text_decoder.layers.2.ffn.inner_proj.bias
878
+ text_decoder.layers.2.ffn.output_proj.weight
879
+ text_decoder.layers.2.ffn.output_proj.bias
880
+ text_decoder.layers.2.ffn_layer_norm.weight
881
+ text_decoder.layers.2.ffn_layer_norm.bias
882
+ text_decoder.layers.3.self_attn.k_proj.weight
883
+ text_decoder.layers.3.self_attn.k_proj.bias
884
+ text_decoder.layers.3.self_attn.v_proj.weight
885
+ text_decoder.layers.3.self_attn.v_proj.bias
886
+ text_decoder.layers.3.self_attn.q_proj.weight
887
+ text_decoder.layers.3.self_attn.q_proj.bias
888
+ text_decoder.layers.3.self_attn.output_proj.weight
889
+ text_decoder.layers.3.self_attn.output_proj.bias
890
+ text_decoder.layers.3.self_attn_layer_norm.weight
891
+ text_decoder.layers.3.self_attn_layer_norm.bias
892
+ text_decoder.layers.3.encoder_decoder_attn.k_proj.weight
893
+ text_decoder.layers.3.encoder_decoder_attn.k_proj.bias
894
+ text_decoder.layers.3.encoder_decoder_attn.v_proj.weight
895
+ text_decoder.layers.3.encoder_decoder_attn.v_proj.bias
896
+ text_decoder.layers.3.encoder_decoder_attn.q_proj.weight
897
+ text_decoder.layers.3.encoder_decoder_attn.q_proj.bias
898
+ text_decoder.layers.3.encoder_decoder_attn.output_proj.weight
899
+ text_decoder.layers.3.encoder_decoder_attn.output_proj.bias
900
+ text_decoder.layers.3.encoder_decoder_attn_layer_norm.weight
901
+ text_decoder.layers.3.encoder_decoder_attn_layer_norm.bias
902
+ text_decoder.layers.3.ffn.inner_proj.weight
903
+ text_decoder.layers.3.ffn.inner_proj.bias
904
+ text_decoder.layers.3.ffn.output_proj.weight
905
+ text_decoder.layers.3.ffn.output_proj.bias
906
+ text_decoder.layers.3.ffn_layer_norm.weight
907
+ text_decoder.layers.3.ffn_layer_norm.bias
908
+ text_decoder.layers.4.self_attn.k_proj.weight
909
+ text_decoder.layers.4.self_attn.k_proj.bias
910
+ text_decoder.layers.4.self_attn.v_proj.weight
911
+ text_decoder.layers.4.self_attn.v_proj.bias
912
+ text_decoder.layers.4.self_attn.q_proj.weight
913
+ text_decoder.layers.4.self_attn.q_proj.bias
914
+ text_decoder.layers.4.self_attn.output_proj.weight
915
+ text_decoder.layers.4.self_attn.output_proj.bias
916
+ text_decoder.layers.4.self_attn_layer_norm.weight
917
+ text_decoder.layers.4.self_attn_layer_norm.bias
918
+ text_decoder.layers.4.encoder_decoder_attn.k_proj.weight
919
+ text_decoder.layers.4.encoder_decoder_attn.k_proj.bias
920
+ text_decoder.layers.4.encoder_decoder_attn.v_proj.weight
921
+ text_decoder.layers.4.encoder_decoder_attn.v_proj.bias
922
+ text_decoder.layers.4.encoder_decoder_attn.q_proj.weight
923
+ text_decoder.layers.4.encoder_decoder_attn.q_proj.bias
924
+ text_decoder.layers.4.encoder_decoder_attn.output_proj.weight
925
+ text_decoder.layers.4.encoder_decoder_attn.output_proj.bias
926
+ text_decoder.layers.4.encoder_decoder_attn_layer_norm.weight
927
+ text_decoder.layers.4.encoder_decoder_attn_layer_norm.bias
928
+ text_decoder.layers.4.ffn.inner_proj.weight
929
+ text_decoder.layers.4.ffn.inner_proj.bias
930
+ text_decoder.layers.4.ffn.output_proj.weight
931
+ text_decoder.layers.4.ffn.output_proj.bias
932
+ text_decoder.layers.4.ffn_layer_norm.weight
933
+ text_decoder.layers.4.ffn_layer_norm.bias
934
+ text_decoder.layers.5.self_attn.k_proj.weight
935
+ text_decoder.layers.5.self_attn.k_proj.bias
936
+ text_decoder.layers.5.self_attn.v_proj.weight
937
+ text_decoder.layers.5.self_attn.v_proj.bias
938
+ text_decoder.layers.5.self_attn.q_proj.weight
939
+ text_decoder.layers.5.self_attn.q_proj.bias
940
+ text_decoder.layers.5.self_attn.output_proj.weight
941
+ text_decoder.layers.5.self_attn.output_proj.bias
942
+ text_decoder.layers.5.self_attn_layer_norm.weight
943
+ text_decoder.layers.5.self_attn_layer_norm.bias
944
+ text_decoder.layers.5.encoder_decoder_attn.k_proj.weight
945
+ text_decoder.layers.5.encoder_decoder_attn.k_proj.bias
946
+ text_decoder.layers.5.encoder_decoder_attn.v_proj.weight
947
+ text_decoder.layers.5.encoder_decoder_attn.v_proj.bias
948
+ text_decoder.layers.5.encoder_decoder_attn.q_proj.weight
949
+ text_decoder.layers.5.encoder_decoder_attn.q_proj.bias
950
+ text_decoder.layers.5.encoder_decoder_attn.output_proj.weight
951
+ text_decoder.layers.5.encoder_decoder_attn.output_proj.bias
952
+ text_decoder.layers.5.encoder_decoder_attn_layer_norm.weight
953
+ text_decoder.layers.5.encoder_decoder_attn_layer_norm.bias
954
+ text_decoder.layers.5.ffn.inner_proj.weight
955
+ text_decoder.layers.5.ffn.inner_proj.bias
956
+ text_decoder.layers.5.ffn.output_proj.weight
957
+ text_decoder.layers.5.ffn.output_proj.bias
958
+ text_decoder.layers.5.ffn_layer_norm.weight
959
+ text_decoder.layers.5.ffn_layer_norm.bias
960
+ text_decoder.layers.6.self_attn.k_proj.weight
961
+ text_decoder.layers.6.self_attn.k_proj.bias
962
+ text_decoder.layers.6.self_attn.v_proj.weight
963
+ text_decoder.layers.6.self_attn.v_proj.bias
964
+ text_decoder.layers.6.self_attn.q_proj.weight
965
+ text_decoder.layers.6.self_attn.q_proj.bias
966
+ text_decoder.layers.6.self_attn.output_proj.weight
967
+ text_decoder.layers.6.self_attn.output_proj.bias
968
+ text_decoder.layers.6.self_attn_layer_norm.weight
969
+ text_decoder.layers.6.self_attn_layer_norm.bias
970
+ text_decoder.layers.6.encoder_decoder_attn.k_proj.weight
971
+ text_decoder.layers.6.encoder_decoder_attn.k_proj.bias
972
+ text_decoder.layers.6.encoder_decoder_attn.v_proj.weight
973
+ text_decoder.layers.6.encoder_decoder_attn.v_proj.bias
974
+ text_decoder.layers.6.encoder_decoder_attn.q_proj.weight
975
+ text_decoder.layers.6.encoder_decoder_attn.q_proj.bias
976
+ text_decoder.layers.6.encoder_decoder_attn.output_proj.weight
977
+ text_decoder.layers.6.encoder_decoder_attn.output_proj.bias
978
+ text_decoder.layers.6.encoder_decoder_attn_layer_norm.weight
979
+ text_decoder.layers.6.encoder_decoder_attn_layer_norm.bias
980
+ text_decoder.layers.6.ffn.inner_proj.weight
981
+ text_decoder.layers.6.ffn.inner_proj.bias
982
+ text_decoder.layers.6.ffn.output_proj.weight
983
+ text_decoder.layers.6.ffn.output_proj.bias
984
+ text_decoder.layers.6.ffn_layer_norm.weight
985
+ text_decoder.layers.6.ffn_layer_norm.bias
986
+ text_decoder.layers.7.self_attn.k_proj.weight
987
+ text_decoder.layers.7.self_attn.k_proj.bias
988
+ text_decoder.layers.7.self_attn.v_proj.weight
989
+ text_decoder.layers.7.self_attn.v_proj.bias
990
+ text_decoder.layers.7.self_attn.q_proj.weight
991
+ text_decoder.layers.7.self_attn.q_proj.bias
992
+ text_decoder.layers.7.self_attn.output_proj.weight
993
+ text_decoder.layers.7.self_attn.output_proj.bias
994
+ text_decoder.layers.7.self_attn_layer_norm.weight
995
+ text_decoder.layers.7.self_attn_layer_norm.bias
996
+ text_decoder.layers.7.encoder_decoder_attn.k_proj.weight
997
+ text_decoder.layers.7.encoder_decoder_attn.k_proj.bias
998
+ text_decoder.layers.7.encoder_decoder_attn.v_proj.weight
999
+ text_decoder.layers.7.encoder_decoder_attn.v_proj.bias
1000
+ text_decoder.layers.7.encoder_decoder_attn.q_proj.weight
1001
+ text_decoder.layers.7.encoder_decoder_attn.q_proj.bias
1002
+ text_decoder.layers.7.encoder_decoder_attn.output_proj.weight
1003
+ text_decoder.layers.7.encoder_decoder_attn.output_proj.bias
1004
+ text_decoder.layers.7.encoder_decoder_attn_layer_norm.weight
1005
+ text_decoder.layers.7.encoder_decoder_attn_layer_norm.bias
1006
+ text_decoder.layers.7.ffn.inner_proj.weight
1007
+ text_decoder.layers.7.ffn.inner_proj.bias
1008
+ text_decoder.layers.7.ffn.output_proj.weight
1009
+ text_decoder.layers.7.ffn.output_proj.bias
1010
+ text_decoder.layers.7.ffn_layer_norm.weight
1011
+ text_decoder.layers.7.ffn_layer_norm.bias
1012
+ text_decoder.layers.8.self_attn.k_proj.weight
1013
+ text_decoder.layers.8.self_attn.k_proj.bias
1014
+ text_decoder.layers.8.self_attn.v_proj.weight
1015
+ text_decoder.layers.8.self_attn.v_proj.bias
1016
+ text_decoder.layers.8.self_attn.q_proj.weight
1017
+ text_decoder.layers.8.self_attn.q_proj.bias
1018
+ text_decoder.layers.8.self_attn.output_proj.weight
1019
+ text_decoder.layers.8.self_attn.output_proj.bias
1020
+ text_decoder.layers.8.self_attn_layer_norm.weight
1021
+ text_decoder.layers.8.self_attn_layer_norm.bias
1022
+ text_decoder.layers.8.encoder_decoder_attn.k_proj.weight
1023
+ text_decoder.layers.8.encoder_decoder_attn.k_proj.bias
1024
+ text_decoder.layers.8.encoder_decoder_attn.v_proj.weight
1025
+ text_decoder.layers.8.encoder_decoder_attn.v_proj.bias
1026
+ text_decoder.layers.8.encoder_decoder_attn.q_proj.weight
1027
+ text_decoder.layers.8.encoder_decoder_attn.q_proj.bias
1028
+ text_decoder.layers.8.encoder_decoder_attn.output_proj.weight
1029
+ text_decoder.layers.8.encoder_decoder_attn.output_proj.bias
1030
+ text_decoder.layers.8.encoder_decoder_attn_layer_norm.weight
1031
+ text_decoder.layers.8.encoder_decoder_attn_layer_norm.bias
1032
+ text_decoder.layers.8.ffn.inner_proj.weight
1033
+ text_decoder.layers.8.ffn.inner_proj.bias
1034
+ text_decoder.layers.8.ffn.output_proj.weight
1035
+ text_decoder.layers.8.ffn.output_proj.bias
1036
+ text_decoder.layers.8.ffn_layer_norm.weight
1037
+ text_decoder.layers.8.ffn_layer_norm.bias
1038
+ text_decoder.layers.9.self_attn.k_proj.weight
1039
+ text_decoder.layers.9.self_attn.k_proj.bias
1040
+ text_decoder.layers.9.self_attn.v_proj.weight
1041
+ text_decoder.layers.9.self_attn.v_proj.bias
1042
+ text_decoder.layers.9.self_attn.q_proj.weight
1043
+ text_decoder.layers.9.self_attn.q_proj.bias
1044
+ text_decoder.layers.9.self_attn.output_proj.weight
1045
+ text_decoder.layers.9.self_attn.output_proj.bias
1046
+ text_decoder.layers.9.self_attn_layer_norm.weight
1047
+ text_decoder.layers.9.self_attn_layer_norm.bias
1048
+ text_decoder.layers.9.encoder_decoder_attn.k_proj.weight
1049
+ text_decoder.layers.9.encoder_decoder_attn.k_proj.bias
1050
+ text_decoder.layers.9.encoder_decoder_attn.v_proj.weight
1051
+ text_decoder.layers.9.encoder_decoder_attn.v_proj.bias
1052
+ text_decoder.layers.9.encoder_decoder_attn.q_proj.weight
1053
+ text_decoder.layers.9.encoder_decoder_attn.q_proj.bias
1054
+ text_decoder.layers.9.encoder_decoder_attn.output_proj.weight
1055
+ text_decoder.layers.9.encoder_decoder_attn.output_proj.bias
1056
+ text_decoder.layers.9.encoder_decoder_attn_layer_norm.weight
1057
+ text_decoder.layers.9.encoder_decoder_attn_layer_norm.bias
1058
+ text_decoder.layers.9.ffn.inner_proj.weight
1059
+ text_decoder.layers.9.ffn.inner_proj.bias
1060
+ text_decoder.layers.9.ffn.output_proj.weight
1061
+ text_decoder.layers.9.ffn.output_proj.bias
1062
+ text_decoder.layers.9.ffn_layer_norm.weight
1063
+ text_decoder.layers.9.ffn_layer_norm.bias
1064
+ text_decoder.layers.10.self_attn.k_proj.weight
1065
+ text_decoder.layers.10.self_attn.k_proj.bias
1066
+ text_decoder.layers.10.self_attn.v_proj.weight
1067
+ text_decoder.layers.10.self_attn.v_proj.bias
1068
+ text_decoder.layers.10.self_attn.q_proj.weight
1069
+ text_decoder.layers.10.self_attn.q_proj.bias
1070
+ text_decoder.layers.10.self_attn.output_proj.weight
1071
+ text_decoder.layers.10.self_attn.output_proj.bias
1072
+ text_decoder.layers.10.self_attn_layer_norm.weight
1073
+ text_decoder.layers.10.self_attn_layer_norm.bias
1074
+ text_decoder.layers.10.encoder_decoder_attn.k_proj.weight
1075
+ text_decoder.layers.10.encoder_decoder_attn.k_proj.bias
1076
+ text_decoder.layers.10.encoder_decoder_attn.v_proj.weight
1077
+ text_decoder.layers.10.encoder_decoder_attn.v_proj.bias
1078
+ text_decoder.layers.10.encoder_decoder_attn.q_proj.weight
1079
+ text_decoder.layers.10.encoder_decoder_attn.q_proj.bias
1080
+ text_decoder.layers.10.encoder_decoder_attn.output_proj.weight
1081
+ text_decoder.layers.10.encoder_decoder_attn.output_proj.bias
1082
+ text_decoder.layers.10.encoder_decoder_attn_layer_norm.weight
1083
+ text_decoder.layers.10.encoder_decoder_attn_layer_norm.bias
1084
+ text_decoder.layers.10.ffn.inner_proj.weight
1085
+ text_decoder.layers.10.ffn.inner_proj.bias
1086
+ text_decoder.layers.10.ffn.output_proj.weight
1087
+ text_decoder.layers.10.ffn.output_proj.bias
1088
+ text_decoder.layers.10.ffn_layer_norm.weight
1089
+ text_decoder.layers.10.ffn_layer_norm.bias
1090
+ text_decoder.layers.11.self_attn.k_proj.weight
1091
+ text_decoder.layers.11.self_attn.k_proj.bias
1092
+ text_decoder.layers.11.self_attn.v_proj.weight
1093
+ text_decoder.layers.11.self_attn.v_proj.bias
1094
+ text_decoder.layers.11.self_attn.q_proj.weight
1095
+ text_decoder.layers.11.self_attn.q_proj.bias
1096
+ text_decoder.layers.11.self_attn.output_proj.weight
1097
+ text_decoder.layers.11.self_attn.output_proj.bias
1098
+ text_decoder.layers.11.self_attn_layer_norm.weight
1099
+ text_decoder.layers.11.self_attn_layer_norm.bias
1100
+ text_decoder.layers.11.encoder_decoder_attn.k_proj.weight
1101
+ text_decoder.layers.11.encoder_decoder_attn.k_proj.bias
1102
+ text_decoder.layers.11.encoder_decoder_attn.v_proj.weight
1103
+ text_decoder.layers.11.encoder_decoder_attn.v_proj.bias
1104
+ text_decoder.layers.11.encoder_decoder_attn.q_proj.weight
1105
+ text_decoder.layers.11.encoder_decoder_attn.q_proj.bias
1106
+ text_decoder.layers.11.encoder_decoder_attn.output_proj.weight
1107
+ text_decoder.layers.11.encoder_decoder_attn.output_proj.bias
1108
+ text_decoder.layers.11.encoder_decoder_attn_layer_norm.weight
1109
+ text_decoder.layers.11.encoder_decoder_attn_layer_norm.bias
1110
+ text_decoder.layers.11.ffn.inner_proj.weight
1111
+ text_decoder.layers.11.ffn.inner_proj.bias
1112
+ text_decoder.layers.11.ffn.output_proj.weight
1113
+ text_decoder.layers.11.ffn.output_proj.bias
1114
+ text_decoder.layers.11.ffn_layer_norm.weight
1115
+ text_decoder.layers.11.ffn_layer_norm.bias
1116
+ text_decoder.layers.12.self_attn.k_proj.weight
1117
+ text_decoder.layers.12.self_attn.k_proj.bias
1118
+ text_decoder.layers.12.self_attn.v_proj.weight
1119
+ text_decoder.layers.12.self_attn.v_proj.bias
1120
+ text_decoder.layers.12.self_attn.q_proj.weight
1121
+ text_decoder.layers.12.self_attn.q_proj.bias
1122
+ text_decoder.layers.12.self_attn.output_proj.weight
1123
+ text_decoder.layers.12.self_attn.output_proj.bias
1124
+ text_decoder.layers.12.self_attn_layer_norm.weight
1125
+ text_decoder.layers.12.self_attn_layer_norm.bias
1126
+ text_decoder.layers.12.encoder_decoder_attn.k_proj.weight
1127
+ text_decoder.layers.12.encoder_decoder_attn.k_proj.bias
1128
+ text_decoder.layers.12.encoder_decoder_attn.v_proj.weight
1129
+ text_decoder.layers.12.encoder_decoder_attn.v_proj.bias
1130
+ text_decoder.layers.12.encoder_decoder_attn.q_proj.weight
1131
+ text_decoder.layers.12.encoder_decoder_attn.q_proj.bias
1132
+ text_decoder.layers.12.encoder_decoder_attn.output_proj.weight
1133
+ text_decoder.layers.12.encoder_decoder_attn.output_proj.bias
1134
+ text_decoder.layers.12.encoder_decoder_attn_layer_norm.weight
1135
+ text_decoder.layers.12.encoder_decoder_attn_layer_norm.bias
1136
+ text_decoder.layers.12.ffn.inner_proj.weight
1137
+ text_decoder.layers.12.ffn.inner_proj.bias
1138
+ text_decoder.layers.12.ffn.output_proj.weight
1139
+ text_decoder.layers.12.ffn.output_proj.bias
1140
+ text_decoder.layers.12.ffn_layer_norm.weight
1141
+ text_decoder.layers.12.ffn_layer_norm.bias
1142
+ text_decoder.layers.13.self_attn.k_proj.weight
1143
+ text_decoder.layers.13.self_attn.k_proj.bias
1144
+ text_decoder.layers.13.self_attn.v_proj.weight
1145
+ text_decoder.layers.13.self_attn.v_proj.bias
1146
+ text_decoder.layers.13.self_attn.q_proj.weight
1147
+ text_decoder.layers.13.self_attn.q_proj.bias
1148
+ text_decoder.layers.13.self_attn.output_proj.weight
1149
+ text_decoder.layers.13.self_attn.output_proj.bias
1150
+ text_decoder.layers.13.self_attn_layer_norm.weight
1151
+ text_decoder.layers.13.self_attn_layer_norm.bias
1152
+ text_decoder.layers.13.encoder_decoder_attn.k_proj.weight
1153
+ text_decoder.layers.13.encoder_decoder_attn.k_proj.bias
1154
+ text_decoder.layers.13.encoder_decoder_attn.v_proj.weight
1155
+ text_decoder.layers.13.encoder_decoder_attn.v_proj.bias
1156
+ text_decoder.layers.13.encoder_decoder_attn.q_proj.weight
1157
+ text_decoder.layers.13.encoder_decoder_attn.q_proj.bias
1158
+ text_decoder.layers.13.encoder_decoder_attn.output_proj.weight
1159
+ text_decoder.layers.13.encoder_decoder_attn.output_proj.bias
1160
+ text_decoder.layers.13.encoder_decoder_attn_layer_norm.weight
1161
+ text_decoder.layers.13.encoder_decoder_attn_layer_norm.bias
1162
+ text_decoder.layers.13.ffn.inner_proj.weight
1163
+ text_decoder.layers.13.ffn.inner_proj.bias
1164
+ text_decoder.layers.13.ffn.output_proj.weight
1165
+ text_decoder.layers.13.ffn.output_proj.bias
1166
+ text_decoder.layers.13.ffn_layer_norm.weight
1167
+ text_decoder.layers.13.ffn_layer_norm.bias
1168
+ text_decoder.layers.14.self_attn.k_proj.weight
1169
+ text_decoder.layers.14.self_attn.k_proj.bias
1170
+ text_decoder.layers.14.self_attn.v_proj.weight
1171
+ text_decoder.layers.14.self_attn.v_proj.bias
1172
+ text_decoder.layers.14.self_attn.q_proj.weight
1173
+ text_decoder.layers.14.self_attn.q_proj.bias
1174
+ text_decoder.layers.14.self_attn.output_proj.weight
1175
+ text_decoder.layers.14.self_attn.output_proj.bias
1176
+ text_decoder.layers.14.self_attn_layer_norm.weight
1177
+ text_decoder.layers.14.self_attn_layer_norm.bias
1178
+ text_decoder.layers.14.encoder_decoder_attn.k_proj.weight
1179
+ text_decoder.layers.14.encoder_decoder_attn.k_proj.bias
1180
+ text_decoder.layers.14.encoder_decoder_attn.v_proj.weight
1181
+ text_decoder.layers.14.encoder_decoder_attn.v_proj.bias
1182
+ text_decoder.layers.14.encoder_decoder_attn.q_proj.weight
1183
+ text_decoder.layers.14.encoder_decoder_attn.q_proj.bias
1184
+ text_decoder.layers.14.encoder_decoder_attn.output_proj.weight
1185
+ text_decoder.layers.14.encoder_decoder_attn.output_proj.bias
1186
+ text_decoder.layers.14.encoder_decoder_attn_layer_norm.weight
1187
+ text_decoder.layers.14.encoder_decoder_attn_layer_norm.bias
1188
+ text_decoder.layers.14.ffn.inner_proj.weight
1189
+ text_decoder.layers.14.ffn.inner_proj.bias
1190
+ text_decoder.layers.14.ffn.output_proj.weight
1191
+ text_decoder.layers.14.ffn.output_proj.bias
1192
+ text_decoder.layers.14.ffn_layer_norm.weight
1193
+ text_decoder.layers.14.ffn_layer_norm.bias
1194
+ text_decoder.layers.15.self_attn.k_proj.weight
1195
+ text_decoder.layers.15.self_attn.k_proj.bias
1196
+ text_decoder.layers.15.self_attn.v_proj.weight
1197
+ text_decoder.layers.15.self_attn.v_proj.bias
1198
+ text_decoder.layers.15.self_attn.q_proj.weight
1199
+ text_decoder.layers.15.self_attn.q_proj.bias
1200
+ text_decoder.layers.15.self_attn.output_proj.weight
1201
+ text_decoder.layers.15.self_attn.output_proj.bias
1202
+ text_decoder.layers.15.self_attn_layer_norm.weight
1203
+ text_decoder.layers.15.self_attn_layer_norm.bias
1204
+ text_decoder.layers.15.encoder_decoder_attn.k_proj.weight
1205
+ text_decoder.layers.15.encoder_decoder_attn.k_proj.bias
1206
+ text_decoder.layers.15.encoder_decoder_attn.v_proj.weight
1207
+ text_decoder.layers.15.encoder_decoder_attn.v_proj.bias
1208
+ text_decoder.layers.15.encoder_decoder_attn.q_proj.weight
1209
+ text_decoder.layers.15.encoder_decoder_attn.q_proj.bias
1210
+ text_decoder.layers.15.encoder_decoder_attn.output_proj.weight
1211
+ text_decoder.layers.15.encoder_decoder_attn.output_proj.bias
1212
+ text_decoder.layers.15.encoder_decoder_attn_layer_norm.weight
1213
+ text_decoder.layers.15.encoder_decoder_attn_layer_norm.bias
1214
+ text_decoder.layers.15.ffn.inner_proj.weight
1215
+ text_decoder.layers.15.ffn.inner_proj.bias
1216
+ text_decoder.layers.15.ffn.output_proj.weight
1217
+ text_decoder.layers.15.ffn.output_proj.bias
1218
+ text_decoder.layers.15.ffn_layer_norm.weight
1219
+ text_decoder.layers.15.ffn_layer_norm.bias
1220
+ text_decoder.layers.16.self_attn.k_proj.weight
1221
+ text_decoder.layers.16.self_attn.k_proj.bias
1222
+ text_decoder.layers.16.self_attn.v_proj.weight
1223
+ text_decoder.layers.16.self_attn.v_proj.bias
1224
+ text_decoder.layers.16.self_attn.q_proj.weight
1225
+ text_decoder.layers.16.self_attn.q_proj.bias
1226
+ text_decoder.layers.16.self_attn.output_proj.weight
1227
+ text_decoder.layers.16.self_attn.output_proj.bias
1228
+ text_decoder.layers.16.self_attn_layer_norm.weight
1229
+ text_decoder.layers.16.self_attn_layer_norm.bias
1230
+ text_decoder.layers.16.encoder_decoder_attn.k_proj.weight
1231
+ text_decoder.layers.16.encoder_decoder_attn.k_proj.bias
1232
+ text_decoder.layers.16.encoder_decoder_attn.v_proj.weight
1233
+ text_decoder.layers.16.encoder_decoder_attn.v_proj.bias
1234
+ text_decoder.layers.16.encoder_decoder_attn.q_proj.weight
1235
+ text_decoder.layers.16.encoder_decoder_attn.q_proj.bias
1236
+ text_decoder.layers.16.encoder_decoder_attn.output_proj.weight
1237
+ text_decoder.layers.16.encoder_decoder_attn.output_proj.bias
1238
+ text_decoder.layers.16.encoder_decoder_attn_layer_norm.weight
1239
+ text_decoder.layers.16.encoder_decoder_attn_layer_norm.bias
1240
+ text_decoder.layers.16.ffn.inner_proj.weight
1241
+ text_decoder.layers.16.ffn.inner_proj.bias
1242
+ text_decoder.layers.16.ffn.output_proj.weight
1243
+ text_decoder.layers.16.ffn.output_proj.bias
1244
+ text_decoder.layers.16.ffn_layer_norm.weight
1245
+ text_decoder.layers.16.ffn_layer_norm.bias
1246
+ text_decoder.layers.17.self_attn.k_proj.weight
1247
+ text_decoder.layers.17.self_attn.k_proj.bias
1248
+ text_decoder.layers.17.self_attn.v_proj.weight
1249
+ text_decoder.layers.17.self_attn.v_proj.bias
1250
+ text_decoder.layers.17.self_attn.q_proj.weight
1251
+ text_decoder.layers.17.self_attn.q_proj.bias
1252
+ text_decoder.layers.17.self_attn.output_proj.weight
1253
+ text_decoder.layers.17.self_attn.output_proj.bias
1254
+ text_decoder.layers.17.self_attn_layer_norm.weight
1255
+ text_decoder.layers.17.self_attn_layer_norm.bias
1256
+ text_decoder.layers.17.encoder_decoder_attn.k_proj.weight
1257
+ text_decoder.layers.17.encoder_decoder_attn.k_proj.bias
1258
+ text_decoder.layers.17.encoder_decoder_attn.v_proj.weight
1259
+ text_decoder.layers.17.encoder_decoder_attn.v_proj.bias
1260
+ text_decoder.layers.17.encoder_decoder_attn.q_proj.weight
1261
+ text_decoder.layers.17.encoder_decoder_attn.q_proj.bias
1262
+ text_decoder.layers.17.encoder_decoder_attn.output_proj.weight
1263
+ text_decoder.layers.17.encoder_decoder_attn.output_proj.bias
1264
+ text_decoder.layers.17.encoder_decoder_attn_layer_norm.weight
1265
+ text_decoder.layers.17.encoder_decoder_attn_layer_norm.bias
1266
+ text_decoder.layers.17.ffn.inner_proj.weight
1267
+ text_decoder.layers.17.ffn.inner_proj.bias
1268
+ text_decoder.layers.17.ffn.output_proj.weight
1269
+ text_decoder.layers.17.ffn.output_proj.bias
1270
+ text_decoder.layers.17.ffn_layer_norm.weight
1271
+ text_decoder.layers.17.ffn_layer_norm.bias
1272
+ text_decoder.layers.18.self_attn.k_proj.weight
1273
+ text_decoder.layers.18.self_attn.k_proj.bias
1274
+ text_decoder.layers.18.self_attn.v_proj.weight
1275
+ text_decoder.layers.18.self_attn.v_proj.bias
1276
+ text_decoder.layers.18.self_attn.q_proj.weight
1277
+ text_decoder.layers.18.self_attn.q_proj.bias
1278
+ text_decoder.layers.18.self_attn.output_proj.weight
1279
+ text_decoder.layers.18.self_attn.output_proj.bias
1280
+ text_decoder.layers.18.self_attn_layer_norm.weight
1281
+ text_decoder.layers.18.self_attn_layer_norm.bias
1282
+ text_decoder.layers.18.encoder_decoder_attn.k_proj.weight
1283
+ text_decoder.layers.18.encoder_decoder_attn.k_proj.bias
1284
+ text_decoder.layers.18.encoder_decoder_attn.v_proj.weight
1285
+ text_decoder.layers.18.encoder_decoder_attn.v_proj.bias
1286
+ text_decoder.layers.18.encoder_decoder_attn.q_proj.weight
1287
+ text_decoder.layers.18.encoder_decoder_attn.q_proj.bias
1288
+ text_decoder.layers.18.encoder_decoder_attn.output_proj.weight
1289
+ text_decoder.layers.18.encoder_decoder_attn.output_proj.bias
1290
+ text_decoder.layers.18.encoder_decoder_attn_layer_norm.weight
1291
+ text_decoder.layers.18.encoder_decoder_attn_layer_norm.bias
1292
+ text_decoder.layers.18.ffn.inner_proj.weight
1293
+ text_decoder.layers.18.ffn.inner_proj.bias
1294
+ text_decoder.layers.18.ffn.output_proj.weight
1295
+ text_decoder.layers.18.ffn.output_proj.bias
1296
+ text_decoder.layers.18.ffn_layer_norm.weight
1297
+ text_decoder.layers.18.ffn_layer_norm.bias
1298
+ text_decoder.layers.19.self_attn.k_proj.weight
1299
+ text_decoder.layers.19.self_attn.k_proj.bias
1300
+ text_decoder.layers.19.self_attn.v_proj.weight
1301
+ text_decoder.layers.19.self_attn.v_proj.bias
1302
+ text_decoder.layers.19.self_attn.q_proj.weight
1303
+ text_decoder.layers.19.self_attn.q_proj.bias
1304
+ text_decoder.layers.19.self_attn.output_proj.weight
1305
+ text_decoder.layers.19.self_attn.output_proj.bias
1306
+ text_decoder.layers.19.self_attn_layer_norm.weight
1307
+ text_decoder.layers.19.self_attn_layer_norm.bias
1308
+ text_decoder.layers.19.encoder_decoder_attn.k_proj.weight
1309
+ text_decoder.layers.19.encoder_decoder_attn.k_proj.bias
1310
+ text_decoder.layers.19.encoder_decoder_attn.v_proj.weight
1311
+ text_decoder.layers.19.encoder_decoder_attn.v_proj.bias
1312
+ text_decoder.layers.19.encoder_decoder_attn.q_proj.weight
1313
+ text_decoder.layers.19.encoder_decoder_attn.q_proj.bias
1314
+ text_decoder.layers.19.encoder_decoder_attn.output_proj.weight
1315
+ text_decoder.layers.19.encoder_decoder_attn.output_proj.bias
1316
+ text_decoder.layers.19.encoder_decoder_attn_layer_norm.weight
1317
+ text_decoder.layers.19.encoder_decoder_attn_layer_norm.bias
1318
+ text_decoder.layers.19.ffn.inner_proj.weight
1319
+ text_decoder.layers.19.ffn.inner_proj.bias
1320
+ text_decoder.layers.19.ffn.output_proj.weight
1321
+ text_decoder.layers.19.ffn.output_proj.bias
1322
+ text_decoder.layers.19.ffn_layer_norm.weight
1323
+ text_decoder.layers.19.ffn_layer_norm.bias
1324
+ text_decoder.layers.20.self_attn.k_proj.weight
1325
+ text_decoder.layers.20.self_attn.k_proj.bias
1326
+ text_decoder.layers.20.self_attn.v_proj.weight
1327
+ text_decoder.layers.20.self_attn.v_proj.bias
1328
+ text_decoder.layers.20.self_attn.q_proj.weight
1329
+ text_decoder.layers.20.self_attn.q_proj.bias
1330
+ text_decoder.layers.20.self_attn.output_proj.weight
1331
+ text_decoder.layers.20.self_attn.output_proj.bias
1332
+ text_decoder.layers.20.self_attn_layer_norm.weight
1333
+ text_decoder.layers.20.self_attn_layer_norm.bias
1334
+ text_decoder.layers.20.encoder_decoder_attn.k_proj.weight
1335
+ text_decoder.layers.20.encoder_decoder_attn.k_proj.bias
1336
+ text_decoder.layers.20.encoder_decoder_attn.v_proj.weight
1337
+ text_decoder.layers.20.encoder_decoder_attn.v_proj.bias
1338
+ text_decoder.layers.20.encoder_decoder_attn.q_proj.weight
1339
+ text_decoder.layers.20.encoder_decoder_attn.q_proj.bias
1340
+ text_decoder.layers.20.encoder_decoder_attn.output_proj.weight
1341
+ text_decoder.layers.20.encoder_decoder_attn.output_proj.bias
1342
+ text_decoder.layers.20.encoder_decoder_attn_layer_norm.weight
1343
+ text_decoder.layers.20.encoder_decoder_attn_layer_norm.bias
1344
+ text_decoder.layers.20.ffn.inner_proj.weight
1345
+ text_decoder.layers.20.ffn.inner_proj.bias
1346
+ text_decoder.layers.20.ffn.output_proj.weight
1347
+ text_decoder.layers.20.ffn.output_proj.bias
1348
+ text_decoder.layers.20.ffn_layer_norm.weight
1349
+ text_decoder.layers.20.ffn_layer_norm.bias
1350
+ text_decoder.layers.21.self_attn.k_proj.weight
1351
+ text_decoder.layers.21.self_attn.k_proj.bias
1352
+ text_decoder.layers.21.self_attn.v_proj.weight
1353
+ text_decoder.layers.21.self_attn.v_proj.bias
1354
+ text_decoder.layers.21.self_attn.q_proj.weight
1355
+ text_decoder.layers.21.self_attn.q_proj.bias
1356
+ text_decoder.layers.21.self_attn.output_proj.weight
1357
+ text_decoder.layers.21.self_attn.output_proj.bias
1358
+ text_decoder.layers.21.self_attn_layer_norm.weight
1359
+ text_decoder.layers.21.self_attn_layer_norm.bias
1360
+ text_decoder.layers.21.encoder_decoder_attn.k_proj.weight
1361
+ text_decoder.layers.21.encoder_decoder_attn.k_proj.bias
1362
+ text_decoder.layers.21.encoder_decoder_attn.v_proj.weight
1363
+ text_decoder.layers.21.encoder_decoder_attn.v_proj.bias
1364
+ text_decoder.layers.21.encoder_decoder_attn.q_proj.weight
1365
+ text_decoder.layers.21.encoder_decoder_attn.q_proj.bias
1366
+ text_decoder.layers.21.encoder_decoder_attn.output_proj.weight
1367
+ text_decoder.layers.21.encoder_decoder_attn.output_proj.bias
1368
+ text_decoder.layers.21.encoder_decoder_attn_layer_norm.weight
1369
+ text_decoder.layers.21.encoder_decoder_attn_layer_norm.bias
1370
+ text_decoder.layers.21.ffn.inner_proj.weight
1371
+ text_decoder.layers.21.ffn.inner_proj.bias
1372
+ text_decoder.layers.21.ffn.output_proj.weight
1373
+ text_decoder.layers.21.ffn.output_proj.bias
1374
+ text_decoder.layers.21.ffn_layer_norm.weight
1375
+ text_decoder.layers.21.ffn_layer_norm.bias
1376
+ text_decoder.layers.22.self_attn.k_proj.weight
1377
+ text_decoder.layers.22.self_attn.k_proj.bias
1378
+ text_decoder.layers.22.self_attn.v_proj.weight
1379
+ text_decoder.layers.22.self_attn.v_proj.bias
1380
+ text_decoder.layers.22.self_attn.q_proj.weight
1381
+ text_decoder.layers.22.self_attn.q_proj.bias
1382
+ text_decoder.layers.22.self_attn.output_proj.weight
1383
+ text_decoder.layers.22.self_attn.output_proj.bias
1384
+ text_decoder.layers.22.self_attn_layer_norm.weight
1385
+ text_decoder.layers.22.self_attn_layer_norm.bias
1386
+ text_decoder.layers.22.encoder_decoder_attn.k_proj.weight
1387
+ text_decoder.layers.22.encoder_decoder_attn.k_proj.bias
1388
+ text_decoder.layers.22.encoder_decoder_attn.v_proj.weight
1389
+ text_decoder.layers.22.encoder_decoder_attn.v_proj.bias
1390
+ text_decoder.layers.22.encoder_decoder_attn.q_proj.weight
1391
+ text_decoder.layers.22.encoder_decoder_attn.q_proj.bias
1392
+ text_decoder.layers.22.encoder_decoder_attn.output_proj.weight
1393
+ text_decoder.layers.22.encoder_decoder_attn.output_proj.bias
1394
+ text_decoder.layers.22.encoder_decoder_attn_layer_norm.weight
1395
+ text_decoder.layers.22.encoder_decoder_attn_layer_norm.bias
1396
+ text_decoder.layers.22.ffn.inner_proj.weight
1397
+ text_decoder.layers.22.ffn.inner_proj.bias
1398
+ text_decoder.layers.22.ffn.output_proj.weight
1399
+ text_decoder.layers.22.ffn.output_proj.bias
1400
+ text_decoder.layers.22.ffn_layer_norm.weight
1401
+ text_decoder.layers.22.ffn_layer_norm.bias
1402
+ text_decoder.layers.23.self_attn.k_proj.weight
1403
+ text_decoder.layers.23.self_attn.k_proj.bias
1404
+ text_decoder.layers.23.self_attn.v_proj.weight
1405
+ text_decoder.layers.23.self_attn.v_proj.bias
1406
+ text_decoder.layers.23.self_attn.q_proj.weight
1407
+ text_decoder.layers.23.self_attn.q_proj.bias
1408
+ text_decoder.layers.23.self_attn.output_proj.weight
1409
+ text_decoder.layers.23.self_attn.output_proj.bias
1410
+ text_decoder.layers.23.self_attn_layer_norm.weight
1411
+ text_decoder.layers.23.self_attn_layer_norm.bias
1412
+ text_decoder.layers.23.encoder_decoder_attn.k_proj.weight
1413
+ text_decoder.layers.23.encoder_decoder_attn.k_proj.bias
1414
+ text_decoder.layers.23.encoder_decoder_attn.v_proj.weight
1415
+ text_decoder.layers.23.encoder_decoder_attn.v_proj.bias
1416
+ text_decoder.layers.23.encoder_decoder_attn.q_proj.weight
1417
+ text_decoder.layers.23.encoder_decoder_attn.q_proj.bias
1418
+ text_decoder.layers.23.encoder_decoder_attn.output_proj.weight
1419
+ text_decoder.layers.23.encoder_decoder_attn.output_proj.bias
1420
+ text_decoder.layers.23.encoder_decoder_attn_layer_norm.weight
1421
+ text_decoder.layers.23.encoder_decoder_attn_layer_norm.bias
1422
+ text_decoder.layers.23.ffn.inner_proj.weight
1423
+ text_decoder.layers.23.ffn.inner_proj.bias
1424
+ text_decoder.layers.23.ffn.output_proj.weight
1425
+ text_decoder.layers.23.ffn.output_proj.bias
1426
+ text_decoder.layers.23.ffn_layer_norm.weight
1427
+ text_decoder.layers.23.ffn_layer_norm.bias
1428
+ text_decoder.layer_norm.weight
1429
+ text_decoder.layer_norm.bias
1430
+ final_proj.weight
1431
+ t2u_model.encoder.layers.0.self_attn.k_proj.weight
1432
+ t2u_model.encoder.layers.0.self_attn.k_proj.bias
1433
+ t2u_model.encoder.layers.0.self_attn.v_proj.weight
1434
+ t2u_model.encoder.layers.0.self_attn.v_proj.bias
1435
+ t2u_model.encoder.layers.0.self_attn.q_proj.weight
1436
+ t2u_model.encoder.layers.0.self_attn.q_proj.bias
1437
+ t2u_model.encoder.layers.0.self_attn.output_proj.weight
1438
+ t2u_model.encoder.layers.0.self_attn.output_proj.bias
1439
+ t2u_model.encoder.layers.0.self_attn_layer_norm.weight
1440
+ t2u_model.encoder.layers.0.self_attn_layer_norm.bias
1441
+ t2u_model.encoder.layers.0.ffn.inner_proj.weight
1442
+ t2u_model.encoder.layers.0.ffn.inner_proj.bias
1443
+ t2u_model.encoder.layers.0.ffn.output_proj.weight
1444
+ t2u_model.encoder.layers.0.ffn.output_proj.bias
1445
+ t2u_model.encoder.layers.0.ffn_layer_norm.weight
1446
+ t2u_model.encoder.layers.0.ffn_layer_norm.bias
1447
+ t2u_model.encoder.layers.1.self_attn.k_proj.weight
1448
+ t2u_model.encoder.layers.1.self_attn.k_proj.bias
1449
+ t2u_model.encoder.layers.1.self_attn.v_proj.weight
1450
+ t2u_model.encoder.layers.1.self_attn.v_proj.bias
1451
+ t2u_model.encoder.layers.1.self_attn.q_proj.weight
1452
+ t2u_model.encoder.layers.1.self_attn.q_proj.bias
1453
+ t2u_model.encoder.layers.1.self_attn.output_proj.weight
1454
+ t2u_model.encoder.layers.1.self_attn.output_proj.bias
1455
+ t2u_model.encoder.layers.1.self_attn_layer_norm.weight
1456
+ t2u_model.encoder.layers.1.self_attn_layer_norm.bias
1457
+ t2u_model.encoder.layers.1.ffn.inner_proj.weight
1458
+ t2u_model.encoder.layers.1.ffn.inner_proj.bias
1459
+ t2u_model.encoder.layers.1.ffn.output_proj.weight
1460
+ t2u_model.encoder.layers.1.ffn.output_proj.bias
1461
+ t2u_model.encoder.layers.1.ffn_layer_norm.weight
1462
+ t2u_model.encoder.layers.1.ffn_layer_norm.bias
1463
+ t2u_model.encoder.layers.2.self_attn.k_proj.weight
1464
+ t2u_model.encoder.layers.2.self_attn.k_proj.bias
1465
+ t2u_model.encoder.layers.2.self_attn.v_proj.weight
1466
+ t2u_model.encoder.layers.2.self_attn.v_proj.bias
1467
+ t2u_model.encoder.layers.2.self_attn.q_proj.weight
1468
+ t2u_model.encoder.layers.2.self_attn.q_proj.bias
1469
+ t2u_model.encoder.layers.2.self_attn.output_proj.weight
1470
+ t2u_model.encoder.layers.2.self_attn.output_proj.bias
1471
+ t2u_model.encoder.layers.2.self_attn_layer_norm.weight
1472
+ t2u_model.encoder.layers.2.self_attn_layer_norm.bias
1473
+ t2u_model.encoder.layers.2.ffn.inner_proj.weight
1474
+ t2u_model.encoder.layers.2.ffn.inner_proj.bias
1475
+ t2u_model.encoder.layers.2.ffn.output_proj.weight
1476
+ t2u_model.encoder.layers.2.ffn.output_proj.bias
1477
+ t2u_model.encoder.layers.2.ffn_layer_norm.weight
1478
+ t2u_model.encoder.layers.2.ffn_layer_norm.bias
1479
+ t2u_model.encoder.layers.3.self_attn.k_proj.weight
1480
+ t2u_model.encoder.layers.3.self_attn.k_proj.bias
1481
+ t2u_model.encoder.layers.3.self_attn.v_proj.weight
1482
+ t2u_model.encoder.layers.3.self_attn.v_proj.bias
1483
+ t2u_model.encoder.layers.3.self_attn.q_proj.weight
1484
+ t2u_model.encoder.layers.3.self_attn.q_proj.bias
1485
+ t2u_model.encoder.layers.3.self_attn.output_proj.weight
1486
+ t2u_model.encoder.layers.3.self_attn.output_proj.bias
1487
+ t2u_model.encoder.layers.3.self_attn_layer_norm.weight
1488
+ t2u_model.encoder.layers.3.self_attn_layer_norm.bias
1489
+ t2u_model.encoder.layers.3.ffn.inner_proj.weight
1490
+ t2u_model.encoder.layers.3.ffn.inner_proj.bias
1491
+ t2u_model.encoder.layers.3.ffn.output_proj.weight
1492
+ t2u_model.encoder.layers.3.ffn.output_proj.bias
1493
+ t2u_model.encoder.layers.3.ffn_layer_norm.weight
1494
+ t2u_model.encoder.layers.3.ffn_layer_norm.bias
1495
+ t2u_model.encoder.layer_norm.weight
1496
+ t2u_model.encoder.layer_norm.bias
1497
+ t2u_model.decoder_frontend.pos_emb_alpha
1498
+ t2u_model.decoder_frontend.pos_emb_alpha_char
1499
+ t2u_model.decoder_frontend.variance_adaptor.duration_predictor.conv1.0.weight
1500
+ t2u_model.decoder_frontend.variance_adaptor.duration_predictor.conv1.0.bias
1501
+ t2u_model.decoder_frontend.variance_adaptor.duration_predictor.ln1.weight
1502
+ t2u_model.decoder_frontend.variance_adaptor.duration_predictor.ln1.bias
1503
+ t2u_model.decoder_frontend.variance_adaptor.duration_predictor.conv2.0.weight
1504
+ t2u_model.decoder_frontend.variance_adaptor.duration_predictor.conv2.0.bias
1505
+ t2u_model.decoder_frontend.variance_adaptor.duration_predictor.ln2.weight
1506
+ t2u_model.decoder_frontend.variance_adaptor.duration_predictor.ln2.bias
1507
+ t2u_model.decoder_frontend.variance_adaptor.duration_predictor.proj.weight
1508
+ t2u_model.decoder_frontend.variance_adaptor.duration_predictor.proj.bias
1509
+ t2u_model.decoder_frontend.variance_adaptor.duration_predictor.film.s_gamma
1510
+ t2u_model.decoder_frontend.variance_adaptor.duration_predictor.film.s_beta
1511
+ t2u_model.decoder_frontend.variance_adaptor.duration_predictor.film.proj.weight
1512
+ t2u_model.decoder_frontend.variance_adaptor.duration_predictor.film.proj.bias
1513
+ t2u_model.decoder.layers.0.self_attn.k_proj.weight
1514
+ t2u_model.decoder.layers.0.self_attn.k_proj.bias
1515
+ t2u_model.decoder.layers.0.self_attn.v_proj.weight
1516
+ t2u_model.decoder.layers.0.self_attn.v_proj.bias
1517
+ t2u_model.decoder.layers.0.self_attn.q_proj.weight
1518
+ t2u_model.decoder.layers.0.self_attn.q_proj.bias
1519
+ t2u_model.decoder.layers.0.self_attn.output_proj.weight
1520
+ t2u_model.decoder.layers.0.self_attn.output_proj.bias
1521
+ t2u_model.decoder.layers.0.self_attn_layer_norm.weight
1522
+ t2u_model.decoder.layers.0.self_attn_layer_norm.bias
1523
+ t2u_model.decoder.layers.0.conv1d.conv1.weight
1524
+ t2u_model.decoder.layers.0.conv1d.conv1.bias
1525
+ t2u_model.decoder.layers.0.conv1d.conv2.weight
1526
+ t2u_model.decoder.layers.0.conv1d.conv2.bias
1527
+ t2u_model.decoder.layers.0.conv1d_layer_norm.weight
1528
+ t2u_model.decoder.layers.0.conv1d_layer_norm.bias
1529
+ t2u_model.decoder.layers.0.film.s_gamma
1530
+ t2u_model.decoder.layers.0.film.s_beta
1531
+ t2u_model.decoder.layers.0.film.proj.weight
1532
+ t2u_model.decoder.layers.0.film.proj.bias
1533
+ t2u_model.decoder.layers.1.self_attn.k_proj.weight
1534
+ t2u_model.decoder.layers.1.self_attn.k_proj.bias
1535
+ t2u_model.decoder.layers.1.self_attn.v_proj.weight
1536
+ t2u_model.decoder.layers.1.self_attn.v_proj.bias
1537
+ t2u_model.decoder.layers.1.self_attn.q_proj.weight
1538
+ t2u_model.decoder.layers.1.self_attn.q_proj.bias
1539
+ t2u_model.decoder.layers.1.self_attn.output_proj.weight
1540
+ t2u_model.decoder.layers.1.self_attn.output_proj.bias
1541
+ t2u_model.decoder.layers.1.self_attn_layer_norm.weight
1542
+ t2u_model.decoder.layers.1.self_attn_layer_norm.bias
1543
+ t2u_model.decoder.layers.1.conv1d.conv1.weight
1544
+ t2u_model.decoder.layers.1.conv1d.conv1.bias
1545
+ t2u_model.decoder.layers.1.conv1d.conv2.weight
1546
+ t2u_model.decoder.layers.1.conv1d.conv2.bias
1547
+ t2u_model.decoder.layers.1.conv1d_layer_norm.weight
1548
+ t2u_model.decoder.layers.1.conv1d_layer_norm.bias
1549
+ t2u_model.decoder.layers.1.film.s_gamma
1550
+ t2u_model.decoder.layers.1.film.s_beta
1551
+ t2u_model.decoder.layers.1.film.proj.weight
1552
+ t2u_model.decoder.layers.1.film.proj.bias
1553
+ t2u_model.decoder.layers.2.self_attn.k_proj.weight
1554
+ t2u_model.decoder.layers.2.self_attn.k_proj.bias
1555
+ t2u_model.decoder.layers.2.self_attn.v_proj.weight
1556
+ t2u_model.decoder.layers.2.self_attn.v_proj.bias
1557
+ t2u_model.decoder.layers.2.self_attn.q_proj.weight
1558
+ t2u_model.decoder.layers.2.self_attn.q_proj.bias
1559
+ t2u_model.decoder.layers.2.self_attn.output_proj.weight
1560
+ t2u_model.decoder.layers.2.self_attn.output_proj.bias
1561
+ t2u_model.decoder.layers.2.self_attn_layer_norm.weight
1562
+ t2u_model.decoder.layers.2.self_attn_layer_norm.bias
1563
+ t2u_model.decoder.layers.2.conv1d.conv1.weight
1564
+ t2u_model.decoder.layers.2.conv1d.conv1.bias
1565
+ t2u_model.decoder.layers.2.conv1d.conv2.weight
1566
+ t2u_model.decoder.layers.2.conv1d.conv2.bias
1567
+ t2u_model.decoder.layers.2.conv1d_layer_norm.weight
1568
+ t2u_model.decoder.layers.2.conv1d_layer_norm.bias
1569
+ t2u_model.decoder.layers.2.film.s_gamma
1570
+ t2u_model.decoder.layers.2.film.s_beta
1571
+ t2u_model.decoder.layers.2.film.proj.weight
1572
+ t2u_model.decoder.layers.2.film.proj.bias
1573
+ t2u_model.decoder.layers.3.self_attn.k_proj.weight
1574
+ t2u_model.decoder.layers.3.self_attn.k_proj.bias
1575
+ t2u_model.decoder.layers.3.self_attn.v_proj.weight
1576
+ t2u_model.decoder.layers.3.self_attn.v_proj.bias
1577
+ t2u_model.decoder.layers.3.self_attn.q_proj.weight
1578
+ t2u_model.decoder.layers.3.self_attn.q_proj.bias
1579
+ t2u_model.decoder.layers.3.self_attn.output_proj.weight
1580
+ t2u_model.decoder.layers.3.self_attn.output_proj.bias
1581
+ t2u_model.decoder.layers.3.self_attn_layer_norm.weight
1582
+ t2u_model.decoder.layers.3.self_attn_layer_norm.bias
1583
+ t2u_model.decoder.layers.3.conv1d.conv1.weight
1584
+ t2u_model.decoder.layers.3.conv1d.conv1.bias
1585
+ t2u_model.decoder.layers.3.conv1d.conv2.weight
1586
+ t2u_model.decoder.layers.3.conv1d.conv2.bias
1587
+ t2u_model.decoder.layers.3.conv1d_layer_norm.weight
1588
+ t2u_model.decoder.layers.3.conv1d_layer_norm.bias
1589
+ t2u_model.decoder.layers.3.film.s_gamma
1590
+ t2u_model.decoder.layers.3.film.s_beta
1591
+ t2u_model.decoder.layers.3.film.proj.weight
1592
+ t2u_model.decoder.layers.3.film.proj.bias
1593
+ t2u_model.decoder.layer_norm.weight
1594
+ t2u_model.decoder.layer_norm.bias
1595
+ t2u_model.decoder_frontend.embed_char.weight
1596
+ t2u_model.decoder_frontend.embed.weight
1597
+ t2u_model.final_proj.weight
1598
+ t2u_model.prosody_proj.weight
1599
+ t2u_model.prosody_proj.bias
1600
+ prosody_encoder_model.blocks.0.conv.weight
1601
+ prosody_encoder_model.blocks.0.conv.bias
1602
+ prosody_encoder_model.blocks.0.norm.weight
1603
+ prosody_encoder_model.blocks.0.norm.bias
1604
+ prosody_encoder_model.blocks.1.tdnn1.conv.weight
1605
+ prosody_encoder_model.blocks.1.tdnn1.conv.bias
1606
+ prosody_encoder_model.blocks.1.tdnn1.norm.weight
1607
+ prosody_encoder_model.blocks.1.tdnn1.norm.bias
1608
+ prosody_encoder_model.blocks.1.res2net_block.blocks.0.conv.weight
1609
+ prosody_encoder_model.blocks.1.res2net_block.blocks.0.conv.bias
1610
+ prosody_encoder_model.blocks.1.res2net_block.blocks.0.norm.weight
1611
+ prosody_encoder_model.blocks.1.res2net_block.blocks.0.norm.bias
1612
+ prosody_encoder_model.blocks.1.res2net_block.blocks.1.conv.weight
1613
+ prosody_encoder_model.blocks.1.res2net_block.blocks.1.conv.bias
1614
+ prosody_encoder_model.blocks.1.res2net_block.blocks.1.norm.weight
1615
+ prosody_encoder_model.blocks.1.res2net_block.blocks.1.norm.bias
1616
+ prosody_encoder_model.blocks.1.res2net_block.blocks.2.conv.weight
1617
+ prosody_encoder_model.blocks.1.res2net_block.blocks.2.conv.bias
1618
+ prosody_encoder_model.blocks.1.res2net_block.blocks.2.norm.weight
1619
+ prosody_encoder_model.blocks.1.res2net_block.blocks.2.norm.bias
1620
+ prosody_encoder_model.blocks.1.res2net_block.blocks.3.conv.weight
1621
+ prosody_encoder_model.blocks.1.res2net_block.blocks.3.conv.bias
1622
+ prosody_encoder_model.blocks.1.res2net_block.blocks.3.norm.weight
1623
+ prosody_encoder_model.blocks.1.res2net_block.blocks.3.norm.bias
1624
+ prosody_encoder_model.blocks.1.res2net_block.blocks.4.conv.weight
1625
+ prosody_encoder_model.blocks.1.res2net_block.blocks.4.conv.bias
1626
+ prosody_encoder_model.blocks.1.res2net_block.blocks.4.norm.weight
1627
+ prosody_encoder_model.blocks.1.res2net_block.blocks.4.norm.bias
1628
+ prosody_encoder_model.blocks.1.res2net_block.blocks.5.conv.weight
1629
+ prosody_encoder_model.blocks.1.res2net_block.blocks.5.conv.bias
1630
+ prosody_encoder_model.blocks.1.res2net_block.blocks.5.norm.weight
1631
+ prosody_encoder_model.blocks.1.res2net_block.blocks.5.norm.bias
1632
+ prosody_encoder_model.blocks.1.res2net_block.blocks.6.conv.weight
1633
+ prosody_encoder_model.blocks.1.res2net_block.blocks.6.conv.bias
1634
+ prosody_encoder_model.blocks.1.res2net_block.blocks.6.norm.weight
1635
+ prosody_encoder_model.blocks.1.res2net_block.blocks.6.norm.bias
1636
+ prosody_encoder_model.blocks.1.tdnn2.conv.weight
1637
+ prosody_encoder_model.blocks.1.tdnn2.conv.bias
1638
+ prosody_encoder_model.blocks.1.tdnn2.norm.weight
1639
+ prosody_encoder_model.blocks.1.tdnn2.norm.bias
1640
+ prosody_encoder_model.blocks.1.se_block.conv1.weight
1641
+ prosody_encoder_model.blocks.1.se_block.conv1.bias
1642
+ prosody_encoder_model.blocks.1.se_block.conv2.weight
1643
+ prosody_encoder_model.blocks.1.se_block.conv2.bias
1644
+ prosody_encoder_model.blocks.2.tdnn1.conv.weight
1645
+ prosody_encoder_model.blocks.2.tdnn1.conv.bias
1646
+ prosody_encoder_model.blocks.2.tdnn1.norm.weight
1647
+ prosody_encoder_model.blocks.2.tdnn1.norm.bias
1648
+ prosody_encoder_model.blocks.2.res2net_block.blocks.0.conv.weight
1649
+ prosody_encoder_model.blocks.2.res2net_block.blocks.0.conv.bias
1650
+ prosody_encoder_model.blocks.2.res2net_block.blocks.0.norm.weight
1651
+ prosody_encoder_model.blocks.2.res2net_block.blocks.0.norm.bias
1652
+ prosody_encoder_model.blocks.2.res2net_block.blocks.1.conv.weight
1653
+ prosody_encoder_model.blocks.2.res2net_block.blocks.1.conv.bias
1654
+ prosody_encoder_model.blocks.2.res2net_block.blocks.1.norm.weight
1655
+ prosody_encoder_model.blocks.2.res2net_block.blocks.1.norm.bias
1656
+ prosody_encoder_model.blocks.2.res2net_block.blocks.2.conv.weight
1657
+ prosody_encoder_model.blocks.2.res2net_block.blocks.2.conv.bias
1658
+ prosody_encoder_model.blocks.2.res2net_block.blocks.2.norm.weight
1659
+ prosody_encoder_model.blocks.2.res2net_block.blocks.2.norm.bias
1660
+ prosody_encoder_model.blocks.2.res2net_block.blocks.3.conv.weight
1661
+ prosody_encoder_model.blocks.2.res2net_block.blocks.3.conv.bias
1662
+ prosody_encoder_model.blocks.2.res2net_block.blocks.3.norm.weight
1663
+ prosody_encoder_model.blocks.2.res2net_block.blocks.3.norm.bias
1664
+ prosody_encoder_model.blocks.2.res2net_block.blocks.4.conv.weight
1665
+ prosody_encoder_model.blocks.2.res2net_block.blocks.4.conv.bias
1666
+ prosody_encoder_model.blocks.2.res2net_block.blocks.4.norm.weight
1667
+ prosody_encoder_model.blocks.2.res2net_block.blocks.4.norm.bias
1668
+ prosody_encoder_model.blocks.2.res2net_block.blocks.5.conv.weight
1669
+ prosody_encoder_model.blocks.2.res2net_block.blocks.5.conv.bias
1670
+ prosody_encoder_model.blocks.2.res2net_block.blocks.5.norm.weight
1671
+ prosody_encoder_model.blocks.2.res2net_block.blocks.5.norm.bias
1672
+ prosody_encoder_model.blocks.2.res2net_block.blocks.6.conv.weight
1673
+ prosody_encoder_model.blocks.2.res2net_block.blocks.6.conv.bias
1674
+ prosody_encoder_model.blocks.2.res2net_block.blocks.6.norm.weight
1675
+ prosody_encoder_model.blocks.2.res2net_block.blocks.6.norm.bias
1676
+ prosody_encoder_model.blocks.2.tdnn2.conv.weight
1677
+ prosody_encoder_model.blocks.2.tdnn2.conv.bias
1678
+ prosody_encoder_model.blocks.2.tdnn2.norm.weight
1679
+ prosody_encoder_model.blocks.2.tdnn2.norm.bias
1680
+ prosody_encoder_model.blocks.2.se_block.conv1.weight
1681
+ prosody_encoder_model.blocks.2.se_block.conv1.bias
1682
+ prosody_encoder_model.blocks.2.se_block.conv2.weight
1683
+ prosody_encoder_model.blocks.2.se_block.conv2.bias
1684
+ prosody_encoder_model.blocks.3.tdnn1.conv.weight
1685
+ prosody_encoder_model.blocks.3.tdnn1.conv.bias
1686
+ prosody_encoder_model.blocks.3.tdnn1.norm.weight
1687
+ prosody_encoder_model.blocks.3.tdnn1.norm.bias
1688
+ prosody_encoder_model.blocks.3.res2net_block.blocks.0.conv.weight
1689
+ prosody_encoder_model.blocks.3.res2net_block.blocks.0.conv.bias
1690
+ prosody_encoder_model.blocks.3.res2net_block.blocks.0.norm.weight
1691
+ prosody_encoder_model.blocks.3.res2net_block.blocks.0.norm.bias
1692
+ prosody_encoder_model.blocks.3.res2net_block.blocks.1.conv.weight
1693
+ prosody_encoder_model.blocks.3.res2net_block.blocks.1.conv.bias
1694
+ prosody_encoder_model.blocks.3.res2net_block.blocks.1.norm.weight
1695
+ prosody_encoder_model.blocks.3.res2net_block.blocks.1.norm.bias
1696
+ prosody_encoder_model.blocks.3.res2net_block.blocks.2.conv.weight
1697
+ prosody_encoder_model.blocks.3.res2net_block.blocks.2.conv.bias
1698
+ prosody_encoder_model.blocks.3.res2net_block.blocks.2.norm.weight
1699
+ prosody_encoder_model.blocks.3.res2net_block.blocks.2.norm.bias
1700
+ prosody_encoder_model.blocks.3.res2net_block.blocks.3.conv.weight
1701
+ prosody_encoder_model.blocks.3.res2net_block.blocks.3.conv.bias
1702
+ prosody_encoder_model.blocks.3.res2net_block.blocks.3.norm.weight
1703
+ prosody_encoder_model.blocks.3.res2net_block.blocks.3.norm.bias
1704
+ prosody_encoder_model.blocks.3.res2net_block.blocks.4.conv.weight
1705
+ prosody_encoder_model.blocks.3.res2net_block.blocks.4.conv.bias
1706
+ prosody_encoder_model.blocks.3.res2net_block.blocks.4.norm.weight
1707
+ prosody_encoder_model.blocks.3.res2net_block.blocks.4.norm.bias
1708
+ prosody_encoder_model.blocks.3.res2net_block.blocks.5.conv.weight
1709
+ prosody_encoder_model.blocks.3.res2net_block.blocks.5.conv.bias
1710
+ prosody_encoder_model.blocks.3.res2net_block.blocks.5.norm.weight
1711
+ prosody_encoder_model.blocks.3.res2net_block.blocks.5.norm.bias
1712
+ prosody_encoder_model.blocks.3.res2net_block.blocks.6.conv.weight
1713
+ prosody_encoder_model.blocks.3.res2net_block.blocks.6.conv.bias
1714
+ prosody_encoder_model.blocks.3.res2net_block.blocks.6.norm.weight
1715
+ prosody_encoder_model.blocks.3.res2net_block.blocks.6.norm.bias
1716
+ prosody_encoder_model.blocks.3.tdnn2.conv.weight
1717
+ prosody_encoder_model.blocks.3.tdnn2.conv.bias
1718
+ prosody_encoder_model.blocks.3.tdnn2.norm.weight
1719
+ prosody_encoder_model.blocks.3.tdnn2.norm.bias
1720
+ prosody_encoder_model.blocks.3.se_block.conv1.weight
1721
+ prosody_encoder_model.blocks.3.se_block.conv1.bias
1722
+ prosody_encoder_model.blocks.3.se_block.conv2.weight
1723
+ prosody_encoder_model.blocks.3.se_block.conv2.bias
1724
+ prosody_encoder_model.mfa.conv.weight
1725
+ prosody_encoder_model.mfa.conv.bias
1726
+ prosody_encoder_model.mfa.norm.weight
1727
+ prosody_encoder_model.mfa.norm.bias
1728
+ prosody_encoder_model.asp.tdnn.conv.weight
1729
+ prosody_encoder_model.asp.tdnn.conv.bias
1730
+ prosody_encoder_model.asp.tdnn.norm.weight
1731
+ prosody_encoder_model.asp.tdnn.norm.bias
1732
+ prosody_encoder_model.asp.conv.weight
1733
+ prosody_encoder_model.asp.conv.bias
1734
+ prosody_encoder_model.asp_norm.weight
1735
+ prosody_encoder_model.asp_norm.bias
1736
+ prosody_encoder_model.fc.weight
1737
+ prosody_encoder_model.fc.bias
pretrained_models/ckpts/prosody_encoder/prosody_encoder_UnitY2.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf01eaec68b9f8a078ac80550a9ad7de3857fb52f3aac126e5de31aa036bd015
3
+ size 14402800
pretrained_models/ckpts/prosody_encoder/prosody_encoder_pretssel.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:832519d7d58c8d18ad1e751bf1f66ecfb4135f1aeeec741a5fc14ef5a1d0b5ff
3
+ size 28757068
pretrained_models/ckpts/vocos-mel-24khz/.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
pretrained_models/ckpts/vocos-mel-24khz/README.md ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ ---
4
+
5
+ # Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis
6
+
7
+ [Audio samples](https://charactr-platform.github.io/vocos/) |
8
+ Paper [[abs]](https://arxiv.org/abs/2306.00814) [[pdf]](https://arxiv.org/pdf/2306.00814.pdf)
9
+
10
+ Vocos is a fast neural vocoder designed to synthesize audio waveforms from acoustic features. Trained using a Generative
11
+ Adversarial Network (GAN) objective, Vocos can generate waveforms in a single forward pass. Unlike other typical
12
+ GAN-based vocoders, Vocos does not model audio samples in the time domain. Instead, it generates spectral
13
+ coefficients, facilitating rapid audio reconstruction through inverse Fourier transform.
14
+
15
+ ## Installation
16
+
17
+ To use Vocos only in inference mode, install it using:
18
+
19
+ ```bash
20
+ pip install vocos
21
+ ```
22
+
23
+ If you wish to train the model, install it with additional dependencies:
24
+
25
+ ```bash
26
+ pip install vocos[train]
27
+ ```
28
+
29
+ ## Usage
30
+
31
+ ### Reconstruct audio from mel-spectrogram
32
+
33
+ ```python
34
+ import torch
35
+
36
+ from vocos import Vocos
37
+
38
+ vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
39
+
40
+ mel = torch.randn(1, 100, 256) # B, C, T
41
+ audio = vocos.decode(mel)
42
+ ```
43
+
44
+ Copy-synthesis from a file:
45
+
46
+ ```python
47
+ import torchaudio
48
+
49
+ y, sr = torchaudio.load(YOUR_AUDIO_FILE)
50
+ if y.size(0) > 1: # mix to mono
51
+ y = y.mean(dim=0, keepdim=True)
52
+ y = torchaudio.functional.resample(y, orig_freq=sr, new_freq=24000)
53
+ y_hat = vocos(y)
54
+ ```
55
+
56
+ ## Citation
57
+
58
+ If this code contributes to your research, please cite our work:
59
+
60
+ ```
61
+ @article{siuzdak2023vocos,
62
+ title={Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis},
63
+ author={Siuzdak, Hubert},
64
+ journal={arXiv preprint arXiv:2306.00814},
65
+ year={2023}
66
+ }
67
+ ```
68
+
69
+ ## License
70
+
71
+ The code in this repository is released under the MIT license.
pretrained_models/ckpts/vocos-mel-24khz/config.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ feature_extractor:
2
+ class_path: vocos.feature_extractors.MelSpectrogramFeatures
3
+ init_args:
4
+ sample_rate: 24000
5
+ n_fft: 1024
6
+ hop_length: 256
7
+ n_mels: 100
8
+ padding: center
9
+
10
+ backbone:
11
+ class_path: vocos.models.VocosBackbone
12
+ init_args:
13
+ input_channels: 100
14
+ dim: 512
15
+ intermediate_dim: 1536
16
+ num_layers: 8
17
+
18
+ head:
19
+ class_path: vocos.heads.ISTFTHead
20
+ init_args:
21
+ dim: 512
22
+ n_fft: 1024
23
+ hop_length: 256
24
+ padding: center
pretrained_models/ckpts/vocos-mel-24khz/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97ec976ad1fd67a33ab2682d29c0ac7df85234fae875aefcc5fb215681a91b2a
3
+ size 54365991
pretrained_models/data/multilingual_grl/vocab.txt ADDED
@@ -0,0 +1,898 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ !
3
+ "
4
+ #1
5
+ #2
6
+ #3
7
+ #4
8
+ (zh)a1
9
+ (zh)a2
10
+ (zh)a3
11
+ (zh)a4
12
+ (zh)a5
13
+ (zh)ai1
14
+ (zh)ai2
15
+ (zh)ai3
16
+ (zh)ai4
17
+ (zh)ai5
18
+ (zh)an1
19
+ (zh)an2
20
+ (zh)an3
21
+ (zh)an4
22
+ (zh)an5
23
+ (zh)ang1
24
+ (zh)ang2
25
+ (zh)ang3
26
+ (zh)ang4
27
+ (zh)ang5
28
+ (zh)ao1
29
+ (zh)ao2
30
+ (zh)ao3
31
+ (zh)ao4
32
+ (zh)ao5
33
+ (zh)b
34
+ (zh)c
35
+ (zh)ch
36
+ (zh)d
37
+ (zh)e1
38
+ (zh)e2
39
+ (zh)e3
40
+ (zh)e4
41
+ (zh)e5
42
+ (zh)ei1
43
+ (zh)ei2
44
+ (zh)ei3
45
+ (zh)ei4
46
+ (zh)ei5
47
+ (zh)en1
48
+ (zh)en2
49
+ (zh)en3
50
+ (zh)en4
51
+ (zh)en5
52
+ (zh)eng1
53
+ (zh)eng2
54
+ (zh)eng3
55
+ (zh)eng4
56
+ (zh)eng5
57
+ (zh)er1
58
+ (zh)er2
59
+ (zh)er3
60
+ (zh)er4
61
+ (zh)er5
62
+ (zh)f
63
+ (zh)g
64
+ (zh)h
65
+ (zh)i1
66
+ (zh)i2
67
+ (zh)i3
68
+ (zh)i4
69
+ (zh)i5
70
+ (zh)ia1
71
+ (zh)ia2
72
+ (zh)ia3
73
+ (zh)ia4
74
+ (zh)ia5
75
+ (zh)ian1
76
+ (zh)ian2
77
+ (zh)ian3
78
+ (zh)ian4
79
+ (zh)ian5
80
+ (zh)iang1
81
+ (zh)iang2
82
+ (zh)iang3
83
+ (zh)iang4
84
+ (zh)iang5
85
+ (zh)iao1
86
+ (zh)iao2
87
+ (zh)iao3
88
+ (zh)iao4
89
+ (zh)iao5
90
+ (zh)ie1
91
+ (zh)ie2
92
+ (zh)ie3
93
+ (zh)ie4
94
+ (zh)ie5
95
+ (zh)in1
96
+ (zh)in2
97
+ (zh)in3
98
+ (zh)in4
99
+ (zh)in5
100
+ (zh)ing1
101
+ (zh)ing2
102
+ (zh)ing3
103
+ (zh)ing4
104
+ (zh)ing5
105
+ (zh)iong1
106
+ (zh)iong2
107
+ (zh)iong3
108
+ (zh)iong4
109
+ (zh)iong5
110
+ (zh)iou1
111
+ (zh)iou2
112
+ (zh)iou3
113
+ (zh)iou4
114
+ (zh)iou5
115
+ (zh)j
116
+ (zh)k
117
+ (zh)l
118
+ (zh)m
119
+ (zh)n
120
+ (zh)o1
121
+ (zh)o2
122
+ (zh)o3
123
+ (zh)o4
124
+ (zh)o5
125
+ (zh)ong1
126
+ (zh)ong2
127
+ (zh)ong3
128
+ (zh)ong4
129
+ (zh)ong5
130
+ (zh)ou1
131
+ (zh)ou2
132
+ (zh)ou3
133
+ (zh)ou4
134
+ (zh)ou5
135
+ (zh)p
136
+ (zh)q
137
+ (zh)r
138
+ (zh)s
139
+ (zh)sh
140
+ (zh)t
141
+ (zh)u1
142
+ (zh)u2
143
+ (zh)u3
144
+ (zh)u4
145
+ (zh)u5
146
+ (zh)ua1
147
+ (zh)ua2
148
+ (zh)ua3
149
+ (zh)ua4
150
+ (zh)ua5
151
+ (zh)uai1
152
+ (zh)uai2
153
+ (zh)uai3
154
+ (zh)uai4
155
+ (zh)uai5
156
+ (zh)uan1
157
+ (zh)uan2
158
+ (zh)uan3
159
+ (zh)uan4
160
+ (zh)uan5
161
+ (zh)uang1
162
+ (zh)uang2
163
+ (zh)uang3
164
+ (zh)uang4
165
+ (zh)uang5
166
+ (zh)uei1
167
+ (zh)uei2
168
+ (zh)uei3
169
+ (zh)uei4
170
+ (zh)uei5
171
+ (zh)uen1
172
+ (zh)uen2
173
+ (zh)uen3
174
+ (zh)uen4
175
+ (zh)uen5
176
+ (zh)ueng1
177
+ (zh)ueng2
178
+ (zh)ueng3
179
+ (zh)ueng4
180
+ (zh)ueng5
181
+ (zh)uo1
182
+ (zh)uo2
183
+ (zh)uo3
184
+ (zh)uo4
185
+ (zh)uo5
186
+ (zh)v1
187
+ (zh)v2
188
+ (zh)v3
189
+ (zh)v4
190
+ (zh)v5
191
+ (zh)van1
192
+ (zh)van2
193
+ (zh)van3
194
+ (zh)van4
195
+ (zh)van5
196
+ (zh)ve1
197
+ (zh)ve2
198
+ (zh)ve3
199
+ (zh)ve4
200
+ (zh)ve5
201
+ (zh)vn1
202
+ (zh)vn2
203
+ (zh)vn3
204
+ (zh)vn4
205
+ (zh)vn5
206
+ (zh)w
207
+ (zh)x
208
+ (zh)y
209
+ (zh)z
210
+ (zh)zh
211
+ (de)a
212
+ (de)aɪ
213
+ (de)aʊ
214
+ (de)b
215
+ (de)bʲ
216
+ (de)c
217
+ (de)d
218
+ (de)dʑ
219
+ (de)dʒ
220
+ (de)e
221
+ (de)eː
222
+ (de)f
223
+ (de)h
224
+ (de)i
225
+ (de)iː
226
+ (de)j
227
+ (de)k
228
+ (de)kʲ
229
+ (de)l
230
+ (de)m
231
+ (de)mʲ
232
+ (de)n
233
+ (de)o
234
+ (de)oɪ
235
+ (de)oː
236
+ (de)p
237
+ (de)pf
238
+ (de)pʲ
239
+ (de)r
240
+ (de)s
241
+ (de)t
242
+ (de)ts
243
+ (de)tɕ
244
+ (de)tʃ
245
+ (de)tʲ
246
+ (de)u
247
+ (de)uː
248
+ (de)v
249
+ (de)vʲ
250
+ (de)w
251
+ (de)x
252
+ (de)y
253
+ (de)yː
254
+ (de)z
255
+ (de)ç
256
+ (de)ð
257
+ (de)øː
258
+ (de)ŋ
259
+ (de)œ
260
+ (de)ɑ
261
+ (de)ɑː
262
+ (de)ɔ
263
+ (de)ɔø
264
+ (de)ɔː
265
+ (de)ɕ
266
+ (de)ə
267
+ (de)ɛ
268
+ (de)ɛɪ
269
+ (de)ɛː
270
+ (de)ɜ
271
+ (de)ɡ
272
+ (de)ɡʲ
273
+ (de)ɣ
274
+ (de)ɨ
275
+ (de)ɪ
276
+ (de)ɲ
277
+ (de)ɲʲ
278
+ (de)ɾ
279
+ (de)ʃ
280
+ (de)ʊ
281
+ (de)ʑ
282
+ (de)ʒ
283
+ (de)θ
284
+ (el)
285
+ (en)a
286
+ (en)aɪ
287
+ (en)aɪə
288
+ (en)aɪɚ
289
+ (en)aʊ
290
+ (en)aː
291
+ (en)b
292
+ (en)bʲ
293
+ (en)c
294
+ (en)d
295
+ (en)dʑ
296
+ (en)dʒ
297
+ (en)e
298
+ (en)eə
299
+ (en)eɪ
300
+ (en)f
301
+ (en)h
302
+ (en)i
303
+ (en)iə
304
+ (en)iː
305
+ (en)iːː
306
+ (en)j
307
+ (en)k
308
+ (en)l
309
+ (en)m
310
+ (en)n
311
+ (en)nʲ
312
+ (en)o
313
+ (en)oʊ
314
+ (en)oː
315
+ (en)oːɹ
316
+ (en)p
317
+ (en)q
318
+ (en)r
319
+ (en)s
320
+ (en)t
321
+ (en)tɕ
322
+ (en)tʃ
323
+ (en)u
324
+ (en)uː
325
+ (en)v
326
+ (en)w
327
+ (en)x
328
+ (en)z
329
+ (en)æ
330
+ (en)ææ
331
+ (en)ç
332
+ (en)ð
333
+ (en)ŋ
334
+ (en)ɐ
335
+ (en)ɐɐ
336
+ (en)ɑ
337
+ (en)ɑː
338
+ (en)ɑːɹ
339
+ (en)ɒ
340
+ (en)ɔ
341
+ (en)ɔɪ
342
+ (en)ɔː
343
+ (en)ɔːɹ
344
+ (en)ɕ
345
+ (en)ə
346
+ (en)əl
347
+ (en)əʊ
348
+ (en)ɚ
349
+ (en)ɛ
350
+ (en)ɛɹ
351
+ (en)ɛː
352
+ (en)ɜː
353
+ (en)ɡ
354
+ (en)ɡʲ
355
+ (en)ɣ
356
+ (en)ɨ
357
+ (en)ɪ
358
+ (en)ɪɹ
359
+ (en)ɪː
360
+ (en)ɬ
361
+ (en)ɲ
362
+ (en)ɲʲ
363
+ (en)ɹ
364
+ (en)ɾ
365
+ (en)ʁ
366
+ (en)ʃ
367
+ (en)ʊ
368
+ (en)ʊə
369
+ (en)ʊɹ
370
+ (en)ʌ
371
+ (en)ʍ
372
+ (en)ʒ
373
+ (en)ʔ
374
+ (en)θ
375
+ (en)ᵻ
376
+ (es)a
377
+ (es)aɪ
378
+ (es)aʊ
379
+ (es)b
380
+ (es)c
381
+ (es)d
382
+ (es)dʒ
383
+ (es)e
384
+ (es)eɪ
385
+ (es)eʊ
386
+ (es)f
387
+ (es)h
388
+ (es)i
389
+ (es)iː
390
+ (es)j
391
+ (es)k
392
+ (es)l
393
+ (es)m
394
+ (es)n
395
+ (es)o
396
+ (es)oɪ
397
+ (es)p
398
+ (es)pː
399
+ (es)r
400
+ (es)s
401
+ (es)t
402
+ (es)ts
403
+ (es)tʃ
404
+ (es)u
405
+ (es)v
406
+ (es)w
407
+ (es)x
408
+ (es)z
409
+ (es)ð
410
+ (es)ŋ
411
+ (es)ə
412
+ (es)ɛ
413
+ (es)ɟ
414
+ (es)ɡ
415
+ (es)ɣ
416
+ (es)ɫ
417
+ (es)ɲ
418
+ (es)ɾ
419
+ (es)ʃ
420
+ (es)ʎ
421
+ (es)ʝ
422
+ (es)β
423
+ (es)θ
424
+ (fr)a
425
+ (fr)aɪ
426
+ (fr)aʊ
427
+ (fr)aː
428
+ (fr)b
429
+ (fr)c
430
+ (fr)d
431
+ (fr)dʒ
432
+ (fr)e
433
+ (fr)eʊ
434
+ (fr)f
435
+ (fr)h
436
+ (fr)i
437
+ (fr)iʰr
438
+ (fr)iː
439
+ (fr)j
440
+ (fr)k
441
+ (fr)l
442
+ (fr)m
443
+ (fr)n
444
+ (fr)o
445
+ (fr)oː
446
+ (fr)p
447
+ (fr)r
448
+ (fr)s
449
+ (fr)t
450
+ (fr)tʃ
451
+ (fr)u
452
+ (fr)uː
453
+ (fr)v
454
+ (fr)w
455
+ (fr)x
456
+ (fr)y
457
+ (fr)yː
458
+ (fr)z
459
+ (fr)ç
460
+ (fr)ð
461
+ (fr)ø
462
+ (fr)øː
463
+ (fr)ŋ
464
+ (fr)œ
465
+ (fr)ɑ
466
+ (fr)ɔ
467
+ (fr)ə
468
+ (fr)ɛ
469
+ (fr)ɡ
470
+ (fr)ɣ
471
+ (fr)ɪ
472
+ (fr)ɪː
473
+ (fr)ɲ
474
+ (fr)ʁ
475
+ (fr)ʃ
476
+ (fr)ʎ
477
+ (fr)ʒ
478
+ (fr)ʰl
479
+ (fr)θ
480
+ (id)a
481
+ (id)aɪ
482
+ (id)aʊ
483
+ (id)b
484
+ (id)d
485
+ (id)dʒ
486
+ (id)e
487
+ (id)f
488
+ (id)h
489
+ (id)i
490
+ (id)j
491
+ (id)k
492
+ (id)l
493
+ (id)m
494
+ (id)n
495
+ (id)o
496
+ (id)p
497
+ (id)r
498
+ (id)s
499
+ (id)t
500
+ (id)tʃ
501
+ (id)u
502
+ (id)v
503
+ (id)w
504
+ (id)x
505
+ (id)z
506
+ (id)ç
507
+ (id)ŋ
508
+ (id)ɔ
509
+ (id)ə
510
+ (id)ɛ
511
+ (id)ɡ
512
+ (id)ɲ
513
+ (id)ɹ
514
+ (id)ʔ
515
+ (id)χ
516
+ (it)a
517
+ (it)aɪ
518
+ (it)aʊ
519
+ (it)aː
520
+ (it)b
521
+ (it)bː
522
+ (it)c
523
+ (it)d
524
+ (it)dz
525
+ (it)dzː
526
+ (it)dʒ
527
+ (it)dʒː
528
+ (it)dː
529
+ (it)e
530
+ (it)eɪ
531
+ (it)eʊ
532
+ (it)eː
533
+ (it)f
534
+ (it)fː
535
+ (it)h
536
+ (it)i
537
+ (it)iː
538
+ (it)j
539
+ (it)k
540
+ (it)kː
541
+ (it)l
542
+ (it)m
543
+ (it)mː
544
+ (it)n
545
+ (it)o
546
+ (it)oɪ
547
+ (it)oː
548
+ (it)p
549
+ (it)pː
550
+ (it)r
551
+ (it)s
552
+ (it)ss
553
+ (it)t
554
+ (it)ts
555
+ (it)tsː
556
+ (it)tʃ
557
+ (it)tʃː
558
+ (it)tː
559
+ (it)u
560
+ (it)uɪ
561
+ (it)uː
562
+ (it)v
563
+ (it)vʲ
564
+ (it)vː
565
+ (it)w
566
+ (it)y
567
+ (it)z
568
+ (it)ŋ
569
+ (it)ɔ
570
+ (it)ɔː
571
+ (it)ə
572
+ (it)əː
573
+ (it)ɛ
574
+ (it)ɛɪ
575
+ (it)ɛː
576
+ (it)ɟ
577
+ (it)ɡ
578
+ (it)ɡː
579
+ (it)ɪ
580
+ (it)ɪː
581
+ (it)ɲ
582
+ (it)ɹ
583
+ (it)ɾ
584
+ (it)ʃ
585
+ (it)ʊ
586
+ (it)ʊː
587
+ (it)ʎ
588
+ (it)ʒ
589
+ (it)ʝ
590
+ (it)ː
591
+ (it)θ
592
+ (it)θː
593
+ (pl)
594
+ (pt)a
595
+ (pt)aɪ
596
+ (pt)aʊ
597
+ (pt)aː
598
+ (pt)b
599
+ (pt)c
600
+ (pt)d
601
+ (pt)dʒ
602
+ (pt)e
603
+ (pt)eɪ
604
+ (pt)eʊ
605
+ (pt)f
606
+ (pt)h
607
+ (pt)i
608
+ (pt)iʊ
609
+ (pt)iː
610
+ (pt)j
611
+ (pt)k
612
+ (pt)l
613
+ (pt)m
614
+ (pt)n
615
+ (pt)o
616
+ (pt)oɪ
617
+ (pt)oː
618
+ (pt)p
619
+ (pt)r
620
+ (pt)s
621
+ (pt)t
622
+ (pt)ts
623
+ (pt)tʃ
624
+ (pt)u
625
+ (pt)uɪ
626
+ (pt)uː
627
+ (pt)v
628
+ (pt)w
629
+ (pt)x
630
+ (pt)y
631
+ (pt)z
632
+ (pt)æ
633
+ (pt)ç
634
+ (pt)ð
635
+ (pt)ŋ
636
+ (pt)ɐ
637
+ (pt)ɑ
638
+ (pt)ɔ
639
+ (pt)ɔɪ
640
+ (pt)ə
641
+ (pt)ɛ
642
+ (pt)ɛɪ
643
+ (pt)ɛʊ
644
+ (pt)ɡ
645
+ (pt)ɣ
646
+ (pt)ɪ
647
+ (pt)ɲ
648
+ (pt)ɹ
649
+ (pt)ɾ
650
+ (pt)ʃ
651
+ (pt)ʊ
652
+ (pt)ʎ
653
+ (pt)ʒ
654
+ (pt)θ
655
+ (ru)a
656
+ (ru)b
657
+ (ru)bʲ
658
+ (ru)c
659
+ (ru)d
660
+ (ru)dʒʲ
661
+ (ru)dʲ
662
+ (ru)e
663
+ (ru)eː
664
+ (ru)f
665
+ (ru)fʲ
666
+ (ru)i
667
+ (ru)iː
668
+ (ru)j
669
+ (ru)ja
670
+ (ru)ju
671
+ (ru)k
672
+ (ru)kʲ
673
+ (ru)l
674
+ (ru)m
675
+ (ru)mʲ
676
+ (ru)n
677
+ (ru)nʲ
678
+ (ru)o
679
+ (ru)p
680
+ (ru)pʲ
681
+ (ru)r
682
+ (ru)rʲ
683
+ (ru)s
684
+ (ru)sʲ
685
+ (ru)t
686
+ (ru)ts
687
+ (ru)tʃʲ
688
+ (ru)tʲ
689
+ (ru)u
690
+ (ru)v
691
+ (ru)vʲ
692
+ (ru)w
693
+ (ru)x
694
+ (ru)y
695
+ (ru)z
696
+ (ru)ç
697
+ (ru)ð
698
+ (ru)ŋ
699
+ (ru)ɑ
700
+ (ru)ɔ
701
+ (ru)ɕ
702
+ (ru)ə
703
+ (ru)ɛ
704
+ (ru)ɡ
705
+ (ru)ɡʲ
706
+ (ru)ɣ
707
+ (ru)ɪ
708
+ (ru)ɭ
709
+ (ru)ɭʲ
710
+ (ru)ɵ
711
+ (ru)ʃ
712
+ (ru)ʌ
713
+ (ru)ʑ
714
+ (ru)ʒ
715
+ (ru)θ
716
+ (vi)a
717
+ (vi)a2
718
+ (vi)a4
719
+ (vi)a5
720
+ (vi)a6
721
+ (vi)aɜ
722
+ (vi)aɪ4
723
+ (vi)aʊɜ
724
+ (vi)aː
725
+ (vi)aː2
726
+ (vi)aː4
727
+ (vi)aː5
728
+ (vi)aː6
729
+ (vi)aːɜ
730
+ (vi)aːɪ
731
+ (vi)b
732
+ (vi)c
733
+ (vi)d
734
+ (vi)e
735
+ (vi)e1
736
+ (vi)e2
737
+ (vi)e4
738
+ (vi)e5
739
+ (vi)e6
740
+ (vi)e7
741
+ (vi)eɜ
742
+ (vi)f
743
+ (vi)h
744
+ (vi)i
745
+ (vi)i2
746
+ (vi)i4
747
+ (vi)i5
748
+ (vi)i6
749
+ (vi)iə
750
+ (vi)iə2
751
+ (vi)iə4
752
+ (vi)iə5
753
+ (vi)iə6
754
+ (vi)iəɜ
755
+ (vi)iɛ
756
+ (vi)iɛ1
757
+ (vi)iɛ2
758
+ (vi)iɛ4
759
+ (vi)iɛ5
760
+ (vi)iɛ6
761
+ (vi)iɛɜ
762
+ (vi)iɜ
763
+ (vi)j
764
+ (vi)k
765
+ (vi)kh
766
+ (vi)l
767
+ (vi)m
768
+ (vi)n
769
+ (vi)o
770
+ (vi)o1
771
+ (vi)o2
772
+ (vi)o4
773
+ (vi)o5
774
+ (vi)o6
775
+ (vi)oɜ
776
+ (vi)p
777
+ (vi)s
778
+ (vi)t
779
+ (vi)tʃ
780
+ (vi)u
781
+ (vi)u2
782
+ (vi)u4
783
+ (vi)u5
784
+ (vi)u6
785
+ (vi)uə
786
+ (vi)uə2
787
+ (vi)uə4
788
+ (vi)uə5
789
+ (vi)uə6
790
+ (vi)uəɜ
791
+ (vi)uɜ
792
+ (vi)v
793
+ (vi)w
794
+ (vi)x
795
+ (vi)y
796
+ (vi)y2
797
+ (vi)y4
798
+ (vi)y5
799
+ (vi)y6
800
+ (vi)yə
801
+ (vi)yə2
802
+ (vi)yə4
803
+ (vi)yə5
804
+ (vi)yə6
805
+ (vi)yə7
806
+ (vi)yəɜ
807
+ (vi)yɜ
808
+ (vi)z
809
+ (vi)ð
810
+ (vi)ŋ
811
+ (vi)ɔ
812
+ (vi)ɔ2
813
+ (vi)ɔ4
814
+ (vi)ɔ5
815
+ (vi)ɔ6
816
+ (vi)ɔɜ
817
+ (vi)ɗ
818
+ (vi)ə
819
+ (vi)ə1
820
+ (vi)ə2
821
+ (vi)ə4
822
+ (vi)ə5
823
+ (vi)ə6
824
+ (vi)əɜ
825
+ (vi)əɪ
826
+ (vi)əɪ2
827
+ (vi)əɪ4
828
+ (vi)əɪ5
829
+ (vi)əɪ6
830
+ (vi)əɪɜ
831
+ (vi)əː
832
+ (vi)əː2
833
+ (vi)əː4
834
+ (vi)əː5
835
+ (vi)əː6
836
+ (vi)əːɜ
837
+ (vi)əːʊ
838
+ (vi)əːʊɜ
839
+ (vi)ɛ
840
+ (vi)ɛ2
841
+ (vi)ɛ4
842
+ (vi)ɛ5
843
+ (vi)ɛ6
844
+ (vi)ɛɜ
845
+ (vi)ɡ
846
+ (vi)ɣ
847
+ (vi)ɲ
848
+ (vi)ʐ
849
+ (vi)ʒ
850
+ ,
851
+ .
852
+ 1
853
+ ?
854
+ ^
855
+ _
856
+ a
857
+
858
+ b
859
+ c
860
+ d
861
+
862
+ e
863
+ f
864
+ i
865
+ j
866
+ k
867
+ l
868
+ m
869
+ n
870
+ o
871
+
872
+ p
873
+ r
874
+ s
875
+ t
876
+ ts
877
+
878
+ u
879
+ v
880
+ w
881
+ x
882
+ z
883
+ ¡
884
+ ç
885
+ ð
886
+ ŋ
887
+ ɔ
888
+ ɛ
889
+ ɡ
890
+ ɣ
891
+ ɲ
892
+ ʃ
893
+ ʎ
894
+ ̃
895
+ ̩
896
+ ̪
897
+ θ
898
+
pretrained_models/data/multilingual_prosody/vocab.txt ADDED
@@ -0,0 +1,898 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ !
3
+ "
4
+ #1
5
+ #2
6
+ #3
7
+ #4
8
+ (zh)a1
9
+ (zh)a2
10
+ (zh)a3
11
+ (zh)a4
12
+ (zh)a5
13
+ (zh)ai1
14
+ (zh)ai2
15
+ (zh)ai3
16
+ (zh)ai4
17
+ (zh)ai5
18
+ (zh)an1
19
+ (zh)an2
20
+ (zh)an3
21
+ (zh)an4
22
+ (zh)an5
23
+ (zh)ang1
24
+ (zh)ang2
25
+ (zh)ang3
26
+ (zh)ang4
27
+ (zh)ang5
28
+ (zh)ao1
29
+ (zh)ao2
30
+ (zh)ao3
31
+ (zh)ao4
32
+ (zh)ao5
33
+ (zh)b
34
+ (zh)c
35
+ (zh)ch
36
+ (zh)d
37
+ (zh)e1
38
+ (zh)e2
39
+ (zh)e3
40
+ (zh)e4
41
+ (zh)e5
42
+ (zh)ei1
43
+ (zh)ei2
44
+ (zh)ei3
45
+ (zh)ei4
46
+ (zh)ei5
47
+ (zh)en1
48
+ (zh)en2
49
+ (zh)en3
50
+ (zh)en4
51
+ (zh)en5
52
+ (zh)eng1
53
+ (zh)eng2
54
+ (zh)eng3
55
+ (zh)eng4
56
+ (zh)eng5
57
+ (zh)er1
58
+ (zh)er2
59
+ (zh)er3
60
+ (zh)er4
61
+ (zh)er5
62
+ (zh)f
63
+ (zh)g
64
+ (zh)h
65
+ (zh)i1
66
+ (zh)i2
67
+ (zh)i3
68
+ (zh)i4
69
+ (zh)i5
70
+ (zh)ia1
71
+ (zh)ia2
72
+ (zh)ia3
73
+ (zh)ia4
74
+ (zh)ia5
75
+ (zh)ian1
76
+ (zh)ian2
77
+ (zh)ian3
78
+ (zh)ian4
79
+ (zh)ian5
80
+ (zh)iang1
81
+ (zh)iang2
82
+ (zh)iang3
83
+ (zh)iang4
84
+ (zh)iang5
85
+ (zh)iao1
86
+ (zh)iao2
87
+ (zh)iao3
88
+ (zh)iao4
89
+ (zh)iao5
90
+ (zh)ie1
91
+ (zh)ie2
92
+ (zh)ie3
93
+ (zh)ie4
94
+ (zh)ie5
95
+ (zh)in1
96
+ (zh)in2
97
+ (zh)in3
98
+ (zh)in4
99
+ (zh)in5
100
+ (zh)ing1
101
+ (zh)ing2
102
+ (zh)ing3
103
+ (zh)ing4
104
+ (zh)ing5
105
+ (zh)iong1
106
+ (zh)iong2
107
+ (zh)iong3
108
+ (zh)iong4
109
+ (zh)iong5
110
+ (zh)iou1
111
+ (zh)iou2
112
+ (zh)iou3
113
+ (zh)iou4
114
+ (zh)iou5
115
+ (zh)j
116
+ (zh)k
117
+ (zh)l
118
+ (zh)m
119
+ (zh)n
120
+ (zh)o1
121
+ (zh)o2
122
+ (zh)o3
123
+ (zh)o4
124
+ (zh)o5
125
+ (zh)ong1
126
+ (zh)ong2
127
+ (zh)ong3
128
+ (zh)ong4
129
+ (zh)ong5
130
+ (zh)ou1
131
+ (zh)ou2
132
+ (zh)ou3
133
+ (zh)ou4
134
+ (zh)ou5
135
+ (zh)p
136
+ (zh)q
137
+ (zh)r
138
+ (zh)s
139
+ (zh)sh
140
+ (zh)t
141
+ (zh)u1
142
+ (zh)u2
143
+ (zh)u3
144
+ (zh)u4
145
+ (zh)u5
146
+ (zh)ua1
147
+ (zh)ua2
148
+ (zh)ua3
149
+ (zh)ua4
150
+ (zh)ua5
151
+ (zh)uai1
152
+ (zh)uai2
153
+ (zh)uai3
154
+ (zh)uai4
155
+ (zh)uai5
156
+ (zh)uan1
157
+ (zh)uan2
158
+ (zh)uan3
159
+ (zh)uan4
160
+ (zh)uan5
161
+ (zh)uang1
162
+ (zh)uang2
163
+ (zh)uang3
164
+ (zh)uang4
165
+ (zh)uang5
166
+ (zh)uei1
167
+ (zh)uei2
168
+ (zh)uei3
169
+ (zh)uei4
170
+ (zh)uei5
171
+ (zh)uen1
172
+ (zh)uen2
173
+ (zh)uen3
174
+ (zh)uen4
175
+ (zh)uen5
176
+ (zh)ueng1
177
+ (zh)ueng2
178
+ (zh)ueng3
179
+ (zh)ueng4
180
+ (zh)ueng5
181
+ (zh)uo1
182
+ (zh)uo2
183
+ (zh)uo3
184
+ (zh)uo4
185
+ (zh)uo5
186
+ (zh)v1
187
+ (zh)v2
188
+ (zh)v3
189
+ (zh)v4
190
+ (zh)v5
191
+ (zh)van1
192
+ (zh)van2
193
+ (zh)van3
194
+ (zh)van4
195
+ (zh)van5
196
+ (zh)ve1
197
+ (zh)ve2
198
+ (zh)ve3
199
+ (zh)ve4
200
+ (zh)ve5
201
+ (zh)vn1
202
+ (zh)vn2
203
+ (zh)vn3
204
+ (zh)vn4
205
+ (zh)vn5
206
+ (zh)w
207
+ (zh)x
208
+ (zh)y
209
+ (zh)z
210
+ (zh)zh
211
+ (de)a
212
+ (de)aɪ
213
+ (de)aʊ
214
+ (de)b
215
+ (de)bʲ
216
+ (de)c
217
+ (de)d
218
+ (de)dʑ
219
+ (de)dʒ
220
+ (de)e
221
+ (de)eː
222
+ (de)f
223
+ (de)h
224
+ (de)i
225
+ (de)iː
226
+ (de)j
227
+ (de)k
228
+ (de)kʲ
229
+ (de)l
230
+ (de)m
231
+ (de)mʲ
232
+ (de)n
233
+ (de)o
234
+ (de)oɪ
235
+ (de)oː
236
+ (de)p
237
+ (de)pf
238
+ (de)pʲ
239
+ (de)r
240
+ (de)s
241
+ (de)t
242
+ (de)ts
243
+ (de)tɕ
244
+ (de)tʃ
245
+ (de)tʲ
246
+ (de)u
247
+ (de)uː
248
+ (de)v
249
+ (de)vʲ
250
+ (de)w
251
+ (de)x
252
+ (de)y
253
+ (de)yː
254
+ (de)z
255
+ (de)ç
256
+ (de)ð
257
+ (de)øː
258
+ (de)ŋ
259
+ (de)œ
260
+ (de)ɑ
261
+ (de)ɑː
262
+ (de)ɔ
263
+ (de)ɔø
264
+ (de)ɔː
265
+ (de)ɕ
266
+ (de)ə
267
+ (de)ɛ
268
+ (de)ɛɪ
269
+ (de)ɛː
270
+ (de)ɜ
271
+ (de)ɡ
272
+ (de)ɡʲ
273
+ (de)ɣ
274
+ (de)ɨ
275
+ (de)ɪ
276
+ (de)ɲ
277
+ (de)ɲʲ
278
+ (de)ɾ
279
+ (de)ʃ
280
+ (de)ʊ
281
+ (de)ʑ
282
+ (de)ʒ
283
+ (de)θ
284
+ (el)
285
+ (en)a
286
+ (en)aɪ
287
+ (en)aɪə
288
+ (en)aɪɚ
289
+ (en)aʊ
290
+ (en)aː
291
+ (en)b
292
+ (en)bʲ
293
+ (en)c
294
+ (en)d
295
+ (en)dʑ
296
+ (en)dʒ
297
+ (en)e
298
+ (en)eə
299
+ (en)eɪ
300
+ (en)f
301
+ (en)h
302
+ (en)i
303
+ (en)iə
304
+ (en)iː
305
+ (en)iːː
306
+ (en)j
307
+ (en)k
308
+ (en)l
309
+ (en)m
310
+ (en)n
311
+ (en)nʲ
312
+ (en)o
313
+ (en)oʊ
314
+ (en)oː
315
+ (en)oːɹ
316
+ (en)p
317
+ (en)q
318
+ (en)r
319
+ (en)s
320
+ (en)t
321
+ (en)tɕ
322
+ (en)tʃ
323
+ (en)u
324
+ (en)uː
325
+ (en)v
326
+ (en)w
327
+ (en)x
328
+ (en)z
329
+ (en)æ
330
+ (en)ææ
331
+ (en)ç
332
+ (en)ð
333
+ (en)ŋ
334
+ (en)ɐ
335
+ (en)ɐɐ
336
+ (en)ɑ
337
+ (en)ɑː
338
+ (en)ɑːɹ
339
+ (en)ɒ
340
+ (en)ɔ
341
+ (en)ɔɪ
342
+ (en)ɔː
343
+ (en)ɔːɹ
344
+ (en)ɕ
345
+ (en)ə
346
+ (en)əl
347
+ (en)əʊ
348
+ (en)ɚ
349
+ (en)ɛ
350
+ (en)ɛɹ
351
+ (en)ɛː
352
+ (en)ɜː
353
+ (en)ɡ
354
+ (en)ɡʲ
355
+ (en)ɣ
356
+ (en)ɨ
357
+ (en)ɪ
358
+ (en)ɪɹ
359
+ (en)ɪː
360
+ (en)ɬ
361
+ (en)ɲ
362
+ (en)ɲʲ
363
+ (en)ɹ
364
+ (en)ɾ
365
+ (en)ʁ
366
+ (en)ʃ
367
+ (en)ʊ
368
+ (en)ʊə
369
+ (en)ʊɹ
370
+ (en)ʌ
371
+ (en)ʍ
372
+ (en)ʒ
373
+ (en)ʔ
374
+ (en)θ
375
+ (en)ᵻ
376
+ (es)a
377
+ (es)aɪ
378
+ (es)aʊ
379
+ (es)b
380
+ (es)c
381
+ (es)d
382
+ (es)dʒ
383
+ (es)e
384
+ (es)eɪ
385
+ (es)eʊ
386
+ (es)f
387
+ (es)h
388
+ (es)i
389
+ (es)iː
390
+ (es)j
391
+ (es)k
392
+ (es)l
393
+ (es)m
394
+ (es)n
395
+ (es)o
396
+ (es)oɪ
397
+ (es)p
398
+ (es)pː
399
+ (es)r
400
+ (es)s
401
+ (es)t
402
+ (es)ts
403
+ (es)tʃ
404
+ (es)u
405
+ (es)v
406
+ (es)w
407
+ (es)x
408
+ (es)z
409
+ (es)ð
410
+ (es)ŋ
411
+ (es)ə
412
+ (es)ɛ
413
+ (es)ɟ
414
+ (es)ɡ
415
+ (es)ɣ
416
+ (es)ɫ
417
+ (es)ɲ
418
+ (es)ɾ
419
+ (es)ʃ
420
+ (es)ʎ
421
+ (es)ʝ
422
+ (es)β
423
+ (es)θ
424
+ (fr)a
425
+ (fr)aɪ
426
+ (fr)aʊ
427
+ (fr)aː
428
+ (fr)b
429
+ (fr)c
430
+ (fr)d
431
+ (fr)dʒ
432
+ (fr)e
433
+ (fr)eʊ
434
+ (fr)f
435
+ (fr)h
436
+ (fr)i
437
+ (fr)iʰr
438
+ (fr)iː
439
+ (fr)j
440
+ (fr)k
441
+ (fr)l
442
+ (fr)m
443
+ (fr)n
444
+ (fr)o
445
+ (fr)oː
446
+ (fr)p
447
+ (fr)r
448
+ (fr)s
449
+ (fr)t
450
+ (fr)tʃ
451
+ (fr)u
452
+ (fr)uː
453
+ (fr)v
454
+ (fr)w
455
+ (fr)x
456
+ (fr)y
457
+ (fr)yː
458
+ (fr)z
459
+ (fr)ç
460
+ (fr)ð
461
+ (fr)ø
462
+ (fr)øː
463
+ (fr)ŋ
464
+ (fr)œ
465
+ (fr)ɑ
466
+ (fr)ɔ
467
+ (fr)ə
468
+ (fr)ɛ
469
+ (fr)ɡ
470
+ (fr)ɣ
471
+ (fr)ɪ
472
+ (fr)ɪː
473
+ (fr)ɲ
474
+ (fr)ʁ
475
+ (fr)ʃ
476
+ (fr)ʎ
477
+ (fr)ʒ
478
+ (fr)ʰl
479
+ (fr)θ
480
+ (id)a
481
+ (id)aɪ
482
+ (id)aʊ
483
+ (id)b
484
+ (id)d
485
+ (id)dʒ
486
+ (id)e
487
+ (id)f
488
+ (id)h
489
+ (id)i
490
+ (id)j
491
+ (id)k
492
+ (id)l
493
+ (id)m
494
+ (id)n
495
+ (id)o
496
+ (id)p
497
+ (id)r
498
+ (id)s
499
+ (id)t
500
+ (id)tʃ
501
+ (id)u
502
+ (id)v
503
+ (id)w
504
+ (id)x
505
+ (id)z
506
+ (id)ç
507
+ (id)ŋ
508
+ (id)ɔ
509
+ (id)ə
510
+ (id)ɛ
511
+ (id)ɡ
512
+ (id)ɲ
513
+ (id)ɹ
514
+ (id)ʔ
515
+ (id)χ
516
+ (it)a
517
+ (it)aɪ
518
+ (it)aʊ
519
+ (it)aː
520
+ (it)b
521
+ (it)bː
522
+ (it)c
523
+ (it)d
524
+ (it)dz
525
+ (it)dzː
526
+ (it)dʒ
527
+ (it)dʒː
528
+ (it)dː
529
+ (it)e
530
+ (it)eɪ
531
+ (it)eʊ
532
+ (it)eː
533
+ (it)f
534
+ (it)fː
535
+ (it)h
536
+ (it)i
537
+ (it)iː
538
+ (it)j
539
+ (it)k
540
+ (it)kː
541
+ (it)l
542
+ (it)m
543
+ (it)mː
544
+ (it)n
545
+ (it)o
546
+ (it)oɪ
547
+ (it)oː
548
+ (it)p
549
+ (it)pː
550
+ (it)r
551
+ (it)s
552
+ (it)ss
553
+ (it)t
554
+ (it)ts
555
+ (it)tsː
556
+ (it)tʃ
557
+ (it)tʃː
558
+ (it)tː
559
+ (it)u
560
+ (it)uɪ
561
+ (it)uː
562
+ (it)v
563
+ (it)vʲ
564
+ (it)vː
565
+ (it)w
566
+ (it)y
567
+ (it)z
568
+ (it)ŋ
569
+ (it)ɔ
570
+ (it)ɔː
571
+ (it)ə
572
+ (it)əː
573
+ (it)ɛ
574
+ (it)ɛɪ
575
+ (it)ɛː
576
+ (it)ɟ
577
+ (it)ɡ
578
+ (it)ɡː
579
+ (it)ɪ
580
+ (it)ɪː
581
+ (it)ɲ
582
+ (it)ɹ
583
+ (it)ɾ
584
+ (it)ʃ
585
+ (it)ʊ
586
+ (it)ʊː
587
+ (it)ʎ
588
+ (it)ʒ
589
+ (it)ʝ
590
+ (it)ː
591
+ (it)θ
592
+ (it)θː
593
+ (pl)
594
+ (pt)a
595
+ (pt)aɪ
596
+ (pt)aʊ
597
+ (pt)aː
598
+ (pt)b
599
+ (pt)c
600
+ (pt)d
601
+ (pt)dʒ
602
+ (pt)e
603
+ (pt)eɪ
604
+ (pt)eʊ
605
+ (pt)f
606
+ (pt)h
607
+ (pt)i
608
+ (pt)iʊ
609
+ (pt)iː
610
+ (pt)j
611
+ (pt)k
612
+ (pt)l
613
+ (pt)m
614
+ (pt)n
615
+ (pt)o
616
+ (pt)oɪ
617
+ (pt)oː
618
+ (pt)p
619
+ (pt)r
620
+ (pt)s
621
+ (pt)t
622
+ (pt)ts
623
+ (pt)tʃ
624
+ (pt)u
625
+ (pt)uɪ
626
+ (pt)uː
627
+ (pt)v
628
+ (pt)w
629
+ (pt)x
630
+ (pt)y
631
+ (pt)z
632
+ (pt)æ
633
+ (pt)ç
634
+ (pt)ð
635
+ (pt)ŋ
636
+ (pt)ɐ
637
+ (pt)ɑ
638
+ (pt)ɔ
639
+ (pt)ɔɪ
640
+ (pt)ə
641
+ (pt)ɛ
642
+ (pt)ɛɪ
643
+ (pt)ɛʊ
644
+ (pt)ɡ
645
+ (pt)ɣ
646
+ (pt)ɪ
647
+ (pt)ɲ
648
+ (pt)ɹ
649
+ (pt)ɾ
650
+ (pt)ʃ
651
+ (pt)ʊ
652
+ (pt)ʎ
653
+ (pt)ʒ
654
+ (pt)θ
655
+ (ru)a
656
+ (ru)b
657
+ (ru)bʲ
658
+ (ru)c
659
+ (ru)d
660
+ (ru)dʒʲ
661
+ (ru)dʲ
662
+ (ru)e
663
+ (ru)eː
664
+ (ru)f
665
+ (ru)fʲ
666
+ (ru)i
667
+ (ru)iː
668
+ (ru)j
669
+ (ru)ja
670
+ (ru)ju
671
+ (ru)k
672
+ (ru)kʲ
673
+ (ru)l
674
+ (ru)m
675
+ (ru)mʲ
676
+ (ru)n
677
+ (ru)nʲ
678
+ (ru)o
679
+ (ru)p
680
+ (ru)pʲ
681
+ (ru)r
682
+ (ru)rʲ
683
+ (ru)s
684
+ (ru)sʲ
685
+ (ru)t
686
+ (ru)ts
687
+ (ru)tʃʲ
688
+ (ru)tʲ
689
+ (ru)u
690
+ (ru)v
691
+ (ru)vʲ
692
+ (ru)w
693
+ (ru)x
694
+ (ru)y
695
+ (ru)z
696
+ (ru)ç
697
+ (ru)ð
698
+ (ru)ŋ
699
+ (ru)ɑ
700
+ (ru)ɔ
701
+ (ru)ɕ
702
+ (ru)ə
703
+ (ru)ɛ
704
+ (ru)ɡ
705
+ (ru)ɡʲ
706
+ (ru)ɣ
707
+ (ru)ɪ
708
+ (ru)ɭ
709
+ (ru)ɭʲ
710
+ (ru)ɵ
711
+ (ru)ʃ
712
+ (ru)ʌ
713
+ (ru)ʑ
714
+ (ru)ʒ
715
+ (ru)θ
716
+ (vi)a
717
+ (vi)a2
718
+ (vi)a4
719
+ (vi)a5
720
+ (vi)a6
721
+ (vi)aɜ
722
+ (vi)aɪ4
723
+ (vi)aʊɜ
724
+ (vi)aː
725
+ (vi)aː2
726
+ (vi)aː4
727
+ (vi)aː5
728
+ (vi)aː6
729
+ (vi)aːɜ
730
+ (vi)aːɪ
731
+ (vi)b
732
+ (vi)c
733
+ (vi)d
734
+ (vi)e
735
+ (vi)e1
736
+ (vi)e2
737
+ (vi)e4
738
+ (vi)e5
739
+ (vi)e6
740
+ (vi)e7
741
+ (vi)eɜ
742
+ (vi)f
743
+ (vi)h
744
+ (vi)i
745
+ (vi)i2
746
+ (vi)i4
747
+ (vi)i5
748
+ (vi)i6
749
+ (vi)iə
750
+ (vi)iə2
751
+ (vi)iə4
752
+ (vi)iə5
753
+ (vi)iə6
754
+ (vi)iəɜ
755
+ (vi)iɛ
756
+ (vi)iɛ1
757
+ (vi)iɛ2
758
+ (vi)iɛ4
759
+ (vi)iɛ5
760
+ (vi)iɛ6
761
+ (vi)iɛɜ
762
+ (vi)iɜ
763
+ (vi)j
764
+ (vi)k
765
+ (vi)kh
766
+ (vi)l
767
+ (vi)m
768
+ (vi)n
769
+ (vi)o
770
+ (vi)o1
771
+ (vi)o2
772
+ (vi)o4
773
+ (vi)o5
774
+ (vi)o6
775
+ (vi)oɜ
776
+ (vi)p
777
+ (vi)s
778
+ (vi)t
779
+ (vi)tʃ
780
+ (vi)u
781
+ (vi)u2
782
+ (vi)u4
783
+ (vi)u5
784
+ (vi)u6
785
+ (vi)uə
786
+ (vi)uə2
787
+ (vi)uə4
788
+ (vi)uə5
789
+ (vi)uə6
790
+ (vi)uəɜ
791
+ (vi)uɜ
792
+ (vi)v
793
+ (vi)w
794
+ (vi)x
795
+ (vi)y
796
+ (vi)y2
797
+ (vi)y4
798
+ (vi)y5
799
+ (vi)y6
800
+ (vi)yə
801
+ (vi)yə2
802
+ (vi)yə4
803
+ (vi)yə5
804
+ (vi)yə6
805
+ (vi)yə7
806
+ (vi)yəɜ
807
+ (vi)yɜ
808
+ (vi)z
809
+ (vi)ð
810
+ (vi)ŋ
811
+ (vi)ɔ
812
+ (vi)ɔ2
813
+ (vi)ɔ4
814
+ (vi)ɔ5
815
+ (vi)ɔ6
816
+ (vi)ɔɜ
817
+ (vi)ɗ
818
+ (vi)ə
819
+ (vi)ə1
820
+ (vi)ə2
821
+ (vi)ə4
822
+ (vi)ə5
823
+ (vi)ə6
824
+ (vi)əɜ
825
+ (vi)əɪ
826
+ (vi)əɪ2
827
+ (vi)əɪ4
828
+ (vi)əɪ5
829
+ (vi)əɪ6
830
+ (vi)əɪɜ
831
+ (vi)əː
832
+ (vi)əː2
833
+ (vi)əː4
834
+ (vi)əː5
835
+ (vi)əː6
836
+ (vi)əːɜ
837
+ (vi)əːʊ
838
+ (vi)əːʊɜ
839
+ (vi)ɛ
840
+ (vi)ɛ2
841
+ (vi)ɛ4
842
+ (vi)ɛ5
843
+ (vi)ɛ6
844
+ (vi)ɛɜ
845
+ (vi)ɡ
846
+ (vi)ɣ
847
+ (vi)ɲ
848
+ (vi)ʐ
849
+ (vi)ʒ
850
+ ,
851
+ .
852
+ 1
853
+ ?
854
+ ^
855
+ _
856
+ a
857
+
858
+ b
859
+ c
860
+ d
861
+
862
+ e
863
+ f
864
+ i
865
+ j
866
+ k
867
+ l
868
+ m
869
+ n
870
+ o
871
+
872
+ p
873
+ r
874
+ s
875
+ t
876
+ ts
877
+
878
+ u
879
+ v
880
+ w
881
+ x
882
+ z
883
+ ¡
884
+ ç
885
+ ð
886
+ ŋ
887
+ ɔ
888
+ ɛ
889
+ ɡ
890
+ ɣ
891
+ ɲ
892
+ ʃ
893
+ ʎ
894
+ ̃
895
+ ̩
896
+ ̪
897
+ θ
898
+
pretrained_models/data/test_examples/en.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4f229a11a19c2dd55681cd11730bbd53bdd818aaf436cd903de38ba5db47edc
3
+ size 167724
pretrained_models/data/test_examples/es.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13d54a63fa9b49e4d73b6ae5bc8d5daead63573ea1993f7752b7813f713850ab
3
+ size 182444
pretrained_models/data/test_examples/pt.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a14340794c868cebd286893c87452fe1ea8cb9374729b637b54279bdeb992133
3
+ size 177964
pretrained_models/denoiser_model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe5eb64fa2e4154c83f8e4935e82871c850c154387ee892e0ab65fe179e7d8c9
3
+ size 16104687
pretrained_models/espeak-ng-data/af_dict ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25729d3bf4c4a0f08da60aea9eb5a0cf352630f83fab1ab0c3955b7740da1776
3
+ size 121473
pretrained_models/espeak-ng-data/am_dict ADDED
Binary file (63.9 kB). View file
 
pretrained_models/espeak-ng-data/an_dict ADDED
Binary file (6.69 kB). View file
 
pretrained_models/espeak-ng-data/ar_dict ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72316426e797777fe4df9420935a3b6a79b37d7e3f3948537ba71cd7b21b2541
3
+ size 478165
pretrained_models/espeak-ng-data/as_dict ADDED
Binary file (5.01 kB). View file
 
pretrained_models/espeak-ng-data/az_dict ADDED
Binary file (43.8 kB). View file
 
pretrained_models/espeak-ng-data/ba_dict ADDED
Binary file (2.1 kB). View file
 
pretrained_models/espeak-ng-data/be_dict ADDED
Binary file (2.65 kB). View file
 
pretrained_models/espeak-ng-data/bg_dict ADDED
Binary file (87.1 kB). View file
 
pretrained_models/espeak-ng-data/bn_dict ADDED
Binary file (90 kB). View file
 
pretrained_models/espeak-ng-data/bpy_dict ADDED
Binary file (5.23 kB). View file
 
pretrained_models/espeak-ng-data/bs_dict ADDED
Binary file (47.1 kB). View file
 
pretrained_models/espeak-ng-data/ca_dict ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59f0f94d03cb6341a1b952ff735d8f0d0cb10136a593462558db7cfc8ee5318d
3
+ size 310331
pretrained_models/espeak-ng-data/chr_dict ADDED
Binary file (2.86 kB). View file
 
pretrained_models/espeak-ng-data/cmn_dict ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a41ddab7213d0284a984def6b40c72cbc40b346d3220db08c6ae324f46d59aa
3
+ size 1566347
pretrained_models/espeak-ng-data/cs_dict ADDED
Binary file (49.6 kB). View file
 
pretrained_models/espeak-ng-data/cv_dict ADDED
Binary file (1.34 kB). View file
 
pretrained_models/espeak-ng-data/cy_dict ADDED
Binary file (43.1 kB). View file
 
pretrained_models/espeak-ng-data/da_dict ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b56272524c3e749ef31f857794f78d9728ffa9fdc62e7b2017965fefa5ff8b0f
3
+ size 245287
pretrained_models/espeak-ng-data/de_dict ADDED
Binary file (69.2 kB). View file
 
pretrained_models/espeak-ng-data/el_dict ADDED
Binary file (72.8 kB). View file
 
pretrained_models/espeak-ng-data/en_dict ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1907005e442ac080b7d8065771c1e2233336cc7ca50734cc7d1317703d9123c4
3
+ size 167080
pretrained_models/espeak-ng-data/eo_dict ADDED
Binary file (4.67 kB). View file
 
pretrained_models/espeak-ng-data/es_dict ADDED
Binary file (49.3 kB). View file
 
pretrained_models/espeak-ng-data/et_dict ADDED
Binary file (44.3 kB). View file
 
pretrained_models/espeak-ng-data/eu_dict ADDED
Binary file (48.8 kB). View file
 
pretrained_models/espeak-ng-data/fa_dict ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0ce04da3d570261be7c7525525c6cc4d71b5875919149d1512cb817c3f0c356
3
+ size 293235
pretrained_models/espeak-ng-data/fi_dict ADDED
Binary file (43.9 kB). View file
 
pretrained_models/espeak-ng-data/fo_dict ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a07227c011f0160afeafaed52efacb13306a0a4f49f1e24801e62b59ba4b0bc
3
+ size 5669268
pretrained_models/espeak-ng-data/fr_dict ADDED
Binary file (63.7 kB). View file
 
pretrained_models/espeak-ng-data/ga_dict ADDED
Binary file (52.7 kB). View file
 
pretrained_models/espeak-ng-data/gd_dict ADDED
Binary file (49.1 kB). View file
 
pretrained_models/espeak-ng-data/gn_dict ADDED
Binary file (3.25 kB). View file