Airenas commited on
Commit
088c366
·
verified ·
1 Parent(s): 05f8488

feat: add model files

Browse files
Files changed (28) hide show
  1. README.md +354 -0
  2. exp/tts_stats_raw_phn_espeak_ng_lt/train/feats_stats.npz +3 -0
  3. exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/config.yaml +301 -0
  4. exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/backward_time.png +0 -0
  5. exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/clip.png +0 -0
  6. exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/decoder_alpha.png +0 -0
  7. exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/duration_loss.png +0 -0
  8. exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/encoder_alpha.png +0 -0
  9. exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/energy_loss.png +0 -0
  10. exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/forward_time.png +0 -0
  11. exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/gpu_max_cached_mem_GB.png +0 -0
  12. exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/grad_norm.png +0 -0
  13. exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/iter_time.png +0 -0
  14. exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/l1_loss.png +0 -0
  15. exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/loss.png +0 -0
  16. exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/loss_scale.png +0 -0
  17. exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/optim0_lr0.png +0 -0
  18. exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/optim_step_time.png +0 -0
  19. exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/pitch_loss.png +0 -0
  20. exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/train_time.png +0 -0
  21. exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/train.loss.ave_5best.pth +3 -0
  22. exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/energy_lengths_stats.npz +3 -0
  23. exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/energy_stats.npz +3 -0
  24. exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/feats_lengths_stats.npz +3 -0
  25. exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/feats_stats.npz +3 -0
  26. exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/pitch_lengths_stats.npz +3 -0
  27. exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/pitch_stats.npz +3 -0
  28. meta.yaml +8 -0
README.md CHANGED
@@ -1,3 +1,357 @@
1
  ---
2
  license: cc-by-4.0
 
 
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: cc-by-4.0
3
+ tags:
4
+ - espnet
5
+ - fastspeech2
6
+ - acoustic-model
7
+ - audio
8
+ - text-to-speech
9
+ language: lt
10
+ datasets:
11
+ - SING
12
  ---
13
+
14
+ # Sintezės akustinis modelis
15
+
16
+ ## Apie
17
+
18
+ - Garsynas: SIN(tezės) G(arsynas)
19
+ - Balsas: vyriškas
20
+ - Modelio tipas: FastSpeech2 (ESPnet)
21
+
22
+ Modelis parengtas naudojant [ESPnet](https://github.com/airenas/espnet/tree/master/egs2/sing/tts1) repozitoriją.
23
+
24
+ ## Naudojimo pavyzdys sintezėje su ESPNet
25
+
26
+ Žr.: https://colab.research.google.com/github/airenas/espnet/blob/master/egs2/sing/tts1/tts_jupyter_demo.ipynb
27
+
28
+ ---
29
+
30
+ ## English
31
+
32
+ # Acoustic model for speech synthesis
33
+
34
+ ## About
35
+
36
+ - Speech corpus: SIN(tezės) G(arsynas)
37
+ - Voice: male
38
+ - Model type: FastSpeech2 (ESPnet)
39
+
40
+ The model was prepared using the [ESPnet](https://github.com/airenas/espnet/tree/master/egs2/sing/tts1) repository.
41
+
42
+ ## Example usage for synthesis with ESPnet
43
+
44
+ See: https://colab.research.google.com/github/airenas/espnet/blob/master/egs2/sing/tts1/tts_jupyter_demo.ipynb
45
+
46
+ ## TTS config
47
+
48
+ <details><summary>expand</summary>
49
+
50
+ ```
51
+ config: conf/tuning/train_fastspeech2.yaml
52
+ print_config: false
53
+ log_level: INFO
54
+ drop_last_iter: false
55
+ dry_run: false
56
+ iterator_type: sequence
57
+ valid_iterator_type: null
58
+ output_dir: exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt
59
+ ngpu: 1
60
+ seed: 0
61
+ num_workers: 1
62
+ num_att_plot: 3
63
+ dist_backend: nccl
64
+ dist_init_method: env://
65
+ dist_world_size: null
66
+ dist_rank: null
67
+ local_rank: 0
68
+ dist_master_addr: null
69
+ dist_master_port: null
70
+ dist_launcher: null
71
+ multiprocessing_distributed: false
72
+ unused_parameters: false
73
+ sharded_ddp: false
74
+ use_deepspeed: false
75
+ deepspeed_config: null
76
+ gradient_as_bucket_view: true
77
+ ddp_comm_hook: null
78
+ cudnn_enabled: true
79
+ cudnn_benchmark: false
80
+ cudnn_deterministic: true
81
+ use_tf32: false
82
+ collect_stats: false
83
+ write_collected_feats: false
84
+ max_epoch: 1000
85
+ patience: null
86
+ val_scheduler_criterion:
87
+ - valid
88
+ - loss
89
+ early_stopping_criterion:
90
+ - valid
91
+ - loss
92
+ - min
93
+ best_model_criterion:
94
+ - - valid
95
+ - loss
96
+ - min
97
+ - - train
98
+ - loss
99
+ - min
100
+ keep_nbest_models: 5
101
+ nbest_averaging_interval: 0
102
+ grad_clip: 1.0
103
+ grad_clip_type: 2.0
104
+ grad_noise: false
105
+ accum_grad: 8
106
+ no_forward_run: false
107
+ resume: true
108
+ train_dtype: float32
109
+ use_amp: false
110
+ log_interval: null
111
+ use_matplotlib: true
112
+ use_tensorboard: true
113
+ create_graph_in_tensorboard: false
114
+ use_wandb: false
115
+ wandb_project: null
116
+ wandb_id: null
117
+ wandb_entity: null
118
+ wandb_name: null
119
+ wandb_model_log_interval: -1
120
+ detect_anomaly: false
121
+ use_adapter: false
122
+ adapter: lora
123
+ save_strategy: all
124
+ adapter_conf: {}
125
+ pretrain_path: null
126
+ init_param: []
127
+ ignore_init_mismatch: false
128
+ freeze_param: []
129
+ num_iters_per_epoch: 800
130
+ batch_size: 20
131
+ valid_batch_size: null
132
+ batch_bins: 3000000
133
+ valid_batch_bins: null
134
+ category_sample_size: 10
135
+ upsampling_factor: 0.5
136
+ category_upsampling_factor: 0.5
137
+ dataset_upsampling_factor: 0.5
138
+ dataset_scaling_factor: 1.2
139
+ max_batch_size: null
140
+ min_batch_size: 1
141
+ train_shape_file:
142
+ - exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/text_shape.phn
143
+ - exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/speech_shape
144
+ valid_shape_file:
145
+ - exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/valid/text_shape.phn
146
+ - exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/valid/speech_shape
147
+ batch_type: numel
148
+ valid_batch_type: null
149
+ fold_length:
150
+ - 150
151
+ - 204800
152
+ sort_in_batch: descending
153
+ shuffle_within_batch: false
154
+ sort_batch: descending
155
+ multiple_iterator: false
156
+ chunk_length: 500
157
+ chunk_shift_ratio: 0.5
158
+ num_cache_chunks: 1024
159
+ chunk_excluded_key_prefixes: []
160
+ chunk_default_fs: null
161
+ chunk_max_abs_length: null
162
+ chunk_discard_short_samples: true
163
+ train_data_path_and_name_and_type:
164
+ - - dump/raw/tr_no_dev/text
165
+ - text
166
+ - text
167
+ - - exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/tr_no_dev/durations
168
+ - durations
169
+ - text_int
170
+ - - dump/raw/tr_no_dev/wav.scp
171
+ - speech
172
+ - sound
173
+ - - exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/collect_feats/pitch.scp
174
+ - pitch
175
+ - npy
176
+ - - exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/collect_feats/energy.scp
177
+ - energy
178
+ - npy
179
+ valid_data_path_and_name_and_type:
180
+ - - dump/raw/dev/text
181
+ - text
182
+ - text
183
+ - - exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/dev/durations
184
+ - durations
185
+ - text_int
186
+ - - dump/raw/dev/wav.scp
187
+ - speech
188
+ - sound
189
+ - - exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/valid/collect_feats/pitch.scp
190
+ - pitch
191
+ - npy
192
+ - - exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/valid/collect_feats/energy.scp
193
+ - energy
194
+ - npy
195
+ multi_task_dataset: false
196
+ allow_variable_data_keys: false
197
+ max_cache_size: 0.0
198
+ max_cache_fd: 32
199
+ allow_multi_rates: false
200
+ valid_max_cache_size: null
201
+ exclude_weight_decay: false
202
+ exclude_weight_decay_conf: {}
203
+ optim: adam
204
+ optim_conf:
205
+ lr: 1.0
206
+ scheduler: noamlr
207
+ scheduler_conf:
208
+ model_size: 384
209
+ warmup_steps: 4000
210
+ token_list:
211
+ - <blank>
212
+ - <unk>
213
+ - <space>
214
+ - ˈ
215
+ - ʲ
216
+ - e
217
+ - a
218
+ - ː
219
+ - i
220
+ - u
221
+ - t
222
+ - o
223
+ - s
224
+ - r
225
+ - k
226
+ - n
227
+ - m
228
+ - ɪ
229
+ - p
230
+ - d
231
+ - v
232
+ - j
233
+ - ʂ
234
+ - ɭ
235
+ - ɡ
236
+ - .
237
+ - ','
238
+ - b
239
+ - ɕ
240
+ - l
241
+ - ̩
242
+ - ʒ
243
+ - ʃ
244
+ - ɑ
245
+ - z
246
+ - '"'
247
+ - ʑ
248
+ - '?'
249
+ - f
250
+ - '!'
251
+ - ʊ
252
+ - ':'
253
+ - x
254
+ - h
255
+ - ɛ
256
+ - ˌ
257
+ - (
258
+ - )
259
+ - ;
260
+ - ɔ
261
+ - <sos/eos>
262
+ odim: null
263
+ model_conf: {}
264
+ use_preprocessor: true
265
+ token_type: phn
266
+ bpemodel: null
267
+ non_linguistic_symbols: null
268
+ cleaner: null
269
+ g2p: espeak_ng_lt
270
+ feats_extract: fbank
271
+ feats_extract_conf:
272
+ n_fft: 1024
273
+ hop_length: 256
274
+ win_length: null
275
+ fs: 22050
276
+ fmin: 80
277
+ fmax: 7600
278
+ n_mels: 80
279
+ normalize: global_mvn
280
+ normalize_conf:
281
+ stats_file: exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/feats_stats.npz
282
+ tts: fastspeech2
283
+ tts_conf:
284
+ adim: 384
285
+ aheads: 2
286
+ elayers: 4
287
+ eunits: 1536
288
+ dlayers: 4
289
+ dunits: 1536
290
+ positionwise_layer_type: conv1d
291
+ positionwise_conv_kernel_size: 3
292
+ duration_predictor_layers: 2
293
+ duration_predictor_chans: 256
294
+ duration_predictor_kernel_size: 3
295
+ postnet_layers: 5
296
+ postnet_filts: 5
297
+ postnet_chans: 256
298
+ use_masking: true
299
+ use_scaled_pos_enc: true
300
+ encoder_normalize_before: true
301
+ decoder_normalize_before: true
302
+ reduction_factor: 1
303
+ init_type: xavier_uniform
304
+ init_enc_alpha: 1.0
305
+ init_dec_alpha: 1.0
306
+ transformer_enc_dropout_rate: 0.2
307
+ transformer_enc_positional_dropout_rate: 0.2
308
+ transformer_enc_attn_dropout_rate: 0.2
309
+ transformer_dec_dropout_rate: 0.2
310
+ transformer_dec_positional_dropout_rate: 0.2
311
+ transformer_dec_attn_dropout_rate: 0.2
312
+ pitch_predictor_layers: 5
313
+ pitch_predictor_chans: 256
314
+ pitch_predictor_kernel_size: 5
315
+ pitch_predictor_dropout: 0.5
316
+ pitch_embed_kernel_size: 1
317
+ pitch_embed_dropout: 0.0
318
+ stop_gradient_from_pitch_predictor: true
319
+ energy_predictor_layers: 2
320
+ energy_predictor_chans: 256
321
+ energy_predictor_kernel_size: 3
322
+ energy_predictor_dropout: 0.5
323
+ energy_embed_kernel_size: 1
324
+ energy_embed_dropout: 0.0
325
+ stop_gradient_from_energy_predictor: false
326
+ pitch_extract: dio
327
+ pitch_extract_conf:
328
+ fs: 22050
329
+ n_fft: 1024
330
+ hop_length: 256
331
+ f0max: 330
332
+ f0min: 65
333
+ reduction_factor: 1
334
+ pitch_normalize: global_mvn
335
+ pitch_normalize_conf:
336
+ stats_file: exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/pitch_stats.npz
337
+ energy_extract: energy
338
+ energy_extract_conf:
339
+ fs: 22050
340
+ n_fft: 1024
341
+ hop_length: 256
342
+ win_length: null
343
+ reduction_factor: 1
344
+ energy_normalize: global_mvn
345
+ energy_normalize_conf:
346
+ stats_file: exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/energy_stats.npz
347
+ required:
348
+ - output_dir
349
+ - token_list
350
+ version: '202511'
351
+ distributed: false
352
+
353
+ ```
354
+
355
+ </details>
356
+
357
+
exp/tts_stats_raw_phn_espeak_ng_lt/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b50d857b80b28c85acd46a59a19931b62c3fc8f1810f40f34a0425e9a2066e3a
3
+ size 1402
exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/config.yaml ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_fastspeech2.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: sequence
7
+ valid_iterator_type: null
8
+ output_dir: exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt
9
+ ngpu: 1
10
+ seed: 0
11
+ num_workers: 1
12
+ num_att_plot: 3
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: null
16
+ dist_rank: null
17
+ local_rank: 0
18
+ dist_master_addr: null
19
+ dist_master_port: null
20
+ dist_launcher: null
21
+ multiprocessing_distributed: false
22
+ unused_parameters: false
23
+ sharded_ddp: false
24
+ use_deepspeed: false
25
+ deepspeed_config: null
26
+ gradient_as_bucket_view: true
27
+ ddp_comm_hook: null
28
+ cudnn_enabled: true
29
+ cudnn_benchmark: false
30
+ cudnn_deterministic: true
31
+ use_tf32: false
32
+ collect_stats: false
33
+ write_collected_feats: false
34
+ max_epoch: 1000
35
+ patience: null
36
+ val_scheduler_criterion:
37
+ - valid
38
+ - loss
39
+ early_stopping_criterion:
40
+ - valid
41
+ - loss
42
+ - min
43
+ best_model_criterion:
44
+ - - valid
45
+ - loss
46
+ - min
47
+ - - train
48
+ - loss
49
+ - min
50
+ keep_nbest_models: 5
51
+ nbest_averaging_interval: 0
52
+ grad_clip: 1.0
53
+ grad_clip_type: 2.0
54
+ grad_noise: false
55
+ accum_grad: 8
56
+ no_forward_run: false
57
+ resume: true
58
+ train_dtype: float32
59
+ use_amp: false
60
+ log_interval: null
61
+ use_matplotlib: true
62
+ use_tensorboard: true
63
+ create_graph_in_tensorboard: false
64
+ use_wandb: false
65
+ wandb_project: null
66
+ wandb_id: null
67
+ wandb_entity: null
68
+ wandb_name: null
69
+ wandb_model_log_interval: -1
70
+ detect_anomaly: false
71
+ use_adapter: false
72
+ adapter: lora
73
+ save_strategy: all
74
+ adapter_conf: {}
75
+ pretrain_path: null
76
+ init_param: []
77
+ ignore_init_mismatch: false
78
+ freeze_param: []
79
+ num_iters_per_epoch: 800
80
+ batch_size: 20
81
+ valid_batch_size: null
82
+ batch_bins: 3000000
83
+ valid_batch_bins: null
84
+ category_sample_size: 10
85
+ upsampling_factor: 0.5
86
+ category_upsampling_factor: 0.5
87
+ dataset_upsampling_factor: 0.5
88
+ dataset_scaling_factor: 1.2
89
+ max_batch_size: null
90
+ min_batch_size: 1
91
+ train_shape_file:
92
+ - exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/text_shape.phn
93
+ - exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/speech_shape
94
+ valid_shape_file:
95
+ - exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/valid/text_shape.phn
96
+ - exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/valid/speech_shape
97
+ batch_type: numel
98
+ valid_batch_type: null
99
+ fold_length:
100
+ - 150
101
+ - 204800
102
+ sort_in_batch: descending
103
+ shuffle_within_batch: false
104
+ sort_batch: descending
105
+ multiple_iterator: false
106
+ chunk_length: 500
107
+ chunk_shift_ratio: 0.5
108
+ num_cache_chunks: 1024
109
+ chunk_excluded_key_prefixes: []
110
+ chunk_default_fs: null
111
+ chunk_max_abs_length: null
112
+ chunk_discard_short_samples: true
113
+ train_data_path_and_name_and_type:
114
+ - - dump/raw/tr_no_dev/text
115
+ - text
116
+ - text
117
+ - - exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/tr_no_dev/durations
118
+ - durations
119
+ - text_int
120
+ - - dump/raw/tr_no_dev/wav.scp
121
+ - speech
122
+ - sound
123
+ - - exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/collect_feats/pitch.scp
124
+ - pitch
125
+ - npy
126
+ - - exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/collect_feats/energy.scp
127
+ - energy
128
+ - npy
129
+ valid_data_path_and_name_and_type:
130
+ - - dump/raw/dev/text
131
+ - text
132
+ - text
133
+ - - exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/dev/durations
134
+ - durations
135
+ - text_int
136
+ - - dump/raw/dev/wav.scp
137
+ - speech
138
+ - sound
139
+ - - exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/valid/collect_feats/pitch.scp
140
+ - pitch
141
+ - npy
142
+ - - exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/valid/collect_feats/energy.scp
143
+ - energy
144
+ - npy
145
+ multi_task_dataset: false
146
+ allow_variable_data_keys: false
147
+ max_cache_size: 0.0
148
+ max_cache_fd: 32
149
+ allow_multi_rates: false
150
+ valid_max_cache_size: null
151
+ exclude_weight_decay: false
152
+ exclude_weight_decay_conf: {}
153
+ optim: adam
154
+ optim_conf:
155
+ lr: 1.0
156
+ scheduler: noamlr
157
+ scheduler_conf:
158
+ model_size: 384
159
+ warmup_steps: 4000
160
+ token_list:
161
+ - <blank>
162
+ - <unk>
163
+ - <space>
164
+ - ˈ
165
+ - ʲ
166
+ - e
167
+ - a
168
+ - ː
169
+ - i
170
+ - u
171
+ - t
172
+ - o
173
+ - s
174
+ - r
175
+ - k
176
+ - n
177
+ - m
178
+ - ɪ
179
+ - p
180
+ - d
181
+ - v
182
+ - j
183
+ - ʂ
184
+ - ɭ
185
+ - ɡ
186
+ - .
187
+ - ','
188
+ - b
189
+ - ɕ
190
+ - l
191
+ - ̩
192
+ - ʒ
193
+ - ʃ
194
+ - ɑ
195
+ - z
196
+ - '"'
197
+ - ʑ
198
+ - '?'
199
+ - f
200
+ - '!'
201
+ - ʊ
202
+ - ':'
203
+ - x
204
+ - h
205
+ - ɛ
206
+ - ˌ
207
+ - (
208
+ - )
209
+ - ;
210
+ - ɔ
211
+ - <sos/eos>
212
+ odim: null
213
+ model_conf: {}
214
+ use_preprocessor: true
215
+ token_type: phn
216
+ bpemodel: null
217
+ non_linguistic_symbols: null
218
+ cleaner: null
219
+ g2p: espeak_ng_lt
220
+ feats_extract: fbank
221
+ feats_extract_conf:
222
+ n_fft: 1024
223
+ hop_length: 256
224
+ win_length: null
225
+ fs: 22050
226
+ fmin: 80
227
+ fmax: 7600
228
+ n_mels: 80
229
+ normalize: global_mvn
230
+ normalize_conf:
231
+ stats_file: exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/feats_stats.npz
232
+ tts: fastspeech2
233
+ tts_conf:
234
+ adim: 384
235
+ aheads: 2
236
+ elayers: 4
237
+ eunits: 1536
238
+ dlayers: 4
239
+ dunits: 1536
240
+ positionwise_layer_type: conv1d
241
+ positionwise_conv_kernel_size: 3
242
+ duration_predictor_layers: 2
243
+ duration_predictor_chans: 256
244
+ duration_predictor_kernel_size: 3
245
+ postnet_layers: 5
246
+ postnet_filts: 5
247
+ postnet_chans: 256
248
+ use_masking: true
249
+ use_scaled_pos_enc: true
250
+ encoder_normalize_before: true
251
+ decoder_normalize_before: true
252
+ reduction_factor: 1
253
+ init_type: xavier_uniform
254
+ init_enc_alpha: 1.0
255
+ init_dec_alpha: 1.0
256
+ transformer_enc_dropout_rate: 0.2
257
+ transformer_enc_positional_dropout_rate: 0.2
258
+ transformer_enc_attn_dropout_rate: 0.2
259
+ transformer_dec_dropout_rate: 0.2
260
+ transformer_dec_positional_dropout_rate: 0.2
261
+ transformer_dec_attn_dropout_rate: 0.2
262
+ pitch_predictor_layers: 5
263
+ pitch_predictor_chans: 256
264
+ pitch_predictor_kernel_size: 5
265
+ pitch_predictor_dropout: 0.5
266
+ pitch_embed_kernel_size: 1
267
+ pitch_embed_dropout: 0.0
268
+ stop_gradient_from_pitch_predictor: true
269
+ energy_predictor_layers: 2
270
+ energy_predictor_chans: 256
271
+ energy_predictor_kernel_size: 3
272
+ energy_predictor_dropout: 0.5
273
+ energy_embed_kernel_size: 1
274
+ energy_embed_dropout: 0.0
275
+ stop_gradient_from_energy_predictor: false
276
+ pitch_extract: dio
277
+ pitch_extract_conf:
278
+ fs: 22050
279
+ n_fft: 1024
280
+ hop_length: 256
281
+ f0max: 330
282
+ f0min: 65
283
+ reduction_factor: 1
284
+ pitch_normalize: global_mvn
285
+ pitch_normalize_conf:
286
+ stats_file: exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/pitch_stats.npz
287
+ energy_extract: energy
288
+ energy_extract_conf:
289
+ fs: 22050
290
+ n_fft: 1024
291
+ hop_length: 256
292
+ win_length: null
293
+ reduction_factor: 1
294
+ energy_normalize: global_mvn
295
+ energy_normalize_conf:
296
+ stats_file: exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/energy_stats.npz
297
+ required:
298
+ - output_dir
299
+ - token_list
300
+ version: '202511'
301
+ distributed: false
exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/backward_time.png ADDED
exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/clip.png ADDED
exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/decoder_alpha.png ADDED
exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/duration_loss.png ADDED
exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/encoder_alpha.png ADDED
exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/energy_loss.png ADDED
exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/forward_time.png ADDED
exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/gpu_max_cached_mem_GB.png ADDED
exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/grad_norm.png ADDED
exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/iter_time.png ADDED
exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/l1_loss.png ADDED
exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/loss.png ADDED
exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/loss_scale.png ADDED
exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/optim0_lr0.png ADDED
exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/optim_step_time.png ADDED
exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/pitch_loss.png ADDED
exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/images/train_time.png ADDED
exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/train.loss.ave_5best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a3c181c209e3fadeb004adf851e3ce43784b59512bb267994cfb7f3383dccbc
3
+ size 148684879
exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/energy_lengths_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48c48db32707b063b0e62c7506981fb8e9112666183210985c516535cab80b1c
3
+ size 778
exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/energy_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4658b0470ada9ec1649501126ae81603e6086fa03f1dd5c43624b53b87b0d14
3
+ size 770
exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/feats_lengths_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:676a2849d09bd6464161b2dad4d89073d8e85e3fadc816d32124be8d4d2b195d
3
+ size 778
exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b50d857b80b28c85acd46a59a19931b62c3fc8f1810f40f34a0425e9a2066e3a
3
+ size 1402
exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/pitch_lengths_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48c48db32707b063b0e62c7506981fb8e9112666183210985c516535cab80b1c
3
+ size 778
exp/tts_train_raw_phn_espeak_ng_lt/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/pitch_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e56f61925f66152923a8bc1dd153bb755b727694148fcc806d7031c9b95a3117
3
+ size 770
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202511'
2
+ files:
3
+ model_file: exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/train.loss.ave_5best.pth
4
+ python: 3.11.14 (main, Oct 21 2025, 18:31:21) [GCC 11.2.0]
5
+ timestamp: 1770196381.722465
6
+ torch: 2.9.1+cu128
7
+ yaml_files:
8
+ train_config: exp/tts_train_fastspeech2_raw_phn_espeak_ng_lt/config.yaml