niobures commited on
Commit
07d6458
·
verified ·
1 Parent(s): b170ec5

tts_fa_fastpitch_hifigan-v2.0

Browse files
Files changed (35) hide show
  1. .gitattributes +3 -0
  2. tts_fa_fastpitch_hifigan-v2.0/.gitattributes +38 -0
  3. tts_fa_fastpitch_hifigan-v2.0/README.md +65 -0
  4. tts_fa_fastpitch_hifigan-v2.0/config/fastpitch/.ipynb_checkpoints/fastpitch_align_22050_grapheme-checkpoint.yaml +248 -0
  5. tts_fa_fastpitch_hifigan-v2.0/config/fastpitch/ds_for_fastpitch_align.yaml +47 -0
  6. tts_fa_fastpitch_hifigan-v2.0/config/fastpitch/fastpitch_align_22050_grapheme.yaml +248 -0
  7. tts_fa_fastpitch_hifigan-v2.0/config/hifigan/.ipynb_checkpoints/hifigan-checkpoint.yaml +99 -0
  8. tts_fa_fastpitch_hifigan-v2.0/config/hifigan/hifigan.yaml +99 -0
  9. tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/generator/.ipynb_checkpoints/v1-checkpoint.yaml +7 -0
  10. tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/generator/v1.yaml +7 -0
  11. tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/train_ds/.ipynb_checkpoints/train_ds_finetune-checkpoint.yaml +15 -0
  12. tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/train_ds/train_ds_finetune.yaml +15 -0
  13. tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/train_ds_finetune.yaml +15 -0
  14. tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/v1.yaml +7 -0
  15. tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/val_ds_finetune.yaml +15 -0
  16. tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/validation_ds/.ipynb_checkpoints/val_ds_finetune-checkpoint.yaml +15 -0
  17. tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/validation_ds/val_ds_finetune.yaml +15 -0
  18. tts_fa_fastpitch_hifigan-v2.0/models/FastPitch--val_loss-0.7236-epoch-50.ckpt +3 -0
  19. tts_fa_fastpitch_hifigan-v2.0/models/FastPitch--val_loss-0.7236-epoch-50.nemo +3 -0
  20. tts_fa_fastpitch_hifigan-v2.0/models/HifiGan--val_loss-0.6090-epoch-39-last.nemo +3 -0
  21. tts_fa_fastpitch_hifigan-v2.0/models/HifiGan--val_loss-2.0688-epoch-12-last.ckpt +3 -0
  22. tts_fa_fastpitch_hifigan-v2.0/models/HifiGan--val_loss-2.0733-epoch-12-last.nemo +3 -0
  23. tts_fa_fastpitch_hifigan-v2.0/models/onnx-models/fastpitch.onnx +3 -0
  24. tts_fa_fastpitch_hifigan-v2.0/models/onnx-models/hifigan.onnx +3 -0
  25. tts_fa_fastpitch_hifigan-v2.0/persian-dict/persian-v5.0.dict +0 -0
  26. tts_fa_fastpitch_hifigan-v2.0/scripts/.ipynb_checkpoints/generate_mels-checkpoint.py +181 -0
  27. tts_fa_fastpitch_hifigan-v2.0/scripts/.ipynb_checkpoints/hifigan_finetune-checkpoint.py +32 -0
  28. tts_fa_fastpitch_hifigan-v2.0/scripts/extract_sup_data.py +83 -0
  29. tts_fa_fastpitch_hifigan-v2.0/scripts/fastpitch.py +35 -0
  30. tts_fa_fastpitch_hifigan-v2.0/scripts/generate_mels.py +181 -0
  31. tts_fa_fastpitch_hifigan-v2.0/scripts/hifigan_finetune.py +32 -0
  32. tts_fa_fastpitch_hifigan-v2.0/source.txt +1 -0
  33. tts_fa_fastpitch_hifigan-v2.0/tts-nemo-fastpitch-hifigan-from-scratch-finetuning-hifigan.ipynb +0 -0
  34. tts_fa_fastpitch_hifigan-v2.0/tts-nemo-fastpitch-hifigan-inference-only.ipynb +0 -0
  35. tts_fa_fastpitch_hifigan-v2.0/tts_nemo_fastpitch_hifigan_convert_to_onnx.ipynb +884 -0
.gitattributes CHANGED
@@ -48,3 +48,6 @@ hifigan_for_sherpa/pretrained/UNIVERSAL_V1/g_02500000 filter=lfs diff=lfs merge=
48
  hifigan_for_sherpa/pretrained/VCTK_V1/generator_v1 filter=lfs diff=lfs merge=lfs -text
49
  hifigan_for_sherpa/pretrained/VCTK_V2/generator_v2 filter=lfs diff=lfs merge=lfs -text
50
  hifigan_for_sherpa/pretrained/VCTK_V3/generator_v3 filter=lfs diff=lfs merge=lfs -text
 
 
 
 
48
  hifigan_for_sherpa/pretrained/VCTK_V1/generator_v1 filter=lfs diff=lfs merge=lfs -text
49
  hifigan_for_sherpa/pretrained/VCTK_V2/generator_v2 filter=lfs diff=lfs merge=lfs -text
50
  hifigan_for_sherpa/pretrained/VCTK_V3/generator_v3 filter=lfs diff=lfs merge=lfs -text
51
+ tts_fa_fastpitch_hifigan-v2.0/models/FastPitch--val_loss-0.7236-epoch-50.nemo filter=lfs diff=lfs merge=lfs -text
52
+ tts_fa_fastpitch_hifigan-v2.0/models/HifiGan--val_loss-0.6090-epoch-39-last.nemo filter=lfs diff=lfs merge=lfs -text
53
+ tts_fa_fastpitch_hifigan-v2.0/models/HifiGan--val_loss-2.0733-epoch-12-last.nemo filter=lfs diff=lfs merge=lfs -text
tts_fa_fastpitch_hifigan-v2.0/.gitattributes ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ models/FastPitch--val_loss-0.7236-epoch-50.nemo filter=lfs diff=lfs merge=lfs -text
37
+ models/HifiGan--val_loss-0.6090-epoch-39-last.nemo filter=lfs diff=lfs merge=lfs -text
38
+ models/HifiGan--val_loss-2.0733-epoch-12-last.nemo filter=lfs diff=lfs merge=lfs -text
tts_fa_fastpitch_hifigan-v2.0/README.md ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+ ## FastPitch and HifiGan v2.0
5
+
6
+ v2.0 of phonemizer and tokenizer. tokenzier `DO SUPPORT` pauses, emotion tokens etc,.
7
+
8
+
9
+ ### Install NeMo
10
+ ```bash
11
+ apt-get update && apt-get install -y libsndfile1 ffmpeg
12
+ pip install Cython packaging
13
+ rm -rf /usr/lib/python3.10/site-packages/blinker*
14
+ rm -rf /usr/local/lib/python3.10/dist-packages/blinker*
15
+ pip install --ignore-installed blinker
16
+ pip install --upgrade --force-reinstall blinker
17
+
18
+ git clone https://github.com/SadeghKrmi/NeMo.git
19
+ cd NeMo
20
+ pip install -e '.[all]'
21
+ ```
22
+
23
+
24
+ ### deterministic split
25
+ Run the deterministic-train-test-split.py to split the train/test
26
+
27
+
28
+ ### Extract the supportive data
29
+ using the following scripts, extract pitch statistics
30
+ ```bash
31
+ tar -xzf dataset_splits.tar.gz
32
+
33
+ cd extract-supportive-data
34
+ HYDRA_FULL_ERROR=1 python3 ./scripts/extract_sup_data.py \
35
+ --config-path ../config/fastpitch/ \
36
+ --config-name ds_for_fastpitch_align.yaml \
37
+ manifest_filepath=./dataset_splits/train/train.jsonl \
38
+ sup_data_path=sup_data \
39
+ phoneme_dict_path=./persian-dict/persian-v4.0.dict \
40
+ ++dataloader_params.num_workers=8
41
+ ```
42
+
43
+ #### dataset sup pitch stats
44
+ PITCH_MEAN=98.72935485839844, PITCH_STD=29.40760040283203
45
+ PITCH_MIN=65.4063949584961, PITCH_MAX=2093.004638671875
46
+
47
+ ### zip and download
48
+ ```bash
49
+ tar -czf sup_data.tar.gz sup_data
50
+ ```
51
+
52
+
53
+ ### Training FastPitch
54
+ training for about 800 epochs, with CosineAnnealing sched. and `max_steps` 200,000 for lr to decay overtime.
55
+
56
+ val_loss didn't decrease lower that about 0.77xx
57
+
58
+ `val_loss = mel_loss + dur_loss + pitch_loss + energy_loss`
59
+
60
+ ### Training HiFiGAN
61
+ training for about 40 epochs, stoped the training based on quality checking by listening to audios
62
+
63
+
64
+
65
+
tts_fa_fastpitch_hifigan-v2.0/config/fastpitch/.ipynb_checkpoints/fastpitch_align_22050_grapheme-checkpoint.yaml ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This config contains the default values for training FastPitch model with aligner using 22KHz sampling
2
+ # rate. If you want to train model on other dataset, you can change config values according to your dataset.
3
+ # Most dataset-specific arguments are in the head of the config file, see below.
4
+
5
+ name: FastPitch
6
+
7
+ train_dataset: ???
8
+ validation_datasets: ???
9
+ sup_data_path: ???
10
+ sup_data_types: [ "align_prior_matrix", "pitch" ]
11
+
12
+ phoneme_dict_path: ???
13
+
14
+ # Default values from librosa.pyin
15
+ pitch_fmin: 65.4063949584961
16
+ pitch_fmax: 2093.004638671875
17
+
18
+ # these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
19
+ # by running `scripts/dataset_processing/tts/extract_sup_data.py`
20
+ pitch_mean: 103.01591491699219 # e.g. 132.524658203125 for https://zenodo.org/record/5525342/files/thorsten-neutral_v03.tgz?download=1
21
+ pitch_std: 30.397296905517578 # e.g. 37.389366149902 for https://zenodo.org/record/5525342/files/thorsten-neutral_v03.tgz?download=1
22
+
23
+ # Default values for dataset with sample_rate=22050
24
+ sample_rate: 22050
25
+ n_mel_channels: 80
26
+ n_window_size: 1024
27
+ n_window_stride: 256
28
+ n_fft: 1024
29
+ lowfreq: 0
30
+ highfreq: null
31
+ window: hann
32
+
33
+ model:
34
+ learn_alignment: true
35
+ bin_loss_warmup_epochs: 100
36
+
37
+ n_speakers: 1
38
+ max_token_duration: 75
39
+ symbols_embedding_dim: 384
40
+ pitch_embedding_kernel_size: 3
41
+
42
+ pitch_fmin: ${pitch_fmin}
43
+ pitch_fmax: ${pitch_fmax}
44
+
45
+ pitch_mean: ${pitch_mean}
46
+ pitch_std: ${pitch_std}
47
+
48
+ sample_rate: ${sample_rate}
49
+ n_mel_channels: ${n_mel_channels}
50
+ n_window_size: ${n_window_size}
51
+ n_window_stride: ${n_window_stride}
52
+ n_fft: ${n_fft}
53
+ lowfreq: ${lowfreq}
54
+ highfreq: ${highfreq}
55
+ window: ${window}
56
+
57
+ # text_normalizer:
58
+ # _target_: nemo_text_processing.text_normalization.normalize.Normalizer
59
+ # lang: de
60
+ # input_case: cased
61
+
62
+ # text_normalizer_call_kwargs:
63
+ # verbose: false
64
+ # punct_pre_process: true
65
+ # punct_post_process: true
66
+
67
+ text_tokenizer:
68
+ _target_: nemo.collections.tts.g2p.models.fa_ir_persian.tokenizer.PersianPhonemesTokenizer
69
+ punct: true
70
+ use_emotion_tokens: true
71
+ use_pause_tokens: true
72
+ use_speed_tokens: true
73
+ g2p:
74
+ _target_: nemo.collections.tts.g2p.models.fa_ir_persian.g2p.PersianG2p
75
+ phoneme_dict: ${phoneme_dict_path}
76
+
77
+ train_ds:
78
+ dataset:
79
+ _target_: nemo.collections.tts.data.dataset.TTSDataset
80
+ manifest_filepath: ${train_dataset}
81
+ sample_rate: ${model.sample_rate}
82
+ sup_data_path: ${sup_data_path}
83
+ sup_data_types: ${sup_data_types}
84
+ n_fft: ${model.n_fft}
85
+ win_length: ${model.n_window_size}
86
+ hop_length: ${model.n_window_stride}
87
+ window: ${model.window}
88
+ n_mels: ${model.n_mel_channels}
89
+ lowfreq: ${model.lowfreq}
90
+ highfreq: ${model.highfreq}
91
+ max_duration: 25 # change to null to include longer audios.
92
+ min_duration: 0.1
93
+ ignore_file: null
94
+ trim: true
95
+ trim_top_db: 50
96
+ trim_frame_length: ${model.n_window_size}
97
+ trim_hop_length: ${model.n_window_stride}
98
+ pitch_fmin: ${model.pitch_fmin}
99
+ pitch_fmax: ${model.pitch_fmax}
100
+ pitch_norm: true
101
+ pitch_mean: ${model.pitch_mean}
102
+ pitch_std: ${model.pitch_std}
103
+
104
+ dataloader_params:
105
+ drop_last: false
106
+ shuffle: true
107
+ batch_size: 32
108
+ num_workers: 12
109
+ pin_memory: true
110
+
111
+ validation_ds:
112
+ dataset:
113
+ _target_: nemo.collections.tts.data.dataset.TTSDataset
114
+ manifest_filepath: ${validation_datasets}
115
+ sample_rate: ${model.sample_rate}
116
+ sup_data_path: ${sup_data_path}
117
+ sup_data_types: ${sup_data_types}
118
+ n_fft: ${model.n_fft}
119
+ win_length: ${model.n_window_size}
120
+ hop_length: ${model.n_window_stride}
121
+ window: ${model.window}
122
+ n_mels: ${model.n_mel_channels}
123
+ lowfreq: ${model.lowfreq}
124
+ highfreq: ${model.highfreq}
125
+ max_duration: 25 # change to null to include longer audios.
126
+ min_duration: 0.1
127
+ ignore_file: null
128
+ trim: true
129
+ trim_top_db: 50
130
+ trim_frame_length: ${model.n_window_size}
131
+ trim_hop_length: ${model.n_window_stride}
132
+ pitch_fmin: ${model.pitch_fmin}
133
+ pitch_fmax: ${model.pitch_fmax}
134
+ pitch_norm: true
135
+ pitch_mean: ${model.pitch_mean}
136
+ pitch_std: ${model.pitch_std}
137
+
138
+ dataloader_params:
139
+ drop_last: false
140
+ shuffle: false
141
+ batch_size: 32
142
+ num_workers: 8
143
+ pin_memory: true
144
+
145
+ preprocessor:
146
+ _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
147
+ features: ${model.n_mel_channels}
148
+ lowfreq: ${model.lowfreq}
149
+ highfreq: ${model.highfreq}
150
+ n_fft: ${model.n_fft}
151
+ n_window_size: ${model.n_window_size}
152
+ window_size: false
153
+ n_window_stride: ${model.n_window_stride}
154
+ window_stride: false
155
+ pad_to: 1
156
+ pad_value: 0
157
+ sample_rate: ${model.sample_rate}
158
+ window: ${model.window}
159
+ normalize: null
160
+ preemph: null
161
+ dither: 0.0
162
+ frame_splicing: 1
163
+ log: true
164
+ log_zero_guard_type: add
165
+ log_zero_guard_value: 1e-05
166
+ mag_power: 1.0
167
+
168
+ input_fft: #n_embed and padding_idx are added by the model
169
+ _target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder
170
+ n_layer: 6
171
+ n_head: 1
172
+ d_model: ${model.symbols_embedding_dim}
173
+ d_head: 64
174
+ d_inner: 1536
175
+ kernel_size: 3
176
+ dropout: 0.1
177
+ dropatt: 0.1
178
+ dropemb: 0.0
179
+ d_embed: ${model.symbols_embedding_dim}
180
+
181
+ output_fft:
182
+ _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder
183
+ n_layer: 6
184
+ n_head: 1
185
+ d_model: ${model.symbols_embedding_dim}
186
+ d_head: 64
187
+ d_inner: 1536
188
+ kernel_size: 3
189
+ dropout: 0.1
190
+ dropatt: 0.1
191
+ dropemb: 0.0
192
+
193
+ alignment_module:
194
+ _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder
195
+ n_text_channels: ${model.symbols_embedding_dim}
196
+
197
+ duration_predictor:
198
+ _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
199
+ input_size: ${model.symbols_embedding_dim}
200
+ kernel_size: 3
201
+ filter_size: 256
202
+ dropout: 0.1
203
+ n_layers: 2
204
+
205
+ pitch_predictor:
206
+ _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
207
+ input_size: ${model.symbols_embedding_dim}
208
+ kernel_size: 3
209
+ filter_size: 256
210
+ dropout: 0.1
211
+ n_layers: 2
212
+
213
+ optim:
214
+ name: adamw
215
+ lr: 1e-3
216
+ betas: [0.9, 0.98]
217
+ weight_decay: 1e-3
218
+
219
+ sched:
220
+ name: CosineAnnealing
221
+ warmup_steps: 2000
222
+ last_epoch: -1
223
+ min_lr: 1e-6
224
+
225
+ trainer:
226
+ num_nodes: 1
227
+ devices: -1 # specify all GPUs regardless of its availability
228
+ accelerator: gpu
229
+ strategy: ddp
230
+ precision: 16
231
+ max_epochs: 1500
232
+ accumulate_grad_batches: 1
233
+ gradient_clip_val: 1000.0
234
+ enable_checkpointing: false # Provided by exp_manager
235
+ logger: false # Provided by exp_manager
236
+ log_every_n_steps: 100
237
+ check_val_every_n_epoch: 5
238
+ benchmark: false
239
+
240
+ exp_manager:
241
+ exp_dir: null
242
+ name: ${name}
243
+ create_tensorboard_logger: true
244
+ create_checkpoint_callback: true
245
+ checkpoint_callback_params:
246
+ monitor: val_loss
247
+ resume_if_exists: false
248
+ resume_ignore_no_checkpoint: false
tts_fa_fastpitch_hifigan-v2.0/config/fastpitch/ds_for_fastpitch_align.yaml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "ds_for_fastpitch_align"
2
+
3
+ manifest_filepath: ???
4
+ sup_data_path: ???
5
+ sup_data_types: [ "align_prior_matrix", "pitch" ]
6
+ phoneme_dict_path: ???
7
+
8
+
9
+ dataset:
10
+ _target_: nemo.collections.tts.data.dataset.TTSDataset
11
+ manifest_filepath: ${manifest_filepath}
12
+ sample_rate: 22050
13
+ sup_data_path: ${sup_data_path}
14
+ sup_data_types: ${sup_data_types}
15
+ n_fft: 1024
16
+ win_length: 1024
17
+ hop_length: 256
18
+ window: "hann"
19
+ n_mels: 80
20
+ lowfreq: 0
21
+ highfreq: 8000
22
+ max_duration: null
23
+ min_duration: 0.1
24
+ ignore_file: null
25
+ trim: false
26
+ pitch_fmin: 65.40639132514966
27
+ pitch_fmax: 2093.004522404789
28
+
29
+ # text_normalizer:
30
+ # _target_: nemo_text_processing.text_normalization.normalize.Normalizer
31
+ # lang: en
32
+ # input_case: cased
33
+
34
+ # text_normalizer_call_kwargs:
35
+ # verbose: false
36
+ # punct_pre_process: true
37
+ # punct_post_process: true
38
+
39
+ text_tokenizer:
40
+ _target_: nemo.collections.tts.g2p.models.fa_ir_persian.tokenizer.PersianPhonemesTokenizer
41
+ punct: true
42
+ use_emotion_tokens: true
43
+ use_pause_tokens: true
44
+ use_speed_tokens: true
45
+ g2p:
46
+ _target_: nemo.collections.tts.g2p.models.fa_ir_persian.g2p.PersianG2p
47
+ phoneme_dict: ${phoneme_dict_path}
tts_fa_fastpitch_hifigan-v2.0/config/fastpitch/fastpitch_align_22050_grapheme.yaml ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This config contains the default values for training FastPitch model with aligner using 22KHz sampling
2
+ # rate. If you want to train model on other dataset, you can change config values according to your dataset.
3
+ # Most dataset-specific arguments are in the head of the config file, see below.
4
+
5
+ name: FastPitch
6
+
7
+ train_dataset: ???
8
+ validation_datasets: ???
9
+ sup_data_path: ???
10
+ sup_data_types: [ "align_prior_matrix", "pitch" ]
11
+
12
+ phoneme_dict_path: ???
13
+
14
+ # Default values from librosa.pyin
15
+ pitch_fmin: 65.4063949584961
16
+ pitch_fmax: 2093.004638671875
17
+
18
+ # these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
19
+ # by running `scripts/dataset_processing/tts/extract_sup_data.py`
20
+ pitch_mean: 103.01591491699219 # e.g. 132.524658203125 for https://zenodo.org/record/5525342/files/thorsten-neutral_v03.tgz?download=1
21
+ pitch_std: 30.397296905517578 # e.g. 37.389366149902 for https://zenodo.org/record/5525342/files/thorsten-neutral_v03.tgz?download=1
22
+
23
+ # Default values for dataset with sample_rate=22050
24
+ sample_rate: 22050
25
+ n_mel_channels: 80
26
+ n_window_size: 1024
27
+ n_window_stride: 256
28
+ n_fft: 1024
29
+ lowfreq: 0
30
+ highfreq: null
31
+ window: hann
32
+
33
+ model:
34
+ learn_alignment: true
35
+ bin_loss_warmup_epochs: 100
36
+
37
+ n_speakers: 1
38
+ max_token_duration: 75
39
+ symbols_embedding_dim: 384
40
+ pitch_embedding_kernel_size: 3
41
+
42
+ pitch_fmin: ${pitch_fmin}
43
+ pitch_fmax: ${pitch_fmax}
44
+
45
+ pitch_mean: ${pitch_mean}
46
+ pitch_std: ${pitch_std}
47
+
48
+ sample_rate: ${sample_rate}
49
+ n_mel_channels: ${n_mel_channels}
50
+ n_window_size: ${n_window_size}
51
+ n_window_stride: ${n_window_stride}
52
+ n_fft: ${n_fft}
53
+ lowfreq: ${lowfreq}
54
+ highfreq: ${highfreq}
55
+ window: ${window}
56
+
57
+ # text_normalizer:
58
+ # _target_: nemo_text_processing.text_normalization.normalize.Normalizer
59
+ # lang: de
60
+ # input_case: cased
61
+
62
+ # text_normalizer_call_kwargs:
63
+ # verbose: false
64
+ # punct_pre_process: true
65
+ # punct_post_process: true
66
+
67
+ text_tokenizer:
68
+ _target_: nemo.collections.tts.g2p.models.fa_ir_persian.tokenizer.PersianPhonemesTokenizer
69
+ punct: true
70
+ use_emotion_tokens: true
71
+ use_pause_tokens: true
72
+ use_speed_tokens: true
73
+ g2p:
74
+ _target_: nemo.collections.tts.g2p.models.fa_ir_persian.g2p.PersianG2p
75
+ phoneme_dict: ${phoneme_dict_path}
76
+
77
+ train_ds:
78
+ dataset:
79
+ _target_: nemo.collections.tts.data.dataset.TTSDataset
80
+ manifest_filepath: ${train_dataset}
81
+ sample_rate: ${model.sample_rate}
82
+ sup_data_path: ${sup_data_path}
83
+ sup_data_types: ${sup_data_types}
84
+ n_fft: ${model.n_fft}
85
+ win_length: ${model.n_window_size}
86
+ hop_length: ${model.n_window_stride}
87
+ window: ${model.window}
88
+ n_mels: ${model.n_mel_channels}
89
+ lowfreq: ${model.lowfreq}
90
+ highfreq: ${model.highfreq}
91
+ max_duration: 25 # change to null to include longer audios.
92
+ min_duration: 0.1
93
+ ignore_file: null
94
+ trim: true
95
+ trim_top_db: 50
96
+ trim_frame_length: ${model.n_window_size}
97
+ trim_hop_length: ${model.n_window_stride}
98
+ pitch_fmin: ${model.pitch_fmin}
99
+ pitch_fmax: ${model.pitch_fmax}
100
+ pitch_norm: true
101
+ pitch_mean: ${model.pitch_mean}
102
+ pitch_std: ${model.pitch_std}
103
+
104
+ dataloader_params:
105
+ drop_last: false
106
+ shuffle: true
107
+ batch_size: 32
108
+ num_workers: 12
109
+ pin_memory: true
110
+
111
+ validation_ds:
112
+ dataset:
113
+ _target_: nemo.collections.tts.data.dataset.TTSDataset
114
+ manifest_filepath: ${validation_datasets}
115
+ sample_rate: ${model.sample_rate}
116
+ sup_data_path: ${sup_data_path}
117
+ sup_data_types: ${sup_data_types}
118
+ n_fft: ${model.n_fft}
119
+ win_length: ${model.n_window_size}
120
+ hop_length: ${model.n_window_stride}
121
+ window: ${model.window}
122
+ n_mels: ${model.n_mel_channels}
123
+ lowfreq: ${model.lowfreq}
124
+ highfreq: ${model.highfreq}
125
+ max_duration: 25 # change to null to include longer audios.
126
+ min_duration: 0.1
127
+ ignore_file: null
128
+ trim: true
129
+ trim_top_db: 50
130
+ trim_frame_length: ${model.n_window_size}
131
+ trim_hop_length: ${model.n_window_stride}
132
+ pitch_fmin: ${model.pitch_fmin}
133
+ pitch_fmax: ${model.pitch_fmax}
134
+ pitch_norm: true
135
+ pitch_mean: ${model.pitch_mean}
136
+ pitch_std: ${model.pitch_std}
137
+
138
+ dataloader_params:
139
+ drop_last: false
140
+ shuffle: false
141
+ batch_size: 32
142
+ num_workers: 8
143
+ pin_memory: true
144
+
145
+ preprocessor:
146
+ _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
147
+ features: ${model.n_mel_channels}
148
+ lowfreq: ${model.lowfreq}
149
+ highfreq: ${model.highfreq}
150
+ n_fft: ${model.n_fft}
151
+ n_window_size: ${model.n_window_size}
152
+ window_size: false
153
+ n_window_stride: ${model.n_window_stride}
154
+ window_stride: false
155
+ pad_to: 1
156
+ pad_value: 0
157
+ sample_rate: ${model.sample_rate}
158
+ window: ${model.window}
159
+ normalize: null
160
+ preemph: null
161
+ dither: 0.0
162
+ frame_splicing: 1
163
+ log: true
164
+ log_zero_guard_type: add
165
+ log_zero_guard_value: 1e-05
166
+ mag_power: 1.0
167
+
168
+ input_fft: #n_embed and padding_idx are added by the model
169
+ _target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder
170
+ n_layer: 6
171
+ n_head: 1
172
+ d_model: ${model.symbols_embedding_dim}
173
+ d_head: 64
174
+ d_inner: 1536
175
+ kernel_size: 3
176
+ dropout: 0.1
177
+ dropatt: 0.1
178
+ dropemb: 0.0
179
+ d_embed: ${model.symbols_embedding_dim}
180
+
181
+ output_fft:
182
+ _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder
183
+ n_layer: 6
184
+ n_head: 1
185
+ d_model: ${model.symbols_embedding_dim}
186
+ d_head: 64
187
+ d_inner: 1536
188
+ kernel_size: 3
189
+ dropout: 0.1
190
+ dropatt: 0.1
191
+ dropemb: 0.0
192
+
193
+ alignment_module:
194
+ _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder
195
+ n_text_channels: ${model.symbols_embedding_dim}
196
+
197
+ duration_predictor:
198
+ _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
199
+ input_size: ${model.symbols_embedding_dim}
200
+ kernel_size: 3
201
+ filter_size: 256
202
+ dropout: 0.1
203
+ n_layers: 2
204
+
205
+ pitch_predictor:
206
+ _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
207
+ input_size: ${model.symbols_embedding_dim}
208
+ kernel_size: 3
209
+ filter_size: 256
210
+ dropout: 0.1
211
+ n_layers: 2
212
+
213
+ optim:
214
+ name: adamw
215
+ lr: 1e-3
216
+ betas: [0.9, 0.98]
217
+ weight_decay: 1e-3
218
+
219
+ sched:
220
+ name: CosineAnnealing
221
+ warmup_steps: 2000
222
+ last_epoch: -1
223
+ min_lr: 1e-6
224
+
225
+ trainer:
226
+ num_nodes: 1
227
+ devices: -1 # specify all GPUs regardless of its availability
228
+ accelerator: gpu
229
+ strategy: ddp
230
+ precision: 16
231
+ max_epochs: 1500
232
+ accumulate_grad_batches: 1
233
+ gradient_clip_val: 1000.0
234
+ enable_checkpointing: false # Provided by exp_manager
235
+ logger: false # Provided by exp_manager
236
+ log_every_n_steps: 100
237
+ check_val_every_n_epoch: 5
238
+ benchmark: false
239
+
240
+ exp_manager:
241
+ exp_dir: null
242
+ name: ${name}
243
+ create_tensorboard_logger: true
244
+ create_checkpoint_callback: true
245
+ checkpoint_callback_params:
246
+ monitor: val_loss
247
+ resume_if_exists: false
248
+ resume_ignore_no_checkpoint: false
tts_fa_fastpitch_hifigan-v2.0/config/hifigan/.ipynb_checkpoints/hifigan-checkpoint.yaml ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This config contains the default values for training HiFi-GAN model on LJSpeech dataset.
2
+ # If you want to train model on other dataset, you can change config values according to your dataset.
3
+ # Most dataset-specific arguments are in the head of the config file, see below.
4
+
5
+ name: "HifiGan"
6
+
7
+ train_dataset: ???
8
+ validation_datasets: ???
9
+
10
+ # Default values for dataset with sample_rate=22050
11
+ sample_rate: 22050
12
+ n_mel_channels: 80
13
+ n_window_size: 1024
14
+ n_window_stride: 256
15
+ n_fft: 1024
16
+ lowfreq: 0
17
+ highfreq: 8000
18
+ window: hann
19
+
20
+ train_n_segments: 8192
21
+ train_max_duration: null
22
+ train_min_duration: 0.75
23
+
24
+ val_n_segments: 66048
25
+ val_max_duration: null
26
+ val_min_duration: 0.75
27
+
28
+ defaults:
29
+ - model/generator: v1
30
+ - model/train_ds: train_ds
31
+ - model/validation_ds: val_ds
32
+
33
+ model:
34
+ preprocessor:
35
+ _target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures
36
+ nfilt: ${n_mel_channels}
37
+ lowfreq: ${lowfreq}
38
+ highfreq: ${highfreq}
39
+ n_fft: ${n_fft}
40
+ n_window_size: ${n_window_size}
41
+ n_window_stride: ${n_window_stride}
42
+ pad_to: 0
43
+ pad_value: -11.52
44
+ sample_rate: ${sample_rate}
45
+ window: ${window}
46
+ normalize: null
47
+ preemph: null
48
+ dither: 0.0
49
+ frame_splicing: 1
50
+ log: true
51
+ log_zero_guard_type: clamp
52
+ log_zero_guard_value: 1e-05
53
+ mag_power: 1.0
54
+ use_grads: false
55
+ exact_pad: true
56
+
57
+ optim:
58
+ _target_: torch.optim.AdamW
59
+ lr: 0.0002
60
+ betas: [0.8, 0.99]
61
+
62
+ sched:
63
+ name: CosineAnnealing
64
+ min_lr: 1e-5
65
+ warmup_ratio: 0.02
66
+
67
+ max_steps: 50000
68
+ l1_loss_factor: 45
69
+ denoise_strength: 0.0025
70
+
71
+ trainer:
72
+ num_nodes: 1
73
+ devices: 1
74
+ accelerator: gpu
75
+ strategy: ddp_find_unused_parameters_true
76
+ precision: 32
77
+ max_steps: ${model.max_steps}
78
+ accumulate_grad_batches: 1
79
+ enable_checkpointing: False # Provided by exp_manager
80
+ logger: false # Provided by exp_manager
81
+ log_every_n_steps: 100
82
+ check_val_every_n_epoch: 10
83
+ benchmark: false
84
+
85
+ exp_manager:
86
+ exp_dir: null
87
+ name: ${name}
88
+ create_tensorboard_logger: true
89
+ create_checkpoint_callback: true
90
+ checkpoint_callback_params:
91
+ monitor: val_loss
92
+ mode: min
93
+ create_wandb_logger: false
94
+ wandb_logger_kwargs:
95
+ name: null
96
+ project: null
97
+ entity: null
98
+ resume_if_exists: false
99
+ resume_ignore_no_checkpoint: false
tts_fa_fastpitch_hifigan-v2.0/config/hifigan/hifigan.yaml ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This config contains the default values for training HiFi-GAN model on LJSpeech dataset.
2
+ # If you want to train model on other dataset, you can change config values according to your dataset.
3
+ # Most dataset-specific arguments are in the head of the config file, see below.
4
+
5
+ name: "HifiGan"
6
+
7
+ train_dataset: ???
8
+ validation_datasets: ???
9
+
10
+ # Default values for dataset with sample_rate=22050
11
+ sample_rate: 22050
12
+ n_mel_channels: 80
13
+ n_window_size: 1024
14
+ n_window_stride: 256
15
+ n_fft: 1024
16
+ lowfreq: 0
17
+ highfreq: 8000
18
+ window: hann
19
+
20
+ train_n_segments: 8192
21
+ train_max_duration: null
22
+ train_min_duration: 0.75
23
+
24
+ val_n_segments: 66048
25
+ val_max_duration: null
26
+ val_min_duration: 3
27
+
28
+ defaults:
29
+ - model/generator: v1
30
+ - model/train_ds: train_ds
31
+ - model/validation_ds: val_ds
32
+
33
+ model:
34
+ preprocessor:
35
+ _target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures
36
+ nfilt: ${n_mel_channels}
37
+ lowfreq: ${lowfreq}
38
+ highfreq: ${highfreq}
39
+ n_fft: ${n_fft}
40
+ n_window_size: ${n_window_size}
41
+ n_window_stride: ${n_window_stride}
42
+ pad_to: 0
43
+ pad_value: -11.52
44
+ sample_rate: ${sample_rate}
45
+ window: ${window}
46
+ normalize: null
47
+ preemph: null
48
+ dither: 0.0
49
+ frame_splicing: 1
50
+ log: true
51
+ log_zero_guard_type: clamp
52
+ log_zero_guard_value: 1e-05
53
+ mag_power: 1.0
54
+ use_grads: false
55
+ exact_pad: true
56
+
57
+ optim:
58
+ _target_: torch.optim.AdamW
59
+ lr: 0.0002
60
+ betas: [0.8, 0.99]
61
+
62
+ sched:
63
+ name: CosineAnnealing
64
+ min_lr: 1e-5
65
+ warmup_ratio: 0.02
66
+
67
+ max_steps: 2500000
68
+ l1_loss_factor: 45
69
+ denoise_strength: 0.0025
70
+
71
+ trainer:
72
+ num_nodes: 1
73
+ devices: 1
74
+ accelerator: gpu
75
+ strategy: ddp_find_unused_parameters_true
76
+ precision: 32
77
+ max_steps: ${model.max_steps}
78
+ accumulate_grad_batches: 1
79
+ enable_checkpointing: False # Provided by exp_manager
80
+ logger: false # Provided by exp_manager
81
+ log_every_n_steps: 100
82
+ check_val_every_n_epoch: 10
83
+ benchmark: false
84
+
85
+ exp_manager:
86
+ exp_dir: null
87
+ name: ${name}
88
+ create_tensorboard_logger: true
89
+ create_checkpoint_callback: true
90
+ checkpoint_callback_params:
91
+ monitor: val_loss
92
+ mode: min
93
+ create_wandb_logger: false
94
+ wandb_logger_kwargs:
95
+ name: null
96
+ project: null
97
+ entity: null
98
+ resume_if_exists: false
99
+ resume_ignore_no_checkpoint: false
tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/generator/.ipynb_checkpoints/v1-checkpoint.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ _target_: nemo.collections.tts.modules.hifigan_modules.Generator
2
+ resblock: 1
3
+ upsample_rates: [8,8,2,2]
4
+ upsample_kernel_sizes: [16,16,4,4]
5
+ upsample_initial_channel: 512
6
+ resblock_kernel_sizes: [3,7,11]
7
+ resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/generator/v1.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ _target_: nemo.collections.tts.modules.hifigan_modules.Generator
2
+ resblock: 1
3
+ upsample_rates: [8,8,2,2]
4
+ upsample_kernel_sizes: [16,16,4,4]
5
+ upsample_initial_channel: 512
6
+ resblock_kernel_sizes: [3,7,11]
7
+ resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/train_ds/.ipynb_checkpoints/train_ds_finetune-checkpoint.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset:
2
+ _target_: "nemo.collections.tts.data.dataset.VocoderDataset"
3
+ manifest_filepath: ${train_dataset}
4
+ sample_rate: ${sample_rate}
5
+ n_segments: ${train_n_segments}
6
+ max_duration: ${train_max_duration}
7
+ min_duration: ${train_min_duration}
8
+ load_precomputed_mel: true
9
+ hop_length: ${n_window_stride}
10
+ dataloader_params:
11
+ drop_last: false
12
+ shuffle: true
13
+ batch_size: 32
14
+ num_workers: 4
15
+ pin_memory: true
tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/train_ds/train_ds_finetune.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset:
2
+ _target_: "nemo.collections.tts.data.dataset.VocoderDataset"
3
+ manifest_filepath: ${train_dataset}
4
+ sample_rate: ${sample_rate}
5
+ n_segments: ${train_n_segments}
6
+ max_duration: ${train_max_duration}
7
+ min_duration: ${train_min_duration}
8
+ load_precomputed_mel: true
9
+ hop_length: ${n_window_stride}
10
+ dataloader_params:
11
+ drop_last: false
12
+ shuffle: true
13
+ batch_size: 32
14
+ num_workers: 4
15
+ pin_memory: true
tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/train_ds_finetune.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset:
2
+ _target_: "nemo.collections.tts.data.dataset.VocoderDataset"
3
+ manifest_filepath: ${train_dataset}
4
+ sample_rate: ${sample_rate}
5
+ n_segments: ${train_n_segments}
6
+ max_duration: ${train_max_duration}
7
+ min_duration: ${train_min_duration}
8
+ load_precomputed_mel: true
9
+ hop_length: ${n_window_stride}
10
+ dataloader_params:
11
+ drop_last: false
12
+ shuffle: true
13
+ batch_size: 16
14
+ num_workers: 4
15
+ pin_memory: true
tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/v1.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ _target_: nemo.collections.tts.modules.hifigan_modules.Generator
2
+ resblock: 1
3
+ upsample_rates: [8,8,2,2]
4
+ upsample_kernel_sizes: [16,16,4,4]
5
+ upsample_initial_channel: 512
6
+ resblock_kernel_sizes: [3,7,11]
7
+ resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/val_ds_finetune.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset:
2
+ _target_: "nemo.collections.tts.data.dataset.VocoderDataset"
3
+ manifest_filepath: ${validation_datasets}
4
+ sample_rate: ${sample_rate}
5
+ n_segments: ${val_n_segments}
6
+ max_duration: ${val_max_duration}
7
+ min_duration: ${val_min_duration}
8
+ load_precomputed_mel: true
9
+ hop_length: ${n_window_stride}
10
+ dataloader_params:
11
+ drop_last: false
12
+ shuffle: false
13
+ batch_size: 16
14
+ num_workers: 4
15
+ pin_memory: true
tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/validation_ds/.ipynb_checkpoints/val_ds_finetune-checkpoint.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset:
2
+ _target_: "nemo.collections.tts.data.dataset.VocoderDataset"
3
+ manifest_filepath: ${validation_datasets}
4
+ sample_rate: ${sample_rate}
5
+ n_segments: ${val_n_segments}
6
+ max_duration: ${val_max_duration}
7
+ min_duration: ${val_min_duration}
8
+ load_precomputed_mel: true
9
+ hop_length: ${n_window_stride}
10
+ dataloader_params:
11
+ drop_last: false
12
+ shuffle: false
13
+ batch_size: 16
14
+ num_workers: 4
15
+ pin_memory: true
tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/validation_ds/val_ds_finetune.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset:
2
+ _target_: "nemo.collections.tts.data.dataset.VocoderDataset"
3
+ manifest_filepath: ${validation_datasets}
4
+ sample_rate: ${sample_rate}
5
+ n_segments: ${val_n_segments}
6
+ max_duration: ${val_max_duration}
7
+ min_duration: ${val_min_duration}
8
+ load_precomputed_mel: true
9
+ hop_length: ${n_window_stride}
10
+ dataloader_params:
11
+ drop_last: false
12
+ shuffle: false
13
+ batch_size: 16
14
+ num_workers: 4
15
+ pin_memory: true
tts_fa_fastpitch_hifigan-v2.0/models/FastPitch--val_loss-0.7236-epoch-50.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eea3093c753874952bab5719b9d82c664b0c1c7bc4116a3034d657659269e3bb
3
+ size 549427880
tts_fa_fastpitch_hifigan-v2.0/models/FastPitch--val_loss-0.7236-epoch-50.nemo ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbff27139ad11c3e742378596421b00c15f5dce664255119b4e1652a4e73d64c
3
+ size 184258560
tts_fa_fastpitch_hifigan-v2.0/models/HifiGan--val_loss-0.6090-epoch-39-last.nemo ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8199747904ba8c35f64f1d3b1a1a4f62c303fd2f0238c2148f2750833563aa8a
3
+ size 339210240
tts_fa_fastpitch_hifigan-v2.0/models/HifiGan--val_loss-2.0688-epoch-12-last.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24e7ddabf6058bef99b570a56131368bdbe39b2c9b095c5be8b1d6d7c8c5adcf
3
+ size 1016835427
tts_fa_fastpitch_hifigan-v2.0/models/HifiGan--val_loss-2.0733-epoch-12-last.nemo ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0e087e923573c910b0a6c86e02c355bf3a85c82017cdcce82f99ff47a2a8577
3
+ size 339210240
tts_fa_fastpitch_hifigan-v2.0/models/onnx-models/fastpitch.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0108a874da58f4cd5c99c7819b89cff32cc841a6d23f3f4fae4e901aeb315000
3
+ size 179344671
tts_fa_fastpitch_hifigan-v2.0/models/onnx-models/hifigan.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f5898c7b64e7c64c8421210f17f498c7b13e4d121d6f33f09d226cb6956f970
3
+ size 55760326
tts_fa_fastpitch_hifigan-v2.0/persian-dict/persian-v5.0.dict ADDED
The diff for this file is too large to render. See raw diff
 
tts_fa_fastpitch_hifigan-v2.0/scripts/.ipynb_checkpoints/generate_mels-checkpoint.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ This script is to generate mel spectrograms from a Fastpitch model checkpoint. Please see general usage below. It runs
17
+ on GPUs by default, but you can add `--num-workers 5 --cpu` as an option to run on CPUs.
18
+
19
+ $ python scripts/dataset_processing/tts/generate_mels.py \
20
+ --fastpitch-model-ckpt ./models/fastpitch/multi_spk/FastPitch--val_loss\=1.4473-epoch\=209.ckpt \
21
+ --input-json-manifests /home/xueyang/HUI-Audio-Corpus-German-clean/test_manifest_text_normed_phonemes.json
22
+ --output-json-manifest-root /home/xueyang/experiments/multi_spk_tts_de
23
+ """
24
+
25
+ import argparse
26
+ import json
27
+ from pathlib import Path
28
+
29
+ import numpy as np
30
+ import soundfile as sf
31
+ import torch
32
+ from joblib import Parallel, delayed
33
+ from tqdm import tqdm
34
+
35
+ from nemo.collections.tts.models import FastPitchModel
36
+ from nemo.collections.tts.parts.utils.tts_dataset_utils import (
37
+ BetaBinomialInterpolator,
38
+ beta_binomial_prior_distribution,
39
+ )
40
+ from nemo.utils import logging
41
+
42
+
43
+ def get_args():
44
+ parser = argparse.ArgumentParser(
45
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
46
+ description="Generate mel spectrograms with pretrained FastPitch model, and create manifests for finetuning Hifigan.",
47
+ )
48
+ parser.add_argument(
49
+ "--fastpitch-model-ckpt",
50
+ required=True,
51
+ type=Path,
52
+ help="Specify a full path of a fastpitch model checkpoint with the suffix of either .ckpt or .nemo.",
53
+ )
54
+ parser.add_argument(
55
+ "--input-json-manifests",
56
+ nargs="+",
57
+ required=True,
58
+ type=Path,
59
+ help="Specify a full path of a JSON manifest. You could add multiple manifests.",
60
+ )
61
+ parser.add_argument(
62
+ "--output-json-manifest-root",
63
+ required=True,
64
+ type=Path,
65
+ help="Specify a full path of output root that would contain new manifests.",
66
+ )
67
+ parser.add_argument(
68
+ "--num-workers",
69
+ default=-1,
70
+ type=int,
71
+ help="Specify the max number of concurrently Python workers processes. "
72
+ "If -1 all CPUs are used. If 1 no parallel computing is used.",
73
+ )
74
+ parser.add_argument("--cpu", action='store_true', default=False, help="Generate mel spectrograms using CPUs.")
75
+ args = parser.parse_args()
76
+ return args
77
+
78
+
79
+ def __load_wav(audio_file):
80
+ with sf.SoundFile(audio_file, 'r') as f:
81
+ samples = f.read(dtype='float32')
82
+ return samples.transpose()
83
+
84
+
85
+ def __generate_mels(entry, spec_model, device, use_beta_binomial_interpolator, mel_root):
86
+ # Generate a spectrograms (we need to use ground truth alignment for correct matching between audio and mels)
87
+ audio = __load_wav(entry["audio_filepath"])
88
+ audio = torch.from_numpy(audio).unsqueeze(0).to(device)
89
+ audio_len = torch.tensor(audio.shape[1], dtype=torch.long, device=device).unsqueeze(0)
90
+
91
+ if spec_model.fastpitch.speaker_emb is not None and "speaker" in entry:
92
+ speaker = torch.tensor([entry['speaker']]).to(device)
93
+ else:
94
+ speaker = None
95
+
96
+ with torch.no_grad():
97
+ if "normalized_text" in entry:
98
+ text = spec_model.parse(entry["normalized_text"], normalize=False)
99
+ else:
100
+ text = spec_model.parse(entry['text'])
101
+
102
+ text_len = torch.tensor(text.shape[-1], dtype=torch.long, device=device).unsqueeze(0)
103
+ spect, spect_len = spec_model.preprocessor(input_signal=audio, length=audio_len)
104
+ mel_len = spect.shape[-1]
105
+
106
+ # Generate attention prior and spectrogram inputs for HiFi-GAN
107
+ if use_beta_binomial_interpolator:
108
+ beta_binomial_interpolator = BetaBinomialInterpolator()
109
+ attn_prior = (
110
+ torch.from_numpy(beta_binomial_interpolator(mel_len, text_len.item()))
111
+ .unsqueeze(0)
112
+ .to(text.device)
113
+ )
114
+ else:
115
+ attn_prior = (
116
+ torch.from_numpy(beta_binomial_prior_distribution(text_len.item(), mel_len))
117
+ .unsqueeze(0)
118
+ .to(text.device)
119
+ )
120
+
121
+ spectrogram = spec_model.forward(
122
+ text=text, input_lens=text_len, spec=spect, mel_lens=spect_len, attn_prior=attn_prior, speaker=speaker,
123
+ )[0]
124
+
125
+ save_path = mel_root / f"{Path(entry['audio_filepath']).stem}.npy"
126
+ np.save(save_path, spectrogram[0].to('cpu').numpy())
127
+ entry["mel_filepath"] = str(save_path)
128
+
129
+ return entry
130
+
131
+
132
+ def main():
133
+ args = get_args()
134
+ ckpt_path = args.fastpitch_model_ckpt
135
+ input_manifest_filepaths = args.input_json_manifests
136
+ output_json_manifest_root = args.output_json_manifest_root
137
+
138
+ mel_root = output_json_manifest_root / "mels"
139
+ mel_root.mkdir(exist_ok=True, parents=True)
140
+
141
+ # load pretrained FastPitch model checkpoint
142
+ suffix = ckpt_path.suffix
143
+ if suffix == ".nemo":
144
+ spec_model = FastPitchModel.restore_from(ckpt_path).eval()
145
+ elif suffix == ".ckpt":
146
+ spec_model = FastPitchModel.load_from_checkpoint(ckpt_path).eval()
147
+ else:
148
+ raise ValueError(f"Unsupported suffix: {suffix}")
149
+ if not args.cpu:
150
+ spec_model.cuda()
151
+ device = spec_model.device
152
+
153
+ use_beta_binomial_interpolator = spec_model.cfg.train_ds.dataset.get("use_beta_binomial_interpolator", False)
154
+
155
+ for manifest in input_manifest_filepaths:
156
+ logging.info(f"Processing {manifest}.")
157
+ entries = []
158
+ with open(manifest, "r") as fjson:
159
+ for line in fjson:
160
+ entries.append(json.loads(line.strip()))
161
+
162
+ if device == "cpu":
163
+ new_entries = Parallel(n_jobs=args.num_workers)(
164
+ delayed(__generate_mels)(entry, spec_model, device, use_beta_binomial_interpolator, mel_root)
165
+ for entry in entries
166
+ )
167
+ else:
168
+ new_entries = []
169
+ for entry in tqdm(entries):
170
+ new_entry = __generate_mels(entry, spec_model, device, use_beta_binomial_interpolator, mel_root)
171
+ new_entries.append(new_entry)
172
+
173
+ mel_manifest_path = output_json_manifest_root / f"{manifest.stem}_mel{manifest.suffix}"
174
+ with open(mel_manifest_path, "w") as fmel:
175
+ for entry in new_entries:
176
+ fmel.write(json.dumps(entry) + "\n")
177
+ logging.info(f"Processing {manifest} is complete --> {mel_manifest_path}")
178
+
179
+
180
+ if __name__ == "__main__":
181
+ main()
tts_fa_fastpitch_hifigan-v2.0/scripts/.ipynb_checkpoints/hifigan_finetune-checkpoint.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import lightning.pytorch as pl
16
+
17
+ from nemo.collections.tts.models import HifiGanModel
18
+ from nemo.core.config import hydra_runner
19
+ from nemo.utils.exp_manager import exp_manager
20
+
21
+
22
+ @hydra_runner(config_path="conf/hifigan", config_name="hifigan_44100")
23
+ def main(cfg):
24
+ trainer = pl.Trainer(**cfg.trainer)
25
+ exp_manager(trainer, cfg.get("exp_manager", None))
26
+ model = HifiGanModel(cfg=cfg.model, trainer=trainer)
27
+ model.maybe_init_from_pretrained_checkpoint(cfg=cfg)
28
+ trainer.fit(model)
29
+
30
+
31
+ if __name__ == '__main__':
32
+ main() # noqa pylint: disable=no-value-for-parameter
tts_fa_fastpitch_hifigan-v2.0/scripts/extract_sup_data.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ import torch
17
+ from hydra.utils import instantiate
18
+ from tqdm import tqdm
19
+
20
+ from nemo.core.config import hydra_runner
21
+
22
+
23
+ def get_pitch_stats(pitch_list):
24
+ pitch_tensor = torch.cat(pitch_list)
25
+ pitch_mean, pitch_std = pitch_tensor.mean().item(), pitch_tensor.std().item()
26
+ pitch_min, pitch_max = pitch_tensor.min().item(), pitch_tensor.max().item()
27
+ print(f"PITCH_MEAN={pitch_mean}, PITCH_STD={pitch_std}")
28
+ print(f"PITCH_MIN={pitch_min}, PITCH_MAX={pitch_max}")
29
+
30
+
31
+ def preprocess_ds_for_fastpitch_align(dataloader):
32
+ pitch_list = []
33
+ for batch in tqdm(dataloader, total=len(dataloader)):
34
+ audios, audio_lengths, tokens, tokens_lengths, align_prior_matrices, pitches, pitches_lengths, *_ = batch
35
+ pitch = pitches.squeeze(0)
36
+ pitch_list.append(pitch[pitch != 0])
37
+
38
+ get_pitch_stats(pitch_list)
39
+
40
+
41
+ def preprocess_ds_for_mixer_tts_x(dataloader):
42
+ pitch_list = []
43
+ for batch in tqdm(dataloader, total=len(dataloader)):
44
+ (
45
+ audios,
46
+ audio_lengths,
47
+ tokens,
48
+ tokens_lengths,
49
+ align_prior_matrices,
50
+ pitches,
51
+ pitches_lengths,
52
+ lm_tokens,
53
+ ) = batch
54
+
55
+ pitch = pitches.squeeze(0)
56
+ pitch_list.append(pitch[pitch != 0])
57
+
58
+ get_pitch_stats(pitch_list)
59
+
60
+
61
+ CFG_NAME2FUNC = {
62
+ "ds_for_fastpitch_align": preprocess_ds_for_fastpitch_align,
63
+ "ds_for_mixer_tts": preprocess_ds_for_fastpitch_align,
64
+ "ds_for_mixer_tts_x": preprocess_ds_for_mixer_tts_x,
65
+ }
66
+
67
+
68
+ @hydra_runner(config_path='ljspeech/ds_conf', config_name='ds_for_fastpitch_align')
69
+ def main(cfg):
70
+ dataset = instantiate(cfg.dataset)
71
+ dataloader = torch.utils.data.DataLoader(
72
+ dataset=dataset,
73
+ batch_size=1,
74
+ collate_fn=dataset._collate_fn,
75
+ num_workers=cfg.get("dataloader_params", {}).get("num_workers", 4),
76
+ )
77
+
78
+ print(f"Processing {cfg.manifest_filepath}:")
79
+ CFG_NAME2FUNC[cfg.name](dataloader)
80
+
81
+
82
+ if __name__ == '__main__':
83
+ main() # noqa pylint: disable=no-value-for-parameter
tts_fa_fastpitch_hifigan-v2.0/scripts/fastpitch.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import lightning.pytorch as pl
16
+
17
+ from nemo.collections.common.callbacks import LogEpochTimeCallback
18
+ from nemo.collections.tts.models import FastPitchModel
19
+ from nemo.core.config import hydra_runner
20
+ from nemo.utils.exp_manager import exp_manager
21
+
22
+
23
+ @hydra_runner(config_path="conf", config_name="fastpitch_align_v1.05")
24
+ def main(cfg):
25
+ trainer = pl.Trainer(**cfg.trainer)
26
+ exp_manager(trainer, cfg.get("exp_manager", None))
27
+ model = FastPitchModel(cfg=cfg.model, trainer=trainer)
28
+ lr_logger = pl.callbacks.LearningRateMonitor()
29
+ epoch_time_logger = LogEpochTimeCallback()
30
+ trainer.callbacks.extend([lr_logger, epoch_time_logger])
31
+ trainer.fit(model)
32
+
33
+
34
+ if __name__ == '__main__':
35
+ main() # noqa pylint: disable=no-value-for-parameter
tts_fa_fastpitch_hifigan-v2.0/scripts/generate_mels.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ This script is to generate mel spectrograms from a Fastpitch model checkpoint. Please see general usage below. It runs
17
+ on GPUs by default, but you can add `--num-workers 5 --cpu` as an option to run on CPUs.
18
+
19
+ $ python scripts/dataset_processing/tts/generate_mels.py \
20
+ --fastpitch-model-ckpt ./models/fastpitch/multi_spk/FastPitch--val_loss\=1.4473-epoch\=209.ckpt \
21
+ --input-json-manifests /home/xueyang/HUI-Audio-Corpus-German-clean/test_manifest_text_normed_phonemes.json
22
+ --output-json-manifest-root /home/xueyang/experiments/multi_spk_tts_de
23
+ """
24
+
25
+ import argparse
26
+ import json
27
+ from pathlib import Path
28
+
29
+ import numpy as np
30
+ import soundfile as sf
31
+ import torch
32
+ from joblib import Parallel, delayed
33
+ from tqdm import tqdm
34
+
35
+ from nemo.collections.tts.models import FastPitchModel
36
+ from nemo.collections.tts.parts.utils.tts_dataset_utils import (
37
+ BetaBinomialInterpolator,
38
+ beta_binomial_prior_distribution,
39
+ )
40
+ from nemo.utils import logging
41
+
42
+
43
+ def get_args():
44
+ parser = argparse.ArgumentParser(
45
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
46
+ description="Generate mel spectrograms with pretrained FastPitch model, and create manifests for finetuning Hifigan.",
47
+ )
48
+ parser.add_argument(
49
+ "--fastpitch-model-ckpt",
50
+ required=True,
51
+ type=Path,
52
+ help="Specify a full path of a fastpitch model checkpoint with the suffix of either .ckpt or .nemo.",
53
+ )
54
+ parser.add_argument(
55
+ "--input-json-manifests",
56
+ nargs="+",
57
+ required=True,
58
+ type=Path,
59
+ help="Specify a full path of a JSON manifest. You could add multiple manifests.",
60
+ )
61
+ parser.add_argument(
62
+ "--output-json-manifest-root",
63
+ required=True,
64
+ type=Path,
65
+ help="Specify a full path of output root that would contain new manifests.",
66
+ )
67
+ parser.add_argument(
68
+ "--num-workers",
69
+ default=-1,
70
+ type=int,
71
+ help="Specify the max number of concurrently Python workers processes. "
72
+ "If -1 all CPUs are used. If 1 no parallel computing is used.",
73
+ )
74
+ parser.add_argument("--cpu", action='store_true', default=False, help="Generate mel spectrograms using CPUs.")
75
+ args = parser.parse_args()
76
+ return args
77
+
78
+
79
+ def __load_wav(audio_file):
80
+ with sf.SoundFile(audio_file, 'r') as f:
81
+ samples = f.read(dtype='float32')
82
+ return samples.transpose()
83
+
84
+
85
+ def __generate_mels(entry, spec_model, device, use_beta_binomial_interpolator, mel_root):
86
+ # Generate a spectrograms (we need to use ground truth alignment for correct matching between audio and mels)
87
+ audio = __load_wav(entry["audio_filepath"])
88
+ audio = torch.from_numpy(audio).unsqueeze(0).to(device)
89
+ audio_len = torch.tensor(audio.shape[1], dtype=torch.long, device=device).unsqueeze(0)
90
+
91
+ if spec_model.fastpitch.speaker_emb is not None and "speaker" in entry:
92
+ speaker = torch.tensor([entry['speaker']]).to(device)
93
+ else:
94
+ speaker = None
95
+
96
+ with torch.no_grad():
97
+ if "normalized_text" in entry:
98
+ text = spec_model.parse(entry["normalized_text"], normalize=False)
99
+ else:
100
+ text = spec_model.parse(entry['text'])
101
+
102
+ text_len = torch.tensor(text.shape[-1], dtype=torch.long, device=device).unsqueeze(0)
103
+ spect, spect_len = spec_model.preprocessor(input_signal=audio, length=audio_len)
104
+ mel_len = spect.shape[-1]
105
+
106
+ # Generate attention prior and spectrogram inputs for HiFi-GAN
107
+ if use_beta_binomial_interpolator:
108
+ beta_binomial_interpolator = BetaBinomialInterpolator()
109
+ attn_prior = (
110
+ torch.from_numpy(beta_binomial_interpolator(mel_len, text_len.item()))
111
+ .unsqueeze(0)
112
+ .to(text.device)
113
+ )
114
+ else:
115
+ attn_prior = (
116
+ torch.from_numpy(beta_binomial_prior_distribution(text_len.item(), mel_len))
117
+ .unsqueeze(0)
118
+ .to(text.device)
119
+ )
120
+
121
+ spectrogram = spec_model.forward(
122
+ text=text, input_lens=text_len, spec=spect, mel_lens=spect_len, attn_prior=attn_prior, speaker=speaker,
123
+ )[0]
124
+
125
+ save_path = mel_root / f"{Path(entry['audio_filepath']).stem}.npy"
126
+ np.save(save_path, spectrogram[0].to('cpu').numpy())
127
+ entry["mel_filepath"] = str(save_path)
128
+
129
+ return entry
130
+
131
+
132
+ def main():
133
+ args = get_args()
134
+ ckpt_path = args.fastpitch_model_ckpt
135
+ input_manifest_filepaths = args.input_json_manifests
136
+ output_json_manifest_root = args.output_json_manifest_root
137
+
138
+ mel_root = output_json_manifest_root / "mels"
139
+ mel_root.mkdir(exist_ok=True, parents=True)
140
+
141
+ # load pretrained FastPitch model checkpoint
142
+ suffix = ckpt_path.suffix
143
+ if suffix == ".nemo":
144
+ spec_model = FastPitchModel.restore_from(ckpt_path).eval()
145
+ elif suffix == ".ckpt":
146
+ spec_model = FastPitchModel.load_from_checkpoint(ckpt_path).eval()
147
+ else:
148
+ raise ValueError(f"Unsupported suffix: {suffix}")
149
+ if not args.cpu:
150
+ spec_model.cuda()
151
+ device = spec_model.device
152
+
153
+ use_beta_binomial_interpolator = spec_model.cfg.train_ds.dataset.get("use_beta_binomial_interpolator", False)
154
+
155
+ for manifest in input_manifest_filepaths:
156
+ logging.info(f"Processing {manifest}.")
157
+ entries = []
158
+ with open(manifest, "r") as fjson:
159
+ for line in fjson:
160
+ entries.append(json.loads(line.strip()))
161
+
162
+ if device == "cpu":
163
+ new_entries = Parallel(n_jobs=args.num_workers)(
164
+ delayed(__generate_mels)(entry, spec_model, device, use_beta_binomial_interpolator, mel_root)
165
+ for entry in entries
166
+ )
167
+ else:
168
+ new_entries = []
169
+ for entry in tqdm(entries):
170
+ new_entry = __generate_mels(entry, spec_model, device, use_beta_binomial_interpolator, mel_root)
171
+ new_entries.append(new_entry)
172
+
173
+ mel_manifest_path = output_json_manifest_root / f"{manifest.stem}_mel{manifest.suffix}"
174
+ with open(mel_manifest_path, "w") as fmel:
175
+ for entry in new_entries:
176
+ fmel.write(json.dumps(entry) + "\n")
177
+ logging.info(f"Processing {manifest} is complete --> {mel_manifest_path}")
178
+
179
+
180
+ if __name__ == "__main__":
181
+ main()
tts_fa_fastpitch_hifigan-v2.0/scripts/hifigan_finetune.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import lightning.pytorch as pl
16
+
17
+ from nemo.collections.tts.models import HifiGanModel
18
+ from nemo.core.config import hydra_runner
19
+ from nemo.utils.exp_manager import exp_manager
20
+
21
+
22
+ @hydra_runner(config_path="conf/hifigan", config_name="hifigan_44100")
23
+ def main(cfg):
24
+ trainer = pl.Trainer(**cfg.trainer)
25
+ exp_manager(trainer, cfg.get("exp_manager", None))
26
+ model = HifiGanModel(cfg=cfg.model, trainer=trainer)
27
+ model.maybe_init_from_pretrained_checkpoint(cfg=cfg)
28
+ trainer.fit(model)
29
+
30
+
31
+ if __name__ == '__main__':
32
+ main() # noqa pylint: disable=no-value-for-parameter
tts_fa_fastpitch_hifigan-v2.0/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/SadeghK/tts_fa_fastpitch_hifigan-v2.0
tts_fa_fastpitch_hifigan-v2.0/tts-nemo-fastpitch-hifigan-from-scratch-finetuning-hifigan.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
tts_fa_fastpitch_hifigan-v2.0/tts-nemo-fastpitch-hifigan-inference-only.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
tts_fa_fastpitch_hifigan-v2.0/tts_nemo_fastpitch_hifigan_convert_to_onnx.ipynb ADDED
@@ -0,0 +1,884 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": []
7
+ },
8
+ "kernelspec": {
9
+ "name": "python3",
10
+ "display_name": "Python 3"
11
+ },
12
+ "language_info": {
13
+ "name": "python"
14
+ }
15
+ },
16
+ "cells": [
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": null,
20
+ "metadata": {
21
+ "id": "4kodC7VXOd56"
22
+ },
23
+ "outputs": [],
24
+ "source": [
25
+ "# !python -m pip install --upgrade pip\n",
26
+ "!apt-get update && apt-get install -y libsndfile1 ffmpeg\n",
27
+ "!pip install Cython packaging\n",
28
+ "!rm -rf /usr/lib/python3.10/site-packages/blinker*\n",
29
+ "!rm -rf /usr/local/lib/python3.10/dist-packages/blinker*\n",
30
+ "!pip install --ignore-installed blinker\n",
31
+ "!pip install --upgrade --force-reinstall blinker\n",
32
+ "# !pip install dask-cuda==24.8.2\n",
33
+ "\n",
34
+ "!mkdir -p /workspace/tts-nemo/\n",
35
+ "!cd /workspace/tts-nemo/\n",
36
+ "!git clone https://github.com/SadeghKrmi/NeMo.git\n",
37
+ "\n",
38
+ "# to install and enable editing without re-installation\n",
39
+ "!cd NeMo && pip install -e '.[all]'\n",
40
+ "\n",
41
+ "# install without editing possibility\n",
42
+ "# !cd NeMo && pip install '.[all]'"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "code",
47
+ "source": [
48
+ "from google.colab import drive\n",
49
+ "drive.mount('/content/drive')"
50
+ ],
51
+ "metadata": {
52
+ "colab": {
53
+ "base_uri": "https://localhost:8080/"
54
+ },
55
+ "id": "LKzWYURw4S5i",
56
+ "outputId": "d0dbbac6-1391-4116-de27-19b0fd39805b"
57
+ },
58
+ "execution_count": 1,
59
+ "outputs": [
60
+ {
61
+ "output_type": "stream",
62
+ "name": "stdout",
63
+ "text": [
64
+ "Mounted at /content/drive\n"
65
+ ]
66
+ }
67
+ ]
68
+ },
69
+ {
70
+ "cell_type": "code",
71
+ "source": [
72
+ "!ls -l /content/drive/MyDrive/vast-ai-gcloud-drive/tts-v3/models/FastPitch--val_loss-0.7236-epoch-50.nemo\n",
73
+ "!ls -l /content/drive/MyDrive/vast-ai-gcloud-drive/tts-v3/models/HifiGan--val_loss-2.0733-epoch-12-last.nemo\n",
74
+ "\n"
75
+ ],
76
+ "metadata": {
77
+ "colab": {
78
+ "base_uri": "https://localhost:8080/"
79
+ },
80
+ "id": "lN8KV1CanbX1",
81
+ "outputId": "6c1a9459-bc49-43c1-a220-64e1c2e175aa"
82
+ },
83
+ "execution_count": 4,
84
+ "outputs": [
85
+ {
86
+ "output_type": "stream",
87
+ "name": "stdout",
88
+ "text": [
89
+ "-rw------- 1 root root 184258560 Aug 13 08:13 /content/drive/MyDrive/vast-ai-gcloud-drive/tts-v3/models/FastPitch--val_loss-0.7236-epoch-50.nemo\n",
90
+ "-rw------- 1 root root 339210240 Aug 15 12:11 /content/drive/MyDrive/vast-ai-gcloud-drive/tts-v3/models/HifiGan--val_loss-2.0733-epoch-12-last.nemo\n"
91
+ ]
92
+ }
93
+ ]
94
+ },
95
+ {
96
+ "cell_type": "code",
97
+ "source": [
98
+ "!ls /content/drive/MyDrive/cNotebooks/perTTS/tts-nemo-fastpitch-hifigan-convert-to-onnx/"
99
+ ],
100
+ "metadata": {
101
+ "colab": {
102
+ "base_uri": "https://localhost:8080/"
103
+ },
104
+ "id": "6hYdqdhxQscq",
105
+ "outputId": "4813f43b-5712-44c4-b9d6-51c12f4f729d"
106
+ },
107
+ "execution_count": null,
108
+ "outputs": [
109
+ {
110
+ "output_type": "stream",
111
+ "name": "stdout",
112
+ "text": [
113
+ "FastPitch--val_loss-0.7796-epoch-800-last.nemo\n",
114
+ "HifiGan--val_loss-0.6090-epoch-39-last.nemo\n",
115
+ "persian-dict\n",
116
+ "tts_nemo_fastpitch_hifigan_convert_to_onnx.ipynb\n"
117
+ ]
118
+ }
119
+ ]
120
+ },
121
+ {
122
+ "cell_type": "code",
123
+ "source": [
124
+ "!pip install num2fawords -q"
125
+ ],
126
+ "metadata": {
127
+ "id": "KNqfXdJ1poZ2"
128
+ },
129
+ "execution_count": 6,
130
+ "outputs": []
131
+ },
132
+ {
133
+ "cell_type": "code",
134
+ "source": [
135
+ "from nemo.collections.tts.g2p.models.fa_ir_persian.g2p import PersianG2p\n",
136
+ "from nemo.collections.tts.g2p.models.fa_ir_persian.tokenizer import PersianPhonemesTokenizer\n",
137
+ "\n",
138
+ "# test Persian Grapheme-to-phoneme module\n",
139
+ "g2p = PersianG2p(\n",
140
+ " phoneme_dict=\"./persian-v6.0.dict\",\n",
141
+ ")\n",
142
+ "\n",
143
+ "# Text tokenizer\n",
144
+ "# text_tokenizer = PersianPhonemesTokenizer(punct=True,chars=True,pad_with_space=True,g2p=g2p)\n",
145
+ "\n",
146
+ "text_tokenizer = PersianPhonemesTokenizer(\n",
147
+ " g2p=g2p,\n",
148
+ " use_emotion_tokens=True,\n",
149
+ " use_pause_tokens=True,\n",
150
+ " use_speed_tokens=True\n",
151
+ ")\n",
152
+ "\n",
153
+ "text = 'و تاریخ میلادی سال ۶۲۲ را نشان میداد.'\n",
154
+ "ids = text_tokenizer.encode(text)\n",
155
+ "print(ids)"
156
+ ],
157
+ "metadata": {
158
+ "id": "mQxZY4z4OiGx",
159
+ "colab": {
160
+ "base_uri": "https://localhost:8080/"
161
+ },
162
+ "outputId": "0d3abeae-4778-407e-9325-495e26743ab3"
163
+ },
164
+ "execution_count": 7,
165
+ "outputs": [
166
+ {
167
+ "output_type": "stream",
168
+ "name": "stdout",
169
+ "text": [
170
+ "[0, 28, 53, 0, 3, 43, 11, 50, 8, 0, 26, 50, 25, 43, 9, 50, 0, 14, 43, 25, 0, 15, 54, 15, 16, 53, 9, 0, 47, 0, 1, 50, 14, 3, 0, 47, 0, 9, 47, 0, 11, 43, 0, 27, 54, 15, 43, 27, 0, 26, 50, 9, 43, 9, 69, 0]\n"
171
+ ]
172
+ }
173
+ ]
174
+ },
175
+ {
176
+ "cell_type": "markdown",
177
+ "source": [
178
+ "### FastPitch Export"
179
+ ],
180
+ "metadata": {
181
+ "id": "oc6P0je3TFe-"
182
+ }
183
+ },
184
+ {
185
+ "cell_type": "code",
186
+ "source": [
187
+ "import nemo.collections.tts as nemo_tts\n",
188
+ "import torch\n",
189
+ "\n",
190
+ "# Load model\n",
191
+ "fastpitch_model = nemo_tts.models.FastPitchModel.restore_from(\"/content/drive/MyDrive/vast-ai-gcloud-drive/tts-v3/models/FastPitch--val_loss-0.7236-epoch-50.nemo\")\n",
192
+ "fastpitch_model.eval()\n",
193
+ "\n",
194
+ "# Get the actual vocabulary size from the model\n",
195
+ "vocab_size = fastpitch_model.fastpitch.encoder.word_emb.num_embeddings\n",
196
+ "print(f\"Model vocabulary size: {vocab_size}\")\n",
197
+ "\n",
198
+ "# Method 1: Try with correct forward signature\n",
199
+ "class FastPitchWrapper1(torch.nn.Module):\n",
200
+ " def __init__(self, model):\n",
201
+ " super().__init__()\n",
202
+ " self.model = model\n",
203
+ "\n",
204
+ " def forward(self, text, input_lens):\n",
205
+ " return self.model.forward(text=text, input_lens=input_lens, pace=1.0)\n",
206
+ "\n",
207
+ "# Method 2: Try with generate_spectrogram\n",
208
+ "class FastPitchWrapper2(torch.nn.Module):\n",
209
+ " def __init__(self, model):\n",
210
+ " super().__init__()\n",
211
+ " self.model = model\n",
212
+ "\n",
213
+ " def forward(self, tokens):\n",
214
+ " return self.model.generate_spectrogram(tokens=tokens, speaker=None, pace=1.0)\n",
215
+ "\n",
216
+ "# Generate dummy data with valid token range (excluding padding token if it's 0)\n",
217
+ "padding_idx = getattr(fastpitch_model.fastpitch.encoder, 'padding_idx', 0)\n",
218
+ "valid_token_range = (1, vocab_size - 1) if padding_idx == 0 else (0, vocab_size - 1)\n",
219
+ "\n",
220
+ "dummy_text = torch.randint(valid_token_range[0], valid_token_range[1] + 1, (1, 50), dtype=torch.long)\n",
221
+ "dummy_input_lens = torch.tensor([50], dtype=torch.long)\n",
222
+ "\n",
223
+ "for i, (wrapper_class, args) in enumerate([(FastPitchWrapper1, (dummy_text, dummy_input_lens)),\n",
224
+ " (FastPitchWrapper2, (dummy_text,))], 1):\n",
225
+ " # try:\n",
226
+ " wrapper = wrapper_class(fastpitch_model)\n",
227
+ " with torch.no_grad():\n",
228
+ " output = wrapper(*args)\n",
229
+ "\n",
230
+ " print(f\"Method {i} works! Trying ONNX export...\")\n",
231
+ "\n",
232
+ " # Export to ONNX\n",
233
+ " input_names = ['text', 'input_lens'] if i == 1 else ['tokens']\n",
234
+ " torch.onnx.export(\n",
235
+ " wrapper,\n",
236
+ " args,\n",
237
+ " f\"fastpitch_method{i}.onnx\",\n",
238
+ " export_params=True,\n",
239
+ " opset_version=14,\n",
240
+ " input_names=input_names,\n",
241
+ " output_names=['mel_spec'],\n",
242
+ " dynamic_axes={\n",
243
+ " input_names[0]: {0: 'batch_size', 1: 'text_length'},\n",
244
+ " **(({input_names[1]: {0: 'batch_size'}} if len(input_names) > 1 else {})),\n",
245
+ " 'mel_spec': {0: 'batch_size', 2: 'mel_length'}\n",
246
+ " }\n",
247
+ " )\n",
248
+ " print(f\"Method {i} ONNX export successful!\")\n",
249
+ " break\n",
250
+ "\n",
251
+ " # except Exception as e:\n",
252
+ " # print(f\"Method {i} failed: {e}\")\n",
253
+ " # continue"
254
+ ],
255
+ "metadata": {
256
+ "colab": {
257
+ "base_uri": "https://localhost:8080/"
258
+ },
259
+ "id": "l45qnF6tSn6e",
260
+ "outputId": "c02529da-e241-4e5d-faa1-6fc67e314fae"
261
+ },
262
+ "execution_count": 16,
263
+ "outputs": [
264
+ {
265
+ "output_type": "stream",
266
+ "name": "stderr",
267
+ "text": [
268
+ "[NeMo W 2025-08-15 12:26:10 nemo_logging:405] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.\n",
269
+ " Train config : \n",
270
+ " dataset:\n",
271
+ " _target_: nemo.collections.tts.data.dataset.TTSDataset\n",
272
+ " manifest_filepath: ./dataset_splits/train/train.jsonl\n",
273
+ " sample_rate: 22050\n",
274
+ " sup_data_path: sup_data\n",
275
+ " sup_data_types:\n",
276
+ " - align_prior_matrix\n",
277
+ " - pitch\n",
278
+ " n_fft: 1024\n",
279
+ " win_length: 1024\n",
280
+ " hop_length: 256\n",
281
+ " window: hann\n",
282
+ " n_mels: 80\n",
283
+ " lowfreq: 0\n",
284
+ " highfreq: null\n",
285
+ " max_duration: 20\n",
286
+ " min_duration: 0.1\n",
287
+ " ignore_file: null\n",
288
+ " trim: true\n",
289
+ " trim_top_db: 50\n",
290
+ " trim_frame_length: 1024\n",
291
+ " trim_hop_length: 256\n",
292
+ " pitch_fmin: 65.4063949584961\n",
293
+ " pitch_fmax: 2093.004638671875\n",
294
+ " pitch_norm: true\n",
295
+ " pitch_mean: 103.01591491699219\n",
296
+ " pitch_std: 30.397296905517578\n",
297
+ " dataloader_params:\n",
298
+ " drop_last: false\n",
299
+ " shuffle: true\n",
300
+ " batch_size: 64\n",
301
+ " num_workers: 12\n",
302
+ " pin_memory: true\n",
303
+ " \n",
304
+ "[NeMo W 2025-08-15 12:26:10 nemo_logging:405] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). \n",
305
+ " Validation config : \n",
306
+ " dataset:\n",
307
+ " _target_: nemo.collections.tts.data.dataset.TTSDataset\n",
308
+ " manifest_filepath: ./dataset_splits/test/test.jsonl\n",
309
+ " sample_rate: 22050\n",
310
+ " sup_data_path: sup_data\n",
311
+ " sup_data_types:\n",
312
+ " - align_prior_matrix\n",
313
+ " - pitch\n",
314
+ " n_fft: 1024\n",
315
+ " win_length: 1024\n",
316
+ " hop_length: 256\n",
317
+ " window: hann\n",
318
+ " n_mels: 80\n",
319
+ " lowfreq: 0\n",
320
+ " highfreq: null\n",
321
+ " max_duration: 20\n",
322
+ " min_duration: 0.1\n",
323
+ " ignore_file: null\n",
324
+ " trim: true\n",
325
+ " trim_top_db: 50\n",
326
+ " trim_frame_length: 1024\n",
327
+ " trim_hop_length: 256\n",
328
+ " pitch_fmin: 65.4063949584961\n",
329
+ " pitch_fmax: 2093.004638671875\n",
330
+ " pitch_norm: true\n",
331
+ " pitch_mean: 103.01591491699219\n",
332
+ " pitch_std: 30.397296905517578\n",
333
+ " dataloader_params:\n",
334
+ " drop_last: false\n",
335
+ " shuffle: false\n",
336
+ " batch_size: 24\n",
337
+ " num_workers: 8\n",
338
+ " pin_memory: true\n",
339
+ " \n"
340
+ ]
341
+ },
342
+ {
343
+ "output_type": "stream",
344
+ "name": "stdout",
345
+ "text": [
346
+ "[NeMo I 2025-08-15 12:26:10 nemo_logging:393] PADDING: 1\n",
347
+ "[NeMo I 2025-08-15 12:26:11 nemo_logging:393] Model FastPitchModel was successfully restored from /content/drive/MyDrive/vast-ai-gcloud-drive/tts-v3/models/FastPitch--val_loss-0.7236-epoch-50.nemo.\n",
348
+ "Model vocabulary size: 94\n",
349
+ "Method 1 works! Trying ONNX export...\n",
350
+ "Method 1 ONNX export successful!\n"
351
+ ]
352
+ }
353
+ ]
354
+ },
355
+ {
356
+ "cell_type": "markdown",
357
+ "source": [
358
+ "### HiFiGAN Export"
359
+ ],
360
+ "metadata": {
361
+ "id": "aKuYMvWBTCSa"
362
+ }
363
+ },
364
+ {
365
+ "cell_type": "code",
366
+ "source": [
367
+ "# Load HiFiGAN model\n",
368
+ "hifigan_model = nemo_tts.models.HifiGanModel.restore_from(\"/content/drive/MyDrive/vast-ai-gcloud-drive/tts-v3/models/HifiGan--val_loss-2.0733-epoch-12-last.nemo\")\n",
369
+ "hifigan_model.eval()\n",
370
+ "\n",
371
+ "# HiFiGAN might also need wrapper if it has the same issue\n",
372
+ "class HiFiGANWrapper(torch.nn.Module):\n",
373
+ " def __init__(self, model):\n",
374
+ " super().__init__()\n",
375
+ " self.model = model\n",
376
+ "\n",
377
+ " def forward(self, mel_spec):\n",
378
+ " return self.model.forward(spec=mel_spec)\n",
379
+ "\n",
380
+ "wrapped_hifigan = HiFiGANWrapper(hifigan_model)\n",
381
+ "\n",
382
+ "# Export HiFiGAN\n",
383
+ "# dummy_mel = torch.randn(1, 80, 100)\n",
384
+ "dummy_mel = torch.randn(1, 80, 100)\n",
385
+ "torch.onnx.export(\n",
386
+ " wrapped_hifigan,\n",
387
+ " dummy_mel,\n",
388
+ " \"hifigan_fixed.onnx\",\n",
389
+ " export_params=True,\n",
390
+ " opset_version=14,\n",
391
+ " do_constant_folding=True,\n",
392
+ " input_names=['mel_spec'],\n",
393
+ " output_names=['audio'],\n",
394
+ " dynamic_axes={\n",
395
+ " 'mel_spec': {0: 'batch_size', 2: 'mel_length'},\n",
396
+ " 'audio': {0: 'batch_size', 1: 'audio_length'}\n",
397
+ " },\n",
398
+ " optimize_for_mobile=False, # Keep False for CPU\n",
399
+ " training=torch.onnx.TrainingMode.EVAL\n",
400
+ ")"
401
+ ],
402
+ "metadata": {
403
+ "colab": {
404
+ "base_uri": "https://localhost:8080/"
405
+ },
406
+ "id": "UMVexvWMSqnJ",
407
+ "outputId": "0572f245-08d4-42ab-cfd1-fe46c98037ae"
408
+ },
409
+ "execution_count": 18,
410
+ "outputs": [
411
+ {
412
+ "output_type": "stream",
413
+ "name": "stderr",
414
+ "text": [
415
+ "[NeMo W 2025-08-15 12:26:59 nemo_logging:405] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.\n",
416
+ " Train config : \n",
417
+ " dataset:\n",
418
+ " _target_: nemo.collections.tts.data.dataset.VocoderDataset\n",
419
+ " manifest_filepath: ./mels/train_mel.jsonl\n",
420
+ " sample_rate: 22050\n",
421
+ " n_segments: 8192\n",
422
+ " max_duration: null\n",
423
+ " min_duration: 0.75\n",
424
+ " load_precomputed_mel: true\n",
425
+ " hop_length: 256\n",
426
+ " dataloader_params:\n",
427
+ " drop_last: false\n",
428
+ " shuffle: true\n",
429
+ " batch_size: 32\n",
430
+ " num_workers: 4\n",
431
+ " pin_memory: true\n",
432
+ " \n",
433
+ "[NeMo W 2025-08-15 12:26:59 nemo_logging:405] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). \n",
434
+ " Validation config : \n",
435
+ " dataset:\n",
436
+ " _target_: nemo.collections.tts.data.dataset.VocoderDataset\n",
437
+ " manifest_filepath: ./mels/test_mel.jsonl\n",
438
+ " sample_rate: 22050\n",
439
+ " n_segments: 1024\n",
440
+ " max_duration: null\n",
441
+ " min_duration: 3\n",
442
+ " load_precomputed_mel: true\n",
443
+ " hop_length: 256\n",
444
+ " dataloader_params:\n",
445
+ " drop_last: false\n",
446
+ " shuffle: false\n",
447
+ " batch_size: 16\n",
448
+ " num_workers: 4\n",
449
+ " pin_memory: true\n",
450
+ " \n"
451
+ ]
452
+ },
453
+ {
454
+ "output_type": "stream",
455
+ "name": "stdout",
456
+ "text": [
457
+ "[NeMo I 2025-08-15 12:26:59 nemo_logging:393] PADDING: 0\n",
458
+ "[NeMo I 2025-08-15 12:26:59 nemo_logging:393] STFT using exact pad\n",
459
+ "[NeMo I 2025-08-15 12:26:59 nemo_logging:393] PADDING: 0\n",
460
+ "[NeMo I 2025-08-15 12:26:59 nemo_logging:393] STFT using exact pad\n",
461
+ "[NeMo I 2025-08-15 12:27:01 nemo_logging:393] Model HifiGanModel was successfully restored from /content/drive/MyDrive/vast-ai-gcloud-drive/tts-v3/models/HifiGan--val_loss-2.0733-epoch-12-last.nemo.\n"
462
+ ]
463
+ }
464
+ ]
465
+ },
466
+ {
467
+ "cell_type": "markdown",
468
+ "source": [
469
+ "### Run ONNX Models on CPU"
470
+ ],
471
+ "metadata": {
472
+ "id": "bS_mgW0HTZ07"
473
+ }
474
+ },
475
+ {
476
+ "cell_type": "code",
477
+ "source": [
478
+ "!pip install onnxruntime numpy librosa soundfile -q"
479
+ ],
480
+ "metadata": {
481
+ "id": "XdiQ2-wnTayc",
482
+ "colab": {
483
+ "base_uri": "https://localhost:8080/"
484
+ },
485
+ "outputId": "6ab61a4a-ef9e-4fda-ad5b-b6e447ddc4f3"
486
+ },
487
+ "execution_count": 20,
488
+ "outputs": [
489
+ {
490
+ "output_type": "stream",
491
+ "name": "stdout",
492
+ "text": [
493
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m16.5/16.5 MB\u001b[0m \u001b[31m42.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
494
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m2.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
495
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m4.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
496
+ "\u001b[?25h"
497
+ ]
498
+ }
499
+ ]
500
+ },
501
+ {
502
+ "cell_type": "code",
503
+ "source": [
504
+ "import onnxruntime as ort\n",
505
+ "import numpy as np\n",
506
+ "import torch\n",
507
+ "import librosa\n",
508
+ "import soundfile as sf\n",
509
+ "from typing import List, Optional\n",
510
+ "\n",
511
+ "# Import the same tokenizer used during training\n",
512
+ "from nemo.collections.tts.g2p.models.fa_ir_persian.g2p import PersianG2p\n",
513
+ "from nemo.collections.tts.g2p.models.fa_ir_persian.tokenizer import PersianPhonemesTokenizer\n",
514
+ "\n",
515
+ "class PersianTTSInferencePipeline:\n",
516
+ " def __init__(self, fastpitch_path: str, hifigan_path: str,\n",
517
+ " persian_dict_path: str = \"./persian-v6.0.dict\"):\n",
518
+ " \"\"\"\n",
519
+ " Initialize Persian TTS inference pipeline with ONNX models\n",
520
+ "\n",
521
+ " Args:\n",
522
+ " fastpitch_path: Path to FastPitch ONNX model\n",
523
+ " hifigan_path: Path to HiFiGAN ONNX model\n",
524
+ " persian_dict_path: Path to Persian phoneme dictionary\n",
525
+ " \"\"\"\n",
526
+ " # Load ONNX models\n",
527
+ " self.fastpitch_session = ort.InferenceSession(\n",
528
+ " fastpitch_path,\n",
529
+ " providers=['CPUExecutionProvider']\n",
530
+ " )\n",
531
+ " self.hifigan_session = ort.InferenceSession(\n",
532
+ " hifigan_path,\n",
533
+ " providers=['CPUExecutionProvider']\n",
534
+ " )\n",
535
+ "\n",
536
+ " # Initialize Persian tokenizer (SAME as training)\n",
537
+ " print(\"Initializing Persian tokenizer...\")\n",
538
+ " self.g2p = PersianG2p(phoneme_dict=persian_dict_path)\n",
539
+ " self.text_tokenizer = PersianPhonemesTokenizer(\n",
540
+ " g2p=g2p,\n",
541
+ " use_emotion_tokens=True,\n",
542
+ " use_pause_tokens=True,\n",
543
+ " use_speed_tokens=True\n",
544
+ " )\n",
545
+ "\n",
546
+ " # Get input/output names\n",
547
+ " self.fp_input_names = [inp.name for inp in self.fastpitch_session.get_inputs()]\n",
548
+ " self.fp_output_names = [out.name for out in self.fastpitch_session.get_outputs()]\n",
549
+ " self.hg_input_names = [inp.name for inp in self.hifigan_session.get_inputs()]\n",
550
+ " self.hg_output_names = [out.name for out in self.hifigan_session.get_outputs()]\n",
551
+ "\n",
552
+ " print(f\"FastPitch inputs: {self.fp_input_names}\")\n",
553
+ " print(f\"FastPitch outputs: {self.fp_output_names}\")\n",
554
+ " print(f\"HiFiGAN inputs: {self.hg_input_names}\")\n",
555
+ " print(f\"HiFiGAN outputs: {self.hg_output_names}\")\n",
556
+ "\n",
557
+ " # Test tokenizer\n",
558
+ " test_text = 'مدل تبدیل متن به گفتار پارسی'\n",
559
+ " test_ids = self.text_tokenizer.encode(test_text)\n",
560
+ " print(f\"Test tokenization: '{test_text}' -> {test_ids[:10]}...\")\n",
561
+ "\n",
562
+ " def text_to_tokens(self, text: str) -> tuple:\n",
563
+ " \"\"\"\n",
564
+ " Convert Persian text to phoneme tokens using the same tokenizer as training\n",
565
+ "\n",
566
+ " Args:\n",
567
+ " text: Input Persian text string\n",
568
+ "\n",
569
+ " Returns:\n",
570
+ " tokens: numpy array of phoneme token indices\n",
571
+ " token_lengths: numpy array of sequence length\n",
572
+ " \"\"\"\n",
573
+ " # Use the exact same tokenizer as training\n",
574
+ " token_ids = self.text_tokenizer.encode(text)\n",
575
+ "\n",
576
+ " # Convert to numpy arrays with batch dimension\n",
577
+ " tokens = np.array([token_ids], dtype=np.int64) # Shape: (1, seq_len)\n",
578
+ " token_lengths = np.array([len(token_ids)], dtype=np.int64) # Shape: (1,)\n",
579
+ "\n",
580
+ " print(f\"Text: '{text}'\")\n",
581
+ " print(f\"Tokens length: {len(token_ids)}\")\n",
582
+ " print(f\"First 20 tokens: {token_ids[:20]}\")\n",
583
+ " print(f\"Token range: [{min(token_ids)}, {max(token_ids)}]\")\n",
584
+ "\n",
585
+ " return tokens, token_lengths\n",
586
+ "\n",
587
+ " def generate_mel_spectrogram(self, text: str) -> np.ndarray:\n",
588
+ " \"\"\"\n",
589
+ " Generate mel spectrogram from Persian text using FastPitch\n",
590
+ "\n",
591
+ " Args:\n",
592
+ " text: Input Persian text string\n",
593
+ "\n",
594
+ " Returns:\n",
595
+ " mel_spec: Generated mel spectrogram\n",
596
+ " \"\"\"\n",
597
+ " # Convert text to phoneme tokens\n",
598
+ " tokens, token_lengths = self.text_to_tokens(text)\n",
599
+ "\n",
600
+ " # Prepare inputs based on your model's input names\n",
601
+ " if len(self.fp_input_names) == 1:\n",
602
+ " # If using generate_spectrogram wrapper (Method 2)\n",
603
+ " inputs = {self.fp_input_names[0]: tokens}\n",
604
+ " else:\n",
605
+ " # If using forward wrapper (Method 1)\n",
606
+ " inputs = {\n",
607
+ " self.fp_input_names[0]: tokens, # text or tokens\n",
608
+ " self.fp_input_names[1]: token_lengths # input_lens\n",
609
+ " }\n",
610
+ "\n",
611
+ " print(f\"FastPitch inputs: {list(inputs.keys())}\")\n",
612
+ " for key, val in inputs.items():\n",
613
+ " print(f\" {key}: shape {val.shape}, dtype {val.dtype}\")\n",
614
+ "\n",
615
+ " # Run FastPitch inference\n",
616
+ " mel_outputs = self.fastpitch_session.run(self.fp_output_names, inputs)\n",
617
+ " mel_spec = mel_outputs[0] # First output should be mel spectrogram\n",
618
+ "\n",
619
+ " print(f\"Generated mel spectrogram shape: {mel_spec.shape}\")\n",
620
+ " print(f\"Mel range: [{mel_spec.min():.4f}, {mel_spec.max():.4f}]\")\n",
621
+ "\n",
622
+ " return mel_spec\n",
623
+ "\n",
624
+ " def generate_audio(self, mel_spec: np.ndarray, sample_rate: int = 22050) -> np.ndarray:\n",
625
+ " \"\"\"\n",
626
+ " Generate audio from mel spectrogram using HiFiGAN\n",
627
+ "\n",
628
+ " Args:\n",
629
+ " mel_spec: Input mel spectrogram\n",
630
+ " sample_rate: Audio sample rate\n",
631
+ "\n",
632
+ " Returns:\n",
633
+ " audio: Generated audio waveform\n",
634
+ " \"\"\"\n",
635
+ " # Prepare inputs for HiFiGAN\n",
636
+ " inputs = {self.hg_input_names[0]: mel_spec}\n",
637
+ "\n",
638
+ " print(f\"HiFiGAN input shape: {mel_spec.shape}\")\n",
639
+ "\n",
640
+ " # Run HiFiGAN inference\n",
641
+ " audio_outputs = self.hifigan_session.run(self.hg_output_names, inputs)\n",
642
+ " audio = audio_outputs[0] # First output should be audio\n",
643
+ "\n",
644
+ " # Remove batch dimension and ensure proper shape\n",
645
+ " if audio.ndim > 1:\n",
646
+ " audio = audio.squeeze()\n",
647
+ "\n",
648
+ " print(f\"Generated audio shape: {audio.shape}\")\n",
649
+ " print(f\"Audio range: [{audio.min():.4f}, {audio.max():.4f}]\")\n",
650
+ " print(f\"Audio RMS: {np.sqrt(np.mean(audio**2)):.4f}\")\n",
651
+ "\n",
652
+ " return audio\n",
653
+ "\n",
654
+ " def text_to_speech(self, text: str, output_path: Optional[str] = None,\n",
655
+ " sample_rate: int = 22050) -> np.ndarray:\n",
656
+ " \"\"\"\n",
657
+ " Complete Persian text-to-speech pipeline\n",
658
+ "\n",
659
+ " Args:\n",
660
+ " text: Input Persian text string\n",
661
+ " output_path: Optional path to save audio file\n",
662
+ " sample_rate: Audio sample rate\n",
663
+ "\n",
664
+ " Returns:\n",
665
+ " audio: Generated audio waveform\n",
666
+ " \"\"\"\n",
667
+ " print(f\"🎙️ Generating Persian speech for: '{text}'\")\n",
668
+ " print(\"=\" * 60)\n",
669
+ "\n",
670
+ " # Step 1: Generate mel spectrogram\n",
671
+ " print(\"📊 Generating mel spectrogram...\")\n",
672
+ " mel_spec = self.generate_mel_spectrogram(text)\n",
673
+ "\n",
674
+ " # Step 2: Generate audio from mel spectrogram\n",
675
+ " print(\"🔊 Generating audio...\")\n",
676
+ " audio = self.generate_audio(mel_spec, sample_rate)\n",
677
+ "\n",
678
+ " # Step 3: Save audio if path provided\n",
679
+ " if output_path:\n",
680
+ " sf.write(output_path, audio, sample_rate)\n",
681
+ " print(f\"💾 Audio saved to: {output_path}\")\n",
682
+ "\n",
683
+ " print(\"✅ Persian TTS generation completed!\")\n",
684
+ " return audio\n",
685
+ "\n",
686
+ " def test_tokenizer_consistency(self):\n",
687
+ " \"\"\"Test that tokenizer works consistently\"\"\"\n",
688
+ " test_texts = [\n",
689
+ " 'سلام دنیا',\n",
690
+ " 'مدل تبدیل متن به گفتار پارسی',\n",
691
+ " 'این یک تست است',\n",
692
+ " 'پردازش زبان طبیعی'\n",
693
+ " ]\n",
694
+ "\n",
695
+ " print(\"🧪 Testing tokenizer consistency:\")\n",
696
+ " for text in test_texts:\n",
697
+ " tokens = self.text_tokenizer.encode(text)\n",
698
+ " decoded = self.text_tokenizer.decode(tokens)\n",
699
+ " print(f\" '{text}' -> {len(tokens)} tokens -> '{decoded}'\")\n",
700
+ "\n",
701
+ " def compare_with_training_tokenizer(self, text: str):\n",
702
+ " \"\"\"Compare tokenizer output with training setup\"\"\"\n",
703
+ " print(f\"🔍 Tokenizer comparison for: '{text}'\")\n",
704
+ "\n",
705
+ " # Your training tokenizer\n",
706
+ " tokens = self.text_tokenizer.encode(text)\n",
707
+ "\n",
708
+ " # Print detailed tokenization info\n",
709
+ " print(f\"Phoneme tokens: {tokens}\")\n",
710
+ " print(f\"Token count: {len(tokens)}\")\n",
711
+ " print(f\"Vocabulary size range: [0, {max(tokens)}]\")\n",
712
+ "\n",
713
+ " # Try to decode back\n",
714
+ " try:\n",
715
+ " decoded = self.text_tokenizer.decode(tokens)\n",
716
+ " print(f\"Decoded back: '{decoded}'\")\n",
717
+ " except:\n",
718
+ " print(\"Could not decode tokens back to text\")\n",
719
+ "\n",
720
+ " return tokens\n",
721
+ "\n",
722
+ "# Example usage for Persian TTS\n",
723
+ "def main():\n",
724
+ " # Initialize the Persian TTS pipeline\n",
725
+ " persian_tts = PersianTTSInferencePipeline(\n",
726
+ " fastpitch_path=\"fastpitch.onnx\", # Your exported ONNX model\n",
727
+ " hifigan_path=\"hifigan.onnx\", # Your exported ONNX model\n",
728
+ " persian_dict_path=\"/content/drive/MyDrive/cNotebooks/perTTS/tts-nemo-fastpitch-hifigan-convert-to-onnx/persian-dict/persian-v4.0.dict\"\n",
729
+ " )\n",
730
+ "\n",
731
+ " # Test tokenizer first\n",
732
+ " persian_tts.test_tokenizer_consistency()\n",
733
+ "\n",
734
+ " # Generate speech for Persian text\n",
735
+ " persian_texts = [\n",
736
+ " 'سلام دنیا',\n",
737
+ " 'مدل تبدیل متن به گفتار پارسی',\n",
738
+ " 'این یک تست از سیستم تولید گفتار است',\n",
739
+ " 'پردازش زبان طبیعی فارسی'\n",
740
+ " ]\n",
741
+ "\n",
742
+ " for i, text in enumerate(persian_texts):\n",
743
+ " print(f\"\\n{'='*80}\")\n",
744
+ " try:\n",
745
+ " audio = persian_tts.text_to_speech(\n",
746
+ " text=text,\n",
747
+ " output_path=f\"persian_output_{i+1}.wav\",\n",
748
+ " sample_rate=22050\n",
749
+ " )\n",
750
+ " print(f\"✅ Successfully generated audio for text {i+1}\")\n",
751
+ "\n",
752
+ " except Exception as e:\n",
753
+ " print(f\"❌ Failed to generate audio for text {i+1}: {e}\")\n",
754
+ " # Debug the tokenization for this text\n",
755
+ " persian_tts.compare_with_training_tokenizer(text)\n",
756
+ "\n",
757
+ " return persian_tts\n",
758
+ "\n",
759
+ "if __name__ == \"__main__\":\n",
760
+ " tts_pipeline = main()"
761
+ ],
762
+ "metadata": {
763
+ "colab": {
764
+ "base_uri": "https://localhost:8080/"
765
+ },
766
+ "id": "ZlNsGG8hTcRx",
767
+ "outputId": "8bce8a81-78ac-4c6b-e376-bcbc55b7a064"
768
+ },
769
+ "execution_count": 23,
770
+ "outputs": [
771
+ {
772
+ "output_type": "stream",
773
+ "name": "stdout",
774
+ "text": [
775
+ "Initializing Persian tokenizer...\n",
776
+ "FastPitch inputs: ['text']\n",
777
+ "FastPitch outputs: ['mel_spec', 'seq_lens', 'durs_predicted', 'log_durs_predicted', 'res']\n",
778
+ "HiFiGAN inputs: ['mel_spec']\n",
779
+ "HiFiGAN outputs: ['audio']\n",
780
+ "Test tokenization: 'مدل تبدیل متن به گفتار پارسی' -> [0, 26, 55, 9, 54, 25, 0, 3, 53, 1]...\n",
781
+ "🧪 Testing tokenizer consistency:\n",
782
+ " 'سلام دنیا' -> 13 tokens -> ' |s|a|l|Λ|m| |d|o|n|y|Λ| '\n",
783
+ " 'مدل تبدیل متن به گفتار پارسی' -> 35 tokens -> ' |m|o|d|e|l| |t|a|b|d|i|l| |m|a|t|n| |b|E| |g|o|f|t|Λ|r| |p|Λ|r|s|i| '\n",
784
+ " 'این یک تست است' -> 17 tokens -> ' |I|n| |y|e|k| |t|e|s|t| |ą|s|t| '\n",
785
+ " 'پردازش زبان طبیعی' -> 23 tokens -> ' |p|a|r|d|Λ|z|e|S| |z|a|b|Λ|n| |T|a|b|i|ʔ|i| '\n",
786
+ "\n",
787
+ "================================================================================\n",
788
+ "🎙️ Generating Persian speech for: 'سلام دنیا'\n",
789
+ "============================================================\n",
790
+ "📊 Generating mel spectrogram...\n",
791
+ "Text: 'سلام دنیا'\n",
792
+ "Tokens length: 13\n",
793
+ "First 20 tokens: [0, 14, 53, 25, 43, 26, 0, 9, 55, 27, 32, 43, 0]\n",
794
+ "Token range: [0, 55]\n",
795
+ "FastPitch inputs: ['text']\n",
796
+ " text: shape (1, 13), dtype int64\n",
797
+ "Generated mel spectrogram shape: (1, 80, 106)\n",
798
+ "Mel range: [-11.0657, -1.1308]\n",
799
+ "🔊 Generating audio...\n",
800
+ "HiFiGAN input shape: (1, 80, 106)\n",
801
+ "Generated audio shape: (27136,)\n",
802
+ "Audio range: [-0.1950, 0.1185]\n",
803
+ "Audio RMS: 0.0259\n",
804
+ "💾 Audio saved to: persian_output_1.wav\n",
805
+ "✅ Persian TTS generation completed!\n",
806
+ "✅ Successfully generated audio for text 1\n",
807
+ "\n",
808
+ "================================================================================\n",
809
+ "🎙️ Generating Persian speech for: 'مدل تبدیل متن به گفتار پارسی'\n",
810
+ "============================================================\n",
811
+ "📊 Generating mel spectrogram...\n",
812
+ "Text: 'مدل تبدیل متن به گفتار پارسی'\n",
813
+ "Tokens length: 35\n",
814
+ "First 20 tokens: [0, 26, 55, 9, 54, 25, 0, 3, 53, 1, 9, 50, 25, 0, 26, 53, 3, 27, 0, 1]\n",
815
+ "Token range: [0, 55]\n",
816
+ "FastPitch inputs: ['text']\n",
817
+ " text: shape (1, 35), dtype int64\n",
818
+ "Generated mel spectrogram shape: (1, 80, 240)\n",
819
+ "Mel range: [-10.2846, 0.0889]\n",
820
+ "🔊 Generating audio...\n",
821
+ "HiFiGAN input shape: (1, 80, 240)\n",
822
+ "Generated audio shape: (61440,)\n",
823
+ "Audio range: [-0.4547, 0.4433]\n",
824
+ "Audio RMS: 0.0730\n",
825
+ "💾 Audio saved to: persian_output_2.wav\n",
826
+ "✅ Persian TTS generation completed!\n",
827
+ "✅ Successfully generated audio for text 2\n",
828
+ "\n",
829
+ "================================================================================\n",
830
+ "🎙️ Generating Persian speech for: 'این یک تست از سیستم تولید گفتار است'\n",
831
+ "============================================================\n",
832
+ "📊 Generating mel spectrogram...\n",
833
+ "Text: 'این یک تست از سیستم تولید گفتار است'\n",
834
+ "Tokens length: 40\n",
835
+ "First 20 tokens: [0, 51, 27, 0, 32, 54, 24, 0, 3, 54, 14, 3, 0, 44, 12, 0, 14, 50, 14, 3]\n",
836
+ "Token range: [0, 55]\n",
837
+ "FastPitch inputs: ['text']\n",
838
+ " text: shape (1, 40), dtype int64\n",
839
+ "Generated mel spectrogram shape: (1, 80, 275)\n",
840
+ "Mel range: [-10.2355, 0.9884]\n",
841
+ "🔊 Generating audio...\n",
842
+ "HiFiGAN input shape: (1, 80, 275)\n",
843
+ "Generated audio shape: (70400,)\n",
844
+ "Audio range: [-0.6646, 0.4960]\n",
845
+ "Audio RMS: 0.1006\n",
846
+ "💾 Audio saved to: persian_output_3.wav\n",
847
+ "✅ Persian TTS generation completed!\n",
848
+ "✅ Successfully generated audio for text 3\n",
849
+ "\n",
850
+ "================================================================================\n",
851
+ "🎙️ Generating Persian speech for: 'پردازش زبان طبیعی فارسی'\n",
852
+ "============================================================\n",
853
+ "📊 Generating mel spectrogram...\n",
854
+ "Text: 'پردازش زبان طبیعی فارسی'\n",
855
+ "Tokens length: 29\n",
856
+ "First 20 tokens: [0, 2, 53, 11, 9, 43, 12, 54, 15, 0, 12, 53, 1, 43, 27, 0, 18, 53, 1, 50]\n",
857
+ "Token range: [0, 54]\n",
858
+ "FastPitch inputs: ['text']\n",
859
+ " text: shape (1, 29), dtype int64\n",
860
+ "Generated mel spectrogram shape: (1, 80, 214)\n",
861
+ "Mel range: [-11.0496, -0.7565]\n",
862
+ "🔊 Generating audio...\n",
863
+ "HiFiGAN input shape: (1, 80, 214)\n",
864
+ "Generated audio shape: (54784,)\n",
865
+ "Audio range: [-0.2387, 0.2220]\n",
866
+ "Audio RMS: 0.0293\n",
867
+ "💾 Audio saved to: persian_output_4.wav\n",
868
+ "✅ Persian TTS generation completed!\n",
869
+ "✅ Successfully generated audio for text 4\n"
870
+ ]
871
+ }
872
+ ]
873
+ },
874
+ {
875
+ "cell_type": "code",
876
+ "source": [],
877
+ "metadata": {
878
+ "id": "rLrOthW4VUqZ"
879
+ },
880
+ "execution_count": null,
881
+ "outputs": []
882
+ }
883
+ ]
884
+ }