tts_fa_fastpitch_hifigan-v2.0
Browse files- .gitattributes +3 -0
- tts_fa_fastpitch_hifigan-v2.0/.gitattributes +38 -0
- tts_fa_fastpitch_hifigan-v2.0/README.md +65 -0
- tts_fa_fastpitch_hifigan-v2.0/config/fastpitch/.ipynb_checkpoints/fastpitch_align_22050_grapheme-checkpoint.yaml +248 -0
- tts_fa_fastpitch_hifigan-v2.0/config/fastpitch/ds_for_fastpitch_align.yaml +47 -0
- tts_fa_fastpitch_hifigan-v2.0/config/fastpitch/fastpitch_align_22050_grapheme.yaml +248 -0
- tts_fa_fastpitch_hifigan-v2.0/config/hifigan/.ipynb_checkpoints/hifigan-checkpoint.yaml +99 -0
- tts_fa_fastpitch_hifigan-v2.0/config/hifigan/hifigan.yaml +99 -0
- tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/generator/.ipynb_checkpoints/v1-checkpoint.yaml +7 -0
- tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/generator/v1.yaml +7 -0
- tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/train_ds/.ipynb_checkpoints/train_ds_finetune-checkpoint.yaml +15 -0
- tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/train_ds/train_ds_finetune.yaml +15 -0
- tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/train_ds_finetune.yaml +15 -0
- tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/v1.yaml +7 -0
- tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/val_ds_finetune.yaml +15 -0
- tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/validation_ds/.ipynb_checkpoints/val_ds_finetune-checkpoint.yaml +15 -0
- tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/validation_ds/val_ds_finetune.yaml +15 -0
- tts_fa_fastpitch_hifigan-v2.0/models/FastPitch--val_loss-0.7236-epoch-50.ckpt +3 -0
- tts_fa_fastpitch_hifigan-v2.0/models/FastPitch--val_loss-0.7236-epoch-50.nemo +3 -0
- tts_fa_fastpitch_hifigan-v2.0/models/HifiGan--val_loss-0.6090-epoch-39-last.nemo +3 -0
- tts_fa_fastpitch_hifigan-v2.0/models/HifiGan--val_loss-2.0688-epoch-12-last.ckpt +3 -0
- tts_fa_fastpitch_hifigan-v2.0/models/HifiGan--val_loss-2.0733-epoch-12-last.nemo +3 -0
- tts_fa_fastpitch_hifigan-v2.0/models/onnx-models/fastpitch.onnx +3 -0
- tts_fa_fastpitch_hifigan-v2.0/models/onnx-models/hifigan.onnx +3 -0
- tts_fa_fastpitch_hifigan-v2.0/persian-dict/persian-v5.0.dict +0 -0
- tts_fa_fastpitch_hifigan-v2.0/scripts/.ipynb_checkpoints/generate_mels-checkpoint.py +181 -0
- tts_fa_fastpitch_hifigan-v2.0/scripts/.ipynb_checkpoints/hifigan_finetune-checkpoint.py +32 -0
- tts_fa_fastpitch_hifigan-v2.0/scripts/extract_sup_data.py +83 -0
- tts_fa_fastpitch_hifigan-v2.0/scripts/fastpitch.py +35 -0
- tts_fa_fastpitch_hifigan-v2.0/scripts/generate_mels.py +181 -0
- tts_fa_fastpitch_hifigan-v2.0/scripts/hifigan_finetune.py +32 -0
- tts_fa_fastpitch_hifigan-v2.0/source.txt +1 -0
- tts_fa_fastpitch_hifigan-v2.0/tts-nemo-fastpitch-hifigan-from-scratch-finetuning-hifigan.ipynb +0 -0
- tts_fa_fastpitch_hifigan-v2.0/tts-nemo-fastpitch-hifigan-inference-only.ipynb +0 -0
- tts_fa_fastpitch_hifigan-v2.0/tts_nemo_fastpitch_hifigan_convert_to_onnx.ipynb +884 -0
.gitattributes
CHANGED
|
@@ -48,3 +48,6 @@ hifigan_for_sherpa/pretrained/UNIVERSAL_V1/g_02500000 filter=lfs diff=lfs merge=
|
|
| 48 |
hifigan_for_sherpa/pretrained/VCTK_V1/generator_v1 filter=lfs diff=lfs merge=lfs -text
|
| 49 |
hifigan_for_sherpa/pretrained/VCTK_V2/generator_v2 filter=lfs diff=lfs merge=lfs -text
|
| 50 |
hifigan_for_sherpa/pretrained/VCTK_V3/generator_v3 filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
hifigan_for_sherpa/pretrained/VCTK_V1/generator_v1 filter=lfs diff=lfs merge=lfs -text
|
| 49 |
hifigan_for_sherpa/pretrained/VCTK_V2/generator_v2 filter=lfs diff=lfs merge=lfs -text
|
| 50 |
hifigan_for_sherpa/pretrained/VCTK_V3/generator_v3 filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
tts_fa_fastpitch_hifigan-v2.0/models/FastPitch--val_loss-0.7236-epoch-50.nemo filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
tts_fa_fastpitch_hifigan-v2.0/models/HifiGan--val_loss-0.6090-epoch-39-last.nemo filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
tts_fa_fastpitch_hifigan-v2.0/models/HifiGan--val_loss-2.0733-epoch-12-last.nemo filter=lfs diff=lfs merge=lfs -text
|
tts_fa_fastpitch_hifigan-v2.0/.gitattributes
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
models/FastPitch--val_loss-0.7236-epoch-50.nemo filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
models/HifiGan--val_loss-0.6090-epoch-39-last.nemo filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
models/HifiGan--val_loss-2.0733-epoch-12-last.nemo filter=lfs diff=lfs merge=lfs -text
|
tts_fa_fastpitch_hifigan-v2.0/README.md
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
---
|
| 4 |
+
## FastPitch and HifiGan v2.0
|
| 5 |
+
|
| 6 |
+
v2.0 of phonemizer and tokenizer. tokenzier `DO SUPPORT` pauses, emotion tokens etc,.
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
### Install NeMo
|
| 10 |
+
```bash
|
| 11 |
+
apt-get update && apt-get install -y libsndfile1 ffmpeg
|
| 12 |
+
pip install Cython packaging
|
| 13 |
+
rm -rf /usr/lib/python3.10/site-packages/blinker*
|
| 14 |
+
rm -rf /usr/local/lib/python3.10/dist-packages/blinker*
|
| 15 |
+
pip install --ignore-installed blinker
|
| 16 |
+
pip install --upgrade --force-reinstall blinker
|
| 17 |
+
|
| 18 |
+
git clone https://github.com/SadeghKrmi/NeMo.git
|
| 19 |
+
cd NeMo
|
| 20 |
+
pip install -e '.[all]'
|
| 21 |
+
```
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
### deterministic split
|
| 25 |
+
Run the deterministic-train-test-split.py to split the train/test
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
### Extract the supportive data
|
| 29 |
+
using the following scripts, extract pitch statistics
|
| 30 |
+
```bash
|
| 31 |
+
tar -xzf dataset_splits.tar.gz
|
| 32 |
+
|
| 33 |
+
cd extract-supportive-data
|
| 34 |
+
HYDRA_FULL_ERROR=1 python3 ./scripts/extract_sup_data.py \
|
| 35 |
+
--config-path ../config/fastpitch/ \
|
| 36 |
+
--config-name ds_for_fastpitch_align.yaml \
|
| 37 |
+
manifest_filepath=./dataset_splits/train/train.jsonl \
|
| 38 |
+
sup_data_path=sup_data \
|
| 39 |
+
phoneme_dict_path=./persian-dict/persian-v4.0.dict \
|
| 40 |
+
++dataloader_params.num_workers=8
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
#### dataset sup pitch stats
|
| 44 |
+
PITCH_MEAN=98.72935485839844, PITCH_STD=29.40760040283203
|
| 45 |
+
PITCH_MIN=65.4063949584961, PITCH_MAX=2093.004638671875
|
| 46 |
+
|
| 47 |
+
### zip and download
|
| 48 |
+
```bash
|
| 49 |
+
tar -czf sup_data.tar.gz sup_data
|
| 50 |
+
```
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
### Training FastPitch
|
| 54 |
+
training for about 800 epochs, with CosineAnnealing sched. and `max_steps` 200,000 for lr to decay overtime.
|
| 55 |
+
|
| 56 |
+
val_loss didn't decrease lower that about 0.77xx
|
| 57 |
+
|
| 58 |
+
`val_loss = mel_loss + dur_loss + pitch_loss + energy_loss`
|
| 59 |
+
|
| 60 |
+
### Training HiFiGAN
|
| 61 |
+
training for about 40 epochs, stoped the training based on quality checking by listening to audios
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
|
tts_fa_fastpitch_hifigan-v2.0/config/fastpitch/.ipynb_checkpoints/fastpitch_align_22050_grapheme-checkpoint.yaml
ADDED
|
@@ -0,0 +1,248 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This config contains the default values for training FastPitch model with aligner using 22KHz sampling
|
| 2 |
+
# rate. If you want to train model on other dataset, you can change config values according to your dataset.
|
| 3 |
+
# Most dataset-specific arguments are in the head of the config file, see below.
|
| 4 |
+
|
| 5 |
+
name: FastPitch
|
| 6 |
+
|
| 7 |
+
train_dataset: ???
|
| 8 |
+
validation_datasets: ???
|
| 9 |
+
sup_data_path: ???
|
| 10 |
+
sup_data_types: [ "align_prior_matrix", "pitch" ]
|
| 11 |
+
|
| 12 |
+
phoneme_dict_path: ???
|
| 13 |
+
|
| 14 |
+
# Default values from librosa.pyin
|
| 15 |
+
pitch_fmin: 65.4063949584961
|
| 16 |
+
pitch_fmax: 2093.004638671875
|
| 17 |
+
|
| 18 |
+
# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
|
| 19 |
+
# by running `scripts/dataset_processing/tts/extract_sup_data.py`
|
| 20 |
+
pitch_mean: 103.01591491699219 # e.g. 132.524658203125 for https://zenodo.org/record/5525342/files/thorsten-neutral_v03.tgz?download=1
|
| 21 |
+
pitch_std: 30.397296905517578 # e.g. 37.389366149902 for https://zenodo.org/record/5525342/files/thorsten-neutral_v03.tgz?download=1
|
| 22 |
+
|
| 23 |
+
# Default values for dataset with sample_rate=22050
|
| 24 |
+
sample_rate: 22050
|
| 25 |
+
n_mel_channels: 80
|
| 26 |
+
n_window_size: 1024
|
| 27 |
+
n_window_stride: 256
|
| 28 |
+
n_fft: 1024
|
| 29 |
+
lowfreq: 0
|
| 30 |
+
highfreq: null
|
| 31 |
+
window: hann
|
| 32 |
+
|
| 33 |
+
model:
|
| 34 |
+
learn_alignment: true
|
| 35 |
+
bin_loss_warmup_epochs: 100
|
| 36 |
+
|
| 37 |
+
n_speakers: 1
|
| 38 |
+
max_token_duration: 75
|
| 39 |
+
symbols_embedding_dim: 384
|
| 40 |
+
pitch_embedding_kernel_size: 3
|
| 41 |
+
|
| 42 |
+
pitch_fmin: ${pitch_fmin}
|
| 43 |
+
pitch_fmax: ${pitch_fmax}
|
| 44 |
+
|
| 45 |
+
pitch_mean: ${pitch_mean}
|
| 46 |
+
pitch_std: ${pitch_std}
|
| 47 |
+
|
| 48 |
+
sample_rate: ${sample_rate}
|
| 49 |
+
n_mel_channels: ${n_mel_channels}
|
| 50 |
+
n_window_size: ${n_window_size}
|
| 51 |
+
n_window_stride: ${n_window_stride}
|
| 52 |
+
n_fft: ${n_fft}
|
| 53 |
+
lowfreq: ${lowfreq}
|
| 54 |
+
highfreq: ${highfreq}
|
| 55 |
+
window: ${window}
|
| 56 |
+
|
| 57 |
+
# text_normalizer:
|
| 58 |
+
# _target_: nemo_text_processing.text_normalization.normalize.Normalizer
|
| 59 |
+
# lang: de
|
| 60 |
+
# input_case: cased
|
| 61 |
+
|
| 62 |
+
# text_normalizer_call_kwargs:
|
| 63 |
+
# verbose: false
|
| 64 |
+
# punct_pre_process: true
|
| 65 |
+
# punct_post_process: true
|
| 66 |
+
|
| 67 |
+
text_tokenizer:
|
| 68 |
+
_target_: nemo.collections.tts.g2p.models.fa_ir_persian.tokenizer.PersianPhonemesTokenizer
|
| 69 |
+
punct: true
|
| 70 |
+
use_emotion_tokens: true
|
| 71 |
+
use_pause_tokens: true
|
| 72 |
+
use_speed_tokens: true
|
| 73 |
+
g2p:
|
| 74 |
+
_target_: nemo.collections.tts.g2p.models.fa_ir_persian.g2p.PersianG2p
|
| 75 |
+
phoneme_dict: ${phoneme_dict_path}
|
| 76 |
+
|
| 77 |
+
train_ds:
|
| 78 |
+
dataset:
|
| 79 |
+
_target_: nemo.collections.tts.data.dataset.TTSDataset
|
| 80 |
+
manifest_filepath: ${train_dataset}
|
| 81 |
+
sample_rate: ${model.sample_rate}
|
| 82 |
+
sup_data_path: ${sup_data_path}
|
| 83 |
+
sup_data_types: ${sup_data_types}
|
| 84 |
+
n_fft: ${model.n_fft}
|
| 85 |
+
win_length: ${model.n_window_size}
|
| 86 |
+
hop_length: ${model.n_window_stride}
|
| 87 |
+
window: ${model.window}
|
| 88 |
+
n_mels: ${model.n_mel_channels}
|
| 89 |
+
lowfreq: ${model.lowfreq}
|
| 90 |
+
highfreq: ${model.highfreq}
|
| 91 |
+
max_duration: 25 # change to null to include longer audios.
|
| 92 |
+
min_duration: 0.1
|
| 93 |
+
ignore_file: null
|
| 94 |
+
trim: true
|
| 95 |
+
trim_top_db: 50
|
| 96 |
+
trim_frame_length: ${model.n_window_size}
|
| 97 |
+
trim_hop_length: ${model.n_window_stride}
|
| 98 |
+
pitch_fmin: ${model.pitch_fmin}
|
| 99 |
+
pitch_fmax: ${model.pitch_fmax}
|
| 100 |
+
pitch_norm: true
|
| 101 |
+
pitch_mean: ${model.pitch_mean}
|
| 102 |
+
pitch_std: ${model.pitch_std}
|
| 103 |
+
|
| 104 |
+
dataloader_params:
|
| 105 |
+
drop_last: false
|
| 106 |
+
shuffle: true
|
| 107 |
+
batch_size: 32
|
| 108 |
+
num_workers: 12
|
| 109 |
+
pin_memory: true
|
| 110 |
+
|
| 111 |
+
validation_ds:
|
| 112 |
+
dataset:
|
| 113 |
+
_target_: nemo.collections.tts.data.dataset.TTSDataset
|
| 114 |
+
manifest_filepath: ${validation_datasets}
|
| 115 |
+
sample_rate: ${model.sample_rate}
|
| 116 |
+
sup_data_path: ${sup_data_path}
|
| 117 |
+
sup_data_types: ${sup_data_types}
|
| 118 |
+
n_fft: ${model.n_fft}
|
| 119 |
+
win_length: ${model.n_window_size}
|
| 120 |
+
hop_length: ${model.n_window_stride}
|
| 121 |
+
window: ${model.window}
|
| 122 |
+
n_mels: ${model.n_mel_channels}
|
| 123 |
+
lowfreq: ${model.lowfreq}
|
| 124 |
+
highfreq: ${model.highfreq}
|
| 125 |
+
max_duration: 25 # change to null to include longer audios.
|
| 126 |
+
min_duration: 0.1
|
| 127 |
+
ignore_file: null
|
| 128 |
+
trim: true
|
| 129 |
+
trim_top_db: 50
|
| 130 |
+
trim_frame_length: ${model.n_window_size}
|
| 131 |
+
trim_hop_length: ${model.n_window_stride}
|
| 132 |
+
pitch_fmin: ${model.pitch_fmin}
|
| 133 |
+
pitch_fmax: ${model.pitch_fmax}
|
| 134 |
+
pitch_norm: true
|
| 135 |
+
pitch_mean: ${model.pitch_mean}
|
| 136 |
+
pitch_std: ${model.pitch_std}
|
| 137 |
+
|
| 138 |
+
dataloader_params:
|
| 139 |
+
drop_last: false
|
| 140 |
+
shuffle: false
|
| 141 |
+
batch_size: 32
|
| 142 |
+
num_workers: 8
|
| 143 |
+
pin_memory: true
|
| 144 |
+
|
| 145 |
+
preprocessor:
|
| 146 |
+
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
|
| 147 |
+
features: ${model.n_mel_channels}
|
| 148 |
+
lowfreq: ${model.lowfreq}
|
| 149 |
+
highfreq: ${model.highfreq}
|
| 150 |
+
n_fft: ${model.n_fft}
|
| 151 |
+
n_window_size: ${model.n_window_size}
|
| 152 |
+
window_size: false
|
| 153 |
+
n_window_stride: ${model.n_window_stride}
|
| 154 |
+
window_stride: false
|
| 155 |
+
pad_to: 1
|
| 156 |
+
pad_value: 0
|
| 157 |
+
sample_rate: ${model.sample_rate}
|
| 158 |
+
window: ${model.window}
|
| 159 |
+
normalize: null
|
| 160 |
+
preemph: null
|
| 161 |
+
dither: 0.0
|
| 162 |
+
frame_splicing: 1
|
| 163 |
+
log: true
|
| 164 |
+
log_zero_guard_type: add
|
| 165 |
+
log_zero_guard_value: 1e-05
|
| 166 |
+
mag_power: 1.0
|
| 167 |
+
|
| 168 |
+
input_fft: #n_embed and padding_idx are added by the model
|
| 169 |
+
_target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder
|
| 170 |
+
n_layer: 6
|
| 171 |
+
n_head: 1
|
| 172 |
+
d_model: ${model.symbols_embedding_dim}
|
| 173 |
+
d_head: 64
|
| 174 |
+
d_inner: 1536
|
| 175 |
+
kernel_size: 3
|
| 176 |
+
dropout: 0.1
|
| 177 |
+
dropatt: 0.1
|
| 178 |
+
dropemb: 0.0
|
| 179 |
+
d_embed: ${model.symbols_embedding_dim}
|
| 180 |
+
|
| 181 |
+
output_fft:
|
| 182 |
+
_target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder
|
| 183 |
+
n_layer: 6
|
| 184 |
+
n_head: 1
|
| 185 |
+
d_model: ${model.symbols_embedding_dim}
|
| 186 |
+
d_head: 64
|
| 187 |
+
d_inner: 1536
|
| 188 |
+
kernel_size: 3
|
| 189 |
+
dropout: 0.1
|
| 190 |
+
dropatt: 0.1
|
| 191 |
+
dropemb: 0.0
|
| 192 |
+
|
| 193 |
+
alignment_module:
|
| 194 |
+
_target_: nemo.collections.tts.modules.aligner.AlignmentEncoder
|
| 195 |
+
n_text_channels: ${model.symbols_embedding_dim}
|
| 196 |
+
|
| 197 |
+
duration_predictor:
|
| 198 |
+
_target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
|
| 199 |
+
input_size: ${model.symbols_embedding_dim}
|
| 200 |
+
kernel_size: 3
|
| 201 |
+
filter_size: 256
|
| 202 |
+
dropout: 0.1
|
| 203 |
+
n_layers: 2
|
| 204 |
+
|
| 205 |
+
pitch_predictor:
|
| 206 |
+
_target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
|
| 207 |
+
input_size: ${model.symbols_embedding_dim}
|
| 208 |
+
kernel_size: 3
|
| 209 |
+
filter_size: 256
|
| 210 |
+
dropout: 0.1
|
| 211 |
+
n_layers: 2
|
| 212 |
+
|
| 213 |
+
optim:
|
| 214 |
+
name: adamw
|
| 215 |
+
lr: 1e-3
|
| 216 |
+
betas: [0.9, 0.98]
|
| 217 |
+
weight_decay: 1e-3
|
| 218 |
+
|
| 219 |
+
sched:
|
| 220 |
+
name: CosineAnnealing
|
| 221 |
+
warmup_steps: 2000
|
| 222 |
+
last_epoch: -1
|
| 223 |
+
min_lr: 1e-6
|
| 224 |
+
|
| 225 |
+
trainer:
|
| 226 |
+
num_nodes: 1
|
| 227 |
+
devices: -1 # specify all GPUs regardless of its availability
|
| 228 |
+
accelerator: gpu
|
| 229 |
+
strategy: ddp
|
| 230 |
+
precision: 16
|
| 231 |
+
max_epochs: 1500
|
| 232 |
+
accumulate_grad_batches: 1
|
| 233 |
+
gradient_clip_val: 1000.0
|
| 234 |
+
enable_checkpointing: false # Provided by exp_manager
|
| 235 |
+
logger: false # Provided by exp_manager
|
| 236 |
+
log_every_n_steps: 100
|
| 237 |
+
check_val_every_n_epoch: 5
|
| 238 |
+
benchmark: false
|
| 239 |
+
|
| 240 |
+
exp_manager:
|
| 241 |
+
exp_dir: null
|
| 242 |
+
name: ${name}
|
| 243 |
+
create_tensorboard_logger: true
|
| 244 |
+
create_checkpoint_callback: true
|
| 245 |
+
checkpoint_callback_params:
|
| 246 |
+
monitor: val_loss
|
| 247 |
+
resume_if_exists: false
|
| 248 |
+
resume_ignore_no_checkpoint: false
|
tts_fa_fastpitch_hifigan-v2.0/config/fastpitch/ds_for_fastpitch_align.yaml
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: "ds_for_fastpitch_align"
|
| 2 |
+
|
| 3 |
+
manifest_filepath: ???
|
| 4 |
+
sup_data_path: ???
|
| 5 |
+
sup_data_types: [ "align_prior_matrix", "pitch" ]
|
| 6 |
+
phoneme_dict_path: ???
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
dataset:
|
| 10 |
+
_target_: nemo.collections.tts.data.dataset.TTSDataset
|
| 11 |
+
manifest_filepath: ${manifest_filepath}
|
| 12 |
+
sample_rate: 22050
|
| 13 |
+
sup_data_path: ${sup_data_path}
|
| 14 |
+
sup_data_types: ${sup_data_types}
|
| 15 |
+
n_fft: 1024
|
| 16 |
+
win_length: 1024
|
| 17 |
+
hop_length: 256
|
| 18 |
+
window: "hann"
|
| 19 |
+
n_mels: 80
|
| 20 |
+
lowfreq: 0
|
| 21 |
+
highfreq: 8000
|
| 22 |
+
max_duration: null
|
| 23 |
+
min_duration: 0.1
|
| 24 |
+
ignore_file: null
|
| 25 |
+
trim: false
|
| 26 |
+
pitch_fmin: 65.40639132514966
|
| 27 |
+
pitch_fmax: 2093.004522404789
|
| 28 |
+
|
| 29 |
+
# text_normalizer:
|
| 30 |
+
# _target_: nemo_text_processing.text_normalization.normalize.Normalizer
|
| 31 |
+
# lang: en
|
| 32 |
+
# input_case: cased
|
| 33 |
+
|
| 34 |
+
# text_normalizer_call_kwargs:
|
| 35 |
+
# verbose: false
|
| 36 |
+
# punct_pre_process: true
|
| 37 |
+
# punct_post_process: true
|
| 38 |
+
|
| 39 |
+
text_tokenizer:
|
| 40 |
+
_target_: nemo.collections.tts.g2p.models.fa_ir_persian.tokenizer.PersianPhonemesTokenizer
|
| 41 |
+
punct: true
|
| 42 |
+
use_emotion_tokens: true
|
| 43 |
+
use_pause_tokens: true
|
| 44 |
+
use_speed_tokens: true
|
| 45 |
+
g2p:
|
| 46 |
+
_target_: nemo.collections.tts.g2p.models.fa_ir_persian.g2p.PersianG2p
|
| 47 |
+
phoneme_dict: ${phoneme_dict_path}
|
tts_fa_fastpitch_hifigan-v2.0/config/fastpitch/fastpitch_align_22050_grapheme.yaml
ADDED
|
@@ -0,0 +1,248 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This config contains the default values for training FastPitch model with aligner using 22KHz sampling
|
| 2 |
+
# rate. If you want to train model on other dataset, you can change config values according to your dataset.
|
| 3 |
+
# Most dataset-specific arguments are in the head of the config file, see below.
|
| 4 |
+
|
| 5 |
+
name: FastPitch
|
| 6 |
+
|
| 7 |
+
train_dataset: ???
|
| 8 |
+
validation_datasets: ???
|
| 9 |
+
sup_data_path: ???
|
| 10 |
+
sup_data_types: [ "align_prior_matrix", "pitch" ]
|
| 11 |
+
|
| 12 |
+
phoneme_dict_path: ???
|
| 13 |
+
|
| 14 |
+
# Default values from librosa.pyin
|
| 15 |
+
pitch_fmin: 65.4063949584961
|
| 16 |
+
pitch_fmax: 2093.004638671875
|
| 17 |
+
|
| 18 |
+
# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
|
| 19 |
+
# by running `scripts/dataset_processing/tts/extract_sup_data.py`
|
| 20 |
+
pitch_mean: 103.01591491699219 # e.g. 132.524658203125 for https://zenodo.org/record/5525342/files/thorsten-neutral_v03.tgz?download=1
|
| 21 |
+
pitch_std: 30.397296905517578 # e.g. 37.389366149902 for https://zenodo.org/record/5525342/files/thorsten-neutral_v03.tgz?download=1
|
| 22 |
+
|
| 23 |
+
# Default values for dataset with sample_rate=22050
|
| 24 |
+
sample_rate: 22050
|
| 25 |
+
n_mel_channels: 80
|
| 26 |
+
n_window_size: 1024
|
| 27 |
+
n_window_stride: 256
|
| 28 |
+
n_fft: 1024
|
| 29 |
+
lowfreq: 0
|
| 30 |
+
highfreq: null
|
| 31 |
+
window: hann
|
| 32 |
+
|
| 33 |
+
model:
|
| 34 |
+
learn_alignment: true
|
| 35 |
+
bin_loss_warmup_epochs: 100
|
| 36 |
+
|
| 37 |
+
n_speakers: 1
|
| 38 |
+
max_token_duration: 75
|
| 39 |
+
symbols_embedding_dim: 384
|
| 40 |
+
pitch_embedding_kernel_size: 3
|
| 41 |
+
|
| 42 |
+
pitch_fmin: ${pitch_fmin}
|
| 43 |
+
pitch_fmax: ${pitch_fmax}
|
| 44 |
+
|
| 45 |
+
pitch_mean: ${pitch_mean}
|
| 46 |
+
pitch_std: ${pitch_std}
|
| 47 |
+
|
| 48 |
+
sample_rate: ${sample_rate}
|
| 49 |
+
n_mel_channels: ${n_mel_channels}
|
| 50 |
+
n_window_size: ${n_window_size}
|
| 51 |
+
n_window_stride: ${n_window_stride}
|
| 52 |
+
n_fft: ${n_fft}
|
| 53 |
+
lowfreq: ${lowfreq}
|
| 54 |
+
highfreq: ${highfreq}
|
| 55 |
+
window: ${window}
|
| 56 |
+
|
| 57 |
+
# text_normalizer:
|
| 58 |
+
# _target_: nemo_text_processing.text_normalization.normalize.Normalizer
|
| 59 |
+
# lang: de
|
| 60 |
+
# input_case: cased
|
| 61 |
+
|
| 62 |
+
# text_normalizer_call_kwargs:
|
| 63 |
+
# verbose: false
|
| 64 |
+
# punct_pre_process: true
|
| 65 |
+
# punct_post_process: true
|
| 66 |
+
|
| 67 |
+
text_tokenizer:
|
| 68 |
+
_target_: nemo.collections.tts.g2p.models.fa_ir_persian.tokenizer.PersianPhonemesTokenizer
|
| 69 |
+
punct: true
|
| 70 |
+
use_emotion_tokens: true
|
| 71 |
+
use_pause_tokens: true
|
| 72 |
+
use_speed_tokens: true
|
| 73 |
+
g2p:
|
| 74 |
+
_target_: nemo.collections.tts.g2p.models.fa_ir_persian.g2p.PersianG2p
|
| 75 |
+
phoneme_dict: ${phoneme_dict_path}
|
| 76 |
+
|
| 77 |
+
train_ds:
|
| 78 |
+
dataset:
|
| 79 |
+
_target_: nemo.collections.tts.data.dataset.TTSDataset
|
| 80 |
+
manifest_filepath: ${train_dataset}
|
| 81 |
+
sample_rate: ${model.sample_rate}
|
| 82 |
+
sup_data_path: ${sup_data_path}
|
| 83 |
+
sup_data_types: ${sup_data_types}
|
| 84 |
+
n_fft: ${model.n_fft}
|
| 85 |
+
win_length: ${model.n_window_size}
|
| 86 |
+
hop_length: ${model.n_window_stride}
|
| 87 |
+
window: ${model.window}
|
| 88 |
+
n_mels: ${model.n_mel_channels}
|
| 89 |
+
lowfreq: ${model.lowfreq}
|
| 90 |
+
highfreq: ${model.highfreq}
|
| 91 |
+
max_duration: 25 # change to null to include longer audios.
|
| 92 |
+
min_duration: 0.1
|
| 93 |
+
ignore_file: null
|
| 94 |
+
trim: true
|
| 95 |
+
trim_top_db: 50
|
| 96 |
+
trim_frame_length: ${model.n_window_size}
|
| 97 |
+
trim_hop_length: ${model.n_window_stride}
|
| 98 |
+
pitch_fmin: ${model.pitch_fmin}
|
| 99 |
+
pitch_fmax: ${model.pitch_fmax}
|
| 100 |
+
pitch_norm: true
|
| 101 |
+
pitch_mean: ${model.pitch_mean}
|
| 102 |
+
pitch_std: ${model.pitch_std}
|
| 103 |
+
|
| 104 |
+
dataloader_params:
|
| 105 |
+
drop_last: false
|
| 106 |
+
shuffle: true
|
| 107 |
+
batch_size: 32
|
| 108 |
+
num_workers: 12
|
| 109 |
+
pin_memory: true
|
| 110 |
+
|
| 111 |
+
validation_ds:
|
| 112 |
+
dataset:
|
| 113 |
+
_target_: nemo.collections.tts.data.dataset.TTSDataset
|
| 114 |
+
manifest_filepath: ${validation_datasets}
|
| 115 |
+
sample_rate: ${model.sample_rate}
|
| 116 |
+
sup_data_path: ${sup_data_path}
|
| 117 |
+
sup_data_types: ${sup_data_types}
|
| 118 |
+
n_fft: ${model.n_fft}
|
| 119 |
+
win_length: ${model.n_window_size}
|
| 120 |
+
hop_length: ${model.n_window_stride}
|
| 121 |
+
window: ${model.window}
|
| 122 |
+
n_mels: ${model.n_mel_channels}
|
| 123 |
+
lowfreq: ${model.lowfreq}
|
| 124 |
+
highfreq: ${model.highfreq}
|
| 125 |
+
max_duration: 25 # change to null to include longer audios.
|
| 126 |
+
min_duration: 0.1
|
| 127 |
+
ignore_file: null
|
| 128 |
+
trim: true
|
| 129 |
+
trim_top_db: 50
|
| 130 |
+
trim_frame_length: ${model.n_window_size}
|
| 131 |
+
trim_hop_length: ${model.n_window_stride}
|
| 132 |
+
pitch_fmin: ${model.pitch_fmin}
|
| 133 |
+
pitch_fmax: ${model.pitch_fmax}
|
| 134 |
+
pitch_norm: true
|
| 135 |
+
pitch_mean: ${model.pitch_mean}
|
| 136 |
+
pitch_std: ${model.pitch_std}
|
| 137 |
+
|
| 138 |
+
dataloader_params:
|
| 139 |
+
drop_last: false
|
| 140 |
+
shuffle: false
|
| 141 |
+
batch_size: 32
|
| 142 |
+
num_workers: 8
|
| 143 |
+
pin_memory: true
|
| 144 |
+
|
| 145 |
+
preprocessor:
|
| 146 |
+
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
|
| 147 |
+
features: ${model.n_mel_channels}
|
| 148 |
+
lowfreq: ${model.lowfreq}
|
| 149 |
+
highfreq: ${model.highfreq}
|
| 150 |
+
n_fft: ${model.n_fft}
|
| 151 |
+
n_window_size: ${model.n_window_size}
|
| 152 |
+
window_size: false
|
| 153 |
+
n_window_stride: ${model.n_window_stride}
|
| 154 |
+
window_stride: false
|
| 155 |
+
pad_to: 1
|
| 156 |
+
pad_value: 0
|
| 157 |
+
sample_rate: ${model.sample_rate}
|
| 158 |
+
window: ${model.window}
|
| 159 |
+
normalize: null
|
| 160 |
+
preemph: null
|
| 161 |
+
dither: 0.0
|
| 162 |
+
frame_splicing: 1
|
| 163 |
+
log: true
|
| 164 |
+
log_zero_guard_type: add
|
| 165 |
+
log_zero_guard_value: 1e-05
|
| 166 |
+
mag_power: 1.0
|
| 167 |
+
|
| 168 |
+
input_fft: #n_embed and padding_idx are added by the model
|
| 169 |
+
_target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder
|
| 170 |
+
n_layer: 6
|
| 171 |
+
n_head: 1
|
| 172 |
+
d_model: ${model.symbols_embedding_dim}
|
| 173 |
+
d_head: 64
|
| 174 |
+
d_inner: 1536
|
| 175 |
+
kernel_size: 3
|
| 176 |
+
dropout: 0.1
|
| 177 |
+
dropatt: 0.1
|
| 178 |
+
dropemb: 0.0
|
| 179 |
+
d_embed: ${model.symbols_embedding_dim}
|
| 180 |
+
|
| 181 |
+
output_fft:
|
| 182 |
+
_target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder
|
| 183 |
+
n_layer: 6
|
| 184 |
+
n_head: 1
|
| 185 |
+
d_model: ${model.symbols_embedding_dim}
|
| 186 |
+
d_head: 64
|
| 187 |
+
d_inner: 1536
|
| 188 |
+
kernel_size: 3
|
| 189 |
+
dropout: 0.1
|
| 190 |
+
dropatt: 0.1
|
| 191 |
+
dropemb: 0.0
|
| 192 |
+
|
| 193 |
+
alignment_module:
|
| 194 |
+
_target_: nemo.collections.tts.modules.aligner.AlignmentEncoder
|
| 195 |
+
n_text_channels: ${model.symbols_embedding_dim}
|
| 196 |
+
|
| 197 |
+
duration_predictor:
|
| 198 |
+
_target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
|
| 199 |
+
input_size: ${model.symbols_embedding_dim}
|
| 200 |
+
kernel_size: 3
|
| 201 |
+
filter_size: 256
|
| 202 |
+
dropout: 0.1
|
| 203 |
+
n_layers: 2
|
| 204 |
+
|
| 205 |
+
pitch_predictor:
|
| 206 |
+
_target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
|
| 207 |
+
input_size: ${model.symbols_embedding_dim}
|
| 208 |
+
kernel_size: 3
|
| 209 |
+
filter_size: 256
|
| 210 |
+
dropout: 0.1
|
| 211 |
+
n_layers: 2
|
| 212 |
+
|
| 213 |
+
optim:
|
| 214 |
+
name: adamw
|
| 215 |
+
lr: 1e-3
|
| 216 |
+
betas: [0.9, 0.98]
|
| 217 |
+
weight_decay: 1e-3
|
| 218 |
+
|
| 219 |
+
sched:
|
| 220 |
+
name: CosineAnnealing
|
| 221 |
+
warmup_steps: 2000
|
| 222 |
+
last_epoch: -1
|
| 223 |
+
min_lr: 1e-6
|
| 224 |
+
|
| 225 |
+
trainer:
|
| 226 |
+
num_nodes: 1
|
| 227 |
+
devices: -1 # specify all GPUs regardless of its availability
|
| 228 |
+
accelerator: gpu
|
| 229 |
+
strategy: ddp
|
| 230 |
+
precision: 16
|
| 231 |
+
max_epochs: 1500
|
| 232 |
+
accumulate_grad_batches: 1
|
| 233 |
+
gradient_clip_val: 1000.0
|
| 234 |
+
enable_checkpointing: false # Provided by exp_manager
|
| 235 |
+
logger: false # Provided by exp_manager
|
| 236 |
+
log_every_n_steps: 100
|
| 237 |
+
check_val_every_n_epoch: 5
|
| 238 |
+
benchmark: false
|
| 239 |
+
|
| 240 |
+
exp_manager:
|
| 241 |
+
exp_dir: null
|
| 242 |
+
name: ${name}
|
| 243 |
+
create_tensorboard_logger: true
|
| 244 |
+
create_checkpoint_callback: true
|
| 245 |
+
checkpoint_callback_params:
|
| 246 |
+
monitor: val_loss
|
| 247 |
+
resume_if_exists: false
|
| 248 |
+
resume_ignore_no_checkpoint: false
|
tts_fa_fastpitch_hifigan-v2.0/config/hifigan/.ipynb_checkpoints/hifigan-checkpoint.yaml
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This config contains the default values for training HiFi-GAN model on LJSpeech dataset.
|
| 2 |
+
# If you want to train model on other dataset, you can change config values according to your dataset.
|
| 3 |
+
# Most dataset-specific arguments are in the head of the config file, see below.
|
| 4 |
+
|
| 5 |
+
name: "HifiGan"
|
| 6 |
+
|
| 7 |
+
train_dataset: ???
|
| 8 |
+
validation_datasets: ???
|
| 9 |
+
|
| 10 |
+
# Default values for dataset with sample_rate=22050
|
| 11 |
+
sample_rate: 22050
|
| 12 |
+
n_mel_channels: 80
|
| 13 |
+
n_window_size: 1024
|
| 14 |
+
n_window_stride: 256
|
| 15 |
+
n_fft: 1024
|
| 16 |
+
lowfreq: 0
|
| 17 |
+
highfreq: 8000
|
| 18 |
+
window: hann
|
| 19 |
+
|
| 20 |
+
train_n_segments: 8192
|
| 21 |
+
train_max_duration: null
|
| 22 |
+
train_min_duration: 0.75
|
| 23 |
+
|
| 24 |
+
val_n_segments: 66048
|
| 25 |
+
val_max_duration: null
|
| 26 |
+
val_min_duration: 0.75
|
| 27 |
+
|
| 28 |
+
defaults:
|
| 29 |
+
- model/generator: v1
|
| 30 |
+
- model/train_ds: train_ds
|
| 31 |
+
- model/validation_ds: val_ds
|
| 32 |
+
|
| 33 |
+
model:
|
| 34 |
+
preprocessor:
|
| 35 |
+
_target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures
|
| 36 |
+
nfilt: ${n_mel_channels}
|
| 37 |
+
lowfreq: ${lowfreq}
|
| 38 |
+
highfreq: ${highfreq}
|
| 39 |
+
n_fft: ${n_fft}
|
| 40 |
+
n_window_size: ${n_window_size}
|
| 41 |
+
n_window_stride: ${n_window_stride}
|
| 42 |
+
pad_to: 0
|
| 43 |
+
pad_value: -11.52
|
| 44 |
+
sample_rate: ${sample_rate}
|
| 45 |
+
window: ${window}
|
| 46 |
+
normalize: null
|
| 47 |
+
preemph: null
|
| 48 |
+
dither: 0.0
|
| 49 |
+
frame_splicing: 1
|
| 50 |
+
log: true
|
| 51 |
+
log_zero_guard_type: clamp
|
| 52 |
+
log_zero_guard_value: 1e-05
|
| 53 |
+
mag_power: 1.0
|
| 54 |
+
use_grads: false
|
| 55 |
+
exact_pad: true
|
| 56 |
+
|
| 57 |
+
optim:
|
| 58 |
+
_target_: torch.optim.AdamW
|
| 59 |
+
lr: 0.0002
|
| 60 |
+
betas: [0.8, 0.99]
|
| 61 |
+
|
| 62 |
+
sched:
|
| 63 |
+
name: CosineAnnealing
|
| 64 |
+
min_lr: 1e-5
|
| 65 |
+
warmup_ratio: 0.02
|
| 66 |
+
|
| 67 |
+
max_steps: 50000
|
| 68 |
+
l1_loss_factor: 45
|
| 69 |
+
denoise_strength: 0.0025
|
| 70 |
+
|
| 71 |
+
trainer:
|
| 72 |
+
num_nodes: 1
|
| 73 |
+
devices: 1
|
| 74 |
+
accelerator: gpu
|
| 75 |
+
strategy: ddp_find_unused_parameters_true
|
| 76 |
+
precision: 32
|
| 77 |
+
max_steps: ${model.max_steps}
|
| 78 |
+
accumulate_grad_batches: 1
|
| 79 |
+
enable_checkpointing: False # Provided by exp_manager
|
| 80 |
+
logger: false # Provided by exp_manager
|
| 81 |
+
log_every_n_steps: 100
|
| 82 |
+
check_val_every_n_epoch: 10
|
| 83 |
+
benchmark: false
|
| 84 |
+
|
| 85 |
+
exp_manager:
|
| 86 |
+
exp_dir: null
|
| 87 |
+
name: ${name}
|
| 88 |
+
create_tensorboard_logger: true
|
| 89 |
+
create_checkpoint_callback: true
|
| 90 |
+
checkpoint_callback_params:
|
| 91 |
+
monitor: val_loss
|
| 92 |
+
mode: min
|
| 93 |
+
create_wandb_logger: false
|
| 94 |
+
wandb_logger_kwargs:
|
| 95 |
+
name: null
|
| 96 |
+
project: null
|
| 97 |
+
entity: null
|
| 98 |
+
resume_if_exists: false
|
| 99 |
+
resume_ignore_no_checkpoint: false
|
tts_fa_fastpitch_hifigan-v2.0/config/hifigan/hifigan.yaml
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This config contains the default values for training HiFi-GAN model on LJSpeech dataset.
|
| 2 |
+
# If you want to train model on other dataset, you can change config values according to your dataset.
|
| 3 |
+
# Most dataset-specific arguments are in the head of the config file, see below.
|
| 4 |
+
|
| 5 |
+
name: "HifiGan"
|
| 6 |
+
|
| 7 |
+
train_dataset: ???
|
| 8 |
+
validation_datasets: ???
|
| 9 |
+
|
| 10 |
+
# Default values for dataset with sample_rate=22050
|
| 11 |
+
sample_rate: 22050
|
| 12 |
+
n_mel_channels: 80
|
| 13 |
+
n_window_size: 1024
|
| 14 |
+
n_window_stride: 256
|
| 15 |
+
n_fft: 1024
|
| 16 |
+
lowfreq: 0
|
| 17 |
+
highfreq: 8000
|
| 18 |
+
window: hann
|
| 19 |
+
|
| 20 |
+
train_n_segments: 8192
|
| 21 |
+
train_max_duration: null
|
| 22 |
+
train_min_duration: 0.75
|
| 23 |
+
|
| 24 |
+
val_n_segments: 66048
|
| 25 |
+
val_max_duration: null
|
| 26 |
+
val_min_duration: 3
|
| 27 |
+
|
| 28 |
+
defaults:
|
| 29 |
+
- model/generator: v1
|
| 30 |
+
- model/train_ds: train_ds
|
| 31 |
+
- model/validation_ds: val_ds
|
| 32 |
+
|
| 33 |
+
model:
|
| 34 |
+
preprocessor:
|
| 35 |
+
_target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures
|
| 36 |
+
nfilt: ${n_mel_channels}
|
| 37 |
+
lowfreq: ${lowfreq}
|
| 38 |
+
highfreq: ${highfreq}
|
| 39 |
+
n_fft: ${n_fft}
|
| 40 |
+
n_window_size: ${n_window_size}
|
| 41 |
+
n_window_stride: ${n_window_stride}
|
| 42 |
+
pad_to: 0
|
| 43 |
+
pad_value: -11.52
|
| 44 |
+
sample_rate: ${sample_rate}
|
| 45 |
+
window: ${window}
|
| 46 |
+
normalize: null
|
| 47 |
+
preemph: null
|
| 48 |
+
dither: 0.0
|
| 49 |
+
frame_splicing: 1
|
| 50 |
+
log: true
|
| 51 |
+
log_zero_guard_type: clamp
|
| 52 |
+
log_zero_guard_value: 1e-05
|
| 53 |
+
mag_power: 1.0
|
| 54 |
+
use_grads: false
|
| 55 |
+
exact_pad: true
|
| 56 |
+
|
| 57 |
+
optim:
|
| 58 |
+
_target_: torch.optim.AdamW
|
| 59 |
+
lr: 0.0002
|
| 60 |
+
betas: [0.8, 0.99]
|
| 61 |
+
|
| 62 |
+
sched:
|
| 63 |
+
name: CosineAnnealing
|
| 64 |
+
min_lr: 1e-5
|
| 65 |
+
warmup_ratio: 0.02
|
| 66 |
+
|
| 67 |
+
max_steps: 2500000
|
| 68 |
+
l1_loss_factor: 45
|
| 69 |
+
denoise_strength: 0.0025
|
| 70 |
+
|
| 71 |
+
trainer:
|
| 72 |
+
num_nodes: 1
|
| 73 |
+
devices: 1
|
| 74 |
+
accelerator: gpu
|
| 75 |
+
strategy: ddp_find_unused_parameters_true
|
| 76 |
+
precision: 32
|
| 77 |
+
max_steps: ${model.max_steps}
|
| 78 |
+
accumulate_grad_batches: 1
|
| 79 |
+
enable_checkpointing: False # Provided by exp_manager
|
| 80 |
+
logger: false # Provided by exp_manager
|
| 81 |
+
log_every_n_steps: 100
|
| 82 |
+
check_val_every_n_epoch: 10
|
| 83 |
+
benchmark: false
|
| 84 |
+
|
| 85 |
+
exp_manager:
|
| 86 |
+
exp_dir: null
|
| 87 |
+
name: ${name}
|
| 88 |
+
create_tensorboard_logger: true
|
| 89 |
+
create_checkpoint_callback: true
|
| 90 |
+
checkpoint_callback_params:
|
| 91 |
+
monitor: val_loss
|
| 92 |
+
mode: min
|
| 93 |
+
create_wandb_logger: false
|
| 94 |
+
wandb_logger_kwargs:
|
| 95 |
+
name: null
|
| 96 |
+
project: null
|
| 97 |
+
entity: null
|
| 98 |
+
resume_if_exists: false
|
| 99 |
+
resume_ignore_no_checkpoint: false
|
tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/generator/.ipynb_checkpoints/v1-checkpoint.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_target_: nemo.collections.tts.modules.hifigan_modules.Generator
|
| 2 |
+
resblock: 1
|
| 3 |
+
upsample_rates: [8,8,2,2]
|
| 4 |
+
upsample_kernel_sizes: [16,16,4,4]
|
| 5 |
+
upsample_initial_channel: 512
|
| 6 |
+
resblock_kernel_sizes: [3,7,11]
|
| 7 |
+
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
|
tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/generator/v1.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_target_: nemo.collections.tts.modules.hifigan_modules.Generator
|
| 2 |
+
resblock: 1
|
| 3 |
+
upsample_rates: [8,8,2,2]
|
| 4 |
+
upsample_kernel_sizes: [16,16,4,4]
|
| 5 |
+
upsample_initial_channel: 512
|
| 6 |
+
resblock_kernel_sizes: [3,7,11]
|
| 7 |
+
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
|
tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/train_ds/.ipynb_checkpoints/train_ds_finetune-checkpoint.yaml
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataset:
|
| 2 |
+
_target_: "nemo.collections.tts.data.dataset.VocoderDataset"
|
| 3 |
+
manifest_filepath: ${train_dataset}
|
| 4 |
+
sample_rate: ${sample_rate}
|
| 5 |
+
n_segments: ${train_n_segments}
|
| 6 |
+
max_duration: ${train_max_duration}
|
| 7 |
+
min_duration: ${train_min_duration}
|
| 8 |
+
load_precomputed_mel: true
|
| 9 |
+
hop_length: ${n_window_stride}
|
| 10 |
+
dataloader_params:
|
| 11 |
+
drop_last: false
|
| 12 |
+
shuffle: true
|
| 13 |
+
batch_size: 32
|
| 14 |
+
num_workers: 4
|
| 15 |
+
pin_memory: true
|
tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/train_ds/train_ds_finetune.yaml
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataset:
|
| 2 |
+
_target_: "nemo.collections.tts.data.dataset.VocoderDataset"
|
| 3 |
+
manifest_filepath: ${train_dataset}
|
| 4 |
+
sample_rate: ${sample_rate}
|
| 5 |
+
n_segments: ${train_n_segments}
|
| 6 |
+
max_duration: ${train_max_duration}
|
| 7 |
+
min_duration: ${train_min_duration}
|
| 8 |
+
load_precomputed_mel: true
|
| 9 |
+
hop_length: ${n_window_stride}
|
| 10 |
+
dataloader_params:
|
| 11 |
+
drop_last: false
|
| 12 |
+
shuffle: true
|
| 13 |
+
batch_size: 32
|
| 14 |
+
num_workers: 4
|
| 15 |
+
pin_memory: true
|
tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/train_ds_finetune.yaml
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataset:
|
| 2 |
+
_target_: "nemo.collections.tts.data.dataset.VocoderDataset"
|
| 3 |
+
manifest_filepath: ${train_dataset}
|
| 4 |
+
sample_rate: ${sample_rate}
|
| 5 |
+
n_segments: ${train_n_segments}
|
| 6 |
+
max_duration: ${train_max_duration}
|
| 7 |
+
min_duration: ${train_min_duration}
|
| 8 |
+
load_precomputed_mel: true
|
| 9 |
+
hop_length: ${n_window_stride}
|
| 10 |
+
dataloader_params:
|
| 11 |
+
drop_last: false
|
| 12 |
+
shuffle: true
|
| 13 |
+
batch_size: 16
|
| 14 |
+
num_workers: 4
|
| 15 |
+
pin_memory: true
|
tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/v1.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_target_: nemo.collections.tts.modules.hifigan_modules.Generator
|
| 2 |
+
resblock: 1
|
| 3 |
+
upsample_rates: [8,8,2,2]
|
| 4 |
+
upsample_kernel_sizes: [16,16,4,4]
|
| 5 |
+
upsample_initial_channel: 512
|
| 6 |
+
resblock_kernel_sizes: [3,7,11]
|
| 7 |
+
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
|
tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/val_ds_finetune.yaml
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataset:
|
| 2 |
+
_target_: "nemo.collections.tts.data.dataset.VocoderDataset"
|
| 3 |
+
manifest_filepath: ${validation_datasets}
|
| 4 |
+
sample_rate: ${sample_rate}
|
| 5 |
+
n_segments: ${val_n_segments}
|
| 6 |
+
max_duration: ${val_max_duration}
|
| 7 |
+
min_duration: ${val_min_duration}
|
| 8 |
+
load_precomputed_mel: true
|
| 9 |
+
hop_length: ${n_window_stride}
|
| 10 |
+
dataloader_params:
|
| 11 |
+
drop_last: false
|
| 12 |
+
shuffle: false
|
| 13 |
+
batch_size: 16
|
| 14 |
+
num_workers: 4
|
| 15 |
+
pin_memory: true
|
tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/validation_ds/.ipynb_checkpoints/val_ds_finetune-checkpoint.yaml
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataset:
|
| 2 |
+
_target_: "nemo.collections.tts.data.dataset.VocoderDataset"
|
| 3 |
+
manifest_filepath: ${validation_datasets}
|
| 4 |
+
sample_rate: ${sample_rate}
|
| 5 |
+
n_segments: ${val_n_segments}
|
| 6 |
+
max_duration: ${val_max_duration}
|
| 7 |
+
min_duration: ${val_min_duration}
|
| 8 |
+
load_precomputed_mel: true
|
| 9 |
+
hop_length: ${n_window_stride}
|
| 10 |
+
dataloader_params:
|
| 11 |
+
drop_last: false
|
| 12 |
+
shuffle: false
|
| 13 |
+
batch_size: 16
|
| 14 |
+
num_workers: 4
|
| 15 |
+
pin_memory: true
|
tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/validation_ds/val_ds_finetune.yaml
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataset:
|
| 2 |
+
_target_: "nemo.collections.tts.data.dataset.VocoderDataset"
|
| 3 |
+
manifest_filepath: ${validation_datasets}
|
| 4 |
+
sample_rate: ${sample_rate}
|
| 5 |
+
n_segments: ${val_n_segments}
|
| 6 |
+
max_duration: ${val_max_duration}
|
| 7 |
+
min_duration: ${val_min_duration}
|
| 8 |
+
load_precomputed_mel: true
|
| 9 |
+
hop_length: ${n_window_stride}
|
| 10 |
+
dataloader_params:
|
| 11 |
+
drop_last: false
|
| 12 |
+
shuffle: false
|
| 13 |
+
batch_size: 16
|
| 14 |
+
num_workers: 4
|
| 15 |
+
pin_memory: true
|
tts_fa_fastpitch_hifigan-v2.0/models/FastPitch--val_loss-0.7236-epoch-50.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eea3093c753874952bab5719b9d82c664b0c1c7bc4116a3034d657659269e3bb
|
| 3 |
+
size 549427880
|
tts_fa_fastpitch_hifigan-v2.0/models/FastPitch--val_loss-0.7236-epoch-50.nemo
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cbff27139ad11c3e742378596421b00c15f5dce664255119b4e1652a4e73d64c
|
| 3 |
+
size 184258560
|
tts_fa_fastpitch_hifigan-v2.0/models/HifiGan--val_loss-0.6090-epoch-39-last.nemo
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8199747904ba8c35f64f1d3b1a1a4f62c303fd2f0238c2148f2750833563aa8a
|
| 3 |
+
size 339210240
|
tts_fa_fastpitch_hifigan-v2.0/models/HifiGan--val_loss-2.0688-epoch-12-last.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:24e7ddabf6058bef99b570a56131368bdbe39b2c9b095c5be8b1d6d7c8c5adcf
|
| 3 |
+
size 1016835427
|
tts_fa_fastpitch_hifigan-v2.0/models/HifiGan--val_loss-2.0733-epoch-12-last.nemo
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d0e087e923573c910b0a6c86e02c355bf3a85c82017cdcce82f99ff47a2a8577
|
| 3 |
+
size 339210240
|
tts_fa_fastpitch_hifigan-v2.0/models/onnx-models/fastpitch.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0108a874da58f4cd5c99c7819b89cff32cc841a6d23f3f4fae4e901aeb315000
|
| 3 |
+
size 179344671
|
tts_fa_fastpitch_hifigan-v2.0/models/onnx-models/hifigan.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0f5898c7b64e7c64c8421210f17f498c7b13e4d121d6f33f09d226cb6956f970
|
| 3 |
+
size 55760326
|
tts_fa_fastpitch_hifigan-v2.0/persian-dict/persian-v5.0.dict
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tts_fa_fastpitch_hifigan-v2.0/scripts/.ipynb_checkpoints/generate_mels-checkpoint.py
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""
|
| 16 |
+
This script is to generate mel spectrograms from a Fastpitch model checkpoint. Please see general usage below. It runs
|
| 17 |
+
on GPUs by default, but you can add `--num-workers 5 --cpu` as an option to run on CPUs.
|
| 18 |
+
|
| 19 |
+
$ python scripts/dataset_processing/tts/generate_mels.py \
|
| 20 |
+
--fastpitch-model-ckpt ./models/fastpitch/multi_spk/FastPitch--val_loss\=1.4473-epoch\=209.ckpt \
|
| 21 |
+
--input-json-manifests /home/xueyang/HUI-Audio-Corpus-German-clean/test_manifest_text_normed_phonemes.json
|
| 22 |
+
--output-json-manifest-root /home/xueyang/experiments/multi_spk_tts_de
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
import argparse
|
| 26 |
+
import json
|
| 27 |
+
from pathlib import Path
|
| 28 |
+
|
| 29 |
+
import numpy as np
|
| 30 |
+
import soundfile as sf
|
| 31 |
+
import torch
|
| 32 |
+
from joblib import Parallel, delayed
|
| 33 |
+
from tqdm import tqdm
|
| 34 |
+
|
| 35 |
+
from nemo.collections.tts.models import FastPitchModel
|
| 36 |
+
from nemo.collections.tts.parts.utils.tts_dataset_utils import (
|
| 37 |
+
BetaBinomialInterpolator,
|
| 38 |
+
beta_binomial_prior_distribution,
|
| 39 |
+
)
|
| 40 |
+
from nemo.utils import logging
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def get_args():
|
| 44 |
+
parser = argparse.ArgumentParser(
|
| 45 |
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
| 46 |
+
description="Generate mel spectrograms with pretrained FastPitch model, and create manifests for finetuning Hifigan.",
|
| 47 |
+
)
|
| 48 |
+
parser.add_argument(
|
| 49 |
+
"--fastpitch-model-ckpt",
|
| 50 |
+
required=True,
|
| 51 |
+
type=Path,
|
| 52 |
+
help="Specify a full path of a fastpitch model checkpoint with the suffix of either .ckpt or .nemo.",
|
| 53 |
+
)
|
| 54 |
+
parser.add_argument(
|
| 55 |
+
"--input-json-manifests",
|
| 56 |
+
nargs="+",
|
| 57 |
+
required=True,
|
| 58 |
+
type=Path,
|
| 59 |
+
help="Specify a full path of a JSON manifest. You could add multiple manifests.",
|
| 60 |
+
)
|
| 61 |
+
parser.add_argument(
|
| 62 |
+
"--output-json-manifest-root",
|
| 63 |
+
required=True,
|
| 64 |
+
type=Path,
|
| 65 |
+
help="Specify a full path of output root that would contain new manifests.",
|
| 66 |
+
)
|
| 67 |
+
parser.add_argument(
|
| 68 |
+
"--num-workers",
|
| 69 |
+
default=-1,
|
| 70 |
+
type=int,
|
| 71 |
+
help="Specify the max number of concurrently Python workers processes. "
|
| 72 |
+
"If -1 all CPUs are used. If 1 no parallel computing is used.",
|
| 73 |
+
)
|
| 74 |
+
parser.add_argument("--cpu", action='store_true', default=False, help="Generate mel spectrograms using CPUs.")
|
| 75 |
+
args = parser.parse_args()
|
| 76 |
+
return args
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def __load_wav(audio_file):
|
| 80 |
+
with sf.SoundFile(audio_file, 'r') as f:
|
| 81 |
+
samples = f.read(dtype='float32')
|
| 82 |
+
return samples.transpose()
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def __generate_mels(entry, spec_model, device, use_beta_binomial_interpolator, mel_root):
|
| 86 |
+
# Generate a spectrograms (we need to use ground truth alignment for correct matching between audio and mels)
|
| 87 |
+
audio = __load_wav(entry["audio_filepath"])
|
| 88 |
+
audio = torch.from_numpy(audio).unsqueeze(0).to(device)
|
| 89 |
+
audio_len = torch.tensor(audio.shape[1], dtype=torch.long, device=device).unsqueeze(0)
|
| 90 |
+
|
| 91 |
+
if spec_model.fastpitch.speaker_emb is not None and "speaker" in entry:
|
| 92 |
+
speaker = torch.tensor([entry['speaker']]).to(device)
|
| 93 |
+
else:
|
| 94 |
+
speaker = None
|
| 95 |
+
|
| 96 |
+
with torch.no_grad():
|
| 97 |
+
if "normalized_text" in entry:
|
| 98 |
+
text = spec_model.parse(entry["normalized_text"], normalize=False)
|
| 99 |
+
else:
|
| 100 |
+
text = spec_model.parse(entry['text'])
|
| 101 |
+
|
| 102 |
+
text_len = torch.tensor(text.shape[-1], dtype=torch.long, device=device).unsqueeze(0)
|
| 103 |
+
spect, spect_len = spec_model.preprocessor(input_signal=audio, length=audio_len)
|
| 104 |
+
mel_len = spect.shape[-1]
|
| 105 |
+
|
| 106 |
+
# Generate attention prior and spectrogram inputs for HiFi-GAN
|
| 107 |
+
if use_beta_binomial_interpolator:
|
| 108 |
+
beta_binomial_interpolator = BetaBinomialInterpolator()
|
| 109 |
+
attn_prior = (
|
| 110 |
+
torch.from_numpy(beta_binomial_interpolator(mel_len, text_len.item()))
|
| 111 |
+
.unsqueeze(0)
|
| 112 |
+
.to(text.device)
|
| 113 |
+
)
|
| 114 |
+
else:
|
| 115 |
+
attn_prior = (
|
| 116 |
+
torch.from_numpy(beta_binomial_prior_distribution(text_len.item(), mel_len))
|
| 117 |
+
.unsqueeze(0)
|
| 118 |
+
.to(text.device)
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
spectrogram = spec_model.forward(
|
| 122 |
+
text=text, input_lens=text_len, spec=spect, mel_lens=spect_len, attn_prior=attn_prior, speaker=speaker,
|
| 123 |
+
)[0]
|
| 124 |
+
|
| 125 |
+
save_path = mel_root / f"{Path(entry['audio_filepath']).stem}.npy"
|
| 126 |
+
np.save(save_path, spectrogram[0].to('cpu').numpy())
|
| 127 |
+
entry["mel_filepath"] = str(save_path)
|
| 128 |
+
|
| 129 |
+
return entry
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def main():
|
| 133 |
+
args = get_args()
|
| 134 |
+
ckpt_path = args.fastpitch_model_ckpt
|
| 135 |
+
input_manifest_filepaths = args.input_json_manifests
|
| 136 |
+
output_json_manifest_root = args.output_json_manifest_root
|
| 137 |
+
|
| 138 |
+
mel_root = output_json_manifest_root / "mels"
|
| 139 |
+
mel_root.mkdir(exist_ok=True, parents=True)
|
| 140 |
+
|
| 141 |
+
# load pretrained FastPitch model checkpoint
|
| 142 |
+
suffix = ckpt_path.suffix
|
| 143 |
+
if suffix == ".nemo":
|
| 144 |
+
spec_model = FastPitchModel.restore_from(ckpt_path).eval()
|
| 145 |
+
elif suffix == ".ckpt":
|
| 146 |
+
spec_model = FastPitchModel.load_from_checkpoint(ckpt_path).eval()
|
| 147 |
+
else:
|
| 148 |
+
raise ValueError(f"Unsupported suffix: {suffix}")
|
| 149 |
+
if not args.cpu:
|
| 150 |
+
spec_model.cuda()
|
| 151 |
+
device = spec_model.device
|
| 152 |
+
|
| 153 |
+
use_beta_binomial_interpolator = spec_model.cfg.train_ds.dataset.get("use_beta_binomial_interpolator", False)
|
| 154 |
+
|
| 155 |
+
for manifest in input_manifest_filepaths:
|
| 156 |
+
logging.info(f"Processing {manifest}.")
|
| 157 |
+
entries = []
|
| 158 |
+
with open(manifest, "r") as fjson:
|
| 159 |
+
for line in fjson:
|
| 160 |
+
entries.append(json.loads(line.strip()))
|
| 161 |
+
|
| 162 |
+
if device == "cpu":
|
| 163 |
+
new_entries = Parallel(n_jobs=args.num_workers)(
|
| 164 |
+
delayed(__generate_mels)(entry, spec_model, device, use_beta_binomial_interpolator, mel_root)
|
| 165 |
+
for entry in entries
|
| 166 |
+
)
|
| 167 |
+
else:
|
| 168 |
+
new_entries = []
|
| 169 |
+
for entry in tqdm(entries):
|
| 170 |
+
new_entry = __generate_mels(entry, spec_model, device, use_beta_binomial_interpolator, mel_root)
|
| 171 |
+
new_entries.append(new_entry)
|
| 172 |
+
|
| 173 |
+
mel_manifest_path = output_json_manifest_root / f"{manifest.stem}_mel{manifest.suffix}"
|
| 174 |
+
with open(mel_manifest_path, "w") as fmel:
|
| 175 |
+
for entry in new_entries:
|
| 176 |
+
fmel.write(json.dumps(entry) + "\n")
|
| 177 |
+
logging.info(f"Processing {manifest} is complete --> {mel_manifest_path}")
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
if __name__ == "__main__":
|
| 181 |
+
main()
|
tts_fa_fastpitch_hifigan-v2.0/scripts/.ipynb_checkpoints/hifigan_finetune-checkpoint.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
import lightning.pytorch as pl
|
| 16 |
+
|
| 17 |
+
from nemo.collections.tts.models import HifiGanModel
|
| 18 |
+
from nemo.core.config import hydra_runner
|
| 19 |
+
from nemo.utils.exp_manager import exp_manager
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@hydra_runner(config_path="conf/hifigan", config_name="hifigan_44100")
|
| 23 |
+
def main(cfg):
|
| 24 |
+
trainer = pl.Trainer(**cfg.trainer)
|
| 25 |
+
exp_manager(trainer, cfg.get("exp_manager", None))
|
| 26 |
+
model = HifiGanModel(cfg=cfg.model, trainer=trainer)
|
| 27 |
+
model.maybe_init_from_pretrained_checkpoint(cfg=cfg)
|
| 28 |
+
trainer.fit(model)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
if __name__ == '__main__':
|
| 32 |
+
main() # noqa pylint: disable=no-value-for-parameter
|
tts_fa_fastpitch_hifigan-v2.0/scripts/extract_sup_data.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
import torch
|
| 17 |
+
from hydra.utils import instantiate
|
| 18 |
+
from tqdm import tqdm
|
| 19 |
+
|
| 20 |
+
from nemo.core.config import hydra_runner
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def get_pitch_stats(pitch_list):
|
| 24 |
+
pitch_tensor = torch.cat(pitch_list)
|
| 25 |
+
pitch_mean, pitch_std = pitch_tensor.mean().item(), pitch_tensor.std().item()
|
| 26 |
+
pitch_min, pitch_max = pitch_tensor.min().item(), pitch_tensor.max().item()
|
| 27 |
+
print(f"PITCH_MEAN={pitch_mean}, PITCH_STD={pitch_std}")
|
| 28 |
+
print(f"PITCH_MIN={pitch_min}, PITCH_MAX={pitch_max}")
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def preprocess_ds_for_fastpitch_align(dataloader):
|
| 32 |
+
pitch_list = []
|
| 33 |
+
for batch in tqdm(dataloader, total=len(dataloader)):
|
| 34 |
+
audios, audio_lengths, tokens, tokens_lengths, align_prior_matrices, pitches, pitches_lengths, *_ = batch
|
| 35 |
+
pitch = pitches.squeeze(0)
|
| 36 |
+
pitch_list.append(pitch[pitch != 0])
|
| 37 |
+
|
| 38 |
+
get_pitch_stats(pitch_list)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def preprocess_ds_for_mixer_tts_x(dataloader):
|
| 42 |
+
pitch_list = []
|
| 43 |
+
for batch in tqdm(dataloader, total=len(dataloader)):
|
| 44 |
+
(
|
| 45 |
+
audios,
|
| 46 |
+
audio_lengths,
|
| 47 |
+
tokens,
|
| 48 |
+
tokens_lengths,
|
| 49 |
+
align_prior_matrices,
|
| 50 |
+
pitches,
|
| 51 |
+
pitches_lengths,
|
| 52 |
+
lm_tokens,
|
| 53 |
+
) = batch
|
| 54 |
+
|
| 55 |
+
pitch = pitches.squeeze(0)
|
| 56 |
+
pitch_list.append(pitch[pitch != 0])
|
| 57 |
+
|
| 58 |
+
get_pitch_stats(pitch_list)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
CFG_NAME2FUNC = {
|
| 62 |
+
"ds_for_fastpitch_align": preprocess_ds_for_fastpitch_align,
|
| 63 |
+
"ds_for_mixer_tts": preprocess_ds_for_fastpitch_align,
|
| 64 |
+
"ds_for_mixer_tts_x": preprocess_ds_for_mixer_tts_x,
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
@hydra_runner(config_path='ljspeech/ds_conf', config_name='ds_for_fastpitch_align')
|
| 69 |
+
def main(cfg):
|
| 70 |
+
dataset = instantiate(cfg.dataset)
|
| 71 |
+
dataloader = torch.utils.data.DataLoader(
|
| 72 |
+
dataset=dataset,
|
| 73 |
+
batch_size=1,
|
| 74 |
+
collate_fn=dataset._collate_fn,
|
| 75 |
+
num_workers=cfg.get("dataloader_params", {}).get("num_workers", 4),
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
print(f"Processing {cfg.manifest_filepath}:")
|
| 79 |
+
CFG_NAME2FUNC[cfg.name](dataloader)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
if __name__ == '__main__':
|
| 83 |
+
main() # noqa pylint: disable=no-value-for-parameter
|
tts_fa_fastpitch_hifigan-v2.0/scripts/fastpitch.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
import lightning.pytorch as pl
|
| 16 |
+
|
| 17 |
+
from nemo.collections.common.callbacks import LogEpochTimeCallback
|
| 18 |
+
from nemo.collections.tts.models import FastPitchModel
|
| 19 |
+
from nemo.core.config import hydra_runner
|
| 20 |
+
from nemo.utils.exp_manager import exp_manager
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
@hydra_runner(config_path="conf", config_name="fastpitch_align_v1.05")
|
| 24 |
+
def main(cfg):
|
| 25 |
+
trainer = pl.Trainer(**cfg.trainer)
|
| 26 |
+
exp_manager(trainer, cfg.get("exp_manager", None))
|
| 27 |
+
model = FastPitchModel(cfg=cfg.model, trainer=trainer)
|
| 28 |
+
lr_logger = pl.callbacks.LearningRateMonitor()
|
| 29 |
+
epoch_time_logger = LogEpochTimeCallback()
|
| 30 |
+
trainer.callbacks.extend([lr_logger, epoch_time_logger])
|
| 31 |
+
trainer.fit(model)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
if __name__ == '__main__':
|
| 35 |
+
main() # noqa pylint: disable=no-value-for-parameter
|
tts_fa_fastpitch_hifigan-v2.0/scripts/generate_mels.py
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""
|
| 16 |
+
This script is to generate mel spectrograms from a Fastpitch model checkpoint. Please see general usage below. It runs
|
| 17 |
+
on GPUs by default, but you can add `--num-workers 5 --cpu` as an option to run on CPUs.
|
| 18 |
+
|
| 19 |
+
$ python scripts/dataset_processing/tts/generate_mels.py \
|
| 20 |
+
--fastpitch-model-ckpt ./models/fastpitch/multi_spk/FastPitch--val_loss\=1.4473-epoch\=209.ckpt \
|
| 21 |
+
--input-json-manifests /home/xueyang/HUI-Audio-Corpus-German-clean/test_manifest_text_normed_phonemes.json
|
| 22 |
+
--output-json-manifest-root /home/xueyang/experiments/multi_spk_tts_de
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
import argparse
|
| 26 |
+
import json
|
| 27 |
+
from pathlib import Path
|
| 28 |
+
|
| 29 |
+
import numpy as np
|
| 30 |
+
import soundfile as sf
|
| 31 |
+
import torch
|
| 32 |
+
from joblib import Parallel, delayed
|
| 33 |
+
from tqdm import tqdm
|
| 34 |
+
|
| 35 |
+
from nemo.collections.tts.models import FastPitchModel
|
| 36 |
+
from nemo.collections.tts.parts.utils.tts_dataset_utils import (
|
| 37 |
+
BetaBinomialInterpolator,
|
| 38 |
+
beta_binomial_prior_distribution,
|
| 39 |
+
)
|
| 40 |
+
from nemo.utils import logging
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def get_args():
|
| 44 |
+
parser = argparse.ArgumentParser(
|
| 45 |
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
| 46 |
+
description="Generate mel spectrograms with pretrained FastPitch model, and create manifests for finetuning Hifigan.",
|
| 47 |
+
)
|
| 48 |
+
parser.add_argument(
|
| 49 |
+
"--fastpitch-model-ckpt",
|
| 50 |
+
required=True,
|
| 51 |
+
type=Path,
|
| 52 |
+
help="Specify a full path of a fastpitch model checkpoint with the suffix of either .ckpt or .nemo.",
|
| 53 |
+
)
|
| 54 |
+
parser.add_argument(
|
| 55 |
+
"--input-json-manifests",
|
| 56 |
+
nargs="+",
|
| 57 |
+
required=True,
|
| 58 |
+
type=Path,
|
| 59 |
+
help="Specify a full path of a JSON manifest. You could add multiple manifests.",
|
| 60 |
+
)
|
| 61 |
+
parser.add_argument(
|
| 62 |
+
"--output-json-manifest-root",
|
| 63 |
+
required=True,
|
| 64 |
+
type=Path,
|
| 65 |
+
help="Specify a full path of output root that would contain new manifests.",
|
| 66 |
+
)
|
| 67 |
+
parser.add_argument(
|
| 68 |
+
"--num-workers",
|
| 69 |
+
default=-1,
|
| 70 |
+
type=int,
|
| 71 |
+
help="Specify the max number of concurrently Python workers processes. "
|
| 72 |
+
"If -1 all CPUs are used. If 1 no parallel computing is used.",
|
| 73 |
+
)
|
| 74 |
+
parser.add_argument("--cpu", action='store_true', default=False, help="Generate mel spectrograms using CPUs.")
|
| 75 |
+
args = parser.parse_args()
|
| 76 |
+
return args
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def __load_wav(audio_file):
|
| 80 |
+
with sf.SoundFile(audio_file, 'r') as f:
|
| 81 |
+
samples = f.read(dtype='float32')
|
| 82 |
+
return samples.transpose()
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def __generate_mels(entry, spec_model, device, use_beta_binomial_interpolator, mel_root):
|
| 86 |
+
# Generate a spectrograms (we need to use ground truth alignment for correct matching between audio and mels)
|
| 87 |
+
audio = __load_wav(entry["audio_filepath"])
|
| 88 |
+
audio = torch.from_numpy(audio).unsqueeze(0).to(device)
|
| 89 |
+
audio_len = torch.tensor(audio.shape[1], dtype=torch.long, device=device).unsqueeze(0)
|
| 90 |
+
|
| 91 |
+
if spec_model.fastpitch.speaker_emb is not None and "speaker" in entry:
|
| 92 |
+
speaker = torch.tensor([entry['speaker']]).to(device)
|
| 93 |
+
else:
|
| 94 |
+
speaker = None
|
| 95 |
+
|
| 96 |
+
with torch.no_grad():
|
| 97 |
+
if "normalized_text" in entry:
|
| 98 |
+
text = spec_model.parse(entry["normalized_text"], normalize=False)
|
| 99 |
+
else:
|
| 100 |
+
text = spec_model.parse(entry['text'])
|
| 101 |
+
|
| 102 |
+
text_len = torch.tensor(text.shape[-1], dtype=torch.long, device=device).unsqueeze(0)
|
| 103 |
+
spect, spect_len = spec_model.preprocessor(input_signal=audio, length=audio_len)
|
| 104 |
+
mel_len = spect.shape[-1]
|
| 105 |
+
|
| 106 |
+
# Generate attention prior and spectrogram inputs for HiFi-GAN
|
| 107 |
+
if use_beta_binomial_interpolator:
|
| 108 |
+
beta_binomial_interpolator = BetaBinomialInterpolator()
|
| 109 |
+
attn_prior = (
|
| 110 |
+
torch.from_numpy(beta_binomial_interpolator(mel_len, text_len.item()))
|
| 111 |
+
.unsqueeze(0)
|
| 112 |
+
.to(text.device)
|
| 113 |
+
)
|
| 114 |
+
else:
|
| 115 |
+
attn_prior = (
|
| 116 |
+
torch.from_numpy(beta_binomial_prior_distribution(text_len.item(), mel_len))
|
| 117 |
+
.unsqueeze(0)
|
| 118 |
+
.to(text.device)
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
spectrogram = spec_model.forward(
|
| 122 |
+
text=text, input_lens=text_len, spec=spect, mel_lens=spect_len, attn_prior=attn_prior, speaker=speaker,
|
| 123 |
+
)[0]
|
| 124 |
+
|
| 125 |
+
save_path = mel_root / f"{Path(entry['audio_filepath']).stem}.npy"
|
| 126 |
+
np.save(save_path, spectrogram[0].to('cpu').numpy())
|
| 127 |
+
entry["mel_filepath"] = str(save_path)
|
| 128 |
+
|
| 129 |
+
return entry
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def main():
|
| 133 |
+
args = get_args()
|
| 134 |
+
ckpt_path = args.fastpitch_model_ckpt
|
| 135 |
+
input_manifest_filepaths = args.input_json_manifests
|
| 136 |
+
output_json_manifest_root = args.output_json_manifest_root
|
| 137 |
+
|
| 138 |
+
mel_root = output_json_manifest_root / "mels"
|
| 139 |
+
mel_root.mkdir(exist_ok=True, parents=True)
|
| 140 |
+
|
| 141 |
+
# load pretrained FastPitch model checkpoint
|
| 142 |
+
suffix = ckpt_path.suffix
|
| 143 |
+
if suffix == ".nemo":
|
| 144 |
+
spec_model = FastPitchModel.restore_from(ckpt_path).eval()
|
| 145 |
+
elif suffix == ".ckpt":
|
| 146 |
+
spec_model = FastPitchModel.load_from_checkpoint(ckpt_path).eval()
|
| 147 |
+
else:
|
| 148 |
+
raise ValueError(f"Unsupported suffix: {suffix}")
|
| 149 |
+
if not args.cpu:
|
| 150 |
+
spec_model.cuda()
|
| 151 |
+
device = spec_model.device
|
| 152 |
+
|
| 153 |
+
use_beta_binomial_interpolator = spec_model.cfg.train_ds.dataset.get("use_beta_binomial_interpolator", False)
|
| 154 |
+
|
| 155 |
+
for manifest in input_manifest_filepaths:
|
| 156 |
+
logging.info(f"Processing {manifest}.")
|
| 157 |
+
entries = []
|
| 158 |
+
with open(manifest, "r") as fjson:
|
| 159 |
+
for line in fjson:
|
| 160 |
+
entries.append(json.loads(line.strip()))
|
| 161 |
+
|
| 162 |
+
if device == "cpu":
|
| 163 |
+
new_entries = Parallel(n_jobs=args.num_workers)(
|
| 164 |
+
delayed(__generate_mels)(entry, spec_model, device, use_beta_binomial_interpolator, mel_root)
|
| 165 |
+
for entry in entries
|
| 166 |
+
)
|
| 167 |
+
else:
|
| 168 |
+
new_entries = []
|
| 169 |
+
for entry in tqdm(entries):
|
| 170 |
+
new_entry = __generate_mels(entry, spec_model, device, use_beta_binomial_interpolator, mel_root)
|
| 171 |
+
new_entries.append(new_entry)
|
| 172 |
+
|
| 173 |
+
mel_manifest_path = output_json_manifest_root / f"{manifest.stem}_mel{manifest.suffix}"
|
| 174 |
+
with open(mel_manifest_path, "w") as fmel:
|
| 175 |
+
for entry in new_entries:
|
| 176 |
+
fmel.write(json.dumps(entry) + "\n")
|
| 177 |
+
logging.info(f"Processing {manifest} is complete --> {mel_manifest_path}")
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
if __name__ == "__main__":
|
| 181 |
+
main()
|
tts_fa_fastpitch_hifigan-v2.0/scripts/hifigan_finetune.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
import lightning.pytorch as pl
|
| 16 |
+
|
| 17 |
+
from nemo.collections.tts.models import HifiGanModel
|
| 18 |
+
from nemo.core.config import hydra_runner
|
| 19 |
+
from nemo.utils.exp_manager import exp_manager
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@hydra_runner(config_path="conf/hifigan", config_name="hifigan_44100")
|
| 23 |
+
def main(cfg):
|
| 24 |
+
trainer = pl.Trainer(**cfg.trainer)
|
| 25 |
+
exp_manager(trainer, cfg.get("exp_manager", None))
|
| 26 |
+
model = HifiGanModel(cfg=cfg.model, trainer=trainer)
|
| 27 |
+
model.maybe_init_from_pretrained_checkpoint(cfg=cfg)
|
| 28 |
+
trainer.fit(model)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
if __name__ == '__main__':
|
| 32 |
+
main() # noqa pylint: disable=no-value-for-parameter
|
tts_fa_fastpitch_hifigan-v2.0/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/SadeghK/tts_fa_fastpitch_hifigan-v2.0
|
tts_fa_fastpitch_hifigan-v2.0/tts-nemo-fastpitch-hifigan-from-scratch-finetuning-hifigan.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tts_fa_fastpitch_hifigan-v2.0/tts-nemo-fastpitch-hifigan-inference-only.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tts_fa_fastpitch_hifigan-v2.0/tts_nemo_fastpitch_hifigan_convert_to_onnx.ipynb
ADDED
|
@@ -0,0 +1,884 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"nbformat": 4,
|
| 3 |
+
"nbformat_minor": 0,
|
| 4 |
+
"metadata": {
|
| 5 |
+
"colab": {
|
| 6 |
+
"provenance": []
|
| 7 |
+
},
|
| 8 |
+
"kernelspec": {
|
| 9 |
+
"name": "python3",
|
| 10 |
+
"display_name": "Python 3"
|
| 11 |
+
},
|
| 12 |
+
"language_info": {
|
| 13 |
+
"name": "python"
|
| 14 |
+
}
|
| 15 |
+
},
|
| 16 |
+
"cells": [
|
| 17 |
+
{
|
| 18 |
+
"cell_type": "code",
|
| 19 |
+
"execution_count": null,
|
| 20 |
+
"metadata": {
|
| 21 |
+
"id": "4kodC7VXOd56"
|
| 22 |
+
},
|
| 23 |
+
"outputs": [],
|
| 24 |
+
"source": [
|
| 25 |
+
"# !python -m pip install --upgrade pip\n",
|
| 26 |
+
"!apt-get update && apt-get install -y libsndfile1 ffmpeg\n",
|
| 27 |
+
"!pip install Cython packaging\n",
|
| 28 |
+
"!rm -rf /usr/lib/python3.10/site-packages/blinker*\n",
|
| 29 |
+
"!rm -rf /usr/local/lib/python3.10/dist-packages/blinker*\n",
|
| 30 |
+
"!pip install --ignore-installed blinker\n",
|
| 31 |
+
"!pip install --upgrade --force-reinstall blinker\n",
|
| 32 |
+
"# !pip install dask-cuda==24.8.2\n",
|
| 33 |
+
"\n",
|
| 34 |
+
"!mkdir -p /workspace/tts-nemo/\n",
|
| 35 |
+
"!cd /workspace/tts-nemo/\n",
|
| 36 |
+
"!git clone https://github.com/SadeghKrmi/NeMo.git\n",
|
| 37 |
+
"\n",
|
| 38 |
+
"# to install and enable editing without re-installation\n",
|
| 39 |
+
"!cd NeMo && pip install -e '.[all]'\n",
|
| 40 |
+
"\n",
|
| 41 |
+
"# install without editing possibility\n",
|
| 42 |
+
"# !cd NeMo && pip install '.[all]'"
|
| 43 |
+
]
|
| 44 |
+
},
|
| 45 |
+
{
|
| 46 |
+
"cell_type": "code",
|
| 47 |
+
"source": [
|
| 48 |
+
"from google.colab import drive\n",
|
| 49 |
+
"drive.mount('/content/drive')"
|
| 50 |
+
],
|
| 51 |
+
"metadata": {
|
| 52 |
+
"colab": {
|
| 53 |
+
"base_uri": "https://localhost:8080/"
|
| 54 |
+
},
|
| 55 |
+
"id": "LKzWYURw4S5i",
|
| 56 |
+
"outputId": "d0dbbac6-1391-4116-de27-19b0fd39805b"
|
| 57 |
+
},
|
| 58 |
+
"execution_count": 1,
|
| 59 |
+
"outputs": [
|
| 60 |
+
{
|
| 61 |
+
"output_type": "stream",
|
| 62 |
+
"name": "stdout",
|
| 63 |
+
"text": [
|
| 64 |
+
"Mounted at /content/drive\n"
|
| 65 |
+
]
|
| 66 |
+
}
|
| 67 |
+
]
|
| 68 |
+
},
|
| 69 |
+
{
|
| 70 |
+
"cell_type": "code",
|
| 71 |
+
"source": [
|
| 72 |
+
"!ls -l /content/drive/MyDrive/vast-ai-gcloud-drive/tts-v3/models/FastPitch--val_loss-0.7236-epoch-50.nemo\n",
|
| 73 |
+
"!ls -l /content/drive/MyDrive/vast-ai-gcloud-drive/tts-v3/models/HifiGan--val_loss-2.0733-epoch-12-last.nemo\n",
|
| 74 |
+
"\n"
|
| 75 |
+
],
|
| 76 |
+
"metadata": {
|
| 77 |
+
"colab": {
|
| 78 |
+
"base_uri": "https://localhost:8080/"
|
| 79 |
+
},
|
| 80 |
+
"id": "lN8KV1CanbX1",
|
| 81 |
+
"outputId": "6c1a9459-bc49-43c1-a220-64e1c2e175aa"
|
| 82 |
+
},
|
| 83 |
+
"execution_count": 4,
|
| 84 |
+
"outputs": [
|
| 85 |
+
{
|
| 86 |
+
"output_type": "stream",
|
| 87 |
+
"name": "stdout",
|
| 88 |
+
"text": [
|
| 89 |
+
"-rw------- 1 root root 184258560 Aug 13 08:13 /content/drive/MyDrive/vast-ai-gcloud-drive/tts-v3/models/FastPitch--val_loss-0.7236-epoch-50.nemo\n",
|
| 90 |
+
"-rw------- 1 root root 339210240 Aug 15 12:11 /content/drive/MyDrive/vast-ai-gcloud-drive/tts-v3/models/HifiGan--val_loss-2.0733-epoch-12-last.nemo\n"
|
| 91 |
+
]
|
| 92 |
+
}
|
| 93 |
+
]
|
| 94 |
+
},
|
| 95 |
+
{
|
| 96 |
+
"cell_type": "code",
|
| 97 |
+
"source": [
|
| 98 |
+
"!ls /content/drive/MyDrive/cNotebooks/perTTS/tts-nemo-fastpitch-hifigan-convert-to-onnx/"
|
| 99 |
+
],
|
| 100 |
+
"metadata": {
|
| 101 |
+
"colab": {
|
| 102 |
+
"base_uri": "https://localhost:8080/"
|
| 103 |
+
},
|
| 104 |
+
"id": "6hYdqdhxQscq",
|
| 105 |
+
"outputId": "4813f43b-5712-44c4-b9d6-51c12f4f729d"
|
| 106 |
+
},
|
| 107 |
+
"execution_count": null,
|
| 108 |
+
"outputs": [
|
| 109 |
+
{
|
| 110 |
+
"output_type": "stream",
|
| 111 |
+
"name": "stdout",
|
| 112 |
+
"text": [
|
| 113 |
+
"FastPitch--val_loss-0.7796-epoch-800-last.nemo\n",
|
| 114 |
+
"HifiGan--val_loss-0.6090-epoch-39-last.nemo\n",
|
| 115 |
+
"persian-dict\n",
|
| 116 |
+
"tts_nemo_fastpitch_hifigan_convert_to_onnx.ipynb\n"
|
| 117 |
+
]
|
| 118 |
+
}
|
| 119 |
+
]
|
| 120 |
+
},
|
| 121 |
+
{
|
| 122 |
+
"cell_type": "code",
|
| 123 |
+
"source": [
|
| 124 |
+
"!pip install num2fawords -q"
|
| 125 |
+
],
|
| 126 |
+
"metadata": {
|
| 127 |
+
"id": "KNqfXdJ1poZ2"
|
| 128 |
+
},
|
| 129 |
+
"execution_count": 6,
|
| 130 |
+
"outputs": []
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"cell_type": "code",
|
| 134 |
+
"source": [
|
| 135 |
+
"from nemo.collections.tts.g2p.models.fa_ir_persian.g2p import PersianG2p\n",
|
| 136 |
+
"from nemo.collections.tts.g2p.models.fa_ir_persian.tokenizer import PersianPhonemesTokenizer\n",
|
| 137 |
+
"\n",
|
| 138 |
+
"# test Persian Grapheme-to-phoneme module\n",
|
| 139 |
+
"g2p = PersianG2p(\n",
|
| 140 |
+
" phoneme_dict=\"./persian-v6.0.dict\",\n",
|
| 141 |
+
")\n",
|
| 142 |
+
"\n",
|
| 143 |
+
"# Text tokenizer\n",
|
| 144 |
+
"# text_tokenizer = PersianPhonemesTokenizer(punct=True,chars=True,pad_with_space=True,g2p=g2p)\n",
|
| 145 |
+
"\n",
|
| 146 |
+
"text_tokenizer = PersianPhonemesTokenizer(\n",
|
| 147 |
+
" g2p=g2p,\n",
|
| 148 |
+
" use_emotion_tokens=True,\n",
|
| 149 |
+
" use_pause_tokens=True,\n",
|
| 150 |
+
" use_speed_tokens=True\n",
|
| 151 |
+
")\n",
|
| 152 |
+
"\n",
|
| 153 |
+
"text = 'و تاریخ میلادی سال ۶۲۲ را نشان میداد.'\n",
|
| 154 |
+
"ids = text_tokenizer.encode(text)\n",
|
| 155 |
+
"print(ids)"
|
| 156 |
+
],
|
| 157 |
+
"metadata": {
|
| 158 |
+
"id": "mQxZY4z4OiGx",
|
| 159 |
+
"colab": {
|
| 160 |
+
"base_uri": "https://localhost:8080/"
|
| 161 |
+
},
|
| 162 |
+
"outputId": "0d3abeae-4778-407e-9325-495e26743ab3"
|
| 163 |
+
},
|
| 164 |
+
"execution_count": 7,
|
| 165 |
+
"outputs": [
|
| 166 |
+
{
|
| 167 |
+
"output_type": "stream",
|
| 168 |
+
"name": "stdout",
|
| 169 |
+
"text": [
|
| 170 |
+
"[0, 28, 53, 0, 3, 43, 11, 50, 8, 0, 26, 50, 25, 43, 9, 50, 0, 14, 43, 25, 0, 15, 54, 15, 16, 53, 9, 0, 47, 0, 1, 50, 14, 3, 0, 47, 0, 9, 47, 0, 11, 43, 0, 27, 54, 15, 43, 27, 0, 26, 50, 9, 43, 9, 69, 0]\n"
|
| 171 |
+
]
|
| 172 |
+
}
|
| 173 |
+
]
|
| 174 |
+
},
|
| 175 |
+
{
|
| 176 |
+
"cell_type": "markdown",
|
| 177 |
+
"source": [
|
| 178 |
+
"### FastPitch Export"
|
| 179 |
+
],
|
| 180 |
+
"metadata": {
|
| 181 |
+
"id": "oc6P0je3TFe-"
|
| 182 |
+
}
|
| 183 |
+
},
|
| 184 |
+
{
|
| 185 |
+
"cell_type": "code",
|
| 186 |
+
"source": [
|
| 187 |
+
"import nemo.collections.tts as nemo_tts\n",
|
| 188 |
+
"import torch\n",
|
| 189 |
+
"\n",
|
| 190 |
+
"# Load model\n",
|
| 191 |
+
"fastpitch_model = nemo_tts.models.FastPitchModel.restore_from(\"/content/drive/MyDrive/vast-ai-gcloud-drive/tts-v3/models/FastPitch--val_loss-0.7236-epoch-50.nemo\")\n",
|
| 192 |
+
"fastpitch_model.eval()\n",
|
| 193 |
+
"\n",
|
| 194 |
+
"# Get the actual vocabulary size from the model\n",
|
| 195 |
+
"vocab_size = fastpitch_model.fastpitch.encoder.word_emb.num_embeddings\n",
|
| 196 |
+
"print(f\"Model vocabulary size: {vocab_size}\")\n",
|
| 197 |
+
"\n",
|
| 198 |
+
"# Method 1: Try with correct forward signature\n",
|
| 199 |
+
"class FastPitchWrapper1(torch.nn.Module):\n",
|
| 200 |
+
" def __init__(self, model):\n",
|
| 201 |
+
" super().__init__()\n",
|
| 202 |
+
" self.model = model\n",
|
| 203 |
+
"\n",
|
| 204 |
+
" def forward(self, text, input_lens):\n",
|
| 205 |
+
" return self.model.forward(text=text, input_lens=input_lens, pace=1.0)\n",
|
| 206 |
+
"\n",
|
| 207 |
+
"# Method 2: Try with generate_spectrogram\n",
|
| 208 |
+
"class FastPitchWrapper2(torch.nn.Module):\n",
|
| 209 |
+
" def __init__(self, model):\n",
|
| 210 |
+
" super().__init__()\n",
|
| 211 |
+
" self.model = model\n",
|
| 212 |
+
"\n",
|
| 213 |
+
" def forward(self, tokens):\n",
|
| 214 |
+
" return self.model.generate_spectrogram(tokens=tokens, speaker=None, pace=1.0)\n",
|
| 215 |
+
"\n",
|
| 216 |
+
"# Generate dummy data with valid token range (excluding padding token if it's 0)\n",
|
| 217 |
+
"padding_idx = getattr(fastpitch_model.fastpitch.encoder, 'padding_idx', 0)\n",
|
| 218 |
+
"valid_token_range = (1, vocab_size - 1) if padding_idx == 0 else (0, vocab_size - 1)\n",
|
| 219 |
+
"\n",
|
| 220 |
+
"dummy_text = torch.randint(valid_token_range[0], valid_token_range[1] + 1, (1, 50), dtype=torch.long)\n",
|
| 221 |
+
"dummy_input_lens = torch.tensor([50], dtype=torch.long)\n",
|
| 222 |
+
"\n",
|
| 223 |
+
"for i, (wrapper_class, args) in enumerate([(FastPitchWrapper1, (dummy_text, dummy_input_lens)),\n",
|
| 224 |
+
" (FastPitchWrapper2, (dummy_text,))], 1):\n",
|
| 225 |
+
" # try:\n",
|
| 226 |
+
" wrapper = wrapper_class(fastpitch_model)\n",
|
| 227 |
+
" with torch.no_grad():\n",
|
| 228 |
+
" output = wrapper(*args)\n",
|
| 229 |
+
"\n",
|
| 230 |
+
" print(f\"Method {i} works! Trying ONNX export...\")\n",
|
| 231 |
+
"\n",
|
| 232 |
+
" # Export to ONNX\n",
|
| 233 |
+
" input_names = ['text', 'input_lens'] if i == 1 else ['tokens']\n",
|
| 234 |
+
" torch.onnx.export(\n",
|
| 235 |
+
" wrapper,\n",
|
| 236 |
+
" args,\n",
|
| 237 |
+
" f\"fastpitch_method{i}.onnx\",\n",
|
| 238 |
+
" export_params=True,\n",
|
| 239 |
+
" opset_version=14,\n",
|
| 240 |
+
" input_names=input_names,\n",
|
| 241 |
+
" output_names=['mel_spec'],\n",
|
| 242 |
+
" dynamic_axes={\n",
|
| 243 |
+
" input_names[0]: {0: 'batch_size', 1: 'text_length'},\n",
|
| 244 |
+
" **(({input_names[1]: {0: 'batch_size'}} if len(input_names) > 1 else {})),\n",
|
| 245 |
+
" 'mel_spec': {0: 'batch_size', 2: 'mel_length'}\n",
|
| 246 |
+
" }\n",
|
| 247 |
+
" )\n",
|
| 248 |
+
" print(f\"Method {i} ONNX export successful!\")\n",
|
| 249 |
+
" break\n",
|
| 250 |
+
"\n",
|
| 251 |
+
" # except Exception as e:\n",
|
| 252 |
+
" # print(f\"Method {i} failed: {e}\")\n",
|
| 253 |
+
" # continue"
|
| 254 |
+
],
|
| 255 |
+
"metadata": {
|
| 256 |
+
"colab": {
|
| 257 |
+
"base_uri": "https://localhost:8080/"
|
| 258 |
+
},
|
| 259 |
+
"id": "l45qnF6tSn6e",
|
| 260 |
+
"outputId": "c02529da-e241-4e5d-faa1-6fc67e314fae"
|
| 261 |
+
},
|
| 262 |
+
"execution_count": 16,
|
| 263 |
+
"outputs": [
|
| 264 |
+
{
|
| 265 |
+
"output_type": "stream",
|
| 266 |
+
"name": "stderr",
|
| 267 |
+
"text": [
|
| 268 |
+
"[NeMo W 2025-08-15 12:26:10 nemo_logging:405] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.\n",
|
| 269 |
+
" Train config : \n",
|
| 270 |
+
" dataset:\n",
|
| 271 |
+
" _target_: nemo.collections.tts.data.dataset.TTSDataset\n",
|
| 272 |
+
" manifest_filepath: ./dataset_splits/train/train.jsonl\n",
|
| 273 |
+
" sample_rate: 22050\n",
|
| 274 |
+
" sup_data_path: sup_data\n",
|
| 275 |
+
" sup_data_types:\n",
|
| 276 |
+
" - align_prior_matrix\n",
|
| 277 |
+
" - pitch\n",
|
| 278 |
+
" n_fft: 1024\n",
|
| 279 |
+
" win_length: 1024\n",
|
| 280 |
+
" hop_length: 256\n",
|
| 281 |
+
" window: hann\n",
|
| 282 |
+
" n_mels: 80\n",
|
| 283 |
+
" lowfreq: 0\n",
|
| 284 |
+
" highfreq: null\n",
|
| 285 |
+
" max_duration: 20\n",
|
| 286 |
+
" min_duration: 0.1\n",
|
| 287 |
+
" ignore_file: null\n",
|
| 288 |
+
" trim: true\n",
|
| 289 |
+
" trim_top_db: 50\n",
|
| 290 |
+
" trim_frame_length: 1024\n",
|
| 291 |
+
" trim_hop_length: 256\n",
|
| 292 |
+
" pitch_fmin: 65.4063949584961\n",
|
| 293 |
+
" pitch_fmax: 2093.004638671875\n",
|
| 294 |
+
" pitch_norm: true\n",
|
| 295 |
+
" pitch_mean: 103.01591491699219\n",
|
| 296 |
+
" pitch_std: 30.397296905517578\n",
|
| 297 |
+
" dataloader_params:\n",
|
| 298 |
+
" drop_last: false\n",
|
| 299 |
+
" shuffle: true\n",
|
| 300 |
+
" batch_size: 64\n",
|
| 301 |
+
" num_workers: 12\n",
|
| 302 |
+
" pin_memory: true\n",
|
| 303 |
+
" \n",
|
| 304 |
+
"[NeMo W 2025-08-15 12:26:10 nemo_logging:405] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). \n",
|
| 305 |
+
" Validation config : \n",
|
| 306 |
+
" dataset:\n",
|
| 307 |
+
" _target_: nemo.collections.tts.data.dataset.TTSDataset\n",
|
| 308 |
+
" manifest_filepath: ./dataset_splits/test/test.jsonl\n",
|
| 309 |
+
" sample_rate: 22050\n",
|
| 310 |
+
" sup_data_path: sup_data\n",
|
| 311 |
+
" sup_data_types:\n",
|
| 312 |
+
" - align_prior_matrix\n",
|
| 313 |
+
" - pitch\n",
|
| 314 |
+
" n_fft: 1024\n",
|
| 315 |
+
" win_length: 1024\n",
|
| 316 |
+
" hop_length: 256\n",
|
| 317 |
+
" window: hann\n",
|
| 318 |
+
" n_mels: 80\n",
|
| 319 |
+
" lowfreq: 0\n",
|
| 320 |
+
" highfreq: null\n",
|
| 321 |
+
" max_duration: 20\n",
|
| 322 |
+
" min_duration: 0.1\n",
|
| 323 |
+
" ignore_file: null\n",
|
| 324 |
+
" trim: true\n",
|
| 325 |
+
" trim_top_db: 50\n",
|
| 326 |
+
" trim_frame_length: 1024\n",
|
| 327 |
+
" trim_hop_length: 256\n",
|
| 328 |
+
" pitch_fmin: 65.4063949584961\n",
|
| 329 |
+
" pitch_fmax: 2093.004638671875\n",
|
| 330 |
+
" pitch_norm: true\n",
|
| 331 |
+
" pitch_mean: 103.01591491699219\n",
|
| 332 |
+
" pitch_std: 30.397296905517578\n",
|
| 333 |
+
" dataloader_params:\n",
|
| 334 |
+
" drop_last: false\n",
|
| 335 |
+
" shuffle: false\n",
|
| 336 |
+
" batch_size: 24\n",
|
| 337 |
+
" num_workers: 8\n",
|
| 338 |
+
" pin_memory: true\n",
|
| 339 |
+
" \n"
|
| 340 |
+
]
|
| 341 |
+
},
|
| 342 |
+
{
|
| 343 |
+
"output_type": "stream",
|
| 344 |
+
"name": "stdout",
|
| 345 |
+
"text": [
|
| 346 |
+
"[NeMo I 2025-08-15 12:26:10 nemo_logging:393] PADDING: 1\n",
|
| 347 |
+
"[NeMo I 2025-08-15 12:26:11 nemo_logging:393] Model FastPitchModel was successfully restored from /content/drive/MyDrive/vast-ai-gcloud-drive/tts-v3/models/FastPitch--val_loss-0.7236-epoch-50.nemo.\n",
|
| 348 |
+
"Model vocabulary size: 94\n",
|
| 349 |
+
"Method 1 works! Trying ONNX export...\n",
|
| 350 |
+
"Method 1 ONNX export successful!\n"
|
| 351 |
+
]
|
| 352 |
+
}
|
| 353 |
+
]
|
| 354 |
+
},
|
| 355 |
+
{
|
| 356 |
+
"cell_type": "markdown",
|
| 357 |
+
"source": [
|
| 358 |
+
"### HiFiGAN Export"
|
| 359 |
+
],
|
| 360 |
+
"metadata": {
|
| 361 |
+
"id": "aKuYMvWBTCSa"
|
| 362 |
+
}
|
| 363 |
+
},
|
| 364 |
+
{
|
| 365 |
+
"cell_type": "code",
|
| 366 |
+
"source": [
|
| 367 |
+
"# Load HiFiGAN model\n",
|
| 368 |
+
"hifigan_model = nemo_tts.models.HifiGanModel.restore_from(\"/content/drive/MyDrive/vast-ai-gcloud-drive/tts-v3/models/HifiGan--val_loss-2.0733-epoch-12-last.nemo\")\n",
|
| 369 |
+
"hifigan_model.eval()\n",
|
| 370 |
+
"\n",
|
| 371 |
+
"# HiFiGAN might also need wrapper if it has the same issue\n",
|
| 372 |
+
"class HiFiGANWrapper(torch.nn.Module):\n",
|
| 373 |
+
" def __init__(self, model):\n",
|
| 374 |
+
" super().__init__()\n",
|
| 375 |
+
" self.model = model\n",
|
| 376 |
+
"\n",
|
| 377 |
+
" def forward(self, mel_spec):\n",
|
| 378 |
+
" return self.model.forward(spec=mel_spec)\n",
|
| 379 |
+
"\n",
|
| 380 |
+
"wrapped_hifigan = HiFiGANWrapper(hifigan_model)\n",
|
| 381 |
+
"\n",
|
| 382 |
+
"# Export HiFiGAN\n",
|
| 383 |
+
"# dummy_mel = torch.randn(1, 80, 100)\n",
|
| 384 |
+
"dummy_mel = torch.randn(1, 80, 100)\n",
|
| 385 |
+
"torch.onnx.export(\n",
|
| 386 |
+
" wrapped_hifigan,\n",
|
| 387 |
+
" dummy_mel,\n",
|
| 388 |
+
" \"hifigan_fixed.onnx\",\n",
|
| 389 |
+
" export_params=True,\n",
|
| 390 |
+
" opset_version=14,\n",
|
| 391 |
+
" do_constant_folding=True,\n",
|
| 392 |
+
" input_names=['mel_spec'],\n",
|
| 393 |
+
" output_names=['audio'],\n",
|
| 394 |
+
" dynamic_axes={\n",
|
| 395 |
+
" 'mel_spec': {0: 'batch_size', 2: 'mel_length'},\n",
|
| 396 |
+
" 'audio': {0: 'batch_size', 1: 'audio_length'}\n",
|
| 397 |
+
" },\n",
|
| 398 |
+
" optimize_for_mobile=False, # Keep False for CPU\n",
|
| 399 |
+
" training=torch.onnx.TrainingMode.EVAL\n",
|
| 400 |
+
")"
|
| 401 |
+
],
|
| 402 |
+
"metadata": {
|
| 403 |
+
"colab": {
|
| 404 |
+
"base_uri": "https://localhost:8080/"
|
| 405 |
+
},
|
| 406 |
+
"id": "UMVexvWMSqnJ",
|
| 407 |
+
"outputId": "0572f245-08d4-42ab-cfd1-fe46c98037ae"
|
| 408 |
+
},
|
| 409 |
+
"execution_count": 18,
|
| 410 |
+
"outputs": [
|
| 411 |
+
{
|
| 412 |
+
"output_type": "stream",
|
| 413 |
+
"name": "stderr",
|
| 414 |
+
"text": [
|
| 415 |
+
"[NeMo W 2025-08-15 12:26:59 nemo_logging:405] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.\n",
|
| 416 |
+
" Train config : \n",
|
| 417 |
+
" dataset:\n",
|
| 418 |
+
" _target_: nemo.collections.tts.data.dataset.VocoderDataset\n",
|
| 419 |
+
" manifest_filepath: ./mels/train_mel.jsonl\n",
|
| 420 |
+
" sample_rate: 22050\n",
|
| 421 |
+
" n_segments: 8192\n",
|
| 422 |
+
" max_duration: null\n",
|
| 423 |
+
" min_duration: 0.75\n",
|
| 424 |
+
" load_precomputed_mel: true\n",
|
| 425 |
+
" hop_length: 256\n",
|
| 426 |
+
" dataloader_params:\n",
|
| 427 |
+
" drop_last: false\n",
|
| 428 |
+
" shuffle: true\n",
|
| 429 |
+
" batch_size: 32\n",
|
| 430 |
+
" num_workers: 4\n",
|
| 431 |
+
" pin_memory: true\n",
|
| 432 |
+
" \n",
|
| 433 |
+
"[NeMo W 2025-08-15 12:26:59 nemo_logging:405] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). \n",
|
| 434 |
+
" Validation config : \n",
|
| 435 |
+
" dataset:\n",
|
| 436 |
+
" _target_: nemo.collections.tts.data.dataset.VocoderDataset\n",
|
| 437 |
+
" manifest_filepath: ./mels/test_mel.jsonl\n",
|
| 438 |
+
" sample_rate: 22050\n",
|
| 439 |
+
" n_segments: 1024\n",
|
| 440 |
+
" max_duration: null\n",
|
| 441 |
+
" min_duration: 3\n",
|
| 442 |
+
" load_precomputed_mel: true\n",
|
| 443 |
+
" hop_length: 256\n",
|
| 444 |
+
" dataloader_params:\n",
|
| 445 |
+
" drop_last: false\n",
|
| 446 |
+
" shuffle: false\n",
|
| 447 |
+
" batch_size: 16\n",
|
| 448 |
+
" num_workers: 4\n",
|
| 449 |
+
" pin_memory: true\n",
|
| 450 |
+
" \n"
|
| 451 |
+
]
|
| 452 |
+
},
|
| 453 |
+
{
|
| 454 |
+
"output_type": "stream",
|
| 455 |
+
"name": "stdout",
|
| 456 |
+
"text": [
|
| 457 |
+
"[NeMo I 2025-08-15 12:26:59 nemo_logging:393] PADDING: 0\n",
|
| 458 |
+
"[NeMo I 2025-08-15 12:26:59 nemo_logging:393] STFT using exact pad\n",
|
| 459 |
+
"[NeMo I 2025-08-15 12:26:59 nemo_logging:393] PADDING: 0\n",
|
| 460 |
+
"[NeMo I 2025-08-15 12:26:59 nemo_logging:393] STFT using exact pad\n",
|
| 461 |
+
"[NeMo I 2025-08-15 12:27:01 nemo_logging:393] Model HifiGanModel was successfully restored from /content/drive/MyDrive/vast-ai-gcloud-drive/tts-v3/models/HifiGan--val_loss-2.0733-epoch-12-last.nemo.\n"
|
| 462 |
+
]
|
| 463 |
+
}
|
| 464 |
+
]
|
| 465 |
+
},
|
| 466 |
+
{
|
| 467 |
+
"cell_type": "markdown",
|
| 468 |
+
"source": [
|
| 469 |
+
"### Run ONNX Models on CPU"
|
| 470 |
+
],
|
| 471 |
+
"metadata": {
|
| 472 |
+
"id": "bS_mgW0HTZ07"
|
| 473 |
+
}
|
| 474 |
+
},
|
| 475 |
+
{
|
| 476 |
+
"cell_type": "code",
|
| 477 |
+
"source": [
|
| 478 |
+
"!pip install onnxruntime numpy librosa soundfile -q"
|
| 479 |
+
],
|
| 480 |
+
"metadata": {
|
| 481 |
+
"id": "XdiQ2-wnTayc",
|
| 482 |
+
"colab": {
|
| 483 |
+
"base_uri": "https://localhost:8080/"
|
| 484 |
+
},
|
| 485 |
+
"outputId": "6ab61a4a-ef9e-4fda-ad5b-b6e447ddc4f3"
|
| 486 |
+
},
|
| 487 |
+
"execution_count": 20,
|
| 488 |
+
"outputs": [
|
| 489 |
+
{
|
| 490 |
+
"output_type": "stream",
|
| 491 |
+
"name": "stdout",
|
| 492 |
+
"text": [
|
| 493 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m16.5/16.5 MB\u001b[0m \u001b[31m42.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
| 494 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m2.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
| 495 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m4.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
| 496 |
+
"\u001b[?25h"
|
| 497 |
+
]
|
| 498 |
+
}
|
| 499 |
+
]
|
| 500 |
+
},
|
| 501 |
+
{
|
| 502 |
+
"cell_type": "code",
|
| 503 |
+
"source": [
|
| 504 |
+
"import onnxruntime as ort\n",
|
| 505 |
+
"import numpy as np\n",
|
| 506 |
+
"import torch\n",
|
| 507 |
+
"import librosa\n",
|
| 508 |
+
"import soundfile as sf\n",
|
| 509 |
+
"from typing import List, Optional\n",
|
| 510 |
+
"\n",
|
| 511 |
+
"# Import the same tokenizer used during training\n",
|
| 512 |
+
"from nemo.collections.tts.g2p.models.fa_ir_persian.g2p import PersianG2p\n",
|
| 513 |
+
"from nemo.collections.tts.g2p.models.fa_ir_persian.tokenizer import PersianPhonemesTokenizer\n",
|
| 514 |
+
"\n",
|
| 515 |
+
"class PersianTTSInferencePipeline:\n",
|
| 516 |
+
" def __init__(self, fastpitch_path: str, hifigan_path: str,\n",
|
| 517 |
+
" persian_dict_path: str = \"./persian-v6.0.dict\"):\n",
|
| 518 |
+
" \"\"\"\n",
|
| 519 |
+
" Initialize Persian TTS inference pipeline with ONNX models\n",
|
| 520 |
+
"\n",
|
| 521 |
+
" Args:\n",
|
| 522 |
+
" fastpitch_path: Path to FastPitch ONNX model\n",
|
| 523 |
+
" hifigan_path: Path to HiFiGAN ONNX model\n",
|
| 524 |
+
" persian_dict_path: Path to Persian phoneme dictionary\n",
|
| 525 |
+
" \"\"\"\n",
|
| 526 |
+
" # Load ONNX models\n",
|
| 527 |
+
" self.fastpitch_session = ort.InferenceSession(\n",
|
| 528 |
+
" fastpitch_path,\n",
|
| 529 |
+
" providers=['CPUExecutionProvider']\n",
|
| 530 |
+
" )\n",
|
| 531 |
+
" self.hifigan_session = ort.InferenceSession(\n",
|
| 532 |
+
" hifigan_path,\n",
|
| 533 |
+
" providers=['CPUExecutionProvider']\n",
|
| 534 |
+
" )\n",
|
| 535 |
+
"\n",
|
| 536 |
+
" # Initialize Persian tokenizer (SAME as training)\n",
|
| 537 |
+
" print(\"Initializing Persian tokenizer...\")\n",
|
| 538 |
+
" self.g2p = PersianG2p(phoneme_dict=persian_dict_path)\n",
|
| 539 |
+
" self.text_tokenizer = PersianPhonemesTokenizer(\n",
|
| 540 |
+
" g2p=g2p,\n",
|
| 541 |
+
" use_emotion_tokens=True,\n",
|
| 542 |
+
" use_pause_tokens=True,\n",
|
| 543 |
+
" use_speed_tokens=True\n",
|
| 544 |
+
" )\n",
|
| 545 |
+
"\n",
|
| 546 |
+
" # Get input/output names\n",
|
| 547 |
+
" self.fp_input_names = [inp.name for inp in self.fastpitch_session.get_inputs()]\n",
|
| 548 |
+
" self.fp_output_names = [out.name for out in self.fastpitch_session.get_outputs()]\n",
|
| 549 |
+
" self.hg_input_names = [inp.name for inp in self.hifigan_session.get_inputs()]\n",
|
| 550 |
+
" self.hg_output_names = [out.name for out in self.hifigan_session.get_outputs()]\n",
|
| 551 |
+
"\n",
|
| 552 |
+
" print(f\"FastPitch inputs: {self.fp_input_names}\")\n",
|
| 553 |
+
" print(f\"FastPitch outputs: {self.fp_output_names}\")\n",
|
| 554 |
+
" print(f\"HiFiGAN inputs: {self.hg_input_names}\")\n",
|
| 555 |
+
" print(f\"HiFiGAN outputs: {self.hg_output_names}\")\n",
|
| 556 |
+
"\n",
|
| 557 |
+
" # Test tokenizer\n",
|
| 558 |
+
" test_text = 'مدل تبدیل متن به گفتار پارسی'\n",
|
| 559 |
+
" test_ids = self.text_tokenizer.encode(test_text)\n",
|
| 560 |
+
" print(f\"Test tokenization: '{test_text}' -> {test_ids[:10]}...\")\n",
|
| 561 |
+
"\n",
|
| 562 |
+
" def text_to_tokens(self, text: str) -> tuple:\n",
|
| 563 |
+
" \"\"\"\n",
|
| 564 |
+
" Convert Persian text to phoneme tokens using the same tokenizer as training\n",
|
| 565 |
+
"\n",
|
| 566 |
+
" Args:\n",
|
| 567 |
+
" text: Input Persian text string\n",
|
| 568 |
+
"\n",
|
| 569 |
+
" Returns:\n",
|
| 570 |
+
" tokens: numpy array of phoneme token indices\n",
|
| 571 |
+
" token_lengths: numpy array of sequence length\n",
|
| 572 |
+
" \"\"\"\n",
|
| 573 |
+
" # Use the exact same tokenizer as training\n",
|
| 574 |
+
" token_ids = self.text_tokenizer.encode(text)\n",
|
| 575 |
+
"\n",
|
| 576 |
+
" # Convert to numpy arrays with batch dimension\n",
|
| 577 |
+
" tokens = np.array([token_ids], dtype=np.int64) # Shape: (1, seq_len)\n",
|
| 578 |
+
" token_lengths = np.array([len(token_ids)], dtype=np.int64) # Shape: (1,)\n",
|
| 579 |
+
"\n",
|
| 580 |
+
" print(f\"Text: '{text}'\")\n",
|
| 581 |
+
" print(f\"Tokens length: {len(token_ids)}\")\n",
|
| 582 |
+
" print(f\"First 20 tokens: {token_ids[:20]}\")\n",
|
| 583 |
+
" print(f\"Token range: [{min(token_ids)}, {max(token_ids)}]\")\n",
|
| 584 |
+
"\n",
|
| 585 |
+
" return tokens, token_lengths\n",
|
| 586 |
+
"\n",
|
| 587 |
+
" def generate_mel_spectrogram(self, text: str) -> np.ndarray:\n",
|
| 588 |
+
" \"\"\"\n",
|
| 589 |
+
" Generate mel spectrogram from Persian text using FastPitch\n",
|
| 590 |
+
"\n",
|
| 591 |
+
" Args:\n",
|
| 592 |
+
" text: Input Persian text string\n",
|
| 593 |
+
"\n",
|
| 594 |
+
" Returns:\n",
|
| 595 |
+
" mel_spec: Generated mel spectrogram\n",
|
| 596 |
+
" \"\"\"\n",
|
| 597 |
+
" # Convert text to phoneme tokens\n",
|
| 598 |
+
" tokens, token_lengths = self.text_to_tokens(text)\n",
|
| 599 |
+
"\n",
|
| 600 |
+
" # Prepare inputs based on your model's input names\n",
|
| 601 |
+
" if len(self.fp_input_names) == 1:\n",
|
| 602 |
+
" # If using generate_spectrogram wrapper (Method 2)\n",
|
| 603 |
+
" inputs = {self.fp_input_names[0]: tokens}\n",
|
| 604 |
+
" else:\n",
|
| 605 |
+
" # If using forward wrapper (Method 1)\n",
|
| 606 |
+
" inputs = {\n",
|
| 607 |
+
" self.fp_input_names[0]: tokens, # text or tokens\n",
|
| 608 |
+
" self.fp_input_names[1]: token_lengths # input_lens\n",
|
| 609 |
+
" }\n",
|
| 610 |
+
"\n",
|
| 611 |
+
" print(f\"FastPitch inputs: {list(inputs.keys())}\")\n",
|
| 612 |
+
" for key, val in inputs.items():\n",
|
| 613 |
+
" print(f\" {key}: shape {val.shape}, dtype {val.dtype}\")\n",
|
| 614 |
+
"\n",
|
| 615 |
+
" # Run FastPitch inference\n",
|
| 616 |
+
" mel_outputs = self.fastpitch_session.run(self.fp_output_names, inputs)\n",
|
| 617 |
+
" mel_spec = mel_outputs[0] # First output should be mel spectrogram\n",
|
| 618 |
+
"\n",
|
| 619 |
+
" print(f\"Generated mel spectrogram shape: {mel_spec.shape}\")\n",
|
| 620 |
+
" print(f\"Mel range: [{mel_spec.min():.4f}, {mel_spec.max():.4f}]\")\n",
|
| 621 |
+
"\n",
|
| 622 |
+
" return mel_spec\n",
|
| 623 |
+
"\n",
|
| 624 |
+
" def generate_audio(self, mel_spec: np.ndarray, sample_rate: int = 22050) -> np.ndarray:\n",
|
| 625 |
+
" \"\"\"\n",
|
| 626 |
+
" Generate audio from mel spectrogram using HiFiGAN\n",
|
| 627 |
+
"\n",
|
| 628 |
+
" Args:\n",
|
| 629 |
+
" mel_spec: Input mel spectrogram\n",
|
| 630 |
+
" sample_rate: Audio sample rate\n",
|
| 631 |
+
"\n",
|
| 632 |
+
" Returns:\n",
|
| 633 |
+
" audio: Generated audio waveform\n",
|
| 634 |
+
" \"\"\"\n",
|
| 635 |
+
" # Prepare inputs for HiFiGAN\n",
|
| 636 |
+
" inputs = {self.hg_input_names[0]: mel_spec}\n",
|
| 637 |
+
"\n",
|
| 638 |
+
" print(f\"HiFiGAN input shape: {mel_spec.shape}\")\n",
|
| 639 |
+
"\n",
|
| 640 |
+
" # Run HiFiGAN inference\n",
|
| 641 |
+
" audio_outputs = self.hifigan_session.run(self.hg_output_names, inputs)\n",
|
| 642 |
+
" audio = audio_outputs[0] # First output should be audio\n",
|
| 643 |
+
"\n",
|
| 644 |
+
" # Remove batch dimension and ensure proper shape\n",
|
| 645 |
+
" if audio.ndim > 1:\n",
|
| 646 |
+
" audio = audio.squeeze()\n",
|
| 647 |
+
"\n",
|
| 648 |
+
" print(f\"Generated audio shape: {audio.shape}\")\n",
|
| 649 |
+
" print(f\"Audio range: [{audio.min():.4f}, {audio.max():.4f}]\")\n",
|
| 650 |
+
" print(f\"Audio RMS: {np.sqrt(np.mean(audio**2)):.4f}\")\n",
|
| 651 |
+
"\n",
|
| 652 |
+
" return audio\n",
|
| 653 |
+
"\n",
|
| 654 |
+
" def text_to_speech(self, text: str, output_path: Optional[str] = None,\n",
|
| 655 |
+
" sample_rate: int = 22050) -> np.ndarray:\n",
|
| 656 |
+
" \"\"\"\n",
|
| 657 |
+
" Complete Persian text-to-speech pipeline\n",
|
| 658 |
+
"\n",
|
| 659 |
+
" Args:\n",
|
| 660 |
+
" text: Input Persian text string\n",
|
| 661 |
+
" output_path: Optional path to save audio file\n",
|
| 662 |
+
" sample_rate: Audio sample rate\n",
|
| 663 |
+
"\n",
|
| 664 |
+
" Returns:\n",
|
| 665 |
+
" audio: Generated audio waveform\n",
|
| 666 |
+
" \"\"\"\n",
|
| 667 |
+
" print(f\"🎙️ Generating Persian speech for: '{text}'\")\n",
|
| 668 |
+
" print(\"=\" * 60)\n",
|
| 669 |
+
"\n",
|
| 670 |
+
" # Step 1: Generate mel spectrogram\n",
|
| 671 |
+
" print(\"📊 Generating mel spectrogram...\")\n",
|
| 672 |
+
" mel_spec = self.generate_mel_spectrogram(text)\n",
|
| 673 |
+
"\n",
|
| 674 |
+
" # Step 2: Generate audio from mel spectrogram\n",
|
| 675 |
+
" print(\"🔊 Generating audio...\")\n",
|
| 676 |
+
" audio = self.generate_audio(mel_spec, sample_rate)\n",
|
| 677 |
+
"\n",
|
| 678 |
+
" # Step 3: Save audio if path provided\n",
|
| 679 |
+
" if output_path:\n",
|
| 680 |
+
" sf.write(output_path, audio, sample_rate)\n",
|
| 681 |
+
" print(f\"💾 Audio saved to: {output_path}\")\n",
|
| 682 |
+
"\n",
|
| 683 |
+
" print(\"✅ Persian TTS generation completed!\")\n",
|
| 684 |
+
" return audio\n",
|
| 685 |
+
"\n",
|
| 686 |
+
" def test_tokenizer_consistency(self):\n",
|
| 687 |
+
" \"\"\"Test that tokenizer works consistently\"\"\"\n",
|
| 688 |
+
" test_texts = [\n",
|
| 689 |
+
" 'سلام دنیا',\n",
|
| 690 |
+
" 'مدل تبدیل متن به گفتار پارسی',\n",
|
| 691 |
+
" 'این یک تست است',\n",
|
| 692 |
+
" 'پردازش زبان طبیعی'\n",
|
| 693 |
+
" ]\n",
|
| 694 |
+
"\n",
|
| 695 |
+
" print(\"🧪 Testing tokenizer consistency:\")\n",
|
| 696 |
+
" for text in test_texts:\n",
|
| 697 |
+
" tokens = self.text_tokenizer.encode(text)\n",
|
| 698 |
+
" decoded = self.text_tokenizer.decode(tokens)\n",
|
| 699 |
+
" print(f\" '{text}' -> {len(tokens)} tokens -> '{decoded}'\")\n",
|
| 700 |
+
"\n",
|
| 701 |
+
" def compare_with_training_tokenizer(self, text: str):\n",
|
| 702 |
+
" \"\"\"Compare tokenizer output with training setup\"\"\"\n",
|
| 703 |
+
" print(f\"🔍 Tokenizer comparison for: '{text}'\")\n",
|
| 704 |
+
"\n",
|
| 705 |
+
" # Your training tokenizer\n",
|
| 706 |
+
" tokens = self.text_tokenizer.encode(text)\n",
|
| 707 |
+
"\n",
|
| 708 |
+
" # Print detailed tokenization info\n",
|
| 709 |
+
" print(f\"Phoneme tokens: {tokens}\")\n",
|
| 710 |
+
" print(f\"Token count: {len(tokens)}\")\n",
|
| 711 |
+
" print(f\"Vocabulary size range: [0, {max(tokens)}]\")\n",
|
| 712 |
+
"\n",
|
| 713 |
+
" # Try to decode back\n",
|
| 714 |
+
" try:\n",
|
| 715 |
+
" decoded = self.text_tokenizer.decode(tokens)\n",
|
| 716 |
+
" print(f\"Decoded back: '{decoded}'\")\n",
|
| 717 |
+
" except:\n",
|
| 718 |
+
" print(\"Could not decode tokens back to text\")\n",
|
| 719 |
+
"\n",
|
| 720 |
+
" return tokens\n",
|
| 721 |
+
"\n",
|
| 722 |
+
"# Example usage for Persian TTS\n",
|
| 723 |
+
"def main():\n",
|
| 724 |
+
" # Initialize the Persian TTS pipeline\n",
|
| 725 |
+
" persian_tts = PersianTTSInferencePipeline(\n",
|
| 726 |
+
" fastpitch_path=\"fastpitch.onnx\", # Your exported ONNX model\n",
|
| 727 |
+
" hifigan_path=\"hifigan.onnx\", # Your exported ONNX model\n",
|
| 728 |
+
" persian_dict_path=\"/content/drive/MyDrive/cNotebooks/perTTS/tts-nemo-fastpitch-hifigan-convert-to-onnx/persian-dict/persian-v4.0.dict\"\n",
|
| 729 |
+
" )\n",
|
| 730 |
+
"\n",
|
| 731 |
+
" # Test tokenizer first\n",
|
| 732 |
+
" persian_tts.test_tokenizer_consistency()\n",
|
| 733 |
+
"\n",
|
| 734 |
+
" # Generate speech for Persian text\n",
|
| 735 |
+
" persian_texts = [\n",
|
| 736 |
+
" 'سلام دنیا',\n",
|
| 737 |
+
" 'مدل تبدیل متن به گفتار پارسی',\n",
|
| 738 |
+
" 'این یک تست از سیستم تولید گفتار است',\n",
|
| 739 |
+
" 'پردازش زبان طبیعی فارسی'\n",
|
| 740 |
+
" ]\n",
|
| 741 |
+
"\n",
|
| 742 |
+
" for i, text in enumerate(persian_texts):\n",
|
| 743 |
+
" print(f\"\\n{'='*80}\")\n",
|
| 744 |
+
" try:\n",
|
| 745 |
+
" audio = persian_tts.text_to_speech(\n",
|
| 746 |
+
" text=text,\n",
|
| 747 |
+
" output_path=f\"persian_output_{i+1}.wav\",\n",
|
| 748 |
+
" sample_rate=22050\n",
|
| 749 |
+
" )\n",
|
| 750 |
+
" print(f\"✅ Successfully generated audio for text {i+1}\")\n",
|
| 751 |
+
"\n",
|
| 752 |
+
" except Exception as e:\n",
|
| 753 |
+
" print(f\"❌ Failed to generate audio for text {i+1}: {e}\")\n",
|
| 754 |
+
" # Debug the tokenization for this text\n",
|
| 755 |
+
" persian_tts.compare_with_training_tokenizer(text)\n",
|
| 756 |
+
"\n",
|
| 757 |
+
" return persian_tts\n",
|
| 758 |
+
"\n",
|
| 759 |
+
"if __name__ == \"__main__\":\n",
|
| 760 |
+
" tts_pipeline = main()"
|
| 761 |
+
],
|
| 762 |
+
"metadata": {
|
| 763 |
+
"colab": {
|
| 764 |
+
"base_uri": "https://localhost:8080/"
|
| 765 |
+
},
|
| 766 |
+
"id": "ZlNsGG8hTcRx",
|
| 767 |
+
"outputId": "8bce8a81-78ac-4c6b-e376-bcbc55b7a064"
|
| 768 |
+
},
|
| 769 |
+
"execution_count": 23,
|
| 770 |
+
"outputs": [
|
| 771 |
+
{
|
| 772 |
+
"output_type": "stream",
|
| 773 |
+
"name": "stdout",
|
| 774 |
+
"text": [
|
| 775 |
+
"Initializing Persian tokenizer...\n",
|
| 776 |
+
"FastPitch inputs: ['text']\n",
|
| 777 |
+
"FastPitch outputs: ['mel_spec', 'seq_lens', 'durs_predicted', 'log_durs_predicted', 'res']\n",
|
| 778 |
+
"HiFiGAN inputs: ['mel_spec']\n",
|
| 779 |
+
"HiFiGAN outputs: ['audio']\n",
|
| 780 |
+
"Test tokenization: 'مدل تبدیل متن به گفتار پارسی' -> [0, 26, 55, 9, 54, 25, 0, 3, 53, 1]...\n",
|
| 781 |
+
"🧪 Testing tokenizer consistency:\n",
|
| 782 |
+
" 'سلام دنیا' -> 13 tokens -> ' |s|a|l|Λ|m| |d|o|n|y|Λ| '\n",
|
| 783 |
+
" 'مدل تبدیل متن به گفتار پارسی' -> 35 tokens -> ' |m|o|d|e|l| |t|a|b|d|i|l| |m|a|t|n| |b|E| |g|o|f|t|Λ|r| |p|Λ|r|s|i| '\n",
|
| 784 |
+
" 'این یک تست است' -> 17 tokens -> ' |I|n| |y|e|k| |t|e|s|t| |ą|s|t| '\n",
|
| 785 |
+
" 'پردازش زبان طبیعی' -> 23 tokens -> ' |p|a|r|d|Λ|z|e|S| |z|a|b|Λ|n| |T|a|b|i|ʔ|i| '\n",
|
| 786 |
+
"\n",
|
| 787 |
+
"================================================================================\n",
|
| 788 |
+
"🎙️ Generating Persian speech for: 'سلام دنیا'\n",
|
| 789 |
+
"============================================================\n",
|
| 790 |
+
"📊 Generating mel spectrogram...\n",
|
| 791 |
+
"Text: 'سلام دنیا'\n",
|
| 792 |
+
"Tokens length: 13\n",
|
| 793 |
+
"First 20 tokens: [0, 14, 53, 25, 43, 26, 0, 9, 55, 27, 32, 43, 0]\n",
|
| 794 |
+
"Token range: [0, 55]\n",
|
| 795 |
+
"FastPitch inputs: ['text']\n",
|
| 796 |
+
" text: shape (1, 13), dtype int64\n",
|
| 797 |
+
"Generated mel spectrogram shape: (1, 80, 106)\n",
|
| 798 |
+
"Mel range: [-11.0657, -1.1308]\n",
|
| 799 |
+
"🔊 Generating audio...\n",
|
| 800 |
+
"HiFiGAN input shape: (1, 80, 106)\n",
|
| 801 |
+
"Generated audio shape: (27136,)\n",
|
| 802 |
+
"Audio range: [-0.1950, 0.1185]\n",
|
| 803 |
+
"Audio RMS: 0.0259\n",
|
| 804 |
+
"💾 Audio saved to: persian_output_1.wav\n",
|
| 805 |
+
"✅ Persian TTS generation completed!\n",
|
| 806 |
+
"✅ Successfully generated audio for text 1\n",
|
| 807 |
+
"\n",
|
| 808 |
+
"================================================================================\n",
|
| 809 |
+
"🎙️ Generating Persian speech for: 'مدل تبدیل متن به گفتار پارسی'\n",
|
| 810 |
+
"============================================================\n",
|
| 811 |
+
"📊 Generating mel spectrogram...\n",
|
| 812 |
+
"Text: 'مدل تبدیل متن به گفتار پارسی'\n",
|
| 813 |
+
"Tokens length: 35\n",
|
| 814 |
+
"First 20 tokens: [0, 26, 55, 9, 54, 25, 0, 3, 53, 1, 9, 50, 25, 0, 26, 53, 3, 27, 0, 1]\n",
|
| 815 |
+
"Token range: [0, 55]\n",
|
| 816 |
+
"FastPitch inputs: ['text']\n",
|
| 817 |
+
" text: shape (1, 35), dtype int64\n",
|
| 818 |
+
"Generated mel spectrogram shape: (1, 80, 240)\n",
|
| 819 |
+
"Mel range: [-10.2846, 0.0889]\n",
|
| 820 |
+
"🔊 Generating audio...\n",
|
| 821 |
+
"HiFiGAN input shape: (1, 80, 240)\n",
|
| 822 |
+
"Generated audio shape: (61440,)\n",
|
| 823 |
+
"Audio range: [-0.4547, 0.4433]\n",
|
| 824 |
+
"Audio RMS: 0.0730\n",
|
| 825 |
+
"💾 Audio saved to: persian_output_2.wav\n",
|
| 826 |
+
"✅ Persian TTS generation completed!\n",
|
| 827 |
+
"✅ Successfully generated audio for text 2\n",
|
| 828 |
+
"\n",
|
| 829 |
+
"================================================================================\n",
|
| 830 |
+
"🎙️ Generating Persian speech for: 'این یک تست از سیستم تولید گفتار است'\n",
|
| 831 |
+
"============================================================\n",
|
| 832 |
+
"📊 Generating mel spectrogram...\n",
|
| 833 |
+
"Text: 'این یک تست از سیستم تولید گفتار است'\n",
|
| 834 |
+
"Tokens length: 40\n",
|
| 835 |
+
"First 20 tokens: [0, 51, 27, 0, 32, 54, 24, 0, 3, 54, 14, 3, 0, 44, 12, 0, 14, 50, 14, 3]\n",
|
| 836 |
+
"Token range: [0, 55]\n",
|
| 837 |
+
"FastPitch inputs: ['text']\n",
|
| 838 |
+
" text: shape (1, 40), dtype int64\n",
|
| 839 |
+
"Generated mel spectrogram shape: (1, 80, 275)\n",
|
| 840 |
+
"Mel range: [-10.2355, 0.9884]\n",
|
| 841 |
+
"🔊 Generating audio...\n",
|
| 842 |
+
"HiFiGAN input shape: (1, 80, 275)\n",
|
| 843 |
+
"Generated audio shape: (70400,)\n",
|
| 844 |
+
"Audio range: [-0.6646, 0.4960]\n",
|
| 845 |
+
"Audio RMS: 0.1006\n",
|
| 846 |
+
"💾 Audio saved to: persian_output_3.wav\n",
|
| 847 |
+
"✅ Persian TTS generation completed!\n",
|
| 848 |
+
"✅ Successfully generated audio for text 3\n",
|
| 849 |
+
"\n",
|
| 850 |
+
"================================================================================\n",
|
| 851 |
+
"🎙️ Generating Persian speech for: 'پردازش زبان طبیعی فارسی'\n",
|
| 852 |
+
"============================================================\n",
|
| 853 |
+
"📊 Generating mel spectrogram...\n",
|
| 854 |
+
"Text: 'پردازش زبان طبیعی فارسی'\n",
|
| 855 |
+
"Tokens length: 29\n",
|
| 856 |
+
"First 20 tokens: [0, 2, 53, 11, 9, 43, 12, 54, 15, 0, 12, 53, 1, 43, 27, 0, 18, 53, 1, 50]\n",
|
| 857 |
+
"Token range: [0, 54]\n",
|
| 858 |
+
"FastPitch inputs: ['text']\n",
|
| 859 |
+
" text: shape (1, 29), dtype int64\n",
|
| 860 |
+
"Generated mel spectrogram shape: (1, 80, 214)\n",
|
| 861 |
+
"Mel range: [-11.0496, -0.7565]\n",
|
| 862 |
+
"🔊 Generating audio...\n",
|
| 863 |
+
"HiFiGAN input shape: (1, 80, 214)\n",
|
| 864 |
+
"Generated audio shape: (54784,)\n",
|
| 865 |
+
"Audio range: [-0.2387, 0.2220]\n",
|
| 866 |
+
"Audio RMS: 0.0293\n",
|
| 867 |
+
"💾 Audio saved to: persian_output_4.wav\n",
|
| 868 |
+
"✅ Persian TTS generation completed!\n",
|
| 869 |
+
"✅ Successfully generated audio for text 4\n"
|
| 870 |
+
]
|
| 871 |
+
}
|
| 872 |
+
]
|
| 873 |
+
},
|
| 874 |
+
{
|
| 875 |
+
"cell_type": "code",
|
| 876 |
+
"source": [],
|
| 877 |
+
"metadata": {
|
| 878 |
+
"id": "rLrOthW4VUqZ"
|
| 879 |
+
},
|
| 880 |
+
"execution_count": null,
|
| 881 |
+
"outputs": []
|
| 882 |
+
}
|
| 883 |
+
]
|
| 884 |
+
}
|