niobures commited on Sep 29, 2025

Commit

07d6458

verified ·

1 Parent(s): b170ec5

tts_fa_fastpitch_hifigan-v2.0

Browse files

Files changed (35) hide show

.gitattributes +3 -0
tts_fa_fastpitch_hifigan-v2.0/.gitattributes +38 -0
tts_fa_fastpitch_hifigan-v2.0/README.md +65 -0
tts_fa_fastpitch_hifigan-v2.0/config/fastpitch/.ipynb_checkpoints/fastpitch_align_22050_grapheme-checkpoint.yaml +248 -0
tts_fa_fastpitch_hifigan-v2.0/config/fastpitch/ds_for_fastpitch_align.yaml +47 -0
tts_fa_fastpitch_hifigan-v2.0/config/fastpitch/fastpitch_align_22050_grapheme.yaml +248 -0
tts_fa_fastpitch_hifigan-v2.0/config/hifigan/.ipynb_checkpoints/hifigan-checkpoint.yaml +99 -0
tts_fa_fastpitch_hifigan-v2.0/config/hifigan/hifigan.yaml +99 -0
tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/generator/.ipynb_checkpoints/v1-checkpoint.yaml +7 -0
tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/generator/v1.yaml +7 -0
tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/train_ds/.ipynb_checkpoints/train_ds_finetune-checkpoint.yaml +15 -0
tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/train_ds/train_ds_finetune.yaml +15 -0
tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/train_ds_finetune.yaml +15 -0
tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/v1.yaml +7 -0
tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/val_ds_finetune.yaml +15 -0
tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/validation_ds/.ipynb_checkpoints/val_ds_finetune-checkpoint.yaml +15 -0
tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/validation_ds/val_ds_finetune.yaml +15 -0
tts_fa_fastpitch_hifigan-v2.0/models/FastPitch--val_loss-0.7236-epoch-50.ckpt +3 -0
tts_fa_fastpitch_hifigan-v2.0/models/FastPitch--val_loss-0.7236-epoch-50.nemo +3 -0
tts_fa_fastpitch_hifigan-v2.0/models/HifiGan--val_loss-0.6090-epoch-39-last.nemo +3 -0
tts_fa_fastpitch_hifigan-v2.0/models/HifiGan--val_loss-2.0688-epoch-12-last.ckpt +3 -0
tts_fa_fastpitch_hifigan-v2.0/models/HifiGan--val_loss-2.0733-epoch-12-last.nemo +3 -0
tts_fa_fastpitch_hifigan-v2.0/models/onnx-models/fastpitch.onnx +3 -0
tts_fa_fastpitch_hifigan-v2.0/models/onnx-models/hifigan.onnx +3 -0
tts_fa_fastpitch_hifigan-v2.0/persian-dict/persian-v5.0.dict +0 -0
tts_fa_fastpitch_hifigan-v2.0/scripts/.ipynb_checkpoints/generate_mels-checkpoint.py +181 -0
tts_fa_fastpitch_hifigan-v2.0/scripts/.ipynb_checkpoints/hifigan_finetune-checkpoint.py +32 -0
tts_fa_fastpitch_hifigan-v2.0/scripts/extract_sup_data.py +83 -0
tts_fa_fastpitch_hifigan-v2.0/scripts/fastpitch.py +35 -0
tts_fa_fastpitch_hifigan-v2.0/scripts/generate_mels.py +181 -0
tts_fa_fastpitch_hifigan-v2.0/scripts/hifigan_finetune.py +32 -0
tts_fa_fastpitch_hifigan-v2.0/source.txt +1 -0
tts_fa_fastpitch_hifigan-v2.0/tts-nemo-fastpitch-hifigan-from-scratch-finetuning-hifigan.ipynb +0 -0
tts_fa_fastpitch_hifigan-v2.0/tts-nemo-fastpitch-hifigan-inference-only.ipynb +0 -0
tts_fa_fastpitch_hifigan-v2.0/tts_nemo_fastpitch_hifigan_convert_to_onnx.ipynb +884 -0

.gitattributes CHANGED Viewed

@@ -48,3 +48,6 @@ hifigan_for_sherpa/pretrained/UNIVERSAL_V1/g_02500000 filter=lfs diff=lfs merge=
 hifigan_for_sherpa/pretrained/VCTK_V1/generator_v1 filter=lfs diff=lfs merge=lfs -text
 hifigan_for_sherpa/pretrained/VCTK_V2/generator_v2 filter=lfs diff=lfs merge=lfs -text
 hifigan_for_sherpa/pretrained/VCTK_V3/generator_v3 filter=lfs diff=lfs merge=lfs -text

 hifigan_for_sherpa/pretrained/VCTK_V1/generator_v1 filter=lfs diff=lfs merge=lfs -text
 hifigan_for_sherpa/pretrained/VCTK_V2/generator_v2 filter=lfs diff=lfs merge=lfs -text
 hifigan_for_sherpa/pretrained/VCTK_V3/generator_v3 filter=lfs diff=lfs merge=lfs -text
+tts_fa_fastpitch_hifigan-v2.0/models/FastPitch--val_loss-0.7236-epoch-50.nemo filter=lfs diff=lfs merge=lfs -text
+tts_fa_fastpitch_hifigan-v2.0/models/HifiGan--val_loss-0.6090-epoch-39-last.nemo filter=lfs diff=lfs merge=lfs -text
+tts_fa_fastpitch_hifigan-v2.0/models/HifiGan--val_loss-2.0733-epoch-12-last.nemo filter=lfs diff=lfs merge=lfs -text

tts_fa_fastpitch_hifigan-v2.0/.gitattributes ADDED Viewed

	@@ -0,0 +1,38 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+models/FastPitch--val_loss-0.7236-epoch-50.nemo filter=lfs diff=lfs merge=lfs -text
+models/HifiGan--val_loss-0.6090-epoch-39-last.nemo filter=lfs diff=lfs merge=lfs -text
+models/HifiGan--val_loss-2.0733-epoch-12-last.nemo filter=lfs diff=lfs merge=lfs -text

tts_fa_fastpitch_hifigan-v2.0/README.md ADDED Viewed

	@@ -0,0 +1,65 @@

+---
+license: apache-2.0
+---
+## FastPitch and HifiGan v2.0
+v2.0 of phonemizer and tokenizer. tokenzier `DO SUPPORT` pauses, emotion tokens etc,.
+### Install NeMo
+```bash
+apt-get update && apt-get install -y libsndfile1 ffmpeg
+pip install Cython packaging
+rm -rf /usr/lib/python3.10/site-packages/blinker*
+rm -rf /usr/local/lib/python3.10/dist-packages/blinker*
+pip install --ignore-installed blinker
+pip install --upgrade --force-reinstall blinker
+git clone https://github.com/SadeghKrmi/NeMo.git
+cd NeMo
+pip install -e '.[all]'
+```
+### deterministic split
+Run the deterministic-train-test-split.py to split the train/test
+### Extract the supportive data
+using the following scripts, extract pitch statistics
+```bash
+tar -xzf dataset_splits.tar.gz
+cd extract-supportive-data
+HYDRA_FULL_ERROR=1 python3 ./scripts/extract_sup_data.py \
+        --config-path ../config/fastpitch/ \
+        --config-name ds_for_fastpitch_align.yaml \
+        manifest_filepath=./dataset_splits/train/train.jsonl \
+        sup_data_path=sup_data \
+        phoneme_dict_path=./persian-dict/persian-v4.0.dict \
+        ++dataloader_params.num_workers=8
+```
+#### dataset sup pitch stats
+PITCH_MEAN=98.72935485839844, PITCH_STD=29.40760040283203
+PITCH_MIN=65.4063949584961, PITCH_MAX=2093.004638671875
+### zip and download
+```bash
+tar -czf sup_data.tar.gz  sup_data
+```
+### Training FastPitch
+training for about 800 epochs, with CosineAnnealing sched. and `max_steps` 200,000 for lr to decay overtime.
+val_loss didn't decrease lower that about 0.77xx
+`val_loss = mel_loss + dur_loss + pitch_loss + energy_loss`
+### Training HiFiGAN
+training for about 40 epochs, stoped the training based on quality checking by listening to audios

tts_fa_fastpitch_hifigan-v2.0/config/fastpitch/.ipynb_checkpoints/fastpitch_align_22050_grapheme-checkpoint.yaml ADDED Viewed

	@@ -0,0 +1,248 @@

+# This config contains the default values for training FastPitch model with aligner using 22KHz sampling
+# rate. If you want to train model on other dataset, you can change config values according to your dataset.
+# Most dataset-specific arguments are in the head of the config file, see below.
+name: FastPitch
+train_dataset: ???
+validation_datasets: ???
+sup_data_path: ???
+sup_data_types: [ "align_prior_matrix", "pitch" ]
+phoneme_dict_path: ???
+# Default values from librosa.pyin
+pitch_fmin: 65.4063949584961
+pitch_fmax: 2093.004638671875
+# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
+# by running `scripts/dataset_processing/tts/extract_sup_data.py`
+pitch_mean: 103.01591491699219  # e.g. 132.524658203125 for https://zenodo.org/record/5525342/files/thorsten-neutral_v03.tgz?download=1
+pitch_std: 30.397296905517578   # e.g.  37.389366149902 for https://zenodo.org/record/5525342/files/thorsten-neutral_v03.tgz?download=1
+# Default values for dataset with sample_rate=22050
+sample_rate: 22050
+n_mel_channels: 80
+n_window_size: 1024
+n_window_stride: 256
+n_fft: 1024
+lowfreq: 0
+highfreq: null
+window: hann
+model:
+  learn_alignment: true
+  bin_loss_warmup_epochs: 100
+  n_speakers: 1
+  max_token_duration: 75
+  symbols_embedding_dim: 384
+  pitch_embedding_kernel_size: 3
+  pitch_fmin: ${pitch_fmin}
+  pitch_fmax: ${pitch_fmax}
+  pitch_mean: ${pitch_mean}
+  pitch_std: ${pitch_std}
+  sample_rate: ${sample_rate}
+  n_mel_channels: ${n_mel_channels}
+  n_window_size: ${n_window_size}
+  n_window_stride: ${n_window_stride}
+  n_fft: ${n_fft}
+  lowfreq: ${lowfreq}
+  highfreq: ${highfreq}
+  window: ${window}
+  # text_normalizer:
+  #   _target_: nemo_text_processing.text_normalization.normalize.Normalizer
+  #   lang: de
+  #   input_case: cased
+  # text_normalizer_call_kwargs:
+  #   verbose: false
+  #   punct_pre_process: true
+  #   punct_post_process: true
+  text_tokenizer:
+    _target_: nemo.collections.tts.g2p.models.fa_ir_persian.tokenizer.PersianPhonemesTokenizer
+    punct: true
+    use_emotion_tokens: true
+    use_pause_tokens: true
+    use_speed_tokens: true
+    g2p:
+      _target_: nemo.collections.tts.g2p.models.fa_ir_persian.g2p.PersianG2p
+      phoneme_dict: ${phoneme_dict_path}
+  train_ds:
+    dataset:
+      _target_: nemo.collections.tts.data.dataset.TTSDataset
+      manifest_filepath: ${train_dataset}
+      sample_rate: ${model.sample_rate}
+      sup_data_path: ${sup_data_path}
+      sup_data_types: ${sup_data_types}
+      n_fft: ${model.n_fft}
+      win_length: ${model.n_window_size}
+      hop_length: ${model.n_window_stride}
+      window: ${model.window}
+      n_mels: ${model.n_mel_channels}
+      lowfreq: ${model.lowfreq}
+      highfreq: ${model.highfreq}
+      max_duration: 25  # change to null to include longer audios.
+      min_duration: 0.1
+      ignore_file: null
+      trim: true
+      trim_top_db: 50
+      trim_frame_length: ${model.n_window_size}
+      trim_hop_length: ${model.n_window_stride}
+      pitch_fmin: ${model.pitch_fmin}
+      pitch_fmax: ${model.pitch_fmax}
+      pitch_norm: true
+      pitch_mean: ${model.pitch_mean}
+      pitch_std: ${model.pitch_std}
+    dataloader_params:
+      drop_last: false
+      shuffle: true
+      batch_size: 32
+      num_workers: 12
+      pin_memory: true
+  validation_ds:
+    dataset:
+      _target_: nemo.collections.tts.data.dataset.TTSDataset
+      manifest_filepath: ${validation_datasets}
+      sample_rate: ${model.sample_rate}
+      sup_data_path: ${sup_data_path}
+      sup_data_types: ${sup_data_types}
+      n_fft: ${model.n_fft}
+      win_length: ${model.n_window_size}
+      hop_length: ${model.n_window_stride}
+      window: ${model.window}
+      n_mels: ${model.n_mel_channels}
+      lowfreq: ${model.lowfreq}
+      highfreq: ${model.highfreq}
+      max_duration: 25  # change to null to include longer audios.
+      min_duration: 0.1
+      ignore_file: null
+      trim: true
+      trim_top_db: 50
+      trim_frame_length: ${model.n_window_size}
+      trim_hop_length: ${model.n_window_stride}
+      pitch_fmin: ${model.pitch_fmin}
+      pitch_fmax: ${model.pitch_fmax}
+      pitch_norm: true
+      pitch_mean: ${model.pitch_mean}
+      pitch_std: ${model.pitch_std}
+    dataloader_params:
+      drop_last: false
+      shuffle: false
+      batch_size: 32
+      num_workers: 8
+      pin_memory: true
+  preprocessor:
+    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
+    features: ${model.n_mel_channels}
+    lowfreq: ${model.lowfreq}
+    highfreq: ${model.highfreq}
+    n_fft: ${model.n_fft}
+    n_window_size: ${model.n_window_size}
+    window_size: false
+    n_window_stride: ${model.n_window_stride}
+    window_stride: false
+    pad_to: 1
+    pad_value: 0
+    sample_rate: ${model.sample_rate}
+    window: ${model.window}
+    normalize: null
+    preemph: null
+    dither: 0.0
+    frame_splicing: 1
+    log: true
+    log_zero_guard_type: add
+    log_zero_guard_value: 1e-05
+    mag_power: 1.0
+  input_fft: #n_embed and padding_idx are added by the model
+    _target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder
+    n_layer: 6
+    n_head: 1
+    d_model: ${model.symbols_embedding_dim}
+    d_head: 64
+    d_inner: 1536
+    kernel_size: 3
+    dropout: 0.1
+    dropatt: 0.1
+    dropemb: 0.0
+    d_embed: ${model.symbols_embedding_dim}
+  output_fft:
+    _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder
+    n_layer: 6
+    n_head: 1
+    d_model: ${model.symbols_embedding_dim}
+    d_head: 64
+    d_inner: 1536
+    kernel_size: 3
+    dropout: 0.1
+    dropatt: 0.1
+    dropemb: 0.0
+  alignment_module:
+    _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder
+    n_text_channels: ${model.symbols_embedding_dim}
+  duration_predictor:
+    _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
+    input_size: ${model.symbols_embedding_dim}
+    kernel_size: 3
+    filter_size: 256
+    dropout: 0.1
+    n_layers: 2
+  pitch_predictor:
+    _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
+    input_size: ${model.symbols_embedding_dim}
+    kernel_size: 3
+    filter_size: 256
+    dropout: 0.1
+    n_layers: 2
+  optim:
+    name: adamw
+    lr: 1e-3
+    betas: [0.9, 0.98]
+    weight_decay: 1e-3
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 2000
+      last_epoch: -1
+      min_lr: 1e-6
+trainer:
+  num_nodes: 1
+  devices: -1  # specify all GPUs regardless of its availability
+  accelerator: gpu
+  strategy: ddp
+  precision: 16
+  max_epochs: 1500
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1000.0
+  enable_checkpointing: false  # Provided by exp_manager
+  logger: false  # Provided by exp_manager
+  log_every_n_steps: 100
+  check_val_every_n_epoch: 5
+  benchmark: false
+exp_manager:
+  exp_dir: null
+  name: ${name}
+  create_tensorboard_logger: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    monitor: val_loss
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: false

tts_fa_fastpitch_hifigan-v2.0/config/fastpitch/ds_for_fastpitch_align.yaml ADDED Viewed

	@@ -0,0 +1,47 @@

+name: "ds_for_fastpitch_align"
+manifest_filepath: ???
+sup_data_path: ???
+sup_data_types: [ "align_prior_matrix", "pitch" ]
+phoneme_dict_path: ???
+dataset:
+  _target_: nemo.collections.tts.data.dataset.TTSDataset
+  manifest_filepath: ${manifest_filepath}
+  sample_rate: 22050
+  sup_data_path: ${sup_data_path}
+  sup_data_types: ${sup_data_types}
+  n_fft: 1024
+  win_length: 1024
+  hop_length: 256
+  window: "hann"
+  n_mels: 80
+  lowfreq: 0
+  highfreq: 8000
+  max_duration: null
+  min_duration: 0.1
+  ignore_file: null
+  trim: false
+  pitch_fmin: 65.40639132514966
+  pitch_fmax: 2093.004522404789
+  # text_normalizer:
+  #   _target_: nemo_text_processing.text_normalization.normalize.Normalizer
+  #   lang: en
+  #   input_case: cased
+  # text_normalizer_call_kwargs:
+  #   verbose: false
+  #   punct_pre_process: true
+  #   punct_post_process: true
+  text_tokenizer:
+    _target_: nemo.collections.tts.g2p.models.fa_ir_persian.tokenizer.PersianPhonemesTokenizer
+    punct: true
+    use_emotion_tokens: true
+    use_pause_tokens: true
+    use_speed_tokens: true
+    g2p:
+      _target_: nemo.collections.tts.g2p.models.fa_ir_persian.g2p.PersianG2p
+      phoneme_dict: ${phoneme_dict_path}

tts_fa_fastpitch_hifigan-v2.0/config/fastpitch/fastpitch_align_22050_grapheme.yaml ADDED Viewed

	@@ -0,0 +1,248 @@

+# This config contains the default values for training FastPitch model with aligner using 22KHz sampling
+# rate. If you want to train model on other dataset, you can change config values according to your dataset.
+# Most dataset-specific arguments are in the head of the config file, see below.
+name: FastPitch
+train_dataset: ???
+validation_datasets: ???
+sup_data_path: ???
+sup_data_types: [ "align_prior_matrix", "pitch" ]
+phoneme_dict_path: ???
+# Default values from librosa.pyin
+pitch_fmin: 65.4063949584961
+pitch_fmax: 2093.004638671875
+# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
+# by running `scripts/dataset_processing/tts/extract_sup_data.py`
+pitch_mean: 103.01591491699219  # e.g. 132.524658203125 for https://zenodo.org/record/5525342/files/thorsten-neutral_v03.tgz?download=1
+pitch_std: 30.397296905517578   # e.g.  37.389366149902 for https://zenodo.org/record/5525342/files/thorsten-neutral_v03.tgz?download=1
+# Default values for dataset with sample_rate=22050
+sample_rate: 22050
+n_mel_channels: 80
+n_window_size: 1024
+n_window_stride: 256
+n_fft: 1024
+lowfreq: 0
+highfreq: null
+window: hann
+model:
+  learn_alignment: true
+  bin_loss_warmup_epochs: 100
+  n_speakers: 1
+  max_token_duration: 75
+  symbols_embedding_dim: 384
+  pitch_embedding_kernel_size: 3
+  pitch_fmin: ${pitch_fmin}
+  pitch_fmax: ${pitch_fmax}
+  pitch_mean: ${pitch_mean}
+  pitch_std: ${pitch_std}
+  sample_rate: ${sample_rate}
+  n_mel_channels: ${n_mel_channels}
+  n_window_size: ${n_window_size}
+  n_window_stride: ${n_window_stride}
+  n_fft: ${n_fft}
+  lowfreq: ${lowfreq}
+  highfreq: ${highfreq}
+  window: ${window}
+  # text_normalizer:
+  #   _target_: nemo_text_processing.text_normalization.normalize.Normalizer
+  #   lang: de
+  #   input_case: cased
+  # text_normalizer_call_kwargs:
+  #   verbose: false
+  #   punct_pre_process: true
+  #   punct_post_process: true
+  text_tokenizer:
+    _target_: nemo.collections.tts.g2p.models.fa_ir_persian.tokenizer.PersianPhonemesTokenizer
+    punct: true
+    use_emotion_tokens: true
+    use_pause_tokens: true
+    use_speed_tokens: true
+    g2p:
+      _target_: nemo.collections.tts.g2p.models.fa_ir_persian.g2p.PersianG2p
+      phoneme_dict: ${phoneme_dict_path}
+  train_ds:
+    dataset:
+      _target_: nemo.collections.tts.data.dataset.TTSDataset
+      manifest_filepath: ${train_dataset}
+      sample_rate: ${model.sample_rate}
+      sup_data_path: ${sup_data_path}
+      sup_data_types: ${sup_data_types}
+      n_fft: ${model.n_fft}
+      win_length: ${model.n_window_size}
+      hop_length: ${model.n_window_stride}
+      window: ${model.window}
+      n_mels: ${model.n_mel_channels}
+      lowfreq: ${model.lowfreq}
+      highfreq: ${model.highfreq}
+      max_duration: 25  # change to null to include longer audios.
+      min_duration: 0.1
+      ignore_file: null
+      trim: true
+      trim_top_db: 50
+      trim_frame_length: ${model.n_window_size}
+      trim_hop_length: ${model.n_window_stride}
+      pitch_fmin: ${model.pitch_fmin}
+      pitch_fmax: ${model.pitch_fmax}
+      pitch_norm: true
+      pitch_mean: ${model.pitch_mean}
+      pitch_std: ${model.pitch_std}
+    dataloader_params:
+      drop_last: false
+      shuffle: true
+      batch_size: 32
+      num_workers: 12
+      pin_memory: true
+  validation_ds:
+    dataset:
+      _target_: nemo.collections.tts.data.dataset.TTSDataset
+      manifest_filepath: ${validation_datasets}
+      sample_rate: ${model.sample_rate}
+      sup_data_path: ${sup_data_path}
+      sup_data_types: ${sup_data_types}
+      n_fft: ${model.n_fft}
+      win_length: ${model.n_window_size}
+      hop_length: ${model.n_window_stride}
+      window: ${model.window}
+      n_mels: ${model.n_mel_channels}
+      lowfreq: ${model.lowfreq}
+      highfreq: ${model.highfreq}
+      max_duration: 25  # change to null to include longer audios.
+      min_duration: 0.1
+      ignore_file: null
+      trim: true
+      trim_top_db: 50
+      trim_frame_length: ${model.n_window_size}
+      trim_hop_length: ${model.n_window_stride}
+      pitch_fmin: ${model.pitch_fmin}
+      pitch_fmax: ${model.pitch_fmax}
+      pitch_norm: true
+      pitch_mean: ${model.pitch_mean}
+      pitch_std: ${model.pitch_std}
+    dataloader_params:
+      drop_last: false
+      shuffle: false
+      batch_size: 32
+      num_workers: 8
+      pin_memory: true
+  preprocessor:
+    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
+    features: ${model.n_mel_channels}
+    lowfreq: ${model.lowfreq}
+    highfreq: ${model.highfreq}
+    n_fft: ${model.n_fft}
+    n_window_size: ${model.n_window_size}
+    window_size: false
+    n_window_stride: ${model.n_window_stride}
+    window_stride: false
+    pad_to: 1
+    pad_value: 0
+    sample_rate: ${model.sample_rate}
+    window: ${model.window}
+    normalize: null
+    preemph: null
+    dither: 0.0
+    frame_splicing: 1
+    log: true
+    log_zero_guard_type: add
+    log_zero_guard_value: 1e-05
+    mag_power: 1.0
+  input_fft: #n_embed and padding_idx are added by the model
+    _target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder
+    n_layer: 6
+    n_head: 1
+    d_model: ${model.symbols_embedding_dim}
+    d_head: 64
+    d_inner: 1536
+    kernel_size: 3
+    dropout: 0.1
+    dropatt: 0.1
+    dropemb: 0.0
+    d_embed: ${model.symbols_embedding_dim}
+  output_fft:
+    _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder
+    n_layer: 6
+    n_head: 1
+    d_model: ${model.symbols_embedding_dim}
+    d_head: 64
+    d_inner: 1536
+    kernel_size: 3
+    dropout: 0.1
+    dropatt: 0.1
+    dropemb: 0.0
+  alignment_module:
+    _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder
+    n_text_channels: ${model.symbols_embedding_dim}
+  duration_predictor:
+    _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
+    input_size: ${model.symbols_embedding_dim}
+    kernel_size: 3
+    filter_size: 256
+    dropout: 0.1
+    n_layers: 2
+  pitch_predictor:
+    _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
+    input_size: ${model.symbols_embedding_dim}
+    kernel_size: 3
+    filter_size: 256
+    dropout: 0.1
+    n_layers: 2
+  optim:
+    name: adamw
+    lr: 1e-3
+    betas: [0.9, 0.98]
+    weight_decay: 1e-3
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 2000
+      last_epoch: -1
+      min_lr: 1e-6
+trainer:
+  num_nodes: 1
+  devices: -1  # specify all GPUs regardless of its availability
+  accelerator: gpu
+  strategy: ddp
+  precision: 16
+  max_epochs: 1500
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1000.0
+  enable_checkpointing: false  # Provided by exp_manager
+  logger: false  # Provided by exp_manager
+  log_every_n_steps: 100
+  check_val_every_n_epoch: 5
+  benchmark: false
+exp_manager:
+  exp_dir: null
+  name: ${name}
+  create_tensorboard_logger: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    monitor: val_loss
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: false

tts_fa_fastpitch_hifigan-v2.0/config/hifigan/.ipynb_checkpoints/hifigan-checkpoint.yaml ADDED Viewed

	@@ -0,0 +1,99 @@

+# This config contains the default values for training HiFi-GAN model on LJSpeech dataset.
+# If you want to train model on other dataset, you can change config values according to your dataset.
+# Most dataset-specific arguments are in the head of the config file, see below.
+name: "HifiGan"
+train_dataset: ???
+validation_datasets: ???
+# Default values for dataset with sample_rate=22050
+sample_rate: 22050
+n_mel_channels: 80
+n_window_size: 1024
+n_window_stride: 256
+n_fft: 1024
+lowfreq: 0
+highfreq: 8000
+window: hann
+train_n_segments: 8192
+train_max_duration: null
+train_min_duration: 0.75
+val_n_segments: 66048
+val_max_duration: null
+val_min_duration: 0.75
+defaults:
+  - model/generator: v1
+  - model/train_ds: train_ds
+  - model/validation_ds: val_ds
+model:
+  preprocessor:
+    _target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures
+    nfilt: ${n_mel_channels}
+    lowfreq: ${lowfreq}
+    highfreq: ${highfreq}
+    n_fft: ${n_fft}
+    n_window_size: ${n_window_size}
+    n_window_stride: ${n_window_stride}
+    pad_to: 0
+    pad_value: -11.52
+    sample_rate: ${sample_rate}
+    window: ${window}
+    normalize: null
+    preemph: null
+    dither: 0.0
+    frame_splicing: 1
+    log: true
+    log_zero_guard_type: clamp
+    log_zero_guard_value: 1e-05
+    mag_power: 1.0
+    use_grads: false
+    exact_pad: true
+  optim:
+    _target_: torch.optim.AdamW
+    lr: 0.0002
+    betas: [0.8, 0.99]
+    sched:
+      name: CosineAnnealing
+      min_lr: 1e-5
+      warmup_ratio: 0.02
+  max_steps: 50000
+  l1_loss_factor: 45
+  denoise_strength: 0.0025
+trainer:
+  num_nodes: 1
+  devices: 1
+  accelerator: gpu
+  strategy: ddp_find_unused_parameters_true
+  precision: 32
+  max_steps: ${model.max_steps}
+  accumulate_grad_batches: 1
+  enable_checkpointing: False  # Provided by exp_manager
+  logger: false # Provided by exp_manager
+  log_every_n_steps: 100
+  check_val_every_n_epoch: 10
+  benchmark: false
+exp_manager:
+  exp_dir: null
+  name: ${name}
+  create_tensorboard_logger: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    monitor: val_loss
+    mode: min
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    name: null
+    project: null
+    entity: null
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: false

tts_fa_fastpitch_hifigan-v2.0/config/hifigan/hifigan.yaml ADDED Viewed

	@@ -0,0 +1,99 @@

+# This config contains the default values for training HiFi-GAN model on LJSpeech dataset.
+# If you want to train model on other dataset, you can change config values according to your dataset.
+# Most dataset-specific arguments are in the head of the config file, see below.
+name: "HifiGan"
+train_dataset: ???
+validation_datasets: ???
+# Default values for dataset with sample_rate=22050
+sample_rate: 22050
+n_mel_channels: 80
+n_window_size: 1024
+n_window_stride: 256
+n_fft: 1024
+lowfreq: 0
+highfreq: 8000
+window: hann
+train_n_segments: 8192
+train_max_duration: null
+train_min_duration: 0.75
+val_n_segments: 66048
+val_max_duration: null
+val_min_duration: 3
+defaults:
+  - model/generator: v1
+  - model/train_ds: train_ds
+  - model/validation_ds: val_ds
+model:
+  preprocessor:
+    _target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures
+    nfilt: ${n_mel_channels}
+    lowfreq: ${lowfreq}
+    highfreq: ${highfreq}
+    n_fft: ${n_fft}
+    n_window_size: ${n_window_size}
+    n_window_stride: ${n_window_stride}
+    pad_to: 0
+    pad_value: -11.52
+    sample_rate: ${sample_rate}
+    window: ${window}
+    normalize: null
+    preemph: null
+    dither: 0.0
+    frame_splicing: 1
+    log: true
+    log_zero_guard_type: clamp
+    log_zero_guard_value: 1e-05
+    mag_power: 1.0
+    use_grads: false
+    exact_pad: true
+  optim:
+    _target_: torch.optim.AdamW
+    lr: 0.0002
+    betas: [0.8, 0.99]
+    sched:
+      name: CosineAnnealing
+      min_lr: 1e-5
+      warmup_ratio: 0.02
+  max_steps: 2500000
+  l1_loss_factor: 45
+  denoise_strength: 0.0025
+trainer:
+  num_nodes: 1
+  devices: 1
+  accelerator: gpu
+  strategy: ddp_find_unused_parameters_true
+  precision: 32
+  max_steps: ${model.max_steps}
+  accumulate_grad_batches: 1
+  enable_checkpointing: False  # Provided by exp_manager
+  logger: false # Provided by exp_manager
+  log_every_n_steps: 100
+  check_val_every_n_epoch: 10
+  benchmark: false
+exp_manager:
+  exp_dir: null
+  name: ${name}
+  create_tensorboard_logger: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    monitor: val_loss
+    mode: min
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    name: null
+    project: null
+    entity: null
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: false

tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/generator/.ipynb_checkpoints/v1-checkpoint.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+_target_: nemo.collections.tts.modules.hifigan_modules.Generator
+resblock: 1
+upsample_rates: [8,8,2,2]
+upsample_kernel_sizes: [16,16,4,4]
+upsample_initial_channel: 512
+resblock_kernel_sizes: [3,7,11]
+resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]

tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/generator/v1.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+_target_: nemo.collections.tts.modules.hifigan_modules.Generator
+resblock: 1
+upsample_rates: [8,8,2,2]
+upsample_kernel_sizes: [16,16,4,4]
+upsample_initial_channel: 512
+resblock_kernel_sizes: [3,7,11]
+resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]

tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/train_ds/.ipynb_checkpoints/train_ds_finetune-checkpoint.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+dataset:
+  _target_: "nemo.collections.tts.data.dataset.VocoderDataset"
+  manifest_filepath: ${train_dataset}
+  sample_rate: ${sample_rate}
+  n_segments: ${train_n_segments}
+  max_duration: ${train_max_duration}
+  min_duration: ${train_min_duration}
+  load_precomputed_mel: true
+  hop_length: ${n_window_stride}
+dataloader_params:
+  drop_last: false
+  shuffle: true
+  batch_size: 32
+  num_workers: 4
+  pin_memory: true

tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/train_ds/train_ds_finetune.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+dataset:
+  _target_: "nemo.collections.tts.data.dataset.VocoderDataset"
+  manifest_filepath: ${train_dataset}
+  sample_rate: ${sample_rate}
+  n_segments: ${train_n_segments}
+  max_duration: ${train_max_duration}
+  min_duration: ${train_min_duration}
+  load_precomputed_mel: true
+  hop_length: ${n_window_stride}
+dataloader_params:
+  drop_last: false
+  shuffle: true
+  batch_size: 32
+  num_workers: 4
+  pin_memory: true

tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/train_ds_finetune.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+dataset:
+  _target_: "nemo.collections.tts.data.dataset.VocoderDataset"
+  manifest_filepath: ${train_dataset}
+  sample_rate: ${sample_rate}
+  n_segments: ${train_n_segments}
+  max_duration: ${train_max_duration}
+  min_duration: ${train_min_duration}
+  load_precomputed_mel: true
+  hop_length: ${n_window_stride}
+dataloader_params:
+  drop_last: false
+  shuffle: true
+  batch_size: 16
+  num_workers: 4
+  pin_memory: true

tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/v1.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+_target_: nemo.collections.tts.modules.hifigan_modules.Generator
+resblock: 1
+upsample_rates: [8,8,2,2]
+upsample_kernel_sizes: [16,16,4,4]
+upsample_initial_channel: 512
+resblock_kernel_sizes: [3,7,11]
+resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]

tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/val_ds_finetune.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+dataset:
+  _target_: "nemo.collections.tts.data.dataset.VocoderDataset"
+  manifest_filepath: ${validation_datasets}
+  sample_rate: ${sample_rate}
+  n_segments: ${val_n_segments}
+  max_duration: ${val_max_duration}
+  min_duration: ${val_min_duration}
+  load_precomputed_mel: true
+  hop_length: ${n_window_stride}
+dataloader_params:
+  drop_last: false
+  shuffle: false
+  batch_size: 16
+  num_workers: 4
+  pin_memory: true

tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/validation_ds/.ipynb_checkpoints/val_ds_finetune-checkpoint.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+dataset:
+  _target_: "nemo.collections.tts.data.dataset.VocoderDataset"
+  manifest_filepath: ${validation_datasets}
+  sample_rate: ${sample_rate}
+  n_segments: ${val_n_segments}
+  max_duration: ${val_max_duration}
+  min_duration: ${val_min_duration}
+  load_precomputed_mel: true
+  hop_length: ${n_window_stride}
+dataloader_params:
+  drop_last: false
+  shuffle: false
+  batch_size: 16
+  num_workers: 4
+  pin_memory: true

tts_fa_fastpitch_hifigan-v2.0/config/hifigan/model/validation_ds/val_ds_finetune.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+dataset:
+  _target_: "nemo.collections.tts.data.dataset.VocoderDataset"
+  manifest_filepath: ${validation_datasets}
+  sample_rate: ${sample_rate}
+  n_segments: ${val_n_segments}
+  max_duration: ${val_max_duration}
+  min_duration: ${val_min_duration}
+  load_precomputed_mel: true
+  hop_length: ${n_window_stride}
+dataloader_params:
+  drop_last: false
+  shuffle: false
+  batch_size: 16
+  num_workers: 4
+  pin_memory: true

tts_fa_fastpitch_hifigan-v2.0/models/FastPitch--val_loss-0.7236-epoch-50.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eea3093c753874952bab5719b9d82c664b0c1c7bc4116a3034d657659269e3bb
+size 549427880

tts_fa_fastpitch_hifigan-v2.0/models/FastPitch--val_loss-0.7236-epoch-50.nemo ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cbff27139ad11c3e742378596421b00c15f5dce664255119b4e1652a4e73d64c
+size 184258560

tts_fa_fastpitch_hifigan-v2.0/models/HifiGan--val_loss-0.6090-epoch-39-last.nemo ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8199747904ba8c35f64f1d3b1a1a4f62c303fd2f0238c2148f2750833563aa8a
+size 339210240

tts_fa_fastpitch_hifigan-v2.0/models/HifiGan--val_loss-2.0688-epoch-12-last.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:24e7ddabf6058bef99b570a56131368bdbe39b2c9b095c5be8b1d6d7c8c5adcf
+size 1016835427

tts_fa_fastpitch_hifigan-v2.0/models/HifiGan--val_loss-2.0733-epoch-12-last.nemo ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0e087e923573c910b0a6c86e02c355bf3a85c82017cdcce82f99ff47a2a8577
+size 339210240

tts_fa_fastpitch_hifigan-v2.0/models/onnx-models/fastpitch.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0108a874da58f4cd5c99c7819b89cff32cc841a6d23f3f4fae4e901aeb315000
+size 179344671

tts_fa_fastpitch_hifigan-v2.0/models/onnx-models/hifigan.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0f5898c7b64e7c64c8421210f17f498c7b13e4d121d6f33f09d226cb6956f970
+size 55760326

tts_fa_fastpitch_hifigan-v2.0/persian-dict/persian-v5.0.dict ADDED Viewed

The diff for this file is too large to render. See raw diff

tts_fa_fastpitch_hifigan-v2.0/scripts/.ipynb_checkpoints/generate_mels-checkpoint.py ADDED Viewed

	@@ -0,0 +1,181 @@

+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script is to generate mel spectrograms from a Fastpitch model checkpoint. Please see general usage below. It runs
+on GPUs by default, but you can add `--num-workers 5 --cpu` as an option to run on CPUs.
+$ python scripts/dataset_processing/tts/generate_mels.py \
+    --fastpitch-model-ckpt ./models/fastpitch/multi_spk/FastPitch--val_loss\=1.4473-epoch\=209.ckpt \
+    --input-json-manifests /home/xueyang/HUI-Audio-Corpus-German-clean/test_manifest_text_normed_phonemes.json
+    --output-json-manifest-root /home/xueyang/experiments/multi_spk_tts_de
+"""
+import argparse
+import json
+from pathlib import Path
+import numpy as np
+import soundfile as sf
+import torch
+from joblib import Parallel, delayed
+from tqdm import tqdm
+from nemo.collections.tts.models import FastPitchModel
+from nemo.collections.tts.parts.utils.tts_dataset_utils import (
+    BetaBinomialInterpolator,
+    beta_binomial_prior_distribution,
+)
+from nemo.utils import logging
+def get_args():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description="Generate mel spectrograms with pretrained FastPitch model, and create manifests for finetuning Hifigan.",
+    )
+    parser.add_argument(
+        "--fastpitch-model-ckpt",
+        required=True,
+        type=Path,
+        help="Specify a full path of a fastpitch model checkpoint with the suffix of either .ckpt or .nemo.",
+    )
+    parser.add_argument(
+        "--input-json-manifests",
+        nargs="+",
+        required=True,
+        type=Path,
+        help="Specify a full path of a JSON manifest. You could add multiple manifests.",
+    )
+    parser.add_argument(
+        "--output-json-manifest-root",
+        required=True,
+        type=Path,
+        help="Specify a full path of output root that would contain new manifests.",
+    )
+    parser.add_argument(
+        "--num-workers",
+        default=-1,
+        type=int,
+        help="Specify the max number of concurrently Python workers processes. "
+        "If -1 all CPUs are used. If 1 no parallel computing is used.",
+    )
+    parser.add_argument("--cpu", action='store_true', default=False, help="Generate mel spectrograms using CPUs.")
+    args = parser.parse_args()
+    return args
+def __load_wav(audio_file):
+    with sf.SoundFile(audio_file, 'r') as f:
+        samples = f.read(dtype='float32')
+    return samples.transpose()
+def __generate_mels(entry, spec_model, device, use_beta_binomial_interpolator, mel_root):
+    # Generate a spectrograms (we need to use ground truth alignment for correct matching between audio and mels)
+    audio = __load_wav(entry["audio_filepath"])
+    audio = torch.from_numpy(audio).unsqueeze(0).to(device)
+    audio_len = torch.tensor(audio.shape[1], dtype=torch.long, device=device).unsqueeze(0)
+    if spec_model.fastpitch.speaker_emb is not None and "speaker" in entry:
+        speaker = torch.tensor([entry['speaker']]).to(device)
+    else:
+        speaker = None
+    with torch.no_grad():
+        if "normalized_text" in entry:
+            text = spec_model.parse(entry["normalized_text"], normalize=False)
+        else:
+            text = spec_model.parse(entry['text'])
+        text_len = torch.tensor(text.shape[-1], dtype=torch.long, device=device).unsqueeze(0)
+        spect, spect_len = spec_model.preprocessor(input_signal=audio, length=audio_len)
+        mel_len = spect.shape[-1]
+        # Generate attention prior and spectrogram inputs for HiFi-GAN
+        if use_beta_binomial_interpolator:
+            beta_binomial_interpolator = BetaBinomialInterpolator()
+            attn_prior = (
+                torch.from_numpy(beta_binomial_interpolator(mel_len, text_len.item()))
+                .unsqueeze(0)
+                .to(text.device)
+            )
+        else:
+            attn_prior = (
+                torch.from_numpy(beta_binomial_prior_distribution(text_len.item(), mel_len))
+                .unsqueeze(0)
+                .to(text.device)
+            )
+        spectrogram = spec_model.forward(
+            text=text, input_lens=text_len, spec=spect, mel_lens=spect_len, attn_prior=attn_prior, speaker=speaker,
+        )[0]
+        save_path = mel_root / f"{Path(entry['audio_filepath']).stem}.npy"
+        np.save(save_path, spectrogram[0].to('cpu').numpy())
+        entry["mel_filepath"] = str(save_path)
+    return entry
+def main():
+    args = get_args()
+    ckpt_path = args.fastpitch_model_ckpt
+    input_manifest_filepaths = args.input_json_manifests
+    output_json_manifest_root = args.output_json_manifest_root
+    mel_root = output_json_manifest_root / "mels"
+    mel_root.mkdir(exist_ok=True, parents=True)
+    # load pretrained FastPitch model checkpoint
+    suffix = ckpt_path.suffix
+    if suffix == ".nemo":
+        spec_model = FastPitchModel.restore_from(ckpt_path).eval()
+    elif suffix == ".ckpt":
+        spec_model = FastPitchModel.load_from_checkpoint(ckpt_path).eval()
+    else:
+        raise ValueError(f"Unsupported suffix: {suffix}")
+    if not args.cpu:
+        spec_model.cuda()
+    device = spec_model.device
+    use_beta_binomial_interpolator = spec_model.cfg.train_ds.dataset.get("use_beta_binomial_interpolator", False)
+    for manifest in input_manifest_filepaths:
+        logging.info(f"Processing {manifest}.")
+        entries = []
+        with open(manifest, "r") as fjson:
+            for line in fjson:
+                entries.append(json.loads(line.strip()))
+        if device == "cpu":
+            new_entries = Parallel(n_jobs=args.num_workers)(
+                delayed(__generate_mels)(entry, spec_model, device, use_beta_binomial_interpolator, mel_root)
+                for entry in entries
+            )
+        else:
+            new_entries = []
+            for entry in tqdm(entries):
+                new_entry = __generate_mels(entry, spec_model, device, use_beta_binomial_interpolator, mel_root)
+                new_entries.append(new_entry)
+        mel_manifest_path = output_json_manifest_root / f"{manifest.stem}_mel{manifest.suffix}"
+        with open(mel_manifest_path, "w") as fmel:
+            for entry in new_entries:
+                fmel.write(json.dumps(entry) + "\n")
+        logging.info(f"Processing {manifest} is complete --> {mel_manifest_path}")
+if __name__ == "__main__":
+    main()

tts_fa_fastpitch_hifigan-v2.0/scripts/.ipynb_checkpoints/hifigan_finetune-checkpoint.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import lightning.pytorch as pl
+from nemo.collections.tts.models import HifiGanModel
+from nemo.core.config import hydra_runner
+from nemo.utils.exp_manager import exp_manager
+@hydra_runner(config_path="conf/hifigan", config_name="hifigan_44100")
+def main(cfg):
+    trainer = pl.Trainer(**cfg.trainer)
+    exp_manager(trainer, cfg.get("exp_manager", None))
+    model = HifiGanModel(cfg=cfg.model, trainer=trainer)
+    model.maybe_init_from_pretrained_checkpoint(cfg=cfg)
+    trainer.fit(model)
+if __name__ == '__main__':
+    main()  # noqa pylint: disable=no-value-for-parameter

tts_fa_fastpitch_hifigan-v2.0/scripts/extract_sup_data.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from hydra.utils import instantiate
+from tqdm import tqdm
+from nemo.core.config import hydra_runner
+def get_pitch_stats(pitch_list):
+    pitch_tensor = torch.cat(pitch_list)
+    pitch_mean, pitch_std = pitch_tensor.mean().item(), pitch_tensor.std().item()
+    pitch_min, pitch_max = pitch_tensor.min().item(), pitch_tensor.max().item()
+    print(f"PITCH_MEAN={pitch_mean}, PITCH_STD={pitch_std}")
+    print(f"PITCH_MIN={pitch_min}, PITCH_MAX={pitch_max}")
+def preprocess_ds_for_fastpitch_align(dataloader):
+    pitch_list = []
+    for batch in tqdm(dataloader, total=len(dataloader)):
+        audios, audio_lengths, tokens, tokens_lengths, align_prior_matrices, pitches, pitches_lengths, *_ = batch
+        pitch = pitches.squeeze(0)
+        pitch_list.append(pitch[pitch != 0])
+    get_pitch_stats(pitch_list)
+def preprocess_ds_for_mixer_tts_x(dataloader):
+    pitch_list = []
+    for batch in tqdm(dataloader, total=len(dataloader)):
+        (
+            audios,
+            audio_lengths,
+            tokens,
+            tokens_lengths,
+            align_prior_matrices,
+            pitches,
+            pitches_lengths,
+            lm_tokens,
+        ) = batch
+        pitch = pitches.squeeze(0)
+        pitch_list.append(pitch[pitch != 0])
+    get_pitch_stats(pitch_list)
+CFG_NAME2FUNC = {
+    "ds_for_fastpitch_align": preprocess_ds_for_fastpitch_align,
+    "ds_for_mixer_tts": preprocess_ds_for_fastpitch_align,
+    "ds_for_mixer_tts_x": preprocess_ds_for_mixer_tts_x,
+}
+@hydra_runner(config_path='ljspeech/ds_conf', config_name='ds_for_fastpitch_align')
+def main(cfg):
+    dataset = instantiate(cfg.dataset)
+    dataloader = torch.utils.data.DataLoader(
+        dataset=dataset,
+        batch_size=1,
+        collate_fn=dataset._collate_fn,
+        num_workers=cfg.get("dataloader_params", {}).get("num_workers", 4),
+    )
+    print(f"Processing {cfg.manifest_filepath}:")
+    CFG_NAME2FUNC[cfg.name](dataloader)
+if __name__ == '__main__':
+    main()  # noqa pylint: disable=no-value-for-parameter

tts_fa_fastpitch_hifigan-v2.0/scripts/fastpitch.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import lightning.pytorch as pl
+from nemo.collections.common.callbacks import LogEpochTimeCallback
+from nemo.collections.tts.models import FastPitchModel
+from nemo.core.config import hydra_runner
+from nemo.utils.exp_manager import exp_manager
+@hydra_runner(config_path="conf", config_name="fastpitch_align_v1.05")
+def main(cfg):
+    trainer = pl.Trainer(**cfg.trainer)
+    exp_manager(trainer, cfg.get("exp_manager", None))
+    model = FastPitchModel(cfg=cfg.model, trainer=trainer)
+    lr_logger = pl.callbacks.LearningRateMonitor()
+    epoch_time_logger = LogEpochTimeCallback()
+    trainer.callbacks.extend([lr_logger, epoch_time_logger])
+    trainer.fit(model)
+if __name__ == '__main__':
+    main()  # noqa pylint: disable=no-value-for-parameter

tts_fa_fastpitch_hifigan-v2.0/scripts/generate_mels.py ADDED Viewed

	@@ -0,0 +1,181 @@

+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script is to generate mel spectrograms from a Fastpitch model checkpoint. Please see general usage below. It runs
+on GPUs by default, but you can add `--num-workers 5 --cpu` as an option to run on CPUs.
+$ python scripts/dataset_processing/tts/generate_mels.py \
+    --fastpitch-model-ckpt ./models/fastpitch/multi_spk/FastPitch--val_loss\=1.4473-epoch\=209.ckpt \
+    --input-json-manifests /home/xueyang/HUI-Audio-Corpus-German-clean/test_manifest_text_normed_phonemes.json
+    --output-json-manifest-root /home/xueyang/experiments/multi_spk_tts_de
+"""
+import argparse
+import json
+from pathlib import Path
+import numpy as np
+import soundfile as sf
+import torch
+from joblib import Parallel, delayed
+from tqdm import tqdm
+from nemo.collections.tts.models import FastPitchModel
+from nemo.collections.tts.parts.utils.tts_dataset_utils import (
+    BetaBinomialInterpolator,
+    beta_binomial_prior_distribution,
+)
+from nemo.utils import logging
+def get_args():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description="Generate mel spectrograms with pretrained FastPitch model, and create manifests for finetuning Hifigan.",
+    )
+    parser.add_argument(
+        "--fastpitch-model-ckpt",
+        required=True,
+        type=Path,
+        help="Specify a full path of a fastpitch model checkpoint with the suffix of either .ckpt or .nemo.",
+    )
+    parser.add_argument(
+        "--input-json-manifests",
+        nargs="+",
+        required=True,
+        type=Path,
+        help="Specify a full path of a JSON manifest. You could add multiple manifests.",
+    )
+    parser.add_argument(
+        "--output-json-manifest-root",
+        required=True,
+        type=Path,
+        help="Specify a full path of output root that would contain new manifests.",
+    )
+    parser.add_argument(
+        "--num-workers",
+        default=-1,
+        type=int,
+        help="Specify the max number of concurrently Python workers processes. "
+        "If -1 all CPUs are used. If 1 no parallel computing is used.",
+    )
+    parser.add_argument("--cpu", action='store_true', default=False, help="Generate mel spectrograms using CPUs.")
+    args = parser.parse_args()
+    return args
+def __load_wav(audio_file):
+    with sf.SoundFile(audio_file, 'r') as f:
+        samples = f.read(dtype='float32')
+    return samples.transpose()
+def __generate_mels(entry, spec_model, device, use_beta_binomial_interpolator, mel_root):
+    # Generate a spectrograms (we need to use ground truth alignment for correct matching between audio and mels)
+    audio = __load_wav(entry["audio_filepath"])
+    audio = torch.from_numpy(audio).unsqueeze(0).to(device)
+    audio_len = torch.tensor(audio.shape[1], dtype=torch.long, device=device).unsqueeze(0)
+    if spec_model.fastpitch.speaker_emb is not None and "speaker" in entry:
+        speaker = torch.tensor([entry['speaker']]).to(device)
+    else:
+        speaker = None
+    with torch.no_grad():
+        if "normalized_text" in entry:
+            text = spec_model.parse(entry["normalized_text"], normalize=False)
+        else:
+            text = spec_model.parse(entry['text'])
+        text_len = torch.tensor(text.shape[-1], dtype=torch.long, device=device).unsqueeze(0)
+        spect, spect_len = spec_model.preprocessor(input_signal=audio, length=audio_len)
+        mel_len = spect.shape[-1]
+        # Generate attention prior and spectrogram inputs for HiFi-GAN
+        if use_beta_binomial_interpolator:
+            beta_binomial_interpolator = BetaBinomialInterpolator()
+            attn_prior = (
+                torch.from_numpy(beta_binomial_interpolator(mel_len, text_len.item()))
+                .unsqueeze(0)
+                .to(text.device)
+            )
+        else:
+            attn_prior = (
+                torch.from_numpy(beta_binomial_prior_distribution(text_len.item(), mel_len))
+                .unsqueeze(0)
+                .to(text.device)
+            )
+        spectrogram = spec_model.forward(
+            text=text, input_lens=text_len, spec=spect, mel_lens=spect_len, attn_prior=attn_prior, speaker=speaker,
+        )[0]
+        save_path = mel_root / f"{Path(entry['audio_filepath']).stem}.npy"
+        np.save(save_path, spectrogram[0].to('cpu').numpy())
+        entry["mel_filepath"] = str(save_path)
+    return entry
+def main():
+    args = get_args()
+    ckpt_path = args.fastpitch_model_ckpt
+    input_manifest_filepaths = args.input_json_manifests
+    output_json_manifest_root = args.output_json_manifest_root
+    mel_root = output_json_manifest_root / "mels"
+    mel_root.mkdir(exist_ok=True, parents=True)
+    # load pretrained FastPitch model checkpoint
+    suffix = ckpt_path.suffix
+    if suffix == ".nemo":
+        spec_model = FastPitchModel.restore_from(ckpt_path).eval()
+    elif suffix == ".ckpt":
+        spec_model = FastPitchModel.load_from_checkpoint(ckpt_path).eval()
+    else:
+        raise ValueError(f"Unsupported suffix: {suffix}")
+    if not args.cpu:
+        spec_model.cuda()
+    device = spec_model.device
+    use_beta_binomial_interpolator = spec_model.cfg.train_ds.dataset.get("use_beta_binomial_interpolator", False)
+    for manifest in input_manifest_filepaths:
+        logging.info(f"Processing {manifest}.")
+        entries = []
+        with open(manifest, "r") as fjson:
+            for line in fjson:
+                entries.append(json.loads(line.strip()))
+        if device == "cpu":
+            new_entries = Parallel(n_jobs=args.num_workers)(
+                delayed(__generate_mels)(entry, spec_model, device, use_beta_binomial_interpolator, mel_root)
+                for entry in entries
+            )
+        else:
+            new_entries = []
+            for entry in tqdm(entries):
+                new_entry = __generate_mels(entry, spec_model, device, use_beta_binomial_interpolator, mel_root)
+                new_entries.append(new_entry)
+        mel_manifest_path = output_json_manifest_root / f"{manifest.stem}_mel{manifest.suffix}"
+        with open(mel_manifest_path, "w") as fmel:
+            for entry in new_entries:
+                fmel.write(json.dumps(entry) + "\n")
+        logging.info(f"Processing {manifest} is complete --> {mel_manifest_path}")
+if __name__ == "__main__":
+    main()

tts_fa_fastpitch_hifigan-v2.0/scripts/hifigan_finetune.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import lightning.pytorch as pl
+from nemo.collections.tts.models import HifiGanModel
+from nemo.core.config import hydra_runner
+from nemo.utils.exp_manager import exp_manager
+@hydra_runner(config_path="conf/hifigan", config_name="hifigan_44100")
+def main(cfg):
+    trainer = pl.Trainer(**cfg.trainer)
+    exp_manager(trainer, cfg.get("exp_manager", None))
+    model = HifiGanModel(cfg=cfg.model, trainer=trainer)
+    model.maybe_init_from_pretrained_checkpoint(cfg=cfg)
+    trainer.fit(model)
+if __name__ == '__main__':
+    main()  # noqa pylint: disable=no-value-for-parameter

tts_fa_fastpitch_hifigan-v2.0/source.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ https://huggingface.co/SadeghK/tts_fa_fastpitch_hifigan-v2.0

tts_fa_fastpitch_hifigan-v2.0/tts-nemo-fastpitch-hifigan-from-scratch-finetuning-hifigan.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

tts_fa_fastpitch_hifigan-v2.0/tts-nemo-fastpitch-hifigan-inference-only.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

tts_fa_fastpitch_hifigan-v2.0/tts_nemo_fastpitch_hifigan_convert_to_onnx.ipynb ADDED Viewed

	@@ -0,0 +1,884 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "4kodC7VXOd56"
+      },
+      "outputs": [],
+      "source": [
+        "# !python -m pip install --upgrade pip\n",
+        "!apt-get update && apt-get install -y libsndfile1 ffmpeg\n",
+        "!pip install Cython packaging\n",
+        "!rm -rf /usr/lib/python3.10/site-packages/blinker*\n",
+        "!rm -rf /usr/local/lib/python3.10/dist-packages/blinker*\n",
+        "!pip install --ignore-installed blinker\n",
+        "!pip install --upgrade --force-reinstall blinker\n",
+        "# !pip install dask-cuda==24.8.2\n",
+        "\n",
+        "!mkdir -p /workspace/tts-nemo/\n",
+        "!cd /workspace/tts-nemo/\n",
+        "!git clone https://github.com/SadeghKrmi/NeMo.git\n",
+        "\n",
+        "# to install and enable editing without re-installation\n",
+        "!cd NeMo && pip install -e '.[all]'\n",
+        "\n",
+        "# install without editing possibility\n",
+        "# !cd NeMo && pip install '.[all]'"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from google.colab import drive\n",
+        "drive.mount('/content/drive')"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "LKzWYURw4S5i",
+        "outputId": "d0dbbac6-1391-4116-de27-19b0fd39805b"
+      },
+      "execution_count": 1,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Mounted at /content/drive\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!ls -l /content/drive/MyDrive/vast-ai-gcloud-drive/tts-v3/models/FastPitch--val_loss-0.7236-epoch-50.nemo\n",
+        "!ls -l /content/drive/MyDrive/vast-ai-gcloud-drive/tts-v3/models/HifiGan--val_loss-2.0733-epoch-12-last.nemo\n",
+        "\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "lN8KV1CanbX1",
+        "outputId": "6c1a9459-bc49-43c1-a220-64e1c2e175aa"
+      },
+      "execution_count": 4,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "-rw------- 1 root root 184258560 Aug 13 08:13 /content/drive/MyDrive/vast-ai-gcloud-drive/tts-v3/models/FastPitch--val_loss-0.7236-epoch-50.nemo\n",
+            "-rw------- 1 root root 339210240 Aug 15 12:11 /content/drive/MyDrive/vast-ai-gcloud-drive/tts-v3/models/HifiGan--val_loss-2.0733-epoch-12-last.nemo\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!ls /content/drive/MyDrive/cNotebooks/perTTS/tts-nemo-fastpitch-hifigan-convert-to-onnx/"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "6hYdqdhxQscq",
+        "outputId": "4813f43b-5712-44c4-b9d6-51c12f4f729d"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "FastPitch--val_loss-0.7796-epoch-800-last.nemo\n",
+            "HifiGan--val_loss-0.6090-epoch-39-last.nemo\n",
+            "persian-dict\n",
+            "tts_nemo_fastpitch_hifigan_convert_to_onnx.ipynb\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip install num2fawords -q"
+      ],
+      "metadata": {
+        "id": "KNqfXdJ1poZ2"
+      },
+      "execution_count": 6,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from nemo.collections.tts.g2p.models.fa_ir_persian.g2p import PersianG2p\n",
+        "from nemo.collections.tts.g2p.models.fa_ir_persian.tokenizer import PersianPhonemesTokenizer\n",
+        "\n",
+        "# test Persian Grapheme-to-phoneme module\n",
+        "g2p = PersianG2p(\n",
+        "    phoneme_dict=\"./persian-v6.0.dict\",\n",
+        ")\n",
+        "\n",
+        "# Text tokenizer\n",
+        "# text_tokenizer = PersianPhonemesTokenizer(punct=True,chars=True,pad_with_space=True,g2p=g2p)\n",
+        "\n",
+        "text_tokenizer = PersianPhonemesTokenizer(\n",
+        "    g2p=g2p,\n",
+        "    use_emotion_tokens=True,\n",
+        "    use_pause_tokens=True,\n",
+        "    use_speed_tokens=True\n",
+        ")\n",
+        "\n",
+        "text = 'و تاریخ میلادی سال ۶۲۲ را نشان میداد.'\n",
+        "ids = text_tokenizer.encode(text)\n",
+        "print(ids)"
+      ],
+      "metadata": {
+        "id": "mQxZY4z4OiGx",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "0d3abeae-4778-407e-9325-495e26743ab3"
+      },
+      "execution_count": 7,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "[0, 28, 53, 0, 3, 43, 11, 50, 8, 0, 26, 50, 25, 43, 9, 50, 0, 14, 43, 25, 0, 15, 54, 15, 16, 53, 9, 0, 47, 0, 1, 50, 14, 3, 0, 47, 0, 9, 47, 0, 11, 43, 0, 27, 54, 15, 43, 27, 0, 26, 50, 9, 43, 9, 69, 0]\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### FastPitch Export"
+      ],
+      "metadata": {
+        "id": "oc6P0je3TFe-"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import nemo.collections.tts as nemo_tts\n",
+        "import torch\n",
+        "\n",
+        "# Load model\n",
+        "fastpitch_model = nemo_tts.models.FastPitchModel.restore_from(\"/content/drive/MyDrive/vast-ai-gcloud-drive/tts-v3/models/FastPitch--val_loss-0.7236-epoch-50.nemo\")\n",
+        "fastpitch_model.eval()\n",
+        "\n",
+        "# Get the actual vocabulary size from the model\n",
+        "vocab_size = fastpitch_model.fastpitch.encoder.word_emb.num_embeddings\n",
+        "print(f\"Model vocabulary size: {vocab_size}\")\n",
+        "\n",
+        "# Method 1: Try with correct forward signature\n",
+        "class FastPitchWrapper1(torch.nn.Module):\n",
+        "    def __init__(self, model):\n",
+        "        super().__init__()\n",
+        "        self.model = model\n",
+        "\n",
+        "    def forward(self, text, input_lens):\n",
+        "        return self.model.forward(text=text, input_lens=input_lens, pace=1.0)\n",
+        "\n",
+        "# Method 2: Try with generate_spectrogram\n",
+        "class FastPitchWrapper2(torch.nn.Module):\n",
+        "    def __init__(self, model):\n",
+        "        super().__init__()\n",
+        "        self.model = model\n",
+        "\n",
+        "    def forward(self, tokens):\n",
+        "        return self.model.generate_spectrogram(tokens=tokens, speaker=None, pace=1.0)\n",
+        "\n",
+        "# Generate dummy data with valid token range (excluding padding token if it's 0)\n",
+        "padding_idx = getattr(fastpitch_model.fastpitch.encoder, 'padding_idx', 0)\n",
+        "valid_token_range = (1, vocab_size - 1) if padding_idx == 0 else (0, vocab_size - 1)\n",
+        "\n",
+        "dummy_text = torch.randint(valid_token_range[0], valid_token_range[1] + 1, (1, 50), dtype=torch.long)\n",
+        "dummy_input_lens = torch.tensor([50], dtype=torch.long)\n",
+        "\n",
+        "for i, (wrapper_class, args) in enumerate([(FastPitchWrapper1, (dummy_text, dummy_input_lens)),\n",
+        "                                           (FastPitchWrapper2, (dummy_text,))], 1):\n",
+        "    # try:\n",
+        "        wrapper = wrapper_class(fastpitch_model)\n",
+        "        with torch.no_grad():\n",
+        "            output = wrapper(*args)\n",
+        "\n",
+        "        print(f\"Method {i} works! Trying ONNX export...\")\n",
+        "\n",
+        "        # Export to ONNX\n",
+        "        input_names = ['text', 'input_lens'] if i == 1 else ['tokens']\n",
+        "        torch.onnx.export(\n",
+        "            wrapper,\n",
+        "            args,\n",
+        "            f\"fastpitch_method{i}.onnx\",\n",
+        "            export_params=True,\n",
+        "            opset_version=14,\n",
+        "            input_names=input_names,\n",
+        "            output_names=['mel_spec'],\n",
+        "            dynamic_axes={\n",
+        "                input_names[0]: {0: 'batch_size', 1: 'text_length'},\n",
+        "                **(({input_names[1]: {0: 'batch_size'}} if len(input_names) > 1 else {})),\n",
+        "                'mel_spec': {0: 'batch_size', 2: 'mel_length'}\n",
+        "            }\n",
+        "        )\n",
+        "        print(f\"Method {i} ONNX export successful!\")\n",
+        "        break\n",
+        "\n",
+        "    # except Exception as e:\n",
+        "    #     print(f\"Method {i} failed: {e}\")\n",
+        "    #     continue"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "l45qnF6tSn6e",
+        "outputId": "c02529da-e241-4e5d-faa1-6fc67e314fae"
+      },
+      "execution_count": 16,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "[NeMo W 2025-08-15 12:26:10 nemo_logging:405] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.\n",
+            "    Train config : \n",
+            "    dataset:\n",
+            "      _target_: nemo.collections.tts.data.dataset.TTSDataset\n",
+            "      manifest_filepath: ./dataset_splits/train/train.jsonl\n",
+            "      sample_rate: 22050\n",
+            "      sup_data_path: sup_data\n",
+            "      sup_data_types:\n",
+            "      - align_prior_matrix\n",
+            "      - pitch\n",
+            "      n_fft: 1024\n",
+            "      win_length: 1024\n",
+            "      hop_length: 256\n",
+            "      window: hann\n",
+            "      n_mels: 80\n",
+            "      lowfreq: 0\n",
+            "      highfreq: null\n",
+            "      max_duration: 20\n",
+            "      min_duration: 0.1\n",
+            "      ignore_file: null\n",
+            "      trim: true\n",
+            "      trim_top_db: 50\n",
+            "      trim_frame_length: 1024\n",
+            "      trim_hop_length: 256\n",
+            "      pitch_fmin: 65.4063949584961\n",
+            "      pitch_fmax: 2093.004638671875\n",
+            "      pitch_norm: true\n",
+            "      pitch_mean: 103.01591491699219\n",
+            "      pitch_std: 30.397296905517578\n",
+            "    dataloader_params:\n",
+            "      drop_last: false\n",
+            "      shuffle: true\n",
+            "      batch_size: 64\n",
+            "      num_workers: 12\n",
+            "      pin_memory: true\n",
+            "    \n",
+            "[NeMo W 2025-08-15 12:26:10 nemo_logging:405] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). \n",
+            "    Validation config : \n",
+            "    dataset:\n",
+            "      _target_: nemo.collections.tts.data.dataset.TTSDataset\n",
+            "      manifest_filepath: ./dataset_splits/test/test.jsonl\n",
+            "      sample_rate: 22050\n",
+            "      sup_data_path: sup_data\n",
+            "      sup_data_types:\n",
+            "      - align_prior_matrix\n",
+            "      - pitch\n",
+            "      n_fft: 1024\n",
+            "      win_length: 1024\n",
+            "      hop_length: 256\n",
+            "      window: hann\n",
+            "      n_mels: 80\n",
+            "      lowfreq: 0\n",
+            "      highfreq: null\n",
+            "      max_duration: 20\n",
+            "      min_duration: 0.1\n",
+            "      ignore_file: null\n",
+            "      trim: true\n",
+            "      trim_top_db: 50\n",
+            "      trim_frame_length: 1024\n",
+            "      trim_hop_length: 256\n",
+            "      pitch_fmin: 65.4063949584961\n",
+            "      pitch_fmax: 2093.004638671875\n",
+            "      pitch_norm: true\n",
+            "      pitch_mean: 103.01591491699219\n",
+            "      pitch_std: 30.397296905517578\n",
+            "    dataloader_params:\n",
+            "      drop_last: false\n",
+            "      shuffle: false\n",
+            "      batch_size: 24\n",
+            "      num_workers: 8\n",
+            "      pin_memory: true\n",
+            "    \n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "[NeMo I 2025-08-15 12:26:10 nemo_logging:393] PADDING: 1\n",
+            "[NeMo I 2025-08-15 12:26:11 nemo_logging:393] Model FastPitchModel was successfully restored from /content/drive/MyDrive/vast-ai-gcloud-drive/tts-v3/models/FastPitch--val_loss-0.7236-epoch-50.nemo.\n",
+            "Model vocabulary size: 94\n",
+            "Method 1 works! Trying ONNX export...\n",
+            "Method 1 ONNX export successful!\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### HiFiGAN Export"
+      ],
+      "metadata": {
+        "id": "aKuYMvWBTCSa"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Load HiFiGAN model\n",
+        "hifigan_model = nemo_tts.models.HifiGanModel.restore_from(\"/content/drive/MyDrive/vast-ai-gcloud-drive/tts-v3/models/HifiGan--val_loss-2.0733-epoch-12-last.nemo\")\n",
+        "hifigan_model.eval()\n",
+        "\n",
+        "# HiFiGAN might also need wrapper if it has the same issue\n",
+        "class HiFiGANWrapper(torch.nn.Module):\n",
+        "    def __init__(self, model):\n",
+        "        super().__init__()\n",
+        "        self.model = model\n",
+        "\n",
+        "    def forward(self, mel_spec):\n",
+        "        return self.model.forward(spec=mel_spec)\n",
+        "\n",
+        "wrapped_hifigan = HiFiGANWrapper(hifigan_model)\n",
+        "\n",
+        "# Export HiFiGAN\n",
+        "# dummy_mel = torch.randn(1, 80, 100)\n",
+        "dummy_mel = torch.randn(1, 80, 100)\n",
+        "torch.onnx.export(\n",
+        "    wrapped_hifigan,\n",
+        "    dummy_mel,\n",
+        "    \"hifigan_fixed.onnx\",\n",
+        "    export_params=True,\n",
+        "    opset_version=14,\n",
+        "    do_constant_folding=True,\n",
+        "    input_names=['mel_spec'],\n",
+        "    output_names=['audio'],\n",
+        "    dynamic_axes={\n",
+        "        'mel_spec': {0: 'batch_size', 2: 'mel_length'},\n",
+        "        'audio': {0: 'batch_size', 1: 'audio_length'}\n",
+        "    },\n",
+        "    optimize_for_mobile=False,  # Keep False for CPU\n",
+        "    training=torch.onnx.TrainingMode.EVAL\n",
+        ")"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "UMVexvWMSqnJ",
+        "outputId": "0572f245-08d4-42ab-cfd1-fe46c98037ae"
+      },
+      "execution_count": 18,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "[NeMo W 2025-08-15 12:26:59 nemo_logging:405] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.\n",
+            "    Train config : \n",
+            "    dataset:\n",
+            "      _target_: nemo.collections.tts.data.dataset.VocoderDataset\n",
+            "      manifest_filepath: ./mels/train_mel.jsonl\n",
+            "      sample_rate: 22050\n",
+            "      n_segments: 8192\n",
+            "      max_duration: null\n",
+            "      min_duration: 0.75\n",
+            "      load_precomputed_mel: true\n",
+            "      hop_length: 256\n",
+            "    dataloader_params:\n",
+            "      drop_last: false\n",
+            "      shuffle: true\n",
+            "      batch_size: 32\n",
+            "      num_workers: 4\n",
+            "      pin_memory: true\n",
+            "    \n",
+            "[NeMo W 2025-08-15 12:26:59 nemo_logging:405] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). \n",
+            "    Validation config : \n",
+            "    dataset:\n",
+            "      _target_: nemo.collections.tts.data.dataset.VocoderDataset\n",
+            "      manifest_filepath: ./mels/test_mel.jsonl\n",
+            "      sample_rate: 22050\n",
+            "      n_segments: 1024\n",
+            "      max_duration: null\n",
+            "      min_duration: 3\n",
+            "      load_precomputed_mel: true\n",
+            "      hop_length: 256\n",
+            "    dataloader_params:\n",
+            "      drop_last: false\n",
+            "      shuffle: false\n",
+            "      batch_size: 16\n",
+            "      num_workers: 4\n",
+            "      pin_memory: true\n",
+            "    \n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "[NeMo I 2025-08-15 12:26:59 nemo_logging:393] PADDING: 0\n",
+            "[NeMo I 2025-08-15 12:26:59 nemo_logging:393] STFT using exact pad\n",
+            "[NeMo I 2025-08-15 12:26:59 nemo_logging:393] PADDING: 0\n",
+            "[NeMo I 2025-08-15 12:26:59 nemo_logging:393] STFT using exact pad\n",
+            "[NeMo I 2025-08-15 12:27:01 nemo_logging:393] Model HifiGanModel was successfully restored from /content/drive/MyDrive/vast-ai-gcloud-drive/tts-v3/models/HifiGan--val_loss-2.0733-epoch-12-last.nemo.\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Run ONNX Models on CPU"
+      ],
+      "metadata": {
+        "id": "bS_mgW0HTZ07"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip install onnxruntime numpy librosa soundfile -q"
+      ],
+      "metadata": {
+        "id": "XdiQ2-wnTayc",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "6ab61a4a-ef9e-4fda-ad5b-b6e447ddc4f3"
+      },
+      "execution_count": 20,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m16.5/16.5 MB\u001b[0m \u001b[31m42.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m2.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m4.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25h"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import onnxruntime as ort\n",
+        "import numpy as np\n",
+        "import torch\n",
+        "import librosa\n",
+        "import soundfile as sf\n",
+        "from typing import List, Optional\n",
+        "\n",
+        "# Import the same tokenizer used during training\n",
+        "from nemo.collections.tts.g2p.models.fa_ir_persian.g2p import PersianG2p\n",
+        "from nemo.collections.tts.g2p.models.fa_ir_persian.tokenizer import PersianPhonemesTokenizer\n",
+        "\n",
+        "class PersianTTSInferencePipeline:\n",
+        "    def __init__(self, fastpitch_path: str, hifigan_path: str,\n",
+        "                 persian_dict_path: str = \"./persian-v6.0.dict\"):\n",
+        "        \"\"\"\n",
+        "        Initialize Persian TTS inference pipeline with ONNX models\n",
+        "\n",
+        "        Args:\n",
+        "            fastpitch_path: Path to FastPitch ONNX model\n",
+        "            hifigan_path: Path to HiFiGAN ONNX model\n",
+        "            persian_dict_path: Path to Persian phoneme dictionary\n",
+        "        \"\"\"\n",
+        "        # Load ONNX models\n",
+        "        self.fastpitch_session = ort.InferenceSession(\n",
+        "            fastpitch_path,\n",
+        "            providers=['CPUExecutionProvider']\n",
+        "        )\n",
+        "        self.hifigan_session = ort.InferenceSession(\n",
+        "            hifigan_path,\n",
+        "            providers=['CPUExecutionProvider']\n",
+        "        )\n",
+        "\n",
+        "        # Initialize Persian tokenizer (SAME as training)\n",
+        "        print(\"Initializing Persian tokenizer...\")\n",
+        "        self.g2p = PersianG2p(phoneme_dict=persian_dict_path)\n",
+        "        self.text_tokenizer = PersianPhonemesTokenizer(\n",
+        "            g2p=g2p,\n",
+        "            use_emotion_tokens=True,\n",
+        "            use_pause_tokens=True,\n",
+        "            use_speed_tokens=True\n",
+        "        )\n",
+        "\n",
+        "        # Get input/output names\n",
+        "        self.fp_input_names = [inp.name for inp in self.fastpitch_session.get_inputs()]\n",
+        "        self.fp_output_names = [out.name for out in self.fastpitch_session.get_outputs()]\n",
+        "        self.hg_input_names = [inp.name for inp in self.hifigan_session.get_inputs()]\n",
+        "        self.hg_output_names = [out.name for out in self.hifigan_session.get_outputs()]\n",
+        "\n",
+        "        print(f\"FastPitch inputs: {self.fp_input_names}\")\n",
+        "        print(f\"FastPitch outputs: {self.fp_output_names}\")\n",
+        "        print(f\"HiFiGAN inputs: {self.hg_input_names}\")\n",
+        "        print(f\"HiFiGAN outputs: {self.hg_output_names}\")\n",
+        "\n",
+        "        # Test tokenizer\n",
+        "        test_text = 'مدل تبدیل متن به گفتار پارسی'\n",
+        "        test_ids = self.text_tokenizer.encode(test_text)\n",
+        "        print(f\"Test tokenization: '{test_text}' -> {test_ids[:10]}...\")\n",
+        "\n",
+        "    def text_to_tokens(self, text: str) -> tuple:\n",
+        "        \"\"\"\n",
+        "        Convert Persian text to phoneme tokens using the same tokenizer as training\n",
+        "\n",
+        "        Args:\n",
+        "            text: Input Persian text string\n",
+        "\n",
+        "        Returns:\n",
+        "            tokens: numpy array of phoneme token indices\n",
+        "            token_lengths: numpy array of sequence length\n",
+        "        \"\"\"\n",
+        "        # Use the exact same tokenizer as training\n",
+        "        token_ids = self.text_tokenizer.encode(text)\n",
+        "\n",
+        "        # Convert to numpy arrays with batch dimension\n",
+        "        tokens = np.array([token_ids], dtype=np.int64)  # Shape: (1, seq_len)\n",
+        "        token_lengths = np.array([len(token_ids)], dtype=np.int64)  # Shape: (1,)\n",
+        "\n",
+        "        print(f\"Text: '{text}'\")\n",
+        "        print(f\"Tokens length: {len(token_ids)}\")\n",
+        "        print(f\"First 20 tokens: {token_ids[:20]}\")\n",
+        "        print(f\"Token range: [{min(token_ids)}, {max(token_ids)}]\")\n",
+        "\n",
+        "        return tokens, token_lengths\n",
+        "\n",
+        "    def generate_mel_spectrogram(self, text: str) -> np.ndarray:\n",
+        "        \"\"\"\n",
+        "        Generate mel spectrogram from Persian text using FastPitch\n",
+        "\n",
+        "        Args:\n",
+        "            text: Input Persian text string\n",
+        "\n",
+        "        Returns:\n",
+        "            mel_spec: Generated mel spectrogram\n",
+        "        \"\"\"\n",
+        "        # Convert text to phoneme tokens\n",
+        "        tokens, token_lengths = self.text_to_tokens(text)\n",
+        "\n",
+        "        # Prepare inputs based on your model's input names\n",
+        "        if len(self.fp_input_names) == 1:\n",
+        "            # If using generate_spectrogram wrapper (Method 2)\n",
+        "            inputs = {self.fp_input_names[0]: tokens}\n",
+        "        else:\n",
+        "            # If using forward wrapper (Method 1)\n",
+        "            inputs = {\n",
+        "                self.fp_input_names[0]: tokens,  # text or tokens\n",
+        "                self.fp_input_names[1]: token_lengths  # input_lens\n",
+        "            }\n",
+        "\n",
+        "        print(f\"FastPitch inputs: {list(inputs.keys())}\")\n",
+        "        for key, val in inputs.items():\n",
+        "            print(f\"  {key}: shape {val.shape}, dtype {val.dtype}\")\n",
+        "\n",
+        "        # Run FastPitch inference\n",
+        "        mel_outputs = self.fastpitch_session.run(self.fp_output_names, inputs)\n",
+        "        mel_spec = mel_outputs[0]  # First output should be mel spectrogram\n",
+        "\n",
+        "        print(f\"Generated mel spectrogram shape: {mel_spec.shape}\")\n",
+        "        print(f\"Mel range: [{mel_spec.min():.4f}, {mel_spec.max():.4f}]\")\n",
+        "\n",
+        "        return mel_spec\n",
+        "\n",
+        "    def generate_audio(self, mel_spec: np.ndarray, sample_rate: int = 22050) -> np.ndarray:\n",
+        "        \"\"\"\n",
+        "        Generate audio from mel spectrogram using HiFiGAN\n",
+        "\n",
+        "        Args:\n",
+        "            mel_spec: Input mel spectrogram\n",
+        "            sample_rate: Audio sample rate\n",
+        "\n",
+        "        Returns:\n",
+        "            audio: Generated audio waveform\n",
+        "        \"\"\"\n",
+        "        # Prepare inputs for HiFiGAN\n",
+        "        inputs = {self.hg_input_names[0]: mel_spec}\n",
+        "\n",
+        "        print(f\"HiFiGAN input shape: {mel_spec.shape}\")\n",
+        "\n",
+        "        # Run HiFiGAN inference\n",
+        "        audio_outputs = self.hifigan_session.run(self.hg_output_names, inputs)\n",
+        "        audio = audio_outputs[0]  # First output should be audio\n",
+        "\n",
+        "        # Remove batch dimension and ensure proper shape\n",
+        "        if audio.ndim > 1:\n",
+        "            audio = audio.squeeze()\n",
+        "\n",
+        "        print(f\"Generated audio shape: {audio.shape}\")\n",
+        "        print(f\"Audio range: [{audio.min():.4f}, {audio.max():.4f}]\")\n",
+        "        print(f\"Audio RMS: {np.sqrt(np.mean(audio**2)):.4f}\")\n",
+        "\n",
+        "        return audio\n",
+        "\n",
+        "    def text_to_speech(self, text: str, output_path: Optional[str] = None,\n",
+        "                       sample_rate: int = 22050) -> np.ndarray:\n",
+        "        \"\"\"\n",
+        "        Complete Persian text-to-speech pipeline\n",
+        "\n",
+        "        Args:\n",
+        "            text: Input Persian text string\n",
+        "            output_path: Optional path to save audio file\n",
+        "            sample_rate: Audio sample rate\n",
+        "\n",
+        "        Returns:\n",
+        "            audio: Generated audio waveform\n",
+        "        \"\"\"\n",
+        "        print(f\"🎙️ Generating Persian speech for: '{text}'\")\n",
+        "        print(\"=\" * 60)\n",
+        "\n",
+        "        # Step 1: Generate mel spectrogram\n",
+        "        print(\"📊 Generating mel spectrogram...\")\n",
+        "        mel_spec = self.generate_mel_spectrogram(text)\n",
+        "\n",
+        "        # Step 2: Generate audio from mel spectrogram\n",
+        "        print(\"🔊 Generating audio...\")\n",
+        "        audio = self.generate_audio(mel_spec, sample_rate)\n",
+        "\n",
+        "        # Step 3: Save audio if path provided\n",
+        "        if output_path:\n",
+        "            sf.write(output_path, audio, sample_rate)\n",
+        "            print(f\"💾 Audio saved to: {output_path}\")\n",
+        "\n",
+        "        print(\"✅ Persian TTS generation completed!\")\n",
+        "        return audio\n",
+        "\n",
+        "    def test_tokenizer_consistency(self):\n",
+        "        \"\"\"Test that tokenizer works consistently\"\"\"\n",
+        "        test_texts = [\n",
+        "            'سلام دنیا',\n",
+        "            'مدل تبدیل متن به گفتار پارسی',\n",
+        "            'این یک تست است',\n",
+        "            'پردازش زبان طبیعی'\n",
+        "        ]\n",
+        "\n",
+        "        print(\"🧪 Testing tokenizer consistency:\")\n",
+        "        for text in test_texts:\n",
+        "            tokens = self.text_tokenizer.encode(text)\n",
+        "            decoded = self.text_tokenizer.decode(tokens)\n",
+        "            print(f\"  '{text}' -> {len(tokens)} tokens -> '{decoded}'\")\n",
+        "\n",
+        "    def compare_with_training_tokenizer(self, text: str):\n",
+        "        \"\"\"Compare tokenizer output with training setup\"\"\"\n",
+        "        print(f\"🔍 Tokenizer comparison for: '{text}'\")\n",
+        "\n",
+        "        # Your training tokenizer\n",
+        "        tokens = self.text_tokenizer.encode(text)\n",
+        "\n",
+        "        # Print detailed tokenization info\n",
+        "        print(f\"Phoneme tokens: {tokens}\")\n",
+        "        print(f\"Token count: {len(tokens)}\")\n",
+        "        print(f\"Vocabulary size range: [0, {max(tokens)}]\")\n",
+        "\n",
+        "        # Try to decode back\n",
+        "        try:\n",
+        "            decoded = self.text_tokenizer.decode(tokens)\n",
+        "            print(f\"Decoded back: '{decoded}'\")\n",
+        "        except:\n",
+        "            print(\"Could not decode tokens back to text\")\n",
+        "\n",
+        "        return tokens\n",
+        "\n",
+        "# Example usage for Persian TTS\n",
+        "def main():\n",
+        "    # Initialize the Persian TTS pipeline\n",
+        "    persian_tts = PersianTTSInferencePipeline(\n",
+        "        fastpitch_path=\"fastpitch.onnx\",  # Your exported ONNX model\n",
+        "        hifigan_path=\"hifigan.onnx\",     # Your exported ONNX model\n",
+        "        persian_dict_path=\"/content/drive/MyDrive/cNotebooks/perTTS/tts-nemo-fastpitch-hifigan-convert-to-onnx/persian-dict/persian-v4.0.dict\"\n",
+        "    )\n",
+        "\n",
+        "    # Test tokenizer first\n",
+        "    persian_tts.test_tokenizer_consistency()\n",
+        "\n",
+        "    # Generate speech for Persian text\n",
+        "    persian_texts = [\n",
+        "        'سلام دنیا',\n",
+        "        'مدل تبدیل متن به گفتار پارسی',\n",
+        "        'این یک تست از سیستم تولید گفتار است',\n",
+        "        'پردازش زبان طبیعی فارسی'\n",
+        "    ]\n",
+        "\n",
+        "    for i, text in enumerate(persian_texts):\n",
+        "        print(f\"\\n{'='*80}\")\n",
+        "        try:\n",
+        "            audio = persian_tts.text_to_speech(\n",
+        "                text=text,\n",
+        "                output_path=f\"persian_output_{i+1}.wav\",\n",
+        "                sample_rate=22050\n",
+        "            )\n",
+        "            print(f\"✅ Successfully generated audio for text {i+1}\")\n",
+        "\n",
+        "        except Exception as e:\n",
+        "            print(f\"❌ Failed to generate audio for text {i+1}: {e}\")\n",
+        "            # Debug the tokenization for this text\n",
+        "            persian_tts.compare_with_training_tokenizer(text)\n",
+        "\n",
+        "    return persian_tts\n",
+        "\n",
+        "if __name__ == \"__main__\":\n",
+        "    tts_pipeline = main()"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "ZlNsGG8hTcRx",
+        "outputId": "8bce8a81-78ac-4c6b-e376-bcbc55b7a064"
+      },
+      "execution_count": 23,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Initializing Persian tokenizer...\n",
+            "FastPitch inputs: ['text']\n",
+            "FastPitch outputs: ['mel_spec', 'seq_lens', 'durs_predicted', 'log_durs_predicted', 'res']\n",
+            "HiFiGAN inputs: ['mel_spec']\n",
+            "HiFiGAN outputs: ['audio']\n",
+            "Test tokenization: 'مدل تبدیل متن به گفتار پارسی' -> [0, 26, 55, 9, 54, 25, 0, 3, 53, 1]...\n",
+            "🧪 Testing tokenizer consistency:\n",
+            "  'سلام دنیا' -> 13 tokens -> ' |s|a|l|Λ|m| |d|o|n|y|Λ| '\n",
+            "  'مدل تبدیل متن به گفتار پارسی' -> 35 tokens -> ' |m|o|d|e|l| |t|a|b|d|i|l| |m|a|t|n| |b|E| |g|o|f|t|Λ|r| |p|Λ|r|s|i| '\n",
+            "  'این یک تست است' -> 17 tokens -> ' |I|n| |y|e|k| |t|e|s|t| |ą|s|t| '\n",
+            "  'پردازش زبان طبیعی' -> 23 tokens -> ' |p|a|r|d|Λ|z|e|S| |z|a|b|Λ|n| |T|a|b|i|ʔ|i| '\n",
+            "\n",
+            "================================================================================\n",
+            "🎙️ Generating Persian speech for: 'سلام دنیا'\n",
+            "============================================================\n",
+            "📊 Generating mel spectrogram...\n",
+            "Text: 'سلام دنیا'\n",
+            "Tokens length: 13\n",
+            "First 20 tokens: [0, 14, 53, 25, 43, 26, 0, 9, 55, 27, 32, 43, 0]\n",
+            "Token range: [0, 55]\n",
+            "FastPitch inputs: ['text']\n",
+            "  text: shape (1, 13), dtype int64\n",
+            "Generated mel spectrogram shape: (1, 80, 106)\n",
+            "Mel range: [-11.0657, -1.1308]\n",
+            "🔊 Generating audio...\n",
+            "HiFiGAN input shape: (1, 80, 106)\n",
+            "Generated audio shape: (27136,)\n",
+            "Audio range: [-0.1950, 0.1185]\n",
+            "Audio RMS: 0.0259\n",
+            "💾 Audio saved to: persian_output_1.wav\n",
+            "✅ Persian TTS generation completed!\n",
+            "✅ Successfully generated audio for text 1\n",
+            "\n",
+            "================================================================================\n",
+            "🎙️ Generating Persian speech for: 'مدل تبدیل متن به گفتار پارسی'\n",
+            "============================================================\n",
+            "📊 Generating mel spectrogram...\n",
+            "Text: 'مدل تبدیل متن به گفتار پارسی'\n",
+            "Tokens length: 35\n",
+            "First 20 tokens: [0, 26, 55, 9, 54, 25, 0, 3, 53, 1, 9, 50, 25, 0, 26, 53, 3, 27, 0, 1]\n",
+            "Token range: [0, 55]\n",
+            "FastPitch inputs: ['text']\n",
+            "  text: shape (1, 35), dtype int64\n",
+            "Generated mel spectrogram shape: (1, 80, 240)\n",
+            "Mel range: [-10.2846, 0.0889]\n",
+            "🔊 Generating audio...\n",
+            "HiFiGAN input shape: (1, 80, 240)\n",
+            "Generated audio shape: (61440,)\n",
+            "Audio range: [-0.4547, 0.4433]\n",
+            "Audio RMS: 0.0730\n",
+            "💾 Audio saved to: persian_output_2.wav\n",
+            "✅ Persian TTS generation completed!\n",
+            "✅ Successfully generated audio for text 2\n",
+            "\n",
+            "================================================================================\n",
+            "🎙️ Generating Persian speech for: 'این یک تست از سیستم تولید گفتار است'\n",
+            "============================================================\n",
+            "📊 Generating mel spectrogram...\n",
+            "Text: 'این یک تست از سیستم تولید گفتار است'\n",
+            "Tokens length: 40\n",
+            "First 20 tokens: [0, 51, 27, 0, 32, 54, 24, 0, 3, 54, 14, 3, 0, 44, 12, 0, 14, 50, 14, 3]\n",
+            "Token range: [0, 55]\n",
+            "FastPitch inputs: ['text']\n",
+            "  text: shape (1, 40), dtype int64\n",
+            "Generated mel spectrogram shape: (1, 80, 275)\n",
+            "Mel range: [-10.2355, 0.9884]\n",
+            "🔊 Generating audio...\n",
+            "HiFiGAN input shape: (1, 80, 275)\n",
+            "Generated audio shape: (70400,)\n",
+            "Audio range: [-0.6646, 0.4960]\n",
+            "Audio RMS: 0.1006\n",
+            "💾 Audio saved to: persian_output_3.wav\n",
+            "✅ Persian TTS generation completed!\n",
+            "✅ Successfully generated audio for text 3\n",
+            "\n",
+            "================================================================================\n",
+            "🎙️ Generating Persian speech for: 'پردازش زبان طبیعی فارسی'\n",
+            "============================================================\n",
+            "📊 Generating mel spectrogram...\n",
+            "Text: 'پردازش زبان طبیعی فارسی'\n",
+            "Tokens length: 29\n",
+            "First 20 tokens: [0, 2, 53, 11, 9, 43, 12, 54, 15, 0, 12, 53, 1, 43, 27, 0, 18, 53, 1, 50]\n",
+            "Token range: [0, 54]\n",
+            "FastPitch inputs: ['text']\n",
+            "  text: shape (1, 29), dtype int64\n",
+            "Generated mel spectrogram shape: (1, 80, 214)\n",
+            "Mel range: [-11.0496, -0.7565]\n",
+            "🔊 Generating audio...\n",
+            "HiFiGAN input shape: (1, 80, 214)\n",
+            "Generated audio shape: (54784,)\n",
+            "Audio range: [-0.2387, 0.2220]\n",
+            "Audio RMS: 0.0293\n",
+            "💾 Audio saved to: persian_output_4.wav\n",
+            "✅ Persian TTS generation completed!\n",
+            "✅ Successfully generated audio for text 4\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "rLrOthW4VUqZ"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}