niobures commited on Oct 24, 2025

Commit

9506d83

verified ·

1 Parent(s): 797b3d2

StyleTTS (ar, en, fr, ms, ru, uk, vi)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +25 -0
ar/StyleTTS2-LibriTTS-arabic/.gitattributes +36 -0
ar/StyleTTS2-LibriTTS-arabic/README.md +142 -0
ar/StyleTTS2-LibriTTS-arabic/config.yml +114 -0
ar/StyleTTS2-LibriTTS-arabic/model.pth +3 -0
ar/StyleTTS2-LibriTTS-arabic/source.txt +1 -0
ar/StyleTTS2-LibriTTS-arabic/synthesized_audio.wav +3 -0
en/StyleTTS2-lite/.gitattributes +36 -0
en/StyleTTS2-lite/.gitignore +8 -0
en/StyleTTS2-lite/Audio/10_michael.wav +3 -0
en/StyleTTS2-lite/Audio/11_fenrir.wav +3 -0
en/StyleTTS2-lite/Audio/12_puck.wav +3 -0
en/StyleTTS2-lite/Audio/13_echo.wav +3 -0
en/StyleTTS2-lite/Audio/14_eric.wav +3 -0
en/StyleTTS2-lite/Audio/15_liam.wav +3 -0
en/StyleTTS2-lite/Audio/16_onyx.wav +3 -0
en/StyleTTS2-lite/Audio/17_santa.wav +3 -0
en/StyleTTS2-lite/Audio/18_adam.wav +3 -0
en/StyleTTS2-lite/Audio/1_heart.wav +3 -0
en/StyleTTS2-lite/Audio/2_belle.wav +3 -0
en/StyleTTS2-lite/Audio/3_kore.wav +3 -0
en/StyleTTS2-lite/Audio/4_sarah.wav +3 -0
en/StyleTTS2-lite/Audio/5_nova.wav +3 -0
en/StyleTTS2-lite/Audio/6_sky.wav +3 -0
en/StyleTTS2-lite/Audio/7_alloy.wav +3 -0
en/StyleTTS2-lite/Audio/8_jessica.wav +3 -0
en/StyleTTS2-lite/Audio/9_river.wav +3 -0
en/StyleTTS2-lite/LICENSE +21 -0
en/StyleTTS2-lite/Models/base_model.pth +3 -0
en/StyleTTS2-lite/Models/config.yaml +79 -0
en/StyleTTS2-lite/Models/inference/model.pth +3 -0
en/StyleTTS2-lite/Modules/__init__.py +1 -0
en/StyleTTS2-lite/Modules/hifigan.py +477 -0
en/StyleTTS2-lite/Modules/utils.py +14 -0
en/StyleTTS2-lite/README.md +88 -0
en/StyleTTS2-lite/inference.py +301 -0
en/StyleTTS2-lite/meldataset.py +307 -0
en/StyleTTS2-lite/models.py +532 -0
en/StyleTTS2-lite/requirements.txt +10 -0
en/StyleTTS2-lite/run.ipynb +176 -0
en/StyleTTS2-lite/source.txt +1 -0
en/StyleTTS2/.gitattributes +35 -0
en/StyleTTS2/Multi0/config.yml +112 -0
en/StyleTTS2/Multi0/config_30_e934.yml +22 -0
en/StyleTTS2/Multi0/config_40_1c872.yml +22 -0
en/StyleTTS2/Multi0/epoch_2nd_23_9ab0.pth +3 -0
en/StyleTTS2/Multi0/epoch_2nd_30_e934.pth +3 -0
en/StyleTTS2/Multi0/epoch_2nd_40_1c872.pth +3 -0
en/StyleTTS2/Multi0/ref_audio.zip +3 -0
en/StyleTTS2/README.md +7 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,28 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+ar/StyleTTS2-LibriTTS-arabic/synthesized_audio.wav filter=lfs diff=lfs merge=lfs -text
+en/StyleTTS2-lite/Audio/1_heart.wav filter=lfs diff=lfs merge=lfs -text
+en/StyleTTS2-lite/Audio/10_michael.wav filter=lfs diff=lfs merge=lfs -text
+en/StyleTTS2-lite/Audio/11_fenrir.wav filter=lfs diff=lfs merge=lfs -text
+en/StyleTTS2-lite/Audio/12_puck.wav filter=lfs diff=lfs merge=lfs -text
+en/StyleTTS2-lite/Audio/13_echo.wav filter=lfs diff=lfs merge=lfs -text
+en/StyleTTS2-lite/Audio/14_eric.wav filter=lfs diff=lfs merge=lfs -text
+en/StyleTTS2-lite/Audio/15_liam.wav filter=lfs diff=lfs merge=lfs -text
+en/StyleTTS2-lite/Audio/16_onyx.wav filter=lfs diff=lfs merge=lfs -text
+en/StyleTTS2-lite/Audio/17_santa.wav filter=lfs diff=lfs merge=lfs -text
+en/StyleTTS2-lite/Audio/18_adam.wav filter=lfs diff=lfs merge=lfs -text
+en/StyleTTS2-lite/Audio/2_belle.wav filter=lfs diff=lfs merge=lfs -text
+en/StyleTTS2-lite/Audio/3_kore.wav filter=lfs diff=lfs merge=lfs -text
+en/StyleTTS2-lite/Audio/4_sarah.wav filter=lfs diff=lfs merge=lfs -text
+en/StyleTTS2-lite/Audio/5_nova.wav filter=lfs diff=lfs merge=lfs -text
+en/StyleTTS2-lite/Audio/6_sky.wav filter=lfs diff=lfs merge=lfs -text
+en/StyleTTS2-lite/Audio/7_alloy.wav filter=lfs diff=lfs merge=lfs -text
+en/StyleTTS2-lite/Audio/8_jessica.wav filter=lfs diff=lfs merge=lfs -text
+en/StyleTTS2-lite/Audio/9_river.wav filter=lfs diff=lfs merge=lfs -text
+ru/StyleTTS_prokopenko_v1/voices/prokopenko/reference.wav filter=lfs diff=lfs merge=lfs -text
+vi,en/StyleTTS2-lite-vi/reference_audio/3.wav filter=lfs diff=lfs merge=lfs -text
+vi,en/StyleTTS2-lite-vi/reference_audio/vn_1.wav filter=lfs diff=lfs merge=lfs -text
+vi,en/StyleTTS2-lite-vi/reference_audio/vn_2.wav filter=lfs diff=lfs merge=lfs -text
+vi,en/StyleTTS2-lite-vi/reference_audio/vn_3.wav filter=lfs diff=lfs merge=lfs -text
+vi,en/StyleTTS2-lite-vi/reference_audio/vn_4.wav filter=lfs diff=lfs merge=lfs -text

ar/StyleTTS2-LibriTTS-arabic/.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+synthesized_audio.wav filter=lfs diff=lfs merge=lfs -text

ar/StyleTTS2-LibriTTS-arabic/README.md ADDED Viewed

	@@ -0,0 +1,142 @@

+---
+language: ar
+tags:
+- text-to-speech
+- tts
+- arabic
+- styletts2
+- pl-bert
+license: mit
+hardware: H100
+---
+# Model Card for Arabic StyleTTS2
+This is an Arabic text-to-speech model based on StyleTTS2 architecture, specifically adapted for Arabic language synthesis. The model achieves good quality Arabic speech synthesis, though not yet state-of-the-art, and further experimentation is needed to optimize performance for Arabic language specifically. All training objectives from the original StyleTTS2 were maintained, except for the WavLM objectives which were removed as they were primarily designed for English speech.
+## Example
+Here is an example output from the model:
+#### Sample 1
+<audio controls>
+  <source src="https://huggingface.co/fadi77/StyleTTS2-LibriTTS-arabic/resolve/main/synthesized_audio.wav" type="audio/wav">
+  Your browser does not support the audio element.
+</audio>
+## Efficiency and Performance
+A key strength of this model lies in its efficiency and performance characteristics:
+- **Compact Architecture**: Achieves impressive quality with <100M parameters
+- **Limited Training Data**: Trained on only 22 hours of single-speaker audio
+- **Transfer Learning**: Successfully fine-tuned from LibriTTS multi-speaker model to single-speaker Arabic
+- **Resource Efficient**: Good quality achieved despite limited computational resources
+Note: According to the StyleTTS2 authors, performance should improve further when training a single-speaker model from scratch rather than fine-tuning. This wasn't attempted in our case due to computational resource constraints, suggesting potential for even better results with more extensive training.
+## Model Details
+### Model Description
+This model is a modified version of StyleTTS2, specifically adapted for Arabic text-to-speech synthesis. It incorporates a custom-trained PL-BERT model for Arabic language understanding and removes the WavLM adversarial training component (which was primarily designed for English).
+- **Developed by:** Fadi (GitHub: Fadi987)
+- **Model type:** Text-to-Speech (StyleTTS2 architecture)
+- **Language(s):** Arabic
+- **Finetuned from model:** [yl4579/StyleTTS2-LibriTTS](https://huggingface.co/yl4579/StyleTTS2-LibriTTS)
+### Model Sources
+- **Repository:** [Fadi987/StyleTTS2](https://github.com/Fadi987/StyleTTS2)
+- **Paper:** [StyleTTS 2: Towards Human-Level Text-to-Speech through Style Diffusion and Adversarial Training with Large Speech Language Models](https://arxiv.org/abs/2306.07691)
+- **PL-BERT Model:** [fadi77/pl-bert](https://huggingface.co/fadi77/pl-bert)
+## Uses
+### Direct Use
+The model can be used for generating Arabic speech from text. To use the model:
+1. Clone the StyleTTS2 repository:
+```bash
+git clone https://github.com/Fadi987/StyleTTS2
+cd StyleTTS2
+```
+2. Install `espeak-ng` for phonemization backend:
+```bash
+# For macOS
+brew install espeak-ng
+# For Ubuntu/Debian
+sudo apt-get install espeak-ng
+# For Windows
+# Download and install espeak-ng from: https://github.com/espeak-ng/espeak-ng/releases
+```
+3. Install Python dependencies:
+```bash
+pip install -r requirements.txt
+```
+4. Download the `model.pth` and `config.yml` files from this repository
+5. Run inference using:
+```bash
+python inference.py --config config.yml --model model.pth --text "الإِتْقَانُ يَحْتَاجُ إِلَى الْعَمَلِ وَالْمُثَابَرَة"
+```
+Make sure use properly diacritized Arabic text for best results
+### Out-of-Scope Use
+The model is specifically designed for Arabic text-to-speech synthesis and may not perform well for:
+- Other languages
+- Heavy dialect variations
+- Non-diacritized Arabic text
+## Training Details
+### Training Data
+- Training was performed on approximately 22 hours of Arabic audiobook data
+- Dataset: [fadi77/arabic-audiobook-dataset-24khz](https://huggingface.co/datasets/fadi77/arabic-audiobook-dataset-24khz)
+- The PL-BERT component was trained on fully diacritized Wikipedia Arabic text
+### Training Hyperparameters
+- **Number of epochs:** 20
+- **Diffusion training:** Started from epoch 5
+### Objectives
+- **Training objectives:** All original StyleTTS2 objectives maintained, except WavLM adversarial training
+- **Validation objectives:** Identical to original StyleTTS2 validation process
+### Compute Infrastructure
+- **Hardware Type:** NVIDIA H100 GPU
+### Notable Modifications from Original StyleTTS2 in Architecture and Objectives
+The architecture of the model follows that of StyleTTS2 with the following exceptions:
+  - Removed WavLM adversarial training component
+  - Custom PL-BERT trained for Arabic language
+## Citation
+**BibTeX:**
+```bibtex
+@article{styletts2,
+  title={StyleTTS 2: Towards Human-Level Text-to-Speech through Style Diffusion and Adversarial Training with Large Speech Language Models},
+  author={Liu, Yinghao Aaron and Chen, Tao and Ping, Wei and Wu, Xiaoliang and Wang, Dongchao and Duan, Yuxuan and Li, Xiaodi and Li, Chong and Liang, Xuchen and Liu, Qiong and others},
+  journal={arXiv preprint arXiv:2306.07691},
+  year={2023}
+}
+```
+## Model Card Contact
+GitHub: [@Fadi987](https://github.com/Fadi987)
+Hugging Face: [@fadi77](https://huggingface.co/fadi77)

ar/StyleTTS2-LibriTTS-arabic/config.yml ADDED Viewed

	@@ -0,0 +1,114 @@

+log_dir: "/style_tts2/Models/FineTune.AudioBook"
+log_interval: 10
+device: "cuda"
+epochs: 25 # number of finetuning epoch
+batch_size: 6
+max_len: 300 # maximum number of frames
+pretrained_model_repo: "yl4579/StyleTTS2-LibriTTS"
+pretrained_model_filename: "Models/LibriTTS/epochs_2nd_00020.pth"
+second_stage_load_pretrained: true # set to true if the pre-trained model is for 2nd stage
+load_only_params: true # set to true if do not want to load epoch numbers and optimizer parameters
+F0_path: "/root/Utils/JDC/bst.t7"
+ASR_config: "/root/Utils/ASR/config.yml"
+ASR_path: "/root/Utils/ASR/epoch_00080.pth"
+PLBERT_repo_id: "fadi77/pl-bert"
+PLBERT_dirname: "models/mlm_only_with_diacritics"
+data_params:
+  train_data: "Data/youtube_train_list.txt"
+  val_data: "Data/youtube_val_list.txt"
+  root_path: "Youtube/wavs"
+  OOD_data: "Data/youtube_train_list.txt"
+  min_length: 50 # sample until texts with this size are obtained for OOD texts
+preprocess_params:
+  sr: 24000
+  spect_params:
+    n_fft: 2048
+    win_length: 1200
+    hop_length: 300
+model_params:
+  multispeaker: false
+  dim_in: 64
+  hidden_dim: 512
+  max_conv_dim: 512
+  n_layer: 3
+  n_mels: 80
+  n_token: 178 # number of phoneme tokens
+  max_dur: 50 # maximum duration of a single phoneme
+  style_dim: 128 # style vector size
+  dropout: 0.2
+  # config for decoder
+  decoder:
+      type: 'hifigan' # either hifigan or istftnet
+      resblock_kernel_sizes: [3,7,11]
+      upsample_rates :  [10,5,3,2]
+      upsample_initial_channel: 512
+      resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
+      upsample_kernel_sizes: [20,10,6,4]
+  # speech language model config
+  slm:
+      model: 'microsoft/wavlm-base-plus'
+      sr: 16000 # sampling rate of SLM
+      hidden: 768 # hidden size of SLM
+      nlayers: 13 # number of layers of SLM
+      initial_channel: 64 # initial channels of SLM discriminator head
+  # style diffusion model config
+  diffusion:
+    embedding_mask_proba: 0.1
+    # transformer config
+    transformer:
+      num_layers: 3
+      num_heads: 8
+      head_features: 64
+      multiplier: 2
+    # diffusion distribution config
+    dist:
+      sigma_data: 0.2 # placeholder for estimate_sigma_data set to false
+      estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
+      mean: -3.0
+      std: 1.0
+loss_params:
+    lambda_mel: 5. # mel reconstruction loss
+    lambda_gen: 1. # generator loss
+    lambda_slm: 1. # slm feature matching loss
+    lambda_mono: 1. # monotonic alignment loss (TMA)
+    lambda_s2s: 1. # sequence-to-sequence loss (TMA)
+    lambda_F0: 1. # F0 reconstruction loss
+    lambda_norm: 1. # norm reconstruction loss
+    lambda_dur: 1. # duration loss
+    lambda_ce: 20. # duration predictor probability output CE loss
+    lambda_sty: 1. # style reconstruction loss
+    lambda_diff: 1. # score matching loss
+    # Note: Current values for training are only adequate for second stage finetuning.
+    diffusion_training_epoch: 5
+    joint_training_epoch: 100
+# Note: Current values for learnings rates are very low. This is only adequate for second stage finetuning.
+optimizer_params:
+  lr: 0.0001 # general learning rate
+  bert_lr: 0.00001 # learning rate for PLBERT
+  ft_lr: 0.0001 # learning rate for acoustic modules
+slmadv_params:
+  min_len: 400 # minimum length of samples
+  max_len: 500 # maximum length of samples
+  batch_percentage: 0.5 # to prevent out of memory, only use half of the original batch size
+  skip_update: 10 # update the discriminator every this iterations of generator update
+  thresh: 5 # gradient norm above which the gradient is scaled
+  scale: 0.01 # gradient scaling factor for predictors from SLM discriminators
+  sig: 1.5 # sigma for differentiable duration modeling

ar/StyleTTS2-LibriTTS-arabic/model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:59d2323412f0c55c774b5675b45e5c12659c0d9e0f9e7012eecc6b7dd845b132
+size 2201968238

ar/StyleTTS2-LibriTTS-arabic/source.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ https://huggingface.co/fadi77/StyleTTS2-LibriTTS-arabic

ar/StyleTTS2-LibriTTS-arabic/synthesized_audio.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f60e90523d734eff1b9f4b95cca49f22277df5cb4acd0bd347fde18f1c3b0469
+size 1795058

en/StyleTTS2-lite/.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text

en/StyleTTS2-lite/.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+Modules/__pycache__/__init__.cpython-311.pyc
+Modules/__pycache__/hifigan.cpython-311.pyc
+Modules/__pycache__/utils.cpython-311.pyc
+Modules/__pycache__/__init__.cpython-311.pyc
+Modules/__pycache__/hifigan.cpython-311.pyc
+Modules/__pycache__/utils.cpython-311.pyc
+__pycache__/inference.cpython-311.pyc
+__pycache__/models.cpython-311.pyc

en/StyleTTS2-lite/Audio/10_michael.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:733023e56be0434c66ac3b855c9aaac29d64f3a060c295a75e700ecfd34c16f0
+size 620444

en/StyleTTS2-lite/Audio/11_fenrir.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:abde72631473e48455d54cf585a0b1f229e6e77e9748ed1acef5678a40b08c08
+size 537644

en/StyleTTS2-lite/Audio/12_puck.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:409cc59612472a0d4bb717613f539dafdb334411ed651ab6988f7fca8b922905
+size 619244

en/StyleTTS2-lite/Audio/13_echo.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b6925e6737a67fcbf8dce32d22d29d086d81627b82c6edbfc92b3706f27479ff
+size 524444

en/StyleTTS2-lite/Audio/14_eric.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:97b8bbf6a880e46730387ee7bb4bfba6c049ed58c4ec8680ec44f83df669eff1
+size 573644

en/StyleTTS2-lite/Audio/15_liam.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:95842cfe6d1093deb37447b0e5993b6c18f7e5591c3fb1fb3dd230641925de44
+size 541244

en/StyleTTS2-lite/Audio/16_onyx.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:25487ea7634b470392d787bfefb79da0a6a56dc26087ab27b62fa70aac43554d
+size 514844

en/StyleTTS2-lite/Audio/17_santa.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:80bc56619904ccbd93ed813fc54491f7b83eb8b8fd6c8a1626bd9177f96a23cd
+size 583244

en/StyleTTS2-lite/Audio/18_adam.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b84a1b122273a45d98b5cbf725f4633e4cccb4a0788b8a46cc9faa4b8612419b
+size 517244

en/StyleTTS2-lite/Audio/1_heart.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:978b285ff24f274a1f4fe4551b0d57a5df704ca5ce83284e839ffe96c2dc3dfd
+size 547244

en/StyleTTS2-lite/Audio/2_belle.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:459a64fa12dfb530320e8dab2f4057d7868ae4c020b447e8df3402149fa2be59
+size 357644

en/StyleTTS2-lite/Audio/3_kore.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e55fc5c463d01d46c090be5457c59727ee52f2ecbeba8be9b38862850418c0c3
+size 276044

en/StyleTTS2-lite/Audio/4_sarah.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ae7416f410104b0cedc1cc9c7365a89fd16a1599733f8f416e7618943d0acb8
+size 640844

en/StyleTTS2-lite/Audio/5_nova.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:252c20a3f55bfe0ea7f42fbd638f6d4113ade7918630d1d37e166e11143f74f8
+size 336044

en/StyleTTS2-lite/Audio/6_sky.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dc985eb31aa7e2088f852c55282ec6ff72365486478a627bcd56ce2387a8d5b2
+size 502844

en/StyleTTS2-lite/Audio/7_alloy.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd7868816449f2139e21661dcbc13d3d553c558627d4c50fada1f7c22ce7f86c
+size 632444

en/StyleTTS2-lite/Audio/8_jessica.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c8d7573154905c901281e767f25be2dbceae731c891da409f5b7c0be3096bd5d
+size 477644

en/StyleTTS2-lite/Audio/9_river.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:75a3b2fc9d4e93ded21f28cccc6ae7bf7a39bf04fed7f2d4d36e59db0792eedd
+size 472844

en/StyleTTS2-lite/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Aaron (Yinghao) Li
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

en/StyleTTS2-lite/Models/base_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:821deb4efee549b7024f37236e86b4bcb023870baf0ddb9f407fb514253340d1
+size 1692092384

en/StyleTTS2-lite/Models/config.yaml ADDED Viewed

	@@ -0,0 +1,79 @@

+log_dir: ./Models/Finetune
+save_freq: 1
+log_interval: 10
+device: cuda
+epochs: 50
+batch_size: 2
+max_len: 310 # maximum number of frames
+pretrained_model: ./Models/Finetune/base_model.pth
+load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
+debug: true
+data_params:
+  train_data: ../../Data_Speech/LibriTTS/train.txt
+  val_data: ../../Data_Speech/LibriTTS/val.txt
+  root_path: ../../Data_Speech/
+symbol: #Total 178 symbols
+  pad: "$"
+  punctuation: ';:,.!?¡¿—…"«»“” '
+  letters: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+  letters_ipa: "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
+  extend: "" #ADD MORE SYMBOLS HERE
+preprocess_params:
+  sr: 24000
+  spect_params:
+    n_fft: 2048
+    win_length: 1200
+    hop_length: 300
+training_strats:
+  #All modules: 'decoder', 'predictor', 'text_encoder', 'style_encoder', 'text_aligner', 'pitch_extractor', 'mpd', 'msd'
+  freeze_modules: [''] # Not updated when training.
+  ignore_modules: [''] # Not loading => fresh start. IMPORTANT: 'text_aligner' and 'pitch_extractor' are util pretraineds DO NOT ignore them.
+model_params:
+  dim_in: 64
+  hidden_dim: 512
+  max_conv_dim: 512
+  n_layer: 3
+  n_mels: 80
+  max_dur: 50 # maximum duration of a single phoneme
+  style_dim: 128 # style vector size
+  dropout: 0.2
+  ASR_params:
+    input_dim: 80
+    hidden_dim: 256
+    n_layers: 6
+    token_embedding_dim: 512
+  JDC_params:
+    num_class: 1
+    seq_len: 192
+  # config for decoder
+  decoder:
+      type: hifigan # either hifigan or istftnet
+      resblock_kernel_sizes: [3,7,11]
+      upsample_rates :  [10,5,3,2]
+      upsample_initial_channel: 512
+      resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
+      upsample_kernel_sizes: [20,10,6,4]
+loss_params:
+    lambda_mel: 5. # mel reconstruction loss
+    lambda_gen: 1. # generator loss
+    lambda_mono: 1. # monotonic alignment loss (TMA)
+    lambda_s2s: 1. # sequence-to-sequence loss (TMA)
+    lambda_F0: 1. # F0 reconstruction loss
+    lambda_norm: 1. # norm reconstruction loss
+    lambda_dur: 1. # duration loss
+    lambda_ce: 20. # duration predictor probability output CE loss
+optimizer_params:
+  lr: 0.0001 # general learning rate
+  ft_lr: 0.00001 # learning rate for acoustic modules

en/StyleTTS2-lite/Models/inference/model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2763d7b6c5477502d3f2a870eda76bbedae671f0107b15a1060fb4e6771ed634
+size 359997166

en/StyleTTS2-lite/Modules/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

en/StyleTTS2-lite/Modules/hifigan.py ADDED Viewed

	@@ -0,0 +1,477 @@

+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from .utils import init_weights, get_padding
+import math
+import random
+import numpy as np
+LRELU_SLOPE = 0.1
+class AdaIN1d(nn.Module):
+    def __init__(self, style_dim, num_features):
+        super().__init__()
+        self.norm = nn.InstanceNorm1d(num_features, affine=False)
+        self.fc = nn.Linear(style_dim, num_features*2)
+    def forward(self, x, s):
+        h = self.fc(s)
+        h = h.view(h.size(0), h.size(1), 1)
+        gamma, beta = torch.chunk(h, chunks=2, dim=1)
+        return (1 + gamma) * self.norm(x) + beta
+class AdaINResBlock1(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), style_dim=64):
+        super(AdaINResBlock1, self).__init__()
+        self.convs1 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
+                               padding=get_padding(kernel_size, dilation[2])))
+        ])
+        self.convs1.apply(init_weights)
+        self.convs2 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1)))
+        ])
+        self.convs2.apply(init_weights)
+        self.adain1 = nn.ModuleList([
+            AdaIN1d(style_dim, channels),
+            AdaIN1d(style_dim, channels),
+            AdaIN1d(style_dim, channels),
+        ])
+        self.adain2 = nn.ModuleList([
+            AdaIN1d(style_dim, channels),
+            AdaIN1d(style_dim, channels),
+            AdaIN1d(style_dim, channels),
+        ])
+        self.alpha1 = nn.ParameterList([nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs1))])
+        self.alpha2 = nn.ParameterList([nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs2))])
+    def forward(self, x, s):
+        for c1, c2, n1, n2, a1, a2 in zip(self.convs1, self.convs2, self.adain1, self.adain2, self.alpha1, self.alpha2):
+            xt = n1(x, s)
+            xt = xt + (1 / a1) * (torch.sin(a1 * xt) ** 2)  # Snake1D
+            xt = c1(xt)
+            xt = n2(xt, s)
+            xt = xt + (1 / a2) * (torch.sin(a2 * xt) ** 2)  # Snake1D
+            xt = c2(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+class SineGen(torch.nn.Module):
+    """ Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+    def __init__(self, samp_rate, upsample_scale, harmonic_num=0,
+                 sine_amp=0.1, noise_std=0.003,
+                 voiced_threshold=0,
+                 flag_for_pulse=False):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.dim = self.harmonic_num + 1
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+        self.flag_for_pulse = flag_for_pulse
+        self.upsample_scale = upsample_scale
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = (f0 > self.voiced_threshold).type(torch.float32)
+        return uv
+    def _f02sine(self, f0_values):
+        """ f0_values: (batchsize, length, dim)
+            where dim indicates fundamental tone and overtones
+        """
+        # convert to F0 in rad. The interger part n can be ignored
+        # because 2 * np.pi * n doesn't affect phase
+        rad_values = (f0_values / self.sampling_rate) % 1
+        # initial phase noise (no noise for fundamental component)
+        rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], \
+                              device=f0_values.device)
+        rand_ini[:, 0] = 0
+        rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
+        # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
+        if not self.flag_for_pulse:
+#             # for normal case
+#             # To prevent torch.cumsum numerical overflow,
+#             # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1.
+#             # Buffer tmp_over_one_idx indicates the time step to add -1.
+#             # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi
+#             tmp_over_one = torch.cumsum(rad_values, 1) % 1
+#             tmp_over_one_idx = (padDiff(tmp_over_one)) < 0
+#             cumsum_shift = torch.zeros_like(rad_values)
+#             cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
+#             phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi
+            rad_values = torch.nn.functional.interpolate(rad_values.transpose(1, 2),
+                                                         scale_factor=1/self.upsample_scale,
+                                                         mode="linear").transpose(1, 2)
+#             tmp_over_one = torch.cumsum(rad_values, 1) % 1
+#             tmp_over_one_idx = (padDiff(tmp_over_one)) < 0
+#             cumsum_shift = torch.zeros_like(rad_values)
+#             cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
+            phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi
+            phase = torch.nn.functional.interpolate(phase.transpose(1, 2) * self.upsample_scale,
+                                                    scale_factor=self.upsample_scale, mode="linear").transpose(1, 2)
+            sines = torch.sin(phase)
+        else:
+            # If necessary, make sure that the first time step of every
+            # voiced segments is sin(pi) or cos(0)
+            # This is used for pulse-train generation
+            # identify the last time step in unvoiced segments
+            uv = self._f02uv(f0_values)
+            uv_1 = torch.roll(uv, shifts=-1, dims=1)
+            uv_1[:, -1, :] = 1
+            u_loc = (uv < 1) * (uv_1 > 0)
+            # get the instantanouse phase
+            tmp_cumsum = torch.cumsum(rad_values, dim=1)
+            # different batch needs to be processed differently
+            for idx in range(f0_values.shape[0]):
+                temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
+                temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
+                # stores the accumulation of i.phase within
+                # each voiced segments
+                tmp_cumsum[idx, :, :] = 0
+                tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
+            # rad_values - tmp_cumsum: remove the accumulation of i.phase
+            # within the previous voiced segment.
+            i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
+            # get the sines
+            sines = torch.cos(i_phase * 2 * np.pi)
+        return sines
+    def forward(self, f0):
+        """ sine_tensor, uv = forward(f0)
+        input F0: tensor(batchsize=1, length, dim=1)
+                  f0 for unvoiced steps should be 0
+        output sine_tensor: tensor(batchsize=1, length, dim)
+        output uv: tensor(batchsize=1, length, 1)
+        """
+        f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim,
+                             device=f0.device)
+        # fundamental component
+        fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
+        # generate sine waveforms
+        sine_waves = self._f02sine(fn) * self.sine_amp
+        # generate uv signal
+        # uv = torch.ones(f0.shape)
+        # uv = uv * (f0 > self.voiced_threshold)
+        uv = self._f02uv(f0)
+        # noise: for unvoiced should be similar to sine_amp
+        #        std = self.sine_amp/3 -> max value ~ self.sine_amp
+        # .       for voiced regions is self.noise_std
+        noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+        noise = noise_amp * torch.randn_like(sine_waves)
+        # first: set the unvoiced part to 0 by uv
+        # then: additive noise
+        sine_waves = sine_waves * uv + noise
+        return sine_waves, uv, noise
+class SourceModuleHnNSF(torch.nn.Module):
+    """ SourceModule for hn-nsf
+    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    harmonic_num: number of harmonic above F0 (default: 0)
+    sine_amp: amplitude of sine source signal (default: 0.1)
+    add_noise_std: std of additive Gaussian noise (default: 0.003)
+        note that amplitude of noise in unvoiced is decided
+        by sine_amp
+    voiced_threshold: threhold to set U/V given F0 (default: 0)
+    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+    F0_sampled (batchsize, length, 1)
+    Sine_source (batchsize, length, 1)
+    noise_source (batchsize, length 1)
+    uv (batchsize, length, 1)
+    """
+    def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0):
+        super(SourceModuleHnNSF, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        # to produce sine waveforms
+        self.l_sin_gen = SineGen(sampling_rate, upsample_scale, harmonic_num,
+                                 sine_amp, add_noise_std, voiced_threshod)
+        # to merge source harmonics into a single excitation
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = torch.nn.Tanh()
+    def forward(self, x):
+        """
+        Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+        F0_sampled (batchsize, length, 1)
+        Sine_source (batchsize, length, 1)
+        noise_source (batchsize, length 1)
+        """
+        # source for harmonic branch
+        with torch.no_grad():
+            sine_wavs, uv, _ = self.l_sin_gen(x)
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+        # source for noise branch, in the same shape as uv
+        noise = torch.randn_like(uv) * self.sine_amp / 3
+        return sine_merge, noise, uv
+def padDiff(x):
+    return F.pad(F.pad(x, (0,0,-1,1), 'constant', 0) - x, (0,0,0,-1), 'constant', 0)
+class Generator(torch.nn.Module):
+    def __init__(self, style_dim, resblock_kernel_sizes, upsample_rates, upsample_initial_channel, resblock_dilation_sizes, upsample_kernel_sizes):
+        super(Generator, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        resblock = AdaINResBlock1
+        self.m_source = SourceModuleHnNSF(
+                    sampling_rate=24000,
+                    upsample_scale=np.prod(upsample_rates),
+                    harmonic_num=8, voiced_threshod=10)
+        self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
+        self.noise_convs = nn.ModuleList()
+        self.ups = nn.ModuleList()
+        self.noise_res = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            c_cur = upsample_initial_channel // (2 ** (i + 1))
+            self.ups.append(weight_norm(ConvTranspose1d(upsample_initial_channel//(2**i),
+                         upsample_initial_channel//(2**(i+1)),
+                         k, u, padding=(u//2 + u%2), output_padding=u%2)))
+            if i + 1 < len(upsample_rates):  #
+                stride_f0 = np.prod(upsample_rates[i + 1:])
+                self.noise_convs.append(Conv1d(
+                    1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=(stride_f0+1) // 2))
+                self.noise_res.append(resblock(c_cur, 7, [1,3,5], style_dim))
+            else:
+                self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
+                self.noise_res.append(resblock(c_cur, 11, [1,3,5], style_dim))
+        self.resblocks = nn.ModuleList()
+        self.alphas = nn.ParameterList()
+        self.alphas.append(nn.Parameter(torch.ones(1, upsample_initial_channel, 1)))
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel//(2**(i+1))
+            self.alphas.append(nn.Parameter(torch.ones(1, ch, 1)))
+            for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.resblocks.append(resblock(ch, k, d, style_dim))
+        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+    def forward(self, x, s, f0):
+        f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
+        har_source, noi_source, uv = self.m_source(f0)
+        har_source = har_source.transpose(1, 2)
+        for i in range(self.num_upsamples):
+            x = x + (1 / self.alphas[i]) * (torch.sin(self.alphas[i] * x) ** 2)
+            x_source = self.noise_convs[i](har_source)
+            x_source = self.noise_res[i](x_source, s)
+            x = self.ups[i](x)
+            x = x + x_source
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i*self.num_kernels+j](x, s)
+                else:
+                    xs += self.resblocks[i*self.num_kernels+j](x, s)
+            x = xs / self.num_kernels
+        x = x + (1 / self.alphas[i+1]) * (torch.sin(self.alphas[i+1] * x) ** 2)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+class AdainResBlk1d(nn.Module):
+    def __init__(self, dim_in, dim_out, style_dim=64, actv=nn.LeakyReLU(0.2),
+                 upsample='none', dropout_p=0.0):
+        super().__init__()
+        self.actv = actv
+        self.upsample_type = upsample
+        self.upsample = UpSample1d(upsample)
+        self.learned_sc = dim_in != dim_out
+        self._build_weights(dim_in, dim_out, style_dim)
+        self.dropout = nn.Dropout(dropout_p)
+        if upsample == 'none':
+            self.pool = nn.Identity()
+        else:
+            self.pool = weight_norm(nn.ConvTranspose1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1, output_padding=1))
+    def _build_weights(self, dim_in, dim_out, style_dim):
+        self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
+        self.conv2 = weight_norm(nn.Conv1d(dim_out, dim_out, 3, 1, 1))
+        self.norm1 = AdaIN1d(style_dim, dim_in)
+        self.norm2 = AdaIN1d(style_dim, dim_out)
+        if self.learned_sc:
+            self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
+    def _shortcut(self, x):
+        x = self.upsample(x)
+        if self.learned_sc:
+            x = self.conv1x1(x)
+        return x
+    def _residual(self, x, s):
+        x = self.norm1(x, s)
+        x = self.actv(x)
+        x = self.pool(x)
+        x = self.conv1(self.dropout(x))
+        x = self.norm2(x, s)
+        x = self.actv(x)
+        x = self.conv2(self.dropout(x))
+        return x
+    def forward(self, x, s):
+        out = self._residual(x, s)
+        out = (out + self._shortcut(x)) / math.sqrt(2)
+        return out
+class UpSample1d(nn.Module):
+    def __init__(self, layer_type):
+        super().__init__()
+        self.layer_type = layer_type
+    def forward(self, x):
+        if self.layer_type == 'none':
+            return x
+        else:
+            return F.interpolate(x, scale_factor=2, mode='nearest')
+class Decoder(nn.Module):
+    def __init__(self, dim_in=512, F0_channel=512, style_dim=64, dim_out=80,
+                resblock_kernel_sizes = [3,7,11],
+                upsample_rates = [10,5,3,2],
+                upsample_initial_channel=512,
+                resblock_dilation_sizes=[[1,3,5], [1,3,5], [1,3,5]],
+                upsample_kernel_sizes=[20,10,6,4]):
+        super().__init__()
+        self.decode = nn.ModuleList()
+        self.encode = AdainResBlk1d(dim_in + 2, 1024, style_dim)
+        self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
+        self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
+        self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
+        self.decode.append(AdainResBlk1d(1024 + 2 + 64, 512, style_dim, upsample=True))
+        self.F0_conv = weight_norm(nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1))
+        self.N_conv = weight_norm(nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1))
+        self.asr_res = nn.Sequential(
+            weight_norm(nn.Conv1d(512, 64, kernel_size=1)),
+        )
+        self.generator = Generator(style_dim, resblock_kernel_sizes, upsample_rates, upsample_initial_channel, resblock_dilation_sizes, upsample_kernel_sizes)
+    def forward(self, asr, F0_curve, N, s):
+        if self.training:
+            downlist = [0, 3, 7]
+            F0_down = downlist[random.randint(0, 2)]
+            downlist = [0, 3, 7, 15]
+            N_down = downlist[random.randint(0, 3)]
+            if F0_down:
+                F0_curve = nn.functional.conv1d(F0_curve.unsqueeze(1), torch.ones(1, 1, F0_down).to(asr.device), padding=F0_down//2).squeeze(1) / F0_down
+            if N_down:
+                N = nn.functional.conv1d(N.unsqueeze(1), torch.ones(1, 1, N_down).to(asr.device), padding=N_down//2).squeeze(1)  / N_down
+        F0 = self.F0_conv(F0_curve.unsqueeze(1))
+        N = self.N_conv(N.unsqueeze(1))
+        x = torch.cat([asr, F0, N], axis=1)
+        x = self.encode(x, s)
+        asr_res = self.asr_res(asr)
+        res = True
+        for block in self.decode:
+            if res:
+                x = torch.cat([x, asr_res, F0, N], axis=1)
+            x = block(x, s)
+            if block.upsample_type != "none":
+                res = False
+        x = self.generator(x, s, F0_curve)
+        return x

en/StyleTTS2-lite/Modules/utils.py ADDED Viewed

	@@ -0,0 +1,14 @@

+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+def apply_weight_norm(m):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        weight_norm(m)
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size*dilation - dilation)/2)

en/StyleTTS2-lite/README.md ADDED Viewed

	@@ -0,0 +1,88 @@

+---
+license: mit
+language:
+- en
+base_model:
+- yl4579/StyleTTS2-LibriTTS
+pipeline_tag: text-to-speech
+---
+# StyleTTS 2 - lite
+## Online Demo
+Explore the model on Hugging Face Spaces:
+https://huggingface.co/spaces/dangtr0408/StyleTTS2-lite-space
+## Fine-tune
+https://github.com/dangtr0408/StyleTTS2-lite
+## Training Details
+1. **Base Checkpoint:** Initialized from the official StyleTTS 2 weights pre-trained on LibriTTS.
+2. **Components Removal:** PLBert, Diffusion, Prosodic Encoder, SLM, and Spectral Normalization.
+2. **Training Data:** LibriTTS corpus.
+3. **Training Schedule:** Trained for 100,000 steps.
+## Model Architecture
+| Component      | Parameters    |
+| -------------- | ------------- |
+| Decoder        | 54 ,289 ,492  |
+| Predictor      | 16 ,194 ,612  |
+| Style Encoder  | 13 ,845 ,440  |
+| Text Encoder   | 5,612 ,320  |
+| **Total**      | **89 ,941 ,576** |
+##  Prerequisites
+- **Python:** Version 3.7 or higher
+- **Git:** To clone the repository
+## Installation & Setup
+1. Clone the repository
+```bash
+git  clone  https://huggingface.co/dangtr0408/StyleTTS2-lite
+cd  StyleTTS2-lite
+```
+2. Install dependencies:
+```bash
+pip  install  -r  requirements.txt
+```
+3. On **Linux**, manually install espeak:
+```bash
+sudo  apt-get  install  espeak-ng
+```
+## Usage Example
+See run.ipynb file.
+## Disclaimer
+**Before using these pre-trained models, you agree to inform the listeners that the speech samples are synthesized by the pre-trained models, unless you have the permission to use the voice you synthesize. That is, you agree to only use voices whose speakers grant the permission to have their voice cloned, either directly or by license before making synthesized voices public, or you have to publicly announce that these voices are synthesized if you do not have the permission to use these voices.**
+## References
+- [yl4579/StyleTTS2](https://arxiv.org/abs/2306.07691)
+- [jik876/hifi-gan](https://github.com/jik876/hifi-gan)
+## License
+**Code: MIT License**

en/StyleTTS2-lite/inference.py ADDED Viewed

	@@ -0,0 +1,301 @@

+import re
+import yaml
+from munch import Munch
+import numpy as np
+import librosa
+import noisereduce as nr
+from meldataset import TextCleaner
+import torch
+import torchaudio
+from nltk.tokenize import word_tokenize
+import nltk
+nltk.download('punkt_tab')
+from models import ProsodyPredictor, TextEncoder, StyleEncoder
+from Modules.hifigan import Decoder
+class Preprocess:
+    def __text_normalize(self, text):
+        punctuation = ["，", "、", "،", ";", "(", "．", "。", "…", "!", "–", ":", "?"]
+        map_to = "."
+        punctuation_pattern = re.compile(f"[{''.join(re.escape(p) for p in punctuation)}]")
+        #replace punctuation that acts like a comma or period
+        text = punctuation_pattern.sub(map_to, text)
+        #replace consecutive whitespace chars with a single space and strip leading/trailing spaces
+        text = re.sub(r'\s+', ' ', text).strip()
+        return text
+    def __merge_fragments(self, texts, n):
+        merged = []
+        i = 0
+        while i < len(texts):
+            fragment = texts[i]
+            j = i + 1
+            while len(fragment.split()) < n and j < len(texts):
+                fragment += ", " + texts[j]
+                j += 1
+            merged.append(fragment)
+            i = j
+        if len(merged[-1].split()) < n and len(merged) > 1: #handle last sentence
+            merged[-2] = merged[-2] + ", " + merged[-1]
+            del merged[-1]
+        else:
+            merged[-1] = merged[-1]
+        return merged
+    def wave_preprocess(self, wave):
+        to_mel = torchaudio.transforms.MelSpectrogram(n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
+        mean, std = -4, 4
+        wave_tensor = torch.from_numpy(wave).float()
+        mel_tensor = to_mel(wave_tensor)
+        mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
+        return mel_tensor
+    def text_preprocess(self, text, n_merge=12):
+        text_norm = self.__text_normalize(text).split(".")#split by sentences.
+        text_norm = [s.strip() for s in text_norm]
+        text_norm = list(filter(lambda x: x != '', text_norm)) #filter empty index
+        text_norm = self.__merge_fragments(text_norm, n=n_merge) #merge if a sentence has less that n
+        return text_norm
+    def length_to_mask(self, lengths):
+        mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
+        mask = torch.gt(mask+1, lengths.unsqueeze(1))
+        return mask
+#For inference only
+class StyleTTS2(torch.nn.Module):
+    def __init__(self, config_path, models_path):
+        super().__init__()
+        self.register_buffer("get_device", torch.empty(0))
+        self.preprocess = Preprocess()
+        self.ref_s = None
+        config = yaml.safe_load(open(config_path, "r", encoding="utf-8"))
+        try:
+            symbols = (
+                            list(config['symbol']['pad']) +
+                            list(config['symbol']['punctuation']) +
+                            list(config['symbol']['letters']) +
+                            list(config['symbol']['letters_ipa']) +
+                            list(config['symbol']['extend'])
+                        )
+            symbol_dict = {}
+            for i in range(len((symbols))):
+                symbol_dict[symbols[i]] = i
+            n_token = len(symbol_dict) + 1
+            print("\nFound:", n_token, "symbols")
+        except Exception as e:
+            print(f"\nERROR: Cannot find {e} in config file!\nYour config file is likely outdated, please download updated version from the repository.")
+            raise SystemExit(1)
+        args = self.__recursive_munch(config['model_params'])
+        args['n_token'] = n_token
+        self.cleaner = TextCleaner(symbol_dict, debug=False)
+        assert args.decoder.type in ['hifigan'], 'Decoder type unknown'
+        self.decoder            = Decoder(dim_in=args.hidden_dim, style_dim=args.style_dim, dim_out=args.n_mels,
+                                        resblock_kernel_sizes = args.decoder.resblock_kernel_sizes,
+                                        upsample_rates = args.decoder.upsample_rates,
+                                        upsample_initial_channel=args.decoder.upsample_initial_channel,
+                                        resblock_dilation_sizes=args.decoder.resblock_dilation_sizes,
+                                        upsample_kernel_sizes=args.decoder.upsample_kernel_sizes)
+        self.predictor           = ProsodyPredictor(style_dim=args.style_dim, d_hid=args.hidden_dim, nlayers=args.n_layer, max_dur=args.max_dur, dropout=args.dropout)
+        self.text_encoder        = TextEncoder(channels=args.hidden_dim, kernel_size=5, depth=args.n_layer, n_symbols=args.n_token)
+        self.style_encoder       = StyleEncoder(dim_in=args.dim_in, style_dim=args.style_dim, max_conv_dim=args.hidden_dim)# acoustic style encoder
+        self.__load_models(models_path)
+    def __recursive_munch(self, d):
+        if isinstance(d, dict):
+            return Munch((k, self.__recursive_munch(v)) for k, v in d.items())
+        elif isinstance(d, list):
+            return [self.__recursive_munch(v) for v in d]
+        else:
+            return d
+    def __replace_outliers_zscore(self, tensor, threshold=3.0, factor=0.95):
+        mean = tensor.mean()
+        std = tensor.std()
+        z = (tensor - mean) / std
+        # Identify outliers
+        outlier_mask = torch.abs(z) > threshold
+        # Compute replacement value, respecting sign
+        sign = torch.sign(tensor - mean)
+        replacement = mean + sign * (threshold * std * factor)
+        result = tensor.clone()
+        result[outlier_mask] = replacement[outlier_mask]
+        return result
+    def __load_models(self, models_path):
+        module_params = []
+        model = {'decoder':self.decoder, 'predictor':self.predictor, 'text_encoder':self.text_encoder, 'style_encoder':self.style_encoder}
+        params_whole = torch.load(models_path, map_location='cpu')
+        params = params_whole['net']
+        params = {key: value for key, value in params.items() if key in model.keys()}
+        for key in model:
+            try:
+                model[key].load_state_dict(params[key])
+            except:
+                from collections import OrderedDict
+                state_dict = params[key]
+                new_state_dict = OrderedDict()
+                for k, v in state_dict.items():
+                    name = k[7:] # remove `module.`
+                    new_state_dict[name] = v
+                model[key].load_state_dict(new_state_dict, strict=False)
+            total_params = sum(p.numel() for p in model[key].parameters())
+            print(key,":",total_params)
+            module_params.append(total_params)
+        print('\nTotal',":",sum(module_params))
+    def __compute_style(self, path, denoise, split_dur):
+        device = self.get_device.device
+        denoise = min(denoise, 1)
+        if split_dur != 0: split_dur = max(int(split_dur), 1)
+        max_samples = 24000*20 #max 20 seconds ref audio
+        print("Computing the style for:", path)
+        wave, sr = librosa.load(path, sr=24000)
+        audio, index = librosa.effects.trim(wave, top_db=30)
+        if sr != 24000:
+            audio = librosa.resample(audio, sr, 24000)
+        if len(audio) > max_samples:
+            audio = audio[:max_samples]
+        if denoise > 0.0:
+            audio_denoise = nr.reduce_noise(y=audio, sr=sr, n_fft=2048, win_length=1200, hop_length=300)
+            audio = audio*(1-denoise) + audio_denoise*denoise
+        with torch.no_grad():
+            if split_dur>0 and len(audio)/sr>=4: #Only effective if audio length is >= 4s
+                #This option will split the ref audio to multiple parts, calculate styles and average them
+                count = 0
+                ref_s = None
+                jump = sr*split_dur
+                total_len = len(audio)
+                #Need to init before the loop
+                mel_tensor = self.preprocess.wave_preprocess(audio[0:jump]).to(device)
+                ref_s = self.style_encoder(mel_tensor.unsqueeze(1))
+                count += 1
+                for i in range(jump, total_len, jump):
+                    if i+jump >= total_len:
+                        left_dur = (total_len-i)/sr
+                        if left_dur >= 1: #Still count if left over dur is >= 1s
+                            mel_tensor = self.preprocess.wave_preprocess(audio[i:total_len]).to(device)
+                            ref_s += self.style_encoder(mel_tensor.unsqueeze(1))
+                            count += 1
+                        continue
+                    mel_tensor = self.preprocess.wave_preprocess(audio[i:i+jump]).to(device)
+                    ref_s += self.style_encoder(mel_tensor.unsqueeze(1))
+                    count += 1
+                ref_s /= count
+            else:
+                mel_tensor = self.preprocess.wave_preprocess(audio).to(device)
+                ref_s = self.style_encoder(mel_tensor.unsqueeze(1))
+        return ref_s
+    def __inference(self, phonem, ref_s, speed=1, prev_d_mean=0, t=0.1):
+        device = self.get_device.device
+        speed = min(max(speed, 0.0001), 2) #speed range [0, 2]
+        phonem = ' '.join(word_tokenize(phonem))
+        tokens = self.cleaner(phonem)
+        tokens.insert(0, 0)
+        tokens.append(0)
+        tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)
+        with torch.no_grad():
+            input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
+            text_mask = self.preprocess.length_to_mask(input_lengths).to(device)
+            # encode
+            t_en = self.text_encoder(tokens, input_lengths, text_mask)
+            s = ref_s.to(device)
+            # cal alignment
+            d = self.predictor.text_encoder(t_en, s, input_lengths, text_mask)
+            x, _ = self.predictor.lstm(d)
+            duration = self.predictor.duration_proj(x)
+            duration = torch.sigmoid(duration).sum(axis=-1)
+            if prev_d_mean != 0:#Stabilize speaking speed between splits
+                dur_stats = torch.empty(duration.shape).normal_(mean=prev_d_mean, std=duration.std()).to(device)
+            else:
+                dur_stats = torch.empty(duration.shape).normal_(mean=duration.mean(), std=duration.std()).to(device)
+            duration = duration*(1-t) + dur_stats*t
+            duration[:,1:-2] = self.__replace_outliers_zscore(duration[:,1:-2]) #Normalize outlier
+            duration /= speed
+            pred_dur = torch.round(duration.squeeze()).clamp(min=1)
+            pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
+            c_frame = 0
+            for i in range(pred_aln_trg.size(0)):
+                pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
+                c_frame += int(pred_dur[i].data)
+            alignment = pred_aln_trg.unsqueeze(0).to(device)
+            # encode prosody
+            en = (d.transpose(-1, -2) @ alignment)
+            F0_pred, N_pred = self.predictor.F0Ntrain(en, s)
+            asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))
+            out = self.decoder(asr, F0_pred, N_pred, s)
+        return out.squeeze().cpu().numpy(), duration.mean()
+    def get_styles(self, speaker, denoise=0.3, avg_style=True, load_styles=False):
+        if not load_styles:
+            if avg_style:   split_dur = 3
+            else:           split_dur = 0
+            self.ref_s = self.__compute_style(speaker['path'], denoise=denoise, split_dur=split_dur)
+        else:
+            if self.ref_s is None:
+                raise Exception("Have to compute or load the styles first!")
+        style = {
+            'style': self.ref_s,
+            'path': speaker['path'],
+            'speed': speaker['speed'],
+        }
+        return style
+    def save_styles(self, save_dir):
+        if self.ref_s is not None:
+            torch.save(self.ref_s, save_dir)
+            print("Saved styles!")
+        else:
+            raise Exception("Have to compute the styles before saving it.")
+    def load_styles(self, save_dir):
+        try:
+            self.ref_s = torch.load(save_dir)
+            print("Loaded styles!")
+        except Exception as e:
+            print(e)
+    def generate(self, phonem, style, stabilize=True, n_merge=16):
+        if stabilize:   smooth_value=0.2
+        else:           smooth_value=0
+        list_wav        = []
+        prev_d_mean     = 0
+        print("Generating Audio...")
+        text_norm = self.preprocess.text_preprocess(phonem, n_merge=n_merge)
+        for sentence in text_norm:
+            wav, prev_d_mean = self.__inference(sentence, style['style'], speed=style['speed'], prev_d_mean=prev_d_mean, t=smooth_value)
+            wav = wav[4000:-4000] #Remove weird pulse and silent tokens
+            list_wav.append(wav)
+        final_wav = np.concatenate(list_wav)
+        final_wav = np.concatenate([np.zeros([4000]), final_wav, np.zeros([4000])], axis=0) # add padding
+        return final_wav

en/StyleTTS2-lite/meldataset.py ADDED Viewed

	@@ -0,0 +1,307 @@

+#coding: utf-8
+import os.path as osp
+import random
+import numpy as np
+import random
+import soundfile as sf
+import librosa
+import torch
+import torchaudio
+import torch.utils.data
+import torch.distributed as dist
+from multiprocessing import Pool
+import logging
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+import pandas as pd
+class TextCleaner:
+    def __init__(self, symbol_dict, debug=True):
+        self.word_index_dictionary = symbol_dict
+        self.debug = debug
+    def __call__(self, text):
+        indexes = []
+        for char in text:
+            try:
+                indexes.append(self.word_index_dictionary[char])
+            except KeyError as e:
+                if self.debug:
+                    print("\nWARNING UNKNOWN IPA CHARACTERS/LETTERS: ", char)
+                    print("To ignore set 'debug' to false in the config")
+                continue
+        return indexes
+np.random.seed(1)
+random.seed(1)
+SPECT_PARAMS = {
+    "n_fft": 2048,
+    "win_length": 1200,
+    "hop_length": 300
+}
+MEL_PARAMS = {
+    "n_mels": 80,
+}
+to_mel = torchaudio.transforms.MelSpectrogram(
+    n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
+mean, std = -4, 4
+def preprocess(wave):
+    wave_tensor = torch.from_numpy(wave).float()
+    mel_tensor = to_mel(wave_tensor)
+    mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
+    return mel_tensor
+class FilePathDataset(torch.utils.data.Dataset):
+    def __init__(self,
+                 data_list,
+                 root_path,
+                 symbol_dict,
+                 sr=24000,
+                 data_augmentation=False,
+                 validation=False,
+                 debug=True
+                 ):
+        _data_list = [l.strip().split('|') for l in data_list]
+        self.data_list = _data_list #[data if len(data) == 3 else (*data, 0) for data in _data_list] #append speakerid=0 for all
+        self.text_cleaner = TextCleaner(symbol_dict, debug)
+        self.sr = sr
+        self.df = pd.DataFrame(self.data_list)
+        self.to_melspec = torchaudio.transforms.MelSpectrogram(**MEL_PARAMS)
+        self.mean, self.std = -4, 4
+        self.data_augmentation = data_augmentation and (not validation)
+        self.max_mel_length = 192
+        self.root_path = root_path
+    def __len__(self):
+        return len(self.data_list)
+    def __getitem__(self, idx):
+        data = self.data_list[idx]
+        path = data[0]
+        wave, text_tensor = self._load_tensor(data)
+        mel_tensor = preprocess(wave).squeeze()
+        acoustic_feature = mel_tensor.squeeze()
+        length_feature = acoustic_feature.size(1)
+        acoustic_feature = acoustic_feature[:, :(length_feature - length_feature % 2)]
+        return acoustic_feature, text_tensor, path, wave
+    def _load_tensor(self, data):
+        wave_path, text = data
+        wave, sr = sf.read(osp.join(self.root_path, wave_path))
+        if wave.shape[-1] == 2:
+            wave = wave[:, 0].squeeze()
+        if sr != 24000:
+            wave = librosa.resample(wave, orig_sr=sr, target_sr=24000)
+            print(wave_path, sr)
+        # Adding half a second padding.
+        wave = np.concatenate([np.zeros([12000]), wave, np.zeros([12000])], axis=0)
+        text = self.text_cleaner(text)
+        text.insert(0, 0)
+        text.append(0)
+        text = torch.LongTensor(text)
+        return wave, text
+    def _load_data(self, data):
+        wave, text_tensor = self._load_tensor(data)
+        mel_tensor = preprocess(wave).squeeze()
+        mel_length = mel_tensor.size(1)
+        if mel_length > self.max_mel_length:
+            random_start = np.random.randint(0, mel_length - self.max_mel_length)
+            mel_tensor = mel_tensor[:, random_start:random_start + self.max_mel_length]
+        return mel_tensor
+class Collater(object):
+    """
+    Args:
+      adaptive_batch_size (bool): if true, decrease batch size when long data comes.
+    """
+    def __init__(self, return_wave=False):
+        self.text_pad_index = 0
+        self.min_mel_length = 192
+        self.max_mel_length = 192
+        self.return_wave = return_wave
+    def __call__(self, batch):
+        batch_size = len(batch)
+        # sort by mel length
+        lengths = [b[0].shape[1] for b in batch]
+        batch_indexes = np.argsort(lengths)[::-1]
+        batch = [batch[bid] for bid in batch_indexes]
+        nmels = batch[0][0].size(0)
+        max_mel_length = max([b[0].shape[1] for b in batch])
+        max_text_length = max([b[1].shape[0] for b in batch])
+        mels = torch.zeros((batch_size, nmels, max_mel_length)).float()
+        texts = torch.zeros((batch_size, max_text_length)).long()
+        input_lengths = torch.zeros(batch_size).long()
+        output_lengths = torch.zeros(batch_size).long()
+        paths = ['' for _ in range(batch_size)]
+        waves = [None for _ in range(batch_size)]
+        for bid, (mel, text, path, wave) in enumerate(batch):
+            mel_size = mel.size(1)
+            text_size = text.size(0)
+            mels[bid, :, :mel_size] = mel
+            texts[bid, :text_size] = text
+            input_lengths[bid] = text_size
+            output_lengths[bid] = mel_size
+            paths[bid] = path
+            waves[bid] = wave
+        return waves, texts, input_lengths, mels, output_lengths
+def get_length(wave_path, root_path):
+    info = sf.info(osp.join(root_path, wave_path))
+    return info.frames * (24000 / info.samplerate)
+def build_dataloader(path_list,
+                     root_path,
+                     symbol_dict,
+                     validation=False,
+                     batch_size=4,
+                     num_workers=1,
+                     device='cpu',
+                     collate_config={},
+                     dataset_config={}):
+    dataset = FilePathDataset(path_list, root_path, symbol_dict, validation=validation, **dataset_config)
+    collate_fn = Collater(**collate_config)
+    print("Getting sample lengths...")
+    num_processes = num_workers * 2
+    if num_processes != 0:
+        list_of_tuples = [(d[0], root_path) for d in dataset.data_list]
+        with Pool(processes=num_processes) as pool:
+            sample_lengths = pool.starmap(get_length, list_of_tuples, chunksize=16)
+    else:
+        sample_lengths = []
+        for d in dataset.data_list:
+            sample_lengths.append(get_length(d[0], root_path))
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        num_workers=num_workers,
+        batch_sampler=BatchSampler(
+            sample_lengths,
+            batch_size,
+            shuffle=(not validation),
+            drop_last=(not validation),
+            num_replicas=1,
+            rank=0,
+        ),
+        collate_fn=collate_fn,
+        pin_memory=(device != "cpu"),
+    )
+    return data_loader
+#https://github.com/duerig/StyleTTS2/
+class BatchSampler(torch.utils.data.Sampler):
+    def __init__(
+        self,
+        sample_lengths,
+        batch_sizes,
+        num_replicas=None,
+        rank=None,
+        shuffle=True,
+        drop_last=False,
+    ):
+        self.batch_sizes = batch_sizes
+        if num_replicas is None:
+            self.num_replicas = dist.get_world_size()
+        else:
+            self.num_replicas = num_replicas
+        if rank is None:
+            self.rank = dist.get_rank()
+        else:
+            self.rank = rank
+        self.shuffle = shuffle
+        self.drop_last = drop_last
+        self.time_bins = {}
+        self.epoch = 0
+        self.total_len = 0
+        self.last_bin = None
+        for i in range(len(sample_lengths)):
+            bin_num = self.get_time_bin(sample_lengths[i])
+            if bin_num != -1:
+                if bin_num not in self.time_bins:
+                    self.time_bins[bin_num] = []
+                self.time_bins[bin_num].append(i)
+        for key in self.time_bins.keys():
+            val = self.time_bins[key]
+            total_batch = self.batch_sizes * num_replicas
+            self.total_len += len(val) // total_batch
+            if not self.drop_last and len(val) % total_batch != 0:
+                self.total_len += 1
+    def __iter__(self):
+        sampler_order = list(self.time_bins.keys())
+        sampler_indices = []
+        if self.shuffle:
+            sampler_indices = torch.randperm(len(sampler_order)).tolist()
+        else:
+            sampler_indices = list(range(len(sampler_order)))
+        for index in sampler_indices:
+            key = sampler_order[index]
+            current_bin = self.time_bins[key]
+            dist = torch.utils.data.distributed.DistributedSampler(
+                current_bin,
+                num_replicas=self.num_replicas,
+                rank=self.rank,
+                shuffle=self.shuffle,
+                drop_last=self.drop_last,
+            )
+            dist.set_epoch(self.epoch)
+            sampler = torch.utils.data.sampler.BatchSampler(
+                dist, self.batch_sizes, self.drop_last
+            )
+            for item_list in sampler:
+                self.last_bin = key
+                yield [current_bin[i] for i in item_list]
+    def __len__(self):
+        return self.total_len
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+    def get_time_bin(self, sample_count):
+        result = -1
+        frames = sample_count // 300
+        if frames >= 20:
+            result = (frames - 20) // 20
+        return result

en/StyleTTS2-lite/models.py ADDED Viewed

	@@ -0,0 +1,532 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils import weight_norm
+from munch import Munch
+class LearnedDownSample(nn.Module):
+    def __init__(self, layer_type, dim_in):
+        super().__init__()
+        self.layer_type = layer_type
+        if self.layer_type == 'none':
+            self.conv = nn.Identity()
+        elif self.layer_type == 'timepreserve':
+            self.conv = nn.Conv2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, padding=(1, 0))
+        elif self.layer_type == 'half':
+            self.conv = nn.Conv2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, padding=1)
+        else:
+            raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
+    def forward(self, x):
+        return self.conv(x)
+class LearnedUpSample(nn.Module):
+    def __init__(self, layer_type, dim_in):
+        super().__init__()
+        self.layer_type = layer_type
+        if self.layer_type == 'none':
+            self.conv = nn.Identity()
+        elif self.layer_type == 'timepreserve':
+            self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, output_padding=(1, 0), padding=(1, 0))
+        elif self.layer_type == 'half':
+            self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, output_padding=1, padding=1)
+        else:
+            raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
+    def forward(self, x):
+        return self.conv(x)
+class DownSample(nn.Module):
+    def __init__(self, layer_type):
+        super().__init__()
+        self.layer_type = layer_type
+    def forward(self, x):
+        if self.layer_type == 'none':
+            return x
+        elif self.layer_type == 'timepreserve':
+            return F.avg_pool2d(x, (2, 1))
+        elif self.layer_type == 'half':
+            if x.shape[-1] % 2 != 0:
+                x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1)
+            return F.avg_pool2d(x, 2)
+        else:
+            raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
+class UpSample(nn.Module):
+    def __init__(self, layer_type):
+        super().__init__()
+        self.layer_type = layer_type
+    def forward(self, x):
+        if self.layer_type == 'none':
+            return x
+        elif self.layer_type == 'timepreserve':
+            return F.interpolate(x, scale_factor=(2, 1), mode='nearest')
+        elif self.layer_type == 'half':
+            return F.interpolate(x, scale_factor=2, mode='nearest')
+        else:
+            raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
+class ResBlk(nn.Module):
+    def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
+                 normalize=False, downsample='none'):
+        super().__init__()
+        self.actv = actv
+        self.normalize = normalize
+        self.downsample = DownSample(downsample)
+        self.downsample_res = LearnedDownSample(downsample, dim_in)
+        self.learned_sc = dim_in != dim_out
+        self._build_weights(dim_in, dim_out)
+    def _build_weights(self, dim_in, dim_out):
+        self.conv1 = nn.Conv2d(dim_in, dim_in, 3, 1, 1)
+        self.conv2 = nn.Conv2d(dim_in, dim_out, 3, 1, 1)
+        if self.normalize:
+            self.norm1 = nn.InstanceNorm2d(dim_in, affine=True)
+            self.norm2 = nn.InstanceNorm2d(dim_in, affine=True)
+        if self.learned_sc:
+            self.conv1x1 = nn.Conv2d(dim_in, dim_out, 1, 1, 0, bias=False)
+    def _shortcut(self, x):
+        if self.learned_sc:
+            x = self.conv1x1(x)
+        if self.downsample:
+            x = self.downsample(x)
+        return x
+    def _residual(self, x):
+        if self.normalize:
+            x = self.norm1(x)
+        x = self.actv(x)
+        x = self.conv1(x)
+        x = self.downsample_res(x)
+        if self.normalize:
+            x = self.norm2(x)
+        x = self.actv(x)
+        x = self.conv2(x)
+        return x
+    def forward(self, x):
+        x = self._shortcut(x) + self._residual(x)
+        return x / math.sqrt(2)  # unit variance
+class StyleEncoder(nn.Module):
+    def __init__(self, dim_in=48, style_dim=48, max_conv_dim=384):
+        super().__init__()
+        blocks = []
+        blocks += [nn.Conv2d(1, dim_in, 3, 1, 1)]
+        repeat_num = 4
+        for _ in range(repeat_num):
+            dim_out = min(dim_in*2, max_conv_dim)
+            blocks += [ResBlk(dim_in, dim_out, downsample='half')]
+            dim_in = dim_out
+        blocks += [nn.LeakyReLU(0.2)]
+        blocks += [nn.Conv2d(dim_out, dim_out, 5, 1, 0)]
+        blocks += [nn.AdaptiveAvgPool2d(1)]
+        blocks += [nn.LeakyReLU(0.2)]
+        self.shared = nn.Sequential(*blocks)
+        self.unshared = nn.Linear(dim_out, style_dim)
+    def forward(self, x):
+        h = self.shared(x)
+        h = h.view(h.size(0), -1)
+        s = self.unshared(h)
+        return s
+class LinearNorm(torch.nn.Module):
+    def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
+        super(LinearNorm, self).__init__()
+        self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
+        torch.nn.init.xavier_uniform_(
+            self.linear_layer.weight,
+            gain=torch.nn.init.calculate_gain(w_init_gain))
+    def forward(self, x):
+        return self.linear_layer(x)
+class ResBlk1d(nn.Module):
+    def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
+                 normalize=False, downsample='none', dropout_p=0.2):
+        super().__init__()
+        self.actv = actv
+        self.normalize = normalize
+        self.downsample_type = downsample
+        self.learned_sc = dim_in != dim_out
+        self._build_weights(dim_in, dim_out)
+        self.dropout_p = dropout_p
+        if self.downsample_type == 'none':
+            self.pool = nn.Identity()
+        else:
+            self.pool = weight_norm(nn.Conv1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1))
+    def _build_weights(self, dim_in, dim_out):
+        self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_in, 3, 1, 1))
+        self.conv2 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
+        if self.normalize:
+            self.norm1 = nn.InstanceNorm1d(dim_in, affine=True)
+            self.norm2 = nn.InstanceNorm1d(dim_in, affine=True)
+        if self.learned_sc:
+            self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
+    def downsample(self, x):
+        if self.downsample_type == 'none':
+            return x
+        else:
+            if x.shape[-1] % 2 != 0:
+                x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1)
+            return F.avg_pool1d(x, 2)
+    def _shortcut(self, x):
+        if self.learned_sc:
+            x = self.conv1x1(x)
+        x = self.downsample(x)
+        return x
+    def _residual(self, x):
+        if self.normalize:
+            x = self.norm1(x)
+        x = self.actv(x)
+        x = F.dropout(x, p=self.dropout_p, training=self.training)
+        x = self.conv1(x)
+        x = self.pool(x)
+        if self.normalize:
+            x = self.norm2(x)
+        x = self.actv(x)
+        x = F.dropout(x, p=self.dropout_p, training=self.training)
+        x = self.conv2(x)
+        return x
+    def forward(self, x):
+        x = self._shortcut(x) + self._residual(x)
+        return x / math.sqrt(2)  # unit variance
+class LayerNorm(nn.Module):
+    def __init__(self, channels, eps=1e-5):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+        self.gamma = nn.Parameter(torch.ones(channels))
+        self.beta = nn.Parameter(torch.zeros(channels))
+    def forward(self, x):
+        x = x.transpose(1, -1)
+        x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+        return x.transpose(1, -1)
+class TextEncoder(nn.Module):
+    def __init__(self, channels, kernel_size, depth, n_symbols, actv=nn.LeakyReLU(0.2)):
+        super().__init__()
+        self.embedding = nn.Embedding(n_symbols, channels)
+        padding = (kernel_size - 1) // 2
+        self.cnn = nn.ModuleList()
+        for _ in range(depth):
+            self.cnn.append(nn.Sequential(
+                weight_norm(nn.Conv1d(channels, channels, kernel_size=kernel_size, padding=padding)),
+                LayerNorm(channels),
+                actv,
+                nn.Dropout(0.2),
+            ))
+        # self.cnn = nn.Sequential(*self.cnn)
+        self.lstm = nn.LSTM(channels, channels//2, 1, batch_first=True, bidirectional=True)
+    def forward(self, x, input_lengths, m):
+        x = self.embedding(x)  # [B, T, emb]
+        x = x.transpose(1, 2)  # [B, emb, T]
+        m = m.to(input_lengths.device).unsqueeze(1)
+        x.masked_fill_(m, 0.0)
+        for c in self.cnn:
+            x = c(x)
+            x.masked_fill_(m, 0.0)
+        x = x.transpose(1, 2)  # [B, T, chn]
+        input_lengths = input_lengths.cpu().numpy()
+        x = nn.utils.rnn.pack_padded_sequence(
+            x, input_lengths, batch_first=True, enforce_sorted=False)
+        self.lstm.flatten_parameters()
+        x, _ = self.lstm(x)
+        x, _ = nn.utils.rnn.pad_packed_sequence(
+            x, batch_first=True)
+        x = x.transpose(-1, -2)
+        x_pad = torch.zeros([x.shape[0], x.shape[1], m.shape[-1]])
+        x_pad[:, :, :x.shape[-1]] = x
+        x = x_pad.to(x.device)
+        x.masked_fill_(m, 0.0)
+        return x
+    def inference(self, x):
+        x = self.embedding(x)
+        x = x.transpose(1, 2)
+        x = self.cnn(x)
+        x = x.transpose(1, 2)
+        self.lstm.flatten_parameters()
+        x, _ = self.lstm(x)
+        return x
+    def length_to_mask(self, lengths):
+        mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
+        mask = torch.gt(mask+1, lengths.unsqueeze(1))
+        return mask
+class AdaIN1d(nn.Module):
+    def __init__(self, style_dim, num_features):
+        super().__init__()
+        self.norm = nn.InstanceNorm1d(num_features, affine=False)
+        self.fc = nn.Linear(style_dim, num_features*2)
+    def forward(self, x, s):
+        h = self.fc(s)
+        h = h.view(h.size(0), h.size(1), 1)
+        gamma, beta = torch.chunk(h, chunks=2, dim=1)
+        return (1 + gamma) * self.norm(x) + beta
+class UpSample1d(nn.Module):
+    def __init__(self, layer_type):
+        super().__init__()
+        self.layer_type = layer_type
+    def forward(self, x):
+        if self.layer_type == 'none':
+            return x
+        else:
+            return F.interpolate(x, scale_factor=2, mode='nearest')
+class AdainResBlk1d(nn.Module):
+    def __init__(self, dim_in, dim_out, style_dim=64, actv=nn.LeakyReLU(0.2),
+                 upsample='none', dropout_p=0.0):
+        super().__init__()
+        self.actv = actv
+        self.upsample_type = upsample
+        self.upsample = UpSample1d(upsample)
+        self.learned_sc = dim_in != dim_out
+        self._build_weights(dim_in, dim_out, style_dim)
+        self.dropout = nn.Dropout(dropout_p)
+        if upsample == 'none':
+            self.pool = nn.Identity()
+        else:
+            self.pool = weight_norm(nn.ConvTranspose1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1, output_padding=1))
+    def _build_weights(self, dim_in, dim_out, style_dim):
+        self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
+        self.conv2 = weight_norm(nn.Conv1d(dim_out, dim_out, 3, 1, 1))
+        self.norm1 = AdaIN1d(style_dim, dim_in)
+        self.norm2 = AdaIN1d(style_dim, dim_out)
+        if self.learned_sc:
+            self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
+    def _shortcut(self, x):
+        x = self.upsample(x)
+        if self.learned_sc:
+            x = self.conv1x1(x)
+        return x
+    def _residual(self, x, s):
+        x = self.norm1(x, s)
+        x = self.actv(x)
+        x = self.pool(x)
+        x = self.conv1(self.dropout(x))
+        x = self.norm2(x, s)
+        x = self.actv(x)
+        x = self.conv2(self.dropout(x))
+        return x
+    def forward(self, x, s):
+        out = self._residual(x, s)
+        out = (out + self._shortcut(x)) / math.sqrt(2)
+        return out
+class AdaLayerNorm(nn.Module):
+    def __init__(self, style_dim, channels, eps=1e-5):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+        self.fc = nn.Linear(style_dim, channels*2)
+    def forward(self, x, s):
+        x = x.transpose(-1, -2)
+        x = x.transpose(1, -1)
+        h = self.fc(s)
+        h = h.view(h.size(0), h.size(1), 1)
+        gamma, beta = torch.chunk(h, chunks=2, dim=1)
+        gamma, beta = gamma.transpose(1, -1), beta.transpose(1, -1)
+        x = F.layer_norm(x, (self.channels,), eps=self.eps)
+        x = (1 + gamma) * x + beta
+        return x.transpose(1, -1).transpose(-1, -2)
+class ProsodyPredictor(nn.Module):
+    def __init__(self, style_dim, d_hid, nlayers, max_dur=50, dropout=0.1):
+        super().__init__()
+        self.text_encoder = DurationEncoder(sty_dim=style_dim,
+                                            d_model=d_hid,
+                                            nlayers=nlayers,
+                                            dropout=dropout)
+        self.lstm = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True)
+        self.duration_proj = LinearNorm(d_hid, max_dur)
+        self.shared = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True)
+        self.F0 = nn.ModuleList()
+        self.F0.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout))
+        self.F0.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout))
+        self.F0.append(AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim, dropout_p=dropout))
+        self.N = nn.ModuleList()
+        self.N.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout))
+        self.N.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout))
+        self.N.append(AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim, dropout_p=dropout))
+        self.F0_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
+        self.N_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
+    def forward(self, texts, style, text_lengths, alignment, m):
+        d = self.text_encoder(texts, style, text_lengths, m)
+        batch_size = d.shape[0]
+        text_size = d.shape[1]
+        # predict duration
+        input_lengths = text_lengths.cpu().numpy()
+        x = nn.utils.rnn.pack_padded_sequence(
+            d, input_lengths, batch_first=True, enforce_sorted=False)
+        m = m.to(text_lengths.device).unsqueeze(1)
+        self.lstm.flatten_parameters()
+        x, _ = self.lstm(x)
+        x, _ = nn.utils.rnn.pad_packed_sequence(
+            x, batch_first=True)
+        x_pad = torch.zeros([x.shape[0], m.shape[-1], x.shape[-1]])
+        x_pad[:, :x.shape[1], :] = x
+        x = x_pad.to(x.device)
+        duration = self.duration_proj(nn.functional.dropout(x, 0.5, training=self.training))
+        en = (d.transpose(-1, -2) @ alignment)
+        return duration.squeeze(-1), en
+    def F0Ntrain(self, x, s):
+        x, _ = self.shared(x.transpose(-1, -2))
+        F0 = x.transpose(-1, -2)
+        for block in self.F0:
+            F0 = block(F0, s)
+        F0 = self.F0_proj(F0)
+        N = x.transpose(-1, -2)
+        for block in self.N:
+            N = block(N, s)
+        N = self.N_proj(N)
+        return F0.squeeze(1), N.squeeze(1)
+    def length_to_mask(self, lengths):
+        mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
+        mask = torch.gt(mask+1, lengths.unsqueeze(1))
+        return mask
+class DurationEncoder(nn.Module):
+    def __init__(self, sty_dim, d_model, nlayers, dropout=0.1):
+        super().__init__()
+        self.lstms = nn.ModuleList()
+        for _ in range(nlayers):
+            self.lstms.append(nn.LSTM(d_model + sty_dim,
+                                 d_model // 2,
+                                 num_layers=1,
+                                 batch_first=True,
+                                 bidirectional=True,
+                                 dropout=dropout))
+            self.lstms.append(AdaLayerNorm(sty_dim, d_model))
+        self.dropout = dropout
+        self.d_model = d_model
+        self.sty_dim = sty_dim
+    def forward(self, x, style, text_lengths, m):
+        masks = m.to(text_lengths.device)
+        x = x.permute(2, 0, 1)
+        s = style.expand(x.shape[0], x.shape[1], -1)
+        x = torch.cat([x, s], axis=-1)
+        x.masked_fill_(masks.unsqueeze(-1).transpose(0, 1), 0.0)
+        x = x.transpose(0, 1)
+        input_lengths = text_lengths.cpu().numpy()
+        x = x.transpose(-1, -2)
+        for block in self.lstms:
+            if isinstance(block, AdaLayerNorm):
+                x = block(x.transpose(-1, -2), style).transpose(-1, -2)
+                x = torch.cat([x, s.permute(1, -1, 0)], axis=1)
+                x.masked_fill_(masks.unsqueeze(-1).transpose(-1, -2), 0.0)
+            else:
+                x = x.transpose(-1, -2)
+                x = nn.utils.rnn.pack_padded_sequence(
+                    x, input_lengths, batch_first=True, enforce_sorted=False)
+                block.flatten_parameters()
+                x, _ = block(x)
+                x, _ = nn.utils.rnn.pad_packed_sequence(
+                    x, batch_first=True)
+                x = F.dropout(x, p=self.dropout, training=self.training)
+                x = x.transpose(-1, -2)
+                x_pad = torch.zeros([x.shape[0], x.shape[1], m.shape[-1]])
+                x_pad[:, :, :x.shape[-1]] = x
+                x = x_pad.to(x.device)
+        return x.transpose(-1, -2)
+    def inference(self, x, style):
+        x = self.embedding(x.transpose(-1, -2)) * math.sqrt(self.d_model)
+        style = style.expand(x.shape[0], x.shape[1], -1)
+        x = torch.cat([x, style], axis=-1)
+        src = self.pos_encoder(x)
+        output = self.transformer_encoder(src).transpose(0, 1)
+        return output
+    def length_to_mask(self, lengths):
+        mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
+        mask = torch.gt(mask+1, lengths.unsqueeze(1))
+        return mask

en/StyleTTS2-lite/requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+torch
+torchaudio
+numpy
+PyYAML
+munch
+nltk
+librosa
+noisereduce
+phonemizer
+espeakng-loader

en/StyleTTS2-lite/run.ipynb ADDED Viewed

	@@ -0,0 +1,176 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5a3ddcc8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from inference import StyleTTS2\n",
+    "\n",
+    "import librosa\n",
+    "import IPython.display as ipd\n",
+    "import torch.cuda\n",
+    "\n",
+    "device = 'cuda' if torch.cuda.is_available() else 'cpu'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "092cfb69",
+   "metadata": {},
+   "source": [
+    "### Load G2P"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a152ec13",
+   "metadata": {},
+   "source": [
+    "If you did not use eSpeak for your language, please add your own G2P."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ca224f37",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import phonemizer\n",
+    "if sys.platform.startswith(\"win\"):\n",
+    "    try:\n",
+    "        from phonemizer.backend.espeak.wrapper import EspeakWrapper\n",
+    "        import espeakng_loader\n",
+    "        EspeakWrapper.set_library(espeakng_loader.get_library_path())\n",
+    "    except Exception as e:\n",
+    "        print(e)\n",
+    "\n",
+    "def get_phoneme(text, lang):\n",
+    "    try:\n",
+    "        my_phonemizer = phonemizer.backend.EspeakBackend(language=lang, preserve_punctuation=True,  with_stress=True, language_switch='remove-flags')\n",
+    "        return my_phonemizer.phonemize([text])[0]\n",
+    "    except Exception as e:\n",
+    "        print(e)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7b9cecbe",
+   "metadata": {},
+   "source": [
+    "### Load models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e7b9c01d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config_path = \"Models/config.yaml\"\n",
+    "models_path = \"Models/inference/model.pth\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b803110e",
+   "metadata": {},
+   "source": [
+    "### Synthesize speech\n",
+    "\n",
+    "Little Note: Reference audio has a huge impact on the result. It is best to select audio around 10s long and consistent in both tone and speed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "78396f70",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "speaker = {\n",
+    "    \"path\": \"./Audio/1_heart.wav\",  #Ref audio path\n",
+    "    \"speed\": 1.0,                        #Speaking speed\n",
+    "}\n",
+    "\n",
+    "max_samples = 24000*20 #max 20 seconds ref audio\n",
+    "print(speaker['path'])\n",
+    "wave, sr = librosa.load(speaker['path'], sr=24000)\n",
+    "audio, index = librosa.effects.trim(wave, top_db=30)\n",
+    "if sr != 24000:              audio = librosa.resample(audio, sr, 24000)\n",
+    "if len(audio) > max_samples: audio = audio[:max_samples]\n",
+    "display(ipd.Audio(audio, rate=24000, normalize=True))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "395959f1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = '''\n",
+    "Nearly 300 scholars currently working in the United States have applied for positions at Aix-Marseille University in France, which has announced a program to provide a haven for academics affected by the Trump administration's policies.\n",
+    "Aix-Marseille launched the \"Safe Place for Science\" initiative earlier this year, offering three-year funded placements for approximately 20 researchers. The program aims to support scholars facing budget cuts and policy changes that have disrupted U.S. academic institutions.\n",
+    "'''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16194211",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model             = StyleTTS2(config_path, models_path).eval().to(device)\n",
+    "avg_style         = True      #BOOL   Split the ref audio and calculate the avg styles.\n",
+    "stabilize         = False     #BOOL   Stabilize speaking speed.\n",
+    "denoise           = 0.3       #FLOAT  Adjust the strength of the denoiser. Value range is [0, 1]\n",
+    "n_merge           = 16        #INT    Avoid short sentences by merging when a sentence has fewer than n words"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "980c6fbb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with torch.no_grad():\n",
+    "    phonemes = get_phoneme(text=text, lang=\"en-us\")\n",
+    "\n",
+    "    styles  = model.get_styles(speaker, denoise, avg_style)\n",
+    "    r       = model.generate(phonemes, styles, stabilize, n_merge)\n",
+    "\n",
+    "print('Synthesized:')\n",
+    "display(ipd.Audio(r, rate=24000, normalize=True))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

en/StyleTTS2-lite/source.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ https://huggingface.co/dangtr0408/StyleTTS2-lite

en/StyleTTS2/.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

en/StyleTTS2/Multi0/config.yml ADDED Viewed

	@@ -0,0 +1,112 @@

+ASR_config: Utils/ASR/config.yml
+ASR_path: Utils/ASR/epoch_00080.pth
+F0_path: Utils/JDC/bst.t7
+PLBERT_dir: Utils/PLBERT/
+batch_size: 6
+data_params:
+  OOD_data: Data/OOD_texts.txt
+  min_length: 50
+  root_path: /root/StyleTTS2/Omni1_data
+  train_data: Data/train_list.txt
+  val_data: Data/val_list.txt
+device: cuda
+epochs_1st: 200
+epochs_2nd: 60
+first_stage_path: first_stage.pth
+log_dir: Models/Omni1
+log_interval: 10
+loss_params:
+  TMA_epoch: 50
+  diff_epoch: 14
+  joint_epoch: 19
+  lambda_F0: 1.0
+  lambda_ce: 20.0
+  lambda_diff: 1.0
+  lambda_dur: 1.0
+  lambda_gen: 1.0
+  lambda_mel: 5.0
+  lambda_mono: 1.0
+  lambda_norm: 1.0
+  lambda_s2s: 1.0
+  lambda_slm: 1.0
+  lambda_sty: 1.0
+max_len: 400
+model_params:
+  decoder:
+    gen_istft_hop_size: 5
+    gen_istft_n_fft: 20
+    resblock_dilation_sizes:
+    - - 1
+      - 3
+      - 5
+    - - 1
+      - 3
+      - 5
+    - - 1
+      - 3
+      - 5
+    resblock_kernel_sizes:
+    - 3
+    - 7
+    - 11
+    type: istftnet
+    upsample_initial_channel: 512
+    upsample_kernel_sizes:
+    - 20
+    - 12
+    upsample_rates:
+    - 10
+    - 6
+  diffusion:
+    dist:
+      estimate_sigma_data: true
+      mean: -3.0
+      sigma_data: 0.3141927569675583
+      std: 1.0
+    embedding_mask_proba: 0.1
+    transformer:
+      head_features: 64
+      multiplier: 2
+      num_heads: 8
+      num_layers: 3
+  dim_in: 64
+  dropout: 0.2
+  hidden_dim: 512
+  max_conv_dim: 512
+  max_dur: 50
+  multispeaker: true
+  n_layer: 3
+  n_mels: 80
+  n_token: 178
+  slm:
+    hidden: 768
+    initial_channel: 64
+    model: microsoft/wavlm-base-plus
+    nlayers: 13
+    sr: 16000
+  style_dim: 128
+optimizer_params:
+  bert_lr: 1.0e-05
+  ft_lr: 1.0e-05
+  lr: 0.0001
+preprocess_params:
+  spect_params:
+    hop_length: 300
+    n_fft: 2048
+    win_length: 1200
+  sr: 24000
+pretrained_model: /root/StyleTTS2/Models/Omni1/epoch_2nd_pretrained.pth
+resume: true
+save_freq: 1
+saver_freq_steps: 150
+saver_max_ckpts: 5
+saver_mode: ITER
+second_stage_load_pretrained: true
+slmadv_params:
+  batch_percentage: 0.5
+  iter: 10
+  max_len: 400
+  min_len: 160
+  scale: 0.01
+  sig: 1.5
+  thresh: 5

en/StyleTTS2/Multi0/config_30_e934.yml ADDED Viewed

	@@ -0,0 +1,22 @@

+{ASR_config: Utils/ASR/config.yml, ASR_path: Utils/ASR/epoch_00080.pth, F0_path: Utils/JDC/bst.t7,
+  PLBERT_dir: Utils/PLBERT/, batch_size: 12, data_params: {OOD_data: Data/OOD_texts.txt,
+    min_length: 50, root_path: /root/StyleTTS2/Multi0_data, train_data: Data/train_list.txt,
+    val_data: Data/val_list.txt}, device: cuda, epochs_1st: 200, epochs_2nd: 60, first_stage_path: first_stage.pth,
+  log_dir: Models/Multi0, log_interval: 10, loss_params: {TMA_epoch: 50, diff_epoch: 14,
+    joint_epoch: 19, lambda_F0: 1.0, lambda_ce: 20.0, lambda_diff: 1.0, lambda_dur: 1.0,
+    lambda_gen: 1.0, lambda_mel: 5.0, lambda_mono: 1.0, lambda_norm: 1.0, lambda_s2s: 1.0,
+    lambda_slm: 1.0, lambda_sty: 1.0}, max_len: 400, model_params: {decoder: {gen_istft_hop_size: 5,
+      gen_istft_n_fft: 20, resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3,
+          5]], resblock_kernel_sizes: [3, 7, 11], type: istftnet, upsample_initial_channel: 512,
+      upsample_kernel_sizes: [20, 12], upsample_rates: [10, 6]}, diffusion: {dist: {
+        estimate_sigma_data: true, mean: -3.0, sigma_data: 0.31839087134423844, std: 1.0},
+      embedding_mask_proba: 0.1, transformer: {head_features: 64, multiplier: 2, num_heads: 8,
+        num_layers: 3}}, dim_in: 64, dropout: 0.2, hidden_dim: 512, max_conv_dim: 512,
+    max_dur: 50, multispeaker: true, n_layer: 3, n_mels: 80, n_token: 178, slm: {
+      hidden: 768, initial_channel: 64, model: microsoft/wavlm-base-plus, nlayers: 13,
+      sr: 16000}, style_dim: 128}, optimizer_params: {bert_lr: 1.0e-05, ft_lr: 1.0e-05,
+    lr: 0.0001}, preprocess_params: {spect_params: {hop_length: 300, n_fft: 2048,
+      win_length: 1200}, sr: 24000}, pretrained_model: '', resume: true, save_freq: 1,
+  saver_freq_steps: 150, saver_max_ckpts: 5, saver_mode: ITER, second_stage_load_pretrained: true,
+  slmadv_params: {batch_percentage: 0.5, iter: 10, max_len: 400, min_len: 160, scale: 0.01,
+    sig: 1.5, thresh: 5}}

en/StyleTTS2/Multi0/config_40_1c872.yml ADDED Viewed

	@@ -0,0 +1,22 @@

+{ASR_config: Utils/ASR/config.yml, ASR_path: Utils/ASR/epoch_00080.pth, F0_path: Utils/JDC/bst.t7,
+  PLBERT_dir: Utils/PLBERT/, batch_size: 6, data_params: {OOD_data: Data/OOD_texts.txt,
+    min_length: 50, root_path: /root/StyleTTS2/Multi0_data, train_data: Data/train_list.txt,
+    val_data: Data/val_list.txt}, device: cuda, epochs_1st: 200, epochs_2nd: 60, first_stage_path: first_stage.pth,
+  log_dir: Models/Multi0, log_interval: 10, loss_params: {TMA_epoch: 50, diff_epoch: 14,
+    joint_epoch: 19, lambda_F0: 1.0, lambda_ce: 20.0, lambda_diff: 1.0, lambda_dur: 1.0,
+    lambda_gen: 1.0, lambda_mel: 5.0, lambda_mono: 1.0, lambda_norm: 1.0, lambda_s2s: 1.0,
+    lambda_slm: 1.0, lambda_sty: 1.0}, max_len: 400, model_params: {decoder: {gen_istft_hop_size: 5,
+      gen_istft_n_fft: 20, resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3,
+          5]], resblock_kernel_sizes: [3, 7, 11], type: istftnet, upsample_initial_channel: 512,
+      upsample_kernel_sizes: [20, 12], upsample_rates: [10, 6]}, diffusion: {dist: {
+        estimate_sigma_data: true, mean: -3.0, sigma_data: 0.2969374090377316, std: 1.0},
+      embedding_mask_proba: 0.1, transformer: {head_features: 64, multiplier: 2, num_heads: 8,
+        num_layers: 3}}, dim_in: 64, dropout: 0.2, hidden_dim: 512, max_conv_dim: 512,
+    max_dur: 50, multispeaker: true, n_layer: 3, n_mels: 80, n_token: 178, slm: {
+      hidden: 768, initial_channel: 64, model: microsoft/wavlm-base-plus, nlayers: 13,
+      sr: 16000}, style_dim: 128}, optimizer_params: {bert_lr: 1.0e-05, ft_lr: 1.0e-05,
+    lr: 0.0001}, preprocess_params: {spect_params: {hop_length: 300, n_fft: 2048,
+      win_length: 1200}, sr: 24000}, pretrained_model: '', resume: true, save_freq: 1,
+  saver_freq_steps: 150, saver_max_ckpts: 5, saver_mode: ITER, second_stage_load_pretrained: true,
+  slmadv_params: {batch_percentage: 0.5, iter: 10, max_len: 400, min_len: 160, scale: 0.01,
+    sig: 1.5, thresh: 5}}

en/StyleTTS2/Multi0/epoch_2nd_23_9ab0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fbe9be5c4c2df12b5ddb65cce7e45849d3ed674db1fcb89eb7f1bafc65f05ade
+size 2132412506

en/StyleTTS2/Multi0/epoch_2nd_30_e934.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7af95c2c61a778fec6ad5cec95497daaf9bf3dd6cec6db7f02f4fe90e3e5657a
+size 2132412506

en/StyleTTS2/Multi0/epoch_2nd_40_1c872.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:065b610ae5fd9fc73eea396761d42a99a4770a243aca76aa7db4ff9bd13d81ac
+size 2132415942

en/StyleTTS2/Multi0/ref_audio.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fa342abae6a7d06b84508e828c1082aa0fc6d484bd709cb40650d879c31c5f16
+size 4766523

en/StyleTTS2/README.md ADDED Viewed

	@@ -0,0 +1,7 @@

+---
+datasets:
+- therealvul/StyleTTS2MLP
+language:
+- en
+---
+This repository contains StyleTTS2 models trained on Pony Preservation Project data