diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..c1e0835d46fea948d70f9a73c35dfe74381f7e1f 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,28 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +ar/StyleTTS2-LibriTTS-arabic/synthesized_audio.wav filter=lfs diff=lfs merge=lfs -text +en/StyleTTS2-lite/Audio/1_heart.wav filter=lfs diff=lfs merge=lfs -text +en/StyleTTS2-lite/Audio/10_michael.wav filter=lfs diff=lfs merge=lfs -text +en/StyleTTS2-lite/Audio/11_fenrir.wav filter=lfs diff=lfs merge=lfs -text +en/StyleTTS2-lite/Audio/12_puck.wav filter=lfs diff=lfs merge=lfs -text +en/StyleTTS2-lite/Audio/13_echo.wav filter=lfs diff=lfs merge=lfs -text +en/StyleTTS2-lite/Audio/14_eric.wav filter=lfs diff=lfs merge=lfs -text +en/StyleTTS2-lite/Audio/15_liam.wav filter=lfs diff=lfs merge=lfs -text +en/StyleTTS2-lite/Audio/16_onyx.wav filter=lfs diff=lfs merge=lfs -text +en/StyleTTS2-lite/Audio/17_santa.wav filter=lfs diff=lfs merge=lfs -text +en/StyleTTS2-lite/Audio/18_adam.wav filter=lfs diff=lfs merge=lfs -text +en/StyleTTS2-lite/Audio/2_belle.wav filter=lfs diff=lfs merge=lfs -text +en/StyleTTS2-lite/Audio/3_kore.wav filter=lfs diff=lfs merge=lfs -text +en/StyleTTS2-lite/Audio/4_sarah.wav filter=lfs diff=lfs merge=lfs -text +en/StyleTTS2-lite/Audio/5_nova.wav filter=lfs diff=lfs merge=lfs -text +en/StyleTTS2-lite/Audio/6_sky.wav filter=lfs diff=lfs merge=lfs -text +en/StyleTTS2-lite/Audio/7_alloy.wav filter=lfs diff=lfs merge=lfs -text +en/StyleTTS2-lite/Audio/8_jessica.wav filter=lfs diff=lfs merge=lfs -text +en/StyleTTS2-lite/Audio/9_river.wav filter=lfs diff=lfs merge=lfs -text +ru/StyleTTS_prokopenko_v1/voices/prokopenko/reference.wav filter=lfs diff=lfs merge=lfs -text +vi,en/StyleTTS2-lite-vi/reference_audio/3.wav filter=lfs diff=lfs merge=lfs -text +vi,en/StyleTTS2-lite-vi/reference_audio/vn_1.wav filter=lfs diff=lfs merge=lfs -text +vi,en/StyleTTS2-lite-vi/reference_audio/vn_2.wav filter=lfs diff=lfs merge=lfs -text +vi,en/StyleTTS2-lite-vi/reference_audio/vn_3.wav filter=lfs diff=lfs merge=lfs -text +vi,en/StyleTTS2-lite-vi/reference_audio/vn_4.wav filter=lfs diff=lfs merge=lfs -text diff --git a/ar/StyleTTS2-LibriTTS-arabic/.gitattributes b/ar/StyleTTS2-LibriTTS-arabic/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..db8aca817a0ff6ef348c9ac764595e938510b22e --- /dev/null +++ b/ar/StyleTTS2-LibriTTS-arabic/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +synthesized_audio.wav filter=lfs diff=lfs merge=lfs -text diff --git a/ar/StyleTTS2-LibriTTS-arabic/README.md b/ar/StyleTTS2-LibriTTS-arabic/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f68b01bdff5f4a9206c2d4ccb505b7e9470e1721 --- /dev/null +++ b/ar/StyleTTS2-LibriTTS-arabic/README.md @@ -0,0 +1,142 @@ +--- +language: ar +tags: +- text-to-speech +- tts +- arabic +- styletts2 +- pl-bert +license: mit +hardware: H100 +--- + +# Model Card for Arabic StyleTTS2 + +This is an Arabic text-to-speech model based on StyleTTS2 architecture, specifically adapted for Arabic language synthesis. The model achieves good quality Arabic speech synthesis, though not yet state-of-the-art, and further experimentation is needed to optimize performance for Arabic language specifically. All training objectives from the original StyleTTS2 were maintained, except for the WavLM objectives which were removed as they were primarily designed for English speech. + +## Example + +Here is an example output from the model: + +#### Sample 1 + + +## Efficiency and Performance + +A key strength of this model lies in its efficiency and performance characteristics: + +- **Compact Architecture**: Achieves impressive quality with <100M parameters +- **Limited Training Data**: Trained on only 22 hours of single-speaker audio +- **Transfer Learning**: Successfully fine-tuned from LibriTTS multi-speaker model to single-speaker Arabic +- **Resource Efficient**: Good quality achieved despite limited computational resources + +Note: According to the StyleTTS2 authors, performance should improve further when training a single-speaker model from scratch rather than fine-tuning. This wasn't attempted in our case due to computational resource constraints, suggesting potential for even better results with more extensive training. + + +## Model Details + +### Model Description + +This model is a modified version of StyleTTS2, specifically adapted for Arabic text-to-speech synthesis. It incorporates a custom-trained PL-BERT model for Arabic language understanding and removes the WavLM adversarial training component (which was primarily designed for English). + +- **Developed by:** Fadi (GitHub: Fadi987) +- **Model type:** Text-to-Speech (StyleTTS2 architecture) +- **Language(s):** Arabic +- **Finetuned from model:** [yl4579/StyleTTS2-LibriTTS](https://huggingface.co/yl4579/StyleTTS2-LibriTTS) + +### Model Sources + +- **Repository:** [Fadi987/StyleTTS2](https://github.com/Fadi987/StyleTTS2) +- **Paper:** [StyleTTS 2: Towards Human-Level Text-to-Speech through Style Diffusion and Adversarial Training with Large Speech Language Models](https://arxiv.org/abs/2306.07691) +- **PL-BERT Model:** [fadi77/pl-bert](https://huggingface.co/fadi77/pl-bert) + +## Uses + +### Direct Use + +The model can be used for generating Arabic speech from text. To use the model: + +1. Clone the StyleTTS2 repository: +```bash +git clone https://github.com/Fadi987/StyleTTS2 +cd StyleTTS2 +``` + +2. Install `espeak-ng` for phonemization backend: +```bash +# For macOS +brew install espeak-ng + +# For Ubuntu/Debian +sudo apt-get install espeak-ng + +# For Windows +# Download and install espeak-ng from: https://github.com/espeak-ng/espeak-ng/releases +``` + +3. Install Python dependencies: +```bash +pip install -r requirements.txt +``` + +4. Download the `model.pth` and `config.yml` files from this repository + +5. Run inference using: +```bash +python inference.py --config config.yml --model model.pth --text "الإِتْقَانُ يَحْتَاجُ إِلَى الْعَمَلِ وَالْمُثَابَرَة" +``` + +Make sure use properly diacritized Arabic text for best results + +### Out-of-Scope Use + +The model is specifically designed for Arabic text-to-speech synthesis and may not perform well for: +- Other languages +- Heavy dialect variations +- Non-diacritized Arabic text + +## Training Details + +### Training Data + +- Training was performed on approximately 22 hours of Arabic audiobook data +- Dataset: [fadi77/arabic-audiobook-dataset-24khz](https://huggingface.co/datasets/fadi77/arabic-audiobook-dataset-24khz) +- The PL-BERT component was trained on fully diacritized Wikipedia Arabic text + +### Training Hyperparameters + +- **Number of epochs:** 20 +- **Diffusion training:** Started from epoch 5 + +### Objectives +- **Training objectives:** All original StyleTTS2 objectives maintained, except WavLM adversarial training +- **Validation objectives:** Identical to original StyleTTS2 validation process + +### Compute Infrastructure +- **Hardware Type:** NVIDIA H100 GPU + +### Notable Modifications from Original StyleTTS2 in Architecture and Objectives +The architecture of the model follows that of StyleTTS2 with the following exceptions: + - Removed WavLM adversarial training component + - Custom PL-BERT trained for Arabic language + + +## Citation + +**BibTeX:** +```bibtex +@article{styletts2, + title={StyleTTS 2: Towards Human-Level Text-to-Speech through Style Diffusion and Adversarial Training with Large Speech Language Models}, + author={Liu, Yinghao Aaron and Chen, Tao and Ping, Wei and Wu, Xiaoliang and Wang, Dongchao and Duan, Yuxuan and Li, Xiaodi and Li, Chong and Liang, Xuchen and Liu, Qiong and others}, + journal={arXiv preprint arXiv:2306.07691}, + year={2023} +} +``` + +## Model Card Contact + +GitHub: [@Fadi987](https://github.com/Fadi987) +Hugging Face: [@fadi77](https://huggingface.co/fadi77) \ No newline at end of file diff --git a/ar/StyleTTS2-LibriTTS-arabic/config.yml b/ar/StyleTTS2-LibriTTS-arabic/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..4672f889aed5193420f7c450d1907f80fa40d7b9 --- /dev/null +++ b/ar/StyleTTS2-LibriTTS-arabic/config.yml @@ -0,0 +1,114 @@ +log_dir: "/style_tts2/Models/FineTune.AudioBook" +log_interval: 10 +device: "cuda" +epochs: 25 # number of finetuning epoch +batch_size: 6 +max_len: 300 # maximum number of frames +pretrained_model_repo: "yl4579/StyleTTS2-LibriTTS" +pretrained_model_filename: "Models/LibriTTS/epochs_2nd_00020.pth" +second_stage_load_pretrained: true # set to true if the pre-trained model is for 2nd stage +load_only_params: true # set to true if do not want to load epoch numbers and optimizer parameters + +F0_path: "/root/Utils/JDC/bst.t7" +ASR_config: "/root/Utils/ASR/config.yml" +ASR_path: "/root/Utils/ASR/epoch_00080.pth" +PLBERT_repo_id: "fadi77/pl-bert" +PLBERT_dirname: "models/mlm_only_with_diacritics" + +data_params: + train_data: "Data/youtube_train_list.txt" + val_data: "Data/youtube_val_list.txt" + root_path: "Youtube/wavs" + OOD_data: "Data/youtube_train_list.txt" + min_length: 50 # sample until texts with this size are obtained for OOD texts + +preprocess_params: + sr: 24000 + spect_params: + n_fft: 2048 + win_length: 1200 + hop_length: 300 + +model_params: + multispeaker: false + + dim_in: 64 + hidden_dim: 512 + max_conv_dim: 512 + n_layer: 3 + n_mels: 80 + + n_token: 178 # number of phoneme tokens + max_dur: 50 # maximum duration of a single phoneme + style_dim: 128 # style vector size + + dropout: 0.2 + + # config for decoder + decoder: + type: 'hifigan' # either hifigan or istftnet + resblock_kernel_sizes: [3,7,11] + upsample_rates : [10,5,3,2] + upsample_initial_channel: 512 + resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] + upsample_kernel_sizes: [20,10,6,4] + + # speech language model config + slm: + model: 'microsoft/wavlm-base-plus' + sr: 16000 # sampling rate of SLM + hidden: 768 # hidden size of SLM + nlayers: 13 # number of layers of SLM + initial_channel: 64 # initial channels of SLM discriminator head + + # style diffusion model config + diffusion: + embedding_mask_proba: 0.1 + # transformer config + transformer: + num_layers: 3 + num_heads: 8 + head_features: 64 + multiplier: 2 + + # diffusion distribution config + dist: + sigma_data: 0.2 # placeholder for estimate_sigma_data set to false + estimate_sigma_data: true # estimate sigma_data from the current batch if set to true + mean: -3.0 + std: 1.0 + +loss_params: + lambda_mel: 5. # mel reconstruction loss + lambda_gen: 1. # generator loss + lambda_slm: 1. # slm feature matching loss + + lambda_mono: 1. # monotonic alignment loss (TMA) + lambda_s2s: 1. # sequence-to-sequence loss (TMA) + + lambda_F0: 1. # F0 reconstruction loss + lambda_norm: 1. # norm reconstruction loss + lambda_dur: 1. # duration loss + lambda_ce: 20. # duration predictor probability output CE loss + lambda_sty: 1. # style reconstruction loss + lambda_diff: 1. # score matching loss + + # Note: Current values for training are only adequate for second stage finetuning. + diffusion_training_epoch: 5 + joint_training_epoch: 100 + +# Note: Current values for learnings rates are very low. This is only adequate for second stage finetuning. +optimizer_params: + lr: 0.0001 # general learning rate + bert_lr: 0.00001 # learning rate for PLBERT + ft_lr: 0.0001 # learning rate for acoustic modules + +slmadv_params: + min_len: 400 # minimum length of samples + max_len: 500 # maximum length of samples + batch_percentage: 0.5 # to prevent out of memory, only use half of the original batch size + skip_update: 10 # update the discriminator every this iterations of generator update + thresh: 5 # gradient norm above which the gradient is scaled + scale: 0.01 # gradient scaling factor for predictors from SLM discriminators + sig: 1.5 # sigma for differentiable duration modeling + diff --git a/ar/StyleTTS2-LibriTTS-arabic/model.pth b/ar/StyleTTS2-LibriTTS-arabic/model.pth new file mode 100644 index 0000000000000000000000000000000000000000..5fe9a87f5f710163e417366fb485541c805aa7b4 --- /dev/null +++ b/ar/StyleTTS2-LibriTTS-arabic/model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59d2323412f0c55c774b5675b45e5c12659c0d9e0f9e7012eecc6b7dd845b132 +size 2201968238 diff --git a/ar/StyleTTS2-LibriTTS-arabic/source.txt b/ar/StyleTTS2-LibriTTS-arabic/source.txt new file mode 100644 index 0000000000000000000000000000000000000000..2749916b58fcb117748c7f2ffba4c02b56ff9313 --- /dev/null +++ b/ar/StyleTTS2-LibriTTS-arabic/source.txt @@ -0,0 +1 @@ +https://huggingface.co/fadi77/StyleTTS2-LibriTTS-arabic \ No newline at end of file diff --git a/ar/StyleTTS2-LibriTTS-arabic/synthesized_audio.wav b/ar/StyleTTS2-LibriTTS-arabic/synthesized_audio.wav new file mode 100644 index 0000000000000000000000000000000000000000..7e7b90d254370ca9f72106f1dba49759436e5e13 --- /dev/null +++ b/ar/StyleTTS2-LibriTTS-arabic/synthesized_audio.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f60e90523d734eff1b9f4b95cca49f22277df5cb4acd0bd347fde18f1c3b0469 +size 1795058 diff --git a/en/StyleTTS2-lite/.gitattributes b/en/StyleTTS2-lite/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..2b96bd1447866b09b8a8d094f3a63f1d142c18bb --- /dev/null +++ b/en/StyleTTS2-lite/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +*.wav filter=lfs diff=lfs merge=lfs -text diff --git a/en/StyleTTS2-lite/.gitignore b/en/StyleTTS2-lite/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..134537e0d10897ca0121496fa8a8bd8d5513fdfd --- /dev/null +++ b/en/StyleTTS2-lite/.gitignore @@ -0,0 +1,8 @@ +Modules/__pycache__/__init__.cpython-311.pyc +Modules/__pycache__/hifigan.cpython-311.pyc +Modules/__pycache__/utils.cpython-311.pyc +Modules/__pycache__/__init__.cpython-311.pyc +Modules/__pycache__/hifigan.cpython-311.pyc +Modules/__pycache__/utils.cpython-311.pyc +__pycache__/inference.cpython-311.pyc +__pycache__/models.cpython-311.pyc diff --git a/en/StyleTTS2-lite/Audio/10_michael.wav b/en/StyleTTS2-lite/Audio/10_michael.wav new file mode 100644 index 0000000000000000000000000000000000000000..c4dbe364d7a7af5376fc642866ad5e9659a1da03 --- /dev/null +++ b/en/StyleTTS2-lite/Audio/10_michael.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:733023e56be0434c66ac3b855c9aaac29d64f3a060c295a75e700ecfd34c16f0 +size 620444 diff --git a/en/StyleTTS2-lite/Audio/11_fenrir.wav b/en/StyleTTS2-lite/Audio/11_fenrir.wav new file mode 100644 index 0000000000000000000000000000000000000000..00009b3d815ea80f3ca82aa7936adedd1c2c1842 --- /dev/null +++ b/en/StyleTTS2-lite/Audio/11_fenrir.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abde72631473e48455d54cf585a0b1f229e6e77e9748ed1acef5678a40b08c08 +size 537644 diff --git a/en/StyleTTS2-lite/Audio/12_puck.wav b/en/StyleTTS2-lite/Audio/12_puck.wav new file mode 100644 index 0000000000000000000000000000000000000000..e804562e277303b08373facc84fc028ba1bbcdc4 --- /dev/null +++ b/en/StyleTTS2-lite/Audio/12_puck.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:409cc59612472a0d4bb717613f539dafdb334411ed651ab6988f7fca8b922905 +size 619244 diff --git a/en/StyleTTS2-lite/Audio/13_echo.wav b/en/StyleTTS2-lite/Audio/13_echo.wav new file mode 100644 index 0000000000000000000000000000000000000000..2fc64bb40433a9eb036d66195371b3c9a8675e20 --- /dev/null +++ b/en/StyleTTS2-lite/Audio/13_echo.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6925e6737a67fcbf8dce32d22d29d086d81627b82c6edbfc92b3706f27479ff +size 524444 diff --git a/en/StyleTTS2-lite/Audio/14_eric.wav b/en/StyleTTS2-lite/Audio/14_eric.wav new file mode 100644 index 0000000000000000000000000000000000000000..0ed81b6fe81a6476ac011a9b6913370afb50a4ac --- /dev/null +++ b/en/StyleTTS2-lite/Audio/14_eric.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97b8bbf6a880e46730387ee7bb4bfba6c049ed58c4ec8680ec44f83df669eff1 +size 573644 diff --git a/en/StyleTTS2-lite/Audio/15_liam.wav b/en/StyleTTS2-lite/Audio/15_liam.wav new file mode 100644 index 0000000000000000000000000000000000000000..94cc37b80892b6542b88e1342aa6cbf1374acdb1 --- /dev/null +++ b/en/StyleTTS2-lite/Audio/15_liam.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95842cfe6d1093deb37447b0e5993b6c18f7e5591c3fb1fb3dd230641925de44 +size 541244 diff --git a/en/StyleTTS2-lite/Audio/16_onyx.wav b/en/StyleTTS2-lite/Audio/16_onyx.wav new file mode 100644 index 0000000000000000000000000000000000000000..73806fae59a0e01f5d955f1a6e9e845bf7c4e660 --- /dev/null +++ b/en/StyleTTS2-lite/Audio/16_onyx.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25487ea7634b470392d787bfefb79da0a6a56dc26087ab27b62fa70aac43554d +size 514844 diff --git a/en/StyleTTS2-lite/Audio/17_santa.wav b/en/StyleTTS2-lite/Audio/17_santa.wav new file mode 100644 index 0000000000000000000000000000000000000000..ca7ff306be7d2d3dd6568f479a8939f32de793cd --- /dev/null +++ b/en/StyleTTS2-lite/Audio/17_santa.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80bc56619904ccbd93ed813fc54491f7b83eb8b8fd6c8a1626bd9177f96a23cd +size 583244 diff --git a/en/StyleTTS2-lite/Audio/18_adam.wav b/en/StyleTTS2-lite/Audio/18_adam.wav new file mode 100644 index 0000000000000000000000000000000000000000..041b5ec69696f44072af1f3ec812dc3776c10470 --- /dev/null +++ b/en/StyleTTS2-lite/Audio/18_adam.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b84a1b122273a45d98b5cbf725f4633e4cccb4a0788b8a46cc9faa4b8612419b +size 517244 diff --git a/en/StyleTTS2-lite/Audio/1_heart.wav b/en/StyleTTS2-lite/Audio/1_heart.wav new file mode 100644 index 0000000000000000000000000000000000000000..cb710714304fd01c010df235086f59dff3688c7a --- /dev/null +++ b/en/StyleTTS2-lite/Audio/1_heart.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:978b285ff24f274a1f4fe4551b0d57a5df704ca5ce83284e839ffe96c2dc3dfd +size 547244 diff --git a/en/StyleTTS2-lite/Audio/2_belle.wav b/en/StyleTTS2-lite/Audio/2_belle.wav new file mode 100644 index 0000000000000000000000000000000000000000..a60f00cf9c7836061a705128f4e2ea97c35b514d --- /dev/null +++ b/en/StyleTTS2-lite/Audio/2_belle.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:459a64fa12dfb530320e8dab2f4057d7868ae4c020b447e8df3402149fa2be59 +size 357644 diff --git a/en/StyleTTS2-lite/Audio/3_kore.wav b/en/StyleTTS2-lite/Audio/3_kore.wav new file mode 100644 index 0000000000000000000000000000000000000000..4345b7b294d9c8eb518965fc780d0db096f9c834 --- /dev/null +++ b/en/StyleTTS2-lite/Audio/3_kore.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e55fc5c463d01d46c090be5457c59727ee52f2ecbeba8be9b38862850418c0c3 +size 276044 diff --git a/en/StyleTTS2-lite/Audio/4_sarah.wav b/en/StyleTTS2-lite/Audio/4_sarah.wav new file mode 100644 index 0000000000000000000000000000000000000000..f66e1ef54aecdb651f3f32a4697c557c4174b11a --- /dev/null +++ b/en/StyleTTS2-lite/Audio/4_sarah.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ae7416f410104b0cedc1cc9c7365a89fd16a1599733f8f416e7618943d0acb8 +size 640844 diff --git a/en/StyleTTS2-lite/Audio/5_nova.wav b/en/StyleTTS2-lite/Audio/5_nova.wav new file mode 100644 index 0000000000000000000000000000000000000000..a57d0e083f1c4d8820b43504d4cfaa9171d5fb61 --- /dev/null +++ b/en/StyleTTS2-lite/Audio/5_nova.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:252c20a3f55bfe0ea7f42fbd638f6d4113ade7918630d1d37e166e11143f74f8 +size 336044 diff --git a/en/StyleTTS2-lite/Audio/6_sky.wav b/en/StyleTTS2-lite/Audio/6_sky.wav new file mode 100644 index 0000000000000000000000000000000000000000..80179e732265cf3b9130a832bd91a0f3dca5d66a --- /dev/null +++ b/en/StyleTTS2-lite/Audio/6_sky.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc985eb31aa7e2088f852c55282ec6ff72365486478a627bcd56ce2387a8d5b2 +size 502844 diff --git a/en/StyleTTS2-lite/Audio/7_alloy.wav b/en/StyleTTS2-lite/Audio/7_alloy.wav new file mode 100644 index 0000000000000000000000000000000000000000..a81738fa8b6ff0ab939631d999fa4c4e0a1a74b3 --- /dev/null +++ b/en/StyleTTS2-lite/Audio/7_alloy.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd7868816449f2139e21661dcbc13d3d553c558627d4c50fada1f7c22ce7f86c +size 632444 diff --git a/en/StyleTTS2-lite/Audio/8_jessica.wav b/en/StyleTTS2-lite/Audio/8_jessica.wav new file mode 100644 index 0000000000000000000000000000000000000000..829db10fd7ff6b541d5585cf421f0a869b63aeed --- /dev/null +++ b/en/StyleTTS2-lite/Audio/8_jessica.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8d7573154905c901281e767f25be2dbceae731c891da409f5b7c0be3096bd5d +size 477644 diff --git a/en/StyleTTS2-lite/Audio/9_river.wav b/en/StyleTTS2-lite/Audio/9_river.wav new file mode 100644 index 0000000000000000000000000000000000000000..8946e917739c9b9a7e92b90c7e3ab8348756e40a --- /dev/null +++ b/en/StyleTTS2-lite/Audio/9_river.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75a3b2fc9d4e93ded21f28cccc6ae7bf7a39bf04fed7f2d4d36e59db0792eedd +size 472844 diff --git a/en/StyleTTS2-lite/LICENSE b/en/StyleTTS2-lite/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..6585d82c2a0f68f31ba4e2264d2d4beb57bda33f --- /dev/null +++ b/en/StyleTTS2-lite/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Aaron (Yinghao) Li + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/en/StyleTTS2-lite/Models/base_model.pth b/en/StyleTTS2-lite/Models/base_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..860eacec1470716ace6171a56d35e65f5f8c23db --- /dev/null +++ b/en/StyleTTS2-lite/Models/base_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:821deb4efee549b7024f37236e86b4bcb023870baf0ddb9f407fb514253340d1 +size 1692092384 diff --git a/en/StyleTTS2-lite/Models/config.yaml b/en/StyleTTS2-lite/Models/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..920d284e55be3d1cc6a55b770c9673d91996412c --- /dev/null +++ b/en/StyleTTS2-lite/Models/config.yaml @@ -0,0 +1,79 @@ +log_dir: ./Models/Finetune +save_freq: 1 +log_interval: 10 +device: cuda +epochs: 50 +batch_size: 2 +max_len: 310 # maximum number of frames +pretrained_model: ./Models/Finetune/base_model.pth +load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters +debug: true + +data_params: + train_data: ../../Data_Speech/LibriTTS/train.txt + val_data: ../../Data_Speech/LibriTTS/val.txt + root_path: ../../Data_Speech/ + +symbol: #Total 178 symbols + pad: "$" + punctuation: ';:,.!?¡¿—…"«»“” ' + letters: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" + letters_ipa: "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" + extend: "" #ADD MORE SYMBOLS HERE + +preprocess_params: + sr: 24000 + spect_params: + n_fft: 2048 + win_length: 1200 + hop_length: 300 + +training_strats: + #All modules: 'decoder', 'predictor', 'text_encoder', 'style_encoder', 'text_aligner', 'pitch_extractor', 'mpd', 'msd' + freeze_modules: [''] # Not updated when training. + ignore_modules: [''] # Not loading => fresh start. IMPORTANT: 'text_aligner' and 'pitch_extractor' are util pretraineds DO NOT ignore them. + +model_params: + dim_in: 64 + hidden_dim: 512 + max_conv_dim: 512 + n_layer: 3 + n_mels: 80 + max_dur: 50 # maximum duration of a single phoneme + style_dim: 128 # style vector size + dropout: 0.2 + + ASR_params: + input_dim: 80 + hidden_dim: 256 + n_layers: 6 + token_embedding_dim: 512 + + JDC_params: + num_class: 1 + seq_len: 192 + + # config for decoder + decoder: + type: hifigan # either hifigan or istftnet + resblock_kernel_sizes: [3,7,11] + upsample_rates : [10,5,3,2] + upsample_initial_channel: 512 + resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] + upsample_kernel_sizes: [20,10,6,4] + +loss_params: + lambda_mel: 5. # mel reconstruction loss + lambda_gen: 1. # generator loss + + lambda_mono: 1. # monotonic alignment loss (TMA) + lambda_s2s: 1. # sequence-to-sequence loss (TMA) + + lambda_F0: 1. # F0 reconstruction loss + lambda_norm: 1. # norm reconstruction loss + lambda_dur: 1. # duration loss + lambda_ce: 20. # duration predictor probability output CE loss + +optimizer_params: + lr: 0.0001 # general learning rate + ft_lr: 0.00001 # learning rate for acoustic modules \ No newline at end of file diff --git a/en/StyleTTS2-lite/Models/inference/model.pth b/en/StyleTTS2-lite/Models/inference/model.pth new file mode 100644 index 0000000000000000000000000000000000000000..21458618df3cd59ecb1560039c78a66c3013ebb4 --- /dev/null +++ b/en/StyleTTS2-lite/Models/inference/model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2763d7b6c5477502d3f2a870eda76bbedae671f0107b15a1060fb4e6771ed634 +size 359997166 diff --git a/en/StyleTTS2-lite/Modules/__init__.py b/en/StyleTTS2-lite/Modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d3f5a12faa99758192ecc4ed3fc22c9249232e86 --- /dev/null +++ b/en/StyleTTS2-lite/Modules/__init__.py @@ -0,0 +1 @@ + diff --git a/en/StyleTTS2-lite/Modules/hifigan.py b/en/StyleTTS2-lite/Modules/hifigan.py new file mode 100644 index 0000000000000000000000000000000000000000..5ad62b7611d7137895b7ab70e214c99f2b6741e1 --- /dev/null +++ b/en/StyleTTS2-lite/Modules/hifigan.py @@ -0,0 +1,477 @@ +import torch +import torch.nn.functional as F +import torch.nn as nn +from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm +from .utils import init_weights, get_padding + +import math +import random +import numpy as np + +LRELU_SLOPE = 0.1 + +class AdaIN1d(nn.Module): + def __init__(self, style_dim, num_features): + super().__init__() + self.norm = nn.InstanceNorm1d(num_features, affine=False) + self.fc = nn.Linear(style_dim, num_features*2) + + def forward(self, x, s): + h = self.fc(s) + h = h.view(h.size(0), h.size(1), 1) + gamma, beta = torch.chunk(h, chunks=2, dim=1) + return (1 + gamma) * self.norm(x) + beta + +class AdaINResBlock1(torch.nn.Module): + def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), style_dim=64): + super(AdaINResBlock1, self).__init__() + self.convs1 = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]))) + ]) + self.convs1.apply(init_weights) + + self.convs2 = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))) + ]) + self.convs2.apply(init_weights) + + self.adain1 = nn.ModuleList([ + AdaIN1d(style_dim, channels), + AdaIN1d(style_dim, channels), + AdaIN1d(style_dim, channels), + ]) + + self.adain2 = nn.ModuleList([ + AdaIN1d(style_dim, channels), + AdaIN1d(style_dim, channels), + AdaIN1d(style_dim, channels), + ]) + + self.alpha1 = nn.ParameterList([nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs1))]) + self.alpha2 = nn.ParameterList([nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs2))]) + + + def forward(self, x, s): + for c1, c2, n1, n2, a1, a2 in zip(self.convs1, self.convs2, self.adain1, self.adain2, self.alpha1, self.alpha2): + xt = n1(x, s) + xt = xt + (1 / a1) * (torch.sin(a1 * xt) ** 2) # Snake1D + xt = c1(xt) + xt = n2(xt, s) + xt = xt + (1 / a2) * (torch.sin(a2 * xt) ** 2) # Snake1D + xt = c2(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for l in self.convs1: + remove_weight_norm(l) + for l in self.convs2: + remove_weight_norm(l) + +class SineGen(torch.nn.Module): + """ Definition of sine generator + SineGen(samp_rate, harmonic_num = 0, + sine_amp = 0.1, noise_std = 0.003, + voiced_threshold = 0, + flag_for_pulse=False) + samp_rate: sampling rate in Hz + harmonic_num: number of harmonic overtones (default 0) + sine_amp: amplitude of sine-wavefrom (default 0.1) + noise_std: std of Gaussian noise (default 0.003) + voiced_thoreshold: F0 threshold for U/V classification (default 0) + flag_for_pulse: this SinGen is used inside PulseGen (default False) + Note: when flag_for_pulse is True, the first time step of a voiced + segment is always sin(np.pi) or cos(0) + """ + + def __init__(self, samp_rate, upsample_scale, harmonic_num=0, + sine_amp=0.1, noise_std=0.003, + voiced_threshold=0, + flag_for_pulse=False): + super(SineGen, self).__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.dim = self.harmonic_num + 1 + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + self.flag_for_pulse = flag_for_pulse + self.upsample_scale = upsample_scale + + def _f02uv(self, f0): + # generate uv signal + uv = (f0 > self.voiced_threshold).type(torch.float32) + return uv + + def _f02sine(self, f0_values): + """ f0_values: (batchsize, length, dim) + where dim indicates fundamental tone and overtones + """ + # convert to F0 in rad. The interger part n can be ignored + # because 2 * np.pi * n doesn't affect phase + rad_values = (f0_values / self.sampling_rate) % 1 + + # initial phase noise (no noise for fundamental component) + rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], \ + device=f0_values.device) + rand_ini[:, 0] = 0 + rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini + + # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad) + if not self.flag_for_pulse: +# # for normal case + +# # To prevent torch.cumsum numerical overflow, +# # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1. +# # Buffer tmp_over_one_idx indicates the time step to add -1. +# # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi +# tmp_over_one = torch.cumsum(rad_values, 1) % 1 +# tmp_over_one_idx = (padDiff(tmp_over_one)) < 0 +# cumsum_shift = torch.zeros_like(rad_values) +# cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + +# phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi + rad_values = torch.nn.functional.interpolate(rad_values.transpose(1, 2), + scale_factor=1/self.upsample_scale, + mode="linear").transpose(1, 2) + +# tmp_over_one = torch.cumsum(rad_values, 1) % 1 +# tmp_over_one_idx = (padDiff(tmp_over_one)) < 0 +# cumsum_shift = torch.zeros_like(rad_values) +# cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + + phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi + phase = torch.nn.functional.interpolate(phase.transpose(1, 2) * self.upsample_scale, + scale_factor=self.upsample_scale, mode="linear").transpose(1, 2) + sines = torch.sin(phase) + + else: + # If necessary, make sure that the first time step of every + # voiced segments is sin(pi) or cos(0) + # This is used for pulse-train generation + + # identify the last time step in unvoiced segments + uv = self._f02uv(f0_values) + uv_1 = torch.roll(uv, shifts=-1, dims=1) + uv_1[:, -1, :] = 1 + u_loc = (uv < 1) * (uv_1 > 0) + + # get the instantanouse phase + tmp_cumsum = torch.cumsum(rad_values, dim=1) + # different batch needs to be processed differently + for idx in range(f0_values.shape[0]): + temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :] + temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :] + # stores the accumulation of i.phase within + # each voiced segments + tmp_cumsum[idx, :, :] = 0 + tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum + + # rad_values - tmp_cumsum: remove the accumulation of i.phase + # within the previous voiced segment. + i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1) + + # get the sines + sines = torch.cos(i_phase * 2 * np.pi) + return sines + + def forward(self, f0): + """ sine_tensor, uv = forward(f0) + input F0: tensor(batchsize=1, length, dim=1) + f0 for unvoiced steps should be 0 + output sine_tensor: tensor(batchsize=1, length, dim) + output uv: tensor(batchsize=1, length, 1) + """ + f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, + device=f0.device) + # fundamental component + fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device)) + + # generate sine waveforms + sine_waves = self._f02sine(fn) * self.sine_amp + + # generate uv signal + # uv = torch.ones(f0.shape) + # uv = uv * (f0 > self.voiced_threshold) + uv = self._f02uv(f0) + + # noise: for unvoiced should be similar to sine_amp + # std = self.sine_amp/3 -> max value ~ self.sine_amp + # . for voiced regions is self.noise_std + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) + + # first: set the unvoiced part to 0 by uv + # then: additive noise + sine_waves = sine_waves * uv + noise + return sine_waves, uv, noise + + +class SourceModuleHnNSF(torch.nn.Module): + """ SourceModule for hn-nsf + SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, + add_noise_std=0.003, voiced_threshod=0) + sampling_rate: sampling_rate in Hz + harmonic_num: number of harmonic above F0 (default: 0) + sine_amp: amplitude of sine source signal (default: 0.1) + add_noise_std: std of additive Gaussian noise (default: 0.003) + note that amplitude of noise in unvoiced is decided + by sine_amp + voiced_threshold: threhold to set U/V given F0 (default: 0) + Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) + F0_sampled (batchsize, length, 1) + Sine_source (batchsize, length, 1) + noise_source (batchsize, length 1) + uv (batchsize, length, 1) + """ + + def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1, + add_noise_std=0.003, voiced_threshod=0): + super(SourceModuleHnNSF, self).__init__() + + self.sine_amp = sine_amp + self.noise_std = add_noise_std + + # to produce sine waveforms + self.l_sin_gen = SineGen(sampling_rate, upsample_scale, harmonic_num, + sine_amp, add_noise_std, voiced_threshod) + + # to merge source harmonics into a single excitation + self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) + self.l_tanh = torch.nn.Tanh() + + def forward(self, x): + """ + Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) + F0_sampled (batchsize, length, 1) + Sine_source (batchsize, length, 1) + noise_source (batchsize, length 1) + """ + # source for harmonic branch + with torch.no_grad(): + sine_wavs, uv, _ = self.l_sin_gen(x) + sine_merge = self.l_tanh(self.l_linear(sine_wavs)) + + # source for noise branch, in the same shape as uv + noise = torch.randn_like(uv) * self.sine_amp / 3 + return sine_merge, noise, uv +def padDiff(x): + return F.pad(F.pad(x, (0,0,-1,1), 'constant', 0) - x, (0,0,0,-1), 'constant', 0) + +class Generator(torch.nn.Module): + def __init__(self, style_dim, resblock_kernel_sizes, upsample_rates, upsample_initial_channel, resblock_dilation_sizes, upsample_kernel_sizes): + super(Generator, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + resblock = AdaINResBlock1 + + self.m_source = SourceModuleHnNSF( + sampling_rate=24000, + upsample_scale=np.prod(upsample_rates), + harmonic_num=8, voiced_threshod=10) + + self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) + self.noise_convs = nn.ModuleList() + self.ups = nn.ModuleList() + self.noise_res = nn.ModuleList() + + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + c_cur = upsample_initial_channel // (2 ** (i + 1)) + + self.ups.append(weight_norm(ConvTranspose1d(upsample_initial_channel//(2**i), + upsample_initial_channel//(2**(i+1)), + k, u, padding=(u//2 + u%2), output_padding=u%2))) + + if i + 1 < len(upsample_rates): # + stride_f0 = np.prod(upsample_rates[i + 1:]) + self.noise_convs.append(Conv1d( + 1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=(stride_f0+1) // 2)) + self.noise_res.append(resblock(c_cur, 7, [1,3,5], style_dim)) + else: + self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) + self.noise_res.append(resblock(c_cur, 11, [1,3,5], style_dim)) + + self.resblocks = nn.ModuleList() + + self.alphas = nn.ParameterList() + self.alphas.append(nn.Parameter(torch.ones(1, upsample_initial_channel, 1))) + + for i in range(len(self.ups)): + ch = upsample_initial_channel//(2**(i+1)) + self.alphas.append(nn.Parameter(torch.ones(1, ch, 1))) + + for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): + self.resblocks.append(resblock(ch, k, d, style_dim)) + + self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) + self.ups.apply(init_weights) + self.conv_post.apply(init_weights) + + def forward(self, x, s, f0): + + f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t + + har_source, noi_source, uv = self.m_source(f0) + har_source = har_source.transpose(1, 2) + + for i in range(self.num_upsamples): + x = x + (1 / self.alphas[i]) * (torch.sin(self.alphas[i] * x) ** 2) + x_source = self.noise_convs[i](har_source) + x_source = self.noise_res[i](x_source, s) + + x = self.ups[i](x) + x = x + x_source + + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i*self.num_kernels+j](x, s) + else: + xs += self.resblocks[i*self.num_kernels+j](x, s) + x = xs / self.num_kernels + x = x + (1 / self.alphas[i+1]) * (torch.sin(self.alphas[i+1] * x) ** 2) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + print('Removing weight norm...') + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + remove_weight_norm(self.conv_pre) + remove_weight_norm(self.conv_post) + + +class AdainResBlk1d(nn.Module): + def __init__(self, dim_in, dim_out, style_dim=64, actv=nn.LeakyReLU(0.2), + upsample='none', dropout_p=0.0): + super().__init__() + self.actv = actv + self.upsample_type = upsample + self.upsample = UpSample1d(upsample) + self.learned_sc = dim_in != dim_out + self._build_weights(dim_in, dim_out, style_dim) + self.dropout = nn.Dropout(dropout_p) + + if upsample == 'none': + self.pool = nn.Identity() + else: + self.pool = weight_norm(nn.ConvTranspose1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1, output_padding=1)) + + + def _build_weights(self, dim_in, dim_out, style_dim): + self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1)) + self.conv2 = weight_norm(nn.Conv1d(dim_out, dim_out, 3, 1, 1)) + self.norm1 = AdaIN1d(style_dim, dim_in) + self.norm2 = AdaIN1d(style_dim, dim_out) + if self.learned_sc: + self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False)) + + def _shortcut(self, x): + x = self.upsample(x) + if self.learned_sc: + x = self.conv1x1(x) + return x + + def _residual(self, x, s): + x = self.norm1(x, s) + x = self.actv(x) + x = self.pool(x) + x = self.conv1(self.dropout(x)) + x = self.norm2(x, s) + x = self.actv(x) + x = self.conv2(self.dropout(x)) + return x + + def forward(self, x, s): + out = self._residual(x, s) + out = (out + self._shortcut(x)) / math.sqrt(2) + return out + +class UpSample1d(nn.Module): + def __init__(self, layer_type): + super().__init__() + self.layer_type = layer_type + + def forward(self, x): + if self.layer_type == 'none': + return x + else: + return F.interpolate(x, scale_factor=2, mode='nearest') + +class Decoder(nn.Module): + def __init__(self, dim_in=512, F0_channel=512, style_dim=64, dim_out=80, + resblock_kernel_sizes = [3,7,11], + upsample_rates = [10,5,3,2], + upsample_initial_channel=512, + resblock_dilation_sizes=[[1,3,5], [1,3,5], [1,3,5]], + upsample_kernel_sizes=[20,10,6,4]): + super().__init__() + + self.decode = nn.ModuleList() + + self.encode = AdainResBlk1d(dim_in + 2, 1024, style_dim) + + self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim)) + self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim)) + self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim)) + self.decode.append(AdainResBlk1d(1024 + 2 + 64, 512, style_dim, upsample=True)) + + self.F0_conv = weight_norm(nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1)) + + self.N_conv = weight_norm(nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1)) + + self.asr_res = nn.Sequential( + weight_norm(nn.Conv1d(512, 64, kernel_size=1)), + ) + + + self.generator = Generator(style_dim, resblock_kernel_sizes, upsample_rates, upsample_initial_channel, resblock_dilation_sizes, upsample_kernel_sizes) + + + def forward(self, asr, F0_curve, N, s): + if self.training: + downlist = [0, 3, 7] + F0_down = downlist[random.randint(0, 2)] + downlist = [0, 3, 7, 15] + N_down = downlist[random.randint(0, 3)] + if F0_down: + F0_curve = nn.functional.conv1d(F0_curve.unsqueeze(1), torch.ones(1, 1, F0_down).to(asr.device), padding=F0_down//2).squeeze(1) / F0_down + if N_down: + N = nn.functional.conv1d(N.unsqueeze(1), torch.ones(1, 1, N_down).to(asr.device), padding=N_down//2).squeeze(1) / N_down + + + F0 = self.F0_conv(F0_curve.unsqueeze(1)) + N = self.N_conv(N.unsqueeze(1)) + + x = torch.cat([asr, F0, N], axis=1) + x = self.encode(x, s) + + asr_res = self.asr_res(asr) + + res = True + for block in self.decode: + if res: + x = torch.cat([x, asr_res, F0, N], axis=1) + x = block(x, s) + if block.upsample_type != "none": + res = False + + x = self.generator(x, s, F0_curve) + return x + + \ No newline at end of file diff --git a/en/StyleTTS2-lite/Modules/utils.py b/en/StyleTTS2-lite/Modules/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c2fd9817caffbb9bf4c616b481cf84aee0362f6b --- /dev/null +++ b/en/StyleTTS2-lite/Modules/utils.py @@ -0,0 +1,14 @@ +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def apply_weight_norm(m): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + weight_norm(m) + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size*dilation - dilation)/2) \ No newline at end of file diff --git a/en/StyleTTS2-lite/README.md b/en/StyleTTS2-lite/README.md new file mode 100644 index 0000000000000000000000000000000000000000..251601030c6b420c57917de21108620d1e2dc67f --- /dev/null +++ b/en/StyleTTS2-lite/README.md @@ -0,0 +1,88 @@ +--- +license: mit +language: +- en +base_model: +- yl4579/StyleTTS2-LibriTTS +pipeline_tag: text-to-speech +--- + +# StyleTTS 2 - lite + +## Online Demo +Explore the model on Hugging Face Spaces: +https://huggingface.co/spaces/dangtr0408/StyleTTS2-lite-space + +## Fine-tune +https://github.com/dangtr0408/StyleTTS2-lite + +## Training Details + +1. **Base Checkpoint:** Initialized from the official StyleTTS 2 weights pre-trained on LibriTTS. +2. **Components Removal:** PLBert, Diffusion, Prosodic Encoder, SLM, and Spectral Normalization. +2. **Training Data:** LibriTTS corpus. +3. **Training Schedule:** Trained for 100,000 steps. + +## Model Architecture + +| Component | Parameters | +| -------------- | ------------- | +| Decoder | 54 ,289 ,492 | +| Predictor | 16 ,194 ,612 | +| Style Encoder | 13 ,845 ,440 | +| Text Encoder | 5,612 ,320 | +| **Total** | **89 ,941 ,576** | + +## Prerequisites + +- **Python:** Version 3.7 or higher +- **Git:** To clone the repository + +## Installation & Setup + +1. Clone the repository + +```bash + +git clone https://huggingface.co/dangtr0408/StyleTTS2-lite + +cd StyleTTS2-lite + +``` + +2. Install dependencies: + +```bash + +pip install -r requirements.txt + +``` + + + +3. On **Linux**, manually install espeak: + +```bash + +sudo apt-get install espeak-ng + +``` + +## Usage Example + +See run.ipynb file. + +## Disclaimer + +**Before using these pre-trained models, you agree to inform the listeners that the speech samples are synthesized by the pre-trained models, unless you have the permission to use the voice you synthesize. That is, you agree to only use voices whose speakers grant the permission to have their voice cloned, either directly or by license before making synthesized voices public, or you have to publicly announce that these voices are synthesized if you do not have the permission to use these voices.** + + +## References + +- [yl4579/StyleTTS2](https://arxiv.org/abs/2306.07691) + +- [jik876/hifi-gan](https://github.com/jik876/hifi-gan) + +## License + +**Code: MIT License** \ No newline at end of file diff --git a/en/StyleTTS2-lite/inference.py b/en/StyleTTS2-lite/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..cbf0a574164acf1aa01e7b7c2dcea8ea74b4384b --- /dev/null +++ b/en/StyleTTS2-lite/inference.py @@ -0,0 +1,301 @@ +import re +import yaml +from munch import Munch +import numpy as np +import librosa +import noisereduce as nr +from meldataset import TextCleaner +import torch +import torchaudio +from nltk.tokenize import word_tokenize +import nltk +nltk.download('punkt_tab') + +from models import ProsodyPredictor, TextEncoder, StyleEncoder +from Modules.hifigan import Decoder + +class Preprocess: + def __text_normalize(self, text): + punctuation = [",", "、", "،", ";", "(", ".", "。", "…", "!", "–", ":", "?"] + map_to = "." + punctuation_pattern = re.compile(f"[{''.join(re.escape(p) for p in punctuation)}]") + #replace punctuation that acts like a comma or period + text = punctuation_pattern.sub(map_to, text) + #replace consecutive whitespace chars with a single space and strip leading/trailing spaces + text = re.sub(r'\s+', ' ', text).strip() + return text + def __merge_fragments(self, texts, n): + merged = [] + i = 0 + while i < len(texts): + fragment = texts[i] + j = i + 1 + while len(fragment.split()) < n and j < len(texts): + fragment += ", " + texts[j] + j += 1 + merged.append(fragment) + i = j + if len(merged[-1].split()) < n and len(merged) > 1: #handle last sentence + merged[-2] = merged[-2] + ", " + merged[-1] + del merged[-1] + else: + merged[-1] = merged[-1] + return merged + def wave_preprocess(self, wave): + to_mel = torchaudio.transforms.MelSpectrogram(n_mels=80, n_fft=2048, win_length=1200, hop_length=300) + mean, std = -4, 4 + wave_tensor = torch.from_numpy(wave).float() + mel_tensor = to_mel(wave_tensor) + mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std + return mel_tensor + def text_preprocess(self, text, n_merge=12): + text_norm = self.__text_normalize(text).split(".")#split by sentences. + text_norm = [s.strip() for s in text_norm] + text_norm = list(filter(lambda x: x != '', text_norm)) #filter empty index + text_norm = self.__merge_fragments(text_norm, n=n_merge) #merge if a sentence has less that n + return text_norm + def length_to_mask(self, lengths): + mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths) + mask = torch.gt(mask+1, lengths.unsqueeze(1)) + return mask + +#For inference only +class StyleTTS2(torch.nn.Module): + def __init__(self, config_path, models_path): + super().__init__() + self.register_buffer("get_device", torch.empty(0)) + self.preprocess = Preprocess() + self.ref_s = None + config = yaml.safe_load(open(config_path, "r", encoding="utf-8")) + + try: + symbols = ( + list(config['symbol']['pad']) + + list(config['symbol']['punctuation']) + + list(config['symbol']['letters']) + + list(config['symbol']['letters_ipa']) + + list(config['symbol']['extend']) + ) + symbol_dict = {} + for i in range(len((symbols))): + symbol_dict[symbols[i]] = i + + n_token = len(symbol_dict) + 1 + print("\nFound:", n_token, "symbols") + except Exception as e: + print(f"\nERROR: Cannot find {e} in config file!\nYour config file is likely outdated, please download updated version from the repository.") + raise SystemExit(1) + + args = self.__recursive_munch(config['model_params']) + args['n_token'] = n_token + + self.cleaner = TextCleaner(symbol_dict, debug=False) + + assert args.decoder.type in ['hifigan'], 'Decoder type unknown' + + self.decoder = Decoder(dim_in=args.hidden_dim, style_dim=args.style_dim, dim_out=args.n_mels, + resblock_kernel_sizes = args.decoder.resblock_kernel_sizes, + upsample_rates = args.decoder.upsample_rates, + upsample_initial_channel=args.decoder.upsample_initial_channel, + resblock_dilation_sizes=args.decoder.resblock_dilation_sizes, + upsample_kernel_sizes=args.decoder.upsample_kernel_sizes) + self.predictor = ProsodyPredictor(style_dim=args.style_dim, d_hid=args.hidden_dim, nlayers=args.n_layer, max_dur=args.max_dur, dropout=args.dropout) + self.text_encoder = TextEncoder(channels=args.hidden_dim, kernel_size=5, depth=args.n_layer, n_symbols=args.n_token) + self.style_encoder = StyleEncoder(dim_in=args.dim_in, style_dim=args.style_dim, max_conv_dim=args.hidden_dim)# acoustic style encoder + + self.__load_models(models_path) + + def __recursive_munch(self, d): + if isinstance(d, dict): + return Munch((k, self.__recursive_munch(v)) for k, v in d.items()) + elif isinstance(d, list): + return [self.__recursive_munch(v) for v in d] + else: + return d + + def __replace_outliers_zscore(self, tensor, threshold=3.0, factor=0.95): + mean = tensor.mean() + std = tensor.std() + z = (tensor - mean) / std + + # Identify outliers + outlier_mask = torch.abs(z) > threshold + # Compute replacement value, respecting sign + sign = torch.sign(tensor - mean) + replacement = mean + sign * (threshold * std * factor) + + result = tensor.clone() + result[outlier_mask] = replacement[outlier_mask] + + return result + + def __load_models(self, models_path): + module_params = [] + model = {'decoder':self.decoder, 'predictor':self.predictor, 'text_encoder':self.text_encoder, 'style_encoder':self.style_encoder} + + params_whole = torch.load(models_path, map_location='cpu') + params = params_whole['net'] + params = {key: value for key, value in params.items() if key in model.keys()} + + for key in model: + try: + model[key].load_state_dict(params[key]) + except: + from collections import OrderedDict + state_dict = params[key] + new_state_dict = OrderedDict() + for k, v in state_dict.items(): + name = k[7:] # remove `module.` + new_state_dict[name] = v + model[key].load_state_dict(new_state_dict, strict=False) + + total_params = sum(p.numel() for p in model[key].parameters()) + print(key,":",total_params) + module_params.append(total_params) + + print('\nTotal',":",sum(module_params)) + + def __compute_style(self, path, denoise, split_dur): + device = self.get_device.device + denoise = min(denoise, 1) + if split_dur != 0: split_dur = max(int(split_dur), 1) + max_samples = 24000*20 #max 20 seconds ref audio + print("Computing the style for:", path) + + wave, sr = librosa.load(path, sr=24000) + audio, index = librosa.effects.trim(wave, top_db=30) + if sr != 24000: + audio = librosa.resample(audio, sr, 24000) + if len(audio) > max_samples: + audio = audio[:max_samples] + + if denoise > 0.0: + audio_denoise = nr.reduce_noise(y=audio, sr=sr, n_fft=2048, win_length=1200, hop_length=300) + audio = audio*(1-denoise) + audio_denoise*denoise + + with torch.no_grad(): + if split_dur>0 and len(audio)/sr>=4: #Only effective if audio length is >= 4s + #This option will split the ref audio to multiple parts, calculate styles and average them + count = 0 + ref_s = None + jump = sr*split_dur + total_len = len(audio) + + #Need to init before the loop + mel_tensor = self.preprocess.wave_preprocess(audio[0:jump]).to(device) + ref_s = self.style_encoder(mel_tensor.unsqueeze(1)) + count += 1 + for i in range(jump, total_len, jump): + if i+jump >= total_len: + left_dur = (total_len-i)/sr + if left_dur >= 1: #Still count if left over dur is >= 1s + mel_tensor = self.preprocess.wave_preprocess(audio[i:total_len]).to(device) + ref_s += self.style_encoder(mel_tensor.unsqueeze(1)) + count += 1 + continue + mel_tensor = self.preprocess.wave_preprocess(audio[i:i+jump]).to(device) + ref_s += self.style_encoder(mel_tensor.unsqueeze(1)) + count += 1 + ref_s /= count + else: + mel_tensor = self.preprocess.wave_preprocess(audio).to(device) + ref_s = self.style_encoder(mel_tensor.unsqueeze(1)) + + return ref_s + + def __inference(self, phonem, ref_s, speed=1, prev_d_mean=0, t=0.1): + device = self.get_device.device + speed = min(max(speed, 0.0001), 2) #speed range [0, 2] + + phonem = ' '.join(word_tokenize(phonem)) + tokens = self.cleaner(phonem) + tokens.insert(0, 0) + tokens.append(0) + tokens = torch.LongTensor(tokens).to(device).unsqueeze(0) + + with torch.no_grad(): + input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device) + text_mask = self.preprocess.length_to_mask(input_lengths).to(device) + + # encode + t_en = self.text_encoder(tokens, input_lengths, text_mask) + s = ref_s.to(device) + + # cal alignment + d = self.predictor.text_encoder(t_en, s, input_lengths, text_mask) + x, _ = self.predictor.lstm(d) + duration = self.predictor.duration_proj(x) + duration = torch.sigmoid(duration).sum(axis=-1) + + if prev_d_mean != 0:#Stabilize speaking speed between splits + dur_stats = torch.empty(duration.shape).normal_(mean=prev_d_mean, std=duration.std()).to(device) + else: + dur_stats = torch.empty(duration.shape).normal_(mean=duration.mean(), std=duration.std()).to(device) + duration = duration*(1-t) + dur_stats*t + duration[:,1:-2] = self.__replace_outliers_zscore(duration[:,1:-2]) #Normalize outlier + + duration /= speed + + pred_dur = torch.round(duration.squeeze()).clamp(min=1) + pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data)) + c_frame = 0 + for i in range(pred_aln_trg.size(0)): + pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1 + c_frame += int(pred_dur[i].data) + alignment = pred_aln_trg.unsqueeze(0).to(device) + + # encode prosody + en = (d.transpose(-1, -2) @ alignment) + F0_pred, N_pred = self.predictor.F0Ntrain(en, s) + asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device)) + + out = self.decoder(asr, F0_pred, N_pred, s) + + return out.squeeze().cpu().numpy(), duration.mean() + + def get_styles(self, speaker, denoise=0.3, avg_style=True, load_styles=False): + if not load_styles: + if avg_style: split_dur = 3 + else: split_dur = 0 + self.ref_s = self.__compute_style(speaker['path'], denoise=denoise, split_dur=split_dur) + else: + if self.ref_s is None: + raise Exception("Have to compute or load the styles first!") + style = { + 'style': self.ref_s, + 'path': speaker['path'], + 'speed': speaker['speed'], + } + return style + + def save_styles(self, save_dir): + if self.ref_s is not None: + torch.save(self.ref_s, save_dir) + print("Saved styles!") + else: + raise Exception("Have to compute the styles before saving it.") + + def load_styles(self, save_dir): + try: + self.ref_s = torch.load(save_dir) + print("Loaded styles!") + except Exception as e: + print(e) + + def generate(self, phonem, style, stabilize=True, n_merge=16): + if stabilize: smooth_value=0.2 + else: smooth_value=0 + + list_wav = [] + prev_d_mean = 0 + + print("Generating Audio...") + text_norm = self.preprocess.text_preprocess(phonem, n_merge=n_merge) + for sentence in text_norm: + wav, prev_d_mean = self.__inference(sentence, style['style'], speed=style['speed'], prev_d_mean=prev_d_mean, t=smooth_value) + wav = wav[4000:-4000] #Remove weird pulse and silent tokens + list_wav.append(wav) + + final_wav = np.concatenate(list_wav) + final_wav = np.concatenate([np.zeros([4000]), final_wav, np.zeros([4000])], axis=0) # add padding + return final_wav \ No newline at end of file diff --git a/en/StyleTTS2-lite/meldataset.py b/en/StyleTTS2-lite/meldataset.py new file mode 100644 index 0000000000000000000000000000000000000000..128873f8f960e5d98a7d2fbe8874bcb1fa07677c --- /dev/null +++ b/en/StyleTTS2-lite/meldataset.py @@ -0,0 +1,307 @@ +#coding: utf-8 +import os.path as osp +import random +import numpy as np +import random +import soundfile as sf +import librosa + +import torch +import torchaudio +import torch.utils.data +import torch.distributed as dist +from multiprocessing import Pool + +import logging +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + +import pandas as pd + +class TextCleaner: + def __init__(self, symbol_dict, debug=True): + self.word_index_dictionary = symbol_dict + self.debug = debug + def __call__(self, text): + indexes = [] + for char in text: + try: + indexes.append(self.word_index_dictionary[char]) + except KeyError as e: + if self.debug: + print("\nWARNING UNKNOWN IPA CHARACTERS/LETTERS: ", char) + print("To ignore set 'debug' to false in the config") + continue + return indexes + +np.random.seed(1) +random.seed(1) +SPECT_PARAMS = { + "n_fft": 2048, + "win_length": 1200, + "hop_length": 300 +} +MEL_PARAMS = { + "n_mels": 80, +} + +to_mel = torchaudio.transforms.MelSpectrogram( + n_mels=80, n_fft=2048, win_length=1200, hop_length=300) +mean, std = -4, 4 + +def preprocess(wave): + wave_tensor = torch.from_numpy(wave).float() + mel_tensor = to_mel(wave_tensor) + mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std + return mel_tensor + +class FilePathDataset(torch.utils.data.Dataset): + def __init__(self, + data_list, + root_path, + symbol_dict, + sr=24000, + data_augmentation=False, + validation=False, + debug=True + ): + + _data_list = [l.strip().split('|') for l in data_list] + self.data_list = _data_list #[data if len(data) == 3 else (*data, 0) for data in _data_list] #append speakerid=0 for all + self.text_cleaner = TextCleaner(symbol_dict, debug) + self.sr = sr + + self.df = pd.DataFrame(self.data_list) + + self.to_melspec = torchaudio.transforms.MelSpectrogram(**MEL_PARAMS) + + self.mean, self.std = -4, 4 + self.data_augmentation = data_augmentation and (not validation) + self.max_mel_length = 192 + + self.root_path = root_path + + def __len__(self): + return len(self.data_list) + + def __getitem__(self, idx): + data = self.data_list[idx] + path = data[0] + + wave, text_tensor = self._load_tensor(data) + + mel_tensor = preprocess(wave).squeeze() + + acoustic_feature = mel_tensor.squeeze() + length_feature = acoustic_feature.size(1) + acoustic_feature = acoustic_feature[:, :(length_feature - length_feature % 2)] + + return acoustic_feature, text_tensor, path, wave + + def _load_tensor(self, data): + wave_path, text = data + wave, sr = sf.read(osp.join(self.root_path, wave_path)) + if wave.shape[-1] == 2: + wave = wave[:, 0].squeeze() + if sr != 24000: + wave = librosa.resample(wave, orig_sr=sr, target_sr=24000) + print(wave_path, sr) + + # Adding half a second padding. + wave = np.concatenate([np.zeros([12000]), wave, np.zeros([12000])], axis=0) + + text = self.text_cleaner(text) + + text.insert(0, 0) + text.append(0) + + text = torch.LongTensor(text) + + return wave, text + + def _load_data(self, data): + wave, text_tensor = self._load_tensor(data) + mel_tensor = preprocess(wave).squeeze() + + mel_length = mel_tensor.size(1) + if mel_length > self.max_mel_length: + random_start = np.random.randint(0, mel_length - self.max_mel_length) + mel_tensor = mel_tensor[:, random_start:random_start + self.max_mel_length] + + return mel_tensor + + +class Collater(object): + """ + Args: + adaptive_batch_size (bool): if true, decrease batch size when long data comes. + """ + + def __init__(self, return_wave=False): + self.text_pad_index = 0 + self.min_mel_length = 192 + self.max_mel_length = 192 + self.return_wave = return_wave + + + def __call__(self, batch): + batch_size = len(batch) + + # sort by mel length + lengths = [b[0].shape[1] for b in batch] + batch_indexes = np.argsort(lengths)[::-1] + batch = [batch[bid] for bid in batch_indexes] + + nmels = batch[0][0].size(0) + max_mel_length = max([b[0].shape[1] for b in batch]) + max_text_length = max([b[1].shape[0] for b in batch]) + + mels = torch.zeros((batch_size, nmels, max_mel_length)).float() + texts = torch.zeros((batch_size, max_text_length)).long() + + input_lengths = torch.zeros(batch_size).long() + output_lengths = torch.zeros(batch_size).long() + paths = ['' for _ in range(batch_size)] + waves = [None for _ in range(batch_size)] + + for bid, (mel, text, path, wave) in enumerate(batch): + mel_size = mel.size(1) + text_size = text.size(0) + mels[bid, :, :mel_size] = mel + texts[bid, :text_size] = text + input_lengths[bid] = text_size + output_lengths[bid] = mel_size + paths[bid] = path + + waves[bid] = wave + + return waves, texts, input_lengths, mels, output_lengths + + +def get_length(wave_path, root_path): + info = sf.info(osp.join(root_path, wave_path)) + return info.frames * (24000 / info.samplerate) + +def build_dataloader(path_list, + root_path, + symbol_dict, + validation=False, + batch_size=4, + num_workers=1, + device='cpu', + collate_config={}, + dataset_config={}): + + dataset = FilePathDataset(path_list, root_path, symbol_dict, validation=validation, **dataset_config) + collate_fn = Collater(**collate_config) + + print("Getting sample lengths...") + + num_processes = num_workers * 2 + if num_processes != 0: + list_of_tuples = [(d[0], root_path) for d in dataset.data_list] + with Pool(processes=num_processes) as pool: + sample_lengths = pool.starmap(get_length, list_of_tuples, chunksize=16) + else: + sample_lengths = [] + for d in dataset.data_list: + sample_lengths.append(get_length(d[0], root_path)) + + data_loader = torch.utils.data.DataLoader( + dataset, + num_workers=num_workers, + batch_sampler=BatchSampler( + sample_lengths, + batch_size, + shuffle=(not validation), + drop_last=(not validation), + num_replicas=1, + rank=0, + ), + collate_fn=collate_fn, + pin_memory=(device != "cpu"), + ) + + return data_loader + +#https://github.com/duerig/StyleTTS2/ +class BatchSampler(torch.utils.data.Sampler): + def __init__( + self, + sample_lengths, + batch_sizes, + num_replicas=None, + rank=None, + shuffle=True, + drop_last=False, + ): + self.batch_sizes = batch_sizes + if num_replicas is None: + self.num_replicas = dist.get_world_size() + else: + self.num_replicas = num_replicas + if rank is None: + self.rank = dist.get_rank() + else: + self.rank = rank + self.shuffle = shuffle + self.drop_last = drop_last + + self.time_bins = {} + self.epoch = 0 + self.total_len = 0 + self.last_bin = None + + for i in range(len(sample_lengths)): + bin_num = self.get_time_bin(sample_lengths[i]) + if bin_num != -1: + if bin_num not in self.time_bins: + self.time_bins[bin_num] = [] + self.time_bins[bin_num].append(i) + + for key in self.time_bins.keys(): + val = self.time_bins[key] + total_batch = self.batch_sizes * num_replicas + self.total_len += len(val) // total_batch + if not self.drop_last and len(val) % total_batch != 0: + self.total_len += 1 + + def __iter__(self): + sampler_order = list(self.time_bins.keys()) + sampler_indices = [] + + if self.shuffle: + sampler_indices = torch.randperm(len(sampler_order)).tolist() + else: + sampler_indices = list(range(len(sampler_order))) + + for index in sampler_indices: + key = sampler_order[index] + current_bin = self.time_bins[key] + dist = torch.utils.data.distributed.DistributedSampler( + current_bin, + num_replicas=self.num_replicas, + rank=self.rank, + shuffle=self.shuffle, + drop_last=self.drop_last, + ) + dist.set_epoch(self.epoch) + sampler = torch.utils.data.sampler.BatchSampler( + dist, self.batch_sizes, self.drop_last + ) + for item_list in sampler: + self.last_bin = key + yield [current_bin[i] for i in item_list] + + def __len__(self): + return self.total_len + + def set_epoch(self, epoch): + self.epoch = epoch + + def get_time_bin(self, sample_count): + result = -1 + frames = sample_count // 300 + if frames >= 20: + result = (frames - 20) // 20 + return result \ No newline at end of file diff --git a/en/StyleTTS2-lite/models.py b/en/StyleTTS2-lite/models.py new file mode 100644 index 0000000000000000000000000000000000000000..f5af8b0c8c8db61a9b7d8bd38a22f8c30bd7cf2c --- /dev/null +++ b/en/StyleTTS2-lite/models.py @@ -0,0 +1,532 @@ +import math +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.utils import weight_norm + +from munch import Munch + +class LearnedDownSample(nn.Module): + def __init__(self, layer_type, dim_in): + super().__init__() + self.layer_type = layer_type + + if self.layer_type == 'none': + self.conv = nn.Identity() + elif self.layer_type == 'timepreserve': + self.conv = nn.Conv2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, padding=(1, 0)) + elif self.layer_type == 'half': + self.conv = nn.Conv2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, padding=1) + else: + raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type) + + def forward(self, x): + return self.conv(x) + +class LearnedUpSample(nn.Module): + def __init__(self, layer_type, dim_in): + super().__init__() + self.layer_type = layer_type + + if self.layer_type == 'none': + self.conv = nn.Identity() + elif self.layer_type == 'timepreserve': + self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, output_padding=(1, 0), padding=(1, 0)) + elif self.layer_type == 'half': + self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, output_padding=1, padding=1) + else: + raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type) + + + def forward(self, x): + return self.conv(x) + +class DownSample(nn.Module): + def __init__(self, layer_type): + super().__init__() + self.layer_type = layer_type + + def forward(self, x): + if self.layer_type == 'none': + return x + elif self.layer_type == 'timepreserve': + return F.avg_pool2d(x, (2, 1)) + elif self.layer_type == 'half': + if x.shape[-1] % 2 != 0: + x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1) + return F.avg_pool2d(x, 2) + else: + raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type) + + +class UpSample(nn.Module): + def __init__(self, layer_type): + super().__init__() + self.layer_type = layer_type + + def forward(self, x): + if self.layer_type == 'none': + return x + elif self.layer_type == 'timepreserve': + return F.interpolate(x, scale_factor=(2, 1), mode='nearest') + elif self.layer_type == 'half': + return F.interpolate(x, scale_factor=2, mode='nearest') + else: + raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type) + + +class ResBlk(nn.Module): + def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2), + normalize=False, downsample='none'): + super().__init__() + self.actv = actv + self.normalize = normalize + self.downsample = DownSample(downsample) + self.downsample_res = LearnedDownSample(downsample, dim_in) + self.learned_sc = dim_in != dim_out + self._build_weights(dim_in, dim_out) + + def _build_weights(self, dim_in, dim_out): + self.conv1 = nn.Conv2d(dim_in, dim_in, 3, 1, 1) + self.conv2 = nn.Conv2d(dim_in, dim_out, 3, 1, 1) + if self.normalize: + self.norm1 = nn.InstanceNorm2d(dim_in, affine=True) + self.norm2 = nn.InstanceNorm2d(dim_in, affine=True) + if self.learned_sc: + self.conv1x1 = nn.Conv2d(dim_in, dim_out, 1, 1, 0, bias=False) + + def _shortcut(self, x): + if self.learned_sc: + x = self.conv1x1(x) + if self.downsample: + x = self.downsample(x) + return x + + def _residual(self, x): + if self.normalize: + x = self.norm1(x) + x = self.actv(x) + x = self.conv1(x) + x = self.downsample_res(x) + if self.normalize: + x = self.norm2(x) + x = self.actv(x) + x = self.conv2(x) + return x + + def forward(self, x): + x = self._shortcut(x) + self._residual(x) + return x / math.sqrt(2) # unit variance + +class StyleEncoder(nn.Module): + def __init__(self, dim_in=48, style_dim=48, max_conv_dim=384): + super().__init__() + blocks = [] + blocks += [nn.Conv2d(1, dim_in, 3, 1, 1)] + + repeat_num = 4 + for _ in range(repeat_num): + dim_out = min(dim_in*2, max_conv_dim) + blocks += [ResBlk(dim_in, dim_out, downsample='half')] + dim_in = dim_out + + blocks += [nn.LeakyReLU(0.2)] + blocks += [nn.Conv2d(dim_out, dim_out, 5, 1, 0)] + blocks += [nn.AdaptiveAvgPool2d(1)] + blocks += [nn.LeakyReLU(0.2)] + self.shared = nn.Sequential(*blocks) + + self.unshared = nn.Linear(dim_out, style_dim) + + def forward(self, x): + h = self.shared(x) + h = h.view(h.size(0), -1) + s = self.unshared(h) + + return s + +class LinearNorm(torch.nn.Module): + def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'): + super(LinearNorm, self).__init__() + self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias) + + torch.nn.init.xavier_uniform_( + self.linear_layer.weight, + gain=torch.nn.init.calculate_gain(w_init_gain)) + + def forward(self, x): + return self.linear_layer(x) + +class ResBlk1d(nn.Module): + def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2), + normalize=False, downsample='none', dropout_p=0.2): + super().__init__() + self.actv = actv + self.normalize = normalize + self.downsample_type = downsample + self.learned_sc = dim_in != dim_out + self._build_weights(dim_in, dim_out) + self.dropout_p = dropout_p + + if self.downsample_type == 'none': + self.pool = nn.Identity() + else: + self.pool = weight_norm(nn.Conv1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1)) + + def _build_weights(self, dim_in, dim_out): + self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_in, 3, 1, 1)) + self.conv2 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1)) + if self.normalize: + self.norm1 = nn.InstanceNorm1d(dim_in, affine=True) + self.norm2 = nn.InstanceNorm1d(dim_in, affine=True) + if self.learned_sc: + self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False)) + + def downsample(self, x): + if self.downsample_type == 'none': + return x + else: + if x.shape[-1] % 2 != 0: + x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1) + return F.avg_pool1d(x, 2) + + def _shortcut(self, x): + if self.learned_sc: + x = self.conv1x1(x) + x = self.downsample(x) + return x + + def _residual(self, x): + if self.normalize: + x = self.norm1(x) + x = self.actv(x) + x = F.dropout(x, p=self.dropout_p, training=self.training) + + x = self.conv1(x) + x = self.pool(x) + if self.normalize: + x = self.norm2(x) + + x = self.actv(x) + x = F.dropout(x, p=self.dropout_p, training=self.training) + + x = self.conv2(x) + return x + + def forward(self, x): + x = self._shortcut(x) + self._residual(x) + return x / math.sqrt(2) # unit variance + +class LayerNorm(nn.Module): + def __init__(self, channels, eps=1e-5): + super().__init__() + self.channels = channels + self.eps = eps + + self.gamma = nn.Parameter(torch.ones(channels)) + self.beta = nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + x = x.transpose(1, -1) + x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) + return x.transpose(1, -1) + +class TextEncoder(nn.Module): + def __init__(self, channels, kernel_size, depth, n_symbols, actv=nn.LeakyReLU(0.2)): + super().__init__() + self.embedding = nn.Embedding(n_symbols, channels) + + padding = (kernel_size - 1) // 2 + self.cnn = nn.ModuleList() + for _ in range(depth): + self.cnn.append(nn.Sequential( + weight_norm(nn.Conv1d(channels, channels, kernel_size=kernel_size, padding=padding)), + LayerNorm(channels), + actv, + nn.Dropout(0.2), + )) + # self.cnn = nn.Sequential(*self.cnn) + + self.lstm = nn.LSTM(channels, channels//2, 1, batch_first=True, bidirectional=True) + + def forward(self, x, input_lengths, m): + x = self.embedding(x) # [B, T, emb] + x = x.transpose(1, 2) # [B, emb, T] + m = m.to(input_lengths.device).unsqueeze(1) + x.masked_fill_(m, 0.0) + + for c in self.cnn: + x = c(x) + x.masked_fill_(m, 0.0) + + x = x.transpose(1, 2) # [B, T, chn] + + input_lengths = input_lengths.cpu().numpy() + x = nn.utils.rnn.pack_padded_sequence( + x, input_lengths, batch_first=True, enforce_sorted=False) + + self.lstm.flatten_parameters() + x, _ = self.lstm(x) + x, _ = nn.utils.rnn.pad_packed_sequence( + x, batch_first=True) + + x = x.transpose(-1, -2) + x_pad = torch.zeros([x.shape[0], x.shape[1], m.shape[-1]]) + + x_pad[:, :, :x.shape[-1]] = x + x = x_pad.to(x.device) + + x.masked_fill_(m, 0.0) + + return x + + def inference(self, x): + x = self.embedding(x) + x = x.transpose(1, 2) + x = self.cnn(x) + x = x.transpose(1, 2) + self.lstm.flatten_parameters() + x, _ = self.lstm(x) + return x + + def length_to_mask(self, lengths): + mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths) + mask = torch.gt(mask+1, lengths.unsqueeze(1)) + return mask + + + +class AdaIN1d(nn.Module): + def __init__(self, style_dim, num_features): + super().__init__() + self.norm = nn.InstanceNorm1d(num_features, affine=False) + self.fc = nn.Linear(style_dim, num_features*2) + + def forward(self, x, s): + h = self.fc(s) + h = h.view(h.size(0), h.size(1), 1) + gamma, beta = torch.chunk(h, chunks=2, dim=1) + return (1 + gamma) * self.norm(x) + beta + +class UpSample1d(nn.Module): + def __init__(self, layer_type): + super().__init__() + self.layer_type = layer_type + + def forward(self, x): + if self.layer_type == 'none': + return x + else: + return F.interpolate(x, scale_factor=2, mode='nearest') + +class AdainResBlk1d(nn.Module): + def __init__(self, dim_in, dim_out, style_dim=64, actv=nn.LeakyReLU(0.2), + upsample='none', dropout_p=0.0): + super().__init__() + self.actv = actv + self.upsample_type = upsample + self.upsample = UpSample1d(upsample) + self.learned_sc = dim_in != dim_out + self._build_weights(dim_in, dim_out, style_dim) + self.dropout = nn.Dropout(dropout_p) + + if upsample == 'none': + self.pool = nn.Identity() + else: + self.pool = weight_norm(nn.ConvTranspose1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1, output_padding=1)) + + + def _build_weights(self, dim_in, dim_out, style_dim): + self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1)) + self.conv2 = weight_norm(nn.Conv1d(dim_out, dim_out, 3, 1, 1)) + self.norm1 = AdaIN1d(style_dim, dim_in) + self.norm2 = AdaIN1d(style_dim, dim_out) + if self.learned_sc: + self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False)) + + def _shortcut(self, x): + x = self.upsample(x) + if self.learned_sc: + x = self.conv1x1(x) + return x + + def _residual(self, x, s): + x = self.norm1(x, s) + x = self.actv(x) + x = self.pool(x) + x = self.conv1(self.dropout(x)) + x = self.norm2(x, s) + x = self.actv(x) + x = self.conv2(self.dropout(x)) + return x + + def forward(self, x, s): + out = self._residual(x, s) + out = (out + self._shortcut(x)) / math.sqrt(2) + return out + +class AdaLayerNorm(nn.Module): + def __init__(self, style_dim, channels, eps=1e-5): + super().__init__() + self.channels = channels + self.eps = eps + + self.fc = nn.Linear(style_dim, channels*2) + + def forward(self, x, s): + x = x.transpose(-1, -2) + x = x.transpose(1, -1) + + h = self.fc(s) + h = h.view(h.size(0), h.size(1), 1) + gamma, beta = torch.chunk(h, chunks=2, dim=1) + gamma, beta = gamma.transpose(1, -1), beta.transpose(1, -1) + + + x = F.layer_norm(x, (self.channels,), eps=self.eps) + x = (1 + gamma) * x + beta + return x.transpose(1, -1).transpose(-1, -2) + +class ProsodyPredictor(nn.Module): + + def __init__(self, style_dim, d_hid, nlayers, max_dur=50, dropout=0.1): + super().__init__() + + self.text_encoder = DurationEncoder(sty_dim=style_dim, + d_model=d_hid, + nlayers=nlayers, + dropout=dropout) + + self.lstm = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True) + self.duration_proj = LinearNorm(d_hid, max_dur) + + self.shared = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True) + self.F0 = nn.ModuleList() + self.F0.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout)) + self.F0.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout)) + self.F0.append(AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim, dropout_p=dropout)) + + self.N = nn.ModuleList() + self.N.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout)) + self.N.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout)) + self.N.append(AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim, dropout_p=dropout)) + + self.F0_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0) + self.N_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0) + + + def forward(self, texts, style, text_lengths, alignment, m): + d = self.text_encoder(texts, style, text_lengths, m) + + batch_size = d.shape[0] + text_size = d.shape[1] + + # predict duration + input_lengths = text_lengths.cpu().numpy() + x = nn.utils.rnn.pack_padded_sequence( + d, input_lengths, batch_first=True, enforce_sorted=False) + + m = m.to(text_lengths.device).unsqueeze(1) + + self.lstm.flatten_parameters() + x, _ = self.lstm(x) + x, _ = nn.utils.rnn.pad_packed_sequence( + x, batch_first=True) + + x_pad = torch.zeros([x.shape[0], m.shape[-1], x.shape[-1]]) + + x_pad[:, :x.shape[1], :] = x + x = x_pad.to(x.device) + + duration = self.duration_proj(nn.functional.dropout(x, 0.5, training=self.training)) + + en = (d.transpose(-1, -2) @ alignment) + + return duration.squeeze(-1), en + + def F0Ntrain(self, x, s): + x, _ = self.shared(x.transpose(-1, -2)) + + F0 = x.transpose(-1, -2) + for block in self.F0: + F0 = block(F0, s) + F0 = self.F0_proj(F0) + + N = x.transpose(-1, -2) + for block in self.N: + N = block(N, s) + N = self.N_proj(N) + + return F0.squeeze(1), N.squeeze(1) + + def length_to_mask(self, lengths): + mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths) + mask = torch.gt(mask+1, lengths.unsqueeze(1)) + return mask + +class DurationEncoder(nn.Module): + + def __init__(self, sty_dim, d_model, nlayers, dropout=0.1): + super().__init__() + self.lstms = nn.ModuleList() + for _ in range(nlayers): + self.lstms.append(nn.LSTM(d_model + sty_dim, + d_model // 2, + num_layers=1, + batch_first=True, + bidirectional=True, + dropout=dropout)) + self.lstms.append(AdaLayerNorm(sty_dim, d_model)) + + + self.dropout = dropout + self.d_model = d_model + self.sty_dim = sty_dim + + def forward(self, x, style, text_lengths, m): + masks = m.to(text_lengths.device) + + x = x.permute(2, 0, 1) + s = style.expand(x.shape[0], x.shape[1], -1) + x = torch.cat([x, s], axis=-1) + x.masked_fill_(masks.unsqueeze(-1).transpose(0, 1), 0.0) + + x = x.transpose(0, 1) + input_lengths = text_lengths.cpu().numpy() + x = x.transpose(-1, -2) + + for block in self.lstms: + if isinstance(block, AdaLayerNorm): + x = block(x.transpose(-1, -2), style).transpose(-1, -2) + x = torch.cat([x, s.permute(1, -1, 0)], axis=1) + x.masked_fill_(masks.unsqueeze(-1).transpose(-1, -2), 0.0) + else: + x = x.transpose(-1, -2) + x = nn.utils.rnn.pack_padded_sequence( + x, input_lengths, batch_first=True, enforce_sorted=False) + block.flatten_parameters() + x, _ = block(x) + x, _ = nn.utils.rnn.pad_packed_sequence( + x, batch_first=True) + x = F.dropout(x, p=self.dropout, training=self.training) + x = x.transpose(-1, -2) + + x_pad = torch.zeros([x.shape[0], x.shape[1], m.shape[-1]]) + + x_pad[:, :, :x.shape[-1]] = x + x = x_pad.to(x.device) + + return x.transpose(-1, -2) + + def inference(self, x, style): + x = self.embedding(x.transpose(-1, -2)) * math.sqrt(self.d_model) + style = style.expand(x.shape[0], x.shape[1], -1) + x = torch.cat([x, style], axis=-1) + src = self.pos_encoder(x) + output = self.transformer_encoder(src).transpose(0, 1) + return output + + def length_to_mask(self, lengths): + mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths) + mask = torch.gt(mask+1, lengths.unsqueeze(1)) + return mask \ No newline at end of file diff --git a/en/StyleTTS2-lite/requirements.txt b/en/StyleTTS2-lite/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..ab81f27db9222d271b07031a9db204fad84e02b0 --- /dev/null +++ b/en/StyleTTS2-lite/requirements.txt @@ -0,0 +1,10 @@ +torch +torchaudio +numpy +PyYAML +munch +nltk +librosa +noisereduce +phonemizer +espeakng-loader \ No newline at end of file diff --git a/en/StyleTTS2-lite/run.ipynb b/en/StyleTTS2-lite/run.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..7c148873edd6eb2ac9b38817dcce8678b057f7b0 --- /dev/null +++ b/en/StyleTTS2-lite/run.ipynb @@ -0,0 +1,176 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "5a3ddcc8", + "metadata": {}, + "outputs": [], + "source": [ + "from inference import StyleTTS2\n", + "\n", + "import librosa\n", + "import IPython.display as ipd\n", + "import torch.cuda\n", + "\n", + "device = 'cuda' if torch.cuda.is_available() else 'cpu'" + ] + }, + { + "cell_type": "markdown", + "id": "092cfb69", + "metadata": {}, + "source": [ + "### Load G2P" + ] + }, + { + "cell_type": "markdown", + "id": "a152ec13", + "metadata": {}, + "source": [ + "If you did not use eSpeak for your language, please add your own G2P." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca224f37", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import phonemizer\n", + "if sys.platform.startswith(\"win\"):\n", + " try:\n", + " from phonemizer.backend.espeak.wrapper import EspeakWrapper\n", + " import espeakng_loader\n", + " EspeakWrapper.set_library(espeakng_loader.get_library_path())\n", + " except Exception as e:\n", + " print(e)\n", + "\n", + "def get_phoneme(text, lang):\n", + " try:\n", + " my_phonemizer = phonemizer.backend.EspeakBackend(language=lang, preserve_punctuation=True, with_stress=True, language_switch='remove-flags')\n", + " return my_phonemizer.phonemize([text])[0]\n", + " except Exception as e:\n", + " print(e)" + ] + }, + { + "cell_type": "markdown", + "id": "7b9cecbe", + "metadata": {}, + "source": [ + "### Load models" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7b9c01d", + "metadata": {}, + "outputs": [], + "source": [ + "config_path = \"Models/config.yaml\"\n", + "models_path = \"Models/inference/model.pth\"" + ] + }, + { + "cell_type": "markdown", + "id": "b803110e", + "metadata": {}, + "source": [ + "### Synthesize speech\n", + "\n", + "Little Note: Reference audio has a huge impact on the result. It is best to select audio around 10s long and consistent in both tone and speed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78396f70", + "metadata": {}, + "outputs": [], + "source": [ + "speaker = {\n", + " \"path\": \"./Audio/1_heart.wav\", #Ref audio path\n", + " \"speed\": 1.0, #Speaking speed\n", + "}\n", + "\n", + "max_samples = 24000*20 #max 20 seconds ref audio\n", + "print(speaker['path'])\n", + "wave, sr = librosa.load(speaker['path'], sr=24000)\n", + "audio, index = librosa.effects.trim(wave, top_db=30)\n", + "if sr != 24000: audio = librosa.resample(audio, sr, 24000)\n", + "if len(audio) > max_samples: audio = audio[:max_samples]\n", + "display(ipd.Audio(audio, rate=24000, normalize=True))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "395959f1", + "metadata": {}, + "outputs": [], + "source": [ + "text = '''\n", + "Nearly 300 scholars currently working in the United States have applied for positions at Aix-Marseille University in France, which has announced a program to provide a haven for academics affected by the Trump administration's policies.\n", + "Aix-Marseille launched the \"Safe Place for Science\" initiative earlier this year, offering three-year funded placements for approximately 20 researchers. The program aims to support scholars facing budget cuts and policy changes that have disrupted U.S. academic institutions.\n", + "'''" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16194211", + "metadata": {}, + "outputs": [], + "source": [ + "model = StyleTTS2(config_path, models_path).eval().to(device)\n", + "avg_style = True #BOOL Split the ref audio and calculate the avg styles.\n", + "stabilize = False #BOOL Stabilize speaking speed.\n", + "denoise = 0.3 #FLOAT Adjust the strength of the denoiser. Value range is [0, 1]\n", + "n_merge = 16 #INT Avoid short sentences by merging when a sentence has fewer than n words" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "980c6fbb", + "metadata": {}, + "outputs": [], + "source": [ + "with torch.no_grad():\n", + " phonemes = get_phoneme(text=text, lang=\"en-us\")\n", + "\n", + " styles = model.get_styles(speaker, denoise, avg_style)\n", + " r = model.generate(phonemes, styles, stabilize, n_merge)\n", + "\n", + "print('Synthesized:')\n", + "display(ipd.Audio(r, rate=24000, normalize=True))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/en/StyleTTS2-lite/source.txt b/en/StyleTTS2-lite/source.txt new file mode 100644 index 0000000000000000000000000000000000000000..9c854ac84b5abb38481db029e9e07e91c7ecaffc --- /dev/null +++ b/en/StyleTTS2-lite/source.txt @@ -0,0 +1 @@ +https://huggingface.co/dangtr0408/StyleTTS2-lite \ No newline at end of file diff --git a/en/StyleTTS2/.gitattributes b/en/StyleTTS2/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..dab9a4e17afd2ef39d90ccb0b40ef2786fe77422 --- /dev/null +++ b/en/StyleTTS2/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/en/StyleTTS2/Multi0/config.yml b/en/StyleTTS2/Multi0/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..362d107fa43ba3beea7f31ff25e7e3505f667355 --- /dev/null +++ b/en/StyleTTS2/Multi0/config.yml @@ -0,0 +1,112 @@ +ASR_config: Utils/ASR/config.yml +ASR_path: Utils/ASR/epoch_00080.pth +F0_path: Utils/JDC/bst.t7 +PLBERT_dir: Utils/PLBERT/ +batch_size: 6 +data_params: + OOD_data: Data/OOD_texts.txt + min_length: 50 + root_path: /root/StyleTTS2/Omni1_data + train_data: Data/train_list.txt + val_data: Data/val_list.txt +device: cuda +epochs_1st: 200 +epochs_2nd: 60 +first_stage_path: first_stage.pth +log_dir: Models/Omni1 +log_interval: 10 +loss_params: + TMA_epoch: 50 + diff_epoch: 14 + joint_epoch: 19 + lambda_F0: 1.0 + lambda_ce: 20.0 + lambda_diff: 1.0 + lambda_dur: 1.0 + lambda_gen: 1.0 + lambda_mel: 5.0 + lambda_mono: 1.0 + lambda_norm: 1.0 + lambda_s2s: 1.0 + lambda_slm: 1.0 + lambda_sty: 1.0 +max_len: 400 +model_params: + decoder: + gen_istft_hop_size: 5 + gen_istft_n_fft: 20 + resblock_dilation_sizes: + - - 1 + - 3 + - 5 + - - 1 + - 3 + - 5 + - - 1 + - 3 + - 5 + resblock_kernel_sizes: + - 3 + - 7 + - 11 + type: istftnet + upsample_initial_channel: 512 + upsample_kernel_sizes: + - 20 + - 12 + upsample_rates: + - 10 + - 6 + diffusion: + dist: + estimate_sigma_data: true + mean: -3.0 + sigma_data: 0.3141927569675583 + std: 1.0 + embedding_mask_proba: 0.1 + transformer: + head_features: 64 + multiplier: 2 + num_heads: 8 + num_layers: 3 + dim_in: 64 + dropout: 0.2 + hidden_dim: 512 + max_conv_dim: 512 + max_dur: 50 + multispeaker: true + n_layer: 3 + n_mels: 80 + n_token: 178 + slm: + hidden: 768 + initial_channel: 64 + model: microsoft/wavlm-base-plus + nlayers: 13 + sr: 16000 + style_dim: 128 +optimizer_params: + bert_lr: 1.0e-05 + ft_lr: 1.0e-05 + lr: 0.0001 +preprocess_params: + spect_params: + hop_length: 300 + n_fft: 2048 + win_length: 1200 + sr: 24000 +pretrained_model: /root/StyleTTS2/Models/Omni1/epoch_2nd_pretrained.pth +resume: true +save_freq: 1 +saver_freq_steps: 150 +saver_max_ckpts: 5 +saver_mode: ITER +second_stage_load_pretrained: true +slmadv_params: + batch_percentage: 0.5 + iter: 10 + max_len: 400 + min_len: 160 + scale: 0.01 + sig: 1.5 + thresh: 5 diff --git a/en/StyleTTS2/Multi0/config_30_e934.yml b/en/StyleTTS2/Multi0/config_30_e934.yml new file mode 100644 index 0000000000000000000000000000000000000000..80ff142816ed6a7c89eab8b39f8920c55336f23f --- /dev/null +++ b/en/StyleTTS2/Multi0/config_30_e934.yml @@ -0,0 +1,22 @@ +{ASR_config: Utils/ASR/config.yml, ASR_path: Utils/ASR/epoch_00080.pth, F0_path: Utils/JDC/bst.t7, + PLBERT_dir: Utils/PLBERT/, batch_size: 12, data_params: {OOD_data: Data/OOD_texts.txt, + min_length: 50, root_path: /root/StyleTTS2/Multi0_data, train_data: Data/train_list.txt, + val_data: Data/val_list.txt}, device: cuda, epochs_1st: 200, epochs_2nd: 60, first_stage_path: first_stage.pth, + log_dir: Models/Multi0, log_interval: 10, loss_params: {TMA_epoch: 50, diff_epoch: 14, + joint_epoch: 19, lambda_F0: 1.0, lambda_ce: 20.0, lambda_diff: 1.0, lambda_dur: 1.0, + lambda_gen: 1.0, lambda_mel: 5.0, lambda_mono: 1.0, lambda_norm: 1.0, lambda_s2s: 1.0, + lambda_slm: 1.0, lambda_sty: 1.0}, max_len: 400, model_params: {decoder: {gen_istft_hop_size: 5, + gen_istft_n_fft: 20, resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, + 5]], resblock_kernel_sizes: [3, 7, 11], type: istftnet, upsample_initial_channel: 512, + upsample_kernel_sizes: [20, 12], upsample_rates: [10, 6]}, diffusion: {dist: { + estimate_sigma_data: true, mean: -3.0, sigma_data: 0.31839087134423844, std: 1.0}, + embedding_mask_proba: 0.1, transformer: {head_features: 64, multiplier: 2, num_heads: 8, + num_layers: 3}}, dim_in: 64, dropout: 0.2, hidden_dim: 512, max_conv_dim: 512, + max_dur: 50, multispeaker: true, n_layer: 3, n_mels: 80, n_token: 178, slm: { + hidden: 768, initial_channel: 64, model: microsoft/wavlm-base-plus, nlayers: 13, + sr: 16000}, style_dim: 128}, optimizer_params: {bert_lr: 1.0e-05, ft_lr: 1.0e-05, + lr: 0.0001}, preprocess_params: {spect_params: {hop_length: 300, n_fft: 2048, + win_length: 1200}, sr: 24000}, pretrained_model: '', resume: true, save_freq: 1, + saver_freq_steps: 150, saver_max_ckpts: 5, saver_mode: ITER, second_stage_load_pretrained: true, + slmadv_params: {batch_percentage: 0.5, iter: 10, max_len: 400, min_len: 160, scale: 0.01, + sig: 1.5, thresh: 5}} diff --git a/en/StyleTTS2/Multi0/config_40_1c872.yml b/en/StyleTTS2/Multi0/config_40_1c872.yml new file mode 100644 index 0000000000000000000000000000000000000000..2cf05e14bf46f77e7701da54863459007d307a70 --- /dev/null +++ b/en/StyleTTS2/Multi0/config_40_1c872.yml @@ -0,0 +1,22 @@ +{ASR_config: Utils/ASR/config.yml, ASR_path: Utils/ASR/epoch_00080.pth, F0_path: Utils/JDC/bst.t7, + PLBERT_dir: Utils/PLBERT/, batch_size: 6, data_params: {OOD_data: Data/OOD_texts.txt, + min_length: 50, root_path: /root/StyleTTS2/Multi0_data, train_data: Data/train_list.txt, + val_data: Data/val_list.txt}, device: cuda, epochs_1st: 200, epochs_2nd: 60, first_stage_path: first_stage.pth, + log_dir: Models/Multi0, log_interval: 10, loss_params: {TMA_epoch: 50, diff_epoch: 14, + joint_epoch: 19, lambda_F0: 1.0, lambda_ce: 20.0, lambda_diff: 1.0, lambda_dur: 1.0, + lambda_gen: 1.0, lambda_mel: 5.0, lambda_mono: 1.0, lambda_norm: 1.0, lambda_s2s: 1.0, + lambda_slm: 1.0, lambda_sty: 1.0}, max_len: 400, model_params: {decoder: {gen_istft_hop_size: 5, + gen_istft_n_fft: 20, resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, + 5]], resblock_kernel_sizes: [3, 7, 11], type: istftnet, upsample_initial_channel: 512, + upsample_kernel_sizes: [20, 12], upsample_rates: [10, 6]}, diffusion: {dist: { + estimate_sigma_data: true, mean: -3.0, sigma_data: 0.2969374090377316, std: 1.0}, + embedding_mask_proba: 0.1, transformer: {head_features: 64, multiplier: 2, num_heads: 8, + num_layers: 3}}, dim_in: 64, dropout: 0.2, hidden_dim: 512, max_conv_dim: 512, + max_dur: 50, multispeaker: true, n_layer: 3, n_mels: 80, n_token: 178, slm: { + hidden: 768, initial_channel: 64, model: microsoft/wavlm-base-plus, nlayers: 13, + sr: 16000}, style_dim: 128}, optimizer_params: {bert_lr: 1.0e-05, ft_lr: 1.0e-05, + lr: 0.0001}, preprocess_params: {spect_params: {hop_length: 300, n_fft: 2048, + win_length: 1200}, sr: 24000}, pretrained_model: '', resume: true, save_freq: 1, + saver_freq_steps: 150, saver_max_ckpts: 5, saver_mode: ITER, second_stage_load_pretrained: true, + slmadv_params: {batch_percentage: 0.5, iter: 10, max_len: 400, min_len: 160, scale: 0.01, + sig: 1.5, thresh: 5}} diff --git a/en/StyleTTS2/Multi0/epoch_2nd_23_9ab0.pth b/en/StyleTTS2/Multi0/epoch_2nd_23_9ab0.pth new file mode 100644 index 0000000000000000000000000000000000000000..b848e5ecf112633dec05dc0cf4eceea61c80dcb9 --- /dev/null +++ b/en/StyleTTS2/Multi0/epoch_2nd_23_9ab0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbe9be5c4c2df12b5ddb65cce7e45849d3ed674db1fcb89eb7f1bafc65f05ade +size 2132412506 diff --git a/en/StyleTTS2/Multi0/epoch_2nd_30_e934.pth b/en/StyleTTS2/Multi0/epoch_2nd_30_e934.pth new file mode 100644 index 0000000000000000000000000000000000000000..49e867447ea0848db3531080dbd2bbc7418bb562 --- /dev/null +++ b/en/StyleTTS2/Multi0/epoch_2nd_30_e934.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7af95c2c61a778fec6ad5cec95497daaf9bf3dd6cec6db7f02f4fe90e3e5657a +size 2132412506 diff --git a/en/StyleTTS2/Multi0/epoch_2nd_40_1c872.pth b/en/StyleTTS2/Multi0/epoch_2nd_40_1c872.pth new file mode 100644 index 0000000000000000000000000000000000000000..b671a74550df6f3551e43201aef6b3a22ae3bddc --- /dev/null +++ b/en/StyleTTS2/Multi0/epoch_2nd_40_1c872.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:065b610ae5fd9fc73eea396761d42a99a4770a243aca76aa7db4ff9bd13d81ac +size 2132415942 diff --git a/en/StyleTTS2/Multi0/ref_audio.zip b/en/StyleTTS2/Multi0/ref_audio.zip new file mode 100644 index 0000000000000000000000000000000000000000..0ee34c08b9649f98823c328837f71b60d9941ad5 --- /dev/null +++ b/en/StyleTTS2/Multi0/ref_audio.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa342abae6a7d06b84508e828c1082aa0fc6d484bd709cb40650d879c31c5f16 +size 4766523 diff --git a/en/StyleTTS2/README.md b/en/StyleTTS2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a6ff1af9223ec0e97215e9f34fdeac1b9a18841b --- /dev/null +++ b/en/StyleTTS2/README.md @@ -0,0 +1,7 @@ +--- +datasets: +- therealvul/StyleTTS2MLP +language: +- en +--- +This repository contains StyleTTS2 models trained on Pony Preservation Project data \ No newline at end of file diff --git a/en/StyleTTS2/Twilight0/config.yml b/en/StyleTTS2/Twilight0/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..389b1c0c7346198dc08a56ca2d51a26a8565e463 --- /dev/null +++ b/en/StyleTTS2/Twilight0/config.yml @@ -0,0 +1,118 @@ +log_dir: "Models/Twilight0" +first_stage_path: "epoch_1st_00066.pth" +save_freq: 1 +log_interval: 10 +device: "cuda" +epochs_1st: 200 # number of epochs for first stage training (pre-training) +epochs_2nd: 100 # number of peochs for second stage training (joint training) +batch_size: 2 +segmented_batch_size: [3, 2, 2] +max_len: 175 # maximum number of frames +pretrained_model: "Models/Twilight0/epoch_2nd_00006.pth.bak" +#pretrained_model: "Models/Twilight0/epoch_1st_00067.pth" +second_stage_load_pretrained: true # set to true if the pre-trained model is for 2nd stage +load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters + +F0_path: "Utils/JDC/bst.t7" +ASR_config: "Utils/ASR/config.yml" +ASR_path: "Utils/ASR/epoch_00080.pth" +PLBERT_dir: 'Utils/PLBERT/' + +data_params: + train_data: "Data/train_list_small.txt" + val_data: "Data/val_list_small.txt" + root_path: "twilight_data" + OOD_data: "Data/OOD_texts.txt" + min_length: 50 # sample until texts with this size are obtained for OOD texts + +preprocess_params: + sr: 24000 + spect_params: + n_fft: 2048 + win_length: 1200 + hop_length: 300 + +model_params: + multispeaker: false + + dim_in: 64 + hidden_dim: 512 + max_conv_dim: 512 + n_layer: 3 + n_mels: 80 + + n_token: 178 # number of phoneme tokens + max_dur: 50 # maximum duration of a single phoneme + style_dim: 128 # style vector size + + dropout: 0.2 + + # config for decoder + decoder: + type: 'istftnet' # either hifigan or istftnet + resblock_kernel_sizes: [3,7,11] + upsample_rates : [10, 6] + upsample_initial_channel: 512 + resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] + upsample_kernel_sizes: [20, 12] + gen_istft_n_fft: 20 + gen_istft_hop_size: 5 + + # speech language model config + slm: + model: 'microsoft/wavlm-base-plus' + sr: 16000 # sampling rate of SLM + hidden: 768 # hidden size of SLM + nlayers: 13 # number of layers of SLM + initial_channel: 64 # initial channels of SLM discriminator head + + # style diffusion model config + diffusion: + embedding_mask_proba: 0.1 + # transformer config + transformer: + num_layers: 3 + num_heads: 8 + head_features: 64 + multiplier: 2 + + # diffusion distribution config + dist: + sigma_data: 0.681720565168225 # placeholder for estimate_sigma_data set to false + estimate_sigma_data: true # estimate sigma_data from the current batch if set to true + mean: -3.0 + std: 1.0 + +loss_params: + lambda_mel: 5. # mel reconstruction loss + lambda_gen: 1. # generator loss + lambda_slm: 1. # slm feature matching loss + + lambda_mono: 1. # monotonic alignment loss (1st stage, TMA) + lambda_s2s: 1. # sequence-to-sequence loss (1st stage, TMA) + TMA_epoch: 50 # TMA starting epoch (1st stage) + + lambda_F0: 1. # F0 reconstruction loss (2nd stage) + lambda_norm: 1. # norm reconstruction loss (2nd stage) + lambda_dur: 1. # duration loss (2nd stage) + lambda_ce: 20. # duration predictor probability output CE loss (2nd stage) + lambda_sty: 1. # style reconstruction loss (2nd stage) + lambda_diff: 1. # score matching loss (2nd stage) + + diff_epoch: 8 # style diffusion starting epoch (2nd stage) + joint_epoch: 9 # joint training starting epoch (2nd stage) + +optimizer_params: + lr: 0.0001 # general learning rate + bert_lr: 0.00001 # learning rate for PLBERT + ft_lr: 0.0001 # learning rate for acoustic modules + +slmadv_params: + min_len: 100 # minimum length of samples + max_len: 500 # maximum length of samples + batch_percentage: 0.5 # to prevent out of memory, only use half of the original batch size + iter: 10 # update the discriminator every this iterations of generator update + thresh: 5 # gradient norm above which the gradient is scaled + scale: 0.01 # gradient scaling factor for predictors from SLM discriminators + sig: 1.5 # sigma for differentiable duration modeling + diff --git a/en/StyleTTS2/Twilight0/epoch_2nd_00007.pth b/en/StyleTTS2/Twilight0/epoch_2nd_00007.pth new file mode 100644 index 0000000000000000000000000000000000000000..32ba24c9f4aa02c701646cdd69e02b8a1da3da45 --- /dev/null +++ b/en/StyleTTS2/Twilight0/epoch_2nd_00007.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a33f933917e96e6ce9a05d2aceee9b6c48df0ecae7da8ba7a15bf6d7ede695b +size 2091494993 diff --git a/en/StyleTTS2/Twilight0/epoch_2nd_00008.pth b/en/StyleTTS2/Twilight0/epoch_2nd_00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..ea78dd31baeb498eba7d5de83e0efb66d1d14e2b --- /dev/null +++ b/en/StyleTTS2/Twilight0/epoch_2nd_00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7213e945587ebedbd9a71c5eb17f7e41f68ee0dc9528e10073494f5f6fbb60cc +size 2091494993 diff --git a/en/StyleTTS2/Twilight0/epoch_2nd_00009.pth b/en/StyleTTS2/Twilight0/epoch_2nd_00009.pth new file mode 100644 index 0000000000000000000000000000000000000000..b1149dde343f2957960924b274ebbae8e74f2b15 --- /dev/null +++ b/en/StyleTTS2/Twilight0/epoch_2nd_00009.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbf0148ac0fa7a7a8056b4b0a366ff779191a13c874770994f8e4ce751f46b60 +size 2091494993 diff --git a/en/StyleTTS2/Unfinished/epoch_1st_1_ae9c.pth b/en/StyleTTS2/Unfinished/epoch_1st_1_ae9c.pth new file mode 100644 index 0000000000000000000000000000000000000000..d336a47a62d5c312b00318c036738331f622dcc3 --- /dev/null +++ b/en/StyleTTS2/Unfinished/epoch_1st_1_ae9c.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a99698dc093c625985c7aa3c1330ab0119258a1b1ac27a7a942c66646d788b0c +size 2177267182 diff --git a/en/StyleTTS2/Unfinished/epoch_1st_50_27678.pth b/en/StyleTTS2/Unfinished/epoch_1st_50_27678.pth new file mode 100644 index 0000000000000000000000000000000000000000..d8dfabcb2d892b26563038d0c6a6e5da1b507f81 --- /dev/null +++ b/en/StyleTTS2/Unfinished/epoch_1st_50_27678.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52ec21f64b374fab11e9b3ab3410e9f643c6c2c407e84ba1c7bccfb5619b6cde +size 2239703042 diff --git a/en/StyleTTS2/Unfinished/epoch_1st_60_2ff94.pth b/en/StyleTTS2/Unfinished/epoch_1st_60_2ff94.pth new file mode 100644 index 0000000000000000000000000000000000000000..d2b9179b11dc8c929e0615455e3f1cf50cf69ea9 --- /dev/null +++ b/en/StyleTTS2/Unfinished/epoch_1st_60_2ff94.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3770cb4bdb9f949a98c654bd7590f4bcb9a3a30900d7d5a7171732c7c4ad1a11 +size 2239703042 diff --git a/en/StyleTTS2/Unfinished/epoch_2nd_22_bcac.pth b/en/StyleTTS2/Unfinished/epoch_2nd_22_bcac.pth new file mode 100644 index 0000000000000000000000000000000000000000..32481fd2c376f90926ffed9aad69486571d31b53 --- /dev/null +++ b/en/StyleTTS2/Unfinished/epoch_2nd_22_bcac.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2714c7d8bc912627a65f50a293144a0bb034fad0080556b4f16d1b42aff5ce8 +size 1585682547 diff --git a/en/StyleTTS2/Unfinished/epoch_2nd_24_13146.pth b/en/StyleTTS2/Unfinished/epoch_2nd_24_13146.pth new file mode 100644 index 0000000000000000000000000000000000000000..5fbdeaee56e1950dd25596d6be8c4348744f8068 --- /dev/null +++ b/en/StyleTTS2/Unfinished/epoch_2nd_24_13146.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bc31ecfb23005f6822eb510932b08c6e69883a03fcc5e0d53683abb934c2045 +size 2132415878 diff --git a/en/StyleTTS2/Unfinished/epoch_2nd_28_12c00.pth b/en/StyleTTS2/Unfinished/epoch_2nd_28_12c00.pth new file mode 100644 index 0000000000000000000000000000000000000000..8b2c262f72078ad1e1a892a84bcad44419dcb96e --- /dev/null +++ b/en/StyleTTS2/Unfinished/epoch_2nd_28_12c00.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8915877d23d8018e05707a9ed51290986b18586dc12ba567832e3475ac049b22 +size 1585684720 diff --git a/en/StyleTTS2/Unfinished/epoch_2nd_30_178c2.pth b/en/StyleTTS2/Unfinished/epoch_2nd_30_178c2.pth new file mode 100644 index 0000000000000000000000000000000000000000..2458ed797822b9a5740122415d33b5bd14b3615e --- /dev/null +++ b/en/StyleTTS2/Unfinished/epoch_2nd_30_178c2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e1d5b7ab4b54b9879a23d2d9162d27893631a7ae984f1c4f310411ec2fc58f6 +size 2132415878 diff --git a/en/StyleTTS2/Unfinished/epoch_2nd_40_268fe.pth b/en/StyleTTS2/Unfinished/epoch_2nd_40_268fe.pth new file mode 100644 index 0000000000000000000000000000000000000000..789c5b9eec99be9514efce76c58c64ebe70961fc --- /dev/null +++ b/en/StyleTTS2/Unfinished/epoch_2nd_40_268fe.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5213efb9403b9ca67ce931e0360784ef27ac844ba50fae9623fabfa777375a4a +size 2132415878 diff --git a/en/StyleTTS2/epoch_1st_00012.pth b/en/StyleTTS2/epoch_1st_00012.pth new file mode 100644 index 0000000000000000000000000000000000000000..2ea455b251ec755ec68fb272c5729f2ec311ee2a --- /dev/null +++ b/en/StyleTTS2/epoch_1st_00012.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e458a11c6eb6be2828ef328555ea1af76dbaa50f6e7b8590ba66a8db9ed1725 +size 1344741563 diff --git a/en/StyleTTS2/source.txt b/en/StyleTTS2/source.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9b018805be0cae443dc10eed1f062525b0d403d --- /dev/null +++ b/en/StyleTTS2/source.txt @@ -0,0 +1 @@ +https://huggingface.co/therealvul/StyleTTS2 \ No newline at end of file diff --git a/fr/StyleTTS2_French/.gitattributes b/fr/StyleTTS2_French/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/fr/StyleTTS2_French/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/fr/StyleTTS2_French/README.md b/fr/StyleTTS2_French/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7be5fc7f47d5db027d120b8024982df93db95b74 --- /dev/null +++ b/fr/StyleTTS2_French/README.md @@ -0,0 +1,3 @@ +--- +license: mit +--- diff --git a/fr/StyleTTS2_French/config.yml b/fr/StyleTTS2_French/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..49ece97f705a60a9e28ff38a1b44264627b59d79 --- /dev/null +++ b/fr/StyleTTS2_French/config.yml @@ -0,0 +1,22 @@ +{ASR_config: Utils/ASR/config.yml, ASR_path: Utils/ASR/epoch_00080.pth, F0_path: Utils/JDC/bst.t7, + PLBERT_dir: Utils/PLBERT/, batch_size: 2, data_params: {OOD_data: Data/OOD_texts.txt, + min_length: 50, root_path: wavs, train_data: train_list.txt, + val_data: val_list.txt}, device: cuda, + epochs_1st: 100, epochs_2nd: 60, first_stage_path: first_stage.pth, load_only_params: false, + log_dir: Models, log_interval: 10, loss_params: {TMA_epoch: 49, diff_epoch: 19, + joint_epoch: 51, lambda_F0: 1.0, lambda_ce: 20.0, lambda_diff: 1.0, lambda_dur: 1.0, + lambda_gen: 1.0, lambda_mel: 5.0, lambda_mono: 1.0, lambda_norm: 1.0, lambda_s2s: 1.0, + lambda_slm: 1.0, lambda_sty: 1.0}, max_len: 300, model_params: {decoder: {gen_istft_hop_size: 5, + gen_istft_n_fft: 20, resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, + 5]], resblock_kernel_sizes: [3, 7, 11], type: istftnet, upsample_initial_channel: 512, + upsample_kernel_sizes: [20, 12], upsample_rates: [10, 6]}, diffusion: {dist: { + estimate_sigma_data: true, mean: -3.0, sigma_data: 0.3284805215512056, std: 1.0}, + embedding_mask_proba: 0.1, transformer: {head_features: 64, multiplier: 2, num_heads: 8, + num_layers: 3}}, dim_in: 64, dropout: 0.2, hidden_dim: 512, max_conv_dim: 512, + max_dur: 50, multispeaker: true, n_layer: 3, n_mels: 80, n_token: 178, slm: { + hidden: 768, initial_channel: 64, model: microsoft/wavlm-base-plus, nlayers: 13, + sr: 16000}, style_dim: 128}, optimizer_params: {bert_lr: 1.0e-05, ft_lr: 1.0e-05, + lr: 0.0001}, preprocess_params: {spect_params: {hop_length: 300, n_fft: 2048, + win_length: 1200}, sr: 24000}, pretrained_model: "", + save_freq: 10, second_stage_load_pretrained: true, slmadv_params: {batch_percentage: 0.5, + iter: 10, max_len: 500, min_len: 400, scale: 0.01, sig: 1.5, thresh: 5}} diff --git a/fr/StyleTTS2_French/epoch_2nd_00050.pth b/fr/StyleTTS2_French/epoch_2nd_00050.pth new file mode 100644 index 0000000000000000000000000000000000000000..5fce5891dfb2924076af4fa24ae776f9ba7676ad --- /dev/null +++ b/fr/StyleTTS2_French/epoch_2nd_00050.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb5420222205db28df1799f463799821f3eff80dcc99061655af1b17652e34e1 +size 1585676736 diff --git a/fr/StyleTTS2_French/source.txt b/fr/StyleTTS2_French/source.txt new file mode 100644 index 0000000000000000000000000000000000000000..61ac3538b156b2673d587f205c950c58ed26c3c1 --- /dev/null +++ b/fr/StyleTTS2_French/source.txt @@ -0,0 +1 @@ +https://huggingface.co/Scralius/StyleTTS2_French \ No newline at end of file diff --git a/ms/StyleTTS2-MS/.gitattributes b/ms/StyleTTS2-MS/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/ms/StyleTTS2-MS/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/ms/StyleTTS2-MS/README.md b/ms/StyleTTS2-MS/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5e32c8921b95a263fdd82353dd3330bacc58b33a --- /dev/null +++ b/ms/StyleTTS2-MS/README.md @@ -0,0 +1,23 @@ +--- +datasets: +- mesolitica/TTS +language: +- ms +--- + +# StyleTTS2 MS + +Forked at https://github.com/mesolitica/StyleTTS2-MS, only trained on first stage. + +## Pre-trained modules + +1. Forked original [yl4579/AuxiliaryASR](https://github.com/yl4579/AuxiliaryASR) at [mesolitica/AuxiliaryASR-Phonemizer](https://github.com/mesolitica/AuxiliaryASR-Phonemizer) to use `ms` phonemizer and trained on [mesolitica/tts-combine-annotated](https://huggingface.co/datasets/mesolitica/tts-combine-annotated) dataset. +2. Forked original [PL-BERT](https://arxiv.org/abs/2301.08810) at [malaysia-ai/PL-BERT-MS](https://github.com/malaysia-ai/PL-BERT-MS) to use custom word tokenizer and pretrained on Malay Wikipedia and local news. + +## Checkpoints + +We uploaded full checkpoints with optimizer states at [checkpoints-first-stage](checkpoints-first-stage). + +## Dataset + +We train on [Mesolitica/TTS](https://huggingface.co/datasets/mesolitica/TTS). \ No newline at end of file diff --git a/ms/StyleTTS2-MS/checkpoints-first-stage/config.yml b/ms/StyleTTS2-MS/checkpoints-first-stage/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..941b76757b53f53f67069adfee20ccca9c2f925f --- /dev/null +++ b/ms/StyleTTS2-MS/checkpoints-first-stage/config.yml @@ -0,0 +1,113 @@ +log_dir: "Models/Multispeakers" +first_stage_path: "first_stage.pth" +save_freq: 1 +log_interval: 10 +device: "cuda" +epochs_1st: 50 # number of epochs for first stage training (pre-training) +epochs_2nd: 30 # number of peochs for second stage training (joint training) +batch_size: 10 +max_len: 300 # maximum number of frames +pretrained_model: "/ephemeral/epoch_1st_00001.pth" +second_stage_load_pretrained: true # set to true if the pre-trained model is for 2nd stage +load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters + +F0_path: "Utils/JDC/bst.t7" +ASR_config: "Utils/ASR/config.yml" +ASR_path: "Utils/ASR/epoch_00020.pth" +PLBERT_dir: 'Utils/PLBERT/' + +data_params: + train_data: "/ephemeral/train_list.txt" + val_data: "/ephemeral/val_list.txt" + root_path: "" + OOD_data: "Data/OOD_texts.txt" + min_length: 50 # sample until texts with this size are obtained for OOD texts + +preprocess_params: + sr: 24000 + spect_params: + n_fft: 2048 + win_length: 1200 + hop_length: 300 + +model_params: + multispeaker: true + + dim_in: 64 + hidden_dim: 512 + max_conv_dim: 512 + n_layer: 3 + n_mels: 80 + + n_token: 178 # number of phoneme tokens + max_dur: 50 # maximum duration of a single phoneme + style_dim: 128 # style vector size + + dropout: 0.2 + + # config for decoder + decoder: + type: 'hifigan' # either hifigan or istftnet + resblock_kernel_sizes: [3,7,11] + upsample_rates : [10,5,3,2] + upsample_initial_channel: 512 + resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] + upsample_kernel_sizes: [20,10,6,4] + + # speech language model config + slm: + model: 'microsoft/wavlm-base-plus' + sr: 16000 # sampling rate of SLM + hidden: 768 # hidden size of SLM + nlayers: 13 # number of layers of SLM + initial_channel: 64 # initial channels of SLM discriminator head + + # style diffusion model config + diffusion: + embedding_mask_proba: 0.1 + # transformer config + transformer: + num_layers: 3 + num_heads: 8 + head_features: 64 + multiplier: 2 + + # diffusion distribution config + dist: + sigma_data: 0.2 # placeholder for estimate_sigma_data set to false + estimate_sigma_data: true # estimate sigma_data from the current batch if set to true + mean: -3.0 + std: 1.0 + +loss_params: + lambda_mel: 5. # mel reconstruction loss + lambda_gen: 1. # generator loss + lambda_slm: 1. # slm feature matching loss + + lambda_mono: 1. # monotonic alignment loss (1st stage, TMA) + lambda_s2s: 1. # sequence-to-sequence loss (1st stage, TMA) + TMA_epoch: 2 # TMA starting epoch (1st stage) + + lambda_F0: 1. # F0 reconstruction loss (2nd stage) + lambda_norm: 1. # norm reconstruction loss (2nd stage) + lambda_dur: 1. # duration loss (2nd stage) + lambda_ce: 20. # duration predictor probability output CE loss (2nd stage) + lambda_sty: 1. # style reconstruction loss (2nd stage) + lambda_diff: 1. # score matching loss (2nd stage) + + diff_epoch: 10 # style diffusion starting epoch (2nd stage) + joint_epoch: 15 # joint training starting epoch (2nd stage) + +optimizer_params: + lr: 0.0001 # general learning rate + bert_lr: 0.00001 # learning rate for PLBERT + ft_lr: 0.00001 # learning rate for acoustic modules + +slmadv_params: + min_len: 400 # minimum length of samples + max_len: 500 # maximum length of samples + batch_percentage: 0.5 # to prevent out of memory, only use half of the original batch size + iter: 20 # update the discriminator every this iterations of generator update + thresh: 5 # gradient norm above which the gradient is scaled + scale: 0.01 # gradient scaling factor for predictors from SLM discriminators + sig: 1.5 # sigma for differentiable duration modeling diff --git a/ms/StyleTTS2-MS/checkpoints-first-stage/epoch_1st_00001.pth b/ms/StyleTTS2-MS/checkpoints-first-stage/epoch_1st_00001.pth new file mode 100644 index 0000000000000000000000000000000000000000..2e405b654e8f11660ac536a2ea4f4138854e4cd7 --- /dev/null +++ b/ms/StyleTTS2-MS/checkpoints-first-stage/epoch_1st_00001.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b10c228157fe7d4a4b82250edad12a800f339808697a303b359e5e3171a36a8 +size 1356576201 diff --git a/ms/StyleTTS2-MS/checkpoints-first-stage/epoch_1st_00002.pth b/ms/StyleTTS2-MS/checkpoints-first-stage/epoch_1st_00002.pth new file mode 100644 index 0000000000000000000000000000000000000000..b3a56969adef565bfd7a5a3148bd3b5ff7769dd0 --- /dev/null +++ b/ms/StyleTTS2-MS/checkpoints-first-stage/epoch_1st_00002.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cf59ed53a77e54b45972db0adfc1010ee07866e5096a1b21f888245917912e4 +size 1748794795 diff --git a/ms/StyleTTS2-MS/checkpoints-first-stage/epoch_1st_00003.pth b/ms/StyleTTS2-MS/checkpoints-first-stage/epoch_1st_00003.pth new file mode 100644 index 0000000000000000000000000000000000000000..507a29f0b3d51af65278b21f089a0c73cb29d8b5 --- /dev/null +++ b/ms/StyleTTS2-MS/checkpoints-first-stage/epoch_1st_00003.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6358966d9a37574ffb8cb38d1319e1283c7354c3ce8fb87b7091c91f86f2270 +size 1748794795 diff --git a/ms/StyleTTS2-MS/checkpoints-first-stage/epoch_1st_00004.pth b/ms/StyleTTS2-MS/checkpoints-first-stage/epoch_1st_00004.pth new file mode 100644 index 0000000000000000000000000000000000000000..7acec207d528ed3025b45283a156a371418ca10b --- /dev/null +++ b/ms/StyleTTS2-MS/checkpoints-first-stage/epoch_1st_00004.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f606850978a5ce5af40c9143880bc848ca3b2a8d188e444ed6c98b159cdabed7 +size 1748794795 diff --git a/ms/StyleTTS2-MS/checkpoints-first-stage/epoch_1st_00006.pth b/ms/StyleTTS2-MS/checkpoints-first-stage/epoch_1st_00006.pth new file mode 100644 index 0000000000000000000000000000000000000000..b8928c3ea09c151d0543c7506ea3c5b357907d0b --- /dev/null +++ b/ms/StyleTTS2-MS/checkpoints-first-stage/epoch_1st_00006.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de977308263b78e6a60dbe6de2350e14001c277621360e731a6334c4024791ac +size 1748794795 diff --git a/ms/StyleTTS2-MS/checkpoints-first-stage/epoch_1st_00008.pth b/ms/StyleTTS2-MS/checkpoints-first-stage/epoch_1st_00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..1eefb029bc1e7bc499bfb9799212e2159c469e84 --- /dev/null +++ b/ms/StyleTTS2-MS/checkpoints-first-stage/epoch_1st_00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9712df724df11b573842f8b269dd6e8fc74c230d3494148d13b30d51e3003b19 +size 1748794795 diff --git a/ms/StyleTTS2-MS/checkpoints-first-stage/epoch_1st_00009.pth b/ms/StyleTTS2-MS/checkpoints-first-stage/epoch_1st_00009.pth new file mode 100644 index 0000000000000000000000000000000000000000..cd7266d1718c84cc2d6dfbf1696194853e0f4acd --- /dev/null +++ b/ms/StyleTTS2-MS/checkpoints-first-stage/epoch_1st_00009.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddee4a0ac8b2da6abe73376366f6022850dcca5dfede95638b4b92717b4e674d +size 1748794795 diff --git a/ms/StyleTTS2-MS/source.txt b/ms/StyleTTS2-MS/source.txt new file mode 100644 index 0000000000000000000000000000000000000000..aafaa179605b005b5fe64202f60c0b9622f7fd96 --- /dev/null +++ b/ms/StyleTTS2-MS/source.txt @@ -0,0 +1 @@ +https://huggingface.co/mesolitica/StyleTTS2-MS \ No newline at end of file diff --git a/ru/StyleTTS_prokopenko_v1/.gitattributes b/ru/StyleTTS_prokopenko_v1/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..9a39d3ec281fdfab271d9dc400eff5d933657d6c --- /dev/null +++ b/ru/StyleTTS_prokopenko_v1/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +voices/prokopenko/reference.wav filter=lfs diff=lfs merge=lfs -text diff --git a/ru/StyleTTS_prokopenko_v1/README.md b/ru/StyleTTS_prokopenko_v1/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c6d938de9442d3f7a38485c22fa839197628294e --- /dev/null +++ b/ru/StyleTTS_prokopenko_v1/README.md @@ -0,0 +1,5 @@ +--- +language: +- ru +--- +папку voices в корень styletts и выбрать этот референс голоса diff --git a/ru/StyleTTS_prokopenko_v1/config.yml b/ru/StyleTTS_prokopenko_v1/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..c01c1f85124c92605025f96c1ec448b24353d234 --- /dev/null +++ b/ru/StyleTTS_prokopenko_v1/config.yml @@ -0,0 +1,20 @@ +{ASR_config: Utils/ASR/config.yml, ASR_path: Utils/ASR/epoch_00080.pth, F0_path: Utils/JDC/bst.t7, + PLBERT_dir: Utils/PLBERT, batch_size: 1, data_params: {OOD_data: Data/OOD_texts.txt, + min_length: 50, root_path: training/prokopenko_v3/audio, train_data: training/prokopenko_v3/train_phoneme.txt, + val_data: training/prokopenko_v3/validation_phoneme.txt}, device: cuda, epochs: 20, + load_only_params: true, log_dir: training/prokopenko_v3/models, log_interval: 10, + loss_params: {diff_epoch: 0, joint_epoch: 0, lambda_F0: 1.0, lambda_ce: 20.0, lambda_diff: 1.0, + lambda_dur: 1.0, lambda_gen: 1.0, lambda_mel: 5.0, lambda_mono: 1.0, lambda_norm: 1.0, + lambda_s2s: 1.0, lambda_slm: 1.0, lambda_sty: 1.0}, max_len: 100, model_params: { + decoder: {resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]], resblock_kernel_sizes: [ + 3, 7, 11], type: hifigan, upsample_initial_channel: 512, upsample_kernel_sizes: [ + 20, 10, 6, 4], upsample_rates: [10, 5, 3, 2]}, diffusion: {dist: {estimate_sigma_data: true, + mean: -3.0, sigma_data: 0.18500752085259445, std: 1.0}, embedding_mask_proba: 0.1, + transformer: {head_features: 64, multiplier: 2, num_heads: 8, num_layers: 3}}, + dim_in: 64, dropout: 0.2, hidden_dim: 512, max_conv_dim: 512, max_dur: 50, multispeaker: true, + n_layer: 3, n_mels: 80, n_token: 178, slm: {hidden: 768, initial_channel: 64, + model: microsoft/wavlm-base-plus, nlayers: 13, sr: 16000}, style_dim: 128}, + optimizer_params: {bert_lr: 1.0e-05, ft_lr: 0.0001, lr: 0.0001}, preprocess_params: { + spect_params: {hop_length: 300, n_fft: 2048, win_length: 1200}, sr: 24000}, pretrained_model: models/prokopenko_v2/epoch_2nd_00018.pth, + save_freq: 1, second_stage_load_pretrained: true, slmadv_params: {batch_percentage: 0.5, + iter: 10, max_len: 500, min_len: 400, scale: 0.01, sig: 1.5, thresh: 5}} diff --git a/ru/StyleTTS_prokopenko_v1/epoch_2nd_00047.pth b/ru/StyleTTS_prokopenko_v1/epoch_2nd_00047.pth new file mode 100644 index 0000000000000000000000000000000000000000..5ba759a3c32d721209fb59b1239911599a1e0856 --- /dev/null +++ b/ru/StyleTTS_prokopenko_v1/epoch_2nd_00047.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12d297bd707547dbd9404ec6e6b282aca78f51debc7b1795212d2a222f7b0f44 +size 2242832422 diff --git a/ru/StyleTTS_prokopenko_v1/source.txt b/ru/StyleTTS_prokopenko_v1/source.txt new file mode 100644 index 0000000000000000000000000000000000000000..28ca450d2ea2bf733aa60e7b0ca6a92e73d473e2 --- /dev/null +++ b/ru/StyleTTS_prokopenko_v1/source.txt @@ -0,0 +1 @@ +https://huggingface.co/snzhkhd/StyleTTS_prokopenko_v1 \ No newline at end of file diff --git a/ru/StyleTTS_prokopenko_v1/voices/prokopenko/reference.wav b/ru/StyleTTS_prokopenko_v1/voices/prokopenko/reference.wav new file mode 100644 index 0000000000000000000000000000000000000000..79793661d50d70e3e884d46bdf573ae2a7b902ba --- /dev/null +++ b/ru/StyleTTS_prokopenko_v1/voices/prokopenko/reference.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ef76b2be85e68db784c43f1ad109a37449be494820d02c65052bd7e0d2d0fc4 +size 984044 diff --git a/uk/styletts2_ukrainian_multispeaker_hifigan/.gitattributes b/uk/styletts2_ukrainian_multispeaker_hifigan/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/uk/styletts2_ukrainian_multispeaker_hifigan/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/uk/styletts2_ukrainian_multispeaker_hifigan/README.md b/uk/styletts2_ukrainian_multispeaker_hifigan/README.md new file mode 100644 index 0000000000000000000000000000000000000000..631d26c3c6e16453e6dabac95ba9072506460092 --- /dev/null +++ b/uk/styletts2_ukrainian_multispeaker_hifigan/README.md @@ -0,0 +1,6 @@ +--- +license: mit +language: +- uk +pipeline_tag: text-to-speech +--- \ No newline at end of file diff --git a/uk/styletts2_ukrainian_multispeaker_hifigan/config.yml b/uk/styletts2_ukrainian_multispeaker_hifigan/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..e61fd97d94b1ed79707a21681f430aa833bc6956 --- /dev/null +++ b/uk/styletts2_ukrainian_multispeaker_hifigan/config.yml @@ -0,0 +1,53 @@ +plbert_params: + vocab_size: 198 + hidden_size: 768 + num_attention_heads: 12 + intermediate_size: 2048 + max_position_embeddings: 512 + num_hidden_layers: 12 + dropout: 0.1 + + +model_params: + multispeaker: true + vocab: "$-´;:,.!?¡¿—…\"«»“” ()†/=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzéýíó'̯'͡ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲ'̩'ᵻ" + + + dim_in: 64 + hidden_dim: 512 + max_conv_dim: 512 + n_layer: 3 + n_mels: 80 + + n_token: 181 # number of phoneme tokens + max_dur: 50 # maximum duration of a single phoneme + style_dim: 128 # style vector size + + dropout: 0.0 + + # config for decoder + decoder: + type: 'hifigan' # either hifigan or istftnet + resblock_kernel_sizes: [3,7,11] + upsample_rates : [10,5,3,2] + upsample_initial_channel: 512 + resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] + upsample_kernel_sizes: [20,10,6,4] + + + # style diffusion model config + diffusion: + embedding_mask_proba: 0.1 + # transformer config + transformer: + num_layers: 3 + num_heads: 8 + head_features: 64 + multiplier: 2 + + # diffusion distribution config + dist: + sigma_data: 0.19988229232390187 + mean: -4.0 + std: 4.0 + diff --git a/uk/styletts2_ukrainian_multispeaker_hifigan/pytorch_model.bin b/uk/styletts2_ukrainian_multispeaker_hifigan/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..528a51c7a07f6f48bd018047e4c654dcceca651b --- /dev/null +++ b/uk/styletts2_ukrainian_multispeaker_hifigan/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfd0e06ecc528d25c5d6371c0a5dbfa742b008d0f83d843738e411efeb03f851 +size 766654558 diff --git a/uk/styletts2_ukrainian_multispeaker_hifigan/source.txt b/uk/styletts2_ukrainian_multispeaker_hifigan/source.txt new file mode 100644 index 0000000000000000000000000000000000000000..aa253b81e34c7adffd181365d00d14dee232c08d --- /dev/null +++ b/uk/styletts2_ukrainian_multispeaker_hifigan/source.txt @@ -0,0 +1 @@ +https://huggingface.co/patriotyk/styletts2_ukrainian_multispeaker_hifigan \ No newline at end of file diff --git a/uk/styletts2_ukrainian_multispeaker_istftnet/.gitattributes b/uk/styletts2_ukrainian_multispeaker_istftnet/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/uk/styletts2_ukrainian_multispeaker_istftnet/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/uk/styletts2_ukrainian_multispeaker_istftnet/README.md b/uk/styletts2_ukrainian_multispeaker_istftnet/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f057ca1d44b88c22f654ce8dc04cf66515f32e9e --- /dev/null +++ b/uk/styletts2_ukrainian_multispeaker_istftnet/README.md @@ -0,0 +1,8 @@ +--- +license: mit +language: +- uk +tags: +- tts +- text-to-speech +--- \ No newline at end of file diff --git a/uk/styletts2_ukrainian_multispeaker_istftnet/config.yml b/uk/styletts2_ukrainian_multispeaker_istftnet/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..61a4a56a76a4174cc913996a1db084a0f37be50c --- /dev/null +++ b/uk/styletts2_ukrainian_multispeaker_istftnet/config.yml @@ -0,0 +1,60 @@ +plbert_params: + vocab_size: 198 + hidden_size: 768 + num_attention_heads: 12 + intermediate_size: 2048 + max_position_embeddings: 512 + num_hidden_layers: 12 + dropout: 0.1 + + +model_params: + multispeaker: true + vocab: "$-´;:,.!?¡¿—…\"«»“” ()†/=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzéýíó'̯'͡ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲ'̩'ᵻ" + + dim_in: 64 + hidden_dim: 512 + max_conv_dim: 512 + n_layer: 3 + n_mels: 80 + + n_token: 181 # number of phoneme tokens + max_dur: 50 # maximum duration of a single phoneme + style_dim: 128 # style vector size + + dropout: 0.0 + + # config for decoder + decoder: + type: 'istftnet' # either hifigan or istftnet + resblock_kernel_sizes: [3,7,11] + upsample_rates : [10, 6] + upsample_initial_channel: 512 + resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] + upsample_kernel_sizes: [20, 12] + gen_istft_n_fft: 20 + gen_istft_hop_size: 5 + + + # style diffusion model config + diffusion: + embedding_mask_proba: 0.1 + # transformer config + transformer: + num_layers: 3 + num_heads: 8 + head_features: 64 + multiplier: 2 + + # diffusion distribution config + dist: + sigma_data: 0.18 + mean: -4.0 + std: 4.0 + + +asr_params: + input_dim: 80 + hidden_dim: 256 + n_token: 181 + token_embedding_dim: 512 \ No newline at end of file diff --git a/uk/styletts2_ukrainian_multispeaker_istftnet/pytorch_model.bin b/uk/styletts2_ukrainian_multispeaker_istftnet/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..06210a6a6de81bff4d6b5fa922642e3e0e5d9672 --- /dev/null +++ b/uk/styletts2_ukrainian_multispeaker_istftnet/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da193f600e7cca59cd6527165d1b65ca82269b7c0d239f35ee5e3501e6066a4e +size 592273364 diff --git a/uk/styletts2_ukrainian_multispeaker_istftnet/source.txt b/uk/styletts2_ukrainian_multispeaker_istftnet/source.txt new file mode 100644 index 0000000000000000000000000000000000000000..d4ecb41eb8ac26a2e9f8c2b429c0d958886c976d --- /dev/null +++ b/uk/styletts2_ukrainian_multispeaker_istftnet/source.txt @@ -0,0 +1 @@ +https://huggingface.co/patriotyk/styletts2_ukrainian_multispeaker_istftnet \ No newline at end of file diff --git a/uk/styletts2_ukrainian_single/.gitattributes b/uk/styletts2_ukrainian_single/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/uk/styletts2_ukrainian_single/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/uk/styletts2_ukrainian_single/README.md b/uk/styletts2_ukrainian_single/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4a9c52a637312f6c45d6509f1c568ee28472fdad --- /dev/null +++ b/uk/styletts2_ukrainian_single/README.md @@ -0,0 +1,12 @@ +--- +license: mit +language: +- uk +pipeline_tag: text-to-speech +datasets: +- patriotyk/filatov_24000 +tags: +- text-to-speech +- tts +- styletts2 +--- \ No newline at end of file diff --git a/uk/styletts2_ukrainian_single/config.yml b/uk/styletts2_ukrainian_single/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..3e4c13ce5aa18ad9df49ef3d1c4f77e269719ffd --- /dev/null +++ b/uk/styletts2_ukrainian_single/config.yml @@ -0,0 +1,54 @@ +plbert_params: + vocab_size: 198 + hidden_size: 768 + num_attention_heads: 12 + intermediate_size: 2048 + max_position_embeddings: 512 + num_hidden_layers: 12 + dropout: 0.1 + + +model_params: + multispeaker: false + vocab: "$-´;:,.!?¡¿—…\"«»“” ()†/=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzéýíó'̯'͡ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲ'̩'ᵻ" + + dim_in: 64 + hidden_dim: 512 + max_conv_dim: 512 + n_layer: 3 + n_mels: 80 + + n_token: 181 # number of phoneme tokens + max_dur: 50 # maximum duration of a single phoneme + style_dim: 128 # style vector size + + dropout: 0.0 + + # config for decoder + decoder: + type: 'istftnet' # either hifigan or istftnet + resblock_kernel_sizes: [3,7,11] + upsample_rates : [10, 6] + upsample_initial_channel: 512 + resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] + upsample_kernel_sizes: [20, 12] + gen_istft_n_fft: 20 + gen_istft_hop_size: 5 + + + # style diffusion model config + diffusion: + embedding_mask_proba: 0.1 + # transformer config + transformer: + num_layers: 3 + num_heads: 8 + head_features: 64 + multiplier: 2 + + # diffusion distribution config + dist: + sigma_data: 0.18 + mean: -4.0 + std: 4.0 + \ No newline at end of file diff --git a/uk/styletts2_ukrainian_single/model.onnx b/uk/styletts2_ukrainian_single/model.onnx new file mode 100644 index 0000000000000000000000000000000000000000..d88d009ee64b631c35c58fd8b0ea07b08bb98957 --- /dev/null +++ b/uk/styletts2_ukrainian_single/model.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3dbd52d5a2372edfc20fce54ccac8ab951c95143832e409a252ae61df1a6413 +size 327779591 diff --git a/uk/styletts2_ukrainian_single/pytorch_model.bin b/uk/styletts2_ukrainian_single/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..48194f64afd88f8dac4429e38aef3bf155358261 --- /dev/null +++ b/uk/styletts2_ukrainian_single/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25e78d882ec4ee5a8a361749004edf6914137760f2be33a71ea24ce22da1a24a +size 748848243 diff --git a/uk/styletts2_ukrainian_single/source.txt b/uk/styletts2_ukrainian_single/source.txt new file mode 100644 index 0000000000000000000000000000000000000000..be5e6cba0aafc620abeb28cd396055f41f278010 --- /dev/null +++ b/uk/styletts2_ukrainian_single/source.txt @@ -0,0 +1 @@ +https://huggingface.co/patriotyk/styletts2_ukrainian_single \ No newline at end of file diff --git a/uk/styletts2_ukrainian_single/style.pt b/uk/styletts2_ukrainian_single/style.pt new file mode 100644 index 0000000000000000000000000000000000000000..fd22f366d224cf7b8c8ed2027ca75423ec9b890a --- /dev/null +++ b/uk/styletts2_ukrainian_single/style.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f181646626df52fdcf749e93a311686ffb2eaeae8112be0005a8d6efa7dc5cc9 +size 2204 diff --git a/vi,en/StyleTTS2-lite-vi/.gitattributes b/vi,en/StyleTTS2-lite-vi/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..2b96bd1447866b09b8a8d094f3a63f1d142c18bb --- /dev/null +++ b/vi,en/StyleTTS2-lite-vi/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +*.wav filter=lfs diff=lfs merge=lfs -text diff --git a/vi,en/StyleTTS2-lite-vi/.gitignore b/vi,en/StyleTTS2-lite-vi/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..134537e0d10897ca0121496fa8a8bd8d5513fdfd --- /dev/null +++ b/vi,en/StyleTTS2-lite-vi/.gitignore @@ -0,0 +1,8 @@ +Modules/__pycache__/__init__.cpython-311.pyc +Modules/__pycache__/hifigan.cpython-311.pyc +Modules/__pycache__/utils.cpython-311.pyc +Modules/__pycache__/__init__.cpython-311.pyc +Modules/__pycache__/hifigan.cpython-311.pyc +Modules/__pycache__/utils.cpython-311.pyc +__pycache__/inference.cpython-311.pyc +__pycache__/models.cpython-311.pyc diff --git a/vi,en/StyleTTS2-lite-vi/LICENSE b/vi,en/StyleTTS2-lite-vi/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..6585d82c2a0f68f31ba4e2264d2d4beb57bda33f --- /dev/null +++ b/vi,en/StyleTTS2-lite-vi/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Aaron (Yinghao) Li + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/vi,en/StyleTTS2-lite-vi/Models/base_model_120k_vi.pth b/vi,en/StyleTTS2-lite-vi/Models/base_model_120k_vi.pth new file mode 100644 index 0000000000000000000000000000000000000000..6b70b2f33b4e6a123024c06c4e29ecb36ac66050 --- /dev/null +++ b/vi,en/StyleTTS2-lite-vi/Models/base_model_120k_vi.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7b4c56f020c5407c93c056e4306b077f2e5cbc707cce07f29fed40375d91ca5 +size 1692240736 diff --git a/vi,en/StyleTTS2-lite-vi/Models/config.yaml b/vi,en/StyleTTS2-lite-vi/Models/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7ec4973f9dd38ece607884fa4d82891b7b83e8c8 --- /dev/null +++ b/vi,en/StyleTTS2-lite-vi/Models/config.yaml @@ -0,0 +1,80 @@ +log_dir: ./Models/Finetune +save_freq: 1 +log_interval: 10 +device: cuda +epochs: 50 +batch_size: 2 +max_len: 310 # maximum number of frames +pretrained_model: ./Models/Finetune/base_model.pth +load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters +debug: true + +data_params: + train_data: "../../Data_Speech/viVoice/train.txt" + val_data: "../../Data_Speech/combine/combine_val.txt" + root_path: "../../Data_Speech/" + +symbol: #Total 189 symbols + pad: "$" + punctuation: ';:,.!?¡¿—…"«»“” ' + letters: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" + letters_ipa: "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" + extend: "∫̆ăη͡123456" #ADD MORE SYMBOLS HERE + +preprocess_params: + sr: 24000 + spect_params: + n_fft: 2048 + win_length: 1200 + hop_length: 300 + +training_strats: + #All modules: 'decoder', 'predictor', 'text_encoder', 'style_encoder', 'text_aligner', 'pitch_extractor', 'mpd', 'msd' + freeze_modules: [''] # Not updated when training. + ignore_modules: [''] # Not loading => fresh start. IMPORTANT: 'text_aligner' and 'pitch_extractor' are util pretraineds DO NOT ignore them. + +model_params: + dim_in: 64 + hidden_dim: 512 + max_conv_dim: 512 + n_layer: 3 + n_mels: 80 + max_dur: 50 # maximum duration of a single phoneme + style_dim: 128 # style vector size + + dropout: 0.2 + + ASR_params: + input_dim: 80 + hidden_dim: 256 + n_layers: 6 + token_embedding_dim: 512 + + JDC_params: + num_class: 1 + seq_len: 192 + + # config for decoder + decoder: + type: 'hifigan' # either hifigan or istftnet + resblock_kernel_sizes: [3,7,11] + upsample_rates : [10,5,3,2] + upsample_initial_channel: 512 + resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] + upsample_kernel_sizes: [20,10,6,4] + +loss_params: + lambda_mel: 5. # mel reconstruction loss + lambda_gen: 1. # generator loss + + lambda_mono: 1. # monotonic alignment loss (TMA) + lambda_s2s: 1. # sequence-to-sequence loss (TMA) + + lambda_F0: 1. # F0 reconstruction loss + lambda_norm: 1. # norm reconstruction loss + lambda_dur: 1. # duration loss + lambda_ce: 20. # duration predictor probability output CE loss + +optimizer_params: + lr: 0.0001 # general learning rate + ft_lr: 0.00001 # learning rate for acoustic modules \ No newline at end of file diff --git a/vi,en/StyleTTS2-lite-vi/Models/inference/model.pth b/vi,en/StyleTTS2-lite-vi/Models/inference/model.pth new file mode 100644 index 0000000000000000000000000000000000000000..169c1f4f4d2e2fcfc8e8ebcbbfa3dd692898629e --- /dev/null +++ b/vi,en/StyleTTS2-lite-vi/Models/inference/model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:997e420474c1be8d1f09a70689c444105d47574a7be65ec221d61c5c2caaf8c0 +size 360061639 diff --git a/vi,en/StyleTTS2-lite-vi/Modules/__init__.py b/vi,en/StyleTTS2-lite-vi/Modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d3f5a12faa99758192ecc4ed3fc22c9249232e86 --- /dev/null +++ b/vi,en/StyleTTS2-lite-vi/Modules/__init__.py @@ -0,0 +1 @@ + diff --git a/vi,en/StyleTTS2-lite-vi/Modules/hifigan.py b/vi,en/StyleTTS2-lite-vi/Modules/hifigan.py new file mode 100644 index 0000000000000000000000000000000000000000..5ad62b7611d7137895b7ab70e214c99f2b6741e1 --- /dev/null +++ b/vi,en/StyleTTS2-lite-vi/Modules/hifigan.py @@ -0,0 +1,477 @@ +import torch +import torch.nn.functional as F +import torch.nn as nn +from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm +from .utils import init_weights, get_padding + +import math +import random +import numpy as np + +LRELU_SLOPE = 0.1 + +class AdaIN1d(nn.Module): + def __init__(self, style_dim, num_features): + super().__init__() + self.norm = nn.InstanceNorm1d(num_features, affine=False) + self.fc = nn.Linear(style_dim, num_features*2) + + def forward(self, x, s): + h = self.fc(s) + h = h.view(h.size(0), h.size(1), 1) + gamma, beta = torch.chunk(h, chunks=2, dim=1) + return (1 + gamma) * self.norm(x) + beta + +class AdaINResBlock1(torch.nn.Module): + def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), style_dim=64): + super(AdaINResBlock1, self).__init__() + self.convs1 = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]))) + ]) + self.convs1.apply(init_weights) + + self.convs2 = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))) + ]) + self.convs2.apply(init_weights) + + self.adain1 = nn.ModuleList([ + AdaIN1d(style_dim, channels), + AdaIN1d(style_dim, channels), + AdaIN1d(style_dim, channels), + ]) + + self.adain2 = nn.ModuleList([ + AdaIN1d(style_dim, channels), + AdaIN1d(style_dim, channels), + AdaIN1d(style_dim, channels), + ]) + + self.alpha1 = nn.ParameterList([nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs1))]) + self.alpha2 = nn.ParameterList([nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs2))]) + + + def forward(self, x, s): + for c1, c2, n1, n2, a1, a2 in zip(self.convs1, self.convs2, self.adain1, self.adain2, self.alpha1, self.alpha2): + xt = n1(x, s) + xt = xt + (1 / a1) * (torch.sin(a1 * xt) ** 2) # Snake1D + xt = c1(xt) + xt = n2(xt, s) + xt = xt + (1 / a2) * (torch.sin(a2 * xt) ** 2) # Snake1D + xt = c2(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for l in self.convs1: + remove_weight_norm(l) + for l in self.convs2: + remove_weight_norm(l) + +class SineGen(torch.nn.Module): + """ Definition of sine generator + SineGen(samp_rate, harmonic_num = 0, + sine_amp = 0.1, noise_std = 0.003, + voiced_threshold = 0, + flag_for_pulse=False) + samp_rate: sampling rate in Hz + harmonic_num: number of harmonic overtones (default 0) + sine_amp: amplitude of sine-wavefrom (default 0.1) + noise_std: std of Gaussian noise (default 0.003) + voiced_thoreshold: F0 threshold for U/V classification (default 0) + flag_for_pulse: this SinGen is used inside PulseGen (default False) + Note: when flag_for_pulse is True, the first time step of a voiced + segment is always sin(np.pi) or cos(0) + """ + + def __init__(self, samp_rate, upsample_scale, harmonic_num=0, + sine_amp=0.1, noise_std=0.003, + voiced_threshold=0, + flag_for_pulse=False): + super(SineGen, self).__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.dim = self.harmonic_num + 1 + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + self.flag_for_pulse = flag_for_pulse + self.upsample_scale = upsample_scale + + def _f02uv(self, f0): + # generate uv signal + uv = (f0 > self.voiced_threshold).type(torch.float32) + return uv + + def _f02sine(self, f0_values): + """ f0_values: (batchsize, length, dim) + where dim indicates fundamental tone and overtones + """ + # convert to F0 in rad. The interger part n can be ignored + # because 2 * np.pi * n doesn't affect phase + rad_values = (f0_values / self.sampling_rate) % 1 + + # initial phase noise (no noise for fundamental component) + rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], \ + device=f0_values.device) + rand_ini[:, 0] = 0 + rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini + + # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad) + if not self.flag_for_pulse: +# # for normal case + +# # To prevent torch.cumsum numerical overflow, +# # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1. +# # Buffer tmp_over_one_idx indicates the time step to add -1. +# # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi +# tmp_over_one = torch.cumsum(rad_values, 1) % 1 +# tmp_over_one_idx = (padDiff(tmp_over_one)) < 0 +# cumsum_shift = torch.zeros_like(rad_values) +# cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + +# phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi + rad_values = torch.nn.functional.interpolate(rad_values.transpose(1, 2), + scale_factor=1/self.upsample_scale, + mode="linear").transpose(1, 2) + +# tmp_over_one = torch.cumsum(rad_values, 1) % 1 +# tmp_over_one_idx = (padDiff(tmp_over_one)) < 0 +# cumsum_shift = torch.zeros_like(rad_values) +# cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + + phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi + phase = torch.nn.functional.interpolate(phase.transpose(1, 2) * self.upsample_scale, + scale_factor=self.upsample_scale, mode="linear").transpose(1, 2) + sines = torch.sin(phase) + + else: + # If necessary, make sure that the first time step of every + # voiced segments is sin(pi) or cos(0) + # This is used for pulse-train generation + + # identify the last time step in unvoiced segments + uv = self._f02uv(f0_values) + uv_1 = torch.roll(uv, shifts=-1, dims=1) + uv_1[:, -1, :] = 1 + u_loc = (uv < 1) * (uv_1 > 0) + + # get the instantanouse phase + tmp_cumsum = torch.cumsum(rad_values, dim=1) + # different batch needs to be processed differently + for idx in range(f0_values.shape[0]): + temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :] + temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :] + # stores the accumulation of i.phase within + # each voiced segments + tmp_cumsum[idx, :, :] = 0 + tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum + + # rad_values - tmp_cumsum: remove the accumulation of i.phase + # within the previous voiced segment. + i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1) + + # get the sines + sines = torch.cos(i_phase * 2 * np.pi) + return sines + + def forward(self, f0): + """ sine_tensor, uv = forward(f0) + input F0: tensor(batchsize=1, length, dim=1) + f0 for unvoiced steps should be 0 + output sine_tensor: tensor(batchsize=1, length, dim) + output uv: tensor(batchsize=1, length, 1) + """ + f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, + device=f0.device) + # fundamental component + fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device)) + + # generate sine waveforms + sine_waves = self._f02sine(fn) * self.sine_amp + + # generate uv signal + # uv = torch.ones(f0.shape) + # uv = uv * (f0 > self.voiced_threshold) + uv = self._f02uv(f0) + + # noise: for unvoiced should be similar to sine_amp + # std = self.sine_amp/3 -> max value ~ self.sine_amp + # . for voiced regions is self.noise_std + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) + + # first: set the unvoiced part to 0 by uv + # then: additive noise + sine_waves = sine_waves * uv + noise + return sine_waves, uv, noise + + +class SourceModuleHnNSF(torch.nn.Module): + """ SourceModule for hn-nsf + SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, + add_noise_std=0.003, voiced_threshod=0) + sampling_rate: sampling_rate in Hz + harmonic_num: number of harmonic above F0 (default: 0) + sine_amp: amplitude of sine source signal (default: 0.1) + add_noise_std: std of additive Gaussian noise (default: 0.003) + note that amplitude of noise in unvoiced is decided + by sine_amp + voiced_threshold: threhold to set U/V given F0 (default: 0) + Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) + F0_sampled (batchsize, length, 1) + Sine_source (batchsize, length, 1) + noise_source (batchsize, length 1) + uv (batchsize, length, 1) + """ + + def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1, + add_noise_std=0.003, voiced_threshod=0): + super(SourceModuleHnNSF, self).__init__() + + self.sine_amp = sine_amp + self.noise_std = add_noise_std + + # to produce sine waveforms + self.l_sin_gen = SineGen(sampling_rate, upsample_scale, harmonic_num, + sine_amp, add_noise_std, voiced_threshod) + + # to merge source harmonics into a single excitation + self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) + self.l_tanh = torch.nn.Tanh() + + def forward(self, x): + """ + Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) + F0_sampled (batchsize, length, 1) + Sine_source (batchsize, length, 1) + noise_source (batchsize, length 1) + """ + # source for harmonic branch + with torch.no_grad(): + sine_wavs, uv, _ = self.l_sin_gen(x) + sine_merge = self.l_tanh(self.l_linear(sine_wavs)) + + # source for noise branch, in the same shape as uv + noise = torch.randn_like(uv) * self.sine_amp / 3 + return sine_merge, noise, uv +def padDiff(x): + return F.pad(F.pad(x, (0,0,-1,1), 'constant', 0) - x, (0,0,0,-1), 'constant', 0) + +class Generator(torch.nn.Module): + def __init__(self, style_dim, resblock_kernel_sizes, upsample_rates, upsample_initial_channel, resblock_dilation_sizes, upsample_kernel_sizes): + super(Generator, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + resblock = AdaINResBlock1 + + self.m_source = SourceModuleHnNSF( + sampling_rate=24000, + upsample_scale=np.prod(upsample_rates), + harmonic_num=8, voiced_threshod=10) + + self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) + self.noise_convs = nn.ModuleList() + self.ups = nn.ModuleList() + self.noise_res = nn.ModuleList() + + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + c_cur = upsample_initial_channel // (2 ** (i + 1)) + + self.ups.append(weight_norm(ConvTranspose1d(upsample_initial_channel//(2**i), + upsample_initial_channel//(2**(i+1)), + k, u, padding=(u//2 + u%2), output_padding=u%2))) + + if i + 1 < len(upsample_rates): # + stride_f0 = np.prod(upsample_rates[i + 1:]) + self.noise_convs.append(Conv1d( + 1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=(stride_f0+1) // 2)) + self.noise_res.append(resblock(c_cur, 7, [1,3,5], style_dim)) + else: + self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) + self.noise_res.append(resblock(c_cur, 11, [1,3,5], style_dim)) + + self.resblocks = nn.ModuleList() + + self.alphas = nn.ParameterList() + self.alphas.append(nn.Parameter(torch.ones(1, upsample_initial_channel, 1))) + + for i in range(len(self.ups)): + ch = upsample_initial_channel//(2**(i+1)) + self.alphas.append(nn.Parameter(torch.ones(1, ch, 1))) + + for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): + self.resblocks.append(resblock(ch, k, d, style_dim)) + + self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) + self.ups.apply(init_weights) + self.conv_post.apply(init_weights) + + def forward(self, x, s, f0): + + f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t + + har_source, noi_source, uv = self.m_source(f0) + har_source = har_source.transpose(1, 2) + + for i in range(self.num_upsamples): + x = x + (1 / self.alphas[i]) * (torch.sin(self.alphas[i] * x) ** 2) + x_source = self.noise_convs[i](har_source) + x_source = self.noise_res[i](x_source, s) + + x = self.ups[i](x) + x = x + x_source + + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i*self.num_kernels+j](x, s) + else: + xs += self.resblocks[i*self.num_kernels+j](x, s) + x = xs / self.num_kernels + x = x + (1 / self.alphas[i+1]) * (torch.sin(self.alphas[i+1] * x) ** 2) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + print('Removing weight norm...') + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + remove_weight_norm(self.conv_pre) + remove_weight_norm(self.conv_post) + + +class AdainResBlk1d(nn.Module): + def __init__(self, dim_in, dim_out, style_dim=64, actv=nn.LeakyReLU(0.2), + upsample='none', dropout_p=0.0): + super().__init__() + self.actv = actv + self.upsample_type = upsample + self.upsample = UpSample1d(upsample) + self.learned_sc = dim_in != dim_out + self._build_weights(dim_in, dim_out, style_dim) + self.dropout = nn.Dropout(dropout_p) + + if upsample == 'none': + self.pool = nn.Identity() + else: + self.pool = weight_norm(nn.ConvTranspose1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1, output_padding=1)) + + + def _build_weights(self, dim_in, dim_out, style_dim): + self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1)) + self.conv2 = weight_norm(nn.Conv1d(dim_out, dim_out, 3, 1, 1)) + self.norm1 = AdaIN1d(style_dim, dim_in) + self.norm2 = AdaIN1d(style_dim, dim_out) + if self.learned_sc: + self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False)) + + def _shortcut(self, x): + x = self.upsample(x) + if self.learned_sc: + x = self.conv1x1(x) + return x + + def _residual(self, x, s): + x = self.norm1(x, s) + x = self.actv(x) + x = self.pool(x) + x = self.conv1(self.dropout(x)) + x = self.norm2(x, s) + x = self.actv(x) + x = self.conv2(self.dropout(x)) + return x + + def forward(self, x, s): + out = self._residual(x, s) + out = (out + self._shortcut(x)) / math.sqrt(2) + return out + +class UpSample1d(nn.Module): + def __init__(self, layer_type): + super().__init__() + self.layer_type = layer_type + + def forward(self, x): + if self.layer_type == 'none': + return x + else: + return F.interpolate(x, scale_factor=2, mode='nearest') + +class Decoder(nn.Module): + def __init__(self, dim_in=512, F0_channel=512, style_dim=64, dim_out=80, + resblock_kernel_sizes = [3,7,11], + upsample_rates = [10,5,3,2], + upsample_initial_channel=512, + resblock_dilation_sizes=[[1,3,5], [1,3,5], [1,3,5]], + upsample_kernel_sizes=[20,10,6,4]): + super().__init__() + + self.decode = nn.ModuleList() + + self.encode = AdainResBlk1d(dim_in + 2, 1024, style_dim) + + self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim)) + self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim)) + self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim)) + self.decode.append(AdainResBlk1d(1024 + 2 + 64, 512, style_dim, upsample=True)) + + self.F0_conv = weight_norm(nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1)) + + self.N_conv = weight_norm(nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1)) + + self.asr_res = nn.Sequential( + weight_norm(nn.Conv1d(512, 64, kernel_size=1)), + ) + + + self.generator = Generator(style_dim, resblock_kernel_sizes, upsample_rates, upsample_initial_channel, resblock_dilation_sizes, upsample_kernel_sizes) + + + def forward(self, asr, F0_curve, N, s): + if self.training: + downlist = [0, 3, 7] + F0_down = downlist[random.randint(0, 2)] + downlist = [0, 3, 7, 15] + N_down = downlist[random.randint(0, 3)] + if F0_down: + F0_curve = nn.functional.conv1d(F0_curve.unsqueeze(1), torch.ones(1, 1, F0_down).to(asr.device), padding=F0_down//2).squeeze(1) / F0_down + if N_down: + N = nn.functional.conv1d(N.unsqueeze(1), torch.ones(1, 1, N_down).to(asr.device), padding=N_down//2).squeeze(1) / N_down + + + F0 = self.F0_conv(F0_curve.unsqueeze(1)) + N = self.N_conv(N.unsqueeze(1)) + + x = torch.cat([asr, F0, N], axis=1) + x = self.encode(x, s) + + asr_res = self.asr_res(asr) + + res = True + for block in self.decode: + if res: + x = torch.cat([x, asr_res, F0, N], axis=1) + x = block(x, s) + if block.upsample_type != "none": + res = False + + x = self.generator(x, s, F0_curve) + return x + + \ No newline at end of file diff --git a/vi,en/StyleTTS2-lite-vi/Modules/utils.py b/vi,en/StyleTTS2-lite-vi/Modules/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c2fd9817caffbb9bf4c616b481cf84aee0362f6b --- /dev/null +++ b/vi,en/StyleTTS2-lite-vi/Modules/utils.py @@ -0,0 +1,14 @@ +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def apply_weight_norm(m): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + weight_norm(m) + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size*dilation - dilation)/2) \ No newline at end of file diff --git a/vi,en/StyleTTS2-lite-vi/README.md b/vi,en/StyleTTS2-lite-vi/README.md new file mode 100644 index 0000000000000000000000000000000000000000..92262afdcacc7e8bfd252f38ee0f03848b284d44 --- /dev/null +++ b/vi,en/StyleTTS2-lite-vi/README.md @@ -0,0 +1,101 @@ +--- +license: mit +datasets: +- capleaf/viVoice +language: +- vi +- en +base_model: +- yl4579/StyleTTS2-LibriTTS +pipeline_tag: text-to-speech +--- + +# StyleTTS 2 - lite - vi (Vietnamese) + + +## Online Demo +Explore the model on Hugging Face Spaces: +https://huggingface.co/spaces/dangtr0408/StyleTTS2-lite-vi-space + +## Fine-tune +https://github.com/dangtr0408/StyleTTS2-lite + +## Training Details + +1. **Base Checkpoint:** + - Initialized from the official StyleTTS 2 LibriTTS weights. +2. **Token Extension:** + - Expanded the token set to 189 symbols to ensure full Vietnamese IPA compatibility. +3. **Training Data:** + - **FonosVietnam** (extracted from the viVoice corpus) + - **VoizFM** (extracted from the viVoice corpus) +4. **Training Schedule:** + - Trained for 120 000 steps. + +## Model Architecture + +| Component | Parameters | +| -------------- | ------------- | +| Decoder | 54 ,289 ,492 | +| Predictor | 16 ,194 ,612 | +| Style Encoder | 13 ,845 ,440 | +| Text Encoder | 5,612 ,320 | +| **Total** | **89 ,941 ,576** | + +## Prerequisites + +- **Python:** Version 3.7 or higher +- **Git:** To clone the repository + +## Installation & Setup + +1. Clone the repository + +```bash + +git clone https://huggingface.co/dangtr0408/StyleTTS2-lite-vi + +cd StyleTTS2-lite-vi + +``` + +2. Install dependencies: + +```bash + +pip install -r requirements.txt + +``` + + + +3. On **Linux**, manually install espeak: + +```bash + +sudo apt-get install espeak-ng + +``` + +## Usage Example + +See run.ipynb file. + +## Disclaimer + +***Before using these pre-trained models, you agree to inform the listeners that the speech samples are synthesized by the pre-trained models, unless you have the permission to use the voice you synthesize. That is, you agree to only use voices whose speakers grant the permission to have their voice cloned, either directly or by license before making synthesized voices public, or you have to publicly announce that these voices are synthesized if you do not have the permission to use these voices.*** + + +## References + +- [yl4579/StyleTTS2](https://arxiv.org/abs/2306.07691) + +- [jik876/hifi-gan](https://github.com/jik876/hifi-gan) + +- [capleaf/viVoice](https://huggingface.co/datasets/capleaf/viVoice) + +## License + +**Code: MIT License** + +**Model: CC-BY-NC-SA-4.0** \ No newline at end of file diff --git a/vi,en/StyleTTS2-lite-vi/app.py b/vi,en/StyleTTS2-lite-vi/app.py new file mode 100644 index 0000000000000000000000000000000000000000..ddce346846145d3ccad1014f01508abcdb379b68 --- /dev/null +++ b/vi,en/StyleTTS2-lite-vi/app.py @@ -0,0 +1,126 @@ +import gradio as gr +import os +import soundfile as sf +import numpy as np +import torch +import traceback +from inference import StyleTTS2 +repo_dir = './' +device = 'cuda' if torch.cuda.is_available() else 'cpu' +config_path = os.path.join(repo_dir, "Models", "config.yaml") +models_path = os.path.join(repo_dir, "Models", "model.pth") +model = StyleTTS2(config_path, models_path).eval().to(device) +voice_path = os.path.join(repo_dir, "reference_audio") +eg_voices = [os.path.join(voice_path,"vn_1.wav"), os.path.join(voice_path,"vn_2.wav")] +eg_texts = [ + "Chỉ với khoảng 90 triệu tham số, [en-us]{StyleTTS2-lite} có thể dễ dàng tạo giọng nói với tốc độ cao.", + "[id_1] Với [en-us]{StyleTTS2-lite} bạn có thể sử dụng [en-us]{language tag} để mô hình chắc chắn đọc bằng tiếng Anh, [id_2]cũng như sử dụng [en-us]{speaker tag} để chuyển đổi nhanh giữa các giọng đọc.", +] + + +# Core inference function +def main(reference_paths, text_prompt, denoise, avg_style, stabilize): + try: + speakers = {} + for i, path in enumerate(reference_paths, 1): + speaker_id = f"id_{i}" + speakers[speaker_id] = { + "path": path, + "lang": "vi", + "speed": 1.0 + } + + with torch.no_grad(): + styles = model.get_styles(speakers, denoise, avg_style) + r = model.generate(text_prompt, styles, stabilize, 18, "[id_1]") + r = r / np.abs(r).max() + + sf.write("output.wav", r, samplerate=24000) + return "output.wav", "Audio generated successfully!" + + except Exception as e: + error_message = traceback.format_exc() + return None, error_message + +def on_file_upload(file_list): + if not file_list: + return None, "No file uploaded yet." + + unique_files = {} + for file_path in file_list: + file_name = os.path.basename(file_path) + unique_files[file_name] = file_path #update and remove duplicate + + uploaded_infos = [] + uploaded_file_names = list(unique_files.keys()) + for i in range(len(uploaded_file_names)): + uploaded_infos.append(f"[id_{i+1}]: {uploaded_file_names[i]}") + + summary = "\n".join(uploaded_infos) + return list(unique_files.values()), f"Current reference audios:\n{summary}" + +def gen_example(reference_paths, text_prompt): + output, status = main(reference_paths, text_prompt, 0.6, True, True) + return output, reference_paths, status + + +# Gradio UI +with gr.Blocks() as demo: + gr.HTML("