niobures commited on
Commit
9506d83
·
verified ·
1 Parent(s): 797b3d2

StyleTTS (ar, en, fr, ms, ru, uk, vi)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +25 -0
  2. ar/StyleTTS2-LibriTTS-arabic/.gitattributes +36 -0
  3. ar/StyleTTS2-LibriTTS-arabic/README.md +142 -0
  4. ar/StyleTTS2-LibriTTS-arabic/config.yml +114 -0
  5. ar/StyleTTS2-LibriTTS-arabic/model.pth +3 -0
  6. ar/StyleTTS2-LibriTTS-arabic/source.txt +1 -0
  7. ar/StyleTTS2-LibriTTS-arabic/synthesized_audio.wav +3 -0
  8. en/StyleTTS2-lite/.gitattributes +36 -0
  9. en/StyleTTS2-lite/.gitignore +8 -0
  10. en/StyleTTS2-lite/Audio/10_michael.wav +3 -0
  11. en/StyleTTS2-lite/Audio/11_fenrir.wav +3 -0
  12. en/StyleTTS2-lite/Audio/12_puck.wav +3 -0
  13. en/StyleTTS2-lite/Audio/13_echo.wav +3 -0
  14. en/StyleTTS2-lite/Audio/14_eric.wav +3 -0
  15. en/StyleTTS2-lite/Audio/15_liam.wav +3 -0
  16. en/StyleTTS2-lite/Audio/16_onyx.wav +3 -0
  17. en/StyleTTS2-lite/Audio/17_santa.wav +3 -0
  18. en/StyleTTS2-lite/Audio/18_adam.wav +3 -0
  19. en/StyleTTS2-lite/Audio/1_heart.wav +3 -0
  20. en/StyleTTS2-lite/Audio/2_belle.wav +3 -0
  21. en/StyleTTS2-lite/Audio/3_kore.wav +3 -0
  22. en/StyleTTS2-lite/Audio/4_sarah.wav +3 -0
  23. en/StyleTTS2-lite/Audio/5_nova.wav +3 -0
  24. en/StyleTTS2-lite/Audio/6_sky.wav +3 -0
  25. en/StyleTTS2-lite/Audio/7_alloy.wav +3 -0
  26. en/StyleTTS2-lite/Audio/8_jessica.wav +3 -0
  27. en/StyleTTS2-lite/Audio/9_river.wav +3 -0
  28. en/StyleTTS2-lite/LICENSE +21 -0
  29. en/StyleTTS2-lite/Models/base_model.pth +3 -0
  30. en/StyleTTS2-lite/Models/config.yaml +79 -0
  31. en/StyleTTS2-lite/Models/inference/model.pth +3 -0
  32. en/StyleTTS2-lite/Modules/__init__.py +1 -0
  33. en/StyleTTS2-lite/Modules/hifigan.py +477 -0
  34. en/StyleTTS2-lite/Modules/utils.py +14 -0
  35. en/StyleTTS2-lite/README.md +88 -0
  36. en/StyleTTS2-lite/inference.py +301 -0
  37. en/StyleTTS2-lite/meldataset.py +307 -0
  38. en/StyleTTS2-lite/models.py +532 -0
  39. en/StyleTTS2-lite/requirements.txt +10 -0
  40. en/StyleTTS2-lite/run.ipynb +176 -0
  41. en/StyleTTS2-lite/source.txt +1 -0
  42. en/StyleTTS2/.gitattributes +35 -0
  43. en/StyleTTS2/Multi0/config.yml +112 -0
  44. en/StyleTTS2/Multi0/config_30_e934.yml +22 -0
  45. en/StyleTTS2/Multi0/config_40_1c872.yml +22 -0
  46. en/StyleTTS2/Multi0/epoch_2nd_23_9ab0.pth +3 -0
  47. en/StyleTTS2/Multi0/epoch_2nd_30_e934.pth +3 -0
  48. en/StyleTTS2/Multi0/epoch_2nd_40_1c872.pth +3 -0
  49. en/StyleTTS2/Multi0/ref_audio.zip +3 -0
  50. en/StyleTTS2/README.md +7 -0
.gitattributes CHANGED
@@ -33,3 +33,28 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ ar/StyleTTS2-LibriTTS-arabic/synthesized_audio.wav filter=lfs diff=lfs merge=lfs -text
37
+ en/StyleTTS2-lite/Audio/1_heart.wav filter=lfs diff=lfs merge=lfs -text
38
+ en/StyleTTS2-lite/Audio/10_michael.wav filter=lfs diff=lfs merge=lfs -text
39
+ en/StyleTTS2-lite/Audio/11_fenrir.wav filter=lfs diff=lfs merge=lfs -text
40
+ en/StyleTTS2-lite/Audio/12_puck.wav filter=lfs diff=lfs merge=lfs -text
41
+ en/StyleTTS2-lite/Audio/13_echo.wav filter=lfs diff=lfs merge=lfs -text
42
+ en/StyleTTS2-lite/Audio/14_eric.wav filter=lfs diff=lfs merge=lfs -text
43
+ en/StyleTTS2-lite/Audio/15_liam.wav filter=lfs diff=lfs merge=lfs -text
44
+ en/StyleTTS2-lite/Audio/16_onyx.wav filter=lfs diff=lfs merge=lfs -text
45
+ en/StyleTTS2-lite/Audio/17_santa.wav filter=lfs diff=lfs merge=lfs -text
46
+ en/StyleTTS2-lite/Audio/18_adam.wav filter=lfs diff=lfs merge=lfs -text
47
+ en/StyleTTS2-lite/Audio/2_belle.wav filter=lfs diff=lfs merge=lfs -text
48
+ en/StyleTTS2-lite/Audio/3_kore.wav filter=lfs diff=lfs merge=lfs -text
49
+ en/StyleTTS2-lite/Audio/4_sarah.wav filter=lfs diff=lfs merge=lfs -text
50
+ en/StyleTTS2-lite/Audio/5_nova.wav filter=lfs diff=lfs merge=lfs -text
51
+ en/StyleTTS2-lite/Audio/6_sky.wav filter=lfs diff=lfs merge=lfs -text
52
+ en/StyleTTS2-lite/Audio/7_alloy.wav filter=lfs diff=lfs merge=lfs -text
53
+ en/StyleTTS2-lite/Audio/8_jessica.wav filter=lfs diff=lfs merge=lfs -text
54
+ en/StyleTTS2-lite/Audio/9_river.wav filter=lfs diff=lfs merge=lfs -text
55
+ ru/StyleTTS_prokopenko_v1/voices/prokopenko/reference.wav filter=lfs diff=lfs merge=lfs -text
56
+ vi,en/StyleTTS2-lite-vi/reference_audio/3.wav filter=lfs diff=lfs merge=lfs -text
57
+ vi,en/StyleTTS2-lite-vi/reference_audio/vn_1.wav filter=lfs diff=lfs merge=lfs -text
58
+ vi,en/StyleTTS2-lite-vi/reference_audio/vn_2.wav filter=lfs diff=lfs merge=lfs -text
59
+ vi,en/StyleTTS2-lite-vi/reference_audio/vn_3.wav filter=lfs diff=lfs merge=lfs -text
60
+ vi,en/StyleTTS2-lite-vi/reference_audio/vn_4.wav filter=lfs diff=lfs merge=lfs -text
ar/StyleTTS2-LibriTTS-arabic/.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ synthesized_audio.wav filter=lfs diff=lfs merge=lfs -text
ar/StyleTTS2-LibriTTS-arabic/README.md ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: ar
3
+ tags:
4
+ - text-to-speech
5
+ - tts
6
+ - arabic
7
+ - styletts2
8
+ - pl-bert
9
+ license: mit
10
+ hardware: H100
11
+ ---
12
+
13
+ # Model Card for Arabic StyleTTS2
14
+
15
+ This is an Arabic text-to-speech model based on StyleTTS2 architecture, specifically adapted for Arabic language synthesis. The model achieves good quality Arabic speech synthesis, though not yet state-of-the-art, and further experimentation is needed to optimize performance for Arabic language specifically. All training objectives from the original StyleTTS2 were maintained, except for the WavLM objectives which were removed as they were primarily designed for English speech.
16
+
17
+ ## Example
18
+
19
+ Here is an example output from the model:
20
+
21
+ #### Sample 1
22
+ <audio controls>
23
+ <source src="https://huggingface.co/fadi77/StyleTTS2-LibriTTS-arabic/resolve/main/synthesized_audio.wav" type="audio/wav">
24
+ Your browser does not support the audio element.
25
+ </audio>
26
+
27
+ ## Efficiency and Performance
28
+
29
+ A key strength of this model lies in its efficiency and performance characteristics:
30
+
31
+ - **Compact Architecture**: Achieves impressive quality with <100M parameters
32
+ - **Limited Training Data**: Trained on only 22 hours of single-speaker audio
33
+ - **Transfer Learning**: Successfully fine-tuned from LibriTTS multi-speaker model to single-speaker Arabic
34
+ - **Resource Efficient**: Good quality achieved despite limited computational resources
35
+
36
+ Note: According to the StyleTTS2 authors, performance should improve further when training a single-speaker model from scratch rather than fine-tuning. This wasn't attempted in our case due to computational resource constraints, suggesting potential for even better results with more extensive training.
37
+
38
+
39
+ ## Model Details
40
+
41
+ ### Model Description
42
+
43
+ This model is a modified version of StyleTTS2, specifically adapted for Arabic text-to-speech synthesis. It incorporates a custom-trained PL-BERT model for Arabic language understanding and removes the WavLM adversarial training component (which was primarily designed for English).
44
+
45
+ - **Developed by:** Fadi (GitHub: Fadi987)
46
+ - **Model type:** Text-to-Speech (StyleTTS2 architecture)
47
+ - **Language(s):** Arabic
48
+ - **Finetuned from model:** [yl4579/StyleTTS2-LibriTTS](https://huggingface.co/yl4579/StyleTTS2-LibriTTS)
49
+
50
+ ### Model Sources
51
+
52
+ - **Repository:** [Fadi987/StyleTTS2](https://github.com/Fadi987/StyleTTS2)
53
+ - **Paper:** [StyleTTS 2: Towards Human-Level Text-to-Speech through Style Diffusion and Adversarial Training with Large Speech Language Models](https://arxiv.org/abs/2306.07691)
54
+ - **PL-BERT Model:** [fadi77/pl-bert](https://huggingface.co/fadi77/pl-bert)
55
+
56
+ ## Uses
57
+
58
+ ### Direct Use
59
+
60
+ The model can be used for generating Arabic speech from text. To use the model:
61
+
62
+ 1. Clone the StyleTTS2 repository:
63
+ ```bash
64
+ git clone https://github.com/Fadi987/StyleTTS2
65
+ cd StyleTTS2
66
+ ```
67
+
68
+ 2. Install `espeak-ng` for phonemization backend:
69
+ ```bash
70
+ # For macOS
71
+ brew install espeak-ng
72
+
73
+ # For Ubuntu/Debian
74
+ sudo apt-get install espeak-ng
75
+
76
+ # For Windows
77
+ # Download and install espeak-ng from: https://github.com/espeak-ng/espeak-ng/releases
78
+ ```
79
+
80
+ 3. Install Python dependencies:
81
+ ```bash
82
+ pip install -r requirements.txt
83
+ ```
84
+
85
+ 4. Download the `model.pth` and `config.yml` files from this repository
86
+
87
+ 5. Run inference using:
88
+ ```bash
89
+ python inference.py --config config.yml --model model.pth --text "الإِتْقَانُ يَحْتَاجُ إِلَى الْعَمَلِ وَالْمُثَابَرَة"
90
+ ```
91
+
92
+ Make sure use properly diacritized Arabic text for best results
93
+
94
+ ### Out-of-Scope Use
95
+
96
+ The model is specifically designed for Arabic text-to-speech synthesis and may not perform well for:
97
+ - Other languages
98
+ - Heavy dialect variations
99
+ - Non-diacritized Arabic text
100
+
101
+ ## Training Details
102
+
103
+ ### Training Data
104
+
105
+ - Training was performed on approximately 22 hours of Arabic audiobook data
106
+ - Dataset: [fadi77/arabic-audiobook-dataset-24khz](https://huggingface.co/datasets/fadi77/arabic-audiobook-dataset-24khz)
107
+ - The PL-BERT component was trained on fully diacritized Wikipedia Arabic text
108
+
109
+ ### Training Hyperparameters
110
+
111
+ - **Number of epochs:** 20
112
+ - **Diffusion training:** Started from epoch 5
113
+
114
+ ### Objectives
115
+ - **Training objectives:** All original StyleTTS2 objectives maintained, except WavLM adversarial training
116
+ - **Validation objectives:** Identical to original StyleTTS2 validation process
117
+
118
+ ### Compute Infrastructure
119
+ - **Hardware Type:** NVIDIA H100 GPU
120
+
121
+ ### Notable Modifications from Original StyleTTS2 in Architecture and Objectives
122
+ The architecture of the model follows that of StyleTTS2 with the following exceptions:
123
+ - Removed WavLM adversarial training component
124
+ - Custom PL-BERT trained for Arabic language
125
+
126
+
127
+ ## Citation
128
+
129
+ **BibTeX:**
130
+ ```bibtex
131
+ @article{styletts2,
132
+ title={StyleTTS 2: Towards Human-Level Text-to-Speech through Style Diffusion and Adversarial Training with Large Speech Language Models},
133
+ author={Liu, Yinghao Aaron and Chen, Tao and Ping, Wei and Wu, Xiaoliang and Wang, Dongchao and Duan, Yuxuan and Li, Xiaodi and Li, Chong and Liang, Xuchen and Liu, Qiong and others},
134
+ journal={arXiv preprint arXiv:2306.07691},
135
+ year={2023}
136
+ }
137
+ ```
138
+
139
+ ## Model Card Contact
140
+
141
+ GitHub: [@Fadi987](https://github.com/Fadi987)
142
+ Hugging Face: [@fadi77](https://huggingface.co/fadi77)
ar/StyleTTS2-LibriTTS-arabic/config.yml ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_dir: "/style_tts2/Models/FineTune.AudioBook"
2
+ log_interval: 10
3
+ device: "cuda"
4
+ epochs: 25 # number of finetuning epoch
5
+ batch_size: 6
6
+ max_len: 300 # maximum number of frames
7
+ pretrained_model_repo: "yl4579/StyleTTS2-LibriTTS"
8
+ pretrained_model_filename: "Models/LibriTTS/epochs_2nd_00020.pth"
9
+ second_stage_load_pretrained: true # set to true if the pre-trained model is for 2nd stage
10
+ load_only_params: true # set to true if do not want to load epoch numbers and optimizer parameters
11
+
12
+ F0_path: "/root/Utils/JDC/bst.t7"
13
+ ASR_config: "/root/Utils/ASR/config.yml"
14
+ ASR_path: "/root/Utils/ASR/epoch_00080.pth"
15
+ PLBERT_repo_id: "fadi77/pl-bert"
16
+ PLBERT_dirname: "models/mlm_only_with_diacritics"
17
+
18
+ data_params:
19
+ train_data: "Data/youtube_train_list.txt"
20
+ val_data: "Data/youtube_val_list.txt"
21
+ root_path: "Youtube/wavs"
22
+ OOD_data: "Data/youtube_train_list.txt"
23
+ min_length: 50 # sample until texts with this size are obtained for OOD texts
24
+
25
+ preprocess_params:
26
+ sr: 24000
27
+ spect_params:
28
+ n_fft: 2048
29
+ win_length: 1200
30
+ hop_length: 300
31
+
32
+ model_params:
33
+ multispeaker: false
34
+
35
+ dim_in: 64
36
+ hidden_dim: 512
37
+ max_conv_dim: 512
38
+ n_layer: 3
39
+ n_mels: 80
40
+
41
+ n_token: 178 # number of phoneme tokens
42
+ max_dur: 50 # maximum duration of a single phoneme
43
+ style_dim: 128 # style vector size
44
+
45
+ dropout: 0.2
46
+
47
+ # config for decoder
48
+ decoder:
49
+ type: 'hifigan' # either hifigan or istftnet
50
+ resblock_kernel_sizes: [3,7,11]
51
+ upsample_rates : [10,5,3,2]
52
+ upsample_initial_channel: 512
53
+ resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
54
+ upsample_kernel_sizes: [20,10,6,4]
55
+
56
+ # speech language model config
57
+ slm:
58
+ model: 'microsoft/wavlm-base-plus'
59
+ sr: 16000 # sampling rate of SLM
60
+ hidden: 768 # hidden size of SLM
61
+ nlayers: 13 # number of layers of SLM
62
+ initial_channel: 64 # initial channels of SLM discriminator head
63
+
64
+ # style diffusion model config
65
+ diffusion:
66
+ embedding_mask_proba: 0.1
67
+ # transformer config
68
+ transformer:
69
+ num_layers: 3
70
+ num_heads: 8
71
+ head_features: 64
72
+ multiplier: 2
73
+
74
+ # diffusion distribution config
75
+ dist:
76
+ sigma_data: 0.2 # placeholder for estimate_sigma_data set to false
77
+ estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
78
+ mean: -3.0
79
+ std: 1.0
80
+
81
+ loss_params:
82
+ lambda_mel: 5. # mel reconstruction loss
83
+ lambda_gen: 1. # generator loss
84
+ lambda_slm: 1. # slm feature matching loss
85
+
86
+ lambda_mono: 1. # monotonic alignment loss (TMA)
87
+ lambda_s2s: 1. # sequence-to-sequence loss (TMA)
88
+
89
+ lambda_F0: 1. # F0 reconstruction loss
90
+ lambda_norm: 1. # norm reconstruction loss
91
+ lambda_dur: 1. # duration loss
92
+ lambda_ce: 20. # duration predictor probability output CE loss
93
+ lambda_sty: 1. # style reconstruction loss
94
+ lambda_diff: 1. # score matching loss
95
+
96
+ # Note: Current values for training are only adequate for second stage finetuning.
97
+ diffusion_training_epoch: 5
98
+ joint_training_epoch: 100
99
+
100
+ # Note: Current values for learnings rates are very low. This is only adequate for second stage finetuning.
101
+ optimizer_params:
102
+ lr: 0.0001 # general learning rate
103
+ bert_lr: 0.00001 # learning rate for PLBERT
104
+ ft_lr: 0.0001 # learning rate for acoustic modules
105
+
106
+ slmadv_params:
107
+ min_len: 400 # minimum length of samples
108
+ max_len: 500 # maximum length of samples
109
+ batch_percentage: 0.5 # to prevent out of memory, only use half of the original batch size
110
+ skip_update: 10 # update the discriminator every this iterations of generator update
111
+ thresh: 5 # gradient norm above which the gradient is scaled
112
+ scale: 0.01 # gradient scaling factor for predictors from SLM discriminators
113
+ sig: 1.5 # sigma for differentiable duration modeling
114
+
ar/StyleTTS2-LibriTTS-arabic/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59d2323412f0c55c774b5675b45e5c12659c0d9e0f9e7012eecc6b7dd845b132
3
+ size 2201968238
ar/StyleTTS2-LibriTTS-arabic/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/fadi77/StyleTTS2-LibriTTS-arabic
ar/StyleTTS2-LibriTTS-arabic/synthesized_audio.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f60e90523d734eff1b9f4b95cca49f22277df5cb4acd0bd347fde18f1c3b0469
3
+ size 1795058
en/StyleTTS2-lite/.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.wav filter=lfs diff=lfs merge=lfs -text
en/StyleTTS2-lite/.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ Modules/__pycache__/__init__.cpython-311.pyc
2
+ Modules/__pycache__/hifigan.cpython-311.pyc
3
+ Modules/__pycache__/utils.cpython-311.pyc
4
+ Modules/__pycache__/__init__.cpython-311.pyc
5
+ Modules/__pycache__/hifigan.cpython-311.pyc
6
+ Modules/__pycache__/utils.cpython-311.pyc
7
+ __pycache__/inference.cpython-311.pyc
8
+ __pycache__/models.cpython-311.pyc
en/StyleTTS2-lite/Audio/10_michael.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:733023e56be0434c66ac3b855c9aaac29d64f3a060c295a75e700ecfd34c16f0
3
+ size 620444
en/StyleTTS2-lite/Audio/11_fenrir.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abde72631473e48455d54cf585a0b1f229e6e77e9748ed1acef5678a40b08c08
3
+ size 537644
en/StyleTTS2-lite/Audio/12_puck.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:409cc59612472a0d4bb717613f539dafdb334411ed651ab6988f7fca8b922905
3
+ size 619244
en/StyleTTS2-lite/Audio/13_echo.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6925e6737a67fcbf8dce32d22d29d086d81627b82c6edbfc92b3706f27479ff
3
+ size 524444
en/StyleTTS2-lite/Audio/14_eric.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97b8bbf6a880e46730387ee7bb4bfba6c049ed58c4ec8680ec44f83df669eff1
3
+ size 573644
en/StyleTTS2-lite/Audio/15_liam.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95842cfe6d1093deb37447b0e5993b6c18f7e5591c3fb1fb3dd230641925de44
3
+ size 541244
en/StyleTTS2-lite/Audio/16_onyx.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25487ea7634b470392d787bfefb79da0a6a56dc26087ab27b62fa70aac43554d
3
+ size 514844
en/StyleTTS2-lite/Audio/17_santa.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80bc56619904ccbd93ed813fc54491f7b83eb8b8fd6c8a1626bd9177f96a23cd
3
+ size 583244
en/StyleTTS2-lite/Audio/18_adam.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b84a1b122273a45d98b5cbf725f4633e4cccb4a0788b8a46cc9faa4b8612419b
3
+ size 517244
en/StyleTTS2-lite/Audio/1_heart.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:978b285ff24f274a1f4fe4551b0d57a5df704ca5ce83284e839ffe96c2dc3dfd
3
+ size 547244
en/StyleTTS2-lite/Audio/2_belle.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:459a64fa12dfb530320e8dab2f4057d7868ae4c020b447e8df3402149fa2be59
3
+ size 357644
en/StyleTTS2-lite/Audio/3_kore.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e55fc5c463d01d46c090be5457c59727ee52f2ecbeba8be9b38862850418c0c3
3
+ size 276044
en/StyleTTS2-lite/Audio/4_sarah.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ae7416f410104b0cedc1cc9c7365a89fd16a1599733f8f416e7618943d0acb8
3
+ size 640844
en/StyleTTS2-lite/Audio/5_nova.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:252c20a3f55bfe0ea7f42fbd638f6d4113ade7918630d1d37e166e11143f74f8
3
+ size 336044
en/StyleTTS2-lite/Audio/6_sky.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc985eb31aa7e2088f852c55282ec6ff72365486478a627bcd56ce2387a8d5b2
3
+ size 502844
en/StyleTTS2-lite/Audio/7_alloy.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd7868816449f2139e21661dcbc13d3d553c558627d4c50fada1f7c22ce7f86c
3
+ size 632444
en/StyleTTS2-lite/Audio/8_jessica.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8d7573154905c901281e767f25be2dbceae731c891da409f5b7c0be3096bd5d
3
+ size 477644
en/StyleTTS2-lite/Audio/9_river.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75a3b2fc9d4e93ded21f28cccc6ae7bf7a39bf04fed7f2d4d36e59db0792eedd
3
+ size 472844
en/StyleTTS2-lite/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Aaron (Yinghao) Li
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
en/StyleTTS2-lite/Models/base_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:821deb4efee549b7024f37236e86b4bcb023870baf0ddb9f407fb514253340d1
3
+ size 1692092384
en/StyleTTS2-lite/Models/config.yaml ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_dir: ./Models/Finetune
2
+ save_freq: 1
3
+ log_interval: 10
4
+ device: cuda
5
+ epochs: 50
6
+ batch_size: 2
7
+ max_len: 310 # maximum number of frames
8
+ pretrained_model: ./Models/Finetune/base_model.pth
9
+ load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
10
+ debug: true
11
+
12
+ data_params:
13
+ train_data: ../../Data_Speech/LibriTTS/train.txt
14
+ val_data: ../../Data_Speech/LibriTTS/val.txt
15
+ root_path: ../../Data_Speech/
16
+
17
+ symbol: #Total 178 symbols
18
+ pad: "$"
19
+ punctuation: ';:,.!?¡¿—…"«»“” '
20
+ letters: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
21
+ letters_ipa: "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
22
+ extend: "" #ADD MORE SYMBOLS HERE
23
+
24
+ preprocess_params:
25
+ sr: 24000
26
+ spect_params:
27
+ n_fft: 2048
28
+ win_length: 1200
29
+ hop_length: 300
30
+
31
+ training_strats:
32
+ #All modules: 'decoder', 'predictor', 'text_encoder', 'style_encoder', 'text_aligner', 'pitch_extractor', 'mpd', 'msd'
33
+ freeze_modules: [''] # Not updated when training.
34
+ ignore_modules: [''] # Not loading => fresh start. IMPORTANT: 'text_aligner' and 'pitch_extractor' are util pretraineds DO NOT ignore them.
35
+
36
+ model_params:
37
+ dim_in: 64
38
+ hidden_dim: 512
39
+ max_conv_dim: 512
40
+ n_layer: 3
41
+ n_mels: 80
42
+ max_dur: 50 # maximum duration of a single phoneme
43
+ style_dim: 128 # style vector size
44
+ dropout: 0.2
45
+
46
+ ASR_params:
47
+ input_dim: 80
48
+ hidden_dim: 256
49
+ n_layers: 6
50
+ token_embedding_dim: 512
51
+
52
+ JDC_params:
53
+ num_class: 1
54
+ seq_len: 192
55
+
56
+ # config for decoder
57
+ decoder:
58
+ type: hifigan # either hifigan or istftnet
59
+ resblock_kernel_sizes: [3,7,11]
60
+ upsample_rates : [10,5,3,2]
61
+ upsample_initial_channel: 512
62
+ resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
63
+ upsample_kernel_sizes: [20,10,6,4]
64
+
65
+ loss_params:
66
+ lambda_mel: 5. # mel reconstruction loss
67
+ lambda_gen: 1. # generator loss
68
+
69
+ lambda_mono: 1. # monotonic alignment loss (TMA)
70
+ lambda_s2s: 1. # sequence-to-sequence loss (TMA)
71
+
72
+ lambda_F0: 1. # F0 reconstruction loss
73
+ lambda_norm: 1. # norm reconstruction loss
74
+ lambda_dur: 1. # duration loss
75
+ lambda_ce: 20. # duration predictor probability output CE loss
76
+
77
+ optimizer_params:
78
+ lr: 0.0001 # general learning rate
79
+ ft_lr: 0.00001 # learning rate for acoustic modules
en/StyleTTS2-lite/Models/inference/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2763d7b6c5477502d3f2a870eda76bbedae671f0107b15a1060fb4e6771ed634
3
+ size 359997166
en/StyleTTS2-lite/Modules/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
en/StyleTTS2-lite/Modules/hifigan.py ADDED
@@ -0,0 +1,477 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+ import torch.nn as nn
4
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
5
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
6
+ from .utils import init_weights, get_padding
7
+
8
+ import math
9
+ import random
10
+ import numpy as np
11
+
12
+ LRELU_SLOPE = 0.1
13
+
14
+ class AdaIN1d(nn.Module):
15
+ def __init__(self, style_dim, num_features):
16
+ super().__init__()
17
+ self.norm = nn.InstanceNorm1d(num_features, affine=False)
18
+ self.fc = nn.Linear(style_dim, num_features*2)
19
+
20
+ def forward(self, x, s):
21
+ h = self.fc(s)
22
+ h = h.view(h.size(0), h.size(1), 1)
23
+ gamma, beta = torch.chunk(h, chunks=2, dim=1)
24
+ return (1 + gamma) * self.norm(x) + beta
25
+
26
+ class AdaINResBlock1(torch.nn.Module):
27
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), style_dim=64):
28
+ super(AdaINResBlock1, self).__init__()
29
+ self.convs1 = nn.ModuleList([
30
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
31
+ padding=get_padding(kernel_size, dilation[0]))),
32
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
33
+ padding=get_padding(kernel_size, dilation[1]))),
34
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
35
+ padding=get_padding(kernel_size, dilation[2])))
36
+ ])
37
+ self.convs1.apply(init_weights)
38
+
39
+ self.convs2 = nn.ModuleList([
40
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
41
+ padding=get_padding(kernel_size, 1))),
42
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
43
+ padding=get_padding(kernel_size, 1))),
44
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
45
+ padding=get_padding(kernel_size, 1)))
46
+ ])
47
+ self.convs2.apply(init_weights)
48
+
49
+ self.adain1 = nn.ModuleList([
50
+ AdaIN1d(style_dim, channels),
51
+ AdaIN1d(style_dim, channels),
52
+ AdaIN1d(style_dim, channels),
53
+ ])
54
+
55
+ self.adain2 = nn.ModuleList([
56
+ AdaIN1d(style_dim, channels),
57
+ AdaIN1d(style_dim, channels),
58
+ AdaIN1d(style_dim, channels),
59
+ ])
60
+
61
+ self.alpha1 = nn.ParameterList([nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs1))])
62
+ self.alpha2 = nn.ParameterList([nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs2))])
63
+
64
+
65
+ def forward(self, x, s):
66
+ for c1, c2, n1, n2, a1, a2 in zip(self.convs1, self.convs2, self.adain1, self.adain2, self.alpha1, self.alpha2):
67
+ xt = n1(x, s)
68
+ xt = xt + (1 / a1) * (torch.sin(a1 * xt) ** 2) # Snake1D
69
+ xt = c1(xt)
70
+ xt = n2(xt, s)
71
+ xt = xt + (1 / a2) * (torch.sin(a2 * xt) ** 2) # Snake1D
72
+ xt = c2(xt)
73
+ x = xt + x
74
+ return x
75
+
76
+ def remove_weight_norm(self):
77
+ for l in self.convs1:
78
+ remove_weight_norm(l)
79
+ for l in self.convs2:
80
+ remove_weight_norm(l)
81
+
82
+ class SineGen(torch.nn.Module):
83
+ """ Definition of sine generator
84
+ SineGen(samp_rate, harmonic_num = 0,
85
+ sine_amp = 0.1, noise_std = 0.003,
86
+ voiced_threshold = 0,
87
+ flag_for_pulse=False)
88
+ samp_rate: sampling rate in Hz
89
+ harmonic_num: number of harmonic overtones (default 0)
90
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
91
+ noise_std: std of Gaussian noise (default 0.003)
92
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
93
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
94
+ Note: when flag_for_pulse is True, the first time step of a voiced
95
+ segment is always sin(np.pi) or cos(0)
96
+ """
97
+
98
+ def __init__(self, samp_rate, upsample_scale, harmonic_num=0,
99
+ sine_amp=0.1, noise_std=0.003,
100
+ voiced_threshold=0,
101
+ flag_for_pulse=False):
102
+ super(SineGen, self).__init__()
103
+ self.sine_amp = sine_amp
104
+ self.noise_std = noise_std
105
+ self.harmonic_num = harmonic_num
106
+ self.dim = self.harmonic_num + 1
107
+ self.sampling_rate = samp_rate
108
+ self.voiced_threshold = voiced_threshold
109
+ self.flag_for_pulse = flag_for_pulse
110
+ self.upsample_scale = upsample_scale
111
+
112
+ def _f02uv(self, f0):
113
+ # generate uv signal
114
+ uv = (f0 > self.voiced_threshold).type(torch.float32)
115
+ return uv
116
+
117
+ def _f02sine(self, f0_values):
118
+ """ f0_values: (batchsize, length, dim)
119
+ where dim indicates fundamental tone and overtones
120
+ """
121
+ # convert to F0 in rad. The interger part n can be ignored
122
+ # because 2 * np.pi * n doesn't affect phase
123
+ rad_values = (f0_values / self.sampling_rate) % 1
124
+
125
+ # initial phase noise (no noise for fundamental component)
126
+ rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], \
127
+ device=f0_values.device)
128
+ rand_ini[:, 0] = 0
129
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
130
+
131
+ # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
132
+ if not self.flag_for_pulse:
133
+ # # for normal case
134
+
135
+ # # To prevent torch.cumsum numerical overflow,
136
+ # # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1.
137
+ # # Buffer tmp_over_one_idx indicates the time step to add -1.
138
+ # # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi
139
+ # tmp_over_one = torch.cumsum(rad_values, 1) % 1
140
+ # tmp_over_one_idx = (padDiff(tmp_over_one)) < 0
141
+ # cumsum_shift = torch.zeros_like(rad_values)
142
+ # cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
143
+
144
+ # phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi
145
+ rad_values = torch.nn.functional.interpolate(rad_values.transpose(1, 2),
146
+ scale_factor=1/self.upsample_scale,
147
+ mode="linear").transpose(1, 2)
148
+
149
+ # tmp_over_one = torch.cumsum(rad_values, 1) % 1
150
+ # tmp_over_one_idx = (padDiff(tmp_over_one)) < 0
151
+ # cumsum_shift = torch.zeros_like(rad_values)
152
+ # cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
153
+
154
+ phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi
155
+ phase = torch.nn.functional.interpolate(phase.transpose(1, 2) * self.upsample_scale,
156
+ scale_factor=self.upsample_scale, mode="linear").transpose(1, 2)
157
+ sines = torch.sin(phase)
158
+
159
+ else:
160
+ # If necessary, make sure that the first time step of every
161
+ # voiced segments is sin(pi) or cos(0)
162
+ # This is used for pulse-train generation
163
+
164
+ # identify the last time step in unvoiced segments
165
+ uv = self._f02uv(f0_values)
166
+ uv_1 = torch.roll(uv, shifts=-1, dims=1)
167
+ uv_1[:, -1, :] = 1
168
+ u_loc = (uv < 1) * (uv_1 > 0)
169
+
170
+ # get the instantanouse phase
171
+ tmp_cumsum = torch.cumsum(rad_values, dim=1)
172
+ # different batch needs to be processed differently
173
+ for idx in range(f0_values.shape[0]):
174
+ temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
175
+ temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
176
+ # stores the accumulation of i.phase within
177
+ # each voiced segments
178
+ tmp_cumsum[idx, :, :] = 0
179
+ tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
180
+
181
+ # rad_values - tmp_cumsum: remove the accumulation of i.phase
182
+ # within the previous voiced segment.
183
+ i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
184
+
185
+ # get the sines
186
+ sines = torch.cos(i_phase * 2 * np.pi)
187
+ return sines
188
+
189
+ def forward(self, f0):
190
+ """ sine_tensor, uv = forward(f0)
191
+ input F0: tensor(batchsize=1, length, dim=1)
192
+ f0 for unvoiced steps should be 0
193
+ output sine_tensor: tensor(batchsize=1, length, dim)
194
+ output uv: tensor(batchsize=1, length, 1)
195
+ """
196
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim,
197
+ device=f0.device)
198
+ # fundamental component
199
+ fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
200
+
201
+ # generate sine waveforms
202
+ sine_waves = self._f02sine(fn) * self.sine_amp
203
+
204
+ # generate uv signal
205
+ # uv = torch.ones(f0.shape)
206
+ # uv = uv * (f0 > self.voiced_threshold)
207
+ uv = self._f02uv(f0)
208
+
209
+ # noise: for unvoiced should be similar to sine_amp
210
+ # std = self.sine_amp/3 -> max value ~ self.sine_amp
211
+ # . for voiced regions is self.noise_std
212
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
213
+ noise = noise_amp * torch.randn_like(sine_waves)
214
+
215
+ # first: set the unvoiced part to 0 by uv
216
+ # then: additive noise
217
+ sine_waves = sine_waves * uv + noise
218
+ return sine_waves, uv, noise
219
+
220
+
221
+ class SourceModuleHnNSF(torch.nn.Module):
222
+ """ SourceModule for hn-nsf
223
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
224
+ add_noise_std=0.003, voiced_threshod=0)
225
+ sampling_rate: sampling_rate in Hz
226
+ harmonic_num: number of harmonic above F0 (default: 0)
227
+ sine_amp: amplitude of sine source signal (default: 0.1)
228
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
229
+ note that amplitude of noise in unvoiced is decided
230
+ by sine_amp
231
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
232
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
233
+ F0_sampled (batchsize, length, 1)
234
+ Sine_source (batchsize, length, 1)
235
+ noise_source (batchsize, length 1)
236
+ uv (batchsize, length, 1)
237
+ """
238
+
239
+ def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
240
+ add_noise_std=0.003, voiced_threshod=0):
241
+ super(SourceModuleHnNSF, self).__init__()
242
+
243
+ self.sine_amp = sine_amp
244
+ self.noise_std = add_noise_std
245
+
246
+ # to produce sine waveforms
247
+ self.l_sin_gen = SineGen(sampling_rate, upsample_scale, harmonic_num,
248
+ sine_amp, add_noise_std, voiced_threshod)
249
+
250
+ # to merge source harmonics into a single excitation
251
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
252
+ self.l_tanh = torch.nn.Tanh()
253
+
254
+ def forward(self, x):
255
+ """
256
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
257
+ F0_sampled (batchsize, length, 1)
258
+ Sine_source (batchsize, length, 1)
259
+ noise_source (batchsize, length 1)
260
+ """
261
+ # source for harmonic branch
262
+ with torch.no_grad():
263
+ sine_wavs, uv, _ = self.l_sin_gen(x)
264
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
265
+
266
+ # source for noise branch, in the same shape as uv
267
+ noise = torch.randn_like(uv) * self.sine_amp / 3
268
+ return sine_merge, noise, uv
269
+ def padDiff(x):
270
+ return F.pad(F.pad(x, (0,0,-1,1), 'constant', 0) - x, (0,0,0,-1), 'constant', 0)
271
+
272
+ class Generator(torch.nn.Module):
273
+ def __init__(self, style_dim, resblock_kernel_sizes, upsample_rates, upsample_initial_channel, resblock_dilation_sizes, upsample_kernel_sizes):
274
+ super(Generator, self).__init__()
275
+ self.num_kernels = len(resblock_kernel_sizes)
276
+ self.num_upsamples = len(upsample_rates)
277
+ resblock = AdaINResBlock1
278
+
279
+ self.m_source = SourceModuleHnNSF(
280
+ sampling_rate=24000,
281
+ upsample_scale=np.prod(upsample_rates),
282
+ harmonic_num=8, voiced_threshod=10)
283
+
284
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
285
+ self.noise_convs = nn.ModuleList()
286
+ self.ups = nn.ModuleList()
287
+ self.noise_res = nn.ModuleList()
288
+
289
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
290
+ c_cur = upsample_initial_channel // (2 ** (i + 1))
291
+
292
+ self.ups.append(weight_norm(ConvTranspose1d(upsample_initial_channel//(2**i),
293
+ upsample_initial_channel//(2**(i+1)),
294
+ k, u, padding=(u//2 + u%2), output_padding=u%2)))
295
+
296
+ if i + 1 < len(upsample_rates): #
297
+ stride_f0 = np.prod(upsample_rates[i + 1:])
298
+ self.noise_convs.append(Conv1d(
299
+ 1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=(stride_f0+1) // 2))
300
+ self.noise_res.append(resblock(c_cur, 7, [1,3,5], style_dim))
301
+ else:
302
+ self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
303
+ self.noise_res.append(resblock(c_cur, 11, [1,3,5], style_dim))
304
+
305
+ self.resblocks = nn.ModuleList()
306
+
307
+ self.alphas = nn.ParameterList()
308
+ self.alphas.append(nn.Parameter(torch.ones(1, upsample_initial_channel, 1)))
309
+
310
+ for i in range(len(self.ups)):
311
+ ch = upsample_initial_channel//(2**(i+1))
312
+ self.alphas.append(nn.Parameter(torch.ones(1, ch, 1)))
313
+
314
+ for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
315
+ self.resblocks.append(resblock(ch, k, d, style_dim))
316
+
317
+ self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
318
+ self.ups.apply(init_weights)
319
+ self.conv_post.apply(init_weights)
320
+
321
+ def forward(self, x, s, f0):
322
+
323
+ f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t
324
+
325
+ har_source, noi_source, uv = self.m_source(f0)
326
+ har_source = har_source.transpose(1, 2)
327
+
328
+ for i in range(self.num_upsamples):
329
+ x = x + (1 / self.alphas[i]) * (torch.sin(self.alphas[i] * x) ** 2)
330
+ x_source = self.noise_convs[i](har_source)
331
+ x_source = self.noise_res[i](x_source, s)
332
+
333
+ x = self.ups[i](x)
334
+ x = x + x_source
335
+
336
+ xs = None
337
+ for j in range(self.num_kernels):
338
+ if xs is None:
339
+ xs = self.resblocks[i*self.num_kernels+j](x, s)
340
+ else:
341
+ xs += self.resblocks[i*self.num_kernels+j](x, s)
342
+ x = xs / self.num_kernels
343
+ x = x + (1 / self.alphas[i+1]) * (torch.sin(self.alphas[i+1] * x) ** 2)
344
+ x = self.conv_post(x)
345
+ x = torch.tanh(x)
346
+
347
+ return x
348
+
349
+ def remove_weight_norm(self):
350
+ print('Removing weight norm...')
351
+ for l in self.ups:
352
+ remove_weight_norm(l)
353
+ for l in self.resblocks:
354
+ l.remove_weight_norm()
355
+ remove_weight_norm(self.conv_pre)
356
+ remove_weight_norm(self.conv_post)
357
+
358
+
359
+ class AdainResBlk1d(nn.Module):
360
+ def __init__(self, dim_in, dim_out, style_dim=64, actv=nn.LeakyReLU(0.2),
361
+ upsample='none', dropout_p=0.0):
362
+ super().__init__()
363
+ self.actv = actv
364
+ self.upsample_type = upsample
365
+ self.upsample = UpSample1d(upsample)
366
+ self.learned_sc = dim_in != dim_out
367
+ self._build_weights(dim_in, dim_out, style_dim)
368
+ self.dropout = nn.Dropout(dropout_p)
369
+
370
+ if upsample == 'none':
371
+ self.pool = nn.Identity()
372
+ else:
373
+ self.pool = weight_norm(nn.ConvTranspose1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1, output_padding=1))
374
+
375
+
376
+ def _build_weights(self, dim_in, dim_out, style_dim):
377
+ self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
378
+ self.conv2 = weight_norm(nn.Conv1d(dim_out, dim_out, 3, 1, 1))
379
+ self.norm1 = AdaIN1d(style_dim, dim_in)
380
+ self.norm2 = AdaIN1d(style_dim, dim_out)
381
+ if self.learned_sc:
382
+ self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
383
+
384
+ def _shortcut(self, x):
385
+ x = self.upsample(x)
386
+ if self.learned_sc:
387
+ x = self.conv1x1(x)
388
+ return x
389
+
390
+ def _residual(self, x, s):
391
+ x = self.norm1(x, s)
392
+ x = self.actv(x)
393
+ x = self.pool(x)
394
+ x = self.conv1(self.dropout(x))
395
+ x = self.norm2(x, s)
396
+ x = self.actv(x)
397
+ x = self.conv2(self.dropout(x))
398
+ return x
399
+
400
+ def forward(self, x, s):
401
+ out = self._residual(x, s)
402
+ out = (out + self._shortcut(x)) / math.sqrt(2)
403
+ return out
404
+
405
+ class UpSample1d(nn.Module):
406
+ def __init__(self, layer_type):
407
+ super().__init__()
408
+ self.layer_type = layer_type
409
+
410
+ def forward(self, x):
411
+ if self.layer_type == 'none':
412
+ return x
413
+ else:
414
+ return F.interpolate(x, scale_factor=2, mode='nearest')
415
+
416
+ class Decoder(nn.Module):
417
+ def __init__(self, dim_in=512, F0_channel=512, style_dim=64, dim_out=80,
418
+ resblock_kernel_sizes = [3,7,11],
419
+ upsample_rates = [10,5,3,2],
420
+ upsample_initial_channel=512,
421
+ resblock_dilation_sizes=[[1,3,5], [1,3,5], [1,3,5]],
422
+ upsample_kernel_sizes=[20,10,6,4]):
423
+ super().__init__()
424
+
425
+ self.decode = nn.ModuleList()
426
+
427
+ self.encode = AdainResBlk1d(dim_in + 2, 1024, style_dim)
428
+
429
+ self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
430
+ self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
431
+ self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
432
+ self.decode.append(AdainResBlk1d(1024 + 2 + 64, 512, style_dim, upsample=True))
433
+
434
+ self.F0_conv = weight_norm(nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1))
435
+
436
+ self.N_conv = weight_norm(nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1))
437
+
438
+ self.asr_res = nn.Sequential(
439
+ weight_norm(nn.Conv1d(512, 64, kernel_size=1)),
440
+ )
441
+
442
+
443
+ self.generator = Generator(style_dim, resblock_kernel_sizes, upsample_rates, upsample_initial_channel, resblock_dilation_sizes, upsample_kernel_sizes)
444
+
445
+
446
+ def forward(self, asr, F0_curve, N, s):
447
+ if self.training:
448
+ downlist = [0, 3, 7]
449
+ F0_down = downlist[random.randint(0, 2)]
450
+ downlist = [0, 3, 7, 15]
451
+ N_down = downlist[random.randint(0, 3)]
452
+ if F0_down:
453
+ F0_curve = nn.functional.conv1d(F0_curve.unsqueeze(1), torch.ones(1, 1, F0_down).to(asr.device), padding=F0_down//2).squeeze(1) / F0_down
454
+ if N_down:
455
+ N = nn.functional.conv1d(N.unsqueeze(1), torch.ones(1, 1, N_down).to(asr.device), padding=N_down//2).squeeze(1) / N_down
456
+
457
+
458
+ F0 = self.F0_conv(F0_curve.unsqueeze(1))
459
+ N = self.N_conv(N.unsqueeze(1))
460
+
461
+ x = torch.cat([asr, F0, N], axis=1)
462
+ x = self.encode(x, s)
463
+
464
+ asr_res = self.asr_res(asr)
465
+
466
+ res = True
467
+ for block in self.decode:
468
+ if res:
469
+ x = torch.cat([x, asr_res, F0, N], axis=1)
470
+ x = block(x, s)
471
+ if block.upsample_type != "none":
472
+ res = False
473
+
474
+ x = self.generator(x, s, F0_curve)
475
+ return x
476
+
477
+
en/StyleTTS2-lite/Modules/utils.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def init_weights(m, mean=0.0, std=0.01):
2
+ classname = m.__class__.__name__
3
+ if classname.find("Conv") != -1:
4
+ m.weight.data.normal_(mean, std)
5
+
6
+
7
+ def apply_weight_norm(m):
8
+ classname = m.__class__.__name__
9
+ if classname.find("Conv") != -1:
10
+ weight_norm(m)
11
+
12
+
13
+ def get_padding(kernel_size, dilation=1):
14
+ return int((kernel_size*dilation - dilation)/2)
en/StyleTTS2-lite/README.md ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - en
5
+ base_model:
6
+ - yl4579/StyleTTS2-LibriTTS
7
+ pipeline_tag: text-to-speech
8
+ ---
9
+
10
+ # StyleTTS 2 - lite
11
+
12
+ ## Online Demo
13
+ Explore the model on Hugging Face Spaces:
14
+ https://huggingface.co/spaces/dangtr0408/StyleTTS2-lite-space
15
+
16
+ ## Fine-tune
17
+ https://github.com/dangtr0408/StyleTTS2-lite
18
+
19
+ ## Training Details
20
+
21
+ 1. **Base Checkpoint:** Initialized from the official StyleTTS 2 weights pre-trained on LibriTTS.
22
+ 2. **Components Removal:** PLBert, Diffusion, Prosodic Encoder, SLM, and Spectral Normalization.
23
+ 2. **Training Data:** LibriTTS corpus.
24
+ 3. **Training Schedule:** Trained for 100,000 steps.
25
+
26
+ ## Model Architecture
27
+
28
+ | Component | Parameters |
29
+ | -------------- | ------------- |
30
+ | Decoder | 54 ,289 ,492 |
31
+ | Predictor | 16 ,194 ,612 |
32
+ | Style Encoder | 13 ,845 ,440 |
33
+ | Text Encoder | 5,612 ,320 |
34
+ | **Total** | **89 ,941 ,576** |
35
+
36
+ ## Prerequisites
37
+
38
+ - **Python:** Version 3.7 or higher
39
+ - **Git:** To clone the repository
40
+
41
+ ## Installation & Setup
42
+
43
+ 1. Clone the repository
44
+
45
+ ```bash
46
+
47
+ git clone https://huggingface.co/dangtr0408/StyleTTS2-lite
48
+
49
+ cd StyleTTS2-lite
50
+
51
+ ```
52
+
53
+ 2. Install dependencies:
54
+
55
+ ```bash
56
+
57
+ pip install -r requirements.txt
58
+
59
+ ```
60
+
61
+
62
+
63
+ 3. On **Linux**, manually install espeak:
64
+
65
+ ```bash
66
+
67
+ sudo apt-get install espeak-ng
68
+
69
+ ```
70
+
71
+ ## Usage Example
72
+
73
+ See run.ipynb file.
74
+
75
+ ## Disclaimer
76
+
77
+ **Before using these pre-trained models, you agree to inform the listeners that the speech samples are synthesized by the pre-trained models, unless you have the permission to use the voice you synthesize. That is, you agree to only use voices whose speakers grant the permission to have their voice cloned, either directly or by license before making synthesized voices public, or you have to publicly announce that these voices are synthesized if you do not have the permission to use these voices.**
78
+
79
+
80
+ ## References
81
+
82
+ - [yl4579/StyleTTS2](https://arxiv.org/abs/2306.07691)
83
+
84
+ - [jik876/hifi-gan](https://github.com/jik876/hifi-gan)
85
+
86
+ ## License
87
+
88
+ **Code: MIT License**
en/StyleTTS2-lite/inference.py ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import yaml
3
+ from munch import Munch
4
+ import numpy as np
5
+ import librosa
6
+ import noisereduce as nr
7
+ from meldataset import TextCleaner
8
+ import torch
9
+ import torchaudio
10
+ from nltk.tokenize import word_tokenize
11
+ import nltk
12
+ nltk.download('punkt_tab')
13
+
14
+ from models import ProsodyPredictor, TextEncoder, StyleEncoder
15
+ from Modules.hifigan import Decoder
16
+
17
+ class Preprocess:
18
+ def __text_normalize(self, text):
19
+ punctuation = [",", "、", "،", ";", "(", ".", "。", "…", "!", "–", ":", "?"]
20
+ map_to = "."
21
+ punctuation_pattern = re.compile(f"[{''.join(re.escape(p) for p in punctuation)}]")
22
+ #replace punctuation that acts like a comma or period
23
+ text = punctuation_pattern.sub(map_to, text)
24
+ #replace consecutive whitespace chars with a single space and strip leading/trailing spaces
25
+ text = re.sub(r'\s+', ' ', text).strip()
26
+ return text
27
+ def __merge_fragments(self, texts, n):
28
+ merged = []
29
+ i = 0
30
+ while i < len(texts):
31
+ fragment = texts[i]
32
+ j = i + 1
33
+ while len(fragment.split()) < n and j < len(texts):
34
+ fragment += ", " + texts[j]
35
+ j += 1
36
+ merged.append(fragment)
37
+ i = j
38
+ if len(merged[-1].split()) < n and len(merged) > 1: #handle last sentence
39
+ merged[-2] = merged[-2] + ", " + merged[-1]
40
+ del merged[-1]
41
+ else:
42
+ merged[-1] = merged[-1]
43
+ return merged
44
+ def wave_preprocess(self, wave):
45
+ to_mel = torchaudio.transforms.MelSpectrogram(n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
46
+ mean, std = -4, 4
47
+ wave_tensor = torch.from_numpy(wave).float()
48
+ mel_tensor = to_mel(wave_tensor)
49
+ mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
50
+ return mel_tensor
51
+ def text_preprocess(self, text, n_merge=12):
52
+ text_norm = self.__text_normalize(text).split(".")#split by sentences.
53
+ text_norm = [s.strip() for s in text_norm]
54
+ text_norm = list(filter(lambda x: x != '', text_norm)) #filter empty index
55
+ text_norm = self.__merge_fragments(text_norm, n=n_merge) #merge if a sentence has less that n
56
+ return text_norm
57
+ def length_to_mask(self, lengths):
58
+ mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
59
+ mask = torch.gt(mask+1, lengths.unsqueeze(1))
60
+ return mask
61
+
62
+ #For inference only
63
+ class StyleTTS2(torch.nn.Module):
64
+ def __init__(self, config_path, models_path):
65
+ super().__init__()
66
+ self.register_buffer("get_device", torch.empty(0))
67
+ self.preprocess = Preprocess()
68
+ self.ref_s = None
69
+ config = yaml.safe_load(open(config_path, "r", encoding="utf-8"))
70
+
71
+ try:
72
+ symbols = (
73
+ list(config['symbol']['pad']) +
74
+ list(config['symbol']['punctuation']) +
75
+ list(config['symbol']['letters']) +
76
+ list(config['symbol']['letters_ipa']) +
77
+ list(config['symbol']['extend'])
78
+ )
79
+ symbol_dict = {}
80
+ for i in range(len((symbols))):
81
+ symbol_dict[symbols[i]] = i
82
+
83
+ n_token = len(symbol_dict) + 1
84
+ print("\nFound:", n_token, "symbols")
85
+ except Exception as e:
86
+ print(f"\nERROR: Cannot find {e} in config file!\nYour config file is likely outdated, please download updated version from the repository.")
87
+ raise SystemExit(1)
88
+
89
+ args = self.__recursive_munch(config['model_params'])
90
+ args['n_token'] = n_token
91
+
92
+ self.cleaner = TextCleaner(symbol_dict, debug=False)
93
+
94
+ assert args.decoder.type in ['hifigan'], 'Decoder type unknown'
95
+
96
+ self.decoder = Decoder(dim_in=args.hidden_dim, style_dim=args.style_dim, dim_out=args.n_mels,
97
+ resblock_kernel_sizes = args.decoder.resblock_kernel_sizes,
98
+ upsample_rates = args.decoder.upsample_rates,
99
+ upsample_initial_channel=args.decoder.upsample_initial_channel,
100
+ resblock_dilation_sizes=args.decoder.resblock_dilation_sizes,
101
+ upsample_kernel_sizes=args.decoder.upsample_kernel_sizes)
102
+ self.predictor = ProsodyPredictor(style_dim=args.style_dim, d_hid=args.hidden_dim, nlayers=args.n_layer, max_dur=args.max_dur, dropout=args.dropout)
103
+ self.text_encoder = TextEncoder(channels=args.hidden_dim, kernel_size=5, depth=args.n_layer, n_symbols=args.n_token)
104
+ self.style_encoder = StyleEncoder(dim_in=args.dim_in, style_dim=args.style_dim, max_conv_dim=args.hidden_dim)# acoustic style encoder
105
+
106
+ self.__load_models(models_path)
107
+
108
+ def __recursive_munch(self, d):
109
+ if isinstance(d, dict):
110
+ return Munch((k, self.__recursive_munch(v)) for k, v in d.items())
111
+ elif isinstance(d, list):
112
+ return [self.__recursive_munch(v) for v in d]
113
+ else:
114
+ return d
115
+
116
+ def __replace_outliers_zscore(self, tensor, threshold=3.0, factor=0.95):
117
+ mean = tensor.mean()
118
+ std = tensor.std()
119
+ z = (tensor - mean) / std
120
+
121
+ # Identify outliers
122
+ outlier_mask = torch.abs(z) > threshold
123
+ # Compute replacement value, respecting sign
124
+ sign = torch.sign(tensor - mean)
125
+ replacement = mean + sign * (threshold * std * factor)
126
+
127
+ result = tensor.clone()
128
+ result[outlier_mask] = replacement[outlier_mask]
129
+
130
+ return result
131
+
132
+ def __load_models(self, models_path):
133
+ module_params = []
134
+ model = {'decoder':self.decoder, 'predictor':self.predictor, 'text_encoder':self.text_encoder, 'style_encoder':self.style_encoder}
135
+
136
+ params_whole = torch.load(models_path, map_location='cpu')
137
+ params = params_whole['net']
138
+ params = {key: value for key, value in params.items() if key in model.keys()}
139
+
140
+ for key in model:
141
+ try:
142
+ model[key].load_state_dict(params[key])
143
+ except:
144
+ from collections import OrderedDict
145
+ state_dict = params[key]
146
+ new_state_dict = OrderedDict()
147
+ for k, v in state_dict.items():
148
+ name = k[7:] # remove `module.`
149
+ new_state_dict[name] = v
150
+ model[key].load_state_dict(new_state_dict, strict=False)
151
+
152
+ total_params = sum(p.numel() for p in model[key].parameters())
153
+ print(key,":",total_params)
154
+ module_params.append(total_params)
155
+
156
+ print('\nTotal',":",sum(module_params))
157
+
158
+ def __compute_style(self, path, denoise, split_dur):
159
+ device = self.get_device.device
160
+ denoise = min(denoise, 1)
161
+ if split_dur != 0: split_dur = max(int(split_dur), 1)
162
+ max_samples = 24000*20 #max 20 seconds ref audio
163
+ print("Computing the style for:", path)
164
+
165
+ wave, sr = librosa.load(path, sr=24000)
166
+ audio, index = librosa.effects.trim(wave, top_db=30)
167
+ if sr != 24000:
168
+ audio = librosa.resample(audio, sr, 24000)
169
+ if len(audio) > max_samples:
170
+ audio = audio[:max_samples]
171
+
172
+ if denoise > 0.0:
173
+ audio_denoise = nr.reduce_noise(y=audio, sr=sr, n_fft=2048, win_length=1200, hop_length=300)
174
+ audio = audio*(1-denoise) + audio_denoise*denoise
175
+
176
+ with torch.no_grad():
177
+ if split_dur>0 and len(audio)/sr>=4: #Only effective if audio length is >= 4s
178
+ #This option will split the ref audio to multiple parts, calculate styles and average them
179
+ count = 0
180
+ ref_s = None
181
+ jump = sr*split_dur
182
+ total_len = len(audio)
183
+
184
+ #Need to init before the loop
185
+ mel_tensor = self.preprocess.wave_preprocess(audio[0:jump]).to(device)
186
+ ref_s = self.style_encoder(mel_tensor.unsqueeze(1))
187
+ count += 1
188
+ for i in range(jump, total_len, jump):
189
+ if i+jump >= total_len:
190
+ left_dur = (total_len-i)/sr
191
+ if left_dur >= 1: #Still count if left over dur is >= 1s
192
+ mel_tensor = self.preprocess.wave_preprocess(audio[i:total_len]).to(device)
193
+ ref_s += self.style_encoder(mel_tensor.unsqueeze(1))
194
+ count += 1
195
+ continue
196
+ mel_tensor = self.preprocess.wave_preprocess(audio[i:i+jump]).to(device)
197
+ ref_s += self.style_encoder(mel_tensor.unsqueeze(1))
198
+ count += 1
199
+ ref_s /= count
200
+ else:
201
+ mel_tensor = self.preprocess.wave_preprocess(audio).to(device)
202
+ ref_s = self.style_encoder(mel_tensor.unsqueeze(1))
203
+
204
+ return ref_s
205
+
206
+ def __inference(self, phonem, ref_s, speed=1, prev_d_mean=0, t=0.1):
207
+ device = self.get_device.device
208
+ speed = min(max(speed, 0.0001), 2) #speed range [0, 2]
209
+
210
+ phonem = ' '.join(word_tokenize(phonem))
211
+ tokens = self.cleaner(phonem)
212
+ tokens.insert(0, 0)
213
+ tokens.append(0)
214
+ tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)
215
+
216
+ with torch.no_grad():
217
+ input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
218
+ text_mask = self.preprocess.length_to_mask(input_lengths).to(device)
219
+
220
+ # encode
221
+ t_en = self.text_encoder(tokens, input_lengths, text_mask)
222
+ s = ref_s.to(device)
223
+
224
+ # cal alignment
225
+ d = self.predictor.text_encoder(t_en, s, input_lengths, text_mask)
226
+ x, _ = self.predictor.lstm(d)
227
+ duration = self.predictor.duration_proj(x)
228
+ duration = torch.sigmoid(duration).sum(axis=-1)
229
+
230
+ if prev_d_mean != 0:#Stabilize speaking speed between splits
231
+ dur_stats = torch.empty(duration.shape).normal_(mean=prev_d_mean, std=duration.std()).to(device)
232
+ else:
233
+ dur_stats = torch.empty(duration.shape).normal_(mean=duration.mean(), std=duration.std()).to(device)
234
+ duration = duration*(1-t) + dur_stats*t
235
+ duration[:,1:-2] = self.__replace_outliers_zscore(duration[:,1:-2]) #Normalize outlier
236
+
237
+ duration /= speed
238
+
239
+ pred_dur = torch.round(duration.squeeze()).clamp(min=1)
240
+ pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
241
+ c_frame = 0
242
+ for i in range(pred_aln_trg.size(0)):
243
+ pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
244
+ c_frame += int(pred_dur[i].data)
245
+ alignment = pred_aln_trg.unsqueeze(0).to(device)
246
+
247
+ # encode prosody
248
+ en = (d.transpose(-1, -2) @ alignment)
249
+ F0_pred, N_pred = self.predictor.F0Ntrain(en, s)
250
+ asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))
251
+
252
+ out = self.decoder(asr, F0_pred, N_pred, s)
253
+
254
+ return out.squeeze().cpu().numpy(), duration.mean()
255
+
256
+ def get_styles(self, speaker, denoise=0.3, avg_style=True, load_styles=False):
257
+ if not load_styles:
258
+ if avg_style: split_dur = 3
259
+ else: split_dur = 0
260
+ self.ref_s = self.__compute_style(speaker['path'], denoise=denoise, split_dur=split_dur)
261
+ else:
262
+ if self.ref_s is None:
263
+ raise Exception("Have to compute or load the styles first!")
264
+ style = {
265
+ 'style': self.ref_s,
266
+ 'path': speaker['path'],
267
+ 'speed': speaker['speed'],
268
+ }
269
+ return style
270
+
271
+ def save_styles(self, save_dir):
272
+ if self.ref_s is not None:
273
+ torch.save(self.ref_s, save_dir)
274
+ print("Saved styles!")
275
+ else:
276
+ raise Exception("Have to compute the styles before saving it.")
277
+
278
+ def load_styles(self, save_dir):
279
+ try:
280
+ self.ref_s = torch.load(save_dir)
281
+ print("Loaded styles!")
282
+ except Exception as e:
283
+ print(e)
284
+
285
+ def generate(self, phonem, style, stabilize=True, n_merge=16):
286
+ if stabilize: smooth_value=0.2
287
+ else: smooth_value=0
288
+
289
+ list_wav = []
290
+ prev_d_mean = 0
291
+
292
+ print("Generating Audio...")
293
+ text_norm = self.preprocess.text_preprocess(phonem, n_merge=n_merge)
294
+ for sentence in text_norm:
295
+ wav, prev_d_mean = self.__inference(sentence, style['style'], speed=style['speed'], prev_d_mean=prev_d_mean, t=smooth_value)
296
+ wav = wav[4000:-4000] #Remove weird pulse and silent tokens
297
+ list_wav.append(wav)
298
+
299
+ final_wav = np.concatenate(list_wav)
300
+ final_wav = np.concatenate([np.zeros([4000]), final_wav, np.zeros([4000])], axis=0) # add padding
301
+ return final_wav
en/StyleTTS2-lite/meldataset.py ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #coding: utf-8
2
+ import os.path as osp
3
+ import random
4
+ import numpy as np
5
+ import random
6
+ import soundfile as sf
7
+ import librosa
8
+
9
+ import torch
10
+ import torchaudio
11
+ import torch.utils.data
12
+ import torch.distributed as dist
13
+ from multiprocessing import Pool
14
+
15
+ import logging
16
+ logger = logging.getLogger(__name__)
17
+ logger.setLevel(logging.DEBUG)
18
+
19
+ import pandas as pd
20
+
21
+ class TextCleaner:
22
+ def __init__(self, symbol_dict, debug=True):
23
+ self.word_index_dictionary = symbol_dict
24
+ self.debug = debug
25
+ def __call__(self, text):
26
+ indexes = []
27
+ for char in text:
28
+ try:
29
+ indexes.append(self.word_index_dictionary[char])
30
+ except KeyError as e:
31
+ if self.debug:
32
+ print("\nWARNING UNKNOWN IPA CHARACTERS/LETTERS: ", char)
33
+ print("To ignore set 'debug' to false in the config")
34
+ continue
35
+ return indexes
36
+
37
+ np.random.seed(1)
38
+ random.seed(1)
39
+ SPECT_PARAMS = {
40
+ "n_fft": 2048,
41
+ "win_length": 1200,
42
+ "hop_length": 300
43
+ }
44
+ MEL_PARAMS = {
45
+ "n_mels": 80,
46
+ }
47
+
48
+ to_mel = torchaudio.transforms.MelSpectrogram(
49
+ n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
50
+ mean, std = -4, 4
51
+
52
+ def preprocess(wave):
53
+ wave_tensor = torch.from_numpy(wave).float()
54
+ mel_tensor = to_mel(wave_tensor)
55
+ mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
56
+ return mel_tensor
57
+
58
+ class FilePathDataset(torch.utils.data.Dataset):
59
+ def __init__(self,
60
+ data_list,
61
+ root_path,
62
+ symbol_dict,
63
+ sr=24000,
64
+ data_augmentation=False,
65
+ validation=False,
66
+ debug=True
67
+ ):
68
+
69
+ _data_list = [l.strip().split('|') for l in data_list]
70
+ self.data_list = _data_list #[data if len(data) == 3 else (*data, 0) for data in _data_list] #append speakerid=0 for all
71
+ self.text_cleaner = TextCleaner(symbol_dict, debug)
72
+ self.sr = sr
73
+
74
+ self.df = pd.DataFrame(self.data_list)
75
+
76
+ self.to_melspec = torchaudio.transforms.MelSpectrogram(**MEL_PARAMS)
77
+
78
+ self.mean, self.std = -4, 4
79
+ self.data_augmentation = data_augmentation and (not validation)
80
+ self.max_mel_length = 192
81
+
82
+ self.root_path = root_path
83
+
84
+ def __len__(self):
85
+ return len(self.data_list)
86
+
87
+ def __getitem__(self, idx):
88
+ data = self.data_list[idx]
89
+ path = data[0]
90
+
91
+ wave, text_tensor = self._load_tensor(data)
92
+
93
+ mel_tensor = preprocess(wave).squeeze()
94
+
95
+ acoustic_feature = mel_tensor.squeeze()
96
+ length_feature = acoustic_feature.size(1)
97
+ acoustic_feature = acoustic_feature[:, :(length_feature - length_feature % 2)]
98
+
99
+ return acoustic_feature, text_tensor, path, wave
100
+
101
+ def _load_tensor(self, data):
102
+ wave_path, text = data
103
+ wave, sr = sf.read(osp.join(self.root_path, wave_path))
104
+ if wave.shape[-1] == 2:
105
+ wave = wave[:, 0].squeeze()
106
+ if sr != 24000:
107
+ wave = librosa.resample(wave, orig_sr=sr, target_sr=24000)
108
+ print(wave_path, sr)
109
+
110
+ # Adding half a second padding.
111
+ wave = np.concatenate([np.zeros([12000]), wave, np.zeros([12000])], axis=0)
112
+
113
+ text = self.text_cleaner(text)
114
+
115
+ text.insert(0, 0)
116
+ text.append(0)
117
+
118
+ text = torch.LongTensor(text)
119
+
120
+ return wave, text
121
+
122
+ def _load_data(self, data):
123
+ wave, text_tensor = self._load_tensor(data)
124
+ mel_tensor = preprocess(wave).squeeze()
125
+
126
+ mel_length = mel_tensor.size(1)
127
+ if mel_length > self.max_mel_length:
128
+ random_start = np.random.randint(0, mel_length - self.max_mel_length)
129
+ mel_tensor = mel_tensor[:, random_start:random_start + self.max_mel_length]
130
+
131
+ return mel_tensor
132
+
133
+
134
+ class Collater(object):
135
+ """
136
+ Args:
137
+ adaptive_batch_size (bool): if true, decrease batch size when long data comes.
138
+ """
139
+
140
+ def __init__(self, return_wave=False):
141
+ self.text_pad_index = 0
142
+ self.min_mel_length = 192
143
+ self.max_mel_length = 192
144
+ self.return_wave = return_wave
145
+
146
+
147
+ def __call__(self, batch):
148
+ batch_size = len(batch)
149
+
150
+ # sort by mel length
151
+ lengths = [b[0].shape[1] for b in batch]
152
+ batch_indexes = np.argsort(lengths)[::-1]
153
+ batch = [batch[bid] for bid in batch_indexes]
154
+
155
+ nmels = batch[0][0].size(0)
156
+ max_mel_length = max([b[0].shape[1] for b in batch])
157
+ max_text_length = max([b[1].shape[0] for b in batch])
158
+
159
+ mels = torch.zeros((batch_size, nmels, max_mel_length)).float()
160
+ texts = torch.zeros((batch_size, max_text_length)).long()
161
+
162
+ input_lengths = torch.zeros(batch_size).long()
163
+ output_lengths = torch.zeros(batch_size).long()
164
+ paths = ['' for _ in range(batch_size)]
165
+ waves = [None for _ in range(batch_size)]
166
+
167
+ for bid, (mel, text, path, wave) in enumerate(batch):
168
+ mel_size = mel.size(1)
169
+ text_size = text.size(0)
170
+ mels[bid, :, :mel_size] = mel
171
+ texts[bid, :text_size] = text
172
+ input_lengths[bid] = text_size
173
+ output_lengths[bid] = mel_size
174
+ paths[bid] = path
175
+
176
+ waves[bid] = wave
177
+
178
+ return waves, texts, input_lengths, mels, output_lengths
179
+
180
+
181
+ def get_length(wave_path, root_path):
182
+ info = sf.info(osp.join(root_path, wave_path))
183
+ return info.frames * (24000 / info.samplerate)
184
+
185
+ def build_dataloader(path_list,
186
+ root_path,
187
+ symbol_dict,
188
+ validation=False,
189
+ batch_size=4,
190
+ num_workers=1,
191
+ device='cpu',
192
+ collate_config={},
193
+ dataset_config={}):
194
+
195
+ dataset = FilePathDataset(path_list, root_path, symbol_dict, validation=validation, **dataset_config)
196
+ collate_fn = Collater(**collate_config)
197
+
198
+ print("Getting sample lengths...")
199
+
200
+ num_processes = num_workers * 2
201
+ if num_processes != 0:
202
+ list_of_tuples = [(d[0], root_path) for d in dataset.data_list]
203
+ with Pool(processes=num_processes) as pool:
204
+ sample_lengths = pool.starmap(get_length, list_of_tuples, chunksize=16)
205
+ else:
206
+ sample_lengths = []
207
+ for d in dataset.data_list:
208
+ sample_lengths.append(get_length(d[0], root_path))
209
+
210
+ data_loader = torch.utils.data.DataLoader(
211
+ dataset,
212
+ num_workers=num_workers,
213
+ batch_sampler=BatchSampler(
214
+ sample_lengths,
215
+ batch_size,
216
+ shuffle=(not validation),
217
+ drop_last=(not validation),
218
+ num_replicas=1,
219
+ rank=0,
220
+ ),
221
+ collate_fn=collate_fn,
222
+ pin_memory=(device != "cpu"),
223
+ )
224
+
225
+ return data_loader
226
+
227
+ #https://github.com/duerig/StyleTTS2/
228
+ class BatchSampler(torch.utils.data.Sampler):
229
+ def __init__(
230
+ self,
231
+ sample_lengths,
232
+ batch_sizes,
233
+ num_replicas=None,
234
+ rank=None,
235
+ shuffle=True,
236
+ drop_last=False,
237
+ ):
238
+ self.batch_sizes = batch_sizes
239
+ if num_replicas is None:
240
+ self.num_replicas = dist.get_world_size()
241
+ else:
242
+ self.num_replicas = num_replicas
243
+ if rank is None:
244
+ self.rank = dist.get_rank()
245
+ else:
246
+ self.rank = rank
247
+ self.shuffle = shuffle
248
+ self.drop_last = drop_last
249
+
250
+ self.time_bins = {}
251
+ self.epoch = 0
252
+ self.total_len = 0
253
+ self.last_bin = None
254
+
255
+ for i in range(len(sample_lengths)):
256
+ bin_num = self.get_time_bin(sample_lengths[i])
257
+ if bin_num != -1:
258
+ if bin_num not in self.time_bins:
259
+ self.time_bins[bin_num] = []
260
+ self.time_bins[bin_num].append(i)
261
+
262
+ for key in self.time_bins.keys():
263
+ val = self.time_bins[key]
264
+ total_batch = self.batch_sizes * num_replicas
265
+ self.total_len += len(val) // total_batch
266
+ if not self.drop_last and len(val) % total_batch != 0:
267
+ self.total_len += 1
268
+
269
+ def __iter__(self):
270
+ sampler_order = list(self.time_bins.keys())
271
+ sampler_indices = []
272
+
273
+ if self.shuffle:
274
+ sampler_indices = torch.randperm(len(sampler_order)).tolist()
275
+ else:
276
+ sampler_indices = list(range(len(sampler_order)))
277
+
278
+ for index in sampler_indices:
279
+ key = sampler_order[index]
280
+ current_bin = self.time_bins[key]
281
+ dist = torch.utils.data.distributed.DistributedSampler(
282
+ current_bin,
283
+ num_replicas=self.num_replicas,
284
+ rank=self.rank,
285
+ shuffle=self.shuffle,
286
+ drop_last=self.drop_last,
287
+ )
288
+ dist.set_epoch(self.epoch)
289
+ sampler = torch.utils.data.sampler.BatchSampler(
290
+ dist, self.batch_sizes, self.drop_last
291
+ )
292
+ for item_list in sampler:
293
+ self.last_bin = key
294
+ yield [current_bin[i] for i in item_list]
295
+
296
+ def __len__(self):
297
+ return self.total_len
298
+
299
+ def set_epoch(self, epoch):
300
+ self.epoch = epoch
301
+
302
+ def get_time_bin(self, sample_count):
303
+ result = -1
304
+ frames = sample_count // 300
305
+ if frames >= 20:
306
+ result = (frames - 20) // 20
307
+ return result
en/StyleTTS2-lite/models.py ADDED
@@ -0,0 +1,532 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ from torch.nn.utils import weight_norm
6
+
7
+ from munch import Munch
8
+
9
+ class LearnedDownSample(nn.Module):
10
+ def __init__(self, layer_type, dim_in):
11
+ super().__init__()
12
+ self.layer_type = layer_type
13
+
14
+ if self.layer_type == 'none':
15
+ self.conv = nn.Identity()
16
+ elif self.layer_type == 'timepreserve':
17
+ self.conv = nn.Conv2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, padding=(1, 0))
18
+ elif self.layer_type == 'half':
19
+ self.conv = nn.Conv2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, padding=1)
20
+ else:
21
+ raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
22
+
23
+ def forward(self, x):
24
+ return self.conv(x)
25
+
26
+ class LearnedUpSample(nn.Module):
27
+ def __init__(self, layer_type, dim_in):
28
+ super().__init__()
29
+ self.layer_type = layer_type
30
+
31
+ if self.layer_type == 'none':
32
+ self.conv = nn.Identity()
33
+ elif self.layer_type == 'timepreserve':
34
+ self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, output_padding=(1, 0), padding=(1, 0))
35
+ elif self.layer_type == 'half':
36
+ self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, output_padding=1, padding=1)
37
+ else:
38
+ raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
39
+
40
+
41
+ def forward(self, x):
42
+ return self.conv(x)
43
+
44
+ class DownSample(nn.Module):
45
+ def __init__(self, layer_type):
46
+ super().__init__()
47
+ self.layer_type = layer_type
48
+
49
+ def forward(self, x):
50
+ if self.layer_type == 'none':
51
+ return x
52
+ elif self.layer_type == 'timepreserve':
53
+ return F.avg_pool2d(x, (2, 1))
54
+ elif self.layer_type == 'half':
55
+ if x.shape[-1] % 2 != 0:
56
+ x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1)
57
+ return F.avg_pool2d(x, 2)
58
+ else:
59
+ raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
60
+
61
+
62
+ class UpSample(nn.Module):
63
+ def __init__(self, layer_type):
64
+ super().__init__()
65
+ self.layer_type = layer_type
66
+
67
+ def forward(self, x):
68
+ if self.layer_type == 'none':
69
+ return x
70
+ elif self.layer_type == 'timepreserve':
71
+ return F.interpolate(x, scale_factor=(2, 1), mode='nearest')
72
+ elif self.layer_type == 'half':
73
+ return F.interpolate(x, scale_factor=2, mode='nearest')
74
+ else:
75
+ raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
76
+
77
+
78
+ class ResBlk(nn.Module):
79
+ def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
80
+ normalize=False, downsample='none'):
81
+ super().__init__()
82
+ self.actv = actv
83
+ self.normalize = normalize
84
+ self.downsample = DownSample(downsample)
85
+ self.downsample_res = LearnedDownSample(downsample, dim_in)
86
+ self.learned_sc = dim_in != dim_out
87
+ self._build_weights(dim_in, dim_out)
88
+
89
+ def _build_weights(self, dim_in, dim_out):
90
+ self.conv1 = nn.Conv2d(dim_in, dim_in, 3, 1, 1)
91
+ self.conv2 = nn.Conv2d(dim_in, dim_out, 3, 1, 1)
92
+ if self.normalize:
93
+ self.norm1 = nn.InstanceNorm2d(dim_in, affine=True)
94
+ self.norm2 = nn.InstanceNorm2d(dim_in, affine=True)
95
+ if self.learned_sc:
96
+ self.conv1x1 = nn.Conv2d(dim_in, dim_out, 1, 1, 0, bias=False)
97
+
98
+ def _shortcut(self, x):
99
+ if self.learned_sc:
100
+ x = self.conv1x1(x)
101
+ if self.downsample:
102
+ x = self.downsample(x)
103
+ return x
104
+
105
+ def _residual(self, x):
106
+ if self.normalize:
107
+ x = self.norm1(x)
108
+ x = self.actv(x)
109
+ x = self.conv1(x)
110
+ x = self.downsample_res(x)
111
+ if self.normalize:
112
+ x = self.norm2(x)
113
+ x = self.actv(x)
114
+ x = self.conv2(x)
115
+ return x
116
+
117
+ def forward(self, x):
118
+ x = self._shortcut(x) + self._residual(x)
119
+ return x / math.sqrt(2) # unit variance
120
+
121
+ class StyleEncoder(nn.Module):
122
+ def __init__(self, dim_in=48, style_dim=48, max_conv_dim=384):
123
+ super().__init__()
124
+ blocks = []
125
+ blocks += [nn.Conv2d(1, dim_in, 3, 1, 1)]
126
+
127
+ repeat_num = 4
128
+ for _ in range(repeat_num):
129
+ dim_out = min(dim_in*2, max_conv_dim)
130
+ blocks += [ResBlk(dim_in, dim_out, downsample='half')]
131
+ dim_in = dim_out
132
+
133
+ blocks += [nn.LeakyReLU(0.2)]
134
+ blocks += [nn.Conv2d(dim_out, dim_out, 5, 1, 0)]
135
+ blocks += [nn.AdaptiveAvgPool2d(1)]
136
+ blocks += [nn.LeakyReLU(0.2)]
137
+ self.shared = nn.Sequential(*blocks)
138
+
139
+ self.unshared = nn.Linear(dim_out, style_dim)
140
+
141
+ def forward(self, x):
142
+ h = self.shared(x)
143
+ h = h.view(h.size(0), -1)
144
+ s = self.unshared(h)
145
+
146
+ return s
147
+
148
+ class LinearNorm(torch.nn.Module):
149
+ def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
150
+ super(LinearNorm, self).__init__()
151
+ self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
152
+
153
+ torch.nn.init.xavier_uniform_(
154
+ self.linear_layer.weight,
155
+ gain=torch.nn.init.calculate_gain(w_init_gain))
156
+
157
+ def forward(self, x):
158
+ return self.linear_layer(x)
159
+
160
+ class ResBlk1d(nn.Module):
161
+ def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
162
+ normalize=False, downsample='none', dropout_p=0.2):
163
+ super().__init__()
164
+ self.actv = actv
165
+ self.normalize = normalize
166
+ self.downsample_type = downsample
167
+ self.learned_sc = dim_in != dim_out
168
+ self._build_weights(dim_in, dim_out)
169
+ self.dropout_p = dropout_p
170
+
171
+ if self.downsample_type == 'none':
172
+ self.pool = nn.Identity()
173
+ else:
174
+ self.pool = weight_norm(nn.Conv1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1))
175
+
176
+ def _build_weights(self, dim_in, dim_out):
177
+ self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_in, 3, 1, 1))
178
+ self.conv2 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
179
+ if self.normalize:
180
+ self.norm1 = nn.InstanceNorm1d(dim_in, affine=True)
181
+ self.norm2 = nn.InstanceNorm1d(dim_in, affine=True)
182
+ if self.learned_sc:
183
+ self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
184
+
185
+ def downsample(self, x):
186
+ if self.downsample_type == 'none':
187
+ return x
188
+ else:
189
+ if x.shape[-1] % 2 != 0:
190
+ x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1)
191
+ return F.avg_pool1d(x, 2)
192
+
193
+ def _shortcut(self, x):
194
+ if self.learned_sc:
195
+ x = self.conv1x1(x)
196
+ x = self.downsample(x)
197
+ return x
198
+
199
+ def _residual(self, x):
200
+ if self.normalize:
201
+ x = self.norm1(x)
202
+ x = self.actv(x)
203
+ x = F.dropout(x, p=self.dropout_p, training=self.training)
204
+
205
+ x = self.conv1(x)
206
+ x = self.pool(x)
207
+ if self.normalize:
208
+ x = self.norm2(x)
209
+
210
+ x = self.actv(x)
211
+ x = F.dropout(x, p=self.dropout_p, training=self.training)
212
+
213
+ x = self.conv2(x)
214
+ return x
215
+
216
+ def forward(self, x):
217
+ x = self._shortcut(x) + self._residual(x)
218
+ return x / math.sqrt(2) # unit variance
219
+
220
+ class LayerNorm(nn.Module):
221
+ def __init__(self, channels, eps=1e-5):
222
+ super().__init__()
223
+ self.channels = channels
224
+ self.eps = eps
225
+
226
+ self.gamma = nn.Parameter(torch.ones(channels))
227
+ self.beta = nn.Parameter(torch.zeros(channels))
228
+
229
+ def forward(self, x):
230
+ x = x.transpose(1, -1)
231
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
232
+ return x.transpose(1, -1)
233
+
234
+ class TextEncoder(nn.Module):
235
+ def __init__(self, channels, kernel_size, depth, n_symbols, actv=nn.LeakyReLU(0.2)):
236
+ super().__init__()
237
+ self.embedding = nn.Embedding(n_symbols, channels)
238
+
239
+ padding = (kernel_size - 1) // 2
240
+ self.cnn = nn.ModuleList()
241
+ for _ in range(depth):
242
+ self.cnn.append(nn.Sequential(
243
+ weight_norm(nn.Conv1d(channels, channels, kernel_size=kernel_size, padding=padding)),
244
+ LayerNorm(channels),
245
+ actv,
246
+ nn.Dropout(0.2),
247
+ ))
248
+ # self.cnn = nn.Sequential(*self.cnn)
249
+
250
+ self.lstm = nn.LSTM(channels, channels//2, 1, batch_first=True, bidirectional=True)
251
+
252
+ def forward(self, x, input_lengths, m):
253
+ x = self.embedding(x) # [B, T, emb]
254
+ x = x.transpose(1, 2) # [B, emb, T]
255
+ m = m.to(input_lengths.device).unsqueeze(1)
256
+ x.masked_fill_(m, 0.0)
257
+
258
+ for c in self.cnn:
259
+ x = c(x)
260
+ x.masked_fill_(m, 0.0)
261
+
262
+ x = x.transpose(1, 2) # [B, T, chn]
263
+
264
+ input_lengths = input_lengths.cpu().numpy()
265
+ x = nn.utils.rnn.pack_padded_sequence(
266
+ x, input_lengths, batch_first=True, enforce_sorted=False)
267
+
268
+ self.lstm.flatten_parameters()
269
+ x, _ = self.lstm(x)
270
+ x, _ = nn.utils.rnn.pad_packed_sequence(
271
+ x, batch_first=True)
272
+
273
+ x = x.transpose(-1, -2)
274
+ x_pad = torch.zeros([x.shape[0], x.shape[1], m.shape[-1]])
275
+
276
+ x_pad[:, :, :x.shape[-1]] = x
277
+ x = x_pad.to(x.device)
278
+
279
+ x.masked_fill_(m, 0.0)
280
+
281
+ return x
282
+
283
+ def inference(self, x):
284
+ x = self.embedding(x)
285
+ x = x.transpose(1, 2)
286
+ x = self.cnn(x)
287
+ x = x.transpose(1, 2)
288
+ self.lstm.flatten_parameters()
289
+ x, _ = self.lstm(x)
290
+ return x
291
+
292
+ def length_to_mask(self, lengths):
293
+ mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
294
+ mask = torch.gt(mask+1, lengths.unsqueeze(1))
295
+ return mask
296
+
297
+
298
+
299
+ class AdaIN1d(nn.Module):
300
+ def __init__(self, style_dim, num_features):
301
+ super().__init__()
302
+ self.norm = nn.InstanceNorm1d(num_features, affine=False)
303
+ self.fc = nn.Linear(style_dim, num_features*2)
304
+
305
+ def forward(self, x, s):
306
+ h = self.fc(s)
307
+ h = h.view(h.size(0), h.size(1), 1)
308
+ gamma, beta = torch.chunk(h, chunks=2, dim=1)
309
+ return (1 + gamma) * self.norm(x) + beta
310
+
311
+ class UpSample1d(nn.Module):
312
+ def __init__(self, layer_type):
313
+ super().__init__()
314
+ self.layer_type = layer_type
315
+
316
+ def forward(self, x):
317
+ if self.layer_type == 'none':
318
+ return x
319
+ else:
320
+ return F.interpolate(x, scale_factor=2, mode='nearest')
321
+
322
+ class AdainResBlk1d(nn.Module):
323
+ def __init__(self, dim_in, dim_out, style_dim=64, actv=nn.LeakyReLU(0.2),
324
+ upsample='none', dropout_p=0.0):
325
+ super().__init__()
326
+ self.actv = actv
327
+ self.upsample_type = upsample
328
+ self.upsample = UpSample1d(upsample)
329
+ self.learned_sc = dim_in != dim_out
330
+ self._build_weights(dim_in, dim_out, style_dim)
331
+ self.dropout = nn.Dropout(dropout_p)
332
+
333
+ if upsample == 'none':
334
+ self.pool = nn.Identity()
335
+ else:
336
+ self.pool = weight_norm(nn.ConvTranspose1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1, output_padding=1))
337
+
338
+
339
+ def _build_weights(self, dim_in, dim_out, style_dim):
340
+ self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
341
+ self.conv2 = weight_norm(nn.Conv1d(dim_out, dim_out, 3, 1, 1))
342
+ self.norm1 = AdaIN1d(style_dim, dim_in)
343
+ self.norm2 = AdaIN1d(style_dim, dim_out)
344
+ if self.learned_sc:
345
+ self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
346
+
347
+ def _shortcut(self, x):
348
+ x = self.upsample(x)
349
+ if self.learned_sc:
350
+ x = self.conv1x1(x)
351
+ return x
352
+
353
+ def _residual(self, x, s):
354
+ x = self.norm1(x, s)
355
+ x = self.actv(x)
356
+ x = self.pool(x)
357
+ x = self.conv1(self.dropout(x))
358
+ x = self.norm2(x, s)
359
+ x = self.actv(x)
360
+ x = self.conv2(self.dropout(x))
361
+ return x
362
+
363
+ def forward(self, x, s):
364
+ out = self._residual(x, s)
365
+ out = (out + self._shortcut(x)) / math.sqrt(2)
366
+ return out
367
+
368
+ class AdaLayerNorm(nn.Module):
369
+ def __init__(self, style_dim, channels, eps=1e-5):
370
+ super().__init__()
371
+ self.channels = channels
372
+ self.eps = eps
373
+
374
+ self.fc = nn.Linear(style_dim, channels*2)
375
+
376
+ def forward(self, x, s):
377
+ x = x.transpose(-1, -2)
378
+ x = x.transpose(1, -1)
379
+
380
+ h = self.fc(s)
381
+ h = h.view(h.size(0), h.size(1), 1)
382
+ gamma, beta = torch.chunk(h, chunks=2, dim=1)
383
+ gamma, beta = gamma.transpose(1, -1), beta.transpose(1, -1)
384
+
385
+
386
+ x = F.layer_norm(x, (self.channels,), eps=self.eps)
387
+ x = (1 + gamma) * x + beta
388
+ return x.transpose(1, -1).transpose(-1, -2)
389
+
390
+ class ProsodyPredictor(nn.Module):
391
+
392
+ def __init__(self, style_dim, d_hid, nlayers, max_dur=50, dropout=0.1):
393
+ super().__init__()
394
+
395
+ self.text_encoder = DurationEncoder(sty_dim=style_dim,
396
+ d_model=d_hid,
397
+ nlayers=nlayers,
398
+ dropout=dropout)
399
+
400
+ self.lstm = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True)
401
+ self.duration_proj = LinearNorm(d_hid, max_dur)
402
+
403
+ self.shared = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True)
404
+ self.F0 = nn.ModuleList()
405
+ self.F0.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout))
406
+ self.F0.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout))
407
+ self.F0.append(AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim, dropout_p=dropout))
408
+
409
+ self.N = nn.ModuleList()
410
+ self.N.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout))
411
+ self.N.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout))
412
+ self.N.append(AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim, dropout_p=dropout))
413
+
414
+ self.F0_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
415
+ self.N_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
416
+
417
+
418
+ def forward(self, texts, style, text_lengths, alignment, m):
419
+ d = self.text_encoder(texts, style, text_lengths, m)
420
+
421
+ batch_size = d.shape[0]
422
+ text_size = d.shape[1]
423
+
424
+ # predict duration
425
+ input_lengths = text_lengths.cpu().numpy()
426
+ x = nn.utils.rnn.pack_padded_sequence(
427
+ d, input_lengths, batch_first=True, enforce_sorted=False)
428
+
429
+ m = m.to(text_lengths.device).unsqueeze(1)
430
+
431
+ self.lstm.flatten_parameters()
432
+ x, _ = self.lstm(x)
433
+ x, _ = nn.utils.rnn.pad_packed_sequence(
434
+ x, batch_first=True)
435
+
436
+ x_pad = torch.zeros([x.shape[0], m.shape[-1], x.shape[-1]])
437
+
438
+ x_pad[:, :x.shape[1], :] = x
439
+ x = x_pad.to(x.device)
440
+
441
+ duration = self.duration_proj(nn.functional.dropout(x, 0.5, training=self.training))
442
+
443
+ en = (d.transpose(-1, -2) @ alignment)
444
+
445
+ return duration.squeeze(-1), en
446
+
447
+ def F0Ntrain(self, x, s):
448
+ x, _ = self.shared(x.transpose(-1, -2))
449
+
450
+ F0 = x.transpose(-1, -2)
451
+ for block in self.F0:
452
+ F0 = block(F0, s)
453
+ F0 = self.F0_proj(F0)
454
+
455
+ N = x.transpose(-1, -2)
456
+ for block in self.N:
457
+ N = block(N, s)
458
+ N = self.N_proj(N)
459
+
460
+ return F0.squeeze(1), N.squeeze(1)
461
+
462
+ def length_to_mask(self, lengths):
463
+ mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
464
+ mask = torch.gt(mask+1, lengths.unsqueeze(1))
465
+ return mask
466
+
467
+ class DurationEncoder(nn.Module):
468
+
469
+ def __init__(self, sty_dim, d_model, nlayers, dropout=0.1):
470
+ super().__init__()
471
+ self.lstms = nn.ModuleList()
472
+ for _ in range(nlayers):
473
+ self.lstms.append(nn.LSTM(d_model + sty_dim,
474
+ d_model // 2,
475
+ num_layers=1,
476
+ batch_first=True,
477
+ bidirectional=True,
478
+ dropout=dropout))
479
+ self.lstms.append(AdaLayerNorm(sty_dim, d_model))
480
+
481
+
482
+ self.dropout = dropout
483
+ self.d_model = d_model
484
+ self.sty_dim = sty_dim
485
+
486
+ def forward(self, x, style, text_lengths, m):
487
+ masks = m.to(text_lengths.device)
488
+
489
+ x = x.permute(2, 0, 1)
490
+ s = style.expand(x.shape[0], x.shape[1], -1)
491
+ x = torch.cat([x, s], axis=-1)
492
+ x.masked_fill_(masks.unsqueeze(-1).transpose(0, 1), 0.0)
493
+
494
+ x = x.transpose(0, 1)
495
+ input_lengths = text_lengths.cpu().numpy()
496
+ x = x.transpose(-1, -2)
497
+
498
+ for block in self.lstms:
499
+ if isinstance(block, AdaLayerNorm):
500
+ x = block(x.transpose(-1, -2), style).transpose(-1, -2)
501
+ x = torch.cat([x, s.permute(1, -1, 0)], axis=1)
502
+ x.masked_fill_(masks.unsqueeze(-1).transpose(-1, -2), 0.0)
503
+ else:
504
+ x = x.transpose(-1, -2)
505
+ x = nn.utils.rnn.pack_padded_sequence(
506
+ x, input_lengths, batch_first=True, enforce_sorted=False)
507
+ block.flatten_parameters()
508
+ x, _ = block(x)
509
+ x, _ = nn.utils.rnn.pad_packed_sequence(
510
+ x, batch_first=True)
511
+ x = F.dropout(x, p=self.dropout, training=self.training)
512
+ x = x.transpose(-1, -2)
513
+
514
+ x_pad = torch.zeros([x.shape[0], x.shape[1], m.shape[-1]])
515
+
516
+ x_pad[:, :, :x.shape[-1]] = x
517
+ x = x_pad.to(x.device)
518
+
519
+ return x.transpose(-1, -2)
520
+
521
+ def inference(self, x, style):
522
+ x = self.embedding(x.transpose(-1, -2)) * math.sqrt(self.d_model)
523
+ style = style.expand(x.shape[0], x.shape[1], -1)
524
+ x = torch.cat([x, style], axis=-1)
525
+ src = self.pos_encoder(x)
526
+ output = self.transformer_encoder(src).transpose(0, 1)
527
+ return output
528
+
529
+ def length_to_mask(self, lengths):
530
+ mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
531
+ mask = torch.gt(mask+1, lengths.unsqueeze(1))
532
+ return mask
en/StyleTTS2-lite/requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ torchaudio
3
+ numpy
4
+ PyYAML
5
+ munch
6
+ nltk
7
+ librosa
8
+ noisereduce
9
+ phonemizer
10
+ espeakng-loader
en/StyleTTS2-lite/run.ipynb ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "5a3ddcc8",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from inference import StyleTTS2\n",
11
+ "\n",
12
+ "import librosa\n",
13
+ "import IPython.display as ipd\n",
14
+ "import torch.cuda\n",
15
+ "\n",
16
+ "device = 'cuda' if torch.cuda.is_available() else 'cpu'"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "markdown",
21
+ "id": "092cfb69",
22
+ "metadata": {},
23
+ "source": [
24
+ "### Load G2P"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "markdown",
29
+ "id": "a152ec13",
30
+ "metadata": {},
31
+ "source": [
32
+ "If you did not use eSpeak for your language, please add your own G2P."
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": null,
38
+ "id": "ca224f37",
39
+ "metadata": {},
40
+ "outputs": [],
41
+ "source": [
42
+ "import sys\n",
43
+ "import phonemizer\n",
44
+ "if sys.platform.startswith(\"win\"):\n",
45
+ " try:\n",
46
+ " from phonemizer.backend.espeak.wrapper import EspeakWrapper\n",
47
+ " import espeakng_loader\n",
48
+ " EspeakWrapper.set_library(espeakng_loader.get_library_path())\n",
49
+ " except Exception as e:\n",
50
+ " print(e)\n",
51
+ "\n",
52
+ "def get_phoneme(text, lang):\n",
53
+ " try:\n",
54
+ " my_phonemizer = phonemizer.backend.EspeakBackend(language=lang, preserve_punctuation=True, with_stress=True, language_switch='remove-flags')\n",
55
+ " return my_phonemizer.phonemize([text])[0]\n",
56
+ " except Exception as e:\n",
57
+ " print(e)"
58
+ ]
59
+ },
60
+ {
61
+ "cell_type": "markdown",
62
+ "id": "7b9cecbe",
63
+ "metadata": {},
64
+ "source": [
65
+ "### Load models"
66
+ ]
67
+ },
68
+ {
69
+ "cell_type": "code",
70
+ "execution_count": null,
71
+ "id": "e7b9c01d",
72
+ "metadata": {},
73
+ "outputs": [],
74
+ "source": [
75
+ "config_path = \"Models/config.yaml\"\n",
76
+ "models_path = \"Models/inference/model.pth\""
77
+ ]
78
+ },
79
+ {
80
+ "cell_type": "markdown",
81
+ "id": "b803110e",
82
+ "metadata": {},
83
+ "source": [
84
+ "### Synthesize speech\n",
85
+ "\n",
86
+ "Little Note: Reference audio has a huge impact on the result. It is best to select audio around 10s long and consistent in both tone and speed."
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "code",
91
+ "execution_count": null,
92
+ "id": "78396f70",
93
+ "metadata": {},
94
+ "outputs": [],
95
+ "source": [
96
+ "speaker = {\n",
97
+ " \"path\": \"./Audio/1_heart.wav\", #Ref audio path\n",
98
+ " \"speed\": 1.0, #Speaking speed\n",
99
+ "}\n",
100
+ "\n",
101
+ "max_samples = 24000*20 #max 20 seconds ref audio\n",
102
+ "print(speaker['path'])\n",
103
+ "wave, sr = librosa.load(speaker['path'], sr=24000)\n",
104
+ "audio, index = librosa.effects.trim(wave, top_db=30)\n",
105
+ "if sr != 24000: audio = librosa.resample(audio, sr, 24000)\n",
106
+ "if len(audio) > max_samples: audio = audio[:max_samples]\n",
107
+ "display(ipd.Audio(audio, rate=24000, normalize=True))"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "code",
112
+ "execution_count": null,
113
+ "id": "395959f1",
114
+ "metadata": {},
115
+ "outputs": [],
116
+ "source": [
117
+ "text = '''\n",
118
+ "Nearly 300 scholars currently working in the United States have applied for positions at Aix-Marseille University in France, which has announced a program to provide a haven for academics affected by the Trump administration's policies.\n",
119
+ "Aix-Marseille launched the \"Safe Place for Science\" initiative earlier this year, offering three-year funded placements for approximately 20 researchers. The program aims to support scholars facing budget cuts and policy changes that have disrupted U.S. academic institutions.\n",
120
+ "'''"
121
+ ]
122
+ },
123
+ {
124
+ "cell_type": "code",
125
+ "execution_count": null,
126
+ "id": "16194211",
127
+ "metadata": {},
128
+ "outputs": [],
129
+ "source": [
130
+ "model = StyleTTS2(config_path, models_path).eval().to(device)\n",
131
+ "avg_style = True #BOOL Split the ref audio and calculate the avg styles.\n",
132
+ "stabilize = False #BOOL Stabilize speaking speed.\n",
133
+ "denoise = 0.3 #FLOAT Adjust the strength of the denoiser. Value range is [0, 1]\n",
134
+ "n_merge = 16 #INT Avoid short sentences by merging when a sentence has fewer than n words"
135
+ ]
136
+ },
137
+ {
138
+ "cell_type": "code",
139
+ "execution_count": null,
140
+ "id": "980c6fbb",
141
+ "metadata": {},
142
+ "outputs": [],
143
+ "source": [
144
+ "with torch.no_grad():\n",
145
+ " phonemes = get_phoneme(text=text, lang=\"en-us\")\n",
146
+ "\n",
147
+ " styles = model.get_styles(speaker, denoise, avg_style)\n",
148
+ " r = model.generate(phonemes, styles, stabilize, n_merge)\n",
149
+ "\n",
150
+ "print('Synthesized:')\n",
151
+ "display(ipd.Audio(r, rate=24000, normalize=True))"
152
+ ]
153
+ }
154
+ ],
155
+ "metadata": {
156
+ "kernelspec": {
157
+ "display_name": "base",
158
+ "language": "python",
159
+ "name": "python3"
160
+ },
161
+ "language_info": {
162
+ "codemirror_mode": {
163
+ "name": "ipython",
164
+ "version": 3
165
+ },
166
+ "file_extension": ".py",
167
+ "mimetype": "text/x-python",
168
+ "name": "python",
169
+ "nbconvert_exporter": "python",
170
+ "pygments_lexer": "ipython3",
171
+ "version": "3.11.7"
172
+ }
173
+ },
174
+ "nbformat": 4,
175
+ "nbformat_minor": 5
176
+ }
en/StyleTTS2-lite/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/dangtr0408/StyleTTS2-lite
en/StyleTTS2/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
en/StyleTTS2/Multi0/config.yml ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ASR_config: Utils/ASR/config.yml
2
+ ASR_path: Utils/ASR/epoch_00080.pth
3
+ F0_path: Utils/JDC/bst.t7
4
+ PLBERT_dir: Utils/PLBERT/
5
+ batch_size: 6
6
+ data_params:
7
+ OOD_data: Data/OOD_texts.txt
8
+ min_length: 50
9
+ root_path: /root/StyleTTS2/Omni1_data
10
+ train_data: Data/train_list.txt
11
+ val_data: Data/val_list.txt
12
+ device: cuda
13
+ epochs_1st: 200
14
+ epochs_2nd: 60
15
+ first_stage_path: first_stage.pth
16
+ log_dir: Models/Omni1
17
+ log_interval: 10
18
+ loss_params:
19
+ TMA_epoch: 50
20
+ diff_epoch: 14
21
+ joint_epoch: 19
22
+ lambda_F0: 1.0
23
+ lambda_ce: 20.0
24
+ lambda_diff: 1.0
25
+ lambda_dur: 1.0
26
+ lambda_gen: 1.0
27
+ lambda_mel: 5.0
28
+ lambda_mono: 1.0
29
+ lambda_norm: 1.0
30
+ lambda_s2s: 1.0
31
+ lambda_slm: 1.0
32
+ lambda_sty: 1.0
33
+ max_len: 400
34
+ model_params:
35
+ decoder:
36
+ gen_istft_hop_size: 5
37
+ gen_istft_n_fft: 20
38
+ resblock_dilation_sizes:
39
+ - - 1
40
+ - 3
41
+ - 5
42
+ - - 1
43
+ - 3
44
+ - 5
45
+ - - 1
46
+ - 3
47
+ - 5
48
+ resblock_kernel_sizes:
49
+ - 3
50
+ - 7
51
+ - 11
52
+ type: istftnet
53
+ upsample_initial_channel: 512
54
+ upsample_kernel_sizes:
55
+ - 20
56
+ - 12
57
+ upsample_rates:
58
+ - 10
59
+ - 6
60
+ diffusion:
61
+ dist:
62
+ estimate_sigma_data: true
63
+ mean: -3.0
64
+ sigma_data: 0.3141927569675583
65
+ std: 1.0
66
+ embedding_mask_proba: 0.1
67
+ transformer:
68
+ head_features: 64
69
+ multiplier: 2
70
+ num_heads: 8
71
+ num_layers: 3
72
+ dim_in: 64
73
+ dropout: 0.2
74
+ hidden_dim: 512
75
+ max_conv_dim: 512
76
+ max_dur: 50
77
+ multispeaker: true
78
+ n_layer: 3
79
+ n_mels: 80
80
+ n_token: 178
81
+ slm:
82
+ hidden: 768
83
+ initial_channel: 64
84
+ model: microsoft/wavlm-base-plus
85
+ nlayers: 13
86
+ sr: 16000
87
+ style_dim: 128
88
+ optimizer_params:
89
+ bert_lr: 1.0e-05
90
+ ft_lr: 1.0e-05
91
+ lr: 0.0001
92
+ preprocess_params:
93
+ spect_params:
94
+ hop_length: 300
95
+ n_fft: 2048
96
+ win_length: 1200
97
+ sr: 24000
98
+ pretrained_model: /root/StyleTTS2/Models/Omni1/epoch_2nd_pretrained.pth
99
+ resume: true
100
+ save_freq: 1
101
+ saver_freq_steps: 150
102
+ saver_max_ckpts: 5
103
+ saver_mode: ITER
104
+ second_stage_load_pretrained: true
105
+ slmadv_params:
106
+ batch_percentage: 0.5
107
+ iter: 10
108
+ max_len: 400
109
+ min_len: 160
110
+ scale: 0.01
111
+ sig: 1.5
112
+ thresh: 5
en/StyleTTS2/Multi0/config_30_e934.yml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {ASR_config: Utils/ASR/config.yml, ASR_path: Utils/ASR/epoch_00080.pth, F0_path: Utils/JDC/bst.t7,
2
+ PLBERT_dir: Utils/PLBERT/, batch_size: 12, data_params: {OOD_data: Data/OOD_texts.txt,
3
+ min_length: 50, root_path: /root/StyleTTS2/Multi0_data, train_data: Data/train_list.txt,
4
+ val_data: Data/val_list.txt}, device: cuda, epochs_1st: 200, epochs_2nd: 60, first_stage_path: first_stage.pth,
5
+ log_dir: Models/Multi0, log_interval: 10, loss_params: {TMA_epoch: 50, diff_epoch: 14,
6
+ joint_epoch: 19, lambda_F0: 1.0, lambda_ce: 20.0, lambda_diff: 1.0, lambda_dur: 1.0,
7
+ lambda_gen: 1.0, lambda_mel: 5.0, lambda_mono: 1.0, lambda_norm: 1.0, lambda_s2s: 1.0,
8
+ lambda_slm: 1.0, lambda_sty: 1.0}, max_len: 400, model_params: {decoder: {gen_istft_hop_size: 5,
9
+ gen_istft_n_fft: 20, resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3,
10
+ 5]], resblock_kernel_sizes: [3, 7, 11], type: istftnet, upsample_initial_channel: 512,
11
+ upsample_kernel_sizes: [20, 12], upsample_rates: [10, 6]}, diffusion: {dist: {
12
+ estimate_sigma_data: true, mean: -3.0, sigma_data: 0.31839087134423844, std: 1.0},
13
+ embedding_mask_proba: 0.1, transformer: {head_features: 64, multiplier: 2, num_heads: 8,
14
+ num_layers: 3}}, dim_in: 64, dropout: 0.2, hidden_dim: 512, max_conv_dim: 512,
15
+ max_dur: 50, multispeaker: true, n_layer: 3, n_mels: 80, n_token: 178, slm: {
16
+ hidden: 768, initial_channel: 64, model: microsoft/wavlm-base-plus, nlayers: 13,
17
+ sr: 16000}, style_dim: 128}, optimizer_params: {bert_lr: 1.0e-05, ft_lr: 1.0e-05,
18
+ lr: 0.0001}, preprocess_params: {spect_params: {hop_length: 300, n_fft: 2048,
19
+ win_length: 1200}, sr: 24000}, pretrained_model: '', resume: true, save_freq: 1,
20
+ saver_freq_steps: 150, saver_max_ckpts: 5, saver_mode: ITER, second_stage_load_pretrained: true,
21
+ slmadv_params: {batch_percentage: 0.5, iter: 10, max_len: 400, min_len: 160, scale: 0.01,
22
+ sig: 1.5, thresh: 5}}
en/StyleTTS2/Multi0/config_40_1c872.yml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {ASR_config: Utils/ASR/config.yml, ASR_path: Utils/ASR/epoch_00080.pth, F0_path: Utils/JDC/bst.t7,
2
+ PLBERT_dir: Utils/PLBERT/, batch_size: 6, data_params: {OOD_data: Data/OOD_texts.txt,
3
+ min_length: 50, root_path: /root/StyleTTS2/Multi0_data, train_data: Data/train_list.txt,
4
+ val_data: Data/val_list.txt}, device: cuda, epochs_1st: 200, epochs_2nd: 60, first_stage_path: first_stage.pth,
5
+ log_dir: Models/Multi0, log_interval: 10, loss_params: {TMA_epoch: 50, diff_epoch: 14,
6
+ joint_epoch: 19, lambda_F0: 1.0, lambda_ce: 20.0, lambda_diff: 1.0, lambda_dur: 1.0,
7
+ lambda_gen: 1.0, lambda_mel: 5.0, lambda_mono: 1.0, lambda_norm: 1.0, lambda_s2s: 1.0,
8
+ lambda_slm: 1.0, lambda_sty: 1.0}, max_len: 400, model_params: {decoder: {gen_istft_hop_size: 5,
9
+ gen_istft_n_fft: 20, resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3,
10
+ 5]], resblock_kernel_sizes: [3, 7, 11], type: istftnet, upsample_initial_channel: 512,
11
+ upsample_kernel_sizes: [20, 12], upsample_rates: [10, 6]}, diffusion: {dist: {
12
+ estimate_sigma_data: true, mean: -3.0, sigma_data: 0.2969374090377316, std: 1.0},
13
+ embedding_mask_proba: 0.1, transformer: {head_features: 64, multiplier: 2, num_heads: 8,
14
+ num_layers: 3}}, dim_in: 64, dropout: 0.2, hidden_dim: 512, max_conv_dim: 512,
15
+ max_dur: 50, multispeaker: true, n_layer: 3, n_mels: 80, n_token: 178, slm: {
16
+ hidden: 768, initial_channel: 64, model: microsoft/wavlm-base-plus, nlayers: 13,
17
+ sr: 16000}, style_dim: 128}, optimizer_params: {bert_lr: 1.0e-05, ft_lr: 1.0e-05,
18
+ lr: 0.0001}, preprocess_params: {spect_params: {hop_length: 300, n_fft: 2048,
19
+ win_length: 1200}, sr: 24000}, pretrained_model: '', resume: true, save_freq: 1,
20
+ saver_freq_steps: 150, saver_max_ckpts: 5, saver_mode: ITER, second_stage_load_pretrained: true,
21
+ slmadv_params: {batch_percentage: 0.5, iter: 10, max_len: 400, min_len: 160, scale: 0.01,
22
+ sig: 1.5, thresh: 5}}
en/StyleTTS2/Multi0/epoch_2nd_23_9ab0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbe9be5c4c2df12b5ddb65cce7e45849d3ed674db1fcb89eb7f1bafc65f05ade
3
+ size 2132412506
en/StyleTTS2/Multi0/epoch_2nd_30_e934.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7af95c2c61a778fec6ad5cec95497daaf9bf3dd6cec6db7f02f4fe90e3e5657a
3
+ size 2132412506
en/StyleTTS2/Multi0/epoch_2nd_40_1c872.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:065b610ae5fd9fc73eea396761d42a99a4770a243aca76aa7db4ff9bd13d81ac
3
+ size 2132415942
en/StyleTTS2/Multi0/ref_audio.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa342abae6a7d06b84508e828c1082aa0fc6d484bd709cb40650d879c31c5f16
3
+ size 4766523
en/StyleTTS2/README.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ ---
2
+ datasets:
3
+ - therealvul/StyleTTS2MLP
4
+ language:
5
+ - en
6
+ ---
7
+ This repository contains StyleTTS2 models trained on Pony Preservation Project data