Add LibtiTTS Vocos model
Browse files- .gitattributes +2 -0
- README.md +26 -3
- Vocos/LibriTTS/Data/OOD_texts2.txt +3 -0
- Vocos/LibriTTS/Data/train_libritts.txt +3 -0
- Vocos/LibriTTS/Data/valid_libritts.txt +3 -0
- Vocos/LibriTTS/LibriTTS_vocos_first_stage.pth +3 -0
- Vocos/LibriTTS/config_libritts_vocos.yml +19 -0
- Vocos/LibriTTS/epoch_2nd_00029.pth +3 -0
- Vocos/LibriTTS/tensorboard/events.out.tfevents.1740379572.gpu-1.2895635.0 +3 -0
- Vocos/LibriTTS/tensorboard/events.out.tfevents.1740450365.gpu-1.3390091.0 +3 -0
- Vocos/LibriTTS/tensorboard/events.out.tfevents.1740450615.gpu-1.3400827.0 +3 -0
- Vocos/LibriTTS/tensorboard/events.out.tfevents.1741224850.gpu-1.3455425.0 +3 -0
- Vocos/LibriTTS/tensorboard/events.out.tfevents.1741224909.gpu-1.3456868.0 +3 -0
- Vocos/LibriTTS/tensorboard/events.out.tfevents.1741249732.gpu-1.3929847.0 +3 -0
- Vocos/LibriTTS/train.log +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.txt filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
*.log filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -1,3 +1,26 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Vocos LibriTTS Model
|
| 2 |
+
|
| 3 |
+
This model was trained using the train-clean-100 and train-clean-360 subsets of the LibriTTS dataset.
|
| 4 |
+
|
| 5 |
+
## Model Information
|
| 6 |
+
|
| 7 |
+
- **Model Architecture**: Vocos
|
| 8 |
+
- **Training Data**: LibriTTS train-clean-100 + train-clean-360
|
| 9 |
+
- **License**: MIT
|
| 10 |
+
|
| 11 |
+
## Repository
|
| 12 |
+
|
| 13 |
+
The training and inference code can be found at: [StyleTTS2-Vocos](https://github.com/5Hyeons/StyleTTS2-Vocos)
|
| 14 |
+
|
| 15 |
+
### Folder Structure
|
| 16 |
+
```
|
| 17 |
+
StyleTTS2
|
| 18 |
+
βββ README.md
|
| 19 |
+
βββ Vocos
|
| 20 |
+
βββ LibriTTS
|
| 21 |
+
βββ [checkpoint files]
|
| 22 |
+
```
|
| 23 |
+
|
| 24 |
+
## License
|
| 25 |
+
|
| 26 |
+
This model is released under the MIT License. This is one of the most permissive open-source licenses, allowing for both commercial and non-commercial use, modification, and distribution.
|
Vocos/LibriTTS/Data/OOD_texts2.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:83c1886c0f8c25b91d0069c2d75f5911e3bfa2766d97098779cb685cd9a23749
|
| 3 |
+
size 42723963
|
Vocos/LibriTTS/Data/train_libritts.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e9da2bab6aaa3dfaea1b70c7adc915bde6e04826c1b02fba9bf7b5ceffc8fa0d
|
| 3 |
+
size 31124170
|
Vocos/LibriTTS/Data/valid_libritts.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:67b42ece5bcb11f9078b79f26832ae17936b78d97a326b99978062600da3aaa2
|
| 3 |
+
size 319831
|
Vocos/LibriTTS/LibriTTS_vocos_first_stage.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:308e539a889a0152a89d946265ecd5993ff3ab21650f22c2d7515abf167f788f
|
| 3 |
+
size 2163605292
|
Vocos/LibriTTS/config_libritts_vocos.yml
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{ASR_config: Utils/ASR/config.yml, ASR_path: Utils/ASR/epoch_00080.pth, F0_path: Utils/JDC/bst.t7,
|
| 2 |
+
PLBERT_dir: Utils/PLBERT/, batch_size: 16, data_params: {OOD_data: Data/OOD_texts.txt,
|
| 3 |
+
min_length: 50, root_path: /data/LibriTTS, train_data: Data/train_libritts.txt,
|
| 4 |
+
val_data: Data/valid_libritts.txt}, device: cuda, epochs_1st: 50, epochs_2nd: 30,
|
| 5 |
+
first_stage_path: LibriTTS_vocos_first_stage.pth, load_only_params: false, log_dir: /data/ckpts/stts2/LibriTTS_vocos,
|
| 6 |
+
log_interval: 10, loss_params: {TMA_epoch: 5, diff_epoch: 10, joint_epoch: 15, lambda_F0: 1.0,
|
| 7 |
+
lambda_ce: 20.0, lambda_diff: 1.0, lambda_dur: 1.0, lambda_gen: 1.0, lambda_mel: 5.0,
|
| 8 |
+
lambda_mono: 1.0, lambda_norm: 1.0, lambda_s2s: 1.0, lambda_slm: 1.0, lambda_sty: 1.0},
|
| 9 |
+
max_len: 300, model_params: {decoder: {gen_istft_hop_size: 300, gen_istft_n_fft: 1200,
|
| 10 |
+
intermediate_dim: 1536, num_layers: 8, type: vocos}, diffusion: {dist: {estimate_sigma_data: true,
|
| 11 |
+
mean: -3.0, sigma_data: 0.23322181252793212, std: 1.0}, embedding_mask_proba: 0.1,
|
| 12 |
+
transformer: {head_features: 64, multiplier: 2, num_heads: 8, num_layers: 3}},
|
| 13 |
+
dim_in: 64, dropout: 0.2, hidden_dim: 512, max_conv_dim: 512, max_dur: 50, multispeaker: true,
|
| 14 |
+
n_layer: 3, n_mels: 80, n_token: 178, slm: {hidden: 768, initial_channel: 64,
|
| 15 |
+
model: microsoft/wavlm-base-plus, nlayers: 13, sr: 16000}, style_dim: 128},
|
| 16 |
+
optimizer_params: {bert_lr: 1.0e-05, ft_lr: 1.0e-05, lr: 0.0001}, preprocess_params: {
|
| 17 |
+
spect_params: {hop_length: 300, n_fft: 2048, win_length: 1200}, sr: 24000}, pretrained_model: '',
|
| 18 |
+
save_freq: 1, second_stage_load_pretrained: true, slmadv_params: {batch_percentage: 0.5,
|
| 19 |
+
iter: 20, max_len: 500, min_len: 400, scale: 0.01, sig: 1.5, thresh: 5}}
|
Vocos/LibriTTS/epoch_2nd_00029.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e8a5436301091400054f348d0aff74081d433c9402fc56bd71bad21f74e7996a
|
| 3 |
+
size 2480791363
|
Vocos/LibriTTS/tensorboard/events.out.tfevents.1740379572.gpu-1.2895635.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0cba9acbcf3eacdf270df91d837d7cb403b973c87924ff9aab5411322cb7cc0c
|
| 3 |
+
size 6041600
|
Vocos/LibriTTS/tensorboard/events.out.tfevents.1740450365.gpu-1.3390091.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eb0c308bfa92b0c6ff4bc86a9284bfdf6d287f12f4880e66cc992730c1b934a9
|
| 3 |
+
size 88
|
Vocos/LibriTTS/tensorboard/events.out.tfevents.1740450615.gpu-1.3400827.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e18f377d0ae08dd42391967d2d9c2a685abd7e98ef21c2de7440d3a85b52afaf
|
| 3 |
+
size 73350099
|
Vocos/LibriTTS/tensorboard/events.out.tfevents.1741224850.gpu-1.3455425.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:972ef9eb2d0d05e73a6373e5db26211d96f9cb7825585e66254f40212716d709
|
| 3 |
+
size 88
|
Vocos/LibriTTS/tensorboard/events.out.tfevents.1741224909.gpu-1.3456868.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:444728d61889724bd6265749d0a378b0ce99f78158a7b599baa91e05fccc0521
|
| 3 |
+
size 4741948
|
Vocos/LibriTTS/tensorboard/events.out.tfevents.1741249732.gpu-1.3929847.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5a31ec93df37c599f68259abd22392729eb5c081112642f1bd156de19a2e2686
|
| 3 |
+
size 78370138
|
Vocos/LibriTTS/train.log
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ebf58487b746afa5729e48134ab118e58c749d0805a25b0e3a2c4ea28f0354a2
|
| 3 |
+
size 15614787
|