Update
Browse files- Experiments/nohup.out +2 -2
- Experiments/run/events.out.tfevents.1706462806.edresson-train-80.145564.0 +2 -2
- Experiments/{runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/trainer_0_log.txt β run/events.out.tfevents.1706899297.edresson-train-80-3.1052.0} +2 -2
- Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9 β YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/best_model.pth +2 -2
- Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/best_model_85001.pth β YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a/best_model_195001.pth} +2 -2
- Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05 β YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/config.json +6 -6
- Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05 β YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/language_ids.json +0 -0
- Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05 β YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/speakers.pth +0 -0
- Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116 β YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/train_syntacc_baseline.py +3 -1
- Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9 β YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/trainer_0_log.txt +2 -2
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/best_model.pth +0 -3
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/best_model_124752.pth +0 -3
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/checkpoint_130000.pth +0 -3
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/checkpoint_135000.pth +0 -3
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/train_syntacc_baseline.py +0 -352
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/config.json +0 -496
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/language_ids.json +0 -15
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/speakers.pth +0 -3
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/train_syntacc_baseline.py +0 -352
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/best_model.pth +0 -3
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/best_model_87192.pth +0 -3
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/checkpoint_130000.pth +0 -3
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/checkpoint_135000.pth +0 -3
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/config.json +0 -496
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/language_ids.json +0 -15
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/speakers.pth +0 -3
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/trainer_0_log.txt +0 -3
- Experiments/runs/YourTTS-Syntacc-PT_continue-January-28-2024_02+26PM-8a499b88c/{checkpoint_185000.pth β checkpoint_195000.pth} +1 -1
- Experiments/runs/YourTTS-Syntacc-PT_continue-January-28-2024_02+26PM-8a499b88c/trainer_0_log.txt +2 -2
- Experiments/train_syntacc_baseline.py +1 -1
Experiments/nohup.out
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4f3ff491be1a22770ad6be06a4ab637e3ee1fdd7ab56a46d56b6ee5ce294191a
|
| 3 |
+
size 19098782
|
Experiments/run/events.out.tfevents.1706462806.edresson-train-80.145564.0
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:edf473f639006f00be06083dcda982e19ad249445299bba3ccfa9d3c3be668c9
|
| 3 |
+
size 603478571
|
Experiments/{runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/trainer_0_log.txt β run/events.out.tfevents.1706899297.edresson-train-80-3.1052.0}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f92cb9921885f7784782d7c4cf4983bd9ebf92511857b363ad6c4a213d77e7fb
|
| 3 |
+
size 1426573
|
Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9 β YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/best_model.pth
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5a8ca0385eb8c2d74471a308ead9447f46334969a793ff980a527783b55f6571
|
| 3 |
+
size 347720178
|
Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/best_model_85001.pth β YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a/best_model_195001.pth}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5a8ca0385eb8c2d74471a308ead9447f46334969a793ff980a527783b55f6571
|
| 3 |
+
size 347720178
|
Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05 β YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/config.json
RENAMED
|
@@ -397,16 +397,16 @@
|
|
| 397 |
],
|
| 398 |
"use_sdp": true,
|
| 399 |
"noise_scale": 1.0,
|
| 400 |
-
"inference_noise_scale": 0.
|
| 401 |
"length_scale": 1,
|
| 402 |
"noise_scale_dp": 1.0,
|
| 403 |
-
"inference_noise_scale_dp":
|
| 404 |
"max_inference_len": null,
|
| 405 |
"init_discriminator": true,
|
| 406 |
"use_spectral_norm_disriminator": false,
|
| 407 |
"use_speaker_embedding": false,
|
| 408 |
"num_speakers": 0,
|
| 409 |
-
"speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-
|
| 410 |
"d_vector_file": [
|
| 411 |
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
|
| 412 |
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
|
|
@@ -429,7 +429,7 @@
|
|
| 429 |
"use_language_embedding": true,
|
| 430 |
"embedded_language_dim": 4,
|
| 431 |
"num_languages": 0,
|
| 432 |
-
"language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-
|
| 433 |
"use_speaker_encoder_as_loss": false,
|
| 434 |
"speaker_encoder_config_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
|
| 435 |
"speaker_encoder_model_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
|
|
@@ -472,9 +472,9 @@
|
|
| 472 |
"r": 1,
|
| 473 |
"num_speakers": 0,
|
| 474 |
"use_speaker_embedding": false,
|
| 475 |
-
"speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-
|
| 476 |
"speaker_embedding_channels": 256,
|
| 477 |
-
"language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-
|
| 478 |
"use_language_embedding": true,
|
| 479 |
"use_d_vector_file": true,
|
| 480 |
"d_vector_file": [
|
|
|
|
| 397 |
],
|
| 398 |
"use_sdp": true,
|
| 399 |
"noise_scale": 1.0,
|
| 400 |
+
"inference_noise_scale": 0.33,
|
| 401 |
"length_scale": 1,
|
| 402 |
"noise_scale_dp": 1.0,
|
| 403 |
+
"inference_noise_scale_dp": 0.33,
|
| 404 |
"max_inference_len": null,
|
| 405 |
"init_discriminator": true,
|
| 406 |
"use_spectral_norm_disriminator": false,
|
| 407 |
"use_speaker_embedding": false,
|
| 408 |
"num_speakers": 0,
|
| 409 |
+
"speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a/speakers.pth",
|
| 410 |
"d_vector_file": [
|
| 411 |
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
|
| 412 |
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
|
|
|
|
| 429 |
"use_language_embedding": true,
|
| 430 |
"embedded_language_dim": 4,
|
| 431 |
"num_languages": 0,
|
| 432 |
+
"language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a/language_ids.json",
|
| 433 |
"use_speaker_encoder_as_loss": false,
|
| 434 |
"speaker_encoder_config_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
|
| 435 |
"speaker_encoder_model_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
|
|
|
|
| 472 |
"r": 1,
|
| 473 |
"num_speakers": 0,
|
| 474 |
"use_speaker_embedding": false,
|
| 475 |
+
"speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a/speakers.pth",
|
| 476 |
"speaker_embedding_channels": 256,
|
| 477 |
+
"language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a/language_ids.json",
|
| 478 |
"use_language_embedding": true,
|
| 479 |
"use_d_vector_file": true,
|
| 480 |
"d_vector_file": [
|
Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05 β YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/language_ids.json
RENAMED
|
File without changes
|
Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05 β YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/speakers.pth
RENAMED
|
File without changes
|
Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116 β YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/train_syntacc_baseline.py
RENAMED
|
@@ -28,7 +28,7 @@ RUN_NAME = "YourTTS-Baseline-PT"
|
|
| 28 |
OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs") # "/raid/coqui/Checkpoints/original-YourTTS/"
|
| 29 |
|
| 30 |
# If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
|
| 31 |
-
RESTORE_PATH = "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-
|
| 32 |
|
| 33 |
# This paramter is useful to debug, it skips the training epochs and just do the evaluation and produce the test sentences
|
| 34 |
SKIP_TRAIN_EPOCH = False
|
|
@@ -221,6 +221,8 @@ audio_config = VitsAudioConfig(
|
|
| 221 |
|
| 222 |
# Init VITSArgs setting the arguments that are needed for the YourTTS model
|
| 223 |
model_args = VitsArgs(
|
|
|
|
|
|
|
| 224 |
spec_segment_size=62,
|
| 225 |
hidden_channels=192,
|
| 226 |
hidden_channels_ffn_text_encoder=768,
|
|
|
|
| 28 |
OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs") # "/raid/coqui/Checkpoints/original-YourTTS/"
|
| 29 |
|
| 30 |
# If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
|
| 31 |
+
RESTORE_PATH = "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT_continue-January-28-2024_02+26PM-8a499b88c/checkpoint_195000.pth" # Download the checkpoint here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
|
| 32 |
|
| 33 |
# This paramter is useful to debug, it skips the training epochs and just do the evaluation and produce the test sentences
|
| 34 |
SKIP_TRAIN_EPOCH = False
|
|
|
|
| 221 |
|
| 222 |
# Init VITSArgs setting the arguments that are needed for the YourTTS model
|
| 223 |
model_args = VitsArgs(
|
| 224 |
+
inference_noise_scale=0.33,
|
| 225 |
+
inference_noise_scale_dp=0.33,
|
| 226 |
spec_segment_size=62,
|
| 227 |
hidden_channels=192,
|
| 228 |
hidden_channels_ffn_text_encoder=768,
|
Experiments/runs/{YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9 β YourTTS-Baseline-PT-February-02-2024_03+41PM-a1d8f544a}/trainer_0_log.txt
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:800fa1ba79843ee3494b41dbc8ffa45c6f147a7eb369e72260cbc0a5ce75dd72
|
| 3 |
+
size 135592
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/best_model.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:c62e29c7a1dd4f701ab4998e0b1f569cfe7486cc7806f149c1ff857f172383e0
|
| 3 |
-
size 1043220702
|
|
|
|
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/best_model_124752.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:c62e29c7a1dd4f701ab4998e0b1f569cfe7486cc7806f149c1ff857f172383e0
|
| 3 |
-
size 1043220702
|
|
|
|
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/checkpoint_130000.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:a71ead47e605fc525b264ad882fd54630c15a42eb69aaf88993d26d5ea84ae3b
|
| 3 |
-
size 1043220766
|
|
|
|
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/checkpoint_135000.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:96e16ee83729813041c17f6edf8a702bdf59e7afe345cfad1fe65dd4ba0b1fce
|
| 3 |
-
size 1043220766
|
|
|
|
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/train_syntacc_baseline.py
DELETED
|
@@ -1,352 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
|
| 3 |
-
import torch
|
| 4 |
-
from trainer import Trainer, TrainerArgs
|
| 5 |
-
|
| 6 |
-
from TTS.bin.compute_embeddings import compute_embeddings
|
| 7 |
-
from TTS.bin.resample import resample_files
|
| 8 |
-
from TTS.config.shared_configs import BaseDatasetConfig
|
| 9 |
-
from TTS.tts.configs.vits_config import VitsConfig
|
| 10 |
-
from TTS.tts.datasets import load_tts_samples
|
| 11 |
-
from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig, VitsDataset
|
| 12 |
-
from TTS.utils.downloaders import download_libri_tts
|
| 13 |
-
from torch.utils.data import DataLoader
|
| 14 |
-
from TTS.utils.samplers import PerfectBatchSampler
|
| 15 |
-
torch.set_num_threads(24)
|
| 16 |
-
|
| 17 |
-
# pylint: disable=W0105
|
| 18 |
-
"""
|
| 19 |
-
This recipe replicates the first experiment proposed in the CML-TTS paper (https://arxiv.org/abs/2306.10097). It uses the YourTTS model.
|
| 20 |
-
YourTTS model is based on the VITS model however it uses external speaker embeddings extracted from a pre-trained speaker encoder and has small architecture changes.
|
| 21 |
-
"""
|
| 22 |
-
CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
|
| 23 |
-
|
| 24 |
-
# Name of the run for the Trainer
|
| 25 |
-
RUN_NAME = "YourTTS-Baseline-PT"
|
| 26 |
-
|
| 27 |
-
# Path where you want to save the models outputs (configs, checkpoints and tensorboard logs)
|
| 28 |
-
OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs") # "/raid/coqui/Checkpoints/original-YourTTS/"
|
| 29 |
-
|
| 30 |
-
# If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
|
| 31 |
-
RESTORE_PATH = "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_85000.pth" # Download the checkpoint here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
|
| 32 |
-
|
| 33 |
-
# This paramter is useful to debug, it skips the training epochs and just do the evaluation and produce the test sentences
|
| 34 |
-
SKIP_TRAIN_EPOCH = False
|
| 35 |
-
|
| 36 |
-
# Set here the batch size to be used in training and evaluation
|
| 37 |
-
BATCH_SIZE = 26
|
| 38 |
-
|
| 39 |
-
# Training Sampling rate and the target sampling rate for resampling the downloaded dataset (Note: If you change this you might need to redownload the dataset !!)
|
| 40 |
-
# Note: If you add new datasets, please make sure that the dataset sampling rate and this parameter are matching, otherwise resample your audios
|
| 41 |
-
SAMPLE_RATE = 16000
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
DASHBOARD_LOGGER="tensorboard"
|
| 45 |
-
LOGGER_URI = None
|
| 46 |
-
|
| 47 |
-
DASHBOARD_LOGGER = "clearml"
|
| 48 |
-
LOGGER_URI = "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/"
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
# Max audio length in seconds to be used in training (every audio bigger than it will be ignored)
|
| 53 |
-
MAX_AUDIO_LEN_IN_SECONDS = float("inf")
|
| 54 |
-
|
| 55 |
-
# Define here the datasets config
|
| 56 |
-
brpb_train_config = BaseDatasetConfig(
|
| 57 |
-
formatter="coqui",
|
| 58 |
-
dataset_name="mupe",
|
| 59 |
-
meta_file_train="metadata_coqui_brpb.csv",
|
| 60 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
| 61 |
-
language="brpb"
|
| 62 |
-
)
|
| 63 |
-
|
| 64 |
-
brba_train_config = BaseDatasetConfig(
|
| 65 |
-
formatter="coqui",
|
| 66 |
-
dataset_name="mupe",
|
| 67 |
-
meta_file_train="metadata_coqui_brba.csv",
|
| 68 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
| 69 |
-
language="brba"
|
| 70 |
-
)
|
| 71 |
-
|
| 72 |
-
brportugal_train_config = BaseDatasetConfig(
|
| 73 |
-
formatter="coqui",
|
| 74 |
-
dataset_name="mupe",
|
| 75 |
-
meta_file_train="metadata_coqui_brportugal.csv",
|
| 76 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
| 77 |
-
language="brportugal"
|
| 78 |
-
)
|
| 79 |
-
|
| 80 |
-
brsp_train_config = BaseDatasetConfig(
|
| 81 |
-
formatter="coqui",
|
| 82 |
-
dataset_name="mupe",
|
| 83 |
-
meta_file_train="metadata_coqui_brsp.csv",
|
| 84 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
| 85 |
-
language="brsp"
|
| 86 |
-
)
|
| 87 |
-
|
| 88 |
-
brpe_train_config = BaseDatasetConfig(
|
| 89 |
-
formatter="coqui",
|
| 90 |
-
dataset_name="mupe",
|
| 91 |
-
meta_file_train="metadata_coqui_brpe.csv",
|
| 92 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
| 93 |
-
language="brpe"
|
| 94 |
-
)
|
| 95 |
-
|
| 96 |
-
brmg_train_config = BaseDatasetConfig(
|
| 97 |
-
formatter="coqui",
|
| 98 |
-
dataset_name="mupe",
|
| 99 |
-
meta_file_train="metadata_coqui_brmg.csv",
|
| 100 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
| 101 |
-
language="brmg"
|
| 102 |
-
)
|
| 103 |
-
|
| 104 |
-
brrj_train_config = BaseDatasetConfig(
|
| 105 |
-
formatter="coqui",
|
| 106 |
-
dataset_name="mupe",
|
| 107 |
-
meta_file_train="metadata_coqui_brrj.csv",
|
| 108 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
| 109 |
-
language="brrj"
|
| 110 |
-
)
|
| 111 |
-
|
| 112 |
-
brce_train_config = BaseDatasetConfig(
|
| 113 |
-
formatter="coqui",
|
| 114 |
-
dataset_name="mupe",
|
| 115 |
-
meta_file_train="metadata_coqui_brce.csv",
|
| 116 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
| 117 |
-
language="brce"
|
| 118 |
-
)
|
| 119 |
-
|
| 120 |
-
brrs_train_config = BaseDatasetConfig(
|
| 121 |
-
formatter="coqui",
|
| 122 |
-
dataset_name="mupe",
|
| 123 |
-
meta_file_train="metadata_coqui_brrs.csv",
|
| 124 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
| 125 |
-
language="brrs"
|
| 126 |
-
)
|
| 127 |
-
|
| 128 |
-
bralemanha_train_config = BaseDatasetConfig(
|
| 129 |
-
formatter="coqui",
|
| 130 |
-
dataset_name="mupe",
|
| 131 |
-
meta_file_train="metadata_coqui_bralemanha.csv",
|
| 132 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
| 133 |
-
language="bralemanha"
|
| 134 |
-
)
|
| 135 |
-
|
| 136 |
-
brgo_train_config = BaseDatasetConfig(
|
| 137 |
-
formatter="coqui",
|
| 138 |
-
dataset_name="mupe",
|
| 139 |
-
meta_file_train="metadata_coqui_brgo.csv",
|
| 140 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
| 141 |
-
language="brgo"
|
| 142 |
-
)
|
| 143 |
-
|
| 144 |
-
bral_train_config = BaseDatasetConfig(
|
| 145 |
-
formatter="coqui",
|
| 146 |
-
dataset_name="mupe",
|
| 147 |
-
meta_file_train="metadata_coqui_bral.csv",
|
| 148 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
| 149 |
-
language="bral"
|
| 150 |
-
)
|
| 151 |
-
|
| 152 |
-
brpr_train_config = BaseDatasetConfig(
|
| 153 |
-
formatter="coqui",
|
| 154 |
-
dataset_name="mupe",
|
| 155 |
-
meta_file_train="metadata_coqui_brpr.csv",
|
| 156 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
| 157 |
-
language="brpr"
|
| 158 |
-
)
|
| 159 |
-
|
| 160 |
-
bres_train_config = BaseDatasetConfig(
|
| 161 |
-
formatter="coqui",
|
| 162 |
-
dataset_name="mupe",
|
| 163 |
-
meta_file_train="metadata_coqui_bres.csv",
|
| 164 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
| 165 |
-
language="bres"
|
| 166 |
-
)
|
| 167 |
-
|
| 168 |
-
brpi_train_config = BaseDatasetConfig(
|
| 169 |
-
formatter="coqui",
|
| 170 |
-
dataset_name="mupe",
|
| 171 |
-
meta_file_train="metadata_coqui_brpi.csv",
|
| 172 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
| 173 |
-
language="brpi"
|
| 174 |
-
)
|
| 175 |
-
|
| 176 |
-
# bres_train_config, brpi_train_config no files found
|
| 177 |
-
DATASETS_CONFIG_LIST = [brpb_train_config,brba_train_config,brportugal_train_config,brsp_train_config,brpe_train_config,brmg_train_config,brrj_train_config,brce_train_config,brrs_train_config,bralemanha_train_config,brgo_train_config,bral_train_config,brpr_train_config]
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
### Extract speaker embeddings
|
| 181 |
-
SPEAKER_ENCODER_CHECKPOINT_PATH = (
|
| 182 |
-
"https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar"
|
| 183 |
-
)
|
| 184 |
-
SPEAKER_ENCODER_CONFIG_PATH = "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json"
|
| 185 |
-
|
| 186 |
-
D_VECTOR_FILES = [] # List of speaker embeddings/d-vectors to be used during the training
|
| 187 |
-
|
| 188 |
-
# Iterates all the dataset configs checking if the speakers embeddings are already computated, if not compute it
|
| 189 |
-
for dataset_conf in DATASETS_CONFIG_LIST:
|
| 190 |
-
# Check if the embeddings weren't already computed, if not compute it
|
| 191 |
-
embeddings_file = os.path.join(dataset_conf.path, f"H_ASP_speaker_embeddings_{dataset_conf.language}.pth")
|
| 192 |
-
if not os.path.isfile(embeddings_file):
|
| 193 |
-
print(f">>> Computing the speaker embeddings for the {dataset_conf.dataset_name} dataset")
|
| 194 |
-
compute_embeddings(
|
| 195 |
-
SPEAKER_ENCODER_CHECKPOINT_PATH,
|
| 196 |
-
SPEAKER_ENCODER_CONFIG_PATH,
|
| 197 |
-
embeddings_file,
|
| 198 |
-
old_speakers_file=None,
|
| 199 |
-
config_dataset_path=None,
|
| 200 |
-
formatter_name=dataset_conf.formatter,
|
| 201 |
-
dataset_name=dataset_conf.dataset_name,
|
| 202 |
-
dataset_path=dataset_conf.path,
|
| 203 |
-
meta_file_train=dataset_conf.meta_file_train,
|
| 204 |
-
meta_file_val=dataset_conf.meta_file_val,
|
| 205 |
-
disable_cuda=False,
|
| 206 |
-
no_eval=False,
|
| 207 |
-
)
|
| 208 |
-
D_VECTOR_FILES.append(embeddings_file)
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
# Audio config used in training.
|
| 212 |
-
audio_config = VitsAudioConfig(
|
| 213 |
-
sample_rate=SAMPLE_RATE,
|
| 214 |
-
hop_length=256,
|
| 215 |
-
win_length=1024,
|
| 216 |
-
fft_size=1024,
|
| 217 |
-
mel_fmin=0.0,
|
| 218 |
-
mel_fmax=None,
|
| 219 |
-
num_mels=80,
|
| 220 |
-
)
|
| 221 |
-
|
| 222 |
-
# Init VITSArgs setting the arguments that are needed for the YourTTS model
|
| 223 |
-
model_args = VitsArgs(
|
| 224 |
-
spec_segment_size=62,
|
| 225 |
-
hidden_channels=192,
|
| 226 |
-
hidden_channels_ffn_text_encoder=768,
|
| 227 |
-
num_heads_text_encoder=2,
|
| 228 |
-
num_layers_text_encoder=10,
|
| 229 |
-
kernel_size_text_encoder=3,
|
| 230 |
-
dropout_p_text_encoder=0.1,
|
| 231 |
-
d_vector_file=D_VECTOR_FILES,
|
| 232 |
-
use_d_vector_file=True,
|
| 233 |
-
d_vector_dim=512,
|
| 234 |
-
speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
|
| 235 |
-
speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
|
| 236 |
-
resblock_type_decoder="2", # In the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
|
| 237 |
-
# Useful parameters to enable the Speaker Consistency Loss (SCL) described in the paper
|
| 238 |
-
use_speaker_encoder_as_loss=False,
|
| 239 |
-
# Useful parameters to enable multilingual training
|
| 240 |
-
use_language_embedding=True,
|
| 241 |
-
embedded_language_dim=4,
|
| 242 |
-
use_adaptive_weight_text_encoder=False,
|
| 243 |
-
use_perfect_class_batch_sampler=True,
|
| 244 |
-
perfect_class_batch_sampler_key="language"
|
| 245 |
-
)
|
| 246 |
-
|
| 247 |
-
# General training config, here you can change the batch size and others useful parameters
|
| 248 |
-
config = VitsConfig(
|
| 249 |
-
output_path=OUT_PATH,
|
| 250 |
-
model_args=model_args,
|
| 251 |
-
run_name=RUN_NAME,
|
| 252 |
-
project_name="SYNTACC",
|
| 253 |
-
run_description="""
|
| 254 |
-
- YourTTS with SYNTACC text encoder
|
| 255 |
-
""",
|
| 256 |
-
dashboard_logger=DASHBOARD_LOGGER,
|
| 257 |
-
logger_uri=LOGGER_URI,
|
| 258 |
-
audio=audio_config,
|
| 259 |
-
batch_size=BATCH_SIZE,
|
| 260 |
-
batch_group_size=48,
|
| 261 |
-
eval_batch_size=BATCH_SIZE,
|
| 262 |
-
num_loader_workers=8,
|
| 263 |
-
eval_split_max_size=256,
|
| 264 |
-
print_step=50,
|
| 265 |
-
plot_step=100,
|
| 266 |
-
log_model_step=1000,
|
| 267 |
-
save_step=5000,
|
| 268 |
-
save_n_checkpoints=2,
|
| 269 |
-
save_checkpoints=True,
|
| 270 |
-
# target_loss="loss_1",
|
| 271 |
-
print_eval=False,
|
| 272 |
-
use_phonemes=False,
|
| 273 |
-
phonemizer="espeak",
|
| 274 |
-
phoneme_language="en",
|
| 275 |
-
compute_input_seq_cache=True,
|
| 276 |
-
add_blank=True,
|
| 277 |
-
text_cleaner="multilingual_cleaners",
|
| 278 |
-
characters=CharactersConfig(
|
| 279 |
-
characters_class="TTS.tts.models.vits.VitsCharacters",
|
| 280 |
-
pad="_",
|
| 281 |
-
eos="&",
|
| 282 |
-
bos="*",
|
| 283 |
-
blank=None,
|
| 284 |
-
characters="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
|
| 285 |
-
punctuations="\u2014!'(),-.:;?\u00bf ",
|
| 286 |
-
phonemes="iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
|
| 287 |
-
is_unique=True,
|
| 288 |
-
is_sorted=True,
|
| 289 |
-
),
|
| 290 |
-
phoneme_cache_path=None,
|
| 291 |
-
precompute_num_workers=12,
|
| 292 |
-
start_by_longest=True,
|
| 293 |
-
datasets=DATASETS_CONFIG_LIST,
|
| 294 |
-
cudnn_benchmark=False,
|
| 295 |
-
max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS,
|
| 296 |
-
mixed_precision=False,
|
| 297 |
-
test_sentences=[
|
| 298 |
-
#GUSTAVO: apenas pessoas do treino
|
| 299 |
-
["Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.", "EDILEINE_FONSECA", None, "brsp"],
|
| 300 |
-
["Quem semeia ventos, colhe tempestades.", "JOSE_PAULO_DE_ARAUJO", None, "brpb"],
|
| 301 |
-
["O olho do dono \u00e9 que engorda o gado.", "VITOR_RAFAEL_OLIVEIRA_ALVES", None, "brba"],
|
| 302 |
-
["\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.", "MARIA_AURORA_FELIX", None, "brportugal"],
|
| 303 |
-
["Quem espera sempre alcan\u00e7a.", "ANTONIO_DE_AMORIM_COSTA", None, "brpe"],
|
| 304 |
-
["Cada macaco no seu galho.", "ALCIDES_DE_LIMA", None, "brmg"],
|
| 305 |
-
["Em terra de cego, quem tem um olho \u00e9 rei.", "ALUISIO_SOARES_DE_SOUSA", None, "brrj"],
|
| 306 |
-
["A ocasi\u00e3o faz o ladr\u00e3o.", "FRANCISCO_JOSE_MOREIRA_MOTA", None, "brce"],
|
| 307 |
-
["De gr\u00e3o em gr\u00e3o, a galinha enche o papo.", "EVALDO_ANDRADA_CORREA", None, "brrs"],
|
| 308 |
-
["Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.", "DORIS_ALEXANDER", None, "bralemanha"],
|
| 309 |
-
["Quem n\u00e3o arrisca, n\u00e3o petisca.", "DONALDO_LUIZ_DE_ALMEIDA", None, "brgo"],
|
| 310 |
-
["A uni\u00e3o faz a for\u00e7a.", "GERONCIO_HENRIQUE_NETO", None, "bral"],
|
| 311 |
-
["Em boca fechada n\u00e3o entra mosquito.", "MALU_NATEL_FREIRE_WEBER", None, "brpr"],
|
| 312 |
-
# ["Quem n\u00e3o tem dinheiro, n\u00e3o tem v\u00edcios.", "INES_VIEIRA_BOGEA", None, "bres"],
|
| 313 |
-
# ["Quando voc\u00ea n\u00e3o corre nenhum risco, voc\u00ea arrisca tudo.", "MARIA_ASSUNCAO_SOUSA", None, "brpi"]
|
| 314 |
-
],
|
| 315 |
-
# Enable the weighted sampler
|
| 316 |
-
use_weighted_sampler=True,
|
| 317 |
-
# Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
|
| 318 |
-
# weighted_sampler_attrs={"language": 1.0, "speaker_name": 1.0},
|
| 319 |
-
weighted_sampler_attrs={"language": 1.0},
|
| 320 |
-
weighted_sampler_multipliers={
|
| 321 |
-
# "speaker_name": {
|
| 322 |
-
# you can force the batching scheme to give a higher weight to a certain speaker and then this speaker will appears more frequently on the batch.
|
| 323 |
-
# It will speedup the speaker adaptation process. Considering the CML train dataset and "new_speaker" as the speaker name of the speaker that you want to adapt.
|
| 324 |
-
# The line above will make the balancer consider the "new_speaker" as 106 speakers so 1/4 of the number of speakers present on CML dataset.
|
| 325 |
-
# 'new_speaker': 106, # (CML tot. train speaker)/4 = (424/4) = 106
|
| 326 |
-
# }
|
| 327 |
-
},
|
| 328 |
-
# It defines the Speaker Consistency Loss (SCL) Ξ± to 9 like the YourTTS paper
|
| 329 |
-
speaker_encoder_loss_alpha=9.0,
|
| 330 |
-
)
|
| 331 |
-
|
| 332 |
-
# Load all the datasets samples and split traning and evaluation sets
|
| 333 |
-
train_samples, eval_samples = load_tts_samples(
|
| 334 |
-
config.datasets,
|
| 335 |
-
eval_split=True,
|
| 336 |
-
eval_split_max_size=config.eval_split_max_size,
|
| 337 |
-
eval_split_size=config.eval_split_size,
|
| 338 |
-
)
|
| 339 |
-
|
| 340 |
-
# Init the model
|
| 341 |
-
model = Vits.init_from_config(config)
|
| 342 |
-
|
| 343 |
-
# Init the trainer and π
|
| 344 |
-
trainer = Trainer(
|
| 345 |
-
TrainerArgs(restore_path=RESTORE_PATH, skip_train_epoch=SKIP_TRAIN_EPOCH, start_with_eval=True),
|
| 346 |
-
config,
|
| 347 |
-
output_path=OUT_PATH,
|
| 348 |
-
model=model,
|
| 349 |
-
train_samples=train_samples,
|
| 350 |
-
eval_samples=eval_samples,
|
| 351 |
-
)
|
| 352 |
-
trainer.fit()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/config.json
DELETED
|
@@ -1,496 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"output_path": "/raid/datasets/MUPE/Experiments/runs",
|
| 3 |
-
"logger_uri": "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/",
|
| 4 |
-
"run_name": "YourTTS-Baseline-PT",
|
| 5 |
-
"project_name": "SYNTACC",
|
| 6 |
-
"run_description": "\n - YourTTS with SYNTACC text encoder\n ",
|
| 7 |
-
"print_step": 50,
|
| 8 |
-
"plot_step": 100,
|
| 9 |
-
"model_param_stats": false,
|
| 10 |
-
"wandb_entity": null,
|
| 11 |
-
"dashboard_logger": "clearml",
|
| 12 |
-
"save_on_interrupt": true,
|
| 13 |
-
"log_model_step": 1000,
|
| 14 |
-
"save_step": 5000,
|
| 15 |
-
"save_n_checkpoints": 2,
|
| 16 |
-
"save_checkpoints": true,
|
| 17 |
-
"save_all_best": false,
|
| 18 |
-
"save_best_after": 10000,
|
| 19 |
-
"target_loss": null,
|
| 20 |
-
"print_eval": false,
|
| 21 |
-
"test_delay_epochs": 0,
|
| 22 |
-
"run_eval": true,
|
| 23 |
-
"run_eval_steps": null,
|
| 24 |
-
"distributed_backend": "nccl",
|
| 25 |
-
"distributed_url": "tcp://localhost:54321",
|
| 26 |
-
"mixed_precision": false,
|
| 27 |
-
"precision": "fp16",
|
| 28 |
-
"epochs": 1000,
|
| 29 |
-
"batch_size": 26,
|
| 30 |
-
"eval_batch_size": 26,
|
| 31 |
-
"grad_clip": [
|
| 32 |
-
1000,
|
| 33 |
-
1000
|
| 34 |
-
],
|
| 35 |
-
"scheduler_after_epoch": true,
|
| 36 |
-
"lr": 0.001,
|
| 37 |
-
"optimizer": "AdamW",
|
| 38 |
-
"optimizer_params": {
|
| 39 |
-
"betas": [
|
| 40 |
-
0.8,
|
| 41 |
-
0.99
|
| 42 |
-
],
|
| 43 |
-
"eps": 1e-09,
|
| 44 |
-
"weight_decay": 0.01
|
| 45 |
-
},
|
| 46 |
-
"lr_scheduler": null,
|
| 47 |
-
"lr_scheduler_params": {},
|
| 48 |
-
"use_grad_scaler": false,
|
| 49 |
-
"allow_tf32": false,
|
| 50 |
-
"cudnn_enable": true,
|
| 51 |
-
"cudnn_deterministic": false,
|
| 52 |
-
"cudnn_benchmark": false,
|
| 53 |
-
"training_seed": 54321,
|
| 54 |
-
"model": "vits",
|
| 55 |
-
"num_loader_workers": 8,
|
| 56 |
-
"num_eval_loader_workers": 0,
|
| 57 |
-
"use_noise_augment": false,
|
| 58 |
-
"audio": {
|
| 59 |
-
"fft_size": 1024,
|
| 60 |
-
"sample_rate": 16000,
|
| 61 |
-
"win_length": 1024,
|
| 62 |
-
"hop_length": 256,
|
| 63 |
-
"num_mels": 80,
|
| 64 |
-
"mel_fmin": 0.0,
|
| 65 |
-
"mel_fmax": null
|
| 66 |
-
},
|
| 67 |
-
"use_phonemes": false,
|
| 68 |
-
"phonemizer": "espeak",
|
| 69 |
-
"phoneme_language": "en",
|
| 70 |
-
"compute_input_seq_cache": true,
|
| 71 |
-
"text_cleaner": "multilingual_cleaners",
|
| 72 |
-
"enable_eos_bos_chars": false,
|
| 73 |
-
"test_sentences_file": "",
|
| 74 |
-
"phoneme_cache_path": null,
|
| 75 |
-
"characters": {
|
| 76 |
-
"characters_class": "TTS.tts.models.vits.VitsCharacters",
|
| 77 |
-
"vocab_dict": null,
|
| 78 |
-
"pad": "_",
|
| 79 |
-
"eos": "&",
|
| 80 |
-
"bos": "*",
|
| 81 |
-
"blank": null,
|
| 82 |
-
"characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
|
| 83 |
-
"punctuations": "\u2014!'(),-.:;?\u00bf ",
|
| 84 |
-
"phonemes": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
|
| 85 |
-
"is_unique": true,
|
| 86 |
-
"is_sorted": true
|
| 87 |
-
},
|
| 88 |
-
"add_blank": true,
|
| 89 |
-
"batch_group_size": 48,
|
| 90 |
-
"loss_masking": null,
|
| 91 |
-
"min_audio_len": 1,
|
| 92 |
-
"max_audio_len": Infinity,
|
| 93 |
-
"min_text_len": 1,
|
| 94 |
-
"max_text_len": Infinity,
|
| 95 |
-
"compute_f0": false,
|
| 96 |
-
"compute_energy": false,
|
| 97 |
-
"compute_linear_spec": true,
|
| 98 |
-
"precompute_num_workers": 12,
|
| 99 |
-
"start_by_longest": true,
|
| 100 |
-
"shuffle": false,
|
| 101 |
-
"drop_last": false,
|
| 102 |
-
"datasets": [
|
| 103 |
-
{
|
| 104 |
-
"formatter": "coqui",
|
| 105 |
-
"dataset_name": "mupe",
|
| 106 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
| 107 |
-
"meta_file_train": "metadata_coqui_brpb.csv",
|
| 108 |
-
"ignored_speakers": null,
|
| 109 |
-
"language": "brpb",
|
| 110 |
-
"phonemizer": "",
|
| 111 |
-
"meta_file_val": "",
|
| 112 |
-
"meta_file_attn_mask": ""
|
| 113 |
-
},
|
| 114 |
-
{
|
| 115 |
-
"formatter": "coqui",
|
| 116 |
-
"dataset_name": "mupe",
|
| 117 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
| 118 |
-
"meta_file_train": "metadata_coqui_brba.csv",
|
| 119 |
-
"ignored_speakers": null,
|
| 120 |
-
"language": "brba",
|
| 121 |
-
"phonemizer": "",
|
| 122 |
-
"meta_file_val": "",
|
| 123 |
-
"meta_file_attn_mask": ""
|
| 124 |
-
},
|
| 125 |
-
{
|
| 126 |
-
"formatter": "coqui",
|
| 127 |
-
"dataset_name": "mupe",
|
| 128 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
| 129 |
-
"meta_file_train": "metadata_coqui_brportugal.csv",
|
| 130 |
-
"ignored_speakers": null,
|
| 131 |
-
"language": "brportugal",
|
| 132 |
-
"phonemizer": "",
|
| 133 |
-
"meta_file_val": "",
|
| 134 |
-
"meta_file_attn_mask": ""
|
| 135 |
-
},
|
| 136 |
-
{
|
| 137 |
-
"formatter": "coqui",
|
| 138 |
-
"dataset_name": "mupe",
|
| 139 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
| 140 |
-
"meta_file_train": "metadata_coqui_brsp.csv",
|
| 141 |
-
"ignored_speakers": null,
|
| 142 |
-
"language": "brsp",
|
| 143 |
-
"phonemizer": "",
|
| 144 |
-
"meta_file_val": "",
|
| 145 |
-
"meta_file_attn_mask": ""
|
| 146 |
-
},
|
| 147 |
-
{
|
| 148 |
-
"formatter": "coqui",
|
| 149 |
-
"dataset_name": "mupe",
|
| 150 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
| 151 |
-
"meta_file_train": "metadata_coqui_brpe.csv",
|
| 152 |
-
"ignored_speakers": null,
|
| 153 |
-
"language": "brpe",
|
| 154 |
-
"phonemizer": "",
|
| 155 |
-
"meta_file_val": "",
|
| 156 |
-
"meta_file_attn_mask": ""
|
| 157 |
-
},
|
| 158 |
-
{
|
| 159 |
-
"formatter": "coqui",
|
| 160 |
-
"dataset_name": "mupe",
|
| 161 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
| 162 |
-
"meta_file_train": "metadata_coqui_brmg.csv",
|
| 163 |
-
"ignored_speakers": null,
|
| 164 |
-
"language": "brmg",
|
| 165 |
-
"phonemizer": "",
|
| 166 |
-
"meta_file_val": "",
|
| 167 |
-
"meta_file_attn_mask": ""
|
| 168 |
-
},
|
| 169 |
-
{
|
| 170 |
-
"formatter": "coqui",
|
| 171 |
-
"dataset_name": "mupe",
|
| 172 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
| 173 |
-
"meta_file_train": "metadata_coqui_brrj.csv",
|
| 174 |
-
"ignored_speakers": null,
|
| 175 |
-
"language": "brrj",
|
| 176 |
-
"phonemizer": "",
|
| 177 |
-
"meta_file_val": "",
|
| 178 |
-
"meta_file_attn_mask": ""
|
| 179 |
-
},
|
| 180 |
-
{
|
| 181 |
-
"formatter": "coqui",
|
| 182 |
-
"dataset_name": "mupe",
|
| 183 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
| 184 |
-
"meta_file_train": "metadata_coqui_brce.csv",
|
| 185 |
-
"ignored_speakers": null,
|
| 186 |
-
"language": "brce",
|
| 187 |
-
"phonemizer": "",
|
| 188 |
-
"meta_file_val": "",
|
| 189 |
-
"meta_file_attn_mask": ""
|
| 190 |
-
},
|
| 191 |
-
{
|
| 192 |
-
"formatter": "coqui",
|
| 193 |
-
"dataset_name": "mupe",
|
| 194 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
| 195 |
-
"meta_file_train": "metadata_coqui_brrs.csv",
|
| 196 |
-
"ignored_speakers": null,
|
| 197 |
-
"language": "brrs",
|
| 198 |
-
"phonemizer": "",
|
| 199 |
-
"meta_file_val": "",
|
| 200 |
-
"meta_file_attn_mask": ""
|
| 201 |
-
},
|
| 202 |
-
{
|
| 203 |
-
"formatter": "coqui",
|
| 204 |
-
"dataset_name": "mupe",
|
| 205 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
| 206 |
-
"meta_file_train": "metadata_coqui_bralemanha.csv",
|
| 207 |
-
"ignored_speakers": null,
|
| 208 |
-
"language": "bralemanha",
|
| 209 |
-
"phonemizer": "",
|
| 210 |
-
"meta_file_val": "",
|
| 211 |
-
"meta_file_attn_mask": ""
|
| 212 |
-
},
|
| 213 |
-
{
|
| 214 |
-
"formatter": "coqui",
|
| 215 |
-
"dataset_name": "mupe",
|
| 216 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
| 217 |
-
"meta_file_train": "metadata_coqui_brgo.csv",
|
| 218 |
-
"ignored_speakers": null,
|
| 219 |
-
"language": "brgo",
|
| 220 |
-
"phonemizer": "",
|
| 221 |
-
"meta_file_val": "",
|
| 222 |
-
"meta_file_attn_mask": ""
|
| 223 |
-
},
|
| 224 |
-
{
|
| 225 |
-
"formatter": "coqui",
|
| 226 |
-
"dataset_name": "mupe",
|
| 227 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
| 228 |
-
"meta_file_train": "metadata_coqui_bral.csv",
|
| 229 |
-
"ignored_speakers": null,
|
| 230 |
-
"language": "bral",
|
| 231 |
-
"phonemizer": "",
|
| 232 |
-
"meta_file_val": "",
|
| 233 |
-
"meta_file_attn_mask": ""
|
| 234 |
-
},
|
| 235 |
-
{
|
| 236 |
-
"formatter": "coqui",
|
| 237 |
-
"dataset_name": "mupe",
|
| 238 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
| 239 |
-
"meta_file_train": "metadata_coqui_brpr.csv",
|
| 240 |
-
"ignored_speakers": null,
|
| 241 |
-
"language": "brpr",
|
| 242 |
-
"phonemizer": "",
|
| 243 |
-
"meta_file_val": "",
|
| 244 |
-
"meta_file_attn_mask": ""
|
| 245 |
-
}
|
| 246 |
-
],
|
| 247 |
-
"test_sentences": [
|
| 248 |
-
[
|
| 249 |
-
"Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.",
|
| 250 |
-
"EDILEINE_FONSECA",
|
| 251 |
-
null,
|
| 252 |
-
"brsp"
|
| 253 |
-
],
|
| 254 |
-
[
|
| 255 |
-
"Quem semeia ventos, colhe tempestades.",
|
| 256 |
-
"JOSE_PAULO_DE_ARAUJO",
|
| 257 |
-
null,
|
| 258 |
-
"brpb"
|
| 259 |
-
],
|
| 260 |
-
[
|
| 261 |
-
"O olho do dono \u00e9 que engorda o gado.",
|
| 262 |
-
"VITOR_RAFAEL_OLIVEIRA_ALVES",
|
| 263 |
-
null,
|
| 264 |
-
"brba"
|
| 265 |
-
],
|
| 266 |
-
[
|
| 267 |
-
"\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.",
|
| 268 |
-
"MARIA_AURORA_FELIX",
|
| 269 |
-
null,
|
| 270 |
-
"brportugal"
|
| 271 |
-
],
|
| 272 |
-
[
|
| 273 |
-
"Quem espera sempre alcan\u00e7a.",
|
| 274 |
-
"ANTONIO_DE_AMORIM_COSTA",
|
| 275 |
-
null,
|
| 276 |
-
"brpe"
|
| 277 |
-
],
|
| 278 |
-
[
|
| 279 |
-
"Cada macaco no seu galho.",
|
| 280 |
-
"ALCIDES_DE_LIMA",
|
| 281 |
-
null,
|
| 282 |
-
"brmg"
|
| 283 |
-
],
|
| 284 |
-
[
|
| 285 |
-
"Em terra de cego, quem tem um olho \u00e9 rei.",
|
| 286 |
-
"ALUISIO_SOARES_DE_SOUSA",
|
| 287 |
-
null,
|
| 288 |
-
"brrj"
|
| 289 |
-
],
|
| 290 |
-
[
|
| 291 |
-
"A ocasi\u00e3o faz o ladr\u00e3o.",
|
| 292 |
-
"FRANCISCO_JOSE_MOREIRA_MOTA",
|
| 293 |
-
null,
|
| 294 |
-
"brce"
|
| 295 |
-
],
|
| 296 |
-
[
|
| 297 |
-
"De gr\u00e3o em gr\u00e3o, a galinha enche o papo.",
|
| 298 |
-
"EVALDO_ANDRADA_CORREA",
|
| 299 |
-
null,
|
| 300 |
-
"brrs"
|
| 301 |
-
],
|
| 302 |
-
[
|
| 303 |
-
"Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.",
|
| 304 |
-
"DORIS_ALEXANDER",
|
| 305 |
-
null,
|
| 306 |
-
"bralemanha"
|
| 307 |
-
],
|
| 308 |
-
[
|
| 309 |
-
"Quem n\u00e3o arrisca, n\u00e3o petisca.",
|
| 310 |
-
"DONALDO_LUIZ_DE_ALMEIDA",
|
| 311 |
-
null,
|
| 312 |
-
"brgo"
|
| 313 |
-
],
|
| 314 |
-
[
|
| 315 |
-
"A uni\u00e3o faz a for\u00e7a.",
|
| 316 |
-
"GERONCIO_HENRIQUE_NETO",
|
| 317 |
-
null,
|
| 318 |
-
"bral"
|
| 319 |
-
],
|
| 320 |
-
[
|
| 321 |
-
"Em boca fechada n\u00e3o entra mosquito.",
|
| 322 |
-
"MALU_NATEL_FREIRE_WEBER",
|
| 323 |
-
null,
|
| 324 |
-
"brpr"
|
| 325 |
-
]
|
| 326 |
-
],
|
| 327 |
-
"eval_split_max_size": 256,
|
| 328 |
-
"eval_split_size": 0.01,
|
| 329 |
-
"use_speaker_weighted_sampler": false,
|
| 330 |
-
"speaker_weighted_sampler_alpha": 1.0,
|
| 331 |
-
"use_language_weighted_sampler": false,
|
| 332 |
-
"language_weighted_sampler_alpha": 1.0,
|
| 333 |
-
"use_length_weighted_sampler": false,
|
| 334 |
-
"length_weighted_sampler_alpha": 1.0,
|
| 335 |
-
"model_args": {
|
| 336 |
-
"num_chars": 266,
|
| 337 |
-
"out_channels": 513,
|
| 338 |
-
"spec_segment_size": 62,
|
| 339 |
-
"hidden_channels": 192,
|
| 340 |
-
"use_adaptive_weight_text_encoder": false,
|
| 341 |
-
"use_perfect_class_batch_sampler": true,
|
| 342 |
-
"perfect_class_batch_sampler_key": "language",
|
| 343 |
-
"hidden_channels_ffn_text_encoder": 768,
|
| 344 |
-
"num_heads_text_encoder": 2,
|
| 345 |
-
"num_layers_text_encoder": 10,
|
| 346 |
-
"kernel_size_text_encoder": 3,
|
| 347 |
-
"dropout_p_text_encoder": 0.1,
|
| 348 |
-
"dropout_p_duration_predictor": 0.5,
|
| 349 |
-
"kernel_size_posterior_encoder": 5,
|
| 350 |
-
"dilation_rate_posterior_encoder": 1,
|
| 351 |
-
"num_layers_posterior_encoder": 16,
|
| 352 |
-
"kernel_size_flow": 5,
|
| 353 |
-
"dilation_rate_flow": 1,
|
| 354 |
-
"num_layers_flow": 4,
|
| 355 |
-
"resblock_type_decoder": "2",
|
| 356 |
-
"resblock_kernel_sizes_decoder": [
|
| 357 |
-
3,
|
| 358 |
-
7,
|
| 359 |
-
11
|
| 360 |
-
],
|
| 361 |
-
"resblock_dilation_sizes_decoder": [
|
| 362 |
-
[
|
| 363 |
-
1,
|
| 364 |
-
3,
|
| 365 |
-
5
|
| 366 |
-
],
|
| 367 |
-
[
|
| 368 |
-
1,
|
| 369 |
-
3,
|
| 370 |
-
5
|
| 371 |
-
],
|
| 372 |
-
[
|
| 373 |
-
1,
|
| 374 |
-
3,
|
| 375 |
-
5
|
| 376 |
-
]
|
| 377 |
-
],
|
| 378 |
-
"upsample_rates_decoder": [
|
| 379 |
-
8,
|
| 380 |
-
8,
|
| 381 |
-
2,
|
| 382 |
-
2
|
| 383 |
-
],
|
| 384 |
-
"upsample_initial_channel_decoder": 512,
|
| 385 |
-
"upsample_kernel_sizes_decoder": [
|
| 386 |
-
16,
|
| 387 |
-
16,
|
| 388 |
-
4,
|
| 389 |
-
4
|
| 390 |
-
],
|
| 391 |
-
"periods_multi_period_discriminator": [
|
| 392 |
-
2,
|
| 393 |
-
3,
|
| 394 |
-
5,
|
| 395 |
-
7,
|
| 396 |
-
11
|
| 397 |
-
],
|
| 398 |
-
"use_sdp": true,
|
| 399 |
-
"noise_scale": 1.0,
|
| 400 |
-
"inference_noise_scale": 0.667,
|
| 401 |
-
"length_scale": 1,
|
| 402 |
-
"noise_scale_dp": 1.0,
|
| 403 |
-
"inference_noise_scale_dp": 1.0,
|
| 404 |
-
"max_inference_len": null,
|
| 405 |
-
"init_discriminator": true,
|
| 406 |
-
"use_spectral_norm_disriminator": false,
|
| 407 |
-
"use_speaker_embedding": false,
|
| 408 |
-
"num_speakers": 0,
|
| 409 |
-
"speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/speakers.pth",
|
| 410 |
-
"d_vector_file": [
|
| 411 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
|
| 412 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
|
| 413 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
|
| 414 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
|
| 415 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
|
| 416 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
|
| 417 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
|
| 418 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
|
| 419 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
|
| 420 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
|
| 421 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
|
| 422 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
|
| 423 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
|
| 424 |
-
],
|
| 425 |
-
"speaker_embedding_channels": 256,
|
| 426 |
-
"use_d_vector_file": true,
|
| 427 |
-
"d_vector_dim": 512,
|
| 428 |
-
"detach_dp_input": true,
|
| 429 |
-
"use_language_embedding": true,
|
| 430 |
-
"embedded_language_dim": 4,
|
| 431 |
-
"num_languages": 0,
|
| 432 |
-
"language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/language_ids.json",
|
| 433 |
-
"use_speaker_encoder_as_loss": false,
|
| 434 |
-
"speaker_encoder_config_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
|
| 435 |
-
"speaker_encoder_model_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
|
| 436 |
-
"condition_dp_on_speaker": true,
|
| 437 |
-
"freeze_encoder": false,
|
| 438 |
-
"freeze_DP": false,
|
| 439 |
-
"freeze_PE": false,
|
| 440 |
-
"freeze_flow_decoder": false,
|
| 441 |
-
"freeze_waveform_decoder": false,
|
| 442 |
-
"encoder_sample_rate": null,
|
| 443 |
-
"interpolate_z": true,
|
| 444 |
-
"reinit_DP": false,
|
| 445 |
-
"reinit_text_encoder": false
|
| 446 |
-
},
|
| 447 |
-
"lr_gen": 0.0002,
|
| 448 |
-
"lr_disc": 0.0002,
|
| 449 |
-
"lr_scheduler_gen": "ExponentialLR",
|
| 450 |
-
"lr_scheduler_gen_params": {
|
| 451 |
-
"gamma": 0.999875,
|
| 452 |
-
"last_epoch": -1
|
| 453 |
-
},
|
| 454 |
-
"lr_scheduler_disc": "ExponentialLR",
|
| 455 |
-
"lr_scheduler_disc_params": {
|
| 456 |
-
"gamma": 0.999875,
|
| 457 |
-
"last_epoch": -1
|
| 458 |
-
},
|
| 459 |
-
"kl_loss_alpha": 1.0,
|
| 460 |
-
"disc_loss_alpha": 1.0,
|
| 461 |
-
"gen_loss_alpha": 1.0,
|
| 462 |
-
"feat_loss_alpha": 1.0,
|
| 463 |
-
"mel_loss_alpha": 45.0,
|
| 464 |
-
"dur_loss_alpha": 1.0,
|
| 465 |
-
"speaker_encoder_loss_alpha": 9.0,
|
| 466 |
-
"return_wav": true,
|
| 467 |
-
"use_weighted_sampler": true,
|
| 468 |
-
"weighted_sampler_attrs": {
|
| 469 |
-
"language": 1.0
|
| 470 |
-
},
|
| 471 |
-
"weighted_sampler_multipliers": {},
|
| 472 |
-
"r": 1,
|
| 473 |
-
"num_speakers": 0,
|
| 474 |
-
"use_speaker_embedding": false,
|
| 475 |
-
"speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/speakers.pth",
|
| 476 |
-
"speaker_embedding_channels": 256,
|
| 477 |
-
"language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/language_ids.json",
|
| 478 |
-
"use_language_embedding": true,
|
| 479 |
-
"use_d_vector_file": true,
|
| 480 |
-
"d_vector_file": [
|
| 481 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
|
| 482 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
|
| 483 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
|
| 484 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
|
| 485 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
|
| 486 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
|
| 487 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
|
| 488 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
|
| 489 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
|
| 490 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
|
| 491 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
|
| 492 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
|
| 493 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
|
| 494 |
-
],
|
| 495 |
-
"d_vector_dim": 512
|
| 496 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/language_ids.json
DELETED
|
@@ -1,15 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"bral": 0,
|
| 3 |
-
"bralemanha": 1,
|
| 4 |
-
"brba": 2,
|
| 5 |
-
"brce": 3,
|
| 6 |
-
"brgo": 4,
|
| 7 |
-
"brmg": 5,
|
| 8 |
-
"brpb": 6,
|
| 9 |
-
"brpe": 7,
|
| 10 |
-
"brportugal": 8,
|
| 11 |
-
"brpr": 9,
|
| 12 |
-
"brrj": 10,
|
| 13 |
-
"brrs": 11,
|
| 14 |
-
"brsp": 12
|
| 15 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/speakers.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:d0b8d8013199105bfba41bbef0ac6c7fc44ecb3385a39980da80931496c039bf
|
| 3 |
-
size 3296
|
|
|
|
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/train_syntacc_baseline.py
DELETED
|
@@ -1,352 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
|
| 3 |
-
import torch
|
| 4 |
-
from trainer import Trainer, TrainerArgs
|
| 5 |
-
|
| 6 |
-
from TTS.bin.compute_embeddings import compute_embeddings
|
| 7 |
-
from TTS.bin.resample import resample_files
|
| 8 |
-
from TTS.config.shared_configs import BaseDatasetConfig
|
| 9 |
-
from TTS.tts.configs.vits_config import VitsConfig
|
| 10 |
-
from TTS.tts.datasets import load_tts_samples
|
| 11 |
-
from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig, VitsDataset
|
| 12 |
-
from TTS.utils.downloaders import download_libri_tts
|
| 13 |
-
from torch.utils.data import DataLoader
|
| 14 |
-
from TTS.utils.samplers import PerfectBatchSampler
|
| 15 |
-
torch.set_num_threads(24)
|
| 16 |
-
|
| 17 |
-
# pylint: disable=W0105
|
| 18 |
-
"""
|
| 19 |
-
This recipe replicates the first experiment proposed in the CML-TTS paper (https://arxiv.org/abs/2306.10097). It uses the YourTTS model.
|
| 20 |
-
YourTTS model is based on the VITS model however it uses external speaker embeddings extracted from a pre-trained speaker encoder and has small architecture changes.
|
| 21 |
-
"""
|
| 22 |
-
CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
|
| 23 |
-
|
| 24 |
-
# Name of the run for the Trainer
|
| 25 |
-
RUN_NAME = "YourTTS-Baseline-PT"
|
| 26 |
-
|
| 27 |
-
# Path where you want to save the models outputs (configs, checkpoints and tensorboard logs)
|
| 28 |
-
OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs") # "/raid/coqui/Checkpoints/original-YourTTS/"
|
| 29 |
-
|
| 30 |
-
# If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
|
| 31 |
-
RESTORE_PATH = "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_85000.pth" # Download the checkpoint here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
|
| 32 |
-
|
| 33 |
-
# This paramter is useful to debug, it skips the training epochs and just do the evaluation and produce the test sentences
|
| 34 |
-
SKIP_TRAIN_EPOCH = False
|
| 35 |
-
|
| 36 |
-
# Set here the batch size to be used in training and evaluation
|
| 37 |
-
BATCH_SIZE = 26
|
| 38 |
-
|
| 39 |
-
# Training Sampling rate and the target sampling rate for resampling the downloaded dataset (Note: If you change this you might need to redownload the dataset !!)
|
| 40 |
-
# Note: If you add new datasets, please make sure that the dataset sampling rate and this parameter are matching, otherwise resample your audios
|
| 41 |
-
SAMPLE_RATE = 16000
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
DASHBOARD_LOGGER="tensorboard"
|
| 45 |
-
LOGGER_URI = None
|
| 46 |
-
|
| 47 |
-
DASHBOARD_LOGGER = "clearml"
|
| 48 |
-
LOGGER_URI = "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/"
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
# Max audio length in seconds to be used in training (every audio bigger than it will be ignored)
|
| 53 |
-
MAX_AUDIO_LEN_IN_SECONDS = float("inf")
|
| 54 |
-
|
| 55 |
-
# Define here the datasets config
|
| 56 |
-
brpb_train_config = BaseDatasetConfig(
|
| 57 |
-
formatter="coqui",
|
| 58 |
-
dataset_name="mupe",
|
| 59 |
-
meta_file_train="metadata_coqui_brpb.csv",
|
| 60 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
| 61 |
-
language="brpb"
|
| 62 |
-
)
|
| 63 |
-
|
| 64 |
-
brba_train_config = BaseDatasetConfig(
|
| 65 |
-
formatter="coqui",
|
| 66 |
-
dataset_name="mupe",
|
| 67 |
-
meta_file_train="metadata_coqui_brba.csv",
|
| 68 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
| 69 |
-
language="brba"
|
| 70 |
-
)
|
| 71 |
-
|
| 72 |
-
brportugal_train_config = BaseDatasetConfig(
|
| 73 |
-
formatter="coqui",
|
| 74 |
-
dataset_name="mupe",
|
| 75 |
-
meta_file_train="metadata_coqui_brportugal.csv",
|
| 76 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
| 77 |
-
language="brportugal"
|
| 78 |
-
)
|
| 79 |
-
|
| 80 |
-
brsp_train_config = BaseDatasetConfig(
|
| 81 |
-
formatter="coqui",
|
| 82 |
-
dataset_name="mupe",
|
| 83 |
-
meta_file_train="metadata_coqui_brsp.csv",
|
| 84 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
| 85 |
-
language="brsp"
|
| 86 |
-
)
|
| 87 |
-
|
| 88 |
-
brpe_train_config = BaseDatasetConfig(
|
| 89 |
-
formatter="coqui",
|
| 90 |
-
dataset_name="mupe",
|
| 91 |
-
meta_file_train="metadata_coqui_brpe.csv",
|
| 92 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
| 93 |
-
language="brpe"
|
| 94 |
-
)
|
| 95 |
-
|
| 96 |
-
brmg_train_config = BaseDatasetConfig(
|
| 97 |
-
formatter="coqui",
|
| 98 |
-
dataset_name="mupe",
|
| 99 |
-
meta_file_train="metadata_coqui_brmg.csv",
|
| 100 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
| 101 |
-
language="brmg"
|
| 102 |
-
)
|
| 103 |
-
|
| 104 |
-
brrj_train_config = BaseDatasetConfig(
|
| 105 |
-
formatter="coqui",
|
| 106 |
-
dataset_name="mupe",
|
| 107 |
-
meta_file_train="metadata_coqui_brrj.csv",
|
| 108 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
| 109 |
-
language="brrj"
|
| 110 |
-
)
|
| 111 |
-
|
| 112 |
-
brce_train_config = BaseDatasetConfig(
|
| 113 |
-
formatter="coqui",
|
| 114 |
-
dataset_name="mupe",
|
| 115 |
-
meta_file_train="metadata_coqui_brce.csv",
|
| 116 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
| 117 |
-
language="brce"
|
| 118 |
-
)
|
| 119 |
-
|
| 120 |
-
brrs_train_config = BaseDatasetConfig(
|
| 121 |
-
formatter="coqui",
|
| 122 |
-
dataset_name="mupe",
|
| 123 |
-
meta_file_train="metadata_coqui_brrs.csv",
|
| 124 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
| 125 |
-
language="brrs"
|
| 126 |
-
)
|
| 127 |
-
|
| 128 |
-
bralemanha_train_config = BaseDatasetConfig(
|
| 129 |
-
formatter="coqui",
|
| 130 |
-
dataset_name="mupe",
|
| 131 |
-
meta_file_train="metadata_coqui_bralemanha.csv",
|
| 132 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
| 133 |
-
language="bralemanha"
|
| 134 |
-
)
|
| 135 |
-
|
| 136 |
-
brgo_train_config = BaseDatasetConfig(
|
| 137 |
-
formatter="coqui",
|
| 138 |
-
dataset_name="mupe",
|
| 139 |
-
meta_file_train="metadata_coqui_brgo.csv",
|
| 140 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
| 141 |
-
language="brgo"
|
| 142 |
-
)
|
| 143 |
-
|
| 144 |
-
bral_train_config = BaseDatasetConfig(
|
| 145 |
-
formatter="coqui",
|
| 146 |
-
dataset_name="mupe",
|
| 147 |
-
meta_file_train="metadata_coqui_bral.csv",
|
| 148 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
| 149 |
-
language="bral"
|
| 150 |
-
)
|
| 151 |
-
|
| 152 |
-
brpr_train_config = BaseDatasetConfig(
|
| 153 |
-
formatter="coqui",
|
| 154 |
-
dataset_name="mupe",
|
| 155 |
-
meta_file_train="metadata_coqui_brpr.csv",
|
| 156 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
| 157 |
-
language="brpr"
|
| 158 |
-
)
|
| 159 |
-
|
| 160 |
-
bres_train_config = BaseDatasetConfig(
|
| 161 |
-
formatter="coqui",
|
| 162 |
-
dataset_name="mupe",
|
| 163 |
-
meta_file_train="metadata_coqui_bres.csv",
|
| 164 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
| 165 |
-
language="bres"
|
| 166 |
-
)
|
| 167 |
-
|
| 168 |
-
brpi_train_config = BaseDatasetConfig(
|
| 169 |
-
formatter="coqui",
|
| 170 |
-
dataset_name="mupe",
|
| 171 |
-
meta_file_train="metadata_coqui_brpi.csv",
|
| 172 |
-
path="/raid/datasets/MUPE/dataset/mupe/",
|
| 173 |
-
language="brpi"
|
| 174 |
-
)
|
| 175 |
-
|
| 176 |
-
# bres_train_config, brpi_train_config no files found
|
| 177 |
-
DATASETS_CONFIG_LIST = [brpb_train_config,brba_train_config,brportugal_train_config,brsp_train_config,brpe_train_config,brmg_train_config,brrj_train_config,brce_train_config,brrs_train_config,bralemanha_train_config,brgo_train_config,bral_train_config,brpr_train_config]
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
### Extract speaker embeddings
|
| 181 |
-
SPEAKER_ENCODER_CHECKPOINT_PATH = (
|
| 182 |
-
"https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar"
|
| 183 |
-
)
|
| 184 |
-
SPEAKER_ENCODER_CONFIG_PATH = "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json"
|
| 185 |
-
|
| 186 |
-
D_VECTOR_FILES = [] # List of speaker embeddings/d-vectors to be used during the training
|
| 187 |
-
|
| 188 |
-
# Iterates all the dataset configs checking if the speakers embeddings are already computated, if not compute it
|
| 189 |
-
for dataset_conf in DATASETS_CONFIG_LIST:
|
| 190 |
-
# Check if the embeddings weren't already computed, if not compute it
|
| 191 |
-
embeddings_file = os.path.join(dataset_conf.path, f"H_ASP_speaker_embeddings_{dataset_conf.language}.pth")
|
| 192 |
-
if not os.path.isfile(embeddings_file):
|
| 193 |
-
print(f">>> Computing the speaker embeddings for the {dataset_conf.dataset_name} dataset")
|
| 194 |
-
compute_embeddings(
|
| 195 |
-
SPEAKER_ENCODER_CHECKPOINT_PATH,
|
| 196 |
-
SPEAKER_ENCODER_CONFIG_PATH,
|
| 197 |
-
embeddings_file,
|
| 198 |
-
old_speakers_file=None,
|
| 199 |
-
config_dataset_path=None,
|
| 200 |
-
formatter_name=dataset_conf.formatter,
|
| 201 |
-
dataset_name=dataset_conf.dataset_name,
|
| 202 |
-
dataset_path=dataset_conf.path,
|
| 203 |
-
meta_file_train=dataset_conf.meta_file_train,
|
| 204 |
-
meta_file_val=dataset_conf.meta_file_val,
|
| 205 |
-
disable_cuda=False,
|
| 206 |
-
no_eval=False,
|
| 207 |
-
)
|
| 208 |
-
D_VECTOR_FILES.append(embeddings_file)
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
# Audio config used in training.
|
| 212 |
-
audio_config = VitsAudioConfig(
|
| 213 |
-
sample_rate=SAMPLE_RATE,
|
| 214 |
-
hop_length=256,
|
| 215 |
-
win_length=1024,
|
| 216 |
-
fft_size=1024,
|
| 217 |
-
mel_fmin=0.0,
|
| 218 |
-
mel_fmax=None,
|
| 219 |
-
num_mels=80,
|
| 220 |
-
)
|
| 221 |
-
|
| 222 |
-
# Init VITSArgs setting the arguments that are needed for the YourTTS model
|
| 223 |
-
model_args = VitsArgs(
|
| 224 |
-
spec_segment_size=62,
|
| 225 |
-
hidden_channels=192,
|
| 226 |
-
hidden_channels_ffn_text_encoder=768,
|
| 227 |
-
num_heads_text_encoder=2,
|
| 228 |
-
num_layers_text_encoder=10,
|
| 229 |
-
kernel_size_text_encoder=3,
|
| 230 |
-
dropout_p_text_encoder=0.1,
|
| 231 |
-
d_vector_file=D_VECTOR_FILES,
|
| 232 |
-
use_d_vector_file=True,
|
| 233 |
-
d_vector_dim=512,
|
| 234 |
-
speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
|
| 235 |
-
speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
|
| 236 |
-
resblock_type_decoder="2", # In the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
|
| 237 |
-
# Useful parameters to enable the Speaker Consistency Loss (SCL) described in the paper
|
| 238 |
-
use_speaker_encoder_as_loss=False,
|
| 239 |
-
# Useful parameters to enable multilingual training
|
| 240 |
-
use_language_embedding=True,
|
| 241 |
-
embedded_language_dim=4,
|
| 242 |
-
use_adaptive_weight_text_encoder=False,
|
| 243 |
-
use_perfect_class_batch_sampler=True,
|
| 244 |
-
perfect_class_batch_sampler_key="language"
|
| 245 |
-
)
|
| 246 |
-
|
| 247 |
-
# General training config, here you can change the batch size and others useful parameters
|
| 248 |
-
config = VitsConfig(
|
| 249 |
-
output_path=OUT_PATH,
|
| 250 |
-
model_args=model_args,
|
| 251 |
-
run_name=RUN_NAME,
|
| 252 |
-
project_name="SYNTACC",
|
| 253 |
-
run_description="""
|
| 254 |
-
- YourTTS with SYNTACC text encoder
|
| 255 |
-
""",
|
| 256 |
-
dashboard_logger=DASHBOARD_LOGGER,
|
| 257 |
-
logger_uri=LOGGER_URI,
|
| 258 |
-
audio=audio_config,
|
| 259 |
-
batch_size=BATCH_SIZE,
|
| 260 |
-
batch_group_size=48,
|
| 261 |
-
eval_batch_size=BATCH_SIZE,
|
| 262 |
-
num_loader_workers=8,
|
| 263 |
-
eval_split_max_size=256,
|
| 264 |
-
print_step=50,
|
| 265 |
-
plot_step=100,
|
| 266 |
-
log_model_step=1000,
|
| 267 |
-
save_step=5000,
|
| 268 |
-
save_n_checkpoints=2,
|
| 269 |
-
save_checkpoints=True,
|
| 270 |
-
# target_loss="loss_1",
|
| 271 |
-
print_eval=False,
|
| 272 |
-
use_phonemes=False,
|
| 273 |
-
phonemizer="espeak",
|
| 274 |
-
phoneme_language="en",
|
| 275 |
-
compute_input_seq_cache=True,
|
| 276 |
-
add_blank=True,
|
| 277 |
-
text_cleaner="multilingual_cleaners",
|
| 278 |
-
characters=CharactersConfig(
|
| 279 |
-
characters_class="TTS.tts.models.vits.VitsCharacters",
|
| 280 |
-
pad="_",
|
| 281 |
-
eos="&",
|
| 282 |
-
bos="*",
|
| 283 |
-
blank=None,
|
| 284 |
-
characters="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
|
| 285 |
-
punctuations="\u2014!'(),-.:;?\u00bf ",
|
| 286 |
-
phonemes="iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
|
| 287 |
-
is_unique=True,
|
| 288 |
-
is_sorted=True,
|
| 289 |
-
),
|
| 290 |
-
phoneme_cache_path=None,
|
| 291 |
-
precompute_num_workers=12,
|
| 292 |
-
start_by_longest=True,
|
| 293 |
-
datasets=DATASETS_CONFIG_LIST,
|
| 294 |
-
cudnn_benchmark=False,
|
| 295 |
-
max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS,
|
| 296 |
-
mixed_precision=False,
|
| 297 |
-
test_sentences=[
|
| 298 |
-
#GUSTAVO: apenas pessoas do treino
|
| 299 |
-
["Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.", "EDILEINE_FONSECA", None, "brsp"],
|
| 300 |
-
["Quem semeia ventos, colhe tempestades.", "JOSE_PAULO_DE_ARAUJO", None, "brpb"],
|
| 301 |
-
["O olho do dono \u00e9 que engorda o gado.", "VITOR_RAFAEL_OLIVEIRA_ALVES", None, "brba"],
|
| 302 |
-
["\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.", "MARIA_AURORA_FELIX", None, "brportugal"],
|
| 303 |
-
["Quem espera sempre alcan\u00e7a.", "ANTONIO_DE_AMORIM_COSTA", None, "brpe"],
|
| 304 |
-
["Cada macaco no seu galho.", "ALCIDES_DE_LIMA", None, "brmg"],
|
| 305 |
-
["Em terra de cego, quem tem um olho \u00e9 rei.", "ALUISIO_SOARES_DE_SOUSA", None, "brrj"],
|
| 306 |
-
["A ocasi\u00e3o faz o ladr\u00e3o.", "FRANCISCO_JOSE_MOREIRA_MOTA", None, "brce"],
|
| 307 |
-
["De gr\u00e3o em gr\u00e3o, a galinha enche o papo.", "EVALDO_ANDRADA_CORREA", None, "brrs"],
|
| 308 |
-
["Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.", "DORIS_ALEXANDER", None, "bralemanha"],
|
| 309 |
-
["Quem n\u00e3o arrisca, n\u00e3o petisca.", "DONALDO_LUIZ_DE_ALMEIDA", None, "brgo"],
|
| 310 |
-
["A uni\u00e3o faz a for\u00e7a.", "GERONCIO_HENRIQUE_NETO", None, "bral"],
|
| 311 |
-
["Em boca fechada n\u00e3o entra mosquito.", "MALU_NATEL_FREIRE_WEBER", None, "brpr"],
|
| 312 |
-
# ["Quem n\u00e3o tem dinheiro, n\u00e3o tem v\u00edcios.", "INES_VIEIRA_BOGEA", None, "bres"],
|
| 313 |
-
# ["Quando voc\u00ea n\u00e3o corre nenhum risco, voc\u00ea arrisca tudo.", "MARIA_ASSUNCAO_SOUSA", None, "brpi"]
|
| 314 |
-
],
|
| 315 |
-
# Enable the weighted sampler
|
| 316 |
-
use_weighted_sampler=True,
|
| 317 |
-
# Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
|
| 318 |
-
# weighted_sampler_attrs={"language": 1.0, "speaker_name": 1.0},
|
| 319 |
-
weighted_sampler_attrs={"language": 1.0},
|
| 320 |
-
weighted_sampler_multipliers={
|
| 321 |
-
# "speaker_name": {
|
| 322 |
-
# you can force the batching scheme to give a higher weight to a certain speaker and then this speaker will appears more frequently on the batch.
|
| 323 |
-
# It will speedup the speaker adaptation process. Considering the CML train dataset and "new_speaker" as the speaker name of the speaker that you want to adapt.
|
| 324 |
-
# The line above will make the balancer consider the "new_speaker" as 106 speakers so 1/4 of the number of speakers present on CML dataset.
|
| 325 |
-
# 'new_speaker': 106, # (CML tot. train speaker)/4 = (424/4) = 106
|
| 326 |
-
# }
|
| 327 |
-
},
|
| 328 |
-
# It defines the Speaker Consistency Loss (SCL) Ξ± to 9 like the YourTTS paper
|
| 329 |
-
speaker_encoder_loss_alpha=9.0,
|
| 330 |
-
)
|
| 331 |
-
|
| 332 |
-
# Load all the datasets samples and split traning and evaluation sets
|
| 333 |
-
train_samples, eval_samples = load_tts_samples(
|
| 334 |
-
config.datasets,
|
| 335 |
-
eval_split=True,
|
| 336 |
-
eval_split_max_size=config.eval_split_max_size,
|
| 337 |
-
eval_split_size=config.eval_split_size,
|
| 338 |
-
)
|
| 339 |
-
|
| 340 |
-
# Init the model
|
| 341 |
-
model = Vits.init_from_config(config)
|
| 342 |
-
|
| 343 |
-
# Init the trainer and π
|
| 344 |
-
trainer = Trainer(
|
| 345 |
-
TrainerArgs(restore_path=RESTORE_PATH, skip_train_epoch=SKIP_TRAIN_EPOCH, start_with_eval=True),
|
| 346 |
-
config,
|
| 347 |
-
output_path=OUT_PATH,
|
| 348 |
-
model=model,
|
| 349 |
-
train_samples=train_samples,
|
| 350 |
-
eval_samples=eval_samples,
|
| 351 |
-
)
|
| 352 |
-
trainer.fit()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/best_model.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:a082ddde12d21020f66a70cf05a74826488d10008a8379b699458d92509e85d1
|
| 3 |
-
size 1043216142
|
|
|
|
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/best_model_87192.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:a082ddde12d21020f66a70cf05a74826488d10008a8379b699458d92509e85d1
|
| 3 |
-
size 1043216142
|
|
|
|
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/checkpoint_130000.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:5a584eb832a857f9a11180b34a84b81117d8690ed1e5fa39e4ff711cf6ffd7f7
|
| 3 |
-
size 1043220766
|
|
|
|
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/checkpoint_135000.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:367ac46477805942658a7a78e8cf473409537967f9382a46249a8d11521ed3f9
|
| 3 |
-
size 1043220766
|
|
|
|
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/config.json
DELETED
|
@@ -1,496 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"output_path": "/raid/datasets/MUPE/Experiments/runs",
|
| 3 |
-
"logger_uri": "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/",
|
| 4 |
-
"run_name": "YourTTS-Baseline-PT",
|
| 5 |
-
"project_name": "SYNTACC",
|
| 6 |
-
"run_description": "\n - YourTTS with SYNTACC text encoder\n ",
|
| 7 |
-
"print_step": 50,
|
| 8 |
-
"plot_step": 100,
|
| 9 |
-
"model_param_stats": false,
|
| 10 |
-
"wandb_entity": null,
|
| 11 |
-
"dashboard_logger": "clearml",
|
| 12 |
-
"save_on_interrupt": true,
|
| 13 |
-
"log_model_step": 1000,
|
| 14 |
-
"save_step": 5000,
|
| 15 |
-
"save_n_checkpoints": 2,
|
| 16 |
-
"save_checkpoints": true,
|
| 17 |
-
"save_all_best": false,
|
| 18 |
-
"save_best_after": 10000,
|
| 19 |
-
"target_loss": null,
|
| 20 |
-
"print_eval": false,
|
| 21 |
-
"test_delay_epochs": 0,
|
| 22 |
-
"run_eval": true,
|
| 23 |
-
"run_eval_steps": null,
|
| 24 |
-
"distributed_backend": "nccl",
|
| 25 |
-
"distributed_url": "tcp://localhost:54321",
|
| 26 |
-
"mixed_precision": false,
|
| 27 |
-
"precision": "fp16",
|
| 28 |
-
"epochs": 1000,
|
| 29 |
-
"batch_size": 26,
|
| 30 |
-
"eval_batch_size": 26,
|
| 31 |
-
"grad_clip": [
|
| 32 |
-
1000,
|
| 33 |
-
1000
|
| 34 |
-
],
|
| 35 |
-
"scheduler_after_epoch": true,
|
| 36 |
-
"lr": 0.001,
|
| 37 |
-
"optimizer": "AdamW",
|
| 38 |
-
"optimizer_params": {
|
| 39 |
-
"betas": [
|
| 40 |
-
0.8,
|
| 41 |
-
0.99
|
| 42 |
-
],
|
| 43 |
-
"eps": 1e-09,
|
| 44 |
-
"weight_decay": 0.01
|
| 45 |
-
},
|
| 46 |
-
"lr_scheduler": null,
|
| 47 |
-
"lr_scheduler_params": {},
|
| 48 |
-
"use_grad_scaler": false,
|
| 49 |
-
"allow_tf32": false,
|
| 50 |
-
"cudnn_enable": true,
|
| 51 |
-
"cudnn_deterministic": false,
|
| 52 |
-
"cudnn_benchmark": false,
|
| 53 |
-
"training_seed": 54321,
|
| 54 |
-
"model": "vits",
|
| 55 |
-
"num_loader_workers": 8,
|
| 56 |
-
"num_eval_loader_workers": 0,
|
| 57 |
-
"use_noise_augment": false,
|
| 58 |
-
"audio": {
|
| 59 |
-
"fft_size": 1024,
|
| 60 |
-
"sample_rate": 16000,
|
| 61 |
-
"win_length": 1024,
|
| 62 |
-
"hop_length": 256,
|
| 63 |
-
"num_mels": 80,
|
| 64 |
-
"mel_fmin": 0.0,
|
| 65 |
-
"mel_fmax": null
|
| 66 |
-
},
|
| 67 |
-
"use_phonemes": false,
|
| 68 |
-
"phonemizer": "espeak",
|
| 69 |
-
"phoneme_language": "en",
|
| 70 |
-
"compute_input_seq_cache": true,
|
| 71 |
-
"text_cleaner": "multilingual_cleaners",
|
| 72 |
-
"enable_eos_bos_chars": false,
|
| 73 |
-
"test_sentences_file": "",
|
| 74 |
-
"phoneme_cache_path": null,
|
| 75 |
-
"characters": {
|
| 76 |
-
"characters_class": "TTS.tts.models.vits.VitsCharacters",
|
| 77 |
-
"vocab_dict": null,
|
| 78 |
-
"pad": "_",
|
| 79 |
-
"eos": "&",
|
| 80 |
-
"bos": "*",
|
| 81 |
-
"blank": null,
|
| 82 |
-
"characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
|
| 83 |
-
"punctuations": "\u2014!'(),-.:;?\u00bf ",
|
| 84 |
-
"phonemes": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
|
| 85 |
-
"is_unique": true,
|
| 86 |
-
"is_sorted": true
|
| 87 |
-
},
|
| 88 |
-
"add_blank": true,
|
| 89 |
-
"batch_group_size": 48,
|
| 90 |
-
"loss_masking": null,
|
| 91 |
-
"min_audio_len": 1,
|
| 92 |
-
"max_audio_len": Infinity,
|
| 93 |
-
"min_text_len": 1,
|
| 94 |
-
"max_text_len": Infinity,
|
| 95 |
-
"compute_f0": false,
|
| 96 |
-
"compute_energy": false,
|
| 97 |
-
"compute_linear_spec": true,
|
| 98 |
-
"precompute_num_workers": 12,
|
| 99 |
-
"start_by_longest": true,
|
| 100 |
-
"shuffle": false,
|
| 101 |
-
"drop_last": false,
|
| 102 |
-
"datasets": [
|
| 103 |
-
{
|
| 104 |
-
"formatter": "coqui",
|
| 105 |
-
"dataset_name": "mupe",
|
| 106 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
| 107 |
-
"meta_file_train": "metadata_coqui_brpb.csv",
|
| 108 |
-
"ignored_speakers": null,
|
| 109 |
-
"language": "brpb",
|
| 110 |
-
"phonemizer": "",
|
| 111 |
-
"meta_file_val": "",
|
| 112 |
-
"meta_file_attn_mask": ""
|
| 113 |
-
},
|
| 114 |
-
{
|
| 115 |
-
"formatter": "coqui",
|
| 116 |
-
"dataset_name": "mupe",
|
| 117 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
| 118 |
-
"meta_file_train": "metadata_coqui_brba.csv",
|
| 119 |
-
"ignored_speakers": null,
|
| 120 |
-
"language": "brba",
|
| 121 |
-
"phonemizer": "",
|
| 122 |
-
"meta_file_val": "",
|
| 123 |
-
"meta_file_attn_mask": ""
|
| 124 |
-
},
|
| 125 |
-
{
|
| 126 |
-
"formatter": "coqui",
|
| 127 |
-
"dataset_name": "mupe",
|
| 128 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
| 129 |
-
"meta_file_train": "metadata_coqui_brportugal.csv",
|
| 130 |
-
"ignored_speakers": null,
|
| 131 |
-
"language": "brportugal",
|
| 132 |
-
"phonemizer": "",
|
| 133 |
-
"meta_file_val": "",
|
| 134 |
-
"meta_file_attn_mask": ""
|
| 135 |
-
},
|
| 136 |
-
{
|
| 137 |
-
"formatter": "coqui",
|
| 138 |
-
"dataset_name": "mupe",
|
| 139 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
| 140 |
-
"meta_file_train": "metadata_coqui_brsp.csv",
|
| 141 |
-
"ignored_speakers": null,
|
| 142 |
-
"language": "brsp",
|
| 143 |
-
"phonemizer": "",
|
| 144 |
-
"meta_file_val": "",
|
| 145 |
-
"meta_file_attn_mask": ""
|
| 146 |
-
},
|
| 147 |
-
{
|
| 148 |
-
"formatter": "coqui",
|
| 149 |
-
"dataset_name": "mupe",
|
| 150 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
| 151 |
-
"meta_file_train": "metadata_coqui_brpe.csv",
|
| 152 |
-
"ignored_speakers": null,
|
| 153 |
-
"language": "brpe",
|
| 154 |
-
"phonemizer": "",
|
| 155 |
-
"meta_file_val": "",
|
| 156 |
-
"meta_file_attn_mask": ""
|
| 157 |
-
},
|
| 158 |
-
{
|
| 159 |
-
"formatter": "coqui",
|
| 160 |
-
"dataset_name": "mupe",
|
| 161 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
| 162 |
-
"meta_file_train": "metadata_coqui_brmg.csv",
|
| 163 |
-
"ignored_speakers": null,
|
| 164 |
-
"language": "brmg",
|
| 165 |
-
"phonemizer": "",
|
| 166 |
-
"meta_file_val": "",
|
| 167 |
-
"meta_file_attn_mask": ""
|
| 168 |
-
},
|
| 169 |
-
{
|
| 170 |
-
"formatter": "coqui",
|
| 171 |
-
"dataset_name": "mupe",
|
| 172 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
| 173 |
-
"meta_file_train": "metadata_coqui_brrj.csv",
|
| 174 |
-
"ignored_speakers": null,
|
| 175 |
-
"language": "brrj",
|
| 176 |
-
"phonemizer": "",
|
| 177 |
-
"meta_file_val": "",
|
| 178 |
-
"meta_file_attn_mask": ""
|
| 179 |
-
},
|
| 180 |
-
{
|
| 181 |
-
"formatter": "coqui",
|
| 182 |
-
"dataset_name": "mupe",
|
| 183 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
| 184 |
-
"meta_file_train": "metadata_coqui_brce.csv",
|
| 185 |
-
"ignored_speakers": null,
|
| 186 |
-
"language": "brce",
|
| 187 |
-
"phonemizer": "",
|
| 188 |
-
"meta_file_val": "",
|
| 189 |
-
"meta_file_attn_mask": ""
|
| 190 |
-
},
|
| 191 |
-
{
|
| 192 |
-
"formatter": "coqui",
|
| 193 |
-
"dataset_name": "mupe",
|
| 194 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
| 195 |
-
"meta_file_train": "metadata_coqui_brrs.csv",
|
| 196 |
-
"ignored_speakers": null,
|
| 197 |
-
"language": "brrs",
|
| 198 |
-
"phonemizer": "",
|
| 199 |
-
"meta_file_val": "",
|
| 200 |
-
"meta_file_attn_mask": ""
|
| 201 |
-
},
|
| 202 |
-
{
|
| 203 |
-
"formatter": "coqui",
|
| 204 |
-
"dataset_name": "mupe",
|
| 205 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
| 206 |
-
"meta_file_train": "metadata_coqui_bralemanha.csv",
|
| 207 |
-
"ignored_speakers": null,
|
| 208 |
-
"language": "bralemanha",
|
| 209 |
-
"phonemizer": "",
|
| 210 |
-
"meta_file_val": "",
|
| 211 |
-
"meta_file_attn_mask": ""
|
| 212 |
-
},
|
| 213 |
-
{
|
| 214 |
-
"formatter": "coqui",
|
| 215 |
-
"dataset_name": "mupe",
|
| 216 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
| 217 |
-
"meta_file_train": "metadata_coqui_brgo.csv",
|
| 218 |
-
"ignored_speakers": null,
|
| 219 |
-
"language": "brgo",
|
| 220 |
-
"phonemizer": "",
|
| 221 |
-
"meta_file_val": "",
|
| 222 |
-
"meta_file_attn_mask": ""
|
| 223 |
-
},
|
| 224 |
-
{
|
| 225 |
-
"formatter": "coqui",
|
| 226 |
-
"dataset_name": "mupe",
|
| 227 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
| 228 |
-
"meta_file_train": "metadata_coqui_bral.csv",
|
| 229 |
-
"ignored_speakers": null,
|
| 230 |
-
"language": "bral",
|
| 231 |
-
"phonemizer": "",
|
| 232 |
-
"meta_file_val": "",
|
| 233 |
-
"meta_file_attn_mask": ""
|
| 234 |
-
},
|
| 235 |
-
{
|
| 236 |
-
"formatter": "coqui",
|
| 237 |
-
"dataset_name": "mupe",
|
| 238 |
-
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
| 239 |
-
"meta_file_train": "metadata_coqui_brpr.csv",
|
| 240 |
-
"ignored_speakers": null,
|
| 241 |
-
"language": "brpr",
|
| 242 |
-
"phonemizer": "",
|
| 243 |
-
"meta_file_val": "",
|
| 244 |
-
"meta_file_attn_mask": ""
|
| 245 |
-
}
|
| 246 |
-
],
|
| 247 |
-
"test_sentences": [
|
| 248 |
-
[
|
| 249 |
-
"Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.",
|
| 250 |
-
"EDILEINE_FONSECA",
|
| 251 |
-
null,
|
| 252 |
-
"brsp"
|
| 253 |
-
],
|
| 254 |
-
[
|
| 255 |
-
"Quem semeia ventos, colhe tempestades.",
|
| 256 |
-
"JOSE_PAULO_DE_ARAUJO",
|
| 257 |
-
null,
|
| 258 |
-
"brpb"
|
| 259 |
-
],
|
| 260 |
-
[
|
| 261 |
-
"O olho do dono \u00e9 que engorda o gado.",
|
| 262 |
-
"VITOR_RAFAEL_OLIVEIRA_ALVES",
|
| 263 |
-
null,
|
| 264 |
-
"brba"
|
| 265 |
-
],
|
| 266 |
-
[
|
| 267 |
-
"\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.",
|
| 268 |
-
"MARIA_AURORA_FELIX",
|
| 269 |
-
null,
|
| 270 |
-
"brportugal"
|
| 271 |
-
],
|
| 272 |
-
[
|
| 273 |
-
"Quem espera sempre alcan\u00e7a.",
|
| 274 |
-
"ANTONIO_DE_AMORIM_COSTA",
|
| 275 |
-
null,
|
| 276 |
-
"brpe"
|
| 277 |
-
],
|
| 278 |
-
[
|
| 279 |
-
"Cada macaco no seu galho.",
|
| 280 |
-
"ALCIDES_DE_LIMA",
|
| 281 |
-
null,
|
| 282 |
-
"brmg"
|
| 283 |
-
],
|
| 284 |
-
[
|
| 285 |
-
"Em terra de cego, quem tem um olho \u00e9 rei.",
|
| 286 |
-
"ALUISIO_SOARES_DE_SOUSA",
|
| 287 |
-
null,
|
| 288 |
-
"brrj"
|
| 289 |
-
],
|
| 290 |
-
[
|
| 291 |
-
"A ocasi\u00e3o faz o ladr\u00e3o.",
|
| 292 |
-
"FRANCISCO_JOSE_MOREIRA_MOTA",
|
| 293 |
-
null,
|
| 294 |
-
"brce"
|
| 295 |
-
],
|
| 296 |
-
[
|
| 297 |
-
"De gr\u00e3o em gr\u00e3o, a galinha enche o papo.",
|
| 298 |
-
"EVALDO_ANDRADA_CORREA",
|
| 299 |
-
null,
|
| 300 |
-
"brrs"
|
| 301 |
-
],
|
| 302 |
-
[
|
| 303 |
-
"Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.",
|
| 304 |
-
"DORIS_ALEXANDER",
|
| 305 |
-
null,
|
| 306 |
-
"bralemanha"
|
| 307 |
-
],
|
| 308 |
-
[
|
| 309 |
-
"Quem n\u00e3o arrisca, n\u00e3o petisca.",
|
| 310 |
-
"DONALDO_LUIZ_DE_ALMEIDA",
|
| 311 |
-
null,
|
| 312 |
-
"brgo"
|
| 313 |
-
],
|
| 314 |
-
[
|
| 315 |
-
"A uni\u00e3o faz a for\u00e7a.",
|
| 316 |
-
"GERONCIO_HENRIQUE_NETO",
|
| 317 |
-
null,
|
| 318 |
-
"bral"
|
| 319 |
-
],
|
| 320 |
-
[
|
| 321 |
-
"Em boca fechada n\u00e3o entra mosquito.",
|
| 322 |
-
"MALU_NATEL_FREIRE_WEBER",
|
| 323 |
-
null,
|
| 324 |
-
"brpr"
|
| 325 |
-
]
|
| 326 |
-
],
|
| 327 |
-
"eval_split_max_size": 256,
|
| 328 |
-
"eval_split_size": 0.01,
|
| 329 |
-
"use_speaker_weighted_sampler": false,
|
| 330 |
-
"speaker_weighted_sampler_alpha": 1.0,
|
| 331 |
-
"use_language_weighted_sampler": false,
|
| 332 |
-
"language_weighted_sampler_alpha": 1.0,
|
| 333 |
-
"use_length_weighted_sampler": false,
|
| 334 |
-
"length_weighted_sampler_alpha": 1.0,
|
| 335 |
-
"model_args": {
|
| 336 |
-
"num_chars": 266,
|
| 337 |
-
"out_channels": 513,
|
| 338 |
-
"spec_segment_size": 62,
|
| 339 |
-
"hidden_channels": 192,
|
| 340 |
-
"use_adaptive_weight_text_encoder": false,
|
| 341 |
-
"use_perfect_class_batch_sampler": true,
|
| 342 |
-
"perfect_class_batch_sampler_key": "language",
|
| 343 |
-
"hidden_channels_ffn_text_encoder": 768,
|
| 344 |
-
"num_heads_text_encoder": 2,
|
| 345 |
-
"num_layers_text_encoder": 10,
|
| 346 |
-
"kernel_size_text_encoder": 3,
|
| 347 |
-
"dropout_p_text_encoder": 0.1,
|
| 348 |
-
"dropout_p_duration_predictor": 0.5,
|
| 349 |
-
"kernel_size_posterior_encoder": 5,
|
| 350 |
-
"dilation_rate_posterior_encoder": 1,
|
| 351 |
-
"num_layers_posterior_encoder": 16,
|
| 352 |
-
"kernel_size_flow": 5,
|
| 353 |
-
"dilation_rate_flow": 1,
|
| 354 |
-
"num_layers_flow": 4,
|
| 355 |
-
"resblock_type_decoder": "2",
|
| 356 |
-
"resblock_kernel_sizes_decoder": [
|
| 357 |
-
3,
|
| 358 |
-
7,
|
| 359 |
-
11
|
| 360 |
-
],
|
| 361 |
-
"resblock_dilation_sizes_decoder": [
|
| 362 |
-
[
|
| 363 |
-
1,
|
| 364 |
-
3,
|
| 365 |
-
5
|
| 366 |
-
],
|
| 367 |
-
[
|
| 368 |
-
1,
|
| 369 |
-
3,
|
| 370 |
-
5
|
| 371 |
-
],
|
| 372 |
-
[
|
| 373 |
-
1,
|
| 374 |
-
3,
|
| 375 |
-
5
|
| 376 |
-
]
|
| 377 |
-
],
|
| 378 |
-
"upsample_rates_decoder": [
|
| 379 |
-
8,
|
| 380 |
-
8,
|
| 381 |
-
2,
|
| 382 |
-
2
|
| 383 |
-
],
|
| 384 |
-
"upsample_initial_channel_decoder": 512,
|
| 385 |
-
"upsample_kernel_sizes_decoder": [
|
| 386 |
-
16,
|
| 387 |
-
16,
|
| 388 |
-
4,
|
| 389 |
-
4
|
| 390 |
-
],
|
| 391 |
-
"periods_multi_period_discriminator": [
|
| 392 |
-
2,
|
| 393 |
-
3,
|
| 394 |
-
5,
|
| 395 |
-
7,
|
| 396 |
-
11
|
| 397 |
-
],
|
| 398 |
-
"use_sdp": true,
|
| 399 |
-
"noise_scale": 1.0,
|
| 400 |
-
"inference_noise_scale": 0.667,
|
| 401 |
-
"length_scale": 1,
|
| 402 |
-
"noise_scale_dp": 1.0,
|
| 403 |
-
"inference_noise_scale_dp": 1.0,
|
| 404 |
-
"max_inference_len": null,
|
| 405 |
-
"init_discriminator": true,
|
| 406 |
-
"use_spectral_norm_disriminator": false,
|
| 407 |
-
"use_speaker_embedding": false,
|
| 408 |
-
"num_speakers": 0,
|
| 409 |
-
"speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/speakers.pth",
|
| 410 |
-
"d_vector_file": [
|
| 411 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
|
| 412 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
|
| 413 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
|
| 414 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
|
| 415 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
|
| 416 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
|
| 417 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
|
| 418 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
|
| 419 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
|
| 420 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
|
| 421 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
|
| 422 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
|
| 423 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
|
| 424 |
-
],
|
| 425 |
-
"speaker_embedding_channels": 256,
|
| 426 |
-
"use_d_vector_file": true,
|
| 427 |
-
"d_vector_dim": 512,
|
| 428 |
-
"detach_dp_input": true,
|
| 429 |
-
"use_language_embedding": true,
|
| 430 |
-
"embedded_language_dim": 4,
|
| 431 |
-
"num_languages": 0,
|
| 432 |
-
"language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/language_ids.json",
|
| 433 |
-
"use_speaker_encoder_as_loss": false,
|
| 434 |
-
"speaker_encoder_config_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
|
| 435 |
-
"speaker_encoder_model_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
|
| 436 |
-
"condition_dp_on_speaker": true,
|
| 437 |
-
"freeze_encoder": false,
|
| 438 |
-
"freeze_DP": false,
|
| 439 |
-
"freeze_PE": false,
|
| 440 |
-
"freeze_flow_decoder": false,
|
| 441 |
-
"freeze_waveform_decoder": false,
|
| 442 |
-
"encoder_sample_rate": null,
|
| 443 |
-
"interpolate_z": true,
|
| 444 |
-
"reinit_DP": false,
|
| 445 |
-
"reinit_text_encoder": false
|
| 446 |
-
},
|
| 447 |
-
"lr_gen": 0.0002,
|
| 448 |
-
"lr_disc": 0.0002,
|
| 449 |
-
"lr_scheduler_gen": "ExponentialLR",
|
| 450 |
-
"lr_scheduler_gen_params": {
|
| 451 |
-
"gamma": 0.999875,
|
| 452 |
-
"last_epoch": -1
|
| 453 |
-
},
|
| 454 |
-
"lr_scheduler_disc": "ExponentialLR",
|
| 455 |
-
"lr_scheduler_disc_params": {
|
| 456 |
-
"gamma": 0.999875,
|
| 457 |
-
"last_epoch": -1
|
| 458 |
-
},
|
| 459 |
-
"kl_loss_alpha": 1.0,
|
| 460 |
-
"disc_loss_alpha": 1.0,
|
| 461 |
-
"gen_loss_alpha": 1.0,
|
| 462 |
-
"feat_loss_alpha": 1.0,
|
| 463 |
-
"mel_loss_alpha": 45.0,
|
| 464 |
-
"dur_loss_alpha": 1.0,
|
| 465 |
-
"speaker_encoder_loss_alpha": 9.0,
|
| 466 |
-
"return_wav": true,
|
| 467 |
-
"use_weighted_sampler": true,
|
| 468 |
-
"weighted_sampler_attrs": {
|
| 469 |
-
"language": 1.0
|
| 470 |
-
},
|
| 471 |
-
"weighted_sampler_multipliers": {},
|
| 472 |
-
"r": 1,
|
| 473 |
-
"num_speakers": 0,
|
| 474 |
-
"use_speaker_embedding": false,
|
| 475 |
-
"speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/speakers.pth",
|
| 476 |
-
"speaker_embedding_channels": 256,
|
| 477 |
-
"language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/language_ids.json",
|
| 478 |
-
"use_language_embedding": true,
|
| 479 |
-
"use_d_vector_file": true,
|
| 480 |
-
"d_vector_file": [
|
| 481 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
|
| 482 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
|
| 483 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
|
| 484 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
|
| 485 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
|
| 486 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
|
| 487 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
|
| 488 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
|
| 489 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
|
| 490 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
|
| 491 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
|
| 492 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
|
| 493 |
-
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
|
| 494 |
-
],
|
| 495 |
-
"d_vector_dim": 512
|
| 496 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/language_ids.json
DELETED
|
@@ -1,15 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"bral": 0,
|
| 3 |
-
"bralemanha": 1,
|
| 4 |
-
"brba": 2,
|
| 5 |
-
"brce": 3,
|
| 6 |
-
"brgo": 4,
|
| 7 |
-
"brmg": 5,
|
| 8 |
-
"brpb": 6,
|
| 9 |
-
"brpe": 7,
|
| 10 |
-
"brportugal": 8,
|
| 11 |
-
"brpr": 9,
|
| 12 |
-
"brrj": 10,
|
| 13 |
-
"brrs": 11,
|
| 14 |
-
"brsp": 12
|
| 15 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/speakers.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:d0b8d8013199105bfba41bbef0ac6c7fc44ecb3385a39980da80931496c039bf
|
| 3 |
-
size 3296
|
|
|
|
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/trainer_0_log.txt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:5ddf81cb4061c7e47bd824c3ebb109cc02bc31ab79ee21e4e69d60d32aca454b
|
| 3 |
-
size 1794644
|
|
|
|
|
|
|
|
|
|
|
|
Experiments/runs/YourTTS-Syntacc-PT_continue-January-28-2024_02+26PM-8a499b88c/{checkpoint_185000.pth β checkpoint_195000.pth}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1044066458
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8c552bdeff67502deab77d3f587269e090fac00dc991bcfba8dedfa21594d471
|
| 3 |
size 1044066458
|
Experiments/runs/YourTTS-Syntacc-PT_continue-January-28-2024_02+26PM-8a499b88c/trainer_0_log.txt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:327601981f984533599c289f977acc81f9d7479999f14235302e6ad1a171d710
|
| 3 |
+
size 3401880
|
Experiments/train_syntacc_baseline.py
CHANGED
|
@@ -28,7 +28,7 @@ RUN_NAME = "YourTTS-Baseline-PT"
|
|
| 28 |
OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs") # "/raid/coqui/Checkpoints/original-YourTTS/"
|
| 29 |
|
| 30 |
# If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
|
| 31 |
-
RESTORE_PATH = "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-
|
| 32 |
|
| 33 |
# This paramter is useful to debug, it skips the training epochs and just do the evaluation and produce the test sentences
|
| 34 |
SKIP_TRAIN_EPOCH = False
|
|
|
|
| 28 |
OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs") # "/raid/coqui/Checkpoints/original-YourTTS/"
|
| 29 |
|
| 30 |
# If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
|
| 31 |
+
RESTORE_PATH = "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT_continue-January-28-2024_02+26PM-8a499b88c/checkpoint_195000.pth" # Download the checkpoint here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
|
| 32 |
|
| 33 |
# This paramter is useful to debug, it skips the training epochs and just do the evaluation and produce the test sentences
|
| 34 |
SKIP_TRAIN_EPOCH = False
|