Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- best_model.pth +3 -0
- best_model_1009517.pth +3 -0
- checkpoint_1111000.pth +3 -0
- checkpoint_1112000.pth +3 -0
- checkpoint_1113000.pth +3 -0
- checkpoint_1113117.pth +3 -0
- events.out.tfevents.1693935825.ip-172-16-76-92.ec2.internal.52882.0 +3 -0
- train_vits.py +92 -0
- trainer_0_log.txt +3 -0
- trainer_1_log.txt +0 -0
- trainer_2_log.txt +0 -0
- trainer_3_log.txt +0 -0
- trainer_4_log.txt +0 -0
- trainer_5_log.txt +0 -0
- trainer_6_log.txt +0 -0
- trainer_7_log.txt +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
trainer_0_log.txt filter=lfs diff=lfs merge=lfs -text
|
best_model.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d6d473e8956c52204d2c1dfc2b43299a6ed3f3064dba5ad39682cbbfc629e8d4
|
| 3 |
+
size 997871045
|
best_model_1009517.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d6d473e8956c52204d2c1dfc2b43299a6ed3f3064dba5ad39682cbbfc629e8d4
|
| 3 |
+
size 997871045
|
checkpoint_1111000.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6e42647d7d4f6a03d61e08961e753fa5366d693c49eb35a734edb5caae8a1ddd
|
| 3 |
+
size 997871109
|
checkpoint_1112000.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ae36115905bed58d939e8f79733ce590d2dce0b357e933a8c88782ea173debf5
|
| 3 |
+
size 997871109
|
checkpoint_1113000.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a5b8b41c60301c234f7031bcdf80cae317859aaf6815ce7983d4248a98678c66
|
| 3 |
+
size 997871109
|
checkpoint_1113117.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c96fda4ad9778107ba5e80c5cf3306f5a050b588ad850cb9c9b31014de70bf1f
|
| 3 |
+
size 997871109
|
events.out.tfevents.1693935825.ip-172-16-76-92.ec2.internal.52882.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8e085546e402a09aa9a8ae7d884d1cd03586e3a88a316c36ab8aa199a263c12c
|
| 3 |
+
size 10044566854
|
train_vits.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
from trainer import Trainer, TrainerArgs
|
| 4 |
+
|
| 5 |
+
from TTS.tts.configs.shared_configs import BaseDatasetConfig
|
| 6 |
+
from TTS.tts.configs.vits_config import VitsConfig
|
| 7 |
+
from TTS.tts.datasets import load_tts_samples
|
| 8 |
+
from TTS.tts.models.vits import Vits, VitsAudioConfig
|
| 9 |
+
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
| 10 |
+
from TTS.utils.audio import AudioProcessor
|
| 11 |
+
|
| 12 |
+
#output_path = os.path.dirname(os.path.abspath(__file__))
|
| 13 |
+
##########################################
|
| 14 |
+
#Change this to your dataset directory
|
| 15 |
+
##########################################
|
| 16 |
+
output_path = os.path.dirname(os.path.abspath(__file__))
|
| 17 |
+
dataset_config = BaseDatasetConfig(
|
| 18 |
+
##########################################
|
| 19 |
+
#Change this to your dataset directory
|
| 20 |
+
##########################################
|
| 21 |
+
formatter="ljspeech", meta_file_train="metadata.csv", path="/home/ec2-user/SageMaker/tts-sage/recipes/ljspeech/vits_tts/adam"
|
| 22 |
+
|
| 23 |
+
)
|
| 24 |
+
audio_config = VitsAudioConfig(
|
| 25 |
+
sample_rate=48000, win_length=1024, hop_length=256, num_mels=80, mel_fmin=0, mel_fmax=None
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
config = VitsConfig(
|
| 29 |
+
audio=audio_config,
|
| 30 |
+
run_name="tts-adam-48k",
|
| 31 |
+
batch_size=7,
|
| 32 |
+
eval_batch_size=12,
|
| 33 |
+
batch_group_size=4,
|
| 34 |
+
# num_loader_workers=8,
|
| 35 |
+
num_loader_workers=4,
|
| 36 |
+
num_eval_loader_workers=4,
|
| 37 |
+
run_eval=True,
|
| 38 |
+
test_delay_epochs=-1,
|
| 39 |
+
epochs=100000,
|
| 40 |
+
save_step=1000,
|
| 41 |
+
save_checkpoints=True,
|
| 42 |
+
save_n_checkpoints=4,
|
| 43 |
+
save_best_after=1000,
|
| 44 |
+
#text_cleaner="english_cleaners",
|
| 45 |
+
text_cleaner="multilingual_cleaners",
|
| 46 |
+
use_phonemes=True,
|
| 47 |
+
phoneme_language="en-us",
|
| 48 |
+
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
|
| 49 |
+
compute_input_seq_cache=True,
|
| 50 |
+
print_step=25,
|
| 51 |
+
print_eval=True,
|
| 52 |
+
mixed_precision=True,
|
| 53 |
+
output_path=output_path,
|
| 54 |
+
datasets=[dataset_config],
|
| 55 |
+
cudnn_benchmark=False,
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
# INITIALIZE THE AUDIO PROCESSOR
|
| 59 |
+
# Audio processor is used for feature extraction and audio I/O.
|
| 60 |
+
# It mainly serves to the dataloader and the training loggers.
|
| 61 |
+
ap = AudioProcessor.init_from_config(config)
|
| 62 |
+
|
| 63 |
+
# INITIALIZE THE TOKENIZER
|
| 64 |
+
# Tokenizer is used to convert text to sequences of token IDs.
|
| 65 |
+
# config is updated with the default characters if not defined in the config.
|
| 66 |
+
tokenizer, config = TTSTokenizer.init_from_config(config)
|
| 67 |
+
|
| 68 |
+
# LOAD DATA SAMPLES
|
| 69 |
+
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
|
| 70 |
+
# You can define your custom sample loader returning the list of samples.
|
| 71 |
+
# Or define your custom formatter and pass it to the `load_tts_samples`.
|
| 72 |
+
# Check `TTS.tts.datasets.load_tts_samples` for more details.
|
| 73 |
+
train_samples, eval_samples = load_tts_samples(
|
| 74 |
+
dataset_config,
|
| 75 |
+
eval_split=True,
|
| 76 |
+
eval_split_max_size=config.eval_split_max_size,
|
| 77 |
+
eval_split_size=config.eval_split_size,
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
# init model
|
| 81 |
+
model = Vits(config, ap, tokenizer, speaker_manager=None)
|
| 82 |
+
|
| 83 |
+
# init the trainer and begin
|
| 84 |
+
trainer = Trainer(
|
| 85 |
+
TrainerArgs(),
|
| 86 |
+
config,
|
| 87 |
+
output_path,
|
| 88 |
+
model=model,
|
| 89 |
+
train_samples=train_samples,
|
| 90 |
+
eval_samples=eval_samples,
|
| 91 |
+
)
|
| 92 |
+
trainer.fit()
|
trainer_0_log.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:865cd234a1230128eeb78c0d507eba4019732b0bbdc5360225011581c43dd60f
|
| 3 |
+
size 18198855
|
trainer_1_log.txt
ADDED
|
File without changes
|
trainer_2_log.txt
ADDED
|
File without changes
|
trainer_3_log.txt
ADDED
|
File without changes
|
trainer_4_log.txt
ADDED
|
File without changes
|
trainer_5_log.txt
ADDED
|
File without changes
|
trainer_6_log.txt
ADDED
|
File without changes
|
trainer_7_log.txt
ADDED
|
File without changes
|