In [1]:
!pip install coqui-tts > /dev/null
!pip install packaging==21.0 > /dev/null

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 24.10.1 requires cubinlinker, which is not installed.
cudf 24.10.1 requires cupy-cuda11x>=12.0.0, which is not installed.
cudf 24.10.1 requires libcudf==24.10.*, which is not installed.
cudf 24.10.1 requires ptxcompiler, which is not installed.
cuml 24.10.0 requires cupy-cuda11x>=12.0.0, which is not installed.
cuml 24.10.0 requires cuvs==24.10.*, which is not installed.
cuml 24.10.0 requires nvidia-cublas, which is not installed.
cuml 24.10.0 requires nvidia-cufft, which is not installed.
cuml 24.10.0 requires nvidia-curand, which is not installed.
cuml 24.10.0 requires nvidia-cusolver, which is not installed.
cuml 24.10.0 requires nvidia-cusparse, which is not installed.
dask-cudf 24.10.1 requires cupy-cuda11x>=12.0.0, which is not installed.
pylibcudf 24.10.1 requires libcudf==24.10.*, wh

In [2]:
import os, math

from trainer import Trainer, TrainerArgs

from TTS.config.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig
from TTS.tts.models.xtts import XttsAudioConfig
from TTS.utils.manage import ModelManager

In [3]:
# Logging parameters
RUN_NAME = "viXTTS-FT-Code"
PROJECT_NAME = "XTTS_trainer"
DASHBOARD_LOGGER = "tensorboard"
LOGGER_URI = None

# Set here the path that the checkpoints will be saved. Default: ./run/training/
OUT_PATH = "/kaggle/working/finetuned"

# Define the path where viXTTS files will be downloaded
CHECKPOINTS_OUT_PATH = os.path.join("/kaggle/temp/viXTTS_original_model_files/")
os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)

In [4]:
# DVAE files
DVAE_CHECKPOINT_LINK = "https://huggingface.co/coqui/XTTS-v2/resolve/main/dvae.pth"
MEL_NORM_LINK = "https://huggingface.co/coqui/XTTS-v2/resolve/main/mel_stats.pth"

# Set the path to the downloaded files
DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))

# download DVAE files if needed
if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
    print(" > Downloading DVAE files!")
    ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=False)

 > Downloading DVAE files!


In [5]:
# Download viXTTS checkpoint if needed
TOKENIZER_FILE_LINK = "https://huggingface.co/capleaf/viXTTS/resolve/main/vocab.json"
XTTS_CHECKPOINT_LINK = "https://huggingface.co/capleaf/viXTTS/resolve/main/model.pth"
# TOKENIZER_FILE_LINK = "https://huggingface.co/coqui/XTTS-v2/resolve/main/vocab.json"
# XTTS_CHECKPOINT_LINK = "https://huggingface.co/coqui/XTTS-v2/resolve/main/model.pth"

# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK))  # vocab.json file
XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK))  # model.pth file

# download viXTTS files if needed
if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
    print(" > Downloading viXTTS files!")
    ModelManager._download_model_files(
        [TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=False
    )

 > Downloading viXTTS files!


In [6]:
# init args and config
model_args = GPTArgs(
    max_conditioning_length=132300,  # 6 secs
    min_conditioning_length=66150,  # 3 secs
    debug_loading_failures=False,
    max_wav_length=255995,  # ~11.6 seconds
    max_text_length=200,
    mel_norm_file=MEL_NORM_FILE,
    dvae_checkpoint=DVAE_CHECKPOINT,
    xtts_checkpoint=XTTS_CHECKPOINT,  # checkpoint path of the model that you want to fine-tune
    tokenizer_file=TOKENIZER_FILE,
    gpt_num_audio_tokens=1026,
    gpt_start_audio_token=1024,
    gpt_stop_audio_token=1025,
    gpt_use_masking_gt_prompt_approach=True,
    gpt_use_perceiver_resampler=True,
)

In [7]:
# Training Parameters
OPTIMIZER_WD_ONLY_ON_WEIGHTS = True  # for multi-gpu training please make it False
START_WITH_EVAL = False  # if True it will start with evaluation
BATCH_SIZE = 4  # set here the batch size
GRAD_ACUMM_STEPS = math.ceil(252 / BATCH_SIZE)  # set here the grad accumulation steps
# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly.

In [8]:
# Training sentences generations
SPEAKER_REFERENCE_LINK = "https://huggingface.co/capleaf/viXTTS/resolve/main/vi_sample.wav"
SPEAKER_REFERENCE = [
    os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(SPEAKER_REFERENCE_LINK))  # speaker reference to be used in training test sentences
]

if not os.path.isfile(SPEAKER_REFERENCE[0]) or not os.path.isfile(SPEAKER_REFERENCE[0]):
    print(" > Downloading XTTS v2.0 files!")
    ModelManager._download_model_files(
        [SPEAKER_REFERENCE_LINK], CHECKPOINTS_OUT_PATH, progress_bar=False
    )
LANGUAGE = 'en'

 > Downloading XTTS v2.0 files!


In [9]:
!dir viXTTS_original_model_files

  pid, fd = os.forkpty()


dir: cannot access 'viXTTS_original_model_files': No such file or directory


In [10]:
# define audio config
audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
# training parameters config
config = GPTTrainerConfig(
    precision="fp16",
    output_path=OUT_PATH,
    model_args=model_args,
    run_name=RUN_NAME,
    project_name=PROJECT_NAME,
    run_description="""
        GPT XTTS training
        """,
    dashboard_logger=DASHBOARD_LOGGER,
    logger_uri=LOGGER_URI,
    audio=audio_config,
    epochs=1,
    batch_size=BATCH_SIZE,
    batch_group_size=48,
    eval_batch_size=BATCH_SIZE,
    num_loader_workers=8,
    eval_split_max_size=256,
    print_step=50,
    plot_step=100,
    log_model_step=1,
    save_step=1000,
    save_n_checkpoints=1,
    save_checkpoints=True,
    # target_loss="loss",
    print_eval=False,
    # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
    optimizer="AdamW",
    optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
    optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
    lr=1e-5,  # learning rate
    lr_scheduler="MultiStepLR",
    # it was adjusted accordly for the new step scheme
    lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
    test_sentences=[
        # {
        #     "text": "My favorite programming languages is C++ and Java.",
        #     "speaker_wav": SPEAKER_REFERENCE,
        #     "language": LANGUAGE,
        # },
        # {
        #     "text": "I am learning HTML, CSS and JavaScript.",
        #     "speaker_wav": SPEAKER_REFERENCE,
        #     "language": LANGUAGE,
        # },
    ],
)

In [11]:
# Define here the dataset that you want to use for the fine-tuning on.
cs50_config_dataset = BaseDatasetConfig(
    formatter="ljspeech",
    dataset_name="cs-50",
    path="/kaggle/input/cs50-dataset/",
    meta_file_train="/kaggle/input/cs50-dataset/metadata.csv",
    language="en",
)

In [12]:
# load training samples
DATASETS_CONFIG_LIST = [cs50_config_dataset]
train_samples, eval_samples = load_tts_samples(
    DATASETS_CONFIG_LIST,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size,
)
import random

# Set a consistent seed
random.seed(42)

# Shuffle the list
random.shuffle(train_samples)
print("No. Train", len(train_samples))
print("No. Eval", len(eval_samples))

No. Train 24007
No. Eval 242


In [13]:
# init the model from config
model = GPTTrainer.init_from_config(config)

In [14]:
# init the trainer and ðŸš€
trainer = Trainer(
    TrainerArgs(
        restore_path=None,  # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
        skip_train_epoch=False,
        start_with_eval=START_WITH_EVAL,
        grad_accum_steps=GRAD_ACUMM_STEPS,
        # use_accelerate=True, cause NaN loss
        # small_run=20
    ),
    config,
    output_path=OUT_PATH,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
)
try:
    trainer.fit()
except Exception as e:
    print(e)

fatal: not a git repository (or any parent up to mount point /kaggle)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).
fatal: not a git repository (or any parent up to mount point /kaggle)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).
 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: False
 | > Precision: float32
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 4
 | > Num. of Torch Threads: 1
 | > Torch seed: 1
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
 > Start Tensorboard: tensorboard --logdir=/kaggle/working/finetuned/viXTTS-FT-Code-December-30-2024_02+18AM-0000000

 > Model has 520210334 parameters

[4m[1m > EPOCH: 0/1[0m
 --> /kaggle/working/finetuned/viXTTS-FT-Code-December-30-2024_02+18AM-0000000

[1m > TRAINING (2024-12-30 02:18:31) [0m
  self.pid = os.fork()

[1m   --> TIME: 2024-12-30 02:18:37 -- STEP: 0/

In [15]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HF_TOKEN = user_secrets.get_secret("HF_TOKEN")

In [16]:
from huggingface_hub import HfApi

api = HfApi(token=HF_TOKEN)

# Upload all the content from the local folder to your remote Space.
# By default, files are uploaded at the root of the repo
repo_id = "thng292/viXTTS-ft-code-test"
try:
    api.create_repo(repo_id, private=True)
except Exception as e:
    print(e)
    pass

409 Client Error: Conflict for url: https://huggingface.co/api/repos/create (Request ID: Root=1-677211ac-238f389e186a26876f72fac4;df039791-dd04-4fd6-891f-68be2f37d1c6)

You already created this model repo


In [17]:
api.upload_folder(
    folder_path=OUT_PATH,
    repo_id=repo_id,
)

checkpoint_6000.pth:   0%|          | 0.00/5.63G [00:00<?, ?B/s]

best_model_6002.pth:   0%|          | 0.00/5.63G [00:00<?, ?B/s]

best_model_6002.pth:   0%|          | 0.00/5.63G [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

events.out.tfevents.1735525110.768dcf3dda93.23.0:   0%|          | 0.00/30.2k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/thng292/viXTTS-ft-code-test/commit/ff9cc4853b31675fed729fa393f2d21cc8e82d34', commit_message='Upload folder using huggingface_hub', commit_description='', oid='ff9cc4853b31675fed729fa393f2d21cc8e82d34', pr_url=None, repo_url=RepoUrl('https://huggingface.co/thng292/viXTTS-ft-code-test', endpoint='https://huggingface.co', repo_type='model', repo_id='thng292/viXTTS-ft-code-test'), pr_revision=None, pr_num=None)