File size: 6,752 Bytes
f0fc7d1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 | import os
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import librosa
import soundfile as sf
from pathlib import Path
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
# βββββββββββββββββββββββββββββββββββββββββ
# AUDIO PREPROCESSING
# βββββββββββββββββββββββββββββββββββββββββ
def preprocess_audio(dataset_path, target_sr=22050):
wavs_dir = os.path.join(dataset_path, "wavs")
wav_files = list(Path(wavs_dir).glob("*.wav"))
already_done = os.path.join(dataset_path, ".preprocessed")
if os.path.exists(already_done):
print("β
Audio allaqachon tayyor.")
return
print(f"π {len(wav_files)} ta wav qayta ishlanmoqda...")
for wav_path in wav_files:
audio, sr = librosa.load(str(wav_path), sr=target_sr, mono=True)
sf.write(str(wav_path), audio, target_sr)
open(already_done, "w").close()
print("β
Barcha wav mono + 22050 Hz ga o'tkazildi.")
dataset_path = "/content/drive/MyDrive/tts/dataset_final"
preprocess_audio(dataset_path)
# βββββββββββββββββββββββββββββββββββββββββ
# TRAIN FUNKSIYASI β har bir GPU uchun alohida ishga tushadi
# βββββββββββββββββββββββββββββββββββββββββ
def train(rank, world_size):
"""rank=0 β GPU0, rank=1 β GPU1"""
# DDP ni ishga tushirish
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "12355"
dist.init_process_group("nccl", rank=rank, world_size=world_size)
torch.cuda.set_device(rank)
print(f"β
GPU {rank}/{world_size} ishga tushdi: {torch.cuda.get_device_name(rank)}")
from TTS.tts.configs.shared_configs import CharactersConfig, BaseDatasetConfig
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.vits import Vits
from TTS.utils.audio import AudioProcessor
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from trainer import Trainer, TrainerArgs
# ββ CONFIG ββ
config = VitsConfig(
run_name="Xurmo Media 20",
batch_size=16, # Har bir GPU uchun 16 β jami 32
eval_batch_size=8,
num_loader_workers=2,
num_eval_loader_workers=2,
epochs=1000,
text_cleaner="multilingual_cleaners",
use_phonemes=False,
mixed_precision=True, # FP16 β T4 da 2x tezlik
run_eval=True,
save_step=1000,
save_n_checkpoints=3,
print_step=50,
output_path="/content/drive/MyDrive/tts/output",
characters=CharactersConfig(
characters="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzO'o'G'g'ShshChch'0123456789",
punctuations="!,.? ",
pad="<PAD>",
eos="<EOS>",
bos="<BOS>",
blank="<BLNK>",
),
)
config.audio.sample_rate = 22050
config.audio.do_trim_silence = True
config.audio.resample = False
# ββ FORMATTER ββ
def formatter(root_path, meta_file, **kwargs):
txt_file = os.path.join(root_path, meta_file)
items = []
with open(txt_file, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
cols = line.split("|")
if len(cols) < 2:
continue
wav_file = os.path.join(root_path, "wavs", cols[0].strip() + ".wav")
text = cols[1].strip()
# Typographic apostrof β oddiy apostrof
text = text.replace("\u2018", "'").replace("\u2019", "'")
text = text.replace("\u02bc", "'").replace("\u0060", "'")
if not os.path.exists(wav_file):
continue
items.append({
"text": text,
"audio_file": wav_file,
"root_path": root_path,
"speaker_name": "xurmo media",
"language": "uz",
})
if rank == 0:
print(f"β
{len(items)} ta sample yuklandi.")
return items
# ββ DATASET ββ
dataset_config = BaseDatasetConfig(
dataset_name="uzbek_tts",
path=dataset_path,
meta_file_train="metadata.csv",
meta_file_val="",
language="uz",
)
train_samples, eval_samples = load_tts_samples(
[dataset_config],
eval_split=True,
eval_split_size=0.1,
formatter=formatter,
)
# ββ MODEL ββ
tokenizer, config = TTSTokenizer.init_from_config(config)
ap = AudioProcessor.init_from_config(config)
model = Vits(config, ap, tokenizer, speaker_manager=None)
# ββ TRAINER β rank va world_size ni uzatamiz ββ
trainer_args = TrainerArgs(
rank=rank,
group_id=f"group_{rank}",
use_ddp=True,
grad_accum_steps=1, # VITS GAN uchun majburiy =1
)
trainer = Trainer(
trainer_args,
config,
output_path="/kaggle/working/output",
model=model,
train_samples=train_samples,
eval_samples=eval_samples,
)
if rank == 0:
print(f"""
ββββββββββββββββββββββββββββββββββββββββ
β π Colab T4 O'QITISH β
β Har GPU batch : 16 β
β Effective batch: 32 β
β Epochs : 1000
ββββββββββββββββββββββββββββββββββββββββ
""")
trainer.fit()
dist.destroy_process_group()
# βββββββββββββββββββββββββββββββββββββββββ
# ISHGA TUSHIRISH
# βββββββββββββββββββββββββββββββββββββββββ
if __name__ == "__main__":
world_size = torch.cuda.device_count()
print(f"π₯οΈ Topilgan GPU: {world_size} ta")
if world_size < 2:
print("β οΈ Faqat 1 GPU topildi! Kaggle Settings β Accelerator β GPU T4 x2 tanlang.")
# Baribir 1 GPU bilan ishlaydi
train(0, 1)
else:
# Ikkala GPU ni parallel ishga tushirish
mp.spawn(
train,
args=(world_size,),
nprocs=world_size,
join=True
) |