File size: 7,005 Bytes
5ec8604 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 | import os
import torch
import librosa
import soundfile as sf
from pathlib import Path
# βββββββββββββββββββββββββββββββββββββββββ
# AUDIO PREPROCESSING
# βββββββββββββββββββββββββββββββββββββββββ
def preprocess_audio(dataset_path, target_sr=22050):
wavs_dir = os.path.join(dataset_path, "wavs")
wav_files = list(Path(wavs_dir).glob("*.wav"))
already_done = os.path.join(dataset_path, ".preprocessed")
if os.path.exists(already_done):
print("β
Audio allaqachon tayyor.")
return
print(f"π {len(wav_files)} ta wav qayta ishlanmoqda...")
for wav_path in wav_files:
audio, sr = librosa.load(str(wav_path), sr=target_sr, mono=True)
sf.write(str(wav_path), audio, target_sr)
open(already_done, "w").close()
print("β
Barcha wav mono + 22050 Hz ga o'tkazildi.")
dataset_path = "/content/drive/MyDrive/tts/dataset_final"
output_dir = "/content/drive/MyDrive/tts/output"
preprocess_audio(dataset_path)
# βββββββββββββββββββββββββββββββββββββββββ
# IMPORT
# βββββββββββββββββββββββββββββββββββββββββ
from TTS.tts.configs.shared_configs import CharactersConfig, BaseDatasetConfig
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.vits import Vits
from TTS.utils.audio import AudioProcessor
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.tts.datasets import formatters
from trainer import Trainer, TrainerArgs
# βββββββββββββββββββββββββββββββββββββββββ
# CONFIG
# βββββββββββββββββββββββββββββββββββββββββ
config = VitsConfig(
run_name="Xurmo_Media_20",
batch_size=16,
eval_batch_size=8,
num_loader_workers=2,
num_eval_loader_workers=2,
epochs=1000,
text_cleaner="multilingual_cleaners",
use_phonemes=False,
mixed_precision=True,
run_eval=True,
save_step=1000,
save_n_checkpoints=3,
print_step=50,
output_path=output_dir,
characters=CharactersConfig(
characters="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzOΚ»oΚ»GΚ»gΚ»ShshChch'0123456789",
punctuations="!,.? ",
pad="<PAD>",
eos="<EOS>",
bos="<BOS>",
blank="<BLNK>",
),
)
config.audio.sample_rate = 22050
config.audio.do_trim_silence = True
config.audio.resample = False
# βββββββββββββββββββββββββββββββββββββββββ
# FORMATTER
# βββββββββββββββββββββββββββββββββββββββββ
def uzbek_formatter(root_path, meta_file, **kwargs):
txt_file = os.path.join(root_path, meta_file)
items = []
with open(txt_file, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
cols = line.split("|")
if len(cols) < 2:
continue
wav_file = os.path.join(root_path, "wavs", cols[0].strip() + ".wav")
text = cols[1].strip()
text = text.replace("\u2018", "'").replace("\u2019", "'")
text = text.replace("\u02bc", "'").replace("\u0060", "'")
if not os.path.exists(wav_file):
continue
items.append({
"text": text,
"audio_file": wav_file,
"root_path": root_path,
"speaker_name": "xurmo media",
"language": "uz",
})
print(f"β
{len(items)} ta sample yuklandi.")
return items
# βββββββββββββββββββββββββββββββββββββββββ
# DATASET
# βββββββββββββββββββββββββββββββββββββββββ
dataset_config = BaseDatasetConfig(
formatter="",
dataset_name="uzbek_tts",
path=dataset_path,
meta_file_train="metadata.csv",
meta_file_val="",
language="uz",
)
train_samples, eval_samples = load_tts_samples(
dataset_config, # β ro'yxat emas, to'g'ridan
eval_split=True,
eval_split_size=0.1,
formatter=uzbek_formatter, # β funksiya nomi, string emas
)
# βββββββββββββββββββββββββββββββββββββββββ
# MODEL
# βββββββββββββββββββββββββββββββββββββββββ
tokenizer, config = TTSTokenizer.init_from_config(config)
ap = AudioProcessor.init_from_config(config)
model = Vits(config, ap, tokenizer, speaker_manager=None)
# βββββββββββββββββββββββββββββββββββββββββ
# RESUME β oxirgi checkpoint ni topamiz
# βββββββββββββββββββββββββββββββββββββββββ
restore_path = "/content/drive/MyDrive/tts/output/Xurmo_Media_20-May-19-2026_11+39AM-0000000/checkpoint_4000.pth"
run_dirs = None #sorted(Path("/kaggle/working/output/Xurmo Media 20-April-25-2026_01+39PM-0000000/").glob("Xurmo_Media_20*"), key=os.path.getmtime)
if run_dirs:
checkpoints = sorted(run_dirs[-1].glob("*.pth"), key=os.path.getmtime)
if checkpoints:
restore_path = str(checkpoints[-1])
print(f"π Resume: {restore_path}")
else:
print("π Yangi training boshlanadi.")
# βββββββββββββββββββββββββββββββββββββββββa
# TRAINER β DDP YO'Q, oddiy single-GPU
# βββββββββββββββββββββββββββββββββββββββββ
trainer_args = TrainerArgs(
restore_path=restore_path,
# use_ddp=False β default, shuning uchun yozish shart emas
)
trainer = Trainer(
trainer_args,
config,
output_path=output_dir,
model=model,
train_samples=train_samples,
eval_samples=eval_samples,
)
print(f"""
ββββββββββββββββββββββββββββββββββββββββ
β π 1x GPU O'QITISH β
β Batch size : 16 β
β Epochs : 1000 β
β Resume : {'HA ' if restore_path else "YO'Q"} β
ββββββββββββββββββββββββββββββββββββββββ
""")
trainer.fit() |