Spaces:

ranggapr2000
/

rvc_api

Sleeping

App Files Files Community

rvc_api / lib /rvc /train.py

aryo100

first commit

b5a064f 3 months ago

raw

history blame contribute delete

37.4 kB

	import glob
	import json
	import operator
	import os
	import shutil
	import time
	from random import shuffle
	from typing import *

	import faiss
	import numpy as np
	import torch
	import torch.distributed as dist
	import torch.multiprocessing as mp
	import torchaudio
	import tqdm
	from sklearn.cluster import MiniBatchKMeans
	from torch.cuda.amp import GradScaler, autocast
	from torch.nn import functional as F
	from torch.nn.parallel import DistributedDataParallel as DDP
	from torch.utils.data import DataLoader
	from torch.utils.tensorboard import SummaryWriter

	from . import commons, utils
	from .checkpoints import save
	from .config import DatasetMetadata, TrainConfig
	from .data_utils import (DistributedBucketSampler, TextAudioCollate,
	TextAudioCollateMultiNSFsid, TextAudioLoader,
	TextAudioLoaderMultiNSFsid)
	from .losses import discriminator_loss, feature_loss, generator_loss, kl_loss
	from .mel_processing import mel_spectrogram_torch, spec_to_mel_torch
	from .models import (MultiPeriodDiscriminator, SynthesizerTrnMs256NSFSid,
	SynthesizerTrnMs256NSFSidNono)
	from .preprocessing.extract_feature import (MODELS_DIR, get_embedder,
	load_embedder)


	def is_audio_file(file: str):
	if "." not in file:
	return False
	ext = os.path.splitext(file)[1]
	return ext.lower() in [
	".wav",
	".flac",
	".ogg",
	".mp3",
	".m4a",
	".wma",
	".aiff",
	]


	def glob_dataset(
	glob_str: str,
	speaker_id: int,
	multiple_speakers: bool = False,
	recursive: bool = True,
	training_dir: str = ".",
	):
	globs = glob_str.split(",")
	speaker_count = 0
	datasets_speakers = []
	speaker_to_id_mapping = {}
	for glob_str in globs:
	if os.path.isdir(glob_str):
	if multiple_speakers:
	# Multispeaker format:
	# dataset_path/
	# - speakername/
	# - {wav name here}.wav
	# - ...
	# - next_speakername/
	# - {wav name here}.wav
	# - ...
	# - ...
	print("Multispeaker dataset enabled; Processing speakers.")
	for dir in tqdm.tqdm(os.listdir(glob_str)):
	print("Speaker ID " + str(speaker_count) + ": " + dir)
	speaker_to_id_mapping[dir] = speaker_count
	speaker_path = glob_str + "/" + dir
	for audio in tqdm.tqdm(os.listdir(speaker_path)):
	if is_audio_file(glob_str + "/" + dir + "/" + audio):
	datasets_speakers.append((glob_str + "/" + dir + "/" + audio, speaker_count))
	speaker_count += 1
	with open(os.path.join(training_dir, "speaker_info.json"), "w") as outfile:
	print("Dumped speaker info to {}".format(os.path.join(training_dir, "speaker_info.json")))
	json.dump(speaker_to_id_mapping, outfile)
	continue # Skip the normal speaker extend

	glob_str = os.path.join(glob_str, "*", "")
	print("Single speaker dataset enabled; Processing speaker as ID " + str(speaker_id) + ".")
	datasets_speakers.extend(
	[
	(file, speaker_id)
	for file in glob.iglob(glob_str, recursive=recursive)
	if is_audio_file(file)
	]
	)

	return sorted(datasets_speakers)


	def create_dataset_meta(training_dir: str, f0: bool):
	gt_wavs_dir = os.path.join(training_dir, "0_gt_wavs")
	co256_dir = os.path.join(training_dir, "3_feature256")

	def list_data(dir: str):
	files = []
	for subdir in os.listdir(dir):
	speaker_dir = os.path.join(dir, subdir)
	for name in os.listdir(speaker_dir):
	files.append(os.path.join(subdir, name.split(".")[0]))
	return files

	names = set(list_data(gt_wavs_dir)) & set(list_data(co256_dir))

	if f0:
	f0_dir = os.path.join(training_dir, "2a_f0")
	f0nsf_dir = os.path.join(training_dir, "2b_f0nsf")
	names = names & set(list_data(f0_dir)) & set(list_data(f0nsf_dir))

	meta = {
	"files": {},
	}

	for name in names:
	speaker_id = os.path.dirname(name).split("_")[0]
	speaker_id = int(speaker_id) if speaker_id.isdecimal() else 0
	if f0:
	gt_wav_path = os.path.join(gt_wavs_dir, f"{name}.wav")
	co256_path = os.path.join(co256_dir, f"{name}.npy")
	f0_path = os.path.join(f0_dir, f"{name}.wav.npy")
	f0nsf_path = os.path.join(f0nsf_dir, f"{name}.wav.npy")
	meta["files"][name] = {
	"gt_wav": gt_wav_path,
	"co256": co256_path,
	"f0": f0_path,
	"f0nsf": f0nsf_path,
	"speaker_id": speaker_id,
	}
	else:
	gt_wav_path = os.path.join(gt_wavs_dir, f"{name}.wav")
	co256_path = os.path.join(co256_dir, f"{name}.npy")
	meta["files"][name] = {
	"gt_wav": gt_wav_path,
	"co256": co256_path,
	"speaker_id": speaker_id,
	}

	with open(os.path.join(training_dir, "meta.json"), "w") as f:
	json.dump(meta, f, indent=2)


	def change_speaker(net_g, speaker_info, embedder, embedding_output_layer, phone, phone_lengths, pitch, pitchf, spec_lengths):
	"""
	random change formant
	inspired by https://github.com/auspicious3000/contentvec/blob/d746688a32940f4bee410ed7c87ec9cf8ff04f74/contentvec/data/audio/audio_utils_1.py#L179
	"""
	N = phone.shape[0]
	device = phone.device
	dtype = phone.dtype

	f0_bin = 256
	f0_max = 1100.0
	f0_min = 50.0
	f0_mel_min = 1127 * np.log(1 + f0_min / 700)
	f0_mel_max = 1127 * np.log(1 + f0_max / 700)

	pitch_median = torch.median(pitchf, 1).values
	lo = 75. + 25. * (pitch_median >= 200).to(dtype=dtype)
	hi = 250. + 150. * (pitch_median >= 200).to(dtype=dtype)
	pitch_median = torch.clip(pitch_median, lo, hi).unsqueeze(1)

	shift_pitch = torch.exp2((1. - 2. * torch.rand(N)) / 4).unsqueeze(1).to(device, dtype) # ピッチを半オクターブの範囲でずらす

	new_sid = np.random.choice(np.arange(len(speaker_info))[speaker_info > 0], size=N)
	rel_pitch = pitchf / pitch_median
	new_pitch_median = torch.from_numpy(speaker_info[new_sid]).to(device, dtype).unsqueeze(1) * shift_pitch
	new_pitchf = new_pitch_median * rel_pitch
	new_sid = torch.from_numpy(new_sid).to(device)

	new_pitch = 1127. * torch.log(1. + new_pitchf / 700.)
	new_pitch = (pitch - f0_mel_min) * (f0_bin - 2.) / (f0_mel_max - f0_mel_min) + 1.
	new_pitch = torch.clip(new_pitch, 1, f0_bin - 1).to(dtype=torch.int)

	aug_wave = net_g.infer(phone, phone_lengths, new_pitch, new_pitchf, new_sid)[0]
	aug_wave_16k = torchaudio.functional.resample(aug_wave, net_g.sr, 16000, rolloff=0.99).squeeze(1)
	padding_mask = torch.arange(aug_wave_16k.shape[1]).unsqueeze(0).to(device) > (spec_lengths.unsqueeze(1) * 160).to(device)

	inputs = {
	"source": aug_wave_16k.to(device, dtype),
	"padding_mask": padding_mask.to(device),
	"output_layer": embedding_output_layer
	}
	logits = embedder.extract_features(**inputs)
	if phone.shape[-1] == 768:
	feats = logits[0]
	else:
	feats = embedder.final_proj(logits[0])
	feats = torch.repeat_interleave(feats, 2, 1)
	new_phone = torch.zeros(phone.shape).to(device, dtype)
	new_phone[:, :feats.shape[1]] = feats[:, :phone.shape[1]]
	return new_phone.to(device), aug_wave


	def change_speaker_nono(net_g, embedder, embedding_output_layer, phone, phone_lengths, spec_lengths):
	"""
	random change formant
	inspired by https://github.com/auspicious3000/contentvec/blob/d746688a32940f4bee410ed7c87ec9cf8ff04f74/contentvec/data/audio/audio_utils_1.py#L179
	"""
	N = phone.shape[0]
	device = phone.device
	dtype = phone.dtype

	new_sid = np.random.randint(net_g.spk_embed_dim, size=N)
	new_sid = torch.from_numpy(new_sid).to(device)

	aug_wave = net_g.infer(phone, phone_lengths, new_sid)[0]
	aug_wave_16k = torchaudio.functional.resample(aug_wave, net_g.sr, 16000, rolloff=0.99).squeeze(1)
	padding_mask = torch.arange(aug_wave_16k.shape[1]).unsqueeze(0).to(device) > (spec_lengths.unsqueeze(1) * 160).to(device)

	inputs = {
	"source": aug_wave_16k.to(device, dtype),
	"padding_mask": padding_mask.to(device),
	"output_layer": embedding_output_layer
	}

	logits = embedder.extract_features(**inputs)
	if phone.shape[-1] == 768:
	feats = logits[0]
	else:
	feats = embedder.final_proj(logits[0])
	feats = torch.repeat_interleave(feats, 2, 1)
	new_phone = torch.zeros(phone.shape).to(device, dtype)
	new_phone[:, :feats.shape[1]] = feats[:, :phone.shape[1]]
	return new_phone.to(device), aug_wave


	def train_index(
	training_dir: str,
	model_name: str,
	out_dir: str,
	emb_ch: int,
	num_cpu_process: int,
	maximum_index_size: Optional[int],
	):
	checkpoint_path = os.path.join(out_dir, model_name)
	feature_256_dir = os.path.join(training_dir, "3_feature256")
	index_dir = os.path.join(os.path.dirname(checkpoint_path), f"{model_name}_index")
	os.makedirs(index_dir, exist_ok=True)
	for speaker_id in tqdm.tqdm(
	sorted([dir for dir in os.listdir(feature_256_dir) if dir.isdecimal()])
	):
	feature_256_spk_dir = os.path.join(feature_256_dir, speaker_id)
	speaker_id = int(speaker_id)
	npys = []
	for name in [
	os.path.join(feature_256_spk_dir, file)
	for file in os.listdir(feature_256_spk_dir)
	if file.endswith(".npy")
	]:
	phone = np.load(os.path.join(feature_256_spk_dir, name))
	npys.append(phone)

	# shuffle big_npy to prevent reproducing the sound source
	big_npy = np.concatenate(npys, 0)
	big_npy_idx = np.arange(big_npy.shape[0])
	np.random.shuffle(big_npy_idx)
	big_npy = big_npy[big_npy_idx]

	if not maximum_index_size is None and big_npy.shape[0] > maximum_index_size:
	kmeans = MiniBatchKMeans(
	n_clusters=maximum_index_size,
	batch_size=256 * num_cpu_process,
	init="random",
	compute_labels=False,
	)
	kmeans.fit(big_npy)
	big_npy = kmeans.cluster_centers_

	# recommend parameter in https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index
	emb_ch = big_npy.shape[1]
	emb_ch_half = emb_ch // 2
	n_ivf = int(8 * np.sqrt(big_npy.shape[0]))
	if big_npy.shape[0] >= 1_000_000:
	index = faiss.index_factory(
	emb_ch, f"IVF{n_ivf},PQ{emb_ch_half}x4fsr,RFlat"
	)
	else:
	index = faiss.index_factory(emb_ch, f"IVF{n_ivf},Flat")

	index.train(big_npy)
	batch_size_add = 8192
	for i in range(0, big_npy.shape[0], batch_size_add):
	index.add(big_npy[i : i + batch_size_add])
	np.save(
	os.path.join(index_dir, f"{model_name}.{speaker_id}.big.npy"),
	big_npy,
	)
	faiss.write_index(
	index,
	os.path.join(index_dir, f"{model_name}.{speaker_id}.index"),
	)


	def train_model(
	gpus: List[int],
	config: TrainConfig,
	training_dir: str,
	model_name: str,
	out_dir: str,
	sample_rate: int,
	f0: bool,
	batch_size: int,
	augment: bool,
	augment_path: Optional[str],
	speaker_info_path: Optional[str],
	cache_batch: bool,
	total_epoch: int,
	save_every_epoch: int,
	save_wav_with_checkpoint: bool,
	pretrain_g: str,
	pretrain_d: str,
	embedder_name: str,
	embedding_output_layer: int,
	save_only_last: bool = False,
	device: Optional[Union[str, torch.device]] = None,
	):
	os.environ["MASTER_ADDR"] = "localhost"
	os.environ["MASTER_PORT"] = str(utils.find_empty_port())

	deterministic = torch.backends.cudnn.deterministic
	benchmark = torch.backends.cudnn.benchmark
	PREV_CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", None)

	torch.backends.cudnn.deterministic = False
	torch.backends.cudnn.benchmark = False

	os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(gpu) for gpu in gpus])

	start = time.perf_counter()

	# Mac(MPS)でやると、mp.spawnでなんかトラブルが出るので普通にtraining_runnerを呼び出す。
	if device is not None:
	training_runner(
	0, # rank
	1, # world size
	config,
	training_dir,
	model_name,
	out_dir,
	sample_rate,
	f0,
	batch_size,
	augment,
	augment_path,
	speaker_info_path,
	cache_batch,
	total_epoch,
	save_every_epoch,
	save_wav_with_checkpoint,
	pretrain_g,
	pretrain_d,
	embedder_name,
	embedding_output_layer,
	save_only_last,
	device,
	)
	else:
	mp.spawn(
	training_runner,
	nprocs=len(gpus),
	args=(
	len(gpus),
	config,
	training_dir,
	model_name,
	out_dir,
	sample_rate,
	f0,
	batch_size,
	augment,
	augment_path,
	speaker_info_path,
	cache_batch,
	total_epoch,
	save_every_epoch,
	save_wav_with_checkpoint,
	pretrain_g,
	pretrain_d,
	embedder_name,
	embedding_output_layer,
	save_only_last,
	device,
	),
	)

	end = time.perf_counter()

	print(f"Time: {end - start}")

	if PREV_CUDA_VISIBLE_DEVICES is None:
	del os.environ["CUDA_VISIBLE_DEVICES"]
	else:
	os.environ["CUDA_VISIBLE_DEVICES"] = PREV_CUDA_VISIBLE_DEVICES

	torch.backends.cudnn.deterministic = deterministic
	torch.backends.cudnn.benchmark = benchmark


	def training_runner(
	rank: int,
	world_size: List[int],
	config: TrainConfig,
	training_dir: str,
	model_name: str,
	out_dir: str,
	sample_rate: int,
	f0: bool,
	batch_size: int,
	augment: bool,
	augment_path: Optional[str],
	speaker_info_path: Optional[str],
	cache_in_gpu: bool,
	total_epoch: int,
	save_every_epoch: int,
	save_wav_with_checkpoint: bool,
	pretrain_g: str,
	pretrain_d: str,
	embedder_name: str,
	embedding_output_layer: int,
	save_only_last: bool = False,
	device: Optional[Union[str, torch.device]] = None,
	):
	config.train.batch_size = batch_size
	log_dir = os.path.join(training_dir, "logs")
	state_dir = os.path.join(training_dir, "state")
	training_files_path = os.path.join(training_dir, "meta.json")
	training_meta = DatasetMetadata.parse_file(training_files_path)
	embedder_out_channels = config.model.emb_channels

	is_multi_process = world_size > 1

	if device is not None:
	if type(device) == str:
	device = torch.device(device)

	global_step = 0
	is_main_process = rank == 0

	if is_main_process:
	os.makedirs(log_dir, exist_ok=True)
	os.makedirs(state_dir, exist_ok=True)
	writer = SummaryWriter(log_dir=log_dir)

	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	if not dist.is_initialized():
	dist.init_process_group(
	backend="gloo", init_method="env://", rank=rank, world_size=world_size
	)

	if is_multi_process:
	torch.cuda.set_device(rank)

	torch.manual_seed(config.train.seed)

	if f0:
	train_dataset = TextAudioLoaderMultiNSFsid(training_meta, config.data)
	else:
	train_dataset = TextAudioLoader(training_meta, config.data)

	train_sampler = DistributedBucketSampler(
	train_dataset,
	config.train.batch_size * world_size,
	[100, 200, 300, 400, 500, 600, 700, 800, 900],
	num_replicas=world_size,
	rank=rank,
	shuffle=True,
	)

	if f0:
	collate_fn = TextAudioCollateMultiNSFsid()
	else:
	collate_fn = TextAudioCollate()

	train_loader = DataLoader(
	train_dataset,
	num_workers=4,
	shuffle=False,
	pin_memory=True,
	collate_fn=collate_fn,
	batch_sampler=train_sampler,
	persistent_workers=True,
	prefetch_factor=8,
	)
	speaker_info = None
	if os.path.exists(os.path.join(training_dir, "speaker_info.json")):
	with open(os.path.join(training_dir, "speaker_info.json"), "r") as f:
	speaker_info = json.load(f)
	config.model.spk_embed_dim = len(speaker_info)
	if f0:
	net_g = SynthesizerTrnMs256NSFSid(
	config.data.filter_length // 2 + 1,
	config.train.segment_size // config.data.hop_length,
	**config.model.dict(),
	is_half=False, # config.train.fp16_run,
	sr=int(sample_rate[:-1] + "000"),
	)
	else:
	net_g = SynthesizerTrnMs256NSFSidNono(
	config.data.filter_length // 2 + 1,
	config.train.segment_size // config.data.hop_length,
	**config.model.dict(),
	is_half=False, # config.train.fp16_run,
	sr=int(sample_rate[:-1] + "000"),
	)

	if is_multi_process:
	net_g = net_g.cuda(rank)
	else:
	net_g = net_g.to(device=device)

	if config.version == "v1":
	periods = [2, 3, 5, 7, 11, 17]
	elif config.version == "v2":
	periods = [2, 3, 5, 7, 11, 17, 23, 37]
	net_d = MultiPeriodDiscriminator(config.model.use_spectral_norm, periods=periods)
	if is_multi_process:
	net_d = net_d.cuda(rank)
	else:
	net_d = net_d.to(device=device)

	optim_g = torch.optim.AdamW(
	net_g.parameters(),
	config.train.learning_rate,
	betas=config.train.betas,
	eps=config.train.eps,
	)
	optim_d = torch.optim.AdamW(
	net_d.parameters(),
	config.train.learning_rate,
	betas=config.train.betas,
	eps=config.train.eps,
	)

	last_d_state = utils.latest_checkpoint_path(state_dir, "D_*.pth")
	last_g_state = utils.latest_checkpoint_path(state_dir, "G_*.pth")

	if last_d_state is None or last_g_state is None:
	epoch = 1
	global_step = 0
	if os.path.exists(pretrain_g) and os.path.exists(pretrain_d):
	net_g_state = torch.load(pretrain_g, map_location="cpu")["model"]
	emb_spk_size = (config.model.spk_embed_dim, config.model.gin_channels)
	emb_phone_size = (config.model.hidden_channels, config.model.emb_channels)
	if emb_spk_size != net_g_state["emb_g.weight"].size():
	original_weight = net_g_state["emb_g.weight"]
	net_g_state["emb_g.weight"] = original_weight.mean(dim=0, keepdims=True) * torch.ones(emb_spk_size, device=original_weight.device, dtype=original_weight.dtype)
	if emb_phone_size != net_g_state["enc_p.emb_phone.weight"].size():
	# interpolate
	orig_shape = net_g_state["enc_p.emb_phone.weight"].size()
	if net_g_state["enc_p.emb_phone.weight"].dtype == torch.half:
	net_g_state["enc_p.emb_phone.weight"] = (
	F.interpolate(
	net_g_state["enc_p.emb_phone.weight"]
	.float()
	.unsqueeze(0)
	.unsqueeze(0),
	size=emb_phone_size,
	mode="bilinear",
	)
	.half()
	.squeeze(0)
	.squeeze(0)
	)
	else:
	net_g_state["enc_p.emb_phone.weight"] = (
	F.interpolate(
	net_g_state["enc_p.emb_phone.weight"]
	.unsqueeze(0)
	.unsqueeze(0),
	size=emb_phone_size,
	mode="bilinear",
	)
	.squeeze(0)
	.squeeze(0)
	)
	print(
	"interpolated pretrained state enc_p.emb_phone from",
	orig_shape,
	"to",
	emb_phone_size,
	)
	if is_multi_process:
	net_g.module.load_state_dict(net_g_state)
	else:
	net_g.load_state_dict(net_g_state)

	del net_g_state

	if is_multi_process:
	net_d.module.load_state_dict(
	torch.load(pretrain_d, map_location="cpu")["model"]
	)
	else:
	net_d.load_state_dict(
	torch.load(pretrain_d, map_location="cpu")["model"]
	)
	if is_main_process:
	print(f"loaded pretrained {pretrain_g} {pretrain_d}")

	else:
	_, _, _, epoch = utils.load_checkpoint(last_d_state, net_d, optim_d)
	_, _, _, epoch = utils.load_checkpoint(last_g_state, net_g, optim_g)
	if is_main_process:
	print(f"loaded last state {last_d_state} {last_g_state}")

	epoch += 1
	global_step = (epoch - 1) * len(train_loader)

	if augment:
	# load embedder
	embedder_filepath, _, embedder_load_from = get_embedder(embedder_name)

	if embedder_load_from == "local":
	embedder_filepath = os.path.join(
	MODELS_DIR, "embeddings", embedder_filepath
	)
	embedder, _ = load_embedder(embedder_filepath, device)
	if not config.train.fp16_run:
	embedder = embedder.float()

	if (augment_path is not None):
	state_dict = torch.load(augment_path, map_location="cpu")
	if state_dict["f0"] == 1:
	augment_net_g = SynthesizerTrnMs256NSFSid(
	**state_dict["params"], is_half=config.train.fp16_run
	)
	augment_speaker_info = np.load(speaker_info_path)
	else:
	augment_net_g = SynthesizerTrnMs256NSFSidNono(
	**state_dict["params"], is_half=config.train.fp16_run
	)

	augment_net_g.load_state_dict(state_dict["weight"], strict=False)
	augment_net_g.eval().to(device)

	else:
	augment_net_g = net_g
	if f0:
	medians = [[] for _ in range(augment_net_g.spk_embed_dim)]
	for file in training_meta.files.values():
	f0f = np.load(file.f0nsf)
	if np.any(f0f > 0):
	medians[file.speaker_id].append(np.median(f0f[f0f > 0]))
	augment_speaker_info = np.array([np.median(x) if len(x) else 0. for x in medians])
	np.save(os.path.join(training_dir, "speaker_info.npy"), augment_speaker_info)

	if is_multi_process:
	net_g = DDP(net_g, device_ids=[rank])
	net_d = DDP(net_d, device_ids=[rank])

	scheduler_g = torch.optim.lr_scheduler.ExponentialLR(
	optim_g, gamma=config.train.lr_decay, last_epoch=epoch - 2
	)
	scheduler_d = torch.optim.lr_scheduler.ExponentialLR(
	optim_d, gamma=config.train.lr_decay, last_epoch=epoch - 2
	)

	scaler = GradScaler(enabled=config.train.fp16_run)

	cache = []
	progress_bar = tqdm.tqdm(range((total_epoch - epoch + 1) * len(train_loader)))
	progress_bar.set_postfix(epoch=epoch)
	step = -1 + len(train_loader) * (epoch - 1)
	for epoch in range(epoch, total_epoch + 1):
	train_loader.batch_sampler.set_epoch(epoch)

	net_g.train()
	net_d.train()

	use_cache = len(cache) == len(train_loader)
	data = cache if use_cache else enumerate(train_loader)

	if is_main_process:
	lr = optim_g.param_groups[0]["lr"]

	if use_cache:
	shuffle(cache)

	for batch_idx, batch in data:
	step += 1
	progress_bar.update(1)
	if f0:
	(
	phone,
	phone_lengths,
	pitch,
	pitchf,
	spec,
	spec_lengths,
	wave,
	wave_lengths,
	sid,
	) = batch
	else:
	(
	phone,
	phone_lengths,
	spec,
	spec_lengths,
	wave,
	wave_lengths,
	sid,
	) = batch

	if not use_cache:
	phone, phone_lengths = (
	phone.to(device=device, non_blocking=True),
	phone_lengths.to(device=device, non_blocking=True),
	)
	if f0:
	pitch, pitchf = (
	pitch.to(device=device, non_blocking=True),
	pitchf.to(device=device, non_blocking=True),
	)
	sid = sid.to(device=device, non_blocking=True)
	spec, spec_lengths = (
	spec.to(device=device, non_blocking=True),
	spec_lengths.to(device=device, non_blocking=True),
	)
	wave, wave_lengths = (
	wave.to(device=device, non_blocking=True),
	wave_lengths.to(device=device, non_blocking=True),
	)
	if cache_in_gpu:
	if f0:
	cache.append(
	(
	batch_idx,
	(
	phone,
	phone_lengths,
	pitch,
	pitchf,
	spec,
	spec_lengths,
	wave,
	wave_lengths,
	sid,
	),
	)
	)
	else:
	cache.append(
	(
	batch_idx,
	(
	phone,
	phone_lengths,
	spec,
	spec_lengths,
	wave,
	wave_lengths,
	sid,
	),
	)
	)

	with autocast(enabled=config.train.fp16_run):
	if augment:
	with torch.no_grad():
	if type(augment_net_g) == SynthesizerTrnMs256NSFSid:
	new_phone, aug_wave = change_speaker(augment_net_g, augment_speaker_info, embedder, embedding_output_layer, phone, phone_lengths, pitch, pitchf, spec_lengths)
	else:
	new_phone, aug_wave = change_speaker_nono(augment_net_g, embedder, embedding_output_layer, phone, phone_lengths, spec_lengths)
	weight = np.power(.5, step / len(train_loader)) # 学習の初期はそのままのphone embeddingを使う
	phone = phone * weight + new_phone * (1. - weight)

	if f0:
	(
	y_hat,
	ids_slice,
	x_mask,
	z_mask,
	(z, z_p, m_p, logs_p, m_q, logs_q),
	) = net_g(
	phone, phone_lengths, pitch, pitchf, spec, spec_lengths, sid
	)
	else:
	(
	y_hat,
	ids_slice,
	x_mask,
	z_mask,
	(z, z_p, m_p, logs_p, m_q, logs_q),
	) = net_g(phone, phone_lengths, spec, spec_lengths, sid)
	mel = spec_to_mel_torch(
	spec,
	config.data.filter_length,
	config.data.n_mel_channels,
	config.data.sampling_rate,
	config.data.mel_fmin,
	config.data.mel_fmax,
	)
	y_mel = commons.slice_segments(
	mel, ids_slice, config.train.segment_size // config.data.hop_length
	)
	with autocast(enabled=False):
	y_hat_mel = mel_spectrogram_torch(
	y_hat.float().squeeze(1),
	config.data.filter_length,
	config.data.n_mel_channels,
	config.data.sampling_rate,
	config.data.hop_length,
	config.data.win_length,
	config.data.mel_fmin,
	config.data.mel_fmax,
	)
	if config.train.fp16_run == True and device != torch.device("mps"):
	y_hat_mel = y_hat_mel.half()
	wave_slice = commons.slice_segments(
	wave, ids_slice * config.data.hop_length, config.train.segment_size
	) # slice

	# Discriminator
	y_d_hat_r, y_d_hat_g, _, _ = net_d(wave_slice, y_hat.detach())
	with autocast(enabled=False):
	loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(
	y_d_hat_r, y_d_hat_g
	)
	optim_d.zero_grad()
	scaler.scale(loss_disc).backward()
	scaler.unscale_(optim_d)
	grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
	scaler.step(optim_d)

	with autocast(enabled=config.train.fp16_run):
	# Generator
	y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(wave_slice, y_hat)
	with autocast(enabled=False):
	loss_mel = F.l1_loss(y_mel, y_hat_mel) * config.train.c_mel
	loss_kl = (
	kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * config.train.c_kl
	)
	loss_fm = feature_loss(fmap_r, fmap_g)
	loss_gen, losses_gen = generator_loss(y_d_hat_g)
	loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl
	optim_g.zero_grad()
	scaler.scale(loss_gen_all).backward()
	scaler.unscale_(optim_g)
	grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
	scaler.step(optim_g)
	scaler.update()

	if is_main_process:
	progress_bar.set_postfix(
	epoch=epoch,
	loss_g=float(loss_gen_all) if loss_gen_all is not None else 0.0,
	loss_d=float(loss_disc) if loss_disc is not None else 0.0,
	lr=float(lr) if lr is not None else 0.0,
	use_cache=use_cache,
	)
	if global_step % config.train.log_interval == 0:
	lr = optim_g.param_groups[0]["lr"]
	# Amor For Tensorboard display
	if loss_mel > 50:
	loss_mel = 50
	if loss_kl > 5:
	loss_kl = 5

	scalar_dict = {
	"loss/g/total": loss_gen_all,
	"loss/d/total": loss_disc,
	"learning_rate": lr,
	"grad_norm_d": grad_norm_d,
	"grad_norm_g": grad_norm_g,
	}
	scalar_dict.update(
	{
	"loss/g/fm": loss_fm,
	"loss/g/mel": loss_mel,
	"loss/g/kl": loss_kl,
	}
	)

	scalar_dict.update(
	{"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}
	)
	scalar_dict.update(
	{
	"loss/d_r/{}".format(i): v
	for i, v in enumerate(losses_disc_r)
	}
	)
	scalar_dict.update(
	{
	"loss/d_g/{}".format(i): v
	for i, v in enumerate(losses_disc_g)
	}
	)
	image_dict = {
	"slice/mel_org": utils.plot_spectrogram_to_numpy(
	y_mel[0].data.cpu().numpy()
	),
	"slice/mel_gen": utils.plot_spectrogram_to_numpy(
	y_hat_mel[0].data.cpu().numpy()
	),
	"all/mel": utils.plot_spectrogram_to_numpy(
	mel[0].data.cpu().numpy()
	),
	}
	utils.summarize(
	writer=writer,
	global_step=global_step,
	images=image_dict,
	scalars=scalar_dict,
	)
	global_step += 1
	if is_main_process and save_every_epoch != 0 and epoch % save_every_epoch == 0:
	if save_only_last:
	old_g_path = os.path.join(
	state_dir, f"G_{epoch - save_every_epoch}.pth"
	)
	old_d_path = os.path.join(
	state_dir, f"D_{epoch - save_every_epoch}.pth"
	)
	old_wav_path = os.path.join(
	state_dir, f"wav_sample_{epoch - save_every_epoch}"
	)
	if os.path.exists(old_g_path):
	os.remove(old_g_path)
	if os.path.exists(old_d_path):
	os.remove(old_d_path)
	if os.path.exists(old_wav_path):
	shutil.rmtree(old_wav_path)

	if save_wav_with_checkpoint:
	with autocast(enabled=config.train.fp16_run):
	with torch.no_grad():
	if f0:
	pred_wave = net_g.infer(phone, phone_lengths, pitch, pitchf, sid)[0]
	else:
	pred_wave = net_g.infer(phone, phone_lengths, sid)[0]
	os.makedirs(os.path.join(state_dir, f"wav_sample_{epoch}"), exist_ok=True)
	for i in range(pred_wave.shape[0]):
	torchaudio.save(filepath=os.path.join(state_dir, f"wav_sample_{epoch}", f"{i:02}_y_true.wav"), src=wave[i].detach().cpu().float(), sample_rate=int(sample_rate[:-1] + "000"))
	torchaudio.save(filepath=os.path.join(state_dir, f"wav_sample_{epoch}", f"{i:02}_y_pred.wav"), src=pred_wave[i].detach().cpu().float(), sample_rate=int(sample_rate[:-1] + "000"))
	if augment:
	torchaudio.save(filepath=os.path.join(state_dir, f"wav_sample_{epoch}", f"{i:02}_y_aug.wav"), src=aug_wave[i].detach().cpu().float(), sample_rate=int(sample_rate[:-1] + "000"))

	utils.save_state(
	net_g,
	optim_g,
	config.train.learning_rate,
	epoch,
	os.path.join(state_dir, f"G_{epoch}.pth"),
	)
	utils.save_state(
	net_d,
	optim_d,
	config.train.learning_rate,
	epoch,
	os.path.join(state_dir, f"D_{epoch}.pth"),
	)

	save(
	net_g,
	config.version,
	sample_rate,
	f0,
	embedder_name,
	embedder_out_channels,
	embedding_output_layer,
	os.path.join(training_dir, "checkpoints", f"{model_name}-{epoch}.pth"),
	epoch,
	speaker_info
	)

	scheduler_g.step()
	scheduler_d.step()

	if is_main_process:
	print("Training is done. The program is closed.")
	save(
	net_g,
	config.version,
	sample_rate,
	f0,
	embedder_name,
	embedder_out_channels,
	embedding_output_layer,
	os.path.join(out_dir, f"{model_name}.pth"),
	epoch,
	speaker_info
	)