Spaces:
Runtime error
Runtime error
Update train_ms.py
Browse files- train_ms.py +28 -19
train_ms.py
CHANGED
|
@@ -4,6 +4,7 @@ import argparse
|
|
| 4 |
import itertools
|
| 5 |
import math
|
| 6 |
import torch
|
|
|
|
| 7 |
from torch import nn, optim
|
| 8 |
from torch.nn import functional as F
|
| 9 |
from torch.utils.data import DataLoader
|
|
@@ -38,12 +39,8 @@ from text.symbols import symbols
|
|
| 38 |
|
| 39 |
torch.backends.cudnn.benchmark = True
|
| 40 |
torch.backends.cuda.matmul.allow_tf32 = True
|
| 41 |
-
torch.backends.cudnn.allow_tf32 = True
|
| 42 |
torch.set_float32_matmul_precision('medium')
|
| 43 |
-
torch.backends.cuda.sdp_kernel("flash")
|
| 44 |
-
torch.backends.cuda.enable_flash_sdp(True)
|
| 45 |
-
torch.backends.cuda.enable_mem_efficient_sdp(True) # Not avaliable if torch version is lower than 2.0
|
| 46 |
-
torch.backends.cuda.enable_math_sdp(True)
|
| 47 |
global_step = 0
|
| 48 |
|
| 49 |
|
|
@@ -56,6 +53,10 @@ def main():
|
|
| 56 |
os.environ['MASTER_PORT'] = '65280'
|
| 57 |
|
| 58 |
hps = utils.get_hparams()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hps,))
|
| 60 |
|
| 61 |
|
|
@@ -68,7 +69,7 @@ def run(rank, n_gpus, hps):
|
|
| 68 |
writer = SummaryWriter(log_dir=hps.model_dir)
|
| 69 |
writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))
|
| 70 |
|
| 71 |
-
dist.init_process_group(backend='nccl', init_method='env://', world_size=n_gpus, rank=rank)
|
| 72 |
torch.manual_seed(hps.train.seed)
|
| 73 |
torch.cuda.set_device(rank)
|
| 74 |
|
|
@@ -81,9 +82,8 @@ def run(rank, n_gpus, hps):
|
|
| 81 |
rank=rank,
|
| 82 |
shuffle=True)
|
| 83 |
collate_fn = TextAudioSpeakerCollate()
|
| 84 |
-
train_loader = DataLoader(train_dataset, num_workers=
|
| 85 |
-
collate_fn=collate_fn, batch_sampler=train_sampler
|
| 86 |
-
persistent_workers=True,prefetch_factor=4) #256G Memory suitable loader.
|
| 87 |
if rank == 0:
|
| 88 |
eval_dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data)
|
| 89 |
eval_loader = DataLoader(eval_dataset, num_workers=0, shuffle=False,
|
|
@@ -155,20 +155,29 @@ def run(rank, n_gpus, hps):
|
|
| 155 |
net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True)
|
| 156 |
if net_dur_disc is not None:
|
| 157 |
net_dur_disc = DDP(net_dur_disc, device_ids=[rank], find_unused_parameters=True)
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
|
|
|
|
|
|
|
|
|
| 165 |
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
print(e)
|
| 170 |
epoch_str = 1
|
| 171 |
global_step = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
|
| 173 |
|
| 174 |
scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2)
|
|
|
|
| 4 |
import itertools
|
| 5 |
import math
|
| 6 |
import torch
|
| 7 |
+
import shutil
|
| 8 |
from torch import nn, optim
|
| 9 |
from torch.nn import functional as F
|
| 10 |
from torch.utils.data import DataLoader
|
|
|
|
| 39 |
|
| 40 |
torch.backends.cudnn.benchmark = True
|
| 41 |
torch.backends.cuda.matmul.allow_tf32 = True
|
| 42 |
+
torch.backends.cudnn.allow_tf32 = True
|
| 43 |
torch.set_float32_matmul_precision('medium')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
global_step = 0
|
| 45 |
|
| 46 |
|
|
|
|
| 53 |
os.environ['MASTER_PORT'] = '65280'
|
| 54 |
|
| 55 |
hps = utils.get_hparams()
|
| 56 |
+
if not hps.cont:
|
| 57 |
+
shutil.copy('./pretrained_models/D_0.pth','./logs/OUTPUT_MODEL/D_0.pth')
|
| 58 |
+
shutil.copy('./pretrained_models/G_0.pth','./logs/OUTPUT_MODEL/G_0.pth')
|
| 59 |
+
shutil.copy('./pretrained_models/DUR_0.pth','./logs/OUTPUT_MODEL/DUR_0.pth')
|
| 60 |
mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hps,))
|
| 61 |
|
| 62 |
|
|
|
|
| 69 |
writer = SummaryWriter(log_dir=hps.model_dir)
|
| 70 |
writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))
|
| 71 |
|
| 72 |
+
dist.init_process_group(backend= 'gloo' if os.name == 'nt' else 'nccl', init_method='env://', world_size=n_gpus, rank=rank)
|
| 73 |
torch.manual_seed(hps.train.seed)
|
| 74 |
torch.cuda.set_device(rank)
|
| 75 |
|
|
|
|
| 82 |
rank=rank,
|
| 83 |
shuffle=True)
|
| 84 |
collate_fn = TextAudioSpeakerCollate()
|
| 85 |
+
train_loader = DataLoader(train_dataset, num_workers=2, shuffle=False, pin_memory=True,
|
| 86 |
+
collate_fn=collate_fn, batch_sampler=train_sampler)
|
|
|
|
| 87 |
if rank == 0:
|
| 88 |
eval_dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data)
|
| 89 |
eval_loader = DataLoader(eval_dataset, num_workers=0, shuffle=False,
|
|
|
|
| 155 |
net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True)
|
| 156 |
if net_dur_disc is not None:
|
| 157 |
net_dur_disc = DDP(net_dur_disc, device_ids=[rank], find_unused_parameters=True)
|
| 158 |
+
|
| 159 |
+
pretrain_dir = None
|
| 160 |
+
if pretrain_dir is None:
|
| 161 |
+
try:
|
| 162 |
+
if net_dur_disc is not None:
|
| 163 |
+
_, optim_dur_disc, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "DUR_*.pth"), net_dur_disc, optim_dur_disc, skip_optimizer=not hps.cont)
|
| 164 |
+
_, optim_g, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g,
|
| 165 |
+
optim_g, skip_optimizer=not hps.cont)
|
| 166 |
+
_, optim_d, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d,
|
| 167 |
+
optim_d, skip_optimizer=not hps.cont)
|
| 168 |
|
| 169 |
+
epoch_str = max(epoch_str, 1)
|
| 170 |
+
global_step = (epoch_str - 1) * len(train_loader)
|
| 171 |
+
except Exception as e:
|
| 172 |
print(e)
|
| 173 |
epoch_str = 1
|
| 174 |
global_step = 0
|
| 175 |
+
else:
|
| 176 |
+
_, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(pretrain_dir, "G_*.pth"), net_g,
|
| 177 |
+
optim_g, True)
|
| 178 |
+
_, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(pretrain_dir, "D_*.pth"), net_d,
|
| 179 |
+
optim_d, True)
|
| 180 |
+
|
| 181 |
|
| 182 |
|
| 183 |
scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2)
|