v2
Browse files- .ipynb_checkpoints/models-checkpoint.py +2 -1
- .ipynb_checkpoints/optimizers-checkpoint.py +73 -0
- Configs/.ipynb_checkpoints/config_ft_single-checkpoint.yml +5 -5
- Configs/config_ft_single.yml +4 -4
- Demo/.ipynb_checkpoints/Inference_pod_90h_30k-checkpoint.ipynb +423 -31
- __pycache__/models.cpython-310.pyc +0 -0
- logs/pod_90h_30k_second_v2/.ipynb_checkpoints/train-checkpoint.log +0 -0
- logs/pod_90h_30k_second_v2/config_ft_single.yml +22 -0
- logs/pod_90h_30k_second_v2/epoch_2nd_00000.pth +3 -0
- logs/pod_90h_30k_second_v2/epoch_2nd_00001.pth +3 -0
- logs/pod_90h_30k_second_v2/epoch_2nd_00002.pth +3 -0
- logs/pod_90h_30k_second_v2/epoch_2nd_00003.pth +3 -0
- logs/pod_90h_30k_second_v2/epoch_2nd_00004.pth +3 -0
- logs/pod_90h_30k_second_v2/epoch_2nd_00005.pth +3 -0
- logs/pod_90h_30k_second_v2/epoch_2nd_00006.pth +3 -0
- logs/pod_90h_30k_second_v2/epoch_2nd_00007.pth +3 -0
- logs/pod_90h_30k_second_v2/epoch_2nd_00008.pth +3 -0
- logs/pod_90h_30k_second_v2/epoch_2nd_00009.pth +3 -0
- logs/pod_90h_30k_second_v2/epoch_2nd_00010.pth +3 -0
- logs/pod_90h_30k_second_v2/epoch_2nd_00011.pth +3 -0
- logs/pod_90h_30k_second_v2/epoch_2nd_00012.pth +3 -0
- logs/pod_90h_30k_second_v2/epoch_2nd_00013.pth +3 -0
- logs/pod_90h_30k_second_v2/epoch_2nd_00014.pth +3 -0
- logs/pod_90h_30k_second_v2/epoch_2nd_00015.pth +3 -0
- logs/pod_90h_30k_second_v2/epoch_2nd_00016.pth +3 -0
- logs/pod_90h_30k_second_v2/epoch_2nd_00017.pth +3 -0
- logs/pod_90h_30k_second_v2/epoch_2nd_00018.pth +3 -0
- logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749758267.7f09b0e2c0b0.17026.0 +3 -0
- logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749758489.7f09b0e2c0b0.18353.0 +3 -0
- logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749758524.7f09b0e2c0b0.18773.0 +3 -0
- logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749758552.7f09b0e2c0b0.19160.0 +3 -0
- logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749758602.7f09b0e2c0b0.19654.0 +3 -0
- logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749763142.7f09b0e2c0b0.41611.0 +3 -0
- logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749763329.7f09b0e2c0b0.42740.0 +3 -0
- logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749763548.7f09b0e2c0b0.44123.0 +3 -0
- logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749789808.7f09b0e2c0b0.1500.0 +3 -0
- logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749790964.7f09b0e2c0b0.2345.0 +3 -0
- logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749791414.7f09b0e2c0b0.1465.0 +3 -0
- logs/pod_90h_30k_second_v2/train.log +0 -0
- models.py +2 -1
.ipynb_checkpoints/models-checkpoint.py
CHANGED
|
@@ -703,8 +703,9 @@ def load_checkpoint(model, optimizer, path, load_only_params=True, ignore_module
|
|
| 703 |
_ = [model[key].eval() for key in model]
|
| 704 |
|
| 705 |
if not load_only_params:
|
| 706 |
-
epoch = state["epoch"]
|
| 707 |
iters = state["iters"]
|
|
|
|
| 708 |
optimizer.load_state_dict(state["optimizer"])
|
| 709 |
else:
|
| 710 |
epoch = 0
|
|
|
|
| 703 |
_ = [model[key].eval() for key in model]
|
| 704 |
|
| 705 |
if not load_only_params:
|
| 706 |
+
epoch = state["epoch"] + 1
|
| 707 |
iters = state["iters"]
|
| 708 |
+
print('Load checkpoint from %s, epoch %d, iters %d' % (path, epoch, iters))
|
| 709 |
optimizer.load_state_dict(state["optimizer"])
|
| 710 |
else:
|
| 711 |
epoch = 0
|
.ipynb_checkpoints/optimizers-checkpoint.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#coding:utf-8
|
| 2 |
+
import os, sys
|
| 3 |
+
import os.path as osp
|
| 4 |
+
import numpy as np
|
| 5 |
+
import torch
|
| 6 |
+
from torch import nn
|
| 7 |
+
from torch.optim import Optimizer
|
| 8 |
+
from functools import reduce
|
| 9 |
+
from torch.optim import AdamW
|
| 10 |
+
|
| 11 |
+
class MultiOptimizer:
|
| 12 |
+
def __init__(self, optimizers={}, schedulers={}):
|
| 13 |
+
self.optimizers = optimizers
|
| 14 |
+
self.schedulers = schedulers
|
| 15 |
+
self.keys = list(optimizers.keys())
|
| 16 |
+
self.param_groups = reduce(lambda x,y: x+y, [v.param_groups for v in self.optimizers.values()])
|
| 17 |
+
|
| 18 |
+
def state_dict(self):
|
| 19 |
+
state_dicts = [(key, self.optimizers[key].state_dict())\
|
| 20 |
+
for key in self.keys]
|
| 21 |
+
return state_dicts
|
| 22 |
+
|
| 23 |
+
def load_state_dict(self, state_dict):
|
| 24 |
+
for key, val in state_dict:
|
| 25 |
+
try:
|
| 26 |
+
self.optimizers[key].load_state_dict(val)
|
| 27 |
+
except:
|
| 28 |
+
print("Unloaded %s" % key)
|
| 29 |
+
|
| 30 |
+
def step(self, key=None, scaler=None):
|
| 31 |
+
keys = [key] if key is not None else self.keys
|
| 32 |
+
_ = [self._step(key, scaler) for key in keys]
|
| 33 |
+
|
| 34 |
+
def _step(self, key, scaler=None):
|
| 35 |
+
if scaler is not None:
|
| 36 |
+
scaler.step(self.optimizers[key])
|
| 37 |
+
scaler.update()
|
| 38 |
+
else:
|
| 39 |
+
self.optimizers[key].step()
|
| 40 |
+
|
| 41 |
+
def zero_grad(self, key=None):
|
| 42 |
+
if key is not None:
|
| 43 |
+
self.optimizers[key].zero_grad()
|
| 44 |
+
else:
|
| 45 |
+
_ = [self.optimizers[key].zero_grad() for key in self.keys]
|
| 46 |
+
|
| 47 |
+
def scheduler(self, *args, key=None):
|
| 48 |
+
if key is not None:
|
| 49 |
+
self.schedulers[key].step(*args)
|
| 50 |
+
else:
|
| 51 |
+
_ = [self.schedulers[key].step(*args) for key in self.keys]
|
| 52 |
+
|
| 53 |
+
def define_scheduler(optimizer, params):
|
| 54 |
+
scheduler = torch.optim.lr_scheduler.OneCycleLR(
|
| 55 |
+
optimizer,
|
| 56 |
+
max_lr=params.get('max_lr', 2e-4),
|
| 57 |
+
epochs=params.get('epochs', 200),
|
| 58 |
+
steps_per_epoch=params.get('steps_per_epoch', 1000),
|
| 59 |
+
pct_start=params.get('pct_start', 0.0),
|
| 60 |
+
div_factor=1,
|
| 61 |
+
final_div_factor=1)
|
| 62 |
+
|
| 63 |
+
return scheduler
|
| 64 |
+
|
| 65 |
+
def build_optimizer(parameters_dict, scheduler_params_dict, lr):
|
| 66 |
+
optim = dict([(key, AdamW(params, lr=lr, weight_decay=1e-4, betas=(0.0, 0.99), eps=1e-9))
|
| 67 |
+
for key, params in parameters_dict.items()])
|
| 68 |
+
|
| 69 |
+
schedulers = dict([(key, define_scheduler(opt, scheduler_params_dict[key])) \
|
| 70 |
+
for key, opt in optim.items()])
|
| 71 |
+
|
| 72 |
+
multi_optim = MultiOptimizer(optim, schedulers)
|
| 73 |
+
return multi_optim
|
Configs/.ipynb_checkpoints/config_ft_single-checkpoint.yml
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
# ─── GLOBAL ──────────────────────────────────────────────────────────
|
| 2 |
-
log_dir: logs/
|
| 3 |
device: "cuda"
|
| 4 |
|
| 5 |
batch_size: 12 # 40 GB A100, fp16
|
|
@@ -11,7 +11,7 @@ save_freq: 1
|
|
| 11 |
log_interval: 50
|
| 12 |
|
| 13 |
# leave blank on first run
|
| 14 |
-
pretrained_model: "
|
| 15 |
second_stage_load_pretrained: true
|
| 16 |
load_only_params: false
|
| 17 |
|
|
@@ -48,14 +48,14 @@ loss_params:
|
|
| 48 |
lambda_sty: 1. # style reconstruction loss (2nd stage)
|
| 49 |
lambda_diff: 1. # score matching loss (2nd stage)
|
| 50 |
|
| 51 |
-
diff_epoch:
|
| 52 |
-
joint_epoch:
|
| 53 |
|
| 54 |
# ─── OPTIMISER ──────────────────────────────────────────────────────
|
| 55 |
optimizer_params:
|
| 56 |
lr: 0.0001
|
| 57 |
bert_lr: 0.00001
|
| 58 |
-
ft_lr: 0.
|
| 59 |
grad_accum_steps: 2
|
| 60 |
|
| 61 |
# ─── MODEL (core network & sub-modules) ─────────────────────────────
|
|
|
|
| 1 |
# ─── GLOBAL ──────────────────────────────────────────────────────────
|
| 2 |
+
log_dir: logs/pod_90h_30k_second_v2
|
| 3 |
device: "cuda"
|
| 4 |
|
| 5 |
batch_size: 12 # 40 GB A100, fp16
|
|
|
|
| 11 |
log_interval: 50
|
| 12 |
|
| 13 |
# leave blank on first run
|
| 14 |
+
pretrained_model: "/workspace/styletts2/logs/pod_90h_30k_second_v2/epoch_2nd_00005.pth"
|
| 15 |
second_stage_load_pretrained: true
|
| 16 |
load_only_params: false
|
| 17 |
|
|
|
|
| 48 |
lambda_sty: 1. # style reconstruction loss (2nd stage)
|
| 49 |
lambda_diff: 1. # score matching loss (2nd stage)
|
| 50 |
|
| 51 |
+
diff_epoch: 1 # style diffusion starting epoch (2nd stage)
|
| 52 |
+
joint_epoch: 5 # joint training starting epoch (2nd stage)
|
| 53 |
|
| 54 |
# ─── OPTIMISER ──────────────────────────────────────────────────────
|
| 55 |
optimizer_params:
|
| 56 |
lr: 0.0001
|
| 57 |
bert_lr: 0.00001
|
| 58 |
+
ft_lr: 0.00001
|
| 59 |
grad_accum_steps: 2
|
| 60 |
|
| 61 |
# ─── MODEL (core network & sub-modules) ─────────────────────────────
|
Configs/config_ft_single.yml
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
# ─── GLOBAL ──────────────────────────────────────────────────────────
|
| 2 |
-
log_dir: logs/
|
| 3 |
device: "cuda"
|
| 4 |
|
| 5 |
batch_size: 12 # 40 GB A100, fp16
|
|
@@ -11,7 +11,7 @@ save_freq: 1
|
|
| 11 |
log_interval: 50
|
| 12 |
|
| 13 |
# leave blank on first run
|
| 14 |
-
pretrained_model: "
|
| 15 |
second_stage_load_pretrained: true
|
| 16 |
load_only_params: false
|
| 17 |
|
|
@@ -48,8 +48,8 @@ loss_params:
|
|
| 48 |
lambda_sty: 1. # style reconstruction loss (2nd stage)
|
| 49 |
lambda_diff: 1. # score matching loss (2nd stage)
|
| 50 |
|
| 51 |
-
diff_epoch:
|
| 52 |
-
joint_epoch:
|
| 53 |
|
| 54 |
# ─── OPTIMISER ──────────────────────────────────────────────────────
|
| 55 |
optimizer_params:
|
|
|
|
| 1 |
# ─── GLOBAL ──────────────────────────────────────────────────────────
|
| 2 |
+
log_dir: logs/pod_90h_30k_second_v2
|
| 3 |
device: "cuda"
|
| 4 |
|
| 5 |
batch_size: 12 # 40 GB A100, fp16
|
|
|
|
| 11 |
log_interval: 50
|
| 12 |
|
| 13 |
# leave blank on first run
|
| 14 |
+
pretrained_model: "/workspace/styletts2/logs/pod_90h_30k_second_v2/epoch_2nd_00005.pth"
|
| 15 |
second_stage_load_pretrained: true
|
| 16 |
load_only_params: false
|
| 17 |
|
|
|
|
| 48 |
lambda_sty: 1. # style reconstruction loss (2nd stage)
|
| 49 |
lambda_diff: 1. # score matching loss (2nd stage)
|
| 50 |
|
| 51 |
+
diff_epoch: 1 # style diffusion starting epoch (2nd stage)
|
| 52 |
+
joint_epoch: 5 # joint training starting epoch (2nd stage)
|
| 53 |
|
| 54 |
# ─── OPTIMISER ──────────────────────────────────────────────────────
|
| 55 |
optimizer_params:
|
Demo/.ipynb_checkpoints/Inference_pod_90h_30k-checkpoint.ipynb
CHANGED
|
@@ -20,7 +20,7 @@
|
|
| 20 |
},
|
| 21 |
{
|
| 22 |
"cell_type": "code",
|
| 23 |
-
"execution_count":
|
| 24 |
"id": "96e173bf",
|
| 25 |
"metadata": {},
|
| 26 |
"outputs": [],
|
|
@@ -39,20 +39,227 @@
|
|
| 39 |
},
|
| 40 |
{
|
| 41 |
"cell_type": "code",
|
| 42 |
-
"execution_count":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
"id": "da84c60f",
|
| 44 |
"metadata": {},
|
| 45 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
"source": [
|
| 47 |
"%cd .."
|
| 48 |
]
|
| 49 |
},
|
| 50 |
{
|
| 51 |
"cell_type": "code",
|
| 52 |
-
"execution_count":
|
| 53 |
"id": "5a3ddcc8",
|
| 54 |
"metadata": {},
|
| 55 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
"source": [
|
| 57 |
"# load packages\n",
|
| 58 |
"import time\n",
|
|
@@ -77,7 +284,7 @@
|
|
| 77 |
},
|
| 78 |
{
|
| 79 |
"cell_type": "code",
|
| 80 |
-
"execution_count":
|
| 81 |
"id": "00ee05e1",
|
| 82 |
"metadata": {},
|
| 83 |
"outputs": [],
|
|
@@ -113,7 +320,7 @@
|
|
| 113 |
},
|
| 114 |
{
|
| 115 |
"cell_type": "code",
|
| 116 |
-
"execution_count":
|
| 117 |
"id": "bbdc04c0",
|
| 118 |
"metadata": {},
|
| 119 |
"outputs": [],
|
|
@@ -121,6 +328,111 @@
|
|
| 121 |
"device = 'cuda' if torch.cuda.is_available() else 'cpu'"
|
| 122 |
]
|
| 123 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
{
|
| 125 |
"cell_type": "markdown",
|
| 126 |
"id": "7b9cecbe",
|
|
@@ -131,7 +443,7 @@
|
|
| 131 |
},
|
| 132 |
{
|
| 133 |
"cell_type": "code",
|
| 134 |
-
"execution_count":
|
| 135 |
"id": "64fc4c0f",
|
| 136 |
"metadata": {},
|
| 137 |
"outputs": [],
|
|
@@ -143,12 +455,12 @@
|
|
| 143 |
},
|
| 144 |
{
|
| 145 |
"cell_type": "code",
|
| 146 |
-
"execution_count":
|
| 147 |
"id": "48e7b644",
|
| 148 |
"metadata": {},
|
| 149 |
"outputs": [],
|
| 150 |
"source": [
|
| 151 |
-
"config = yaml.safe_load(open(\"
|
| 152 |
"\n",
|
| 153 |
"# load pretrained ASR model\n",
|
| 154 |
"ASR_config = config.get('ASR_config', False)\n",
|
|
@@ -167,10 +479,21 @@
|
|
| 167 |
},
|
| 168 |
{
|
| 169 |
"cell_type": "code",
|
| 170 |
-
"execution_count":
|
| 171 |
"id": "ffc18cf7",
|
| 172 |
"metadata": {},
|
| 173 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
"source": [
|
| 175 |
"model_params = recursive_munch(config['model_params'])\n",
|
| 176 |
"model = build_model(model_params, text_aligner, pitch_extractor, plbert)\n",
|
|
@@ -180,21 +503,41 @@
|
|
| 180 |
},
|
| 181 |
{
|
| 182 |
"cell_type": "code",
|
| 183 |
-
"execution_count":
|
| 184 |
"id": "64529d5c",
|
| 185 |
"metadata": {},
|
| 186 |
"outputs": [],
|
| 187 |
"source": [
|
| 188 |
-
"params_whole = torch.load(\"
|
| 189 |
"params = params_whole['net']"
|
| 190 |
]
|
| 191 |
},
|
| 192 |
{
|
| 193 |
"cell_type": "code",
|
| 194 |
-
"execution_count":
|
| 195 |
"id": "895d9706",
|
| 196 |
"metadata": {},
|
| 197 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
"source": [
|
| 199 |
"for key in model:\n",
|
| 200 |
" if key in params:\n",
|
|
@@ -217,7 +560,7 @@
|
|
| 217 |
},
|
| 218 |
{
|
| 219 |
"cell_type": "code",
|
| 220 |
-
"execution_count":
|
| 221 |
"id": "c1a59db2",
|
| 222 |
"metadata": {},
|
| 223 |
"outputs": [],
|
|
@@ -227,7 +570,7 @@
|
|
| 227 |
},
|
| 228 |
{
|
| 229 |
"cell_type": "code",
|
| 230 |
-
"execution_count":
|
| 231 |
"id": "e30985ab",
|
| 232 |
"metadata": {},
|
| 233 |
"outputs": [],
|
|
@@ -250,7 +593,7 @@
|
|
| 250 |
},
|
| 251 |
{
|
| 252 |
"cell_type": "code",
|
| 253 |
-
"execution_count":
|
| 254 |
"id": "ca57469c",
|
| 255 |
"metadata": {},
|
| 256 |
"outputs": [],
|
|
@@ -335,7 +678,7 @@
|
|
| 335 |
},
|
| 336 |
{
|
| 337 |
"cell_type": "code",
|
| 338 |
-
"execution_count":
|
| 339 |
"id": "cace9787",
|
| 340 |
"metadata": {},
|
| 341 |
"outputs": [],
|
|
@@ -454,6 +797,22 @@
|
|
| 454 |
" display(ipd.Audio(path, rate=24000, normalize=False))"
|
| 455 |
]
|
| 456 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 457 |
{
|
| 458 |
"cell_type": "markdown",
|
| 459 |
"id": "141e91b3",
|
|
@@ -470,7 +829,7 @@
|
|
| 470 |
},
|
| 471 |
{
|
| 472 |
"cell_type": "code",
|
| 473 |
-
"execution_count":
|
| 474 |
"id": "81addda4",
|
| 475 |
"metadata": {},
|
| 476 |
"outputs": [],
|
|
@@ -481,9 +840,46 @@
|
|
| 481 |
{
|
| 482 |
"cell_type": "code",
|
| 483 |
"execution_count": null,
|
| 484 |
-
"id": "
|
| 485 |
"metadata": {},
|
| 486 |
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 487 |
"source": [
|
| 488 |
"texts = {}\n",
|
| 489 |
"texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
|
|
@@ -913,9 +1309,7 @@
|
|
| 913 |
"cell_type": "code",
|
| 914 |
"execution_count": null,
|
| 915 |
"id": "6d0a3825",
|
| 916 |
-
"metadata": {
|
| 917 |
-
"scrolled": false
|
| 918 |
-
},
|
| 919 |
"outputs": [],
|
| 920 |
"source": [
|
| 921 |
"path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
|
|
@@ -1110,9 +1504,7 @@
|
|
| 1110 |
"cell_type": "code",
|
| 1111 |
"execution_count": null,
|
| 1112 |
"id": "44a4cea1",
|
| 1113 |
-
"metadata": {
|
| 1114 |
-
"scrolled": false
|
| 1115 |
-
},
|
| 1116 |
"outputs": [],
|
| 1117 |
"source": [
|
| 1118 |
"start = time.time()\n",
|
|
@@ -1133,9 +1525,9 @@
|
|
| 1133 |
],
|
| 1134 |
"metadata": {
|
| 1135 |
"kernelspec": {
|
| 1136 |
-
"display_name": "
|
| 1137 |
"language": "python",
|
| 1138 |
-
"name": "
|
| 1139 |
},
|
| 1140 |
"language_info": {
|
| 1141 |
"codemirror_mode": {
|
|
@@ -1147,7 +1539,7 @@
|
|
| 1147 |
"name": "python",
|
| 1148 |
"nbconvert_exporter": "python",
|
| 1149 |
"pygments_lexer": "ipython3",
|
| 1150 |
-
"version": "3.
|
| 1151 |
}
|
| 1152 |
},
|
| 1153 |
"nbformat": 4,
|
|
|
|
| 20 |
},
|
| 21 |
{
|
| 22 |
"cell_type": "code",
|
| 23 |
+
"execution_count": 1,
|
| 24 |
"id": "96e173bf",
|
| 25 |
"metadata": {},
|
| 26 |
"outputs": [],
|
|
|
|
| 39 |
},
|
| 40 |
{
|
| 41 |
"cell_type": "code",
|
| 42 |
+
"execution_count": 4,
|
| 43 |
+
"id": "2458c639-10a0-4b57-8602-22bc893c5176",
|
| 44 |
+
"metadata": {
|
| 45 |
+
"scrolled": true
|
| 46 |
+
},
|
| 47 |
+
"outputs": [
|
| 48 |
+
{
|
| 49 |
+
"name": "stdout",
|
| 50 |
+
"output_type": "stream",
|
| 51 |
+
"text": [
|
| 52 |
+
"Collecting git+https://github.com/resemble-ai/monotonic_align.git (from -r requirements.txt (line 17))\n",
|
| 53 |
+
" Cloning https://github.com/resemble-ai/monotonic_align.git to /tmp/pip-req-build-ps9pa2ga\n",
|
| 54 |
+
" Running command git clone --filter=blob:none --quiet https://github.com/resemble-ai/monotonic_align.git /tmp/pip-req-build-ps9pa2ga\n",
|
| 55 |
+
" Resolved https://github.com/resemble-ai/monotonic_align.git to commit c6e5e6cb19882164027eb6e35118e841eed9298e\n",
|
| 56 |
+
" Installing build dependencies ... \u001b[?25ldone\n",
|
| 57 |
+
"\u001b[?25h Getting requirements to build wheel ... \u001b[?25ldone\n",
|
| 58 |
+
"\u001b[?25h Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n",
|
| 59 |
+
"\u001b[?25hCollecting SoundFile (from -r requirements.txt (line 1))\n",
|
| 60 |
+
" Using cached soundfile-0.13.1-py2.py3-none-manylinux_2_28_x86_64.whl.metadata (16 kB)\n",
|
| 61 |
+
"Requirement already satisfied: torchaudio in /venv/main/lib/python3.12/site-packages (from -r requirements.txt (line 2)) (2.6.0+cu126)\n",
|
| 62 |
+
"Collecting munch (from -r requirements.txt (line 3))\n",
|
| 63 |
+
" Using cached munch-4.0.0-py2.py3-none-any.whl.metadata (5.9 kB)\n",
|
| 64 |
+
"Requirement already satisfied: torch in /venv/main/lib/python3.12/site-packages (from -r requirements.txt (line 4)) (2.6.0+cu126)\n",
|
| 65 |
+
"Collecting pydub (from -r requirements.txt (line 5))\n",
|
| 66 |
+
" Using cached pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)\n",
|
| 67 |
+
"Requirement already satisfied: pyyaml in /venv/main/lib/python3.12/site-packages (from -r requirements.txt (line 6)) (6.0.2)\n",
|
| 68 |
+
"Collecting librosa (from -r requirements.txt (line 7))\n",
|
| 69 |
+
" Using cached librosa-0.11.0-py3-none-any.whl.metadata (8.7 kB)\n",
|
| 70 |
+
"Collecting nltk (from -r requirements.txt (line 8))\n",
|
| 71 |
+
" Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)\n",
|
| 72 |
+
"Collecting matplotlib (from -r requirements.txt (line 9))\n",
|
| 73 |
+
" Downloading matplotlib-3.10.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)\n",
|
| 74 |
+
"Collecting accelerate (from -r requirements.txt (line 10))\n",
|
| 75 |
+
" Using cached accelerate-1.7.0-py3-none-any.whl.metadata (19 kB)\n",
|
| 76 |
+
"Collecting transformers (from -r requirements.txt (line 11))\n",
|
| 77 |
+
" Using cached transformers-4.52.4-py3-none-any.whl.metadata (38 kB)\n",
|
| 78 |
+
"Collecting einops (from -r requirements.txt (line 12))\n",
|
| 79 |
+
" Using cached einops-0.8.1-py3-none-any.whl.metadata (13 kB)\n",
|
| 80 |
+
"Collecting einops-exts (from -r requirements.txt (line 13))\n",
|
| 81 |
+
" Using cached einops_exts-0.0.4-py3-none-any.whl.metadata (621 bytes)\n",
|
| 82 |
+
"Requirement already satisfied: tqdm in /venv/main/lib/python3.12/site-packages (from -r requirements.txt (line 14)) (4.67.1)\n",
|
| 83 |
+
"Collecting typing (from -r requirements.txt (line 15))\n",
|
| 84 |
+
" Using cached typing-3.7.4.3.tar.gz (78 kB)\n",
|
| 85 |
+
" Preparing metadata (setup.py) ... \u001b[?25ldone\n",
|
| 86 |
+
"\u001b[?25hRequirement already satisfied: typing-extensions in /venv/main/lib/python3.12/site-packages (from -r requirements.txt (line 16)) (4.13.2)\n",
|
| 87 |
+
"Collecting cffi>=1.0 (from SoundFile->-r requirements.txt (line 1))\n",
|
| 88 |
+
" Downloading cffi-1.17.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.5 kB)\n",
|
| 89 |
+
"Requirement already satisfied: numpy in /venv/main/lib/python3.12/site-packages (from SoundFile->-r requirements.txt (line 1)) (2.1.2)\n",
|
| 90 |
+
"Requirement already satisfied: filelock in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (3.18.0)\n",
|
| 91 |
+
"Requirement already satisfied: setuptools in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (70.2.0)\n",
|
| 92 |
+
"Requirement already satisfied: sympy==1.13.1 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (1.13.1)\n",
|
| 93 |
+
"Requirement already satisfied: networkx in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (3.3)\n",
|
| 94 |
+
"Requirement already satisfied: jinja2 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (3.1.4)\n",
|
| 95 |
+
"Requirement already satisfied: fsspec in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (2025.3.2)\n",
|
| 96 |
+
"Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.6.77 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (12.6.77)\n",
|
| 97 |
+
"Requirement already satisfied: nvidia-cuda-runtime-cu12==12.6.77 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (12.6.77)\n",
|
| 98 |
+
"Requirement already satisfied: nvidia-cuda-cupti-cu12==12.6.80 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (12.6.80)\n",
|
| 99 |
+
"Requirement already satisfied: nvidia-cudnn-cu12==9.5.1.17 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (9.5.1.17)\n",
|
| 100 |
+
"Requirement already satisfied: nvidia-cublas-cu12==12.6.4.1 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (12.6.4.1)\n",
|
| 101 |
+
"Requirement already satisfied: nvidia-cufft-cu12==11.3.0.4 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (11.3.0.4)\n",
|
| 102 |
+
"Requirement already satisfied: nvidia-curand-cu12==10.3.7.77 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (10.3.7.77)\n",
|
| 103 |
+
"Requirement already satisfied: nvidia-cusolver-cu12==11.7.1.2 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (11.7.1.2)\n",
|
| 104 |
+
"Requirement already satisfied: nvidia-cusparse-cu12==12.5.4.2 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (12.5.4.2)\n",
|
| 105 |
+
"Requirement already satisfied: nvidia-cusparselt-cu12==0.6.3 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (0.6.3)\n",
|
| 106 |
+
"Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (2.21.5)\n",
|
| 107 |
+
"Requirement already satisfied: nvidia-nvtx-cu12==12.6.77 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (12.6.77)\n",
|
| 108 |
+
"Requirement already satisfied: nvidia-nvjitlink-cu12==12.6.85 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (12.6.85)\n",
|
| 109 |
+
"Requirement already satisfied: triton==3.2.0 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (3.2.0)\n",
|
| 110 |
+
"Requirement already satisfied: mpmath<1.4,>=1.1.0 in /venv/main/lib/python3.12/site-packages (from sympy==1.13.1->torch->-r requirements.txt (line 4)) (1.3.0)\n",
|
| 111 |
+
"Collecting audioread>=2.1.9 (from librosa->-r requirements.txt (line 7))\n",
|
| 112 |
+
" Using cached audioread-3.0.1-py3-none-any.whl.metadata (8.4 kB)\n",
|
| 113 |
+
"Collecting numba>=0.51.0 (from librosa->-r requirements.txt (line 7))\n",
|
| 114 |
+
" Downloading numba-0.61.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.8 kB)\n",
|
| 115 |
+
"Collecting scipy>=1.6.0 (from librosa->-r requirements.txt (line 7))\n",
|
| 116 |
+
" Downloading scipy-1.15.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)\n",
|
| 117 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.0/62.0 kB\u001b[0m \u001b[31m1.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
|
| 118 |
+
"\u001b[?25hCollecting scikit-learn>=1.1.0 (from librosa->-r requirements.txt (line 7))\n",
|
| 119 |
+
" Downloading scikit_learn-1.7.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (17 kB)\n",
|
| 120 |
+
"Collecting joblib>=1.0 (from librosa->-r requirements.txt (line 7))\n",
|
| 121 |
+
" Using cached joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)\n",
|
| 122 |
+
"Requirement already satisfied: decorator>=4.3.0 in /venv/main/lib/python3.12/site-packages (from librosa->-r requirements.txt (line 7)) (5.2.1)\n",
|
| 123 |
+
"Collecting pooch>=1.1 (from librosa->-r requirements.txt (line 7))\n",
|
| 124 |
+
" Using cached pooch-1.8.2-py3-none-any.whl.metadata (10 kB)\n",
|
| 125 |
+
"Collecting soxr>=0.3.2 (from librosa->-r requirements.txt (line 7))\n",
|
| 126 |
+
" Downloading soxr-0.5.0.post1-cp312-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)\n",
|
| 127 |
+
"Collecting lazy_loader>=0.1 (from librosa->-r requirements.txt (line 7))\n",
|
| 128 |
+
" Using cached lazy_loader-0.4-py3-none-any.whl.metadata (7.6 kB)\n",
|
| 129 |
+
"Collecting msgpack>=1.0 (from librosa->-r requirements.txt (line 7))\n",
|
| 130 |
+
" Downloading msgpack-1.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)\n",
|
| 131 |
+
"Collecting click (from nltk->-r requirements.txt (line 8))\n",
|
| 132 |
+
" Using cached click-8.2.1-py3-none-any.whl.metadata (2.5 kB)\n",
|
| 133 |
+
"Collecting regex>=2021.8.3 (from nltk->-r requirements.txt (line 8))\n",
|
| 134 |
+
" Downloading regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)\n",
|
| 135 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.5/40.5 kB\u001b[0m \u001b[31m4.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
| 136 |
+
"\u001b[?25hCollecting contourpy>=1.0.1 (from matplotlib->-r requirements.txt (line 9))\n",
|
| 137 |
+
" Downloading contourpy-1.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.5 kB)\n",
|
| 138 |
+
"Collecting cycler>=0.10 (from matplotlib->-r requirements.txt (line 9))\n",
|
| 139 |
+
" Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)\n",
|
| 140 |
+
"Collecting fonttools>=4.22.0 (from matplotlib->-r requirements.txt (line 9))\n",
|
| 141 |
+
" Downloading fonttools-4.58.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (106 kB)\n",
|
| 142 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m106.3/106.3 kB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
|
| 143 |
+
"\u001b[?25hCollecting kiwisolver>=1.3.1 (from matplotlib->-r requirements.txt (line 9))\n",
|
| 144 |
+
" Downloading kiwisolver-1.4.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.2 kB)\n",
|
| 145 |
+
"Requirement already satisfied: packaging>=20.0 in /venv/main/lib/python3.12/site-packages (from matplotlib->-r requirements.txt (line 9)) (25.0)\n",
|
| 146 |
+
"Requirement already satisfied: pillow>=8 in /venv/main/lib/python3.12/site-packages (from matplotlib->-r requirements.txt (line 9)) (11.0.0)\n",
|
| 147 |
+
"Collecting pyparsing>=2.3.1 (from matplotlib->-r requirements.txt (line 9))\n",
|
| 148 |
+
" Using cached pyparsing-3.2.3-py3-none-any.whl.metadata (5.0 kB)\n",
|
| 149 |
+
"Requirement already satisfied: python-dateutil>=2.7 in /venv/main/lib/python3.12/site-packages (from matplotlib->-r requirements.txt (line 9)) (2.9.0.post0)\n",
|
| 150 |
+
"Requirement already satisfied: psutil in /venv/main/lib/python3.12/site-packages (from accelerate->-r requirements.txt (line 10)) (7.0.0)\n",
|
| 151 |
+
"Requirement already satisfied: huggingface-hub>=0.21.0 in /venv/main/lib/python3.12/site-packages (from accelerate->-r requirements.txt (line 10)) (0.30.2)\n",
|
| 152 |
+
"Collecting safetensors>=0.4.3 (from accelerate->-r requirements.txt (line 10))\n",
|
| 153 |
+
" Using cached safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)\n",
|
| 154 |
+
"Requirement already satisfied: requests in /venv/main/lib/python3.12/site-packages (from transformers->-r requirements.txt (line 11)) (2.32.3)\n",
|
| 155 |
+
"Collecting tokenizers<0.22,>=0.21 (from transformers->-r requirements.txt (line 11))\n",
|
| 156 |
+
" Using cached tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)\n",
|
| 157 |
+
"Collecting pycparser (from cffi>=1.0->SoundFile->-r requirements.txt (line 1))\n",
|
| 158 |
+
" Using cached pycparser-2.22-py3-none-any.whl.metadata (943 bytes)\n",
|
| 159 |
+
"Collecting llvmlite<0.45,>=0.44.0dev0 (from numba>=0.51.0->librosa->-r requirements.txt (line 7))\n",
|
| 160 |
+
" Downloading llvmlite-0.44.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.0 kB)\n",
|
| 161 |
+
"Requirement already satisfied: platformdirs>=2.5.0 in /venv/main/lib/python3.12/site-packages (from pooch>=1.1->librosa->-r requirements.txt (line 7)) (4.3.7)\n",
|
| 162 |
+
"Requirement already satisfied: six>=1.5 in /venv/main/lib/python3.12/site-packages (from python-dateutil>=2.7->matplotlib->-r requirements.txt (line 9)) (1.17.0)\n",
|
| 163 |
+
"Requirement already satisfied: charset-normalizer<4,>=2 in /venv/main/lib/python3.12/site-packages (from requests->transformers->-r requirements.txt (line 11)) (3.4.1)\n",
|
| 164 |
+
"Requirement already satisfied: idna<4,>=2.5 in /venv/main/lib/python3.12/site-packages (from requests->transformers->-r requirements.txt (line 11)) (3.10)\n",
|
| 165 |
+
"Requirement already satisfied: urllib3<3,>=1.21.1 in /venv/main/lib/python3.12/site-packages (from requests->transformers->-r requirements.txt (line 11)) (2.4.0)\n",
|
| 166 |
+
"Requirement already satisfied: certifi>=2017.4.17 in /venv/main/lib/python3.12/site-packages (from requests->transformers->-r requirements.txt (line 11)) (2025.4.26)\n",
|
| 167 |
+
"Collecting threadpoolctl>=3.1.0 (from scikit-learn>=1.1.0->librosa->-r requirements.txt (line 7))\n",
|
| 168 |
+
" Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)\n",
|
| 169 |
+
"Requirement already satisfied: MarkupSafe>=2.0 in /venv/main/lib/python3.12/site-packages (from jinja2->torch->-r requirements.txt (line 4)) (2.1.5)\n",
|
| 170 |
+
"Using cached soundfile-0.13.1-py2.py3-none-manylinux_2_28_x86_64.whl (1.3 MB)\n",
|
| 171 |
+
"Using cached munch-4.0.0-py2.py3-none-any.whl (9.9 kB)\n",
|
| 172 |
+
"Using cached pydub-0.25.1-py2.py3-none-any.whl (32 kB)\n",
|
| 173 |
+
"Using cached librosa-0.11.0-py3-none-any.whl (260 kB)\n",
|
| 174 |
+
"Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)\n",
|
| 175 |
+
"Downloading matplotlib-3.10.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.6 MB)\n",
|
| 176 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.6/8.6 MB\u001b[0m \u001b[31m28.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
|
| 177 |
+
"\u001b[?25hUsing cached accelerate-1.7.0-py3-none-any.whl (362 kB)\n",
|
| 178 |
+
"Using cached transformers-4.52.4-py3-none-any.whl (10.5 MB)\n",
|
| 179 |
+
"Using cached einops-0.8.1-py3-none-any.whl (64 kB)\n",
|
| 180 |
+
"Using cached einops_exts-0.0.4-py3-none-any.whl (3.9 kB)\n",
|
| 181 |
+
"Using cached audioread-3.0.1-py3-none-any.whl (23 kB)\n",
|
| 182 |
+
"Downloading cffi-1.17.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (479 kB)\n",
|
| 183 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m479.4/479.4 kB\u001b[0m \u001b[31m169.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
| 184 |
+
"\u001b[?25hDownloading contourpy-1.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (323 kB)\n",
|
| 185 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m323.7/323.7 kB\u001b[0m \u001b[31m127.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
| 186 |
+
"\u001b[?25hUsing cached cycler-0.12.1-py3-none-any.whl (8.3 kB)\n",
|
| 187 |
+
"Downloading fonttools-4.58.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.9 MB)\n",
|
| 188 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m87.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n",
|
| 189 |
+
"\u001b[?25hUsing cached joblib-1.5.1-py3-none-any.whl (307 kB)\n",
|
| 190 |
+
"Downloading kiwisolver-1.4.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)\n",
|
| 191 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m185.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
| 192 |
+
"\u001b[?25hUsing cached lazy_loader-0.4-py3-none-any.whl (12 kB)\n",
|
| 193 |
+
"Downloading msgpack-1.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (401 kB)\n",
|
| 194 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m401.4/401.4 kB\u001b[0m \u001b[31m192.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
| 195 |
+
"\u001b[?25hDownloading numba-0.61.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.9 MB)\n",
|
| 196 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.9/3.9 MB\u001b[0m \u001b[31m42.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n",
|
| 197 |
+
"\u001b[?25hUsing cached pooch-1.8.2-py3-none-any.whl (64 kB)\n",
|
| 198 |
+
"Using cached pyparsing-3.2.3-py3-none-any.whl (111 kB)\n",
|
| 199 |
+
"Downloading regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (796 kB)\n",
|
| 200 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m796.9/796.9 kB\u001b[0m \u001b[31m125.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
| 201 |
+
"\u001b[?25hUsing cached safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (471 kB)\n",
|
| 202 |
+
"Downloading scikit_learn-1.7.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.5 MB)\n",
|
| 203 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.5/12.5 MB\u001b[0m \u001b[31m43.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
|
| 204 |
+
"\u001b[?25hDownloading scipy-1.15.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (37.3 MB)\n",
|
| 205 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m37.3/37.3 MB\u001b[0m \u001b[31m26.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
|
| 206 |
+
"\u001b[?25hDownloading soxr-0.5.0.post1-cp312-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (248 kB)\n",
|
| 207 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m248.5/248.5 kB\u001b[0m \u001b[31m36.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
| 208 |
+
"\u001b[?25hUsing cached tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)\n",
|
| 209 |
+
"Using cached click-8.2.1-py3-none-any.whl (102 kB)\n",
|
| 210 |
+
"Downloading llvmlite-0.44.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (42.4 MB)\n",
|
| 211 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.4/42.4 MB\u001b[0m \u001b[31m14.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
|
| 212 |
+
"\u001b[?25hUsing cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)\n",
|
| 213 |
+
"Using cached pycparser-2.22-py3-none-any.whl (117 kB)\n",
|
| 214 |
+
"Building wheels for collected packages: typing, monotonic_align\n",
|
| 215 |
+
" Building wheel for typing (setup.py) ... \u001b[?25ldone\n",
|
| 216 |
+
"\u001b[?25h Created wheel for typing: filename=typing-3.7.4.3-py3-none-any.whl size=26304 sha256=7bd8523fe1f7cb4e20da87ee646956891addbdea2d87074f6bbf77fe282e8720\n",
|
| 217 |
+
" Stored in directory: /root/.cache/pip/wheels/12/98/52/2bffe242a9a487f00886e43b8ed8dac46456702e11a0d6abef\n",
|
| 218 |
+
" Building wheel for monotonic_align (pyproject.toml) ... \u001b[?25ldone\n",
|
| 219 |
+
"\u001b[?25h Created wheel for monotonic_align: filename=monotonic_align-1.2-cp312-cp312-linux_x86_64.whl size=1543517 sha256=dc9566d3e5a0656ebf939e760d934e0926d435f336db84e0019c7391576cd4cc\n",
|
| 220 |
+
" Stored in directory: /tmp/pip-ephem-wheel-cache-0gzg26zy/wheels/76/0a/37/00634137cd000799e060087bd1cb49a060ac6a48fc42a15488\n",
|
| 221 |
+
"Successfully built typing monotonic_align\n",
|
| 222 |
+
"Installing collected packages: pydub, typing, threadpoolctl, soxr, scipy, safetensors, regex, pyparsing, pycparser, munch, msgpack, monotonic_align, llvmlite, lazy_loader, kiwisolver, joblib, fonttools, einops, cycler, contourpy, click, audioread, scikit-learn, pooch, numba, nltk, matplotlib, einops-exts, cffi, tokenizers, SoundFile, transformers, librosa, accelerate\n",
|
| 223 |
+
"Successfully installed SoundFile-0.13.1 accelerate-1.7.0 audioread-3.0.1 cffi-1.17.1 click-8.2.1 contourpy-1.3.2 cycler-0.12.1 einops-0.8.1 einops-exts-0.0.4 fonttools-4.58.2 joblib-1.5.1 kiwisolver-1.4.8 lazy_loader-0.4 librosa-0.11.0 llvmlite-0.44.0 matplotlib-3.10.3 monotonic_align-1.2 msgpack-1.1.0 munch-4.0.0 nltk-3.9.1 numba-0.61.2 pooch-1.8.2 pycparser-2.22 pydub-0.25.1 pyparsing-3.2.3 regex-2024.11.6 safetensors-0.5.3 scikit-learn-1.7.0 scipy-1.15.3 soxr-0.5.0.post1 threadpoolctl-3.6.0 tokenizers-0.21.1 transformers-4.52.4 typing-3.7.4.3\n"
|
| 224 |
+
]
|
| 225 |
+
}
|
| 226 |
+
],
|
| 227 |
+
"source": [
|
| 228 |
+
"!pip install -r requirements.txt"
|
| 229 |
+
]
|
| 230 |
+
},
|
| 231 |
+
{
|
| 232 |
+
"cell_type": "code",
|
| 233 |
+
"execution_count": 2,
|
| 234 |
"id": "da84c60f",
|
| 235 |
"metadata": {},
|
| 236 |
+
"outputs": [
|
| 237 |
+
{
|
| 238 |
+
"name": "stdout",
|
| 239 |
+
"output_type": "stream",
|
| 240 |
+
"text": [
|
| 241 |
+
"/workspace/styletts2\n"
|
| 242 |
+
]
|
| 243 |
+
}
|
| 244 |
+
],
|
| 245 |
"source": [
|
| 246 |
"%cd .."
|
| 247 |
]
|
| 248 |
},
|
| 249 |
{
|
| 250 |
"cell_type": "code",
|
| 251 |
+
"execution_count": 5,
|
| 252 |
"id": "5a3ddcc8",
|
| 253 |
"metadata": {},
|
| 254 |
+
"outputs": [
|
| 255 |
+
{
|
| 256 |
+
"name": "stdout",
|
| 257 |
+
"output_type": "stream",
|
| 258 |
+
"text": [
|
| 259 |
+
"177\n"
|
| 260 |
+
]
|
| 261 |
+
}
|
| 262 |
+
],
|
| 263 |
"source": [
|
| 264 |
"# load packages\n",
|
| 265 |
"import time\n",
|
|
|
|
| 284 |
},
|
| 285 |
{
|
| 286 |
"cell_type": "code",
|
| 287 |
+
"execution_count": 6,
|
| 288 |
"id": "00ee05e1",
|
| 289 |
"metadata": {},
|
| 290 |
"outputs": [],
|
|
|
|
| 320 |
},
|
| 321 |
{
|
| 322 |
"cell_type": "code",
|
| 323 |
+
"execution_count": 7,
|
| 324 |
"id": "bbdc04c0",
|
| 325 |
"metadata": {},
|
| 326 |
"outputs": [],
|
|
|
|
| 328 |
"device = 'cuda' if torch.cuda.is_available() else 'cpu'"
|
| 329 |
]
|
| 330 |
},
|
| 331 |
+
{
|
| 332 |
+
"cell_type": "code",
|
| 333 |
+
"execution_count": 10,
|
| 334 |
+
"id": "bc8a517e-915c-427f-a3e0-b96310317bec",
|
| 335 |
+
"metadata": {
|
| 336 |
+
"scrolled": true
|
| 337 |
+
},
|
| 338 |
+
"outputs": [
|
| 339 |
+
{
|
| 340 |
+
"name": "stdout",
|
| 341 |
+
"output_type": "stream",
|
| 342 |
+
"text": [
|
| 343 |
+
"Requirement already satisfied: phonemizer in /venv/main/lib/python3.12/site-packages (3.3.0)\n",
|
| 344 |
+
"Requirement already satisfied: joblib in /venv/main/lib/python3.12/site-packages (from phonemizer) (1.5.1)\n",
|
| 345 |
+
"Requirement already satisfied: segments in /venv/main/lib/python3.12/site-packages (from phonemizer) (2.3.0)\n",
|
| 346 |
+
"Requirement already satisfied: attrs>=18.1 in /venv/main/lib/python3.12/site-packages (from phonemizer) (25.3.0)\n",
|
| 347 |
+
"Requirement already satisfied: dlinfo in /venv/main/lib/python3.12/site-packages (from phonemizer) (2.0.0)\n",
|
| 348 |
+
"Requirement already satisfied: typing-extensions in /venv/main/lib/python3.12/site-packages (from phonemizer) (4.13.2)\n",
|
| 349 |
+
"Requirement already satisfied: regex in /venv/main/lib/python3.12/site-packages (from segments->phonemizer) (2024.11.6)\n",
|
| 350 |
+
"Requirement already satisfied: csvw>=1.5.6 in /venv/main/lib/python3.12/site-packages (from segments->phonemizer) (3.5.1)\n",
|
| 351 |
+
"Requirement already satisfied: isodate in /venv/main/lib/python3.12/site-packages (from csvw>=1.5.6->segments->phonemizer) (0.7.2)\n",
|
| 352 |
+
"Requirement already satisfied: python-dateutil in /venv/main/lib/python3.12/site-packages (from csvw>=1.5.6->segments->phonemizer) (2.9.0.post0)\n",
|
| 353 |
+
"Requirement already satisfied: rfc3986<2 in /venv/main/lib/python3.12/site-packages (from csvw>=1.5.6->segments->phonemizer) (1.5.0)\n",
|
| 354 |
+
"Requirement already satisfied: uritemplate>=3.0.0 in /venv/main/lib/python3.12/site-packages (from csvw>=1.5.6->segments->phonemizer) (4.2.0)\n",
|
| 355 |
+
"Requirement already satisfied: babel in /venv/main/lib/python3.12/site-packages (from csvw>=1.5.6->segments->phonemizer) (2.17.0)\n",
|
| 356 |
+
"Requirement already satisfied: requests in /venv/main/lib/python3.12/site-packages (from csvw>=1.5.6->segments->phonemizer) (2.32.3)\n",
|
| 357 |
+
"Requirement already satisfied: language-tags in /venv/main/lib/python3.12/site-packages (from csvw>=1.5.6->segments->phonemizer) (1.2.0)\n",
|
| 358 |
+
"Requirement already satisfied: rdflib in /venv/main/lib/python3.12/site-packages (from csvw>=1.5.6->segments->phonemizer) (7.1.4)\n",
|
| 359 |
+
"Requirement already satisfied: colorama in /venv/main/lib/python3.12/site-packages (from csvw>=1.5.6->segments->phonemizer) (0.4.6)\n",
|
| 360 |
+
"Requirement already satisfied: jsonschema in /venv/main/lib/python3.12/site-packages (from csvw>=1.5.6->segments->phonemizer) (4.24.0)\n",
|
| 361 |
+
"Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /venv/main/lib/python3.12/site-packages (from jsonschema->csvw>=1.5.6->segments->phonemizer) (2025.4.1)\n",
|
| 362 |
+
"Requirement already satisfied: referencing>=0.28.4 in /venv/main/lib/python3.12/site-packages (from jsonschema->csvw>=1.5.6->segments->phonemizer) (0.36.2)\n",
|
| 363 |
+
"Requirement already satisfied: rpds-py>=0.7.1 in /venv/main/lib/python3.12/site-packages (from jsonschema->csvw>=1.5.6->segments->phonemizer) (0.25.1)\n",
|
| 364 |
+
"Requirement already satisfied: six>=1.5 in /venv/main/lib/python3.12/site-packages (from python-dateutil->csvw>=1.5.6->segments->phonemizer) (1.17.0)\n",
|
| 365 |
+
"Requirement already satisfied: pyparsing<4,>=2.1.0 in /venv/main/lib/python3.12/site-packages (from rdflib->csvw>=1.5.6->segments->phonemizer) (3.2.3)\n",
|
| 366 |
+
"Requirement already satisfied: charset-normalizer<4,>=2 in /venv/main/lib/python3.12/site-packages (from requests->csvw>=1.5.6->segments->phonemizer) (3.4.1)\n",
|
| 367 |
+
"Requirement already satisfied: idna<4,>=2.5 in /venv/main/lib/python3.12/site-packages (from requests->csvw>=1.5.6->segments->phonemizer) (3.10)\n",
|
| 368 |
+
"Requirement already satisfied: urllib3<3,>=1.21.1 in /venv/main/lib/python3.12/site-packages (from requests->csvw>=1.5.6->segments->phonemizer) (2.4.0)\n",
|
| 369 |
+
"Requirement already satisfied: certifi>=2017.4.17 in /venv/main/lib/python3.12/site-packages (from requests->csvw>=1.5.6->segments->phonemizer) (2025.4.26)\n"
|
| 370 |
+
]
|
| 371 |
+
}
|
| 372 |
+
],
|
| 373 |
+
"source": [
|
| 374 |
+
"!pip install phonemizer"
|
| 375 |
+
]
|
| 376 |
+
},
|
| 377 |
+
{
|
| 378 |
+
"cell_type": "code",
|
| 379 |
+
"execution_count": 13,
|
| 380 |
+
"id": "48f471f2-ae4a-489e-9d6b-11caff294cf6",
|
| 381 |
+
"metadata": {
|
| 382 |
+
"scrolled": true
|
| 383 |
+
},
|
| 384 |
+
"outputs": [
|
| 385 |
+
{
|
| 386 |
+
"name": "stdout",
|
| 387 |
+
"output_type": "stream",
|
| 388 |
+
"text": [
|
| 389 |
+
"Reading package lists... Done\n",
|
| 390 |
+
"Building dependency tree... Done\n",
|
| 391 |
+
"Reading state information... Done\n",
|
| 392 |
+
"The following additional packages will be installed:\n",
|
| 393 |
+
" espeak-ng-data libespeak-ng1 libpcaudio0 libsonic0\n",
|
| 394 |
+
"The following NEW packages will be installed:\n",
|
| 395 |
+
" espeak-ng espeak-ng-data libespeak-ng1 libpcaudio0 libsonic0\n",
|
| 396 |
+
"0 upgraded, 5 newly installed, 0 to remove and 42 not upgraded.\n",
|
| 397 |
+
"Need to get 5128 kB of archives.\n",
|
| 398 |
+
"After this operation, 13.7 MB of additional disk space will be used.\n",
|
| 399 |
+
"Get:1 http://archive.ubuntu.com/ubuntu noble/main amd64 libpcaudio0 amd64 1.2-2build3 [9144 B]\n",
|
| 400 |
+
"Get:2 http://archive.ubuntu.com/ubuntu noble/main amd64 libsonic0 amd64 0.2.0-13build1 [10.3 kB]\n",
|
| 401 |
+
"Get:3 http://archive.ubuntu.com/ubuntu noble/main amd64 espeak-ng-data amd64 1.51+dfsg-12build1 [4538 kB]\n",
|
| 402 |
+
"Get:4 http://archive.ubuntu.com/ubuntu noble/main amd64 libespeak-ng1 amd64 1.51+dfsg-12build1 [206 kB]\n",
|
| 403 |
+
"Get:5 http://archive.ubuntu.com/ubuntu noble/universe amd64 espeak-ng amd64 1.51+dfsg-12build1 [364 kB]\n",
|
| 404 |
+
"Fetched 5128 kB in 2s (3310 kB/s) \n",
|
| 405 |
+
"debconf: delaying package configuration, since apt-utils is not installed\n",
|
| 406 |
+
"Selecting previously unselected package libpcaudio0:amd64.\n",
|
| 407 |
+
"(Reading database ... 41253 files and directories currently installed.)\n",
|
| 408 |
+
"Preparing to unpack .../libpcaudio0_1.2-2build3_amd64.deb ...\n",
|
| 409 |
+
"Unpacking libpcaudio0:amd64 (1.2-2build3) ...\n",
|
| 410 |
+
"Selecting previously unselected package libsonic0:amd64.\n",
|
| 411 |
+
"Preparing to unpack .../libsonic0_0.2.0-13build1_amd64.deb ...\n",
|
| 412 |
+
"Unpacking libsonic0:amd64 (0.2.0-13build1) ...\n",
|
| 413 |
+
"Selecting previously unselected package espeak-ng-data:amd64.\n",
|
| 414 |
+
"Preparing to unpack .../espeak-ng-data_1.51+dfsg-12build1_amd64.deb ...\n",
|
| 415 |
+
"Unpacking espeak-ng-data:amd64 (1.51+dfsg-12build1) ...\n",
|
| 416 |
+
"Selecting previously unselected package libespeak-ng1:amd64.\n",
|
| 417 |
+
"Preparing to unpack .../libespeak-ng1_1.51+dfsg-12build1_amd64.deb ...\n",
|
| 418 |
+
"Unpacking libespeak-ng1:amd64 (1.51+dfsg-12build1) ...\n",
|
| 419 |
+
"Selecting previously unselected package espeak-ng.\n",
|
| 420 |
+
"Preparing to unpack .../espeak-ng_1.51+dfsg-12build1_amd64.deb ...\n",
|
| 421 |
+
"Unpacking espeak-ng (1.51+dfsg-12build1) ...\n",
|
| 422 |
+
"Setting up libpcaudio0:amd64 (1.2-2build3) ...\n",
|
| 423 |
+
"Setting up libsonic0:amd64 (0.2.0-13build1) ...\n",
|
| 424 |
+
"Setting up espeak-ng-data:amd64 (1.51+dfsg-12build1) ...\n",
|
| 425 |
+
"Setting up libespeak-ng1:amd64 (1.51+dfsg-12build1) ...\n",
|
| 426 |
+
"Setting up espeak-ng (1.51+dfsg-12build1) ...\n",
|
| 427 |
+
"Processing triggers for man-db (2.12.0-4build2) ...\n",
|
| 428 |
+
"Processing triggers for libc-bin (2.39-0ubuntu8.4) ...\n"
|
| 429 |
+
]
|
| 430 |
+
}
|
| 431 |
+
],
|
| 432 |
+
"source": [
|
| 433 |
+
"!sudo apt-get install -y -V espeak-ng"
|
| 434 |
+
]
|
| 435 |
+
},
|
| 436 |
{
|
| 437 |
"cell_type": "markdown",
|
| 438 |
"id": "7b9cecbe",
|
|
|
|
| 443 |
},
|
| 444 |
{
|
| 445 |
"cell_type": "code",
|
| 446 |
+
"execution_count": 14,
|
| 447 |
"id": "64fc4c0f",
|
| 448 |
"metadata": {},
|
| 449 |
"outputs": [],
|
|
|
|
| 455 |
},
|
| 456 |
{
|
| 457 |
"cell_type": "code",
|
| 458 |
+
"execution_count": 15,
|
| 459 |
"id": "48e7b644",
|
| 460 |
"metadata": {},
|
| 461 |
"outputs": [],
|
| 462 |
"source": [
|
| 463 |
+
"config = yaml.safe_load(open(\"logs/pod_90h_30k_second_lr1/config_ft_single.yml\"))\n",
|
| 464 |
"\n",
|
| 465 |
"# load pretrained ASR model\n",
|
| 466 |
"ASR_config = config.get('ASR_config', False)\n",
|
|
|
|
| 479 |
},
|
| 480 |
{
|
| 481 |
"cell_type": "code",
|
| 482 |
+
"execution_count": 16,
|
| 483 |
"id": "ffc18cf7",
|
| 484 |
"metadata": {},
|
| 485 |
+
"outputs": [
|
| 486 |
+
{
|
| 487 |
+
"name": "stderr",
|
| 488 |
+
"output_type": "stream",
|
| 489 |
+
"text": [
|
| 490 |
+
"/venv/main/lib/python3.12/site-packages/torch/nn/utils/weight_norm.py:143: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`.\n",
|
| 491 |
+
" WeightNorm.apply(module, name, dim)\n",
|
| 492 |
+
"/venv/main/lib/python3.12/site-packages/torch/nn/modules/rnn.py:123: UserWarning: dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout=0.2 and num_layers=1\n",
|
| 493 |
+
" warnings.warn(\n"
|
| 494 |
+
]
|
| 495 |
+
}
|
| 496 |
+
],
|
| 497 |
"source": [
|
| 498 |
"model_params = recursive_munch(config['model_params'])\n",
|
| 499 |
"model = build_model(model_params, text_aligner, pitch_extractor, plbert)\n",
|
|
|
|
| 503 |
},
|
| 504 |
{
|
| 505 |
"cell_type": "code",
|
| 506 |
+
"execution_count": 18,
|
| 507 |
"id": "64529d5c",
|
| 508 |
"metadata": {},
|
| 509 |
"outputs": [],
|
| 510 |
"source": [
|
| 511 |
+
"params_whole = torch.load(\"logs/pod_90h_30k_second_lr1/epoch_2nd_00018.pth\", map_location='cpu')\n",
|
| 512 |
"params = params_whole['net']"
|
| 513 |
]
|
| 514 |
},
|
| 515 |
{
|
| 516 |
"cell_type": "code",
|
| 517 |
+
"execution_count": 19,
|
| 518 |
"id": "895d9706",
|
| 519 |
"metadata": {},
|
| 520 |
+
"outputs": [
|
| 521 |
+
{
|
| 522 |
+
"name": "stdout",
|
| 523 |
+
"output_type": "stream",
|
| 524 |
+
"text": [
|
| 525 |
+
"bert loaded\n",
|
| 526 |
+
"bert_encoder loaded\n",
|
| 527 |
+
"predictor loaded\n",
|
| 528 |
+
"decoder loaded\n",
|
| 529 |
+
"text_encoder loaded\n",
|
| 530 |
+
"predictor_encoder loaded\n",
|
| 531 |
+
"style_encoder loaded\n",
|
| 532 |
+
"diffusion loaded\n",
|
| 533 |
+
"text_aligner loaded\n",
|
| 534 |
+
"pitch_extractor loaded\n",
|
| 535 |
+
"mpd loaded\n",
|
| 536 |
+
"msd loaded\n",
|
| 537 |
+
"wd loaded\n"
|
| 538 |
+
]
|
| 539 |
+
}
|
| 540 |
+
],
|
| 541 |
"source": [
|
| 542 |
"for key in model:\n",
|
| 543 |
" if key in params:\n",
|
|
|
|
| 560 |
},
|
| 561 |
{
|
| 562 |
"cell_type": "code",
|
| 563 |
+
"execution_count": 20,
|
| 564 |
"id": "c1a59db2",
|
| 565 |
"metadata": {},
|
| 566 |
"outputs": [],
|
|
|
|
| 570 |
},
|
| 571 |
{
|
| 572 |
"cell_type": "code",
|
| 573 |
+
"execution_count": 21,
|
| 574 |
"id": "e30985ab",
|
| 575 |
"metadata": {},
|
| 576 |
"outputs": [],
|
|
|
|
| 593 |
},
|
| 594 |
{
|
| 595 |
"cell_type": "code",
|
| 596 |
+
"execution_count": 22,
|
| 597 |
"id": "ca57469c",
|
| 598 |
"metadata": {},
|
| 599 |
"outputs": [],
|
|
|
|
| 678 |
},
|
| 679 |
{
|
| 680 |
"cell_type": "code",
|
| 681 |
+
"execution_count": 23,
|
| 682 |
"id": "cace9787",
|
| 683 |
"metadata": {},
|
| 684 |
"outputs": [],
|
|
|
|
| 797 |
" display(ipd.Audio(path, rate=24000, normalize=False))"
|
| 798 |
]
|
| 799 |
},
|
| 800 |
+
{
|
| 801 |
+
"cell_type": "code",
|
| 802 |
+
"execution_count": null,
|
| 803 |
+
"id": "62747cbb-bb33-4be4-8275-8c292e306987",
|
| 804 |
+
"metadata": {},
|
| 805 |
+
"outputs": [],
|
| 806 |
+
"source": []
|
| 807 |
+
},
|
| 808 |
+
{
|
| 809 |
+
"cell_type": "code",
|
| 810 |
+
"execution_count": null,
|
| 811 |
+
"id": "ec8fb32f-91dd-4fca-a7c6-7f156449c296",
|
| 812 |
+
"metadata": {},
|
| 813 |
+
"outputs": [],
|
| 814 |
+
"source": []
|
| 815 |
+
},
|
| 816 |
{
|
| 817 |
"cell_type": "markdown",
|
| 818 |
"id": "141e91b3",
|
|
|
|
| 829 |
},
|
| 830 |
{
|
| 831 |
"cell_type": "code",
|
| 832 |
+
"execution_count": 25,
|
| 833 |
"id": "81addda4",
|
| 834 |
"metadata": {},
|
| 835 |
"outputs": [],
|
|
|
|
| 840 |
{
|
| 841 |
"cell_type": "code",
|
| 842 |
"execution_count": null,
|
| 843 |
+
"id": "c0deea36-de7c-4b65-bbc4-8e00697c6796",
|
| 844 |
"metadata": {},
|
| 845 |
"outputs": [],
|
| 846 |
+
"source": []
|
| 847 |
+
},
|
| 848 |
+
{
|
| 849 |
+
"cell_type": "code",
|
| 850 |
+
"execution_count": null,
|
| 851 |
+
"id": "41b18368-2fcb-4bc8-8963-00734227267c",
|
| 852 |
+
"metadata": {},
|
| 853 |
+
"outputs": [],
|
| 854 |
+
"source": []
|
| 855 |
+
},
|
| 856 |
+
{
|
| 857 |
+
"cell_type": "code",
|
| 858 |
+
"execution_count": 26,
|
| 859 |
+
"id": "be1b2a11",
|
| 860 |
+
"metadata": {
|
| 861 |
+
"scrolled": true
|
| 862 |
+
},
|
| 863 |
+
"outputs": [
|
| 864 |
+
{
|
| 865 |
+
"ename": "LookupError",
|
| 866 |
+
"evalue": "\n**********************************************************************\n Resource \u001b[93mpunkt_tab\u001b[0m not found.\n Please use the NLTK Downloader to obtain the resource:\n\n \u001b[31m>>> import nltk\n >>> nltk.download('punkt_tab')\n \u001b[0m\n For more information see: https://www.nltk.org/data.html\n\n Attempted to load \u001b[93mtokenizers/punkt_tab/english/\u001b[0m\n\n Searched in:\n - '/root/nltk_data'\n - '/venv/main/nltk_data'\n - '/venv/main/share/nltk_data'\n - '/venv/main/lib/nltk_data'\n - '/usr/share/nltk_data'\n - '/usr/local/share/nltk_data'\n - '/usr/lib/nltk_data'\n - '/usr/local/lib/nltk_data'\n**********************************************************************\n",
|
| 867 |
+
"output_type": "error",
|
| 868 |
+
"traceback": [
|
| 869 |
+
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
| 870 |
+
"\u001b[31mLookupError\u001b[39m Traceback (most recent call last)",
|
| 871 |
+
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[26]\u001b[39m\u001b[32m, line 8\u001b[39m\n\u001b[32m 5\u001b[39m texts[\u001b[33m'\u001b[39m\u001b[33mSurprised\u001b[39m\u001b[33m'\u001b[39m] = \u001b[33m\"\u001b[39m\u001b[33mI can\u001b[39m\u001b[33m'\u001b[39m\u001b[33mt believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 7\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m k,v \u001b[38;5;129;01min\u001b[39;00m texts.items():\n\u001b[32m----> \u001b[39m\u001b[32m8\u001b[39m wav = \u001b[43minference\u001b[49m\u001b[43m(\u001b[49m\u001b[43mv\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mref_s\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdiffusion_steps\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m10\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43malpha\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m0.3\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbeta\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m0.7\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43membedding_scale\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m1\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 9\u001b[39m \u001b[38;5;28mprint\u001b[39m(k + \u001b[33m\"\u001b[39m\u001b[33m: \u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 10\u001b[39m display(ipd.Audio(wav, rate=\u001b[32m24000\u001b[39m, normalize=\u001b[38;5;28;01mFalse\u001b[39;00m))\n",
|
| 872 |
+
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[22]\u001b[39m\u001b[32m, line 4\u001b[39m, in \u001b[36minference\u001b[39m\u001b[34m(text, ref_s, alpha, beta, diffusion_steps, embedding_scale)\u001b[39m\n\u001b[32m 2\u001b[39m text = text.strip()\n\u001b[32m 3\u001b[39m ps = global_phonemizer.phonemize([text])\n\u001b[32m----> \u001b[39m\u001b[32m4\u001b[39m ps = \u001b[43mword_tokenize\u001b[49m\u001b[43m(\u001b[49m\u001b[43mps\u001b[49m\u001b[43m[\u001b[49m\u001b[32;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 5\u001b[39m ps = \u001b[33m'\u001b[39m\u001b[33m \u001b[39m\u001b[33m'\u001b[39m.join(ps)\n\u001b[32m 6\u001b[39m tokens = textclenaer(ps)\n",
|
| 873 |
+
"\u001b[36mFile \u001b[39m\u001b[32m/venv/main/lib/python3.12/site-packages/nltk/tokenize/__init__.py:142\u001b[39m, in \u001b[36mword_tokenize\u001b[39m\u001b[34m(text, language, preserve_line)\u001b[39m\n\u001b[32m 127\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mword_tokenize\u001b[39m(text, language=\u001b[33m\"\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m\"\u001b[39m, preserve_line=\u001b[38;5;28;01mFalse\u001b[39;00m):\n\u001b[32m 128\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 129\u001b[39m \u001b[33;03m Return a tokenized copy of *text*,\u001b[39;00m\n\u001b[32m 130\u001b[39m \u001b[33;03m using NLTK's recommended word tokenizer\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 140\u001b[39m \u001b[33;03m :type preserve_line: bool\u001b[39;00m\n\u001b[32m 141\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m142\u001b[39m sentences = [text] \u001b[38;5;28;01mif\u001b[39;00m preserve_line \u001b[38;5;28;01melse\u001b[39;00m \u001b[43msent_tokenize\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlanguage\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 143\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m [\n\u001b[32m 144\u001b[39m token \u001b[38;5;28;01mfor\u001b[39;00m sent \u001b[38;5;129;01min\u001b[39;00m sentences \u001b[38;5;28;01mfor\u001b[39;00m token \u001b[38;5;129;01min\u001b[39;00m _treebank_word_tokenizer.tokenize(sent)\n\u001b[32m 145\u001b[39m ]\n",
|
| 874 |
+
"\u001b[36mFile \u001b[39m\u001b[32m/venv/main/lib/python3.12/site-packages/nltk/tokenize/__init__.py:119\u001b[39m, in \u001b[36msent_tokenize\u001b[39m\u001b[34m(text, language)\u001b[39m\n\u001b[32m 109\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34msent_tokenize\u001b[39m(text, language=\u001b[33m\"\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m\"\u001b[39m):\n\u001b[32m 110\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 111\u001b[39m \u001b[33;03m Return a sentence-tokenized copy of *text*,\u001b[39;00m\n\u001b[32m 112\u001b[39m \u001b[33;03m using NLTK's recommended sentence tokenizer\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 117\u001b[39m \u001b[33;03m :param language: the model name in the Punkt corpus\u001b[39;00m\n\u001b[32m 118\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m119\u001b[39m tokenizer = \u001b[43m_get_punkt_tokenizer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlanguage\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 120\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m tokenizer.tokenize(text)\n",
|
| 875 |
+
"\u001b[36mFile \u001b[39m\u001b[32m/venv/main/lib/python3.12/site-packages/nltk/tokenize/__init__.py:105\u001b[39m, in \u001b[36m_get_punkt_tokenizer\u001b[39m\u001b[34m(language)\u001b[39m\n\u001b[32m 96\u001b[39m \u001b[38;5;129m@functools\u001b[39m.lru_cache\n\u001b[32m 97\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m_get_punkt_tokenizer\u001b[39m(language=\u001b[33m\"\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m\"\u001b[39m):\n\u001b[32m 98\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 99\u001b[39m \u001b[33;03m A constructor for the PunktTokenizer that utilizes\u001b[39;00m\n\u001b[32m 100\u001b[39m \u001b[33;03m a lru cache for performance.\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 103\u001b[39m \u001b[33;03m :type language: str\u001b[39;00m\n\u001b[32m 104\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m105\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mPunktTokenizer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlanguage\u001b[49m\u001b[43m)\u001b[49m\n",
|
| 876 |
+
"\u001b[36mFile \u001b[39m\u001b[32m/venv/main/lib/python3.12/site-packages/nltk/tokenize/punkt.py:1744\u001b[39m, in \u001b[36mPunktTokenizer.__init__\u001b[39m\u001b[34m(self, lang)\u001b[39m\n\u001b[32m 1742\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, lang=\u001b[33m\"\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m\"\u001b[39m):\n\u001b[32m 1743\u001b[39m PunktSentenceTokenizer.\u001b[34m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m)\n\u001b[32m-> \u001b[39m\u001b[32m1744\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mload_lang\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlang\u001b[49m\u001b[43m)\u001b[49m\n",
|
| 877 |
+
"\u001b[36mFile \u001b[39m\u001b[32m/venv/main/lib/python3.12/site-packages/nltk/tokenize/punkt.py:1749\u001b[39m, in \u001b[36mPunktTokenizer.load_lang\u001b[39m\u001b[34m(self, lang)\u001b[39m\n\u001b[32m 1746\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mload_lang\u001b[39m(\u001b[38;5;28mself\u001b[39m, lang=\u001b[33m\"\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m\"\u001b[39m):\n\u001b[32m 1747\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mnltk\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdata\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m find\n\u001b[32m-> \u001b[39m\u001b[32m1749\u001b[39m lang_dir = \u001b[43mfind\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43mf\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtokenizers/punkt_tab/\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mlang\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[33;43m/\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 1750\u001b[39m \u001b[38;5;28mself\u001b[39m._params = load_punkt_params(lang_dir)\n\u001b[32m 1751\u001b[39m \u001b[38;5;28mself\u001b[39m._lang = lang\n",
|
| 878 |
+
"\u001b[36mFile \u001b[39m\u001b[32m/venv/main/lib/python3.12/site-packages/nltk/data.py:579\u001b[39m, in \u001b[36mfind\u001b[39m\u001b[34m(resource_name, paths)\u001b[39m\n\u001b[32m 577\u001b[39m sep = \u001b[33m\"\u001b[39m\u001b[33m*\u001b[39m\u001b[33m\"\u001b[39m * \u001b[32m70\u001b[39m\n\u001b[32m 578\u001b[39m resource_not_found = \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00msep\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00mmsg\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00msep\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m--> \u001b[39m\u001b[32m579\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mLookupError\u001b[39;00m(resource_not_found)\n",
|
| 879 |
+
"\u001b[31mLookupError\u001b[39m: \n**********************************************************************\n Resource \u001b[93mpunkt_tab\u001b[0m not found.\n Please use the NLTK Downloader to obtain the resource:\n\n \u001b[31m>>> import nltk\n >>> nltk.download('punkt_tab')\n \u001b[0m\n For more information see: https://www.nltk.org/data.html\n\n Attempted to load \u001b[93mtokenizers/punkt_tab/english/\u001b[0m\n\n Searched in:\n - '/root/nltk_data'\n - '/venv/main/nltk_data'\n - '/venv/main/share/nltk_data'\n - '/venv/main/lib/nltk_data'\n - '/usr/share/nltk_data'\n - '/usr/local/share/nltk_data'\n - '/usr/lib/nltk_data'\n - '/usr/local/lib/nltk_data'\n**********************************************************************\n"
|
| 880 |
+
]
|
| 881 |
+
}
|
| 882 |
+
],
|
| 883 |
"source": [
|
| 884 |
"texts = {}\n",
|
| 885 |
"texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
|
|
|
|
| 1309 |
"cell_type": "code",
|
| 1310 |
"execution_count": null,
|
| 1311 |
"id": "6d0a3825",
|
| 1312 |
+
"metadata": {},
|
|
|
|
|
|
|
| 1313 |
"outputs": [],
|
| 1314 |
"source": [
|
| 1315 |
"path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
|
|
|
|
| 1504 |
"cell_type": "code",
|
| 1505 |
"execution_count": null,
|
| 1506 |
"id": "44a4cea1",
|
| 1507 |
+
"metadata": {},
|
|
|
|
|
|
|
| 1508 |
"outputs": [],
|
| 1509 |
"source": [
|
| 1510 |
"start = time.time()\n",
|
|
|
|
| 1525 |
],
|
| 1526 |
"metadata": {
|
| 1527 |
"kernelspec": {
|
| 1528 |
+
"display_name": "Python3 (main venv)",
|
| 1529 |
"language": "python",
|
| 1530 |
+
"name": "main"
|
| 1531 |
},
|
| 1532 |
"language_info": {
|
| 1533 |
"codemirror_mode": {
|
|
|
|
| 1539 |
"name": "python",
|
| 1540 |
"nbconvert_exporter": "python",
|
| 1541 |
"pygments_lexer": "ipython3",
|
| 1542 |
+
"version": "3.12.3"
|
| 1543 |
}
|
| 1544 |
},
|
| 1545 |
"nbformat": 4,
|
__pycache__/models.cpython-310.pyc
CHANGED
|
Binary files a/__pycache__/models.cpython-310.pyc and b/__pycache__/models.cpython-310.pyc differ
|
|
|
logs/pod_90h_30k_second_v2/.ipynb_checkpoints/train-checkpoint.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
logs/pod_90h_30k_second_v2/config_ft_single.yml
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{ASR_config: Utils/ASR/config.yml, ASR_path: Utils/ASR/epoch_00080.pth, F0_path: Utils/JDC/bst.t7,
|
| 2 |
+
PLBERT_dir: Utils/PLBERT/, batch_size: 12, data_params: {OOD_data: /workspace/styletts2/data/OOD_texts.txt,
|
| 3 |
+
min_length: 50, root_path: /workspace, train_data: /workspace/styletts2/data/train_list.txt,
|
| 4 |
+
val_data: /workspace/styletts2/data/val_list.txt}, device: cuda, epochs_1st: 25,
|
| 5 |
+
epochs_2nd: 20, first_stage_path: /workspace/styletts2/stage1_final.pth, load_only_params: false,
|
| 6 |
+
log_dir: logs/pod_90h_30k_second_v2, log_interval: 50, loss_params: {TMA_epoch: 14,
|
| 7 |
+
diff_epoch: 1, joint_epoch: 5, lambda_F0: 1.0, lambda_ce: 20.0, lambda_diff: 1.0,
|
| 8 |
+
lambda_dur: 1.0, lambda_gen: 1.0, lambda_mel: 5.0, lambda_mono: 1.0, lambda_norm: 1.0,
|
| 9 |
+
lambda_s2s: 1.0, lambda_slm: 1.0, lambda_sty: 1.0}, max_len: 300, model_params: {
|
| 10 |
+
decoder: {resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]], resblock_kernel_sizes: [
|
| 11 |
+
3, 7, 11], type: hifigan, upsample_initial_channel: 512, upsample_kernel_sizes: [
|
| 12 |
+
20, 10, 6, 4], upsample_rates: [10, 5, 3, 2]}, diffusion: {dist: {estimate_sigma_data: true,
|
| 13 |
+
mean: -3.0, sigma_data: 0.3631309394446902, std: 1.0}, embedding_mask_proba: 0.1,
|
| 14 |
+
transformer: {head_features: 64, multiplier: 2, num_heads: 8, num_layers: 3}},
|
| 15 |
+
dim_in: 64, dropout: 0.2, hidden_dim: 512, max_conv_dim: 512, max_dur: 50, multispeaker: true,
|
| 16 |
+
n_layer: 3, n_mels: 80, n_token: 178, slm: {hidden: 768, initial_channel: 64,
|
| 17 |
+
model: microsoft/wavlm-base-plus, nlayers: 13, sr: 16000}, style_dim: 128},
|
| 18 |
+
optimizer_params: {bert_lr: 1.0e-05, ft_lr: 1.0e-05, grad_accum_steps: 2, lr: 0.0001},
|
| 19 |
+
preprocess_params: {spect_params: {hop_length: 300, n_fft: 2048, win_length: 1200},
|
| 20 |
+
sr: 24000}, pretrained_model: /workspace/styletts2/logs/pod_90h_30k_second_v2/epoch_2nd_00005.pth,
|
| 21 |
+
save_freq: 1, second_stage_load_pretrained: true, slmadv_params: {batch_percentage: 0.5,
|
| 22 |
+
iter: 20, max_len: 500, min_len: 400, scale: 0.01, sig: 1.5, thresh: 5}}
|
logs/pod_90h_30k_second_v2/epoch_2nd_00000.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0928157cdc46e1d7e76b85e06e5264bd3cde20091d3042dc4665caf2bfe02526
|
| 3 |
+
size 1055973030
|
logs/pod_90h_30k_second_v2/epoch_2nd_00001.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1e0e105ad5e3a8c28cd29e0b3f02eb057076d1a8be9eb2b683f1533635e8401e
|
| 3 |
+
size 1589850598
|
logs/pod_90h_30k_second_v2/epoch_2nd_00002.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:26a7b2881b5543005c7581da0d2055750057dea608d01ea788adb1fafc3945f4
|
| 3 |
+
size 1589850598
|
logs/pod_90h_30k_second_v2/epoch_2nd_00003.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c0065304e4bf5ed0559cb9a45c2d4bdd0e609e89d6e21474ffc4b1cf33922571
|
| 3 |
+
size 1589850598
|
logs/pod_90h_30k_second_v2/epoch_2nd_00004.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:256ee7584931924080191070f86a26ee9d32828465a52a33b0a7a94fdd109eb7
|
| 3 |
+
size 1589850598
|
logs/pod_90h_30k_second_v2/epoch_2nd_00005.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:74fc75095ee7d28d84006849e9103d2640292e34f57fe58638cf0dc355e7ed0b
|
| 3 |
+
size 2144951284
|
logs/pod_90h_30k_second_v2/epoch_2nd_00006.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:637f68e2b35a7b60431920cfc9a9494d15ca971d3edab962b11c54c02c8728ad
|
| 3 |
+
size 2144951284
|
logs/pod_90h_30k_second_v2/epoch_2nd_00007.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f05c6d4bba2e93159df4e9b2bb8d3552e695476d173062467cadb57694875f49
|
| 3 |
+
size 2144951284
|
logs/pod_90h_30k_second_v2/epoch_2nd_00008.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4d97bc932af34f115414e10ac988870e48e50921b0832d14ea142deb89e9d99e
|
| 3 |
+
size 2144951284
|
logs/pod_90h_30k_second_v2/epoch_2nd_00009.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5b08801054bf85f09cbc03240bdf5e1ec1d834d97d0933d4df29385dc3808d3f
|
| 3 |
+
size 2144951284
|
logs/pod_90h_30k_second_v2/epoch_2nd_00010.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1ae7cbe8be0a7f72aa499f92cbc17f6e7a379314675a5d4eae526eff30f1af72
|
| 3 |
+
size 2144951284
|
logs/pod_90h_30k_second_v2/epoch_2nd_00011.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:873f640d25ba128814ef68d1c63ff537bbf9d32e3d3c263584c9feeede06c5b5
|
| 3 |
+
size 2144951284
|
logs/pod_90h_30k_second_v2/epoch_2nd_00012.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a26b0348252ea0c3525df5b6f9f88c51291d7a191d5502fa7a604ebbeb89dcca
|
| 3 |
+
size 2144951284
|
logs/pod_90h_30k_second_v2/epoch_2nd_00013.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e4b6e188d004bbb60c34f4807cb2c2a6593242deab4ec59e7c672a5485fe9988
|
| 3 |
+
size 2144951284
|
logs/pod_90h_30k_second_v2/epoch_2nd_00014.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2252d3258328915ef20c3569e7297723537e691479690ebd026fc426dbcf2384
|
| 3 |
+
size 2144951284
|
logs/pod_90h_30k_second_v2/epoch_2nd_00015.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:43544b5d118dbdaacb98500b2f68c8e355ad53222189948e9ee2890e2fec4430
|
| 3 |
+
size 2144951284
|
logs/pod_90h_30k_second_v2/epoch_2nd_00016.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:edfe04c4831cd8d61fafc6bb413572806f30a8346c033fcae3bedeb05f94e9ae
|
| 3 |
+
size 2144951284
|
logs/pod_90h_30k_second_v2/epoch_2nd_00017.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8684fe24d5bd9a1ae3e24529c19147503367aa61df2e550f34902b87179e1e10
|
| 3 |
+
size 2144951284
|
logs/pod_90h_30k_second_v2/epoch_2nd_00018.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:71b08d3be791931c8c1534e2349e2d37dada50a72e700685fd8e4cf6b2d6e381
|
| 3 |
+
size 2144951284
|
logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749758267.7f09b0e2c0b0.17026.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1d06bd4591368eb36ccaedb27cf696641c63f468363381dee869e800814cb7a9
|
| 3 |
+
size 1984
|
logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749758489.7f09b0e2c0b0.18353.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f67fd20bb1e7c93e62e1f9c7195dc5abcc7781698b70c679dc8ec3a16e4301e8
|
| 3 |
+
size 88
|
logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749758524.7f09b0e2c0b0.18773.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c6f25e7eb7c86858883dd108c75d5748b4c35d1c2523cd938aadd0c1d6aeb8b1
|
| 3 |
+
size 88
|
logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749758552.7f09b0e2c0b0.19160.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c2256999fdd31fea21179c68e82673acd9189401e9d6ae24fed4e3c2eba5c6fd
|
| 3 |
+
size 88
|
logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749758602.7f09b0e2c0b0.19654.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:de73c06f010bf27d9dddc26d28870a7b5cbeb43e7b1d8be67585861622895a2d
|
| 3 |
+
size 6761012
|
logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749763142.7f09b0e2c0b0.41611.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:048f50f6a49f65fd244e93aed39d98e13e34c77499fc09d3bde9c1e80833a4f9
|
| 3 |
+
size 716
|
logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749763329.7f09b0e2c0b0.42740.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2622d704b1a3eb973e99275d7ad036d28e17836bc139b3fe77060ca5332b2c96
|
| 3 |
+
size 1984
|
logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749763548.7f09b0e2c0b0.44123.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:453ad43bcbf767292cc38b1b68b95bce85be1f0554f5773779d1c6d851056587
|
| 3 |
+
size 1953908
|
logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749789808.7f09b0e2c0b0.1500.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1f8e754635df505b43fc9ce9b08109bdcc715d84659fc6e8fa4dc93ba2727991
|
| 3 |
+
size 1344
|
logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749790964.7f09b0e2c0b0.2345.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:704b38d288963faaa1da5a8302dc10f92e904b92545dd17d2c38f520d9bbea01
|
| 3 |
+
size 88
|
logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749791414.7f09b0e2c0b0.1465.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6022662a58865c2f726d48b227e4a35f5643c01cdd8781d20152470fc8905558
|
| 3 |
+
size 3881480
|
logs/pod_90h_30k_second_v2/train.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models.py
CHANGED
|
@@ -703,8 +703,9 @@ def load_checkpoint(model, optimizer, path, load_only_params=True, ignore_module
|
|
| 703 |
_ = [model[key].eval() for key in model]
|
| 704 |
|
| 705 |
if not load_only_params:
|
| 706 |
-
epoch = state["epoch"]
|
| 707 |
iters = state["iters"]
|
|
|
|
| 708 |
optimizer.load_state_dict(state["optimizer"])
|
| 709 |
else:
|
| 710 |
epoch = 0
|
|
|
|
| 703 |
_ = [model[key].eval() for key in model]
|
| 704 |
|
| 705 |
if not load_only_params:
|
| 706 |
+
epoch = state["epoch"] + 1
|
| 707 |
iters = state["iters"]
|
| 708 |
+
print('Load checkpoint from %s, epoch %d, iters %d' % (path, epoch, iters))
|
| 709 |
optimizer.load_state_dict(state["optimizer"])
|
| 710 |
else:
|
| 711 |
epoch = 0
|