geneing
/

Kokoro

Text-to-Speech

English

Model card Files Files and versions

xet

Community

geneing commited on Jan 10, 2025

Commit

2d2f498

1 Parent(s): 8fdffc3

Fixed missing change. Updated models_onnx from models from upstream.

Browse files

Files changed (2) hide show

kokoro.py +1 -1
models_onnx.py +4 -224

kokoro.py CHANGED Viewed

@@ -116,7 +116,7 @@ def forward(model, tokens, ref_s, speed):
     tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
     input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
     text_mask = length_to_mask(input_lengths).to(device)
-    bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
     d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
     s = ref_s[:, 128:]
     d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)

     tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
     input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
     text_mask = length_to_mask(input_lengths).to(device)
+    bert_dur = model.bert(tokens)
     d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
     s = ref_s[:, 128:]
     d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)

models_onnx.py CHANGED Viewed

@@ -1,6 +1,5 @@
 # https://github.com/yl4579/StyleTTS2/blob/main/models.py
-from ast import Tuple
-from istftnet import Decoder
 from munch import Munch
 from pathlib import Path
 from plbert import load_plbert
@@ -12,118 +11,6 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
-class LearnedDownSample(nn.Module):
-    def __init__(self, layer_type, dim_in):
-        super().__init__()
-        self.layer_type = layer_type
-        if self.layer_type == 'none':
-            self.conv = nn.Identity()
-        elif self.layer_type == 'timepreserve':
-            self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, padding=(1, 0)))
-        elif self.layer_type == 'half':
-            self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, padding=1))
-        else:
-            raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
-    def forward(self, x):
-        return self.conv(x)
-class LearnedUpSample(nn.Module):
-    def __init__(self, layer_type, dim_in):
-        super().__init__()
-        self.layer_type = layer_type
-        if self.layer_type == 'none':
-            self.conv = nn.Identity()
-        elif self.layer_type == 'timepreserve':
-            self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, output_padding=(1, 0), padding=(1, 0))
-        elif self.layer_type == 'half':
-            self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, output_padding=1, padding=1)
-        else:
-            raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
-    def forward(self, x):
-        return self.conv(x)
-class DownSample(nn.Module):
-    def __init__(self, layer_type):
-        super().__init__()
-        self.layer_type = layer_type
-    def forward(self, x):
-        if self.layer_type == 'none':
-            return x
-        elif self.layer_type == 'timepreserve':
-            return F.avg_pool2d(x, (2, 1))
-        elif self.layer_type == 'half':
-            if x.shape[-1] % 2 != 0:
-                x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1)
-            return F.avg_pool2d(x, 2)
-        else:
-            raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
-class UpSample(nn.Module):
-    def __init__(self, layer_type):
-        super().__init__()
-        self.layer_type = layer_type
-    def forward(self, x):
-        if self.layer_type == 'none':
-            return x
-        elif self.layer_type == 'timepreserve':
-            return F.interpolate(x, scale_factor=(2, 1), mode='nearest')
-        elif self.layer_type == 'half':
-            return F.interpolate(x, scale_factor=2, mode='nearest')
-        else:
-            raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
-class ResBlk(nn.Module):
-    def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
-                 normalize=False, downsample='none'):
-        super().__init__()
-        self.actv = actv
-        self.normalize = normalize
-        self.downsample = DownSample(downsample)
-        self.downsample_res = LearnedDownSample(downsample, dim_in)
-        self.learned_sc = dim_in != dim_out
-        self._build_weights(dim_in, dim_out)
-    def _build_weights(self, dim_in, dim_out):
-        self.conv1 = spectral_norm(nn.Conv2d(dim_in, dim_in, 3, 1, 1))
-        self.conv2 = spectral_norm(nn.Conv2d(dim_in, dim_out, 3, 1, 1))
-        if self.normalize:
-            self.norm1 = nn.InstanceNorm2d(dim_in, affine=True)
-            self.norm2 = nn.InstanceNorm2d(dim_in, affine=True)
-        if self.learned_sc:
-            self.conv1x1 = spectral_norm(nn.Conv2d(dim_in, dim_out, 1, 1, 0, bias=False))
-    def _shortcut(self, x):
-        if self.learned_sc:
-            x = self.conv1x1(x)
-        if self.downsample:
-            x = self.downsample(x)
-        return x
-    def _residual(self, x):
-        if self.normalize:
-            x = self.norm1(x)
-        x = self.actv(x)
-        x = self.conv1(x)
-        x = self.downsample_res(x)
-        if self.normalize:
-            x = self.norm2(x)
-        x = self.actv(x)
-        x = self.conv2(x)
-        return x
-    def forward(self, x):
-        x = self._shortcut(x) + self._residual(x)
-        return x / np.sqrt(2)  # unit variance
 class LinearNorm(torch.nn.Module):
     def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
         super(LinearNorm, self).__init__()
@@ -136,98 +23,6 @@ class LinearNorm(torch.nn.Module):
     def forward(self, x):
         return self.linear_layer(x)
-class Discriminator2d(nn.Module):
-    def __init__(self, dim_in=48, num_domains=1, max_conv_dim=384, repeat_num=4):
-        super().__init__()
-        blocks = []
-        blocks += [spectral_norm(nn.Conv2d(1, dim_in, 3, 1, 1))]
-        for lid in range(repeat_num):
-            dim_out = min(dim_in*2, max_conv_dim)
-            blocks += [ResBlk(dim_in, dim_out, downsample='half')]
-            dim_in = dim_out
-        blocks += [nn.LeakyReLU(0.2)]
-        blocks += [spectral_norm(nn.Conv2d(dim_out, dim_out, 5, 1, 0))]
-        blocks += [nn.LeakyReLU(0.2)]
-        blocks += [nn.AdaptiveAvgPool2d(1)]
-        blocks += [spectral_norm(nn.Conv2d(dim_out, num_domains, 1, 1, 0))]
-        self.main = nn.Sequential(*blocks)
-    def get_feature(self, x):
-        features = []
-        for l in self.main:
-            x = l(x)
-            features.append(x)
-        out = features[-1]
-        out = out.view(out.size(0), -1)  # (batch, num_domains)
-        return out, features
-    def forward(self, x):
-        out, features = self.get_feature(x)
-        out = out.squeeze()  # (batch)
-        return out, features
-class ResBlk1d(nn.Module):
-    def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
-                 normalize=False, downsample='none', dropout_p=0.2):
-        super().__init__()
-        self.actv = actv
-        self.normalize = normalize
-        self.downsample_type = downsample
-        self.learned_sc = dim_in != dim_out
-        self._build_weights(dim_in, dim_out)
-        self.dropout_p = dropout_p
-        if self.downsample_type == 'none':
-            self.pool = nn.Identity()
-        else:
-            self.pool = weight_norm(nn.Conv1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1))
-    def _build_weights(self, dim_in, dim_out):
-        self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_in, 3, 1, 1))
-        self.conv2 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
-        if self.normalize:
-            self.norm1 = nn.InstanceNorm1d(dim_in, affine=True)
-            self.norm2 = nn.InstanceNorm1d(dim_in, affine=True)
-        if self.learned_sc:
-            self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
-    def downsample(self, x):
-        if self.downsample_type == 'none':
-            return x
-        else:
-            if x.shape[-1] % 2 != 0:
-                x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1)
-            return F.avg_pool1d(x, 2)
-    def _shortcut(self, x):
-        if self.learned_sc:
-            x = self.conv1x1(x)
-        x = self.downsample(x)
-        return x
-    def _residual(self, x):
-        if self.normalize:
-            x = self.norm1(x)
-        x = self.actv(x)
-        x = F.dropout(x, p=self.dropout_p, training=self.training)
-        x = self.conv1(x)
-        x = self.pool(x)
-        if self.normalize:
-            x = self.norm2(x)
-        x = self.actv(x)
-        x = F.dropout(x, p=self.dropout_p, training=self.training)
-        x = self.conv2(x)
-        return x
-    def forward(self, x):
-        x = self._shortcut(x) + self._residual(x)
-        return x / np.sqrt(2)  # unit variance
 class LayerNorm(nn.Module):
     def __init__(self, channels, eps=1e-5):
         super().__init__()
@@ -312,19 +107,6 @@ class TextEncoder(nn.Module):
         return mask
-class AdaIN1d(nn.Module):
-    def __init__(self, style_dim, num_features):
-        super().__init__()
-        self.norm = nn.InstanceNorm1d(num_features, affine=False)
-        self.fc = nn.Linear(style_dim, num_features*2)
-    def forward(self, x, s):
-        h = self.fc(s)
-        h = h.view(h.size(0), h.size(1), 1)
-        gamma, beta = torch.chunk(h, chunks=2, dim=1)
-        return (1 + gamma) * self.norm(x) + beta
 class UpSample1d(nn.Module):
     def __init__(self, layer_type):
         super().__init__()
@@ -406,6 +188,7 @@ class AdaLayerNorm(nn.Module):
 class ProsodyPredictor(nn.Module):
     def __init__(self, style_dim, d_hid, nlayers, max_dur=50, dropout=0.1):
         super().__init__()
@@ -418,7 +201,6 @@ class ProsodyPredictor(nn.Module):
         self.duration_proj = LinearNorm(d_hid, max_dur)
         self.shared = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True)
         self.F0 = nn.ModuleList()
         self.F0.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout))
         self.F0.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout))
@@ -462,6 +244,7 @@ class ProsodyPredictor(nn.Module):
         return duration.squeeze(-1), en
     def F0Ntrain(self, x: torch.Tensor, s: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
         x1 = x.transpose(-1, -2)
         x2, _temp = self.shared(x1)
@@ -574,6 +357,7 @@ def recursive_munch(d):
     else:
         return d
 def build_model(path: str, device: str):
     config = Path(__file__).parent / 'config.json'
     assert config.exists(), f'Config path incorrect: config.json not found at {config}'
@@ -587,17 +371,14 @@ def build_model(path: str, device: str):
             resblock_dilation_sizes=args.decoder.resblock_dilation_sizes,
             upsample_kernel_sizes=args.decoder.upsample_kernel_sizes,
             gen_istft_n_fft=args.decoder.gen_istft_n_fft, gen_istft_hop_size=args.decoder.gen_istft_hop_size)
     text_encoder = TextEncoder(channels=args.hidden_dim, kernel_size=5, depth=args.n_layer, n_symbols=args.n_token)
     predictor = ProsodyPredictor(style_dim=args.style_dim, d_hid=args.hidden_dim, nlayers=args.n_layer, max_dur=args.max_dur, dropout=args.dropout)
     bert = load_plbert()
     bert_encoder = nn.Linear(bert.config.hidden_size, args.hidden_dim)
     for parent in [bert, bert_encoder, predictor, decoder, text_encoder]:
         for child in parent.children():
             if isinstance(child, nn.RNNBase):
                 child.flatten_parameters()
     model = Munch(
         bert=bert.to(device).eval(),
         bert_encoder=bert_encoder.to(device).eval(),
@@ -605,7 +386,6 @@ def build_model(path: str, device: str):
         decoder=decoder.to(device).eval(),
         text_encoder=text_encoder.to(device).eval(),
     )
     for key, state_dict in torch.load(path, map_location='cpu', weights_only=True)['net'].items():
         assert key in model, key
         try:

 # https://github.com/yl4579/StyleTTS2/blob/main/models.py
+from istftnet import AdaIN1d, Decoder
 from munch import Munch
 from pathlib import Path
 from plbert import load_plbert
 import torch.nn as nn
 import torch.nn.functional as F
 class LinearNorm(torch.nn.Module):
     def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
         super(LinearNorm, self).__init__()
     def forward(self, x):
         return self.linear_layer(x)
 class LayerNorm(nn.Module):
     def __init__(self, channels, eps=1e-5):
         super().__init__()
         return mask
 class UpSample1d(nn.Module):
     def __init__(self, layer_type):
         super().__init__()
 class ProsodyPredictor(nn.Module):
     def __init__(self, style_dim, d_hid, nlayers, max_dur=50, dropout=0.1):
         super().__init__()
         self.duration_proj = LinearNorm(d_hid, max_dur)
         self.shared = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True)
         self.F0 = nn.ModuleList()
         self.F0.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout))
         self.F0.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout))
         return duration.squeeze(-1), en
     def F0Ntrain(self, x: torch.Tensor, s: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
         x1 = x.transpose(-1, -2)
         x2, _temp = self.shared(x1)
     else:
         return d
 def build_model(path: str, device: str):
     config = Path(__file__).parent / 'config.json'
     assert config.exists(), f'Config path incorrect: config.json not found at {config}'
             resblock_dilation_sizes=args.decoder.resblock_dilation_sizes,
             upsample_kernel_sizes=args.decoder.upsample_kernel_sizes,
             gen_istft_n_fft=args.decoder.gen_istft_n_fft, gen_istft_hop_size=args.decoder.gen_istft_hop_size)
     text_encoder = TextEncoder(channels=args.hidden_dim, kernel_size=5, depth=args.n_layer, n_symbols=args.n_token)
     predictor = ProsodyPredictor(style_dim=args.style_dim, d_hid=args.hidden_dim, nlayers=args.n_layer, max_dur=args.max_dur, dropout=args.dropout)
     bert = load_plbert()
     bert_encoder = nn.Linear(bert.config.hidden_size, args.hidden_dim)
     for parent in [bert, bert_encoder, predictor, decoder, text_encoder]:
         for child in parent.children():
             if isinstance(child, nn.RNNBase):
                 child.flatten_parameters()
     model = Munch(
         bert=bert.to(device).eval(),
         bert_encoder=bert_encoder.to(device).eval(),
         decoder=decoder.to(device).eval(),
         text_encoder=text_encoder.to(device).eval(),
     )
     for key, state_dict in torch.load(path, map_location='cpu', weights_only=True)['net'].items():
         assert key in model, key
         try: