geneing
/

Kokoro

Text-to-Speech

English

Model card Files Files and versions

xet

Community

geneing commited on Jan 1, 2025

Commit

5b93bbf

1 Parent(s): c3b0d86

Working on onnx export.

Browse files

Files changed (2) hide show

models.py +45 -18
test.ipynb +0 -0

models.py CHANGED Viewed

@@ -272,9 +272,8 @@ class TextEncoder(nn.Module):
         x = x.transpose(1, 2)  # [B, T, chn]
-        input_lengths = input_lengths.cpu().numpy()
         x = nn.utils.rnn.pack_padded_sequence(
-            x, input_lengths, batch_first=True, enforce_sorted=False)
         self.lstm.flatten_parameters()
         x, _ = self.lstm(x)
@@ -292,12 +291,19 @@ class TextEncoder(nn.Module):
         return x
     def inference(self, x):
-        x = self.embedding(x)
-        x = x.transpose(1, 2)
-        x = self.cnn(x)
-        x = x.transpose(1, 2)
         self.lstm.flatten_parameters()
         x, _ = self.lstm(x)
         return x
     def length_to_mask(self, lengths):
@@ -433,7 +439,7 @@ class ProsodyPredictor(nn.Module):
         text_size = d.shape[1]
         # predict duration
-        input_lengths = text_lengths.cpu().numpy()
         x = nn.utils.rnn.pack_padded_sequence(
             d, input_lengths, batch_first=True, enforce_sorted=False)
@@ -456,8 +462,14 @@ class ProsodyPredictor(nn.Module):
         return duration.squeeze(-1), en
     def F0Ntrain(self, x, s):
-        x, _ = self.shared(x.transpose(-1, -2))
         F0 = x.transpose(-1, -2)
         for block in self.F0:
             F0 = block(F0, s)
@@ -503,7 +515,6 @@ class DurationEncoder(nn.Module):
         x.masked_fill_(masks.unsqueeze(-1).transpose(0, 1), 0.0)
         x = x.transpose(0, 1)
-        input_lengths = text_lengths.cpu().numpy()
         x = x.transpose(-1, -2)
         for block in self.lstms:
@@ -513,8 +524,9 @@ class DurationEncoder(nn.Module):
                 x.masked_fill_(masks.unsqueeze(-1).transpose(-1, -2), 0.0)
             else:
                 x = x.transpose(-1, -2)
                 x = nn.utils.rnn.pack_padded_sequence(
-                    x, input_lengths, batch_first=True, enforce_sorted=False)
                 block.flatten_parameters()
                 x, _ = block(x)
                 x, _ = nn.utils.rnn.pad_packed_sequence(
@@ -529,13 +541,28 @@ class DurationEncoder(nn.Module):
         return x.transpose(-1, -2)
-    def inference(self, x, style):
-        x = self.embedding(x.transpose(-1, -2)) * np.sqrt(self.d_model)
-        style = style.expand(x.shape[0], x.shape[1], -1)
-        x = torch.cat([x, style], axis=-1)
-        src = self.pos_encoder(x)
-        output = self.transformer_encoder(src).transpose(0, 1)
-        return output
     def length_to_mask(self, lengths):
         mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)

         x = x.transpose(1, 2)  # [B, T, chn]
         x = nn.utils.rnn.pack_padded_sequence(
+            x, input_lengths.cpu(), batch_first=True, enforce_sorted=False)
         self.lstm.flatten_parameters()
         x, _ = self.lstm(x)
         return x
     def inference(self, x):
+        x = self.embedding(x)  # [B, T, emb]
+        x = x.transpose(1, 2)  # [B, emb, T]
+        for c in self.cnn:
+            x = c(x)
+        x = x.transpose(1, 2)  # [B, T, chn]
         self.lstm.flatten_parameters()
         x, _ = self.lstm(x)
+        x = x.transpose(-1, -2)
         return x
     def length_to_mask(self, lengths):
         text_size = d.shape[1]
         # predict duration
+        input_lengths = text_lengths
         x = nn.utils.rnn.pack_padded_sequence(
             d, input_lengths, batch_first=True, enforce_sorted=False)
         return duration.squeeze(-1), en
     def F0Ntrain(self, x, s):
+        x1 = x.transpose(-1, -2)
+        torch._check(x1.dim() == 3, lambda: print(f"Expected 3D tensor, got {x1.dim()}D tensor"))
+        torch._check(x1.shape[1] > 1, lambda: print(f"Shape 2, got {x1.size(1)}"))
+        torch._check(x1.shape[2] > 1, lambda: print(f"Shape 2, got {x1.size(2)}"))
+        torch._check(x.shape[2] > 0, lambda: print(f"Shape 2, got {x.size(2)}"))
+        x, _ = self.shared(x1)
+        # torch._check(x.shape[2] > 0, lambda: print(f"Shape 2, got {x.size(2)}"))
         F0 = x.transpose(-1, -2)
         for block in self.F0:
             F0 = block(F0, s)
         x.masked_fill_(masks.unsqueeze(-1).transpose(0, 1), 0.0)
         x = x.transpose(0, 1)
         x = x.transpose(-1, -2)
         for block in self.lstms:
                 x.masked_fill_(masks.unsqueeze(-1).transpose(-1, -2), 0.0)
             else:
                 x = x.transpose(-1, -2)
                 x = nn.utils.rnn.pack_padded_sequence(
+                    x, text_lengths.cpu(), batch_first=True, enforce_sorted=False)
                 block.flatten_parameters()
                 x, _ = block(x)
                 x, _ = nn.utils.rnn.pad_packed_sequence(
         return x.transpose(-1, -2)
+    def inference(self, x: torch.Tensor, style: torch.Tensor) -> torch.Tensor:
+        x = x.permute(2, 0, 1)
+        s = style.expand(x.shape[0], x.shape[1], -1)
+        x = torch.cat([x, s], axis=-1)
+        x = x.transpose(0, 1)
+        x = x.transpose(-1, -2)
+        for block in self.lstms:
+            if isinstance(block, AdaLayerNorm):
+                x = block(x.transpose(-1, -2), style).transpose(-1, -2)
+                x = torch.cat([x, s.permute(1, -1, 0)], axis=1)
+            else:
+                x = x.transpose(-1, -2)
+                block.flatten_parameters()
+                x, _ = block(x)
+                x = F.dropout(x, p=self.dropout, training=self.training)
+                x = x.transpose(-1, -2)
+        return x.transpose(-1, -2)
     def length_to_mask(self, lengths):
         mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)

test.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff