Working on onnx export.
Browse files- models.py +45 -18
- test.ipynb +0 -0
models.py
CHANGED
|
@@ -272,9 +272,8 @@ class TextEncoder(nn.Module):
|
|
| 272 |
|
| 273 |
x = x.transpose(1, 2) # [B, T, chn]
|
| 274 |
|
| 275 |
-
input_lengths = input_lengths.cpu().numpy()
|
| 276 |
x = nn.utils.rnn.pack_padded_sequence(
|
| 277 |
-
x, input_lengths, batch_first=True, enforce_sorted=False)
|
| 278 |
|
| 279 |
self.lstm.flatten_parameters()
|
| 280 |
x, _ = self.lstm(x)
|
|
@@ -292,12 +291,19 @@ class TextEncoder(nn.Module):
|
|
| 292 |
return x
|
| 293 |
|
| 294 |
def inference(self, x):
|
| 295 |
-
x = self.embedding(x)
|
| 296 |
-
x = x.transpose(1, 2)
|
| 297 |
-
|
| 298 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
self.lstm.flatten_parameters()
|
| 300 |
x, _ = self.lstm(x)
|
|
|
|
|
|
|
|
|
|
| 301 |
return x
|
| 302 |
|
| 303 |
def length_to_mask(self, lengths):
|
|
@@ -433,7 +439,7 @@ class ProsodyPredictor(nn.Module):
|
|
| 433 |
text_size = d.shape[1]
|
| 434 |
|
| 435 |
# predict duration
|
| 436 |
-
input_lengths = text_lengths
|
| 437 |
x = nn.utils.rnn.pack_padded_sequence(
|
| 438 |
d, input_lengths, batch_first=True, enforce_sorted=False)
|
| 439 |
|
|
@@ -456,8 +462,14 @@ class ProsodyPredictor(nn.Module):
|
|
| 456 |
return duration.squeeze(-1), en
|
| 457 |
|
| 458 |
def F0Ntrain(self, x, s):
|
| 459 |
-
|
| 460 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 461 |
F0 = x.transpose(-1, -2)
|
| 462 |
for block in self.F0:
|
| 463 |
F0 = block(F0, s)
|
|
@@ -503,7 +515,6 @@ class DurationEncoder(nn.Module):
|
|
| 503 |
x.masked_fill_(masks.unsqueeze(-1).transpose(0, 1), 0.0)
|
| 504 |
|
| 505 |
x = x.transpose(0, 1)
|
| 506 |
-
input_lengths = text_lengths.cpu().numpy()
|
| 507 |
x = x.transpose(-1, -2)
|
| 508 |
|
| 509 |
for block in self.lstms:
|
|
@@ -513,8 +524,9 @@ class DurationEncoder(nn.Module):
|
|
| 513 |
x.masked_fill_(masks.unsqueeze(-1).transpose(-1, -2), 0.0)
|
| 514 |
else:
|
| 515 |
x = x.transpose(-1, -2)
|
|
|
|
| 516 |
x = nn.utils.rnn.pack_padded_sequence(
|
| 517 |
-
x,
|
| 518 |
block.flatten_parameters()
|
| 519 |
x, _ = block(x)
|
| 520 |
x, _ = nn.utils.rnn.pad_packed_sequence(
|
|
@@ -529,13 +541,28 @@ class DurationEncoder(nn.Module):
|
|
| 529 |
|
| 530 |
return x.transpose(-1, -2)
|
| 531 |
|
| 532 |
-
def inference(self, x, style):
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 539 |
|
| 540 |
def length_to_mask(self, lengths):
|
| 541 |
mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
|
|
|
|
| 272 |
|
| 273 |
x = x.transpose(1, 2) # [B, T, chn]
|
| 274 |
|
|
|
|
| 275 |
x = nn.utils.rnn.pack_padded_sequence(
|
| 276 |
+
x, input_lengths.cpu(), batch_first=True, enforce_sorted=False)
|
| 277 |
|
| 278 |
self.lstm.flatten_parameters()
|
| 279 |
x, _ = self.lstm(x)
|
|
|
|
| 291 |
return x
|
| 292 |
|
| 293 |
def inference(self, x):
|
| 294 |
+
x = self.embedding(x) # [B, T, emb]
|
| 295 |
+
x = x.transpose(1, 2) # [B, emb, T]
|
| 296 |
+
|
| 297 |
+
for c in self.cnn:
|
| 298 |
+
x = c(x)
|
| 299 |
+
|
| 300 |
+
x = x.transpose(1, 2) # [B, T, chn]
|
| 301 |
+
|
| 302 |
self.lstm.flatten_parameters()
|
| 303 |
x, _ = self.lstm(x)
|
| 304 |
+
|
| 305 |
+
x = x.transpose(-1, -2)
|
| 306 |
+
|
| 307 |
return x
|
| 308 |
|
| 309 |
def length_to_mask(self, lengths):
|
|
|
|
| 439 |
text_size = d.shape[1]
|
| 440 |
|
| 441 |
# predict duration
|
| 442 |
+
input_lengths = text_lengths
|
| 443 |
x = nn.utils.rnn.pack_padded_sequence(
|
| 444 |
d, input_lengths, batch_first=True, enforce_sorted=False)
|
| 445 |
|
|
|
|
| 462 |
return duration.squeeze(-1), en
|
| 463 |
|
| 464 |
def F0Ntrain(self, x, s):
|
| 465 |
+
x1 = x.transpose(-1, -2)
|
| 466 |
+
torch._check(x1.dim() == 3, lambda: print(f"Expected 3D tensor, got {x1.dim()}D tensor"))
|
| 467 |
+
torch._check(x1.shape[1] > 1, lambda: print(f"Shape 2, got {x1.size(1)}"))
|
| 468 |
+
torch._check(x1.shape[2] > 1, lambda: print(f"Shape 2, got {x1.size(2)}"))
|
| 469 |
+
torch._check(x.shape[2] > 0, lambda: print(f"Shape 2, got {x.size(2)}"))
|
| 470 |
+
x, _ = self.shared(x1)
|
| 471 |
+
# torch._check(x.shape[2] > 0, lambda: print(f"Shape 2, got {x.size(2)}"))
|
| 472 |
+
|
| 473 |
F0 = x.transpose(-1, -2)
|
| 474 |
for block in self.F0:
|
| 475 |
F0 = block(F0, s)
|
|
|
|
| 515 |
x.masked_fill_(masks.unsqueeze(-1).transpose(0, 1), 0.0)
|
| 516 |
|
| 517 |
x = x.transpose(0, 1)
|
|
|
|
| 518 |
x = x.transpose(-1, -2)
|
| 519 |
|
| 520 |
for block in self.lstms:
|
|
|
|
| 524 |
x.masked_fill_(masks.unsqueeze(-1).transpose(-1, -2), 0.0)
|
| 525 |
else:
|
| 526 |
x = x.transpose(-1, -2)
|
| 527 |
+
|
| 528 |
x = nn.utils.rnn.pack_padded_sequence(
|
| 529 |
+
x, text_lengths.cpu(), batch_first=True, enforce_sorted=False)
|
| 530 |
block.flatten_parameters()
|
| 531 |
x, _ = block(x)
|
| 532 |
x, _ = nn.utils.rnn.pad_packed_sequence(
|
|
|
|
| 541 |
|
| 542 |
return x.transpose(-1, -2)
|
| 543 |
|
| 544 |
+
def inference(self, x: torch.Tensor, style: torch.Tensor) -> torch.Tensor:
|
| 545 |
+
|
| 546 |
+
x = x.permute(2, 0, 1)
|
| 547 |
+
s = style.expand(x.shape[0], x.shape[1], -1)
|
| 548 |
+
x = torch.cat([x, s], axis=-1)
|
| 549 |
+
|
| 550 |
+
x = x.transpose(0, 1)
|
| 551 |
+
x = x.transpose(-1, -2)
|
| 552 |
+
|
| 553 |
+
for block in self.lstms:
|
| 554 |
+
if isinstance(block, AdaLayerNorm):
|
| 555 |
+
x = block(x.transpose(-1, -2), style).transpose(-1, -2)
|
| 556 |
+
x = torch.cat([x, s.permute(1, -1, 0)], axis=1)
|
| 557 |
+
else:
|
| 558 |
+
x = x.transpose(-1, -2)
|
| 559 |
+
|
| 560 |
+
block.flatten_parameters()
|
| 561 |
+
x, _ = block(x)
|
| 562 |
+
|
| 563 |
+
x = F.dropout(x, p=self.dropout, training=self.training)
|
| 564 |
+
x = x.transpose(-1, -2)
|
| 565 |
+
return x.transpose(-1, -2)
|
| 566 |
|
| 567 |
def length_to_mask(self, lengths):
|
| 568 |
mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
|
test.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|