debug static on sound prompts

Browse files

Files changed (5) hide show

assets/ocr.txt +11 -0
audiocraft/builders.py +1 -1
audiocraft/conv.py +12 -52
audiocraft/lm.py +4 -5
demo.py +1 -1

assets/ocr.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+MIHAI VODA VITEASUL
+1593 - 1601
+MIHAIL DOMNUL TAREI ROMANESIT STRALUCITOR IN NOROCIRE SI IN NENOROCIRE SI VIRTUOS IN AMANDOUA IN VARSTA DE 43 ANI!
+PRECUVÂNTARE
+LA I-ia EDIȚIUNE, TIPĂRITĂ LA 1877.
+Indeplinesc astăzi una din cele mai vii şi mai stăruitoare ale mele dorinţe: aceia de a scoate la lumină Istoria Românilor sub Mihaiu Vodă Viteazul, lucrarea de căpetenie a eminentului şi mult-deplânsului nostru istoric Nicolae Bălcescu, care tot de odată este și o scriere de frunte în literele româneşti.
+De două-zeci şi cinci de ani, de când Nicolae Bălcescu a murit, înstrăinat de prea iubita şi prea dorita sa țară, eu unul n'am pregetat un moment de a căuta mijloc spre a face cunoscută publicului românesc această frumoasă operă, în care îşi pironise mintea şi puterile sale un om de un rar talent, pe care, de copil încă, mă deprinsesem a-l respecta, a-l iubi, a-l admira.
+Cinci-spre-zece ani din aceştia, am păstrat cu sfințenie la mine manuscriptele lui, cercându-mă de câte ori mi-a stat în putere, a da publicităţei cel puţin o parte din ele. Dar, spre ruşinarea noastră de până acum, a trecut un pătrar de se- col de la moartea Bălcescului mai nainte ca să poată fi pus sub ochii națiunei române, tot ceiace dansul lucrase intru cea mai mare aei onoare!

audiocraft/builders.py CHANGED Viewed

@@ -10,7 +10,7 @@ from .lm import LMModel
 from .seanet import SEANetDecoder
 from .vq import ResidualVectorQuantizer
-N_REPEAT = 2  # num (virtual batch_size) clones of audio sounds
 def _shift(x):
     n = x.shape[0]

 from .seanet import SEANetDecoder
 from .vq import ResidualVectorQuantizer
+N_REPEAT = 4  # num (virtual batch_size) clones of audio sounds
 def _shift(x):
     n = x.shape[0]

audiocraft/conv.py CHANGED Viewed

@@ -30,18 +30,7 @@ def apply_parametrization_norm(module: nn.Module, norm: str = 'none'):
         return module
-def get_norm_module(module: nn.Module, causal: bool = False, norm: str = 'none', **norm_kwargs):
-    """Return the proper normalization module. If causal is True, this will ensure the returned
-    module is causal, or return an error if the normalization doesn't support causal evaluation.
-    """
-    assert norm in CONV_NORMALIZATIONS
-    if norm == 'time_group_norm':
-        if causal:
-            raise ValueError("GroupNorm doesn't support causal evaluation.")
-        assert isinstance(module, nn.modules.conv._ConvNd)
-        return nn.GroupNorm(1, module.out_channels, **norm_kwargs)
-    else:
-        return nn.Identity()
 def get_extra_padding_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int,
@@ -52,22 +41,6 @@ def get_extra_padding_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int,
     ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
     return ideal_length - length
-def pad_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0):
-    """Pad for a convolution to make sure that the last window is full.
-    Extra padding is added at the end. This is required to ensure that we can rebuild
-    an output of the same length, as otherwise, even with padding, some time steps
-    might get removed.
-    For instance, with total padding = 4, kernel size = 4, stride = 2:
-        0 0 1 2 3 4 5 0 0   # (0s are padding)
-        1   2   3           # (output frames of a convolution, last 0 is never used)
-        0 0 1 2 3 4 5 0     # (output of tr. conv., but pos. 5 is going to get removed as padding)
-            1 2 3 4         # once you removed padding, we are missing one time step !
-    """
-    extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
-    return F.pad(x, (0, extra_padding))
 def pad1d(x: torch.Tensor, paddings: tp.Tuple[int, int], mode: str = 'constant', value: float = 0.):
     """Tiny wrapper around F.pad, just to allow for reflect padding on small input.
     If this is the case, we insert extra 0 padding to the right before the reflection happen.
@@ -98,40 +71,27 @@ def unpad1d(x: torch.Tensor, paddings: tp.Tuple[int, int]):
 class NormConv1d(nn.Module):
-    """Wrapper around Conv1d and normalization applied to this conv
-    to provide a uniform interface across normalization approaches.
-    """
-    def __init__(self, *args, causal: bool = False, norm: str = 'none',
-                 norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
         super().__init__()
-        self.conv = apply_parametrization_norm(nn.Conv1d(*args, **kwargs), norm)
-        self.norm = get_norm_module(self.conv, causal, norm, **norm_kwargs)
-        self.norm_type = norm
     def forward(self, x):
-        x = self.conv(x)
-        x = self.norm(x)
-        return x
 class NormConvTranspose1d(nn.Module):
-    """Wrapper around ConvTranspose1d and normalization applied to this conv
-    to provide a uniform interface across normalization approaches.
-    """
     def __init__(self, *args, causal: bool = False, norm: str = 'none',
                  norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
         super().__init__()
         self.convtr = apply_parametrization_norm(nn.ConvTranspose1d(*args, **kwargs), norm)
-        self.norm = get_norm_module(self.convtr, causal, norm, **norm_kwargs)
-        self.norm_type = norm
     def forward(self, x):
-        x = self.convtr(x)
-        x = self.norm(x)
-        return x
@@ -155,9 +115,9 @@ class StreamableConv1d(nn.Module):
                  pad_mode='reflect'):
         super().__init__()
         # warn user on unusual setup between dilation and stride
-        if stride > 1 and dilation > 1:
-            warnings.warn("StreamableConv1d has been initialized with stride > 1 and dilation > 1"
-                          f" (kernel_size={kernel_size} stride={stride}, dilation={dilation}).")
         self.conv = NormConv1d(in_channels, out_channels, kernel_size, stride,
                                dilation=dilation, groups=groups, bias=bias, causal=causal,
                                norm=norm, norm_kwargs=norm_kwargs)

         return module
 def get_extra_padding_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int,
     ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
     return ideal_length - length
 def pad1d(x: torch.Tensor, paddings: tp.Tuple[int, int], mode: str = 'constant', value: float = 0.):
     """Tiny wrapper around F.pad, just to allow for reflect padding on small input.
     If this is the case, we insert extra 0 padding to the right before the reflection happen.
 class NormConv1d(nn.Module):
+    def __init__(self, *args,
+                 causal = False, norm = 'none',
+                 norm_kwargs = {}, **kwargs):
         super().__init__()
+        self.conv = apply_parametrization_norm(nn.Conv1d(*args, **kwargs), norm)  # norm = weight_norm
     def forward(self, x):
+        return self.conv(x)
 class NormConvTranspose1d(nn.Module):
     def __init__(self, *args, causal: bool = False, norm: str = 'none',
                  norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
         super().__init__()
         self.convtr = apply_parametrization_norm(nn.ConvTranspose1d(*args, **kwargs), norm)
     def forward(self, x):
+        return self.convtr(x)
                  pad_mode='reflect'):
         super().__init__()
         # warn user on unusual setup between dilation and stride
+        # if stride > 1 and dilation > 1:
+        #     warnings.warn("StreamableConv1d has been initialized with stride > 1 and dilation > 1"
+        #                   f" (kernel_size={kernel_size} stride={stride}, dilation={dilation}).")
         self.conv = NormConv1d(in_channels, out_channels, kernel_size, stride,
                                dilation=dilation, groups=groups, bias=bias, causal=causal,
                                norm=norm, norm_kwargs=norm_kwargs)

audiocraft/lm.py CHANGED Viewed

@@ -19,7 +19,7 @@ class LMModel(nn.Module):
         self.condition_provider = T5Conditioner(name='t5-large',
                                                 output_dim=dim)
         self.card = card  # 2048 ?
-        self.n_draw = 3  # replicate so many times the generation of each text in batch
         # the batch is more expensive than n_draw as it re-runs the model bs times
         # n_draw just draws more phonemes from the multinomial - after running the lm
         embed_dim = self.card + 1
@@ -56,7 +56,7 @@ class LMModel(nn.Module):
         # SAMPLE TOP K
-        k = 400  # 450 is nice sound still train honk is clear!
         p = torch.softmax(logits, dim=3)
         top_k_value, _ = torch.topk(p, k, dim=3)  # [3, 4, 1, k]
         min_value_top_k = top_k_value[:, :, :, -1:]
@@ -67,7 +67,7 @@ class LMModel(nn.Module):
         p = p.reshape(bs * self.n_q, 2048)
         out = torch.multinomial(p,  # p=[bs,2048], out=[bs, num_samples]
                                 num_samples=self.n_draw,
-                                replacement=True)  # [bs*4, self.n_draw]
         # print('DRAW','c', out)
         return out.reshape(bs, self.n_q, self.n_draw).transpose(1,2)  # [bs=3not6, self.n_draw, 4]
@@ -138,8 +138,7 @@ class LMModel(nn.Module):
                 pass #print('No delete anti-diag')
             out_codes[:, :, [0, 1, 2, 3], torch.tensor([3, 2, 1, 0]) + offset + 1] = next_token
-        print('\n_____ALIGN____\n',  out_codes[1, 2, :, 4:max_tokens+4])  # do we pass 2048 to Seanet - There wil result in AtenIndexingError as it has no 2048
         # EXTRACT COLUMNS AS ALIGN IS ALREADY DONE by FILLING DIAGONALLY
         out_codes = out_codes[:, :, :, 4:max_tokens+4].transpose(1, 2).reshape(bs, 4, self.n_draw * max_tokens)   # [bs, 4, duration*n_draw] DISCARD FILL 2048

         self.condition_provider = T5Conditioner(name='t5-large',
                                                 output_dim=dim)
         self.card = card  # 2048 ?
+        self.n_draw = 6  # replicate so many times the generation of each text in batch
         # the batch is more expensive than n_draw as it re-runs the model bs times
         # n_draw just draws more phonemes from the multinomial - after running the lm
         embed_dim = self.card + 1
         # SAMPLE TOP K
+        k = 400 # 450 is nice sound still train honk is clear!
         p = torch.softmax(logits, dim=3)
         top_k_value, _ = torch.topk(p, k, dim=3)  # [3, 4, 1, k]
         min_value_top_k = top_k_value[:, :, :, -1:]
         p = p.reshape(bs * self.n_q, 2048)
         out = torch.multinomial(p,  # p=[bs,2048], out=[bs, num_samples]
                                 num_samples=self.n_draw,
+                                replacement=False)  # [bs*4, self.n_draw]
         # print('DRAW','c', out)
         return out.reshape(bs, self.n_q, self.n_draw).transpose(1,2)  # [bs=3not6, self.n_draw, 4]
                 pass #print('No delete anti-diag')
             out_codes[:, :, [0, 1, 2, 3], torch.tensor([3, 2, 1, 0]) + offset + 1] = next_token
+# END LOOP
         # EXTRACT COLUMNS AS ALIGN IS ALREADY DONE by FILLING DIAGONALLY
         out_codes = out_codes[:, :, :, 4:max_tokens+4].transpose(1, 2).reshape(bs, 4, self.n_draw * max_tokens)   # [bs, 4, duration*n_draw] DISCARD FILL 2048

demo.py CHANGED Viewed

@@ -74,7 +74,7 @@ def tts_entry(text='»Vom Prof. Friedrich ist noch eine recht schöne große Lan
         background = sound_gen.generate(soundscape,
                                               duration=len(x)/24000 + .74,  # sound duration in seconds
                                               ).detach().cpu().numpy() # bs, 11400 @.74s
-        x = .5 * x + .5 * background[:len(x)]
     return x
 soundfile.write(f'demo.wav', tts_entry(), 24000)

         background = sound_gen.generate(soundscape,
                                               duration=len(x)/24000 + .74,  # sound duration in seconds
                                               ).detach().cpu().numpy() # bs, 11400 @.74s
+        x = .5 * x + .47 * background[:len(x)]
     return x
 soundfile.write(f'demo.wav', tts_entry(), 24000)