debug static on sound prompts
Browse files- assets/ocr.txt +11 -0
- audiocraft/builders.py +1 -1
- audiocraft/conv.py +12 -52
- audiocraft/lm.py +4 -5
- demo.py +1 -1
assets/ocr.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIHAI VODA VITEASUL
|
| 2 |
+
|
| 3 |
+
1593 - 1601
|
| 4 |
+
|
| 5 |
+
MIHAIL DOMNUL TAREI ROMANESIT STRALUCITOR IN NOROCIRE SI IN NENOROCIRE SI VIRTUOS IN AMANDOUA IN VARSTA DE 43 ANI!
|
| 6 |
+
|
| 7 |
+
PRECUVÂNTARE
|
| 8 |
+
LA I-ia EDIȚIUNE, TIPĂRITĂ LA 1877.
|
| 9 |
+
Indeplinesc astăzi una din cele mai vii şi mai stăruitoare ale mele dorinţe: aceia de a scoate la lumină Istoria Românilor sub Mihaiu Vodă Viteazul, lucrarea de căpetenie a eminentului şi mult-deplânsului nostru istoric Nicolae Bălcescu, care tot de odată este și o scriere de frunte în literele româneşti.
|
| 10 |
+
De două-zeci şi cinci de ani, de când Nicolae Bălcescu a murit, înstrăinat de prea iubita şi prea dorita sa țară, eu unul n'am pregetat un moment de a căuta mijloc spre a face cunoscută publicului românesc această frumoasă operă, în care îşi pironise mintea şi puterile sale un om de un rar talent, pe care, de copil încă, mă deprinsesem a-l respecta, a-l iubi, a-l admira.
|
| 11 |
+
Cinci-spre-zece ani din aceştia, am păstrat cu sfințenie la mine manuscriptele lui, cercându-mă de câte ori mi-a stat în putere, a da publicităţei cel puţin o parte din ele. Dar, spre ruşinarea noastră de până acum, a trecut un pătrar de se- col de la moartea Bălcescului mai nainte ca să poată fi pus sub ochii națiunei române, tot ceiace dansul lucrase intru cea mai mare aei onoare!
|
audiocraft/builders.py
CHANGED
|
@@ -10,7 +10,7 @@ from .lm import LMModel
|
|
| 10 |
from .seanet import SEANetDecoder
|
| 11 |
from .vq import ResidualVectorQuantizer
|
| 12 |
|
| 13 |
-
N_REPEAT =
|
| 14 |
|
| 15 |
def _shift(x):
|
| 16 |
n = x.shape[0]
|
|
|
|
| 10 |
from .seanet import SEANetDecoder
|
| 11 |
from .vq import ResidualVectorQuantizer
|
| 12 |
|
| 13 |
+
N_REPEAT = 4 # num (virtual batch_size) clones of audio sounds
|
| 14 |
|
| 15 |
def _shift(x):
|
| 16 |
n = x.shape[0]
|
audiocraft/conv.py
CHANGED
|
@@ -30,18 +30,7 @@ def apply_parametrization_norm(module: nn.Module, norm: str = 'none'):
|
|
| 30 |
return module
|
| 31 |
|
| 32 |
|
| 33 |
-
|
| 34 |
-
"""Return the proper normalization module. If causal is True, this will ensure the returned
|
| 35 |
-
module is causal, or return an error if the normalization doesn't support causal evaluation.
|
| 36 |
-
"""
|
| 37 |
-
assert norm in CONV_NORMALIZATIONS
|
| 38 |
-
if norm == 'time_group_norm':
|
| 39 |
-
if causal:
|
| 40 |
-
raise ValueError("GroupNorm doesn't support causal evaluation.")
|
| 41 |
-
assert isinstance(module, nn.modules.conv._ConvNd)
|
| 42 |
-
return nn.GroupNorm(1, module.out_channels, **norm_kwargs)
|
| 43 |
-
else:
|
| 44 |
-
return nn.Identity()
|
| 45 |
|
| 46 |
|
| 47 |
def get_extra_padding_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int,
|
|
@@ -52,22 +41,6 @@ def get_extra_padding_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int,
|
|
| 52 |
ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
|
| 53 |
return ideal_length - length
|
| 54 |
|
| 55 |
-
|
| 56 |
-
def pad_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0):
|
| 57 |
-
"""Pad for a convolution to make sure that the last window is full.
|
| 58 |
-
Extra padding is added at the end. This is required to ensure that we can rebuild
|
| 59 |
-
an output of the same length, as otherwise, even with padding, some time steps
|
| 60 |
-
might get removed.
|
| 61 |
-
For instance, with total padding = 4, kernel size = 4, stride = 2:
|
| 62 |
-
0 0 1 2 3 4 5 0 0 # (0s are padding)
|
| 63 |
-
1 2 3 # (output frames of a convolution, last 0 is never used)
|
| 64 |
-
0 0 1 2 3 4 5 0 # (output of tr. conv., but pos. 5 is going to get removed as padding)
|
| 65 |
-
1 2 3 4 # once you removed padding, we are missing one time step !
|
| 66 |
-
"""
|
| 67 |
-
extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
|
| 68 |
-
return F.pad(x, (0, extra_padding))
|
| 69 |
-
|
| 70 |
-
|
| 71 |
def pad1d(x: torch.Tensor, paddings: tp.Tuple[int, int], mode: str = 'constant', value: float = 0.):
|
| 72 |
"""Tiny wrapper around F.pad, just to allow for reflect padding on small input.
|
| 73 |
If this is the case, we insert extra 0 padding to the right before the reflection happen.
|
|
@@ -98,40 +71,27 @@ def unpad1d(x: torch.Tensor, paddings: tp.Tuple[int, int]):
|
|
| 98 |
|
| 99 |
|
| 100 |
class NormConv1d(nn.Module):
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
def __init__(self, *args, causal: bool = False, norm: str = 'none',
|
| 105 |
-
norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
|
| 106 |
super().__init__()
|
| 107 |
-
self.conv = apply_parametrization_norm(nn.Conv1d(*args, **kwargs), norm)
|
| 108 |
-
self.norm = get_norm_module(self.conv, causal, norm, **norm_kwargs)
|
| 109 |
-
self.norm_type = norm
|
| 110 |
-
|
| 111 |
def forward(self, x):
|
| 112 |
-
|
| 113 |
-
x = self.norm(x)
|
| 114 |
-
return x
|
| 115 |
|
| 116 |
|
| 117 |
|
| 118 |
|
| 119 |
|
| 120 |
class NormConvTranspose1d(nn.Module):
|
| 121 |
-
"""Wrapper around ConvTranspose1d and normalization applied to this conv
|
| 122 |
-
to provide a uniform interface across normalization approaches.
|
| 123 |
-
"""
|
| 124 |
def __init__(self, *args, causal: bool = False, norm: str = 'none',
|
| 125 |
norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
|
| 126 |
super().__init__()
|
| 127 |
self.convtr = apply_parametrization_norm(nn.ConvTranspose1d(*args, **kwargs), norm)
|
| 128 |
-
|
| 129 |
-
self.norm_type = norm
|
| 130 |
-
|
| 131 |
def forward(self, x):
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
return x
|
| 135 |
|
| 136 |
|
| 137 |
|
|
@@ -155,9 +115,9 @@ class StreamableConv1d(nn.Module):
|
|
| 155 |
pad_mode='reflect'):
|
| 156 |
super().__init__()
|
| 157 |
# warn user on unusual setup between dilation and stride
|
| 158 |
-
if stride > 1 and dilation > 1:
|
| 159 |
-
|
| 160 |
-
|
| 161 |
self.conv = NormConv1d(in_channels, out_channels, kernel_size, stride,
|
| 162 |
dilation=dilation, groups=groups, bias=bias, causal=causal,
|
| 163 |
norm=norm, norm_kwargs=norm_kwargs)
|
|
|
|
| 30 |
return module
|
| 31 |
|
| 32 |
|
| 33 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
|
| 36 |
def get_extra_padding_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int,
|
|
|
|
| 41 |
ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
|
| 42 |
return ideal_length - length
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
def pad1d(x: torch.Tensor, paddings: tp.Tuple[int, int], mode: str = 'constant', value: float = 0.):
|
| 45 |
"""Tiny wrapper around F.pad, just to allow for reflect padding on small input.
|
| 46 |
If this is the case, we insert extra 0 padding to the right before the reflection happen.
|
|
|
|
| 71 |
|
| 72 |
|
| 73 |
class NormConv1d(nn.Module):
|
| 74 |
+
def __init__(self, *args,
|
| 75 |
+
causal = False, norm = 'none',
|
| 76 |
+
norm_kwargs = {}, **kwargs):
|
|
|
|
|
|
|
| 77 |
super().__init__()
|
| 78 |
+
self.conv = apply_parametrization_norm(nn.Conv1d(*args, **kwargs), norm) # norm = weight_norm
|
|
|
|
|
|
|
|
|
|
| 79 |
def forward(self, x):
|
| 80 |
+
return self.conv(x)
|
|
|
|
|
|
|
| 81 |
|
| 82 |
|
| 83 |
|
| 84 |
|
| 85 |
|
| 86 |
class NormConvTranspose1d(nn.Module):
|
|
|
|
|
|
|
|
|
|
| 87 |
def __init__(self, *args, causal: bool = False, norm: str = 'none',
|
| 88 |
norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
|
| 89 |
super().__init__()
|
| 90 |
self.convtr = apply_parametrization_norm(nn.ConvTranspose1d(*args, **kwargs), norm)
|
| 91 |
+
|
|
|
|
|
|
|
| 92 |
def forward(self, x):
|
| 93 |
+
return self.convtr(x)
|
| 94 |
+
|
|
|
|
| 95 |
|
| 96 |
|
| 97 |
|
|
|
|
| 115 |
pad_mode='reflect'):
|
| 116 |
super().__init__()
|
| 117 |
# warn user on unusual setup between dilation and stride
|
| 118 |
+
# if stride > 1 and dilation > 1:
|
| 119 |
+
# warnings.warn("StreamableConv1d has been initialized with stride > 1 and dilation > 1"
|
| 120 |
+
# f" (kernel_size={kernel_size} stride={stride}, dilation={dilation}).")
|
| 121 |
self.conv = NormConv1d(in_channels, out_channels, kernel_size, stride,
|
| 122 |
dilation=dilation, groups=groups, bias=bias, causal=causal,
|
| 123 |
norm=norm, norm_kwargs=norm_kwargs)
|
audiocraft/lm.py
CHANGED
|
@@ -19,7 +19,7 @@ class LMModel(nn.Module):
|
|
| 19 |
self.condition_provider = T5Conditioner(name='t5-large',
|
| 20 |
output_dim=dim)
|
| 21 |
self.card = card # 2048 ?
|
| 22 |
-
self.n_draw =
|
| 23 |
# the batch is more expensive than n_draw as it re-runs the model bs times
|
| 24 |
# n_draw just draws more phonemes from the multinomial - after running the lm
|
| 25 |
embed_dim = self.card + 1
|
|
@@ -56,7 +56,7 @@ class LMModel(nn.Module):
|
|
| 56 |
|
| 57 |
|
| 58 |
# SAMPLE TOP K
|
| 59 |
-
k = 400
|
| 60 |
p = torch.softmax(logits, dim=3)
|
| 61 |
top_k_value, _ = torch.topk(p, k, dim=3) # [3, 4, 1, k]
|
| 62 |
min_value_top_k = top_k_value[:, :, :, -1:]
|
|
@@ -67,7 +67,7 @@ class LMModel(nn.Module):
|
|
| 67 |
p = p.reshape(bs * self.n_q, 2048)
|
| 68 |
out = torch.multinomial(p, # p=[bs,2048], out=[bs, num_samples]
|
| 69 |
num_samples=self.n_draw,
|
| 70 |
-
replacement=
|
| 71 |
# print('DRAW','c', out)
|
| 72 |
return out.reshape(bs, self.n_q, self.n_draw).transpose(1,2) # [bs=3not6, self.n_draw, 4]
|
| 73 |
|
|
@@ -138,8 +138,7 @@ class LMModel(nn.Module):
|
|
| 138 |
pass #print('No delete anti-diag')
|
| 139 |
|
| 140 |
out_codes[:, :, [0, 1, 2, 3], torch.tensor([3, 2, 1, 0]) + offset + 1] = next_token
|
| 141 |
-
|
| 142 |
-
print('\n_____ALIGN____\n', out_codes[1, 2, :, 4:max_tokens+4]) # do we pass 2048 to Seanet - There wil result in AtenIndexingError as it has no 2048
|
| 143 |
# EXTRACT COLUMNS AS ALIGN IS ALREADY DONE by FILLING DIAGONALLY
|
| 144 |
out_codes = out_codes[:, :, :, 4:max_tokens+4].transpose(1, 2).reshape(bs, 4, self.n_draw * max_tokens) # [bs, 4, duration*n_draw] DISCARD FILL 2048
|
| 145 |
|
|
|
|
| 19 |
self.condition_provider = T5Conditioner(name='t5-large',
|
| 20 |
output_dim=dim)
|
| 21 |
self.card = card # 2048 ?
|
| 22 |
+
self.n_draw = 6 # replicate so many times the generation of each text in batch
|
| 23 |
# the batch is more expensive than n_draw as it re-runs the model bs times
|
| 24 |
# n_draw just draws more phonemes from the multinomial - after running the lm
|
| 25 |
embed_dim = self.card + 1
|
|
|
|
| 56 |
|
| 57 |
|
| 58 |
# SAMPLE TOP K
|
| 59 |
+
k = 400 # 450 is nice sound still train honk is clear!
|
| 60 |
p = torch.softmax(logits, dim=3)
|
| 61 |
top_k_value, _ = torch.topk(p, k, dim=3) # [3, 4, 1, k]
|
| 62 |
min_value_top_k = top_k_value[:, :, :, -1:]
|
|
|
|
| 67 |
p = p.reshape(bs * self.n_q, 2048)
|
| 68 |
out = torch.multinomial(p, # p=[bs,2048], out=[bs, num_samples]
|
| 69 |
num_samples=self.n_draw,
|
| 70 |
+
replacement=False) # [bs*4, self.n_draw]
|
| 71 |
# print('DRAW','c', out)
|
| 72 |
return out.reshape(bs, self.n_q, self.n_draw).transpose(1,2) # [bs=3not6, self.n_draw, 4]
|
| 73 |
|
|
|
|
| 138 |
pass #print('No delete anti-diag')
|
| 139 |
|
| 140 |
out_codes[:, :, [0, 1, 2, 3], torch.tensor([3, 2, 1, 0]) + offset + 1] = next_token
|
| 141 |
+
# END LOOP
|
|
|
|
| 142 |
# EXTRACT COLUMNS AS ALIGN IS ALREADY DONE by FILLING DIAGONALLY
|
| 143 |
out_codes = out_codes[:, :, :, 4:max_tokens+4].transpose(1, 2).reshape(bs, 4, self.n_draw * max_tokens) # [bs, 4, duration*n_draw] DISCARD FILL 2048
|
| 144 |
|
demo.py
CHANGED
|
@@ -74,7 +74,7 @@ def tts_entry(text='»Vom Prof. Friedrich ist noch eine recht schöne große Lan
|
|
| 74 |
background = sound_gen.generate(soundscape,
|
| 75 |
duration=len(x)/24000 + .74, # sound duration in seconds
|
| 76 |
).detach().cpu().numpy() # bs, 11400 @.74s
|
| 77 |
-
x = .5 * x + .
|
| 78 |
return x
|
| 79 |
|
| 80 |
soundfile.write(f'demo.wav', tts_entry(), 24000)
|
|
|
|
| 74 |
background = sound_gen.generate(soundscape,
|
| 75 |
duration=len(x)/24000 + .74, # sound duration in seconds
|
| 76 |
).detach().cpu().numpy() # bs, 11400 @.74s
|
| 77 |
+
x = .5 * x + .47 * background[:len(x)]
|
| 78 |
return x
|
| 79 |
|
| 80 |
soundfile.write(f'demo.wav', tts_entry(), 24000)
|