Dionyssos commited on
Commit
8e60374
·
1 Parent(s): bc08da5

debug static on sound prompts

Browse files
Files changed (5) hide show
  1. assets/ocr.txt +11 -0
  2. audiocraft/builders.py +1 -1
  3. audiocraft/conv.py +12 -52
  4. audiocraft/lm.py +4 -5
  5. demo.py +1 -1
assets/ocr.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIHAI VODA VITEASUL
2
+
3
+ 1593 - 1601
4
+
5
+ MIHAIL DOMNUL TAREI ROMANESIT STRALUCITOR IN NOROCIRE SI IN NENOROCIRE SI VIRTUOS IN AMANDOUA IN VARSTA DE 43 ANI!
6
+
7
+ PRECUVÂNTARE
8
+ LA I-ia EDIȚIUNE, TIPĂRITĂ LA 1877.
9
+ Indeplinesc astăzi una din cele mai vii şi mai stăruitoare ale mele dorinţe: aceia de a scoate la lumină Istoria Românilor sub Mihaiu Vodă Viteazul, lucrarea de căpetenie a eminentului şi mult-deplânsului nostru istoric Nicolae Bălcescu, care tot de odată este și o scriere de frunte în literele româneşti.
10
+ De două-zeci şi cinci de ani, de când Nicolae Bălcescu a murit, înstrăinat de prea iubita şi prea dorita sa țară, eu unul n'am pregetat un moment de a căuta mijloc spre a face cunoscută publicului românesc această frumoasă operă, în care îşi pironise mintea şi puterile sale un om de un rar talent, pe care, de copil încă, mă deprinsesem a-l respecta, a-l iubi, a-l admira.
11
+ Cinci-spre-zece ani din aceştia, am păstrat cu sfințenie la mine manuscriptele lui, cercându-mă de câte ori mi-a stat în putere, a da publicităţei cel puţin o parte din ele. Dar, spre ruşinarea noastră de până acum, a trecut un pătrar de se- col de la moartea Bălcescului mai nainte ca să poată fi pus sub ochii națiunei române, tot ceiace dansul lucrase intru cea mai mare aei onoare!
audiocraft/builders.py CHANGED
@@ -10,7 +10,7 @@ from .lm import LMModel
10
  from .seanet import SEANetDecoder
11
  from .vq import ResidualVectorQuantizer
12
 
13
- N_REPEAT = 2 # num (virtual batch_size) clones of audio sounds
14
 
15
  def _shift(x):
16
  n = x.shape[0]
 
10
  from .seanet import SEANetDecoder
11
  from .vq import ResidualVectorQuantizer
12
 
13
+ N_REPEAT = 4 # num (virtual batch_size) clones of audio sounds
14
 
15
  def _shift(x):
16
  n = x.shape[0]
audiocraft/conv.py CHANGED
@@ -30,18 +30,7 @@ def apply_parametrization_norm(module: nn.Module, norm: str = 'none'):
30
  return module
31
 
32
 
33
- def get_norm_module(module: nn.Module, causal: bool = False, norm: str = 'none', **norm_kwargs):
34
- """Return the proper normalization module. If causal is True, this will ensure the returned
35
- module is causal, or return an error if the normalization doesn't support causal evaluation.
36
- """
37
- assert norm in CONV_NORMALIZATIONS
38
- if norm == 'time_group_norm':
39
- if causal:
40
- raise ValueError("GroupNorm doesn't support causal evaluation.")
41
- assert isinstance(module, nn.modules.conv._ConvNd)
42
- return nn.GroupNorm(1, module.out_channels, **norm_kwargs)
43
- else:
44
- return nn.Identity()
45
 
46
 
47
  def get_extra_padding_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int,
@@ -52,22 +41,6 @@ def get_extra_padding_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int,
52
  ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
53
  return ideal_length - length
54
 
55
-
56
- def pad_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0):
57
- """Pad for a convolution to make sure that the last window is full.
58
- Extra padding is added at the end. This is required to ensure that we can rebuild
59
- an output of the same length, as otherwise, even with padding, some time steps
60
- might get removed.
61
- For instance, with total padding = 4, kernel size = 4, stride = 2:
62
- 0 0 1 2 3 4 5 0 0 # (0s are padding)
63
- 1 2 3 # (output frames of a convolution, last 0 is never used)
64
- 0 0 1 2 3 4 5 0 # (output of tr. conv., but pos. 5 is going to get removed as padding)
65
- 1 2 3 4 # once you removed padding, we are missing one time step !
66
- """
67
- extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
68
- return F.pad(x, (0, extra_padding))
69
-
70
-
71
  def pad1d(x: torch.Tensor, paddings: tp.Tuple[int, int], mode: str = 'constant', value: float = 0.):
72
  """Tiny wrapper around F.pad, just to allow for reflect padding on small input.
73
  If this is the case, we insert extra 0 padding to the right before the reflection happen.
@@ -98,40 +71,27 @@ def unpad1d(x: torch.Tensor, paddings: tp.Tuple[int, int]):
98
 
99
 
100
  class NormConv1d(nn.Module):
101
- """Wrapper around Conv1d and normalization applied to this conv
102
- to provide a uniform interface across normalization approaches.
103
- """
104
- def __init__(self, *args, causal: bool = False, norm: str = 'none',
105
- norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
106
  super().__init__()
107
- self.conv = apply_parametrization_norm(nn.Conv1d(*args, **kwargs), norm)
108
- self.norm = get_norm_module(self.conv, causal, norm, **norm_kwargs)
109
- self.norm_type = norm
110
-
111
  def forward(self, x):
112
- x = self.conv(x)
113
- x = self.norm(x)
114
- return x
115
 
116
 
117
 
118
 
119
 
120
  class NormConvTranspose1d(nn.Module):
121
- """Wrapper around ConvTranspose1d and normalization applied to this conv
122
- to provide a uniform interface across normalization approaches.
123
- """
124
  def __init__(self, *args, causal: bool = False, norm: str = 'none',
125
  norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
126
  super().__init__()
127
  self.convtr = apply_parametrization_norm(nn.ConvTranspose1d(*args, **kwargs), norm)
128
- self.norm = get_norm_module(self.convtr, causal, norm, **norm_kwargs)
129
- self.norm_type = norm
130
-
131
  def forward(self, x):
132
- x = self.convtr(x)
133
- x = self.norm(x)
134
- return x
135
 
136
 
137
 
@@ -155,9 +115,9 @@ class StreamableConv1d(nn.Module):
155
  pad_mode='reflect'):
156
  super().__init__()
157
  # warn user on unusual setup between dilation and stride
158
- if stride > 1 and dilation > 1:
159
- warnings.warn("StreamableConv1d has been initialized with stride > 1 and dilation > 1"
160
- f" (kernel_size={kernel_size} stride={stride}, dilation={dilation}).")
161
  self.conv = NormConv1d(in_channels, out_channels, kernel_size, stride,
162
  dilation=dilation, groups=groups, bias=bias, causal=causal,
163
  norm=norm, norm_kwargs=norm_kwargs)
 
30
  return module
31
 
32
 
33
+
 
 
 
 
 
 
 
 
 
 
 
34
 
35
 
36
  def get_extra_padding_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int,
 
41
  ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
42
  return ideal_length - length
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  def pad1d(x: torch.Tensor, paddings: tp.Tuple[int, int], mode: str = 'constant', value: float = 0.):
45
  """Tiny wrapper around F.pad, just to allow for reflect padding on small input.
46
  If this is the case, we insert extra 0 padding to the right before the reflection happen.
 
71
 
72
 
73
  class NormConv1d(nn.Module):
74
+ def __init__(self, *args,
75
+ causal = False, norm = 'none',
76
+ norm_kwargs = {}, **kwargs):
 
 
77
  super().__init__()
78
+ self.conv = apply_parametrization_norm(nn.Conv1d(*args, **kwargs), norm) # norm = weight_norm
 
 
 
79
  def forward(self, x):
80
+ return self.conv(x)
 
 
81
 
82
 
83
 
84
 
85
 
86
  class NormConvTranspose1d(nn.Module):
 
 
 
87
  def __init__(self, *args, causal: bool = False, norm: str = 'none',
88
  norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
89
  super().__init__()
90
  self.convtr = apply_parametrization_norm(nn.ConvTranspose1d(*args, **kwargs), norm)
91
+
 
 
92
  def forward(self, x):
93
+ return self.convtr(x)
94
+
 
95
 
96
 
97
 
 
115
  pad_mode='reflect'):
116
  super().__init__()
117
  # warn user on unusual setup between dilation and stride
118
+ # if stride > 1 and dilation > 1:
119
+ # warnings.warn("StreamableConv1d has been initialized with stride > 1 and dilation > 1"
120
+ # f" (kernel_size={kernel_size} stride={stride}, dilation={dilation}).")
121
  self.conv = NormConv1d(in_channels, out_channels, kernel_size, stride,
122
  dilation=dilation, groups=groups, bias=bias, causal=causal,
123
  norm=norm, norm_kwargs=norm_kwargs)
audiocraft/lm.py CHANGED
@@ -19,7 +19,7 @@ class LMModel(nn.Module):
19
  self.condition_provider = T5Conditioner(name='t5-large',
20
  output_dim=dim)
21
  self.card = card # 2048 ?
22
- self.n_draw = 3 # replicate so many times the generation of each text in batch
23
  # the batch is more expensive than n_draw as it re-runs the model bs times
24
  # n_draw just draws more phonemes from the multinomial - after running the lm
25
  embed_dim = self.card + 1
@@ -56,7 +56,7 @@ class LMModel(nn.Module):
56
 
57
 
58
  # SAMPLE TOP K
59
- k = 400 # 450 is nice sound still train honk is clear!
60
  p = torch.softmax(logits, dim=3)
61
  top_k_value, _ = torch.topk(p, k, dim=3) # [3, 4, 1, k]
62
  min_value_top_k = top_k_value[:, :, :, -1:]
@@ -67,7 +67,7 @@ class LMModel(nn.Module):
67
  p = p.reshape(bs * self.n_q, 2048)
68
  out = torch.multinomial(p, # p=[bs,2048], out=[bs, num_samples]
69
  num_samples=self.n_draw,
70
- replacement=True) # [bs*4, self.n_draw]
71
  # print('DRAW','c', out)
72
  return out.reshape(bs, self.n_q, self.n_draw).transpose(1,2) # [bs=3not6, self.n_draw, 4]
73
 
@@ -138,8 +138,7 @@ class LMModel(nn.Module):
138
  pass #print('No delete anti-diag')
139
 
140
  out_codes[:, :, [0, 1, 2, 3], torch.tensor([3, 2, 1, 0]) + offset + 1] = next_token
141
-
142
- print('\n_____ALIGN____\n', out_codes[1, 2, :, 4:max_tokens+4]) # do we pass 2048 to Seanet - There wil result in AtenIndexingError as it has no 2048
143
  # EXTRACT COLUMNS AS ALIGN IS ALREADY DONE by FILLING DIAGONALLY
144
  out_codes = out_codes[:, :, :, 4:max_tokens+4].transpose(1, 2).reshape(bs, 4, self.n_draw * max_tokens) # [bs, 4, duration*n_draw] DISCARD FILL 2048
145
 
 
19
  self.condition_provider = T5Conditioner(name='t5-large',
20
  output_dim=dim)
21
  self.card = card # 2048 ?
22
+ self.n_draw = 6 # replicate so many times the generation of each text in batch
23
  # the batch is more expensive than n_draw as it re-runs the model bs times
24
  # n_draw just draws more phonemes from the multinomial - after running the lm
25
  embed_dim = self.card + 1
 
56
 
57
 
58
  # SAMPLE TOP K
59
+ k = 400 # 450 is nice sound still train honk is clear!
60
  p = torch.softmax(logits, dim=3)
61
  top_k_value, _ = torch.topk(p, k, dim=3) # [3, 4, 1, k]
62
  min_value_top_k = top_k_value[:, :, :, -1:]
 
67
  p = p.reshape(bs * self.n_q, 2048)
68
  out = torch.multinomial(p, # p=[bs,2048], out=[bs, num_samples]
69
  num_samples=self.n_draw,
70
+ replacement=False) # [bs*4, self.n_draw]
71
  # print('DRAW','c', out)
72
  return out.reshape(bs, self.n_q, self.n_draw).transpose(1,2) # [bs=3not6, self.n_draw, 4]
73
 
 
138
  pass #print('No delete anti-diag')
139
 
140
  out_codes[:, :, [0, 1, 2, 3], torch.tensor([3, 2, 1, 0]) + offset + 1] = next_token
141
+ # END LOOP
 
142
  # EXTRACT COLUMNS AS ALIGN IS ALREADY DONE by FILLING DIAGONALLY
143
  out_codes = out_codes[:, :, :, 4:max_tokens+4].transpose(1, 2).reshape(bs, 4, self.n_draw * max_tokens) # [bs, 4, duration*n_draw] DISCARD FILL 2048
144
 
demo.py CHANGED
@@ -74,7 +74,7 @@ def tts_entry(text='»Vom Prof. Friedrich ist noch eine recht schöne große Lan
74
  background = sound_gen.generate(soundscape,
75
  duration=len(x)/24000 + .74, # sound duration in seconds
76
  ).detach().cpu().numpy() # bs, 11400 @.74s
77
- x = .5 * x + .5 * background[:len(x)]
78
  return x
79
 
80
  soundfile.write(f'demo.wav', tts_entry(), 24000)
 
74
  background = sound_gen.generate(soundscape,
75
  duration=len(x)/24000 + .74, # sound duration in seconds
76
  ).detach().cpu().numpy() # bs, 11400 @.74s
77
+ x = .5 * x + .47 * background[:len(x)]
78
  return x
79
 
80
  soundfile.write(f'demo.wav', tts_entry(), 24000)