IvA commited on
Commit
ec70542
·
1 Parent(s): 4560afd
Files changed (1) hide show
  1. README.md +9 -12
README.md CHANGED
@@ -50,13 +50,12 @@ class Voc(Wav2Vec2PreTrainedModel):
50
  groups=512 if upsample_channel_wise_bug else 1,
51
  stride=2, bias=False)
52
  self.frame_rate = 12.5
53
- self.encode_buffer = None # holds raw audio chunk if incomplete < 1920 samples
54
 
55
- @torch.no_grad()
56
  def _flush(self):
57
  '''stream buffers have tensors of old batch size! Voc()._flush() to clean buffers
58
  '''
59
- self.encode_buffer = None # holds unused (incomplete windows of len < 1920) - we need 1920 to produce 1 token
60
  if self.downsample.previous is not None:
61
  self.downsample.previous = None
62
  if self.upsample.partial is not None:
@@ -145,7 +144,7 @@ class SEANetResnetBlock(nn.Module):
145
  self.block = nn.Sequential(*block)
146
 
147
  def forward(self, x):
148
- return x + self.block(x) # BufferConv1d assures atleast 1 kernl exists 0pad or previous
149
 
150
 
151
  class SEANetEncoder(nn.Module):
@@ -161,7 +160,7 @@ class SEANetEncoder(nn.Module):
161
  super().__init__()
162
  self.ratios = list(reversed(ratios))
163
  del ratios
164
- mult = 1 # incr. each of for
165
  model=[
166
  BufferConv1d(
167
  channels,
@@ -368,7 +367,7 @@ class VocAttention(nn.Module):
368
  if x.shape[1] > 1:
369
  x = x.mean(1, keepdims=True)
370
  x = torch.matmul(x, self.fused_proj)
371
- return x # FFN broadcasts to x.shape[1]
372
 
373
 
374
  class VocTransformerLayer(nn.Module):
@@ -401,13 +400,11 @@ class VocTransformer(nn.Module):
401
 
402
  device = 'cpu' #'cuda:0'
403
  model = Voc.from_pretrained('ivao0/voc').to(device)
404
- model.n_q = 16
405
- x, _ = soundfile.read(
406
- hf_hub_download(repo_id='ivao0/voc', filename='true.wav')
407
- ) # 24 KHz
408
  x = torch.from_numpy(x[None, None, :]).to(dtype=torch.float, device=device)
409
- codes = model.encode(x) # [bs, len(_acoustic_books) + 1, T] If len(x) < 1920 audio samples -> codes is torch.empty
410
  y = model.decode(codes) # audio signal 24KHz
411
  soundfile.write('reconstruct.wav', y[0, 0, :].cpu().numpy(), 24000)
412
- model._flush() # For call encode()/decode() for different batch size
 
413
  ```
 
50
  groups=512 if upsample_channel_wise_bug else 1,
51
  stride=2, bias=False)
52
  self.frame_rate = 12.5
53
+ self.encode_buffer = None
54
 
 
55
  def _flush(self):
56
  '''stream buffers have tensors of old batch size! Voc()._flush() to clean buffers
57
  '''
58
+ self.encode_buffer = None # holds unused (incomplete windows of len < 1920) - we need 1920 to produce 1 token
59
  if self.downsample.previous is not None:
60
  self.downsample.previous = None
61
  if self.upsample.partial is not None:
 
144
  self.block = nn.Sequential(*block)
145
 
146
  def forward(self, x):
147
+ return x + self.block(x)
148
 
149
 
150
  class SEANetEncoder(nn.Module):
 
160
  super().__init__()
161
  self.ratios = list(reversed(ratios))
162
  del ratios
163
+ mult = 1
164
  model=[
165
  BufferConv1d(
166
  channels,
 
367
  if x.shape[1] > 1:
368
  x = x.mean(1, keepdims=True)
369
  x = torch.matmul(x, self.fused_proj)
370
+ return x # FFN broadcasts to x.shape[1]=2
371
 
372
 
373
  class VocTransformerLayer(nn.Module):
 
400
 
401
  device = 'cpu' #'cuda:0'
402
  model = Voc.from_pretrained('ivao0/voc').to(device)
403
+ x, _ = soundfile.read(hf_hub_download(repo_id='ivao0/voc', filename='true.wav')) # 24 KHz
 
 
 
404
  x = torch.from_numpy(x[None, None, :]).to(dtype=torch.float, device=device)
405
+ codes = model.encode(x) # [bs, len(_acoustic_books) + 1, T]
406
  y = model.decode(codes) # audio signal 24KHz
407
  soundfile.write('reconstruct.wav', y[0, 0, :].cpu().numpy(), 24000)
408
+ model._flush() # For encode()/decode() for different batch size
409
+
410
  ```