ivao0
/

voc

@@ -50,13 +50,12 @@ class Voc(Wav2Vec2PreTrainedModel):
                                               groups=512 if upsample_channel_wise_bug else 1,
                                               stride=2, bias=False)
         self.frame_rate = 12.5
-        self.encode_buffer = None  # holds raw audio chunk if incomplete < 1920 samples
-    @torch.no_grad()
     def _flush(self):
         '''stream buffers have tensors of old batch size! Voc()._flush() to clean buffers
         '''
-        self.encode_buffer = None    # holds unused (incomplete windows of len < 1920) - we need 1920 to produce 1 token
         if self.downsample.previous is not None:
             self.downsample.previous = None
         if self.upsample.partial is not None:
@@ -145,7 +144,7 @@ class SEANetResnetBlock(nn.Module):
         self.block = nn.Sequential(*block)
     def forward(self, x):
-        return x + self.block(x)  # BufferConv1d assures atleast 1 kernl exists 0pad or previous
 class SEANetEncoder(nn.Module):
@@ -161,7 +160,7 @@ class SEANetEncoder(nn.Module):
         super().__init__()
         self.ratios = list(reversed(ratios))
         del ratios
-        mult = 1  # incr. each of for
         model=[
             BufferConv1d(
                 channels,
@@ -368,7 +367,7 @@ class VocAttention(nn.Module):
         if x.shape[1] > 1:
             x = x.mean(1, keepdims=True)
         x = torch.matmul(x, self.fused_proj)
-        return x  # FFN broadcasts to x.shape[1]
 class VocTransformerLayer(nn.Module):
@@ -401,13 +400,11 @@ class VocTransformer(nn.Module):
 device = 'cpu'  #'cuda:0'
 model = Voc.from_pretrained('ivao0/voc').to(device)
-model.n_q = 16
-x, _ = soundfile.read(
-    hf_hub_download(repo_id='ivao0/voc', filename='true.wav')
-)  # 24 KHz
 x = torch.from_numpy(x[None, None, :]).to(dtype=torch.float, device=device)
-codes = model.encode(x) # [bs, len(_acoustic_books) + 1, T] If len(x) < 1920 audio samples -> codes is torch.empty
 y = model.decode(codes) # audio signal 24KHz
 soundfile.write('reconstruct.wav', y[0, 0, :].cpu().numpy(), 24000)
-model._flush()  # For call encode()/decode() for different batch size
 ```

                                               groups=512 if upsample_channel_wise_bug else 1,
                                               stride=2, bias=False)
         self.frame_rate = 12.5
+        self.encode_buffer = None
     def _flush(self):
         '''stream buffers have tensors of old batch size! Voc()._flush() to clean buffers
         '''
+        self.encode_buffer = None # holds unused (incomplete windows of len < 1920) - we need 1920 to produce 1 token
         if self.downsample.previous is not None:
             self.downsample.previous = None
         if self.upsample.partial is not None:
         self.block = nn.Sequential(*block)
     def forward(self, x):
+        return x + self.block(x)
 class SEANetEncoder(nn.Module):
         super().__init__()
         self.ratios = list(reversed(ratios))
         del ratios
+        mult = 1
         model=[
             BufferConv1d(
                 channels,
         if x.shape[1] > 1:
             x = x.mean(1, keepdims=True)
         x = torch.matmul(x, self.fused_proj)
+        return x  # FFN broadcasts to x.shape[1]=2
 class VocTransformerLayer(nn.Module):
 device = 'cpu'  #'cuda:0'
 model = Voc.from_pretrained('ivao0/voc').to(device)
+x, _ = soundfile.read(hf_hub_download(repo_id='ivao0/voc', filename='true.wav'))  # 24 KHz
 x = torch.from_numpy(x[None, None, :]).to(dtype=torch.float, device=device)
+codes = model.encode(x) # [bs, len(_acoustic_books) + 1, T]
 y = model.decode(codes) # audio signal 24KHz
 soundfile.write('reconstruct.wav', y[0, 0, :].cpu().numpy(), 24000)
+model._flush()  # For encode()/decode() for different batch size
 ```