IvA commited on
Commit ·
ec70542
1
Parent(s): 4560afd
fx
Browse files
README.md
CHANGED
|
@@ -50,13 +50,12 @@ class Voc(Wav2Vec2PreTrainedModel):
|
|
| 50 |
groups=512 if upsample_channel_wise_bug else 1,
|
| 51 |
stride=2, bias=False)
|
| 52 |
self.frame_rate = 12.5
|
| 53 |
-
self.encode_buffer = None
|
| 54 |
|
| 55 |
-
@torch.no_grad()
|
| 56 |
def _flush(self):
|
| 57 |
'''stream buffers have tensors of old batch size! Voc()._flush() to clean buffers
|
| 58 |
'''
|
| 59 |
-
self.encode_buffer = None
|
| 60 |
if self.downsample.previous is not None:
|
| 61 |
self.downsample.previous = None
|
| 62 |
if self.upsample.partial is not None:
|
|
@@ -145,7 +144,7 @@ class SEANetResnetBlock(nn.Module):
|
|
| 145 |
self.block = nn.Sequential(*block)
|
| 146 |
|
| 147 |
def forward(self, x):
|
| 148 |
-
return x + self.block(x)
|
| 149 |
|
| 150 |
|
| 151 |
class SEANetEncoder(nn.Module):
|
|
@@ -161,7 +160,7 @@ class SEANetEncoder(nn.Module):
|
|
| 161 |
super().__init__()
|
| 162 |
self.ratios = list(reversed(ratios))
|
| 163 |
del ratios
|
| 164 |
-
mult = 1
|
| 165 |
model=[
|
| 166 |
BufferConv1d(
|
| 167 |
channels,
|
|
@@ -368,7 +367,7 @@ class VocAttention(nn.Module):
|
|
| 368 |
if x.shape[1] > 1:
|
| 369 |
x = x.mean(1, keepdims=True)
|
| 370 |
x = torch.matmul(x, self.fused_proj)
|
| 371 |
-
return x # FFN broadcasts to x.shape[1]
|
| 372 |
|
| 373 |
|
| 374 |
class VocTransformerLayer(nn.Module):
|
|
@@ -401,13 +400,11 @@ class VocTransformer(nn.Module):
|
|
| 401 |
|
| 402 |
device = 'cpu' #'cuda:0'
|
| 403 |
model = Voc.from_pretrained('ivao0/voc').to(device)
|
| 404 |
-
|
| 405 |
-
x, _ = soundfile.read(
|
| 406 |
-
hf_hub_download(repo_id='ivao0/voc', filename='true.wav')
|
| 407 |
-
) # 24 KHz
|
| 408 |
x = torch.from_numpy(x[None, None, :]).to(dtype=torch.float, device=device)
|
| 409 |
-
codes = model.encode(x) # [bs, len(_acoustic_books) + 1, T]
|
| 410 |
y = model.decode(codes) # audio signal 24KHz
|
| 411 |
soundfile.write('reconstruct.wav', y[0, 0, :].cpu().numpy(), 24000)
|
| 412 |
-
model._flush() # For
|
|
|
|
| 413 |
```
|
|
|
|
| 50 |
groups=512 if upsample_channel_wise_bug else 1,
|
| 51 |
stride=2, bias=False)
|
| 52 |
self.frame_rate = 12.5
|
| 53 |
+
self.encode_buffer = None
|
| 54 |
|
|
|
|
| 55 |
def _flush(self):
|
| 56 |
'''stream buffers have tensors of old batch size! Voc()._flush() to clean buffers
|
| 57 |
'''
|
| 58 |
+
self.encode_buffer = None # holds unused (incomplete windows of len < 1920) - we need 1920 to produce 1 token
|
| 59 |
if self.downsample.previous is not None:
|
| 60 |
self.downsample.previous = None
|
| 61 |
if self.upsample.partial is not None:
|
|
|
|
| 144 |
self.block = nn.Sequential(*block)
|
| 145 |
|
| 146 |
def forward(self, x):
|
| 147 |
+
return x + self.block(x)
|
| 148 |
|
| 149 |
|
| 150 |
class SEANetEncoder(nn.Module):
|
|
|
|
| 160 |
super().__init__()
|
| 161 |
self.ratios = list(reversed(ratios))
|
| 162 |
del ratios
|
| 163 |
+
mult = 1
|
| 164 |
model=[
|
| 165 |
BufferConv1d(
|
| 166 |
channels,
|
|
|
|
| 367 |
if x.shape[1] > 1:
|
| 368 |
x = x.mean(1, keepdims=True)
|
| 369 |
x = torch.matmul(x, self.fused_proj)
|
| 370 |
+
return x # FFN broadcasts to x.shape[1]=2
|
| 371 |
|
| 372 |
|
| 373 |
class VocTransformerLayer(nn.Module):
|
|
|
|
| 400 |
|
| 401 |
device = 'cpu' #'cuda:0'
|
| 402 |
model = Voc.from_pretrained('ivao0/voc').to(device)
|
| 403 |
+
x, _ = soundfile.read(hf_hub_download(repo_id='ivao0/voc', filename='true.wav')) # 24 KHz
|
|
|
|
|
|
|
|
|
|
| 404 |
x = torch.from_numpy(x[None, None, :]).to(dtype=torch.float, device=device)
|
| 405 |
+
codes = model.encode(x) # [bs, len(_acoustic_books) + 1, T]
|
| 406 |
y = model.decode(codes) # audio signal 24KHz
|
| 407 |
soundfile.write('reconstruct.wav', y[0, 0, :].cpu().numpy(), 24000)
|
| 408 |
+
model._flush() # For encode()/decode() for different batch size
|
| 409 |
+
|
| 410 |
```
|