timofeiiz commited on
Commit
9a78ded
·
verified ·
1 Parent(s): 9e7c4ef

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. README.md +1 -4
  2. config.json +8 -3
  3. config_model.py +19 -0
  4. model.py +13 -22
README.md CHANGED
@@ -1,4 +1 @@
1
- ---
2
- license: mit
3
- ---
4
- Soundstream implementation. Sample rate is 16000.
 
1
+ Soundstream implementation. Sample rate 16000.
 
 
 
config.json CHANGED
@@ -1,6 +1,11 @@
1
  {
 
2
  "channels": 32,
3
- "codebook_size": 1024,
4
  "latent_dim": 512,
5
- "num_quantizers": 8
6
- }
 
 
 
 
 
 
1
  {
2
+ "model_type": "soundstream",
3
  "channels": 32,
 
4
  "latent_dim": 512,
5
+ "codebook_size": 1024,
6
+ "num_quantizers": 8,
7
+ "auto_map": {
8
+ "AutoConfig": "config_model.SoundStreamConfig",
9
+ "AutoModel": "model.SoundStreamCodec"
10
+ }
11
+ }
config_model.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+
3
+
4
+ class SoundStreamConfig(PretrainedConfig):
5
+ model_type = "soundstream"
6
+
7
+ def __init__(
8
+ self,
9
+ channels: int = 32,
10
+ latent_dim: int = 512,
11
+ codebook_size: int = 1024,
12
+ num_quantizers: int = 8,
13
+ **kwargs,
14
+ ):
15
+ super().__init__(**kwargs)
16
+ self.channels = channels
17
+ self.latent_dim = latent_dim
18
+ self.codebook_size = codebook_size
19
+ self.num_quantizers = num_quantizers
model.py CHANGED
@@ -1,7 +1,9 @@
1
  import torch
2
  import torch.nn.functional as F
3
  from torch import nn
4
- from huggingface_hub import PyTorchModelHubMixin
 
 
5
 
6
 
7
  class CausalConv1d(nn.Module):
@@ -102,7 +104,6 @@ class Encoder(nn.Module):
102
 
103
  def __init__(self, channels: int = 16, dim: int = 512):
104
  super().__init__()
105
- # NB: attribute name "encoder" matches training checkpoint keys
106
  self.encoder = nn.Sequential(
107
  CausalConv1d(kernel_size=7, in_channels=1, out_channels=channels),
108
  EncoderBlock(channels=2 * channels, s=2),
@@ -120,7 +121,6 @@ class Decoder(nn.Module):
120
 
121
  def __init__(self, channels: int = 16, dim: int = 512):
122
  super().__init__()
123
- # NB: attribute name "decoder" matches training checkpoint keys
124
  self.decoder = nn.Sequential(
125
  CausalConv1d(kernel_size=7, in_channels=dim, out_channels=16 * channels),
126
  DecoderBlock(channels=16 * channels, s=5),
@@ -325,32 +325,23 @@ class ResidualVectorQuantizer(nn.Module):
325
  return quantized
326
 
327
 
328
- class SoundStreamCodec(
329
- nn.Module,
330
- PyTorchModelHubMixin,
331
- library_name="soundstream-impl",
332
- license="mit",
333
- ):
334
- def __init__(
335
- self,
336
- channels: int = 32,
337
- latent_dim: int = 512,
338
- codebook_size: int = 1024,
339
- num_quantizers: int = 8,
340
- ):
341
- super().__init__()
342
  self.strides = (2, 4, 5, 5)
343
  self.downsampling_factor = 1
344
  for s in self.strides:
345
  self.downsampling_factor *= s
346
 
347
- self.encoder = Encoder(channels=channels, dim=latent_dim)
348
  self.quantizer = ResidualVectorQuantizer(
349
- latent_dim=latent_dim,
350
- codebook_size=codebook_size,
351
- num_quantizers=num_quantizers,
352
  )
353
- self.decoder = Decoder(channels=channels, dim=latent_dim)
354
 
355
  def forward(self, audio, **kwargs):
356
  original_length = audio.size(-1)
 
1
  import torch
2
  import torch.nn.functional as F
3
  from torch import nn
4
+ from transformers import PreTrainedModel
5
+
6
+ from .config_model import SoundStreamConfig
7
 
8
 
9
  class CausalConv1d(nn.Module):
 
104
 
105
  def __init__(self, channels: int = 16, dim: int = 512):
106
  super().__init__()
 
107
  self.encoder = nn.Sequential(
108
  CausalConv1d(kernel_size=7, in_channels=1, out_channels=channels),
109
  EncoderBlock(channels=2 * channels, s=2),
 
121
 
122
  def __init__(self, channels: int = 16, dim: int = 512):
123
  super().__init__()
 
124
  self.decoder = nn.Sequential(
125
  CausalConv1d(kernel_size=7, in_channels=dim, out_channels=16 * channels),
126
  DecoderBlock(channels=16 * channels, s=5),
 
325
  return quantized
326
 
327
 
328
+ class SoundStreamCodec(PreTrainedModel):
329
+ config_class = SoundStreamConfig
330
+
331
+ def __init__(self, config):
332
+ super().__init__(config)
 
 
 
 
 
 
 
 
 
333
  self.strides = (2, 4, 5, 5)
334
  self.downsampling_factor = 1
335
  for s in self.strides:
336
  self.downsampling_factor *= s
337
 
338
+ self.encoder = Encoder(channels=config.channels, dim=config.latent_dim)
339
  self.quantizer = ResidualVectorQuantizer(
340
+ latent_dim=config.latent_dim,
341
+ codebook_size=config.codebook_size,
342
+ num_quantizers=config.num_quantizers,
343
  )
344
+ self.decoder = Decoder(channels=config.channels, dim=config.latent_dim)
345
 
346
  def forward(self, audio, **kwargs):
347
  original_length = audio.size(-1)