Instructions to use timofeiiz/soundstream-impl with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use timofeiiz/soundstream-impl with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("timofeiiz/soundstream-impl", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
Upload folder using huggingface_hub
Browse files- README.md +1 -4
- config.json +8 -3
- config_model.py +19 -0
- model.py +13 -22
README.md
CHANGED
|
@@ -1,4 +1 @@
|
|
| 1 |
-
|
| 2 |
-
license: mit
|
| 3 |
-
---
|
| 4 |
-
Soundstream implementation. Sample rate is 16000.
|
|
|
|
| 1 |
+
Soundstream implementation. Sample rate 16000.
|
|
|
|
|
|
|
|
|
config.json
CHANGED
|
@@ -1,6 +1,11 @@
|
|
| 1 |
{
|
|
|
|
| 2 |
"channels": 32,
|
| 3 |
-
"codebook_size": 1024,
|
| 4 |
"latent_dim": 512,
|
| 5 |
-
"
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
{
|
| 2 |
+
"model_type": "soundstream",
|
| 3 |
"channels": 32,
|
|
|
|
| 4 |
"latent_dim": 512,
|
| 5 |
+
"codebook_size": 1024,
|
| 6 |
+
"num_quantizers": 8,
|
| 7 |
+
"auto_map": {
|
| 8 |
+
"AutoConfig": "config_model.SoundStreamConfig",
|
| 9 |
+
"AutoModel": "model.SoundStreamCodec"
|
| 10 |
+
}
|
| 11 |
+
}
|
config_model.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import PretrainedConfig
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class SoundStreamConfig(PretrainedConfig):
|
| 5 |
+
model_type = "soundstream"
|
| 6 |
+
|
| 7 |
+
def __init__(
|
| 8 |
+
self,
|
| 9 |
+
channels: int = 32,
|
| 10 |
+
latent_dim: int = 512,
|
| 11 |
+
codebook_size: int = 1024,
|
| 12 |
+
num_quantizers: int = 8,
|
| 13 |
+
**kwargs,
|
| 14 |
+
):
|
| 15 |
+
super().__init__(**kwargs)
|
| 16 |
+
self.channels = channels
|
| 17 |
+
self.latent_dim = latent_dim
|
| 18 |
+
self.codebook_size = codebook_size
|
| 19 |
+
self.num_quantizers = num_quantizers
|
model.py
CHANGED
|
@@ -1,7 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
import torch.nn.functional as F
|
| 3 |
from torch import nn
|
| 4 |
-
from
|
|
|
|
|
|
|
| 5 |
|
| 6 |
|
| 7 |
class CausalConv1d(nn.Module):
|
|
@@ -102,7 +104,6 @@ class Encoder(nn.Module):
|
|
| 102 |
|
| 103 |
def __init__(self, channels: int = 16, dim: int = 512):
|
| 104 |
super().__init__()
|
| 105 |
-
# NB: attribute name "encoder" matches training checkpoint keys
|
| 106 |
self.encoder = nn.Sequential(
|
| 107 |
CausalConv1d(kernel_size=7, in_channels=1, out_channels=channels),
|
| 108 |
EncoderBlock(channels=2 * channels, s=2),
|
|
@@ -120,7 +121,6 @@ class Decoder(nn.Module):
|
|
| 120 |
|
| 121 |
def __init__(self, channels: int = 16, dim: int = 512):
|
| 122 |
super().__init__()
|
| 123 |
-
# NB: attribute name "decoder" matches training checkpoint keys
|
| 124 |
self.decoder = nn.Sequential(
|
| 125 |
CausalConv1d(kernel_size=7, in_channels=dim, out_channels=16 * channels),
|
| 126 |
DecoderBlock(channels=16 * channels, s=5),
|
|
@@ -325,32 +325,23 @@ class ResidualVectorQuantizer(nn.Module):
|
|
| 325 |
return quantized
|
| 326 |
|
| 327 |
|
| 328 |
-
class SoundStreamCodec(
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
):
|
| 334 |
-
def __init__(
|
| 335 |
-
self,
|
| 336 |
-
channels: int = 32,
|
| 337 |
-
latent_dim: int = 512,
|
| 338 |
-
codebook_size: int = 1024,
|
| 339 |
-
num_quantizers: int = 8,
|
| 340 |
-
):
|
| 341 |
-
super().__init__()
|
| 342 |
self.strides = (2, 4, 5, 5)
|
| 343 |
self.downsampling_factor = 1
|
| 344 |
for s in self.strides:
|
| 345 |
self.downsampling_factor *= s
|
| 346 |
|
| 347 |
-
self.encoder = Encoder(channels=channels, dim=latent_dim)
|
| 348 |
self.quantizer = ResidualVectorQuantizer(
|
| 349 |
-
latent_dim=latent_dim,
|
| 350 |
-
codebook_size=codebook_size,
|
| 351 |
-
num_quantizers=num_quantizers,
|
| 352 |
)
|
| 353 |
-
self.decoder = Decoder(channels=channels, dim=latent_dim)
|
| 354 |
|
| 355 |
def forward(self, audio, **kwargs):
|
| 356 |
original_length = audio.size(-1)
|
|
|
|
| 1 |
import torch
|
| 2 |
import torch.nn.functional as F
|
| 3 |
from torch import nn
|
| 4 |
+
from transformers import PreTrainedModel
|
| 5 |
+
|
| 6 |
+
from .config_model import SoundStreamConfig
|
| 7 |
|
| 8 |
|
| 9 |
class CausalConv1d(nn.Module):
|
|
|
|
| 104 |
|
| 105 |
def __init__(self, channels: int = 16, dim: int = 512):
|
| 106 |
super().__init__()
|
|
|
|
| 107 |
self.encoder = nn.Sequential(
|
| 108 |
CausalConv1d(kernel_size=7, in_channels=1, out_channels=channels),
|
| 109 |
EncoderBlock(channels=2 * channels, s=2),
|
|
|
|
| 121 |
|
| 122 |
def __init__(self, channels: int = 16, dim: int = 512):
|
| 123 |
super().__init__()
|
|
|
|
| 124 |
self.decoder = nn.Sequential(
|
| 125 |
CausalConv1d(kernel_size=7, in_channels=dim, out_channels=16 * channels),
|
| 126 |
DecoderBlock(channels=16 * channels, s=5),
|
|
|
|
| 325 |
return quantized
|
| 326 |
|
| 327 |
|
| 328 |
+
class SoundStreamCodec(PreTrainedModel):
|
| 329 |
+
config_class = SoundStreamConfig
|
| 330 |
+
|
| 331 |
+
def __init__(self, config):
|
| 332 |
+
super().__init__(config)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
self.strides = (2, 4, 5, 5)
|
| 334 |
self.downsampling_factor = 1
|
| 335 |
for s in self.strides:
|
| 336 |
self.downsampling_factor *= s
|
| 337 |
|
| 338 |
+
self.encoder = Encoder(channels=config.channels, dim=config.latent_dim)
|
| 339 |
self.quantizer = ResidualVectorQuantizer(
|
| 340 |
+
latent_dim=config.latent_dim,
|
| 341 |
+
codebook_size=config.codebook_size,
|
| 342 |
+
num_quantizers=config.num_quantizers,
|
| 343 |
)
|
| 344 |
+
self.decoder = Decoder(channels=config.channels, dim=config.latent_dim)
|
| 345 |
|
| 346 |
def forward(self, audio, **kwargs):
|
| 347 |
original_length = audio.size(-1)
|