Spaces:
Sleeping
Sleeping
v1
Browse files- __pycache__/model.cpython-310.pyc +0 -0
- app.py +30 -2
- cvae/__init__.py +7 -0
- cvae/__pycache__/__init__.cpython-310.pyc +0 -0
- cvae/__pycache__/__init__.cpython-311.pyc +0 -0
- cvae/__pycache__/blocks.cpython-310.pyc +0 -0
- cvae/__pycache__/blocks.cpython-311.pyc +0 -0
- cvae/__pycache__/models.cpython-310.pyc +0 -0
- cvae/__pycache__/models.cpython-311.pyc +0 -0
- cvae/blocks.py +59 -0
- cvae/models.py +167 -0
- epoch=17-step=650718.ckpt +3 -0
- model.py +28 -0
__pycache__/model.cpython-310.pyc
ADDED
|
Binary file (1.96 kB). View file
|
|
|
app.py
CHANGED
|
@@ -1,4 +1,32 @@
|
|
| 1 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
-
x = st.slider('Select a value')
|
| 4 |
-
st.write(x, 'squared is', x * x)
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
+
from model import generate
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
if "result" not in st.session_state:
|
| 6 |
+
st.session_state["result"] = np.empty(16000*4)
|
| 7 |
+
|
| 8 |
+
st.title("Sound Exploration")
|
| 9 |
+
|
| 10 |
+
col1, col2 = st.columns(2)
|
| 11 |
+
|
| 12 |
+
with col1:
|
| 13 |
+
instrument = st.selectbox(
|
| 14 |
+
'Which intrument do you want ?',
|
| 15 |
+
('🎸 Bass', '🎺 Brass', '🪈 Flute', '🪕 Guitar', '🎹 Keyboard', '🔨 Mallet', 'Organ', 'Reed', '🎻 String', 'Synth lead', '🎙️ Vocal')
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
with col2:
|
| 19 |
+
instrument_t = st.selectbox(
|
| 20 |
+
'Which type intrument do you want ?',
|
| 21 |
+
('📯 Acoustic', '🎙️ Electronic', '🎛️ Synthetic')
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
with st.expander("Magical parameters 🪄"):
|
| 25 |
+
p1 = st.slider('p1', 0., 1., step=0.001)
|
| 26 |
+
|
| 27 |
+
if st.button("Generate ✨", type="primary"):
|
| 28 |
+
st.session_state["result"] = generate([instrument, instrument_t])
|
| 29 |
+
|
| 30 |
+
if st.session_state["result"].any():
|
| 31 |
+
st.audio(st.session_state["result"], sample_rate=16000)
|
| 32 |
|
|
|
|
|
|
cvae/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .models import (
|
| 2 |
+
Encoder, Decoder, VAE, CVAE
|
| 3 |
+
)
|
| 4 |
+
|
| 5 |
+
from .blocks import (
|
| 6 |
+
UpResConvBlock, DownResConvBlock
|
| 7 |
+
)
|
cvae/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (316 Bytes). View file
|
|
|
cvae/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (430 Bytes). View file
|
|
|
cvae/__pycache__/blocks.cpython-310.pyc
ADDED
|
Binary file (2.01 kB). View file
|
|
|
cvae/__pycache__/blocks.cpython-311.pyc
ADDED
|
Binary file (4.32 kB). View file
|
|
|
cvae/__pycache__/models.cpython-310.pyc
ADDED
|
Binary file (6.09 kB). View file
|
|
|
cvae/__pycache__/models.cpython-311.pyc
ADDED
|
Binary file (12 kB). View file
|
|
|
cvae/blocks.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from torch import nn
|
| 2 |
+
|
| 3 |
+
class UpResConvBlock(nn.Module):
|
| 4 |
+
def __init__(self, in_channels, out_channels, kernel_size):
|
| 5 |
+
super(UpResConvBlock, self).__init__()
|
| 6 |
+
|
| 7 |
+
self.residual = nn.Sequential(
|
| 8 |
+
nn.Upsample(scale_factor=2),
|
| 9 |
+
nn.Conv1d(in_channels, out_channels, 1, 1, bias=False),
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
self.main = nn.Sequential(
|
| 13 |
+
nn.Upsample(scale_factor=2),
|
| 14 |
+
nn.Conv1d(in_channels, out_channels, kernel_size, 1),
|
| 15 |
+
nn.GroupNorm(1, out_channels),
|
| 16 |
+
nn.GELU(),
|
| 17 |
+
nn.Conv1d(out_channels, out_channels, kernel_size, 1),
|
| 18 |
+
nn.GroupNorm(1, out_channels),
|
| 19 |
+
nn.GELU()
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
def forward(self, x):
|
| 23 |
+
return self.main(x) + self.residual(x)
|
| 24 |
+
|
| 25 |
+
class DownResConvBlock(nn.Module):
|
| 26 |
+
def __init__(self, in_channels, out_channels, kernel_size):
|
| 27 |
+
super(DownResConvBlock, self).__init__()
|
| 28 |
+
|
| 29 |
+
self.residual = nn.Conv1d(in_channels, out_channels, 1, 2, bias=False)
|
| 30 |
+
|
| 31 |
+
self.main = nn.Sequential(
|
| 32 |
+
nn.Conv1d(in_channels, out_channels, kernel_size, 2),
|
| 33 |
+
nn.GroupNorm(1, out_channels),
|
| 34 |
+
nn.GELU(),
|
| 35 |
+
nn.Conv1d(out_channels, out_channels, kernel_size, 1),
|
| 36 |
+
nn.GroupNorm(1, out_channels),
|
| 37 |
+
nn.GELU()
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
def forward(self, x):
|
| 41 |
+
return self.main(x) + self.residual(x)
|
| 42 |
+
|
| 43 |
+
class ResConvBlock(nn.Module):
|
| 44 |
+
def __init__(self, in_channels, out_channels, kernel_size):
|
| 45 |
+
super(ResConvBlock, self).__init__()
|
| 46 |
+
|
| 47 |
+
self.residual = nn.Identity() if in_channels == out_channels else nn.Conv1d(in_channels, out_channels, 1, bias=False)
|
| 48 |
+
|
| 49 |
+
self.main = nn.Sequential(
|
| 50 |
+
nn.Conv1d(in_channels, out_channels, kernel_size),
|
| 51 |
+
nn.GroupNorm(1, out_channels),
|
| 52 |
+
nn.GELU(),
|
| 53 |
+
nn.Conv1d(out_channels, out_channels, kernel_size),
|
| 54 |
+
nn.GroupNorm(1, out_channels),
|
| 55 |
+
nn.GELU()
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
def forward(self, x):
|
| 59 |
+
return self.main(x) + self.residual(x)
|
cvae/models.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from torch import nn, Tensor
|
| 3 |
+
from torch.optim import Optimizer
|
| 4 |
+
from .blocks import UpResConvBlock, DownResConvBlock
|
| 5 |
+
import lightning as L
|
| 6 |
+
from auraloss.freq import MultiResolutionSTFTLoss
|
| 7 |
+
|
| 8 |
+
class Encoder(nn.Module):
|
| 9 |
+
def __init__(self,
|
| 10 |
+
in_channels: int,
|
| 11 |
+
in_features: int,
|
| 12 |
+
out_features: int,
|
| 13 |
+
channels: list = None,
|
| 14 |
+
) -> None:
|
| 15 |
+
super(Encoder, self).__init__()
|
| 16 |
+
|
| 17 |
+
assert in_features % 2**len(channels) == 0, f"in_features ({in_features}) must be a multiple of downscale factor ({2**len(channels)})"
|
| 18 |
+
|
| 19 |
+
modules = [
|
| 20 |
+
nn.Conv1d(in_channels, channels[0], 1),
|
| 21 |
+
nn.GELU()
|
| 22 |
+
]
|
| 23 |
+
|
| 24 |
+
for in_channel, out_channel in zip(channels, channels[1:]+[channels[-1]]):
|
| 25 |
+
modules += [
|
| 26 |
+
DownResConvBlock(in_channel, out_channel, 1),
|
| 27 |
+
]
|
| 28 |
+
|
| 29 |
+
n_features = int(in_features*.5**len(channels))
|
| 30 |
+
|
| 31 |
+
modules += [
|
| 32 |
+
nn.Flatten(),
|
| 33 |
+
nn.Linear(n_features*channels[-1], 2*out_features)
|
| 34 |
+
]
|
| 35 |
+
|
| 36 |
+
self.net = nn.Sequential(*modules)
|
| 37 |
+
|
| 38 |
+
def forward(self, x):
|
| 39 |
+
mean, logvar = self.net(x).chunk(2, dim=1)
|
| 40 |
+
return mean, logvar
|
| 41 |
+
|
| 42 |
+
class Decoder(nn.Module):
|
| 43 |
+
def __init__(self,
|
| 44 |
+
out_channels: int,
|
| 45 |
+
in_features: int,
|
| 46 |
+
out_features: int,
|
| 47 |
+
channels: list = None,
|
| 48 |
+
) -> None:
|
| 49 |
+
super(Decoder, self).__init__()
|
| 50 |
+
|
| 51 |
+
n_features = int(out_features/2**len(channels))
|
| 52 |
+
|
| 53 |
+
modules = [
|
| 54 |
+
nn.Linear(in_features, n_features*channels[0]),
|
| 55 |
+
nn.Unflatten(-1, (channels[0], n_features))
|
| 56 |
+
]
|
| 57 |
+
|
| 58 |
+
for in_channel, out_channel in zip(channels, channels[1:]+[channels[-1]]):
|
| 59 |
+
modules += [
|
| 60 |
+
UpResConvBlock(in_channel, out_channel, 1),
|
| 61 |
+
]
|
| 62 |
+
|
| 63 |
+
modules += [
|
| 64 |
+
nn.Conv1d(channels[-1], out_channels, 1),
|
| 65 |
+
nn.GELU()
|
| 66 |
+
]
|
| 67 |
+
|
| 68 |
+
self.net = nn.Sequential(*modules)
|
| 69 |
+
|
| 70 |
+
def forward(self, x):
|
| 71 |
+
x = torch.tanh(self.net(x))
|
| 72 |
+
return x
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
class VAE(L.LightningModule):
|
| 76 |
+
def __init__(self, io_channels: int, io_features: int, latent_features: int, channels: list, learning_rate: float):
|
| 77 |
+
super().__init__()
|
| 78 |
+
self.encoder = Encoder(io_channels, io_features, latent_features, channels)
|
| 79 |
+
channels.reverse()
|
| 80 |
+
self.decoder = Decoder(io_channels, latent_features, io_features, channels)
|
| 81 |
+
self.latent_features = latent_features
|
| 82 |
+
self.audio_loss_func = MultiResolutionSTFTLoss()
|
| 83 |
+
self.learning_rate = learning_rate
|
| 84 |
+
|
| 85 |
+
@torch.no_grad()
|
| 86 |
+
def sample(self, eps=None):
|
| 87 |
+
if eps is None:
|
| 88 |
+
eps = torch.rand((1, self.latent_features))
|
| 89 |
+
return self.decoder(eps)
|
| 90 |
+
|
| 91 |
+
def loss_function(self, x, x_hat, mean, logvar):
|
| 92 |
+
audio_loss = self.audio_loss_func(x, x_hat)
|
| 93 |
+
kld_loss = -0.5 * torch.sum(1 + logvar - mean.pow(2) - logvar.exp())
|
| 94 |
+
return audio_loss + kld_loss
|
| 95 |
+
|
| 96 |
+
def reparameterize(self, mean, logvar):
|
| 97 |
+
std= torch.exp(0.5 * logvar)
|
| 98 |
+
eps = torch.randn_like(std)
|
| 99 |
+
return eps * std + mean
|
| 100 |
+
|
| 101 |
+
def forward(self, x):
|
| 102 |
+
mean, logvar = self.encoder(x)
|
| 103 |
+
z = self.reparameterize(mean, logvar)
|
| 104 |
+
return self.decoder(z), mean, logvar
|
| 105 |
+
|
| 106 |
+
def training_step(self, batch: Tensor, batch_idx: int, log: bool = True) -> Tensor:
|
| 107 |
+
x_hat, mean, logvar = self.forward(batch)
|
| 108 |
+
loss = self.loss_function(batch, x_hat, mean, logvar)
|
| 109 |
+
if log: self.log("train_loss", loss, prog_bar=True)
|
| 110 |
+
return loss
|
| 111 |
+
|
| 112 |
+
def configure_optimizers(self) -> Optimizer:
|
| 113 |
+
optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
|
| 114 |
+
return optimizer
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
class CVAE(L.LightningModule):
|
| 118 |
+
def __init__(self, io_channels: int, io_features: int, latent_features: int, channels: list, num_classes: int, learning_rate: float):
|
| 119 |
+
super().__init__()
|
| 120 |
+
self.class_embedder = nn.Linear(num_classes, io_features)
|
| 121 |
+
self.data_embedder = nn.Conv1d(io_channels, io_channels, kernel_size=1)
|
| 122 |
+
self.encoder = Encoder(io_channels+1, io_features, latent_features, channels)
|
| 123 |
+
channels.reverse()
|
| 124 |
+
self.decoder = Decoder(io_channels, latent_features+num_classes, io_features, channels)
|
| 125 |
+
self.num_classes = num_classes
|
| 126 |
+
self.latent_features = latent_features
|
| 127 |
+
self.audio_loss_func = MultiResolutionSTFTLoss()
|
| 128 |
+
self.learning_rate = learning_rate
|
| 129 |
+
|
| 130 |
+
@torch.no_grad()
|
| 131 |
+
def sample(self, c, eps=None):
|
| 132 |
+
c = nn.functional.one_hot(c, num_classes=self.num_classes).float().unsqueeze(0)
|
| 133 |
+
if eps is None:
|
| 134 |
+
eps = torch.rand((1, self.latent_features))
|
| 135 |
+
z = torch.cat([eps, c], dim=1)
|
| 136 |
+
return self.decoder(z)
|
| 137 |
+
|
| 138 |
+
def loss_function(self, x, x_hat, mean, logvar):
|
| 139 |
+
audio_loss = self.audio_loss_func(x, x_hat)
|
| 140 |
+
kld_loss = -0.5 * torch.sum(1 + logvar - mean.pow(2) - logvar.exp())
|
| 141 |
+
return audio_loss + kld_loss
|
| 142 |
+
|
| 143 |
+
def reparameterize(self, mean, logvar):
|
| 144 |
+
std= torch.exp(0.5 * logvar)
|
| 145 |
+
eps = torch.randn_like(std)
|
| 146 |
+
return eps * std + mean
|
| 147 |
+
|
| 148 |
+
def forward(self, x, c):
|
| 149 |
+
c = nn.functional.one_hot(c, num_classes=self.num_classes).float()
|
| 150 |
+
c_embedding = self.class_embedder(c).unsqueeze(1)
|
| 151 |
+
x_embedding = self.data_embedder(x)
|
| 152 |
+
x = torch.cat([x_embedding, c_embedding], dim = 1)
|
| 153 |
+
mean, logvar = self.encoder(x)
|
| 154 |
+
z = self.reparameterize(mean, logvar)
|
| 155 |
+
z = torch.cat([z, c], dim = 1)
|
| 156 |
+
return self.decoder(z), mean, logvar
|
| 157 |
+
|
| 158 |
+
def training_step(self, batch: Tensor, batch_idx: int, log: bool = True) -> Tensor:
|
| 159 |
+
x, c = batch
|
| 160 |
+
x_hat, mean, logvar = self.forward(x, c)
|
| 161 |
+
loss = self.loss_function(x, x_hat, mean, logvar)
|
| 162 |
+
if log: self.log("train_loss", loss, prog_bar=True)
|
| 163 |
+
return loss
|
| 164 |
+
|
| 165 |
+
def configure_optimizers(self) -> Optimizer:
|
| 166 |
+
optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
|
| 167 |
+
return optimizer
|
epoch=17-step=650718.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9cbccc3cf4b4a124831ab6fc7f23b4270ed90fcb41e1e87277ec4155787362c8
|
| 3 |
+
size 651547328
|
model.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from cvae import CVAE
|
| 2 |
+
import torch
|
| 3 |
+
from typing import Sequence
|
| 4 |
+
import re
|
| 5 |
+
|
| 6 |
+
instruments = ['bass_acoustic', 'brass_acoustic', 'flute_acoustic', 'guitar_acoustic', 'keyboard_acoustic', 'mallet_acoustic', 'organ_acoustic', 'reed_acoustic', 'string_acoustic', 'synth_lead_acoustic', 'vocal_acoustic', 'bass_synthetic', 'brass_synthetic', 'flute_synthetic', 'guitar_synthetic', 'keyboard_synthetic', 'mallet_synthetic', 'organ_synthetic', 'reed_synthetic', 'string_synthetic', 'synth_lead_synthetic', 'vocal_synthetic', 'bass_electronic', 'brass_electronic', 'flute_electronic', 'guitar_electronic', 'keyboard_electronic', 'mallet_electronic', 'organ_electronic', 'reed_electronic', 'string_electronic', 'synth_lead_electronic', 'vocal_electronic']
|
| 7 |
+
|
| 8 |
+
model = CVAE.load_from_checkpoint(
|
| 9 |
+
'epoch=17-step=650718.ckpt',
|
| 10 |
+
io_channels=1,
|
| 11 |
+
io_features=16000*4,
|
| 12 |
+
latent_features=5,
|
| 13 |
+
channels=[32, 64, 128, 256, 512],
|
| 14 |
+
num_classes=len(instruments),
|
| 15 |
+
learning_rate=1e-5
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
def format(text):
|
| 19 |
+
text = text.split(' ')[-1]
|
| 20 |
+
return text.replace(" ", "").lower()
|
| 21 |
+
|
| 22 |
+
def choice_to_tensor(choice: Sequence[str]) -> torch.Tensor:
|
| 23 |
+
choice = '_'.join([format(i) for i in choice])
|
| 24 |
+
return torch.tensor(instruments.index(choice))
|
| 25 |
+
|
| 26 |
+
def generate(choice: Sequence[str], params: Sequence[int]=None):
|
| 27 |
+
noise = torch.tensor(params).unsqueeze(0).to('cuda') if params else torch.randn(1, 5).to('cuda')
|
| 28 |
+
return model.sample(eps=noise, c = choice_to_tensor(choice).to('cuda')).cpu().numpy()[0]
|