add vae and slicer modules
Browse filescreate evaluate module as template
update app file to slice given songs
- __pycache__/config.cpython-312.pyc +0 -0
- __pycache__/slicer_module.cpython-312.pyc +0 -0
- __pycache__/vae_module.cpython-312.pyc +0 -0
- app.py +34 -4
- config.py +14 -0
- evaluate_module.py +1 -0
- requirements.txt +4 -0
- slicer_module.py +30 -0
- vae_model.pth +3 -0
- vae_model_state_dict.pth +3 -0
- vae_module.py +131 -0
__pycache__/config.cpython-312.pyc
ADDED
|
Binary file (943 Bytes). View file
|
|
|
__pycache__/slicer_module.cpython-312.pyc
ADDED
|
Binary file (1.44 kB). View file
|
|
|
__pycache__/vae_module.cpython-312.pyc
ADDED
|
Binary file (7.56 kB). View file
|
|
|
app.py
CHANGED
|
@@ -1,7 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
-
def greet(name):
|
| 4 |
-
return "Hello " + name + "!!"
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import torch
|
| 4 |
import gradio as gr
|
| 5 |
+
from vae_module import VAE, Encoder, Decoder, loss_function
|
| 6 |
+
from config import config
|
| 7 |
+
from slicer_module import get_slices
|
| 8 |
+
from diffusers import UNet2DConditionModel, DDPMScheduler
|
| 9 |
|
|
|
|
|
|
|
| 10 |
|
| 11 |
+
vae = VAE()
|
| 12 |
+
vae.load_state_dict(torch.load('vae_model_state_dict.pth', map_location=torch.device('cpu')))
|
| 13 |
+
vae.to(config.device)
|
| 14 |
+
vae.eval()
|
| 15 |
+
|
| 16 |
+
model = UNet2DConditionModel.from_pretrained(config.hub_model_id, subfolder="unet")
|
| 17 |
+
noise_scheduler = DDPMScheduler.from_pretrained(config.hub_model_id, subfolder="scheduler")
|
| 18 |
+
|
| 19 |
+
def generate_new_track(audio_paths):
|
| 20 |
+
|
| 21 |
+
for i, audio_path in enumerate(audio_paths):
|
| 22 |
+
get_slices(audio_path)
|
| 23 |
+
|
| 24 |
+
return
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# Define the Gradio interface
|
| 28 |
+
interface = gr.Interface(
|
| 29 |
+
fn=generate_new_track,
|
| 30 |
+
inputs=gr.Files(file_count="multiple", label="Upload Your Audio Files"),
|
| 31 |
+
outputs=gr.Audio(type="filepath", label="Generated Track"),
|
| 32 |
+
title="AMUSE: Music Generation",
|
| 33 |
+
description="Upload audio files and generate new tracks based on them using AMUSE."
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
# Launch the interface
|
| 37 |
+
interface.launch()
|
config.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
class Config:
|
| 3 |
+
def __init__(self):
|
| 4 |
+
self.image_size: int = 512 # the generated image resolution
|
| 5 |
+
self.sample_rate: int = 44100 # the sample rate of the audio
|
| 6 |
+
self.guidance_scale: float = 1 # the guidance scale for the diffusion process
|
| 7 |
+
self.mixed_precision: str = "fp16" # `no` for float32, `fp16` for automatic mixed precision
|
| 8 |
+
self.hub_model_id: str = "alppo/amuse" # the name of the repository to create on the HF Hub
|
| 9 |
+
self.hub_dataset_id: str = "alppo/music" # the name of the dataset to create on the HF Hub
|
| 10 |
+
self.seed: int = 0
|
| 11 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
config = Config()
|
evaluate_module.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# to do : evaluate function
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
diffusers
|
| 2 |
+
torch
|
| 3 |
+
librosa
|
| 4 |
+
soundfile
|
slicer_module.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import librosa
|
| 3 |
+
import soundfile as sf
|
| 4 |
+
|
| 5 |
+
def get_slices(file_path, sample_rate=44100, slice_duration=10, output_dir='slices'):
|
| 6 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 7 |
+
|
| 8 |
+
audio, sr = librosa.load(file_path, sr=sample_rate)
|
| 9 |
+
|
| 10 |
+
slice_samples = slice_duration * sample_rate
|
| 11 |
+
|
| 12 |
+
num_slices = len(audio) // slice_samples
|
| 13 |
+
for i in range(num_slices):
|
| 14 |
+
start_sample = i * slice_samples
|
| 15 |
+
end_sample = start_sample + slice_samples
|
| 16 |
+
audio_slice = audio[start_sample:end_sample]
|
| 17 |
+
|
| 18 |
+
# save it into /slices
|
| 19 |
+
output_file = os.path.join(output_dir, f'slice_{i:04d}.wav')
|
| 20 |
+
sf.write(output_file, audio_slice, sample_rate)
|
| 21 |
+
|
| 22 |
+
# handle last slice
|
| 23 |
+
if len(audio) % slice_samples != 0:
|
| 24 |
+
start_sample = num_slices * slice_samples
|
| 25 |
+
audio_slice = audio[start_sample:]
|
| 26 |
+
output_file = os.path.join(output_dir, f'slice_{num_slices:04d}.wav')
|
| 27 |
+
sf.write(output_file, audio_slice, sample_rate)
|
| 28 |
+
|
| 29 |
+
if __name__ == "__main__":
|
| 30 |
+
get_slices('rock_song_009.wav')
|
vae_model.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a8e271f76eea5c196e3d2a3b69b3a605610ae53fab76f1b2d451d96d13dfcfcd
|
| 3 |
+
size 277897466
|
vae_model_state_dict.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:131c1e7d7707a044b41f794bd18297549553f29ccdf693f0a4eeaed177e28006
|
| 3 |
+
size 277883050
|
vae_module.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
import torch.nn.functional as F
|
| 4 |
+
|
| 5 |
+
class ResidualBlock(nn.Module):
|
| 6 |
+
def __init__(self, channels):
|
| 7 |
+
super(ResidualBlock, self).__init__()
|
| 8 |
+
self.conv1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
|
| 9 |
+
self.bn1 = nn.BatchNorm2d(channels)
|
| 10 |
+
self.relu = nn.ReLU(inplace=True)
|
| 11 |
+
self.conv2 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
|
| 12 |
+
self.bn2 = nn.BatchNorm2d(channels)
|
| 13 |
+
|
| 14 |
+
def forward(self, x):
|
| 15 |
+
residual = x
|
| 16 |
+
out = self.conv1(x)
|
| 17 |
+
out = self.bn1(out)
|
| 18 |
+
out = self.relu(out)
|
| 19 |
+
out = self.conv2(out)
|
| 20 |
+
out = self.bn2(out)
|
| 21 |
+
out += residual
|
| 22 |
+
out = self.relu(out)
|
| 23 |
+
return out
|
| 24 |
+
|
| 25 |
+
class Encoder(nn.Module):
|
| 26 |
+
def __init__(self, input_channels=1, hidden_dims=[64, 128, 256, 512, 1024], latent_dim=32):
|
| 27 |
+
super(Encoder, self).__init__()
|
| 28 |
+
self.hidden_dims = hidden_dims
|
| 29 |
+
|
| 30 |
+
# Build Encoder with Residual Blocks
|
| 31 |
+
modules = []
|
| 32 |
+
for h_dim in hidden_dims:
|
| 33 |
+
modules.append(
|
| 34 |
+
nn.Sequential(
|
| 35 |
+
nn.Conv2d(input_channels, h_dim, kernel_size=3, stride=2, padding=1),
|
| 36 |
+
nn.BatchNorm2d(h_dim),
|
| 37 |
+
nn.LeakyReLU(),
|
| 38 |
+
ResidualBlock(h_dim) # Adding a residual block
|
| 39 |
+
)
|
| 40 |
+
)
|
| 41 |
+
input_channels = h_dim
|
| 42 |
+
|
| 43 |
+
self.encoder = nn.Sequential(*modules)
|
| 44 |
+
self.fc_mu = nn.Linear(hidden_dims[-1]*hidden_dims[-3], latent_dim)
|
| 45 |
+
self.fc_var = nn.Linear(hidden_dims[-1]*hidden_dims[-3], latent_dim)
|
| 46 |
+
|
| 47 |
+
def forward(self, x):
|
| 48 |
+
for layer in self.encoder:
|
| 49 |
+
x = layer(x)
|
| 50 |
+
x = torch.flatten(x, start_dim=1)
|
| 51 |
+
mu = self.fc_mu(x)
|
| 52 |
+
log_var = self.fc_var(x)
|
| 53 |
+
return mu, log_var
|
| 54 |
+
|
| 55 |
+
class Decoder(nn.Module):
|
| 56 |
+
def __init__(self, latent_dim=32, output_channels=1, hidden_dims=[64, 128, 256, 512, 1024]):
|
| 57 |
+
super(Decoder, self).__init__()
|
| 58 |
+
self.hidden_dims = hidden_dims
|
| 59 |
+
# Reversing the order for the decoder
|
| 60 |
+
hidden_dims = hidden_dims[::-1]
|
| 61 |
+
self.decoder_input = nn.Linear(latent_dim, hidden_dims[0]*hidden_dims[2])
|
| 62 |
+
|
| 63 |
+
# Build Decoder with Residual Blocks
|
| 64 |
+
modules = []
|
| 65 |
+
for i in range(len(hidden_dims) - 1):
|
| 66 |
+
modules.append(
|
| 67 |
+
nn.Sequential(
|
| 68 |
+
nn.ConvTranspose2d(hidden_dims[i], hidden_dims[i+1], kernel_size=3, stride=2, padding=1, output_padding=1),
|
| 69 |
+
nn.BatchNorm2d(hidden_dims[i+1]),
|
| 70 |
+
nn.LeakyReLU(),
|
| 71 |
+
ResidualBlock(hidden_dims[i+1]) # Adding a residual block
|
| 72 |
+
)
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
self.decoder = nn.Sequential(*modules)
|
| 76 |
+
self.final_layer = nn.Sequential(
|
| 77 |
+
nn.ConvTranspose2d(hidden_dims[-1], hidden_dims[-1], kernel_size=3, stride=2, padding=1, output_padding=1),
|
| 78 |
+
nn.BatchNorm2d(hidden_dims[-1]),
|
| 79 |
+
nn.LeakyReLU(),
|
| 80 |
+
nn.Conv2d(hidden_dims[-1], output_channels, kernel_size=3, padding=1),
|
| 81 |
+
nn.Sigmoid()
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
def forward(self, z):
|
| 85 |
+
z = self.decoder_input(z)
|
| 86 |
+
z = z.view(-1, 1024, 16, 16)
|
| 87 |
+
for layer in self.decoder:
|
| 88 |
+
z = layer(z)
|
| 89 |
+
result = self.final_layer(z)
|
| 90 |
+
return result
|
| 91 |
+
|
| 92 |
+
class VAE(nn.Module):
|
| 93 |
+
def __init__(self,
|
| 94 |
+
input_channels=1,
|
| 95 |
+
latent_dim=32,
|
| 96 |
+
hidden_dims=None):
|
| 97 |
+
super(VAE, self).__init__()
|
| 98 |
+
|
| 99 |
+
if hidden_dims is None:
|
| 100 |
+
hidden_dims = [64, 128, 256, 512, 1024]
|
| 101 |
+
|
| 102 |
+
self.encoder = Encoder(input_channels=input_channels,
|
| 103 |
+
hidden_dims=hidden_dims,
|
| 104 |
+
latent_dim=latent_dim)
|
| 105 |
+
|
| 106 |
+
self.decoder = Decoder(latent_dim=latent_dim,
|
| 107 |
+
output_channels=input_channels,
|
| 108 |
+
hidden_dims=hidden_dims)
|
| 109 |
+
|
| 110 |
+
def encode(self, input):
|
| 111 |
+
mu, log_var = self.encoder(input)
|
| 112 |
+
return mu, log_var
|
| 113 |
+
|
| 114 |
+
def reparameterize(self, mu, log_var):
|
| 115 |
+
std = torch.exp(0.5 * log_var)
|
| 116 |
+
eps = torch.randn_like(std)
|
| 117 |
+
return mu + eps * std
|
| 118 |
+
|
| 119 |
+
def decode(self, z):
|
| 120 |
+
return self.decoder(z)
|
| 121 |
+
|
| 122 |
+
def forward(self, input):
|
| 123 |
+
mu, log_var = self.encode(input)
|
| 124 |
+
z = self.reparameterize(mu, log_var)
|
| 125 |
+
return self.decode(z), mu, log_var
|
| 126 |
+
|
| 127 |
+
# Loss function for VAE
|
| 128 |
+
def loss_function(recon_x, x, mu, log_var):
|
| 129 |
+
BCE = F.binary_cross_entropy(recon_x, x, reduction='sum')
|
| 130 |
+
KLD = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())
|
| 131 |
+
return BCE + KLD
|