alppo commited on
Commit
0cc41af
·
1 Parent(s): a3e78b9

add vae and slicer modules

Browse files

create evaluate module as template
update app file to slice given songs

__pycache__/config.cpython-312.pyc ADDED
Binary file (943 Bytes). View file
 
__pycache__/slicer_module.cpython-312.pyc ADDED
Binary file (1.44 kB). View file
 
__pycache__/vae_module.cpython-312.pyc ADDED
Binary file (7.56 kB). View file
 
app.py CHANGED
@@ -1,7 +1,37 @@
 
 
 
1
  import gradio as gr
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import torch
4
  import gradio as gr
5
+ from vae_module import VAE, Encoder, Decoder, loss_function
6
+ from config import config
7
+ from slicer_module import get_slices
8
+ from diffusers import UNet2DConditionModel, DDPMScheduler
9
 
 
 
10
 
11
+ vae = VAE()
12
+ vae.load_state_dict(torch.load('vae_model_state_dict.pth', map_location=torch.device('cpu')))
13
+ vae.to(config.device)
14
+ vae.eval()
15
+
16
+ model = UNet2DConditionModel.from_pretrained(config.hub_model_id, subfolder="unet")
17
+ noise_scheduler = DDPMScheduler.from_pretrained(config.hub_model_id, subfolder="scheduler")
18
+
19
+ def generate_new_track(audio_paths):
20
+
21
+ for i, audio_path in enumerate(audio_paths):
22
+ get_slices(audio_path)
23
+
24
+ return
25
+
26
+
27
+ # Define the Gradio interface
28
+ interface = gr.Interface(
29
+ fn=generate_new_track,
30
+ inputs=gr.Files(file_count="multiple", label="Upload Your Audio Files"),
31
+ outputs=gr.Audio(type="filepath", label="Generated Track"),
32
+ title="AMUSE: Music Generation",
33
+ description="Upload audio files and generate new tracks based on them using AMUSE."
34
+ )
35
+
36
+ # Launch the interface
37
+ interface.launch()
config.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ class Config:
3
+ def __init__(self):
4
+ self.image_size: int = 512 # the generated image resolution
5
+ self.sample_rate: int = 44100 # the sample rate of the audio
6
+ self.guidance_scale: float = 1 # the guidance scale for the diffusion process
7
+ self.mixed_precision: str = "fp16" # `no` for float32, `fp16` for automatic mixed precision
8
+ self.hub_model_id: str = "alppo/amuse" # the name of the repository to create on the HF Hub
9
+ self.hub_dataset_id: str = "alppo/music" # the name of the dataset to create on the HF Hub
10
+ self.seed: int = 0
11
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
12
+
13
+
14
+ config = Config()
evaluate_module.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # to do : evaluate function
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ diffusers
2
+ torch
3
+ librosa
4
+ soundfile
slicer_module.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import librosa
3
+ import soundfile as sf
4
+
5
+ def get_slices(file_path, sample_rate=44100, slice_duration=10, output_dir='slices'):
6
+ os.makedirs(output_dir, exist_ok=True)
7
+
8
+ audio, sr = librosa.load(file_path, sr=sample_rate)
9
+
10
+ slice_samples = slice_duration * sample_rate
11
+
12
+ num_slices = len(audio) // slice_samples
13
+ for i in range(num_slices):
14
+ start_sample = i * slice_samples
15
+ end_sample = start_sample + slice_samples
16
+ audio_slice = audio[start_sample:end_sample]
17
+
18
+ # save it into /slices
19
+ output_file = os.path.join(output_dir, f'slice_{i:04d}.wav')
20
+ sf.write(output_file, audio_slice, sample_rate)
21
+
22
+ # handle last slice
23
+ if len(audio) % slice_samples != 0:
24
+ start_sample = num_slices * slice_samples
25
+ audio_slice = audio[start_sample:]
26
+ output_file = os.path.join(output_dir, f'slice_{num_slices:04d}.wav')
27
+ sf.write(output_file, audio_slice, sample_rate)
28
+
29
+ if __name__ == "__main__":
30
+ get_slices('rock_song_009.wav')
vae_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8e271f76eea5c196e3d2a3b69b3a605610ae53fab76f1b2d451d96d13dfcfcd
3
+ size 277897466
vae_model_state_dict.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:131c1e7d7707a044b41f794bd18297549553f29ccdf693f0a4eeaed177e28006
3
+ size 277883050
vae_module.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+
5
+ class ResidualBlock(nn.Module):
6
+ def __init__(self, channels):
7
+ super(ResidualBlock, self).__init__()
8
+ self.conv1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
9
+ self.bn1 = nn.BatchNorm2d(channels)
10
+ self.relu = nn.ReLU(inplace=True)
11
+ self.conv2 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
12
+ self.bn2 = nn.BatchNorm2d(channels)
13
+
14
+ def forward(self, x):
15
+ residual = x
16
+ out = self.conv1(x)
17
+ out = self.bn1(out)
18
+ out = self.relu(out)
19
+ out = self.conv2(out)
20
+ out = self.bn2(out)
21
+ out += residual
22
+ out = self.relu(out)
23
+ return out
24
+
25
+ class Encoder(nn.Module):
26
+ def __init__(self, input_channels=1, hidden_dims=[64, 128, 256, 512, 1024], latent_dim=32):
27
+ super(Encoder, self).__init__()
28
+ self.hidden_dims = hidden_dims
29
+
30
+ # Build Encoder with Residual Blocks
31
+ modules = []
32
+ for h_dim in hidden_dims:
33
+ modules.append(
34
+ nn.Sequential(
35
+ nn.Conv2d(input_channels, h_dim, kernel_size=3, stride=2, padding=1),
36
+ nn.BatchNorm2d(h_dim),
37
+ nn.LeakyReLU(),
38
+ ResidualBlock(h_dim) # Adding a residual block
39
+ )
40
+ )
41
+ input_channels = h_dim
42
+
43
+ self.encoder = nn.Sequential(*modules)
44
+ self.fc_mu = nn.Linear(hidden_dims[-1]*hidden_dims[-3], latent_dim)
45
+ self.fc_var = nn.Linear(hidden_dims[-1]*hidden_dims[-3], latent_dim)
46
+
47
+ def forward(self, x):
48
+ for layer in self.encoder:
49
+ x = layer(x)
50
+ x = torch.flatten(x, start_dim=1)
51
+ mu = self.fc_mu(x)
52
+ log_var = self.fc_var(x)
53
+ return mu, log_var
54
+
55
+ class Decoder(nn.Module):
56
+ def __init__(self, latent_dim=32, output_channels=1, hidden_dims=[64, 128, 256, 512, 1024]):
57
+ super(Decoder, self).__init__()
58
+ self.hidden_dims = hidden_dims
59
+ # Reversing the order for the decoder
60
+ hidden_dims = hidden_dims[::-1]
61
+ self.decoder_input = nn.Linear(latent_dim, hidden_dims[0]*hidden_dims[2])
62
+
63
+ # Build Decoder with Residual Blocks
64
+ modules = []
65
+ for i in range(len(hidden_dims) - 1):
66
+ modules.append(
67
+ nn.Sequential(
68
+ nn.ConvTranspose2d(hidden_dims[i], hidden_dims[i+1], kernel_size=3, stride=2, padding=1, output_padding=1),
69
+ nn.BatchNorm2d(hidden_dims[i+1]),
70
+ nn.LeakyReLU(),
71
+ ResidualBlock(hidden_dims[i+1]) # Adding a residual block
72
+ )
73
+ )
74
+
75
+ self.decoder = nn.Sequential(*modules)
76
+ self.final_layer = nn.Sequential(
77
+ nn.ConvTranspose2d(hidden_dims[-1], hidden_dims[-1], kernel_size=3, stride=2, padding=1, output_padding=1),
78
+ nn.BatchNorm2d(hidden_dims[-1]),
79
+ nn.LeakyReLU(),
80
+ nn.Conv2d(hidden_dims[-1], output_channels, kernel_size=3, padding=1),
81
+ nn.Sigmoid()
82
+ )
83
+
84
+ def forward(self, z):
85
+ z = self.decoder_input(z)
86
+ z = z.view(-1, 1024, 16, 16)
87
+ for layer in self.decoder:
88
+ z = layer(z)
89
+ result = self.final_layer(z)
90
+ return result
91
+
92
+ class VAE(nn.Module):
93
+ def __init__(self,
94
+ input_channels=1,
95
+ latent_dim=32,
96
+ hidden_dims=None):
97
+ super(VAE, self).__init__()
98
+
99
+ if hidden_dims is None:
100
+ hidden_dims = [64, 128, 256, 512, 1024]
101
+
102
+ self.encoder = Encoder(input_channels=input_channels,
103
+ hidden_dims=hidden_dims,
104
+ latent_dim=latent_dim)
105
+
106
+ self.decoder = Decoder(latent_dim=latent_dim,
107
+ output_channels=input_channels,
108
+ hidden_dims=hidden_dims)
109
+
110
+ def encode(self, input):
111
+ mu, log_var = self.encoder(input)
112
+ return mu, log_var
113
+
114
+ def reparameterize(self, mu, log_var):
115
+ std = torch.exp(0.5 * log_var)
116
+ eps = torch.randn_like(std)
117
+ return mu + eps * std
118
+
119
+ def decode(self, z):
120
+ return self.decoder(z)
121
+
122
+ def forward(self, input):
123
+ mu, log_var = self.encode(input)
124
+ z = self.reparameterize(mu, log_var)
125
+ return self.decode(z), mu, log_var
126
+
127
+ # Loss function for VAE
128
+ def loss_function(recon_x, x, mu, log_var):
129
+ BCE = F.binary_cross_entropy(recon_x, x, reduction='sum')
130
+ KLD = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())
131
+ return BCE + KLD