add description
Browse files- __pycache__/generator_module.cpython-312.pyc +0 -0
- app.py +30 -19
- generator_module.py +8 -13
__pycache__/generator_module.cpython-312.pyc
CHANGED
|
Binary files a/__pycache__/generator_module.cpython-312.pyc and b/__pycache__/generator_module.cpython-312.pyc differ
|
|
|
app.py
CHANGED
|
@@ -11,7 +11,6 @@ from mel_module import Mel
|
|
| 11 |
from generator_module import Generator
|
| 12 |
import shutil
|
| 13 |
|
| 14 |
-
|
| 15 |
slices_folder = 'slices'
|
| 16 |
|
| 17 |
if os.path.exists(slices_folder): # delete previous tracks
|
|
@@ -25,30 +24,25 @@ vae.eval()
|
|
| 25 |
model = UNet2DConditionModel.from_pretrained(config.hub_model_id, subfolder="unet")
|
| 26 |
noise_scheduler = DDPMScheduler.from_pretrained(config.hub_model_id, subfolder="scheduler")
|
| 27 |
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
def generate_new_track(audio_paths):
|
| 31 |
-
|
| 32 |
for i, audio_path in enumerate(audio_paths):
|
| 33 |
-
print(audio_paths,audio_path)
|
| 34 |
get_slices(audio_path)
|
| 35 |
|
| 36 |
embedding = get_embedding()
|
| 37 |
-
print("sample latent",embedding.shape)
|
| 38 |
-
|
|
|
|
| 39 |
generator.generate()
|
| 40 |
|
| 41 |
return config.generated_track_path
|
| 42 |
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
def get_embedding(): # returns middle point of given audio files latent representations
|
| 47 |
latents = []
|
| 48 |
slices_dir = 'slices'
|
| 49 |
-
|
| 50 |
for slice_file in os.listdir(slices_dir):
|
| 51 |
-
if slice_file.endswith('.wav'): # make sure the file is audio
|
| 52 |
mel = Mel(os.path.join(slices_dir, slice_file))
|
| 53 |
spectrogram = mel.get_spectrogram()
|
| 54 |
tensor = torch.tensor(spectrogram).float().unsqueeze(0).unsqueeze(0)
|
|
@@ -59,24 +53,41 @@ def get_embedding(): # returns middle point of given audio files latent represen
|
|
| 59 |
normalized_tensor = 2 * ((latent - min_val) / (max_val - min_val)) - 1
|
| 60 |
latent = normalized_tensor.unsqueeze(0)
|
| 61 |
latents.append(latent)
|
| 62 |
-
|
| 63 |
if not latents:
|
| 64 |
return None
|
| 65 |
-
|
| 66 |
latents_tensor = torch.cat(latents, dim=0)
|
| 67 |
mean_latent = latents_tensor.mean(dim=0, keepdim=True)
|
| 68 |
return mean_latent
|
| 69 |
|
| 70 |
-
|
| 71 |
|
| 72 |
-
# Define the Gradio interface
|
| 73 |
interface = gr.Interface(
|
| 74 |
fn=generate_new_track,
|
| 75 |
inputs=gr.Files(file_count="multiple", label="Upload Your Audio Files"),
|
| 76 |
outputs=gr.Audio(type="filepath", label="Generated Track"),
|
| 77 |
title="AMUSE: Music Generation",
|
| 78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
-
# Launch the interface
|
| 82 |
interface.launch()
|
|
|
|
| 11 |
from generator_module import Generator
|
| 12 |
import shutil
|
| 13 |
|
|
|
|
| 14 |
slices_folder = 'slices'
|
| 15 |
|
| 16 |
if os.path.exists(slices_folder): # delete previous tracks
|
|
|
|
| 24 |
model = UNet2DConditionModel.from_pretrained(config.hub_model_id, subfolder="unet")
|
| 25 |
noise_scheduler = DDPMScheduler.from_pretrained(config.hub_model_id, subfolder="scheduler")
|
| 26 |
|
| 27 |
+
def generate_new_track(audio_paths, progress=gr.Progress(track_tqdm=True)):
|
|
|
|
|
|
|
|
|
|
| 28 |
for i, audio_path in enumerate(audio_paths):
|
| 29 |
+
print(audio_paths, audio_path)
|
| 30 |
get_slices(audio_path)
|
| 31 |
|
| 32 |
embedding = get_embedding()
|
| 33 |
+
print("sample latent", embedding.shape)
|
| 34 |
+
|
| 35 |
+
generator = Generator(config, model, noise_scheduler, vae, embedding, progress_callback=progress)
|
| 36 |
generator.generate()
|
| 37 |
|
| 38 |
return config.generated_track_path
|
| 39 |
|
|
|
|
|
|
|
|
|
|
| 40 |
def get_embedding(): # returns middle point of given audio files latent representations
|
| 41 |
latents = []
|
| 42 |
slices_dir = 'slices'
|
| 43 |
+
|
| 44 |
for slice_file in os.listdir(slices_dir):
|
| 45 |
+
if slice_file.endswith('.wav'): # make sure the file is audio
|
| 46 |
mel = Mel(os.path.join(slices_dir, slice_file))
|
| 47 |
spectrogram = mel.get_spectrogram()
|
| 48 |
tensor = torch.tensor(spectrogram).float().unsqueeze(0).unsqueeze(0)
|
|
|
|
| 53 |
normalized_tensor = 2 * ((latent - min_val) / (max_val - min_val)) - 1
|
| 54 |
latent = normalized_tensor.unsqueeze(0)
|
| 55 |
latents.append(latent)
|
| 56 |
+
|
| 57 |
if not latents:
|
| 58 |
return None
|
| 59 |
+
|
| 60 |
latents_tensor = torch.cat(latents, dim=0)
|
| 61 |
mean_latent = latents_tensor.mean(dim=0, keepdim=True)
|
| 62 |
return mean_latent
|
| 63 |
|
|
|
|
| 64 |
|
|
|
|
| 65 |
interface = gr.Interface(
|
| 66 |
fn=generate_new_track,
|
| 67 |
inputs=gr.Files(file_count="multiple", label="Upload Your Audio Files"),
|
| 68 |
outputs=gr.Audio(type="filepath", label="Generated Track"),
|
| 69 |
title="AMUSE: Music Generation",
|
| 70 |
+
description = (
|
| 71 |
+
"<h3>Welcome to the AMUSE music generation app</h3>"
|
| 72 |
+
"<p>Here's how it works:</p>"
|
| 73 |
+
"<ol>"
|
| 74 |
+
"<li><strong>Upload Your Audio Files:</strong> Provide audio files from which the taste will be extracted, "
|
| 75 |
+
"and a new track will be generated accordingly. The audio files should be in .wav format!</li>"
|
| 76 |
+
"<li><strong>Process:</strong> The app slices the audio, extracts features, and generates a new track using a VAE and a diffusion model.</li>"
|
| 77 |
+
"<li><strong>Progress:</strong> The progress bar will show the generation process in real-time. Note that this takes a significant amount of time, "
|
| 78 |
+
"so you may leave the site in the free version and come back later to see the result.</li>"
|
| 79 |
+
"<li><strong>Download:</strong> Once the track is generated, you can download it directly.</li>"
|
| 80 |
+
"</ol>"
|
| 81 |
+
"<h4>Notes:</h4>"
|
| 82 |
+
"<ul>"
|
| 83 |
+
"<li>As mentioned earlier, it takes a significant amount of time to generate a new track in the free version of HF Spaces. "
|
| 84 |
+
"So, submit your tracks and forget about it for a little while :) Then come back to see the new track.</li>"
|
| 85 |
+
"<li>Ensure your audio files are clean and of good quality for the best results (sample rate: 44100 and .wav format).</li>"
|
| 86 |
+
"</ul>"
|
| 87 |
)
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
|
| 92 |
|
|
|
|
| 93 |
interface.launch()
|
generator_module.py
CHANGED
|
@@ -7,12 +7,13 @@ import soundfile as sf
|
|
| 7 |
from mel_module import Mel
|
| 8 |
|
| 9 |
class Generator:
|
| 10 |
-
def __init__(self, config, unet, scheduler, vae, embedding):
|
| 11 |
self.config = config
|
| 12 |
self.unet = unet
|
| 13 |
self.scheduler = scheduler
|
| 14 |
self.vae = vae
|
| 15 |
self.embedding = embedding
|
|
|
|
| 16 |
|
| 17 |
def tensor_to_mel(self, tensor):
|
| 18 |
denormalize = transforms.Normalize(
|
|
@@ -29,7 +30,7 @@ class Generator:
|
|
| 29 |
mu, log_var = self.vae.encode(uncond_image)
|
| 30 |
uncond_latent = torch.cat((mu, log_var), dim=1)
|
| 31 |
uncond_latent = uncond_latent.unsqueeze(0)
|
| 32 |
-
print("uncond",uncond_latent.shape)
|
| 33 |
|
| 34 |
embeddings = torch.cat([uncond_latent, self.embedding])
|
| 35 |
|
|
@@ -41,24 +42,18 @@ class Generator:
|
|
| 41 |
device=self.config.device,
|
| 42 |
)
|
| 43 |
|
| 44 |
-
|
|
|
|
|
|
|
| 45 |
image_model_input = torch.cat([noise] * 2)
|
| 46 |
-
# torch.Size([2, 1, 512, 512])
|
| 47 |
image_model_input = self.scheduler.scale_model_input(image_model_input, timestep=t)
|
| 48 |
-
# torch.Size([2, 1, 512, 512])
|
| 49 |
|
| 50 |
with torch.no_grad():
|
| 51 |
noise_pred = self.unet(image_model_input, t, encoder_hidden_states=embeddings).sample
|
| 52 |
noise_pred_uncond, noise_pred_img = noise_pred.chunk(2)
|
| 53 |
-
# torch.Size([1, 1, 512, 512])
|
| 54 |
-
# torch.Size([1, 1, 512, 512])
|
| 55 |
noise_pred = noise_pred_uncond + self.config.guidance_scale * (noise_pred_img - noise_pred_uncond)
|
| 56 |
-
# torch.Size([1, 1, 512, 512])
|
| 57 |
-
# compute the previous noisy sample x_t -> x_t-1
|
| 58 |
noise = self.scheduler.step(noise_pred, t, noise).prev_sample
|
| 59 |
|
| 60 |
-
image_tensor =
|
| 61 |
-
mel = tensor_to_mel(image_tensor)
|
| 62 |
mel.save_audio()
|
| 63 |
-
|
| 64 |
-
|
|
|
|
| 7 |
from mel_module import Mel
|
| 8 |
|
| 9 |
class Generator:
|
| 10 |
+
def __init__(self, config, unet, scheduler, vae, embedding, progress_callback=None):
|
| 11 |
self.config = config
|
| 12 |
self.unet = unet
|
| 13 |
self.scheduler = scheduler
|
| 14 |
self.vae = vae
|
| 15 |
self.embedding = embedding
|
| 16 |
+
self.progress_callback = progress_callback
|
| 17 |
|
| 18 |
def tensor_to_mel(self, tensor):
|
| 19 |
denormalize = transforms.Normalize(
|
|
|
|
| 30 |
mu, log_var = self.vae.encode(uncond_image)
|
| 31 |
uncond_latent = torch.cat((mu, log_var), dim=1)
|
| 32 |
uncond_latent = uncond_latent.unsqueeze(0)
|
| 33 |
+
print("uncond", uncond_latent.shape)
|
| 34 |
|
| 35 |
embeddings = torch.cat([uncond_latent, self.embedding])
|
| 36 |
|
|
|
|
| 42 |
device=self.config.device,
|
| 43 |
)
|
| 44 |
|
| 45 |
+
total_steps = len(self.scheduler.timesteps)
|
| 46 |
+
|
| 47 |
+
for i, t in enumerate(self.progress_callback.tqdm(self.scheduler.timesteps)):
|
| 48 |
image_model_input = torch.cat([noise] * 2)
|
|
|
|
| 49 |
image_model_input = self.scheduler.scale_model_input(image_model_input, timestep=t)
|
|
|
|
| 50 |
|
| 51 |
with torch.no_grad():
|
| 52 |
noise_pred = self.unet(image_model_input, t, encoder_hidden_states=embeddings).sample
|
| 53 |
noise_pred_uncond, noise_pred_img = noise_pred.chunk(2)
|
|
|
|
|
|
|
| 54 |
noise_pred = noise_pred_uncond + self.config.guidance_scale * (noise_pred_img - noise_pred_uncond)
|
|
|
|
|
|
|
| 55 |
noise = self.scheduler.step(noise_pred, t, noise).prev_sample
|
| 56 |
|
| 57 |
+
image_tensor = noise.squeeze(1) # [1, 512, 512]
|
| 58 |
+
mel = self.tensor_to_mel(image_tensor)
|
| 59 |
mel.save_audio()
|
|
|
|
|
|