Spaces:

alppo
/

amuse

Sleeping

App Files Files Community

alppo commited on Jul 23, 2024

Commit

6f849d1

1 Parent(s): 4fba802

add description

Browse files

Files changed (3) hide show

__pycache__/generator_module.cpython-312.pyc +0 -0
app.py +30 -19
generator_module.py +8 -13

__pycache__/generator_module.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/generator_module.cpython-312.pyc and b/__pycache__/generator_module.cpython-312.pyc differ

app.py CHANGED Viewed

@@ -11,7 +11,6 @@ from mel_module import Mel
 from generator_module import Generator
 import shutil
 slices_folder = 'slices'
 if os.path.exists(slices_folder): # delete previous tracks
@@ -25,30 +24,25 @@ vae.eval()
 model = UNet2DConditionModel.from_pretrained(config.hub_model_id, subfolder="unet")
 noise_scheduler = DDPMScheduler.from_pretrained(config.hub_model_id, subfolder="scheduler")
-def generate_new_track(audio_paths):
     for i, audio_path in enumerate(audio_paths):
-        print(audio_paths,audio_path)
         get_slices(audio_path)
     embedding = get_embedding()
-    print("sample latent",embedding.shape)
-    generator = Generator(config, model, noise_scheduler, vae, embedding)
     generator.generate()
     return config.generated_track_path
 def get_embedding(): # returns middle point of given audio files latent representations
     latents = []
     slices_dir = 'slices'
     for slice_file in os.listdir(slices_dir):
-        if slice_file.endswith('.wav'): # make sure the file is audio
             mel = Mel(os.path.join(slices_dir, slice_file))
             spectrogram = mel.get_spectrogram()
             tensor = torch.tensor(spectrogram).float().unsqueeze(0).unsqueeze(0)
@@ -59,24 +53,41 @@ def get_embedding(): # returns middle point of given audio files latent represen
             normalized_tensor = 2 * ((latent - min_val) / (max_val - min_val)) - 1
             latent = normalized_tensor.unsqueeze(0)
             latents.append(latent)
     if not latents:
         return None
     latents_tensor = torch.cat(latents, dim=0)
     mean_latent = latents_tensor.mean(dim=0, keepdim=True)
     return mean_latent
-# Define the Gradio interface
 interface = gr.Interface(
     fn=generate_new_track,
     inputs=gr.Files(file_count="multiple", label="Upload Your Audio Files"),
     outputs=gr.Audio(type="filepath", label="Generated Track"),
     title="AMUSE: Music Generation",
-    description="Upload audio files and generate new tracks based on them using AMUSE."
 )
-# Launch the interface
 interface.launch()

 from generator_module import Generator
 import shutil
 slices_folder = 'slices'
 if os.path.exists(slices_folder): # delete previous tracks
 model = UNet2DConditionModel.from_pretrained(config.hub_model_id, subfolder="unet")
 noise_scheduler = DDPMScheduler.from_pretrained(config.hub_model_id, subfolder="scheduler")
+def generate_new_track(audio_paths, progress=gr.Progress(track_tqdm=True)):
     for i, audio_path in enumerate(audio_paths):
+        print(audio_paths, audio_path)
         get_slices(audio_path)
     embedding = get_embedding()
+    print("sample latent", embedding.shape)
+    generator = Generator(config, model, noise_scheduler, vae, embedding, progress_callback=progress)
     generator.generate()
     return config.generated_track_path
 def get_embedding(): # returns middle point of given audio files latent representations
     latents = []
     slices_dir = 'slices'
     for slice_file in os.listdir(slices_dir):
+        if slice_file.endswith('.wav'): # make sure the file is audio
             mel = Mel(os.path.join(slices_dir, slice_file))
             spectrogram = mel.get_spectrogram()
             tensor = torch.tensor(spectrogram).float().unsqueeze(0).unsqueeze(0)
             normalized_tensor = 2 * ((latent - min_val) / (max_val - min_val)) - 1
             latent = normalized_tensor.unsqueeze(0)
             latents.append(latent)
     if not latents:
         return None
     latents_tensor = torch.cat(latents, dim=0)
     mean_latent = latents_tensor.mean(dim=0, keepdim=True)
     return mean_latent
 interface = gr.Interface(
     fn=generate_new_track,
     inputs=gr.Files(file_count="multiple", label="Upload Your Audio Files"),
     outputs=gr.Audio(type="filepath", label="Generated Track"),
     title="AMUSE: Music Generation",
+   description = (
+    "<h3>Welcome to the AMUSE music generation app</h3>"
+    "<p>Here's how it works:</p>"
+    "<ol>"
+    "<li><strong>Upload Your Audio Files:</strong> Provide audio files from which the taste will be extracted, "
+    "and a new track will be generated accordingly. The audio files should be in .wav format!</li>"
+    "<li><strong>Process:</strong> The app slices the audio, extracts features, and generates a new track using a VAE and a diffusion model.</li>"
+    "<li><strong>Progress:</strong> The progress bar will show the generation process in real-time. Note that this takes a significant amount of time, "
+    "so you may leave the site in the free version and come back later to see the result.</li>"
+    "<li><strong>Download:</strong> Once the track is generated, you can download it directly.</li>"
+    "</ol>"
+    "<h4>Notes:</h4>"
+    "<ul>"
+    "<li>As mentioned earlier, it takes a significant amount of time to generate a new track in the free version of HF Spaces. "
+    "So, submit your tracks and forget about it for a little while :) Then come back to see the new track.</li>"
+    "<li>Ensure your audio files are clean and of good quality for the best results (sample rate: 44100 and .wav format).</li>"
+    "</ul>"
 )
+)
 interface.launch()

generator_module.py CHANGED Viewed

@@ -7,12 +7,13 @@ import soundfile as sf
 from mel_module import Mel
 class Generator:
-    def __init__(self, config, unet, scheduler, vae, embedding):
         self.config = config
         self.unet = unet
         self.scheduler = scheduler
         self.vae = vae
         self.embedding = embedding
     def tensor_to_mel(self, tensor):
         denormalize = transforms.Normalize(
@@ -29,7 +30,7 @@ class Generator:
             mu, log_var = self.vae.encode(uncond_image)
             uncond_latent = torch.cat((mu, log_var), dim=1)
             uncond_latent = uncond_latent.unsqueeze(0)
-            print("uncond",uncond_latent.shape)
         embeddings = torch.cat([uncond_latent, self.embedding])
@@ -41,24 +42,18 @@ class Generator:
             device=self.config.device,
         )
-        for t in tqdm(self.scheduler.timesteps):
             image_model_input = torch.cat([noise] * 2)
-            # torch.Size([2, 1, 512, 512])
             image_model_input = self.scheduler.scale_model_input(image_model_input, timestep=t)
-            # torch.Size([2, 1, 512, 512])
             with torch.no_grad():
                 noise_pred = self.unet(image_model_input, t, encoder_hidden_states=embeddings).sample
             noise_pred_uncond, noise_pred_img = noise_pred.chunk(2)
-            # torch.Size([1, 1, 512, 512])
-            # torch.Size([1, 1, 512, 512])
             noise_pred = noise_pred_uncond + self.config.guidance_scale * (noise_pred_img - noise_pred_uncond)
-            # torch.Size([1, 1, 512, 512])
-            # compute the previous noisy sample x_t -> x_t-1
             noise = self.scheduler.step(noise_pred, t, noise).prev_sample
-        image_tensor = image.squeeze(1) # [1, 512, 512]
-        mel = tensor_to_mel(image_tensor)
         mel.save_audio()

 from mel_module import Mel
 class Generator:
+    def __init__(self, config, unet, scheduler, vae, embedding, progress_callback=None):
         self.config = config
         self.unet = unet
         self.scheduler = scheduler
         self.vae = vae
         self.embedding = embedding
+        self.progress_callback = progress_callback
     def tensor_to_mel(self, tensor):
         denormalize = transforms.Normalize(
             mu, log_var = self.vae.encode(uncond_image)
             uncond_latent = torch.cat((mu, log_var), dim=1)
             uncond_latent = uncond_latent.unsqueeze(0)
+            print("uncond", uncond_latent.shape)
         embeddings = torch.cat([uncond_latent, self.embedding])
             device=self.config.device,
         )
+        total_steps = len(self.scheduler.timesteps)
+        for i, t in enumerate(self.progress_callback.tqdm(self.scheduler.timesteps)):
             image_model_input = torch.cat([noise] * 2)
             image_model_input = self.scheduler.scale_model_input(image_model_input, timestep=t)
             with torch.no_grad():
                 noise_pred = self.unet(image_model_input, t, encoder_hidden_states=embeddings).sample
             noise_pred_uncond, noise_pred_img = noise_pred.chunk(2)
             noise_pred = noise_pred_uncond + self.config.guidance_scale * (noise_pred_img - noise_pred_uncond)
             noise = self.scheduler.step(noise_pred, t, noise).prev_sample
+        image_tensor = noise.squeeze(1) # [1, 512, 512]
+        mel = self.tensor_to_mel(image_tensor)
         mel.save_audio()