alppo commited on
Commit
6f849d1
·
1 Parent(s): 4fba802

add description

Browse files
__pycache__/generator_module.cpython-312.pyc CHANGED
Binary files a/__pycache__/generator_module.cpython-312.pyc and b/__pycache__/generator_module.cpython-312.pyc differ
 
app.py CHANGED
@@ -11,7 +11,6 @@ from mel_module import Mel
11
  from generator_module import Generator
12
  import shutil
13
 
14
-
15
  slices_folder = 'slices'
16
 
17
  if os.path.exists(slices_folder): # delete previous tracks
@@ -25,30 +24,25 @@ vae.eval()
25
  model = UNet2DConditionModel.from_pretrained(config.hub_model_id, subfolder="unet")
26
  noise_scheduler = DDPMScheduler.from_pretrained(config.hub_model_id, subfolder="scheduler")
27
 
28
-
29
-
30
- def generate_new_track(audio_paths):
31
-
32
  for i, audio_path in enumerate(audio_paths):
33
- print(audio_paths,audio_path)
34
  get_slices(audio_path)
35
 
36
  embedding = get_embedding()
37
- print("sample latent",embedding.shape)
38
- generator = Generator(config, model, noise_scheduler, vae, embedding)
 
39
  generator.generate()
40
 
41
  return config.generated_track_path
42
 
43
-
44
-
45
-
46
  def get_embedding(): # returns middle point of given audio files latent representations
47
  latents = []
48
  slices_dir = 'slices'
49
-
50
  for slice_file in os.listdir(slices_dir):
51
- if slice_file.endswith('.wav'): # make sure the file is audio
52
  mel = Mel(os.path.join(slices_dir, slice_file))
53
  spectrogram = mel.get_spectrogram()
54
  tensor = torch.tensor(spectrogram).float().unsqueeze(0).unsqueeze(0)
@@ -59,24 +53,41 @@ def get_embedding(): # returns middle point of given audio files latent represen
59
  normalized_tensor = 2 * ((latent - min_val) / (max_val - min_val)) - 1
60
  latent = normalized_tensor.unsqueeze(0)
61
  latents.append(latent)
62
-
63
  if not latents:
64
  return None
65
-
66
  latents_tensor = torch.cat(latents, dim=0)
67
  mean_latent = latents_tensor.mean(dim=0, keepdim=True)
68
  return mean_latent
69
 
70
-
71
 
72
- # Define the Gradio interface
73
  interface = gr.Interface(
74
  fn=generate_new_track,
75
  inputs=gr.Files(file_count="multiple", label="Upload Your Audio Files"),
76
  outputs=gr.Audio(type="filepath", label="Generated Track"),
77
  title="AMUSE: Music Generation",
78
- description="Upload audio files and generate new tracks based on them using AMUSE."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  )
 
 
 
 
80
 
81
- # Launch the interface
82
  interface.launch()
 
11
  from generator_module import Generator
12
  import shutil
13
 
 
14
  slices_folder = 'slices'
15
 
16
  if os.path.exists(slices_folder): # delete previous tracks
 
24
  model = UNet2DConditionModel.from_pretrained(config.hub_model_id, subfolder="unet")
25
  noise_scheduler = DDPMScheduler.from_pretrained(config.hub_model_id, subfolder="scheduler")
26
 
27
+ def generate_new_track(audio_paths, progress=gr.Progress(track_tqdm=True)):
 
 
 
28
  for i, audio_path in enumerate(audio_paths):
29
+ print(audio_paths, audio_path)
30
  get_slices(audio_path)
31
 
32
  embedding = get_embedding()
33
+ print("sample latent", embedding.shape)
34
+
35
+ generator = Generator(config, model, noise_scheduler, vae, embedding, progress_callback=progress)
36
  generator.generate()
37
 
38
  return config.generated_track_path
39
 
 
 
 
40
  def get_embedding(): # returns middle point of given audio files latent representations
41
  latents = []
42
  slices_dir = 'slices'
43
+
44
  for slice_file in os.listdir(slices_dir):
45
+ if slice_file.endswith('.wav'): # make sure the file is audio
46
  mel = Mel(os.path.join(slices_dir, slice_file))
47
  spectrogram = mel.get_spectrogram()
48
  tensor = torch.tensor(spectrogram).float().unsqueeze(0).unsqueeze(0)
 
53
  normalized_tensor = 2 * ((latent - min_val) / (max_val - min_val)) - 1
54
  latent = normalized_tensor.unsqueeze(0)
55
  latents.append(latent)
56
+
57
  if not latents:
58
  return None
59
+
60
  latents_tensor = torch.cat(latents, dim=0)
61
  mean_latent = latents_tensor.mean(dim=0, keepdim=True)
62
  return mean_latent
63
 
 
64
 
 
65
  interface = gr.Interface(
66
  fn=generate_new_track,
67
  inputs=gr.Files(file_count="multiple", label="Upload Your Audio Files"),
68
  outputs=gr.Audio(type="filepath", label="Generated Track"),
69
  title="AMUSE: Music Generation",
70
+ description = (
71
+ "<h3>Welcome to the AMUSE music generation app</h3>"
72
+ "<p>Here's how it works:</p>"
73
+ "<ol>"
74
+ "<li><strong>Upload Your Audio Files:</strong> Provide audio files from which the taste will be extracted, "
75
+ "and a new track will be generated accordingly. The audio files should be in .wav format!</li>"
76
+ "<li><strong>Process:</strong> The app slices the audio, extracts features, and generates a new track using a VAE and a diffusion model.</li>"
77
+ "<li><strong>Progress:</strong> The progress bar will show the generation process in real-time. Note that this takes a significant amount of time, "
78
+ "so you may leave the site in the free version and come back later to see the result.</li>"
79
+ "<li><strong>Download:</strong> Once the track is generated, you can download it directly.</li>"
80
+ "</ol>"
81
+ "<h4>Notes:</h4>"
82
+ "<ul>"
83
+ "<li>As mentioned earlier, it takes a significant amount of time to generate a new track in the free version of HF Spaces. "
84
+ "So, submit your tracks and forget about it for a little while :) Then come back to see the new track.</li>"
85
+ "<li>Ensure your audio files are clean and of good quality for the best results (sample rate: 44100 and .wav format).</li>"
86
+ "</ul>"
87
  )
88
+ )
89
+
90
+
91
+
92
 
 
93
  interface.launch()
generator_module.py CHANGED
@@ -7,12 +7,13 @@ import soundfile as sf
7
  from mel_module import Mel
8
 
9
  class Generator:
10
- def __init__(self, config, unet, scheduler, vae, embedding):
11
  self.config = config
12
  self.unet = unet
13
  self.scheduler = scheduler
14
  self.vae = vae
15
  self.embedding = embedding
 
16
 
17
  def tensor_to_mel(self, tensor):
18
  denormalize = transforms.Normalize(
@@ -29,7 +30,7 @@ class Generator:
29
  mu, log_var = self.vae.encode(uncond_image)
30
  uncond_latent = torch.cat((mu, log_var), dim=1)
31
  uncond_latent = uncond_latent.unsqueeze(0)
32
- print("uncond",uncond_latent.shape)
33
 
34
  embeddings = torch.cat([uncond_latent, self.embedding])
35
 
@@ -41,24 +42,18 @@ class Generator:
41
  device=self.config.device,
42
  )
43
 
44
- for t in tqdm(self.scheduler.timesteps):
 
 
45
  image_model_input = torch.cat([noise] * 2)
46
- # torch.Size([2, 1, 512, 512])
47
  image_model_input = self.scheduler.scale_model_input(image_model_input, timestep=t)
48
- # torch.Size([2, 1, 512, 512])
49
 
50
  with torch.no_grad():
51
  noise_pred = self.unet(image_model_input, t, encoder_hidden_states=embeddings).sample
52
  noise_pred_uncond, noise_pred_img = noise_pred.chunk(2)
53
- # torch.Size([1, 1, 512, 512])
54
- # torch.Size([1, 1, 512, 512])
55
  noise_pred = noise_pred_uncond + self.config.guidance_scale * (noise_pred_img - noise_pred_uncond)
56
- # torch.Size([1, 1, 512, 512])
57
- # compute the previous noisy sample x_t -> x_t-1
58
  noise = self.scheduler.step(noise_pred, t, noise).prev_sample
59
 
60
- image_tensor = image.squeeze(1) # [1, 512, 512]
61
- mel = tensor_to_mel(image_tensor)
62
  mel.save_audio()
63
-
64
-
 
7
  from mel_module import Mel
8
 
9
  class Generator:
10
+ def __init__(self, config, unet, scheduler, vae, embedding, progress_callback=None):
11
  self.config = config
12
  self.unet = unet
13
  self.scheduler = scheduler
14
  self.vae = vae
15
  self.embedding = embedding
16
+ self.progress_callback = progress_callback
17
 
18
  def tensor_to_mel(self, tensor):
19
  denormalize = transforms.Normalize(
 
30
  mu, log_var = self.vae.encode(uncond_image)
31
  uncond_latent = torch.cat((mu, log_var), dim=1)
32
  uncond_latent = uncond_latent.unsqueeze(0)
33
+ print("uncond", uncond_latent.shape)
34
 
35
  embeddings = torch.cat([uncond_latent, self.embedding])
36
 
 
42
  device=self.config.device,
43
  )
44
 
45
+ total_steps = len(self.scheduler.timesteps)
46
+
47
+ for i, t in enumerate(self.progress_callback.tqdm(self.scheduler.timesteps)):
48
  image_model_input = torch.cat([noise] * 2)
 
49
  image_model_input = self.scheduler.scale_model_input(image_model_input, timestep=t)
 
50
 
51
  with torch.no_grad():
52
  noise_pred = self.unet(image_model_input, t, encoder_hidden_states=embeddings).sample
53
  noise_pred_uncond, noise_pred_img = noise_pred.chunk(2)
 
 
54
  noise_pred = noise_pred_uncond + self.config.guidance_scale * (noise_pred_img - noise_pred_uncond)
 
 
55
  noise = self.scheduler.step(noise_pred, t, noise).prev_sample
56
 
57
+ image_tensor = noise.squeeze(1) # [1, 512, 512]
58
+ mel = self.tensor_to_mel(image_tensor)
59
  mel.save_audio()