Spaces:

teticio
/

audio-diffusion

Runtime error

App Files Files Community

teticio commited on Nov 7, 2022

Commit

5bc60f9

1 Parent(s): c51d0e3

add sample_rate and n_fft params

Browse files

Files changed (4) hide show

README.md +3 -1
scripts/audio_to_images.py +3 -1
scripts/train_unconditional.py +5 -1
scripts/train_vae.py +15 -3

README.md CHANGED Viewed

@@ -71,7 +71,9 @@ python scripts/audio_to_images.py \
   --output_dir data/audio-diffusion-256 \
   --push_to_hub teticio/audio-diffusion-256
 ```
 ## Train model
 #### Run training on local machine.
 ```bash

   --output_dir data/audio-diffusion-256 \
   --push_to_hub teticio/audio-diffusion-256
 ```
+Note that the default `sample_rate` is 22050 and audios will be resampled if they are at a different rate. If you change this value, you may find that the results in the `test_mel.ipynb` notebook are not good (for example, if `sample_rate` is 48000) and that it is necessary to adjust `n_fft` (for example, to 2000 instead of the default value of 2048; alternatively, you can resample to a `sample_rate` of 44100). Make sure you use the same parameters for training and inference. You should also bear in mind that not all resolutions work with the neural network architecture as currently configured - you should be safe if you stick to powers of 2.
 ## Train model
 #### Run training on local machine.
 ```bash

scripts/audio_to_images.py CHANGED Viewed

@@ -19,7 +19,8 @@ def main(args):
     mel = Mel(x_res=args.resolution[0],
               y_res=args.resolution[1],
               hop_length=args.hop_length,
-              sample_rate=args.sample_rate)
     os.makedirs(args.output_dir, exist_ok=True)
     audio_files = [
         os.path.join(root, file) for root, _, files in os.walk(args.input_dir)
@@ -86,6 +87,7 @@ if __name__ == "__main__":
     parser.add_argument("--hop_length", type=int, default=512)
     parser.add_argument("--push_to_hub", type=str, default=None)
     parser.add_argument("--sample_rate", type=int, default=22050)
     args = parser.parse_args()
     if args.input_dir is None:

     mel = Mel(x_res=args.resolution[0],
               y_res=args.resolution[1],
               hop_length=args.hop_length,
+              sample_rate=args.sample_rate,
+              n_fft=args.n_fft)
     os.makedirs(args.output_dir, exist_ok=True)
     audio_files = [
         os.path.join(root, file) for root, _, files in os.walk(args.input_dir)
     parser.add_argument("--hop_length", type=int, default=512)
     parser.add_argument("--push_to_hub", type=str, default=None)
     parser.add_argument("--sample_rate", type=int, default=22050)
+    parser.add_argument("--n_fft", type=int, default=2048)
     args = parser.parse_args()
     if args.input_dir is None:

scripts/train_unconditional.py CHANGED Viewed

@@ -173,7 +173,9 @@ def main(args):
     mel = Mel(x_res=resolution[1],
               y_res=resolution[0],
-              hop_length=args.hop_length)
     global_step = 0
     for epoch in range(args.num_epochs):
@@ -362,6 +364,8 @@ if __name__ == "__main__":
             "and an Nvidia Ampere GPU."),
     )
     parser.add_argument("--hop_length", type=int, default=512)
     parser.add_argument("--from_pretrained", type=str, default=None)
     parser.add_argument("--start_epoch", type=int, default=0)
     parser.add_argument("--num_train_steps", type=int, default=1000)

     mel = Mel(x_res=resolution[1],
               y_res=resolution[0],
+              hop_length=args.hop_length,
+              sample_rate=args.sample_rate,
+              n_fft=args.n_fft)
     global_step = 0
     for epoch in range(args.num_epochs):
             "and an Nvidia Ampere GPU."),
     )
     parser.add_argument("--hop_length", type=int, default=512)
+    parser.add_argument("--sample_rate", type=int, default=22050)
+    parser.add_argument("--n_fft", type=int, default=2048)
     parser.add_argument("--from_pretrained", type=str, default=None)
     parser.add_argument("--start_epoch", type=int, default=0)
     parser.add_argument("--num_train_steps", type=int, default=1000)

scripts/train_vae.py CHANGED Viewed

@@ -60,10 +60,16 @@ class AudioDiffusionDataModule(pl.LightningDataModule):
 class ImageLogger(Callback):
-    def __init__(self, every=1000, hop_length=512):
         super().__init__()
         self.every = every
         self.hop_length = hop_length
     @rank_zero_only
     def log_images_and_audios(self, pl_module, batch):
@@ -76,7 +82,9 @@ class ImageLogger(Callback):
         channels = image_shape[1]
         mel = Mel(x_res=image_shape[2],
                   y_res=image_shape[3],
-                  hop_length=self.hop_length)
         for k in images:
             images[k] = images[k].detach().cpu()
@@ -145,6 +153,8 @@ if __name__ == "__main__":
                         type=int,
                         default=1)
     parser.add_argument("--hop_length", type=int, default=512)
     parser.add_argument("--save_images_batches", type=int, default=1000)
     parser.add_argument("--max_epochs", type=int, default=100)
     args = parser.parse_args()
@@ -166,7 +176,9 @@ if __name__ == "__main__":
         resume_from_checkpoint=args.resume_from_checkpoint,
         callbacks=[
             ImageLogger(every=args.save_images_batches,
-                        hop_length=args.hop_length),
             HFModelCheckpoint(ldm_config=config,
                               hf_checkpoint=args.hf_checkpoint_dir,
                               dirpath=args.ldm_checkpoint_dir,

 class ImageLogger(Callback):
+    def __init__(self,
+                 every=1000,
+                 hop_length=512,
+                 sample_rate=22050,
+                 n_fft=2048):
         super().__init__()
         self.every = every
         self.hop_length = hop_length
+        self.sample_rate = sample_rate
+        self.n_fft = n_fft
     @rank_zero_only
     def log_images_and_audios(self, pl_module, batch):
         channels = image_shape[1]
         mel = Mel(x_res=image_shape[2],
                   y_res=image_shape[3],
+                  hop_length=self.hop_length,
+                  sample_rate=self.sample_rate,
+                  n_fft=self.n_fft)
         for k in images:
             images[k] = images[k].detach().cpu()
                         type=int,
                         default=1)
     parser.add_argument("--hop_length", type=int, default=512)
+    parser.add_argument("--sample_rate", type=int, default=22050)
+    parser.add_argument("--n_fft", type=int, default=2048)
     parser.add_argument("--save_images_batches", type=int, default=1000)
     parser.add_argument("--max_epochs", type=int, default=100)
     args = parser.parse_args()
         resume_from_checkpoint=args.resume_from_checkpoint,
         callbacks=[
             ImageLogger(every=args.save_images_batches,
+                        hop_length=args.hop_length,
+                        sample_rate=args.sample_rate,
+                        n_fft=args.n_fft),
             HFModelCheckpoint(ldm_config=config,
                               hf_checkpoint=args.hf_checkpoint_dir,
                               dirpath=args.ldm_checkpoint_dir,