Spaces:

teticio
/

audio-diffusion

Runtime error

App Files Files Community

teticio commited on Oct 14, 2022

Commit

8f292f9

1 Parent(s): 9a9737e

update train_unconditional for latent diffusion

Browse files

Files changed (3) hide show

README.md +2 -2
scripts/train_unconditional.py +16 -19
scripts/train_vae.py +0 -2

README.md CHANGED Viewed

@@ -89,7 +89,7 @@ accelerate launch --config_file config/accelerate_local.yaml \
   scripts/train_unconditional.py \
   --dataset_name teticio/audio-diffusion-256 \
   --resolution 256 \
-  --output_dir latent-audio-diffusion-256 \
   --num_epochs 100 \
   --train_batch_size 2 \
   --eval_batch_size 2 \
@@ -98,7 +98,7 @@ accelerate launch --config_file config/accelerate_local.yaml \
   --lr_warmup_steps 500 \
   --mixed_precision no \
   --push_to_hub True \
-  --hub_model_id latent-audio-diffusion-256 \
   --hub_token $(cat $HOME/.huggingface/token)
 ```
 #### Run training on SageMaker.

   scripts/train_unconditional.py \
   --dataset_name teticio/audio-diffusion-256 \
   --resolution 256 \
+  --output_dir audio-diffusion-256 \
   --num_epochs 100 \
   --train_batch_size 2 \
   --eval_batch_size 2 \
   --lr_warmup_steps 500 \
   --mixed_precision no \
   --push_to_hub True \
+  --hub_model_id audio-diffusion-256 \
   --hub_token $(cat $HOME/.huggingface/token)
 ```
 #### Run training on SageMaker.

scripts/train_unconditional.py CHANGED Viewed

@@ -48,8 +48,9 @@ def main(args):
         model = DDPMPipeline.from_pretrained(args.from_pretrained).unet
     else:
         model = UNet2DModel(
-            in_channels=1,
-            out_channels=1,
             layers_per_block=2,
             block_out_channels=(128, 128, 256, 256, 512, 512),
             down_block_types=(
@@ -114,7 +115,7 @@ def main(args):
     def transforms(examples):
         if args.vae is not None:
             images = [
-                augmentations(image).convert("RGB")
                 for image in examples["image"]
             ]
         else:
@@ -173,6 +174,13 @@ def main(args):
         model.train()
         for step, batch in enumerate(train_dataloader):
             clean_images = batch["input"]
             # Sample noise that we'll add to the images
             noise = torch.randn(clean_images.shape).to(clean_images.device)
             bsz = clean_images.shape[0]
@@ -184,11 +192,6 @@ def main(args):
                 device=clean_images.device,
             ).long()
-            if args.vae is not None:
-                with torch.no_grad():
-                    clean_images = vqvae.encode(
-                        clean_images).latent_dist.sample()
             # Add noise to the clean images according to the noise magnitude at each timestep
             # (this is the forward diffusion process)
             noisy_images = noise_scheduler.add_noise(clean_images, noise,
@@ -196,8 +199,7 @@ def main(args):
             with accelerator.accumulate(model):
                 # Predict the noise residual
-                images = model(noisy_images, timesteps)["sample"]
-                noise_pred = vqvae.decode(images)["sample"]
                 loss = F.mse_loss(noise_pred, noise)
                 accelerator.backward(loss)
@@ -209,13 +211,6 @@ def main(args):
                     ema_model.step(model)
                 optimizer.zero_grad()
-            if args.vae is not None:
-                with torch.no_grad():
-                    images = [
-                        image.convert('L')
-                        for image in vqvae.decode(images)["sample"]
-                    ]
             if accelerator.sync_gradients:
                 progress_bar.update(1)
                 global_step += 1
@@ -239,14 +234,16 @@ def main(args):
                 if args.vae is not None:
                     pipeline = LDMPipeline(
                         unet=accelerator.unwrap_model(
-                            ema_model.averaged_model if args.use_ema else model),
                         vqvae=vqvae,
                         scheduler=noise_scheduler,
                     )
                 else:
                     pipeline = DDPMPipeline(
                         unet=accelerator.unwrap_model(
-                            ema_model.averaged_model if args.use_ema else model),
                         scheduler=noise_scheduler,
                     )

         model = DDPMPipeline.from_pretrained(args.from_pretrained).unet
     else:
         model = UNet2DModel(
+            sample_size=args.resolution if args.vae is None else 64,
+            in_channels=1 if args.vae is None else 3,
+            out_channels=1 if args.vae is None else 3,
             layers_per_block=2,
             block_out_channels=(128, 128, 256, 256, 512, 512),
             down_block_types=(
     def transforms(examples):
         if args.vae is not None:
             images = [
+                augmentations(image.convert("RGB"))
                 for image in examples["image"]
             ]
         else:
         model.train()
         for step, batch in enumerate(train_dataloader):
             clean_images = batch["input"]
+            if args.vae is not None:
+                vqvae.to(clean_images.device)
+                with torch.no_grad():
+                    clean_images = vqvae.encode(
+                        clean_images).latent_dist.sample()
             # Sample noise that we'll add to the images
             noise = torch.randn(clean_images.shape).to(clean_images.device)
             bsz = clean_images.shape[0]
                 device=clean_images.device,
             ).long()
             # Add noise to the clean images according to the noise magnitude at each timestep
             # (this is the forward diffusion process)
             noisy_images = noise_scheduler.add_noise(clean_images, noise,
             with accelerator.accumulate(model):
                 # Predict the noise residual
+                noise_pred = model(noisy_images, timesteps)["sample"]
                 loss = F.mse_loss(noise_pred, noise)
                 accelerator.backward(loss)
                     ema_model.step(model)
                 optimizer.zero_grad()
             if accelerator.sync_gradients:
                 progress_bar.update(1)
                 global_step += 1
                 if args.vae is not None:
                     pipeline = LDMPipeline(
                         unet=accelerator.unwrap_model(
+                            ema_model.averaged_model if args.use_ema else model
+                        ),
                         vqvae=vqvae,
                         scheduler=noise_scheduler,
                     )
                 else:
                     pipeline = DDPMPipeline(
                         unet=accelerator.unwrap_model(
+                            ema_model.averaged_model if args.use_ema else model
+                        ),
                         scheduler=noise_scheduler,
                     )

scripts/train_vae.py CHANGED Viewed

@@ -4,9 +4,7 @@
 # TODO
 # grayscale
-# add vae to train_uncond (no_grad)
 # update README
-# merge in changes to train_unconditional
 import os
 import argparse

 # TODO
 # grayscale
 # update README
 import os
 import argparse