Spaces:
Runtime error
Runtime error
fix pipelines
Browse files- audiodiffusion/__init__.py +16 -11
- notebooks/test_vae.ipynb +2 -2
- scripts/train_unconditional.py +8 -8
audiodiffusion/__init__.py
CHANGED
|
@@ -5,7 +5,7 @@ import numpy as np
|
|
| 5 |
from PIL import Image
|
| 6 |
from tqdm.auto import tqdm
|
| 7 |
from librosa.beat import beat_track
|
| 8 |
-
from diffusers import
|
| 9 |
|
| 10 |
from .mel import Mel
|
| 11 |
|
|
@@ -42,13 +42,14 @@ class AudioDiffusion:
|
|
| 42 |
hop_length=hop_length,
|
| 43 |
top_db=top_db)
|
| 44 |
self.model_id = model_id
|
| 45 |
-
self.
|
| 46 |
if cuda:
|
| 47 |
-
self.
|
| 48 |
self.progress_bar = progress_bar or (lambda _: _)
|
| 49 |
|
| 50 |
def generate_spectrogram_and_audio(
|
| 51 |
self,
|
|
|
|
| 52 |
generator: torch.Generator = None
|
| 53 |
) -> Tuple[Image.Image, Tuple[int, np.ndarray]]:
|
| 54 |
"""Generate random mel spectrogram and convert to audio.
|
|
@@ -60,7 +61,10 @@ class AudioDiffusion:
|
|
| 60 |
PIL Image: mel spectrogram
|
| 61 |
(float, np.ndarray): sample rate and raw audio
|
| 62 |
"""
|
| 63 |
-
images = self.
|
|
|
|
|
|
|
|
|
|
| 64 |
images = (images * 255).round().astype("uint8").transpose(0, 3, 1, 2)
|
| 65 |
image = Image.fromarray(images[0][0])
|
| 66 |
audio = self.mel.image_to_audio(image)
|
|
@@ -95,16 +99,17 @@ class AudioDiffusion:
|
|
| 95 |
(float, np.ndarray): sample rate and raw audio
|
| 96 |
"""
|
| 97 |
|
| 98 |
-
# It would be better to derive a class from
|
| 99 |
# but currently the return type ImagePipelineOutput cannot be imported.
|
| 100 |
if steps is None:
|
| 101 |
-
steps = self.
|
| 102 |
-
|
|
|
|
| 103 |
scheduler.set_timesteps(steps)
|
| 104 |
mask = None
|
| 105 |
images = noise = torch.randn(
|
| 106 |
-
(1, self.
|
| 107 |
-
self.
|
| 108 |
generator=generator)
|
| 109 |
|
| 110 |
if audio_file is not None or raw_audio is not None:
|
|
@@ -129,10 +134,10 @@ class AudioDiffusion:
|
|
| 129 |
torch.tensor(input_image[np.newaxis, np.newaxis, :]), noise,
|
| 130 |
torch.tensor(scheduler.timesteps[start_step:]))
|
| 131 |
|
| 132 |
-
images = images.to(self.
|
| 133 |
for step, t in enumerate(
|
| 134 |
self.progress_bar(scheduler.timesteps[start_step:])):
|
| 135 |
-
model_output = self.
|
| 136 |
images = scheduler.step(model_output,
|
| 137 |
t,
|
| 138 |
images,
|
|
|
|
| 5 |
from PIL import Image
|
| 6 |
from tqdm.auto import tqdm
|
| 7 |
from librosa.beat import beat_track
|
| 8 |
+
from diffusers import DiffusionPipeline
|
| 9 |
|
| 10 |
from .mel import Mel
|
| 11 |
|
|
|
|
| 42 |
hop_length=hop_length,
|
| 43 |
top_db=top_db)
|
| 44 |
self.model_id = model_id
|
| 45 |
+
self.pipe = DiffusionPipeline.from_pretrained(self.model_id)
|
| 46 |
if cuda:
|
| 47 |
+
self.pipe.to("cuda")
|
| 48 |
self.progress_bar = progress_bar or (lambda _: _)
|
| 49 |
|
| 50 |
def generate_spectrogram_and_audio(
|
| 51 |
self,
|
| 52 |
+
steps: int = None,
|
| 53 |
generator: torch.Generator = None
|
| 54 |
) -> Tuple[Image.Image, Tuple[int, np.ndarray]]:
|
| 55 |
"""Generate random mel spectrogram and convert to audio.
|
|
|
|
| 61 |
PIL Image: mel spectrogram
|
| 62 |
(float, np.ndarray): sample rate and raw audio
|
| 63 |
"""
|
| 64 |
+
images = self.pipe(output_type="numpy",
|
| 65 |
+
generator=generator,
|
| 66 |
+
num_inference_steps=self.pipe.scheduler.
|
| 67 |
+
num_train_timesteps)["sample"]
|
| 68 |
images = (images * 255).round().astype("uint8").transpose(0, 3, 1, 2)
|
| 69 |
image = Image.fromarray(images[0][0])
|
| 70 |
audio = self.mel.image_to_audio(image)
|
|
|
|
| 99 |
(float, np.ndarray): sample rate and raw audio
|
| 100 |
"""
|
| 101 |
|
| 102 |
+
# It would be better to derive a class from DiffusionPipeline
|
| 103 |
# but currently the return type ImagePipelineOutput cannot be imported.
|
| 104 |
if steps is None:
|
| 105 |
+
steps = self.pipe.scheduler.num_train_timesteps
|
| 106 |
+
# Unfortunately, the schedule is set up in the constructor.
|
| 107 |
+
scheduler = self.pipe.scheduler.__class__(num_train_timesteps=steps)
|
| 108 |
scheduler.set_timesteps(steps)
|
| 109 |
mask = None
|
| 110 |
images = noise = torch.randn(
|
| 111 |
+
(1, self.pipe.unet.in_channels, self.pipe.unet.sample_size,
|
| 112 |
+
self.pipe.unet.sample_size),
|
| 113 |
generator=generator)
|
| 114 |
|
| 115 |
if audio_file is not None or raw_audio is not None:
|
|
|
|
| 134 |
torch.tensor(input_image[np.newaxis, np.newaxis, :]), noise,
|
| 135 |
torch.tensor(scheduler.timesteps[start_step:]))
|
| 136 |
|
| 137 |
+
images = images.to(self.pipe.device)
|
| 138 |
for step, t in enumerate(
|
| 139 |
self.progress_bar(scheduler.timesteps[start_step:])):
|
| 140 |
+
model_output = self.pipe.unet(images, t)['sample']
|
| 141 |
images = scheduler.step(model_output,
|
| 142 |
t,
|
| 143 |
images,
|
notebooks/test_vae.ipynb
CHANGED
|
@@ -3,7 +3,7 @@
|
|
| 3 |
{
|
| 4 |
"cell_type": "code",
|
| 5 |
"execution_count": null,
|
| 6 |
-
"id": "
|
| 7 |
"metadata": {},
|
| 8 |
"outputs": [],
|
| 9 |
"source": [
|
|
@@ -57,7 +57,7 @@
|
|
| 57 |
"metadata": {},
|
| 58 |
"outputs": [],
|
| 59 |
"source": [
|
| 60 |
-
"ds = load_dataset('teticio/audio-diffusion-
|
| 61 |
]
|
| 62 |
},
|
| 63 |
{
|
|
|
|
| 3 |
{
|
| 4 |
"cell_type": "code",
|
| 5 |
"execution_count": null,
|
| 6 |
+
"id": "bcbbe26c",
|
| 7 |
"metadata": {},
|
| 8 |
"outputs": [],
|
| 9 |
"source": [
|
|
|
|
| 57 |
"metadata": {},
|
| 58 |
"outputs": [],
|
| 59 |
"source": [
|
| 60 |
+
"ds = load_dataset('teticio/audio-diffusion-256')"
|
| 61 |
]
|
| 62 |
},
|
| 63 |
{
|
scripts/train_unconditional.py
CHANGED
|
@@ -231,15 +231,14 @@ def main(args):
|
|
| 231 |
|
| 232 |
# Generate sample images for visual inspection
|
| 233 |
if accelerator.is_main_process:
|
| 234 |
-
if
|
|
|
|
|
|
|
| 235 |
if args.vae is not None:
|
| 236 |
-
pipeline = LDMPipeline(
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
vqvae=vqvae,
|
| 241 |
-
scheduler=noise_scheduler,
|
| 242 |
-
)
|
| 243 |
else:
|
| 244 |
pipeline = DDPMPipeline(
|
| 245 |
unet=accelerator.unwrap_model(
|
|
@@ -269,6 +268,7 @@ def main(args):
|
|
| 269 |
generator=generator,
|
| 270 |
batch_size=args.eval_batch_size,
|
| 271 |
output_type="numpy",
|
|
|
|
| 272 |
)["sample"]
|
| 273 |
|
| 274 |
# denormalize the images and save to tensorboard
|
|
|
|
| 231 |
|
| 232 |
# Generate sample images for visual inspection
|
| 233 |
if accelerator.is_main_process:
|
| 234 |
+
if (
|
| 235 |
+
epoch + 1
|
| 236 |
+
) % args.save_model_epochs == 0 or epoch == args.num_epochs - 1:
|
| 237 |
if args.vae is not None:
|
| 238 |
+
pipeline = LDMPipeline(unet=accelerator.unwrap_model(
|
| 239 |
+
ema_model.averaged_model if args.use_ema else model),
|
| 240 |
+
vqvae=vqvae,
|
| 241 |
+
scheduler=noise_scheduler)
|
|
|
|
|
|
|
|
|
|
| 242 |
else:
|
| 243 |
pipeline = DDPMPipeline(
|
| 244 |
unet=accelerator.unwrap_model(
|
|
|
|
| 268 |
generator=generator,
|
| 269 |
batch_size=args.eval_batch_size,
|
| 270 |
output_type="numpy",
|
| 271 |
+
num_inference_steps=args.num_train_steps,
|
| 272 |
)["sample"]
|
| 273 |
|
| 274 |
# denormalize the images and save to tensorboard
|