Spaces:
Runtime error
Runtime error
use new models for now
Browse files
notebooks/audio_diffusion_pipeline.ipynb
CHANGED
|
@@ -46,7 +46,7 @@
|
|
| 46 |
"from datasets import load_dataset\n",
|
| 47 |
"from IPython.display import Audio\n",
|
| 48 |
"from librosa.beat import beat_track\n",
|
| 49 |
-
"from diffusers import DiffusionPipeline
|
| 50 |
]
|
| 51 |
},
|
| 52 |
{
|
|
@@ -56,8 +56,6 @@
|
|
| 56 |
"metadata": {},
|
| 57 |
"outputs": [],
|
| 58 |
"source": [
|
| 59 |
-
"mel = Mel()\n",
|
| 60 |
-
"sample_rate = mel.get_sample_rate()\n",
|
| 61 |
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
|
| 62 |
"generator = torch.Generator(device=device)"
|
| 63 |
]
|
|
@@ -91,7 +89,7 @@
|
|
| 91 |
"\n",
|
| 92 |
"#@markdown teticio/audio-diffusion-instrumental-hiphop-256 - trained on instrumental hiphop\n",
|
| 93 |
"\n",
|
| 94 |
-
"model_id = \"teticio/audio-diffusion-256\" #@param [\"teticio/audio-diffusion-256\", \"teticio/audio-diffusion-breaks-256\", \"audio-diffusion-instrumenal-hiphop-256\", \"teticio/audio-diffusion-ddim-256\"]"
|
| 95 |
]
|
| 96 |
},
|
| 97 |
{
|
|
@@ -101,7 +99,9 @@
|
|
| 101 |
"metadata": {},
|
| 102 |
"outputs": [],
|
| 103 |
"source": [
|
| 104 |
-
"audio_diffusion = DiffusionPipeline.from_pretrained(model_id).to(device)"
|
|
|
|
|
|
|
| 105 |
]
|
| 106 |
},
|
| 107 |
{
|
|
@@ -150,7 +150,7 @@
|
|
| 150 |
" seed = generator.seed()\n",
|
| 151 |
" print(f'Seed = {seed}')\n",
|
| 152 |
" generator.manual_seed(seed)\n",
|
| 153 |
-
" output = audio_diffusion(
|
| 154 |
" image = output.images[0]\n",
|
| 155 |
" audio = output.audios[0, 0]\n",
|
| 156 |
" display(image)\n",
|
|
@@ -187,7 +187,7 @@
|
|
| 187 |
"source": [
|
| 188 |
"seed = 2391504374279719 #@param {type:\"integer\"}\n",
|
| 189 |
"generator.manual_seed(seed)\n",
|
| 190 |
-
"output = audio_diffusion(
|
| 191 |
"image = output.images[0]\n",
|
| 192 |
"audio = output.audios[0, 0]\n",
|
| 193 |
"display(image)\n",
|
|
@@ -206,7 +206,7 @@
|
|
| 206 |
"start_step = 500 #@param {type:\"slider\", min:0, max:1000, step:10}\n",
|
| 207 |
"track = loop_it(audio, sample_rate, loops=1)\n",
|
| 208 |
"for variation in range(12):\n",
|
| 209 |
-
" output = audio_diffusion(
|
| 210 |
" image2 = output.images[0]\n",
|
| 211 |
" audio2 = output.audios[0, 0]\n",
|
| 212 |
" display(image2)\n",
|
|
@@ -235,8 +235,7 @@
|
|
| 235 |
"overlap_samples = overlap_secs * sample_rate\n",
|
| 236 |
"track = audio\n",
|
| 237 |
"for variation in range(12):\n",
|
| 238 |
-
" output = audio_diffusion(
|
| 239 |
-
" raw_audio=audio[-overlap_samples:],\n",
|
| 240 |
" start_step=start_step,\n",
|
| 241 |
" mask_start_secs=overlap_secs)\n",
|
| 242 |
" image2 = output.images[0]\n",
|
|
@@ -306,8 +305,7 @@
|
|
| 306 |
" # Normalize and re-insert generated audio\n",
|
| 307 |
" audio[:overlap_samples] = audio2[-overlap_samples:] * np.max(\n",
|
| 308 |
" audio[:overlap_samples]) / np.max(audio2[-overlap_samples:])\n",
|
| 309 |
-
" output = audio_diffusion(
|
| 310 |
-
" raw_audio=audio,\n",
|
| 311 |
" start_step=start_step,\n",
|
| 312 |
" generator=generator,\n",
|
| 313 |
" mask_start_secs=overlap_secs * not_first)\n",
|
|
@@ -334,8 +332,7 @@
|
|
| 334 |
"source": [
|
| 335 |
"sample = 3 #@param {type:\"integer\"}\n",
|
| 336 |
"raw_audio = track_audio[sample * stride:sample * stride + slice_size]\n",
|
| 337 |
-
"output = audio_diffusion(
|
| 338 |
-
" raw_audio=raw_audio,\n",
|
| 339 |
" mask_start_secs=1,\n",
|
| 340 |
" mask_end_secs=1,\n",
|
| 341 |
" step_generator=torch.Generator(device=device))\n",
|
|
@@ -359,7 +356,9 @@
|
|
| 359 |
"metadata": {},
|
| 360 |
"outputs": [],
|
| 361 |
"source": [
|
| 362 |
-
"audio_diffusion = DiffusionPipeline.from_pretrained('teticio/audio-diffusion-ddim-256').to(device)"
|
|
|
|
|
|
|
| 363 |
]
|
| 364 |
},
|
| 365 |
{
|
|
@@ -381,7 +380,7 @@
|
|
| 381 |
" seed = generator.seed()\n",
|
| 382 |
" print(f'Seed = {seed}')\n",
|
| 383 |
" generator.manual_seed(seed)\n",
|
| 384 |
-
" output = audio_diffusion(
|
| 385 |
" image = output.images[0]\n",
|
| 386 |
" audio = output.audios[0, 0]\n",
|
| 387 |
" display(image)\n",
|
|
@@ -410,7 +409,7 @@
|
|
| 410 |
"metadata": {},
|
| 411 |
"outputs": [],
|
| 412 |
"source": [
|
| 413 |
-
"output = audio_diffusion(
|
| 414 |
"image = output.images[0]\n",
|
| 415 |
"audio = output.audios[0, 0]\n",
|
| 416 |
"display(image)\n",
|
|
@@ -509,7 +508,6 @@
|
|
| 509 |
"source": [
|
| 510 |
"alpha = 0.5 #@param {type:\"slider\", min:0, max:1, step:0.1}\n",
|
| 511 |
"output = audio_diffusion(\n",
|
| 512 |
-
" mel=mel,\n",
|
| 513 |
" noise=audio_diffusion.slerp(noise, noise2, alpha),\n",
|
| 514 |
" generator=generator)\n",
|
| 515 |
"audio = output.audios[0, 0]\n",
|
|
@@ -534,7 +532,7 @@
|
|
| 534 |
"metadata": {},
|
| 535 |
"outputs": [],
|
| 536 |
"source": [
|
| 537 |
-
"model_id = \"teticio/latent-audio-diffusion-ddim-256\" #@param [\"teticio/latent-audio-diffusion-256\", \"teticio/latent-audio-diffusion-ddim-256\"]"
|
| 538 |
]
|
| 539 |
},
|
| 540 |
{
|
|
@@ -544,7 +542,9 @@
|
|
| 544 |
"metadata": {},
|
| 545 |
"outputs": [],
|
| 546 |
"source": [
|
| 547 |
-
"audio_diffusion = DiffusionPipeline.from_pretrained(model_id).to(device)"
|
|
|
|
|
|
|
| 548 |
]
|
| 549 |
},
|
| 550 |
{
|
|
@@ -556,7 +556,7 @@
|
|
| 556 |
"source": [
|
| 557 |
"seed = 3412253600050855 #@param {type:\"integer\"}\n",
|
| 558 |
"generator.manual_seed(seed)\n",
|
| 559 |
-
"output = audio_diffusion(
|
| 560 |
"image = output.images[0]\n",
|
| 561 |
"audio = output.audios[0, 0]\n",
|
| 562 |
"display(image)\n",
|
|
@@ -572,7 +572,7 @@
|
|
| 572 |
"source": [
|
| 573 |
"seed2 = 7016114633369557 #@param {type:\"integer\"}\n",
|
| 574 |
"generator.manual_seed(seed2)\n",
|
| 575 |
-
"output = audio_diffusion(
|
| 576 |
"image2 = output.images[0]\n",
|
| 577 |
"audio2 = output.audios[0, 0]\n",
|
| 578 |
"display(image2)\n",
|
|
@@ -628,7 +628,6 @@
|
|
| 628 |
"source": [
|
| 629 |
"alpha = 0.5 #@param {type:\"slider\", min:0, max:1, step:0.1}\n",
|
| 630 |
"output = audio_diffusion(\n",
|
| 631 |
-
" mel=mel,\n",
|
| 632 |
" noise=audio_diffusion.slerp(latents, latents2, alpha),\n",
|
| 633 |
" generator=generator)\n",
|
| 634 |
"audio3 = output.audios[0, 0]\n",
|
|
|
|
| 46 |
"from datasets import load_dataset\n",
|
| 47 |
"from IPython.display import Audio\n",
|
| 48 |
"from librosa.beat import beat_track\n",
|
| 49 |
+
"from diffusers import DiffusionPipeline"
|
| 50 |
]
|
| 51 |
},
|
| 52 |
{
|
|
|
|
| 56 |
"metadata": {},
|
| 57 |
"outputs": [],
|
| 58 |
"source": [
|
|
|
|
|
|
|
| 59 |
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
|
| 60 |
"generator = torch.Generator(device=device)"
|
| 61 |
]
|
|
|
|
| 89 |
"\n",
|
| 90 |
"#@markdown teticio/audio-diffusion-instrumental-hiphop-256 - trained on instrumental hiphop\n",
|
| 91 |
"\n",
|
| 92 |
+
"model_id = \"teticio/audio-diffusion-256-new\" #@param [\"teticio/audio-diffusion-256\", \"teticio/audio-diffusion-breaks-256\", \"audio-diffusion-instrumenal-hiphop-256\", \"teticio/audio-diffusion-ddim-256\"]"
|
| 93 |
]
|
| 94 |
},
|
| 95 |
{
|
|
|
|
| 99 |
"metadata": {},
|
| 100 |
"outputs": [],
|
| 101 |
"source": [
|
| 102 |
+
"audio_diffusion = DiffusionPipeline.from_pretrained(model_id).to(device)\n",
|
| 103 |
+
"mel = audio_diffusion.mel\n",
|
| 104 |
+
"sample_rate = mel.get_sample_rate()"
|
| 105 |
]
|
| 106 |
},
|
| 107 |
{
|
|
|
|
| 150 |
" seed = generator.seed()\n",
|
| 151 |
" print(f'Seed = {seed}')\n",
|
| 152 |
" generator.manual_seed(seed)\n",
|
| 153 |
+
" output = audio_diffusion(generator=generator)\n",
|
| 154 |
" image = output.images[0]\n",
|
| 155 |
" audio = output.audios[0, 0]\n",
|
| 156 |
" display(image)\n",
|
|
|
|
| 187 |
"source": [
|
| 188 |
"seed = 2391504374279719 #@param {type:\"integer\"}\n",
|
| 189 |
"generator.manual_seed(seed)\n",
|
| 190 |
+
"output = audio_diffusion(generator=generator)\n",
|
| 191 |
"image = output.images[0]\n",
|
| 192 |
"audio = output.audios[0, 0]\n",
|
| 193 |
"display(image)\n",
|
|
|
|
| 206 |
"start_step = 500 #@param {type:\"slider\", min:0, max:1000, step:10}\n",
|
| 207 |
"track = loop_it(audio, sample_rate, loops=1)\n",
|
| 208 |
"for variation in range(12):\n",
|
| 209 |
+
" output = audio_diffusion(raw_audio=audio, start_step=start_step)\n",
|
| 210 |
" image2 = output.images[0]\n",
|
| 211 |
" audio2 = output.audios[0, 0]\n",
|
| 212 |
" display(image2)\n",
|
|
|
|
| 235 |
"overlap_samples = overlap_secs * sample_rate\n",
|
| 236 |
"track = audio\n",
|
| 237 |
"for variation in range(12):\n",
|
| 238 |
+
" output = audio_diffusion(raw_audio=audio[-overlap_samples:],\n",
|
|
|
|
| 239 |
" start_step=start_step,\n",
|
| 240 |
" mask_start_secs=overlap_secs)\n",
|
| 241 |
" image2 = output.images[0]\n",
|
|
|
|
| 305 |
" # Normalize and re-insert generated audio\n",
|
| 306 |
" audio[:overlap_samples] = audio2[-overlap_samples:] * np.max(\n",
|
| 307 |
" audio[:overlap_samples]) / np.max(audio2[-overlap_samples:])\n",
|
| 308 |
+
" output = audio_diffusion(raw_audio=audio,\n",
|
|
|
|
| 309 |
" start_step=start_step,\n",
|
| 310 |
" generator=generator,\n",
|
| 311 |
" mask_start_secs=overlap_secs * not_first)\n",
|
|
|
|
| 332 |
"source": [
|
| 333 |
"sample = 3 #@param {type:\"integer\"}\n",
|
| 334 |
"raw_audio = track_audio[sample * stride:sample * stride + slice_size]\n",
|
| 335 |
+
"output = audio_diffusion(raw_audio=raw_audio,\n",
|
|
|
|
| 336 |
" mask_start_secs=1,\n",
|
| 337 |
" mask_end_secs=1,\n",
|
| 338 |
" step_generator=torch.Generator(device=device))\n",
|
|
|
|
| 356 |
"metadata": {},
|
| 357 |
"outputs": [],
|
| 358 |
"source": [
|
| 359 |
+
"audio_diffusion = DiffusionPipeline.from_pretrained('teticio/audio-diffusion-ddim-256-new').to(device)\n",
|
| 360 |
+
"mel = audio_diffusion.mel\n",
|
| 361 |
+
"sample_rate = mel.get_sample_rate()"
|
| 362 |
]
|
| 363 |
},
|
| 364 |
{
|
|
|
|
| 380 |
" seed = generator.seed()\n",
|
| 381 |
" print(f'Seed = {seed}')\n",
|
| 382 |
" generator.manual_seed(seed)\n",
|
| 383 |
+
" output = audio_diffusion(generator=generator)\n",
|
| 384 |
" image = output.images[0]\n",
|
| 385 |
" audio = output.audios[0, 0]\n",
|
| 386 |
" display(image)\n",
|
|
|
|
| 409 |
"metadata": {},
|
| 410 |
"outputs": [],
|
| 411 |
"source": [
|
| 412 |
+
"output = audio_diffusion(steps=1000, generator=generator, eta=1)\n",
|
| 413 |
"image = output.images[0]\n",
|
| 414 |
"audio = output.audios[0, 0]\n",
|
| 415 |
"display(image)\n",
|
|
|
|
| 508 |
"source": [
|
| 509 |
"alpha = 0.5 #@param {type:\"slider\", min:0, max:1, step:0.1}\n",
|
| 510 |
"output = audio_diffusion(\n",
|
|
|
|
| 511 |
" noise=audio_diffusion.slerp(noise, noise2, alpha),\n",
|
| 512 |
" generator=generator)\n",
|
| 513 |
"audio = output.audios[0, 0]\n",
|
|
|
|
| 532 |
"metadata": {},
|
| 533 |
"outputs": [],
|
| 534 |
"source": [
|
| 535 |
+
"model_id = \"teticio/latent-audio-diffusion-ddim-256-new\" #@param [\"teticio/latent-audio-diffusion-256\", \"teticio/latent-audio-diffusion-ddim-256\"]"
|
| 536 |
]
|
| 537 |
},
|
| 538 |
{
|
|
|
|
| 542 |
"metadata": {},
|
| 543 |
"outputs": [],
|
| 544 |
"source": [
|
| 545 |
+
"audio_diffusion = DiffusionPipeline.from_pretrained(model_id).to(device)\n",
|
| 546 |
+
"mel = audio_diffusion.mel\n",
|
| 547 |
+
"sample_rate = mel.get_sample_rate()"
|
| 548 |
]
|
| 549 |
},
|
| 550 |
{
|
|
|
|
| 556 |
"source": [
|
| 557 |
"seed = 3412253600050855 #@param {type:\"integer\"}\n",
|
| 558 |
"generator.manual_seed(seed)\n",
|
| 559 |
+
"output = audio_diffusion(generator=generator)\n",
|
| 560 |
"image = output.images[0]\n",
|
| 561 |
"audio = output.audios[0, 0]\n",
|
| 562 |
"display(image)\n",
|
|
|
|
| 572 |
"source": [
|
| 573 |
"seed2 = 7016114633369557 #@param {type:\"integer\"}\n",
|
| 574 |
"generator.manual_seed(seed2)\n",
|
| 575 |
+
"output = audio_diffusion(generator=generator)\n",
|
| 576 |
"image2 = output.images[0]\n",
|
| 577 |
"audio2 = output.audios[0, 0]\n",
|
| 578 |
"display(image2)\n",
|
|
|
|
| 628 |
"source": [
|
| 629 |
"alpha = 0.5 #@param {type:\"slider\", min:0, max:1, step:0.1}\n",
|
| 630 |
"output = audio_diffusion(\n",
|
|
|
|
| 631 |
" noise=audio_diffusion.slerp(latents, latents2, alpha),\n",
|
| 632 |
" generator=generator)\n",
|
| 633 |
"audio3 = output.audios[0, 0]\n",
|