Spaces:
Runtime error
Runtime error
normalize in remix
Browse files- audiodiffusion/__init__.py +1 -1
- notebooks/test_model.ipynb +25 -32
audiodiffusion/__init__.py
CHANGED
|
@@ -92,7 +92,7 @@ class AudioDiffusion:
|
|
| 92 |
images = noise = torch.randn(
|
| 93 |
(1, self.ddpm.unet.in_channels, self.ddpm.unet.sample_size,
|
| 94 |
self.ddpm.unet.sample_size),
|
| 95 |
-
generator=generator
|
| 96 |
)
|
| 97 |
|
| 98 |
if audio_file is not None or raw_audio is not None:
|
|
|
|
| 92 |
images = noise = torch.randn(
|
| 93 |
(1, self.ddpm.unet.in_channels, self.ddpm.unet.sample_size,
|
| 94 |
self.ddpm.unet.sample_size),
|
| 95 |
+
generator=generator
|
| 96 |
)
|
| 97 |
|
| 98 |
if audio_file is not None or raw_audio is not None:
|
notebooks/test_model.ipynb
CHANGED
|
@@ -87,6 +87,16 @@
|
|
| 87 |
"audio_diffusion = AudioDiffusion(model_id=model_id)"
|
| 88 |
]
|
| 89 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
{
|
| 91 |
"cell_type": "markdown",
|
| 92 |
"id": "011fb5a1",
|
|
@@ -171,7 +181,7 @@
|
|
| 171 |
},
|
| 172 |
{
|
| 173 |
"cell_type": "markdown",
|
| 174 |
-
"id": "
|
| 175 |
"metadata": {},
|
| 176 |
"source": [
|
| 177 |
"### Generate continuations (\"out-painting\")"
|
|
@@ -180,7 +190,7 @@
|
|
| 180 |
{
|
| 181 |
"cell_type": "code",
|
| 182 |
"execution_count": null,
|
| 183 |
-
"id": "
|
| 184 |
"metadata": {},
|
| 185 |
"outputs": [],
|
| 186 |
"source": [
|
|
@@ -230,7 +240,7 @@
|
|
| 230 |
" from google.colab import files\n",
|
| 231 |
" audio_file = list(files.upload().keys())[0]\n",
|
| 232 |
"except:\n",
|
| 233 |
-
" audio_file = \"/home/teticio/Music/
|
| 234 |
]
|
| 235 |
},
|
| 236 |
{
|
|
@@ -244,43 +254,44 @@
|
|
| 244 |
"source": [
|
| 245 |
"start_step = 500 #@param {type:\"slider\", min:0, max:1000, step:10}\n",
|
| 246 |
"overlap_secs = 2 #@param {type:\"integer\"}\n",
|
| 247 |
-
"mel = Mel(x_res=256, y_res=256)\n",
|
| 248 |
"mel.load_audio(audio_file)\n",
|
| 249 |
"overlap_samples = overlap_secs * mel.get_sample_rate()\n",
|
| 250 |
-
"slice_size =
|
| 251 |
"stride = slice_size - overlap_samples\n",
|
| 252 |
"generator = torch.Generator()\n",
|
| 253 |
"seed = generator.seed()\n",
|
| 254 |
"track = np.array([])\n",
|
| 255 |
"for sample in range(len(mel.audio) // stride):\n",
|
| 256 |
" generator.manual_seed(seed)\n",
|
| 257 |
-
" audio = mel.audio[sample * stride:sample * stride + slice_size]\n",
|
|
|
|
| 258 |
" if len(track) > 0:\n",
|
| 259 |
-
"
|
|
|
|
|
|
|
| 260 |
" _, (sample_rate,\n",
|
| 261 |
" audio2) = audio_diffusion.generate_spectrogram_and_audio_from_audio(\n",
|
| 262 |
" raw_audio=audio,\n",
|
| 263 |
" start_step=start_step,\n",
|
| 264 |
" generator=generator,\n",
|
| 265 |
-
" mask_start_secs=
|
| 266 |
-
" display(Audio(audio
|
| 267 |
-
" display(Audio(audio2, rate=sample_rate))\n",
|
| 268 |
" track = np.concatenate([track, audio2[overlap_samples:]])"
|
| 269 |
]
|
| 270 |
},
|
| 271 |
{
|
| 272 |
"cell_type": "code",
|
| 273 |
"execution_count": null,
|
| 274 |
-
"id": "
|
| 275 |
"metadata": {},
|
| 276 |
"outputs": [],
|
| 277 |
"source": [
|
| 278 |
-
"
|
| 279 |
]
|
| 280 |
},
|
| 281 |
{
|
| 282 |
"cell_type": "markdown",
|
| 283 |
-
"id": "
|
| 284 |
"metadata": {},
|
| 285 |
"source": [
|
| 286 |
"### Fill the gap (\"in-painting\")"
|
|
@@ -289,7 +300,7 @@
|
|
| 289 |
{
|
| 290 |
"cell_type": "code",
|
| 291 |
"execution_count": null,
|
| 292 |
-
"id": "
|
| 293 |
"metadata": {},
|
| 294 |
"outputs": [],
|
| 295 |
"source": [
|
|
@@ -313,16 +324,6 @@
|
|
| 313 |
"### Compare results with random sample from training set"
|
| 314 |
]
|
| 315 |
},
|
| 316 |
-
{
|
| 317 |
-
"cell_type": "code",
|
| 318 |
-
"execution_count": null,
|
| 319 |
-
"id": "f028a3c8",
|
| 320 |
-
"metadata": {},
|
| 321 |
-
"outputs": [],
|
| 322 |
-
"source": [
|
| 323 |
-
"mel = Mel(x_res=256, y_res=256)"
|
| 324 |
-
]
|
| 325 |
-
},
|
| 326 |
{
|
| 327 |
"cell_type": "code",
|
| 328 |
"execution_count": null,
|
|
@@ -354,14 +355,6 @@
|
|
| 354 |
"audio = mel.image_to_audio(image)\n",
|
| 355 |
"Audio(data=audio, rate=mel.get_sample_rate())"
|
| 356 |
]
|
| 357 |
-
},
|
| 358 |
-
{
|
| 359 |
-
"cell_type": "code",
|
| 360 |
-
"execution_count": null,
|
| 361 |
-
"id": "d32afb5e",
|
| 362 |
-
"metadata": {},
|
| 363 |
-
"outputs": [],
|
| 364 |
-
"source": []
|
| 365 |
}
|
| 366 |
],
|
| 367 |
"metadata": {
|
|
|
|
| 87 |
"audio_diffusion = AudioDiffusion(model_id=model_id)"
|
| 88 |
]
|
| 89 |
},
|
| 90 |
+
{
|
| 91 |
+
"cell_type": "code",
|
| 92 |
+
"execution_count": null,
|
| 93 |
+
"id": "6e16ed0e",
|
| 94 |
+
"metadata": {},
|
| 95 |
+
"outputs": [],
|
| 96 |
+
"source": [
|
| 97 |
+
"mel = Mel(x_res=256, y_res=256)"
|
| 98 |
+
]
|
| 99 |
+
},
|
| 100 |
{
|
| 101 |
"cell_type": "markdown",
|
| 102 |
"id": "011fb5a1",
|
|
|
|
| 181 |
},
|
| 182 |
{
|
| 183 |
"cell_type": "markdown",
|
| 184 |
+
"id": "c3b05163",
|
| 185 |
"metadata": {},
|
| 186 |
"source": [
|
| 187 |
"### Generate continuations (\"out-painting\")"
|
|
|
|
| 190 |
{
|
| 191 |
"cell_type": "code",
|
| 192 |
"execution_count": null,
|
| 193 |
+
"id": "4add9643",
|
| 194 |
"metadata": {},
|
| 195 |
"outputs": [],
|
| 196 |
"source": [
|
|
|
|
| 240 |
" from google.colab import files\n",
|
| 241 |
" audio_file = list(files.upload().keys())[0]\n",
|
| 242 |
"except:\n",
|
| 243 |
+
" audio_file = \"/home/teticio/Music/liked/El Michels Affair - Glaciers Of Ice.mp3\""
|
| 244 |
]
|
| 245 |
},
|
| 246 |
{
|
|
|
|
| 254 |
"source": [
|
| 255 |
"start_step = 500 #@param {type:\"slider\", min:0, max:1000, step:10}\n",
|
| 256 |
"overlap_secs = 2 #@param {type:\"integer\"}\n",
|
|
|
|
| 257 |
"mel.load_audio(audio_file)\n",
|
| 258 |
"overlap_samples = overlap_secs * mel.get_sample_rate()\n",
|
| 259 |
+
"slice_size = mel.x_res * mel.hop_length\n",
|
| 260 |
"stride = slice_size - overlap_samples\n",
|
| 261 |
"generator = torch.Generator()\n",
|
| 262 |
"seed = generator.seed()\n",
|
| 263 |
"track = np.array([])\n",
|
| 264 |
"for sample in range(len(mel.audio) // stride):\n",
|
| 265 |
" generator.manual_seed(seed)\n",
|
| 266 |
+
" audio = np.array(mel.audio[sample * stride:sample * stride + slice_size])\n",
|
| 267 |
+
" display(Audio(audio, rate=sample_rate))\n",
|
| 268 |
" if len(track) > 0:\n",
|
| 269 |
+
" # Normalize and re-insert generated audio\n",
|
| 270 |
+
" audio[:overlap_samples] = audio2[-overlap_samples:] * np.max(\n",
|
| 271 |
+
" audio[:overlap_samples]) / np.max(audio2[-overlap_samples:])\n",
|
| 272 |
" _, (sample_rate,\n",
|
| 273 |
" audio2) = audio_diffusion.generate_spectrogram_and_audio_from_audio(\n",
|
| 274 |
" raw_audio=audio,\n",
|
| 275 |
" start_step=start_step,\n",
|
| 276 |
" generator=generator,\n",
|
| 277 |
+
" mask_start_secs=overlap_secs if len(track) > 0 else 0)\n",
|
| 278 |
+
" display(Audio(audio2http://localhost:8889/notebooks/huggingface/audio-diffusion/notebooks/test_model.ipynb#, rate=sample_rate))\n",
|
|
|
|
| 279 |
" track = np.concatenate([track, audio2[overlap_samples:]])"
|
| 280 |
]
|
| 281 |
},
|
| 282 |
{
|
| 283 |
"cell_type": "code",
|
| 284 |
"execution_count": null,
|
| 285 |
+
"id": "6e54802a",
|
| 286 |
"metadata": {},
|
| 287 |
"outputs": [],
|
| 288 |
"source": [
|
| 289 |
+
"Audio(track, rate=sample_rate)"
|
| 290 |
]
|
| 291 |
},
|
| 292 |
{
|
| 293 |
"cell_type": "markdown",
|
| 294 |
+
"id": "2147bddb",
|
| 295 |
"metadata": {},
|
| 296 |
"source": [
|
| 297 |
"### Fill the gap (\"in-painting\")"
|
|
|
|
| 300 |
{
|
| 301 |
"cell_type": "code",
|
| 302 |
"execution_count": null,
|
| 303 |
+
"id": "c9de4e17",
|
| 304 |
"metadata": {},
|
| 305 |
"outputs": [],
|
| 306 |
"source": [
|
|
|
|
| 324 |
"### Compare results with random sample from training set"
|
| 325 |
]
|
| 326 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 327 |
{
|
| 328 |
"cell_type": "code",
|
| 329 |
"execution_count": null,
|
|
|
|
| 355 |
"audio = mel.image_to_audio(image)\n",
|
| 356 |
"Audio(data=audio, rate=mel.get_sample_rate())"
|
| 357 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 358 |
}
|
| 359 |
],
|
| 360 |
"metadata": {
|