Spaces:

hilamanor
/

audioEditing

Runtime error

App Files Files Community

Update app.py

#70

by R-Kentaren - opened Jun 22, 2025

base: refs/heads/main

←

from: refs/pr/70

Discussion Files changed

+21

-110

Files changed (1) hide show

app.py +21 -110

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# Will be fixed soon, but meanwhile:
 import os
 if os.getenv('SPACES_ZERO_GPU') == "true":
     os.environ['SPACES_ZERO_GPU'] = "1"
@@ -6,27 +5,20 @@ if os.getenv('SPACES_ZERO_GPU') == "true":
 import gradio as gr
 import random
 import torch
-import os
 from torch import inference_mode
 from typing import Optional, List
-import numpy as np
 from models import load_model
 import utils
 import spaces
-import huggingface_hub
-from inversion_utils import inversion_forward_process, inversion_reverse_process
 LDM2 = "cvssp/audioldm2"
 MUSIC = "cvssp/audioldm2-music"
 LDM2_LARGE = "cvssp/audioldm2-large"
-STABLEAUD = "stabilityai/stable-audio-open-1.0"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 ldm2 = load_model(model_id=LDM2, device=device)
 ldm2_large = load_model(model_id=LDM2_LARGE, device=device)
 ldm2_music = load_model(model_id=MUSIC, device=device)
-ldm_stableaud = load_model(model_id=STABLEAUD, device=device, token=os.getenv('PRIV_TOKEN'))
 def randomize_seed_fn(seed, randomize_seed):
     if randomize_seed:
@@ -34,14 +26,10 @@ def randomize_seed_fn(seed, randomize_seed):
     torch.manual_seed(seed)
     return seed
 def invert(ldm_stable, x0, prompt_src, num_diffusion_steps, cfg_scale_src, duration, save_compute):
-    # ldm_stable.model.scheduler.set_timesteps(num_diffusion_steps, device=device)
     with inference_mode():
         w0 = ldm_stable.vae_encode(x0)
-    # find Zs and wts - forward process
     _, zs, wts, extra_info = inversion_forward_process(ldm_stable, w0, etas=1,
                                                        prompts=[prompt_src],
                                                        cfg_scales=[cfg_scale_src],
@@ -51,9 +39,7 @@ def invert(ldm_stable, x0, prompt_src, num_diffusion_steps, cfg_scale_src, durat
                                                        save_compute=save_compute)
     return zs, wts, extra_info
 def sample(ldm_stable, zs, wts, extra_info, prompt_tar, tstart, cfg_scale_tar, duration, save_compute):
-    # reverse process (via Zs and wT)
     tstart = torch.tensor(tstart, dtype=torch.int)
     w0, _ = inversion_reverse_process(ldm_stable, xT=wts, tstart=tstart,
                                       etas=1., prompts=[prompt_tar],
@@ -63,22 +49,17 @@ def sample(ldm_stable, zs, wts, extra_info, prompt_tar, tstart, cfg_scale_tar, d
                                       extra_info=extra_info,
                                       save_compute=save_compute)
-    # vae decode image
     with inference_mode():
         x0_dec = ldm_stable.vae_decode(w0)
-    if 'stable-audio' not in ldm_stable.model_id:
-        if x0_dec.dim() < 4:
-            x0_dec = x0_dec[None, :, :, :]
-        with torch.no_grad():
-            audio = ldm_stable.decode_to_mel(x0_dec)
-    else:
-        audio = x0_dec.squeeze(0).T
     return (ldm_stable.get_sr(), audio.squeeze().cpu().numpy())
 def get_duration(input_audio,
                  model_id: str,
                  do_inversion: bool,
@@ -91,60 +72,37 @@ def get_duration(input_audio,
                  cfg_scale_tar: float = 12,
                  t_start: int = 45,
                  randomize_seed: bool = True,
-                 save_compute: bool = True,
-                 oauth_token: Optional[gr.OAuthToken] = None):
     if model_id == LDM2:
         factor = 1
     elif model_id == LDM2_LARGE:
         factor = 2.5
-    elif model_id == STABLEAUD:
-        factor = 3.2
     else:  # MUSIC
         factor = 1
     forwards = 0
     if do_inversion or randomize_seed:
-        forwards = steps if source_prompt == "" else steps * 2  # x2 when there is a prompt text
     forwards += int(t_start / 100 * steps) * 2
     duration = min(utils.get_duration(input_audio), utils.MAX_DURATION)
-    time_for_maxlength = factor * forwards * 0.15  # 0.25 is the time per forward pass
-    if model_id != STABLEAUD:
-        time_for_maxlength = time_for_maxlength / utils.MAX_DURATION * duration
     print('expected time:', time_for_maxlength)
     spare_time = 5
     return max(10, time_for_maxlength + spare_time)
-def verify_model_params(model_id: str, input_audio, src_prompt: str, tar_prompt: str, cfg_scale_src: float,
-                        oauth_token: gr.OAuthToken | None):
     if input_audio is None:
         raise gr.Error('Input audio missing!')
     if tar_prompt == "":
         raise gr.Error("Please provide a target prompt to edit the audio.")
-    if src_prompt != "":
-        if model_id == STABLEAUD and cfg_scale_src != 1:
-            gr.Info("Consider using Source Guidance Scale=1 for Stable Audio Open 1.0.")
-        elif model_id != STABLEAUD and cfg_scale_src != 3:
-            gr.Info(f"Consider using Source Guidance Scale=3 for {model_id}.")
-    if model_id == STABLEAUD:
-        if oauth_token is None:
-            raise gr.Error("You must be logged in to use Stable Audio Open 1.0. Please log in and try again.")
-        try:
-            huggingface_hub.get_hf_file_metadata(huggingface_hub.hf_hub_url(STABLEAUD, 'transformer/config.json'),
-                                                 token=oauth_token.token)
-            print('Has Access')
-        # except huggingface_hub.utils._errors.GatedRepoError:
-        except huggingface_hub.errors.GatedRepoError:
-            raise gr.Error("You need to accept the license agreement to use Stable Audio Open 1.0. "
-                           "Visit the <a href='https://huggingface.co/stabilityai/stable-audio-open-1.0'>"
-                           "model page</a> to get access.")
 @spaces.GPU(duration=get_duration)
 def edit(input_audio,
@@ -159,32 +117,28 @@ def edit(input_audio,
          cfg_scale_tar: float = 12,
          t_start: int = 45,
          randomize_seed: bool = True,
-         save_compute: bool = True,
-         oauth_token: Optional[gr.OAuthToken] = None):
     print(model_id)
     if model_id == LDM2:
         ldm_stable = ldm2
     elif model_id == LDM2_LARGE:
         ldm_stable = ldm2_large
-    elif model_id == STABLEAUD:
-        ldm_stable = ldm_stableaud
     else:  # MUSIC
         ldm_stable = ldm2_music
     ldm_stable.model.scheduler.set_timesteps(steps, device=device)
-    # If the inversion was done for a different model, we need to re-run the inversion
     if not do_inversion and (saved_inv_model is None or saved_inv_model != model_id):
         do_inversion = True
     if input_audio is None:
         raise gr.Error('Input audio missing!')
     x0, _, duration = utils.load_audio(input_audio, ldm_stable.get_fn_STFT(), device=device,
-                                       stft=('stable-audio' not in ldm_stable.model_id), model_sr=ldm_stable.get_sr())
     if wts is None or zs is None:
         do_inversion = True
-    if do_inversion or randomize_seed:  # always re-run inversion
         zs_tensor, wts_tensor, extra_info_list = invert(ldm_stable=ldm_stable, x0=x0, prompt_src=source_prompt,
                                                         num_diffusion_steps=steps,
                                                         cfg_scale_src=cfg_scale_src,
@@ -205,8 +159,6 @@ def edit(input_audio,
                     save_compute=save_compute)
     return output, wts.cpu(), zs.cpu(), [e.cpu() for e in extra_info if e is not None], saved_inv_model, do_inversion
-    # return output, wtszs_file, saved_inv_model, do_inversion
 def get_example():
     case = [
@@ -226,14 +178,6 @@ def get_example():
          '27s',
          'Examples/Beethoven_piano.mp3',
          ],
-        ['Examples/Beethoven.mp3',
-         '',
-         'Heavy Rock.',
-         40,
-         'stabilityai/stable-audio-open-1.0',
-         '27s',
-         'Examples/Beethoven_rock.mp3',
-         ],
         ['Examples/ModalJazz.mp3',
          'Trumpets playing alongside a piano, bass and drums in an upbeat old-timey cool jazz song.',
          'A banjo playing alongside a piano, bass and drums in an upbeat old-timey cool country song.',
@@ -241,13 +185,6 @@ def get_example():
          'cvssp/audioldm2-music',
          '106s',
          'Examples/ModalJazz_banjo.mp3',],
-        ['Examples/Shadows.mp3',
-         '',
-         '8-bit arcade game soundtrack.',
-         40,
-         'stabilityai/stable-audio-open-1.0',
-         '34s',
-         'Examples/Shadows_arcade.mp3',],
         ['Examples/Cat.mp3',
          '',
          'A dog barking.',
@@ -258,14 +195,13 @@ def get_example():
     ]
     return case
 intro = """
 <h1 style="font-weight: 1000; text-align: center; margin: 0px;"> ZETA Editing 🎧 </h1>
 <h2 style="font-weight: 1000; text-align: center; margin: 0px;">
     Zero-Shot Text-Based Audio Editing Using DDPM Inversion 🎛️ </h2>
 <h3 style="margin-top: 0px; margin-bottom: 10px; text-align: center;">
-    <a href="https://arxiv.org/abs/2402.10009">[Paper]</a>&nbsp;|&nbsp;
-    <a href="https://hilamanor.github.io/AudioEditing/">[Project page]</a>&nbsp;|&nbsp;
     <a href="https://github.com/HilaManor/AudioEditingCode">[Code]</a>
 </h3>
@@ -275,22 +211,6 @@ For faster inference without waiting in queue, you may duplicate the space and u
 <img style="margin-top: 0em; margin-bottom: 0em; display:inline" src="https://bit.ly/3gLdBN6" alt="Duplicate Space" >
 </a>
 </p>
-<p style="margin: 0px;">
-<b>NEW - 15.10.24:</b> You can now edit using <b>Stable Audio Open 1.0</b>.
-You must be <b>logged in</b> after accepting the
-<b><a href="https://huggingface.co/stabilityai/stable-audio-open-1.0">license agreement</a></b> to use it.</br>
-</p>
-<ul style="padding-left:40px; line-height:normal;">
-<li style="margin: 0px;">Prompts behave differently - e.g.,
-try "8-bit arcade" directly instead of "a recording of...". Check out the new examples below!</li>
-<li style="margin: 0px;">Try to play around <code>T-start=40%</code>.</li>
-<li style="margin: 0px;">Under "More Options": Use <code>Source Guidance Scale=1</code>,
-and you can try fewer timesteps (even 20!).</li>
-<li style="margin: 0px;">Stable Audio Open is a general-audio model.
-For better music editing, duplicate the space and change to a
-<a href="https://huggingface.co/models?other=base_model:finetune:stabilityai/stable-audio-open-1.0">
-fine-tuned model for music</a>.</li>
-</ul>
 <p>
 <b>NEW - 15.10.24:</b> Parallel editing is enabled by default.
 To disable, uncheck <code>Efficient editing</code> under "More Options".
@@ -298,7 +218,6 @@ Saves a bit of time.
 </p>
 """
 help = """
 <div style="font-size:medium">
 <b>Instructions:</b><br>
@@ -319,21 +238,17 @@ to <code style="display:inline; background-color: lightgrey;">None</code>.
 </li>
 </ul>
 </div>
 """
 css = '.gradio-container {max-width: 1000px !important; padding-top: 1.5rem !important;}' \
       '.audio-upload .wrap {min-height: 0px;}'
-# with gr.Blocks(css='style.css') as demo:
 with gr.Blocks(css=css) as demo:
     def reset_do_inversion(do_inversion_user, do_inversion):
-        # do_inversion = gr.State(value=True)
         do_inversion = True
         do_inversion_user = True
         return do_inversion_user, do_inversion
-    # handle the case where the user clicked the button but the inversion was not done
     def clear_do_inversion_user(do_inversion_user):
         do_inversion_user = False
         return do_inversion_user
@@ -350,7 +265,7 @@ with gr.Blocks(css=css) as demo:
     zs = gr.State()
     extra_info = gr.State()
     saved_inv_model = gr.State()
-    do_inversion = gr.State(value=True)  # To save some runtime when editing the same thing over and over
     do_inversion_user = gr.State(value=False)
     with gr.Group():
@@ -371,15 +286,12 @@ with gr.Blocks(css=css) as demo:
         t_start = gr.Slider(minimum=15, maximum=85, value=45, step=1, label="T-start (%)", interactive=True, scale=3,
                             info="Lower T-start -> closer to original audio. Higher T-start -> stronger edit.")
         model_id = gr.Dropdown(label="Model Version",
-                               choices=[LDM2,
-                                        LDM2_LARGE,
-                                        MUSIC,
-                                        STABLEAUD],
                                info="Choose a checkpoint suitable for your audio and edit",
                                value="cvssp/audioldm2-music", interactive=True, type="value", scale=2)
     with gr.Row():
         submit = gr.Button("Edit", variant="primary", scale=3)
-        gr.LoginButton(value="Login to HF (For Stable Audio)", scale=1)
     with gr.Accordion("More Options", open=False):
         with gr.Row():
@@ -435,7 +347,6 @@ with gr.Blocks(css=css) as demo:
             outputs=[do_inversion_user, do_inversion]
         )
-    # If sources changed we have to rerun inversion
     gr.on(
         triggers=[input_audio.change, src_prompt.change, model_id.change, cfg_scale_src.change,
                   steps.change, save_compute.change],
@@ -452,4 +363,4 @@ with gr.Blocks(css=css) as demo:
     )
     demo.queue()
-    demo.launch(state_session_capacity=15)

 import os
 if os.getenv('SPACES_ZERO_GPU') == "true":
     os.environ['SPACES_ZERO_GPU'] = "1"
 import gradio as gr
 import random
 import torch
+import numpy as np
 from torch import inference_mode
 from typing import Optional, List
 from models import load_model
 import utils
 import spaces
 LDM2 = "cvssp/audioldm2"
 MUSIC = "cvssp/audioldm2-music"
 LDM2_LARGE = "cvssp/audioldm2-large"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 ldm2 = load_model(model_id=LDM2, device=device)
 ldm2_large = load_model(model_id=LDM2_LARGE, device=device)
 ldm2_music = load_model(model_id=MUSIC, device=device)
 def randomize_seed_fn(seed, randomize_seed):
     if randomize_seed:
     torch.manual_seed(seed)
     return seed
 def invert(ldm_stable, x0, prompt_src, num_diffusion_steps, cfg_scale_src, duration, save_compute):
     with inference_mode():
         w0 = ldm_stable.vae_encode(x0)
     _, zs, wts, extra_info = inversion_forward_process(ldm_stable, w0, etas=1,
                                                        prompts=[prompt_src],
                                                        cfg_scales=[cfg_scale_src],
                                                        save_compute=save_compute)
     return zs, wts, extra_info
 def sample(ldm_stable, zs, wts, extra_info, prompt_tar, tstart, cfg_scale_tar, duration, save_compute):
     tstart = torch.tensor(tstart, dtype=torch.int)
     w0, _ = inversion_reverse_process(ldm_stable, xT=wts, tstart=tstart,
                                       etas=1., prompts=[prompt_tar],
                                       extra_info=extra_info,
                                       save_compute=save_compute)
     with inference_mode():
         x0_dec = ldm_stable.vae_decode(w0)
+    if x0_dec.dim() < 4:
+        x0_dec = x0_dec[None, :, :, :]
+    with torch.no_grad():
+        audio = ldm_stable.decode_to_mel(x0_dec)
     return (ldm_stable.get_sr(), audio.squeeze().cpu().numpy())
 def get_duration(input_audio,
                  model_id: str,
                  do_inversion: bool,
                  cfg_scale_tar: float = 12,
                  t_start: int = 45,
                  randomize_seed: bool = True,
+                 save_compute: bool = True):
     if model_id == LDM2:
         factor = 1
     elif model_id == LDM2_LARGE:
         factor = 2.5
     else:  # MUSIC
         factor = 1
     forwards = 0
     if do_inversion or randomize_seed:
+        forwards = steps if source_prompt == "" else steps * 2
     forwards += int(t_start / 100 * steps) * 2
     duration = min(utils.get_duration(input_audio), utils.MAX_DURATION)
+    time_for_maxlength = factor * forwards * 0.15
+    time_for_maxlength = time_for_maxlength / utils.MAX_DURATION * duration
     print('expected time:', time_for_maxlength)
     spare_time = 5
     return max(10, time_for_maxlength + spare_time)
+def verify_model_params(model_id: str, input_audio, src_prompt: str, tar_prompt: str, cfg_scale_src: float):
     if input_audio is None:
         raise gr.Error('Input audio missing!')
     if tar_prompt == "":
         raise gr.Error("Please provide a target prompt to edit the audio.")
+    if src_prompt != "" and cfg_scale_src != 3:
+        gr.Info(f"Consider using Source Guidance Scale=3 for {model_id}.")
 @spaces.GPU(duration=get_duration)
 def edit(input_audio,
          cfg_scale_tar: float = 12,
          t_start: int = 45,
          randomize_seed: bool = True,
+         save_compute: bool = True):
     print(model_id)
     if model_id == LDM2:
         ldm_stable = ldm2
     elif model_id == LDM2_LARGE:
         ldm_stable = ldm2_large
     else:  # MUSIC
         ldm_stable = ldm2_music
     ldm_stable.model.scheduler.set_timesteps(steps, device=device)
     if not do_inversion and (saved_inv_model is None or saved_inv_model != model_id):
         do_inversion = True
     if input_audio is None:
         raise gr.Error('Input audio missing!')
     x0, _, duration = utils.load_audio(input_audio, ldm_stable.get_fn_STFT(), device=device,
+                                       stft=True, model_sr=ldm_stable.get_sr())
     if wts is None or zs is None:
         do_inversion = True
+    if do_inversion or randomize_seed:
         zs_tensor, wts_tensor, extra_info_list = invert(ldm_stable=ldm_stable, x0=x0, prompt_src=source_prompt,
                                                         num_diffusion_steps=steps,
                                                         cfg_scale_src=cfg_scale_src,
                     save_compute=save_compute)
     return output, wts.cpu(), zs.cpu(), [e.cpu() for e in extra_info if e is not None], saved_inv_model, do_inversion
 def get_example():
     case = [
          '27s',
          'Examples/Beethoven_piano.mp3',
          ],
         ['Examples/ModalJazz.mp3',
          'Trumpets playing alongside a piano, bass and drums in an upbeat old-timey cool jazz song.',
          'A banjo playing alongside a piano, bass and drums in an upbeat old-timey cool country song.',
          'cvssp/audioldm2-music',
          '106s',
          'Examples/ModalJazz_banjo.mp3',],
         ['Examples/Cat.mp3',
          '',
          'A dog barking.',
     ]
     return case
 intro = """
 <h1 style="font-weight: 1000; text-align: center; margin: 0px;"> ZETA Editing 🎧 </h1>
 <h2 style="font-weight: 1000; text-align: center; margin: 0px;">
     Zero-Shot Text-Based Audio Editing Using DDPM Inversion 🎛️ </h2>
 <h3 style="margin-top: 0px; margin-bottom: 10px; text-align: center;">
+    <a href="https://arxiv.org/abs/2402.10009">[Paper]</a> |
+    <a href="https://hilamanor.github.io/AudioEditing/">[Project page]</a> |
     <a href="https://github.com/HilaManor/AudioEditingCode">[Code]</a>
 </h3>
 <img style="margin-top: 0em; margin-bottom: 0em; display:inline" src="https://bit.ly/3gLdBN6" alt="Duplicate Space" >
 </a>
 </p>
 <p>
 <b>NEW - 15.10.24:</b> Parallel editing is enabled by default.
 To disable, uncheck <code>Efficient editing</code> under "More Options".
 </p>
 """
 help = """
 <div style="font-size:medium">
 <b>Instructions:</b><br>
 </li>
 </ul>
 </div>
 """
 css = '.gradio-container {max-width: 1000px !important; padding-top: 1.5rem !important;}' \
       '.audio-upload .wrap {min-height: 0px;}'
 with gr.Blocks(css=css) as demo:
     def reset_do_inversion(do_inversion_user, do_inversion):
         do_inversion = True
         do_inversion_user = True
         return do_inversion_user, do_inversion
     def clear_do_inversion_user(do_inversion_user):
         do_inversion_user = False
         return do_inversion_user
     zs = gr.State()
     extra_info = gr.State()
     saved_inv_model = gr.State()
+    do_inversion = gr.State(value=True)
     do_inversion_user = gr.State(value=False)
     with gr.Group():
         t_start = gr.Slider(minimum=15, maximum=85, value=45, step=1, label="T-start (%)", interactive=True, scale=3,
                             info="Lower T-start -> closer to original audio. Higher T-start -> stronger edit.")
         model_id = gr.Dropdown(label="Model Version",
+                               choices=[LDM2, LDM2_LARGE, MUSIC],
                                info="Choose a checkpoint suitable for your audio and edit",
                                value="cvssp/audioldm2-music", interactive=True, type="value", scale=2)
     with gr.Row():
         submit = gr.Button("Edit", variant="primary", scale=3)
     with gr.Accordion("More Options", open=False):
         with gr.Row():
             outputs=[do_inversion_user, do_inversion]
         )
     gr.on(
         triggers=[input_audio.change, src_prompt.change, model_id.change, cfg_scale_src.change,
                   steps.change, save_compute.change],
     )
     demo.queue()
+    demo.launch(state_session_capacity=15)