Spaces:
Runtime error
Runtime error
dropdown>radio, t-start percentage, intro text change
Browse files
app.py
CHANGED
|
@@ -20,7 +20,7 @@ LDM2_LARGE = "cvssp/audioldm2-large"
|
|
| 20 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 21 |
ldm2 = load_model(model_id=LDM2, device=device)
|
| 22 |
ldm2_large = load_model(model_id=LDM2_LARGE, device=device)
|
| 23 |
-
ldm2_music = load_model(model_id=
|
| 24 |
|
| 25 |
|
| 26 |
def randomize_seed_fn(seed, randomize_seed):
|
|
@@ -46,7 +46,6 @@ def invert(ldm_stable, x0, prompt_src, num_diffusion_steps, cfg_scale_src): # ,
|
|
| 46 |
return zs, wts
|
| 47 |
|
| 48 |
|
| 49 |
-
|
| 50 |
def sample(ldm_stable, zs, wts, steps, prompt_tar, tstart, cfg_scale_tar): # , ldm_stable):
|
| 51 |
# reverse process (via Zs and wT)
|
| 52 |
tstart = torch.tensor(tstart, dtype=torch.int)
|
|
@@ -71,14 +70,16 @@ def sample(ldm_stable, zs, wts, steps, prompt_tar, tstart, cfg_scale_tar): # ,
|
|
| 71 |
|
| 72 |
return f.name
|
| 73 |
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
|
|
|
|
|
|
| 82 |
|
| 83 |
def edit(input_audio,
|
| 84 |
model_id: str,
|
|
@@ -89,7 +90,7 @@ def edit(input_audio,
|
|
| 89 |
steps=200,
|
| 90 |
cfg_scale_src=3.5,
|
| 91 |
cfg_scale_tar=12,
|
| 92 |
-
t_start=
|
| 93 |
randomize_seed=True):
|
| 94 |
|
| 95 |
# global ldm_stable, current_loaded_model
|
|
@@ -104,10 +105,8 @@ def edit(input_audio,
|
|
| 104 |
ldm_stable = ldm2
|
| 105 |
elif model_id == LDM2_LARGE:
|
| 106 |
ldm_stable = ldm2_large
|
| 107 |
-
else:
|
| 108 |
ldm_stable = ldm2_music
|
| 109 |
-
|
| 110 |
-
|
| 111 |
|
| 112 |
# If the inversion was done for a different model, we need to re-run the inversion
|
| 113 |
if not do_inversion and (saved_inv_model is None or saved_inv_model != model_id):
|
|
@@ -123,25 +122,22 @@ def edit(input_audio,
|
|
| 123 |
zs = gr.State(value=zs_tensor)
|
| 124 |
saved_inv_model = model_id
|
| 125 |
do_inversion = False
|
| 126 |
-
|
| 127 |
# make sure t_start is in the right limit
|
| 128 |
-
t_start = change_tstart_range(t_start, steps)
|
| 129 |
|
| 130 |
-
output = sample(ldm_stable, zs.value, wts.value, steps, prompt_tar=target_prompt,
|
| 131 |
-
cfg_scale_tar=cfg_scale_tar)
|
| 132 |
|
| 133 |
return output, wts, zs, saved_inv_model, do_inversion
|
| 134 |
|
| 135 |
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
def get_example():
|
| 140 |
case = [
|
| 141 |
['Examples/Beethoven.wav',
|
| 142 |
'',
|
| 143 |
'A recording of an arcade game soundtrack.',
|
| 144 |
-
|
| 145 |
'cvssp/audioldm2-music',
|
| 146 |
'27s',
|
| 147 |
'Examples/Beethoven_arcade.wav',
|
|
@@ -149,7 +145,7 @@ def get_example():
|
|
| 149 |
['Examples/Beethoven.wav',
|
| 150 |
'A high quality recording of wind instruments and strings playing.',
|
| 151 |
'A high quality recording of a piano playing.',
|
| 152 |
-
|
| 153 |
'cvssp/audioldm2-music',
|
| 154 |
'27s',
|
| 155 |
'Examples/Beethoven_piano.wav',
|
|
@@ -157,14 +153,14 @@ def get_example():
|
|
| 157 |
['Examples/ModalJazz.wav',
|
| 158 |
'Trumpets playing alongside a piano, bass and drums in an upbeat old-timey cool jazz song.',
|
| 159 |
'A banjo playing alongside a piano, bass and drums in an upbeat old-timey cool country song.',
|
| 160 |
-
|
| 161 |
'cvssp/audioldm2-music',
|
| 162 |
'106s',
|
| 163 |
'Examples/ModalJazz_banjo.wav',],
|
| 164 |
['Examples/Cat.wav',
|
| 165 |
'',
|
| 166 |
'A dog barking.',
|
| 167 |
-
|
| 168 |
'cvssp/audioldm2-large',
|
| 169 |
'10s',
|
| 170 |
'Examples/Cat_dog.wav',]
|
|
@@ -173,15 +169,15 @@ def get_example():
|
|
| 173 |
|
| 174 |
|
| 175 |
intro = """
|
| 176 |
-
<h1 style="font-weight: 1400; text-align: center; margin-bottom: 7px;">
|
| 177 |
-
<h2 style="font-weight: 1400; text-align: center; margin-bottom: 7px;"> Audio
|
| 178 |
<h3 style="margin-bottom: 10px; text-align: center;">
|
| 179 |
<a href="https://arxiv.org/abs/2402.10009">[Paper]</a> |
|
| 180 |
<a href="https://hilamanor.github.io/AudioEditing/">[Project page]</a> |
|
| 181 |
<a href="https://github.com/HilaManor/AudioEditingCode">[Code]</a>
|
| 182 |
</h3>
|
| 183 |
<p style="font-size:large">
|
| 184 |
-
Demo for the method introduced in:
|
| 185 |
<b <a href="https://arxiv.org/abs/2402.10009" style="text-decoration: underline;" target="_blank"> Zero-Shot Unsupervised and Text-Based Audio Editing Using DDPM Inversion </a> </b>
|
| 186 |
</p>
|
| 187 |
<p style="font-size:larger">
|
|
@@ -228,22 +224,24 @@ with gr.Blocks(css='style.css') as demo:
|
|
| 228 |
output_audio = gr.Audio(label="Edited Audio", interactive=False, scale=1)
|
| 229 |
|
| 230 |
with gr.Row():
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
|
| 235 |
with gr.Row():
|
| 236 |
with gr.Column():
|
| 237 |
submit = gr.Button("Edit")
|
| 238 |
|
| 239 |
-
with gr.Row():
|
| 240 |
-
t_start = gr.Slider(minimum=10, maximum=240, value=30, step=1, label="T-start", interactive=True, scale=3,
|
| 241 |
-
info="Higher T-start -> stronger edit. Lower T-start -> closer to original audio")
|
| 242 |
-
model_id = gr.Dropdown(label="AudioLDM2 Version", choices=["cvssp/audioldm2",
|
| 243 |
-
"cvssp/audioldm2-large",
|
| 244 |
-
"cvssp/audioldm2-music"],
|
| 245 |
-
info="Choose a checkpoint suitable for your intended audio and edit",
|
| 246 |
-
value="cvssp/audioldm2-music", interactive=True, type="value", scale=2)
|
| 247 |
with gr.Accordion("More Options", open=False):
|
| 248 |
with gr.Row():
|
| 249 |
src_prompt = gr.Textbox(label="Source Prompt", lines=2, interactive=True, info= "Optional: Describe the original audio input",
|
|
|
|
| 20 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 21 |
ldm2 = load_model(model_id=LDM2, device=device)
|
| 22 |
ldm2_large = load_model(model_id=LDM2_LARGE, device=device)
|
| 23 |
+
ldm2_music = load_model(model_id=MUSIC, device=device)
|
| 24 |
|
| 25 |
|
| 26 |
def randomize_seed_fn(seed, randomize_seed):
|
|
|
|
| 46 |
return zs, wts
|
| 47 |
|
| 48 |
|
|
|
|
| 49 |
def sample(ldm_stable, zs, wts, steps, prompt_tar, tstart, cfg_scale_tar): # , ldm_stable):
|
| 50 |
# reverse process (via Zs and wT)
|
| 51 |
tstart = torch.tensor(tstart, dtype=torch.int)
|
|
|
|
| 70 |
|
| 71 |
return f.name
|
| 72 |
|
| 73 |
+
|
| 74 |
+
# def change_tstart_range(t_start, steps):
|
| 75 |
+
# maximum = int(0.8 * steps)
|
| 76 |
+
# minimum = int(0.15 * steps)
|
| 77 |
+
# if t_start > maximum:
|
| 78 |
+
# t_start = maximum
|
| 79 |
+
# elif t_start < minimum:
|
| 80 |
+
# t_start = minimum
|
| 81 |
+
# return t_start
|
| 82 |
+
|
| 83 |
|
| 84 |
def edit(input_audio,
|
| 85 |
model_id: str,
|
|
|
|
| 90 |
steps=200,
|
| 91 |
cfg_scale_src=3.5,
|
| 92 |
cfg_scale_tar=12,
|
| 93 |
+
t_start=45,
|
| 94 |
randomize_seed=True):
|
| 95 |
|
| 96 |
# global ldm_stable, current_loaded_model
|
|
|
|
| 105 |
ldm_stable = ldm2
|
| 106 |
elif model_id == LDM2_LARGE:
|
| 107 |
ldm_stable = ldm2_large
|
| 108 |
+
else: # MUSIC
|
| 109 |
ldm_stable = ldm2_music
|
|
|
|
|
|
|
| 110 |
|
| 111 |
# If the inversion was done for a different model, we need to re-run the inversion
|
| 112 |
if not do_inversion and (saved_inv_model is None or saved_inv_model != model_id):
|
|
|
|
| 122 |
zs = gr.State(value=zs_tensor)
|
| 123 |
saved_inv_model = model_id
|
| 124 |
do_inversion = False
|
| 125 |
+
|
| 126 |
# make sure t_start is in the right limit
|
| 127 |
+
# t_start = change_tstart_range(t_start, steps)
|
| 128 |
|
| 129 |
+
output = sample(ldm_stable, zs.value, wts.value, steps, prompt_tar=target_prompt,
|
| 130 |
+
tstart=int(t_start / 100 * steps), cfg_scale_tar=cfg_scale_tar)
|
| 131 |
|
| 132 |
return output, wts, zs, saved_inv_model, do_inversion
|
| 133 |
|
| 134 |
|
|
|
|
|
|
|
|
|
|
| 135 |
def get_example():
|
| 136 |
case = [
|
| 137 |
['Examples/Beethoven.wav',
|
| 138 |
'',
|
| 139 |
'A recording of an arcade game soundtrack.',
|
| 140 |
+
45,
|
| 141 |
'cvssp/audioldm2-music',
|
| 142 |
'27s',
|
| 143 |
'Examples/Beethoven_arcade.wav',
|
|
|
|
| 145 |
['Examples/Beethoven.wav',
|
| 146 |
'A high quality recording of wind instruments and strings playing.',
|
| 147 |
'A high quality recording of a piano playing.',
|
| 148 |
+
45,
|
| 149 |
'cvssp/audioldm2-music',
|
| 150 |
'27s',
|
| 151 |
'Examples/Beethoven_piano.wav',
|
|
|
|
| 153 |
['Examples/ModalJazz.wav',
|
| 154 |
'Trumpets playing alongside a piano, bass and drums in an upbeat old-timey cool jazz song.',
|
| 155 |
'A banjo playing alongside a piano, bass and drums in an upbeat old-timey cool country song.',
|
| 156 |
+
45,
|
| 157 |
'cvssp/audioldm2-music',
|
| 158 |
'106s',
|
| 159 |
'Examples/ModalJazz_banjo.wav',],
|
| 160 |
['Examples/Cat.wav',
|
| 161 |
'',
|
| 162 |
'A dog barking.',
|
| 163 |
+
75,
|
| 164 |
'cvssp/audioldm2-large',
|
| 165 |
'10s',
|
| 166 |
'Examples/Cat_dog.wav',]
|
|
|
|
| 169 |
|
| 170 |
|
| 171 |
intro = """
|
| 172 |
+
<h1 style="font-weight: 1400; text-align: center; margin-bottom: 7px;"> ZETA Editing 🎧 </h1>
|
| 173 |
+
<h2 style="font-weight: 1400; text-align: center; margin-bottom: 7px;"> Zero-Shot Text-Based Audio Editing Using DDPM Inversion 🎛️ </h2>
|
| 174 |
<h3 style="margin-bottom: 10px; text-align: center;">
|
| 175 |
<a href="https://arxiv.org/abs/2402.10009">[Paper]</a> |
|
| 176 |
<a href="https://hilamanor.github.io/AudioEditing/">[Project page]</a> |
|
| 177 |
<a href="https://github.com/HilaManor/AudioEditingCode">[Code]</a>
|
| 178 |
</h3>
|
| 179 |
<p style="font-size:large">
|
| 180 |
+
Demo for the text-based editing method introduced in:
|
| 181 |
<b <a href="https://arxiv.org/abs/2402.10009" style="text-decoration: underline;" target="_blank"> Zero-Shot Unsupervised and Text-Based Audio Editing Using DDPM Inversion </a> </b>
|
| 182 |
</p>
|
| 183 |
<p style="font-size:larger">
|
|
|
|
| 224 |
output_audio = gr.Audio(label="Edited Audio", interactive=False, scale=1)
|
| 225 |
|
| 226 |
with gr.Row():
|
| 227 |
+
tar_prompt = gr.Textbox(label="Prompt", info="Describe your desired edited output", placeholder="a recording of a happy upbeat arcade game soundtrack",
|
| 228 |
+
lines=2, interactive=True)
|
| 229 |
+
|
| 230 |
+
with gr.Row():
|
| 231 |
+
t_start = gr.Slider(minimum=15, maximum=85, value=45, step=1, label="T-start (%)", interactive=True, scale=3,
|
| 232 |
+
info="Higher T-start -> stronger edit. Lower T-start -> closer to original audio.")
|
| 233 |
+
# model_id = gr.Dropdown(label="AudioLDM2 Version",
|
| 234 |
+
model_id = gr.Radio(label="AudioLDM2 Version",
|
| 235 |
+
choices=["cvssp/audioldm2",
|
| 236 |
+
"cvssp/audioldm2-large",
|
| 237 |
+
"cvssp/audioldm2-music"],
|
| 238 |
+
info="Choose a checkpoint suitable for your intended audio and edit",
|
| 239 |
+
value="cvssp/audioldm2-music", interactive=True, type="value", scale=2)
|
| 240 |
|
| 241 |
with gr.Row():
|
| 242 |
with gr.Column():
|
| 243 |
submit = gr.Button("Edit")
|
| 244 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
with gr.Accordion("More Options", open=False):
|
| 246 |
with gr.Row():
|
| 247 |
src_prompt = gr.Textbox(label="Source Prompt", lines=2, interactive=True, info= "Optional: Describe the original audio input",
|