Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -127,6 +127,92 @@ def infer_img2img(prompt, audio_path, desired_strength, progress=gr.Progress(tra
|
|
| 127 |
|
| 128 |
return "output.wav", input_spec_image_path, output_spec_image_path
|
| 129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
css="""
|
| 131 |
div#col-container{
|
| 132 |
margin: 0 auto;
|
|
@@ -185,14 +271,7 @@ with gr.Blocks(css=css) as demo:
|
|
| 185 |
input_spectrogram = gr.Image(label="Input Spectrogram")
|
| 186 |
output_spectrogram = gr.Image(label="Output Spectrogram")
|
| 187 |
|
| 188 |
-
|
| 189 |
-
examples = [
|
| 190 |
-
"Rolling thunder with lightning strikes",
|
| 191 |
-
"Two gunshots followed by birds chirping",
|
| 192 |
-
"A train whistle blowing in the distance"
|
| 193 |
-
],
|
| 194 |
-
inputs = [prompt_img2img]
|
| 195 |
-
)
|
| 196 |
|
| 197 |
submit_btn_img2img.click(
|
| 198 |
fn = infer_img2img,
|
|
@@ -200,4 +279,23 @@ with gr.Blocks(css=css) as demo:
|
|
| 200 |
outputs = [audio_out_img2img, input_spectrogram, output_spectrogram]
|
| 201 |
)
|
| 202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
demo.queue().launch(show_api=False, show_error=True)
|
|
|
|
| 127 |
|
| 128 |
return "output.wav", input_spec_image_path, output_spec_image_path
|
| 129 |
|
| 130 |
+
def infer_inp(prompt, audio_path, progress=gr.Progress(track_tqdm=True)):
|
| 131 |
+
|
| 132 |
+
pretrained_model_name_or_path = "auffusion/auffusion-full-no-adapter"
|
| 133 |
+
dtype = torch.float16
|
| 134 |
+
device = "cuda"
|
| 135 |
+
|
| 136 |
+
if not os.path.isdir(pretrained_model_name_or_path):
|
| 137 |
+
pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path)
|
| 138 |
+
|
| 139 |
+
vocoder = Generator.from_pretrained(pretrained_model_name_or_path, subfolder="vocoder")
|
| 140 |
+
vocoder = vocoder.to(device=device, dtype=dtype)
|
| 141 |
+
|
| 142 |
+
pipe = StableDiffusionImg2ImgPipeline.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype)
|
| 143 |
+
pipe = pipe.to(device)
|
| 144 |
+
|
| 145 |
+
width_start, width = 256, 512
|
| 146 |
+
prompt = "A siren ringing with a vehicle speeding closer"
|
| 147 |
+
seed = 42
|
| 148 |
+
|
| 149 |
+
# Loading
|
| 150 |
+
audio, sampling_rate = load_wav(audio_path)
|
| 151 |
+
audio, spec = get_mel_spectrogram_from_audio(audio)
|
| 152 |
+
norm_spec = normalize_spectrogram(spec)
|
| 153 |
+
norm_spec = pad_spec(norm_spec, 1024)
|
| 154 |
+
norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input
|
| 155 |
+
|
| 156 |
+
raw_image = image_add_color(torch_to_pil(norm_spec))
|
| 157 |
+
|
| 158 |
+
# Add Mask
|
| 159 |
+
mask = torch.zeros_like(norm_spec)[:1,...]
|
| 160 |
+
mask[:, :, width_start:width_start+width] = 1
|
| 161 |
+
mask_image = torch_to_pil(mask)
|
| 162 |
+
|
| 163 |
+
mask, masked_spec = prepare_mask_and_masked_image(norm_spec, mask)
|
| 164 |
+
masked_spec_image = torch_to_pil(masked_spec)
|
| 165 |
+
|
| 166 |
+
# color masked spec and paint masked area to black
|
| 167 |
+
color_masked_spec_image = image_add_color(masked_spec_image)
|
| 168 |
+
color_masked_spec_image = np.array(color_masked_spec_image)
|
| 169 |
+
color_masked_spec_image[:, width_start:width_start+width, :] = 0
|
| 170 |
+
color_masked_spec_image = Image.fromarray(color_masked_spec_image)
|
| 171 |
+
|
| 172 |
+
# Generation
|
| 173 |
+
generator = torch.Generator(device=device).manual_seed(seed)
|
| 174 |
+
|
| 175 |
+
with torch.autocast("cuda"):
|
| 176 |
+
output_spec = pipe(
|
| 177 |
+
prompt=prompt, image=norm_spec, mask_image=mask, num_inference_steps=100, generator=generator, height=256, width=1024, output_type="pt"
|
| 178 |
+
).images[0]
|
| 179 |
+
|
| 180 |
+
output_spec_image = torch_to_pil(output_spec)
|
| 181 |
+
color_output_spec_image = image_add_color(output_spec_image)
|
| 182 |
+
|
| 183 |
+
# Display audio result: raw audio, masked raw audio, generated audio
|
| 184 |
+
post_norm_spec = denormalize(norm_spec).to(device, dtype)
|
| 185 |
+
raw_chunk_spec = denormalize_spectrogram(post_norm_spec)
|
| 186 |
+
raw_chunk_audio = vocoder.inference(raw_chunk_spec)
|
| 187 |
+
|
| 188 |
+
post_masked_spec = denormalize(masked_spec).to(device, dtype)
|
| 189 |
+
denorm_masked_spec = denormalize_spectrogram(post_masked_spec)
|
| 190 |
+
denorm_masked_spec_audio = vocoder.inference(denorm_masked_spec)
|
| 191 |
+
|
| 192 |
+
denorm_spec = denormalize_spectrogram(output_spec)
|
| 193 |
+
denorm_spec_audio = vocoder.inference(denorm_spec)
|
| 194 |
+
|
| 195 |
+
#———
|
| 196 |
+
|
| 197 |
+
# Ensure correct shape
|
| 198 |
+
denorm_spec_audio = denorm_spec_audio.flatten() # Converts (1, N) → (N,)
|
| 199 |
+
|
| 200 |
+
# Normalize the audio to prevent clipping or excessive loudness
|
| 201 |
+
denorm_spec_audio = denorm_spec_audio / np.max(np.abs(denorm_spec_audio)) # Scale between -1 and 1
|
| 202 |
+
|
| 203 |
+
# Save as WAV
|
| 204 |
+
sf.write("output.wav", denorm_spec_audio, 16000)
|
| 205 |
+
|
| 206 |
+
# Save input spectrogram image
|
| 207 |
+
#input_spec_image_path = "input_spectrogram.png"
|
| 208 |
+
#raw_image.save(input_spec_image_path)
|
| 209 |
+
|
| 210 |
+
# Save concatenated spectrogram image
|
| 211 |
+
#output_spec_image_path = "output_spectrogram.png"
|
| 212 |
+
concat_image.save(output_spec_image_path)
|
| 213 |
+
|
| 214 |
+
#return "output.wav"
|
| 215 |
+
|
| 216 |
css="""
|
| 217 |
div#col-container{
|
| 218 |
margin: 0 auto;
|
|
|
|
| 271 |
input_spectrogram = gr.Image(label="Input Spectrogram")
|
| 272 |
output_spectrogram = gr.Image(label="Output Spectrogram")
|
| 273 |
|
| 274 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
|
| 276 |
submit_btn_img2img.click(
|
| 277 |
fn = infer_img2img,
|
|
|
|
| 279 |
outputs = [audio_out_img2img, input_spectrogram, output_spectrogram]
|
| 280 |
)
|
| 281 |
|
| 282 |
+
with gr.Tab("Audio InPainting"):
|
| 283 |
+
prompt_inp = gr.Textbox(label="Prompt")
|
| 284 |
+
audio_in_inp = gr.Audio(label="Audio Reference", type="filepath")
|
| 285 |
+
submit_btn_inp = gr.Button("Submit")
|
| 286 |
+
audio_out_inp = gr.Audio(label="Audio Ressult")
|
| 287 |
+
|
| 288 |
+
with gr.Accordion("Compare Spectrograms", open=False):
|
| 289 |
+
with gr.Column():
|
| 290 |
+
input_spectrogram_inp = gr.Image(label="Input Spectrogram")
|
| 291 |
+
output_spectrogram_inp = gr.Image(label="Output Spectrogram")
|
| 292 |
+
|
| 293 |
+
|
| 294 |
+
|
| 295 |
+
submit_btn_inp.click(
|
| 296 |
+
fn = infer_inp,
|
| 297 |
+
inputs = [prompt_inp, audio_in_inp],
|
| 298 |
+
outputs = [audio_out_inp]
|
| 299 |
+
)
|
| 300 |
+
|
| 301 |
demo.queue().launch(show_api=False, show_error=True)
|