Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import torch, os
|
| 3 |
-
from torchvision import transforms
|
| 4 |
import numpy as np
|
| 5 |
from PIL import Image
|
| 6 |
import matplotlib.pyplot as plt
|
|
@@ -128,10 +127,7 @@ def infer_img2img(prompt, audio_path, desired_strength, progress=gr.Progress(tra
|
|
| 128 |
|
| 129 |
return "output.wav", input_spec_image_path, output_spec_image_path
|
| 130 |
|
| 131 |
-
def infer_inp(prompt, audio_path,
|
| 132 |
-
|
| 133 |
-
if spec_with_mask:
|
| 134 |
-
print(spec_with_mask)
|
| 135 |
|
| 136 |
pretrained_model_name_or_path = "auffusion/auffusion-full-no-adapter"
|
| 137 |
dtype = torch.float16
|
|
@@ -146,7 +142,7 @@ def infer_inp(prompt, audio_path, spec_with_mask, progress=gr.Progress(track_tqd
|
|
| 146 |
pipe = StableDiffusionInpaintPipeline.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype)
|
| 147 |
pipe = pipe.to(device)
|
| 148 |
|
| 149 |
-
width_start, width =
|
| 150 |
prompt = "A siren ringing with a vehicle speeding closer"
|
| 151 |
seed = 42
|
| 152 |
|
|
@@ -160,24 +156,11 @@ def infer_inp(prompt, audio_path, spec_with_mask, progress=gr.Progress(track_tqd
|
|
| 160 |
raw_image = image_add_color(torch_to_pil(norm_spec))
|
| 161 |
|
| 162 |
# Add Mask
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
# Load the mask image (input from user)
|
| 168 |
-
mask_pil = spec_with_mask['layers'][0]
|
| 169 |
-
|
| 170 |
-
# Convert to tensor and normalize
|
| 171 |
-
mask_tensor = transforms.ToTensor()(mask_pil) # Shape: (1, H, W), values in [0, 1]
|
| 172 |
-
|
| 173 |
-
# Ensure the shape matches expected input (add batch dimension if needed)
|
| 174 |
-
mask_tensor = mask_tensor[:1, :, :] # Keep only one channel (grayscale)
|
| 175 |
-
mask_tensor = mask_tensor.to(device, dtype) # Send to correct device and dtype
|
| 176 |
|
| 177 |
-
|
| 178 |
-
mask_image = torch_to_pil(mask_tensor)
|
| 179 |
-
|
| 180 |
-
mask, masked_spec = prepare_mask_and_masked_image(norm_spec, mask_tensor)
|
| 181 |
masked_spec_image = torch_to_pil(masked_spec)
|
| 182 |
|
| 183 |
# color masked spec and paint masked area to black
|
|
@@ -221,47 +204,15 @@ def infer_inp(prompt, audio_path, spec_with_mask, progress=gr.Progress(track_tqd
|
|
| 221 |
sf.write("output.wav", denorm_spec_audio, 16000)
|
| 222 |
|
| 223 |
# Save input spectrogram image
|
| 224 |
-
#input_spec_image_path = "input_spectrogram.png"
|
| 225 |
-
#raw_image.save(input_spec_image_path)
|
| 226 |
-
|
| 227 |
-
# Save concatenated spectrogram image
|
| 228 |
-
#output_spec_image_path = "output_spectrogram.png"
|
| 229 |
-
#denorm_spec_audio.save(output_spec_image_path)
|
| 230 |
-
|
| 231 |
-
return "output.wav"
|
| 232 |
-
|
| 233 |
-
def create_transparent_layer(image_path):
|
| 234 |
-
"""Creates a transparent PNG with the same size as the background image."""
|
| 235 |
-
background = Image.open(image_path)
|
| 236 |
-
transparent_layer = Image.new("RGBA", background.size, (0, 0, 0, 0))
|
| 237 |
-
|
| 238 |
-
layer_path = "layer_one.png"
|
| 239 |
-
transparent_layer.save(layer_path)
|
| 240 |
-
return layer_path
|
| 241 |
-
|
| 242 |
-
def load_spec_for_manual_masking(audio_path):
|
| 243 |
-
# Loading
|
| 244 |
-
audio, sampling_rate = load_wav(audio_path)
|
| 245 |
-
audio, spec = get_mel_spectrogram_from_audio(audio)
|
| 246 |
-
norm_spec = normalize_spectrogram(spec)
|
| 247 |
-
norm_spec = pad_spec(norm_spec, 1024)
|
| 248 |
-
norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input
|
| 249 |
-
|
| 250 |
-
raw_image = image_add_color(torch_to_pil(norm_spec))
|
| 251 |
-
|
| 252 |
input_spec_image_path = "input_spectrogram.png"
|
| 253 |
raw_image.save(input_spec_image_path)
|
| 254 |
|
| 255 |
-
#
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
return
|
| 260 |
-
|
| 261 |
-
"layers": [layer_one_path],
|
| 262 |
-
"composite": None
|
| 263 |
-
}
|
| 264 |
-
|
| 265 |
|
| 266 |
css="""
|
| 267 |
div#col-container{
|
|
@@ -332,8 +283,8 @@ with gr.Blocks(css=css) as demo:
|
|
| 332 |
with gr.Tab("Audio InPainting"):
|
| 333 |
prompt_inp = gr.Textbox(label="Prompt")
|
| 334 |
audio_in_inp = gr.Audio(label="Audio Reference", type="filepath")
|
| 335 |
-
|
| 336 |
-
|
| 337 |
submit_btn_inp = gr.Button("Submit")
|
| 338 |
audio_out_inp = gr.Audio(label="Audio Ressult")
|
| 339 |
|
|
@@ -341,23 +292,11 @@ with gr.Blocks(css=css) as demo:
|
|
| 341 |
with gr.Column():
|
| 342 |
input_spectrogram_inp = gr.Image(label="Input Spectrogram")
|
| 343 |
output_spectrogram_inp = gr.Image(label="Output Spectrogram")
|
| 344 |
-
|
| 345 |
-
audio_in_inp.upload(
|
| 346 |
-
fn = load_spec_for_manual_masking,
|
| 347 |
-
inputs = [audio_in_inp],
|
| 348 |
-
outputs = [spec_for_mask]
|
| 349 |
-
)
|
| 350 |
-
|
| 351 |
-
spec_for_mask.clear(
|
| 352 |
-
fn = load_spec_for_manual_masking,
|
| 353 |
-
inputs = [audio_in_inp],
|
| 354 |
-
outputs = [spec_for_mask]
|
| 355 |
-
)
|
| 356 |
|
| 357 |
submit_btn_inp.click(
|
| 358 |
fn = infer_inp,
|
| 359 |
-
inputs = [prompt_inp, audio_in_inp,
|
| 360 |
-
outputs = [audio_out_inp]
|
| 361 |
)
|
| 362 |
|
| 363 |
demo.queue().launch(show_api=False, show_error=True)
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import torch, os
|
|
|
|
| 3 |
import numpy as np
|
| 4 |
from PIL import Image
|
| 5 |
import matplotlib.pyplot as plt
|
|
|
|
| 127 |
|
| 128 |
return "output.wav", input_spec_image_path, output_spec_image_path
|
| 129 |
|
| 130 |
+
def infer_inp(prompt, audio_path, mask_start_point, mask_end_point, progress=gr.Progress(track_tqdm=True)):
|
|
|
|
|
|
|
|
|
|
| 131 |
|
| 132 |
pretrained_model_name_or_path = "auffusion/auffusion-full-no-adapter"
|
| 133 |
dtype = torch.float16
|
|
|
|
| 142 |
pipe = StableDiffusionInpaintPipeline.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype)
|
| 143 |
pipe = pipe.to(device)
|
| 144 |
|
| 145 |
+
width_start, width = mask_start_point, mask_end_point-mask_start_point
|
| 146 |
prompt = "A siren ringing with a vehicle speeding closer"
|
| 147 |
seed = 42
|
| 148 |
|
|
|
|
| 156 |
raw_image = image_add_color(torch_to_pil(norm_spec))
|
| 157 |
|
| 158 |
# Add Mask
|
| 159 |
+
mask = torch.zeros_like(norm_spec)[:1,...]
|
| 160 |
+
mask[:, :, width_start:width_start+width] = 1
|
| 161 |
+
mask_image = torch_to_pil(mask)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
+
mask, masked_spec = prepare_mask_and_masked_image(norm_spec, mask)
|
|
|
|
|
|
|
|
|
|
| 164 |
masked_spec_image = torch_to_pil(masked_spec)
|
| 165 |
|
| 166 |
# color masked spec and paint masked area to black
|
|
|
|
| 204 |
sf.write("output.wav", denorm_spec_audio, 16000)
|
| 205 |
|
| 206 |
# Save input spectrogram image
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
input_spec_image_path = "input_spectrogram.png"
|
| 208 |
raw_image.save(input_spec_image_path)
|
| 209 |
|
| 210 |
+
# Save output spectrogram image
|
| 211 |
+
output_spec_image_path = "output_spectrogram.png"
|
| 212 |
+
color_output_spec_image.save(output_spec_image_path)
|
| 213 |
+
|
| 214 |
+
return "output.wav", input_spec_image_path, color_output_spec_image
|
| 215 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
|
| 217 |
css="""
|
| 218 |
div#col-container{
|
|
|
|
| 283 |
with gr.Tab("Audio InPainting"):
|
| 284 |
prompt_inp = gr.Textbox(label="Prompt")
|
| 285 |
audio_in_inp = gr.Audio(label="Audio Reference", type="filepath")
|
| 286 |
+
mask_start_point = gr.Slider(label="Mask Start point", minimum=0, maximum=1024, step=1, value=256)
|
| 287 |
+
mask_end_point = gr.Slider(label="Mask End point", minimum=0, maximum=1024, step=1, value=768)
|
| 288 |
submit_btn_inp = gr.Button("Submit")
|
| 289 |
audio_out_inp = gr.Audio(label="Audio Ressult")
|
| 290 |
|
|
|
|
| 292 |
with gr.Column():
|
| 293 |
input_spectrogram_inp = gr.Image(label="Input Spectrogram")
|
| 294 |
output_spectrogram_inp = gr.Image(label="Output Spectrogram")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
|
| 296 |
submit_btn_inp.click(
|
| 297 |
fn = infer_inp,
|
| 298 |
+
inputs = [prompt_inp, audio_in_inp, mask_start_point, mask_end_point],
|
| 299 |
+
outputs = [audio_out_inp, input_spectrogram, output_spectrogram]
|
| 300 |
)
|
| 301 |
|
| 302 |
demo.queue().launch(show_api=False, show_error=True)
|