Kaworu17 commited on
Commit
b781380
·
verified ·
1 Parent(s): e3cc8bd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -119
app.py CHANGED
@@ -6,7 +6,6 @@ import time
6
  import gradio as gr
7
  from typing import List, Optional, Tuple, Union
8
 
9
- import gradio as gr
10
  import matplotlib.pyplot as plt
11
  import numpy as np
12
  import torch
@@ -42,19 +41,6 @@ NOISES = {
42
 
43
 
44
  def mix_at_snr(clean, noise, snr, eps=1e-10):
45
- """Mix clean and noise signal at a given SNR.
46
-
47
- Args:
48
- clean: 1D Tensor with the clean signal to mix.
49
- noise: 1D Tensor of shape.
50
- snr: Signal to noise ratio.
51
-
52
- Returns:
53
- clean: 1D Tensor with gain changed according to the snr.
54
- noise: 1D Tensor with the combined noise channels.
55
- mix: 1D Tensor with added clean and noise signals.
56
-
57
- """
58
  clean = torch.as_tensor(clean).mean(0, keepdim=True)
59
  noise = torch.as_tensor(noise).mean(0, keepdim=True)
60
  if noise.shape[1] < clean.shape[1]:
@@ -77,21 +63,17 @@ def mix_at_snr(clean, noise, snr, eps=1e-10):
77
  return clean, noise, mixture
78
 
79
 
80
- def load_audio_gradio(
81
- audio_or_file: Union[None, str, Tuple[int, np.ndarray]], sr: int
82
- ) -> Optional[Tuple[Tensor, AudioMetaData]]:
83
  if audio_or_file is None:
84
  return None
85
  if isinstance(audio_or_file, str):
86
  if audio_or_file.lower() == "none":
87
  return None
88
- # First try default format
89
  audio, meta = load_audio(audio_or_file, sr)
90
  else:
91
  meta = AudioMetaData(-1, -1, -1, -1, "")
92
  assert isinstance(audio_or_file, (tuple, list))
93
  meta.sample_rate, audio_np = audio_or_file
94
- # Gradio documentation says, the shape is [samples, 2], but apparently sometimes its not.
95
  audio_np = audio_np.reshape(audio_np.shape[0], -1).T
96
  if audio_np.dtype == np.int16:
97
  audio_np = (audio_np / (1 << 15)).astype(np.float32)
@@ -109,7 +91,7 @@ def demo_fn(speech_upl: str, noise_type: str, snr: int, mic_input: Optional[str]
109
  snr = int(snr)
110
  noise_fn = NOISES[noise_type]
111
  meta = AudioMetaData(-1, -1, -1, -1, "")
112
- max_s = 10 # limit to 10 seconds
113
  if speech_upl is not None:
114
  sample, meta = load_audio(speech_upl, sr)
115
  max_len = max_s * sr
@@ -120,13 +102,11 @@ def demo_fn(speech_upl: str, noise_type: str, snr: int, mic_input: Optional[str]
120
  sample, meta = load_audio("samples/p232_013_clean.wav", sr)
121
  sample = sample[..., : max_s * sr]
122
  if sample.dim() > 1 and sample.shape[0] > 1:
123
- assert (
124
- sample.shape[1] > sample.shape[0]
125
- ), f"Expecting channels first, but got {sample.shape}"
126
  sample = sample.mean(dim=0, keepdim=True)
127
  logger.info(f"Loaded sample with shape {sample.shape}")
128
  if noise_fn is not None:
129
- noise, _ = load_audio(noise_fn, sr) # type: ignore
130
  logger.info(f"Loaded noise with shape {noise.shape}")
131
  _, _, sample = mix_at_snr(sample, noise, snr)
132
  logger.info("Start denoising audio")
@@ -155,24 +135,47 @@ def demo_fn(speech_upl: str, noise_type: str, snr: int, mic_input: Optional[str]
155
  return noisy_wav, noisy_im, enhanced_wav, enh_im
156
 
157
 
158
- def specshow(
159
- spec,
160
- ax=None,
161
- title=None,
162
- xlabel=None,
163
- ylabel=None,
164
- sr=48000,
165
- n_fft=None,
166
- hop=None,
167
- t=None,
168
- f=None,
169
- vmin=-100,
170
- vmax=0,
171
- xlim=None,
172
- ylim=None,
173
- cmap="inferno",
174
- ):
175
- """Plots a spectrogram of shape [F, T]"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  spec_np = spec.cpu().numpy() if isinstance(spec, torch.Tensor) else spec
177
  if ax is not None:
178
  set_title = ax.set_title
@@ -188,18 +191,11 @@ def specshow(
188
  set_xlim = plt.xlim
189
  set_ylim = plt.ylim
190
  if n_fft is None:
191
- if spec.shape[0] % 2 == 0:
192
- n_fft = spec.shape[0] * 2
193
- else:
194
- n_fft = (spec.shape[0] - 1) * 2
195
  hop = hop or n_fft // 4
196
- if t is None:
197
- t = np.arange(0, spec_np.shape[-1]) * hop / sr
198
- if f is None:
199
- f = np.arange(0, spec_np.shape[0]) * sr // 2 / (n_fft // 2) / 1000
200
- im = ax.pcolormesh(
201
- t, f, spec_np, rasterized=True, shading="auto", vmin=vmin, vmax=vmax, cmap=cmap
202
- )
203
  if title is not None:
204
  set_title(title)
205
  if xlabel is not None:
@@ -213,15 +209,7 @@ def specshow(
213
  return im
214
 
215
 
216
- def spec_im(
217
- audio: torch.Tensor,
218
- figsize=(15, 5),
219
- colorbar=False,
220
- colorbar_format=None,
221
- figure=None,
222
- labels=True,
223
- **kwargs,
224
- ) -> Image:
225
  audio = torch.as_tensor(audio)
226
  if labels:
227
  kwargs.setdefault("xlabel", "Time [s]")
@@ -233,7 +221,6 @@ def spec_im(
233
  spec = spec.div_(w.pow(2).sum())
234
  spec = torch.view_as_complex(spec).abs().clamp_min(1e-12).log10().mul(10)
235
  kwargs.setdefault("vmax", max(0.0, spec.max().item()))
236
-
237
  if figure is None:
238
  figure = plt.figure(figsize=figsize)
239
  figure.set_tight_layout(True)
@@ -252,85 +239,41 @@ def spec_im(
252
  return Image.frombytes("RGB", figure.canvas.get_width_height(), figure.canvas.tostring_rgb())
253
 
254
 
255
- def cleanup_tmp(filter: List[str] = [], hours_keep=2):
256
- filter.append("p232")
257
- logger.info(f"Filter: {filter}")
258
- # Cleanup some old wav files
259
- if os.path.exists("/tmp"):
260
- for f in glob.glob("/tmp/*"):
261
- print(f"Got file {f}")
262
- is_old = (time.time() - os.path.getmtime(f)) / 3600 > hours_keep
263
- filtered = any(filt in f for filt in filter if filt is not None)
264
- if is_old and not filtered:
265
- try:
266
- os.remove(f)
267
- logger.info(f"Removed file {f}")
268
- except Exception as e:
269
- logger.warning(f"failed to remove file {f}: {e}")
270
-
271
-
272
- def toggle(choice):
273
- if choice == "mic":
274
- return gr.update(visible=True, value=None), gr.update(visible=False, value=None)
275
- else:
276
- return gr.update(visible=False, value=None), gr.update(visible=True, value=None)
277
-
278
-
279
  with gr.Blocks() as demo:
280
  with gr.Row():
281
- gr.Markdown(
282
- """
283
  ## DeepFilterNet2 Demo\
284
-
285
  This demo denoises audio files using DeepFilterNet. Try it with your own voice!
286
- """
287
- )
288
  with gr.Row():
289
  with gr.Column():
290
- radio = gr.Radio(
291
- ["mic", "file"], value="file", label="How would you like to upload your audio?"
292
- )
293
  mic_input = gr.Mic(label="Input", type="filepath", visible=False)
294
  audio_file = gr.Audio(type="filepath", label="Input", visible=True)
295
  inputs = [
296
  audio_file,
297
- gr.Dropdown(
298
- label="Add background noise",
299
- choices=list(NOISES.keys()),
300
- value="None",
301
- ),
302
- gr.Dropdown(
303
- label="Noise Level (SNR)",
304
- choices=["-5", "0", "10", "20"],
305
- value="10",
306
- ),
307
  mic_input,
308
  ]
309
  btn = gr.Button("Generate")
310
  with gr.Column():
311
  outputs = [
312
- # gr.Video(type="filepath", label="Noisy audio"),
313
  gr.Audio(type="filepath", label="Noisy audio"),
314
  gr.Image(label="Noisy spectrogram"),
315
- # gr.Video(type="filepath", label="Enhanced audio"),
316
  gr.Audio(type="filepath", label="Enhanced audio"),
317
  gr.Image(label="Enhanced spectrogram"),
318
  ]
319
  btn.click(fn=demo_fn, inputs=inputs, outputs=outputs, api_name='denoise')
320
  radio.change(toggle, radio, [mic_input, audio_file])
321
- gr.Examples(
322
- [
323
- ["./samples/p232_013_clean.wav", "Kitchen", "10"],
324
- ["./samples/p232_013_clean.wav", "Cafe", "10"],
325
- ["./samples/p232_019_clean.wav", "Cafe", "10"],
326
- ["./samples/p232_019_clean.wav", "River", "10"],
327
- ],
328
- fn=demo_fn,
329
- inputs=inputs,
330
- outputs=outputs,
331
- cache_examples=True,
332
- ),
333
  gr.Markdown(open("usage.md").read())
334
 
335
  cleanup_tmp()
336
  demo.launch()
 
 
6
  import gradio as gr
7
  from typing import List, Optional, Tuple, Union
8
 
 
9
  import matplotlib.pyplot as plt
10
  import numpy as np
11
  import torch
 
41
 
42
 
43
  def mix_at_snr(clean, noise, snr, eps=1e-10):
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  clean = torch.as_tensor(clean).mean(0, keepdim=True)
45
  noise = torch.as_tensor(noise).mean(0, keepdim=True)
46
  if noise.shape[1] < clean.shape[1]:
 
63
  return clean, noise, mixture
64
 
65
 
66
+ def load_audio_gradio(audio_or_file: Union[None, str, Tuple[int, np.ndarray]], sr: int) -> Optional[Tuple[Tensor, AudioMetaData]]:
 
 
67
  if audio_or_file is None:
68
  return None
69
  if isinstance(audio_or_file, str):
70
  if audio_or_file.lower() == "none":
71
  return None
 
72
  audio, meta = load_audio(audio_or_file, sr)
73
  else:
74
  meta = AudioMetaData(-1, -1, -1, -1, "")
75
  assert isinstance(audio_or_file, (tuple, list))
76
  meta.sample_rate, audio_np = audio_or_file
 
77
  audio_np = audio_np.reshape(audio_np.shape[0], -1).T
78
  if audio_np.dtype == np.int16:
79
  audio_np = (audio_np / (1 << 15)).astype(np.float32)
 
91
  snr = int(snr)
92
  noise_fn = NOISES[noise_type]
93
  meta = AudioMetaData(-1, -1, -1, -1, "")
94
+ max_s = 10
95
  if speech_upl is not None:
96
  sample, meta = load_audio(speech_upl, sr)
97
  max_len = max_s * sr
 
102
  sample, meta = load_audio("samples/p232_013_clean.wav", sr)
103
  sample = sample[..., : max_s * sr]
104
  if sample.dim() > 1 and sample.shape[0] > 1:
105
+ assert sample.shape[1] > sample.shape[0], f"Expecting channels first, but got {sample.shape}"
 
 
106
  sample = sample.mean(dim=0, keepdim=True)
107
  logger.info(f"Loaded sample with shape {sample.shape}")
108
  if noise_fn is not None:
109
+ noise, _ = load_audio(noise_fn, sr)
110
  logger.info(f"Loaded noise with shape {noise.shape}")
111
  _, _, sample = mix_at_snr(sample, noise, snr)
112
  logger.info("Start denoising audio")
 
135
  return noisy_wav, noisy_im, enhanced_wav, enh_im
136
 
137
 
138
+ def denoise_api(audio_file_path: str, noise_type: str = "None", snr: int = 10):
139
+ sr = config("sr", 48000, int, section="df")
140
+ sample, meta = load_audio(audio_file_path, sr)
141
+ noise = None if noise_type == "None" else load_audio(NOISES[noise_type], sr)[0]
142
+ _, _, sample_mix = mix_at_snr(sample, noise, snr) if noise is not None else (sample, None, sample)
143
+ enhanced = enhance(model, df, sample_mix)
144
+ noisy_wav = tempfile.NamedTemporaryFile(suffix="noisy.wav", delete=False).name
145
+ enhanced_wav = tempfile.NamedTemporaryFile(suffix="enhanced.wav", delete=False).name
146
+ save_audio(noisy_wav, sample_mix, sr)
147
+ save_audio(enhanced_wav, enhanced, sr)
148
+ return {
149
+ "enhanced_audio": enhanced_wav,
150
+ "snr": snr,
151
+ "status": "done"
152
+ }
153
+
154
+
155
+ def toggle(choice):
156
+ if choice == "mic":
157
+ return gr.update(visible=True, value=None), gr.update(visible=False, value=None)
158
+ else:
159
+ return gr.update(visible=False, value=None), gr.update(visible=True, value=None)
160
+
161
+
162
+ def cleanup_tmp(filter: List[str] = [], hours_keep=2):
163
+ filter.append("p232")
164
+ logger.info(f"Filter: {filter}")
165
+ if os.path.exists("/tmp"):
166
+ for f in glob.glob("/tmp/*"):
167
+ print(f"Got file {f}")
168
+ is_old = (time.time() - os.path.getmtime(f)) / 3600 > hours_keep
169
+ filtered = any(filt in f for filt in filter if filt is not None)
170
+ if is_old and not filtered:
171
+ try:
172
+ os.remove(f)
173
+ logger.info(f"Removed file {f}")
174
+ except Exception as e:
175
+ logger.warning(f"failed to remove file {f}: {e}")
176
+
177
+
178
+ def specshow(spec, ax=None, title=None, xlabel=None, ylabel=None, sr=48000, n_fft=None, hop=None, t=None, f=None, vmin=-100, vmax=0, xlim=None, ylim=None, cmap="inferno"):
179
  spec_np = spec.cpu().numpy() if isinstance(spec, torch.Tensor) else spec
180
  if ax is not None:
181
  set_title = ax.set_title
 
191
  set_xlim = plt.xlim
192
  set_ylim = plt.ylim
193
  if n_fft is None:
194
+ n_fft = spec.shape[0] * 2 if spec.shape[0] % 2 == 0 else (spec.shape[0] - 1) * 2
 
 
 
195
  hop = hop or n_fft // 4
196
+ t = np.arange(0, spec_np.shape[-1]) * hop / sr if t is None else t
197
+ f = np.arange(0, spec_np.shape[0]) * sr // 2 / (n_fft // 2) / 1000 if f is None else f
198
+ im = ax.pcolormesh(t, f, spec_np, rasterized=True, shading="auto", vmin=vmin, vmax=vmax, cmap=cmap)
 
 
 
 
199
  if title is not None:
200
  set_title(title)
201
  if xlabel is not None:
 
209
  return im
210
 
211
 
212
+ def spec_im(audio: torch.Tensor, figsize=(15, 5), colorbar=False, colorbar_format=None, figure=None, labels=True, **kwargs) -> Image:
 
 
 
 
 
 
 
 
213
  audio = torch.as_tensor(audio)
214
  if labels:
215
  kwargs.setdefault("xlabel", "Time [s]")
 
221
  spec = spec.div_(w.pow(2).sum())
222
  spec = torch.view_as_complex(spec).abs().clamp_min(1e-12).log10().mul(10)
223
  kwargs.setdefault("vmax", max(0.0, spec.max().item()))
 
224
  if figure is None:
225
  figure = plt.figure(figsize=figsize)
226
  figure.set_tight_layout(True)
 
239
  return Image.frombytes("RGB", figure.canvas.get_width_height(), figure.canvas.tostring_rgb())
240
 
241
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
  with gr.Blocks() as demo:
243
  with gr.Row():
244
+ gr.Markdown("""
 
245
  ## DeepFilterNet2 Demo\
 
246
  This demo denoises audio files using DeepFilterNet. Try it with your own voice!
247
+ """)
 
248
  with gr.Row():
249
  with gr.Column():
250
+ radio = gr.Radio(["mic", "file"], value="file", label="How would you like to upload your audio?")
 
 
251
  mic_input = gr.Mic(label="Input", type="filepath", visible=False)
252
  audio_file = gr.Audio(type="filepath", label="Input", visible=True)
253
  inputs = [
254
  audio_file,
255
+ gr.Dropdown(label="Add background noise", choices=list(NOISES.keys()), value="None"),
256
+ gr.Dropdown(label="Noise Level (SNR)", choices=["-5", "0", "10", "20"], value="10"),
 
 
 
 
 
 
 
 
257
  mic_input,
258
  ]
259
  btn = gr.Button("Generate")
260
  with gr.Column():
261
  outputs = [
 
262
  gr.Audio(type="filepath", label="Noisy audio"),
263
  gr.Image(label="Noisy spectrogram"),
 
264
  gr.Audio(type="filepath", label="Enhanced audio"),
265
  gr.Image(label="Enhanced spectrogram"),
266
  ]
267
  btn.click(fn=demo_fn, inputs=inputs, outputs=outputs, api_name='denoise')
268
  radio.change(toggle, radio, [mic_input, audio_file])
269
+ gr.Examples([
270
+ ["./samples/p232_013_clean.wav", "Kitchen", "10"],
271
+ ["./samples/p232_013_clean.wav", "Cafe", "10"],
272
+ ["./samples/p232_019_clean.wav", "Cafe", "10"],
273
+ ["./samples/p232_019_clean.wav", "River", "10"],
274
+ ], fn=demo_fn, inputs=inputs, outputs=outputs, cache_examples=True)
 
 
 
 
 
 
275
  gr.Markdown(open("usage.md").read())
276
 
277
  cleanup_tmp()
278
  demo.launch()
279
+