Spaces:

hshr
/

DeepFilterNet

Runtime error

Hendrik Schroeter commited on Jun 7, 2022

Commit

4f235e3

unverified ·

1 Parent(s): 897b496

Force mono audio with max len of 10s

Files changed (2) hide show

app.py CHANGED Viewed

@@ -41,7 +41,7 @@ def mix_at_snr(clean, noise, snr, eps=1e-10):
     if noise.shape[1] < clean.shape[1]:
         noise = noise.repeat((1, int(math.ceil(clean.shape[1] / noise.shape[1]))))
     max_start = int(noise.shape[1] - clean.shape[1])
-    start = torch.randint(0, max_start, ()).item()
     logger.debug(f"start: {start}, {clean.shape}")
     noise = noise[:, start : start + clean.shape[1]]
     E_speech = torch.mean(clean.pow(2)) + eps
@@ -92,6 +92,7 @@ def mix_and_denoise(
     if noise_fn is None:
         noise_fn = "samples/dkitchen.wav"
     meta = AudioMetaData(-1, -1, -1, -1, "")
     if speech_rec is None and speech_upl is None:
         speech, meta = load_audio("samples/p232_013_clean.wav", sr)
     elif speech_upl is not None:
@@ -100,6 +101,12 @@ def mix_and_denoise(
         tmp = load_audio_gradio(speech_rec, sr)
         assert tmp is not None
         speech, meta = tmp
     logger.info(f"Loaded speech with shape {speech.shape}")
     noise, _ = load_audio(noise_fn, sr)  # type: ignore
     if meta.sample_rate != sr:

     if noise.shape[1] < clean.shape[1]:
         noise = noise.repeat((1, int(math.ceil(clean.shape[1] / noise.shape[1]))))
     max_start = int(noise.shape[1] - clean.shape[1])
+    start = torch.randint(0, max_start, ()).item() if max_start > 0 else 0
     logger.debug(f"start: {start}, {clean.shape}")
     noise = noise[:, start : start + clean.shape[1]]
     E_speech = torch.mean(clean.pow(2)) + eps
     if noise_fn is None:
         noise_fn = "samples/dkitchen.wav"
     meta = AudioMetaData(-1, -1, -1, -1, "")
+    max_s = 10  # limit to 10 seconds
     if speech_rec is None and speech_upl is None:
         speech, meta = load_audio("samples/p232_013_clean.wav", sr)
     elif speech_upl is not None:
         tmp = load_audio_gradio(speech_rec, sr)
         assert tmp is not None
         speech, meta = tmp
+    if speech.dim() > 1 and speech.shape[0] > 1:
+        assert (
+            speech.shape[1] > speech.shape[0]
+        ), f"Expecting channels first, but got {speech.shape}"
+        speech = speech.mean(dim=0, keepdim=True)
+        speech = speech[..., : max_s * sr]
     logger.info(f"Loaded speech with shape {speech.shape}")
     noise, _ = load_audio(noise_fn, sr)  # type: ignore
     if meta.sample_rate != sr:

usage.md CHANGED Viewed

@@ -4,6 +4,7 @@ This demo takes a speech sample and a noise sample and mixes them at the provide
 You can either record a speech sample or alternatively provide one via upload.
 Furthermore, you may upload a noise sample which will be mixed with the speech sample.
 If no samples are provided, a default will be used.
 DeepFilterNet [(link)](https://github.com/Rikorose/DeepFilterNet) is used to denoise the noisy mixture.

 You can either record a speech sample or alternatively provide one via upload.
 Furthermore, you may upload a noise sample which will be mixed with the speech sample.
 If no samples are provided, a default will be used.
+Long audio samples will be trimmed to 10s.
 DeepFilterNet [(link)](https://github.com/Rikorose/DeepFilterNet) is used to denoise the noisy mixture.