Spaces:
Runtime error
Runtime error
Hendrik Schroeter
commited on
Force mono audio with max len of 10s
Browse files
app.py
CHANGED
|
@@ -41,7 +41,7 @@ def mix_at_snr(clean, noise, snr, eps=1e-10):
|
|
| 41 |
if noise.shape[1] < clean.shape[1]:
|
| 42 |
noise = noise.repeat((1, int(math.ceil(clean.shape[1] / noise.shape[1]))))
|
| 43 |
max_start = int(noise.shape[1] - clean.shape[1])
|
| 44 |
-
start = torch.randint(0, max_start, ()).item()
|
| 45 |
logger.debug(f"start: {start}, {clean.shape}")
|
| 46 |
noise = noise[:, start : start + clean.shape[1]]
|
| 47 |
E_speech = torch.mean(clean.pow(2)) + eps
|
|
@@ -92,6 +92,7 @@ def mix_and_denoise(
|
|
| 92 |
if noise_fn is None:
|
| 93 |
noise_fn = "samples/dkitchen.wav"
|
| 94 |
meta = AudioMetaData(-1, -1, -1, -1, "")
|
|
|
|
| 95 |
if speech_rec is None and speech_upl is None:
|
| 96 |
speech, meta = load_audio("samples/p232_013_clean.wav", sr)
|
| 97 |
elif speech_upl is not None:
|
|
@@ -100,6 +101,12 @@ def mix_and_denoise(
|
|
| 100 |
tmp = load_audio_gradio(speech_rec, sr)
|
| 101 |
assert tmp is not None
|
| 102 |
speech, meta = tmp
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
logger.info(f"Loaded speech with shape {speech.shape}")
|
| 104 |
noise, _ = load_audio(noise_fn, sr) # type: ignore
|
| 105 |
if meta.sample_rate != sr:
|
|
|
|
| 41 |
if noise.shape[1] < clean.shape[1]:
|
| 42 |
noise = noise.repeat((1, int(math.ceil(clean.shape[1] / noise.shape[1]))))
|
| 43 |
max_start = int(noise.shape[1] - clean.shape[1])
|
| 44 |
+
start = torch.randint(0, max_start, ()).item() if max_start > 0 else 0
|
| 45 |
logger.debug(f"start: {start}, {clean.shape}")
|
| 46 |
noise = noise[:, start : start + clean.shape[1]]
|
| 47 |
E_speech = torch.mean(clean.pow(2)) + eps
|
|
|
|
| 92 |
if noise_fn is None:
|
| 93 |
noise_fn = "samples/dkitchen.wav"
|
| 94 |
meta = AudioMetaData(-1, -1, -1, -1, "")
|
| 95 |
+
max_s = 10 # limit to 10 seconds
|
| 96 |
if speech_rec is None and speech_upl is None:
|
| 97 |
speech, meta = load_audio("samples/p232_013_clean.wav", sr)
|
| 98 |
elif speech_upl is not None:
|
|
|
|
| 101 |
tmp = load_audio_gradio(speech_rec, sr)
|
| 102 |
assert tmp is not None
|
| 103 |
speech, meta = tmp
|
| 104 |
+
if speech.dim() > 1 and speech.shape[0] > 1:
|
| 105 |
+
assert (
|
| 106 |
+
speech.shape[1] > speech.shape[0]
|
| 107 |
+
), f"Expecting channels first, but got {speech.shape}"
|
| 108 |
+
speech = speech.mean(dim=0, keepdim=True)
|
| 109 |
+
speech = speech[..., : max_s * sr]
|
| 110 |
logger.info(f"Loaded speech with shape {speech.shape}")
|
| 111 |
noise, _ = load_audio(noise_fn, sr) # type: ignore
|
| 112 |
if meta.sample_rate != sr:
|
usage.md
CHANGED
|
@@ -4,6 +4,7 @@ This demo takes a speech sample and a noise sample and mixes them at the provide
|
|
| 4 |
You can either record a speech sample or alternatively provide one via upload.
|
| 5 |
Furthermore, you may upload a noise sample which will be mixed with the speech sample.
|
| 6 |
If no samples are provided, a default will be used.
|
|
|
|
| 7 |
|
| 8 |
DeepFilterNet [(link)](https://github.com/Rikorose/DeepFilterNet) is used to denoise the noisy mixture.
|
| 9 |
|
|
|
|
| 4 |
You can either record a speech sample or alternatively provide one via upload.
|
| 5 |
Furthermore, you may upload a noise sample which will be mixed with the speech sample.
|
| 6 |
If no samples are provided, a default will be used.
|
| 7 |
+
Long audio samples will be trimmed to 10s.
|
| 8 |
|
| 9 |
DeepFilterNet [(link)](https://github.com/Rikorose/DeepFilterNet) is used to denoise the noisy mixture.
|
| 10 |
|