DeepFilterNet2

Running

App Files Files Community

ongudidan commited on Oct 3, 2025

Commit

cb1e8bf

verified ·

1 Parent(s): a030856

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -53

app.py CHANGED Viewed

@@ -8,12 +8,6 @@ from typing import List, Optional, Tuple, Union
 import subprocess
 # import os
-# import torch
-# import numpy as np
-# import tempfile
-# from typing import Optional
-# import gradio as gr
 import gradio as gr
 import matplotlib.pyplot as plt
 import numpy as np
@@ -109,90 +103,82 @@ def load_audio_gradio(
     return audio, meta
-def chunk_audio(sample: torch.Tensor, chunk_size: int):
-    """Yield chunks of audio of size `chunk_size`."""
-    total_len = sample.shape[-1]
-    for start in range(0, total_len, chunk_size):
-        end = min(start + chunk_size, total_len)
-        yield sample[..., start:end], start, total_len
-def demo_fn(speech_upl: str, noise_type: str, snr: int, mic_input: Optional[str] = None, progress=gr.Progress()):
     if mic_input:
         speech_upl = mic_input
     sr = config("sr", 48000, int, section="df")
     snr = int(snr)
     noise_fn = NOISES[noise_type]
     meta = AudioMetaData(-1, -1, -1, -1, "")
-    max_s = 3600  # 1 hour
-    chunk_s = 10  # process in 10-second chunks
-    chunk_len = chunk_s * sr
-    # Load audio
-    speech_upl = ensure_wav(speech_upl)
-    sample, meta = load_audio(speech_upl, sr)
-    # Limit to max_s
-    if sample.shape[-1] > max_s * sr:
-        start_idx = torch.randint(0, sample.shape[-1] - max_s*sr, ()).item()
-        sample = sample[..., start_idx:start_idx + max_s*sr]
-    # Convert to mono if needed
     if sample.dim() > 1 and sample.shape[0] > 1:
         sample = sample.mean(dim=0, keepdim=True)
-    # Mix noise if applicable
     if noise_fn is not None:
-        noise, _ = load_audio(noise_fn, sr)
         _, _, sample = mix_at_snr(sample, noise, snr)
-    # Prepare output tensor
-    enhanced_chunks = []
-    # Process audio in chunks
-    for i, (chunk, start, total_len) in enumerate(chunk_audio(sample, chunk_len)):
-        # Denoise the chunk
-        enhanced_chunk = enhance(model, df, chunk)
-        enhanced_chunks.append(enhanced_chunk)
-        # Update progress
-        progress((start + chunk.shape[-1]) / total_len * 100, desc="Denoising audio...")
-    # Concatenate all chunks
-    enhanced = torch.cat(enhanced_chunks, dim=-1)
-    # Optional: apply fade or limiter
     lim = torch.linspace(0.0, 1.0, int(sr * 0.15)).unsqueeze(0)
     lim = torch.cat((lim, torch.ones(1, enhanced.shape[1] - lim.shape[1])), dim=1)
     enhanced = enhanced * lim
-    # Resample if needed
     if meta.sample_rate != sr:
         enhanced = resample(enhanced, sr, meta.sample_rate)
         sample = resample(sample, sr, meta.sample_rate)
         sr = meta.sample_rate
-    # Save outputs
     noisy_wav = tempfile.NamedTemporaryFile(suffix="noisy.wav", delete=False).name
-    enhanced_wav = tempfile.NamedTemporaryFile(suffix="enhanced.wav", delete=False).name
     save_audio(noisy_wav, sample, sr)
     save_audio(enhanced_wav, enhanced, sr)
-    # Spectrograms
     ax_noisy.clear()
     ax_enh.clear()
     noisy_im = spec_im(sample, sr=sr, figure=fig_noisy, ax=ax_noisy)
     enh_im = spec_im(enhanced, sr=sr, figure=fig_enh, ax=ax_enh)
-    cleanup_tmp([speech_upl, noisy_wav, enhanced_wav])
-    return noisy_wav, noisy_im, enhanced_wav, enh_im
 def specshow(
     spec,
@@ -373,7 +359,7 @@ with gr.Blocks() as demo:
 cleanup_tmp()
 # demo.launch(enable_queue=True)
-demo.launch()
-# demo.queue().launch()

 import subprocess
 # import os
 import gradio as gr
 import matplotlib.pyplot as plt
 import numpy as np
     return audio, meta
+def ensure_wav(filepath: str) -> str:
+    """Convert MP3 (or other formats) to WAV using ffmpeg if needed."""
+    if filepath.lower().endswith(".mp3"):
+        wav_path = filepath.rsplit(".", 1)[0] + ".wav"
+        subprocess.run(["ffmpeg", "-y", "-i", filepath, wav_path], check=True)
+        return wav_path
+    return filepath
+def demo_fn(speech_upl: str, noise_type: str, snr: int, mic_input: Optional[str] = None):
     if mic_input:
         speech_upl = mic_input
     sr = config("sr", 48000, int, section="df")
+    logger.info(f"Got parameters speech_upl: {speech_upl}, noise: {noise_type}, snr: {snr}")
     snr = int(snr)
     noise_fn = NOISES[noise_type]
     meta = AudioMetaData(-1, -1, -1, -1, "")
+    max_s = 3600  # allow up to 1 hour (3600 seconds)
+    if speech_upl is not None:
+        # ✅ Ensure compatible WAV input
+        speech_upl = ensure_wav(speech_upl)
+        sample, meta = load_audio(speech_upl, sr)
+        max_len = max_s * sr
+        if sample.shape[-1] > max_len:
+            start = torch.randint(0, sample.shape[-1] - max_len, ()).item()
+            sample = sample[..., start : start + max_len]
+    else:
+        sample, meta = load_audio("samples/p232_013_clean.wav", sr)
+        sample = sample[..., : max_s * sr]
     if sample.dim() > 1 and sample.shape[0] > 1:
+        assert sample.shape[1] > sample.shape[0], f"Expecting channels first, but got {sample.shape}"
         sample = sample.mean(dim=0, keepdim=True)
+    logger.info(f"Loaded sample with shape {sample.shape}")
     if noise_fn is not None:
+        noise, _ = load_audio(noise_fn, sr)  # type: ignore
+        logger.info(f"Loaded noise with shape {noise.shape}")
         _, _, sample = mix_at_snr(sample, noise, snr)
+    logger.info("Start denoising audio")
+    enhanced = enhance(model, df, sample)
+    logger.info("Denoising finished")
     lim = torch.linspace(0.0, 1.0, int(sr * 0.15)).unsqueeze(0)
     lim = torch.cat((lim, torch.ones(1, enhanced.shape[1] - lim.shape[1])), dim=1)
     enhanced = enhanced * lim
     if meta.sample_rate != sr:
         enhanced = resample(enhanced, sr, meta.sample_rate)
         sample = resample(sample, sr, meta.sample_rate)
         sr = meta.sample_rate
     noisy_wav = tempfile.NamedTemporaryFile(suffix="noisy.wav", delete=False).name
     save_audio(noisy_wav, sample, sr)
+    enhanced_wav = tempfile.NamedTemporaryFile(suffix="enhanced.wav", delete=False).name
     save_audio(enhanced_wav, enhanced, sr)
+    logger.info(f"saved audios: {noisy_wav}, {enhanced_wav}")
     ax_noisy.clear()
     ax_enh.clear()
     noisy_im = spec_im(sample, sr=sr, figure=fig_noisy, ax=ax_noisy)
     enh_im = spec_im(enhanced, sr=sr, figure=fig_enh, ax=ax_enh)
+    filter = [speech_upl, noisy_wav, enhanced_wav]
+    if mic_input is not None and mic_input != "":
+        filter.append(mic_input)
+    cleanup_tmp(filter)
+    return noisy_wav, noisy_im, enhanced_wav, enh_im
 def specshow(
     spec,
 cleanup_tmp()
 # demo.launch(enable_queue=True)
+# demo.launch()
+demo.queue().launch()