DeepFilterNet3

Runtime error

App Files Files Community

humairmunirawn commited on Mar 12

Commit

3132eeb

verified ·

1 Parent(s): 3f3d91b

Update app.py

Browse files

Files changed (1) hide show

app.py +258 -19

app.py CHANGED Viewed

@@ -4,18 +4,20 @@ import os
 import tempfile
 import time
 from typing import List, Optional, Tuple, Union
 import gradio as gr
 import matplotlib.pyplot as plt
 import numpy as np
 import torch
 from loguru import logger
 from PIL import Image
 from torch import Tensor
-from torchaudio.backend.common import AudioMetaData
 from df import config
-from df.enhance import enhance, init_df, load_audio, save_audio
 from df.io import resample
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -40,6 +42,138 @@ NOISES = {
 }
 def mix_at_snr(clean, noise, snr, eps=1e-10):
     """Mix clean and noise signal at a given SNR.
@@ -67,7 +201,7 @@ def mix_at_snr(clean, noise, snr, eps=1e-10):
     K = torch.sqrt((E_noise / E_speech) * 10 ** (snr / 10) + eps)
     noise = noise / K
     mixture = clean + noise
-    logger.debug("mixture: {mixture.shape}")
     assert torch.isfinite(mixture).all()
     max_m = mixture.abs().max()
     if max_m > 1:
@@ -79,36 +213,77 @@ def mix_at_snr(clean, noise, snr, eps=1e-10):
 def load_audio_gradio(
     audio_or_file: Union[None, str, Tuple[int, np.ndarray]], sr: int
 ) -> Optional[Tuple[Tensor, AudioMetaData]]:
     if audio_or_file is None:
         return None
     if isinstance(audio_or_file, str):
         if audio_or_file.lower() == "none":
             return None
-        # First try default format
         audio, meta = load_audio(audio_or_file, sr)
     else:
-        meta = AudioMetaData(-1, -1, -1, -1, "")
         assert isinstance(audio_or_file, (tuple, list))
-        meta.sample_rate, audio_np = audio_or_file
-        # Gradio documentation says, the shape is [samples, 2], but apparently sometimes its not.
         audio_np = audio_np.reshape(audio_np.shape[0], -1).T
         if audio_np.dtype == np.int16:
             audio_np = (audio_np / (1 << 15)).astype(np.float32)
         elif audio_np.dtype == np.int32:
             audio_np = (audio_np / (1 << 31)).astype(np.float32)
-        audio = resample(torch.from_numpy(audio_np), meta.sample_rate, sr)
     return audio, meta
 def demo_fn(speech_upl: str, noise_type: str, snr: int, mic_input: Optional[str] = None):
     if mic_input:
         speech_upl = mic_input
     sr = config("sr", 48000, int, section="df")
     logger.info(f"Got parameters speech_upl: {speech_upl}, noise: {noise_type}, snr: {snr}")
     snr = int(snr)
     noise_fn = NOISES[noise_type]
     meta = AudioMetaData(-1, -1, -1, -1, "")
     max_s = 10  # limit to 10 seconds
     if speech_upl is not None:
         sample, meta = load_audio(speech_upl, sr)
         max_len = max_s * sr
@@ -118,39 +293,56 @@ def demo_fn(speech_upl: str, noise_type: str, snr: int, mic_input: Optional[str]
     else:
         sample, meta = load_audio("samples/p232_013_clean.wav", sr)
         sample = sample[..., : max_s * sr]
     if sample.dim() > 1 and sample.shape[0] > 1:
         assert (
             sample.shape[1] > sample.shape[0]
         ), f"Expecting channels first, but got {sample.shape}"
         sample = sample.mean(dim=0, keepdim=True)
     logger.info(f"Loaded sample with shape {sample.shape}")
     if noise_fn is not None:
-        noise, _ = load_audio(noise_fn, sr)  # type: ignore
         logger.info(f"Loaded noise with shape {noise.shape}")
         _, _, sample = mix_at_snr(sample, noise, snr)
     logger.info("Start denoising audio")
     enhanced = enhance(model, df, sample)
     logger.info("Denoising finished")
     lim = torch.linspace(0.0, 1.0, int(sr * 0.15)).unsqueeze(0)
     lim = torch.cat((lim, torch.ones(1, enhanced.shape[1] - lim.shape[1])), dim=1)
     enhanced = enhanced * lim
     if meta.sample_rate != sr:
-        enhanced = resample(enhanced, sr, meta.sample_rate)
-        sample = resample(sample, sr, meta.sample_rate)
         sr = meta.sample_rate
-    noisy_wav = tempfile.NamedTemporaryFile(suffix="noisy.wav", delete=False).name
     save_audio(noisy_wav, sample, sr)
-    enhanced_wav = tempfile.NamedTemporaryFile(suffix="enhanced.wav", delete=False).name
     save_audio(enhanced_wav, enhanced, sr)
     logger.info(f"saved audios: {noisy_wav}, {enhanced_wav}")
     ax_noisy.clear()
     ax_enh.clear()
     noisy_im = spec_im(sample, sr=sr, figure=fig_noisy, ax=ax_noisy)
     enh_im = spec_im(enhanced, sr=sr, figure=fig_enh, ax=ax_enh)
     filter = [speech_upl, noisy_wav, enhanced_wav]
     if mic_input is not None and mic_input != "":
         filter.append(mic_input)
     cleanup_tmp(filter)
     return noisy_wav, noisy_im, enhanced_wav, enh_im
@@ -186,19 +378,25 @@ def specshow(
         set_ylabel = plt.ylabel
         set_xlim = plt.xlim
         set_ylim = plt.ylim
     if n_fft is None:
         if spec.shape[0] % 2 == 0:
             n_fft = spec.shape[0] * 2
         else:
             n_fft = (spec.shape[0] - 1) * 2
     hop = hop or n_fft // 4
     if t is None:
         t = np.arange(0, spec_np.shape[-1]) * hop / sr
     if f is None:
         f = np.arange(0, spec_np.shape[0]) * sr // 2 / (n_fft // 2) / 1000
     im = ax.pcolormesh(
         t, f, spec_np, rasterized=True, shading="auto", vmin=vmin, vmax=vmax, cmap=cmap
     )
     if title is not None:
         set_title(title)
     if xlabel is not None:
@@ -209,6 +407,7 @@ def specshow(
         set_xlim(xlim)
     if ylim is not None:
         set_ylim(ylim)
     return im
@@ -221,24 +420,44 @@ def spec_im(
     labels=True,
     **kwargs,
 ) -> Image:
     audio = torch.as_tensor(audio)
     if labels:
         kwargs.setdefault("xlabel", "Time [s]")
         kwargs.setdefault("ylabel", "Frequency [Hz]")
     n_fft = kwargs.setdefault("n_fft", 1024)
     hop = kwargs.setdefault("hop", 512)
     w = torch.hann_window(n_fft, device=audio.device)
     spec = torch.stft(audio, n_fft, hop, window=w, return_complex=False)
     spec = spec.div_(w.pow(2).sum())
     spec = torch.view_as_complex(spec).abs().clamp_min(1e-12).log10().mul(10)
     kwargs.setdefault("vmax", max(0.0, spec.max().item()))
     if figure is None:
         figure = plt.figure(figsize=figsize)
         figure.set_tight_layout(True)
     if spec.dim() > 2:
         spec = spec.squeeze(0)
     im = specshow(spec, **kwargs)
     if colorbar:
         ckwargs = {}
         if "ax" in kwargs:
@@ -247,13 +466,21 @@ def spec_im(
                     colorbar_format = "%+2.0f dB"
             ckwargs = {"ax": kwargs["ax"]}
         plt.colorbar(im, format=colorbar_format, **ckwargs)
     figure.canvas.draw()
     return Image.frombytes("RGB", figure.canvas.get_width_height(), figure.canvas.tostring_rgb())
 def cleanup_tmp(filter: List[str] = [], hours_keep=2):
     filter.append("p232")
     logger.info(f"Filter: {filter}")
     # Cleanup some old wav files
     if os.path.exists("/tmp"):
         for f in glob.glob("/tmp/*"):
@@ -269,21 +496,31 @@ def cleanup_tmp(filter: List[str] = [], hours_keep=2):
 def toggle(choice):
     if choice == "mic":
         return gr.update(visible=True, value=None), gr.update(visible=False, value=None)
     else:
         return gr.update(visible=False, value=None), gr.update(visible=True, value=None)
 with gr.Blocks() as demo:
     with gr.Row():
         gr.Markdown(
             """
-            ## DeepFilterNet2 Demo\
             This demo denoises audio files using DeepFilterNet. Try it with your own voice!
             """
         )
     with gr.Row():
         with gr.Column():
             radio = gr.Radio(
@@ -306,17 +543,18 @@ with gr.Blocks() as demo:
                 mic_input,
             ]
             btn = gr.Button("Generate")
         with gr.Column():
             outputs = [
-                # gr.Video(type="filepath", label="Noisy audio"),
                 gr.Audio(type="filepath", label="Noisy audio"),
                 gr.Image(label="Noisy spectrogram"),
-                # gr.Video(type="filepath", label="Enhanced audio"),
                 gr.Audio(type="filepath", label="Enhanced audio"),
                 gr.Image(label="Enhanced spectrogram"),
             ]
     btn.click(fn=demo_fn, inputs=inputs, outputs=outputs, api_name='denoise')
     radio.change(toggle, radio, [mic_input, audio_file])
     gr.Examples(
         [
             ["./samples/p232_013_clean.wav", "Kitchen", "10"],
@@ -328,8 +566,9 @@ with gr.Blocks() as demo:
         inputs=inputs,
         outputs=outputs,
         cache_examples=True,
-    ),
     gr.Markdown(open("usage.md").read())
 cleanup_tmp()
-demo.launch(enable_queue=True)

 import tempfile
 import time
 from typing import List, Optional, Tuple, Union
+from dataclasses import dataclass
 import gradio as gr
 import matplotlib.pyplot as plt
 import numpy as np
 import torch
+import soundfile as sf
 from loguru import logger
 from PIL import Image
 from torch import Tensor
+from scipy import signal
 from df import config
+from df.enhance import enhance, init_df
 from df.io import resample
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 }
+@dataclass
+class AudioMetaData:
+    """Simple audio metadata container to replace torchaudio.backend.common.AudioMetaData"""
+    sample_rate: int
+    num_frames: int
+    num_channels: int
+    bits_per_sample: int
+    encoding: str
+def load_audio(file_path: str, sr: int) -> Tuple[Tensor, AudioMetaData]:
+    """Load audio file using soundfile and resample if necessary.
+    Args:
+        file_path: Path to audio file
+        sr: Target sample rate
+    Returns:
+        audio: Torch tensor of shape [channels, samples]
+        meta: AudioMetaData with file information
+    """
+    try:
+        # Read audio using soundfile
+        audio_np, sample_rate = sf.read(file_path, dtype='float32')
+        # Handle mono/stereo
+        if audio_np.ndim == 1:
+            audio_np = audio_np[np.newaxis, :]  # Add channel dimension
+            num_channels = 1
+        else:
+            audio_np = audio_np.T  # Convert [samples, channels] to [channels, samples]
+            num_channels = audio_np.shape[0]
+        # Get file info for metadata
+        info = sf.info(file_path)
+        num_frames = info.frames
+        # Create metadata
+        meta = AudioMetaData(
+            sample_rate=sample_rate,
+            num_frames=num_frames,
+            num_channels=num_channels,
+            bits_per_sample=-1,  # Not directly available from soundfile
+            encoding=info.format
+        )
+        # Convert to torch tensor
+        audio = torch.from_numpy(audio_np).float()
+        # Resample if necessary
+        if sample_rate != sr:
+            audio = resample_audio(audio, sample_rate, sr)
+            meta.sample_rate = sr
+        return audio, meta
+    except Exception as e:
+        logger.error(f"Error loading audio file {file_path}: {e}")
+        raise
+def save_audio(file_path: str, audio: Tensor, sr: int) -> None:
+    """Save audio tensor to file using soundfile.
+    Args:
+        file_path: Output file path
+        audio: Audio tensor of shape [channels, samples] or [samples]
+        sr: Sample rate
+    """
+    try:
+        # Convert tensor to numpy
+        audio_np = audio.cpu().numpy()
+        # Handle tensor shape
+        if audio_np.ndim == 3:
+            audio_np = audio_np.squeeze(0)
+        # Convert [channels, samples] to [samples, channels] for soundfile
+        if audio_np.ndim == 2:
+            audio_np = audio_np.T
+        # Ensure float32
+        audio_np = audio_np.astype(np.float32)
+        # Clip to valid range
+        audio_np = np.clip(audio_np, -1.0, 1.0)
+        # Save using soundfile
+        sf.write(file_path, audio_np, sr)
+        logger.info(f"Saved audio to {file_path}")
+    except Exception as e:
+        logger.error(f"Error saving audio to {file_path}: {e}")
+        raise
+def resample_audio(audio: Tensor, sr_orig: int, sr_target: int) -> Tensor:
+    """Resample audio using scipy.signal.resample_poly.
+    Args:
+        audio: Audio tensor of shape [channels, samples]
+        sr_orig: Original sample rate
+        sr_target: Target sample rate
+    Returns:
+        Resampled audio tensor
+    """
+    if sr_orig == sr_target:
+        return audio
+    # Convert to numpy for resampling
+    audio_np = audio.cpu().numpy()
+    # Calculate gcd for polyphase resampling
+    from math import gcd
+    g = gcd(sr_orig, sr_target)
+    up = sr_target // g
+    down = sr_orig // g
+    logger.debug(f"Resampling from {sr_orig} to {sr_target} (up={up}, down={down})")
+    # Resample each channel
+    if audio_np.ndim == 2:
+        resampled = np.zeros((audio_np.shape[0], int(audio_np.shape[1] * sr_target / sr_orig)))
+        for ch in range(audio_np.shape[0]):
+            resampled[ch] = signal.resample_poly(audio_np[ch], up, down)
+    else:
+        resampled = signal.resample_poly(audio_np, up, down)
+    return torch.from_numpy(resampled).float()
 def mix_at_snr(clean, noise, snr, eps=1e-10):
     """Mix clean and noise signal at a given SNR.
     K = torch.sqrt((E_noise / E_speech) * 10 ** (snr / 10) + eps)
     noise = noise / K
     mixture = clean + noise
+    logger.debug(f"mixture: {mixture.shape}")
     assert torch.isfinite(mixture).all()
     max_m = mixture.abs().max()
     if max_m > 1:
 def load_audio_gradio(
     audio_or_file: Union[None, str, Tuple[int, np.ndarray]], sr: int
 ) -> Optional[Tuple[Tensor, AudioMetaData]]:
+    """Load audio from file or gradio microphone input.
+    Args:
+        audio_or_file: Path to audio file, tuple from gradio mic, or None
+        sr: Target sample rate
+    Returns:
+        Tuple of (audio tensor, metadata) or None
+    """
     if audio_or_file is None:
         return None
     if isinstance(audio_or_file, str):
         if audio_or_file.lower() == "none":
             return None
+        # Load from file path
         audio, meta = load_audio(audio_or_file, sr)
     else:
+        # Handle gradio microphone input
+        meta = AudioMetaData(
+            sample_rate=-1,
+            num_frames=-1,
+            num_channels=-1,
+            bits_per_sample=-1,
+            encoding=""
+        )
         assert isinstance(audio_or_file, (tuple, list))
+        sample_rate, audio_np = audio_or_file
+        # Gradio returns [samples, channels], reshape if needed
         audio_np = audio_np.reshape(audio_np.shape[0], -1).T
+        # Handle different integer formats
         if audio_np.dtype == np.int16:
             audio_np = (audio_np / (1 << 15)).astype(np.float32)
         elif audio_np.dtype == np.int32:
             audio_np = (audio_np / (1 << 31)).astype(np.float32)
+        audio = torch.from_numpy(audio_np).float()
+        # Resample if necessary
+        if sample_rate != sr:
+            audio = resample_audio(audio, sample_rate, sr)
+        meta.sample_rate = sr
     return audio, meta
 def demo_fn(speech_upl: str, noise_type: str, snr: int, mic_input: Optional[str] = None):
+    """Main demo function for audio denoising.
+    Args:
+        speech_upl: Path to uploaded speech file
+        noise_type: Type of noise to add
+        snr: Signal-to-noise ratio
+        mic_input: Path to microphone input file
+    Returns:
+        Tuple of (noisy_audio_path, noisy_spectrogram, enhanced_audio_path, enhanced_spectrogram)
+    """
     if mic_input:
         speech_upl = mic_input
     sr = config("sr", 48000, int, section="df")
     logger.info(f"Got parameters speech_upl: {speech_upl}, noise: {noise_type}, snr: {snr}")
     snr = int(snr)
     noise_fn = NOISES[noise_type]
     meta = AudioMetaData(-1, -1, -1, -1, "")
     max_s = 10  # limit to 10 seconds
     if speech_upl is not None:
         sample, meta = load_audio(speech_upl, sr)
         max_len = max_s * sr
     else:
         sample, meta = load_audio("samples/p232_013_clean.wav", sr)
         sample = sample[..., : max_s * sr]
     if sample.dim() > 1 and sample.shape[0] > 1:
         assert (
             sample.shape[1] > sample.shape[0]
         ), f"Expecting channels first, but got {sample.shape}"
         sample = sample.mean(dim=0, keepdim=True)
     logger.info(f"Loaded sample with shape {sample.shape}")
     if noise_fn is not None:
+        noise, _ = load_audio(noise_fn, sr)
         logger.info(f"Loaded noise with shape {noise.shape}")
         _, _, sample = mix_at_snr(sample, noise, snr)
     logger.info("Start denoising audio")
     enhanced = enhance(model, df, sample)
     logger.info("Denoising finished")
+    # Apply fade-in limiter
     lim = torch.linspace(0.0, 1.0, int(sr * 0.15)).unsqueeze(0)
     lim = torch.cat((lim, torch.ones(1, enhanced.shape[1] - lim.shape[1])), dim=1)
     enhanced = enhanced * lim
+    # Resample back to original sample rate if needed
     if meta.sample_rate != sr:
+        enhanced = resample_audio(enhanced, sr, meta.sample_rate)
+        sample = resample_audio(sample, sr, meta.sample_rate)
         sr = meta.sample_rate
+    # Save audio files
+    noisy_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
     save_audio(noisy_wav, sample, sr)
+    enhanced_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
     save_audio(enhanced_wav, enhanced, sr)
     logger.info(f"saved audios: {noisy_wav}, {enhanced_wav}")
+    # Generate spectrograms
     ax_noisy.clear()
     ax_enh.clear()
     noisy_im = spec_im(sample, sr=sr, figure=fig_noisy, ax=ax_noisy)
     enh_im = spec_im(enhanced, sr=sr, figure=fig_enh, ax=ax_enh)
+    # Cleanup temporary files (except the ones we want to return)
     filter = [speech_upl, noisy_wav, enhanced_wav]
     if mic_input is not None and mic_input != "":
         filter.append(mic_input)
     cleanup_tmp(filter)
     return noisy_wav, noisy_im, enhanced_wav, enh_im
         set_ylabel = plt.ylabel
         set_xlim = plt.xlim
         set_ylim = plt.ylim
     if n_fft is None:
         if spec.shape[0] % 2 == 0:
             n_fft = spec.shape[0] * 2
         else:
             n_fft = (spec.shape[0] - 1) * 2
     hop = hop or n_fft // 4
     if t is None:
         t = np.arange(0, spec_np.shape[-1]) * hop / sr
     if f is None:
         f = np.arange(0, spec_np.shape[0]) * sr // 2 / (n_fft // 2) / 1000
     im = ax.pcolormesh(
         t, f, spec_np, rasterized=True, shading="auto", vmin=vmin, vmax=vmax, cmap=cmap
     )
     if title is not None:
         set_title(title)
     if xlabel is not None:
         set_xlim(xlim)
     if ylim is not None:
         set_ylim(ylim)
     return im
     labels=True,
     **kwargs,
 ) -> Image:
+    """Convert audio to spectrogram image.
+    Args:
+        audio: Audio tensor
+        figsize: Figure size
+        colorbar: Whether to show colorbar
+        colorbar_format: Format for colorbar
+        figure: Matplotlib figure to use
+        labels: Whether to show axis labels
+        **kwargs: Additional arguments for specshow
+    Returns:
+        PIL Image of the spectrogram
+    """
     audio = torch.as_tensor(audio)
     if labels:
         kwargs.setdefault("xlabel", "Time [s]")
         kwargs.setdefault("ylabel", "Frequency [Hz]")
     n_fft = kwargs.setdefault("n_fft", 1024)
     hop = kwargs.setdefault("hop", 512)
     w = torch.hann_window(n_fft, device=audio.device)
     spec = torch.stft(audio, n_fft, hop, window=w, return_complex=False)
     spec = spec.div_(w.pow(2).sum())
     spec = torch.view_as_complex(spec).abs().clamp_min(1e-12).log10().mul(10)
     kwargs.setdefault("vmax", max(0.0, spec.max().item()))
     if figure is None:
         figure = plt.figure(figsize=figsize)
         figure.set_tight_layout(True)
     if spec.dim() > 2:
         spec = spec.squeeze(0)
     im = specshow(spec, **kwargs)
     if colorbar:
         ckwargs = {}
         if "ax" in kwargs:
                     colorbar_format = "%+2.0f dB"
             ckwargs = {"ax": kwargs["ax"]}
         plt.colorbar(im, format=colorbar_format, **ckwargs)
     figure.canvas.draw()
     return Image.frombytes("RGB", figure.canvas.get_width_height(), figure.canvas.tostring_rgb())
 def cleanup_tmp(filter: List[str] = [], hours_keep=2):
+    """Clean up old temporary files.
+    Args:
+        filter: List of file paths to keep (not delete)
+        hours_keep: Number of hours to keep files
+    """
     filter.append("p232")
     logger.info(f"Filter: {filter}")
     # Cleanup some old wav files
     if os.path.exists("/tmp"):
         for f in glob.glob("/tmp/*"):
 def toggle(choice):
+    """Toggle between microphone and file input.
+    Args:
+        choice: "mic" or "file"
+    Returns:
+        Tuple of updated components visibility
+    """
     if choice == "mic":
         return gr.update(visible=True, value=None), gr.update(visible=False, value=None)
     else:
         return gr.update(visible=False, value=None), gr.update(visible=True, value=None)
+# Create Gradio interface
 with gr.Blocks() as demo:
     with gr.Row():
         gr.Markdown(
             """
+            ## DeepFilterNet2 Demo
             This demo denoises audio files using DeepFilterNet. Try it with your own voice!
             """
         )
     with gr.Row():
         with gr.Column():
             radio = gr.Radio(
                 mic_input,
             ]
             btn = gr.Button("Generate")
         with gr.Column():
             outputs = [
                 gr.Audio(type="filepath", label="Noisy audio"),
                 gr.Image(label="Noisy spectrogram"),
                 gr.Audio(type="filepath", label="Enhanced audio"),
                 gr.Image(label="Enhanced spectrogram"),
             ]
     btn.click(fn=demo_fn, inputs=inputs, outputs=outputs, api_name='denoise')
     radio.change(toggle, radio, [mic_input, audio_file])
     gr.Examples(
         [
             ["./samples/p232_013_clean.wav", "Kitchen", "10"],
         inputs=inputs,
         outputs=outputs,
         cache_examples=True,
+    )
     gr.Markdown(open("usage.md").read())
 cleanup_tmp()
+demo.launch(enable_queue=True)