Spaces:

nvidia
/

RE-USE

Running on Zero

App Files Files Community

szuweifu commited on Mar 18

Commit

3c7312f

verified ·

1 Parent(s): 9b418f5

Upload 9 files

Browse files

Files changed (9) hide show

app.py +150 -0
exp/30x1_lr_00002_norm_05_vq_065_nfft_320_hop_40_NRIR_012_pha_0005_com_04_early_peak_GAN_tel_mic/g_01134000.pth +3 -0
models/codec_module_time_d4.py +168 -0
models/generator_SEMamba_time_d4.py +91 -0
models/mamba_block2_SEMamba.py +81 -0
models/stfts.py +95 -0
recipes/USEMamba_30x1_lr_00002_norm_05_vq_065_nfft_320_hop_40_NRIR_012_pha_0005_com_04_early_001.yaml +44 -0
requirements.txt +14 -0
utils/util.py +37 -0

app.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import shlex
+import subprocess
+import spaces
+import gradio as gr
+def install_mamba():
+    subprocess.run(shlex.split("pip install https://github.com/state-spaces/mamba/releases/download/v2.2.2/mamba_ssm-2.2.2+cu122torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"))
+install_mamba()
+ABOUT = """
+# RE-USE: A universal speech enhancement model for diverse degradations, sampling rates, and languages.
+Upload or record a noisy clip, then click **Enhance** to listen to the result and view its spectrogram.
+(ref: https://huggingface.co/spaces/rc19477/Speech_Enhancement_Mamba)
+"""
+import torch
+import torchaudio
+import librosa
+import matplotlib.pyplot as plt
+import numpy as np
+from models.stfts import mag_phase_stft, mag_phase_istft
+from models.generator_SEMamba_MPSEnet_time_d4 import SEMamba
+from utils.util import load_config, pad_or_trim_to_match
+def make_even(value):
+    value = int(round(value))
+    return value if value % 2 == 0 else value + 1
+device = "cuda"
+cfg1 = load_config('recipes/USEMamba_30x1_lr_00002_norm_05_vq_065_nfft_320_hop_40_NRIR_012_pha_0005_com_04_early_001.yaml')
+n_fft, hop_size, win_size = cfg1['stft_cfg']['n_fft'], cfg1['stft_cfg']['hop_size'], cfg1['stft_cfg']['win_size']
+compress_factor = cfg1['model_cfg']['compress_factor']
+sampling_rate = cfg1['stft_cfg']['sampling_rate']
+USE_model = SEMamba(cfg1).to(device)
+checkpoint_file = "exp/30x1_lr_00002_norm_05_vq_065_nfft_320_hop_40_NRIR_012_pha_0005_com_04_early_peak_GAN_tel_mic/g_01134000.pth"
+state_dict = torch.load(checkpoint_file, map_location=device)
+USE_model.load_state_dict(state_dict['generator'])
+USE_model.eval()
+@spaces.GPU
+def enhance(filepath, low_pass_sampling_rate, target_sampling_rate):
+    with torch.no_grad():
+        noisy_wav, noisy_sr = torchaudio.load(filepath)
+        torchaudio.save("original.wav", noisy_wav.cpu(), noisy_sr)
+        original_noisy_wav = noisy_wav
+        original_sr = noisy_sr
+        if target_sampling_rate != '':
+            if low_pass_sampling_rate != '':
+                opts = {"res_type": "kaiser_best"}
+                noisy_wav = torch.tensor(librosa.resample(noisy_wav.cpu().numpy(), orig_sr=noisy_sr, target_sr=int(low_pass_sampling_rate), **opts))
+                noisy_sr = int(low_pass_sampling_rate)
+            opts = {"res_type": "kaiser_best"}
+            noisy_wav = librosa.resample(noisy_wav.cpu().numpy(), orig_sr=noisy_sr, target_sr=int(target_sampling_rate), **opts)
+            noisy_sr = int(target_sampling_rate)
+        noisy_wav = torch.FloatTensor(noisy_wav).to(device)
+        n_fft_scaled = make_even(n_fft * noisy_sr // sampling_rate)
+        hop_size_scaled = make_even(hop_size * noisy_sr // sampling_rate)
+        win_size_scaled = make_even(win_size * noisy_sr // sampling_rate)
+        noisy_mag, noisy_pha, noisy_com = mag_phase_stft(
+            noisy_wav,
+            n_fft=n_fft_scaled,
+            hop_size=hop_size_scaled,
+            win_size=win_size_scaled,
+            compress_factor=compress_factor,
+            center=True,
+            addeps=False
+        )
+        amp_g, pha_g, _ = USE_model(noisy_mag, noisy_pha)
+        audio_g = mag_phase_istft(amp_g, pha_g, n_fft_scaled, hop_size_scaled, win_size_scaled, compress_factor)
+        audio_g = pad_or_trim_to_match(noisy_wav.detach(), audio_g, pad_value=1e-8)  # Align lengths using epsilon padding
+        assert audio_g.shape == noisy_wav.shape, audio_g.shape
+        # write file
+        torchaudio.save("enhanced.wav", audio_g.cpu(), noisy_sr)
+        # spectrograms
+        fig, axs = plt.subplots(1, 2, figsize=(16, 4))
+        # noisy
+        D_noisy = librosa.stft(original_noisy_wav[0].cpu().numpy(), n_fft=512, hop_length=256)
+        S_noisy = librosa.amplitude_to_db(np.abs(D_noisy), ref=np.max)
+        librosa.display.specshow(S_noisy, sr=original_sr, hop_length=256, x_axis="time", y_axis="hz", ax=axs[0], vmax=0)
+        axs[0].set_title("Noisy Spectrogram")
+        # enhanced
+        D_clean = librosa.stft(audio_g.cpu().numpy(), n_fft=512, hop_length=256)
+        S_clean = librosa.amplitude_to_db(np.abs(D_clean), ref=np.max)
+        librosa.display.specshow(S_clean[0], sr=noisy_sr, hop_length=256, x_axis="time", y_axis="hz", ax=axs[1], vmax=0)
+        axs[1].set_title("Enhanced Spectrogram")
+        plt.tight_layout()
+    return "original.wav", "enhanced.wav", fig
+with gr.Blocks() as demo:
+    gr.Markdown(ABOUT)
+    gr.Markdown("**Note 1**: For bandwidth extension, the performance may be affected by the characteristics of the input data, particularly the cutoff pattern. A simple solution is to apply low-pass filtering beforehand.")
+    gr.Markdown("**Note 2**: When processing long input audio, out-of-memory (OOM) errors may occur. To address this, use the chunk-wise inference implementation provided on the Hugging Face.")
+    with gr.Row():
+        with gr.Column():
+            # Create Tabs to separate Audio and Video sessions
+            with gr.Tabs():
+                with gr.TabItem("Audio Upload"):
+                    # gr.Audio works great for standard audio files
+                    input_audio = gr.Audio(label="Input Audio", type="filepath")
+                with gr.TabItem("Video Upload (.mp4, .mov)"):
+                    # gr.File handles .mp4 and .mov without errors
+                    input_video = gr.File(label="Input Video", file_types=[".mp4", ".mov"])
+            target_sampling_rate = gr.Textbox(label="(Optional) Enter target sampling rate for bandwidth extension:")
+            low_pass_sampling_rate = gr.Textbox(label="(Optional) Enter target sampling rate for pre-low-pass filtering before bandwidth extension:")
+            # Helper to unify the input: we use a hidden state to store which one was used
+            active_input = gr.State()
+    enhance_btn = gr.Button("Enhance")
+    with gr.Row():
+        input_audio_player = gr.Audio(label="Original Input Audio", type="filepath")
+        output_audio = gr.Audio(label="Enhanced Audio", type="filepath")
+    plot_output = gr.Plot(label="Spectrograms")
+    # This function determines which input (audio tab or video tab) to send to your model
+    def unified_enhance(audio_path, video_path, lp_sr, target_sr):
+        # Determine which path is valid (the one from the active tab)
+        # Note: input_video returns a file object, so we get its .name
+        final_path = audio_path if audio_path else video_path
+        return enhance(final_path, lp_sr, target_sr)
+    enhance_btn.click(
+        fn=unified_enhance,
+        inputs=[input_audio, input_video, low_pass_sampling_rate, target_sampling_rate],
+        outputs=[input_audio_player, output_audio, plot_output]
+    )
+demo.queue().launch(share=True)

exp/30x1_lr_00002_norm_05_vq_065_nfft_320_hop_40_NRIR_012_pha_0005_com_04_early_peak_GAN_tel_mic/g_01134000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e27db9e1de904eb59fc627dea72c69da7ca25650a3e704b4096f89812b395fe5
+size 38982886

models/codec_module_time_d4.py ADDED Viewed

	@@ -0,0 +1,168 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import torch
+import torch.nn as nn
+import numpy as np
+from einops import rearrange
+def get_padding_2d(kernel_size, dilation=(1, 1)):
+    """
+    Calculate the padding size for a 2D convolutional layer.
+    Args:
+    - kernel_size (tuple): Size of the convolutional kernel (height, width).
+    - dilation (tuple, optional): Dilation rate of the convolution (height, width). Defaults to (1, 1).
+    Returns:
+    - tuple: Calculated padding size (height, width).
+    """
+    return (int((kernel_size[0] * dilation[0] - dilation[0]) / 2),
+            int((kernel_size[1] * dilation[1] - dilation[1]) / 2))
+class SPConvTranspose2d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, r=1):
+        super(SPConvTranspose2d, self).__init__()
+        self.pad1 = nn.ConstantPad2d((1, 1, 0, 0), value=0.)
+        self.out_channels = out_channels
+        self.conv = nn.Conv2d(in_channels, out_channels * r, kernel_size=kernel_size, stride=(1, 1))
+        self.r = r
+    def forward(self, x):
+        x = self.pad1(x)
+        out = self.conv(x)
+        batch_size, nchannels, H, W = out.shape
+        out = out.view((batch_size, self.r, nchannels // self.r, H, W))
+        out = out.permute(0, 2, 3, 4, 1)
+        out = out.contiguous().view((batch_size, nchannels // self.r, H, -1))
+        return out
+class DenseBlock(nn.Module):
+    """
+    DenseBlock module consisting of multiple convolutional layers with dilation.
+    """
+    def __init__(self, cfg, kernel_size=(3, 3), depth=4):
+        super(DenseBlock, self).__init__()
+        self.cfg = cfg
+        self.depth = depth
+        self.dense_block = nn.ModuleList()
+        self.hid_feature = cfg['model_cfg']['hid_feature']
+        for i in range(depth):
+            dil = 2 ** i
+            dense_conv = nn.Sequential(
+                nn.Conv2d(self.hid_feature * (i + 1), self.hid_feature, kernel_size,
+                          dilation=(dil, 1), padding=get_padding_2d(kernel_size, (dil, 1))),
+                nn.InstanceNorm2d(self.hid_feature, affine=True),
+                nn.PReLU(self.hid_feature)
+            )
+            self.dense_block.append(dense_conv)
+    def forward(self, x):
+        skip = x
+        for i in range(self.depth):
+            x = self.dense_block[i](skip)
+            skip = torch.cat([x, skip], dim=1)
+        return x
+class DenseEncoder(nn.Module):
+    """
+    DenseEncoder module consisting of initial convolution, dense block, and a final convolution.
+    """
+    def __init__(self, cfg):
+        super(DenseEncoder, self).__init__()
+        self.cfg = cfg
+        self.input_channel = cfg['model_cfg']['input_channel']
+        self.hid_feature = cfg['model_cfg']['hid_feature']
+        self.dense_conv_1 = nn.Sequential(
+            nn.Conv2d(self.input_channel, self.hid_feature, (1, 1)),
+            nn.InstanceNorm2d(self.hid_feature, affine=True),
+            nn.PReLU(self.hid_feature)
+        )
+        self.dense_block = DenseBlock(cfg, depth=4)
+        self.dense_conv_2 = nn.Sequential(
+            nn.Conv2d(self.hid_feature, self.hid_feature, (1, 3), stride=(4, 2)),
+            nn.InstanceNorm2d(self.hid_feature, affine=True),
+            nn.PReLU(self.hid_feature)
+        )
+    def forward(self, x):
+        x = self.dense_conv_1(x)  # [batch, hid_feature, time, freq]
+        x = self.dense_block(x)   # [batch, hid_feature, time, freq]
+        x = self.dense_conv_2(x)  # [batch, hid_feature, time, freq//2]
+        return x
+class MagDecoder(nn.Module):
+    """
+    MagDecoder module for decoding magnitude information.
+    """
+    def __init__(self, cfg):
+        super(MagDecoder, self).__init__()
+        self.dense_block = DenseBlock(cfg, depth=4)
+        self.hid_feature = cfg['model_cfg']['hid_feature']
+        self.output_channel = cfg['model_cfg']['output_channel']
+        self.n_fft = cfg['stft_cfg']['n_fft']
+        self.beta = cfg['model_cfg']['beta']
+        self.up_conv1 = nn.Sequential(
+            SPConvTranspose2d(self.hid_feature, self.hid_feature, (1, 3), 2),
+            nn.InstanceNorm2d(self.hid_feature, affine=True),
+            nn.PReLU(self.hid_feature)
+        )
+        self.up_conv2 = nn.Sequential(
+            SPConvTranspose2d(self.hid_feature, self.hid_feature, (1, 3), 4),
+            nn.InstanceNorm2d(self.hid_feature, affine=True),
+            nn.PReLU(self.hid_feature)
+        )
+        self.final_conv = nn.Conv2d(self.hid_feature, self.output_channel, (1, 1))
+    def forward(self, x):
+        x = self.dense_block(x)
+        x = self.up_conv1(x)
+        x = self.up_conv2(x.permute(0,1,3,2)).permute(0,1,3,2)
+        x = self.final_conv(x)
+        return x
+class PhaseDecoder(nn.Module):
+    """
+    PhaseDecoder module for decoding phase information.
+    """
+    def __init__(self, cfg):
+        super(PhaseDecoder, self).__init__()
+        self.dense_block = DenseBlock(cfg, depth=4)
+        self.hid_feature = cfg['model_cfg']['hid_feature']
+        self.output_channel = cfg['model_cfg']['output_channel']
+        self.up_conv1 = nn.Sequential(
+            SPConvTranspose2d(self.hid_feature, self.hid_feature, (1, 3), 2),
+            nn.InstanceNorm2d(self.hid_feature, affine=True),
+            nn.PReLU(self.hid_feature)
+        )
+        self.up_conv2 = nn.Sequential(
+            SPConvTranspose2d(self.hid_feature, self.hid_feature, (1, 3), 4),
+            nn.InstanceNorm2d(self.hid_feature, affine=True),
+            nn.PReLU(self.hid_feature)
+        )
+        self.phase_conv_r = nn.Conv2d(self.hid_feature, self.output_channel, (1, 1))
+        self.phase_conv_i = nn.Conv2d(self.hid_feature, self.output_channel, (1, 1))
+    def forward(self, x):
+        x = self.dense_block(x)
+        x = self.up_conv1(x)
+        x = self.up_conv2(x.permute(0,1,3,2)).permute(0,1,3,2)
+        x_r = self.phase_conv_r(x)
+        x_i = self.phase_conv_i(x)
+        x = torch.atan2(x_i, x_r)
+        return x

models/generator_SEMamba_time_d4.py ADDED Viewed

	@@ -0,0 +1,91 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import torch
+import torch.nn as nn
+from einops import rearrange
+from .mamba_block2_SEMamba import TFMambaBlock
+from .codec_module_time_d4 import DenseEncoder, MagDecoder, PhaseDecoder
+class SEMamba(nn.Module):
+    """
+    SEMamba model for speech enhancement using Mamba blocks.
+    This model uses a dense encoder, multiple Mamba blocks, and separate magnitude
+    and phase decoders to process noisy magnitude and phase inputs.
+    """
+    def __init__(self, cfg):
+        """
+        Initialize the SEMamba model.
+        Args:
+        - cfg: Configuration object containing model parameters.
+        """
+        super(SEMamba, self).__init__()
+        self.cfg = cfg
+        self.num_tscblocks = cfg['model_cfg']['num_tfmamba'] if cfg['model_cfg']['num_tfmamba'] is not None else 4  # default tfmamba: 4
+        # Initialize dense encoder
+        self.dense_encoder = DenseEncoder(cfg)
+        # Initialize Mamba blocks
+        self.TSMamba = nn.ModuleList([TFMambaBlock(cfg) for _ in range(self.num_tscblocks)])
+        # Initialize decoders
+        self.mask_decoder = MagDecoder(cfg)
+        self.phase_decoder = PhaseDecoder(cfg)
+    def forward(self, noisy_mag, noisy_pha):
+        """
+        Forward pass for the SEMamba model.
+        Args:
+        - noisy_mag (torch.Tensor): Noisy magnitude input tensor [B, F, T].
+        - noisy_pha (torch.Tensor): Noisy phase input tensor [B, F, T].
+        Returns:
+        - denoised_mag (torch.Tensor): Denoised magnitude tensor [B, F, T].
+        - denoised_pha (torch.Tensor): Denoised phase tensor [B, F, T].
+        - denoised_com (torch.Tensor): Denoised complex tensor [B, F, T, 2].
+        """
+        # Reshape inputs
+        noisy_mag = rearrange(noisy_mag, 'b f t -> b t f').unsqueeze(1)  # [B, 1, T, F]
+        noisy_pha = rearrange(noisy_pha, 'b f t -> b t f').unsqueeze(1)  # [B, 1, T, F]
+        # Concatenate magnitude and phase inputs
+        x = torch.cat((noisy_mag, noisy_pha), dim=1)  # [B, 2, T, F]
+        # Prevent unpredictable errors
+        B, C, T, F = x.shape
+        zeros = torch.zeros(B, C, T, 2, device=x.device)
+        x = torch.cat((x, zeros), dim=-1)
+        zeros = torch.zeros(B, C, 2, F+2, device=x.device)
+        x = torch.cat((x, zeros), dim=-2)
+        # Encode input
+        x = self.dense_encoder(x)
+        # Apply Mamba blocks
+        for block in self.TSMamba:
+            x = block(x)
+        # Decode output
+        denoised_mag = rearrange(self.mask_decoder(x), 'b c t f -> b f t c').squeeze(-1)
+        denoised_pha = rearrange(self.phase_decoder(x), 'b c t f -> b f t c').squeeze(-1)
+        # Prevent unpredictable errors
+        denoised_mag = denoised_mag[:, :F, :T]
+        denoised_pha = denoised_pha[:, :F, :T]
+        # Combine denoised magnitude and phase into a complex representation
+        denoised_com = torch.stack(
+            (denoised_mag * torch.cos(denoised_pha), denoised_mag * torch.sin(denoised_pha)),
+            dim=-1
+        )
+        return denoised_mag, denoised_pha, denoised_com

models/mamba_block2_SEMamba.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import init
+from torch.nn.parameter import Parameter
+from functools import partial
+from einops import rearrange
+from mamba_ssm import Mamba
+class MambaBlock(nn.Module):
+    def __init__(self, d_model, cfg):
+        super(MambaBlock, self).__init__()
+        d_state = cfg['model_cfg']['d_state'] # 16
+        d_conv = cfg['model_cfg']['d_conv'] # 4
+        expand = cfg['model_cfg']['expand'] # 4
+        self.forward_blocks  = Mamba(d_model=d_model, d_state=d_state, d_conv=d_conv, expand=expand)
+        self.backward_blocks = Mamba(d_model=d_model, d_state=d_state, d_conv=d_conv, expand=expand)
+        self.output_proj = nn.Linear(2 * d_model, d_model)
+        self.norm = nn.LayerNorm(d_model)
+    def forward(self, x):
+        # x: [B, T, D]
+        out_fw = self.forward_blocks(x) + x
+        out_bw = self.backward_blocks(torch.flip(x, dims=[1])) + torch.flip(x, dims=[1])
+        out_bw = torch.flip(out_bw, dims=[1])
+        out = torch.cat([out_fw, out_bw], dim=-1)
+        out = self.output_proj(out)
+        # LayerNorm
+        return self.norm(out)
+class TFMambaBlock(nn.Module):
+    """
+    Temporal-Frequency Mamba block for sequence modeling.
+    Attributes:
+    cfg (Config): Configuration for the block.
+    time_mamba (MambaBlock): Mamba block for temporal dimension.
+    freq_mamba (MambaBlock): Mamba block for frequency dimension.
+    tlinear (ConvTranspose1d): ConvTranspose1d layer for temporal dimension.
+    flinear (ConvTranspose1d): ConvTranspose1d layer for frequency dimension.
+    """
+    def __init__(self, cfg):
+        super(TFMambaBlock, self).__init__()
+        self.cfg = cfg
+        self.hid_feature = cfg['model_cfg']['hid_feature']
+        # Initialize Mamba blocks
+        self.time_mamba = MambaBlock(d_model=self.hid_feature, cfg=cfg)
+        self.freq_mamba = MambaBlock(d_model=self.hid_feature, cfg=cfg)
+    def forward(self, x):
+        """
+        Forward pass of the TFMamba block.
+        Parameters:
+        x (Tensor): Input tensor with shape (batch, channels, time, freq).
+        Returns:
+        Tensor: Output tensor after applying temporal and frequency Mamba blocks.
+        """
+        b, c, t, f = x.size()
+        x = x.permute(0, 3, 2, 1).contiguous().view(b*f, t, c)
+        x = self.time_mamba(x) + x
+        x = x.view(b, f, t, c).permute(0, 2, 1, 3).contiguous().view(b*t, f, c)
+        x = self.freq_mamba(x) + x
+        x = x.view(b, t, f, c).permute(0, 3, 1, 2)
+        return x

models/stfts.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import torch
+import torch.nn as nn
+def decompress_signed_log1p(y):
+   return torch.sign(y) * (torch.expm1(torch.abs(y)))
+RELU = nn.ReLU()
+def mag_phase_stft(y, n_fft, hop_size, win_size, compress_factor=1.0, center=True, addeps=False):
+    """
+    Compute magnitude and phase using STFT.
+    Args:
+        y (torch.Tensor): Input audio signal.
+        n_fft (int): FFT size.
+        hop_size (int): Hop size.
+        win_size (int): Window size.
+        compress_factor (float, optional): Magnitude compression factor. Defaults to 1.0.
+        center (bool, optional): Whether to center the signal before padding. Defaults to True.
+        eps (bool, optional): Whether adding epsilon to magnitude and phase or not. Defaults to False.
+    Returns:
+        tuple: Magnitude, phase, and complex representation of the STFT.
+    """
+    eps = 1e-10
+    hann_window = torch.hann_window(win_size).to(y.device)
+    stft_spec = torch.stft(
+                    y, n_fft,
+                    hop_length=hop_size,
+                    win_length=win_size,
+                    window=hann_window,
+                    center=center,
+                    pad_mode='reflect',
+                    normalized=False,
+                    return_complex=True)
+    if addeps==False:
+        mag = torch.abs(stft_spec)
+        pha = torch.angle(stft_spec)
+    else:
+        real_part = stft_spec.real
+        imag_part = stft_spec.imag
+        mag = torch.sqrt(real_part.pow(2) + imag_part.pow(2) + eps)
+        pha = torch.atan2(imag_part + eps, real_part + eps)
+    # Compress the magnitude
+    if compress_factor in ['log1p','relu_log1p', 'signed_log1p']:
+        mag = torch.log1p(mag)
+    else:
+        mag = torch.pow(mag, compress_factor)
+    com = torch.stack((mag * torch.cos(pha), mag * torch.sin(pha)), dim=-1)
+    return mag, pha, com
+def mag_phase_istft(mag, pha, n_fft, hop_size, win_size, compress_factor=1.0, center=True):
+    """
+    Inverse STFT to reconstruct the audio signal from magnitude and phase.
+    Args:
+        mag (torch.Tensor): Magnitude of the STFT.
+        pha (torch.Tensor): Phase of the STFT.
+        n_fft (int): FFT size.
+        hop_size (int): Hop size.
+        win_size (int): Window size.
+        compress_factor (float, optional): Magnitude compression factor. Defaults to 1.0.
+        center (bool, optional): Whether to center the signal before padding. Defaults to True.
+    Returns:
+        torch.Tensor: Reconstructed audio signal.
+    """
+    if compress_factor == 'log1p':
+        mag = torch.expm1(mag)
+    elif compress_factor == 'signed_log1p':
+        mag = decompress_signed_log1p(mag)
+    elif compress_factor == 'relu_log1p':
+        mag = torch.expm1(RELU(mag))
+    else:
+        mag = torch.pow(RELU(mag), 1.0 / compress_factor)
+    com = torch.complex(mag * torch.cos(pha), mag * torch.sin(pha))
+    hann_window = torch.hann_window(win_size).to(com.device)
+    wav = torch.istft(
+                    com,
+                    n_fft,
+                    hop_length=hop_size,
+                    win_length=win_size,
+                    window=hann_window,
+                    center=center)
+    return wav

recipes/USEMamba_30x1_lr_00002_norm_05_vq_065_nfft_320_hop_40_NRIR_012_pha_0005_com_04_early_001.yaml ADDED Viewed

	@@ -0,0 +1,44 @@

+# Environment Settings
+# These settings specify the hardware and distributed setup for the model training.
+# Adjust `num_gpus` and `dist_config` according to your distributed training environment.
+env_setting:
+  num_gpus: 8  # Number of GPUs. Now we don't support CPU mode.
+  num_workers: 20 # 0  Number of worker threads for data loading.
+  persistent_workers: True # False  If you have large RAM, turn this to be True
+  prefetch_factor: 8 # null
+  seed: 1234  # Seed for random number generators to ensure reproducibility.
+  stdout_interval: 5000
+  checkpoint_interval: 5000  # save model to ckpt every N steps
+  validation_interval: 5000
+  dist_cfg:
+    dist_backend: nccl  # Distributed training backend, 'nccl' for NVIDIA GPUs.
+    dist_url: tcp://localhost:19478  # URL for initializing distributed training.
+    world_size: 1  # Total number of processes in the distributed training.
+  pin_memory: True # If you have large RAM, turn this to be True
+# STFT Configuration
+# Configuration for Short-Time Fourier Transform (STFT), crucial for audio processing models.
+stft_cfg:
+  sampling_rate: 8000  # Audio sampling rate in Hz.
+  n_fft: 320  # FFT components for transforming audio signals.
+  hop_size: 40  # Samples between successive frames.
+  win_size: 320  # Window size used in FFT.
+  sfi: True # Sampline Frequency Independent
+# Model Configuration
+# Defines the architecture specifics of the model, including layer configurations and feature compression.
+model_cfg:
+  hid_feature: 64  # Channels in dense layers.
+  compress_factor: relu_log1p  # Compression factor applied to extracted features.
+  num_tfmamba: 30  # Number of Time-Frequency Mamba (TFMamba) blocks in the model.
+  d_state: 16  # Dimensionality of the state vector in Mamba blocks.
+  d_conv: 4  # Convolutional layer dimensionality within Mamba blocks.
+  expand: 4  # Expansion factor for the layers within the Mamba blocks.
+  norm_epsilon: 0.00001  # Numerical stability in normalization layers within the Mamba blocks.
+  beta: 2.0  # Hyperparameter for the Learnable Sigmoid function.
+  input_channel: 2 # Magnitude and Phase
+  output_channel: 1  # Single Channel Speech Enhancement
+  inner_mamba_nlayer: 1 # Number of layer of Mamba in Bidirectional Mamba
+  nonlinear: None # last activation function for the mag encoder. 'softplus' or 'relu'
+  mapping: True # Otherwise, this should be masking model

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+packaging
+librosa
+soundfile
+pyyaml
+argparse
+tensorboard
+pesq
+einops
+matplotlib
+torch==2.6.0
+torchaudio==2.6.0
+numpy==1.26.4
+resampy
+transformers==4.33.3

utils/util.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import yaml
+import torch
+import os
+import shutil
+import torch.nn.functional as F
+def load_config(config_path):
+    """Load configuration from a YAML file."""
+    with open(config_path, 'r') as file:
+        return yaml.safe_load(file)
+def pad_or_trim_to_match(reference: torch.Tensor, target: torch.Tensor, pad_value: float = 1e-6) -> torch.Tensor:
+    """
+    Extends the target tensor to match the reference tensor along dim=1
+    without breaking autograd, by creating a new tensor and copying data in.
+    """
+    B, ref_len = reference.shape
+    _, tgt_len = target.shape
+    if tgt_len == ref_len:
+        return target
+    elif tgt_len > ref_len:
+        return target[:, :ref_len]
+    # Allocate padded tensor with grad support
+    padded = torch.full((B, ref_len), pad_value, dtype=target.dtype, device=target.device)
+    padded[:, :tgt_len] = target  # This preserves gradient tracking
+    return padded