Spaces:

swc2
/

Target-speaker-extraction

Running

App Files Files Community

swc2 commited on Mar 13, 2024

Commit

c7ba938

1 Parent(s): bb19896

init

Browse files

Files changed (17) hide show

app.py +87 -0
nnet/ResNet34.py +213 -0
nnet/__init__.py +0 -0
nnet/cnns.py +186 -0
nnet/norm.py +59 -0
nnet/pooling.py +100 -0
nnet/speaker_encoder.py +47 -0
nnet/spex_plus.py +247 -0
requirement.txt +5 -0
utils/__init__.py +0 -0
utils/audio.py +124 -0
utils/dataset copy.py +284 -0
utils/dataset.py +402 -0
utils/load_obj.py +18 -0
utils/logger.py +22 -0
utils/sisdr.py +23 -0
utils/timer.py +17 -0

app.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import gradio as gr
+import torch as th
+import numpy as np
+from nnet.spex_plus import SpEx_Plus
+from utils.logger import get_logger
+from utils.audio import WaveReader, write_wav
+logger = get_logger(__name__)
+class NnetComputer(object):
+    def __init__(self, cpt_dir, gpuid, nnet_conf):
+        self.device = th.device("cuda:{}".format(gpuid)) if gpuid >= 0 else th.device("cpu")
+        nnet = self._load_nnet(cpt_dir, nnet_conf)
+        self.nnet = nnet.to(self.device) if gpuid >= 0 else nnet
+        # set eval model
+        self.nnet.eval()
+    def _load_nnet(self, cpt_dir, nnet_conf):
+        nnet = SpEx_Plus(**nnet_conf)
+        cpt_fname = os.path.join(cpt_dir, "59.pt.tar")
+        cpt = th.load(cpt_fname, map_location="cpu")
+        nnet.load_state_dict(cpt["model_state_dict"])
+        logger.info("Load checkpoint from {}, epoch {:d}".format(
+            cpt_fname, cpt["epoch"]))
+        return nnet
+    def compute(self, samps, aux_samps, aux_samps_len):
+        with th.no_grad():
+            raw = th.tensor(samps, dtype=th.float32, device=self.device)
+            aux = th.tensor(aux_samps, dtype=th.float32, device=self.device)
+            aux_len = th.tensor(aux_samps_len, dtype=th.float32, device=self.device)
+            aux = aux.unsqueeze(0)
+            sps, sps2, sps3, spk_pred = self.nnet(raw, aux, aux_len)
+            sp_samps = np.squeeze(sps.detach().cpu().numpy())
+            return sp_samps
+def compute_output(input_audio, use_gpu, checkpoint, output_dir):
+    # Prepare mix_input and aux_input based on the input_audio
+    mix_input = {}  # Modify this to include your mix_input
+    aux_input = {}  # Modify this to include your aux_input
+    # Set GPU index based on the user's choice
+    gpu_index = -1 if not use_gpu else 0
+    # Run the computation
+    nnet_conf = {
+        "L1": int(0.0025 * 16000),
+        "L2": int(0.01 * 16000),
+        "L3": int(0.02 * 16000),
+        "N": 256,
+        "B": 8,
+        "O": 256,
+        "P": 512,
+        "Q": 3,
+        "num_spks": 395,
+        "spk_embed_dim": 256,
+        "causal": False
+    }
+    computer = NnetComputer(checkpoint, gpu_index, nnet_conf)
+    for key, mix_samps in mix_input:
+        aux_samps = aux_input[key]
+        logger.info("Compute on utterance {}...".format(key))
+        samps = computer.compute(mix_samps, aux_samps, len(aux_samps))
+        norm = np.linalg.norm(mix_samps, np.inf)
+        samps = samps[:mix_samps.size]
+        # Normalize the output
+        samps = samps * norm / np.max(np.abs(samps))
+        # Write output to the specified directory
+        write_wav(os.path.join(output_dir, "{}.wav".format(key)), samps, sample_rate=args.sample_rate)
+    logger.info("Compute over {:d} utterances".format(len(mix_input)))
+# Define the Gradio interface
+inputs = [
+    gr.Audio(name="input_audio", label="Input Audio"),
+    gr.Checkbox(name="use_gpu", label="Use GPU"),
+    gr.TextInput(name="checkpoint", label="Checkpoint Directory"),
+    gr.TextInput(name="output_dir", label="Output Directory")
+]
+output = gr.Interface(
+    fn=compute_output,
+    inputs=inputs,
+    outputs=None,
+    title="Audio Processing with Neural Network",
+    description="Process audio input using a neural network model.",
+    theme="compact"
+)
+output.launch()

nnet/ResNet34.py ADDED Viewed

	@@ -0,0 +1,213 @@

+#! /usr/bin/python
+# -*- encoding: utf-8 -*-
+'''
+Fast ResNet
+https://arxiv.org/pdf/2003.11982.pdf
+'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Parameter
+try:
+    from .pooling import *
+except:
+    from pooling import *
+class SEBasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
+        super(SEBasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.se = SELayer(planes, reduction)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.relu(out)
+        out = self.bn1(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.se(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class SEBottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
+        super(SEBottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.se = SELayer(planes * 4, reduction)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        out = self.se(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class SELayer(nn.Module):
+    def __init__(self, channel, reduction=8):
+        super(SELayer, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+                nn.Linear(channel, channel // reduction),
+                nn.ReLU(inplace=True),
+                nn.Linear(channel // reduction, channel),
+                nn.Sigmoid()
+        )
+    def forward(self, x):
+        b, c, _, _ = x.size()
+        y = self.avg_pool(x).view(b, c)
+        y = self.fc(y).view(b, c, 1, 1)
+        return x * y
+class ResNetSE(nn.Module):
+    def __init__(self, block, layers, num_filters, embedding_dim, n_mels=80, pooling_type="TSP", **kwargs):
+        super(ResNetSE, self).__init__()
+        self.inplanes   = num_filters[0]
+        self.conv1 = nn.Conv2d(1, num_filters[0] , kernel_size=3, stride=(1, 1), padding=1,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(num_filters[0])
+        self.relu = nn.ReLU(inplace=True)
+        self.layer1 = self._make_layer(block, num_filters[0], layers[0])
+        self.layer2 = self._make_layer(block, num_filters[1], layers[1], stride=(2, 2))
+        self.layer3 = self._make_layer(block, num_filters[2], layers[2], stride=(2, 2))
+        self.layer4 = self._make_layer(block, num_filters[3], layers[3], stride=(2, 2))
+        out_dim = num_filters[3] * block.expansion * (n_mels//8)
+        if pooling_type == "Temporal_Average_Pooling" or pooling_type == "TAP":
+            self.pooling = Temporal_Average_Pooling()
+            self.bn2 = nn.BatchNorm1d(out_dim)
+            self.fc = nn.Linear(out_dim, embedding_dim)
+            self.bn3 = nn.BatchNorm1d(embedding_dim)
+        elif pooling_type == "Temporal_Statistics_Pooling" or pooling_type == "TSP":
+            self.pooling = Temporal_Statistics_Pooling()
+            self.bn2 = nn.BatchNorm1d(out_dim * 2)
+            self.fc = nn.Linear(out_dim * 2, embedding_dim)
+            self.bn3 = nn.BatchNorm1d(embedding_dim)
+        elif pooling_type == "Self_Attentive_Pooling" or pooling_type == "SAP":
+            self.pooling = Self_Attentive_Pooling(out_dim)
+            self.bn2 = nn.BatchNorm1d(out_dim)
+            self.fc = nn.Linear(out_dim, embedding_dim)
+            self.bn3 = nn.BatchNorm1d(embedding_dim)
+        elif pooling_type == "Attentive_Statistics_Pooling" or pooling_type == "ASP":
+            self.pooling = Attentive_Statistics_Pooling(out_dim)
+            self.bn2 = nn.BatchNorm1d(out_dim * 2)
+            self.fc = nn.Linear(out_dim * 2, embedding_dim)
+            self.bn3 = nn.BatchNorm1d(embedding_dim)
+        else:
+            raise ValueError('{} pooling type is not defined'.format(pooling_type))
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = x.unsqueeze(1)
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = x.reshape(x.shape[0], -1, x.shape[-1])
+        x = self.pooling(x)
+        x = self.bn2(x)
+        x = torch.flatten(x, 1)
+        x = self.fc(x)
+        x = self.bn3(x)
+        return x
+def Speaker_Encoder(embedding_dim=256, **kwargs):
+    # Number of filters
+    num_filters = [32, 64, 128, 256]
+    model = ResNetSE(SEBasicBlock, [3, 4, 6, 3], num_filters, embedding_dim, **kwargs)
+    return model
+if __name__ == '__main__':
+    model = Speaker_Encoder()
+    total = sum([param.nelement() for param in model.parameters()])
+    print(total/1e6)
+    data = torch.randn(10, 80, 100)
+    out = model(data)
+    print(data.shape)
+    print(out.shape)

nnet/__init__.py ADDED Viewed

File without changes

nnet/cnns.py ADDED Viewed

	@@ -0,0 +1,186 @@

+#!/usr/bin/env python
+import torch as th
+import torch.nn as nn
+from .norm import ChannelwiseLayerNorm, GlobalLayerNorm
+class Conv1D(nn.Conv1d):
+    """
+    1D Conv based on nn.Conv1d for 2D or 3D tensor
+    Input: 2D or 3D tensor with [N, L_in] or [N, C_in, L_in]
+    Output: Default 3D tensor with [N, C_out, L_out]
+            If C_out=1 and squeeze is true, return 2D tensor
+    """
+    def __init__(self, *args, **kwargs):
+        super(Conv1D, self).__init__(*args, **kwargs)
+    def forward(self, x, squeeze=False):
+        if x.dim() not in [2, 3]:
+            raise RuntimeError("{} require a 2/3D tensor input".format(
+                self.__name__))
+        x = super().forward(x if x.dim() == 3 else th.unsqueeze(x, 1))
+        if squeeze:
+            x = th.squeeze(x)
+        return x
+class ConvTrans1D(nn.ConvTranspose1d):
+    """
+    1D Transposed Conv based on nn.ConvTranspose1d for 2D or 3D tensor
+    Input: 2D or 3D tensor with [N, L_in] or [N, C_in, L_in]
+    Output: 2D tensor with [N, L_out]
+    """
+    def __init__(self, *args, **kwargs):
+        super(ConvTrans1D, self).__init__(*args, **kwargs)
+    def forward(self, x):
+        if x.dim() not in [2, 3]:
+            raise RuntimeError("{} require a 2/3D tensor input".format(
+                self.__name__))
+        x = super().forward(x if x.dim() == 3 else th.unsqueeze(x, 1))
+        # squeeze the channel dimension 1 after reconstructing the signal
+        return th.squeeze(x, 1)
+class TCNBlock(nn.Module):
+    """
+    Temporal convolutional network block,
+        1x1Conv - PReLU - Norm - DConv - PReLU - Norm - SConv
+    Input: 3D tensor with [N, C_in, L_in]
+    Output: 3D tensor with [N, C_out, L_out]
+    """
+    def __init__(self,
+                 in_channels=256,
+                 conv_channels=512,
+                 kernel_size=3,
+                 dilation=1,
+                 causal=False):
+        super(TCNBlock, self).__init__()
+        self.conv1x1 = Conv1D(in_channels, conv_channels, 1)
+        self.prelu1 = nn.PReLU()
+        self.norm1 = GlobalLayerNorm(conv_channels, elementwise_affine=True) if not causal else (
+            ChannelwiseLayerNorm(conv_channels, elementwise_affine=True))
+        dconv_pad = (dilation * (kernel_size - 1)) // 2 if not causal else (
+            dilation * (kernel_size - 1))
+        self.dconv = nn.Conv1d(
+            conv_channels,
+            conv_channels,
+            kernel_size,
+            groups=conv_channels,
+            padding=dconv_pad,
+            dilation=dilation,
+            bias=True)
+        self.prelu2 = nn.PReLU()
+        self.norm2 = GlobalLayerNorm(conv_channels, elementwise_affine=True) if not causal else (
+            ChannelwiseLayerNorm(conv_channels, elementwise_affine=True))
+        self.sconv = nn.Conv1d(conv_channels, in_channels, 1, bias=True)
+        self.causal = causal
+        self.dconv_pad = dconv_pad
+    def forward(self, x):
+        y = self.conv1x1(x)
+        y = self.norm1(self.prelu1(y))
+        y = self.dconv(y)
+        if self.causal:
+            y = y[:, :, :-self.dconv_pad]
+        y = self.norm2(self.prelu2(y))
+        y = self.sconv(y)
+        y += x
+        return y
+class TCNBlock_Spk(nn.Module):
+    """
+    Temporal convolutional network block,
+        1x1Conv - PReLU - Norm - DConv - PReLU - Norm - SConv
+        The first tcn block takes additional speaker embedding as inputs
+    Input: 3D tensor with [N, C_in, L_in]
+    Input Speaker Embedding: 2D tensor with [N, D]
+    Output: 3D tensor with [N, C_out, L_out]
+    """
+    def __init__(self,
+                 in_channels=256,
+                 spk_embed_dim=100,
+                 conv_channels=512,
+                 kernel_size=3,
+                 dilation=1,
+                 causal=False):
+        super(TCNBlock_Spk, self).__init__()
+        self.conv1x1 = Conv1D(in_channels+spk_embed_dim, conv_channels, 1)
+        self.prelu1 = nn.PReLU()
+        self.norm1 = GlobalLayerNorm(conv_channels, elementwise_affine=True) if not causal else (
+            ChannelwiseLayerNorm(conv_channels, elementwise_affine=True))
+        dconv_pad = (dilation * (kernel_size - 1)) // 2 if not causal else (
+            dilation * (kernel_size - 1))
+        self.dconv = nn.Conv1d(
+            conv_channels,
+            conv_channels,
+            kernel_size,
+            groups=conv_channels,
+            padding=dconv_pad,
+            dilation=dilation,
+            bias=True)
+        self.prelu2 = nn.PReLU()
+        self.norm2 = GlobalLayerNorm(conv_channels, elementwise_affine=True) if not causal else (
+            ChannelwiseLayerNorm(conv_channels, elementwise_affine=True))
+        self.sconv = nn.Conv1d(conv_channels, in_channels, 1, bias=True)
+        self.causal = causal
+        self.dconv_pad = dconv_pad
+        self.dilation = dilation
+    def forward(self, x, aux):
+        # Repeatedly concated speaker embedding aux to each frame of the representation x
+        T = x.shape[-1]
+        aux = th.unsqueeze(aux, -1)
+        aux = aux.repeat(1,1,T)
+        y = th.cat([x, aux], 1)
+        y = self.conv1x1(y)
+        y = self.norm1(self.prelu1(y))
+        y = self.dconv(y)
+        if self.causal:
+            y = y[:, :, :-self.dconv_pad]
+        y = self.norm2(self.prelu2(y))
+        y = self.sconv(y)
+        y += x
+        return y
+class ResBlock(nn.Module):
+    """
+    Resnet block for speaker encoder to obtain speaker embedding
+    ref to
+        https://github.com/fatchord/WaveRNN/blob/master/models/fatchord_version.py
+        and
+        https://github.com/Jungjee/RawNet/blob/master/PyTorch/model_RawNet.py
+    """
+    def __init__(self, in_dims, out_dims):
+        super(ResBlock, self).__init__()
+        self.conv1 = nn.Conv1d(in_dims, out_dims, kernel_size=1, bias=False)
+        self.conv2 = nn.Conv1d(out_dims, out_dims, kernel_size=1, bias=False)
+        self.batch_norm1 = nn.BatchNorm1d(out_dims)
+        self.batch_norm2 = nn.BatchNorm1d(out_dims)
+        self.prelu1 = nn.PReLU()
+        self.prelu2 = nn.PReLU()
+        self.maxpool = nn.MaxPool1d(3)
+        if in_dims != out_dims:
+            self.downsample = True
+            self.conv_downsample = nn.Conv1d(in_dims, out_dims, kernel_size=1, bias=False)
+        else:
+            self.downsample = False
+    def forward(self, x):
+        y = self.conv1(x)
+        y = self.batch_norm1(y)
+        y = self.prelu1(y)
+        y = self.conv2(y)
+        y = self.batch_norm2(y)
+        if self.downsample:
+            y += self.conv_downsample(x)
+        else:
+            y += x
+        y = self.prelu2(y)
+        return self.maxpool(y)

nnet/norm.py ADDED Viewed

	@@ -0,0 +1,59 @@

+#!/usr/bin/env python
+import torch as th
+import torch.nn as nn
+class ChannelwiseLayerNorm(nn.LayerNorm):
+    """
+    Channel-wise layer normalization based on nn.LayerNorm
+    Input: 3D tensor with [batch_size(N), channel_size(C), frame_num(T)]
+    Output: 3D tensor with same shape
+    """
+    def __init__(self, *args, **kwargs):
+        super(ChannelwiseLayerNorm, self).__init__(*args, **kwargs)
+    def forward(self, x):
+        if x.dim() != 3:
+            raise RuntimeError("{} requires a 3D tensor input".format(
+                self.__name__))
+        x = th.transpose(x, 1, 2)
+        x = super().forward(x)
+        x = th.transpose(x, 1, 2)
+        return x
+class GlobalLayerNorm(nn.Module):
+    """
+    Global layer normalization
+    Input: 3D tensor with [batch_size(N), channel_size(C), frame_num(T)]
+    Output: 3D tensor with same shape
+    """
+    def __init__(self, dim, eps=1e-05, elementwise_affine=True):
+        super(GlobalLayerNorm, self).__init__()
+        self.eps = eps
+        self.normalized_dim = dim
+        self.elementwise_affine = elementwise_affine
+        if elementwise_affine:
+            self.beta = nn.Parameter(th.zeros(dim, 1))
+            self.gamma = nn.Parameter(th.ones(dim, 1))
+        else:
+            self.register_parameter("weight", None)
+            self.register_parameter("bias", None)
+    def forward(self, x):
+        if x.dim() != 3:
+            raise RuntimeError("{} requires a 3D tensor input".format(
+                self.__name__))
+        # calculate the mean, variance over the channel and time dimensions
+        mean = th.mean(x, (1, 2), keepdim=True)
+        var = th.mean((x - mean)**2, (1, 2), keepdim=True)
+        if self.elementwise_affine:
+            x = self.gamma * (x - mean) / th.sqrt(var + self.eps) + self.beta
+        else:
+            x = (x - mean) / th.sqrt(var + self.eps)
+        return x
+    def extra_repr(self):
+        return "{normalized_dim}, eps={eps}, " \
+            "elementwise_affine={elementwise_affine}".format(**self.__dict__)

nnet/pooling.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from speechbrain.lobes.models.ECAPA_TDNN import AttentiveStatisticsPooling
+class Temporal_Average_Pooling(nn.Module):
+    def __init__(self, **kwargs):
+        """TAP
+        Paper: Multi-Task Learning with High-Order Statistics for X-vector based Text-Independent Speaker Verification
+        Link: https://arxiv.org/pdf/1903.12058.pdf
+        """
+        super(Temporal_Average_Pooling, self).__init__()
+    def forward(self, x):
+        """Computes Temporal Average Pooling Module
+        Args:
+            x (torch.Tensor): Input tensor (#batch, channels, frames).
+        Returns:
+            torch.Tensor: Output tensor (#batch, channels)
+        """
+        x = torch.mean(x, axis=2)
+        return x
+class Temporal_Statistics_Pooling(nn.Module):
+    def __init__(self, **kwargs):
+        """TSP
+        Paper: X-vectors: Robust DNN Embeddings for Speaker Recognition
+        Link： http://www.danielpovey.com/files/2018_icassp_xvectors.pdf
+        """
+        super(Temporal_Statistics_Pooling, self).__init__()
+    def forward(self, x):
+        """Computes Temporal Statistics Pooling Module
+        Args:
+            x (torch.Tensor): Input tensor (#batch, channels, frames).
+        Returns:
+            torch.Tensor: Output tensor (#batch, channels*2)
+        """
+        mean = torch.mean(x, axis=2)
+        var = torch.var(x, axis=2)
+        x = torch.cat((mean, var), axis=1)
+        return x
+''' Self attentive weighted mean pooling.
+'''
+class Self_Attentive_Pooling(nn.Module):
+    def __init__(self, dim, **kwargs):
+        # Use Conv1d with stride == 1 rather than Linear, then we don't need to transpose inputs.
+        # attention dim = 128
+        super(Self_Attentive_Pooling, self).__init__()
+        self.linear1 = nn.Conv1d(dim, dim, kernel_size=1) # equals W and b in the paper
+        self.linear2 = nn.Conv1d(dim, dim, kernel_size=1) # equals V and k in the paper
+    def forward(self, x):
+        # DON'T use ReLU here! In experiments, I find ReLU hard to converge.
+        alpha = torch.tanh(self.linear1(x))
+        alpha = torch.softmax(self.linear2(alpha), dim=2)
+        mean = torch.sum(alpha * x, dim=2)
+        return mean
+''' Attentive weighted mean and standard deviation pooling.
+'''
+class Attentive_Statistics_Pooling(nn.Module):
+    def __init__(self, dim, **kwargs):
+        # Use AttentiveStatisticsPooling and BatchNorm1d from speechbrain
+        super(Attentive_Statistics_Pooling, self).__init__()
+        self.pooling = AttentiveStatisticsPooling(dim)
+    def forward(self, x):
+        x = self.pooling(x)
+        return x
+# class Attentive_Statistics_Pooling(nn.Module):
+#     def __init__(self, dim, **kwargs):
+#         # Use Conv1d with stride == 1 rather than Linear, then we don't need to transpose inputs.
+#         # attention dim = 128
+#         super(Attentive_Statistics_Pooling, self).__init__()
+#         self.linear1 = nn.Conv1d(dim, dim, kernel_size=1) # equals W and b in the paper
+#         self.linear2 = nn.Conv1d(dim, dim, kernel_size=1) # equals V and k in the paper
+#
+#     def forward(self, x):
+#         # DON'T use ReLU here! In experiments, I find ReLU hard to converge.
+#         alpha = torch.tanh(self.linear1(x))
+#         alpha = torch.softmax(self.linear2(alpha), dim=2)
+#         mean = torch.sum(alpha * x, dim=2)
+#         residuals = torch.sum(alpha * x ** 2, dim=2) - mean ** 2
+#         std = torch.sqrt(residuals.clamp(min=1e-9))
+#         return torch.cat([mean, std], dim=1)
+if __name__ == "__main__":
+    data = torch.randn(10, 128, 100)
+    pooling = Self_Attentive_Pooling(128)
+    out = pooling(data)
+    print(data.shape)
+    print(out.shape)

nnet/speaker_encoder.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import torch
+import torchaudio
+import torch.nn as nn
+from torch.nn import functional as F
+from .ResNet34 import Speaker_Encoder
+class Speaker_Model(torch.nn.Module):
+#class Speaker_Model(LightningModule):
+    def __init__(self, pooling_type, spk_embed_dim, sample_rate, n_mels):
+        super().__init__()
+        # self.save_hyperparameters()
+        self.pooling_type = pooling_type
+        self.spk_embed_dim = spk_embed_dim
+        self.sample_rate = sample_rate
+        self.n_mels = n_mels
+        sr = self.sample_rate
+        self.mel_trans = torch.nn.Sequential(
+                PreEmphasis(),
+                torchaudio.transforms.MelSpectrogram(sample_rate=sr, n_fft=512,
+                                                     win_length=sr * 25 // 1000, hop_length=sr * 10 // 1000,
+                                                     window_fn=torch.hamming_window, n_mels=self.n_mels)
+                )
+        self.instancenorm = nn.InstanceNorm1d(self.n_mels)
+        self.hparams = {'embedding_dim': self.spk_embed_dim, 'pooling_type': self.pooling_type , 'n_mels': self.n_mels}
+        self.speaker_encoder = Speaker_Encoder(**dict(self.hparams))
+class PreEmphasis(torch.nn.Module):
+    def __init__(self, coef: float = 0.97):
+        super().__init__()
+        self.coef = coef
+        # make kernel
+        # In pytorch, the convolution operation uses cross-correlation. So, filter is flipped.
+        self.register_buffer(
+            'flipped_filter', torch.FloatTensor([-self.coef, 1.]).unsqueeze(0).unsqueeze(0)
+        )
+    def forward(self, inputs: torch.tensor) -> torch.tensor:
+        assert len(inputs.size()) == 2, 'The number of dimensions of inputs tensor must be 2!'
+        # reflect padding to match lengths of in/out
+        inputs = inputs.unsqueeze(1)
+        inputs = F.pad(inputs, (1, 0), 'reflect')
+        return F.conv1d(inputs, self.flipped_filter).squeeze(1)

nnet/spex_plus.py ADDED Viewed

	@@ -0,0 +1,247 @@

+#!/usr/bin/env python
+import torch as th
+import torch.nn as nn
+import torch.nn.functional as F
+from .norm import ChannelwiseLayerNorm, GlobalLayerNorm
+from .cnns import Conv1D, ConvTrans1D, TCNBlock, TCNBlock_Spk, ResBlock
+import torchaudio
+from .ResNet34 import Speaker_Encoder
+# from .sunine.trainer.utils import PreEmphasis
+# 考虑两种可能，频域就不大可能有所谓的多时间尺度，所以肯定speaker是直接频谱，那speech呢？
+# 注意下维度 是 B N T 还是 B T N
+class SpEx_Plus(nn.Module):
+    def __init__(self,
+                 L1=20,
+                 L2=80,
+                 L3=160,
+                 N=256,
+                 B=8,
+                 O=256,
+                 P=512,
+                 Q=3,
+                 num_spks=101,
+                 spk_embed_dim=256,
+                 sample_rate = 16000,
+                 n_mels = 80,
+                 causal=False,
+                 ):
+        super(SpEx_Plus, self).__init__()
+        # n x S => n x N x T, S = 4s*8000 = 32000
+        self.sample_rate = sample_rate
+        self.n_mels = n_mels
+        self.L1 = L1
+        self.L2 = L2
+        self.L3 = L3
+        self.encoder_1d_short = Conv1D(1, N, L1, stride=L1 // 2, padding=0)
+        self.encoder_1d_middle = Conv1D(1, N, L2, stride=L1 // 2, padding=0)
+        self.encoder_1d_long = Conv1D(1, N, L3, stride=L1 // 2, padding=0)
+        # before repeat blocks, always cLN
+        self.ln = ChannelwiseLayerNorm(3*N)
+        # n x N x T => n x O x T
+        self.proj = Conv1D(3*N, O, 1)
+        self.conv_block_1 = TCNBlock_Spk(spk_embed_dim=spk_embed_dim, in_channels=O, conv_channels=P, kernel_size=Q, causal=causal, dilation=1)
+        self.conv_block_1_other = self._build_stacks(num_blocks=B, in_channels=O, conv_channels=P, kernel_size=Q, causal=causal)
+        self.conv_block_2 = TCNBlock_Spk(spk_embed_dim=spk_embed_dim, in_channels=O, conv_channels=P, kernel_size=Q, causal=causal, dilation=1)
+        self.conv_block_2_other = self._build_stacks(num_blocks=B, in_channels=O, conv_channels=P, kernel_size=Q, causal=causal)
+        self.conv_block_3 = TCNBlock_Spk(spk_embed_dim=spk_embed_dim, in_channels=O, conv_channels=P, kernel_size=Q, causal=causal, dilation=1)
+        self.conv_block_3_other = self._build_stacks(num_blocks=B, in_channels=O, conv_channels=P, kernel_size=Q, causal=causal)
+        self.conv_block_4 = TCNBlock_Spk(spk_embed_dim=spk_embed_dim, in_channels=O, conv_channels=P, kernel_size=Q, causal=causal, dilation=1)
+        self.conv_block_4_other = self._build_stacks(num_blocks=B, in_channels=O, conv_channels=P, kernel_size=Q, causal=causal)
+        # n x O x T => n x N x T
+        self.mask1 = Conv1D(O, N, 1)
+        self.mask2 = Conv1D(O, N, 1)
+        self.mask3 = Conv1D(O, N, 1)
+        # using ConvTrans1D: n x N x T => n x 1 x To
+        # To = (T - 1) * L // 2 + L
+#############################################################
+        self.decoder_1d_short = ConvTrans1D(N, 1, kernel_size=L1, stride=L1 // 2, bias=True)
+        self.decoder_1d_middle = ConvTrans1D(N, 1, kernel_size=L2, stride=L1 // 2, bias=True)
+        self.decoder_1d_long = ConvTrans1D(N, 1, kernel_size=L3, stride=L1 // 2, bias=True)
+        self.num_spks = num_spks
+        # self.spk_encoder = nn.Sequential(
+        #     ChannelwiseLayerNorm(3*N),
+        #     Conv1D(3*N, O, 1),
+        #     ResBlock(O, O),
+        #     ResBlock(O, P),
+        #     ResBlock(P, P),
+        #     Conv1D(P, spk_embed_dim, 1),
+        # )
+        # self.pred_linear = nn.Linear(spk_embed_dim, num_spks)
+# 改为pretrain
+# 考虑两种可能，频域就不大可能有所谓的多时间尺度，所以肯定speaker是直接频谱，那speech呢？
+# /work105/youzhenghai/model/resnet_asp_aam_adamw_welr
+# import ..sunine/trainer/speaker encoder
+# **kwargs 无需关心 找到 self.hparams就行  按照 main_infer改就行
+#############################################################
+        # # 1. Acoustic Feature
+        # self.mel_trans = th.nn.Sequential(
+        #         PreEmphasis(),
+        #         torchaudio.transforms.MelSpectrogram(sample_rate=self.sample_rate, n_fft=512,
+        #             win_length=400, hop_length=160, window_fn=th.hamming_window, n_mels=self.n_mels)
+        #         )
+        # self.instancenorm = nn.InstanceNorm1d(self.n_mels)
+        # # 在调用的地方设置超参数 记得后面写为参数传入
+        # self.hparams = {'embedding_dim': spk_embed_dim, 'pooling_type': 'ASP' , 'n_mels': self.n_mels}
+        # # 使用 **self.hparams 调用函数
+        # self.speaker_encoder = Speaker_Encoder(**self.hparams)
+        self.speaker_embedding_extracter = Speaker_Model(pooling_type='ASP', spk_embed_dim=spk_embed_dim, sample_rate=self.sample_rate, n_mels=self.n_mels)
+        self.pred_linear = nn.Linear(spk_embed_dim, num_spks)
+#############################################################
+        # # 3. Loss / Classifier
+        # if not self.hparams.evaluate:
+        #     LossFunction = importlib.import_module('trainer.loss.'+self.hparams.loss_type).__getattribute__('LossFunction')
+        #     self.loss = LossFunction(**dict(self.hparams))
+    def _build_stacks(self, num_blocks, **block_kwargs):
+        """
+        Stack B numbers of TCN block, the first TCN block takes the speaker embedding
+        """
+        blocks = [
+            TCNBlock(**block_kwargs, dilation=(2**b))
+            for b in range(1,num_blocks)
+        ]
+        return nn.Sequential(*blocks)
+#   注意下维度 是 B N T 还是 B T N
+    def forward(self, x, aux, aux_len):
+        if x.dim() >= 3:
+            raise RuntimeError(
+                "{} accept 1/2D tensor as input, but got {:d}".format(
+                    self.__name__, x.dim()))
+        # when inference, only one utt
+        if x.dim() == 1:
+            x = th.unsqueeze(x, 0)
+        # n x 1 x S => n x N x T
+        w1 = F.relu(self.encoder_1d_short(x))
+        T = w1.shape[-1]
+        xlen1 = x.shape[-1]
+        xlen2 = (T - 1) * (self.L1 // 2) + self.L2
+        xlen3 = (T - 1) * (self.L1 // 2) + self.L3
+        w2 = F.relu(self.encoder_1d_middle(F.pad(x, (0, xlen2 - xlen1), "constant", 0)))
+        w3 = F.relu(self.encoder_1d_long(F.pad(x, (0, xlen3 - xlen1), "constant", 0)))
+        # n x 3N x T
+        y = self.ln(th.cat([w1, w2, w3], 1))
+        # n x O x T
+        y = self.proj(y)
+        # speaker encoder (share params from speech encoder)
+        # aux_w1 = F.relu(self.encoder_1d_short(aux))
+        # aux_T_shape = aux_w1.shape[-1]
+        # aux_len1 = aux.shape[-1]
+        # aux_len2 = (aux_T_shape - 1) * (self.L1 // 2) + self.L2
+        # aux_len3 = (aux_T_shape - 1) * (self.L1 // 2) + self.L3
+        # aux_w2 = F.relu(self.encoder_1d_middle(F.pad(aux, (0, aux_len2 - aux_len1), "constant", 0)))
+        # aux_w3 = F.relu(self.encoder_1d_long(F.pad(aux, (0, aux_len3 - aux_len1), "constant", 0)))
+        # spk_encoder + mean pooling
+        # aux = self.spk_encoder(th.cat([aux_w1, aux_w2, aux_w3], 1))
+        # aux_T = (aux_len - self.L1) // (self.L1 // 2) + 1
+        # aux_T = ((aux_T // 3) // 3) // 3
+        # aux = th.sum(aux, -1)/aux_T.view(-1,1).float()
+        # spk_encoder + TAP pooling
+        aux = self.speaker_embedding_extracter(aux)
+        #aux = torch.mean(aux, axis=0)
+        # aux = aux.cpu().detach().numpy()
+        # 不需要 reshape N * D 是正确的维度
+        #aux = aux.reshape(-1, self.hparams.nPerSpeaker, self.spk_embed_dim)
+    #     loss, acc = self.loss(x, label)
+    #     return loss.mean(), acc
+        # 考虑 loss 是否也要
+        y = self.conv_block_1(y, aux)
+        y = self.conv_block_1_other(y)
+        y = self.conv_block_2(y, aux)
+        y = self.conv_block_2_other(y)
+        y = self.conv_block_3(y, aux)
+        y = self.conv_block_3_other(y)
+        y = self.conv_block_4(y, aux)
+        y = self.conv_block_4_other(y)
+        # n x N x T
+        m1 = F.relu(self.mask1(y))
+        m2 = F.relu(self.mask2(y))
+        m3 = F.relu(self.mask3(y))
+        S1 = w1 * m1
+        S2 = w2 * m2
+        S3 = w3 * m3
+        return self.decoder_1d_short(S1), self.decoder_1d_middle(S2)[:, :xlen1], self.decoder_1d_long(S3)[:, :xlen1], self.pred_linear(aux)
+class PreEmphasis(th.nn.Module):
+    def __init__(self, coef: float = 0.97):
+        super().__init__()
+        self.coef = coef
+        # make kernel
+        # In pyth, the convolution operation uses cross-correlation. So, filter is flipped.
+        self.register_buffer(
+            'flipped_filter', th.FloatTensor([-self.coef, 1.]).unsqueeze(0).unsqueeze(0)
+        )
+    def forward(self, inputs: th.tensor) -> th.tensor:
+        assert len(inputs.size()) == 2, 'The number of dimensions of inputs tensor must be 2!'
+        # reflect padding to match lengths of in/out
+        inputs = inputs.unsqueeze(1)
+        inputs = F.pad(inputs, (1, 0), 'reflect')
+        return F.conv1d(inputs, self.flipped_filter).squeeze(1)
+class Speaker_Model(nn.Module):
+#class Speaker_Model(LightningModule):
+    def __init__(self, pooling_type, spk_embed_dim, sample_rate, n_mels):
+        super().__init__()
+        # self.save_hyperparameters()
+        self.pooling_type = pooling_type
+        self.spk_embed_dim = spk_embed_dim
+        self.sample_rate = sample_rate
+        self.n_mels = n_mels
+        sr = self.sample_rate
+        self.mel_trans = th.nn.Sequential(
+                PreEmphasis(),
+                torchaudio.transforms.MelSpectrogram(sample_rate=sr, n_fft=512,
+                                                     win_length=sr * 25 // 1000, hop_length=sr * 10 // 1000,
+                                                     window_fn=th.hamming_window, n_mels=self.n_mels)
+                )
+        self.instancenorm = nn.InstanceNorm1d(self.n_mels)
+        self.hparams = {'embedding_dim': self.spk_embed_dim, 'pooling_type': self.pooling_type , 'n_mels': self.n_mels}
+        self.speaker_encoder = Speaker_Encoder(**dict(self.hparams))
+    def extract_speaker_embedding(self, data):
+        x = data.reshape(-1, data.size()[-1])
+        x = self.mel_trans(x) + 1e-6
+        x = x.log()
+        x = self.instancenorm(x)
+        x = self.speaker_encoder(x)
+        return x
+    def forward(self, x):
+        x = self.extract_speaker_embedding(x)
+        return x

requirement.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch=1.8.0
+torchaudio=0.8.0
+speechbrain=0.5.10
+soundfile
+gradio

utils/__init__.py ADDED Viewed

File without changes

utils/audio.py ADDED Viewed

	@@ -0,0 +1,124 @@

+#!/usr/bin/env python
+import os
+import numpy as np
+import soundfile as sf
+def write_wav(fname, samps, sample_rate=16000, normalize=True):
+    """
+    Write wav files in float32, support single/multi-channel
+    """
+    # wham and whamr mixture and clean data are float 32, can not use scipy.io.wavfile to read and write int16
+    # change to soundfile to read and write, although reference speech is int16, soundfile still can read and outputs as float
+    fdir = os.path.dirname(fname)
+    if fdir and not os.path.exists(fdir):
+        os.makedirs(fdir)
+    sf.write(fname, samps, sample_rate, subtype='FLOAT')
+def read_wav(fname, normalize=True, return_rate=False):
+    """
+    Read wave files (support multi-channel)
+    """
+    # wham and whamr mixture and clean data are float 32, can not use scipy.io.wavfile to read and write int16
+    # change to soundfile to read and write, although reference speech is int16, soundfile still can read and outputs as float
+    samps, samp_rate = sf.read(fname)
+    if return_rate:
+        return samp_rate, samps
+    return samps
+def parse_scripts(scp_path, value_processor=lambda x: x, num_tokens=2):
+    """
+    Parse kaldi's script(.scp) file
+    If num_tokens >= 2, function will check token number
+    """
+    scp_dict = dict()
+    line = 0
+    with open(scp_path, "r") as f:
+        for raw_line in f:
+            scp_tokens = raw_line.strip().split()
+            line += 1
+            if num_tokens >= 2 and len(scp_tokens) != num_tokens or len(
+                    scp_tokens) < 2:
+                raise RuntimeError(
+                    "For {}, format error in line[{:d}]: {}".format(
+                        scp_path, line, raw_line))
+            if num_tokens == 2:
+                key, value = scp_tokens
+            else:
+                key, value = scp_tokens[0], scp_tokens[1:]
+            if key in scp_dict:
+                raise ValueError("Duplicated key \'{0}\' exists in {1}".format(
+                    key, scp_path))
+            scp_dict[key] = value_processor(value)
+    return scp_dict
+class Reader(object):
+    """
+        Basic Reader Class
+    """
+    def __init__(self, scp_path, value_processor=lambda x: x):
+        self.index_dict = parse_scripts(
+            scp_path, value_processor=value_processor, num_tokens=2)
+        self.index_keys = list(self.index_dict.keys())
+    def _load(self, key):
+        # return path
+        return self.index_dict[key]
+    # number of utterance
+    def __len__(self):
+        return len(self.index_dict)
+    # avoid key error
+    def __contains__(self, key):
+        return key in self.index_dict
+    # sequential index
+    def __iter__(self):
+        for key in self.index_keys:
+            yield key, self._load(key)
+    # random index, support str/int as index
+    def __getitem__(self, index):
+        if type(index) not in [int, str]:
+            raise IndexError("Unsupported index type: {}".format(type(index)))
+        if type(index) == int:
+            # from int index to key
+            num_utts = len(self.index_keys)
+            if index >= num_utts or index < 0:
+                raise KeyError(
+                    "Interger index out of range, {:d} vs {:d}".format(
+                        index, num_utts))
+            index = self.index_keys[index]
+        if index not in self.index_dict:
+            raise KeyError("Missing utterance {}!".format(index))
+        return self._load(index)
+class WaveReader(Reader):
+    """
+        Sequential/Random Reader for single channel wave
+        Format of wav.scp follows Kaldi's definition:
+            key1 /path/to/wav
+            ...
+    """
+    def __init__(self, wav_scp, sample_rate=None, normalize=True):
+        super(WaveReader, self).__init__(wav_scp)
+        self.samp_rate = sample_rate
+        self.normalize = normalize
+    def _load(self, key):
+        # return C x N or N
+        samp_rate, samps = read_wav(
+            self.index_dict[key], normalize=self.normalize, return_rate=True)
+        # if given samp_rate, check it
+        if self.samp_rate is not None and samp_rate != self.samp_rate:
+            raise RuntimeError("SampleRate mismatch: {:d} vs {:d}".format(
+                samp_rate, self.samp_rate))
+        return samps

utils/dataset copy.py ADDED Viewed

	@@ -0,0 +1,284 @@

+#!/usr/bin/env python
+import random
+import torch as th
+import numpy as np
+from torch.utils.data.dataloader import default_collate
+import torch.utils.data as dat
+from torch.nn.utils.rnn import pad_sequence
+from .audio import WaveReader
+import soundfile as sf
+# random_seed = 1453
+# random.seed(random_seed)
+def make_dataloader(train=True,
+                    utt_scp_file=None,
+                    spk_list=None,
+                    sample_rate=16000,
+                    num_workers=4,
+                    chunk_size=32000,
+                    batch_size=16):
+    dataset = Dataset(utt_scp_file=utt_scp_file,
+                      spk_list=spk_list,
+                      chunk_size=chunk_size,
+                      sample_rate=sample_rate)
+    return DataLoader(dataset,
+                      train=train,
+                      chunk_size=chunk_size,
+                      batch_size=batch_size,
+                      num_workers=num_workers)
+class Dataset(object):
+    """
+    Per Utterance Loader
+    """
+    def __init__(self, utt_scp_file="", spk_list=None,chunk_size=32000, sample_rate=8000):
+        self.sample_rate = sample_rate
+        self.spk_list = self._load_spk(spk_list)
+        self.seg_least= int(chunk_size // 2 )
+        # self.mix = WaveReader(mix_scp, sample_rate=sample_rate)
+        # self.ref = WaveReader(ref_scp, sample_rate=sample_rate)
+        # self.aux = WaveReader(aux_scp, sample_rate=sample_rate)
+        with open(utt_scp_file, 'r') as f:
+            lines = f.readlines()
+        self.data = []
+        self.total_lines = len(self.data)
+        for line in lines:
+            parts = line.strip().split()
+            sentence_id = parts[0]
+            sentence_path = parts[1]
+            data_len = parts[2]
+            spk_id = (sentence_id.split('-')[0])[1:5]
+            self.data.append((sentence_id, spk_id, sentence_path, data_len))
+        if not self.data:
+            raise ValueError("No valid lines found in the input file.")
+        self.total_lines = len(self.data)
+    def _load_spk(self, spk_list_path):
+        if spk_list_path is None:
+            return []
+        lines = open(spk_list_path).readlines()
+        new_lines = []
+        for line in lines:
+            new_lines.append(line.strip())
+        return new_lines
+    def __len__(self):
+        return len(self.data)
+    def _get_segment_start_stop(self, seg_len, length):
+        if seg_len is not None:
+            start = random.randint(0, length - seg_len)
+            stop = start + seg_len
+        else:
+            start = 0
+            stop = None
+        return start, stop
+    def _mix(self, sources_list):
+        # if self.seg_len:
+        #     mix_length = self.seg_len
+        # else:
+        #     mix_length = self.common_length
+        mix_length = self.common_length
+        mixture = np.zeros(mix_length)
+        for i, _ in enumerate(sources_list):
+            mixture += sources_list[i]
+        return mixture
+    def __getitem__(self, idx):
+        source_id, source_spk, source_path, all_source_length= self.data[idx]
+        all_source_length = int(all_source_length)
+        spk_idx = self.spk_list.index(source_spk)
+        other_counter = 0
+        while True:
+            random_idx = np.random.randint(0, self.total_lines)
+            if self.data[random_idx][1] != source_spk:
+                other_id, other_spk, other_path, other_length = self.data[random_idx]
+                other_length = int(other_length)
+                if other_length > self.seg_least:
+                    break
+            other_counter += 1
+            if other_counter >= self.total_lines:
+                raise ValueError("All Data too shorter to mix")
+        enroll_counter = 0
+        while True:
+            random_idx = np.random.randint(0, self.total_lines)
+            if self.data[random_idx][1] == source_spk:
+                enroll_id, enroll_spk, enroll_path, all_enroll_length= self.data[random_idx]
+                all_enroll_length = int(all_enroll_length)
+                if all_enroll_length > self.seg_least:
+                    break
+            enroll_counter += 1
+            if enroll_counter >= self.total_lines:
+                raise ValueError("All Data too shorter to enroll")
+        # lengths = [all_source_length, other_length]
+        if all_source_length >= other_length:
+            self.common_length = other_length
+            start, stop = self._get_segment_start_stop(other_length, all_source_length)
+            source_tmp,_ = sf.read(source_path, dtype="float32", start=start, stop=stop)
+            other_tmp,_ = sf.read(other_path, dtype="float32")
+        elif all_source_length <= other_length:
+            self.common_length = all_source_length
+            start, stop = self._get_segment_start_stop(all_source_length, other_length)
+            source_tmp,_ = sf.read(source_path, dtype="float32")
+            other_tmp,_ = sf.read(other_path, dtype="float32", start=start, stop=stop)
+        source = source_tmp[:, np.random.randint(0, source_tmp.shape[1])]
+        other = other_tmp[:, np.random.randint(0, other_tmp.shape[1])]
+        mixture = self._mix([source, other])
+        mixture = mixture.astype(np.float32)
+        enroll_tmp, _ = sf.read(enroll_path, dtype="float32")
+        enroll = enroll_tmp[:, np.random.randint(0, enroll_tmp.shape[1])]
+        return {
+            "mix": mixture,
+            "ref": source,
+            "aux": enroll,
+            "aux_len": len(enroll),
+            "spk_idx": spk_idx
+        }
+class ChunkSplitter(object):
+    """
+    Split utterance into small chunks
+    """
+    def __init__(self, chunk_size, train=True, least=16000):
+        self.chunk_size = chunk_size
+        self.least = least
+        self.train = train
+    def _make_chunk(self, eg, s):
+        """
+        Make a chunk instance, which contains:
+            "mix": ndarray,
+            "ref": [ndarray...]
+        """
+        chunk = dict()
+        chunk["mix"] = eg["mix"][s:s + self.chunk_size]
+        chunk["ref"] = eg["ref"][s:s + self.chunk_size]
+        chunk["aux"] = eg["aux"]
+        chunk["aux_len"] = eg["aux_len"]
+        chunk["valid_len"] = int(self.chunk_size)
+        chunk["spk_idx"] = eg["spk_idx"]
+        return chunk
+    def split(self, eg):
+        N = eg["mix"].size
+        # too short, throw away
+        if N < self.least:
+            return []
+        chunks = []
+        # padding zeros
+        if N < self.chunk_size:
+            P = self.chunk_size - N
+            chunk = dict()
+            chunk["mix"] = np.pad(eg["mix"], (0, P), "constant")
+            chunk["ref"] = np.pad(eg["ref"], (0, P), "constant")
+            chunk["aux"] = eg["aux"]
+            chunk["aux_len"] = eg["aux_len"]
+            chunk["valid_len"] = int(N)
+            chunk["spk_idx"] = eg["spk_idx"]
+            chunks.append(chunk)
+        else:
+            # random select start point for training
+            s = random.randint(0, N % self.least) if self.train else 0
+            while True:
+                if s + self.chunk_size > N:
+                    break
+                chunk = self._make_chunk(eg, s)
+                chunks.append(chunk)
+                s += self.least
+        return chunks
+class DataLoader(object):
+    """
+    Online dataloader for chunk-level
+    """
+    def __init__(self,
+                 dataset,
+                 num_workers=4,
+                 chunk_size=32000,
+                 batch_size=16,
+                 train=True):
+        self.batch_size = batch_size
+        self.train = train
+        self.splitter = ChunkSplitter(chunk_size,
+                                      train=train,
+                                      least=chunk_size // 2)
+        # just return batch of egs, support multiple workers
+        self.eg_loader = dat.DataLoader(dataset,
+                                        batch_size=batch_size // 2,
+                                        num_workers=num_workers,
+                                        shuffle=train,
+                                        collate_fn=self._collate)
+    def _collate(self, batch):
+        """
+        Online split utterances
+        """
+        chunk = []
+        for eg in batch:
+            chunk += self.splitter.split(eg)
+        return chunk
+    def _pad_aux(self, chunk_list):
+        lens_list = []
+        for chunk_item in chunk_list:
+            lens_list.append(chunk_item['aux_len'])
+        max_len = np.max(lens_list)
+        for idx in range(len(chunk_list)):
+            P = max_len - len(chunk_list[idx]["aux"])
+            chunk_list[idx]["aux"] = np.pad(chunk_list[idx]["aux"], (0, P), "constant")
+        return chunk_list
+    def _merge(self, chunk_list):
+        """
+        Merge chunk list into mini-batch
+        """
+        N = len(chunk_list)
+        if self.train:
+            random.shuffle(chunk_list)
+        blist = []
+        for s in range(0, N - self.batch_size + 1, self.batch_size):
+            # padding aux info
+            #self._pad_aux(chunk_list[s:s + self.batch_size])
+            batch = default_collate(self._pad_aux(chunk_list[s:s + self.batch_size]))
+            blist.append(batch)
+        rn = N % self.batch_size
+        return blist, chunk_list[-rn:] if rn else []
+    def __iter__(self):
+        chunk_list = []
+        for chunks in self.eg_loader:
+            chunk_list += chunks
+            batch, chunk_list = self._merge(chunk_list)
+            for obj in batch:
+                yield obj

utils/dataset.py ADDED Viewed

	@@ -0,0 +1,402 @@

+#!/usr/bin/env python
+import random
+import torch as th
+import numpy as np
+from torch.utils.data.dataloader import default_collate
+import torch.utils.data as dat
+from torch.nn.utils.rnn import pad_sequence
+from .audio import WaveReader
+import soundfile as sf
+# random_seed = 1453
+# random.seed(random_seed)
+# "aux_len": all_enroll_length,
+EPS = 1e-10
+def make_dataloader(train=True,
+                    mix_scp_file=None,
+                    enroll_scp_file=None,
+                    noise_scp_file=None,
+                    spk_list=None,
+                    sample_rate=16000,
+                    num_workers=4,
+                    chunk_size=32000,
+                    batch_size=16):
+    dataset = Dataset(mix_scp_file=mix_scp_file,
+                      enroll_scp_file=enroll_scp_file,
+                      noise_scp_file=noise_scp_file,
+                      spk_list=spk_list,
+                      chunk_size=chunk_size,
+                      sample_rate=sample_rate)
+    return DataLoader(dataset,
+                      train=train,
+                      chunk_size=chunk_size,
+                      batch_size=batch_size,
+                      num_workers=num_workers)
+class Dataset(object):
+    """
+    Per Utterance Loader
+    """
+    def __init__(self, mix_scp_file="", enroll_scp_file="", noise_scp_file="", spk_list=None,chunk_size=32000, sample_rate=8000):
+        self.sample_rate = sample_rate
+        self.spk_list = self._load_spk(spk_list)
+        self.seg_least= int(chunk_size // 2 )
+        with open(mix_scp_file, 'r') as f:
+            lines = f.readlines()
+        self.data = []
+        for line in lines:
+            parts = line.strip().split()
+            sentence_id = parts[0]
+            sentence_path = parts[1]
+            data_len = parts[2]
+            spk_id = (sentence_id.split('-')[0])[1:5]
+            self.data.append((sentence_id, spk_id, sentence_path, data_len))
+        with open(enroll_scp_file, 'r') as f:
+            enroll_lines = f.readlines()
+        self.enroll_data = []
+        for line in enroll_lines:
+            parts = line.strip().split()
+            sentence_id = parts[0]
+            sentence_path = parts[1]
+            data_len = parts[2]
+            spk_id = (sentence_id.split('-')[0])[1:5]
+            self.enroll_data.append((sentence_id, spk_id, sentence_path, data_len))
+        with open(noise_scp_file, 'r') as f:
+            noise_lines = f.readlines()
+        self.noise_data = []
+        for line in noise_lines:
+            parts = line.strip().split()
+            sentence_id = parts[0]
+            sentence_path = parts[1]
+            data_len = parts[2]
+            # spk_id = (sentence_id.split('-')[0])[1:5]
+            self.noise_data.append((sentence_id, sentence_path, data_len))
+        self.total_lines = len(self.data)
+        self.total_enroll = self._enroll_data_len()
+        self.total_noise = self._noise_data_len()
+        if not self.data:
+            raise ValueError("No valid lines found in the input file.")
+    def _load_spk(self, spk_list_path):
+        if spk_list_path is None:
+            return []
+        lines = open(spk_list_path).readlines()
+        new_lines = []
+        for line in lines:
+            new_lines.append(line.strip())
+        return new_lines
+    def __len__(self):
+        return len(self.data)
+    def _enroll_data_len(self):
+        return len(self.enroll_data)
+    def _noise_data_len(self):
+        return len(self.noise_data)
+    def _get_segment_start_stop(self, seg_len, length):
+        if seg_len is not None:
+            start = random.randint(0, length - seg_len)
+            stop = start + seg_len
+        else:
+            start = 0
+            stop = None
+        return start, stop
+    def _mix(self, sources_list):
+        # if self.seg_len:
+        #     mix_length = self.seg_len
+        # else:
+        #     mix_length = self.common_length
+        mix_length = self.common_length
+        mixture = np.zeros(mix_length)
+        for i, _ in enumerate(sources_list):
+            mixture += sources_list[i]
+        return mixture
+    def __getitem__(self, idx):
+        source_id, source_spk, source_path, all_source_length= self.data[idx]
+        all_source_length = int(all_source_length)
+        spk_idx = self.spk_list.index(source_spk)
+        other_counter = 0
+        while True:
+            random_idx = np.random.randint(0, self.total_lines)
+            if self.data[random_idx][1] != source_spk:
+                other_id, other_spk, other_path, other_length = self.data[random_idx]
+                other_length = int(other_length)
+                if other_length > self.seg_least:
+                    break
+            other_counter += 1
+            if other_counter >= self.total_lines:
+                raise ValueError("All Data too shorter to mix")
+        if all_source_length >= other_length:
+            self.common_length = other_length
+            start, stop = self._get_segment_start_stop(self.common_length, all_source_length)
+            source_tmp,_ = sf.read(source_path, dtype="float32", start=start, stop=stop)
+            other_tmp,_ = sf.read(other_path, dtype="float32")
+        elif all_source_length <= other_length:
+            self.common_length = all_source_length
+            start, stop = self._get_segment_start_stop(self.common_length, other_length)
+            source_tmp,_ = sf.read(source_path, dtype="float32")
+            other_tmp,_ = sf.read(other_path, dtype="float32", start=start, stop=stop)
+        noise_counter = 0
+        while True:
+            random_idx = np.random.randint(0, self.total_noise)
+            noise_id, noise_path, all_noise_length= self.noise_data[random_idx]
+            all_noise_length = int(all_noise_length)
+            if all_noise_length >= self.common_length:
+                break
+            noise_counter += 1
+            if noise_counter >= self.total_noise:
+                raise ValueError("All Data can't as noise")
+        enroll_counter = 0
+        while True:
+            random_idx = np.random.randint(0, self.total_enroll)
+            if self.enroll_data[random_idx][1] == source_spk:
+                enroll_id, enroll_spk, enroll_path, all_enroll_length= self.enroll_data[random_idx]
+                all_enroll_length = int(all_enroll_length)
+                break
+            enroll_counter += 1
+            if enroll_counter >= self.total_enroll:
+                raise ValueError("All Data can't as enroll")
+        source = source_tmp[:, np.random.randint(0, source_tmp.shape[1])]
+        other = other_tmp[:, np.random.randint(0, other_tmp.shape[1])]
+        noise_start, noise_stop = self._get_segment_start_stop(self.common_length, all_noise_length)
+        noise,_ = sf.read(noise_path, dtype="float32", start=noise_start, stop=noise_stop) # single channel?
+        # noise = noise_tmp[:, np.random.randint(0, noise_tmp.shape[1])]
+        # other_noise = self._mix([other,noise])
+        desired_snr = np.random.uniform(-4, 4)  # 设置目标 SNR
+        current_snr = 10 * np.log10(np.mean(source ** 2) / (np.mean(noise ** 2) + EPS) + EPS)
+        scale_factor = 10 ** ((current_snr - desired_snr ) / 20)
+        scaled_noise = noise * scale_factor
+        snr = 10 * np.log10(np.mean(source ** 2) / (np.mean(scaled_noise ** 2) + EPS) + EPS)
+        mixture = self._mix([source,other,scaled_noise])
+        mixture = mixture.astype(np.float32)
+        enroll_tmp, _ = sf.read(enroll_path, dtype="float32")
+        enroll = enroll_tmp[:, np.random.randint(0, enroll_tmp.shape[1])]
+        return {
+            "mix": mixture,
+            "ref": source,
+            "aux": enroll,
+            "aux_len": all_enroll_length,
+            "spk_idx": spk_idx
+        }
+class ChunkSplitter(object):
+    """
+    Split utterance into small chunks
+    """
+    def __init__(self, chunk_size, train=True, least=16000):
+        self.chunk_size = chunk_size
+        self.least = least
+        self.train = train
+    def _make_chunk(self, eg, s):
+        """
+        Make a chunk instance, which contains:
+            "mix": ndarray,
+            "ref": [ndarray...]
+        """
+        chunk = dict()
+        chunk["mix"] = eg["mix"][s:s + self.chunk_size]
+        chunk["ref"] = eg["ref"][s:s + self.chunk_size]
+        chunk["aux"] = eg["aux"]
+        chunk["aux_len"] = eg["aux_len"]
+        chunk["valid_len"] = int(self.chunk_size)
+        chunk["spk_idx"] = eg["spk_idx"]
+        return chunk
+    def split(self, eg):
+        N = eg["mix"].size
+        # too short, throw away
+        if N < self.least:
+            return []
+        chunks = []
+        # padding zeros
+        if N < self.chunk_size:
+            P = self.chunk_size - N
+            chunk = dict()
+            chunk["mix"] = np.pad(eg["mix"], (0, P), "constant")
+            chunk["ref"] = np.pad(eg["ref"], (0, P), "constant")
+            chunk["aux"] = eg["aux"]
+            chunk["aux_len"] = eg["aux_len"]
+            chunk["valid_len"] = int(N)
+            chunk["spk_idx"] = eg["spk_idx"]
+            chunks.append(chunk)
+        # else:
+        #     # random select start point for training
+        #     s = random.randint(0, N % self.least) if self.train else 0
+        #     while True:
+        #         if s + self.chunk_size > N:
+        #             break
+        #         chunk = self._make_chunk(eg, s)
+        #         chunks.append(chunk)
+        #         s += self.least
+        # return chunks
+        else:
+            if self.train:
+                # random select A start point for training
+                s = random.randint(0, N - self.chunk_size)
+                chunk = self._make_chunk(eg, s)
+                chunks.append(chunk)
+            else:
+                s = 0
+                while True:
+                    if s + self.chunk_size > N:
+                        break
+                    chunk = self._make_chunk(eg, s)
+                    chunks.append(chunk)
+                    s += self.least
+        return chunks
+class DataLoader(object):
+    """
+    Online dataloader for chunk-level
+    """
+    def __init__(self,
+                 dataset,
+                 num_workers=4,
+                 chunk_size=32000,
+                 batch_size=16,
+                 train=True):
+        self.batch_size = batch_size
+        self.train = train
+        self.splitter = ChunkSplitter(chunk_size,
+                                      train=train,
+                                      least=chunk_size // 2)
+        # just return batch of egs, support multiple workers
+        self.eg_loader = dat.DataLoader(dataset,
+                                        batch_size=batch_size // 2,
+                                        num_workers=num_workers,
+                                        shuffle=train,
+                                        collate_fn=self._collate)
+    def _collate(self, batch):
+        """
+        Online split utterances
+        """
+        chunk = []
+        for eg in batch:
+            chunk += self.splitter.split(eg)
+        return chunk
+    def _pad_aux(self, chunk_list):
+        lens_list = []
+        for chunk_item in chunk_list:
+            lens_list.append(chunk_item['aux_len'])
+        max_len = np.max(lens_list)
+        # pad 0
+        for idx in range(len(chunk_list)):
+            P = max_len - len(chunk_list[idx]["aux"])
+            chunk_list[idx]["aux"] = np.pad(chunk_list[idx]["aux"], (0, P), "constant")
+        # # pad circle
+        # for idx in range(len(chunk_list)):
+        #     P = max_len - len(chunk_list[idx]["aux"])
+        #     original_aux_len = len(chunk_list[idx]["aux"])
+        #     # 使用循环来填充原句子的内容
+        #     for i in range(P):
+        #         chunk_list[idx]["aux"].append(chunk_list[idx]["aux"][i % original_aux_len])
+        return chunk_list
+    def _merge(self, chunk_list):
+        """
+        Merge chunk list into mini-batch
+        """
+        N = len(chunk_list)
+        if self.train:
+            random.shuffle(chunk_list)
+        blist = []
+        for s in range(0, N - self.batch_size + 1, self.batch_size):
+            # padding aux info
+            #self._pad_aux(chunk_list[s:s + self.batch_size])
+            batch = default_collate(self._pad_aux(chunk_list[s:s + self.batch_size]))
+            blist.append(batch)
+        rn = N % self.batch_size
+        return blist, chunk_list[-rn:] if rn else []
+    def __iter__(self):
+        chunk_list = []
+        for chunks in self.eg_loader:
+            chunk_list += chunks
+            batch, chunk_list = self._merge(chunk_list)
+            for obj in batch:
+                yield obj
+# def snr_xy(x, y):
+#     return 10 * np.log10(np.mean(x ** 2) / (np.mean(y ** 2) + EPS) + EPS)
+# def main(args):
+#     wham_noise_dir = args.wham_dir
+#     # Get train dir
+#     subdir = os.path.join(wham_noise_dir, 'tr')
+#     # List files in that dir
+#     sound_paths = glob.glob(os.path.join(subdir, '**/*.wav'),
+#                             recursive=True)
+#     # Avoid running this script if it already have been run
+#     if len(sound_paths) == 60000:
+#         print("It appears that augmented files have already been generated.\n"
+#               "Skipping data augmentation.")
+#         return
+#     elif len(sound_paths) != 20000:
+#         print("It appears that augmented files have not been generated properly\n"
+#               "Resuming augmentation.")
+#         originals = [x for x in sound_paths if 'sp' not in x]
+#         to_be_removed_08 = [x.replace('sp08','') for x in sound_paths if 'sp08' in x]
+#         to_be_removed_12 = [x.replace('sp12','') for x in sound_paths if 'sp12' in x ]
+#         sound_paths_08 = list(set(originals) - set(to_be_removed_08))
+#         sound_paths_12 = list(set(originals) - set(to_be_removed_12))
+#         augment_noise(sound_paths_08, 0.8)
+#         augment_noise(sound_paths_12, 1.2)
+#     else:
+#         print(f'Augmenting {subdir} files')
+#         # Transform audio speed
+#         augment_noise(sound_paths, 0.8)
+#         augment_noise(sound_paths, 1.2)

utils/load_obj.py ADDED Viewed

	@@ -0,0 +1,18 @@

+#!/usr/bin/env python
+import torch as th
+def load_obj(obj, device):
+    """
+    Offload tensor object in obj to cuda device
+    """
+    def cuda(obj):
+        return obj.to(device) if isinstance(obj, th.Tensor) else obj
+    if isinstance(obj, dict):
+        return {key: load_obj(obj[key], device) for key in obj}
+    elif isinstance(obj, list):
+        return [load_obj(val, device) for val in obj]
+    else:
+        return cuda(obj)

utils/logger.py ADDED Viewed

	@@ -0,0 +1,22 @@

+#!/usr/bin/env python
+import logging
+def get_logger(
+        name,
+        format_str="%(asctime)s [%(pathname)s:%(lineno)s - %(levelname)s ] %(message)s",
+        date_format="%Y-%m-%d %H:%M:%S",
+        file=False):
+    """
+    Get python logger instance
+    """
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.INFO)
+    # file or console
+    handler = logging.StreamHandler() if not file else logging.FileHandler(
+        name)
+    handler.setLevel(logging.INFO)
+    formatter = logging.Formatter(fmt=format_str, datefmt=date_format)
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+    return logger

utils/sisdr.py ADDED Viewed

	@@ -0,0 +1,23 @@

+#!/usr/bin/env python
+import numpy as np
+def sisdr(x, s, remove_dc=True):
+    """
+    Compute SI-SDR
+    x: extracted signal
+    s: reference signal(ground truth)
+    """
+    def vec_l2norm(x):
+        return np.linalg.norm(x, 2)
+    if remove_dc:
+        x_zm = x - np.mean(x)
+        s_zm = s - np.mean(s)
+        t = np.inner(x_zm, s_zm) * s_zm / vec_l2norm(s_zm)**2
+        n = x_zm - t
+    else:
+        t = np.inner(x, s) * s / vec_l2norm(s)**2
+        n = x - t
+    return 20 * np.log10(vec_l2norm(t) / vec_l2norm(n))

utils/timer.py ADDED Viewed

	@@ -0,0 +1,17 @@

+#!/usr/bin/env python
+import time
+class Timer(object):
+    """
+    A timer to record the elapsed time
+    """
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.start = time.time()
+    def elapsed(self):
+        return (time.time() - self.start) / 60