add rawnet2 code

Browse files

Files changed (4) hide show

config/rawnet_config.yaml +50 -0
src/audio_utils.py +41 -0
src/rawnet_model.py +558 -0
src/utils.py +247 -0

config/rawnet_config.yaml ADDED Viewed

	@@ -0,0 +1,50 @@

+---
+seed: 1234
+num_thread: 6
+prefetch_factor: 2
+num_epochs: 150
+early_stopping: 10
+lr: 0.0001
+weight_decay: 0.0001
+batch_size: 128
+#batch_size: 32
+T_max: 100
+eta_min: 0.00001
+save_model_folder: 'checkpoints/rawnet2_model/'
+save_results_folder: 'results/'
+model_pretrained: 'RAWNET_ASVSPOOF.pth'
+#amsgrad: 1
+win_len: 3.0
+training_asvspoof: True
+training_FoR: True
+training_InTheWild: True
+train_model: True
+eval_model: True
+#model-related
+model:
+  first_conv: 1024   # no. of filter coefficients
+  in_channels: 1
+  filts: [20, [20, 20], [20, 128], [128, 128]] # no. of filters channel in residual blocks
+  blocks: [2, 4]
+  nb_fc_node: 1024
+  gru_node: 1024
+  nb_gru_layer: 3
+  nb_classes: 2
+old_model:
+  first_conv: 1024   # no. of filter coefficients
+  in_channels: 1
+  filts: [20, [20, 20], [20, 128], [128, 128]] # no. of filters channel in residual blocks
+  blocks: [2, 4]
+  nb_fc_node: 1024
+  gru_node: 1024
+  nb_gru_layer: 3
+  nb_classes: 2

src/audio_utils.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import librosa
+import numpy as np
+import warnings
+import soundfile as sf
+warnings.filterwarnings("ignore")
+def read_audio(audio_path, dur=180, fs=16000, trim=False, int_type=False, windowing=False, freq_min=None, freq_max=6000):
+    if audio_path.endswith('.wav'):
+        X, fs_orig = sf.read(audio_path)
+        # X, fs_orig = librosa.load(audio_path, sr=None, duration=dur)
+        if fs_orig != fs:
+            X = librosa.resample(X, orig_sr=fs_orig, target_sr=fs)
+    else:
+        X = np.load(audio_path)
+        fs = 16000
+    if trim:
+        X = librosa.effects.trim(X, top_db=20)[0]
+    # from float to int
+    if int_type:
+        X = (X * 32768).astype(np.int32)
+    if windowing:
+        win_len = 3 # in seconds
+        mask = np.zeros(dur*fs).astype(bool)
+        for ii in range(mask.shape[0]//(win_len*fs)):
+            mask[ii*win_len*fs:ii*win_len*fs+fs] = True
+            mask = mask[:X.shape[0]]
+        X = X[mask]
+        sf.write(audio_path, X, fs)
+    return X, fs
+def mix_tracks(audio1, audio2):
+    mix_len = np.min([len(audio1), len(audio2)])
+    mix = (audio1[:mix_len] + audio2[:mix_len]) / 2
+    return mix

src/rawnet_model.py ADDED Viewed

	@@ -0,0 +1,558 @@

+import torch.nn.functional as F
+from collections import OrderedDict
+import torch
+import torch.nn as nn
+from torch import Tensor
+import sys
+from src.audio_utils import *
+import random
+import pandas as pd
+import pdb
+class SincConv(nn.Module):
+    @staticmethod
+    def to_mel(hz):
+        return 2595 * np.log10(1 + hz / 700)
+    @staticmethod
+    def to_hz(mel):
+        return 700 * (10 ** (mel / 2595) - 1)
+    def __init__(self, device, out_channels, kernel_size, in_channels=1, sample_rate=16000,
+                 stride=1, padding=0, dilation=1, bias=False, groups=1):
+        super(SincConv, self).__init__()
+        if in_channels != 1:
+            msg = "SincConv only support one input channel (here, in_channels = {%i})" % (in_channels)
+            raise ValueError(msg)
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.sample_rate = sample_rate
+        # Forcing the filters to be odd (i.e, perfectly symmetrics)
+        if kernel_size % 2 == 0:
+            self.kernel_size = self.kernel_size + 1
+        self.device = device
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        if bias:
+            raise ValueError('SincConv does not support bias.')
+        if groups > 1:
+            raise ValueError('SincConv does not support groups.')
+        # initialize filterbanks using Mel scale
+        NFFT = 512
+        f = int(self.sample_rate / 2) * np.linspace(0, 1, int(NFFT / 2) + 1)
+        fmel = self.to_mel(f)  # Hz to mel conversion
+        fmelmax = np.max(fmel)
+        fmelmin = np.min(fmel)
+        filbandwidthsmel = np.linspace(fmelmin, fmelmax, self.out_channels + 1)
+        filbandwidthsf = self.to_hz(filbandwidthsmel)  # Mel to Hz conversion
+        self.mel = filbandwidthsf
+        self.hsupp = torch.arange(-(self.kernel_size - 1) / 2, (self.kernel_size - 1) / 2 + 1)
+        self.band_pass = torch.zeros(self.out_channels, self.kernel_size)
+    def forward(self, x):
+        for i in range(len(self.mel) - 1):
+            fmin = self.mel[i]
+            fmax = self.mel[i + 1]
+            hHigh = (2 * fmax / self.sample_rate) * np.sinc(2 * fmax * self.hsupp / self.sample_rate)
+            hLow = (2 * fmin / self.sample_rate) * np.sinc(2 * fmin * self.hsupp / self.sample_rate)
+            hideal = hHigh - hLow
+            self.band_pass[i, :] = Tensor(np.hamming(self.kernel_size)) * Tensor(hideal)
+        band_pass_filter = self.band_pass.to(self.device)
+        self.filters = (band_pass_filter).view(self.out_channels, 1, self.kernel_size)
+        return F.conv1d(x, self.filters, stride=self.stride,
+                        padding=self.padding, dilation=self.dilation,
+                        bias=None, groups=1)
+class Residual_block(nn.Module):
+    def __init__(self, nb_filts, first=False):
+        super(Residual_block, self).__init__()
+        self.first = first
+        if not self.first:
+            self.bn1 = nn.BatchNorm1d(num_features=nb_filts[0])
+        self.lrelu = nn.LeakyReLU(negative_slope=0.3)
+        self.conv1 = nn.Conv1d(in_channels=nb_filts[0],
+                               out_channels=nb_filts[1],
+                               kernel_size=3,
+                               padding=1,
+                               stride=1)
+        self.bn2 = nn.BatchNorm1d(num_features=nb_filts[1])
+        self.conv2 = nn.Conv1d(in_channels=nb_filts[1],
+                               out_channels=nb_filts[1],
+                               padding=1,
+                               kernel_size=3,
+                               stride=1)
+        if nb_filts[0] != nb_filts[1]:
+            self.downsample = True
+            self.conv_downsample = nn.Conv1d(in_channels=nb_filts[0],
+                                             out_channels=nb_filts[1],
+                                             padding=0,
+                                             kernel_size=1,
+                                             stride=1)
+        else:
+            self.downsample = False
+        self.mp = nn.MaxPool1d(3)
+    def forward(self, x):
+        identity = x
+        if not self.first:
+            out = self.bn1(x)
+            out = self.lrelu(out)
+        else:
+            out = x
+        out = self.conv1(x)
+        out = self.bn2(out)
+        out = self.lrelu(out)
+        out = self.conv2(out)
+        if self.downsample:
+            identity = self.conv_downsample(identity)
+        out += identity
+        out = self.mp(out)
+        return out
+class RawNet(nn.Module):
+    def __init__(self, d_args, device):
+        super(RawNet, self).__init__()
+        self.device = device
+        self.Sinc_conv = SincConv(device=self.device,
+                                  out_channels=d_args['filts'][0],
+                                  kernel_size=d_args['first_conv'],
+                                  in_channels=d_args['in_channels']
+                                  )
+        # self.Sinc_conv = SincConv(out_channels=d_args['filts'][0],
+        #                            kernel_size=d_args['first_conv'])
+        self.first_bn = nn.BatchNorm1d(num_features=d_args['filts'][0])
+        self.selu = nn.SELU(inplace=True)
+        self.block0 = nn.Sequential(Residual_block(nb_filts=d_args['filts'][1], first=True))
+        self.block1 = nn.Sequential(Residual_block(nb_filts=d_args['filts'][1]))
+        self.block2 = nn.Sequential(Residual_block(nb_filts=d_args['filts'][2]))
+        d_args['filts'][2][0] = d_args['filts'][2][1]
+        self.block3 = nn.Sequential(Residual_block(nb_filts=d_args['filts'][2]))
+        self.block4 = nn.Sequential(Residual_block(nb_filts=d_args['filts'][2]))
+        self.block5 = nn.Sequential(Residual_block(nb_filts=d_args['filts'][2]))
+        self.avgpool = nn.AdaptiveAvgPool1d(1)
+        self.fc_attention0 = self._make_attention_fc(in_features=d_args['filts'][1][-1],
+                                                     l_out_features=d_args['filts'][1][-1])
+        self.fc_attention1 = self._make_attention_fc(in_features=d_args['filts'][1][-1],
+                                                     l_out_features=d_args['filts'][1][-1])
+        self.fc_attention2 = self._make_attention_fc(in_features=d_args['filts'][2][-1],
+                                                     l_out_features=d_args['filts'][2][-1])
+        self.fc_attention3 = self._make_attention_fc(in_features=d_args['filts'][2][-1],
+                                                     l_out_features=d_args['filts'][2][-1])
+        self.fc_attention4 = self._make_attention_fc(in_features=d_args['filts'][2][-1],
+                                                     l_out_features=d_args['filts'][2][-1])
+        self.fc_attention5 = self._make_attention_fc(in_features=d_args['filts'][2][-1],
+                                                     l_out_features=d_args['filts'][2][-1])
+        self.bn_before_gru = nn.BatchNorm1d(num_features=d_args['filts'][2][-1])
+        self.gru = nn.GRU(input_size=d_args['filts'][2][-1],
+                          hidden_size=d_args['gru_node'],
+                          num_layers=d_args['nb_gru_layer'],
+                          batch_first=True)
+        self.fc1_gru = nn.Linear(in_features=d_args['gru_node'],
+                                 out_features=d_args['nb_fc_node'])
+        self.fc2_gru = nn.Linear(in_features=d_args['nb_fc_node'],
+                                 out_features=d_args['nb_classes'], bias=True)
+        self.sig = nn.Sigmoid()
+        self.logsoftmax = nn.LogSoftmax(dim=1)
+    #
+    # def forward(self, x, y=None):
+    #
+        # nb_samp = x.shape[0]
+        # len_seq = x.shape[1]
+        # x = x.view(nb_samp, 1, len_seq)
+        #
+        # x = self.Sinc_conv(x)
+        # x = F.max_pool1d(torch.abs(x), 3)
+        # x = self.first_bn(x)
+        # x = self.selu(x)
+        #
+        # x0 = self.block0(x)
+        # y0 = self.avgpool(x0).view(x0.size(0), -1)  # torch.Size([batch, filter])
+        # y0 = self.fc_attention0(y0)
+        # y0 = self.sig(y0).view(y0.size(0), y0.size(1), -1)  # torch.Size([batch, filter, 1])
+        # x = x0 * y0 + y0  # (batch, filter, time) x (batch, filter, 1)
+        #
+        # x1 = self.block1(x)
+        # y1 = self.avgpool(x1).view(x1.size(0), -1)  # torch.Size([batch, filter])
+        # y1 = self.fc_attention1(y1)
+        # y1 = self.sig(y1).view(y1.size(0), y1.size(1), -1)  # torch.Size([batch, filter, 1])
+        # x = x1 * y1 + y1  # (batch, filter, time) x (batch, filter, 1)
+        #
+        # x2 = self.block2(x)
+        # y2 = self.avgpool(x2).view(x2.size(0), -1)  # torch.Size([batch, filter])
+        # y2 = self.fc_attention2(y2)
+        # y2 = self.sig(y2).view(y2.size(0), y2.size(1), -1)  # torch.Size([batch, filter, 1])
+        # x = x2 * y2 + y2  # (batch, filter, time) x (batch, filter, 1)
+        #
+        # x3 = self.block3(x)
+        # y3 = self.avgpool(x3).view(x3.size(0), -1)  # torch.Size([batch, filter])
+        # y3 = self.fc_attention3(y3)
+        # y3 = self.sig(y3).view(y3.size(0), y3.size(1), -1)  # torch.Size([batch, filter, 1])
+        # x = x3 * y3 + y3  # (batch, filter, time) x (batch, filter, 1)
+        #
+        # x4 = self.block4(x)
+        # y4 = self.avgpool(x4).view(x4.size(0), -1)  # torch.Size([batch, filter])
+        # y4 = self.fc_attention4(y4)
+        # y4 = self.sig(y4).view(y4.size(0), y4.size(1), -1)  # torch.Size([batch, filter, 1])
+        # x = x4 * y4 + y4  # (batch, filter, time) x (batch, filter, 1)
+        #
+        # x5 = self.block5(x)
+        # y5 = self.avgpool(x5).view(x5.size(0), -1)  # torch.Size([batch, filter])
+        # y5 = self.fc_attention5(y5)
+        # y5 = self.sig(y5).view(y5.size(0), y5.size(1), -1)  # torch.Size([batch, filter, 1])
+        # x = x5 * y5 + y5  # (batch, filter, time) x (batch, filter, 1)
+        #
+        # x = self.bn_before_gru(x)
+        # x = self.selu(x)
+        # x = x.permute(0, 2, 1)  # (batch, filt, time) >> (batch, time, filt)
+        # self.gru.flatten_parameters()
+        # x, _ = self.gru(x)
+        # x = x[:, -1, :]
+        # x = self.fc1_gru(x)
+        # x = self.fc2_gru(x)
+        # output = self.logsoftmax(x)
+        #
+        # return output
+    def forward(self, x):
+        # Pass through Residual Part
+        x = self._forward_residual_part(x)
+        # pdb.set_trace()
+        # Pass through Processing Part
+        x = self._forward_processing_part(x)
+        output = self.logsoftmax(x)
+        return output
+    def _forward_residual_part(self, x):
+        nb_samp = x.shape[0]
+        len_seq = x.shape[1]
+        x = x.view(nb_samp, 1, len_seq)
+        # pdb.set_trace()
+        x = self.Sinc_conv(x)
+        x = F.max_pool1d(torch.abs(x), 3)
+        x = self.first_bn(x)
+        x = self.selu(x)
+        x0 = self.block0(x)
+        y0 = self.avgpool(x0).view(x0.size(0), -1)
+        y0 = self.fc_attention0(y0)
+        y0 = self.sig(y0).view(y0.size(0), y0.size(1), -1)
+        x = x0 * y0 + y0
+        x1 = self.block1(x)
+        y1 = self.avgpool(x1).view(x1.size(0), -1)
+        y1 = self.fc_attention1(y1)
+        y1 = self.sig(y1).view(y1.size(0), y1.size(1), -1)
+        x = x1 * y1 + y1
+        x2 = self.block2(x)
+        y2 = self.avgpool(x2).view(x2.size(0), -1)
+        y2 = self.fc_attention2(y2)
+        y2 = self.sig(y2).view(y2.size(0), y2.size(1), -1)
+        x = x2 * y2 + y2
+        x3 = self.block3(x)
+        y3 = self.avgpool(x3).view(x3.size(0), -1)
+        y3 = self.fc_attention3(y3)
+        y3 = self.sig(y3).view(y3.size(0), y3.size(1), -1)
+        x = x3 * y3 + y3
+        x4 = self.block4(x)
+        y4 = self.avgpool(x4).view(x4.size(0), -1)
+        y4 = self.fc_attention4(y4)
+        y4 = self.sig(y4).view(y4.size(0), y4.size(1), -1)
+        x = x4 * y4 + y4
+        x5 = self.block5(x)
+        y5 = self.avgpool(x5).view(x5.size(0), -1)
+        y5 = self.fc_attention5(y5)
+        y5 = self.sig(y5).view(y5.size(0), y5.size(1), -1)
+        x = x5 * y5 + y5
+        return x
+    def _forward_processing_part(self, x):
+        x = self.bn_before_gru(x)
+        x = self.selu(x)
+        x = x.permute(0, 2, 1)
+        self.gru.flatten_parameters()
+        x, _ = self.gru(x)
+        x = x[:, -1, :]
+        x = self.fc1_gru(x)
+        x = self.fc2_gru(x)
+        return x
+    def freeze_processing_part(self):
+        for param in self.bn_before_gru.parameters():
+            param.requires_grad = False
+        for param in self.gru.parameters():
+            param.requires_grad = False
+        for param in self.fc1_gru.parameters():
+            param.requires_grad = False
+        for param in self.fc2_gru.parameters():
+            param.requires_grad = False
+    def unfreeze_processing_part(self):
+        for param in self.bn_before_gru.parameters():
+            param.requires_grad = True
+        for param in self.gru.parameters():
+            param.requires_grad = True
+        for param in self.fc1_gru.parameters():
+            param.requires_grad = True
+        for param in self.fc2_gru.parameters():
+            param.requires_grad = True
+    def freeze_residual_part(self):
+        for param in self.Sinc_conv.parameters():
+            param.requires_grad = False
+        for param in self.first_bn.parameters():
+            param.requires_grad = False
+        for param in self.block0.parameters():
+            param.requires_grad = False
+        for param in self.block1.parameters():
+            param.requires_grad = False
+        for param in self.block2.parameters():
+            param.requires_grad = False
+        for param in self.block3.parameters():
+            param.requires_grad = False
+        for param in self.block4.parameters():
+            param.requires_grad = False
+        for param in self.block5.parameters():
+            param.requires_grad = False
+        for param in self.fc_attention0.parameters():
+            param.requires_grad = False
+        for param in self.fc_attention1.parameters():
+            param.requires_grad = False
+        for param in self.fc_attention2.parameters():
+            param.requires_grad = False
+        for param in self.fc_attention3.parameters():
+            param.requires_grad = False
+        for param in self.fc_attention4.parameters():
+            param.requires_grad = False
+        for param in self.fc_attention5.parameters():
+            param.requires_grad = False
+    def unfreeze_residual_part(self):
+        for param in self.Sinc_conv.parameters():
+            param.requires_grad = True
+        for param in self.first_bn.parameters():
+            param.requires_grad = True
+        for param in self.block0.parameters():
+            param.requires_grad = True
+        for param in self.block1.parameters():
+            param.requires_grad = True
+        for param in self.block2.parameters():
+            param.requires_grad = True
+        for param in self.block3.parameters():
+            param.requires_grad = True
+        for param in self.block4.parameters():
+            param.requires_grad = True
+        for param in self.block5.parameters():
+            param.requires_grad = True
+        for param in self.fc_attention0.parameters():
+            param.requires_grad = True
+        for param in self.fc_attention1.parameters():
+            param.requires_grad = True
+        for param in self.fc_attention2.parameters():
+            param.requires_grad = True
+        for param in self.fc_attention3.parameters():
+            param.requires_grad = True
+        for param in self.fc_attention4.parameters():
+            param.requires_grad = True
+        for param in self.fc_attention5.parameters():
+            param.requires_grad = True
+    def get_embeddings(self, x):
+        nb_samp = x.shape[0]
+        len_seq = x.shape[1]
+        x = x.view(nb_samp, 1, len_seq)
+        x = self.Sinc_conv(x)
+        x = F.max_pool1d(torch.abs(x), 3)
+        x = self.first_bn(x)
+        x = self.selu(x)
+        x0 = self.block0(x)
+        y0 = self.avgpool(x0).view(x0.size(0), -1)
+        y0 = self.fc_attention0(y0)
+        y0 = self.sig(y0).view(y0.size(0), y0.size(1), -1)
+        x = x0 * y0 + y0
+        x1 = self.block1(x)
+        y1 = self.avgpool(x1).view(x1.size(0), -1)
+        y1 = self.fc_attention1(y1)
+        y1 = self.sig(y1).view(y1.size(0), y1.size(1), -1)
+        x = x1 * y1 + y1
+        x2 = self.block2(x)
+        y2 = self.avgpool(x2).view(x2.size(0), -1)
+        y2 = self.fc_attention2(y2)
+        y2 = self.sig(y2).view(y2.size(0), y2.size(1), -1)
+        x = x2 * y2 + y2
+        x3 = self.block3(x)
+        y3 = self.avgpool(x3).view(x3.size(0), -1)
+        y3 = self.fc_attention3(y3)
+        y3 = self.sig(y3).view(y3.size(0), y3.size(1), -1)
+        x = x3 * y3 + y3
+        x4 = self.block4(x)
+        y4 = self.avgpool(x4).view(x4.size(0), -1)
+        y4 = self.fc_attention4(y4)
+        y4 = self.sig(y4).view(y4.size(0), y4.size(1), -1)
+        x = x4 * y4 + y4
+        x5 = self.block5(x)
+        y5 = self.avgpool(x5).view(x5.size(0), -1)
+        y5 = self.fc_attention5(y5)
+        y5 = self.sig(y5).view(y5.size(0), y5.size(1), -1)
+        x = x5 * y5 + y5
+        x = self.bn_before_gru(x)
+        x = self.selu(x)
+        x = x.permute(0, 2, 1)  # (batch, filt, time) >> (batch, time, filt)
+        self.gru.flatten_parameters()
+        x, _ = self.gru(x)
+        embeddings = x[:, -1, :]  # Extract the embeddings from the GRU output
+        return embeddings
+    def _make_attention_fc(self, in_features, l_out_features):
+        l_fc = []
+        l_fc.append(nn.Linear(in_features=in_features,
+                              out_features=l_out_features))
+        return nn.Sequential(*l_fc)
+    def _make_layer(self, nb_blocks, nb_filts, first=False):
+        layers = []
+        # def __init__(self, nb_filts, first = False):
+        for i in range(nb_blocks):
+            first = first if i == 0 else False
+            layers.append(Residual_block(nb_filts=nb_filts,
+                                         first=first))
+            if i == 0: nb_filts[0] = nb_filts[1]
+        return nn.Sequential(*layers)
+    def summary(self, input_size, batch_size=-1, device="cuda", print_fn=None):
+        if print_fn == None: printfn = print
+        model = self
+        def register_hook(module):
+            def hook(module, input, output):
+                class_name = str(module.__class__).split(".")[-1].split("'")[0]
+                module_idx = len(summary)
+                m_key = "%s-%i" % (class_name, module_idx + 1)
+                summary[m_key] = OrderedDict()
+                summary[m_key]["input_shape"] = list(input[0].size())
+                summary[m_key]["input_shape"][0] = batch_size
+                if isinstance(output, (list, tuple)):
+                    summary[m_key]["output_shape"] = [
+                        [-1] + list(o.size())[1:] for o in output
+                    ]
+                else:
+                    summary[m_key]["output_shape"] = list(output.size())
+                    if len(summary[m_key]["output_shape"]) != 0:
+                        summary[m_key]["output_shape"][0] = batch_size
+                params = 0
+                if hasattr(module, "weight") and hasattr(module.weight, "size"):
+                    params += torch.prod(torch.LongTensor(list(module.weight.size())))
+                    summary[m_key]["trainable"] = module.weight.requires_grad
+                if hasattr(module, "bias") and hasattr(module.bias, "size"):
+                    params += torch.prod(torch.LongTensor(list(module.bias.size())))
+                summary[m_key]["nb_params"] = params
+            if (
+                    not isinstance(module, nn.Sequential)
+                    and not isinstance(module, nn.ModuleList)
+                    and not (module == model)
+            ):
+                hooks.append(module.register_forward_hook(hook))
+        device = device.lower()
+        assert device in [
+            "cuda",
+            "cpu",
+        ], "Input device is not valid, please specify 'cuda' or 'cpu'"
+        if device == "cuda" and torch.cuda.is_available():
+            dtype = torch.cuda.FloatTensor
+        else:
+            dtype = torch.FloatTensor
+        if isinstance(input_size, tuple):
+            input_size = [input_size]
+        x = [torch.rand(2, *in_size).type(dtype) for in_size in input_size]
+        summary = OrderedDict()
+        hooks = []
+        model.apply(register_hook)
+        model(*x)
+        for h in hooks:
+            h.remove()
+        print_fn("----------------------------------------------------------------")
+        line_new = "{:>20}  {:>25} {:>15}".format("Layer (type)", "Output Shape", "Param #")
+        print_fn(line_new)
+        print_fn("================================================================")
+        total_params = 0
+        total_output = 0
+        trainable_params = 0
+        for layer in summary:
+            # input_shape, output_shape, trainable, nb_params
+            line_new = "{:>20}  {:>25} {:>15}".format(
+                layer,
+                str(summary[layer]["output_shape"]),
+                "{0:,}".format(summary[layer]["nb_params"]),
+            )
+            total_params += summary[layer]["nb_params"]
+            total_output += np.prod(summary[layer]["output_shape"])
+            if "trainable" in summary[layer]:
+                if summary[layer]["trainable"] == True:
+                    trainable_params += summary[layer]["nb_params"]
+            print_fn(line_new)

src/utils.py ADDED Viewed

	@@ -0,0 +1,247 @@

+import os
+import torch
+import random
+import GPUtil
+import yaml
+import matplotlib.pyplot as plt
+import numpy as np
+from sklearn.metrics import roc_curve, auc, confusion_matrix
+import pandas as pd
+import torch.nn as nn
+def set_gpu(id=-1):
+    """
+    Set GPU device or select the one with the lowest memory usage (None for CPU-only)
+    :param id: if specified, corresponds to the GPU index desired.
+    """
+    if id is None:
+        # CPU only
+        print('GPU not selected')
+        os.environ["CUDA_VISIBLE_DEVICES"] = str(-1)
+    else:
+        # -1 for automatic choice
+        device = id if id != -1 else GPUtil.getFirstAvailable(order='memory')[0]
+        try:
+            name = GPUtil.getGPUs()[device].name
+        except IndexError:
+            print('The selected GPU does not exist. Switching to the most available one.')
+            device = GPUtil.getFirstAvailable(order='memory')[0]
+            name = GPUtil.getGPUs()[device].name
+        print('GPU selected: %d - %s' % (device, name))
+        os.environ["CUDA_VISIBLE_DEVICES"] = str(device)
+    return device
+def prepare_asvspoof_data(config):
+    data_dir_2019 = '/nas/public/dataset/asvspoof2019/LA/ASVspoof2019_LA_cm_protocols'
+    data_eval_2021 = '/nas/public/dataset/asvspoof2021/DF_cm_eval_labels.txt'
+    files = [os.path.join(data_dir_2019, 'ASVspoof2019.LA.cm.train.trn.txt'),
+        os.path.join(data_dir_2019, 'ASVspoof2019.LA.cm.dev.trl.txt'), data_eval_2021]
+    audio_dir_2019 = '/nas/public/dataset/asvspoof2019/LA'
+    audio_dir_2021 = '/nas/public/dataset/asvspoof2021/ASVspoof2021_DF_eval/flac/'
+    set_dirs = [os.path.join(audio_dir_2019, 'ASVspoof2019_LA_train/flac/'),
+                os.path.join(audio_dir_2019, 'ASVspoof2019_LA_dev/flac/'), audio_dir_2021]
+    save_paths = [config['df_train_path'], config['df_dev_path'], config['df_eval_path']]
+    for file_path, set_dir, save_path in zip(files, set_dirs, save_paths):
+        txt_file = pd.read_csv(file_path, sep=' ', header=None)
+        txt_file = txt_file.replace({'bonafide': 0, 'spoof': 1})
+        txt_file.iloc[:,1] = set_dir + txt_file.iloc[:,1].astype(str) + '.flac'
+        if not file_path == data_eval_2021:
+            df = txt_file[[1, 4]]
+            df = df.rename({1: 'path', 4: 'label'}, axis='columns')
+        else:
+            df = txt_file[[1, 5]]
+            df = df.rename({1: 'path', 5: 'label'}, axis='columns')
+        df.to_csv(save_path)
+def init_weights(module):
+    if isinstance(module, nn.Linear):
+        torch.nn.init.xavier_uniform_(module.weight)
+        module.bias.data.fill_(0.01)
+def read_yaml(config_path):
+    """
+    Read YAML file.
+    :param config_path: path to the YAML config file.
+    :type config_path: str
+    :return: dictionary correspondent to YAML content
+    :rtype dict
+    """
+    with open(config_path, 'r') as f:
+        config = yaml.safe_load(f)
+    return config
+def sigmoid(x, factor=1):
+    """
+    Compute sigmoid function.
+    :param x: input signal
+    :param factor: sigmoid parameter
+    :return: sigmoid(x)
+    :rtype np.array
+    """
+    z = 1 / (1 + np.exp(-factor*x))
+    return z
+def plot_roc_curve(labels, pred, legend=None):
+    """
+    Plot ROC curve.
+    :param labels: groundtruth labels
+    :type labels: list
+    :param pred: predicted score
+    :type pred: list
+    :param legend: if True, add legend to the plot
+    :type legend: bool
+    :return:
+    """
+    # labels and pred bust be given in (N, ) shape
+    def tpr5(y_true, y_pred):
+        fpr, tpr, thr = roc_curve(y_true, y_pred)
+        fp_sort = sorted(fpr)
+        tp_sort = sorted(tpr)
+        tpr_ind = [i for (i, val) in enumerate(fp_sort) if val >= 0.1][0]
+        tpr01 = tp_sort[tpr_ind]
+        return tpr01
+    lw = 3
+    fpr, tpr, thres = roc_curve(labels, pred)
+    rocauc = auc(fpr, tpr)
+    fnr = 1 - tpr
+    eer = fpr[np.nanargmin(np.absolute(fnr - fpr))]
+    optimal_index = np.argmax(tpr - fpr)
+    optimal_threshold = thres[optimal_index]
+    print('TPR5 = {:.3f}'.format(tpr5(labels, pred)))
+    print('AUC = {:.3f}'.format(rocauc))
+    print('EER = {:.3f}'.format(eer))
+    print('Best Thres. = {:.3f}'.format(optimal_threshold))
+    print()
+    if legend:
+        plt.plot(fpr, tpr, lw=lw, label='$\mathrm{' + legend + ' - AUC = %0.2f}$' % rocauc)
+    else:
+        plt.plot(fpr, tpr, lw=lw, label='$\mathrm{AUC = %0.2f}$' % rocauc)
+    plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='--')
+    plt.xlim([-0.02, 1.0])
+    plt.ylim([0.0, 1.03])
+    plt.xlabel(r'$\mathrm{False\;Positive\;Rate}$', fontsize=18)
+    plt.ylabel(r'$\mathrm{True\;Positive\;Rate}$', fontsize=18)
+    plt.legend(loc="lower right", fontsize=15)
+    plt.xticks(fontsize=15)
+    plt.yticks(fontsize=15)
+    plt.grid(True)
+    # plt.show()
+    return optimal_threshold
+def plot_confusion_matrix(y_true, y_pred, normalize=False, cmap=plt.cm.Blues):
+    """
+    Plot confusion matrix.
+    :param y_true: ground-truth labels
+    :type y_true: list
+    :param y_pred: predicted labels
+    :type y_pred: list
+    :param normalize: if set to True, normalise the confusion matrix.
+    :type normalize: bool
+    :param cmap: matplotlib cmap to be used for plot
+    :type cmap:
+    :return:
+    """
+    cm = confusion_matrix(y_true, y_pred)
+    # Only use the labels that appear in the data
+    # classes = classes[unique_labels(y_true, y_pred)]
+    classes = ['$\it{Real}$','$\it{Fake}$']
+    if normalize:
+        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+    print(cm)
+    fsize = 25  # fontsize
+    fig, ax = plt.subplots()
+    im = ax.imshow(cm, interpolation='nearest', cmap=cmap, clim=(0,1))
+    cbar = ax.figure.colorbar(im, ax=ax)
+    cbar.ax.tick_params(labelsize=fsize)
+    ax.set(xticks=np.arange(cm.shape[1]),
+           yticks=np.arange(cm.shape[0]),
+           )
+    ax.set_xlabel('$\mathrm{True\;label}$', fontsize=fsize)
+    ax.set_ylabel('$\mathrm{Predicted\;label}$', fontsize=fsize)
+    ax.set_xticklabels(classes, fontsize=fsize)
+    ax.set_yticklabels(classes, fontsize=fsize)
+    # Rotate the tick labels and set their alignment.
+    # plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
+    #          rotation_mode="anchor")
+    # Loop over data dimensions and create text annotations.
+    fmt = '.3f' if normalize else 'd'
+    thresh = cm.max() / 2.
+    for i in range(cm.shape[0]):
+        for j in range(cm.shape[1]):
+            ax.text(j, i, format('$\mathrm{' + str(format(cm[i, j], fmt)) + '}$'),
+                    ha="center", va="center",
+                    fontsize=fsize,
+                    color="white" if np.array(cm[i, j]) > thresh else "black")
+    fig.tight_layout()
+    # plt.show()
+    return ax
+def reconstruct_from_pred(pred_array, win_len, hop_size, fs=16000):
+    """
+    Create a score array with length equal to the original signal length starting from predictions aggregated on
+    rectangular windows.
+    :param pred_array: aggregated prediction array
+    :type pred_array: list
+    :param win_len: length of the window used for aggregation
+    :type win_len: int
+    :param hop_size: length of the hop used for aggregation
+    :type hop_size: int
+    :param fs: sampling frequency
+    :type fs: int
+    :return: reconstructed array
+    """
+    pred_array = np.array(pred_array)
+    audio_shape = (len(pred_array)-1) * hop_size * fs + win_len * fs
+    window_pred = np.zeros((len(pred_array), int(audio_shape)))
+    for idx, pred in enumerate(pred_array):
+        window_pred[idx, int(idx*hop_size*fs):int((idx*hop_size+win_len)*fs)] = pred
+    window_pred = np.nanmean(np.where(window_pred != 0, window_pred, np.nan), 0)
+    return window_pred
+def seed_everything(seed: int):
+    """
+    Set seed for everything.
+    :param seed: seed value
+    :type seed: int
+    """
+    random.seed(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = True