Spaces:

xiaoxuezi
/

speaker_verification

Runtime error

App Files Files Community

xiaoxuezi commited on May 19, 2022

Commit

875baeb

1 Parent(s): c608f1c

2

Browse files

Files changed (19) hide show

lossfunction/.DS_Store +0 -0
lossfunction/AdditiveAngularMargin.py +50 -0
lossfunction/Unetloss.py +87 -0
lossfunction/__init__.py +7 -0
lossfunction/aamsoftmax.py +67 -0
lossfunction/aamsoftmaxproto.py +29 -0
lossfunction/amsoftmax.py +39 -0
lossfunction/angleproto.py +41 -0
lossfunction/ge2e.py +58 -0
lossfunction/proto.py +48 -0
lossfunction/softmax.py +22 -0
lossfunction/softmaxproto.py +37 -0
lossfunction/triplet.py +101 -0
net/.DS_Store +0 -0
net/ECAPATDNN.py +955 -0
net/ECAPA_TDNN.py +246 -0
net/ECAPA_TDNN_br.py +171 -0
net/__init__.py +16 -0
utils/.DS_Store +0 -0

lossfunction/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

lossfunction/AdditiveAngularMargin.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy
+import math
+from utils.acc import accuracy
+class AdditiveAngularMargin(nn.Module):
+    def __init__(self,
+                 feature_dim=256,
+                 n_classes=1000,
+                 margin=0.2,
+                 scale=30,
+                 easy_margin=False):
+        super(AdditiveAngularMargin, self).__init__()
+        self.margin = margin
+        self.scale = scale
+        self.easy_margin = easy_margin
+        self.w = nn.Parameter(torch.FloatTensor(feature_dim, n_classes))
+        nn.init.xavier_normal_(self.w)
+        self.cos_m = math.cos(self.margin)
+        self.sin_m = math.sin(self.margin)
+        self.th = math.cos(math.pi - self.margin)
+        self.mm = math.sin(math.pi - self.margin) * self.margin
+        self.nll_loss = nn.NLLLoss()
+        self.n_classes = n_classes
+        self.test_normalize = True
+    def forward(self, logits, targets):
+        # logits = self.drop(logits)
+        logits = F.normalize(logits, p=2, dim=1, eps=1e-8)
+        wn = F.normalize(self.w, p=2, dim=0, eps=1e-8)
+        cosine = logits @ wn
+        #cosine = outputs.astype('float32')
+        sine = torch.sqrt(1.0 - torch.square(cosine))
+        phi = cosine * self.cos_m - sine * self.sin_m  # cos(theta + m)
+        if self.easy_margin:
+            phi = torch.where(cosine > 0, phi, cosine)
+        else:
+            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
+        target_one_hot = F.one_hot(targets, self.n_classes)
+        outputs = (target_one_hot * phi) + ((1.0 - target_one_hot) * cosine)
+        outputs = self.scale * outputs
+        pred = F.log_softmax(outputs, dim=-1)
+        nloss = self.nll_loss(pred, targets)
+        prec1 = accuracy(pred.detach(), targets.detach(), topk=(1,))[0]
+        return nloss, prec1

lossfunction/Unetloss.py ADDED Viewed

	@@ -0,0 +1,87 @@

+from lossfunction.softmaxproto import SoftmaxProto
+import torch.nn as nn
+import lossfunction.softmax as softmax
+import torch
+import torch.nn.functional as F
+import numpy
+class Unetloss(nn.Module):
+    def __init__(self, nOut, nClasses):
+        super(Unetloss, self).__init__()
+        self.test_normalize = True
+        self.softmax = SoftmaxProto(nOut, nClasses)
+        self.mseloss = nn.MSELoss()
+        print('Initialised Unet Loss')
+    def forward(self, emb, spectrogram, x, label=None):
+        nlossE, prec1 = self.softmax(emb, label)
+        nlossS = self.mseloss(spectrogram, x)
+        # print("\nnlossE:", nlossE,"nlossS:", nlossS)
+        # nlossE: 13.1695 , nlossS:0.8902
+        return nlossE+10*nlossS, prec1
+class UnetMaskloss(nn.Module):
+    def __init__(self, nOut, nClasses):
+        super(UnetMaskloss, self).__init__()
+        self.test_normalize = True
+        self.softmax = softmax.Softmax(nOut, nClasses)
+        self.mseloss = nn.MSELoss(reduction='sum')
+        self.criterion = torch.nn.CrossEntropyLoss()
+        print('Initialised UnetMask Loss')
+    def forward(self, emb, spectrogram, label=None):
+        assert emb.size()[1] >= 2
+        nlossEd1 = self.mseloss(emb[:, 0, :], emb[:, 1, :])+self.mseloss(emb[:, 0, :], emb[:, 2, :])
+        nlossEd2 = self.mseloss(emb[:, 3, :], emb[:, 4, :])+self.mseloss(emb[:, 3, :], emb[:, 5, :])
+        emb_anchor = torch.mean(emb[:, 0:3, :], 1)
+        emb_positive = torch.mean(emb[:, 3:6, :], 1)
+        stepsize = emb_anchor.size()[0]
+        output = -1 * (F.pairwise_distance(emb_positive.unsqueeze(-1), emb_anchor.unsqueeze(-1).transpose(0, 2)) ** 2)
+        label0 = torch.from_numpy(numpy.asarray(range(0, stepsize))).cuda()
+        nlossEP = self.criterion(output, label0)
+        nlossEC, prec1 = self.softmax(emb.reshape(-1, emb.size()[-1]), label.repeat_interleave(emb.size()[1]))
+        nlossSd1 = self.mseloss(spectrogram[:, 0, :, :], spectrogram[:, 1, :, :]) + self.mseloss(spectrogram[:, 0, :, :], spectrogram[:, 2, :, :])
+        nlossSd2 = self.mseloss(spectrogram[:, 3, :, :], spectrogram[:, 4, :, :]) + self.mseloss(
+            spectrogram[:, 3, :, :], spectrogram[:, 5, :, :])
+        spec_anchor = torch.mean(spectrogram[:, 0:3, :, :], 1)
+        spec_positive = torch.mean(spectrogram[:, 3:6, :, :], 1)
+        nlossS = self.mseloss(spec_anchor, spec_positive)
+        # print("\nnlossEd1:", nlossEd1, "nlossEd2:", nlossEd2,  "nlossEP:", nlossEP,  "nlossEC:", nlossEC)
+        # print("nlossSd1:", nlossSd1, "nlossSd2:", nlossSd2, "nlossS:", nlossS)
+        # nlossEd1: 3.9563, nlossEd2: 3.5833, nlossEP:0.6218,nlossEC: 8.7362,
+        # nlossSd1: 3.4339,  nlossSd2: 30.1156,nlossS: 2.2820,
+        loss = 100*(nlossEd1+nlossEd2)+10*nlossEP+nlossEC+nlossSd1+nlossSd2+10*nlossS
+        return loss, prec1
+if __name__ == "__main__":
+    # a = torch.tensor([[[1, 2], [3, 4]], [[1, 2], [3, 4]]])
+    # b = torch.tensor([[[2, 3], [4, 5]], [[1, 2], [3, 4]]])
+    a = torch.randint(10,(1,2,3))
+    b = torch.randint(10,(1,2,3))
+    print(a)
+    print(b)
+    print(a.shape,a.shape)
+    # loss_fn = torch.nn.MSELoss(reduce=False, size_average=True)
+    # input = torch.autograd.Variable(torch.from_numpy(a))
+    # target = torch.autograd.Variable(torch.from_numpy(b))
+    # loss = loss_fn(input.float(), target.float())
+    # print(loss)
+    distance = F.pairwise_distance(a, b)
+    print(distance.shape)
+    print(distance)

lossfunction/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .softmaxproto import SoftmaxProto
+from .Unetloss import Unetloss, UnetMaskloss
+from .softmax import Softmax
+from .proto import proto
+from .AdditiveAngularMargin import AdditiveAngularMargin
+from .aamsoftmax import AamSoftmax
+from .aamsoftmaxproto import AamSoftmaxProto

lossfunction/aamsoftmax.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from utils.acc import accuracy
+class AamSoftmax(nn.Module):
+    def __init__(self, nOut, nClasses, margin=0.2, scale=30, easy_margin=False, **kwargs):
+        super(AamSoftmax, self).__init__()
+        self.test_normalize = True
+        self.m = margin
+        self.s = scale
+        self.in_feats = nOut
+        self.weight = torch.nn.Parameter(torch.FloatTensor(nClasses, nOut), requires_grad=True)
+        self.ce = nn.CrossEntropyLoss()
+        nn.init.xavier_normal_(self.weight, gain=1)
+        self.easy_margin = easy_margin
+        self.cos_m = math.cos(self.m)
+        self.sin_m = math.sin(self.m)
+        # make the function cos(theta+m) monotonic decreasing while theta in [0°,180°]
+        self.th = math.cos(math.pi - self.m)
+        self.mm = math.sin(math.pi - self.m) * self.m
+        print('Initialised AAMSoftmax margin %.3f scale %.3f'%(self.m,self.s))
+    def forward(self, x, label=None):
+        assert x.size()[0] == label.size()[0]
+        assert x.size()[1] == self.in_feats
+        # cos(theta)
+        cosine = F.linear(F.normalize(x), F.normalize(self.weight))
+        # print("cosine:", cosine.shape)
+        # cos(theta + m)
+        sine = torch.sqrt((1.0 - torch.mul(cosine, cosine)).clamp(0, 1))
+        # phi = cos(ø+m)
+        phi = cosine * self.cos_m - sine * self.sin_m
+        # print(self.cos_m)
+        # print("phi:", phi.shape)
+        if self.easy_margin:
+            phi = torch.where(cosine > 0, phi, cosine)
+        else:
+            phi = torch.where((cosine - self.th) > 0, phi, cosine - self.mm)
+        #one_hot = torch.zeros(cosine.size(), device='cuda' if torch.cuda.is_available() else 'cpu')
+        one_hot = torch.zeros_like(cosine)
+        one_hot.scatter_(1, label.view(-1, 1), 1)
+        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
+        output = output * self.s
+        loss    = self.ce(output, label)
+        prec1   = accuracy(output.detach(), label.detach(), topk=(1,))[0]
+        return loss, prec1
+if __name__ == "__main__":
+    x = torch.randn(32, 512)
+    y = torch.randint(1000, size=(32,))
+    print(x.shape, y.shape)
+    loss = AamSoftmax(512, 1000)
+    nloss, prec1 = loss(x, y)
+    print(nloss, prec1)

lossfunction/aamsoftmaxproto.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import torch
+import torch.nn as nn
+import lossfunction.aamsoftmax as aamsoftmax
+import lossfunction.angleproto as angleproto
+class AamSoftmaxProto(nn.Module):
+    def __init__(self, nOut, nClasses, margin, scale):
+        super(AamSoftmaxProto, self).__init__()
+        self.test_normalize = True
+        self.aamsoftmax = aamsoftmax.AamSoftmax(nOut, nClasses, margin, scale)
+        self.angleproto = angleproto.AngleProto()
+        print('Initialised AamSoftmaxPrototypical Loss')
+    def forward(self, x, label=None):
+        assert x.size()[1] == 2
+        nlossS, prec1 = self.aamsoftmax(x.reshape(-1, x.size()[-1]), label.repeat_interleave(2))
+        nlossP, _ = self.angleproto(x, None)
+        # print("lossP:", nlossP, "nlossS:", nlossS)
+        # lossP:0.6678 nlossS:13.6913
+        return nlossS + nlossP, prec1

lossfunction/amsoftmax.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import torch
+import torch.nn as nn
+from utils.acc import accuracy
+class AmSoftmax(nn.Module):
+    def __init__(self, nOut, nClasses, margin=0.3, scale=15, **kwargs):
+        super(AmSoftmax, self).__init__()
+        self.test_normalize = True
+        self.m = margin
+        self.s = scale
+        self.in_feats = nOut
+        self.W = torch.nn.Parameter(torch.randn(nOut, nClasses), requires_grad=True)
+        self.ce = nn.CrossEntropyLoss()
+        nn.init.xavier_normal_(self.W, gain=1)
+        print('Initialised AMSoftmax m=%.3f s=%.3f'%(self.m,self.s))
+    def forward(self, x, label=None):
+        assert x.size()[0] == label.size()[0]
+        assert x.size()[1] == self.in_feats
+        x_norm = torch.norm(x, p=2, dim=1, keepdim=True).clamp(min=1e-12)
+        x_norm = torch.div(x, x_norm)
+        w_norm = torch.norm(self.W, p=2, dim=0, keepdim=True).clamp(min=1e-12)
+        w_norm = torch.div(self.W, w_norm)
+        costh = torch.mm(x_norm, w_norm)
+        label_view = label.view(-1, 1)
+        if label_view.is_cuda: label_view = label_view.cpu()
+        delt_costh = torch.zeros(costh.size()).scatter_(1, label_view, self.m)
+        if x.is_cuda: delt_costh = delt_costh.cuda()
+        costh_m = costh - delt_costh
+        costh_m_s = self.s * costh_m
+        loss    = self.ce(costh_m_s, label)
+        prec1   = accuracy(costh_m_s.detach(), label.detach(), topk=(1,))[0]
+        return loss, prec1

lossfunction/angleproto.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy
+from utils.acc import accuracy
+class AngleProto(nn.Module):
+    def __init__(self, init_w=10.0, init_b=-5.0):
+        super(AngleProto, self).__init__()
+        self.test_normalize = True
+        self.w = nn.Parameter(torch.tensor(init_w))
+        self.b = nn.Parameter(torch.tensor(init_b))
+        self.criterion  = torch.nn.CrossEntropyLoss()
+        self.mse = torch.nn.MSELoss()
+        print('Initialised AngleProto')
+    def forward(self, x, label=None):
+        assert x.size()[1] >= 2
+        out_anchor      = torch.mean(x[:,1:,:],1)
+        out_positive    = x[:,0,:]
+        stepsize        = out_anchor.size()[0]
+        cos_sim_matrix  = F.cosine_similarity(out_positive.unsqueeze(-1),out_anchor.unsqueeze(-1).transpose(0,2))
+        # print(cos_sim_matrix)
+        torch.clamp(self.w, 1e-6)
+        cos_sim_matrix = cos_sim_matrix * self.w + self.b
+        label   = torch.from_numpy(numpy.asarray(range(0,stepsize))).cuda()
+        # print(label)
+        nloss   = self.criterion(cos_sim_matrix, label) + self.mse(out_positive, out_anchor)
+        # nloss = self.criterion(cos_sim_matrix, label)
+        # print("lossC:", self.criterion(cos_sim_matrix, label), "lossM:", self.mse(out_positive, out_anchor))
+        prec1   = accuracy(cos_sim_matrix.detach(), label.detach(), topk=(1,))[0]
+        return nloss, prec1

lossfunction/ge2e.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy
+from utils.acc import accuracy
+class Ge2e(nn.Module):
+    def __init__(self, init_w=10.0, init_b=-5.0, **kwargs):
+        super(Ge2e, self).__init__()
+        self.test_normalize = True
+        self.w = nn.Parameter(torch.tensor(init_w))
+        self.b = nn.Parameter(torch.tensor(init_b))
+        self.criterion  = torch.nn.CrossEntropyLoss()
+        print('Initialised GE2E')
+    def forward(self, x, label=None):
+        assert x.size()[1] >= 2
+        gsize = x.size()[1]
+        centroids = torch.mean(x, 1)
+        stepsize = x.size()[0]
+        cos_sim_matrix = []
+        for ii in range(0,gsize):
+            idx = [*range(0,gsize)]
+            idx.remove(ii)
+            exc_centroids = torch.mean(x[:,idx,:], 1)  # (32,512)
+            cos_sim_diag    = F.cosine_similarity(x[:,ii,:],exc_centroids)
+            # print(cos_sim_diag.shape)
+            cos_sim         = F.cosine_similarity(x[:,ii,:].unsqueeze(-1),centroids.unsqueeze(-1).transpose(0,2))
+            cos_sim[range(0,stepsize),range(0,stepsize)] = cos_sim_diag
+            cos_sim_matrix.append(torch.clamp(cos_sim,1e-6))
+        cos_sim_matrix = torch.stack(cos_sim_matrix,dim=1)
+        torch.clamp(self.w, 1e-6)
+        cos_sim_matrix = cos_sim_matrix * self.w + self.b
+        label = torch.from_numpy(numpy.asarray(range(0,stepsize))).cuda()
+        nloss = self.criterion(cos_sim_matrix.view(-1,stepsize), torch.repeat_interleave(label,repeats=gsize,dim=0).cuda())
+        prec1 = accuracy(cos_sim_matrix.view(-1,stepsize).detach(), torch.repeat_interleave(label,repeats=gsize,dim=0).detach(), topk=(1,))[0]
+        return nloss, prec1
+if __name__ == "__main__":
+    x = torch.randn(32, 10, 512).cuda()
+    y = torch.randint(1000, size=(32,)).cuda()
+    print(x.shape, y.shape)
+    loss = Ge2e()
+    nloss, prec1 = loss(x, y)
+    print(nloss, prec1)

lossfunction/proto.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy
+from utils.acc import accuracy
+class proto(nn.Module):
+    def __init__(self, **kwargs):
+        super(proto, self).__init__()
+        self.test_normalize = False
+        self.criterion  = torch.nn.CrossEntropyLoss()
+        print('Initialised Prototypical Loss')
+    def forward(self, x, label=None):
+        assert x.size()[1] >= 2
+        out_anchor      = torch.mean(x[:, 1:, :], 1)
+        out_positive    = x[:, 0, :]
+        stepsize        = out_anchor.size()[0]
+        # print(out_anchor.shape, out_positive.shape)
+        # print(out_positive.unsqueeze(-1).shape, out_anchor.unsqueeze(-1).transpose(0, 2).shape)
+        # （10， 512， 1）  （1，512，10）生成一个矩阵，使相同的靠近，对角线靠近。
+        output  = -1 * (F.pairwise_distance(out_positive.unsqueeze(-1), out_anchor.unsqueeze(-1).transpose(0,2))**2)
+        # print(output)
+        label   = torch.from_numpy(numpy.asarray(range(0,stepsize))).cuda()
+        # label = torch.from_numpy(numpy.asarray(range(0, stepsize)))
+        # print(label)
+        nloss   = self.criterion(output, label)
+        prec1   = accuracy(output.detach(), label.detach(), topk=(1,))[0]
+        return nloss, prec1
+if __name__ == "__main__":
+    # x = torch.randn(10, 10, 512)
+    # loss = LossFunction()
+    # nloss, prec1 = loss(x)
+    # print(nloss, prec1)
+    x = torch.randint(10, (10,512,10))
+    y = torch.randint(10, (10,512,10))
+    d = F.pairwise_distance(x,y)
+    print(d)
+    print(d.shape)

lossfunction/softmax.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import torch
+import torch.nn as nn
+from utils.acc import accuracy
+class Softmax(nn.Module):
+    def __init__(self, nOut, nClasses):
+        super(Softmax, self).__init__()
+        self.test_normalize = True
+        self.criterion = torch.nn.CrossEntropyLoss()
+        self.fc = nn.Linear(nOut, nClasses)
+        print('Initialised Softmax Loss')
+    def forward(self, x, label=None):
+        x = self.fc(x)
+        nloss = self.criterion(x, label)
+        prec1 = accuracy(x.detach(), label.detach(), topk=(1,))[0]
+        return nloss, prec1

lossfunction/softmaxproto.py ADDED Viewed

	@@ -0,0 +1,37 @@

+#! /usr/bin/python
+# -*- encoding: utf-8 -*-
+import torch
+import torch.nn as nn
+import lossfunction.softmax as softmax
+import lossfunction.angleproto as angleproto
+class SoftmaxProto(nn.Module):
+    def __init__(self, nOut, nClasses):
+        super(SoftmaxProto, self).__init__()
+        self.test_normalize = True
+        self.softmax = softmax.Softmax(nOut, nClasses)
+        self.angleproto = angleproto.AngleProto()
+        print('Initialised SoftmaxPrototypical Loss')
+    def forward(self, x, label=None):
+        if x.size()[1] != 2:
+            # 2是nPerSpeaker
+            x = x.reshape(-1, 2, x.size()[-1]).squeeze(1)
+        assert x.size()[1] == 2
+        nlossS, prec1 = self.softmax(x.reshape(-1, x.size()[-1]), label.repeat_interleave(2))
+        nlossP, _ = self.angleproto(x, None)
+        # print("lossP:", nlossP, "nlossS:", nlossS)
+        # lossP:0.6678 nlossS:13.6913
+        # return nlossS + nlossP, prec1
+        return nlossS + nlossP

lossfunction/triplet.py ADDED Viewed

	@@ -0,0 +1,101 @@

+#! /usr/bin/python
+# -*- encoding: utf-8 -*-
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy
+from tuneThreshold import tuneThresholdfromScore
+import random
+class LossFunction(nn.Module):
+    def __init__(self, hard_rank=0, hard_prob=0, margin=0, **kwargs):
+        super(LossFunction, self).__init__()
+        self.test_normalize = True
+        self.hard_rank  = hard_rank
+        self.hard_prob  = hard_prob
+        self.margin     = margin
+        print('Initialised Triplet Loss')
+    def forward(self, x, label=None):
+        assert x.size()[1] == 2
+        out_anchor      = F.normalize(x[:,0,:], p=2, dim=1)
+        out_positive    = F.normalize(x[:,1,:], p=2, dim=1)
+        stepsize        = out_anchor.size()[0]
+        output      = -1 * (F.pairwise_distance(out_anchor.unsqueeze(-1),out_positive.unsqueeze(-1).transpose(0,2))**2)
+        print(output.shape)
+        negidx      = self.mineHardNegative(output.detach())
+        print(negidx)
+        out_negative = out_positive[negidx,:]
+        print(out_negative.shape)
+        labelnp     = numpy.array([1]*len(out_positive)+[0]*len(out_negative))
+        ## calculate distances
+        pos_dist    = F.pairwise_distance(out_anchor,out_positive)
+        neg_dist    = F.pairwise_distance(out_anchor,out_negative)
+        print(pos_dist.shape)
+        print(neg_dist.shape)
+        print(F.relu(torch.pow(pos_dist, 2)).shape)
+        ## loss function
+        nloss   = torch.mean(F.relu(torch.pow(pos_dist, 2) - torch.pow(neg_dist, 2) + self.margin))
+        scores = -1 * torch.cat([pos_dist,neg_dist],dim=0).detach().cpu().numpy()
+        print(scores.shape)
+        errors = tuneThresholdfromScore(scores, labelnp, []);
+        return nloss, errors[1]
+    ## ===== ===== ===== ===== ===== ===== ===== =====
+    ## Hard negative mining
+    ## ===== ===== ===== ===== ===== ===== ===== =====
+    def mineHardNegative(self, output):
+        negidx = []
+        for idx, similarity in enumerate(output):
+            simval, simidx = torch.sort(similarity,descending=True)
+            if self.hard_rank < 0:
+                ## Semi hard negative mining
+                semihardidx = simidx[(similarity[idx] - self.margin < simval) & (simval < similarity[idx])]
+                if len(semihardidx) == 0:
+                    negidx.append(random.choice(simidx))
+                else:
+                    negidx.append(random.choice(semihardidx))
+            else:
+                ## Rank based negative mining
+                simidx = simidx[simidx!=idx]
+                if random.random() < self.hard_prob:
+                    negidx.append(simidx[random.randint(0, self.hard_rank)])
+                else:
+                    negidx.append(random.choice(simidx))
+        return negidx
+if __name__=="__main__":
+   x = torch.randn(32, 2, 512)
+   loss = LossFunction()
+   nloss, errors = loss(x)
+   print(nloss, errors)

net/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

net/ECAPATDNN.py ADDED Viewed

	@@ -0,0 +1,955 @@

+"""A popular speaker recognition and diarization model.
+Authors
+ * Hwidong Na 2020
+"""
+# import os
+import torch  # noqa: F401
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+import torchaudio
+def length_to_mask(length, max_len=None, dtype=None, device=None):
+    """Creates a binary mask for each sequence.
+    Reference: https://discuss.pytorch.org/t/how-to-generate-variable-length-mask/23397/3
+    Arguments
+    ---------
+    length : torch.LongTensor
+        Containing the length of each sequence in the batch. Must be 1D.
+    max_len : int
+        Max length for the mask, also the size of the second dimension.
+    dtype : torch.dtype, default: None
+        The dtype of the generated mask.
+    device: torch.device, default: None
+        The device to put the mask variable.
+    Returns
+    -------
+    mask : tensor
+        The binary mask.
+    Example
+    -------
+    >>> length=torch.Tensor([1,2,3])
+    >>> mask=length_to_mask(length)
+    >>> mask
+    tensor([[1., 0., 0.],
+            [1., 1., 0.],
+            [1., 1., 1.]])
+    """
+    assert len(length.shape) == 1
+    if max_len is None:
+        max_len = length.max().long().item()  # using arange to generate mask
+    mask = torch.arange(
+        max_len, device=length.device, dtype=length.dtype
+    ).expand(len(length), max_len) < length.unsqueeze(1)
+    if dtype is None:
+        dtype = length.dtype
+    if device is None:
+        device = length.device
+    mask = torch.as_tensor(mask, dtype=dtype, device=device)
+    return mask
+def get_padding_elem(L_in: int, stride: int, kernel_size: int, dilation: int):
+    """This function computes the number of elements to add for zero-padding.
+    Arguments
+    ---------
+    L_in : int
+    stride: int
+    kernel_size : int
+    dilation : int
+    """
+    if stride > 1:
+        n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1)
+        L_out = stride * (n_steps - 1) + kernel_size * dilation
+        padding = [kernel_size // 2, kernel_size // 2]
+    else:
+        L_out = (L_in - dilation * (kernel_size - 1) - 1) // stride + 1
+        padding = [(L_in - L_out) // 2, (L_in - L_out) // 2]
+    return padding
+class _Conv1d(nn.Module):
+    """This function implements 1d convolution.
+    Arguments
+    ---------
+    out_channels : int
+        It is the number of output channels.
+    kernel_size : int
+        Kernel size of the convolutional filters.
+    input_shape : tuple
+        The shape of the input. Alternatively use ``in_channels``.
+    in_channels : int
+        The number of input channels. Alternatively use ``input_shape``.
+    stride : int
+        Stride factor of the convolutional filters. When the stride factor > 1,
+        a decimation in time is performed.
+    dilation : int
+        Dilation factor of the convolutional filters.
+    padding : str
+        (same, valid, causal). If "valid", no padding is performed.
+        If "same" and stride is 1, output shape is the same as the input shape.
+        "causal" results in causal (dilated) convolutions.
+    padding_mode : str
+        This flag specifies the type of padding. See torch.nn documentation
+        for more information.
+    skip_transpose : bool
+        If False, uses batch x time x channel convention of SpeakerRec.
+        If True, uses batch x channel x time convention.
+    Example
+    -------
+    >>> inp_tensor = torch.rand([10, 40, 16])
+    >>> cnn_1d = Conv1d(
+    ...     input_shape=inp_tensor.shape, out_channels=8, kernel_size=5
+    ... )
+    >>> out_tensor = cnn_1d(inp_tensor)
+    >>> out_tensor.shape
+    torch.Size([10, 40, 8])
+    """
+    def __init__(
+        self,
+        out_channels,
+        kernel_size,
+        input_shape=None,
+        in_channels=None,
+        stride=1,
+        dilation=1,
+        padding="same",
+        groups=1,
+        bias=True,
+        padding_mode="reflect",
+        skip_transpose=False,
+    ):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+        self.padding = padding
+        self.padding_mode = padding_mode
+        self.unsqueeze = False
+        self.skip_transpose = skip_transpose
+        if input_shape is None and in_channels is None:
+            raise ValueError("Must provide one of input_shape or in_channels")
+        if in_channels is None:
+            in_channels = self._check_input_shape(input_shape)
+        self.conv = nn.Conv1d(
+            in_channels,
+            out_channels,
+            self.kernel_size,
+            stride=self.stride,
+            dilation=self.dilation,
+            padding=0,
+            groups=groups,
+            bias=bias,
+        )
+    def forward(self, x):
+        """Returns the output of the convolution.
+        Arguments
+        ---------
+        x : torch.Tensor (batch, time, channel)
+            input to convolve. 2d or 4d tensors are expected.
+        """
+        if not self.skip_transpose:
+            x = x.transpose(1, -1)
+        if self.unsqueeze:
+            x = x.unsqueeze(1)
+        if self.padding == "same":
+            x = self._manage_padding(
+                x, self.kernel_size, self.dilation, self.stride
+            )
+        elif self.padding == "causal":
+            num_pad = (self.kernel_size - 1) * self.dilation
+            x = F.pad(x, (num_pad, 0))
+        elif self.padding == "valid":
+            pass
+        else:
+            raise ValueError(
+                "Padding must be 'same', 'valid' or 'causal'. Got "
+                + self.padding
+            )
+        wx = self.conv(x)
+        if self.unsqueeze:
+            wx = wx.squeeze(1)
+        if not self.skip_transpose:
+            wx = wx.transpose(1, -1)
+        return wx
+    def _manage_padding(
+        self, x, kernel_size: int, dilation: int, stride: int,
+    ):
+        """This function performs zero-padding on the time axis
+        such that their lengths is unchanged after the convolution.
+        Arguments
+        ---------
+        x : torch.Tensor
+            Input tensor.
+        kernel_size : int
+            Size of kernel.
+        dilation : int
+            Dilation used.
+        stride : int
+            Stride.
+        """
+        # Detecting input shape
+        L_in = x.shape[-1]
+        # Time padding
+        padding = get_padding_elem(L_in, stride, kernel_size, dilation)
+        # Applying padding
+        x = F.pad(x, padding, mode=self.padding_mode)
+        return x
+    def _check_input_shape(self, shape):
+        """Checks the input shape and returns the number of input channels.
+        """
+        if len(shape) == 2:
+            self.unsqueeze = True
+            in_channels = 1
+        elif self.skip_transpose:
+            in_channels = shape[1]
+        elif len(shape) == 3:
+            in_channels = shape[2]
+        else:
+            raise ValueError(
+                "conv1d expects 2d, 3d inputs. Got " + str(len(shape))
+            )
+        # Kernel size must be odd
+        if self.kernel_size % 2 == 0:
+            raise ValueError(
+                "The field kernel size must be an odd number. Got %s."
+                % (self.kernel_size)
+            )
+        return in_channels
+class _BatchNorm1d(nn.Module):
+    """Applies 1d batch normalization to the input tensor.
+    Arguments
+    ---------
+    input_shape : tuple
+        The expected shape of the input. Alternatively, use ``input_size``.
+    input_size : int
+        The expected size of the input. Alternatively, use ``input_shape``.
+    eps : float
+        This value is added to std deviation estimation to improve the numerical
+        stability.
+    momentum : float
+        It is a value used for the running_mean and running_var computation.
+    affine : bool
+        When set to True, the affine parameters are learned.
+    track_running_stats : bool
+        When set to True, this module tracks the running mean and variance,
+        and when set to False, this module does not track such statistics.
+    combine_batch_time : bool
+        When true, it combines batch an time axis.
+    Example
+    -------
+    >>> input = torch.randn(100, 10)
+    >>> norm = BatchNorm1d(input_shape=input.shape)
+    >>> output = norm(input)
+    >>> output.shape
+    torch.Size([100, 10])
+    """
+    def __init__(
+        self,
+        input_shape=None,
+        input_size=None,
+        eps=1e-05,
+        momentum=0.1,
+        affine=True,
+        track_running_stats=True,
+        combine_batch_time=False,
+        skip_transpose=False,
+    ):
+        super().__init__()
+        self.combine_batch_time = combine_batch_time
+        self.skip_transpose = skip_transpose
+        if input_size is None and skip_transpose:
+            input_size = input_shape[1]
+        elif input_size is None:
+            input_size = input_shape[-1]
+        self.norm = nn.BatchNorm1d(
+            input_size,
+            eps=eps,
+            momentum=momentum,
+            affine=affine,
+            track_running_stats=track_running_stats,
+        )
+    def forward(self, x):
+        """Returns the normalized input tensor.
+        Arguments
+        ---------
+        x : torch.Tensor (batch, time, [channels])
+            input to normalize. 2d or 3d tensors are expected in input
+            4d tensors can be used when combine_dims=True.
+        """
+        shape_or = x.shape
+        if self.combine_batch_time:
+            if x.ndim == 3:
+                x = x.reshape(shape_or[0] * shape_or[1], shape_or[2])
+            else:
+                x = x.reshape(
+                    shape_or[0] * shape_or[1], shape_or[3], shape_or[2]
+                )
+        elif not self.skip_transpose:
+            x = x.transpose(-1, 1)
+        x_n = self.norm(x)
+        if self.combine_batch_time:
+            x_n = x_n.reshape(shape_or)
+        elif not self.skip_transpose:
+            x_n = x_n.transpose(1, -1)
+        return x_n
+class Linear(torch.nn.Module):
+    """Computes a linear transformation y = wx + b.
+    Arguments
+    ---------
+    n_neurons : int
+        It is the number of output neurons (i.e, the dimensionality of the
+        output).
+    input_shape: tuple
+        It is the shape of the input tensor.
+    input_size: int
+        Size of the input tensor.
+    bias : bool
+        If True, the additive bias b is adopted.
+    combine_dims : bool
+        If True and the input is 4D, combine 3rd and 4th dimensions of input.
+    Example
+    -------
+    >>> inputs = torch.rand(10, 50, 40)
+    >>> lin_t = Linear(input_shape=(10, 50, 40), n_neurons=100)
+    >>> output = lin_t(inputs)
+    >>> output.shape
+    torch.Size([10, 50, 100])
+    """
+    def __init__(
+        self,
+        n_neurons,
+        input_shape=None,
+        input_size=None,
+        bias=True,
+        combine_dims=False,
+    ):
+        super().__init__()
+        self.combine_dims = combine_dims
+        if input_shape is None and input_size is None:
+            raise ValueError("Expected one of input_shape or input_size")
+        if input_size is None:
+            input_size = input_shape[-1]
+            if len(input_shape) == 4 and self.combine_dims:
+                input_size = input_shape[2] * input_shape[3]
+        # Weights are initialized following pytorch approach
+        self.w = nn.Linear(input_size, n_neurons, bias=bias)
+    def forward(self, x):
+        """Returns the linear transformation of input tensor.
+        Arguments
+        ---------
+        x : torch.Tensor
+            Input to transform linearly.
+        """
+        if x.ndim == 4 and self.combine_dims:
+            x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3])
+        wx = self.w(x)
+        return wx
+class Conv1d(_Conv1d):
+    def __init__(self, *args, **kwargs):
+        super().__init__(skip_transpose=True, *args, **kwargs)
+class BatchNorm1d(_BatchNorm1d):
+    def __init__(self, *args, **kwargs):
+        super().__init__(skip_transpose=True, *args, **kwargs)
+class TDNNBlock(nn.Module):
+    """An implementation of TDNN.
+    Arguments
+    ----------
+    in_channels : int
+        Number of input channels.
+    out_channels : int
+        The number of output channels.
+    kernel_size : int
+        The kernel size of the TDNN blocks.
+    dilation : int
+        The dilation of the Res2Net block.
+    activation : torch class
+        A class for constructing the activation layers.
+    Example
+    -------
+    >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
+    >>> layer = TDNNBlock(64, 64, kernel_size=3, dilation=1)
+    >>> out_tensor = layer(inp_tensor).transpose(1, 2)
+    >>> out_tensor.shape
+    torch.Size([8, 120, 64])
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        dilation,
+        activation=nn.ReLU,
+    ):
+        super(TDNNBlock, self).__init__()
+        self.conv = Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            dilation=dilation,
+        )
+        self.activation = activation()
+        self.norm = BatchNorm1d(input_size=out_channels)
+    def forward(self, x):
+        return self.norm(self.activation(self.conv(x)))
+class Res2NetBlock(torch.nn.Module):
+    """An implementation of Res2NetBlock w/ dilation.
+    Arguments
+    ---------
+    in_channels : int
+        The number of channels expected in the input.
+    out_channels : int
+        The number of output channels.
+    scale : int
+        The scale of the Res2Net block.
+    kernel_size: int
+        The kernel size of the Res2Net block.
+    dilation : int
+        The dilation of the Res2Net block.
+    Example
+    -------
+    >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
+    >>> layer = Res2NetBlock(64, 64, scale=4, dilation=3)
+    >>> out_tensor = layer(inp_tensor).transpose(1, 2)
+    >>> out_tensor.shape
+    torch.Size([8, 120, 64])
+    """
+    def __init__(
+        self, in_channels, out_channels, scale=8, kernel_size=3, dilation=1
+    ):
+        super(Res2NetBlock, self).__init__()
+        assert in_channels % scale == 0
+        assert out_channels % scale == 0
+        in_channel = in_channels // scale
+        hidden_channel = out_channels // scale
+        self.blocks = nn.ModuleList(
+            [
+                TDNNBlock(
+                    in_channel,
+                    hidden_channel,
+                    kernel_size=kernel_size,
+                    dilation=dilation,
+                )
+                for i in range(scale - 1)
+            ]
+        )
+        self.scale = scale
+    def forward(self, x):
+        y = []
+        for i, x_i in enumerate(torch.chunk(x, self.scale, dim=1)):
+            if i == 0:
+                y_i = x_i
+            elif i == 1:
+                y_i = self.blocks[i - 1](x_i)
+            else:
+                y_i = self.blocks[i - 1](x_i + y_i)
+            y.append(y_i)
+        y = torch.cat(y, dim=1)
+        return y
+class SEBlock(nn.Module):
+    """An implementation of squeeze-and-excitation block.
+    Arguments
+    ---------
+    in_channels : int
+        The number of input channels.
+    se_channels : int
+        The number of output channels after squeeze.
+    out_channels : int
+        The number of output channels.
+    Example
+    -------
+    >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
+    >>> se_layer = SEBlock(64, 16, 64)
+    >>> lengths = torch.rand((8,))
+    >>> out_tensor = se_layer(inp_tensor, lengths).transpose(1, 2)
+    >>> out_tensor.shape
+    torch.Size([8, 120, 64])
+    """
+    def __init__(self, in_channels, se_channels, out_channels):
+        super(SEBlock, self).__init__()
+        self.conv1 = Conv1d(
+            in_channels=in_channels, out_channels=se_channels, kernel_size=1
+        )
+        self.relu = torch.nn.ReLU(inplace=True)
+        self.conv2 = Conv1d(
+            in_channels=se_channels, out_channels=out_channels, kernel_size=1
+        )
+        self.sigmoid = torch.nn.Sigmoid()
+    def forward(self, x, lengths=None):
+        L = x.shape[-1]
+        if lengths is not None:
+            mask = length_to_mask(lengths * L, max_len=L, device=x.device)
+            mask = mask.unsqueeze(1)
+            total = mask.sum(dim=2, keepdim=True)
+            s = (x * mask).sum(dim=2, keepdim=True) / total
+        else:
+            s = x.mean(dim=2, keepdim=True)
+        s = self.relu(self.conv1(s))
+        s = self.sigmoid(self.conv2(s))
+        return s * x
+class AttentiveStatisticsPooling(nn.Module):
+    """This class implements an attentive statistic pooling layer for each channel.
+    It returns the concatenated mean and std of the input tensor.
+    Arguments
+    ---------
+    channels: int
+        The number of input channels.
+    attention_channels: int
+        The number of attention channels.
+    Example
+    -------
+    >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
+    >>> asp_layer = AttentiveStatisticsPooling(64)
+    >>> lengths = torch.rand((8,))
+    >>> out_tensor = asp_layer(inp_tensor, lengths).transpose(1, 2)
+    >>> out_tensor.shape
+    torch.Size([8, 1, 128])
+    """
+    def __init__(self, channels, attention_channels=128, global_context=True):
+        super().__init__()
+        self.eps = 1e-12
+        self.global_context = global_context
+        if global_context:
+            self.tdnn = TDNNBlock(channels * 3, attention_channels, 1, 1)
+        else:
+            self.tdnn = TDNNBlock(channels, attention_channels, 1, 1)
+        self.tanh = nn.Tanh()
+        self.conv = Conv1d(
+            in_channels=attention_channels, out_channels=channels, kernel_size=1
+        )
+    def forward(self, x, lengths=None):
+        """Calculates mean and std for a batch (input tensor).
+        Arguments
+        ---------
+        x : torch.Tensor
+            Tensor of shape [N, C, L].
+        """
+        L = x.shape[-1]
+        def _compute_statistics(x, m, dim=2, eps=self.eps):
+            mean = (m * x).sum(dim)
+            std = torch.sqrt(
+                (m * (x - mean.unsqueeze(dim)).pow(2)).sum(dim).clamp(eps)
+            )
+            return mean, std
+        if lengths is None:
+            lengths = torch.ones(x.shape[0], device=x.device)
+        # Make binary mask of shape [N, 1, L]
+        mask = length_to_mask(lengths * L, max_len=L, device=x.device)  # mask生成的是一种全1的（N，L）
+        mask = mask.unsqueeze(1)
+        # Expand the temporal context of the pooling layer by allowing the
+        # self-attention to look at global properties of the utterance.
+        if self.global_context:
+            # torch.std is unstable for backward computation
+            # https://github.com/pytorch/pytorch/issues/4320
+            total = mask.sum(dim=2, keepdim=True).float()
+            mean, std = _compute_statistics(x, mask / total)
+            mean = mean.unsqueeze(2).repeat(1, 1, L)
+            std = std.unsqueeze(2).repeat(1, 1, L)
+            attn = torch.cat([x, mean, std], dim=1)
+        else:
+            attn = x
+        # Apply layers
+        attn = self.conv(self.tanh(self.tdnn(attn)))
+        # Filter out zero-paddings
+        attn = attn.masked_fill(mask == 0, float("-inf"))
+        attn = F.softmax(attn, dim=2)
+        mean, std = _compute_statistics(x, attn)
+        # Append mean and std of the batch
+        pooled_stats = torch.cat((mean, std), dim=1)
+        pooled_stats = pooled_stats.unsqueeze(2)
+        return pooled_stats
+class SERes2NetBlock(nn.Module):
+    """An implementation of building block in ECAPA-TDNN, i.e.,
+    TDNN-Res2Net-TDNN-SEBlock.
+    Arguments
+    ----------
+    out_channels: int
+        The number of output channels.
+    res2net_scale: int
+        The scale of the Res2Net block.
+    kernel_size: int
+        The kernel size of the TDNN blocks.
+    dilation: int
+        The dilation of the Res2Net block.
+    activation : torch class
+        A class for constructing the activation layers.
+    Example
+    -------
+    >>> x = torch.rand(8, 120, 64).transpose(1, 2)
+    >>> conv = SERes2NetBlock(64, 64, res2net_scale=4)
+    >>> out = conv(x).transpose(1, 2)
+    >>> out.shape
+    torch.Size([8, 120, 64])
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        res2net_scale=8,
+        se_channels=128,
+        kernel_size=1,
+        dilation=1,
+        activation=torch.nn.ReLU,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.tdnn1 = TDNNBlock(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            dilation=1,
+            activation=activation,
+        )
+        self.res2net_block = Res2NetBlock(
+            out_channels, out_channels, res2net_scale, kernel_size, dilation
+        )
+        self.tdnn2 = TDNNBlock(
+            out_channels,
+            out_channels,
+            kernel_size=1,
+            dilation=1,
+            activation=activation,
+        )
+        self.se_block = SEBlock(out_channels, se_channels, out_channels)
+        self.shortcut = None
+        if in_channels != out_channels:
+            self.shortcut = Conv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+            )
+    def forward(self, x, lengths=None):
+        residual = x
+        if self.shortcut:
+            residual = self.shortcut(x)
+        x = self.tdnn1(x)
+        x = self.res2net_block(x)
+        x = self.tdnn2(x)
+        x = self.se_block(x, lengths)
+        return x + residual
+class ECAPATDNN(torch.nn.Module):
+    """An implementation of the speaker embedding model in a paper.
+    "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in
+    TDNN Based Speaker Verification" (https://arxiv.org/abs/2005.07143).
+    Arguments
+    ---------
+    device : str
+        Device used, e.g., "cpu" or "cuda".
+    activation : torch class
+        A class for constructing the activation layers.
+    channels : list of ints
+        Output channels for TDNN/SERes2Net layer.
+    kernel_sizes : list of ints
+        List of kernel sizes for each layer.
+    dilations : list of ints
+        List of dilations for kernels in each layer.
+    lin_neurons : int
+        Number of neurons in linear layers.
+    Example
+    -------
+    >>> input_feats = torch.rand([5, 120, 80])
+    >>> compute_embedding = ECAPATDNN(80, lin_neurons=192)
+    >>> outputs = compute_embedding(input_feats)
+    >>> outputs.shape
+    torch.Size([5, 1, 192])
+    """
+    def __init__(
+        self,
+        input_size,
+        device="cpu",
+        lin_neurons=192,
+        activation=torch.nn.ReLU,
+        channels=[512, 512, 512, 512, 1536],
+        kernel_sizes=[5, 3, 3, 3, 1],
+        dilations=[1, 2, 3, 4, 1],
+        attention_channels=128,
+        res2net_scale=8,
+        se_channels=128,
+        global_context=True,
+    ):
+        super().__init__()
+        assert len(channels) == len(kernel_sizes)
+        assert len(channels) == len(dilations)
+        self.channels = channels
+        self.torchfb = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400,
+                                                            hop_length=160, f_min=0.0, f_max=8000, pad=0, n_mels=80)
+        self.instancenorm = nn.InstanceNorm1d(40)
+        self.blocks = nn.ModuleList()
+        # The initial TDNN layer
+        self.blocks.append(
+            TDNNBlock(
+                input_size,
+                channels[0],
+                kernel_sizes[0],
+                dilations[0],
+                activation,
+            )
+        )
+        # SE-Res2Net layers
+        for i in range(1, len(channels) - 1):
+            self.blocks.append(
+                SERes2NetBlock(
+                    channels[i - 1],
+                    channels[i],
+                    res2net_scale=res2net_scale,
+                    se_channels=se_channels,
+                    kernel_size=kernel_sizes[i],
+                    dilation=dilations[i],
+                    activation=activation,
+                )
+            )
+        # Multi-layer feature aggregation
+        self.mfa = TDNNBlock(
+            channels[-1],
+            channels[-1],
+            kernel_sizes[-1],
+            dilations[-1],
+            activation,
+        )
+        # Attentive Statistical Pooling
+        self.asp = AttentiveStatisticsPooling(
+            channels[-1],
+            attention_channels=attention_channels,
+            global_context=global_context,
+        )
+        self.asp_bn = BatchNorm1d(input_size=channels[-1] * 2)
+        # Final linear transformation
+        self.fc = Conv1d(
+            in_channels=channels[-1] * 2,
+            out_channels=lin_neurons,
+            kernel_size=1,
+        )
+    def forward(self, x, lengths=None):
+        """Returns the embedding vector.
+        Arguments
+        ---------
+        x : torch.Tensor
+            Tensor of shape (batch, channel, time).
+        """
+        # Minimize transpose for efficiency
+        x = self.torchfb(x) + 1e-6
+        x = x.log()
+        x = self.instancenorm(x)
+        xl = []
+        for layer in self.blocks:
+            try:
+                x = layer(x, lengths=lengths)
+            except TypeError:
+                x = layer(x)
+            xl.append(x)
+        # Multi-layer feature aggregation
+        x = torch.cat(xl[1:], dim=1)
+        x = self.mfa(x)
+        # Attentive Statistical Pooling
+        x = self.asp(x, lengths=lengths)
+        x = self.asp_bn(x)
+        # Final linear transformation
+        x = self.fc(x)
+        x = x.transpose(1, 2).squeeze(1)
+        return x
+class Classifier(torch.nn.Module):
+    """This class implements the cosine similarity on the top of features.
+    Arguments
+    ---------
+    device : str
+        Device used, e.g., "cpu" or "cuda".
+    lin_blocks : int
+        Number of linear layers.
+    lin_neurons : int
+        Number of neurons in linear layers.
+    out_neurons : int
+        Number of classes.
+    Example
+    -------
+    >>> classify = Classifier(input_size=2, lin_neurons=2, out_neurons=2)
+    >>> outputs = torch.tensor([ [1., -1.], [-9., 1.], [0.9, 0.1], [0.1, 0.9] ])
+    >>> outupts = outputs.unsqueeze(1)
+    >>> cos = classify(outputs)
+    >>> (cos < -1.0).long().sum()
+    tensor(0)
+    >>> (cos > 1.0).long().sum()
+    tensor(0)
+    """
+    def __init__(
+        self,
+        input_size,
+        device="cpu",
+        lin_blocks=0,
+        lin_neurons=192,
+        out_neurons=1211,
+    ):
+        super().__init__()
+        self.blocks = nn.ModuleList()
+        for block_index in range(lin_blocks):
+            self.blocks.extend(
+                [
+                    _BatchNorm1d(input_size),
+                    Linear(input_size=input_size, n_neurons=lin_neurons),
+                ]
+            )
+            input_size = lin_neurons
+        # Final Layer
+        self.weight = nn.Parameter(
+            torch.FloatTensor(out_neurons, input_size, device=device)
+        )
+        nn.init.xavier_uniform_(self.weight)
+    def forward(self, x):
+        """Returns the output probabilities over speakers.
+        Arguments
+        ---------
+        x : torch.Tensor
+            Torch tensor.
+        """
+        for layer in self.blocks:
+            x = layer(x)
+        # Need to be normalized
+        x = F.linear(F.normalize(x.squeeze(1)), F.normalize(self.weight))
+        return x.unsqueeze(1)
+if __name__ == '__main__':
+    x = torch.zeros(32, 32240)
+    model = ECAPATDNN(80, lin_neurons=192)
+    out = model(x)
+    print(out.shape)  # should be [2, 192]

net/ECAPA_TDNN.py ADDED Viewed

	@@ -0,0 +1,246 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio
+from torchinfo import summary
+''' Res2Conv1d + BatchNorm1d + ReLU
+'''
+class Res2Conv1dReluBn(nn.Module):
+    '''
+    in_channels == out_channels == channels
+    '''
+    def __init__(self, channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False, scale=4):
+        super().__init__()
+        assert channels % scale == 0, "{} % {} != 0".format(channels, scale)
+        self.scale = scale
+        self.width = channels // scale
+        self.nums = scale if scale == 1 else scale - 1
+        self.convs = []
+        self.bns = []
+        for i in range(self.nums):
+            self.convs.append(nn.Conv1d(self.width, self.width, kernel_size, stride, padding, dilation, bias=bias))
+            self.bns.append(nn.BatchNorm1d(self.width))
+        self.convs = nn.ModuleList(self.convs)
+        self.bns = nn.ModuleList(self.bns)
+    def forward(self, x):
+        out = []
+        spx = torch.split(x, self.width, 1)
+        for i in range(self.nums):
+            if i == 0:
+                sp = spx[i]
+            else:
+                sp = sp + spx[i]
+            # Order: conv -> relu -> bn
+            sp = self.convs[i](sp)
+            sp = self.bns[i](F.relu(sp))
+            out.append(sp)
+        if self.scale != 1:
+            out.append(spx[self.nums])
+        out = torch.cat(out, dim=1)
+        return out
+''' Conv1d + BatchNorm1d + ReLU
+'''
+class Conv1dReluBn(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False):
+        super().__init__()
+        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias)
+        self.bn = nn.BatchNorm1d(out_channels)
+    def forward(self, x):
+        return self.bn(F.relu(self.conv(x)))
+''' The SE connection of 1D case.
+'''
+class SE_Connect(nn.Module):
+    def __init__(self, channels, s=2):
+        super().__init__()
+        assert channels % s == 0, "{} % {} != 0".format(channels, s)
+        self.linear1 = nn.Linear(channels, channels // s)
+        self.linear2 = nn.Linear(channels // s, channels)
+    def forward(self, x):
+        out = x.mean(dim=2)
+        out = F.relu(self.linear1(out))
+        out = torch.sigmoid(self.linear2(out))
+        out = x * out.unsqueeze(2)
+        return out
+''' SE-Res2Block.
+    Note: residual connection is implemented in the ECAPA_TDNN.yaml model, not here.
+'''
+class SE_Res2Block(nn.Module):
+    def __init__(self, channels, kernel_size, stride, padding, dilation, scale):
+         super().__init__()
+         self.block = nn.Sequential(
+            Conv1dReluBn(channels, channels, kernel_size=1, stride=1, padding=0),
+            Res2Conv1dReluBn(channels, kernel_size, stride, padding, dilation, scale=scale),
+            Conv1dReluBn(channels, channels, kernel_size=1, stride=1, padding=0),
+            SE_Connect(channels)
+         )
+    def forward(self, x):
+        out = self.block(x)
+        return out + x
+''' Attentive weighted mean and standard deviation pooling.
+'''
+class AttentiveStatsPool(nn.Module):
+    def __init__(self, in_dim, bottleneck_dim):
+        super().__init__()
+        # Use Conv1d with stride == 1 rather than Linear, then we don't need to transpose inputs.
+        self.linear1 = nn.Conv1d(in_dim, bottleneck_dim, kernel_size=1) # equals W and b in the paper
+        self.linear2 = nn.Conv1d(bottleneck_dim, in_dim, kernel_size=1) # equals V and k in the paper
+    def forward(self, x):
+        # DON'T use ReLU here! In experiments, I find ReLU hard to converge.
+        alpha = torch.tanh(self.linear1(x))
+        alpha = torch.softmax(self.linear2(alpha), dim=2)
+        mean = torch.sum(alpha * x, dim=2)
+        residuals = torch.sum(alpha * x ** 2, dim=2) - mean ** 2
+        std = torch.sqrt(residuals.clamp(min=1e-9))
+        return torch.cat([mean, std], dim=1)
+''' Implementation of
+    "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in TDNN Based Speaker Verification".
+    Note that we DON'T concatenate the last frame-wise layer with non-weighted mean and standard deviation,
+    because it brings little improvment but significantly increases model parameters.
+    As a result, this implementation basically equals the A.2 of Table 2 in the paper.
+'''
+class ECAPA_TDNN(nn.Module):
+    def __init__(self, in_channels=80, channels=512, embd_dim=192):
+        super().__init__()
+        self.torchfb = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400,
+                                                            hop_length=160, f_min=0.0, f_max=8000, pad=0, n_mels=80)
+        self.instancenorm = nn.InstanceNorm1d(80)
+        self.layer1 = Conv1dReluBn(in_channels, channels, kernel_size=5, padding=2)
+        self.layer2 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=2, dilation=2, scale=8)
+        self.layer3 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=3, dilation=3, scale=8)
+        self.layer4 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=4, dilation=4, scale=8)
+        cat_channels = channels * 3
+        self.conv = nn.Conv1d(cat_channels, cat_channels, kernel_size=1)
+        self.pooling = AttentiveStatsPool(cat_channels, 128)
+        self.bn1 = nn.BatchNorm1d(cat_channels * 2)
+        self.linear = nn.Linear(cat_channels * 2, embd_dim)
+        self.bn2 = nn.BatchNorm1d(embd_dim)
+    def forward(self, x):
+        x = self.torchfb(x) + 1e-6
+        x = x.log()
+        x = self.instancenorm(x)
+        # print(x.shape)
+        # x = x.transpose(1, 2)
+        out1 = self.layer1(x)
+        out2 = self.layer2(out1) + out1
+        out3 = self.layer3(out1 + out2) + out1 + out2
+        out4 = self.layer4(out1 + out2 + out3) + out1 + out2 + out3
+        out = torch.cat([out2, out3, out4], dim=1)
+        out = F.relu(self.conv(out))
+        # print(out.shape)
+        out = self.bn1(self.pooling(out))
+        # print(out.shape)
+        out = self.bn2(self.linear(out))
+        return out
+class ECAPA_TDNN_ks5(nn.Module):
+    def __init__(self, in_channels=80, channels=512, embd_dim=192):
+        super().__init__()
+        self.torchfb = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400,
+                                                            hop_length=160, f_min=0.0, f_max=8000, pad=0, n_mels=80)
+        self.instancenorm = nn.InstanceNorm1d(40)
+        self.layer1 = Conv1dReluBn(in_channels, channels, kernel_size=7, padding=3)
+        self.layer2 = SE_Res2Block(channels, kernel_size=5, stride=1, padding=4, dilation=2, scale=8)
+        self.layer3 = SE_Res2Block(channels, kernel_size=5, stride=1, padding=6, dilation=3, scale=8)
+        self.layer4 = SE_Res2Block(channels, kernel_size=5, stride=1, padding=8, dilation=4, scale=8)
+        cat_channels = channels * 3
+        self.conv = nn.Conv1d(cat_channels, cat_channels, kernel_size=1)
+        self.pooling = AttentiveStatsPool(cat_channels, 128)
+        self.bn1 = nn.BatchNorm1d(cat_channels * 2)
+        self.linear = nn.Linear(cat_channels * 2, embd_dim)
+        self.bn2 = nn.BatchNorm1d(embd_dim)
+    def forward(self, x):
+        x = self.torchfb(x) + 1e-6
+        x = x.log()
+        x = self.instancenorm(x)
+        # print(x.shape)
+        # x = x.transpose(1, 2)
+        out1 = self.layer1(x)
+        out2 = self.layer2(out1) + out1
+        out3 = self.layer3(out1 + out2) + out1 + out2
+        out4 = self.layer4(out1 + out2 + out3) + out1 + out2 + out3
+        out = torch.cat([out2, out3, out4], dim=1)
+        out = F.relu(self.conv(out))
+        out = self.bn1(self.pooling(out))
+        out = self.bn2(self.linear(out))
+        return out
+class ECAPA_TDNN_L2(nn.Module):
+    def __init__(self, in_channels=80, channels=512, embd_dim=192):
+        super().__init__()
+        self.torchfb = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400,
+                                                            hop_length=160, f_min=0.0, f_max=8000, pad=0, n_mels=80)
+        self.instancenorm = nn.InstanceNorm1d(40)
+        self.layer1 = Conv1dReluBn(in_channels, channels, kernel_size=5, padding=2)
+        self.layer2 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=2, dilation=2, scale=8)
+        self.layer3 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=3, dilation=3, scale=8)
+        self.layer4 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=4, dilation=4, scale=8)
+        cat_channels = channels * 3
+        self.conv = nn.Conv1d(cat_channels, cat_channels, kernel_size=1)
+        self.pooling = AttentiveStatsPool(cat_channels, 128)
+        self.bn1 = nn.BatchNorm1d(cat_channels * 2)
+        self.linear = nn.Linear(cat_channels * 2, embd_dim)
+        self.bn2 = nn.BatchNorm1d(embd_dim)
+    def forward(self, x):
+        x = self.torchfb(x) + 1e-6
+        x = x.log()
+        x = self.instancenorm(x)
+        # print(x.shape)
+        # x = x.transpose(1, 2)
+        out1 = self.layer1(x)
+        out2 = self.layer2(out1) + out1
+        out3 = self.layer3(out1 + out2) + out1 + out2
+        out4 = self.layer4(out1 + out2 + out3) + out1 + out2 + out3
+        out = torch.cat([out2, out3, out4], dim=1)
+        out = F.relu(self.conv(out))
+        out = self.bn1(self.pooling(out))
+        out = self.bn2(self.linear(out))
+        out_l2 = out / torch.norm(out, dim=1, keepdim=True)
+        return out_l2*512
+if __name__ == '__main__':
+    # Input size: batch_size * seq_len * feat_dim  32240 => 202, 35760=>224
+    x = torch.zeros(32, 35760).cuda()
+    model = ECAPA_TDNN(in_channels=80, channels=512, embd_dim=192)
+    # print(model)
+    summary(model, input_size=(tuple(x.shape)))
+    out = model(x)
+    print(out.shape)    # should be [2, 192]

net/ECAPA_TDNN_br.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio
+from torchinfo import summary
+''' Res2Conv1d + BatchNorm1d + ReLU
+'''
+class Res2Conv1dReluBn(nn.Module):
+    '''
+    in_channels == out_channels == channels
+    '''
+    def __init__(self, channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False, scale=4):
+        super().__init__()
+        assert channels % scale == 0, "{} % {} != 0".format(channels, scale)
+        self.scale = scale
+        self.width = channels // scale
+        self.nums = scale if scale == 1 else scale - 1
+        self.convs = []
+        self.bns = []
+        for i in range(self.nums):
+            self.convs.append(nn.Conv1d(self.width, self.width, kernel_size, stride, padding, dilation, bias=bias))
+            self.bns.append(nn.BatchNorm1d(self.width))
+        self.convs = nn.ModuleList(self.convs)
+        self.bns = nn.ModuleList(self.bns)
+    def forward(self, x):
+        out = []
+        spx = torch.split(x, self.width, 1)
+        for i in range(self.nums):
+            if i == 0:
+                sp = spx[i]
+            else:
+                sp = sp + spx[i]
+            # Order: conv -> relu -> bn
+            sp = self.convs[i](sp)
+            sp = F.relu(self.bns[i](sp))
+            out.append(sp)
+        if self.scale != 1:
+            out.append(spx[self.nums])
+        out = torch.cat(out, dim=1)
+        return out
+''' Conv1d + BatchNorm1d + ReLU
+'''
+class Conv1dReluBn(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False):
+        super().__init__()
+        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias)
+        self.bn = nn.BatchNorm1d(out_channels)
+    def forward(self, x):
+        return F.relu(self.bn(self.conv(x)))
+''' The SE connection of 1D case.
+'''
+class SE_Connect(nn.Module):
+    def __init__(self, channels, s=2):
+        super().__init__()
+        assert channels % s == 0, "{} % {} != 0".format(channels, s)
+        self.linear1 = nn.Linear(channels, channels // s)
+        self.linear2 = nn.Linear(channels // s, channels)
+    def forward(self, x):
+        out = x.mean(dim=2)
+        out = F.relu(self.linear1(out))
+        out = torch.sigmoid(self.linear2(out))
+        out = x * out.unsqueeze(2)
+        return out
+''' SE-Res2Block.
+    Note: residual connection is implemented in the ECAPA_TDNN.yaml model, not here.
+'''
+class SE_Res2Block(nn.Module):
+    def __init__(self, channels, kernel_size, stride, padding, dilation, scale):
+         super().__init__()
+         self.block = nn.Sequential(
+            Conv1dReluBn(channels, channels, kernel_size=1, stride=1, padding=0),
+            Res2Conv1dReluBn(channels, kernel_size, stride, padding, dilation, scale=scale),
+            Conv1dReluBn(channels, channels, kernel_size=1, stride=1, padding=0),
+            SE_Connect(channels)
+         )
+    def forward(self, x):
+        out = self.block(x)
+        return out + x
+''' Attentive weighted mean and standard deviation pooling.
+'''
+class AttentiveStatsPool(nn.Module):
+    def __init__(self, in_dim, bottleneck_dim):
+        super().__init__()
+        # Use Conv1d with stride == 1 rather than Linear, then we don't need to transpose inputs.
+        self.linear1 = nn.Conv1d(in_dim, bottleneck_dim, kernel_size=1) # equals W and b in the paper
+        self.linear2 = nn.Conv1d(bottleneck_dim, in_dim, kernel_size=1) # equals V and k in the paper
+    def forward(self, x):
+        # DON'T use ReLU here! In experiments, I find ReLU hard to converge.
+        alpha = torch.tanh(self.linear1(x))
+        alpha = torch.softmax(self.linear2(alpha), dim=2)
+        mean = torch.sum(alpha * x, dim=2)
+        residuals = torch.sum(alpha * x ** 2, dim=2) - mean ** 2
+        std = torch.sqrt(residuals.clamp(min=1e-9))
+        return torch.cat([mean, std], dim=1)
+''' Implementation of
+    "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in TDNN Based Speaker Verification".
+    Note that we DON'T concatenate the last frame-wise layer with non-weighted mean and standard deviation,
+    because it brings little improvment but significantly increases model parameters.
+    As a result, this implementation basically equals the A.2 of Table 2 in the paper.
+'''
+class ECAPA_TDNN_br(nn.Module):
+    def __init__(self, in_channels=80, channels=512, embd_dim=192):
+        super().__init__()
+        self.torchfb = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400,
+                                                            hop_length=160, f_min=0.0, f_max=8000, pad=0, n_mels=80)
+        self.instancenorm = nn.InstanceNorm1d(40)
+        self.layer1 = Conv1dReluBn(in_channels, channels, kernel_size=5, padding=2)
+        self.layer2 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=2, dilation=2, scale=8)
+        self.layer3 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=3, dilation=3, scale=8)
+        self.layer4 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=4, dilation=4, scale=8)
+        cat_channels = channels * 3
+        self.conv = nn.Conv1d(cat_channels, cat_channels, kernel_size=1)
+        self.pooling = AttentiveStatsPool(cat_channels, 128)
+        self.bn1 = nn.BatchNorm1d(cat_channels * 2)
+        self.linear = nn.Linear(cat_channels * 2, embd_dim)
+        self.bn2 = nn.BatchNorm1d(embd_dim)
+    def forward(self, x):
+        x = self.torchfb(x) + 1e-6
+        x = x.log()
+        x = self.instancenorm(x)
+        # print(x.shape)
+        # x = x.transpose(1, 2)
+        out1 = self.layer1(x)
+        out2 = self.layer2(out1) + out1
+        out3 = self.layer3(out1 + out2) + out1 + out2
+        out4 = self.layer4(out1 + out2 + out3) + out1 + out2 + out3
+        out = torch.cat([out2, out3, out4], dim=1)
+        out = F.relu(self.conv(out))
+        out = self.bn1(self.pooling(out))
+        out = self.bn2(self.linear(out))
+        return out
+if __name__ == '__main__':
+    # Input size: batch_size * seq_len * feat_dim
+    x = torch.zeros(32, 32240).cuda()
+    model = ECAPA_TDNN_br(in_channels=80, channels=512, embd_dim=192)
+    # print(model)
+    summary(model, input_size=(tuple(x.shape)))
+    out = model(x)
+    print(out.shape)    # should be [2, 192]

net/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from .VGGVox import Vgg
+from .vggvox1 import vgg
+from .u_net import UNetVgg, UNetVggMask
+from .ECAPA_TDNN import ECAPA_TDNN, ECAPA_TDNN_ks5, ECAPA_TDNN_L2
+from .ECAPATDNN import ECAPATDNN
+from .hrnet import hrnet
+from .VGG_TDNN import Vggtdnn
+from .ResNetSE34V2 import MainModel as ResNetSE34V2
+from .ECAPA_TDNN_br import ECAPA_TDNN_br
+from .hrtdnn import hrtdnn
+from .ResTDNN import MainModel as ResTDNN
+from .TDNN_VGG import TDNN_VGG
+from .ResNet_TDNN import MainModel as ResNet_TDNN
+from .TDNN_ResNet import TDNN_ResNet
+from .hr_tdnn import hr_tdnn
+from .swin_transformer import SwinTransformer

utils/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file