Spaces:
Sleeping
Sleeping
| import torch | |
| from torch import nn | |
| import torchaudio as audio | |
| from torch import Tensor | |
| class StutterNet(nn.Module): | |
| def __init__(self, n_mels=40, | |
| dropout=0.0, use_batchnorm=False, scale=1): | |
| '''Implementation of StutterNet | |
| from Sheikh et al. StutterNet: | |
| "Stuttering Detection Using | |
| Time Delay Neural Network" 2021 | |
| Args: | |
| n_mels (int, optional): number of mel filter banks | |
| n_classes (int, optional): number of classes in output layer | |
| use_dropout (bool, optional): whether or not to use dropout in the | |
| last two linear layers | |
| use_batchnorm (bool, optional): whether ot not to batchnorm in the | |
| TDNN layers | |
| scale (float ,optional): width scale factor | |
| ''' | |
| super(StutterNet, self).__init__() | |
| self.n_mels = n_mels | |
| # self.spec = audio.transforms.MelSpectrogram(n_mels=n_mels, sample_rate=16000, | |
| # n_fft=512, pad=1, f_max=8000, win_length=400, | |
| # f_min=0, power=2.0, hop_length=160, norm='slaney') | |
| # self.db = audio.transforms.AmplitudeToDB() | |
| # self.mfcc = audio.transforms.MFCC(16000, 40) | |
| self.tdnn_1 = nn.Conv1d(n_mels, int(512*scale), 5, dilation=1) | |
| self.tdnn_2 = nn.Conv1d(int(512*scale), int(1536*scale), 5, dilation=2) | |
| self.tdnn_3 = nn.Conv1d(int(1536*scale), int(512*scale), 7, dilation=3) | |
| self.tdnn_4 = nn.Conv1d(int(512*scale), int(512*scale), 1) | |
| self.tdnn_5 = nn.Conv1d(int(512*scale), int(1500*scale), 1) | |
| self.fc_1 = nn.Linear(int(3000*scale), 512) | |
| self.relu = nn.ReLU() | |
| self.bn_1 = nn.BatchNorm1d(int(512*scale)) | |
| self.bn_2 = nn.BatchNorm1d(int(1536*scale)) | |
| self.bn_3 = nn.BatchNorm1d(int(512*scale)) | |
| self.bn_4 = nn.BatchNorm1d(int(512*scale)) | |
| self.bn_5 = nn.BatchNorm1d(int(1500*scale)) | |
| nn.init.xavier_uniform_(self.fc_1.weight) | |
| self.dropout_1 = nn.Dropout(dropout) | |
| self.fc_2 = nn.Linear(512, 512) | |
| nn.init.xavier_uniform_(self.fc_1.weight) | |
| self.dropout_2 = nn.Dropout(dropout) | |
| self.binary_head = nn.Linear(512, 6) | |
| self.class_head = nn.Linear(512, 6) | |
| self.sig = nn.Sigmoid() | |
| def forward(self, x): | |
| '''forward method''' | |
| batch_size = x.shape[0] | |
| # x = self.spec(x) | |
| # x = self.db(x) | |
| # x = self.mfcc(x) | |
| x = self.tdnn_1(x) | |
| x = self.relu(x) | |
| x = self.bn_1(x) | |
| x = self.tdnn_2(x) | |
| x = self.relu(x) | |
| x = self.bn_2(x) | |
| x = self.tdnn_3(x) | |
| x = self.relu(x) | |
| x = self.bn_3(x) | |
| x = self.tdnn_4(x) | |
| x = self.relu(x) | |
| x = self.bn_4(x) | |
| x = self.tdnn_5(x) | |
| x = self.relu(x) | |
| x = self.bn_5(x) | |
| mean = torch.mean(x,-1) | |
| std = torch.std(x,-1) | |
| x = torch.cat((mean,std),1) | |
| x = self.fc_1(x) | |
| x = self.dropout_1(x) | |
| x = self.fc_2(x) | |
| x = self.dropout_2(x) | |
| binary = self.binary_head(x) | |
| # binary = self.sig(binary) | |
| classes = self.class_head(x) | |
| # classes = self.sig(classes) | |
| # return torch.cat((classes, binary), dim=-1) | |
| return torch.cat((binary, classes), dim=-1) | |
| class ResBlock1d(nn.Module): | |
| def __init__(self, input_dims, output_dims, depth=2, kernel_size=3, | |
| use_batchnorm=False, downsample=False, dropout=0.0): | |
| super(ResBlock1d, self).__init__() | |
| self.depth = depth | |
| self.use_batchnorm = use_batchnorm | |
| scale = 1 | |
| self.up = None | |
| if (downsample): | |
| self.down = nn.Conv1d(int(input_dims), int(output_dims), 3, 2, padding=1) | |
| # self.down = nn.MaxPool1d(1, stride=2) | |
| scale=2 | |
| self.downsample = downsample | |
| self.conv_1 = nn.Conv1d(int(input_dims), | |
| output_dims, 3, stride=scale, padding=1) | |
| self.convs = nn.ModuleList([nn.Conv1d(output_dims, | |
| output_dims, kernel_size, padding='same') for _ in range(depth-1)]) | |
| self.bn_1 = nn.BatchNorm1d(output_dims) | |
| self.bn = None | |
| if (use_batchnorm): | |
| self.bn = nn.ModuleList([nn.BatchNorm1d( | |
| output_dims) for _ in range(depth-1)]) | |
| self.relu = nn.ReLU() | |
| self.dropout = nn.Dropout(dropout) | |
| def forward(self, x): | |
| temp = x | |
| if (self.downsample): | |
| temp = self.down(x) | |
| x = self.conv_1(x) | |
| x = self.bn_1(x) | |
| if (not self.use_batchnorm): | |
| for i in range(self.depth-1): | |
| x = self.convs[i](x) | |
| x = self.dropout(x) | |
| if (i != self.depth-2): | |
| x = self.relu(x) | |
| else: | |
| for i in range(self.depth-1): | |
| x = self.convs[i](x) | |
| x = self.dropout(x) | |
| x = self.bn[i](x) | |
| if (i != self.depth-2): | |
| x = self.relu(x) | |
| x = temp + x | |
| return x | |
| class ResNet1D(nn.Module): | |
| def __init__(self, n_mels=100,n_classes=12, kernel_size=3, | |
| dropout=0.0, use_batchnorm=False, scale=1): | |
| '''Implementation of StutterNet | |
| from Sheikh et al. StutterNet: | |
| "Stuttering Detection Using | |
| Time Delay Neural Network" 2021 | |
| Args: | |
| n_mels (int, optional): number of mel filter banks | |
| n_classes (int, optional): number of classes in output layer | |
| use_dropout (bool, optional): whether or not to use dropout in the | |
| last two linear layers | |
| use_batchnorm (bool, optional): whether ot not to batchnorm in the | |
| TDNN layers | |
| scale (float ,optional): width scale factor | |
| ''' | |
| super(ResNet1D, self).__init__() | |
| self.n_mels = n_mels | |
| # self.spec = audio.transforms.MelSpectrogram(n_mels=n_mels, sample_rate=16000, | |
| # n_fft=512, pad=1, f_max=8000, f_min=0, | |
| # power=2.0, hop_length=160) | |
| # self.mfcc = audio.transforms.MFCC(16000, 40) | |
| # self.db = audio.transforms.AmplitudeToDB() | |
| self.tdnn_1 = nn.Conv1d(n_mels, int(64*scale), 3, padding=1, bias=False) | |
| self.res_1_1 = ResBlock1d(int(64*scale), int(64*scale), kernel_size=kernel_size, downsample=True, use_batchnorm=use_batchnorm, dropout=dropout) | |
| self.res_1_2 = ResBlock1d(int(64*scale), int(64*scale), kernel_size=kernel_size, downsample=False, use_batchnorm=use_batchnorm, dropout=dropout) | |
| self.res_1_3 = ResBlock1d(int(64*scale), int(64*scale), kernel_size=kernel_size, downsample=False, use_batchnorm=use_batchnorm, dropout=dropout) | |
| self.res_2_1 = ResBlock1d(int(64*scale), int(128*scale), kernel_size=kernel_size, downsample=True, use_batchnorm=use_batchnorm, dropout=dropout) | |
| self.res_2_2 = ResBlock1d(int(128*scale), int(128*scale), kernel_size=kernel_size, downsample=False, use_batchnorm=use_batchnorm, dropout=dropout) | |
| self.res_2_3 = ResBlock1d(int(128*scale), int(128*scale), kernel_size=kernel_size, downsample=False, use_batchnorm=use_batchnorm, dropout=dropout) | |
| self.res_3_1 = ResBlock1d(int(128*scale), int(256*scale), kernel_size=kernel_size, downsample=True, use_batchnorm=use_batchnorm, dropout=dropout) | |
| self.res_3_2 = ResBlock1d(int(256*scale), int(256*scale), kernel_size=kernel_size, downsample=False, use_batchnorm=use_batchnorm, dropout=dropout) | |
| self.res_3_3 = ResBlock1d(int(256*scale), int(256*scale), kernel_size=kernel_size, downsample=False, use_batchnorm=use_batchnorm, dropout=dropout) | |
| self.res_4_1 = ResBlock1d(int(256*scale), int(512*scale), kernel_size=kernel_size, downsample=True, use_batchnorm=use_batchnorm, dropout=dropout) | |
| self.res_4_2 = ResBlock1d(int(512*scale), int(512*scale), kernel_size=kernel_size, downsample=False, use_batchnorm=use_batchnorm, dropout=dropout) | |
| self.res_4_3 = ResBlock1d(int(512*scale), int(512*scale), kernel_size=kernel_size, downsample=False, use_batchnorm=use_batchnorm, dropout=dropout) | |
| # self.bn = nn.BatchNorm1d(int(512*scale)) | |
| self.relu = nn.ReLU() | |
| self.fc = nn.Linear(int(1024*scale), n_classes) | |
| def forward(self, x): | |
| '''forward method''' | |
| batch_size = x.shape[0] | |
| # x = self.spec(x) | |
| # x = self.mfcc(x) | |
| # x = self.db(x) | |
| x = self.tdnn_1(x) | |
| x = self.res_1_1(x) | |
| x = self.relu(x) | |
| x = self.res_1_2(x) | |
| x = self.relu(x) | |
| x = self.res_1_3(x) | |
| x = self.relu(x) | |
| x = self.res_2_1(x) | |
| x = self.relu(x) | |
| x = self.res_2_2(x) | |
| x = self.relu(x) | |
| x = self.res_2_3(x) | |
| x = self.relu(x) | |
| x = self.res_3_1(x) | |
| x = self.relu(x) | |
| x = self.res_3_2(x) | |
| x = self.relu(x) | |
| x = self.res_3_3(x) | |
| x = self.relu(x) | |
| x = self.res_4_1(x) | |
| x = self.relu(x) | |
| x = self.res_4_2(x) | |
| x = self.relu(x) | |
| x = self.res_4_3(x) | |
| x = self.relu(x) | |
| # x = self.bn(x) | |
| mean = torch.mean(x,-1) | |
| std = torch.std(x,-1) | |
| x = torch.cat((mean,std),1) | |
| x = self.fc(x) | |
| return x | |
| from torch import Tensor | |
| '''credit: https://github.com/roman-vygon/BCResNet''' | |
| class SubSpectralNorm(nn.Module): | |
| def __init__(self, C, S, eps=1e-5): | |
| super(SubSpectralNorm, self).__init__() | |
| self.S = S | |
| self.eps = eps | |
| self.bn = nn.BatchNorm2d(C*S) | |
| def forward(self, x): | |
| # x: input features with shape {N, C, F, T} | |
| # S: number of sub-bands | |
| N, C, F, T = x.size() | |
| x = x.view(N, C * self.S, F // self.S, T) | |
| x = self.bn(x) | |
| return x.view(N, C, F, T) | |
| class BroadcastedBlock(nn.Module): | |
| def __init__( | |
| self, | |
| planes: int, | |
| dilation=1, | |
| stride=1, | |
| temp_pad=(0, 1), | |
| ) -> None: | |
| super(BroadcastedBlock, self).__init__() | |
| self.freq_dw_conv = nn.Conv2d(planes, planes, kernel_size=(3, 1), padding=(1, 0), groups=planes, | |
| dilation=dilation, | |
| stride=stride, bias=False) | |
| self.ssn1 = SubSpectralNorm(planes, 5) | |
| self.temp_dw_conv = nn.Conv2d(planes, planes, kernel_size=(1, 3), padding=temp_pad, groups=planes, | |
| dilation=dilation, stride=stride, bias=False) | |
| self.bn = nn.BatchNorm2d(planes) | |
| self.relu = nn.ReLU(inplace=True) | |
| self.channel_drop = nn.Dropout2d(p=0.5) | |
| self.swish = nn.SiLU() | |
| self.conv1x1 = nn.Conv2d(planes, planes, kernel_size=(1, 1), bias=False) | |
| def forward(self, x: Tensor) -> Tensor: | |
| identity = x | |
| # f2 | |
| ########################## | |
| out = self.freq_dw_conv(x) | |
| out = self.ssn1(out) | |
| ########################## | |
| auxilary = out | |
| out = out.mean(2, keepdim=True) # frequency average pooling | |
| # f1 | |
| ############################ | |
| out = self.temp_dw_conv(out) | |
| out = self.bn(out) | |
| out = self.swish(out) | |
| out = self.conv1x1(out) | |
| out = self.channel_drop(out) | |
| ############################ | |
| out = out + identity + auxilary | |
| out = self.relu(out) | |
| return out | |
| class TransitionBlock(nn.Module): | |
| def __init__( | |
| self, | |
| inplanes: int, | |
| planes: int, | |
| dilation=1, | |
| stride=1, | |
| temp_pad=(0, 1), | |
| ) -> None: | |
| super(TransitionBlock, self).__init__() | |
| self.freq_dw_conv = nn.Conv2d(planes, planes, kernel_size=(3, 1), padding=(1, 0), groups=planes, | |
| stride=stride, | |
| dilation=dilation, bias=False) | |
| self.ssn = SubSpectralNorm(planes, 5) | |
| self.temp_dw_conv = nn.Conv2d(planes, planes, kernel_size=(1, 3), padding=temp_pad, groups=planes, | |
| dilation=dilation, stride=stride, bias=False) | |
| self.bn1 = nn.BatchNorm2d(planes) | |
| self.bn2 = nn.BatchNorm2d(planes) | |
| self.relu = nn.ReLU(inplace=True) | |
| self.channel_drop = nn.Dropout2d(p=0.5) | |
| self.swish = nn.SiLU() | |
| self.conv1x1_1 = nn.Conv2d(inplanes, planes, kernel_size=(1, 1), bias=False) | |
| self.conv1x1_2 = nn.Conv2d(planes, planes, kernel_size=(1, 1), bias=False) | |
| def forward(self, x: Tensor) -> Tensor: | |
| # f2 | |
| ############################# | |
| out = self.conv1x1_1(x) | |
| out = self.bn1(out) | |
| out = self.relu(out) | |
| out = self.freq_dw_conv(out) | |
| out = self.ssn(out) | |
| ############################# | |
| auxilary = out | |
| out = out.mean(2, keepdim=True) # frequency average pooling | |
| # f1 | |
| ############################# | |
| out = self.temp_dw_conv(out) | |
| out = self.bn2(out) | |
| out = self.swish(out) | |
| out = self.conv1x1_2(out) | |
| out = self.channel_drop(out) | |
| ############################# | |
| out = auxilary + out | |
| out = self.relu(out) | |
| return out | |
| class BCResNet(torch.nn.Module): | |
| def __init__(self): | |
| super(BCResNet, self).__init__() | |
| self.conv1 = nn.Conv2d(1, 16, 5, stride=(2, 1), padding=(2, 2)) | |
| self.block1_1 = TransitionBlock(16, 8) | |
| self.block1_2 = BroadcastedBlock(8) | |
| self.block2_1 = TransitionBlock(8, 12, stride=(2, 1), dilation=(1, 2), temp_pad=(0, 2)) | |
| self.block2_2 = BroadcastedBlock(12, dilation=(1, 2), temp_pad=(0, 2)) | |
| self.block3_1 = TransitionBlock(12, 16, stride=(2, 1), dilation=(1, 4), temp_pad=(0, 4)) | |
| self.block3_2 = BroadcastedBlock(16, dilation=(1, 4), temp_pad=(0, 4)) | |
| self.block3_3 = BroadcastedBlock(16, dilation=(1, 4), temp_pad=(0, 4)) | |
| self.block3_4 = BroadcastedBlock(16, dilation=(1, 4), temp_pad=(0, 4)) | |
| self.block4_1 = TransitionBlock(16, 20, dilation=(1, 8), temp_pad=(0, 8)) | |
| self.block4_2 = BroadcastedBlock(20, dilation=(1, 8), temp_pad=(0, 8)) | |
| self.block4_3 = BroadcastedBlock(20, dilation=(1, 8), temp_pad=(0, 8)) | |
| self.block4_4 = BroadcastedBlock(20, dilation=(1, 8), temp_pad=(0, 8)) | |
| self.conv2 = nn.Conv2d(20, 20, 5, groups=20, padding=(0, 2)) | |
| self.conv3 = nn.Conv2d(20, 32, 1, bias=False) | |
| self.conv4 = nn.Conv2d(32, 12, 1, bias=False) | |
| def forward(self, x): | |
| out = self.conv1(x) | |
| out = self.block1_1(out) | |
| out = self.block1_2(out) | |
| out = self.block2_1(out) | |
| out = self.block2_2(out) | |
| out = self.block3_1(out) | |
| out = self.block3_2(out) | |
| out = self.block3_3(out) | |
| out = self.block3_4(out) | |
| out = self.block4_1(out) | |
| out = self.block4_2(out) | |
| out = self.block4_3(out) | |
| out = self.block4_4(out) | |
| out = self.conv2(out) | |
| out = self.conv3(out) | |
| out = out.mean(-1, keepdim=True) | |
| out = self.conv4(out) | |
| return out.reshape((-1, 12)) | |