| """ |
| Implementation of model from: |
| Kum et al. - "Joint Detection and Classification of Singing Voice Melody Using |
| Convolutional Recurrent Neural Networks" (2019) |
| Link: https://www.semanticscholar.org/paper/Joint-Detection-and-Classification-of-Singing-Voice-Kum-Nam/60a2ad4c7db43bace75805054603747fcd062c0d |
| """ |
| import torch |
| from torch import nn |
|
|
|
|
| class JDCNet(nn.Module): |
| """ |
| Joint Detection and Classification Network model for singing voice melody. |
| """ |
| def __init__(self, num_class=722, leaky_relu_slope=0.01): |
| super().__init__() |
| self.num_class = num_class |
|
|
| |
| self.conv_block = nn.Sequential( |
| nn.Conv2d(in_channels=1, out_channels=64, kernel_size=3, padding=1, bias=False), |
| nn.BatchNorm2d(num_features=64), |
| nn.LeakyReLU(leaky_relu_slope, inplace=True), |
| nn.Conv2d(64, 64, 3, padding=1, bias=False), |
| ) |
|
|
| |
| self.res_block1 = ResBlock(in_channels=64, out_channels=128) |
| self.res_block2 = ResBlock(in_channels=128, out_channels=192) |
| self.res_block3 = ResBlock(in_channels=192, out_channels=256) |
|
|
| |
| self.pool_block = nn.Sequential( |
| nn.BatchNorm2d(num_features=256), |
| nn.LeakyReLU(leaky_relu_slope, inplace=True), |
| nn.MaxPool2d(kernel_size=(1, 4)), |
| nn.Dropout(p=0.5), |
| ) |
|
|
| |
| |
| self.maxpool1 = nn.MaxPool2d(kernel_size=(1, 40)) |
| |
| self.maxpool2 = nn.MaxPool2d(kernel_size=(1, 20)) |
| |
| self.maxpool3 = nn.MaxPool2d(kernel_size=(1, 10)) |
|
|
| |
| self.detector_conv = nn.Sequential( |
| nn.Conv2d(640, 256, 1, bias=False), |
| nn.BatchNorm2d(256), |
| nn.LeakyReLU(leaky_relu_slope, inplace=True), |
| nn.Dropout(p=0.5), |
| ) |
|
|
| |
| self.bilstm_classifier = nn.LSTM( |
| input_size=512, hidden_size=256, |
| batch_first=True, dropout=0.3, bidirectional=True) |
|
|
| |
| self.bilstm_detector = nn.LSTM( |
| input_size=512, hidden_size=256, |
| batch_first=True, dropout=0.3, bidirectional=True) |
|
|
| |
| self.classifier = nn.Linear(in_features=512, out_features=self.num_class) |
|
|
| |
| self.detector = nn.Linear(in_features=512, out_features=2) |
|
|
| |
| self.apply(self.init_weights) |
|
|
| def forward(self, x): |
| """ |
| Returns: |
| classification_prediction, detection_prediction |
| sizes: (b, 31, 722), (b, 31, 2) |
| """ |
| seq_len = x.shape[-2] |
| |
| |
| |
| convblock_out = self.conv_block(x) |
| |
| resblock1_out = self.res_block1(convblock_out) |
| resblock2_out = self.res_block2(resblock1_out) |
| resblock3_out = self.res_block3(resblock2_out) |
| poolblock_out = self.pool_block(resblock3_out) |
| |
| |
| classifier_out = poolblock_out.permute(0, 2, 1, 3).contiguous().view((-1, seq_len, 512)) |
| classifier_out, _ = self.bilstm_classifier(classifier_out) |
|
|
| classifier_out = classifier_out.contiguous().view((-1, 512)) |
| classifier_out = self.classifier(classifier_out) |
| classifier_out = classifier_out.view((-1, seq_len, self.num_class)) |
|
|
| |
| |
| |
| mp1_out = self.maxpool1(convblock_out) |
| mp2_out = self.maxpool2(resblock1_out) |
| mp3_out = self.maxpool3(resblock2_out) |
|
|
| |
| concat_out = torch.cat((mp1_out, mp2_out, mp3_out, poolblock_out), dim=1) |
| detector_out = self.detector_conv(concat_out) |
|
|
| |
| detector_out = detector_out.permute(0, 2, 1, 3).contiguous().view((-1, seq_len, 512)) |
| detector_out, _ = self.bilstm_detector(detector_out) |
|
|
| detector_out = detector_out.contiguous().view((-1, 512)) |
| detector_out = self.detector(detector_out) |
| detector_out = detector_out.view((-1, seq_len, 2)).sum(axis=-1) |
| |
| |
| |
| |
| return classifier_out, detector_out |
|
|
| @staticmethod |
| def init_weights(m): |
| if isinstance(m, nn.Linear): |
| nn.init.kaiming_uniform_(m.weight) |
| if m.bias is not None: |
| nn.init.constant_(m.bias, 0) |
| elif isinstance(m, nn.Conv2d): |
| nn.init.xavier_normal_(m.weight) |
| elif isinstance(m, nn.LSTM) or isinstance(m, nn.LSTMCell): |
| for p in m.parameters(): |
| if p.data is None: |
| continue |
|
|
| if len(p.shape) >= 2: |
| nn.init.orthogonal_(p.data) |
| else: |
| nn.init.normal_(p.data) |
|
|
|
|
| class ResBlock(nn.Module): |
| def __init__(self, in_channels: int, out_channels: int, leaky_relu_slope=0.01): |
| super().__init__() |
| self.downsample = in_channels != out_channels |
|
|
| |
| self.pre_conv = nn.Sequential( |
| nn.BatchNorm2d(num_features=in_channels), |
| nn.LeakyReLU(leaky_relu_slope, inplace=True), |
| nn.MaxPool2d(kernel_size=(1, 2)), |
| ) |
|
|
| |
| self.conv = nn.Sequential( |
| nn.Conv2d(in_channels=in_channels, out_channels=out_channels, |
| kernel_size=3, padding=1, bias=False), |
| nn.BatchNorm2d(out_channels), |
| nn.LeakyReLU(leaky_relu_slope, inplace=True), |
| nn.Conv2d(out_channels, out_channels, 3, padding=1, bias=False), |
| ) |
|
|
| |
| self.conv1by1 = None |
| if self.downsample: |
| self.conv1by1 = nn.Conv2d(in_channels, out_channels, 1, bias=False) |
|
|
| def forward(self, x): |
| x = self.pre_conv(x) |
| if self.downsample: |
| x = self.conv(x) + self.conv1by1(x) |
| else: |
| x = self.conv(x) + x |
| return x |