Spaces:

huazai676
/

StutterRecognition

Sleeping

App Files Files Community

huazai676 commited on Aug 11, 2024

Commit

ba80248

verified ·

1 Parent(s): 0adc7af

Upload 6 files

Browse files

Files changed (6) hide show

StutterNet/__init__.py +5 -0
StutterNet/io.py +89 -0
StutterNet/losses.py +53 -0
StutterNet/metrics.py +22 -0
StutterNet/models.py +423 -0
StutterNet/train.py +223 -0

StutterNet/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .models import *
+from .io import *
+from .losses import *
+from .metrics import *
+from .train import *

StutterNet/io.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import torch
+import numpy as np
+import pandas as pd
+import librosa
+import torchaudio as audio
+class SEP28KDataset(torch.utils.data.Dataset):
+    """SEP-28k Dataset."""
+    def __init__(self, x, y, unsqueeze=False, transform=None):
+        """
+        Args:
+            x (hdf5): hdf5 data one of 'Xtrain', 'Xtest', or 'Xvalid'
+            y (hdf5): hdf5 file one of 'Ytrain', 'Ytest', or 'Yvalid'
+            unsqueeze (bool, Optional): Whether or not to unsqueeze the feature.
+              May be required for models that require image-like inputs.
+            transform (callable, Optional): Optional transform to be applied
+                on a sample.
+        """
+        self.data = x
+        self.labels = y
+        # self.spec = audio.transforms.MelSpectrogram(n_mels=80, sample_rate=16000,
+        #                                       n_fft=512, f_max=8000, f_min=0,
+        #                                       power=0.5, hop_length=152, win_length=480)
+        # self.db = audio.transforms.AmplitudeToDB()
+        # self.freq_mask = audio.transforms.FrequencyMasking(freq_mask_param=1)
+        # self.time_mask = audio.transforms.TimeMasking(time_mask_param=20)
+        # self.rng = np.random.default_rng(42)
+        # self.rng_2 = np.random.default_rng(68)
+        self.unsqueeze = unsqueeze
+        self.transform = transform
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        if torch.is_tensor(idx):
+            idx = idx.tolist()
+        # load sliced clip
+        # _, wav = wavfile.read(clip_path)
+        wav = self.data[idx]
+        wav = self.pad_trunc(wav, 3000, 16000).astype('float32')
+        wav = torch.tensor(wav)
+        #wav = self.spec(wav)
+        #wav = self.db(wav)
+        #if (self.rng.choice(2,p=[0.2,0.8])):
+        #  wav = self.freq_mask(wav)
+        # if (self.rng_2.choice(2,p=[0.2,0.8])):
+        #   wav = self.time_mask(wav)
+        # get labels
+        labels = self.labels[idx].astype('float32')
+        if self.transform is not None:
+            wav = self.transform(wav)
+        if (self.unsqueeze):
+            wav = torch.unsqueeze(wav, 0)
+        return torch.tensor(wav).clone().detach(), torch.tensor(labels).clone().detach()
+    @staticmethod
+    def pad_trunc(sig, max_ms, sr):
+      sig_len = sig.shape[0]
+      max_len = sr//1000 * max_ms
+      if (sig_len > max_len):
+        # Truncate the signal to the given length
+        sig = sig[:,:max_len]
+      elif (sig_len < max_len):
+        # Length of padding to add at the beginning and end of the signal
+        pad_begin_len = np.random.randint(0, max_len - sig_len)
+        pad_end_len = max_len - sig_len - pad_begin_len
+        # Pad with 0s
+        pad_begin = np.zeros((pad_begin_len))
+        pad_end = np.zeros((pad_end_len))
+        sig = np.concatenate((pad_begin, sig, pad_end), 0)
+      return sig

StutterNet/losses.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import torch
+from torch import nn
+from torchvision.ops import sigmoid_focal_loss
+class CCCLoss(nn.Module):
+  '''concordance correlation coefficient loss'''
+  def __init__(self, eps=1e-7):
+      '''
+      Args:
+        eps (float, optional): stabilizing term
+      '''
+      super(CCCLoss, self).__init__()
+      self.eps = eps
+  def forward(self, y_hat, y):
+      gold_mean = torch.mean(y.T, dim=-1, keepdim=True)
+      pred_mean = torch.mean(y_hat.T, dim=-1, keepdim=True)
+      covariance = (y.T-gold_mean)*(y_hat.T-pred_mean)
+      gold_var = torch.mean(torch.square(y.T-gold_mean), dim=-1,  keepdim=True)
+      pred_var = torch.mean(torch.square(y_hat.T-pred_mean), dim=-1, keepdim=True)
+      ccc = 2 * covariance / (gold_var + pred_var + torch.square(gold_mean - pred_mean) + self.eps)
+      return torch.mean(1-ccc, dim=-1)
+      # return torch.mean(torch.mean(1-ccc, dim=-1))
+class SigmoidFocalLoss(nn.Module):
+  def __init__(self, reduction=None):
+    super(SigmoidFocalLoss, self).__init__()
+    self.reduction = reduction
+  def forward(self, y_hat , y):
+    loss = sigmoid_focal_loss(y_hat, y, reduction=self.reduction)
+    return loss
+class StutterLoss(nn.Module):
+  '''SEP-28k Loss '''
+  def __init__(self, alpha=1, beta=1, stutter_weights=None, reduction='mean'):
+    super(StutterLoss, self).__init__()
+    self.stutter_loss = CCCLoss()
+    self.disfluency_loss = SigmoidFocalLoss(reduction=reduction)
+    self.alpha = alpha
+    self.beta = beta
+    self.stutter_weights = stutter_weights
+    if (isinstance(self.stutter_weights, torch.Tensor)):
+      self.stutter_weights = self.stutter_weights.reshape((1,-1))
+  def forward(self, y_hat , y):
+    '''expects list of inputs and outputs'''
+    y_class, y_bin = torch.split(y, [6,6], dim=-1)
+    y_hat_class, y_hat_bin = torch.split(y_hat, [6,6], dim=-1)
+    disfluency_loss = self.disfluency_loss(y_hat_class, y_class)
+    stutter_loss = torch.mean(self.stutter_loss(y_hat_bin, y_bin))
+    if (not isinstance(self.stutter_weights, torch.Tensor)):
+      return self.alpha * stutter_loss + self.beta * torch.mean(disfluency_loss, dim=0)
+    return self.alpha * stutter_loss + self.beta * self.stutter_weights@disfluency_loss(y_hat_class, y_class)

StutterNet/metrics.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from sklearn.metrics import f1_score
+import numpy as np
+#TODO: implement as nn.Module subclass
+def f1(y_hat, y):
+  per_class_score = f1_score(y.cpu().detach().numpy().astype('int'),
+                  (sigmoid(y_hat.cpu().detach().numpy()) > 0.5).astype('int'),
+                  average='samples', zero_division=1)
+  return np.mean(per_class_score)
+def accuracy(outputs, labels):
+  # y_hat = (sigmoid(outputs.cpu().detach().numpy()).flatten() > 0.5).astype('int')
+  # y = labels.cpu().detach().numpy().flatten().astype('int')
+  y_hat = (sigmoid(outputs.cpu().detach().numpy()) > 0.5).astype('int')
+  y = labels.cpu().detach().numpy().astype('int')
+  batch_size = y.shape[0]
+  per_class_acc = np.sum(y == y_hat, axis=0) / batch_size
+  # total = float(len(y))
+  # correct = float(np.sum(y == y_hat))
+  # return correct / total
+  return np.mean(per_class_acc)

StutterNet/models.py ADDED Viewed

	@@ -0,0 +1,423 @@

+import torch
+from torch import nn
+import torchaudio as audio
+from torch import Tensor
+class StutterNet(nn.Module):
+  def __init__(self, n_mels=40,
+               dropout=0.0, use_batchnorm=False, scale=1):
+    '''Implementation of StutterNet
+    from Sheikh et al. StutterNet:
+    "Stuttering Detection Using
+    Time Delay Neural Network" 2021
+    Args:
+      n_mels (int, optional): number of mel filter banks
+      n_classes (int, optional): number of classes in output layer
+      use_dropout (bool, optional): whether or not to use dropout in the
+        last two linear layers
+      use_batchnorm (bool, optional): whether ot not to batchnorm in the
+        TDNN layers
+      scale (float ,optional): width scale factor
+    '''
+    super(StutterNet, self).__init__()
+    self.n_mels = n_mels
+    # self.spec = audio.transforms.MelSpectrogram(n_mels=n_mels, sample_rate=16000,
+    #                                           n_fft=512, pad=1, f_max=8000, win_length=400,
+    #                                           f_min=0, power=2.0, hop_length=160, norm='slaney')
+    # self.db = audio.transforms.AmplitudeToDB()
+    # self.mfcc = audio.transforms.MFCC(16000, 40)
+    self.tdnn_1 = nn.Conv1d(n_mels, int(512*scale), 5, dilation=1)
+    self.tdnn_2 = nn.Conv1d(int(512*scale), int(1536*scale), 5, dilation=2)
+    self.tdnn_3 = nn.Conv1d(int(1536*scale), int(512*scale), 7, dilation=3)
+    self.tdnn_4 = nn.Conv1d(int(512*scale), int(512*scale), 1)
+    self.tdnn_5 = nn.Conv1d(int(512*scale), int(1500*scale), 1)
+    self.fc_1 = nn.Linear(int(3000*scale), 512)
+    self.relu = nn.ReLU()
+    self.bn_1 = nn.BatchNorm1d(int(512*scale))
+    self.bn_2 = nn.BatchNorm1d(int(1536*scale))
+    self.bn_3 = nn.BatchNorm1d(int(512*scale))
+    self.bn_4 = nn.BatchNorm1d(int(512*scale))
+    self.bn_5 = nn.BatchNorm1d(int(1500*scale))
+    nn.init.xavier_uniform_(self.fc_1.weight)
+    self.dropout_1 = nn.Dropout(dropout)
+    self.fc_2 = nn.Linear(512, 512)
+    nn.init.xavier_uniform_(self.fc_1.weight)
+    self.dropout_2 = nn.Dropout(dropout)
+    self.binary_head = nn.Linear(512, 6)
+    self.class_head = nn.Linear(512, 6)
+    self.sig = nn.Sigmoid()
+  def forward(self, x):
+    '''forward method'''
+    batch_size = x.shape[0]
+    # x = self.spec(x)
+    # x = self.db(x)
+    # x = self.mfcc(x)
+    x = self.tdnn_1(x)
+    x = self.relu(x)
+    x = self.bn_1(x)
+    x = self.tdnn_2(x)
+    x = self.relu(x)
+    x = self.bn_2(x)
+    x = self.tdnn_3(x)
+    x = self.relu(x)
+    x = self.bn_3(x)
+    x = self.tdnn_4(x)
+    x = self.relu(x)
+    x = self.bn_4(x)
+    x = self.tdnn_5(x)
+    x = self.relu(x)
+    x = self.bn_5(x)
+    mean = torch.mean(x,-1)
+    std = torch.std(x,-1)
+    x = torch.cat((mean,std),1)
+    x = self.fc_1(x)
+    x = self.dropout_1(x)
+    x = self.fc_2(x)
+    x = self.dropout_2(x)
+    binary = self.binary_head(x)
+    # binary = self.sig(binary)
+    classes = self.class_head(x)
+    # classes = self.sig(classes)
+    # return torch.cat((classes, binary), dim=-1)
+    return torch.cat((binary, classes), dim=-1)
+class ResBlock1d(nn.Module):
+  def __init__(self, input_dims, output_dims, depth=2, kernel_size=3,
+               use_batchnorm=False, downsample=False, dropout=0.0):
+    super(ResBlock1d, self).__init__()
+    self.depth = depth
+    self.use_batchnorm = use_batchnorm
+    scale = 1
+    self.up = None
+    if (downsample):
+      self.down = nn.Conv1d(int(input_dims), int(output_dims), 3, 2, padding=1)
+      # self.down = nn.MaxPool1d(1, stride=2)
+      scale=2
+    self.downsample = downsample
+    self.conv_1 = nn.Conv1d(int(input_dims),
+      output_dims, 3, stride=scale, padding=1)
+    self.convs = nn.ModuleList([nn.Conv1d(output_dims,
+      output_dims, kernel_size, padding='same') for _ in range(depth-1)])
+    self.bn_1 = nn.BatchNorm1d(output_dims)
+    self.bn = None
+    if (use_batchnorm):
+      self.bn = nn.ModuleList([nn.BatchNorm1d(
+          output_dims) for _ in range(depth-1)])
+    self.relu = nn.ReLU()
+    self.dropout = nn.Dropout(dropout)
+  def forward(self, x):
+    temp = x
+    if (self.downsample):
+      temp = self.down(x)
+    x = self.conv_1(x)
+    x = self.bn_1(x)
+    if (not self.use_batchnorm):
+      for i in range(self.depth-1):
+        x = self.convs[i](x)
+        x = self.dropout(x)
+        if (i != self.depth-2):
+          x = self.relu(x)
+    else:
+      for i in range(self.depth-1):
+        x = self.convs[i](x)
+        x = self.dropout(x)
+        x = self.bn[i](x)
+        if (i != self.depth-2):
+          x = self.relu(x)
+    x = temp + x
+    return x
+class ResNet1D(nn.Module):
+  def __init__(self, n_mels=100,n_classes=12, kernel_size=3,
+               dropout=0.0, use_batchnorm=False, scale=1):
+    '''Implementation of StutterNet
+    from Sheikh et al. StutterNet:
+    "Stuttering Detection Using
+    Time Delay Neural Network" 2021
+    Args:
+      n_mels (int, optional): number of mel filter banks
+      n_classes (int, optional): number of classes in output layer
+      use_dropout (bool, optional): whether or not to use dropout in the
+        last two linear layers
+      use_batchnorm (bool, optional): whether ot not to batchnorm in the
+        TDNN layers
+      scale (float ,optional): width scale factor
+    '''
+    super(ResNet1D, self).__init__()
+    self.n_mels = n_mels
+    # self.spec = audio.transforms.MelSpectrogram(n_mels=n_mels, sample_rate=16000,
+    #                                            n_fft=512, pad=1, f_max=8000, f_min=0,
+    #                                            power=2.0, hop_length=160)
+    # self.mfcc = audio.transforms.MFCC(16000, 40)
+    # self.db = audio.transforms.AmplitudeToDB()
+    self.tdnn_1 = nn.Conv1d(n_mels, int(64*scale), 3, padding=1, bias=False)
+    self.res_1_1 = ResBlock1d(int(64*scale), int(64*scale), kernel_size=kernel_size, downsample=True, use_batchnorm=use_batchnorm, dropout=dropout)
+    self.res_1_2 = ResBlock1d(int(64*scale), int(64*scale), kernel_size=kernel_size, downsample=False, use_batchnorm=use_batchnorm, dropout=dropout)
+    self.res_1_3 = ResBlock1d(int(64*scale), int(64*scale), kernel_size=kernel_size, downsample=False, use_batchnorm=use_batchnorm, dropout=dropout)
+    self.res_2_1 = ResBlock1d(int(64*scale), int(128*scale), kernel_size=kernel_size, downsample=True, use_batchnorm=use_batchnorm, dropout=dropout)
+    self.res_2_2 = ResBlock1d(int(128*scale), int(128*scale), kernel_size=kernel_size, downsample=False, use_batchnorm=use_batchnorm, dropout=dropout)
+    self.res_2_3 = ResBlock1d(int(128*scale), int(128*scale), kernel_size=kernel_size, downsample=False, use_batchnorm=use_batchnorm, dropout=dropout)
+    self.res_3_1 = ResBlock1d(int(128*scale), int(256*scale), kernel_size=kernel_size, downsample=True, use_batchnorm=use_batchnorm, dropout=dropout)
+    self.res_3_2 = ResBlock1d(int(256*scale), int(256*scale), kernel_size=kernel_size, downsample=False, use_batchnorm=use_batchnorm, dropout=dropout)
+    self.res_3_3 = ResBlock1d(int(256*scale), int(256*scale), kernel_size=kernel_size, downsample=False, use_batchnorm=use_batchnorm, dropout=dropout)
+    self.res_4_1 = ResBlock1d(int(256*scale), int(512*scale), kernel_size=kernel_size, downsample=True, use_batchnorm=use_batchnorm, dropout=dropout)
+    self.res_4_2 = ResBlock1d(int(512*scale), int(512*scale), kernel_size=kernel_size, downsample=False, use_batchnorm=use_batchnorm, dropout=dropout)
+    self.res_4_3 = ResBlock1d(int(512*scale), int(512*scale), kernel_size=kernel_size, downsample=False, use_batchnorm=use_batchnorm, dropout=dropout)
+    # self.bn = nn.BatchNorm1d(int(512*scale))
+    self.relu = nn.ReLU()
+    self.fc = nn.Linear(int(1024*scale), n_classes)
+  def forward(self, x):
+    '''forward method'''
+    batch_size = x.shape[0]
+    # x = self.spec(x)
+    # x = self.mfcc(x)
+    # x = self.db(x)
+    x = self.tdnn_1(x)
+    x = self.res_1_1(x)
+    x = self.relu(x)
+    x = self.res_1_2(x)
+    x = self.relu(x)
+    x = self.res_1_3(x)
+    x = self.relu(x)
+    x = self.res_2_1(x)
+    x = self.relu(x)
+    x = self.res_2_2(x)
+    x = self.relu(x)
+    x = self.res_2_3(x)
+    x = self.relu(x)
+    x = self.res_3_1(x)
+    x = self.relu(x)
+    x = self.res_3_2(x)
+    x = self.relu(x)
+    x = self.res_3_3(x)
+    x = self.relu(x)
+    x = self.res_4_1(x)
+    x = self.relu(x)
+    x = self.res_4_2(x)
+    x = self.relu(x)
+    x = self.res_4_3(x)
+    x = self.relu(x)
+    # x = self.bn(x)
+    mean = torch.mean(x,-1)
+    std = torch.std(x,-1)
+    x = torch.cat((mean,std),1)
+    x = self.fc(x)
+    return x
+  from torch import Tensor
+'''credit: https://github.com/roman-vygon/BCResNet'''
+class SubSpectralNorm(nn.Module):
+    def __init__(self, C, S, eps=1e-5):
+        super(SubSpectralNorm, self).__init__()
+        self.S = S
+        self.eps = eps
+        self.bn = nn.BatchNorm2d(C*S)
+    def forward(self, x):
+        # x: input features with shape {N, C, F, T}
+        # S: number of sub-bands
+        N, C, F, T = x.size()
+        x = x.view(N, C * self.S, F // self.S, T)
+        x = self.bn(x)
+        return x.view(N, C, F, T)
+class BroadcastedBlock(nn.Module):
+    def __init__(
+            self,
+            planes: int,
+            dilation=1,
+            stride=1,
+            temp_pad=(0, 1),
+    ) -> None:
+        super(BroadcastedBlock, self).__init__()
+        self.freq_dw_conv = nn.Conv2d(planes, planes, kernel_size=(3, 1), padding=(1, 0), groups=planes,
+                                      dilation=dilation,
+                                      stride=stride, bias=False)
+        self.ssn1 = SubSpectralNorm(planes, 5)
+        self.temp_dw_conv = nn.Conv2d(planes, planes, kernel_size=(1, 3), padding=temp_pad, groups=planes,
+                                      dilation=dilation, stride=stride, bias=False)
+        self.bn = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.channel_drop = nn.Dropout2d(p=0.5)
+        self.swish = nn.SiLU()
+        self.conv1x1 = nn.Conv2d(planes, planes, kernel_size=(1, 1), bias=False)
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+        # f2
+        ##########################
+        out = self.freq_dw_conv(x)
+        out = self.ssn1(out)
+        ##########################
+        auxilary = out
+        out = out.mean(2, keepdim=True)  # frequency average pooling
+        # f1
+        ############################
+        out = self.temp_dw_conv(out)
+        out = self.bn(out)
+        out = self.swish(out)
+        out = self.conv1x1(out)
+        out = self.channel_drop(out)
+        ############################
+        out = out + identity + auxilary
+        out = self.relu(out)
+        return out
+class TransitionBlock(nn.Module):
+    def __init__(
+            self,
+            inplanes: int,
+            planes: int,
+            dilation=1,
+            stride=1,
+            temp_pad=(0, 1),
+    ) -> None:
+        super(TransitionBlock, self).__init__()
+        self.freq_dw_conv = nn.Conv2d(planes, planes, kernel_size=(3, 1), padding=(1, 0), groups=planes,
+                                      stride=stride,
+                                      dilation=dilation, bias=False)
+        self.ssn = SubSpectralNorm(planes, 5)
+        self.temp_dw_conv = nn.Conv2d(planes, planes, kernel_size=(1, 3), padding=temp_pad, groups=planes,
+                                      dilation=dilation, stride=stride, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.channel_drop = nn.Dropout2d(p=0.5)
+        self.swish = nn.SiLU()
+        self.conv1x1_1 = nn.Conv2d(inplanes, planes, kernel_size=(1, 1), bias=False)
+        self.conv1x1_2 = nn.Conv2d(planes, planes, kernel_size=(1, 1), bias=False)
+    def forward(self, x: Tensor) -> Tensor:
+        # f2
+        #############################
+        out = self.conv1x1_1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.freq_dw_conv(out)
+        out = self.ssn(out)
+        #############################
+        auxilary = out
+        out = out.mean(2, keepdim=True)  # frequency average pooling
+        # f1
+        #############################
+        out = self.temp_dw_conv(out)
+        out = self.bn2(out)
+        out = self.swish(out)
+        out = self.conv1x1_2(out)
+        out = self.channel_drop(out)
+        #############################
+        out = auxilary + out
+        out = self.relu(out)
+        return out
+class BCResNet(torch.nn.Module):
+    def __init__(self):
+        super(BCResNet, self).__init__()
+        self.conv1 = nn.Conv2d(1, 16, 5, stride=(2, 1), padding=(2, 2))
+        self.block1_1 = TransitionBlock(16, 8)
+        self.block1_2 = BroadcastedBlock(8)
+        self.block2_1 = TransitionBlock(8, 12, stride=(2, 1), dilation=(1, 2), temp_pad=(0, 2))
+        self.block2_2 = BroadcastedBlock(12, dilation=(1, 2), temp_pad=(0, 2))
+        self.block3_1 = TransitionBlock(12, 16, stride=(2, 1), dilation=(1, 4), temp_pad=(0, 4))
+        self.block3_2 = BroadcastedBlock(16, dilation=(1, 4), temp_pad=(0, 4))
+        self.block3_3 = BroadcastedBlock(16, dilation=(1, 4), temp_pad=(0, 4))
+        self.block3_4 = BroadcastedBlock(16, dilation=(1, 4), temp_pad=(0, 4))
+        self.block4_1 = TransitionBlock(16, 20, dilation=(1, 8), temp_pad=(0, 8))
+        self.block4_2 = BroadcastedBlock(20, dilation=(1, 8), temp_pad=(0, 8))
+        self.block4_3 = BroadcastedBlock(20, dilation=(1, 8), temp_pad=(0, 8))
+        self.block4_4 = BroadcastedBlock(20, dilation=(1, 8), temp_pad=(0, 8))
+        self.conv2 = nn.Conv2d(20, 20, 5, groups=20, padding=(0, 2))
+        self.conv3 = nn.Conv2d(20, 32, 1, bias=False)
+        self.conv4 = nn.Conv2d(32, 12, 1, bias=False)
+    def forward(self, x):
+        out = self.conv1(x)
+        out = self.block1_1(out)
+        out = self.block1_2(out)
+        out = self.block2_1(out)
+        out = self.block2_2(out)
+        out = self.block3_1(out)
+        out = self.block3_2(out)
+        out = self.block3_3(out)
+        out = self.block3_4(out)
+        out = self.block4_1(out)
+        out = self.block4_2(out)
+        out = self.block4_3(out)
+        out = self.block4_4(out)
+        out = self.conv2(out)
+        out = self.conv3(out)
+        out = out.mean(-1, keepdim=True)
+        out = self.conv4(out)
+        return out.reshape((-1, 12))

StutterNet/train.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import torch
+from torch import nn
+import numpy as np
+import argparse
+def sigmoid(x):
+  return 1 / (1 + np.exp(-x))
+def parser():
+	#TODO: create parser
+	ap = argparse.ArgumentParser()
+	return ap.parse_args()
+def train(net, trainloader, criterion, batch_size, target_names,
+          validationloader=None, optimizer=None,
+          scheduler=None, epochs=50, logdir=None, metrics=None,
+          verbose=True, tuner=False, checkpoint_dir=None):
+  ''' training loop function for simple
+  supervised learning task.
+  Args:
+    net (torch.nn.Module): network to train
+    trainloader (torch.utils.data.DataLoader):
+      train data loader
+    criterion (torch.nn.object): criterion with which
+      to optimize the provided network
+    batch_size (int): batch of trainloader and validationloader
+    validationloader (torch.utils.data.DataLoader, optional):
+      validation data loader
+    optimizer (torch.optim.Optimizer, optional):
+      optimizer function, defaults to torch.nn.optim.Adam w/ amsgrad
+    scheduler (torch.optim.lr_scheduler, optional):
+      learning rate scheduler object
+    epochs (int, optional): number of epochs to train network,
+      defaults to 50
+    logdir (string, optional): path to tensorboard log directory,
+      if None logging default to ./runs/ directory
+    metrics (list of tuples, optional): metrics to be logged with
+      name and metric being the first and second element of the
+      each tuple respectively
+    verbose (bool, optional): whether or not to print information
+      to console
+    tuner (bool, optional): whether to employ ray tune
+  '''
+  from torch.utils.tensorboard import SummaryWriter
+  from sklearn.metrics import classification_report
+  writer = SummaryWriter(log_dir=logdir)
+  if (verbose):
+    from tensorflow.keras.utils import Progbar
+  if (optimizer is None):
+    optimizer = torch.optim.Adam(net.parameters(), lr=1e-4, amsgrad=True)
+  start_epoch = 0
+  if (checkpoint_dir is not None):
+    # state, optim_state = torch.load(os.path.join(
+    #     checkpoint_dir, "checkpoint"))
+    state = torch.load(checkpoint_dir)
+    start_epoch = state['epoch']
+    net.load_state_dict(state['state_dict'])
+    optimizer.load_state_dict(state['optimizer'])
+  assert epochs > 0, "Assertion failed. epochs must be greater than 0!"
+  steps = 0
+  device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # get device
+  net.train(True)
+#   net.to(device)
+  if (tuner):
+    from ray import tune
+    import os
+  for i in range(start_epoch, start_epoch + epochs):
+    num_batches = len(trainloader)
+    num_samples = num_batches * batch_size
+    if (verbose):
+      print("\nepoch {}/{}".format(i+1, start_epoch+epochs))
+      pbar = Progbar(target=num_batches)
+    # if (metrics is not None):
+    #   train_metrics = [0 for metric in metrics]
+    y_true = np.zeros((num_samples, 12))
+    y_pred = np.zeros((num_samples, 12))
+    idx = 0
+    for j, data in enumerate(iter(trainloader)):
+        # get the inputs; data is a list of [inputs, labels]
+        inputs, labels = data[0].to(device), data[1].to(device)
+        # inputs, labels = data[0].to(device), [data[1][0].to(device), data[1][1].to(device)]
+        # zero the parameter gradients
+        optimizer.zero_grad()
+        # forward + backward + optimize
+        outputs = net(inputs)
+        train_loss = criterion(outputs, labels)
+        train_loss.backward()
+        optimizer.step()
+        y_true[idx:idx+outputs.shape[0], :] = labels.detach().cpu().numpy()
+        y_pred[idx:idx+outputs.shape[0], :] = outputs.detach().cpu().numpy()
+        idx += outputs.shape[0]
+        if (scheduler is not None):
+            scheduler.step()
+        if (verbose):
+          pbar.update(j, values=[("loss",
+            train_loss.detach().cpu().numpy().item())])
+        steps += 1
+        writer.add_scalar('Loss/train',
+          train_loss.detach().cpu().numpy().item(), steps)
+        # if (metrics is not None):
+          # for (j, metric) in enumerate(metrics):
+          #   # train_metrics[j] += metric[1](outputs, labels).detach().cpu().numpy()
+          #   train_metrics[j] += metric[1](outputs, labels)
+    rep = classification_report(y_true.astype('int'),
+      (sigmoid(y_pred) > 0.5).astype('int'), target_names=target_names,
+      output_dict=True)
+    for k in rep.keys():
+      for j in rep[k].keys():
+        writer.add_scalar(j + '/' + k + '/train',
+          rep[k][j], steps)
+    # if (metrics is not None):
+    #   for (j, metric) in enumerate(metrics):
+    #     # writer.add_scalar(metric[0] + '/train',
+    #     #   train_metrics[j] / num_samples, steps)
+    #     writer.add_scalar(metric[0] + '/train',
+    #       train_metrics[j] / num_batches, steps)
+    if (validationloader is not None):
+      net.train(False)
+      val_loss = 0
+      # if (metrics is not None):
+      #   val_metrics = [0 for metric in metrics]
+      num_val_batches = len(validationloader)
+      num_val_samples = num_val_batches * batch_size
+      y_val_true = np.zeros((num_val_samples, 12))
+      y_val_pred = np.zeros((num_val_samples, 12))
+      idx = 0
+      for data in iter(validationloader):
+            # get the inputs; data is a list of [inputs, labels]
+            inputs, labels = data[0].to(device), data[1].to(device)
+            # inputs, labels = data[0].to(device), [data[1][0].to(device), data[1][1].to(device)]
+            outputs = net(inputs)
+            val_loss += criterion(outputs, labels).detach().cpu().numpy()
+            y_val_true[idx:idx+outputs.shape[0], :] = labels.detach().cpu().numpy()
+            y_val_pred[idx:idx+outputs.shape[0], :] = outputs.detach().cpu().numpy()
+            idx += outputs.shape[0]
+            # if (metrics is not None):
+            #     for (j, metric) in enumerate(metrics):
+            #         # val_metrics[j] += metric[1](outputs, labels).detach().cpu().numpy()
+            #         val_metrics[j] += metric[1](outputs, labels)
+      val_loss /= (num_val_batches) # assume all validation set used
+      # scheduler.step(val_loss)
+      rep = classification_report(y_val_true.astype('int'),
+        (sigmoid(y_val_pred) > 0.5).astype('int'), target_names=target_names,
+        output_dict=True)
+      print(classification_report(y_val_true.astype('int'),
+                                  (sigmoid(y_val_pred) > 0.5).astype('int'), target_names=target_names))
+      #  output_dict=False)
+      #print(rep2)
+      for k in rep.keys():
+        for j in rep[k].keys():
+          writer.add_scalar(j + '/' + k + '/valid',
+            rep[k][j], steps)
+      writer.add_scalar('Loss/valid', val_loss, steps)
+      # if (metrics is not None):
+      #   for (j, metric) in enumerate(metrics):
+      #     # writer.add_scalar(metric[0] + '/valid',
+      #     #   val_metrics[j] / num_val_samples, steps)
+      #      writer.add_scalar(metric[0] + '/valid',
+      #       val_metrics[j] / num_val_batches, steps)
+      # if (tuner):
+      #   with tune.checkpoint_dir(i+1) as checkpoint_dir:
+      #       path = os.path.join(checkpoint_dir, "checkpoint")
+      #       torch.save((net.state_dict(), optimizer.state_dict()), path)
+      #   tune.report(loss=val_loss, accuracy=val_metrics[0] / num_val_samples, iters=i+1)
+      if (verbose):
+        pbar.update(num_batches, values=[("val_loss",val_loss.item())])
+      net.train(True)
+    else:
+      if (verbose):
+        pbar.update(num_batches, values=None)
+if __name__ == "__main__":
+	args = parser() # get arguments
+	# TODO: implement args such that we can train from the command line
+	#train(args.net, args.trainloader, args.criterion, args.batch_size,
+        #  args.validationloader, args.optimizer,
+        #  args.scheduler, args.epochs, args.logdir, args.metrics,
+        #  args.verbose, args.tuner, args.checkpoint_dir):