Spaces:

uwx
/

waveformer

Runtime error

App Files Files Community

bandhav commited on Nov 2, 2022

Commit

e6a6383

1 Parent(s): 0e59911

Base code

Browse files

Files changed (11) hide show

app.py +62 -0
default_config.json +60 -0
requirements.txt +9 -0
src/__init__.py +0 -0
src/helpers/__init__.py +0 -0
src/helpers/utils.py +205 -0
src/training/__init__.py +0 -0
src/training/dcc_tf.py +486 -0
src/training/eval.py +214 -0
src/training/synthetic_dataset.py +168 -0
src/training/train.py +311 -0

app.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import argparse
+import os
+import wget
+import torch
+import torchaudio
+import gradio as gr
+from src.helpers import utils
+from src.training.dcc_tf import Net as Waveformer
+TARGETS = [
+    "Acoustic_guitar", "Applause", "Bark", "Bass_drum",
+    "Burping_or_eructation", "Bus", "Cello", "Chime", "Clarinet",
+    "Computer_keyboard", "Cough", "Cowbell", "Double_bass",
+    "Drawer_open_or_close", "Electric_piano", "Fart", "Finger_snapping",
+    "Fireworks", "Flute", "Glockenspiel", "Gong", "Gunshot_or_gunfire",
+    "Harmonica", "Hi-hat", "Keys_jangling", "Knock", "Laughter", "Meow",
+    "Microwave_oven", "Oboe", "Saxophone", "Scissors", "Shatter",
+    "Snare_drum", "Squeak", "Tambourine", "Tearing", "Telephone",
+    "Trumpet", "Violin_or_fiddle", "Writing"
+]
+if not os.path.exists('default_config.json'):
+    config_url = 'https://targetsound.cs.washington.edu/files/default_config.json'
+    print("Downloading model configuration from %s:" % config_url)
+    wget.download(config_url)
+if not os.path.exists('default_ckpt.pt'):
+    ckpt_url = 'https://targetsound.cs.washington.edu/files/default_ckpt.pt'
+    print("\nDownloading the checkpoint from %s:" % ckpt_url)
+    wget.download(ckpt_url)
+# Instantiate model
+params = utils.Params('default_config.json')
+model = Waveformer(**params.model_params)
+utils.load_checkpoint('default_ckpt.pt', model)
+model.eval()
+def waveformer(audio, label_choices):
+    # Read input audio
+    fs, mixture = audio
+    if fs != 44100:
+        raise ValueError(fs)
+    mixture = torch.from_numpy(mixture).unsqueeze(0)
+    # Construct the query vector
+    if len(label_choices) == 0:
+         raise ValueError(label_choices)
+    query = torch.zeros(1, len(TARGETS))
+    for t in label_choices:
+        query[0, TARGETS.index(t)] = 1.
+    with torch.no_grad():
+        output = model(mixture, query)
+    return fs, output.squeeze(0).numpy()
+label_checkbox = gr.CheckboxGroup(choices=TARGETS)
+demo = gr.Interface(fn=waveformer, inputs=['audio', label_checkbox], outputs="audio")
+demo.launch()

default_config.json ADDED Viewed

	@@ -0,0 +1,60 @@

+{
+    "model": "src.training.dcc_tf",
+    "model_params":
+    {
+        "label_len": 41,
+        "L": 32,
+        "enc_dim": 512,
+        "num_enc_layers": 10,
+        "dec_dim": 256,
+        "num_dec_layers": 1,
+        "dec_buf_len": 13,
+        "dec_chunk_size": 13,
+        "out_buf_len": 4,
+        "use_pos_enc": "true"
+    },
+    "train_data":
+    {
+        "input_dir": "data/FSDSoundScapes",
+        "dset": "train",
+        "sr": 44100,
+        "resample_rate": null,
+	"max_num_targets":3
+    },
+    "val_data":
+    {
+        "input_dir": "data/FSDSoundScapes",
+        "dset": "val",
+        "sr": 44100,
+        "resample_rate": null,
+	"max_num_targets":3
+    },
+    "test_data":
+    {
+        "input_dir": "data/FSDSoundScapes",
+        "dset": "test",
+        "sr": 44100,
+        "resample_rate": null,
+	"max_num_targets":3
+    },
+    "optim":
+    {
+        "lr": 5e-4,
+        "weight_decay": 0.0
+    },
+    "lr_sched":
+    {
+        "mode": "max",
+        "factor": 0.1,
+        "patience": 5,
+        "min_lr": 5e-6,
+        "threshold": 0.1,
+        "threshold_mode": "abs"
+    },
+    "base_metric": "scale_invariant_signal_noise_ratio",
+    "fix_lr_epochs": 50,
+    "epochs": 150,
+    "batch_size": 16,
+    "eval_batch_size": 64,
+    "n_workers": 16
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+### Requirements
+librosa
+torch
+torchaudio
+soundfile
+numpy
+speechbrain
+wget

src/__init__.py ADDED Viewed

File without changes

src/helpers/__init__.py ADDED Viewed

File without changes

src/helpers/utils.py ADDED Viewed

	@@ -0,0 +1,205 @@

+"""A collection of useful helper functions"""
+import os
+import logging
+import json
+import torch
+from torch.profiler import profile, record_function, ProfilerActivity
+import pandas as pd
+from torchmetrics.functional import(
+    scale_invariant_signal_noise_ratio as si_snr,
+    signal_noise_ratio as snr,
+    signal_distortion_ratio as sdr,
+    scale_invariant_signal_distortion_ratio as si_sdr)
+import matplotlib.pyplot as plt
+class Params():
+    """Class that loads hyperparameters from a json file.
+    Example:
+    ```
+    params = Params(json_path)
+    print(params.learning_rate)
+    params.learning_rate = 0.5  # change the value of learning_rate in params
+    ```
+    """
+    def __init__(self, json_path):
+        with open(json_path) as f:
+            params = json.load(f)
+            self.__dict__.update(params)
+    def save(self, json_path):
+        with open(json_path, 'w') as f:
+            json.dump(self.__dict__, f, indent=4)
+    def update(self, json_path):
+        """Loads parameters from json file"""
+        with open(json_path) as f:
+            params = json.load(f)
+            self.__dict__.update(params)
+    @property
+    def dict(self):
+        """Gives dict-like access to Params instance by `params.dict['learning_rate']"""
+        return self.__dict__
+def save_graph(train_metrics, test_metrics, save_dir):
+    metrics = [snr, si_snr]
+    results = {'train_loss': train_metrics['loss'],
+               'test_loss' : test_metrics['loss']}
+    for m_fn in metrics:
+        results["train_"+m_fn.__name__] = train_metrics[m_fn.__name__]
+        results["test_"+m_fn.__name__] = test_metrics[m_fn.__name__]
+    results_pd = pd.DataFrame(results)
+    results_pd.to_csv(os.path.join(save_dir, 'results.csv'))
+    fig, temp_ax = plt.subplots(2, 3, figsize=(15,10))
+    axs=[]
+    for i in temp_ax:
+        for j in i:
+            axs.append(j)
+    x = range(len(train_metrics['loss']))
+    axs[0].plot(x, train_metrics['loss'], label='train')
+    axs[0].plot(x, test_metrics['loss'], label='test')
+    axs[0].set(ylabel='Loss')
+    axs[0].set(xlabel='Epoch')
+    axs[0].set_title('loss',fontweight='bold')
+    axs[0].legend()
+    for i in range(len(metrics)):
+        axs[i+1].plot(x, train_metrics[metrics[i].__name__], label='train')
+        axs[i+1].plot(x, test_metrics[metrics[i].__name__], label='test')
+        axs[i+1].set(xlabel='Epoch')
+        axs[i+1].set_title(metrics[i].__name__,fontweight='bold')
+        axs[i+1].legend()
+    plt.tight_layout()
+    plt.savefig(os.path.join(save_dir, 'results.png'))
+    plt.close(fig)
+def set_logger(log_path):
+    """Set the logger to log info in terminal and file `log_path`.
+    In general, it is useful to have a logger so that every output to the terminal is saved
+    in a permanent file. Here we save it to `model_dir/train.log`.
+    Example:
+    ```
+    logging.info("Starting training...")
+    ```
+    Args:
+        log_path: (string) where to log
+    """
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    logger.handlers.clear()
+    # Logging to a file
+    file_handler = logging.FileHandler(log_path)
+    file_handler.setFormatter(logging.Formatter('%(asctime)s:%(levelname)s: %(message)s'))
+    logger.addHandler(file_handler)
+    # Logging to console
+    stream_handler = logging.StreamHandler()
+    stream_handler.setFormatter(logging.Formatter('%(message)s'))
+    logger.addHandler(stream_handler)
+def load_checkpoint(checkpoint, model, optim=None, lr_sched=None, data_parallel=False):
+    """Loads model parameters (state_dict) from file_path.
+    Args:
+        checkpoint: (string) filename which needs to be loaded
+        model: (torch.nn.Module) model for which the parameters are loaded
+        data_parallel: (bool) if the model is a data parallel model
+    """
+    if not os.path.exists(checkpoint):
+        raise("File doesn't exist {}".format(checkpoint))
+    state_dict = torch.load(checkpoint)
+    if data_parallel:
+        state_dict['model_state_dict'] = {
+            'module.' + k: state_dict['model_state_dict'][k]
+            for k in state_dict['model_state_dict'].keys()}
+    model.load_state_dict(state_dict['model_state_dict'])
+    if optim is not None:
+        optim.load_state_dict(state_dict['optim_state_dict'])
+    if lr_sched is not None:
+        lr_sched.load_state_dict(state_dict['lr_sched_state_dict'])
+    return state_dict['epoch'], state_dict['train_metrics'], \
+           state_dict['val_metrics']
+def save_checkpoint(checkpoint, epoch, model, optim=None, lr_sched=None,
+                    train_metrics=None, val_metrics=None, data_parallel=False):
+    """Saves model parameters (state_dict) to file_path.
+    Args:
+        checkpoint: (string) filename which needs to be loaded
+        model: (torch.nn.Module) model for which the parameters are loaded
+        data_parallel: (bool) if the model is a data parallel model
+    """
+    if os.path.exists(checkpoint):
+        raise("File already exists {}".format(checkpoint))
+    model_state_dict = model.state_dict()
+    if data_parallel:
+        model_state_dict = {
+            k.partition('module.')[2]:
+            model_state_dict[k] for k in model_state_dict.keys()}
+    optim_state_dict = None if not optim else optim.state_dict()
+    lr_sched_state_dict = None if not lr_sched else lr_sched.state_dict()
+    state_dict = {
+        'epoch': epoch,
+        'model_state_dict': model_state_dict,
+        'optim_state_dict': optim_state_dict,
+        'lr_sched_state_dict': lr_sched_state_dict,
+        'train_metrics': train_metrics,
+        'val_metrics': val_metrics
+    }
+    torch.save(state_dict, checkpoint)
+def model_size(model):
+    """
+    Returns size of the `model` in millions of parameters.
+    """
+    num_train_params = sum(
+        p.numel() for p in model.parameters() if p.requires_grad)
+    return num_train_params / 1e6
+def run_time(model, inputs, profiling=False):
+    """
+    Returns runtime of a model in ms.
+    """
+    # Warmup
+    for _ in range(100):
+        output = model(*inputs)
+    with profile(activities=[ProfilerActivity.CPU],
+                 record_shapes=True) as prof:
+        with record_function("model_inference"):
+            output = model(*inputs)
+    # Print profiling results
+    if profiling:
+        print(prof.key_averages().table(sort_by="self_cpu_time_total",
+                                        row_limit=20))
+    # Return runtime in ms
+    return prof.profiler.self_cpu_time_total / 1000
+def format_lr_info(optimizer):
+    lr_info = ""
+    for i, pg in enumerate(optimizer.param_groups):
+        lr_info += " {group %d: params=%.5fM lr=%.1E}" % (
+            i, sum([p.numel() for p in pg['params']]) / (1024 ** 2), pg['lr'])
+    return lr_info

src/training/__init__.py ADDED Viewed

File without changes

src/training/dcc_tf.py ADDED Viewed

	@@ -0,0 +1,486 @@

+import math
+from collections import OrderedDict
+from typing import Optional
+from torch import Tensor
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torchmetrics.functional import(
+    scale_invariant_signal_noise_ratio as si_snr,
+    signal_noise_ratio as snr,
+    signal_distortion_ratio as sdr,
+    scale_invariant_signal_distortion_ratio as si_sdr)
+from speechbrain.lobes.models.transformer.Transformer import PositionalEncoding
+def mod_pad(x, chunk_size, pad):
+    # Mod pad the input to perform integer number of
+    # inferences
+    mod = 0
+    if (x.shape[-1] % chunk_size) != 0:
+        mod = chunk_size - (x.shape[-1] % chunk_size)
+    x = F.pad(x, (0, mod))
+    x = F.pad(x, pad)
+    return x, mod
+class LayerNormPermuted(nn.LayerNorm):
+    def __init__(self, *args, **kwargs):
+        super(LayerNormPermuted, self).__init__(*args, **kwargs)
+    def forward(self, x):
+        """
+        Args:
+            x: [B, C, T]
+        """
+        x = x.permute(0, 2, 1) # [B, T, C]
+        x = super().forward(x)
+        x = x.permute(0, 2, 1) # [B, C, T]
+        return x
+class DepthwiseSeparableConv(nn.Module):
+    """
+    Depthwise separable convolutions
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, stride,
+                 padding, dilation):
+        super(DepthwiseSeparableConv, self).__init__()
+        self.layers = nn.Sequential(
+            nn.Conv1d(in_channels, in_channels, kernel_size, stride,
+                      padding, groups=in_channels, dilation=dilation),
+            LayerNormPermuted(in_channels),
+            nn.ReLU(),
+            nn.Conv1d(in_channels, out_channels, kernel_size=1, stride=1,
+                      padding=0),
+            LayerNormPermuted(out_channels),
+            nn.ReLU(),
+        )
+    def forward(self, x):
+        return self.layers(x)
+class DilatedCausalConvEncoder(nn.Module):
+    """
+    A dilated causal convolution based encoder for encoding
+    time domain audio input into latent space.
+    """
+    def __init__(self, channels, num_layers, kernel_size=3):
+        super(DilatedCausalConvEncoder, self).__init__()
+        self.channels = channels
+        self.num_layers = num_layers
+        self.kernel_size = kernel_size
+        # Compute buffer lengths for each layer
+        # buf_length[i] = (kernel_size - 1) * dilation[i]
+        self.buf_lengths = [(kernel_size - 1) * 2**i
+                            for i in range(num_layers)]
+        # Compute buffer start indices for each layer
+        self.buf_indices = [0]
+        for i in range(num_layers - 1):
+            self.buf_indices.append(
+                self.buf_indices[-1] + self.buf_lengths[i])
+        # Dilated causal conv layers aggregate previous context to obtain
+        # contexful encoded input.
+        _dcc_layers = OrderedDict()
+        for i in range(num_layers):
+            dcc_layer = DepthwiseSeparableConv(
+                channels, channels, kernel_size=3, stride=1,
+                padding=0, dilation=2**i)
+            _dcc_layers.update({'dcc_%d' % i: dcc_layer})
+        self.dcc_layers = nn.Sequential(_dcc_layers)
+    def init_ctx_buf(self, batch_size, device):
+        """
+        Returns an initialized context buffer for a given batch size.
+        """
+        return torch.zeros(
+            (batch_size, self.channels,
+                 (self.kernel_size - 1) * (2**self.num_layers - 1)),
+            device=device)
+    def forward(self, x, ctx_buf):
+        """
+        Encodes input audio `x` into latent space, and aggregates
+        contextual information in `ctx_buf`. Also generates new context
+        buffer with updated context.
+        Args:
+            x: [B, in_channels, T]
+                Input multi-channel audio.
+            ctx_buf: {[B, channels, self.buf_length[0]], ...}
+                A list of tensors holding context for each dilation
+                causal conv layer. (len(ctx_buf) == self.num_layers)
+        Returns:
+            ctx_buf: {[B, channels, self.buf_length[0]], ...}
+                Updated context buffer with output as the
+                last element.
+        """
+        T = x.shape[-1] # Sequence length
+        for i in range(self.num_layers):
+            buf_start_idx = self.buf_indices[i]
+            buf_end_idx = self.buf_indices[i] + self.buf_lengths[i]
+            # DCC input: concatenation of current output and context
+            dcc_in = torch.cat(
+                (ctx_buf[..., buf_start_idx:buf_end_idx], x), dim=-1)
+            # Push current output to the context buffer
+            ctx_buf[..., buf_start_idx:buf_end_idx] = \
+                dcc_in[..., -self.buf_lengths[i]:]
+            # Residual connection
+            x = x + self.dcc_layers[i](dcc_in)
+        return x, ctx_buf
+class CausalTransformerDecoderLayer(torch.nn.TransformerDecoderLayer):
+    """
+    Adapted from:
+    "https://github.com/alexmt-scale/causal-transformer-decoder/blob/"
+    "0caf6ad71c46488f76d89845b0123d2550ef792f/"
+    "causal_transformer_decoder/model.py#L77"
+    """
+    def forward(
+        self,
+        tgt: Tensor,
+        memory: Optional[Tensor] = None,
+        chunk_size: int = 1
+    ) -> Tensor:
+        tgt_last_tok = tgt[:, -chunk_size:, :]
+        # self attention part
+        tmp_tgt, sa_map = self.self_attn(
+            tgt_last_tok,
+            tgt,
+            tgt,
+            attn_mask=None,  # not needed because we only care about the last token
+            key_padding_mask=None,
+        )
+        tgt_last_tok = tgt_last_tok + self.dropout1(tmp_tgt)
+        tgt_last_tok = self.norm1(tgt_last_tok)
+        # encoder-decoder attention
+        if memory is not None:
+            tmp_tgt, ca_map = self.multihead_attn(
+                tgt_last_tok,
+                memory,
+                memory,
+                attn_mask=None, # Attend to the entire chunk
+                key_padding_mask=None,
+            )
+            tgt_last_tok = tgt_last_tok + self.dropout2(tmp_tgt)
+            tgt_last_tok = self.norm2(tgt_last_tok)
+        # final feed-forward network
+        tmp_tgt = self.linear2(
+            self.dropout(self.activation(self.linear1(tgt_last_tok)))
+        )
+        tgt_last_tok = tgt_last_tok + self.dropout3(tmp_tgt)
+        tgt_last_tok = self.norm3(tgt_last_tok)
+        return tgt_last_tok, sa_map, ca_map
+class CausalTransformerDecoder(nn.Module):
+    """
+    A casual transformer decoder which decodes input vectors using
+    precisely `ctx_len` past vectors in the sequence, and using no future
+    vectors at all.
+    """
+    def __init__(self, model_dim, ctx_len, chunk_size, num_layers,
+                 nhead, use_pos_enc, ff_dim):
+        super(CausalTransformerDecoder, self).__init__()
+        self.num_layers = num_layers
+        self.model_dim = model_dim
+        self.ctx_len = ctx_len
+        self.chunk_size = chunk_size
+        self.nhead = nhead
+        self.use_pos_enc = use_pos_enc
+        self.unfold = nn.Unfold(kernel_size=(ctx_len + chunk_size, 1), stride=chunk_size)
+        self.pos_enc = PositionalEncoding(model_dim, max_len=200)
+        self.tf_dec_layers = nn.ModuleList([CausalTransformerDecoderLayer(
+            d_model=model_dim, nhead=nhead, dim_feedforward=ff_dim,
+            batch_first=True) for _ in range(num_layers)])
+    def init_ctx_buf(self, batch_size, device):
+        return torch.zeros(
+            (batch_size, self.num_layers + 1, self.ctx_len, self.model_dim),
+            device=device)
+    def _causal_unfold(self, x):
+        """
+        Unfolds the sequence into a batch of sequences
+        prepended with `ctx_len` previous values.
+        Args:
+            x: [B, ctx_len + L, C]
+            ctx_len: int
+        Returns:
+            [B * L, ctx_len + 1, C]
+        """
+        B, T, C = x.shape
+        x = x.permute(0, 2, 1) # [B, C, ctx_len + L]
+        x = self.unfold(x.unsqueeze(-1)) # [B, C * (ctx_len + chunk_size), -1]
+        x = x.permute(0, 2, 1)
+        x = x.reshape(B, -1, C, self.ctx_len + self.chunk_size)
+        x = x.reshape(-1, C, self.ctx_len + self.chunk_size)
+        x = x.permute(0, 2, 1)
+        return x
+    def forward(self, tgt, mem, ctx_buf, probe=False):
+        """
+        Args:
+            x: [B, model_dim, T]
+            ctx_buf: [B, num_layers, model_dim, ctx_len]
+        """
+        mem, _ = mod_pad(mem, self.chunk_size, (0, 0))
+        tgt, mod = mod_pad(tgt, self.chunk_size, (0, 0))
+        # Input sequence length
+        B, C, T = tgt.shape
+        tgt = tgt.permute(0, 2, 1)
+        mem = mem.permute(0, 2, 1)
+        # Prepend mem with the context
+        mem = torch.cat((ctx_buf[:, 0, :, :], mem), dim=1)
+        ctx_buf[:, 0, :, :] = mem[:, -self.ctx_len:, :]
+        mem_ctx = self._causal_unfold(mem)
+        if self.use_pos_enc:
+            mem_ctx = mem_ctx + self.pos_enc(mem_ctx)
+        # Attention chunk size: required to ensure the model
+        # wouldn't trigger an out-of-memory error when working
+        # on long sequences.
+        K = 1000
+        for i, tf_dec_layer in enumerate(self.tf_dec_layers):
+            # Update the tgt with context
+            tgt = torch.cat((ctx_buf[:, i + 1, :, :], tgt), dim=1)
+            ctx_buf[:, i + 1, :, :] = tgt[:, -self.ctx_len:, :]
+            # Compute encoded output
+            tgt_ctx = self._causal_unfold(tgt)
+            if self.use_pos_enc and i == 0:
+                tgt_ctx = tgt_ctx + self.pos_enc(tgt_ctx)
+            tgt = torch.zeros_like(tgt_ctx)[:, -self.chunk_size:, :]
+            for i in range(int(math.ceil(tgt.shape[0] / K))):
+                tgt[i*K:(i+1)*K], _sa_map, _ca_map = tf_dec_layer(
+                    tgt_ctx[i*K:(i+1)*K], mem_ctx[i*K:(i+1)*K],
+                    self.chunk_size)
+            tgt = tgt.reshape(B, T, C)
+        tgt = tgt.permute(0, 2, 1)
+        if mod != 0:
+            tgt = tgt[..., :-mod]
+        return tgt, ctx_buf
+class MaskNet(nn.Module):
+    def __init__(self, enc_dim, num_enc_layers, dec_dim, dec_buf_len,
+                 dec_chunk_size, num_dec_layers, use_pos_enc, skip_connection, proj):
+        super(MaskNet, self).__init__()
+        self.skip_connection = skip_connection
+        self.proj = proj
+        # Encoder based on dilated causal convolutions.
+        self.encoder = DilatedCausalConvEncoder(channels=enc_dim,
+                                                num_layers=num_enc_layers)
+        # Project between encoder and decoder dimensions
+        self.proj_e2d_e = nn.Sequential(
+            nn.Conv1d(enc_dim, dec_dim, kernel_size=1, stride=1, padding=0,
+                      groups=dec_dim),
+            nn.ReLU())
+        self.proj_e2d_l = nn.Sequential(
+            nn.Conv1d(enc_dim, dec_dim, kernel_size=1, stride=1, padding=0,
+                      groups=dec_dim),
+            nn.ReLU())
+        self.proj_d2e = nn.Sequential(
+            nn.Conv1d(dec_dim, enc_dim, kernel_size=1, stride=1, padding=0,
+                      groups=dec_dim),
+            nn.ReLU())
+        # Transformer decoder that operates on chunks of size
+        # buffer size.
+        self.decoder = CausalTransformerDecoder(
+            model_dim=dec_dim, ctx_len=dec_buf_len, chunk_size=dec_chunk_size,
+            num_layers=num_dec_layers, nhead=8, use_pos_enc=use_pos_enc,
+            ff_dim=2 * dec_dim)
+    def forward(self, x, l, enc_buf, dec_buf):
+        """
+        Generates a mask based on encoded input `e` and the one-hot
+        label `label`.
+        Args:
+            x: [B, C, T]
+                Input audio sequence
+            l: [B, C]
+                Label embedding
+            ctx_buf: {[B, C, <receptive field of the layer>], ...}
+                List of context buffers maintained by DCC encoder
+        """
+        # Enocder the label integrated input
+        e, enc_buf = self.encoder(x, enc_buf)
+        # Label integration
+        l = l.unsqueeze(2) * e
+        # Project to `dec_dim` dimensions
+        if self.proj:
+            e = self.proj_e2d_e(e)
+            m = self.proj_e2d_l(l)
+            # Cross-attention to predict the mask
+            m, dec_buf = self.decoder(m, e, dec_buf)
+        else:
+            # Cross-attention to predict the mask
+            m, dec_buf = self.decoder(l, e, dec_buf)
+        # Project mask to encoder dimensions
+        if self.proj:
+            m = self.proj_d2e(m)
+        # Final mask after residual connection
+        if self.skip_connection:
+            m = l + m
+        return m, enc_buf, dec_buf
+class Net(nn.Module):
+    def __init__(self, label_len, L=8,
+                 enc_dim=512, num_enc_layers=10,
+                 dec_dim=256, dec_buf_len=100, num_dec_layers=2,
+                 dec_chunk_size=72, out_buf_len=2,
+                 use_pos_enc=True, skip_connection=True, proj=True, lookahead=True):
+        super(Net, self).__init__()
+        self.L = L
+        self.out_buf_len = out_buf_len
+        self.enc_dim = enc_dim
+        self.lookahead = lookahead
+        # Input conv to convert input audio to a latent representation
+        kernel_size = 3 * L if lookahead else L
+        self.in_conv = nn.Sequential(
+            nn.Conv1d(in_channels=1,
+                      out_channels=enc_dim, kernel_size=kernel_size, stride=L,
+                      padding=0, bias=False),
+            nn.ReLU())
+        # Label embedding layer
+        self.label_embedding = nn.Sequential(
+            nn.Linear(label_len, 512),
+            nn.LayerNorm(512),
+            nn.ReLU(),
+            nn.Linear(512, enc_dim),
+            nn.LayerNorm(enc_dim),
+            nn.ReLU())
+        # Mask generator
+        self.mask_gen = MaskNet(
+            enc_dim=enc_dim, num_enc_layers=num_enc_layers,
+            dec_dim=dec_dim, dec_buf_len=dec_buf_len,
+            dec_chunk_size=dec_chunk_size, num_dec_layers=num_dec_layers,
+            use_pos_enc=use_pos_enc, skip_connection=skip_connection, proj=proj)
+        # Output conv layer
+        self.out_conv = nn.Sequential(
+            nn.ConvTranspose1d(
+                in_channels=enc_dim, out_channels=1,
+                kernel_size=(out_buf_len + 1) * L,
+                stride=L,
+                padding=out_buf_len * L, bias=False),
+            nn.Tanh())
+    def init_buffers(self, batch_size, device):
+        enc_buf = self.mask_gen.encoder.init_ctx_buf(batch_size, device)
+        dec_buf = self.mask_gen.decoder.init_ctx_buf(batch_size, device)
+        out_buf = torch.zeros(batch_size, self.enc_dim, self.out_buf_len,
+                              device=device)
+        return enc_buf, dec_buf, out_buf
+    def forward(self, x, label, init_enc_buf=None, init_dec_buf=None,
+                init_out_buf=None, pad=True):
+        """
+        Extracts the audio corresponding to the `label` in the given
+        `mixture`. Generates `chunk_size` samples per iteration.
+        Args:
+            mixed: [B, n_mics, T]
+                input audio mixture
+            label: [B, num_labels]
+                one hot label
+        Returns:
+            out: [B, n_spk, T]
+                extracted audio with sounds corresponding to the `label`
+        """
+        mod = 0
+        if pad:
+            pad_size = (self.L, self.L) if self.lookahead else (0, 0)
+            x, mod = mod_pad(x, chunk_size=self.L, pad=pad_size)
+        if init_enc_buf is None or init_dec_buf is None or init_out_buf is None:
+            assert init_enc_buf is None and \
+                   init_dec_buf is None and \
+                   init_out_buf is None, \
+                "Both buffers have to initialized, or " \
+                "both of them have to be None."
+            enc_buf, dec_buf, out_buf = self.init_buffers(
+                x.shape[0], x.device)
+        else:
+            enc_buf, dec_buf, out_buf = \
+                init_enc_buf, init_dec_buf, init_out_buf
+        # Generate latent space representation of the input
+        x = self.in_conv(x)
+        # Generate label embedding
+        l = self.label_embedding(label) # [B, label_len] --> [B, channels]
+        # Generate mask corresponding to the label
+        m, enc_buf, dec_buf = self.mask_gen(x, l, enc_buf, dec_buf)
+        # Apply mask and decode
+        x = x * m
+        x = torch.cat((out_buf, x), dim=-1)
+        out_buf = x[..., -self.out_buf_len:]
+        x = self.out_conv(x)
+        # Remove mod padding, if present.
+        if mod != 0:
+            x = x[:, :, :-mod]
+        if init_enc_buf is None:
+            return x
+        else:
+            return x, enc_buf, dec_buf, out_buf
+# Define optimizer, loss and metrics
+def optimizer(model, data_parallel=False, **kwargs):
+    return optim.Adam(model.parameters(), **kwargs)
+def loss(pred, tgt):
+    return -0.9 * snr(pred, tgt).mean() - 0.1 * si_snr(pred, tgt).mean()
+def metrics(mixed, output, gt):
+    """ Function to compute metrics """
+    metrics = {}
+    def metric_i(metric, src, pred, tgt):
+        _vals = []
+        for s, t, p in zip(src, tgt, pred):
+            _vals.append((metric(p, t) - metric(s, t)).cpu().item())
+        return _vals
+    for m_fn in [snr, si_snr]:
+        metrics[m_fn.__name__] = metric_i(m_fn,
+                                          mixed[:, :gt.shape[1], :],
+                                          output,
+                                          gt)
+    return metrics

src/training/eval.py ADDED Viewed

	@@ -0,0 +1,214 @@

+"""
+Test script to evaluate the model.
+"""
+import argparse
+import importlib
+import multiprocessing
+import os, glob
+import logging
+import numpy as np
+import torch
+import pandas as pd
+import torch.nn as nn
+from torch.utils.tensorboard import SummaryWriter
+from torch.profiler import profile, record_function, ProfilerActivity
+from tqdm import tqdm  # pylint: disable=unused-import
+from torchmetrics.functional import(
+    scale_invariant_signal_noise_ratio as si_snr,
+    signal_noise_ratio as snr,
+    signal_distortion_ratio as sdr,
+    scale_invariant_signal_distortion_ratio as si_sdr)
+from src.helpers import utils
+from src.training.synthetic_dataset import FSDSoundScapesDataset, tensorboard_add_metrics
+from src.training.synthetic_dataset import tensorboard_add_sample
+def test_epoch(model: nn.Module, device: torch.device,
+               test_loader: torch.utils.data.dataloader.DataLoader,
+               n_items: int, loss_fn, metrics_fn,
+               profiling: bool = False, epoch: int = 0,
+               writer: SummaryWriter = None, data_params = None) -> float:
+    """
+    Evaluate the network.
+    """
+    model.eval()
+    metrics = {}
+    with torch.no_grad():
+        for batch_idx, (mixed, label, gt) in \
+                enumerate(tqdm(test_loader, desc='Test', ncols=100)):
+            mixed = mixed.to(device)
+            label = label.to(device)
+            gt = gt.to(device)
+            # Run through the model
+            with profile(activities=[ProfilerActivity.CPU],
+                         record_shapes=True) as prof:
+                with record_function("model_inference"):
+                    output = model(mixed, label)
+            if profiling:
+                logging.info(
+                    prof.key_averages().table(sort_by="self_cpu_time_total",
+                                              row_limit=20))
+            # Compute loss
+            loss = loss_fn(output, gt)
+            # Compute metrics
+            metrics_batch = metrics_fn(mixed, output, gt)
+            metrics_batch['loss'] = [loss.item()]
+            metrics_batch['runtime'] = [prof.profiler.self_cpu_time_total/1000]
+            for k in metrics_batch.keys():
+                if not k in metrics:
+                    metrics[k] = metrics_batch[k]
+                else:
+                    metrics[k] += metrics_batch[k]
+            if writer is not None:
+                if batch_idx == 0:
+                    tensorboard_add_sample(
+                        writer, tag='Test',
+                        sample=(mixed[:8], label[:8], gt[:8], output[:8]),
+                        step=epoch, params=data_params)
+                tensorboard_add_metrics(
+                    writer, tag='Test', metrics=metrics_batch, label=label,
+                    step=epoch)
+            if n_items is not None and batch_idx == (n_items - 1):
+                break
+        avg_metrics = {k: np.mean(metrics[k]) for k in metrics.keys()}
+        avg_metrics_str = "Test:"
+        for m in avg_metrics.keys():
+            avg_metrics_str += ' %s=%.04f' % (m, avg_metrics[m])
+        logging.info(avg_metrics_str)
+        return avg_metrics
+def evaluate(network, args: argparse.Namespace):
+    """
+    Evaluate the model on a given dataset.
+    """
+    # Load dataset
+    data_test = FSDSoundScapesDataset(**args.test_data)
+    logging.info("Loaded test dataset at %s containing %d elements" %
+                 (args.test_data['input_dir'], len(data_test)))
+    # Set up the device and workers.
+    use_cuda = args.use_cuda and torch.cuda.is_available()
+    if use_cuda:
+        gpu_ids = args.gpu_ids if args.gpu_ids is not None\
+                        else range(torch.cuda.device_count())
+        device_ids = [_ for _ in gpu_ids]
+        data_parallel = len(device_ids) > 1
+        device = 'cuda:%d' % device_ids[0]
+        torch.cuda.set_device(device_ids[0])
+        logging.info("Using CUDA devices: %s" % str(device_ids))
+    else:
+        data_parallel = False
+        device = torch.device('cpu')
+        logging.info("Using device: CPU")
+    # Set multiprocessing params
+    num_workers = min(multiprocessing.cpu_count(), args.n_workers)
+    kwargs = {
+        'num_workers': num_workers,
+        'pin_memory': True
+    } if use_cuda else {}
+    # Set up data loader
+    test_loader = torch.utils.data.DataLoader(data_test,
+                                              batch_size=args.eval_batch_size,
+                                              **kwargs)
+    # Set up model
+    model = network.Net(**args.model_params)
+    if use_cuda and data_parallel:
+        model = nn.DataParallel(model, device_ids=device_ids)
+        logging.info("Using data parallel model")
+    model.to(device)
+    # Load weights
+    if args.pretrain_path == "best":
+        ckpts = glob.glob(os.path.join(args.exp_dir, '*.pt'))
+        ckpts.sort(
+            key=lambda _: int(os.path.splitext(os.path.basename(_))[0]))
+        val_metrics = torch.load(ckpts[-1])['val_metrics'][args.base_metric]
+        best_epoch = max(range(len(val_metrics)), key=val_metrics.__getitem__)
+        args.pretrain_path = os.path.join(args.exp_dir, '%d.pt' % best_epoch)
+        logging.info(
+            "Found 'best' validation %s=%.02f at %s" %
+            (args.base_metric, val_metrics[best_epoch], args.pretrain_path))
+    if args.pretrain_path != "":
+        utils.load_checkpoint(
+            args.pretrain_path, model, data_parallel=data_parallel)
+        logging.info("Loaded pretrain weights from %s" % args.pretrain_path)
+    # Evaluate
+    try:
+        return test_epoch(
+            model, device, test_loader, args.n_items, network.loss,
+            network.metrics, args.profiling)
+    except KeyboardInterrupt:
+        print("Interrupted")
+    except Exception as _:  # pylint: disable=broad-except
+        import traceback  # pylint: disable=import-outside-toplevel
+        traceback.print_exc()
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    # Data Params
+    parser.add_argument('experiments', nargs='+', type=str,
+                        default=None,
+                        help="List of experiments to evaluate. "
+                        "Provide only one experiment when providing "
+                        "pretrained path. If pretrianed path is not "
+                        "provided, epoch with best validation metric "
+                        "is used for evaluation.")
+    parser.add_argument('--results', type=str, default="",
+                        help="Path to the CSV file to store results.")
+    # System params
+    parser.add_argument('--n_items', type=int, default=None,
+                        help="Number of items to test.")
+    parser.add_argument('--pretrain_path', type=str, default="best",
+                        help="Path to pretrained weights")
+    parser.add_argument('--profiling', dest='profiling', action='store_true',
+                        help="Enable or disable profiling.")
+    parser.add_argument('--use_cuda', dest='use_cuda', action='store_true',
+                        help="Whether to use cuda")
+    parser.add_argument('--gpu_ids', nargs='+', type=int, default=None,
+                        help="List of GPU ids used for training. "
+                        "Eg., --gpu_ids 2 4. All GPUs are used by default.")
+    args = parser.parse_args()
+    results = []
+    for exp_dir in args.experiments:
+        eval_args = argparse.Namespace(**vars(args))
+        eval_args.exp_dir = exp_dir
+        utils.set_logger(os.path.join(exp_dir, 'eval.log'))
+        logging.info("Evaluating %s ..." % exp_dir)
+        # Load model and training params
+        params = utils.Params(os.path.join(exp_dir, 'config.json'))
+        for k, v in params.__dict__.items():
+            vars(eval_args)[k] = v
+        network = importlib.import_module(eval_args.model)
+        logging.info("Imported the model from '%s'." % eval_args.model)
+        curr_res = evaluate(network, eval_args)
+        curr_res['experiment'] = os.path.basename(exp_dir)
+        results.append(curr_res)
+        del eval_args
+    if args.results != "":
+        print("Writing results to %s" % args.results)
+        pd.DataFrame(results).to_csv(args.results, index=False)

src/training/synthetic_dataset.py ADDED Viewed

	@@ -0,0 +1,168 @@

+"""
+Torch dataset object for synthetically rendered spatial data.
+"""
+import os
+import json
+import random
+from pathlib import Path
+import logging
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import scaper
+import torch
+import torchaudio
+import torchaudio.transforms as AT
+from random import randrange
+class FSDSoundScapesDataset(torch.utils.data.Dataset):  # type: ignore
+    """
+    Base class for FSD Sound Scapes dataset
+    """
+    _labels = [
+    "Acoustic_guitar", "Applause", "Bark", "Bass_drum",
+    "Burping_or_eructation", "Bus", "Cello", "Chime", "Clarinet",
+    "Computer_keyboard", "Cough", "Cowbell", "Double_bass",
+    "Drawer_open_or_close", "Electric_piano", "Fart", "Finger_snapping",
+    "Fireworks", "Flute", "Glockenspiel", "Gong", "Gunshot_or_gunfire",
+    "Harmonica", "Hi-hat", "Keys_jangling", "Knock", "Laughter", "Meow",
+    "Microwave_oven", "Oboe", "Saxophone", "Scissors", "Shatter",
+    "Snare_drum", "Squeak", "Tambourine", "Tearing", "Telephone",
+    "Trumpet", "Violin_or_fiddle", "Writing"]
+    def __init__(self, input_dir, dset='', sr=None,
+                 resample_rate=None, max_num_targets=1):
+        assert dset in ['train', 'val', 'test'], \
+            "`dset` must be one of ['train', 'val', 'test']"
+        self.dset = dset
+        self.max_num_targets = max_num_targets
+        self.fg_dir = os.path.join(input_dir, 'FSDKaggle2018/%s' % dset)
+        if dset in ['train', 'val']:
+            self.bg_dir = os.path.join(
+                input_dir,
+                'TAU-acoustic-sounds/'
+                'TAU-urban-acoustic-scenes-2019-development')
+        else:
+            self.bg_dir = os.path.join(
+                input_dir,
+                'TAU-acoustic-sounds/'
+                'TAU-urban-acoustic-scenes-2019-evaluation')
+        logging.info("Loading %s dataset: fg_dir=%s bg_dir=%s" %
+                     (dset, self.fg_dir, self.bg_dir))
+        self.samples = sorted(list(
+            Path(os.path.join(input_dir, 'jams', dset)).glob('[0-9]*')))
+        jamsfile = os.path.join(self.samples[0], 'mixture.jams')
+        _, jams, _, _ = scaper.generate_from_jams(
+            jamsfile, fg_path=self.fg_dir, bg_path=self.bg_dir)
+        _sr = jams['annotations'][0]['sandbox']['scaper']['sr']
+        assert _sr == sr, "Sampling rate provided does not match the data"
+        if resample_rate is not None:
+            self.resampler = AT.Resample(sr, resample_rate)
+            self.sr = resample_rate
+        else:
+            self.resampler = lambda a: a
+            self.sr = sr
+    def _get_label_vector(self, labels):
+        """
+        Generates a multi-hot vector corresponding to `labels`.
+        """
+        vector = torch.zeros(len(FSDSoundScapesDataset._labels))
+        for label in labels:
+            idx = FSDSoundScapesDataset._labels.index(label)
+            assert vector[idx] == 0, "Repeated labels"
+            vector[idx] = 1
+        return vector
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, idx):
+        sample_path = self.samples[idx]
+        jamsfile = os.path.join(sample_path, 'mixture.jams')
+        mixture, jams, ann_list, event_audio_list = scaper.generate_from_jams(
+            jamsfile, fg_path=self.fg_dir, bg_path=self.bg_dir)
+        isolated_events = {}
+        for e, a in zip(ann_list, event_audio_list[1:]):
+            # 0th event is background
+            isolated_events[e[2]] = a
+        gt_events = list(pd.read_csv(
+            os.path.join(sample_path, 'gt_events.csv'), sep='\t')['label'])
+        mixture = torch.from_numpy(mixture).permute(1, 0)
+        mixture = self.resampler(mixture.to(torch.float))
+        if self.dset == 'train':
+            labels = random.sample(gt_events, randrange(1,self.max_num_targets+1))
+        elif self.dset == 'val':
+            labels = gt_events[:idx%self.max_num_targets+1]
+        elif self.dset == 'test':
+            labels = gt_events[:self.max_num_targets]
+        label_vector = self._get_label_vector(labels)
+        gt = torch.zeros_like(
+            torch.from_numpy(event_audio_list[1]).permute(1, 0))
+        for l in labels:
+            gt = gt + torch.from_numpy(isolated_events[l]).permute(1, 0)
+        gt = self.resampler(gt.to(torch.float))
+        return mixture, label_vector, gt #, jams
+def tensorboard_add_sample(writer, tag, sample, step, params):
+    """
+    Adds a sample of FSDSynthDataset to tensorboard.
+    """
+    if params['resample_rate'] is not None:
+        sr = params['resample_rate']
+    else:
+        sr = params['sr']
+    resample_rate = 16000 if sr > 16000 else sr
+    m, l, gt, o = sample
+    m, gt, o = (
+        torchaudio.functional.resample(_, sr, resample_rate).cpu()
+        for _ in (m, gt, o))
+    def _add_audio(a, audio_tag, axis, plt_title):
+        for i, ch in enumerate(a):
+            axis.plot(ch, label='mic %d' % i)
+            writer.add_audio(
+                '%s/mic %d' % (audio_tag, i), ch.unsqueeze(0), step, resample_rate)
+        axis.set_title(plt_title)
+        axis.legend()
+    for b in range(m.shape[0]):
+        label = []
+        for i in range(len(l[b, :])):
+            if l[b, i] == 1:
+                label.append(FSDSoundScapesDataset._labels[i])
+        # Add waveforms
+        rows = 3 # input, output, gt
+        fig = plt.figure(figsize=(10, 2 * rows))
+        axes = fig.subplots(rows, 1, sharex=True)
+        _add_audio(m[b], '%s/sample_%d/0_input' % (tag, b), axes[0], "Mixed")
+        _add_audio(o[b], '%s/sample_%d/1_output' % (tag, b), axes[1], "Output (%s)" % label)
+        _add_audio(gt[b], '%s/sample_%d/2_gt' % (tag, b), axes[2], "GT (%s)" % label)
+        writer.add_figure('%s/sample_%d/waveform' % (tag, b), fig, step)
+def tensorboard_add_metrics(writer, tag, metrics, label, step):
+    """
+    Add metrics to tensorboard.
+    """
+    vals = np.asarray(metrics['scale_invariant_signal_noise_ratio'])
+    writer.add_histogram('%s/%s' % (tag, 'SI-SNRi'), vals, step)
+    label_names = [FSDSoundScapesDataset._labels[torch.argmax(_)] for _ in label]
+    for l, v in zip(label_names, vals):
+        writer.add_histogram('%s/%s' % (tag, l), v, step)

src/training/train.py ADDED Viewed

	@@ -0,0 +1,311 @@

+"""
+The main training script for training on synthetic data
+"""
+import argparse
+import multiprocessing
+import os
+import logging
+from pathlib import Path
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torch.utils.tensorboard import SummaryWriter
+from tqdm import tqdm  # pylint: disable=unused-import
+from torchmetrics.functional import(
+    scale_invariant_signal_noise_ratio as si_snr,
+    signal_noise_ratio as snr,
+    signal_distortion_ratio as sdr,
+    scale_invariant_signal_distortion_ratio as si_sdr)
+from src.helpers import utils
+from src.training.eval import test_epoch
+from src.training.synthetic_dataset import FSDSoundScapesDataset as Dataset
+from src.training.synthetic_dataset import tensorboard_add_sample
+def train_epoch(model: nn.Module, device: torch.device,
+                optimizer: optim.Optimizer,
+                train_loader: torch.utils.data.dataloader.DataLoader,
+                n_items: int, epoch: int = 0,
+                writer: SummaryWriter = None, data_params = None) -> float:
+    """
+    Train a single epoch.
+    """
+    # Set the model to training.
+    model.train()
+    # Training loop
+    losses = []
+    metrics = {}
+    with tqdm(total=len(train_loader), desc='Train', ncols=100) as t:
+        for batch_idx, (mixed, label, gt) in enumerate(train_loader):
+            mixed = mixed.to(device)
+            label = label.to(device)
+            gt = gt.to(device)
+            # Reset grad
+            optimizer.zero_grad()
+            # Run through the model
+            output = model(mixed, label)
+            # Compute loss
+            loss = network.loss(output, gt)
+            losses.append(loss.item())
+            # Backpropagation
+            loss.backward()
+            # Gradient clipping
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
+            # Update the weights
+            optimizer.step()
+            metrics_batch = network.metrics(mixed.detach(), output.detach(),
+                                            gt.detach())
+            for k in metrics_batch.keys():
+                if not k in metrics:
+                    metrics[k] = metrics_batch[k]
+                else:
+                    metrics[k] += metrics_batch[k]
+            if writer is not None and batch_idx == 0:
+                tensorboard_add_sample(
+                    writer, tag='Train',
+                    sample=(mixed.detach()[:8], label.detach()[:8],
+                            gt.detach()[:8], output.detach()[:8]),
+                    step=epoch, params=data_params)
+            # Show current loss in the progress meter
+            t.set_postfix(loss='%.05f'%loss.item())
+            t.update()
+            if n_items is not None and batch_idx == n_items:
+                break
+    avg_metrics = {k: np.mean(metrics[k]) for k in metrics.keys()}
+    avg_metrics['loss'] = np.mean(losses)
+    avg_metrics_str = "Train:"
+    for m in avg_metrics.keys():
+        avg_metrics_str += ' %s=%.04f' % (m, avg_metrics[m])
+    logging.info(avg_metrics_str)
+    return avg_metrics
+def train(args: argparse.Namespace):
+    """
+    Train the network.
+    """
+    # Load dataset
+    data_train = Dataset(**args.train_data)
+    logging.info("Loaded train dataset at %s containing %d elements" %
+                 (args.train_data['input_dir'], len(data_train)))
+    data_val = Dataset(**args.val_data)
+    logging.info("Loaded test dataset at %s containing %d elements" %
+                 (args.val_data['input_dir'], len(data_val)))
+    # Set up the device and workers.
+    use_cuda = args.use_cuda and torch.cuda.is_available()
+    if use_cuda:
+        gpu_ids = args.gpu_ids if args.gpu_ids is not None\
+                        else range(torch.cuda.device_count())
+        device_ids = [_ for _ in gpu_ids]
+        data_parallel = len(device_ids) > 1
+        device = 'cuda:%d' % device_ids[0]
+        torch.cuda.set_device(device_ids[0])
+        logging.info("Using CUDA devices: %s" % str(device_ids))
+    else:
+        data_parallel = False
+        device = torch.device('cpu')
+        logging.info("Using device: CPU")
+    # Set multiprocessing params
+    num_workers = min(multiprocessing.cpu_count(), args.n_workers)
+    kwargs = {
+        'num_workers': num_workers,
+        'pin_memory': True
+    } if use_cuda else {}
+    # Set up data loaders
+    #print(args.batch_size, args.eval_batch_size)
+    train_loader = torch.utils.data.DataLoader(data_train,
+                                               batch_size=args.batch_size,
+                                               shuffle=True, **kwargs)
+    val_loader = torch.utils.data.DataLoader(data_val,
+                                             batch_size=args.eval_batch_size,
+                                             **kwargs)
+    # Set up model
+    model = network.Net(**args.model_params)
+    # Add graph to tensorboard with example train samples
+    # _mixed, _label, _ = next(iter(val_loader))
+    # args.writer.add_graph(model, (_mixed, _label))
+    if use_cuda and data_parallel:
+        model = nn.DataParallel(model, device_ids=device_ids)
+        logging.info("Using data parallel model")
+    model.to(device)
+    # Set up the optimizer
+    logging.info("Initializing optimizer with %s" % str(args.optim))
+    optimizer = network.optimizer(model, **args.optim, data_parallel=data_parallel)
+    logging.info('Learning rates initialized to:' + utils.format_lr_info(optimizer))
+    lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(
+        optimizer, **args.lr_sched)
+    logging.info("Initialized LR scheduler with params: fix_lr_epochs=%d %s"
+                 % (args.fix_lr_epochs, str(args.lr_sched)))
+    base_metric = args.base_metric
+    train_metrics = {}
+    val_metrics = {}
+    # Load the model if `args.start_epoch` is greater than 0. This will load the
+    # model from epoch = `args.start_epoch - 1`
+    assert args.start_epoch >=0, "start_epoch must be greater than 0."
+    if args.start_epoch > 0:
+        checkpoint_path = os.path.join(args.exp_dir,
+                                       '%d.pt' % (args.start_epoch - 1))
+        _, train_metrics, val_metrics = utils.load_checkpoint(
+            checkpoint_path, model, optim=optimizer, lr_sched=lr_scheduler,
+            data_parallel=data_parallel)
+        logging.info("Loaded checkpoint from %s" % checkpoint_path)
+        logging.info("Learning rates restored to:" + utils.format_lr_info(optimizer))
+    # Training loop
+    try:
+        torch.autograd.set_detect_anomaly(args.detect_anomaly)
+        for epoch in range(args.start_epoch, args.epochs + 1):
+            logging.info("Epoch %d:" % epoch)
+            checkpoint_file = os.path.join(args.exp_dir, '%d.pt' % epoch)
+            assert not os.path.exists(checkpoint_file), \
+                "Checkpoint file %s already exists" % checkpoint_file
+            #print("---- begin trianivg")
+            curr_train_metrics = train_epoch(model, device, optimizer,
+                                             train_loader, args.n_train_items,
+                                             epoch=epoch, writer=args.writer,
+                                             data_params=args.train_data)
+            #raise KeyboardInterrupt
+            curr_test_metrics = test_epoch(model, device, val_loader,
+                                           args.n_test_items, network.loss,
+                                           network.metrics, epoch=epoch,
+                                           writer=args.writer,
+                                           data_params=args.val_data)
+            # LR scheduler
+            if epoch >= args.fix_lr_epochs:
+                lr_scheduler.step(curr_test_metrics[base_metric])
+                logging.info(
+                    "LR after scheduling step: %s" %
+                    [_['lr'] for _ in optimizer.param_groups])
+            # Write metrics to tensorboard
+            args.writer.add_scalars('Train', curr_train_metrics, epoch)
+            args.writer.add_scalars('Val', curr_test_metrics, epoch)
+            args.writer.flush()
+            for k in curr_train_metrics.keys():
+                if not k in train_metrics:
+                    train_metrics[k] = [curr_train_metrics[k]]
+                else:
+                    train_metrics[k].append(curr_train_metrics[k])
+            for k in curr_test_metrics.keys():
+                if not k in val_metrics:
+                    val_metrics[k] = [curr_test_metrics[k]]
+                else:
+                    val_metrics[k].append(curr_test_metrics[k])
+            if max(val_metrics[base_metric]) == val_metrics[base_metric][-1]:
+                logging.info("Found best validation %s!" % base_metric)
+            utils.save_checkpoint(
+                checkpoint_file, epoch, model, optimizer, lr_scheduler,
+                train_metrics, val_metrics, data_parallel)
+            logging.info("Saved checkpoint at %s" % checkpoint_file)
+            utils.save_graph(train_metrics, val_metrics, args.exp_dir)
+        return train_metrics, val_metrics
+    except KeyboardInterrupt:
+        print("Interrupted")
+    except Exception as _:  # pylint: disable=broad-except
+        import traceback  # pylint: disable=import-outside-toplevel
+        traceback.print_exc()
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    # Data Params
+    parser.add_argument('exp_dir', type=str,
+                        default='./experiments/fsd_mask_label_mult',
+                        help="Path to save checkpoints and logs.")
+    parser.add_argument('--n_train_items', type=int, default=None,
+                        help="Number of items to train on in each epoch")
+    parser.add_argument('--n_test_items', type=int, default=None,
+                        help="Number of items to test.")
+    parser.add_argument('--start_epoch', type=int, default=0,
+                        help="Start epoch")
+    parser.add_argument('--pretrain_path', type=str,
+                        help="Path to pretrained weights")
+    parser.add_argument('--use_cuda', dest='use_cuda', action='store_true',
+                        help="Whether to use cuda")
+    parser.add_argument('--gpu_ids', nargs='+', type=int, default=None,
+                        help="List of GPU ids used for training. "
+                        "Eg., --gpu_ids 2 4. All GPUs are used by default.")
+    parser.add_argument('--detect_anomaly', dest='detect_anomaly',
+                        action='store_true',
+                        help="Whether to use cuda")
+    parser.add_argument('--wandb', dest='wandb', action='store_true',
+                        help="Whether to sync tensorboard to wandb")
+    args = parser.parse_args()
+    # Set the random seed for reproducible experiments
+    torch.manual_seed(230)
+    random.seed(230)
+    np.random.seed(230)
+    if args.use_cuda:
+        torch.cuda.manual_seed(230)
+    # Set up checkpoints
+    if not os.path.exists(args.exp_dir):
+        os.makedirs(args.exp_dir)
+    utils.set_logger(os.path.join(args.exp_dir, 'train.log'))
+    # Load model and training params
+    params = utils.Params(os.path.join(args.exp_dir, 'config.json'))
+    for k, v in params.__dict__.items():
+        vars(args)[k] = v
+    # Initialize tensorboard writer
+    tensorboard_dir = os.path.join(args.exp_dir, 'tensorboard')
+    args.writer = SummaryWriter(tensorboard_dir, purge_step=args.start_epoch)
+    if args.wandb:
+        import wandb
+        wandb.init(
+            project='Semaudio', sync_tensorboard=True,
+            dir=tensorboard_dir, name=os.path.basename(args.exp_dir))
+    exec("import %s as network" % args.model)
+    logging.info("Imported the model from '%s'." % args.model)
+    train(args)
+    args.writer.close()
+    if args.wandb:
+        wandb.finish()