NeMo / nemo /collections /asr /data /audio_to_audio.py
camenduru's picture
thanks to NVIDIA ❤
7934b29
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import abc
import math
import random
from collections import OrderedDict, namedtuple
from dataclasses import dataclass
from typing import Callable, Dict, List, Optional, Tuple, Type, Union
import librosa
import numpy as np
import torch
from nemo.collections.asr.parts.preprocessing.segment import AudioSegment
from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType
from nemo.collections.common.parts.preprocessing import collections
from nemo.collections.common.parts.utils import flatten
from nemo.core.classes import Dataset
from nemo.core.neural_types import AudioSignal, EncodedRepresentation, LengthsType, NeuralType
from nemo.utils import logging
__all__ = [
'AudioToTargetDataset',
'AudioToTargetWithReferenceDataset',
'AudioToTargetWithEmbeddingDataset',
]
def _audio_collate_fn(batch: List[dict]) -> Tuple[torch.Tensor]:
"""Collate a batch of items returned by __getitem__.
Examples for each signal are zero padded to the same length
(batch_length), which is determined by the longest example.
Lengths of the original signals are returned in the output.
Args:
batch: List of dictionaries. Each element of the list
has the following format
```
{
'signal_0': 1D or 2D tensor,
'signal_1': 1D or 2D tensor,
...
'signal_N': 1D or 2D tensor,
}
```
1D tensors have shape (num_samples,) and 2D tensors
have shape (num_channels, num_samples)
Returns:
A tuple containing signal tensor and signal length tensor (in samples)
for each signal.
The output has the following format:
```
(signal_0, signal_0_length, signal_1, signal_1_length, ..., signal_N, signal_N_length)
```
Note that the output format is obtained by interleaving signals and their length.
"""
signals = batch[0].keys()
batched = tuple()
for signal in signals:
signal_length = [b[signal].shape[-1] for b in batch]
# Batch length is determined by the longest signal in the batch
batch_length = max(signal_length)
b_signal = []
for s_len, b in zip(signal_length, batch):
# check if padding is necessary
if s_len < batch_length:
if b[signal].ndim == 1:
# single-channel signal
pad = (0, batch_length - s_len)
elif b[signal].ndim == 2:
# multi-channel signal
pad = (0, batch_length - s_len, 0, 0)
else:
raise RuntimeError(
f'Signal {signal} has unsuported dimensions {signal.shape}. Currently, only 1D and 2D arrays are supported.'
)
b[signal] = torch.nn.functional.pad(b[signal], pad)
# append the current padded signal
b_signal.append(b[signal])
# (signal_batched, signal_length)
batched += (torch.stack(b_signal), torch.tensor(signal_length, dtype=torch.int32))
# Currently, outputs are expected to be in a tuple, where each element must correspond
# to the output type in the OrderedDict returned by output_types.
#
# Therefore, we return batched signals by interleaving signals and their length:
# (signal_0, signal_0_length, signal_1, signal_1_length, ...)
return batched
@dataclass
class SignalSetup:
signals: List[str] # signal names
duration: Optional[Union[float, list]] = None # duration for each signal
channel_selectors: Optional[List[ChannelSelectorType]] = None # channel selector for loading each signal
class ASRAudioProcessor:
"""Class that processes an example from Audio collection and returns
a dictionary with prepared signals.
For example, the output dictionary may be the following
```
{
'input_signal': input_signal_tensor,
'target_signal': target_signal_tensor,
'reference_signal': reference_signal_tensor,
'embedding_vector': embedding_vector
}
```
Keys in the output dictionary are ordered with synchronous signals given first,
followed by asynchronous signals and embedding.
Args:
sample_rate: sample rate used for all audio signals
random_offset: If `True`, offset will be randomized when loading a subsegment
from a file.
"""
def __init__(
self, sample_rate: float, random_offset: bool,
):
self.sample_rate = sample_rate
self.random_offset = random_offset
self.sync_setup = None
self.async_setup = None
self.embedding_setup = None
@property
def sample_rate(self) -> float:
return self._sample_rate
@sample_rate.setter
def sample_rate(self, value: float):
if value <= 0:
raise ValueError(f'Sample rate must be positive, received {value}')
self._sample_rate = value
@property
def random_offset(self) -> bool:
return self._random_offset
@random_offset.setter
def random_offset(self, value: bool):
self._random_offset = value
@property
def sync_setup(self) -> SignalSetup:
"""Return the current setup for synchronous signals.
Returns:
A dataclass containing the list of signals, their
duration and channel selectors.
"""
return self._sync_setup
@sync_setup.setter
def sync_setup(self, value: Optional[SignalSetup]):
"""Setup signals to be loaded synchronously.
Args:
value: An instance of SignalSetup with the following fields
- signals: list of signals (keys of example.audio_signals) which will be loaded
synchronously with the same start time and duration.
- duration: Duration for each signal to be loaded.
If duration is set to None, the whole file will be loaded.
- channel_selectors: A list of channel selector for each signal. If channel selector
is None, all channels in the audio file will be loaded.
"""
if value is None or isinstance(value, SignalSetup):
self._sync_setup = value
else:
raise ValueError(f'Unexpected type {type(value)} for value {value}.')
@property
def async_setup(self) -> SignalSetup:
"""Return the current setup for asynchronous signals.
Returns:
A dataclass containing the list of signals, their
duration and channel selectors.
"""
return self._async_setup
@async_setup.setter
def async_setup(self, value: Optional[SignalSetup]):
"""Setup signals to be loaded asynchronously.
Args:
Args:
value: An instance of SignalSetup with the following fields
- signals: list of signals (keys of example.audio_signals) which will be loaded
asynchronously with signals possibly having different start and duration
- duration: Duration for each signal to be loaded.
If duration is set to None, the whole file will be loaded.
- channel_selectors: A list of channel selector for each signal. If channel selector
is None, all channels in the audio file will be loaded.
"""
if value is None or isinstance(value, SignalSetup):
self._async_setup = value
else:
raise ValueError(f'Unexpected type {type(value)} for value {value}.')
@property
def embedding_setup(self) -> SignalSetup:
"""Setup signals corresponding to an embedding vector.
"""
return self._embedding_setup
@embedding_setup.setter
def embedding_setup(self, value: SignalSetup):
"""Setup signals corresponding to an embedding vector.
Args:
value: An instance of SignalSetup with the following fields
- signals: list of signals (keys of example.audio_signals) which will be loaded
as embedding vectors.
"""
if value is None or isinstance(value, SignalSetup):
self._embedding_setup = value
else:
raise ValueError(f'Unexpected type {type(value)} for value {value}.')
def process(self, example: collections.Audio.OUTPUT_TYPE) -> Dict[str, torch.Tensor]:
"""Process an example from a collection of audio examples.
Args:
example: an example from Audio collection.
Returns:
An ordered dictionary of signals and their tensors.
For example, the output dictionary may be the following
```
{
'input_signal': input_signal_tensor,
'target_signal': target_signal_tensor,
'reference_signal': reference_signal_tensor,
'embedding_vector': embedding_vector
}
```
Keys in the output dictionary are ordered with synchronous signals given first,
followed by asynchronous signals and embedding.
"""
audio = self.load_audio(example=example)
audio = self.process_audio(audio=audio)
return audio
def load_audio(self, example: collections.Audio.OUTPUT_TYPE) -> Dict[str, torch.Tensor]:
"""Given an example, load audio from `example.audio_files` and prepare
the output dictionary.
Args:
example: An example from an audio collection
Returns:
An ordered dictionary of signals and their tensors.
For example, the output dictionary may be the following
```
{
'input_signal': input_signal_tensor,
'target_signal': target_signal_tensor,
'reference_signal': reference_signal_tensor,
'embedding_vector': embedding_vector
}
```
Keys in the output dictionary are ordered with synchronous signals given first,
followed by asynchronous signals and embedding.
"""
output = OrderedDict()
if self.sync_setup is not None:
# Load all signals with the same start and duration
sync_signals = self.load_sync_signals(example)
output.update(sync_signals)
if self.async_setup is not None:
# Load each signal independently
async_signals = self.load_async_signals(example)
output.update(async_signals)
# Load embedding vector
if self.embedding_setup is not None:
embedding = self.load_embedding(example)
output.update(embedding)
if not output:
raise RuntimeError('Output dictionary is empty. Please use `_setup` methods to setup signals to be loaded')
return output
def process_audio(self, audio: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
"""Process audio signals available in the input dictionary.
Args:
audio: A dictionary containing loaded signals `signal: tensor`
Returns:
An ordered dictionary of signals and their tensors.
"""
# Currently, not doing any processing of the loaded signals.
return audio
def load_sync_signals(self, example: collections.Audio.OUTPUT_TYPE) -> Dict[str, torch.Tensor]:
"""Load signals with the same start and duration.
Args:
example: an example from audio collection
Returns:
An ordered dictionary of signals and their tensors.
"""
output = OrderedDict()
sync_audio_files = [example.audio_files[s] for s in self.sync_setup.signals]
sync_samples = self.get_samples_synchronized(
audio_files=sync_audio_files,
channel_selectors=self.sync_setup.channel_selectors,
sample_rate=self.sample_rate,
duration=self.sync_setup.duration,
fixed_offset=example.offset,
random_offset=self.random_offset,
)
for signal, samples in zip(self.sync_setup.signals, sync_samples):
output[signal] = torch.tensor(samples)
return output
def load_async_signals(self, example: collections.Audio.OUTPUT_TYPE) -> Dict[str, torch.Tensor]:
"""Load each async signal independently, no constraints on starting
from the same time.
Args:
example: an example from audio collection
Returns:
An ordered dictionary of signals and their tensors.
"""
output = OrderedDict()
for idx, signal in enumerate(self.async_setup.signals):
samples = self.get_samples(
audio_file=example.audio_files[signal],
sample_rate=self.sample_rate,
duration=self.async_setup.duration[idx],
channel_selector=self.async_setup.channel_selectors[idx],
fixed_offset=example.offset,
random_offset=self.random_offset,
)
output[signal] = torch.tensor(samples)
return output
@classmethod
def get_samples(
cls,
audio_file: str,
sample_rate: int,
duration: Optional[float] = None,
channel_selector: ChannelSelectorType = None,
fixed_offset: float = 0,
random_offset: bool = False,
) -> np.ndarray:
"""Get samples from an audio file.
For a single-channel signal, the output is shape (num_samples,).
For a multi-channel signal, the output is shape (num_samples, num_channels).
Args:
audio_file: path to an audio file
sample_rate: desired sample rate for output samples
duration: Optional desired duration of output samples.
If `None`, the complete file will be loaded.
If set, a segment of `duration` seconds will be loaded.
channel_selector: Optional channel selector, for selecting a subset of channels.
fixed_offset: Optional fixed offset when loading samples.
random_offset: If `True`, offset will be randomized when loading a short segment
from a file. The value is randomized between fixed_offset and
max_offset (set depending on the duration and fixed_offset).
Returns:
Numpy array with samples from audio file.
The array has shape (num_samples,) for a single-channel signal
or (num_channels, num_samples) for a multi-channel signal.
"""
output = cls.get_samples_synchronized(
audio_files=[audio_file],
sample_rate=sample_rate,
duration=duration,
channel_selectors=[channel_selector],
fixed_offset=fixed_offset,
random_offset=random_offset,
)
return output[0]
@classmethod
def get_samples_synchronized(
cls,
audio_files: List[str],
sample_rate: int,
duration: Optional[float] = None,
channel_selectors: Optional[List[ChannelSelectorType]] = None,
fixed_offset: float = 0,
random_offset: bool = False,
) -> List[np.ndarray]:
"""Get samples from multiple files with the same start and end point.
Args:
audio_files: list of paths to audio files
sample_rate: desired sample rate for output samples
duration: Optional desired duration of output samples.
If `None`, the complete files will be loaded.
If set, a segment of `duration` seconds will be loaded from
all files. Segment is synchronized across files, so that
start and end points are the same.
channel_selectors: Optional channel selector for each signal, for selecting
a subset of channels.
fixed_offset: Optional fixed offset when loading samples.
random_offset: If `True`, offset will be randomized when loading a short segment
from a file. The value is randomized between fixed_offset and
max_offset (set depending on the duration and fixed_offset).
Returns:
List with the same size as `audio_files` but containing numpy arrays
with samples from each audio file.
Each array has shape (num_samples,) or (num_channels, num_samples), for single-
or multi-channel signal, respectively.
For example, if `audio_files = [path/to/file_1.wav, path/to/file_2.wav]`,
the output will be a list `output = [samples_file_1, samples_file_2]`.
"""
if channel_selectors is None:
channel_selectors = [None] * len(audio_files)
if duration is None:
# Load complete files starting from a fixed offset
offset = fixed_offset # fixed offset
num_samples = None # no constrain on the number of samples
else:
# Fixed duration of the output
audio_durations = cls.get_duration(audio_files)
min_audio_duration = min(audio_durations)
available_duration = min_audio_duration - fixed_offset
if available_duration <= 0:
raise ValueError(f'Fixed offset {fixed_offset}s is larger than shortest file {min_duration}s.')
if duration + fixed_offset > min_audio_duration:
# The shortest file is shorter than the requested duration
logging.warning(
f'Shortest file ({min_audio_duration}s) is less than the desired duration {duration}s + fixed offset {fixed_offset}s. Returned signals will be shortened to {available_duration} seconds.'
)
offset = fixed_offset
duration = available_duration
elif random_offset:
# Randomize offset based on the shortest file
max_offset = min_audio_duration - duration
offset = random.uniform(fixed_offset, max_offset)
else:
# Fixed offset
offset = fixed_offset
# Fixed number of samples
num_samples = math.floor(duration * sample_rate)
output = []
# Prepare segments
for idx, audio_file in enumerate(audio_files):
segment_samples = cls.get_samples_from_file(
audio_file=audio_file,
sample_rate=sample_rate,
offset=offset,
num_samples=num_samples,
channel_selector=channel_selectors[idx],
)
output.append(segment_samples)
return output
@classmethod
def get_samples_from_file(
cls,
audio_file: Union[str, List[str]],
sample_rate: int,
offset: float,
num_samples: Optional[int] = None,
channel_selector: Optional[ChannelSelectorType] = None,
) -> np.ndarray:
"""Get samples from a single or multiple files.
If loading samples from multiple files, they will
be concatenated along the channel dimension.
Args:
audio_file: path or a list of paths.
sample_rate: sample rate of the loaded samples
offset: fixed offset in seconds
num_samples: Optional, number of samples to load.
If `None`, all available samples will be loaded.
channel_selector: Select a subset of available channels.
Returns:
An array with shape (samples,) or (channels, samples)
"""
if isinstance(audio_file, str):
# Load samples from a single file
segment_samples = cls.get_segment_from_file(
audio_file=audio_file,
sample_rate=sample_rate,
offset=offset,
num_samples=num_samples,
channel_selector=channel_selector,
)
elif isinstance(audio_file, list):
# Load samples from multiple files and form a multi-channel signal
segment_samples = []
for a_file in audio_file:
a_file_samples = cls.get_segment_from_file(
audio_file=a_file,
sample_rate=sample_rate,
offset=offset,
num_samples=num_samples,
channel_selector=channel_selector,
)
segment_samples.append(a_file_samples)
segment_samples = cls.list_to_multichannel(segment_samples)
elif audio_file is None:
# Support for inference, when the target signal is `None`
segment_samples = []
else:
raise RuntimeError(f'Unexpected audio_file type {type(audio_file)}')
return segment_samples
@staticmethod
def get_segment_from_file(
audio_file: str,
sample_rate: int,
offset: float,
num_samples: Optional[int] = None,
channel_selector: Optional[ChannelSelectorType] = None,
) -> np.ndarray:
"""Get a segment of samples from a single audio file.
Args:
audio_file: path to an audio file
sample_rate: sample rate of the loaded samples
offset: fixed offset in seconds
num_samples: Optional, number of samples to load.
If `None`, all available samples will be loaded.
channel_selector: Select a subset of available channels.
Returns:
An array with shape (samples,) or (channels, samples)
"""
if num_samples is None:
segment = AudioSegment.from_file(
audio_file=audio_file, target_sr=sample_rate, offset=offset, channel_selector=channel_selector,
)
else:
segment = AudioSegment.segment_from_file(
audio_file=audio_file,
target_sr=sample_rate,
n_segments=num_samples,
offset=offset,
channel_selector=channel_selector,
)
if segment.samples.ndim == 1:
# Single-channel signal
return segment.samples
elif segment.samples.ndim == 2:
# Use multi-channel format as (channels, samples)
return segment.samples.T
else:
raise RuntimeError(f'Unexpected samples shape: {segment.samples.shape}')
@staticmethod
def list_to_multichannel(signal: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray:
"""Convert a list of signals into a multi-channel signal by concatenating
the elements of the list along the channel dimension.
If input is not a list, it is returned unmodified.
Args:
signal: list of arrays
Returns:
Numpy array obtained by concatenating the elements of the list
along the channel dimension (axis=0).
"""
if not isinstance(signal, list):
# Nothing to do there
return signal
elif len(signal) == 0:
# Nothing to do, return as is
return signal
elif len(signal) == 1:
# Nothing to concatenate, return the original format
return signal[0]
# If multiple signals are provided in a list, we concatenate them along the channel dimension
if signal[0].ndim == 1:
# Single-channel individual files
mc_signal = np.stack(signal, axis=0)
elif signal[0].ndim == 2:
# Multi-channel individual files
mc_signal = np.concatenate(signal, axis=0)
else:
raise RuntimeError(f'Unexpected target with {signal[0].ndim} dimensions.')
return mc_signal
@staticmethod
def get_duration(audio_files: List[str]) -> List[float]:
"""Get duration for each audio file in `audio_files`.
Args:
audio_files: list of paths to audio files
Returns:
List of durations in seconds.
"""
duration = [librosa.get_duration(filename=f) for f in flatten(audio_files)]
return duration
def load_embedding(self, example: collections.Audio.OUTPUT_TYPE) -> Dict[str, torch.Tensor]:
"""Given an example, load embedding from `example.audio_files[embedding]`
and return it in a dictionary.
Args:
example: An example from audio collection
Returns:
An dictionary of embedding keys and their tensors.
"""
output = OrderedDict()
for idx, signal in enumerate(self.embedding_setup.signals):
embedding_file = example.audio_files[signal]
embedding = self.load_embedding_vector(embedding_file)
output[signal] = torch.tensor(embedding)
return output
@staticmethod
def load_embedding_vector(filepath: str) -> np.ndarray:
"""Load an embedding vector from a file.
Args:
filepath: path to a file storing a vector.
Currently, it is assumed the file is a npy file.
Returns:
Array loaded from filepath.
"""
if filepath.endswith('.npy'):
with open(filepath, 'rb') as f:
embedding = np.load(f)
else:
raise RuntimeError(f'Unknown embedding file format in file: {filepath}')
return embedding
class BaseAudioDataset(Dataset):
"""Base class of audio datasets, providing common functionality
for other audio datasets.
Args:
collection: Collection of audio examples prepared from manifest files.
audio_processor: Used to process every example from the collection.
A callable with `process` method. For reference,
please check ASRAudioProcessor.
"""
@property
@abc.abstractmethod
def output_types(self) -> Optional[Dict[str, NeuralType]]:
"""Returns definitions of module output ports.
"""
def __init__(self, collection: collections.Audio, audio_processor: Callable, output_type: Type[namedtuple]):
"""Instantiates an audio dataset.
"""
super().__init__()
self.collection = collection
self.audio_processor = audio_processor
self.output_type = output_type
def num_channels(self, signal_key) -> int:
"""Returns the number of channels for a particular signal in
items prepared by this dictionary.
More specifically, this will get the tensor from the first
item in the dataset, check if it's a one- or two-dimensional
tensor, and return the number of channels based on the size
of the first axis (shape[0]).
NOTE:
This assumes that all examples have the same number of channels.
Args:
signal_key: string, used to select a signal from the dictionary
output by __getitem__
Returns:
Number of channels for the selected signal.
"""
# Assumption: whole dataset has the same number of channels
item = self.__getitem__(0)
if item[signal_key].ndim == 1:
return 1
elif item[signal_key].ndim == 2:
return item[signal_key].shape[0]
else:
raise RuntimeError(
f'Unexpected number of dimension for signal {signal_key} with shape {item[signal_key].shape}'
)
def __getitem__(self, index: int) -> Dict[str, torch.Tensor]:
"""Return a single example from the dataset.
Args:
index: integer index of an example in the collection
Returns:
Dictionary providing mapping from signal to its tensor.
For example:
```
{
'input_signal': input_signal_tensor,
'target_signal': target_signal_tensor,
}
```
"""
example = self.collection[index]
output = self.audio_processor.process(example=example)
return output
def __len__(self) -> int:
"""Return the number of examples in the dataset.
"""
return len(self.collection)
def _collate_fn(self, batch) -> Tuple[torch.Tensor]:
"""Collate items in a batch.
"""
return self.output_type(*_audio_collate_fn(batch))
AudioToTargetExample = namedtuple(
typename='AudioToTargetExample', field_names='input_signal input_length target_signal target_length'
)
class AudioToTargetDataset(BaseAudioDataset):
"""A dataset for audio-to-audio tasks where the goal is to use
an input signal to recover the corresponding target signal.
Each line of the manifest file is expected to have the following format
```
{
'input_key': 'path/to/input.wav',
'target_key': 'path/to/path_to_target.wav',
'duration': duration_of_input,
}
```
Additionally, multiple audio files may be provided for each key in the manifest, for example,
```
{
'input_key': 'path/to/input.wav',
'target_key': ['path/to/path_to_target_ch0.wav', 'path/to/path_to_target_ch1.wav'],
'duration': duration_of_input,
}
```
Keys for input and target signals can be configured in the constructor (`input_key` and `target_key`).
Args:
manifest_filepath: Path to manifest file in a format described above.
sample_rate: Sample rate for loaded audio signals.
input_key: Key pointing to input audio files in the manifest
target_key: Key pointing to target audio files in manifest
audio_duration: Optional duration of each item returned by __getitem__.
If `None`, complete audio will be loaded.
If set, a random subsegment will be loaded synchronously from
target and audio, i.e., with the same start and end point.
random_offset: If `True`, offset will be randomized when loading a subsegment
from a file.
max_duration: If audio exceeds this length, do not include in dataset.
min_duration: If audio is less than this length, do not include in dataset.
max_utts: Limit number of utterances.
input_channel_selector: Optional, select subset of channels from each input audio file.
If `None`, all channels will be loaded.
target_channel_selector: Optional, select subset of channels from each input audio file.
If `None`, all channels will be loaded.
"""
def __init__(
self,
manifest_filepath: str,
sample_rate: int,
input_key: str,
target_key: str,
audio_duration: Optional[float] = None,
random_offset: bool = False,
max_duration: Optional[float] = None,
min_duration: Optional[float] = None,
max_utts: Optional[int] = None,
input_channel_selector: Optional[int] = None,
target_channel_selector: Optional[int] = None,
):
audio_to_manifest_key = {
'input_signal': input_key,
'target_signal': target_key,
}
collection = collections.AudioCollection(
manifest_files=manifest_filepath,
audio_to_manifest_key=audio_to_manifest_key,
min_duration=min_duration,
max_duration=max_duration,
max_number=max_utts,
)
audio_processor = ASRAudioProcessor(sample_rate=sample_rate, random_offset=random_offset,)
audio_processor.sync_setup = SignalSetup(
signals=['input_signal', 'target_signal'],
duration=audio_duration,
channel_selectors=[input_channel_selector, target_channel_selector],
)
super().__init__(collection=collection, audio_processor=audio_processor, output_type=AudioToTargetExample)
@property
def output_types(self) -> Optional[Dict[str, NeuralType]]:
"""Returns definitions of module output ports.
Returns:
Ordered dictionary in the following form:
```
{
'input_signal': batched single- or multi-channel format,
'input_length': batched original length of each input signal
'target_signal': batched single- or multi-channel format,
'target_length': batched original length of each target signal
}
```
"""
sc_audio_type = NeuralType(('B', 'T'), AudioSignal())
mc_audio_type = NeuralType(('B', 'C', 'T'), AudioSignal())
return OrderedDict(
input_signal=sc_audio_type if self.num_channels('input_signal') == 1 else mc_audio_type,
input_length=NeuralType(('B',), LengthsType()),
target_signal=sc_audio_type if self.num_channels('target_signal') == 1 else mc_audio_type,
target_length=NeuralType(('B',), LengthsType()),
)
AudioToTargetWithReferenceExample = namedtuple(
typename='AudioToTargetWithReferenceExample',
field_names='input_signal input_length target_signal target_length reference_signal reference_length',
)
class AudioToTargetWithReferenceDataset(BaseAudioDataset):
"""A dataset for audio-to-audio tasks where the goal is to use
an input signal to recover the corresponding target signal and an
additional reference signal is available.
This can be used, for example, when a reference signal is
available from
- enrollment utterance for the target signal
- echo reference from playback
- reference from another sensor that correlates with the target signal
Each line of the manifest file is expected to have the following format
```
{
'input_key': 'path/to/input.wav',
'target_key': 'path/to/path_to_target.wav',
'reference_key': 'path/to/path_to_reference.wav',
'duration': duration_of_input,
}
```
Keys for input, target and reference signals can be configured in the constructor.
Args:
manifest_filepath: Path to manifest file in a format described above.
sample_rate: Sample rate for loaded audio signals.
input_key: Key pointing to input audio files in the manifest
target_key: Key pointing to target audio files in manifest
reference_key: Key pointing to reference audio files in manifest
audio_duration: Optional duration of each item returned by __getitem__.
If `None`, complete audio will be loaded.
If set, a random subsegment will be loaded synchronously from
target and audio, i.e., with the same start and end point.
random_offset: If `True`, offset will be randomized when loading a subsegment
from a file.
max_duration: If audio exceeds this length, do not include in dataset.
min_duration: If audio is less than this length, do not include in dataset.
max_utts: Limit number of utterances.
input_channel_selector: Optional, select subset of channels from each input audio file.
If `None`, all channels will be loaded.
target_channel_selector: Optional, select subset of channels from each input audio file.
If `None`, all channels will be loaded.
reference_channel_selector: Optional, select subset of channels from each input audio file.
If `None`, all channels will be loaded.
reference_is_synchronized: If True, it is assumed that the reference signal is synchronized
with the input signal, so the same subsegment will be loaded as for
input and target. If False, reference signal will be loaded independently
from input and target.
reference_duration: Optional, can be used to set a fixed duration of the reference utterance. If `None`,
complete audio file will be loaded.
"""
def __init__(
self,
manifest_filepath: str,
sample_rate: int,
input_key: str,
target_key: str,
reference_key: str,
audio_duration: Optional[float] = None,
random_offset: bool = False,
max_duration: Optional[float] = None,
min_duration: Optional[float] = None,
max_utts: Optional[int] = None,
input_channel_selector: Optional[int] = None,
target_channel_selector: Optional[int] = None,
reference_channel_selector: Optional[int] = None,
reference_is_synchronized: bool = True,
reference_duration: Optional[float] = None,
):
audio_to_manifest_key = {
'input_signal': input_key,
'target_signal': target_key,
'reference_signal': reference_key,
}
collection = collections.AudioCollection(
manifest_files=manifest_filepath,
audio_to_manifest_key=audio_to_manifest_key,
min_duration=min_duration,
max_duration=max_duration,
max_number=max_utts,
)
audio_processor = ASRAudioProcessor(sample_rate=sample_rate, random_offset=random_offset,)
if reference_is_synchronized:
audio_processor.sync_setup = SignalSetup(
signals=['input_signal', 'target_signal', 'reference_signal'],
duration=audio_duration,
channel_selectors=[input_channel_selector, target_channel_selector, reference_channel_selector],
)
else:
audio_processor.sync_setup = SignalSetup(
signals=['input_signal', 'target_signal'],
duration=audio_duration,
channel_selectors=[input_channel_selector, target_channel_selector],
)
audio_processor.async_setup = SignalSetup(
signals=['reference_signal'],
duration=[reference_duration],
channel_selectors=[reference_channel_selector],
)
super().__init__(
collection=collection, audio_processor=audio_processor, output_type=AudioToTargetWithReferenceExample
)
@property
def output_types(self) -> Optional[Dict[str, NeuralType]]:
"""Returns definitions of module output ports.
Returns:
Ordered dictionary in the following form:
```
{
'input_signal': batched single- or multi-channel format,
'input_length': batched original length of each input signal
'target_signal': batched single- or multi-channel format,
'target_length': batched original length of each target signal
'reference_signal': single- or multi-channel format,
'reference_length': original length of each reference signal
}
```
"""
sc_audio_type = NeuralType(('B', 'T'), AudioSignal())
mc_audio_type = NeuralType(('B', 'C', 'T'), AudioSignal())
return OrderedDict(
input_signal=sc_audio_type if self.num_channels('input_signal') == 1 else mc_audio_type,
input_length=NeuralType(('B',), LengthsType()),
target_signal=sc_audio_type if self.num_channels('target_signal') == 1 else mc_audio_type,
target_length=NeuralType(('B',), LengthsType()),
reference_signal=sc_audio_type if self.num_channels('reference_signal') == 1 else mc_audio_type,
reference_length=NeuralType(('B',), LengthsType()),
)
AudioToTargetWithEmbeddingExample = namedtuple(
typename='AudioToTargetWithEmbeddingExample',
field_names='input_signal input_length target_signal target_length embedding_vector embedding_length',
)
class AudioToTargetWithEmbeddingDataset(BaseAudioDataset):
"""A dataset for audio-to-audio tasks where the goal is to use
an input signal to recover the corresponding target signal and an
additional embedding signal. It is assumed that the embedding
is in a form of a vector.
Each line of the manifest file is expected to have the following format
```
{
input_key: 'path/to/input.wav',
target_key: 'path/to/path_to_target.wav',
embedding_key: 'path/to/path_to_reference.npy',
'duration': duration_of_input,
}
```
Keys for input, target and embedding signals can be configured in the constructor.
Args:
manifest_filepath: Path to manifest file in a format described above.
sample_rate: Sample rate for loaded audio signals.
input_key: Key pointing to input audio files in the manifest
target_key: Key pointing to target audio files in manifest
embedding_key: Key pointing to embedding files in manifest
audio_duration: Optional duration of each item returned by __getitem__.
If `None`, complete audio will be loaded.
If set, a random subsegment will be loaded synchronously from
target and audio, i.e., with the same start and end point.
random_offset: If `True`, offset will be randomized when loading a subsegment
from a file.
max_duration: If audio exceeds this length, do not include in dataset.
min_duration: If audio is less than this length, do not include in dataset.
max_utts: Limit number of utterances.
input_channel_selector: Optional, select subset of channels from each input audio file.
If `None`, all channels will be loaded.
target_channel_selector: Optional, select subset of channels from each input audio file.
If `None`, all channels will be loaded.
"""
def __init__(
self,
manifest_filepath: str,
sample_rate: int,
input_key: str,
target_key: str,
embedding_key: str,
audio_duration: Optional[float] = None,
random_offset: bool = False,
max_duration: Optional[float] = None,
min_duration: Optional[float] = None,
max_utts: Optional[int] = None,
input_channel_selector: Optional[int] = None,
target_channel_selector: Optional[int] = None,
):
audio_to_manifest_key = {
'input_signal': input_key,
'target_signal': target_key,
'embedding_vector': embedding_key,
}
collection = collections.AudioCollection(
manifest_files=manifest_filepath,
audio_to_manifest_key=audio_to_manifest_key,
min_duration=min_duration,
max_duration=max_duration,
max_number=max_utts,
)
audio_processor = ASRAudioProcessor(sample_rate=sample_rate, random_offset=random_offset,)
audio_processor.sync_setup = SignalSetup(
signals=['input_signal', 'target_signal'],
duration=audio_duration,
channel_selectors=[input_channel_selector, target_channel_selector],
)
audio_processor.embedding_setup = SignalSetup(signals=['embedding_vector'])
super().__init__(
collection=collection, audio_processor=audio_processor, output_type=AudioToTargetWithEmbeddingExample
)
@property
def output_types(self) -> Optional[Dict[str, NeuralType]]:
"""Returns definitions of module output ports.
Returns:
Ordered dictionary in the following form:
```
{
'input_signal': batched single- or multi-channel format,
'input_length': batched original length of each input signal
'target_signal': batched single- or multi-channel format,
'target_length': batched original length of each target signal
'embedding_vector': batched embedded vector format,
'embedding_length': batched original length of each embedding vector
}
```
"""
sc_audio_type = NeuralType(('B', 'T'), AudioSignal())
mc_audio_type = NeuralType(('B', 'C', 'T'), AudioSignal())
return OrderedDict(
input_signal=sc_audio_type if self.num_channels('input_signal') == 1 else mc_audio_type,
input_length=NeuralType(('B',), LengthsType()),
target_signal=sc_audio_type if self.num_channels('target_signal') == 1 else mc_audio_type,
target_length=NeuralType(('B',), LengthsType()),
embedding_vector=NeuralType(('B', 'D'), EncodedRepresentation()),
embedding_length=NeuralType(('B',), LengthsType()),
)