| | from __future__ import annotations
|
| |
|
| | from pathlib import Path
|
| |
|
| | import torch
|
| | import numpy as np
|
| | import numpy.typing as npt
|
| | from omegaconf import DictConfig
|
| |
|
| | from osuT5.dataset.data_utils import load_audio_file
|
| |
|
| |
|
| | class Preprocessor(object):
|
| | def __init__(self, args: DictConfig):
|
| | """Preprocess audio data into sequences."""
|
| | self.frame_seq_len = args.data.src_seq_len - 1
|
| | self.frame_size = args.data.hop_length
|
| | self.sample_rate = args.data.sample_rate
|
| | self.samples_per_sequence = self.frame_seq_len * self.frame_size
|
| | self.sequence_stride = int(self.samples_per_sequence * args.data.sequence_stride)
|
| |
|
| | def load(self, path: Path) -> npt.ArrayLike:
|
| | """Load an audio file as audio frames. Convert stereo to mono, normalize.
|
| |
|
| | Args:
|
| | path: Path to audio file.
|
| |
|
| | Returns:
|
| | samples: Audio time-series.
|
| | """
|
| | return load_audio_file(path, self.sample_rate)
|
| |
|
| | def segment(self, samples: npt.ArrayLike) -> torch.Tensor:
|
| | """Segment audio samples into sequences. Sequences are flattened frames.
|
| |
|
| | Args:
|
| | samples: Audio time-series.
|
| |
|
| | Returns:
|
| | sequences: A list of sequences of shape (batch size, samples per sequence).
|
| | """
|
| | samples = np.pad(
|
| | samples,
|
| | [0, self.sequence_stride - (len(samples) - self.samples_per_sequence) % self.sequence_stride],
|
| | )
|
| | sequences = self.window(samples, self.samples_per_sequence, self.sequence_stride)
|
| | sequences = torch.from_numpy(sequences).to(torch.float32)
|
| | return sequences
|
| |
|
| | @staticmethod
|
| | def window(a, w, o, copy=False):
|
| | sh = (a.size - w + 1, w)
|
| | st = a.strides * 2
|
| | view = np.lib.stride_tricks.as_strided(a, strides=st, shape=sh)[0::o]
|
| | if copy:
|
| | return view.copy()
|
| | else:
|
| | return view
|
| |
|