|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import importlib |
|
|
|
|
|
import numpy as np |
|
|
import pytest |
|
|
import torch |
|
|
from einops import rearrange |
|
|
|
|
|
from nemo.collections.audio.modules.transforms import AudioToSpectrogram, SpectrogramToAudio |
|
|
|
|
|
|
|
|
class TestAudioSpectrogram: |
|
|
@pytest.mark.unit |
|
|
@pytest.mark.parametrize('fft_length', [64, 512]) |
|
|
@pytest.mark.parametrize('num_channels', [1, 3]) |
|
|
def test_audio_to_spec(self, fft_length: int, num_channels: int): |
|
|
"""Test output length for audio to spectrogram. |
|
|
|
|
|
Create signals of arbitrary length and check output |
|
|
length is matching the actual transform length. |
|
|
""" |
|
|
hop_lengths = [fft_length // 2, fft_length // 3, fft_length // 4] |
|
|
batch_size = 4 |
|
|
num_examples = 20 |
|
|
random_seed = 42 |
|
|
atol = 1e-6 |
|
|
|
|
|
_rng = np.random.default_rng(seed=random_seed) |
|
|
|
|
|
for n in range(num_examples): |
|
|
|
|
|
|
|
|
input_length = _rng.integers(low=fft_length, high=100 * fft_length, size=batch_size) |
|
|
x = _rng.normal(size=(batch_size, num_channels, np.max(input_length))) |
|
|
x = torch.tensor(x) |
|
|
for b in range(batch_size): |
|
|
x[b, :, input_length[b] :] = 0 |
|
|
|
|
|
for hop_length in hop_lengths: |
|
|
|
|
|
audio2spec = AudioToSpectrogram(fft_length=fft_length, hop_length=hop_length) |
|
|
|
|
|
|
|
|
batch_spec, batch_spec_len = audio2spec(input=x, input_length=torch.tensor(input_length)) |
|
|
|
|
|
for b in range(batch_size): |
|
|
|
|
|
|
|
|
b_spec, b_spec_len = audio2spec(input=x[b : b + 1, :, : input_length[b]]) |
|
|
actual_len = b_spec.size(-1) |
|
|
|
|
|
|
|
|
assert ( |
|
|
actual_len == b_spec_len |
|
|
), f'Output length not matching for example ({n}, {b}) with length {input_length[n]} (hop_length={hop_length}): true {actual_len} vs calculated {b_spec_len}.' |
|
|
|
|
|
assert ( |
|
|
actual_len == batch_spec_len[b] |
|
|
), f'Output length not matching for example ({n}, {b}) with length {input_length[n]} (hop_length={hop_length}): true {actual_len} vs calculated batch len {batch_spec_len[b]}.' |
|
|
|
|
|
|
|
|
assert torch.allclose( |
|
|
batch_spec[b, ..., :actual_len], b_spec, atol=atol |
|
|
), f'Spectrograms not matching for example ({n}, {b}) with length {input_length[b]} (hop_length={hop_length})' |
|
|
|
|
|
@pytest.mark.unit |
|
|
@pytest.mark.parametrize('fft_length', [64, 512]) |
|
|
@pytest.mark.parametrize('num_channels', [1, 3]) |
|
|
def test_spec_to_audio(self, fft_length: int, num_channels: int): |
|
|
"""Test output length for spectrogram to audio. |
|
|
|
|
|
Create signals of arbitrary length and check output |
|
|
length is matching the actual transform length. |
|
|
""" |
|
|
hop_lengths = [fft_length // 2, fft_length // 3, fft_length // 4] |
|
|
batch_size = 4 |
|
|
num_examples = 20 |
|
|
random_seed = 42 |
|
|
atol = 1e-6 |
|
|
|
|
|
_rng = np.random.default_rng(seed=random_seed) |
|
|
|
|
|
for n in range(num_examples): |
|
|
|
|
|
|
|
|
input_length = _rng.integers(low=10, high=100, size=batch_size) |
|
|
input_shape = (batch_size, num_channels, fft_length // 2 + 1, np.max(input_length)) |
|
|
spec = _rng.normal(size=input_shape) + 1j * _rng.normal(size=input_shape) |
|
|
spec = torch.tensor(spec) |
|
|
spec[..., 0, :] = spec[..., 0, :].real |
|
|
spec[..., -1, :] = spec[..., -1, :].real |
|
|
for b in range(batch_size): |
|
|
spec[b, ..., input_length[b] :] = 0 |
|
|
|
|
|
for hop_length in hop_lengths: |
|
|
|
|
|
spec2audio = SpectrogramToAudio(fft_length=fft_length, hop_length=hop_length) |
|
|
|
|
|
|
|
|
batch_x, batch_x_len = spec2audio(input=spec, input_length=torch.tensor(input_length)) |
|
|
|
|
|
for b in range(batch_size): |
|
|
|
|
|
|
|
|
b_x, b_x_len = spec2audio(input=spec[b : b + 1, ..., : input_length[b]]) |
|
|
|
|
|
actual_len = b_x.size(-1) |
|
|
|
|
|
|
|
|
assert ( |
|
|
b_x_len == actual_len |
|
|
), f'Output length not matching for example ({n}, {b}) with {input_length[b]} frames (hop_length={hop_length}): true {actual_len} vs calculated {b_x_len}.' |
|
|
|
|
|
assert ( |
|
|
batch_x_len[b] == actual_len |
|
|
), f'Output length not matching for example ({n}, {b}) with {input_length[b]} frames (hop_length={hop_length}): true {actual_len} vs calculated batch {batch_x_len[b]}.' |
|
|
|
|
|
|
|
|
if input_length[b] < spec.size(-1): |
|
|
|
|
|
|
|
|
|
|
|
tail_length = max(fft_length // 2 - hop_length, 0) |
|
|
else: |
|
|
tail_length = 0 |
|
|
valid_len = actual_len - tail_length |
|
|
batch_x_valid = batch_x[b, :, :valid_len] |
|
|
b_x_valid = b_x[..., :valid_len] |
|
|
assert torch.allclose( |
|
|
batch_x_valid, b_x_valid, atol=atol |
|
|
), f'Signals not matching for example ({n}, {b}) with length {input_length[b]} (hop_length={hop_length}): max abs diff {torch.max(torch.abs(batch_x_valid-b_x_valid))} at {torch.argmax(torch.abs(batch_x_valid-b_x_valid))}' |
|
|
|
|
|
@pytest.mark.unit |
|
|
@pytest.mark.parametrize('fft_length', [128, 1024]) |
|
|
@pytest.mark.parametrize('num_channels', [1, 4]) |
|
|
@pytest.mark.parametrize('magnitude_power', [0.5, 1, 2]) |
|
|
@pytest.mark.parametrize('scale', [0.1, 1.0]) |
|
|
def test_audio_to_spectrogram_reconstruction( |
|
|
self, fft_length: int, num_channels: int, magnitude_power: float, scale: float |
|
|
): |
|
|
"""Test analysis and synthesis transform result in a perfect reconstruction.""" |
|
|
batch_size = 4 |
|
|
num_samples = fft_length * 50 |
|
|
num_examples = 25 |
|
|
random_seed = 42 |
|
|
atol = 1e-6 |
|
|
|
|
|
_rng = np.random.default_rng(seed=random_seed) |
|
|
|
|
|
hop_lengths = [fft_length // 2, fft_length // 4] |
|
|
|
|
|
for hop_length in hop_lengths: |
|
|
audio2spec = AudioToSpectrogram( |
|
|
fft_length=fft_length, hop_length=hop_length, magnitude_power=magnitude_power, scale=scale |
|
|
) |
|
|
spec2audio = SpectrogramToAudio( |
|
|
fft_length=fft_length, hop_length=hop_length, magnitude_power=magnitude_power, scale=scale |
|
|
) |
|
|
|
|
|
for n in range(num_examples): |
|
|
x = _rng.normal(size=(batch_size, num_channels, num_samples)) |
|
|
|
|
|
x_spec, x_spec_length = audio2spec(input=torch.Tensor(x)) |
|
|
x_hat, x_hat_length = spec2audio(input=x_spec, input_length=x_spec_length) |
|
|
|
|
|
assert np.allclose( |
|
|
x_hat.cpu().detach().numpy(), x, atol=atol |
|
|
), f'Reconstructed not matching for example {n} (hop length {hop_length})' |
|
|
|
|
|
@pytest.mark.unit |
|
|
@pytest.mark.parametrize('fft_length', [128, 512]) |
|
|
@pytest.mark.parametrize('num_channels', [1, 4]) |
|
|
@pytest.mark.parametrize('magnitude_power', [0.5, 1]) |
|
|
@pytest.mark.parametrize('scale', [0.1, 1.0]) |
|
|
def test_match_reference_implementation( |
|
|
self, fft_length: int, num_channels: int, magnitude_power: float, scale: float |
|
|
): |
|
|
"""Test analysis and synthesis transforms match reference implementation.""" |
|
|
batch_size = 4 |
|
|
num_samples = fft_length * 50 |
|
|
num_examples = 8 |
|
|
random_seed = 42 |
|
|
atol = 1e-6 |
|
|
|
|
|
_rng = np.random.default_rng(seed=random_seed) |
|
|
|
|
|
hop_lengths = [fft_length // 2, fft_length // 4] |
|
|
|
|
|
for hop_length in hop_lengths: |
|
|
audio2spec = AudioToSpectrogram( |
|
|
fft_length=fft_length, hop_length=hop_length, magnitude_power=magnitude_power, scale=scale |
|
|
) |
|
|
spec2audio = SpectrogramToAudio( |
|
|
fft_length=fft_length, hop_length=hop_length, magnitude_power=magnitude_power, scale=scale |
|
|
) |
|
|
|
|
|
|
|
|
ref_window = torch.hann_window(fft_length) |
|
|
|
|
|
def audio2spec_ref(x): |
|
|
|
|
|
x_spec = [] |
|
|
for b in range(batch_size): |
|
|
for c in range(num_channels): |
|
|
x_spec_bc = torch.stft( |
|
|
input=x[b, c, :], |
|
|
n_fft=fft_length, |
|
|
hop_length=hop_length, |
|
|
win_length=fft_length, |
|
|
window=ref_window, |
|
|
center=True, |
|
|
pad_mode='constant', |
|
|
normalized=False, |
|
|
onesided=True, |
|
|
return_complex=True, |
|
|
) |
|
|
x_spec.append(x_spec_bc) |
|
|
x_spec = torch.stack(x_spec, dim=0) |
|
|
x_spec = rearrange(x_spec, '(B C) F N -> B C F N', B=batch_size, C=num_channels) |
|
|
|
|
|
x_spec = ( |
|
|
torch.pow(x_spec.abs(), magnitude_power) * torch.exp(1j * x_spec.angle()) |
|
|
if magnitude_power != 1 |
|
|
else x_spec |
|
|
) |
|
|
x_spec = x_spec * scale if scale != 1 else x_spec |
|
|
return x_spec |
|
|
|
|
|
def spec2audio_ref(x_spec): |
|
|
|
|
|
x_spec = x_spec / scale if scale != 1 else x_spec |
|
|
x_spec = ( |
|
|
torch.pow(x_spec.abs(), 1 / magnitude_power) * torch.exp(1j * x_spec.angle()) |
|
|
if magnitude_power != 1 |
|
|
else x_spec |
|
|
) |
|
|
|
|
|
x = [] |
|
|
for b in range(batch_size): |
|
|
for c in range(num_channels): |
|
|
x_bc = torch.istft( |
|
|
input=x_spec[b, c, ...], |
|
|
n_fft=fft_length, |
|
|
hop_length=hop_length, |
|
|
win_length=fft_length, |
|
|
window=ref_window, |
|
|
center=True, |
|
|
normalized=False, |
|
|
onesided=True, |
|
|
return_complex=False, |
|
|
) |
|
|
x.append(x_bc) |
|
|
x = torch.stack(x, dim=0) |
|
|
x = rearrange(x, '(B C) T -> B C T', B=batch_size, C=num_channels) |
|
|
return x |
|
|
|
|
|
for n in range(num_examples): |
|
|
x = _rng.normal(size=(batch_size, num_channels, num_samples)) |
|
|
|
|
|
|
|
|
x_spec, x_spec_length = audio2spec(input=torch.Tensor(x)) |
|
|
x_spec_ref = audio2spec_ref(torch.Tensor(x)) |
|
|
|
|
|
assert torch.allclose( |
|
|
x_spec, x_spec_ref, atol=atol |
|
|
), f'Analysis not matching for example {n} (hop length {hop_length})' |
|
|
|
|
|
|
|
|
x_hat, _ = spec2audio(input=x_spec, input_length=x_spec_length) |
|
|
x_hat_ref = spec2audio_ref(x_spec_ref) |
|
|
|
|
|
assert torch.allclose( |
|
|
x_hat, x_hat_ref, atol=atol |
|
|
), f'Synthesis not matching for example {n} (hop length {hop_length})' |
|
|
|
|
|
@pytest.mark.unit |
|
|
@pytest.mark.parametrize('fft_length', [13, 63]) |
|
|
def test_invalid_length(self, fft_length: int): |
|
|
"""Test initializing transforms with invalid length.""" |
|
|
|
|
|
|
|
|
with pytest.raises(ValueError): |
|
|
AudioToSpectrogram(fft_length=fft_length, hop_length=fft_length // 2) |
|
|
with pytest.raises(ValueError): |
|
|
SpectrogramToAudio(fft_length=fft_length, hop_length=fft_length // 2) |
|
|
|
|
|
@pytest.mark.unit |
|
|
@pytest.mark.parametrize('fft_length', [32]) |
|
|
def test_invalid_compression(self, fft_length: int): |
|
|
"""Test initializing transforms with invalid compression.""" |
|
|
|
|
|
with pytest.raises(ValueError): |
|
|
AudioToSpectrogram(fft_length=fft_length, hop_length=fft_length // 2, magnitude_power=0.0) |
|
|
with pytest.raises(ValueError): |
|
|
SpectrogramToAudio(fft_length=fft_length, hop_length=fft_length // 2, magnitude_power=0.0) |
|
|
with pytest.raises(ValueError): |
|
|
AudioToSpectrogram(fft_length=fft_length, hop_length=fft_length // 2, magnitude_power=-1.0) |
|
|
with pytest.raises(ValueError): |
|
|
SpectrogramToAudio(fft_length=fft_length, hop_length=fft_length // 2, magnitude_power=-1.0) |
|
|
|
|
|
|
|
|
with pytest.raises(ValueError): |
|
|
AudioToSpectrogram(fft_length=fft_length, hop_length=fft_length // 2, scale=0.0) |
|
|
with pytest.raises(ValueError): |
|
|
SpectrogramToAudio(fft_length=fft_length, hop_length=fft_length // 2, scale=0.0) |
|
|
with pytest.raises(ValueError): |
|
|
AudioToSpectrogram(fft_length=fft_length, hop_length=fft_length // 2, scale=-1.0) |
|
|
with pytest.raises(ValueError): |
|
|
SpectrogramToAudio(fft_length=fft_length, hop_length=fft_length // 2, scale=-1.0) |
|
|
|
|
|
@pytest.mark.unit |
|
|
@pytest.mark.parametrize('fft_length', [32]) |
|
|
def test_invalid_spec_to_audio_input(self, fft_length: int): |
|
|
"""Test invalid input for spec to audio transform.""" |
|
|
s2a = SpectrogramToAudio(fft_length=fft_length, hop_length=fft_length // 2) |
|
|
|
|
|
s2a(input=torch.randn(1, 1, fft_length // 2 + 1, 100, dtype=torch.cfloat)) |
|
|
|
|
|
|
|
|
with pytest.raises(ValueError): |
|
|
s2a(input=torch.randn(1, 1, fft_length // 2 + 1, 100)) |
|
|
|