|
|
|
|
|
import librosa |
|
|
import soundfile as sf |
|
|
import numpy as np |
|
|
import torch |
|
|
import random |
|
|
|
|
|
def pad_audio(x, max_len=48000): |
|
|
x_len = x.shape[0] |
|
|
if x_len >= max_len: |
|
|
return x[:max_len] |
|
|
|
|
|
num_repeats = int(max_len / x_len) + 1 |
|
|
padded_x = np.tile(x, (1, num_repeats))[:, :max_len][0] |
|
|
return padded_x |
|
|
|
|
|
|
|
|
def preprocess(audio_file, target_sr=16000, win_dur=4): |
|
|
print(f'Preprocessing {audio_file}') |
|
|
|
|
|
|
|
|
|
|
|
y, sr_orig = sf.read(audio_file) |
|
|
|
|
|
|
|
|
if y.ndim > 1: |
|
|
y = np.mean(y, axis=1) |
|
|
|
|
|
if not sr_orig == target_sr: |
|
|
y = librosa.resample(y, orig_sr=sr_orig, target_sr=target_sr) |
|
|
sr = target_sr |
|
|
|
|
|
|
|
|
num_eval = 5 |
|
|
win_len = int(win_dur*sr) |
|
|
last_sample = len(y) - win_len |
|
|
|
|
|
start_sample_list = [random.randint(0, max(0, last_sample)) for _ in range(num_eval)] |
|
|
|
|
|
frames = [] |
|
|
for start_sample in start_sample_list: |
|
|
start = int(start_sample) |
|
|
end = start + win_len |
|
|
frame = y[start:end] |
|
|
|
|
|
if len(frame) < win_len: |
|
|
frame = pad_audio(frame, win_len) |
|
|
|
|
|
frames.append(frame) |
|
|
|
|
|
y_win = np.stack(frames, axis=0) |
|
|
tensor = torch.from_numpy(y_win) |
|
|
tensor = tensor.float() |
|
|
|
|
|
print(f'preprocessed track - shape {tensor.shape}') |
|
|
return tensor, sr_orig |
|
|
|
|
|
|
|
|
def preprocess_FS(audio_file, win_dur=4): |
|
|
print(f'Preprocessing {audio_file}') |
|
|
|
|
|
|
|
|
|
|
|
y, sr_orig = sf.read(audio_file) |
|
|
|
|
|
|
|
|
if y.ndim > 1: |
|
|
y = np.mean(y, axis=1) |
|
|
|
|
|
y_16 = librosa.resample(y, orig_sr=sr_orig, target_sr=16000) |
|
|
y_22 = librosa.resample(y, orig_sr=sr_orig, target_sr=22050) |
|
|
y_24 = librosa.resample(y, orig_sr=sr_orig, target_sr=24000) |
|
|
|
|
|
|
|
|
num_eval = 5 |
|
|
win_len_16 = int(win_dur*16000) |
|
|
win_len_22 = int(win_dur*22050) |
|
|
win_len_24 = int(win_dur*24000) |
|
|
|
|
|
last_sample_24 = len(y_24) - win_len_24 |
|
|
start_sample_list_24 = [random.randint(0, max(0, last_sample_24)) for _ in range(num_eval)] |
|
|
start_sample_list_16 = [int(x * 16000 // 24000) for x in start_sample_list_24] |
|
|
start_sample_list_22 = [int(x * 22050 // 24000) for x in start_sample_list_24] |
|
|
|
|
|
|
|
|
frames_16 = [] |
|
|
for start_sample in start_sample_list_16: |
|
|
start = int(start_sample) |
|
|
end = start + win_len_16 |
|
|
frame = y_16[start:end] |
|
|
|
|
|
if len(frame) < win_len_16: |
|
|
frame = pad_audio(frame, win_len_16) |
|
|
|
|
|
frames_16.append(frame) |
|
|
|
|
|
|
|
|
frames_22 = [] |
|
|
for start_sample in start_sample_list_22: |
|
|
start = int(start_sample) |
|
|
end = start + win_len_22 |
|
|
frame = y_22[start:end] |
|
|
|
|
|
if len(frame) < win_len_22: |
|
|
frame = pad_audio(frame, win_len_22) |
|
|
|
|
|
frames_22.append(frame) |
|
|
|
|
|
|
|
|
frames_24 = [] |
|
|
for start_sample in start_sample_list_24: |
|
|
start = int(start_sample) |
|
|
end = start + win_len_24 |
|
|
frame = y_24[start:end] |
|
|
|
|
|
if len(frame) < win_len_24: |
|
|
frame = pad_audio(frame, win_len_24) |
|
|
|
|
|
frames_24.append(frame) |
|
|
|
|
|
|
|
|
y_win_16 = np.stack(frames_16, axis=0) |
|
|
y_win_22 = np.stack(frames_22, axis=0) |
|
|
y_win_24 = np.stack(frames_24, axis=0) |
|
|
|
|
|
tensor_16 = torch.from_numpy(y_win_16) |
|
|
tensor_22 = torch.from_numpy(y_win_22) |
|
|
tensor_24 = torch.from_numpy(y_win_24) |
|
|
|
|
|
tensor_16 = tensor_16.float() |
|
|
tensor_22 = tensor_22.float() |
|
|
tensor_24 = tensor_24.float() |
|
|
|
|
|
print(f'preprocessed track - shape {tensor_16.shape}') |
|
|
return tensor_16, tensor_22, tensor_24 |
|
|
|
|
|
|
|
|
def preprocess_old(audio_file): |
|
|
|
|
|
y, sr = librosa.load(audio_file, sr=None) |
|
|
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) |
|
|
tensor = torch.from_numpy(mfccs)[None] |
|
|
return tensor |
|
|
|