ispl_safe / preprocess.py
davesalvi's picture
MOE FS
399daf2
import librosa
import soundfile as sf
import numpy as np
import torch
import random
def pad_audio(x, max_len=48000):
x_len = x.shape[0]
if x_len >= max_len:
return x[:max_len]
# need to pad
num_repeats = int(max_len / x_len) + 1
padded_x = np.tile(x, (1, num_repeats))[:, :max_len][0]
return padded_x
def preprocess(audio_file, target_sr=16000, win_dur=4):
print(f'Preprocessing {audio_file}')
# Load the audio file
# y, sr = librosa.load(audio_file, sr=16000)
y, sr_orig = sf.read(audio_file)
# If stereo, convert to mono
if y.ndim > 1:
y = np.mean(y, axis=1)
if not sr_orig == target_sr:
y = librosa.resample(y, orig_sr=sr_orig, target_sr=target_sr)
sr = target_sr
# Evaluate N windows of the audio file
num_eval = 5
win_len = int(win_dur*sr)
last_sample = len(y) - win_len
# start_sample_list = np.linspace(0, max(0, last_sample), num=num_eval)
start_sample_list = [random.randint(0, max(0, last_sample)) for _ in range(num_eval)]
frames = []
for start_sample in start_sample_list:
start = int(start_sample)
end = start + win_len
frame = y[start:end]
if len(frame) < win_len:
frame = pad_audio(frame, win_len)
frames.append(frame)
y_win = np.stack(frames, axis=0)
tensor = torch.from_numpy(y_win)
tensor = tensor.float()
print(f'preprocessed track - shape {tensor.shape}')
return tensor, sr_orig
def preprocess_FS(audio_file, win_dur=4):
print(f'Preprocessing {audio_file}')
# Load the audio file
# y, sr = librosa.load(audio_file, sr=16000)
y, sr_orig = sf.read(audio_file)
# If stereo, convert to mono
if y.ndim > 1:
y = np.mean(y, axis=1)
y_16 = librosa.resample(y, orig_sr=sr_orig, target_sr=16000)
y_22 = librosa.resample(y, orig_sr=sr_orig, target_sr=22050)
y_24 = librosa.resample(y, orig_sr=sr_orig, target_sr=24000)
# Evaluate N windows of the audio file
num_eval = 5
win_len_16 = int(win_dur*16000)
win_len_22 = int(win_dur*22050)
win_len_24 = int(win_dur*24000)
last_sample_24 = len(y_24) - win_len_24
start_sample_list_24 = [random.randint(0, max(0, last_sample_24)) for _ in range(num_eval)]
start_sample_list_16 = [int(x * 16000 // 24000) for x in start_sample_list_24]
start_sample_list_22 = [int(x * 22050 // 24000) for x in start_sample_list_24]
# TENSOR_16
frames_16 = []
for start_sample in start_sample_list_16:
start = int(start_sample)
end = start + win_len_16
frame = y_16[start:end]
if len(frame) < win_len_16:
frame = pad_audio(frame, win_len_16)
frames_16.append(frame)
# TENSOR_22
frames_22 = []
for start_sample in start_sample_list_22:
start = int(start_sample)
end = start + win_len_22
frame = y_22[start:end]
if len(frame) < win_len_22:
frame = pad_audio(frame, win_len_22)
frames_22.append(frame)
# TENSOR_24
frames_24 = []
for start_sample in start_sample_list_24:
start = int(start_sample)
end = start + win_len_24
frame = y_24[start:end]
if len(frame) < win_len_24:
frame = pad_audio(frame, win_len_24)
frames_24.append(frame)
y_win_16 = np.stack(frames_16, axis=0)
y_win_22 = np.stack(frames_22, axis=0)
y_win_24 = np.stack(frames_24, axis=0)
tensor_16 = torch.from_numpy(y_win_16)
tensor_22 = torch.from_numpy(y_win_22)
tensor_24 = torch.from_numpy(y_win_24)
tensor_16 = tensor_16.float()
tensor_22 = tensor_22.float()
tensor_24 = tensor_24.float()
print(f'preprocessed track - shape {tensor_16.shape}')
return tensor_16, tensor_22, tensor_24
def preprocess_old(audio_file):
# Load the audio file
y, sr = librosa.load(audio_file, sr=None)
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
tensor = torch.from_numpy(mfccs)[None]
return tensor