import librosa import soundfile as sf import numpy as np import torch import random def pad_audio(x, max_len=48000): x_len = x.shape[0] if x_len >= max_len: return x[:max_len] # need to pad num_repeats = int(max_len / x_len) + 1 padded_x = np.tile(x, (1, num_repeats))[:, :max_len][0] return padded_x def preprocess(audio_file, target_sr=16000, win_dur=4): print(f'Preprocessing {audio_file}') # Load the audio file # y, sr = librosa.load(audio_file, sr=16000) y, sr_orig = sf.read(audio_file) # If stereo, convert to mono if y.ndim > 1: y = np.mean(y, axis=1) if not sr_orig == target_sr: y = librosa.resample(y, orig_sr=sr_orig, target_sr=target_sr) sr = target_sr # Evaluate N windows of the audio file num_eval = 5 win_len = int(win_dur*sr) last_sample = len(y) - win_len # start_sample_list = np.linspace(0, max(0, last_sample), num=num_eval) start_sample_list = [random.randint(0, max(0, last_sample)) for _ in range(num_eval)] frames = [] for start_sample in start_sample_list: start = int(start_sample) end = start + win_len frame = y[start:end] if len(frame) < win_len: frame = pad_audio(frame, win_len) frames.append(frame) y_win = np.stack(frames, axis=0) tensor = torch.from_numpy(y_win) tensor = tensor.float() print(f'preprocessed track - shape {tensor.shape}') return tensor, sr_orig def preprocess_FS(audio_file, win_dur=4): print(f'Preprocessing {audio_file}') # Load the audio file # y, sr = librosa.load(audio_file, sr=16000) y, sr_orig = sf.read(audio_file) # If stereo, convert to mono if y.ndim > 1: y = np.mean(y, axis=1) y_16 = librosa.resample(y, orig_sr=sr_orig, target_sr=16000) y_22 = librosa.resample(y, orig_sr=sr_orig, target_sr=22050) y_24 = librosa.resample(y, orig_sr=sr_orig, target_sr=24000) # Evaluate N windows of the audio file num_eval = 5 win_len_16 = int(win_dur*16000) win_len_22 = int(win_dur*22050) win_len_24 = int(win_dur*24000) last_sample_24 = len(y_24) - win_len_24 start_sample_list_24 = [random.randint(0, max(0, last_sample_24)) for _ in range(num_eval)] start_sample_list_16 = [int(x * 16000 // 24000) for x in start_sample_list_24] start_sample_list_22 = [int(x * 22050 // 24000) for x in start_sample_list_24] # TENSOR_16 frames_16 = [] for start_sample in start_sample_list_16: start = int(start_sample) end = start + win_len_16 frame = y_16[start:end] if len(frame) < win_len_16: frame = pad_audio(frame, win_len_16) frames_16.append(frame) # TENSOR_22 frames_22 = [] for start_sample in start_sample_list_22: start = int(start_sample) end = start + win_len_22 frame = y_22[start:end] if len(frame) < win_len_22: frame = pad_audio(frame, win_len_22) frames_22.append(frame) # TENSOR_24 frames_24 = [] for start_sample in start_sample_list_24: start = int(start_sample) end = start + win_len_24 frame = y_24[start:end] if len(frame) < win_len_24: frame = pad_audio(frame, win_len_24) frames_24.append(frame) y_win_16 = np.stack(frames_16, axis=0) y_win_22 = np.stack(frames_22, axis=0) y_win_24 = np.stack(frames_24, axis=0) tensor_16 = torch.from_numpy(y_win_16) tensor_22 = torch.from_numpy(y_win_22) tensor_24 = torch.from_numpy(y_win_24) tensor_16 = tensor_16.float() tensor_22 = tensor_22.float() tensor_24 = tensor_24.float() print(f'preprocessed track - shape {tensor_16.shape}') return tensor_16, tensor_22, tensor_24 def preprocess_old(audio_file): # Load the audio file y, sr = librosa.load(audio_file, sr=None) mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) tensor = torch.from_numpy(mfccs)[None] return tensor