File size: 4,040 Bytes
b1fba4c 7e35931 637ed97 b1fba4c f20a679 b1fba4c 637ed97 d50110e 637ed97 7e35931 6bb7746 bc8746b d50110e 637ed97 d50110e f20a679 9a0f548 637ed97 bc8746b 637ed97 8d4eda1 637ed97 6bb7746 637ed97 399daf2 d1028ac b1fba4c 637ed97 b1fba4c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import librosa
import soundfile as sf
import numpy as np
import torch
import random
def pad_audio(x, max_len=48000):
x_len = x.shape[0]
if x_len >= max_len:
return x[:max_len]
# need to pad
num_repeats = int(max_len / x_len) + 1
padded_x = np.tile(x, (1, num_repeats))[:, :max_len][0]
return padded_x
def preprocess(audio_file, target_sr=16000, win_dur=4):
print(f'Preprocessing {audio_file}')
# Load the audio file
# y, sr = librosa.load(audio_file, sr=16000)
y, sr_orig = sf.read(audio_file)
# If stereo, convert to mono
if y.ndim > 1:
y = np.mean(y, axis=1)
if not sr_orig == target_sr:
y = librosa.resample(y, orig_sr=sr_orig, target_sr=target_sr)
sr = target_sr
# Evaluate N windows of the audio file
num_eval = 5
win_len = int(win_dur*sr)
last_sample = len(y) - win_len
# start_sample_list = np.linspace(0, max(0, last_sample), num=num_eval)
start_sample_list = [random.randint(0, max(0, last_sample)) for _ in range(num_eval)]
frames = []
for start_sample in start_sample_list:
start = int(start_sample)
end = start + win_len
frame = y[start:end]
if len(frame) < win_len:
frame = pad_audio(frame, win_len)
frames.append(frame)
y_win = np.stack(frames, axis=0)
tensor = torch.from_numpy(y_win)
tensor = tensor.float()
print(f'preprocessed track - shape {tensor.shape}')
return tensor, sr_orig
def preprocess_FS(audio_file, win_dur=4):
print(f'Preprocessing {audio_file}')
# Load the audio file
# y, sr = librosa.load(audio_file, sr=16000)
y, sr_orig = sf.read(audio_file)
# If stereo, convert to mono
if y.ndim > 1:
y = np.mean(y, axis=1)
y_16 = librosa.resample(y, orig_sr=sr_orig, target_sr=16000)
y_22 = librosa.resample(y, orig_sr=sr_orig, target_sr=22050)
y_24 = librosa.resample(y, orig_sr=sr_orig, target_sr=24000)
# Evaluate N windows of the audio file
num_eval = 5
win_len_16 = int(win_dur*16000)
win_len_22 = int(win_dur*22050)
win_len_24 = int(win_dur*24000)
last_sample_24 = len(y_24) - win_len_24
start_sample_list_24 = [random.randint(0, max(0, last_sample_24)) for _ in range(num_eval)]
start_sample_list_16 = [int(x * 16000 // 24000) for x in start_sample_list_24]
start_sample_list_22 = [int(x * 22050 // 24000) for x in start_sample_list_24]
# TENSOR_16
frames_16 = []
for start_sample in start_sample_list_16:
start = int(start_sample)
end = start + win_len_16
frame = y_16[start:end]
if len(frame) < win_len_16:
frame = pad_audio(frame, win_len_16)
frames_16.append(frame)
# TENSOR_22
frames_22 = []
for start_sample in start_sample_list_22:
start = int(start_sample)
end = start + win_len_22
frame = y_22[start:end]
if len(frame) < win_len_22:
frame = pad_audio(frame, win_len_22)
frames_22.append(frame)
# TENSOR_24
frames_24 = []
for start_sample in start_sample_list_24:
start = int(start_sample)
end = start + win_len_24
frame = y_24[start:end]
if len(frame) < win_len_24:
frame = pad_audio(frame, win_len_24)
frames_24.append(frame)
y_win_16 = np.stack(frames_16, axis=0)
y_win_22 = np.stack(frames_22, axis=0)
y_win_24 = np.stack(frames_24, axis=0)
tensor_16 = torch.from_numpy(y_win_16)
tensor_22 = torch.from_numpy(y_win_22)
tensor_24 = torch.from_numpy(y_win_24)
tensor_16 = tensor_16.float()
tensor_22 = tensor_22.float()
tensor_24 = tensor_24.float()
print(f'preprocessed track - shape {tensor_16.shape}')
return tensor_16, tensor_22, tensor_24
def preprocess_old(audio_file):
# Load the audio file
y, sr = librosa.load(audio_file, sr=None)
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
tensor = torch.from_numpy(mfccs)[None]
return tensor
|