File size: 4,040 Bytes


import librosa
import soundfile as sf
import numpy as np
import torch
import random

def pad_audio(x, max_len=48000):
    x_len = x.shape[0]
    if x_len >= max_len:
        return x[:max_len]
    # need to pad
    num_repeats = int(max_len / x_len) + 1
    padded_x = np.tile(x, (1, num_repeats))[:, :max_len][0]
    return padded_x


def preprocess(audio_file, target_sr=16000, win_dur=4):
    print(f'Preprocessing {audio_file}')
    # Load the audio file
    # y, sr = librosa.load(audio_file, sr=16000)

    y, sr_orig = sf.read(audio_file)

    # If stereo, convert to mono
    if y.ndim > 1:
        y = np.mean(y, axis=1)

    if not sr_orig == target_sr:
        y = librosa.resample(y, orig_sr=sr_orig, target_sr=target_sr)
    sr = target_sr

    # Evaluate N windows of the audio file
    num_eval = 5
    win_len = int(win_dur*sr)
    last_sample = len(y) - win_len
    # start_sample_list = np.linspace(0, max(0, last_sample), num=num_eval)
    start_sample_list = [random.randint(0, max(0, last_sample)) for _ in range(num_eval)]

    frames = []
    for start_sample in start_sample_list:
        start = int(start_sample)
        end = start + win_len
        frame = y[start:end]

        if len(frame) < win_len:
            frame = pad_audio(frame, win_len)

        frames.append(frame)

    y_win = np.stack(frames, axis=0)
    tensor = torch.from_numpy(y_win)
    tensor = tensor.float()

    print(f'preprocessed track - shape {tensor.shape}')
    return tensor, sr_orig


def preprocess_FS(audio_file, win_dur=4):
    print(f'Preprocessing {audio_file}')
    # Load the audio file
    # y, sr = librosa.load(audio_file, sr=16000)

    y, sr_orig = sf.read(audio_file)

    # If stereo, convert to mono
    if y.ndim > 1:
        y = np.mean(y, axis=1)

    y_16 = librosa.resample(y, orig_sr=sr_orig, target_sr=16000)
    y_22 = librosa.resample(y, orig_sr=sr_orig, target_sr=22050)
    y_24 = librosa.resample(y, orig_sr=sr_orig, target_sr=24000)

    # Evaluate N windows of the audio file
    num_eval = 5
    win_len_16 = int(win_dur*16000)
    win_len_22 = int(win_dur*22050)
    win_len_24 = int(win_dur*24000)

    last_sample_24 = len(y_24) - win_len_24
    start_sample_list_24 = [random.randint(0, max(0, last_sample_24)) for _ in range(num_eval)]
    start_sample_list_16 = [int(x * 16000 // 24000) for x in start_sample_list_24]
    start_sample_list_22 = [int(x * 22050 // 24000) for x in start_sample_list_24]

    # TENSOR_16
    frames_16 = []
    for start_sample in start_sample_list_16:
        start = int(start_sample)
        end = start + win_len_16
        frame = y_16[start:end]

        if len(frame) < win_len_16:
            frame = pad_audio(frame, win_len_16)

        frames_16.append(frame)

    # TENSOR_22
    frames_22 = []
    for start_sample in start_sample_list_22:
        start = int(start_sample)
        end = start + win_len_22
        frame = y_22[start:end]

        if len(frame) < win_len_22:
            frame = pad_audio(frame, win_len_22)

        frames_22.append(frame)

    # TENSOR_24
    frames_24 = []
    for start_sample in start_sample_list_24:
        start = int(start_sample)
        end = start + win_len_24
        frame = y_24[start:end]

        if len(frame) < win_len_24:
            frame = pad_audio(frame, win_len_24)

        frames_24.append(frame)


    y_win_16 = np.stack(frames_16, axis=0)
    y_win_22 = np.stack(frames_22, axis=0)
    y_win_24 = np.stack(frames_24, axis=0)

    tensor_16 = torch.from_numpy(y_win_16)
    tensor_22 = torch.from_numpy(y_win_22)
    tensor_24 = torch.from_numpy(y_win_24)

    tensor_16 = tensor_16.float()
    tensor_22 = tensor_22.float()
    tensor_24 = tensor_24.float()

    print(f'preprocessed track - shape {tensor_16.shape}')
    return tensor_16, tensor_22, tensor_24


def preprocess_old(audio_file):
    # Load the audio file
    y, sr = librosa.load(audio_file, sr=None)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    tensor = torch.from_numpy(mfccs)[None]
    return tensor