davesalvi
/

ispl_safe

Model card Files Files and versions

ispl_safe / preprocess.py

davesalvi's picture

MOE FS

399daf2 10 months ago

history blame contribute delete

4.04 kB


	import librosa
	import soundfile as sf
	import numpy as np
	import torch
	import random

	def pad_audio(x, max_len=48000):
	x_len = x.shape[0]
	if x_len >= max_len:
	return x[:max_len]
	# need to pad
	num_repeats = int(max_len / x_len) + 1
	padded_x = np.tile(x, (1, num_repeats))[:, :max_len][0]
	return padded_x


	def preprocess(audio_file, target_sr=16000, win_dur=4):
	print(f'Preprocessing {audio_file}')
	# Load the audio file
	# y, sr = librosa.load(audio_file, sr=16000)

	y, sr_orig = sf.read(audio_file)

	# If stereo, convert to mono
	if y.ndim > 1:
	y = np.mean(y, axis=1)

	if not sr_orig == target_sr:
	y = librosa.resample(y, orig_sr=sr_orig, target_sr=target_sr)
	sr = target_sr

	# Evaluate N windows of the audio file
	num_eval = 5
	win_len = int(win_dur*sr)
	last_sample = len(y) - win_len
	# start_sample_list = np.linspace(0, max(0, last_sample), num=num_eval)
	start_sample_list = [random.randint(0, max(0, last_sample)) for _ in range(num_eval)]

	frames = []
	for start_sample in start_sample_list:
	start = int(start_sample)
	end = start + win_len
	frame = y[start:end]

	if len(frame) < win_len:
	frame = pad_audio(frame, win_len)

	frames.append(frame)

	y_win = np.stack(frames, axis=0)
	tensor = torch.from_numpy(y_win)
	tensor = tensor.float()

	print(f'preprocessed track - shape {tensor.shape}')
	return tensor, sr_orig


	def preprocess_FS(audio_file, win_dur=4):
	print(f'Preprocessing {audio_file}')
	# Load the audio file
	# y, sr = librosa.load(audio_file, sr=16000)

	y, sr_orig = sf.read(audio_file)

	# If stereo, convert to mono
	if y.ndim > 1:
	y = np.mean(y, axis=1)

	y_16 = librosa.resample(y, orig_sr=sr_orig, target_sr=16000)
	y_22 = librosa.resample(y, orig_sr=sr_orig, target_sr=22050)
	y_24 = librosa.resample(y, orig_sr=sr_orig, target_sr=24000)

	# Evaluate N windows of the audio file
	num_eval = 5
	win_len_16 = int(win_dur*16000)
	win_len_22 = int(win_dur*22050)
	win_len_24 = int(win_dur*24000)

	last_sample_24 = len(y_24) - win_len_24
	start_sample_list_24 = [random.randint(0, max(0, last_sample_24)) for _ in range(num_eval)]
	start_sample_list_16 = [int(x * 16000 // 24000) for x in start_sample_list_24]
	start_sample_list_22 = [int(x * 22050 // 24000) for x in start_sample_list_24]

	# TENSOR_16
	frames_16 = []
	for start_sample in start_sample_list_16:
	start = int(start_sample)
	end = start + win_len_16
	frame = y_16[start:end]

	if len(frame) < win_len_16:
	frame = pad_audio(frame, win_len_16)

	frames_16.append(frame)

	# TENSOR_22
	frames_22 = []
	for start_sample in start_sample_list_22:
	start = int(start_sample)
	end = start + win_len_22
	frame = y_22[start:end]

	if len(frame) < win_len_22:
	frame = pad_audio(frame, win_len_22)

	frames_22.append(frame)

	# TENSOR_24
	frames_24 = []
	for start_sample in start_sample_list_24:
	start = int(start_sample)
	end = start + win_len_24
	frame = y_24[start:end]

	if len(frame) < win_len_24:
	frame = pad_audio(frame, win_len_24)

	frames_24.append(frame)


	y_win_16 = np.stack(frames_16, axis=0)
	y_win_22 = np.stack(frames_22, axis=0)
	y_win_24 = np.stack(frames_24, axis=0)

	tensor_16 = torch.from_numpy(y_win_16)
	tensor_22 = torch.from_numpy(y_win_22)
	tensor_24 = torch.from_numpy(y_win_24)

	tensor_16 = tensor_16.float()
	tensor_22 = tensor_22.float()
	tensor_24 = tensor_24.float()

	print(f'preprocessed track - shape {tensor_16.shape}')
	return tensor_16, tensor_22, tensor_24


	def preprocess_old(audio_file):
	# Load the audio file
	y, sr = librosa.load(audio_file, sr=None)
	mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
	tensor = torch.from_numpy(mfccs)[None]
	return tensor