from typing import Dict from numpy import ndarray import resampy import torch import torchcrepe import numpy as np from TorchJAEKWON.DataProcess.Util.UtilAudio import UtilAudio class UtilAudioPlus(UtilAudio): def get_pitch_crepe(self, wav:ndarray, #mono 1d array sample_rate:float, hop_size:int, spec_time_bin_length:int, f0_min:float = 50.0, f0_max:float = 1100.0, threshold:float=0.05, device = torch.device("cuda")) -> Dict[str,ndarray]: wav16k = resampy.resample(wav, sample_rate, 16000) wav16k_torch = torch.FloatTensor(wav16k).unsqueeze(0).to(device) f0, pd = torchcrepe.predict(wav16k_torch, 16000, 80, f0_min, f0_max, pad=True, model='full', batch_size=1024, device=device, return_periodicity=True) pd = torchcrepe.filter.median(pd, 3) pd = torchcrepe.threshold.Silence(-60.)(pd, wav16k_torch, 16000, 80) f0 = torchcrepe.threshold.At(threshold)(f0, pd) f0 = torchcrepe.filter.mean(f0, 3) f0 = torch.where(torch.isnan(f0), torch.full_like(f0, 0), f0) nzindex = torch.nonzero(f0[0]).squeeze() f0 = torch.index_select(f0[0], dim=0, index=nzindex).cpu().numpy() time_org = 0.005 * nzindex.cpu().numpy() time_frame = np.arange(spec_time_bin_length) * hop_size / sample_rate if f0.shape[0] == 0: f0 = torch.FloatTensor(time_frame.shape[0]).fill_(0) print('f0 all zero!') else: f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1]) pitch_coarse = self.f0_to_coarse(f0) return {'f0':f0, 'pitch':pitch_coarse} def f0_to_coarse(self, f0:ndarray, f0_bin:int = 256, f0_min:float = 50.0, f0_max:float = 1100.0) -> ndarray: is_torch = isinstance(f0, torch.Tensor) f0_mel_min = 1127 * np.log(1 + f0_min / 700) f0_mel_max = 1127 * np.log(1 + f0_max / 700) f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700) f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1 f0_mel[f0_mel <= 1] = 1 f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1 f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(int) assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min()) return f0_coarse