File size: 2,659 Bytes
dfd1909 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | from typing import Dict
from numpy import ndarray
import resampy
import torch
import torchcrepe
import numpy as np
from TorchJAEKWON.DataProcess.Util.UtilAudio import UtilAudio
class UtilAudioPlus(UtilAudio):
def get_pitch_crepe(self,
wav:ndarray, #mono 1d array
sample_rate:float,
hop_size:int,
spec_time_bin_length:int,
f0_min:float = 50.0,
f0_max:float = 1100.0,
threshold:float=0.05,
device = torch.device("cuda")) -> Dict[str,ndarray]:
wav16k = resampy.resample(wav, sample_rate, 16000)
wav16k_torch = torch.FloatTensor(wav16k).unsqueeze(0).to(device)
f0, pd = torchcrepe.predict(wav16k_torch, 16000, 80, f0_min, f0_max, pad=True, model='full', batch_size=1024, device=device, return_periodicity=True)
pd = torchcrepe.filter.median(pd, 3)
pd = torchcrepe.threshold.Silence(-60.)(pd, wav16k_torch, 16000, 80)
f0 = torchcrepe.threshold.At(threshold)(f0, pd)
f0 = torchcrepe.filter.mean(f0, 3)
f0 = torch.where(torch.isnan(f0), torch.full_like(f0, 0), f0)
nzindex = torch.nonzero(f0[0]).squeeze()
f0 = torch.index_select(f0[0], dim=0, index=nzindex).cpu().numpy()
time_org = 0.005 * nzindex.cpu().numpy()
time_frame = np.arange(spec_time_bin_length) * hop_size / sample_rate
if f0.shape[0] == 0:
f0 = torch.FloatTensor(time_frame.shape[0]).fill_(0)
print('f0 all zero!')
else:
f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
pitch_coarse = self.f0_to_coarse(f0)
return {'f0':f0, 'pitch':pitch_coarse}
def f0_to_coarse(self,
f0:ndarray,
f0_bin:int = 256,
f0_min:float = 50.0,
f0_max:float = 1100.0) -> ndarray:
is_torch = isinstance(f0, torch.Tensor)
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(int)
assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min())
return f0_coarse |