File size: 2,659 Bytes
dfd1909
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from typing import Dict
from numpy import ndarray

import resampy
import torch
import torchcrepe
import numpy as np

from TorchJAEKWON.DataProcess.Util.UtilAudio import UtilAudio

class UtilAudioPlus(UtilAudio):
    def get_pitch_crepe(self,
                        wav:ndarray, #mono 1d array
                        sample_rate:float,
                        hop_size:int,
                        spec_time_bin_length:int,
                        f0_min:float = 50.0,
                        f0_max:float = 1100.0,
                        threshold:float=0.05,
                        device = torch.device("cuda")) -> Dict[str,ndarray]:
        
        wav16k = resampy.resample(wav, sample_rate, 16000)
        wav16k_torch = torch.FloatTensor(wav16k).unsqueeze(0).to(device)

        f0, pd = torchcrepe.predict(wav16k_torch, 16000, 80, f0_min, f0_max, pad=True, model='full', batch_size=1024, device=device, return_periodicity=True)

        pd = torchcrepe.filter.median(pd, 3)
        pd = torchcrepe.threshold.Silence(-60.)(pd, wav16k_torch, 16000, 80)
        f0 = torchcrepe.threshold.At(threshold)(f0, pd)
        f0 = torchcrepe.filter.mean(f0, 3)

        f0 = torch.where(torch.isnan(f0), torch.full_like(f0, 0), f0)
        
        nzindex = torch.nonzero(f0[0]).squeeze()
        f0 = torch.index_select(f0[0], dim=0, index=nzindex).cpu().numpy()
        time_org = 0.005 * nzindex.cpu().numpy()
        time_frame = np.arange(spec_time_bin_length) * hop_size / sample_rate
        if f0.shape[0] == 0:
            f0 = torch.FloatTensor(time_frame.shape[0]).fill_(0)
            print('f0 all zero!')
        else:
            f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
        pitch_coarse = self.f0_to_coarse(f0)
        return {'f0':f0, 'pitch':pitch_coarse}

    def f0_to_coarse(self,
                     f0:ndarray,
                     f0_bin:int = 256,
                     f0_min:float = 50.0,
                     f0_max:float = 1100.0) -> ndarray:
        
        is_torch = isinstance(f0, torch.Tensor)
        f0_mel_min = 1127 * np.log(1 + f0_min / 700)
        f0_mel_max = 1127 * np.log(1 + f0_max / 700)
        f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
        f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1

        f0_mel[f0_mel <= 1] = 1
        f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
        f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(int)
        assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min())
        return f0_coarse