File size: 2,855 Bytes
79cf5f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import pathlib

import numpy as np
import parselmouth


def norm_f0(f0):
    f0 = np.log2(f0)
    return f0


def denorm_f0(f0, uv, pitch_padding=None):
    f0 = 2 ** f0
    if uv is not None:
        f0[uv > 0] = 0
    if pitch_padding is not None:
        f0[pitch_padding] = 0
    return f0


def interp_f0(f0, uv=None):
    if uv is None:
        uv = f0 == 0
    f0 = norm_f0(f0)
    if sum(uv) == len(f0):
        f0[uv] = -np.inf
    elif sum(uv) > 0:
        f0[uv] = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv])
    return denorm_f0(f0, uv=None), uv


def resample_align_curve(points: np.ndarray, original_timestep: float, target_timestep: float, align_length: int):
    t_max = (len(points) - 1) * original_timestep
    curve_interp = np.interp(
        np.arange(0, t_max, target_timestep),
        original_timestep * np.arange(len(points)),
        points
    ).astype(points.dtype)
    delta_l = align_length - len(curve_interp)
    if delta_l < 0:
        curve_interp = curve_interp[:align_length]
    elif delta_l > 0:
        curve_interp = np.concatenate((curve_interp, np.full(delta_l, fill_value=curve_interp[-1])), axis=0)
    return curve_interp


def get_pitch_parselmouth(wav_data, hop_size, audio_sample_rate, interp_uv=True):
    time_step = hop_size / audio_sample_rate
    f0_min = 65.
    f0_max = 1100.

    # noinspection PyArgumentList
    f0 = (
        parselmouth.Sound(wav_data, sampling_frequency=audio_sample_rate)
        .to_pitch_ac(
            time_step=time_step, voicing_threshold=0.6, pitch_floor=f0_min, pitch_ceiling=f0_max
        ).selected_array["frequency"]
    )
    uv = f0 == 0
    if interp_uv:
        f0, uv = interp_f0(f0, uv)
    return time_step, f0, uv


rmvpe = None


def get_pitch_rmvpe(wav_data, hop_size, audio_sample_rate, interp_uv=True):
    global rmvpe
    if rmvpe is None:
        from rmvpe import RMVPE
        rmvpe = RMVPE(pathlib.Path(__file__).parent / 'assets' / 'rmvpe' / 'model.pt')
    f0 = rmvpe.infer_from_audio(wav_data, sample_rate=audio_sample_rate)
    uv = f0 == 0
    f0, uv = interp_f0(f0, uv)

    time_step = hop_size / audio_sample_rate
    length = (wav_data.shape[0] + hop_size - 1) // hop_size
    f0_res = resample_align_curve(f0, 0.01, time_step, length)
    uv_res = resample_align_curve(uv.astype(np.float32), 0.01, time_step, length) > 0.5
    if not interp_uv:
        f0_res[uv_res] = 0
    return time_step, f0_res, uv_res


def get_pitch(algorithm, wav_data, hop_size, audio_sample_rate, interp_uv=True):
    if algorithm == 'parselmouth':
        return get_pitch_parselmouth(wav_data, hop_size, audio_sample_rate, interp_uv=interp_uv)
    elif algorithm == 'rmvpe':
        return get_pitch_rmvpe(wav_data, hop_size, audio_sample_rate, interp_uv=interp_uv)
    else:
        raise ValueError(f" [x] Unknown f0 extractor: {algorithm}")