|
|
import librosa |
|
|
import numpy as np |
|
|
import torch |
|
|
|
|
|
from .constants import * |
|
|
|
|
|
|
|
|
def to_local_average_f0(hidden, center=None, thred=0.03): |
|
|
idx = torch.arange(N_CLASS, device=hidden.device)[None, None, :] |
|
|
idx_cents = idx * 20 + CONST |
|
|
if center is None: |
|
|
center = torch.argmax(hidden, dim=2, keepdim=True) |
|
|
start = torch.clip(center - 4, min=0) |
|
|
end = torch.clip(center + 5, max=N_CLASS) |
|
|
idx_mask = (idx >= start) & (idx < end) |
|
|
weights = hidden * idx_mask |
|
|
product_sum = torch.sum(weights * idx_cents, dim=2) |
|
|
weight_sum = torch.sum(weights, dim=2) |
|
|
cents = product_sum / (weight_sum + (weight_sum == 0)) |
|
|
f0 = 10 * 2 ** (cents / 1200) |
|
|
uv = hidden.max(dim=2)[0] < thred |
|
|
f0 = f0 * ~uv |
|
|
return f0.squeeze(0).cpu().numpy() |
|
|
|
|
|
|
|
|
def to_viterbi_f0(hidden, thred=0.03): |
|
|
|
|
|
if not hasattr(to_viterbi_f0, 'transition'): |
|
|
xx, yy = np.meshgrid(range(N_CLASS), range(N_CLASS)) |
|
|
transition = np.maximum(30 - abs(xx - yy), 0) |
|
|
transition = transition / transition.sum(axis=1, keepdims=True) |
|
|
to_viterbi_f0.transition = transition |
|
|
|
|
|
|
|
|
prob = hidden.squeeze(0).cpu().numpy() |
|
|
prob = prob.T |
|
|
prob = prob / prob.sum(axis=0) |
|
|
|
|
|
|
|
|
path = librosa.sequence.viterbi(prob, to_viterbi_f0.transition).astype(np.int64) |
|
|
center = torch.from_numpy(path).unsqueeze(0).unsqueeze(-1).to(hidden.device) |
|
|
|
|
|
return to_local_average_f0(hidden, center=center, thred=thred) |
|
|
|