Xinsheng-Wang's picture
Upload folder using huggingface_hub
c7f3ffb verified
import numpy as np
import torch
def regulate_real_note_itv(note_itv, note_bd, word_bd, word_durs, hop_size, audio_sample_rate):
# regulate note_itv in seconds according to the correspondence between note_bd and word_bd
assert note_itv.shape[0] == np.sum(note_bd) + 1
assert np.sum(word_bd) <= np.sum(note_bd)
assert word_durs.shape[0] == np.sum(word_bd) + 1, f"{word_durs.shape[0]} {np.sum(word_bd) + 1}"
word_bd = np.cumsum(word_bd) * word_bd # [0,1,0,0,1,0,0,0] -> [0,1,0,0,2,0,0,0]
word_itv = np.zeros((word_durs.shape[0], 2))
word_offsets = np.cumsum(word_durs)
note2words = np.zeros(note_itv.shape[0], dtype=int)
for idx in range(len(word_offsets) - 1):
word_itv[idx, 1] = word_itv[idx + 1, 0] = word_offsets[idx]
word_itv[-1, 1] = word_offsets[-1]
note_itv_secs = note_itv * hop_size / audio_sample_rate
for idx, itv in enumerate(note_itv):
start_idx, end_idx = itv
if word_bd[start_idx] > 0:
word_dur_idx = word_bd[start_idx]
note_itv_secs[idx, 0] = word_itv[word_dur_idx, 0]
note2words[idx] = word_dur_idx
if word_bd[end_idx] > 0:
word_dur_idx = word_bd[end_idx] - 1
note_itv_secs[idx, 1] = word_itv[word_dur_idx, 1]
note2words[idx] = word_dur_idx
note2words += 1 # mel2ph fashion: start from 1
return note_itv_secs, note2words
def regulate_ill_slur(notes, note_itv, note2words):
res_note2words = []
res_note_itv = []
res_notes = []
note_idx = 0
note_idx_end = 0
while True:
if note_idx > len(notes) - 1:
break
while note_idx <= note_idx_end < len(notes) and note2words[note_idx] == note2words[note_idx_end]:
note_idx_end += 1
res_note2words.append(note2words[note_idx])
res_note_itv.append(note_itv[note_idx].tolist())
res_notes.append(notes[note_idx])
for idx in range(note_idx+1, note_idx_end):
if notes[idx] == notes[idx-1]:
res_note_itv[-1][1] = note_itv[idx][1]
else:
res_note_itv.append(note_itv[idx].tolist())
res_note2words.append(note2words[idx])
res_notes.append(notes[idx])
note_idx = note_idx_end
res_notes = np.array(res_notes, dtype=notes.dtype)
res_note_itv = np.array(res_note_itv, dtype=note_itv.dtype)
res_note2words = np.array(res_note2words, dtype=note2words.dtype)
return res_notes, res_note_itv, res_note2words
def bd_to_idxs(bd):
# bd [T]
idxs = []
for idx in range(len(bd)):
if bd[idx] == 1:
idxs.append(idx)
return idxs
def bd_to_durs(bd):
# bd [T]
last_idx = 0
durs = []
for idx in range(len(bd)):
if bd[idx] == 1:
durs.append(idx - last_idx)
last_idx = idx
durs.append(len(bd) - last_idx)
return durs
def get_mel_len(wav_len, hop_size):
return (wav_len + hop_size - 1) // hop_size
def mel2token_to_dur(mel2token, T_txt=None, max_dur=None):
is_torch = isinstance(mel2token, torch.Tensor)
has_batch_dim = True
if not is_torch:
mel2token = torch.LongTensor(mel2token)
if T_txt is None:
T_txt = mel2token.max()
if len(mel2token.shape) == 1:
mel2token = mel2token[None, ...]
has_batch_dim = False
B, _ = mel2token.shape
dur = mel2token.new_zeros(B, T_txt + 1).scatter_add(1, mel2token, torch.ones_like(mel2token))
dur = dur[:, 1:]
if max_dur is not None:
dur = dur.clamp(max=max_dur)
if not is_torch:
dur = dur.numpy()
if not has_batch_dim:
dur = dur[0]
return dur
def align_word(word_durs, mel_len, hop_size, audio_sample_rate):
mel2word = np.zeros([mel_len], int)
start_time = 0
for i_word in range(len(word_durs)):
start_frame = int(start_time * audio_sample_rate / hop_size + 0.5)
end_frame = int((start_time + word_durs[i_word]) * audio_sample_rate / hop_size + 0.5)
mel2word[start_frame:end_frame] = i_word + 1
start_time = start_time + word_durs[i_word]
dur_word = mel2token_to_dur(mel2word)
return mel2word, dur_word.tolist()