import numpy as np import torch def regulate_real_note_itv(note_itv, note_bd, word_bd, word_durs, hop_size, audio_sample_rate): # regulate note_itv in seconds according to the correspondence between note_bd and word_bd assert note_itv.shape[0] == np.sum(note_bd) + 1 assert np.sum(word_bd) <= np.sum(note_bd) assert word_durs.shape[0] == np.sum(word_bd) + 1, f"{word_durs.shape[0]} {np.sum(word_bd) + 1}" word_bd = np.cumsum(word_bd) * word_bd # [0,1,0,0,1,0,0,0] -> [0,1,0,0,2,0,0,0] word_itv = np.zeros((word_durs.shape[0], 2)) word_offsets = np.cumsum(word_durs) note2words = np.zeros(note_itv.shape[0], dtype=int) for idx in range(len(word_offsets) - 1): word_itv[idx, 1] = word_itv[idx + 1, 0] = word_offsets[idx] word_itv[-1, 1] = word_offsets[-1] note_itv_secs = note_itv * hop_size / audio_sample_rate for idx, itv in enumerate(note_itv): start_idx, end_idx = itv if word_bd[start_idx] > 0: word_dur_idx = word_bd[start_idx] note_itv_secs[idx, 0] = word_itv[word_dur_idx, 0] note2words[idx] = word_dur_idx if word_bd[end_idx] > 0: word_dur_idx = word_bd[end_idx] - 1 note_itv_secs[idx, 1] = word_itv[word_dur_idx, 1] note2words[idx] = word_dur_idx note2words += 1 # mel2ph fashion: start from 1 return note_itv_secs, note2words def regulate_ill_slur(notes, note_itv, note2words): res_note2words = [] res_note_itv = [] res_notes = [] note_idx = 0 note_idx_end = 0 while True: if note_idx > len(notes) - 1: break while note_idx <= note_idx_end < len(notes) and note2words[note_idx] == note2words[note_idx_end]: note_idx_end += 1 res_note2words.append(note2words[note_idx]) res_note_itv.append(note_itv[note_idx].tolist()) res_notes.append(notes[note_idx]) for idx in range(note_idx+1, note_idx_end): if notes[idx] == notes[idx-1]: res_note_itv[-1][1] = note_itv[idx][1] else: res_note_itv.append(note_itv[idx].tolist()) res_note2words.append(note2words[idx]) res_notes.append(notes[idx]) note_idx = note_idx_end res_notes = np.array(res_notes, dtype=notes.dtype) res_note_itv = np.array(res_note_itv, dtype=note_itv.dtype) res_note2words = np.array(res_note2words, dtype=note2words.dtype) return res_notes, res_note_itv, res_note2words def bd_to_idxs(bd): # bd [T] idxs = [] for idx in range(len(bd)): if bd[idx] == 1: idxs.append(idx) return idxs def bd_to_durs(bd): # bd [T] last_idx = 0 durs = [] for idx in range(len(bd)): if bd[idx] == 1: durs.append(idx - last_idx) last_idx = idx durs.append(len(bd) - last_idx) return durs def get_mel_len(wav_len, hop_size): return (wav_len + hop_size - 1) // hop_size def mel2token_to_dur(mel2token, T_txt=None, max_dur=None): is_torch = isinstance(mel2token, torch.Tensor) has_batch_dim = True if not is_torch: mel2token = torch.LongTensor(mel2token) if T_txt is None: T_txt = mel2token.max() if len(mel2token.shape) == 1: mel2token = mel2token[None, ...] has_batch_dim = False B, _ = mel2token.shape dur = mel2token.new_zeros(B, T_txt + 1).scatter_add(1, mel2token, torch.ones_like(mel2token)) dur = dur[:, 1:] if max_dur is not None: dur = dur.clamp(max=max_dur) if not is_torch: dur = dur.numpy() if not has_batch_dim: dur = dur[0] return dur def align_word(word_durs, mel_len, hop_size, audio_sample_rate): mel2word = np.zeros([mel_len], int) start_time = 0 for i_word in range(len(word_durs)): start_frame = int(start_time * audio_sample_rate / hop_size + 0.5) end_frame = int((start_time + word_durs[i_word]) * audio_sample_rate / hop_size + 0.5) mel2word[start_frame:end_frame] = i_word + 1 start_time = start_time + word_durs[i_word] dur_word = mel2token_to_dur(mel2word) return mel2word, dur_word.tolist()