Spaces:
Running on Zero
Running on Zero
| import numpy as np | |
| import torch | |
| def regulate_real_note_itv(note_itv, note_bd, word_bd, word_durs, hop_size, audio_sample_rate): | |
| # regulate note_itv in seconds according to the correspondence between note_bd and word_bd | |
| assert note_itv.shape[0] == np.sum(note_bd) + 1 | |
| assert np.sum(word_bd) <= np.sum(note_bd) | |
| assert word_durs.shape[0] == np.sum(word_bd) + 1, f"{word_durs.shape[0]} {np.sum(word_bd) + 1}" | |
| word_bd = np.cumsum(word_bd) * word_bd # [0,1,0,0,1,0,0,0] -> [0,1,0,0,2,0,0,0] | |
| word_itv = np.zeros((word_durs.shape[0], 2)) | |
| word_offsets = np.cumsum(word_durs) | |
| note2words = np.zeros(note_itv.shape[0], dtype=int) | |
| for idx in range(len(word_offsets) - 1): | |
| word_itv[idx, 1] = word_itv[idx + 1, 0] = word_offsets[idx] | |
| word_itv[-1, 1] = word_offsets[-1] | |
| note_itv_secs = note_itv * hop_size / audio_sample_rate | |
| for idx, itv in enumerate(note_itv): | |
| start_idx, end_idx = itv | |
| if word_bd[start_idx] > 0: | |
| word_dur_idx = word_bd[start_idx] | |
| note_itv_secs[idx, 0] = word_itv[word_dur_idx, 0] | |
| note2words[idx] = word_dur_idx | |
| if word_bd[end_idx] > 0: | |
| word_dur_idx = word_bd[end_idx] - 1 | |
| note_itv_secs[idx, 1] = word_itv[word_dur_idx, 1] | |
| note2words[idx] = word_dur_idx | |
| note2words += 1 # mel2ph fashion: start from 1 | |
| return note_itv_secs, note2words | |
| def regulate_ill_slur(notes, note_itv, note2words): | |
| res_note2words = [] | |
| res_note_itv = [] | |
| res_notes = [] | |
| note_idx = 0 | |
| note_idx_end = 0 | |
| while True: | |
| if note_idx > len(notes) - 1: | |
| break | |
| while note_idx <= note_idx_end < len(notes) and note2words[note_idx] == note2words[note_idx_end]: | |
| note_idx_end += 1 | |
| res_note2words.append(note2words[note_idx]) | |
| res_note_itv.append(note_itv[note_idx].tolist()) | |
| res_notes.append(notes[note_idx]) | |
| for idx in range(note_idx+1, note_idx_end): | |
| if notes[idx] == notes[idx-1]: | |
| res_note_itv[-1][1] = note_itv[idx][1] | |
| else: | |
| res_note_itv.append(note_itv[idx].tolist()) | |
| res_note2words.append(note2words[idx]) | |
| res_notes.append(notes[idx]) | |
| note_idx = note_idx_end | |
| res_notes = np.array(res_notes, dtype=notes.dtype) | |
| res_note_itv = np.array(res_note_itv, dtype=note_itv.dtype) | |
| res_note2words = np.array(res_note2words, dtype=note2words.dtype) | |
| return res_notes, res_note_itv, res_note2words | |
| def bd_to_idxs(bd): | |
| # bd [T] | |
| idxs = [] | |
| for idx in range(len(bd)): | |
| if bd[idx] == 1: | |
| idxs.append(idx) | |
| return idxs | |
| def bd_to_durs(bd): | |
| # bd [T] | |
| last_idx = 0 | |
| durs = [] | |
| for idx in range(len(bd)): | |
| if bd[idx] == 1: | |
| durs.append(idx - last_idx) | |
| last_idx = idx | |
| durs.append(len(bd) - last_idx) | |
| return durs | |
| def get_mel_len(wav_len, hop_size): | |
| return (wav_len + hop_size - 1) // hop_size | |
| def mel2token_to_dur(mel2token, T_txt=None, max_dur=None): | |
| is_torch = isinstance(mel2token, torch.Tensor) | |
| has_batch_dim = True | |
| if not is_torch: | |
| mel2token = torch.LongTensor(mel2token) | |
| if T_txt is None: | |
| T_txt = mel2token.max() | |
| if len(mel2token.shape) == 1: | |
| mel2token = mel2token[None, ...] | |
| has_batch_dim = False | |
| B, _ = mel2token.shape | |
| dur = mel2token.new_zeros(B, T_txt + 1).scatter_add(1, mel2token, torch.ones_like(mel2token)) | |
| dur = dur[:, 1:] | |
| if max_dur is not None: | |
| dur = dur.clamp(max=max_dur) | |
| if not is_torch: | |
| dur = dur.numpy() | |
| if not has_batch_dim: | |
| dur = dur[0] | |
| return dur | |
| def align_word(word_durs, mel_len, hop_size, audio_sample_rate): | |
| mel2word = np.zeros([mel_len], int) | |
| start_time = 0 | |
| for i_word in range(len(word_durs)): | |
| start_frame = int(start_time * audio_sample_rate / hop_size + 0.5) | |
| end_frame = int((start_time + word_durs[i_word]) * audio_sample_rate / hop_size + 0.5) | |
| mel2word[start_frame:end_frame] = i_word + 1 | |
| start_time = start_time + word_durs[i_word] | |
| dur_word = mel2token_to_dur(mel2word) | |
| return mel2word, dur_word.tolist() |