SoulX-Singer

Sleeping

File size: 4,195 Bytes

c7f3ffb

import numpy as np
import torch

def regulate_real_note_itv(note_itv, note_bd, word_bd, word_durs, hop_size, audio_sample_rate):
    # regulate note_itv in seconds according to the correspondence between note_bd and word_bd
    assert note_itv.shape[0] == np.sum(note_bd) + 1
    assert np.sum(word_bd) <= np.sum(note_bd)
    assert word_durs.shape[0] == np.sum(word_bd) + 1, f"{word_durs.shape[0]} {np.sum(word_bd) + 1}"
    word_bd = np.cumsum(word_bd) * word_bd  # [0,1,0,0,1,0,0,0] -> [0,1,0,0,2,0,0,0]
    word_itv = np.zeros((word_durs.shape[0], 2))
    word_offsets = np.cumsum(word_durs)
    note2words = np.zeros(note_itv.shape[0], dtype=int)
    for idx in range(len(word_offsets) - 1):
        word_itv[idx, 1] = word_itv[idx + 1, 0] = word_offsets[idx]
    word_itv[-1, 1] = word_offsets[-1]
    note_itv_secs = note_itv * hop_size / audio_sample_rate
    for idx, itv in enumerate(note_itv):
        start_idx, end_idx = itv
        if word_bd[start_idx] > 0:
            word_dur_idx = word_bd[start_idx]
            note_itv_secs[idx, 0] = word_itv[word_dur_idx, 0]
            note2words[idx] = word_dur_idx
        if word_bd[end_idx] > 0:
            word_dur_idx = word_bd[end_idx] - 1
            note_itv_secs[idx, 1] = word_itv[word_dur_idx, 1]
            note2words[idx] = word_dur_idx
    note2words += 1  # mel2ph fashion: start from 1
    return note_itv_secs, note2words

def regulate_ill_slur(notes, note_itv, note2words):
    res_note2words = []
    res_note_itv = []
    res_notes = []
    note_idx = 0
    note_idx_end = 0
    while True:
        if note_idx > len(notes) - 1:
            break
        while note_idx <= note_idx_end < len(notes) and note2words[note_idx] == note2words[note_idx_end]:
            note_idx_end += 1
        res_note2words.append(note2words[note_idx])
        res_note_itv.append(note_itv[note_idx].tolist())
        res_notes.append(notes[note_idx])
        for idx in range(note_idx+1, note_idx_end):
            if notes[idx] == notes[idx-1]:
                res_note_itv[-1][1] = note_itv[idx][1]
            else:
                res_note_itv.append(note_itv[idx].tolist())
                res_note2words.append(note2words[idx])
                res_notes.append(notes[idx])
        note_idx = note_idx_end
    res_notes = np.array(res_notes, dtype=notes.dtype)
    res_note_itv = np.array(res_note_itv, dtype=note_itv.dtype)
    res_note2words = np.array(res_note2words, dtype=note2words.dtype)
    return res_notes, res_note_itv, res_note2words

def bd_to_idxs(bd):
    # bd [T]
    idxs = []
    for idx in range(len(bd)):
        if bd[idx] == 1:
            idxs.append(idx)
    return idxs

def bd_to_durs(bd):
    # bd [T]
    last_idx = 0
    durs = []
    for idx in range(len(bd)):
        if bd[idx] == 1:
            durs.append(idx - last_idx)
            last_idx = idx
    durs.append(len(bd) - last_idx)
    return durs

def get_mel_len(wav_len, hop_size):
    return (wav_len + hop_size - 1) // hop_size

def mel2token_to_dur(mel2token, T_txt=None, max_dur=None):
    is_torch = isinstance(mel2token, torch.Tensor)
    has_batch_dim = True
    if not is_torch:
        mel2token = torch.LongTensor(mel2token)
    if T_txt is None:
        T_txt = mel2token.max()
    if len(mel2token.shape) == 1:
        mel2token = mel2token[None, ...]
        has_batch_dim = False
    B, _ = mel2token.shape
    dur = mel2token.new_zeros(B, T_txt + 1).scatter_add(1, mel2token, torch.ones_like(mel2token))
    dur = dur[:, 1:]
    if max_dur is not None:
        dur = dur.clamp(max=max_dur)
    if not is_torch:
        dur = dur.numpy()
    if not has_batch_dim:
        dur = dur[0]
    return dur

def align_word(word_durs, mel_len, hop_size, audio_sample_rate):
    mel2word = np.zeros([mel_len], int)
    start_time = 0
    for i_word in range(len(word_durs)):
        start_frame = int(start_time * audio_sample_rate / hop_size + 0.5)
        end_frame = int((start_time + word_durs[i_word]) * audio_sample_rate / hop_size + 0.5)
        mel2word[start_frame:end_frame] = i_word + 1
        start_time = start_time + word_durs[i_word]

    dur_word = mel2token_to_dur(mel2word)

    return mel2word, dur_word.tolist()