Spaces:
Running on Zero
Running on Zero
File size: 2,642 Bytes
61e6f25 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | import numpy as np
def align_lrc_put_to_front(tokenizer, lrc_start_times, lrc_lines, total_lens):
lrc_text_list = []
lrc_token = np.zeros(total_lens, dtype=np.int64)
token_start = 0
for temp in lrc_lines:
# for punct in ",。!?、;:,.!?;:":
# one_line_lrc = one_line_lrc.replace(punct, ",")
# one_line_lrc = one_line_lrc.strip(",。!?、;:,.!?;: ")
for one_line_lrc in temp.split("|"):
lrc_text_list.append(one_line_lrc)
one_line_token = tokenizer.encode(one_line_lrc)
lrc_text_list.append("<SEP>")
one_line_token = one_line_token + [tokenizer.phone2id["<SEP>"]]
one_line_token = np.array(one_line_token)
assert token_start + len(one_line_token) <= len(lrc_token), (
"lrc_token 的长度超过了 vocal latent"
)
lrc_token[token_start : token_start + len(one_line_token)] = one_line_token
token_start = token_start + len(one_line_token)
return lrc_token, "".join(lrc_text_list)
def align_lrc_sentence_level(
tokenizer, lrc_start_times, lrc_lines, total_lens, vae_frame_rate
):
# BUG Only the prompt and the two segments to be generated have start timestamps, the generated content and the prompt do not contain anything like <SEP>.
lrc_text_list = []
lrc_token = np.zeros(total_lens, dtype=np.int64)
token_start = 0
for lrc_start_time, one_line_lrc in zip(lrc_start_times, lrc_lines):
one_line_lrc = one_line_lrc.replace("|", " ")
for punct in ",。!?、;:,.!?;:":
one_line_lrc = one_line_lrc.replace(punct, ",")
one_line_lrc = one_line_lrc.strip(",。!?、;:,.!?;: ")
lrc_text_list.append(one_line_lrc)
one_line_token = tokenizer.encode(one_line_lrc)
lrc_text_list.append("<SEP>")
one_line_token = one_line_token + [tokenizer.phone2id["<SEP>"]]
one_line_token = np.array(one_line_token)
timestamp_cal_start_frame = int(lrc_start_time * vae_frame_rate)
# Handling Postponement Situations
timestamp_cal_start_frame = max(timestamp_cal_start_frame, token_start)
assert timestamp_cal_start_frame + len(one_line_token) <= len(lrc_token), (
"The length of the lrc_token exceeds that of the vocal latent"
)
lrc_token[
timestamp_cal_start_frame : timestamp_cal_start_frame + len(one_line_token)
] = one_line_token
token_start = timestamp_cal_start_frame + len(one_line_token)
return lrc_token, "".join(lrc_text_list)
|