File size: 2,642 Bytes
61e6f25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import numpy as np


def align_lrc_put_to_front(tokenizer, lrc_start_times, lrc_lines, total_lens):
    lrc_text_list = []
    lrc_token = np.zeros(total_lens, dtype=np.int64)

    token_start = 0
    for temp in lrc_lines:
        # for punct in ",。!?、;:,.!?;:":
        #     one_line_lrc = one_line_lrc.replace(punct, ",")
        #     one_line_lrc = one_line_lrc.strip(",。!?、;:,.!?;: ")
        for one_line_lrc in temp.split("|"):
            lrc_text_list.append(one_line_lrc)
            one_line_token = tokenizer.encode(one_line_lrc)
            lrc_text_list.append("<SEP>")
            one_line_token = one_line_token + [tokenizer.phone2id["<SEP>"]]

            one_line_token = np.array(one_line_token)
            assert token_start + len(one_line_token) <= len(lrc_token), (
                "lrc_token 的长度超过了 vocal latent"
            )
            lrc_token[token_start : token_start + len(one_line_token)] = one_line_token
            token_start = token_start + len(one_line_token)
    return lrc_token, "".join(lrc_text_list)


def align_lrc_sentence_level(
    tokenizer, lrc_start_times, lrc_lines, total_lens, vae_frame_rate
):
    # BUG Only the prompt and the two segments to be generated have start timestamps, the generated content and the prompt do not contain anything like <SEP>.
    lrc_text_list = []
    lrc_token = np.zeros(total_lens, dtype=np.int64)

    token_start = 0
    for lrc_start_time, one_line_lrc in zip(lrc_start_times, lrc_lines):
        one_line_lrc = one_line_lrc.replace("|", " ")
        for punct in ",。!?、;:,.!?;:":
            one_line_lrc = one_line_lrc.replace(punct, ",")
            one_line_lrc = one_line_lrc.strip(",。!?、;:,.!?;: ")

        lrc_text_list.append(one_line_lrc)
        one_line_token = tokenizer.encode(one_line_lrc)
        lrc_text_list.append("<SEP>")
        one_line_token = one_line_token + [tokenizer.phone2id["<SEP>"]]

        one_line_token = np.array(one_line_token)

        timestamp_cal_start_frame = int(lrc_start_time * vae_frame_rate)

        # Handling Postponement Situations
        timestamp_cal_start_frame = max(timestamp_cal_start_frame, token_start)

        assert timestamp_cal_start_frame + len(one_line_token) <= len(lrc_token), (
            "The length of the lrc_token exceeds that of the vocal latent"
        )
        lrc_token[
            timestamp_cal_start_frame : timestamp_cal_start_frame + len(one_line_token)
        ] = one_line_token
        token_start = timestamp_cal_start_frame + len(one_line_token)
    return lrc_token, "".join(lrc_text_list)