Spaces:
Running on Zero
Running on Zero
File size: 1,091 Bytes
61e6f25 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 | import json
class CNENTokenizer:
def __init__(self):
with open(
"./src/YingMusicSinger/utils/f5_tts/g2p/g2p/vocab.json",
"r",
encoding="utf-8",
) as file:
self.phone2id: dict = json.load(file)["vocab"]
self.phone2id = {k: int(v) + 1 for (k, v) in self.phone2id.items()}
self.pad_token_id = 0
self.phone2id["<PAD>"] = 0
self.punct_token_id = len(self.phone2id) # Punctuation marks tokens
self.phone2id["<PUNCT>"] = len(self.phone2id)
self.sep_token_id = len(self.phone2id) # Sentence separation token
self.phone2id["<SEP>"] = len(self.phone2id)
self.id2phone = {v: k for (k, v) in self.phone2id.items()}
from src.YingMusicSinger.utils.f5_tts.g2p.g2p_generation import chn_eng_g2p
self.tokenizer = chn_eng_g2p
def encode(self, text):
phone, token = self.tokenizer(text)
token = [x + 1 for x in token]
return token
def decode(self, token):
return "|".join([self.id2phone[x] for x in token])
|