Spaces:

LEMAS-Project
/

LEMAS-TTS

Running on Zero

File size: 10,844 Bytes

34fb334

import os, re, regex
import langid
import uroman as ur
import jieba, zhconv
from num2words import num2words

jieba.set_dictionary(dictionary_path=os.path.join(os.path.dirname(__file__) + "/../infer/text_norm/jieba_dict.txt"))
# from pypinyin.core import Pinyin
from pypinyin import pinyin, lazy_pinyin, Style

from .text_norm.txt2pinyin import _PAUSE_SYMBOL, get_phoneme_from_char_and_pinyin
from .text_norm.cn_tn import NSWNormalizer
from .text_norm.tokenizer import TextTokenizer, txt2phone
from pypinyin.contrib.tone_convert import to_initials, to_finals_tone3
from pypinyin_dict.phrase_pinyin_data import large_pinyin  # large_pinyin  #  cc_cedict
large_pinyin.load()

class TextNorm():
    def __init__(self, dtype="phone"):
        # my_pinyin = Pinyin(MyConverter())
        # self.pinyin_parser = my_pinyin.pinyin
        cmn_lexicon = open(os.path.join(os.path.dirname(__file__)+'/../infer/text_norm/pinyin-lexicon-r.txt'),'r', encoding="utf-8").readlines()
        cmn_lexicon = [x.strip().split() for x in cmn_lexicon]
        self.cmn_dict = {x[0]:x[1:] for x in cmn_lexicon}
        langid.set_languages(['es','pt','zh','en','de','fr','it','ru', 'vi','id','th','ja','ko','ar'])
        langs = {"en":"en-us", "it":"it", "es":"es", "pt":"pt-br", "fr":"fr-fr", "de":"de", "ru":"ru", "vi":"vi", "id":"id", "th":"th", "ja":"ja", "ko":"ko"} # "zh":"cmn", "cmn":"cmn", "ar":"ar-sa"}
        text_tokenizer = {}
        for k,v in langs.items():
            tokenizer = TextTokenizer(language=v, backend="espeak")
            lang = "zh" if k == "cmn" else k
            text_tokenizer[k] = (lang, tokenizer)
        self.text_tokenizer = text_tokenizer
        self.cn_tn = NSWNormalizer()
        self.dtype = dtype

    def detect_lang(self, text):
        lang, _ = langid.classify(text)[0]
        return lang

    def sil_type(self, time_s):
        if round(time_s) < 0.4:
            return ""
        elif round(time_s) >= 0.4 and round(time_s) < 0.8:
            return "#1"
        elif round(time_s) >= 0.8 and round(time_s) < 1.5:
            return "#2"
        elif round(time_s) >= 1.5 and round(time_s) < 3.0:
            return "#3"
        elif round(time_s) >= 3.0:
            return "#4"


    def add_sil_raw(self, sub_list, start_time, end_time, target_transcript):
        txt = []
        txt_list = [x["word"] for x in sub_list]
        sil = self.sil_type(sub_list[0]["start"])
        if len(sil) > 0:
            txt.append(sil)
        txt.append(txt_list[0])
        for i in range(1, len(sub_list)):
            if sub_list[i]["start"] >= start_time and sub_list[i]["end"] <= end_time:
                txt.append(target_transcript)
                target_transcript = ""
            else:
                sil = self.sil_type(sub_list[i]["start"] - sub_list[i-1]["end"])
                if len(sil) > 0:
                    txt.append(sil)
                txt.append(txt_list[i])
        return ' '.join(txt)

    def add_sil(self, sub_list, start_time, end_time, target_transcript, src_lang, tar_lang):
        txts = []
        txt_list = [x["word"] for x in sub_list]
        sil = self.sil_type(sub_list[0]["start"])
        if len(sil) > 0:
            txts.append([src_lang, sil])

        if sub_list[0]["start"] < start_time:
            txts.append([src_lang, txt_list[0]])
        for i in range(1, len(sub_list)):
            if sub_list[i]["start"] >= start_time and sub_list[i]["end"] <= end_time:
                txts.append([tar_lang, target_transcript])
                target_transcript = ""
            else:
                sil = self.sil_type(sub_list[i]["start"] - sub_list[i-1]["end"])
                if len(sil) > 0:
                    txts.append([src_lang, sil])
                txts.append([src_lang, txt_list[i]])
                
        target_txt = [txts[0]]
        for txt in txts[1:]:
            if txt[1] == "":
                continue
            if txt[0] != target_txt[-1][0]:
                target_txt.append([txt[0], ""])
            target_txt[-1][-1] += " " + txt[1]
        
        return target_txt

    def replace_numbers_with_words(self, sentence, lang="en"):
        sentence = re.sub(r'(\d+)', r' \1 ', sentence) # add spaces around numbers
        
        def replace_with_words(match):
            num = match.group(0)
            try:
                return num2words(num, lang=lang) # Convert numbers to words
            except:
                return num # In case num2words fails (unlikely with digits but just to be safe)
        return re.sub(r'\b\d+\b', replace_with_words, sentence) # Regular expression that matches numbers


    def get_prompt(self, sub_list, start_time, end_time, src_lang):
        txts = []
        txt_list = [x["word"] for x in sub_list]

        if start_time <= sub_list[0]["start"]:
            sil = self.sil_type(sub_list[0]["start"])
            if len(sil) > 0:
                txts.append([src_lang, sil])
            txts.append([src_lang, txt_list[0]])
        
        for i in range(1, len(sub_list)):
            # if sub_list[i]["start"] <= start_time and sub_list[i]["end"] <= end_time:
            #     txts.append([tar_lang, target_transcript])
            #     target_transcript = ""
            if sub_list[i]["start"] >= start_time and sub_list[i]["end"] <= end_time:
                sil = self.sil_type(sub_list[i]["start"] - sub_list[i-1]["end"])
                if len(sil) > 0:
                    txts.append([src_lang, sil])
                txts.append([src_lang, txt_list[i]])

        target_txt = [txts[0]]
        for txt in txts[1:]:
            if txt[1] == "":
                continue
            if txt[0] != target_txt[-1][0]:
                target_txt.append([txt[0], ""])
            target_txt[-1][-1] += " " + txt[1]
        return target_txt


    def txt2pinyin(self, text):
        txts, phonemes = [], []
        texts = re.split(r"(#\d)", text)
        print("before norm: ", texts)
        for text in texts:
            if text in {'#1', '#2', '#3', '#4'}:
                txts.append(text)
                phonemes.append(text)
                continue
            text = self.cn_tn.normalize(text.strip())
            
            text_list = list(jieba.cut(text))
            print("jieba cut: ", text, text_list)
            for words in text_list:
                if words in _PAUSE_SYMBOL:
                    # phonemes[-1] += _PAUSE_SYMBOL[words]
                    phonemes.append(_PAUSE_SYMBOL[words])
                    # phonemes.append('#1')
                    txts[-1] += words
                elif re.search("[\u4e00-\u9fa5]+", words):
                    # pinyin = self.pinyin_parser(words, style=Style.TONE3, errors="ignore")
                    pinyin = lazy_pinyin(words, style=Style.TONE3, tone_sandhi=True, neutral_tone_with_five=True)
                    new_pinyin = []
                    for x in pinyin:
                        x = "".join(x)
                        if "#" not in x:
                            new_pinyin.append(x)
                        else:
                            phonemes.append(words)
                            continue
                    # new_pinyin = change_tone_in_bu_or_yi(words, new_pinyin) if len(words)>1 and words[-1] not in {"一","不"} else new_pinyin
                    phoneme = get_phoneme_from_char_and_pinyin(words, new_pinyin)
                    phonemes += phoneme
                    txts += list(words)
                elif re.search(r"[a-zA-Z]", words) or re.search(r"#[1-4]", words):
                    phonemes.append(words.upper())
                    txts.append(words.upper())
                    # phonemes.append("#1")
        # phones = " ".join(phonemes)
        return txts, phonemes


    def txt2pin_phns(self, text):
        text = re.sub(r'(?<! )(' + r'[^\w\s]' + r')', r' \1', text)
        text = re.sub(r'\s+', ' ', text).strip()
        
        # print(text.split(" "))
        res_list = []
        for txt in text.split(" "):
            if txt in self.cmn_dict:
                # res_list +=  ["(zh)" + x for x in self.cmn_dict[txt]]
                res_list.append("(zh)")
                res_list.append(to_initials(txt, strict=False))
                res_list.append(to_finals_tone3(txt, neutral_tone_with_five=True))
            elif txt == '':
                continue
            elif txt[0] in {"#1", "#2", "#3", "#4"} or not bool(regex.search(r'\p{L}', txt[0][0])): 
                if len(res_list) > 0 and res_list[-1] == "_":
                    res_list.pop()
                res_list += [txt]
                continue
            else:
                if len(res_list) > 0 and res_list[-1] == "_":
                    res_list.pop()
                lang = langid.classify(txt)[0]
                lang = lang if lang in self.text_tokenizer else "en"
                tokenizer = self.text_tokenizer[lang][1]
                ipa = tokenizer.backend.phonemize([txt], separator=tokenizer.separator, strip=True, njobs=1)
                phns = ipa[0] if ipa[0][0] == "(" else f"({lang})_" + ipa[0]
                res_list += phns.replace("_", "|_|").split("|")

                # lang = phns.split(")")[0][1:]
                # phns = phns[len(lang)+3:].replace("_", "|_|")
                # phns = phns.split("|")
                # for i in range(len(phns)):
                #     if phns[i] not in {"#1", "#2", "#3", "#4", "_", ",", ".", "?", "!"}: 
                #         phns[i] = f"({lang})" + phns[i]
                # res_list += phns
            res_list.append("_")
        res = "|".join(res_list)
        res = re.sub(r'(\|_)+', '|_', res)
        return res


    def text2phn(self, sentence, lang=None):
        if not lang:
            lang = langid.classify(sentence)[0]
        if re.search("[\u4e00-\u9fa5]+", sentence):
            txts, phones = self.txt2pinyin(sentence)
            transcript_norm = " ".join(phones)
            phones = self.txt2pin_phns(transcript_norm) # IPA mix Pinyin
        else:
            transcript = self.replace_numbers_with_words(sentence, lang=lang).split(' ')
            transcript_norm = sentence
            # All IPA
            phones = txt2phone(self.text_tokenizer[lang][1], transcript_norm.strip().replace(".", ",").replace("。", ","))
            phones = f"({lang})|" + phones if phones[0] != "(" else phones
        return phones


    def text2norm(self, sentence, lang=None):
        if not lang:
            lang = langid.classify(sentence)[0]
        if re.search("[\u4e00-\u9fa5]+", sentence):
            txts, phones = self.txt2pinyin(sentence)
            transcript_norm = " ".join(phones)
        else:
            transcript = self.replace_numbers_with_words(sentence, lang=lang).split(' ')
            transcript_norm = sentence
        return (lang, transcript_norm)