import os, re, regex import langid import uroman as ur import jieba, zhconv from num2words import num2words jieba.set_dictionary(dictionary_path=os.path.join(os.path.dirname(__file__) + "/../infer/text_norm/jieba_dict.txt")) # from pypinyin.core import Pinyin from pypinyin import pinyin, lazy_pinyin, Style from .text_norm.txt2pinyin import _PAUSE_SYMBOL, get_phoneme_from_char_and_pinyin from .text_norm.cn_tn import NSWNormalizer from .text_norm.tokenizer import TextTokenizer, txt2phone from pypinyin.contrib.tone_convert import to_initials, to_finals_tone3 from pypinyin_dict.phrase_pinyin_data import large_pinyin # large_pinyin # cc_cedict large_pinyin.load() class TextNorm(): def __init__(self, dtype="phone"): # my_pinyin = Pinyin(MyConverter()) # self.pinyin_parser = my_pinyin.pinyin cmn_lexicon = open(os.path.join(os.path.dirname(__file__)+'/../infer/text_norm/pinyin-lexicon-r.txt'),'r', encoding="utf-8").readlines() cmn_lexicon = [x.strip().split() for x in cmn_lexicon] self.cmn_dict = {x[0]:x[1:] for x in cmn_lexicon} langid.set_languages(['es','pt','zh','en','de','fr','it','ru', 'vi','id','th','ja','ko','ar']) langs = {"en":"en-us", "it":"it", "es":"es", "pt":"pt-br", "fr":"fr-fr", "de":"de", "ru":"ru", "vi":"vi", "id":"id", "th":"th", "ja":"ja", "ko":"ko"} # "zh":"cmn", "cmn":"cmn", "ar":"ar-sa"} text_tokenizer = {} for k,v in langs.items(): tokenizer = TextTokenizer(language=v, backend="espeak") lang = "zh" if k == "cmn" else k text_tokenizer[k] = (lang, tokenizer) self.text_tokenizer = text_tokenizer self.cn_tn = NSWNormalizer() self.dtype = dtype def detect_lang(self, text): lang, _ = langid.classify(text)[0] return lang def sil_type(self, time_s): if round(time_s) < 0.4: return "" elif round(time_s) >= 0.4 and round(time_s) < 0.8: return "#1" elif round(time_s) >= 0.8 and round(time_s) < 1.5: return "#2" elif round(time_s) >= 1.5 and round(time_s) < 3.0: return "#3" elif round(time_s) >= 3.0: return "#4" def add_sil_raw(self, sub_list, start_time, end_time, target_transcript): txt = [] txt_list = [x["word"] for x in sub_list] sil = self.sil_type(sub_list[0]["start"]) if len(sil) > 0: txt.append(sil) txt.append(txt_list[0]) for i in range(1, len(sub_list)): if sub_list[i]["start"] >= start_time and sub_list[i]["end"] <= end_time: txt.append(target_transcript) target_transcript = "" else: sil = self.sil_type(sub_list[i]["start"] - sub_list[i-1]["end"]) if len(sil) > 0: txt.append(sil) txt.append(txt_list[i]) return ' '.join(txt) def add_sil(self, sub_list, start_time, end_time, target_transcript, src_lang, tar_lang): txts = [] txt_list = [x["word"] for x in sub_list] sil = self.sil_type(sub_list[0]["start"]) if len(sil) > 0: txts.append([src_lang, sil]) if sub_list[0]["start"] < start_time: txts.append([src_lang, txt_list[0]]) for i in range(1, len(sub_list)): if sub_list[i]["start"] >= start_time and sub_list[i]["end"] <= end_time: txts.append([tar_lang, target_transcript]) target_transcript = "" else: sil = self.sil_type(sub_list[i]["start"] - sub_list[i-1]["end"]) if len(sil) > 0: txts.append([src_lang, sil]) txts.append([src_lang, txt_list[i]]) target_txt = [txts[0]] for txt in txts[1:]: if txt[1] == "": continue if txt[0] != target_txt[-1][0]: target_txt.append([txt[0], ""]) target_txt[-1][-1] += " " + txt[1] return target_txt def replace_numbers_with_words(self, sentence, lang="en"): sentence = re.sub(r'(\d+)', r' \1 ', sentence) # add spaces around numbers def replace_with_words(match): num = match.group(0) try: return num2words(num, lang=lang) # Convert numbers to words except: return num # In case num2words fails (unlikely with digits but just to be safe) return re.sub(r'\b\d+\b', replace_with_words, sentence) # Regular expression that matches numbers def get_prompt(self, sub_list, start_time, end_time, src_lang): txts = [] txt_list = [x["word"] for x in sub_list] if start_time <= sub_list[0]["start"]: sil = self.sil_type(sub_list[0]["start"]) if len(sil) > 0: txts.append([src_lang, sil]) txts.append([src_lang, txt_list[0]]) for i in range(1, len(sub_list)): # if sub_list[i]["start"] <= start_time and sub_list[i]["end"] <= end_time: # txts.append([tar_lang, target_transcript]) # target_transcript = "" if sub_list[i]["start"] >= start_time and sub_list[i]["end"] <= end_time: sil = self.sil_type(sub_list[i]["start"] - sub_list[i-1]["end"]) if len(sil) > 0: txts.append([src_lang, sil]) txts.append([src_lang, txt_list[i]]) target_txt = [txts[0]] for txt in txts[1:]: if txt[1] == "": continue if txt[0] != target_txt[-1][0]: target_txt.append([txt[0], ""]) target_txt[-1][-1] += " " + txt[1] return target_txt def txt2pinyin(self, text): txts, phonemes = [], [] texts = re.split(r"(#\d)", text) print("before norm: ", texts) for text in texts: if text in {'#1', '#2', '#3', '#4'}: txts.append(text) phonemes.append(text) continue text = self.cn_tn.normalize(text.strip()) text_list = list(jieba.cut(text)) print("jieba cut: ", text, text_list) for words in text_list: if words in _PAUSE_SYMBOL: # phonemes[-1] += _PAUSE_SYMBOL[words] phonemes.append(_PAUSE_SYMBOL[words]) # phonemes.append('#1') txts[-1] += words elif re.search("[\u4e00-\u9fa5]+", words): # pinyin = self.pinyin_parser(words, style=Style.TONE3, errors="ignore") pinyin = lazy_pinyin(words, style=Style.TONE3, tone_sandhi=True, neutral_tone_with_five=True) new_pinyin = [] for x in pinyin: x = "".join(x) if "#" not in x: new_pinyin.append(x) else: phonemes.append(words) continue # new_pinyin = change_tone_in_bu_or_yi(words, new_pinyin) if len(words)>1 and words[-1] not in {"一","不"} else new_pinyin phoneme = get_phoneme_from_char_and_pinyin(words, new_pinyin) phonemes += phoneme txts += list(words) elif re.search(r"[a-zA-Z]", words) or re.search(r"#[1-4]", words): phonemes.append(words.upper()) txts.append(words.upper()) # phonemes.append("#1") # phones = " ".join(phonemes) return txts, phonemes def txt2pin_phns(self, text): text = re.sub(r'(? 0 and res_list[-1] == "_": res_list.pop() res_list += [txt] continue else: if len(res_list) > 0 and res_list[-1] == "_": res_list.pop() lang = langid.classify(txt)[0] lang = lang if lang in self.text_tokenizer else "en" tokenizer = self.text_tokenizer[lang][1] ipa = tokenizer.backend.phonemize([txt], separator=tokenizer.separator, strip=True, njobs=1) phns = ipa[0] if ipa[0][0] == "(" else f"({lang})_" + ipa[0] res_list += phns.replace("_", "|_|").split("|") # lang = phns.split(")")[0][1:] # phns = phns[len(lang)+3:].replace("_", "|_|") # phns = phns.split("|") # for i in range(len(phns)): # if phns[i] not in {"#1", "#2", "#3", "#4", "_", ",", ".", "?", "!"}: # phns[i] = f"({lang})" + phns[i] # res_list += phns res_list.append("_") res = "|".join(res_list) res = re.sub(r'(\|_)+', '|_', res) return res def text2phn(self, sentence, lang=None): if not lang: lang = langid.classify(sentence)[0] if re.search("[\u4e00-\u9fa5]+", sentence): txts, phones = self.txt2pinyin(sentence) transcript_norm = " ".join(phones) phones = self.txt2pin_phns(transcript_norm) # IPA mix Pinyin else: transcript = self.replace_numbers_with_words(sentence, lang=lang).split(' ') transcript_norm = sentence # All IPA phones = txt2phone(self.text_tokenizer[lang][1], transcript_norm.strip().replace(".", ",").replace("。", ",")) phones = f"({lang})|" + phones if phones[0] != "(" else phones return phones def text2norm(self, sentence, lang=None): if not lang: lang = langid.classify(sentence)[0] if re.search("[\u4e00-\u9fa5]+", sentence): txts, phones = self.txt2pinyin(sentence) transcript_norm = " ".join(phones) else: transcript = self.replace_numbers_with_words(sentence, lang=lang).split(' ') transcript_norm = sentence return (lang, transcript_norm)