Approximetal's picture
Upload folder using huggingface_hub
f36e46d verified
import os, re, regex
import langid
import uroman as ur
import jieba, zhconv
from num2words import num2words
jieba.set_dictionary(dictionary_path=os.path.join(os.path.dirname(__file__) + "/../infer/text_norm/jieba_dict.txt"))
# from pypinyin.core import Pinyin
from pypinyin import pinyin, lazy_pinyin, Style
from .text_norm.txt2pinyin import _PAUSE_SYMBOL, get_phoneme_from_char_and_pinyin
from .text_norm.cn_tn import NSWNormalizer
from .text_norm.tokenizer import TextTokenizer, txt2phone
from pypinyin.contrib.tone_convert import to_initials, to_finals_tone3
from pypinyin_dict.phrase_pinyin_data import large_pinyin # large_pinyin # cc_cedict
large_pinyin.load()
class TextNorm():
def __init__(self, dtype="phone"):
# my_pinyin = Pinyin(MyConverter())
# self.pinyin_parser = my_pinyin.pinyin
cmn_lexicon = open(os.path.join(os.path.dirname(__file__)+'/../infer/text_norm/pinyin-lexicon-r.txt'),'r', encoding="utf-8").readlines()
cmn_lexicon = [x.strip().split() for x in cmn_lexicon]
self.cmn_dict = {x[0]:x[1:] for x in cmn_lexicon}
langid.set_languages(['es','pt','zh','en','de','fr','it','ru', 'vi','id','th','ja','ko','ar'])
langs = {"en":"en-us", "it":"it", "es":"es", "pt":"pt-br", "fr":"fr-fr", "de":"de", "ru":"ru", "vi":"vi", "id":"id", "th":"th", "ja":"ja", "ko":"ko"} # "zh":"cmn", "cmn":"cmn", "ar":"ar-sa"}
text_tokenizer = {}
for k,v in langs.items():
tokenizer = TextTokenizer(language=v, backend="espeak")
lang = "zh" if k == "cmn" else k
text_tokenizer[k] = (lang, tokenizer)
self.text_tokenizer = text_tokenizer
self.cn_tn = NSWNormalizer()
self.dtype = dtype
def detect_lang(self, text):
lang, _ = langid.classify(text)[0]
return lang
def sil_type(self, time_s):
if round(time_s) < 0.4:
return ""
elif round(time_s) >= 0.4 and round(time_s) < 0.8:
return "#1"
elif round(time_s) >= 0.8 and round(time_s) < 1.5:
return "#2"
elif round(time_s) >= 1.5 and round(time_s) < 3.0:
return "#3"
elif round(time_s) >= 3.0:
return "#4"
def add_sil_raw(self, sub_list, start_time, end_time, target_transcript):
txt = []
txt_list = [x["word"] for x in sub_list]
sil = self.sil_type(sub_list[0]["start"])
if len(sil) > 0:
txt.append(sil)
txt.append(txt_list[0])
for i in range(1, len(sub_list)):
if sub_list[i]["start"] >= start_time and sub_list[i]["end"] <= end_time:
txt.append(target_transcript)
target_transcript = ""
else:
sil = self.sil_type(sub_list[i]["start"] - sub_list[i-1]["end"])
if len(sil) > 0:
txt.append(sil)
txt.append(txt_list[i])
return ' '.join(txt)
def add_sil(self, sub_list, start_time, end_time, target_transcript, src_lang, tar_lang):
txts = []
txt_list = [x["word"] for x in sub_list]
sil = self.sil_type(sub_list[0]["start"])
if len(sil) > 0:
txts.append([src_lang, sil])
if sub_list[0]["start"] < start_time:
txts.append([src_lang, txt_list[0]])
for i in range(1, len(sub_list)):
if sub_list[i]["start"] >= start_time and sub_list[i]["end"] <= end_time:
txts.append([tar_lang, target_transcript])
target_transcript = ""
else:
sil = self.sil_type(sub_list[i]["start"] - sub_list[i-1]["end"])
if len(sil) > 0:
txts.append([src_lang, sil])
txts.append([src_lang, txt_list[i]])
target_txt = [txts[0]]
for txt in txts[1:]:
if txt[1] == "":
continue
if txt[0] != target_txt[-1][0]:
target_txt.append([txt[0], ""])
target_txt[-1][-1] += " " + txt[1]
return target_txt
def replace_numbers_with_words(self, sentence, lang="en"):
sentence = re.sub(r'(\d+)', r' \1 ', sentence) # add spaces around numbers
def replace_with_words(match):
num = match.group(0)
try:
return num2words(num, lang=lang) # Convert numbers to words
except:
return num # In case num2words fails (unlikely with digits but just to be safe)
return re.sub(r'\b\d+\b', replace_with_words, sentence) # Regular expression that matches numbers
def get_prompt(self, sub_list, start_time, end_time, src_lang):
txts = []
txt_list = [x["word"] for x in sub_list]
if start_time <= sub_list[0]["start"]:
sil = self.sil_type(sub_list[0]["start"])
if len(sil) > 0:
txts.append([src_lang, sil])
txts.append([src_lang, txt_list[0]])
for i in range(1, len(sub_list)):
# if sub_list[i]["start"] <= start_time and sub_list[i]["end"] <= end_time:
# txts.append([tar_lang, target_transcript])
# target_transcript = ""
if sub_list[i]["start"] >= start_time and sub_list[i]["end"] <= end_time:
sil = self.sil_type(sub_list[i]["start"] - sub_list[i-1]["end"])
if len(sil) > 0:
txts.append([src_lang, sil])
txts.append([src_lang, txt_list[i]])
target_txt = [txts[0]]
for txt in txts[1:]:
if txt[1] == "":
continue
if txt[0] != target_txt[-1][0]:
target_txt.append([txt[0], ""])
target_txt[-1][-1] += " " + txt[1]
return target_txt
def txt2pinyin(self, text):
txts, phonemes = [], []
texts = re.split(r"(#\d)", text)
print("before norm: ", texts)
for text in texts:
if text in {'#1', '#2', '#3', '#4'}:
txts.append(text)
phonemes.append(text)
continue
text = self.cn_tn.normalize(text.strip())
text_list = list(jieba.cut(text))
print("jieba cut: ", text, text_list)
for words in text_list:
if words in _PAUSE_SYMBOL:
# phonemes[-1] += _PAUSE_SYMBOL[words]
phonemes.append(_PAUSE_SYMBOL[words])
# phonemes.append('#1')
txts[-1] += words
elif re.search("[\u4e00-\u9fa5]+", words):
# pinyin = self.pinyin_parser(words, style=Style.TONE3, errors="ignore")
pinyin = lazy_pinyin(words, style=Style.TONE3, tone_sandhi=True, neutral_tone_with_five=True)
new_pinyin = []
for x in pinyin:
x = "".join(x)
if "#" not in x:
new_pinyin.append(x)
else:
phonemes.append(words)
continue
# new_pinyin = change_tone_in_bu_or_yi(words, new_pinyin) if len(words)>1 and words[-1] not in {"一","不"} else new_pinyin
phoneme = get_phoneme_from_char_and_pinyin(words, new_pinyin)
phonemes += phoneme
txts += list(words)
elif re.search(r"[a-zA-Z]", words) or re.search(r"#[1-4]", words):
phonemes.append(words.upper())
txts.append(words.upper())
# phonemes.append("#1")
# phones = " ".join(phonemes)
return txts, phonemes
def txt2pin_phns(self, text):
text = re.sub(r'(?<! )(' + r'[^\w\s]' + r')', r' \1', text)
text = re.sub(r'\s+', ' ', text).strip()
# print(text.split(" "))
res_list = []
for txt in text.split(" "):
if txt in self.cmn_dict:
# res_list += ["(zh)" + x for x in self.cmn_dict[txt]]
res_list.append("(zh)")
res_list.append(to_initials(txt, strict=False))
res_list.append(to_finals_tone3(txt, neutral_tone_with_five=True))
elif txt == '':
continue
elif txt[0] in {"#1", "#2", "#3", "#4"} or not bool(regex.search(r'\p{L}', txt[0][0])):
if len(res_list) > 0 and res_list[-1] == "_":
res_list.pop()
res_list += [txt]
continue
else:
if len(res_list) > 0 and res_list[-1] == "_":
res_list.pop()
lang = langid.classify(txt)[0]
lang = lang if lang in self.text_tokenizer else "en"
tokenizer = self.text_tokenizer[lang][1]
ipa = tokenizer.backend.phonemize([txt], separator=tokenizer.separator, strip=True, njobs=1)
phns = ipa[0] if ipa[0][0] == "(" else f"({lang})_" + ipa[0]
res_list += phns.replace("_", "|_|").split("|")
# lang = phns.split(")")[0][1:]
# phns = phns[len(lang)+3:].replace("_", "|_|")
# phns = phns.split("|")
# for i in range(len(phns)):
# if phns[i] not in {"#1", "#2", "#3", "#4", "_", ",", ".", "?", "!"}:
# phns[i] = f"({lang})" + phns[i]
# res_list += phns
res_list.append("_")
res = "|".join(res_list)
res = re.sub(r'(\|_)+', '|_', res)
return res
def text2phn(self, sentence, lang=None):
if not lang:
lang = langid.classify(sentence)[0]
if re.search("[\u4e00-\u9fa5]+", sentence):
txts, phones = self.txt2pinyin(sentence)
transcript_norm = " ".join(phones)
phones = self.txt2pin_phns(transcript_norm) # IPA mix Pinyin
else:
transcript = self.replace_numbers_with_words(sentence, lang=lang).split(' ')
transcript_norm = sentence
# All IPA
phones = txt2phone(self.text_tokenizer[lang][1], transcript_norm.strip().replace(".", ",").replace("。", ","))
phones = f"({lang})|" + phones if phones[0] != "(" else phones
return phones
def text2norm(self, sentence, lang=None):
if not lang:
lang = langid.classify(sentence)[0]
if re.search("[\u4e00-\u9fa5]+", sentence):
txts, phones = self.txt2pinyin(sentence)
transcript_norm = " ".join(phones)
else:
transcript = self.replace_numbers_with_words(sentence, lang=lang).split(' ')
transcript_norm = sentence
return (lang, transcript_norm)