Spaces:
Running
on
Zero
Running
on
Zero
| import re | |
| import regex | |
| import inflect | |
| import unicodedata | |
| from lingua import Language, LanguageDetectorBuilder | |
| from builtins import str as unicode | |
| from tn.chinese.normalizer import Normalizer as ZhNormalizer | |
| from tn.english.normalizer import Normalizer as EnNormalizer | |
| from fireredtts.modules.text_normalizer.regex_common import * | |
| from fireredtts.modules.text_normalizer.utils import * | |
| def preprocess_text(sentence): | |
| # preprocessing | |
| sentence = bytes(sentence, "utf-8").decode("utf-8", "ignore") | |
| sentence = regex.sub("[\p{Cf}--[\u200d]]", "", sentence, flags=regex.V1) | |
| sentence = regex.sub("\p{Co}", "", sentence) | |
| sentence = sentence.replace("\u00a0", " ") | |
| sentence = sentence.replace("\ufffd", "") | |
| sentence = regex.sub("\p{Zl}", "\n", sentence) | |
| sentence = regex.sub("\p{Zp}", "\n", sentence) | |
| sentence = unicode(sentence) | |
| sentence = "".join( | |
| char | |
| for char in unicodedata.normalize("NFD", sentence) | |
| if unicodedata.category(char) != "Mn" | |
| ) # Strip accents | |
| sentence = strip_kaomoji(sentence) | |
| # full to half with exemption (to be converted after number TN): 。,: | |
| sentence = f2b(sentence, exemption="。,:") | |
| # clean spaces | |
| sentence = sentence.replace("\n", ",") | |
| sentence = sentence.replace("\t", ",") | |
| sentence = sentence.replace("\r", ",") | |
| sentence = re.sub(r"[。.]{3,}", "…", sentence) | |
| sentence = re.sub(r"[…⋯]{1,}", "…", sentence) | |
| sentence = re.sub(r"[ ]+", " ", sentence) | |
| sentence = sentence.strip() | |
| # punctuation reduction | |
| result = "" | |
| for idx, char in enumerate(sentence): | |
| if char in symbol_reduction: | |
| char = symbol_reduction[char] | |
| if char == " ": | |
| if idx == 0: | |
| continue | |
| if is_chinese(sentence[idx + 1]) and ( | |
| is_chinese(sentence[idx - 1]) or sentence[idx - 1] in '") ' | |
| ): | |
| result += "," | |
| else: | |
| result += " " | |
| continue | |
| if is_valid_char(char): | |
| result += char | |
| result = re.sub(r"[ ]+", " ", result) | |
| return result | |
| def rettt(sentence): | |
| # handle abbreviations for all languages | |
| sentence = sentence.replace("&nd", "and") | |
| sentence = sentence.replace("Jan.", "january") | |
| sentence = sentence.replace("Feb.", "febrary") | |
| sentence = sentence.replace("Mar.", "march") | |
| sentence = sentence.replace("Apr.", "april") | |
| sentence = sentence.replace("May.", "may") | |
| sentence = sentence.replace("Jun.", "june") | |
| sentence = sentence.replace("Jul.", "july") | |
| sentence = sentence.replace("Aug.", "august") | |
| sentence = sentence.replace("Sept.", "september") | |
| sentence = sentence.replace("Sep.", "september") | |
| sentence = sentence.replace("Oct.", "october") | |
| sentence = sentence.replace("Nov.", "november") | |
| sentence = sentence.replace("Dec.", "december") | |
| sentence = sentence.replace("Mon.", "monday") | |
| sentence = sentence.replace("Tues.", "tuesday") | |
| sentence = sentence.replace("Wed.", "wednesday") | |
| sentence = sentence.replace("Thur.", "thursday") | |
| sentence = sentence.replace("Fri.", "friday") | |
| sentence = sentence.replace("Sat.", "saturday") | |
| if sentence != "Sun.": | |
| sentence = sentence.replace("Sun.", "sunday") | |
| sentence = re.sub(r" St\. ([A-Z])", r" saint \1", sentence) | |
| sentence = re.sub(r" St\.", " street", sentence) | |
| sentence = re.sub(r" Rd\.", " road", sentence) | |
| sentence = re.sub(r"[Aa]\.[Mm]\.", "A_M", sentence) | |
| sentence = re.sub(r"[Pp]\.[Mm]\.", "P_M", sentence) | |
| sentence = re.sub(r"[Bb]\.[Cc]\.", "B_C", sentence) | |
| sentence = re.sub(r"[Ad]\.[Dd]\.", "A_D", sentence) | |
| sentence = sentence.replace("Mr.", "mister") | |
| sentence = sentence.replace("Ms.", "miss") | |
| sentence = sentence.replace("Mrs.", "misses") | |
| sentence = sentence.replace("Ph.D", "P_H_D") | |
| sentence = sentence.replace("i.e.", "that is") | |
| sentence = sentence.replace("e.g.", "for example") | |
| sentence = sentence.replace("btw.", "by the way") | |
| sentence = sentence.replace("btw", "by the way") | |
| sentence = sentence.replace("b.t.w.", "by the way") | |
| sentence = sentence.replace("@", " at ") | |
| return sentence | |
| class TextNormalizer: | |
| def __init__(self): | |
| self.language_detector = LanguageDetectorBuilder.from_languages( | |
| Language.ENGLISH, Language.CHINESE | |
| ).build() | |
| self.zh_normalizer = ZhNormalizer() | |
| self.en_normalizer = EnNormalizer() | |
| self.inflect_parser = inflect.engine() | |
| self.lang2token = {Language.ENGLISH: "en", Language.CHINESE: "zh"} | |
| def tn(self, text): | |
| text = preprocess_text(text) | |
| text = rettt(text) # regex replacements | |
| # for non chinese languages | |
| language = self.language_detector.detect_language_of(text) | |
| # enforce chinese if text contains any chinese character | |
| if contains_chinese(text): | |
| language = Language.CHINESE | |
| text_lang = self.lang2token.get(language, "zh") | |
| if is_upper_eng_and_digit(text): | |
| language = Language.CHINESE | |
| if language == Language.CHINESE: | |
| text = self.zh_normalizer.normalize(text) | |
| # print("---text after zh_normalizer:", text) | |
| text = text.replace("\n", "") | |
| text = text.replace(",", ",") | |
| text = text.replace(".", "。") | |
| text = re.sub(r"[,,]+$", "。", text) | |
| # print("---text after zh_normalizer 2:", text) | |
| else: | |
| text = re.sub(r"[^ 0-9A-Za-z\[\]'.,:?!_\-]", "", text) | |
| text = self.en_normalizer.normalize(text) | |
| # fallback number normalization | |
| pieces = re.split(r"(\d+)", text) | |
| text = "".join( | |
| [ | |
| self.inflect_parser.number_to_words(p) if p.isnumeric() else p | |
| for p in pieces | |
| if len(p) > 0 | |
| ] | |
| ) | |
| # cleanup | |
| text = text.replace("_", " ") | |
| text = re.sub(r"[ ]+", " ", text) | |
| # spell caplital words | |
| pieces = re.split(r"([A-Z]{2,4}|[ ])", text) | |
| for idx, p in enumerate(pieces): | |
| if re.match("[A-Z]{2,4}", p): | |
| pieces[idx] = " ".join(p) | |
| text = " ".join([p for p in pieces if p != " "]) | |
| # post TN full to half | |
| # text = text.replace("。", ".") | |
| # text = text.replace(",", ",") | |
| # text = text.replace(":", ":") | |
| # model limitations | |
| text = text.lower().strip() | |
| text = text.replace('"', "") | |
| text = text.replace("·", " ") | |
| # text = re.sub("[…~!,&*%$#^:;!:;]+", ",", text) | |
| text = re.sub("[…~!&*%$#^:;!:;]+", ",", text) | |
| text = re.sub("[,]+", ",", text) | |
| text = re.sub(r"[,. ]+$", ".", text) | |
| if len(text) > 0 and text[-1] not in ".?": | |
| text = text + "." | |
| text = text.replace("。.", "。") | |
| return text, text_lang | |