Spaces:
Running
on
Zero
Running
on
Zero
| from fireredtts.modules.text_normalizer.regex_common import * | |
| from sentencex import segment | |
| import re | |
| symbol_reduction = { | |
| "「": '"', | |
| "」": '"', | |
| "`": '"', | |
| "〝": '"', | |
| "〞": '"', | |
| "‟": '"', | |
| "„": '"', | |
| "{": "(", | |
| "}": ")", | |
| "【": "(", | |
| "】": ")", | |
| "〖": "(", | |
| "〗": ")", | |
| "〔": "(", | |
| "〕": ")", | |
| "〘": "(", | |
| "〙": ")", | |
| "《": "(", | |
| "》": ")", | |
| "⦅": "(", | |
| "⦆": ")", | |
| "〚": "(", | |
| "〛": ")", | |
| "『": '"', | |
| "』": '"', | |
| "「": '"', | |
| "」": '"', | |
| "{": "(", | |
| "}": ")", | |
| "〈": "(", | |
| "〉": ")", | |
| "•": "·", | |
| "‧": "·", | |
| "〰": "…", | |
| "﹏": "…", | |
| "〜": "~", | |
| "~": "~", | |
| "+": "+", | |
| "、": "、", | |
| "。": "。", | |
| "︐": ",", | |
| "﹐": ",", | |
| "︑": "、", | |
| "﹑": "、", | |
| "︒": "。", | |
| "︓": ":", | |
| "﹕": ":", | |
| "︔": ";", | |
| "﹔": ";", | |
| "︕": "!", | |
| "﹗": "!", | |
| "︖": "?", | |
| "﹖": "?", | |
| "﹙": "(", | |
| "﹚": ")", | |
| "﹪": "%", | |
| "﹠": "&", | |
| ">": ">", | |
| "|": "、", | |
| "=": "=", | |
| "‐": "-", | |
| "‑": "-", | |
| "‒": "-", | |
| "–": "-", | |
| "—": "-", | |
| "―": "-", | |
| "%": "%", | |
| "μ": "u", | |
| } | |
| strong_break = re.compile("([。”;;!!:…??)\)\]』】」}~\r\n]| \.)", re.UNICODE) | |
| weak_break = re.compile( | |
| "[" | |
| "\U00002702-\U000027b0\U0001f926-\U0001f937\U00010000-\U0001fbff\U00030000-\U0010ffff" | |
| "\u2640-\u2642\u2600-\u2b55\u23cf\u23e9\u231a\ufe0f\u3030" | |
| "\t,,. ]", | |
| re.UNICODE, | |
| ) | |
| def contains_chinese(text): | |
| return bool(chinese_regex.search(text)) | |
| def strip_kaomoji(text): | |
| return kaomoji_regex.sub(" ", text) | |
| def is_chinese(char): | |
| return chinese_char_regex.match(char) | |
| def is_eng_and_digit(char): | |
| return eng_and_digit_char_regex.match(char) | |
| def is_upper_eng_and_digit(text): | |
| return upper_eng_and_digit_regex.match(text) | |
| def is_valid_char(char): | |
| return valid_char_regex.match(char) | |
| def is_digit(text): | |
| return digit_regex.match(text) | |
| def f2b(ustr, exemption="。,:"): | |
| half = [] | |
| for u in ustr: | |
| num = ord(u) | |
| if num == 0x3000: | |
| half.append(" ") | |
| elif u in exemption: # exemption | |
| half.append(u) | |
| elif 0xFF01 <= num <= 0xFF5E: | |
| num -= 0xFEE0 | |
| half.append(chr(num)) | |
| else: | |
| half.append(u) | |
| return "".join(half) | |
| def zh_text_split(text, length=80): | |
| if length == 0: | |
| return [] | |
| if length == 1: | |
| return [c for c in length] | |
| if len(text) <= length: | |
| return [text] | |
| match_strong = re.search(strong_break, text[:length][::-1]) | |
| match_weak = re.search(weak_break, text[:length][::-1]) | |
| end_ind_strong = length - match_strong.start() if match_strong else 0 | |
| end_ind_weak = length - match_weak.start() if match_weak else 0 | |
| if end_ind_strong < length // 3: | |
| if end_ind_weak < length // 3: | |
| valid_max = max(end_ind_strong, end_ind_weak) | |
| if valid_max >= 3: | |
| return [text[:valid_max]] + zh_text_split(text[valid_max:]) | |
| else: | |
| return [text[:length]] + zh_text_split(text[length:]) | |
| else: | |
| return [text[:end_ind_weak]] + zh_text_split(text[end_ind_weak:]) | |
| else: | |
| return [text[:end_ind_strong]] + zh_text_split(text[end_ind_strong:]) | |
| def text_split(text): | |
| if contains_chinese(text): | |
| substrings = list(segment("zh", text)) | |
| new_substrings = [] | |
| for s in substrings: | |
| if len(s) > 50: | |
| new_substrings += zh_text_split(s, length=50) | |
| else: | |
| new_substrings.append(s) | |
| substrings = new_substrings | |
| else: | |
| substrings = list(segment("en", text)) | |
| return substrings | |