Spaces:

FireRedTeam
/

FireRedTTS-1S

Running on Zero

FireRedTTS-1S / fireredtts /modules /text_normalizer /utils.py

Shen Feiyu

add 1s

faadabf 4 months ago

3.82 kB

	from fireredtts.modules.text_normalizer.regex_common import *
	from sentencex import segment
	import re


	symbol_reduction = {
	"「": '"',
	"」": '"',
	"｀": '"',
	"〝": '"',
	"〞": '"',
	"‟": '"',
	"„": '"',
	"｛": "(",
	"｝": ")",
	"【": "(",
	"】": ")",
	"〖": "(",
	"〗": ")",
	"〔": "(",
	"〕": ")",
	"〘": "(",
	"〙": ")",
	"《": "(",
	"》": ")",
	"｟": "(",
	"｠": ")",
	"〚": "(",
	"〛": ")",
	"『": '"',
	"』": '"',
	"｢": '"',
	"｣": '"',
	"{": "(",
	"}": ")",
	"〈": "(",
	"〉": ")",
	"•": "·",
	"‧": "·",
	"〰": "…",
	"﹏": "…",
	"〜": "~",
	"～": "~",
	"＋": "+",
	"､": "、",
	"｡": "。",
	"︐": "，",
	"﹐": "，",
	"︑": "、",
	"﹑": "、",
	"︒": "。",
	"︓": "：",
	"﹕": "：",
	"︔": "；",
	"﹔": "；",
	"︕": "！",
	"﹗": "！",
	"︖": "？",
	"﹖": "？",
	"﹙": "(",
	"﹚": ")",
	"﹪": "%",
	"﹠": "&",
	"＞": ">",
	"\|": "、",
	"＝": "=",
	"‐": "-",
	"‑": "-",
	"‒": "-",
	"–": "-",
	"—": "-",
	"―": "-",
	"％": "%",
	"μ": "u",
	}


	strong_break = re.compile("([。”;；!！：…?？）\)\]』】」}~\r\n]\| \.)", re.UNICODE)
	weak_break = re.compile(
	"["
	"\U00002702-\U000027b0\U0001f926-\U0001f937\U00010000-\U0001fbff\U00030000-\U0010ffff"
	"\u2640-\u2642\u2600-\u2b55\u23cf\u23e9\u231a\ufe0f\u3030"
	"\t，,. ]",
	re.UNICODE,
	)


	def contains_chinese(text):
	return bool(chinese_regex.search(text))


	def strip_kaomoji(text):
	return kaomoji_regex.sub(" ", text)


	def is_chinese(char):
	return chinese_char_regex.match(char)


	def is_eng_and_digit(char):
	return eng_and_digit_char_regex.match(char)


	def is_upper_eng_and_digit(text):
	return upper_eng_and_digit_regex.match(text)


	def is_valid_char(char):
	return valid_char_regex.match(char)


	def is_digit(text):
	return digit_regex.match(text)


	def f2b(ustr, exemption="。，："):
	half = []
	for u in ustr:
	num = ord(u)
	if num == 0x3000:
	half.append(" ")
	elif u in exemption: # exemption
	half.append(u)
	elif 0xFF01 <= num <= 0xFF5E:
	num -= 0xFEE0
	half.append(chr(num))
	else:
	half.append(u)
	return "".join(half)


	def zh_text_split(text, length=80):
	if length == 0:
	return []
	if length == 1:
	return [c for c in length]
	if len(text) <= length:
	return [text]

	match_strong = re.search(strong_break, text[:length][::-1])
	match_weak = re.search(weak_break, text[:length][::-1])
	end_ind_strong = length - match_strong.start() if match_strong else 0
	end_ind_weak = length - match_weak.start() if match_weak else 0

	if end_ind_strong < length // 3:
	if end_ind_weak < length // 3:
	valid_max = max(end_ind_strong, end_ind_weak)
	if valid_max >= 3:
	return [text[:valid_max]] + zh_text_split(text[valid_max:])
	else:
	return [text[:length]] + zh_text_split(text[length:])
	else:
	return [text[:end_ind_weak]] + zh_text_split(text[end_ind_weak:])
	else:
	return [text[:end_ind_strong]] + zh_text_split(text[end_ind_strong:])


	def text_split(text):
	if contains_chinese(text):
	substrings = list(segment("zh", text))
	new_substrings = []
	for s in substrings:
	if len(s) > 50:
	new_substrings += zh_text_split(s, length=50)
	else:
	new_substrings.append(s)
	substrings = new_substrings
	else:
	substrings = list(segment("en", text))

	return substrings