| | import re |
| | from opencc import OpenCC |
| |
|
| |
|
| | t2s_converter = OpenCC('t2s') |
| | s2t_converter = OpenCC('s2t') |
| |
|
| |
|
| | EMOJI_PATTERN = re.compile( |
| | "[" |
| | "\U0001F600-\U0001F64F" |
| | "]+", flags=re.UNICODE |
| | ) |
| |
|
| | |
| | TRANSLATION_TABLE = str.maketrans({ |
| | '-': ' ', |
| | ',': None, |
| | '.': None, |
| | ',': None, |
| | '。': None, |
| | '!': None, |
| | '!': None, |
| | '?': None, |
| | '?': None, |
| | '…': None, |
| | ';': None, |
| | ';': None, |
| | ':': None, |
| | ':': None, |
| | '\u3000': ' ', |
| | }) |
| |
|
| | |
| | BACKSLASH_PATTERN = re.compile(r'\(.*?\)|\[.*?\]') |
| |
|
| | SPACE_PATTERN = re.compile('(?<!^)\s+(?!$)') |
| |
|
| |
|
| | def normalize_text(text, language, strip=True): |
| | """ |
| | 对文本进行标准化处理,去除标点符号,转为小写(如果适用) |
| | """ |
| | |
| | text = text.translate(TRANSLATION_TABLE) |
| |
|
| | |
| | text = EMOJI_PATTERN.sub('', text) |
| |
|
| | |
| | text = SPACE_PATTERN.sub(' ', text) |
| |
|
| | |
| | if strip: |
| | text = text.strip() |
| |
|
| | |
| | text = text.lower() |
| |
|
| | |
| | if language == "zh": |
| | text = t2s_converter.convert(text) |
| | if language == "yue": |
| | text = s2t_converter.convert(text) |
| | |
| | return text |
| |
|