| import re |
| from opencc import OpenCC |
|
|
|
|
| t2s_converter = OpenCC('t2s') |
| s2t_converter = OpenCC('s2t') |
|
|
|
|
| EMOJI_PATTERN = re.compile( |
| "[" |
| "\U0001F600-\U0001F64F" |
| "]+", flags=re.UNICODE |
| ) |
|
|
| |
| TRANSLATION_TABLE = str.maketrans({ |
| '-': ' ', |
| ',': None, |
| '.': None, |
| ',': None, |
| '。': None, |
| '!': None, |
| '!': None, |
| '?': None, |
| '?': None, |
| '…': None, |
| ';': None, |
| ';': None, |
| ':': None, |
| ':': None, |
| '\u3000': ' ', |
| }) |
|
|
| |
| BACKSLASH_PATTERN = re.compile(r'\(.*?\)|\[.*?\]') |
|
|
| SPACE_PATTERN = re.compile('(?<!^)\s+(?!$)') |
|
|
|
|
| def normalize_text(text, language, strip=True): |
| """ |
| 对文本进行标准化处理,去除标点符号,转为小写(如果适用) |
| """ |
| |
| text = text.translate(TRANSLATION_TABLE) |
|
|
| |
| text = EMOJI_PATTERN.sub('', text) |
|
|
| |
| text = SPACE_PATTERN.sub(' ', text) |
|
|
| |
| if strip: |
| text = text.strip() |
|
|
| |
| text = text.lower() |
|
|
| |
| if language == "zh": |
| text = t2s_converter.convert(text) |
| if language == "yue": |
| text = s2t_converter.convert(text) |
| |
| return text |
|
|