| |
| import traceback |
| import os |
| import sys |
| import re |
| import re |
|
|
|
|
|
|
|
|
| class TextNormalizer: |
| def __init__(self): |
| |
| self.zh_normalizer = None |
| self.en_normalizer = None |
| self.char_rep_map = { |
| "๏ผ": ",", |
| "๏ผ": ",", |
| ";": ",", |
| "๏ผ": ",", |
| "ใ": ".", |
| "๏ผ": "!", |
| "๏ผ": "?", |
| "\n": ".", |
| "ยท": ",", |
| "ใ": ",", |
| "...": "โฆ", |
| "โฆโฆ": "โฆ", |
| "$": ".", |
| "โ": "'", |
| "โ": "'", |
| '"': "'", |
| "โ": "'", |
| "โ": "'", |
| "๏ผ": "'", |
| "๏ผ": "'", |
| "(": "'", |
| ")": "'", |
| "ใ": "'", |
| "ใ": "'", |
| "ใ": "'", |
| "ใ": "'", |
| "[": "'", |
| "]": "'", |
| "โ": "-", |
| "๏ฝ": "-", |
| "~": "-", |
| "ใ": "'", |
| "ใ": "'", |
| ":": ",", |
| } |
|
|
| def match_email(self, email): |
| |
| pattern = r'^[a-zA-Z0-9]+@[a-zA-Z0-9]+\.[a-zA-Z]+$' |
| return re.match(pattern, email) is not None |
|
|
| def use_chinese(self, s): |
| has_chinese = bool(re.search(r'[\u4e00-\u9fff]', s)) |
| has_digit = bool(re.search(r'\d', s)) |
| has_alpha = bool(re.search(r'[a-zA-Z]', s)) |
| is_email = self.match_email(s) |
| if has_chinese or not has_alpha or is_email: |
| return True |
| else: |
| return False |
|
|
| def load(self): |
| |
| |
|
|
| from tn.chinese.normalizer import Normalizer as NormalizerZh |
| from tn.english.normalizer import Normalizer as NormalizerEn |
|
|
| self.zh_normalizer = NormalizerZh(remove_interjections=False, remove_erhua=False) |
| self.en_normalizer = NormalizerEn() |
|
|
| def infer(self, text): |
| pattern = re.compile("|".join(re.escape(p) for p in self.char_rep_map.keys())) |
| replaced_text = pattern.sub(lambda x: self.char_rep_map[x.group()], text) |
| if not self.zh_normalizer or not self.en_normalizer: |
| print("Error, text normalizer is not initialized !!!") |
| return "" |
| try: |
| normalizer = self.zh_normalizer if self.use_chinese(text) else self.en_normalizer |
| result = normalizer.normalize(text) |
| except Exception: |
| result = "" |
| print(traceback.format_exc()) |
| return result |
|
|
|
|
| if __name__ == '__main__': |
| |
| text_normalizer = TextNormalizer() |
| print(text_normalizer.infer("2.5ๅนณๆน็ต็บฟ")) |