Spaces:
Runtime error
Runtime error
| import re | |
| import unicodedata | |
| class TextNormalizer: | |
| url_pattern = re.compile(r"https?://\S+|www\.\S+", re.IGNORECASE) | |
| mention_pattern = re.compile(r"@\w+") | |
| whitespace_pattern = re.compile(r"\s+") | |
| repeated_latin_pattern = re.compile(r"([A-Za-z])\1{2,}") | |
| zero_width_pattern = re.compile(r"[\u200b-\u200f\u2060\ufeff]") | |
| repeated_punctuation_pattern = re.compile(r"([!?.,])\1{2,}") | |
| def normalize(self, text: str) -> str: | |
| normalized = unicodedata.normalize("NFKC", text).strip() | |
| normalized = self.zero_width_pattern.sub("", normalized) | |
| normalized = self.url_pattern.sub("<URL>", normalized) | |
| normalized = self.mention_pattern.sub("<USER>", normalized) | |
| normalized = self.repeated_latin_pattern.sub(r"\1\1", normalized) | |
| normalized = self.repeated_punctuation_pattern.sub(r"\1\1", normalized) | |
| normalized = self.whitespace_pattern.sub(" ", normalized) | |
| return normalized | |