File size: 954 Bytes
16f57d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import re
import unicodedata


class TextNormalizer:
    url_pattern = re.compile(r"https?://\S+|www\.\S+", re.IGNORECASE)
    mention_pattern = re.compile(r"@\w+")
    whitespace_pattern = re.compile(r"\s+")
    repeated_latin_pattern = re.compile(r"([A-Za-z])\1{2,}")
    zero_width_pattern = re.compile(r"[\u200b-\u200f\u2060\ufeff]")
    repeated_punctuation_pattern = re.compile(r"([!?.,])\1{2,}")

    def normalize(self, text: str) -> str:
        normalized = unicodedata.normalize("NFKC", text).strip()
        normalized = self.zero_width_pattern.sub("", normalized)
        normalized = self.url_pattern.sub("<URL>", normalized)
        normalized = self.mention_pattern.sub("<USER>", normalized)
        normalized = self.repeated_latin_pattern.sub(r"\1\1", normalized)
        normalized = self.repeated_punctuation_pattern.sub(r"\1\1", normalized)
        normalized = self.whitespace_pattern.sub(" ", normalized)
        return normalized