File size: 519 Bytes
633bb91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import unicodedata


class Normalizer:
    def __init__(self):
        pass

    def normalize_text(self, text: str) -> str:
        # Unicode normalization (e.g., full-width → half-width, etc.)
        text = unicodedata.normalize("NFKC", text)

        # Lowercase
        #text = text.lower()

        # Remove punctuation
        #text = "".join(char for char in text if char not in self.punctuation)

        # Collapse multiple whitespace
        #text = re.sub(r"\s+", " ", text).strip()

        return text