File size: 406 Bytes
5f2a5b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import re

def normalize(text: str) -> list[str]:
    REPEAT_PATTERN = re.compile(r'(.)\1{2,}') # removes >2 repeated characters

    text = text.lower()
    text = re.sub(r"[^\w\s']", " ", text)
    tokens = text.split()

    norm_tokens = []
    for t in tokens:
        t = REPEAT_PATTERN.sub(r'\1\1', t)
        t = t.strip("'")

        if t:
            norm_tokens.append(t)

    return norm_tokens