Spaces:
Sleeping
Sleeping
File size: 406 Bytes
5f2a5b3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 |
import re
def normalize(text: str) -> list[str]:
REPEAT_PATTERN = re.compile(r'(.)\1{2,}') # removes >2 repeated characters
text = text.lower()
text = re.sub(r"[^\w\s']", " ", text)
tokens = text.split()
norm_tokens = []
for t in tokens:
t = REPEAT_PATTERN.sub(r'\1\1', t)
t = t.strip("'")
if t:
norm_tokens.append(t)
return norm_tokens
|