Spaces:
Sleeping
Sleeping
| import re | |
| def normalize(text: str) -> list[str]: | |
| REPEAT_PATTERN = re.compile(r'(.)\1{2,}') # removes >2 repeated characters | |
| text = text.lower() | |
| text = re.sub(r"[^\w\s']", " ", text) | |
| tokens = text.split() | |
| norm_tokens = [] | |
| for t in tokens: | |
| t = REPEAT_PATTERN.sub(r'\1\1', t) | |
| t = t.strip("'") | |
| if t: | |
| norm_tokens.append(t) | |
| return norm_tokens | |