| """Light text normalization. Intentionally non-destructive: punctuation is |
| collapsed, not stripped, so VADER and downstream feature extractors still see |
| exclamation marks, question marks, and repeated chars that carry signal. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import re |
| import unicodedata |
|
|
| _WS = re.compile(r"\s+") |
| _REPEAT_PUNCT = re.compile(r"([!?.,])\1{2,}") |
| _REPEAT_CHARS = re.compile(r"(.)\1{3,}") |
| _DASHES = re.compile(r"[β-ββ]") |
| _QUOTES = re.compile(r"[ββββ]") |
| _DQUOTES = re.compile(r"[ββββ]") |
|
|
|
|
| def clean(text: str) -> str: |
| if text is None: |
| return "" |
| t = unicodedata.normalize("NFKC", str(text)) |
| t = _DASHES.sub("-", t) |
| t = _QUOTES.sub("'", t) |
| t = _DQUOTES.sub('"', t) |
| t = _REPEAT_PUNCT.sub(r"\1", t) |
| t = _REPEAT_CHARS.sub(r"\1\1", t) |
| t = _WS.sub(" ", t).strip() |
| return t.lower() |
|
|
|
|
| def clean_preserve_case(text: str) -> str: |
| """Same as clean() but keeps case. Useful for NER and caps-as-emotion.""" |
| if text is None: |
| return "" |
| t = unicodedata.normalize("NFKC", str(text)) |
| t = _DASHES.sub("-", t) |
| t = _QUOTES.sub("'", t) |
| t = _DQUOTES.sub('"', t) |
| t = _REPEAT_PUNCT.sub(r"\1", t) |
| t = _REPEAT_CHARS.sub(r"\1\1", t) |
| return _WS.sub(" ", t).strip() |
|
|