l2-demo / src /common /text.py
imanerd's picture
initial deploy
98b332e
"""Light text normalization. Intentionally non-destructive: punctuation is
collapsed, not stripped, so VADER and downstream feature extractors still see
exclamation marks, question marks, and repeated chars that carry signal.
"""
from __future__ import annotations
import re
import unicodedata
_WS = re.compile(r"\s+")
_REPEAT_PUNCT = re.compile(r"([!?.,])\1{2,}") # !!!! -> !
_REPEAT_CHARS = re.compile(r"(.)\1{3,}") # sooooo -> soo (keep one duplicate)
_DASHES = re.compile(r"[‐-β€•βˆ’]") # unicode dashes -> ascii dash
_QUOTES = re.compile(r"[β€˜β€™β€šβ€›]") # curly singles -> '
_DQUOTES = re.compile(r"[β€œβ€β€žβ€Ÿ]") # curly doubles -> "
def clean(text: str) -> str:
if text is None:
return ""
t = unicodedata.normalize("NFKC", str(text))
t = _DASHES.sub("-", t)
t = _QUOTES.sub("'", t)
t = _DQUOTES.sub('"', t)
t = _REPEAT_PUNCT.sub(r"\1", t)
t = _REPEAT_CHARS.sub(r"\1\1", t)
t = _WS.sub(" ", t).strip()
return t.lower()
def clean_preserve_case(text: str) -> str:
"""Same as clean() but keeps case. Useful for NER and caps-as-emotion."""
if text is None:
return ""
t = unicodedata.normalize("NFKC", str(text))
t = _DASHES.sub("-", t)
t = _QUOTES.sub("'", t)
t = _DQUOTES.sub('"', t)
t = _REPEAT_PUNCT.sub(r"\1", t)
t = _REPEAT_CHARS.sub(r"\1\1", t)
return _WS.sub(" ", t).strip()