StressDetect / utils /text_preprocessing.py
Ace-119's picture
Add sentiment analysis, text preprocessing, and RL reward signal
fb3e070
"""
utils/text_preprocessing.py
============================
Industry-grade text preprocessing pipeline for stress detection.
Operations (applied in order)
------------------------------
1. HTML entity unescaping (``&`` β†’ ``&``, ``'`` β†’ ``'``)
2. HTML tag stripping (``<b>hello</b>`` β†’ ``hello``)
3. Emoji-to-text mapping (``😰`` β†’ ``anxious``)
4. URL removal (``https://…`` β†’ space)
5. E-mail removal (``user@host.com`` β†’ space)
6. Repeated-char compression (``"soooo"`` β†’ ``"soo"``)
7. Unicode NFKC normalisation (ligatures, half-width chars, etc.)
8. Whitespace normalisation (collapse runs, strip leading/trailing)
Design notes
------------
- All operations are conservative: they strip noise without destroying
meaningful content.
- No external dependencies beyond the Python standard library.
- ``clean_text`` is the primary public function. It is called by both the
training pipeline (``training/train.py``) and the inference API
(``api/main.py``) to ensure **identical preprocessing** at train and
inference time β€” a critical requirement for consistent predictions.
"""
from __future__ import annotations
import html
import re
import unicodedata
# ---------------------------------------------------------------------------
# Compiled regex patterns
# ---------------------------------------------------------------------------
# Capped tag length (<200 chars) to guard against ReDoS on malformed HTML.
_HTML_TAG_RE = re.compile(r"<[^>]{0,200}>")
_URL_RE = re.compile(r"https?://\S+|www\.\S+", re.IGNORECASE)
_EMAIL_RE = re.compile(
r"\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b"
)
# Compress 4+ repetitions of the same character ("sooooo" β†’ "soo")
_REPEATED_CHAR_RE = re.compile(r"(.)\1{3,}")
_WHITESPACE_RE = re.compile(r"\s+")
# ---------------------------------------------------------------------------
# Emoji β†’ text mapping
# Covers the most common emojis appearing in stress-related social-media text.
# Each emoji is replaced by a semantically equivalent English word so that the
# model's hash-based tokeniser can process it.
# ---------------------------------------------------------------------------
_EMOJI_TEXT: dict[str, str] = {
# Positive / happy
"😊": "happy",
"πŸ™‚": "happy",
"πŸ˜€": "happy",
"😁": "happy",
"πŸ˜„": "happy",
"πŸ˜ƒ": "happy",
"πŸ˜†": "laughing",
"πŸ˜‚": "laughing",
"🀣": "laughing",
"πŸ₯³": "happy",
"πŸŽ‰": "happy",
"😌": "calm",
"😍": "love",
"πŸ₯°": "love",
# Sad / emotional
"πŸ₯²": "emotional",
"😒": "sad",
"😭": "crying",
"πŸ˜”": "sad",
"😞": "sad",
"😟": "sad",
"πŸ’”": "heartbroken",
# Stress / anxiety
"😰": "anxious",
"😨": "scared",
"😱": "terrified",
"πŸ˜“": "stressed",
"πŸ˜₯": "upset",
"🀯": "overwhelmed",
"😀": "frustrated",
"😑": "angry",
"🀬": "angry",
# Tired / sick
"😫": "exhausted",
"😩": "tired",
"😴": "tired",
"πŸ€•": "hurt",
"πŸ€’": "sick",
# Other
"πŸ˜•": "confused",
"😢": "speechless",
"πŸ₯Ί": "hopeful",
"❀️": "love",
"πŸ’ͺ": "strong",
"πŸ™": "grateful",
"πŸ‘": "good",
"πŸ‘Ž": "bad",
"πŸ†˜": "help",
"⚑": "stressed",
"πŸ’€": "dying",
"πŸ”₯": "intense",
}
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def clean_text(text: str, *, normalize_repeated: bool = True) -> str:
"""Clean and normalise text for model input.
Parameters
----------
text : str
Raw user input (social-media post, journal entry, chat message …).
normalize_repeated : bool
When ``True`` (default), compress long repeated-character runs
(e.g. ``"soooooo"`` β†’ ``"soo"``).
Returns
-------
str
Cleaned text ready for tokenisation, or an empty string when the
input is empty or not a string.
"""
if not text or not isinstance(text, str):
return ""
# 1. HTML entity unescaping
text = html.unescape(text)
# 2. Strip HTML/XML tags
text = _HTML_TAG_RE.sub(" ", text)
# 3. Emoji β†’ text (iterate over a snapshot so replacements don't loop)
for emoji_char, replacement in _EMOJI_TEXT.items():
if emoji_char in text:
text = text.replace(emoji_char, f" {replacement} ")
# 4. Remove URLs
text = _URL_RE.sub(" ", text)
# 5. Remove e-mail addresses
text = _EMAIL_RE.sub(" ", text)
# 6. Repeated-character compression
if normalize_repeated:
text = _REPEATED_CHAR_RE.sub(r"\1\1", text)
# 7. Unicode NFKC normalisation
text = unicodedata.normalize("NFKC", text)
# 8. Collapse whitespace
text = _WHITESPACE_RE.sub(" ", text).strip()
return text