Spaces:
Running
Running
File size: 4,952 Bytes
fb3e070 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 | """
utils/text_preprocessing.py
============================
Industry-grade text preprocessing pipeline for stress detection.
Operations (applied in order)
------------------------------
1. HTML entity unescaping (``&`` β ``&``, ``'`` β ``'``)
2. HTML tag stripping (``<b>hello</b>`` β ``hello``)
3. Emoji-to-text mapping (``π°`` β ``anxious``)
4. URL removal (``https://β¦`` β space)
5. E-mail removal (``user@host.com`` β space)
6. Repeated-char compression (``"soooo"`` β ``"soo"``)
7. Unicode NFKC normalisation (ligatures, half-width chars, etc.)
8. Whitespace normalisation (collapse runs, strip leading/trailing)
Design notes
------------
- All operations are conservative: they strip noise without destroying
meaningful content.
- No external dependencies beyond the Python standard library.
- ``clean_text`` is the primary public function. It is called by both the
training pipeline (``training/train.py``) and the inference API
(``api/main.py``) to ensure **identical preprocessing** at train and
inference time β a critical requirement for consistent predictions.
"""
from __future__ import annotations
import html
import re
import unicodedata
# ---------------------------------------------------------------------------
# Compiled regex patterns
# ---------------------------------------------------------------------------
# Capped tag length (<200 chars) to guard against ReDoS on malformed HTML.
_HTML_TAG_RE = re.compile(r"<[^>]{0,200}>")
_URL_RE = re.compile(r"https?://\S+|www\.\S+", re.IGNORECASE)
_EMAIL_RE = re.compile(
r"\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b"
)
# Compress 4+ repetitions of the same character ("sooooo" β "soo")
_REPEATED_CHAR_RE = re.compile(r"(.)\1{3,}")
_WHITESPACE_RE = re.compile(r"\s+")
# ---------------------------------------------------------------------------
# Emoji β text mapping
# Covers the most common emojis appearing in stress-related social-media text.
# Each emoji is replaced by a semantically equivalent English word so that the
# model's hash-based tokeniser can process it.
# ---------------------------------------------------------------------------
_EMOJI_TEXT: dict[str, str] = {
# Positive / happy
"π": "happy",
"π": "happy",
"π": "happy",
"π": "happy",
"π": "happy",
"π": "happy",
"π": "laughing",
"π": "laughing",
"π€£": "laughing",
"π₯³": "happy",
"π": "happy",
"π": "calm",
"π": "love",
"π₯°": "love",
# Sad / emotional
"π₯²": "emotional",
"π’": "sad",
"π": "crying",
"π": "sad",
"π": "sad",
"π": "sad",
"π": "heartbroken",
# Stress / anxiety
"π°": "anxious",
"π¨": "scared",
"π±": "terrified",
"π": "stressed",
"π₯": "upset",
"π€―": "overwhelmed",
"π€": "frustrated",
"π‘": "angry",
"π€¬": "angry",
# Tired / sick
"π«": "exhausted",
"π©": "tired",
"π΄": "tired",
"π€": "hurt",
"π€": "sick",
# Other
"π": "confused",
"πΆ": "speechless",
"π₯Ί": "hopeful",
"β€οΈ": "love",
"πͺ": "strong",
"π": "grateful",
"π": "good",
"π": "bad",
"π": "help",
"β‘": "stressed",
"π": "dying",
"π₯": "intense",
}
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def clean_text(text: str, *, normalize_repeated: bool = True) -> str:
"""Clean and normalise text for model input.
Parameters
----------
text : str
Raw user input (social-media post, journal entry, chat message β¦).
normalize_repeated : bool
When ``True`` (default), compress long repeated-character runs
(e.g. ``"soooooo"`` β ``"soo"``).
Returns
-------
str
Cleaned text ready for tokenisation, or an empty string when the
input is empty or not a string.
"""
if not text or not isinstance(text, str):
return ""
# 1. HTML entity unescaping
text = html.unescape(text)
# 2. Strip HTML/XML tags
text = _HTML_TAG_RE.sub(" ", text)
# 3. Emoji β text (iterate over a snapshot so replacements don't loop)
for emoji_char, replacement in _EMOJI_TEXT.items():
if emoji_char in text:
text = text.replace(emoji_char, f" {replacement} ")
# 4. Remove URLs
text = _URL_RE.sub(" ", text)
# 5. Remove e-mail addresses
text = _EMAIL_RE.sub(" ", text)
# 6. Repeated-character compression
if normalize_repeated:
text = _REPEATED_CHAR_RE.sub(r"\1\1", text)
# 7. Unicode NFKC normalisation
text = unicodedata.normalize("NFKC", text)
# 8. Collapse whitespace
text = _WHITESPACE_RE.sub(" ", text).strip()
return text
|