File size: 4,952 Bytes
fb3e070
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
"""
utils/text_preprocessing.py
============================
Industry-grade text preprocessing pipeline for stress detection.

Operations (applied in order)
------------------------------
1. HTML entity unescaping  (``&`` β†’ ``&``, ``'`` β†’ ``'``)
2. HTML tag stripping       (``<b>hello</b>`` β†’ ``hello``)
3. Emoji-to-text mapping    (``😰`` β†’ ``anxious``)
4. URL removal              (``https://…`` β†’ space)
5. E-mail removal           (``user@host.com`` β†’ space)
6. Repeated-char compression (``"soooo"`` β†’ ``"soo"``)
7. Unicode NFKC normalisation (ligatures, half-width chars, etc.)
8. Whitespace normalisation  (collapse runs, strip leading/trailing)

Design notes
------------
- All operations are conservative: they strip noise without destroying
  meaningful content.
- No external dependencies beyond the Python standard library.
- ``clean_text`` is the primary public function.  It is called by both the
  training pipeline (``training/train.py``) and the inference API
  (``api/main.py``) to ensure **identical preprocessing** at train and
  inference time β€” a critical requirement for consistent predictions.
"""

from __future__ import annotations

import html
import re
import unicodedata

# ---------------------------------------------------------------------------
# Compiled regex patterns
# ---------------------------------------------------------------------------

# Capped tag length (<200 chars) to guard against ReDoS on malformed HTML.
_HTML_TAG_RE = re.compile(r"<[^>]{0,200}>")
_URL_RE = re.compile(r"https?://\S+|www\.\S+", re.IGNORECASE)
_EMAIL_RE = re.compile(
    r"\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b"
)
# Compress 4+ repetitions of the same character ("sooooo" β†’ "soo")
_REPEATED_CHAR_RE = re.compile(r"(.)\1{3,}")
_WHITESPACE_RE = re.compile(r"\s+")

# ---------------------------------------------------------------------------
# Emoji β†’ text mapping
# Covers the most common emojis appearing in stress-related social-media text.
# Each emoji is replaced by a semantically equivalent English word so that the
# model's hash-based tokeniser can process it.
# ---------------------------------------------------------------------------

_EMOJI_TEXT: dict[str, str] = {
    # Positive / happy
    "😊": "happy",
    "πŸ™‚": "happy",
    "πŸ˜€": "happy",
    "😁": "happy",
    "πŸ˜„": "happy",
    "πŸ˜ƒ": "happy",
    "πŸ˜†": "laughing",
    "πŸ˜‚": "laughing",
    "🀣": "laughing",
    "πŸ₯³": "happy",
    "πŸŽ‰": "happy",
    "😌": "calm",
    "😍": "love",
    "πŸ₯°": "love",
    # Sad / emotional
    "πŸ₯²": "emotional",
    "😒": "sad",
    "😭": "crying",
    "πŸ˜”": "sad",
    "😞": "sad",
    "😟": "sad",
    "πŸ’”": "heartbroken",
    # Stress / anxiety
    "😰": "anxious",
    "😨": "scared",
    "😱": "terrified",
    "πŸ˜“": "stressed",
    "πŸ˜₯": "upset",
    "🀯": "overwhelmed",
    "😀": "frustrated",
    "😑": "angry",
    "🀬": "angry",
    # Tired / sick
    "😫": "exhausted",
    "😩": "tired",
    "😴": "tired",
    "πŸ€•": "hurt",
    "πŸ€’": "sick",
    # Other
    "πŸ˜•": "confused",
    "😢": "speechless",
    "πŸ₯Ί": "hopeful",
    "❀️": "love",
    "πŸ’ͺ": "strong",
    "πŸ™": "grateful",
    "πŸ‘": "good",
    "πŸ‘Ž": "bad",
    "πŸ†˜": "help",
    "⚑": "stressed",
    "πŸ’€": "dying",
    "πŸ”₯": "intense",
}


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------


def clean_text(text: str, *, normalize_repeated: bool = True) -> str:
    """Clean and normalise text for model input.

    Parameters
    ----------
    text : str
        Raw user input (social-media post, journal entry, chat message …).
    normalize_repeated : bool
        When ``True`` (default), compress long repeated-character runs
        (e.g. ``"soooooo"`` β†’ ``"soo"``).

    Returns
    -------
    str
        Cleaned text ready for tokenisation, or an empty string when the
        input is empty or not a string.
    """
    if not text or not isinstance(text, str):
        return ""

    # 1. HTML entity unescaping
    text = html.unescape(text)

    # 2. Strip HTML/XML tags
    text = _HTML_TAG_RE.sub(" ", text)

    # 3. Emoji β†’ text  (iterate over a snapshot so replacements don't loop)
    for emoji_char, replacement in _EMOJI_TEXT.items():
        if emoji_char in text:
            text = text.replace(emoji_char, f" {replacement} ")

    # 4. Remove URLs
    text = _URL_RE.sub(" ", text)

    # 5. Remove e-mail addresses
    text = _EMAIL_RE.sub(" ", text)

    # 6. Repeated-character compression
    if normalize_repeated:
        text = _REPEATED_CHAR_RE.sub(r"\1\1", text)

    # 7. Unicode NFKC normalisation
    text = unicodedata.normalize("NFKC", text)

    # 8. Collapse whitespace
    text = _WHITESPACE_RE.sub(" ", text).strip()

    return text