Spaces:
Running
Running
| """ | |
| utils/text_preprocessing.py | |
| ============================ | |
| Industry-grade text preprocessing pipeline for stress detection. | |
| Operations (applied in order) | |
| ------------------------------ | |
| 1. HTML entity unescaping (``&`` β ``&``, ``'`` β ``'``) | |
| 2. HTML tag stripping (``<b>hello</b>`` β ``hello``) | |
| 3. Emoji-to-text mapping (``π°`` β ``anxious``) | |
| 4. URL removal (``https://β¦`` β space) | |
| 5. E-mail removal (``user@host.com`` β space) | |
| 6. Repeated-char compression (``"soooo"`` β ``"soo"``) | |
| 7. Unicode NFKC normalisation (ligatures, half-width chars, etc.) | |
| 8. Whitespace normalisation (collapse runs, strip leading/trailing) | |
| Design notes | |
| ------------ | |
| - All operations are conservative: they strip noise without destroying | |
| meaningful content. | |
| - No external dependencies beyond the Python standard library. | |
| - ``clean_text`` is the primary public function. It is called by both the | |
| training pipeline (``training/train.py``) and the inference API | |
| (``api/main.py``) to ensure **identical preprocessing** at train and | |
| inference time β a critical requirement for consistent predictions. | |
| """ | |
| from __future__ import annotations | |
| import html | |
| import re | |
| import unicodedata | |
| # --------------------------------------------------------------------------- | |
| # Compiled regex patterns | |
| # --------------------------------------------------------------------------- | |
| # Capped tag length (<200 chars) to guard against ReDoS on malformed HTML. | |
| _HTML_TAG_RE = re.compile(r"<[^>]{0,200}>") | |
| _URL_RE = re.compile(r"https?://\S+|www\.\S+", re.IGNORECASE) | |
| _EMAIL_RE = re.compile( | |
| r"\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b" | |
| ) | |
| # Compress 4+ repetitions of the same character ("sooooo" β "soo") | |
| _REPEATED_CHAR_RE = re.compile(r"(.)\1{3,}") | |
| _WHITESPACE_RE = re.compile(r"\s+") | |
| # --------------------------------------------------------------------------- | |
| # Emoji β text mapping | |
| # Covers the most common emojis appearing in stress-related social-media text. | |
| # Each emoji is replaced by a semantically equivalent English word so that the | |
| # model's hash-based tokeniser can process it. | |
| # --------------------------------------------------------------------------- | |
| _EMOJI_TEXT: dict[str, str] = { | |
| # Positive / happy | |
| "π": "happy", | |
| "π": "happy", | |
| "π": "happy", | |
| "π": "happy", | |
| "π": "happy", | |
| "π": "happy", | |
| "π": "laughing", | |
| "π": "laughing", | |
| "π€£": "laughing", | |
| "π₯³": "happy", | |
| "π": "happy", | |
| "π": "calm", | |
| "π": "love", | |
| "π₯°": "love", | |
| # Sad / emotional | |
| "π₯²": "emotional", | |
| "π’": "sad", | |
| "π": "crying", | |
| "π": "sad", | |
| "π": "sad", | |
| "π": "sad", | |
| "π": "heartbroken", | |
| # Stress / anxiety | |
| "π°": "anxious", | |
| "π¨": "scared", | |
| "π±": "terrified", | |
| "π": "stressed", | |
| "π₯": "upset", | |
| "π€―": "overwhelmed", | |
| "π€": "frustrated", | |
| "π‘": "angry", | |
| "π€¬": "angry", | |
| # Tired / sick | |
| "π«": "exhausted", | |
| "π©": "tired", | |
| "π΄": "tired", | |
| "π€": "hurt", | |
| "π€": "sick", | |
| # Other | |
| "π": "confused", | |
| "πΆ": "speechless", | |
| "π₯Ί": "hopeful", | |
| "β€οΈ": "love", | |
| "πͺ": "strong", | |
| "π": "grateful", | |
| "π": "good", | |
| "π": "bad", | |
| "π": "help", | |
| "β‘": "stressed", | |
| "π": "dying", | |
| "π₯": "intense", | |
| } | |
| # --------------------------------------------------------------------------- | |
| # Public API | |
| # --------------------------------------------------------------------------- | |
| def clean_text(text: str, *, normalize_repeated: bool = True) -> str: | |
| """Clean and normalise text for model input. | |
| Parameters | |
| ---------- | |
| text : str | |
| Raw user input (social-media post, journal entry, chat message β¦). | |
| normalize_repeated : bool | |
| When ``True`` (default), compress long repeated-character runs | |
| (e.g. ``"soooooo"`` β ``"soo"``). | |
| Returns | |
| ------- | |
| str | |
| Cleaned text ready for tokenisation, or an empty string when the | |
| input is empty or not a string. | |
| """ | |
| if not text or not isinstance(text, str): | |
| return "" | |
| # 1. HTML entity unescaping | |
| text = html.unescape(text) | |
| # 2. Strip HTML/XML tags | |
| text = _HTML_TAG_RE.sub(" ", text) | |
| # 3. Emoji β text (iterate over a snapshot so replacements don't loop) | |
| for emoji_char, replacement in _EMOJI_TEXT.items(): | |
| if emoji_char in text: | |
| text = text.replace(emoji_char, f" {replacement} ") | |
| # 4. Remove URLs | |
| text = _URL_RE.sub(" ", text) | |
| # 5. Remove e-mail addresses | |
| text = _EMAIL_RE.sub(" ", text) | |
| # 6. Repeated-character compression | |
| if normalize_repeated: | |
| text = _REPEATED_CHAR_RE.sub(r"\1\1", text) | |
| # 7. Unicode NFKC normalisation | |
| text = unicodedata.normalize("NFKC", text) | |
| # 8. Collapse whitespace | |
| text = _WHITESPACE_RE.sub(" ", text).strip() | |
| return text | |