File size: 6,651 Bytes
4c423a1
 
 
 
 
 
 
 
 
 
 
 
 
22d72ea
4c423a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
"""
MindWatch — Data Preprocessing Module
Handles text cleaning, normalization, and tokenization for social media text.
"""

import re
import string
import nltk
from typing import List, Optional

# Download required NLTK data (handled gracefully for HF Spaces)
try:
    nltk.data.find("tokenizers/punkt_tab")
except (LookupError, OSError):
    nltk.download("punkt_tab", quiet=True)
try:
    nltk.data.find("corpora/stopwords")
except LookupError:
    nltk.download("stopwords", quiet=True)

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

STOPWORDS = set(stopwords.words("english"))

# Contractions map for expansion
CONTRACTIONS = {
    "can't": "cannot", "won't": "will not", "n't": " not",
    "i'm": "i am", "i've": "i have", "i'll": "i will",
    "i'd": "i would", "it's": "it is", "that's": "that is",
    "there's": "there is", "they're": "they are", "we're": "we are",
    "you're": "you are", "he's": "he is", "she's": "she is",
    "let's": "let us", "who's": "who is", "what's": "what is",
    "here's": "here is", "where's": "where is", "don't": "do not",
    "doesn't": "does not", "didn't": "did not", "isn't": "is not",
    "aren't": "are not", "wasn't": "was not", "weren't": "were not",
    "hasn't": "has not", "haven't": "have not", "hadn't": "had not",
    "couldn't": "could not", "wouldn't": "would not", "shouldn't": "should not",
    "mustn't": "must not", "needn't": "need not",
}

# Mental-health-specific terms to preserve (not remove as stopwords)
PRESERVE_WORDS = {
    "not", "no", "never", "nothing", "nobody", "nowhere", "neither",
    "nor", "hardly", "barely", "alone", "empty", "hopeless", "worthless",
    "helpless", "useless", "pointless", "tired", "exhausted", "can't",
    "cannot", "won't", "don't",
}


def remove_urls(text: str) -> str:
    """Remove URLs from text."""
    return re.sub(r"http\S+|www\.\S+", "", text)


def remove_html_tags(text: str) -> str:
    """Remove HTML tags."""
    return re.sub(r"<[^>]+>", "", text)


def remove_mentions_hashtags(text: str) -> str:
    """Remove @mentions and convert #hashtags to words."""
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#(\w+)", r"\1", text)  # keep hashtag text
    return text


def remove_emojis(text: str) -> str:
    """Remove emoji characters."""
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map
        "\U0001F1E0-\U0001F1FF"  # flags
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE,
    )
    return emoji_pattern.sub("", text)


def expand_contractions(text: str) -> str:
    """Expand contractions in text."""
    for contraction, expanded in CONTRACTIONS.items():
        text = text.replace(contraction, expanded)
    return text


def normalize_text(text: str) -> str:
    """Normalize whitespace and casing."""
    text = text.lower().strip()
    text = re.sub(r"\s+", " ", text)
    # Remove repeated characters (e.g., "soooo" -> "so")
    text = re.sub(r"(.)\1{2,}", r"\1\1", text)
    return text


def remove_special_chars(text: str, keep_punctuation: bool = False) -> str:
    """Remove special characters, optionally keeping basic punctuation."""
    if keep_punctuation:
        return re.sub(r"[^a-zA-Z0-9\s.,!?']", "", text)
    return re.sub(r"[^a-zA-Z0-9\s]", "", text)


def tokenize(text: str, remove_stopwords: bool = False) -> List[str]:
    """Tokenize text with optional stopword removal (preserves mental-health terms)."""
    tokens = word_tokenize(text)
    if remove_stopwords:
        tokens = [
            t for t in tokens
            if t not in STOPWORDS or t in PRESERVE_WORDS
        ]
    return tokens


def preprocess_text(
    text: str,
    remove_stops: bool = False,
    keep_punctuation: bool = True,
) -> str:
    """
    Full preprocessing pipeline for social media text.

    Steps:
        1. Remove URLs
        2. Remove HTML tags
        3. Remove mentions, convert hashtags
        4. Remove emojis
        5. Expand contractions
        6. Normalize text
        7. Remove special characters
        8. (Optional) Remove stopwords
    """
    if not text or not isinstance(text, str):
        return ""

    text = remove_urls(text)
    text = remove_html_tags(text)
    text = remove_mentions_hashtags(text)
    text = remove_emojis(text)
    text = expand_contractions(text)
    text = normalize_text(text)
    text = remove_special_chars(text, keep_punctuation=keep_punctuation)

    if remove_stops:
        tokens = tokenize(text, remove_stopwords=True)
        text = " ".join(tokens)

    return text.strip()


def extract_linguistic_features(text: str) -> dict:
    """
    Extract linguistic features relevant to mental health analysis.
    These act as auxiliary signals for the multi-signal model.
    """
    clean = preprocess_text(text)
    tokens = tokenize(clean)
    words = [t for t in tokens if t.isalpha()]

    # Negation count
    negation_words = {"not", "no", "never", "nothing", "nobody", "none",
                      "neither", "nor", "cannot", "hardly", "barely"}
    negation_count = sum(1 for w in words if w in negation_words)

    # First-person pronoun ratio (self-focus indicator)
    first_person = {"i", "me", "my", "mine", "myself"}
    fp_count = sum(1 for w in words if w in first_person)

    # Absolutist words (linked to anxiety/depression research)
    absolutist = {"always", "never", "completely", "nothing", "everything",
                  "totally", "entirely", "absolutely", "constantly"}
    abs_count = sum(1 for w in words if w in absolutist)

    word_count = len(words) if words else 1

    return {
        "word_count": len(words),
        "avg_word_length": sum(len(w) for w in words) / word_count if words else 0,
        "negation_ratio": negation_count / word_count,
        "first_person_ratio": fp_count / word_count,
        "absolutist_ratio": abs_count / word_count,
        "question_marks": text.count("?"),
        "exclamation_marks": text.count("!"),
    }


if __name__ == "__main__":
    samples = [
        "I feel completely exhausted and nothing seems to work anymore.",
        "I can't sleep again tonight. Everything feels pointless. #depression",
        "Had a great day with friends! 😊 https://t.co/example",
        "I don't see any point in continuing anymore...",
    ]
    for s in samples:
        print(f"Original : {s}")
        print(f"Cleaned  : {preprocess_text(s)}")
        print(f"Features : {extract_linguistic_features(s)}")
        print()