| """ |
| MindWatch — Data Preprocessing Module |
| Handles text cleaning, normalization, and tokenization for social media text. |
| """ |
|
|
| import re |
| import string |
| import nltk |
| from typing import List, Optional |
|
|
| |
| try: |
| nltk.data.find("tokenizers/punkt_tab") |
| except (LookupError, OSError): |
| nltk.download("punkt_tab", quiet=True) |
| try: |
| nltk.data.find("corpora/stopwords") |
| except LookupError: |
| nltk.download("stopwords", quiet=True) |
|
|
| from nltk.corpus import stopwords |
| from nltk.tokenize import word_tokenize |
|
|
| STOPWORDS = set(stopwords.words("english")) |
|
|
| |
| CONTRACTIONS = { |
| "can't": "cannot", "won't": "will not", "n't": " not", |
| "i'm": "i am", "i've": "i have", "i'll": "i will", |
| "i'd": "i would", "it's": "it is", "that's": "that is", |
| "there's": "there is", "they're": "they are", "we're": "we are", |
| "you're": "you are", "he's": "he is", "she's": "she is", |
| "let's": "let us", "who's": "who is", "what's": "what is", |
| "here's": "here is", "where's": "where is", "don't": "do not", |
| "doesn't": "does not", "didn't": "did not", "isn't": "is not", |
| "aren't": "are not", "wasn't": "was not", "weren't": "were not", |
| "hasn't": "has not", "haven't": "have not", "hadn't": "had not", |
| "couldn't": "could not", "wouldn't": "would not", "shouldn't": "should not", |
| "mustn't": "must not", "needn't": "need not", |
| } |
|
|
| |
| PRESERVE_WORDS = { |
| "not", "no", "never", "nothing", "nobody", "nowhere", "neither", |
| "nor", "hardly", "barely", "alone", "empty", "hopeless", "worthless", |
| "helpless", "useless", "pointless", "tired", "exhausted", "can't", |
| "cannot", "won't", "don't", |
| } |
|
|
|
|
| def remove_urls(text: str) -> str: |
| """Remove URLs from text.""" |
| return re.sub(r"http\S+|www\.\S+", "", text) |
|
|
|
|
| def remove_html_tags(text: str) -> str: |
| """Remove HTML tags.""" |
| return re.sub(r"<[^>]+>", "", text) |
|
|
|
|
| def remove_mentions_hashtags(text: str) -> str: |
| """Remove @mentions and convert #hashtags to words.""" |
| text = re.sub(r"@\w+", "", text) |
| text = re.sub(r"#(\w+)", r"\1", text) |
| return text |
|
|
|
|
| def remove_emojis(text: str) -> str: |
| """Remove emoji characters.""" |
| emoji_pattern = re.compile( |
| "[" |
| "\U0001F600-\U0001F64F" |
| "\U0001F300-\U0001F5FF" |
| "\U0001F680-\U0001F6FF" |
| "\U0001F1E0-\U0001F1FF" |
| "\U00002702-\U000027B0" |
| "\U000024C2-\U0001F251" |
| "]+", |
| flags=re.UNICODE, |
| ) |
| return emoji_pattern.sub("", text) |
|
|
|
|
| def expand_contractions(text: str) -> str: |
| """Expand contractions in text.""" |
| for contraction, expanded in CONTRACTIONS.items(): |
| text = text.replace(contraction, expanded) |
| return text |
|
|
|
|
| def normalize_text(text: str) -> str: |
| """Normalize whitespace and casing.""" |
| text = text.lower().strip() |
| text = re.sub(r"\s+", " ", text) |
| |
| text = re.sub(r"(.)\1{2,}", r"\1\1", text) |
| return text |
|
|
|
|
| def remove_special_chars(text: str, keep_punctuation: bool = False) -> str: |
| """Remove special characters, optionally keeping basic punctuation.""" |
| if keep_punctuation: |
| return re.sub(r"[^a-zA-Z0-9\s.,!?']", "", text) |
| return re.sub(r"[^a-zA-Z0-9\s]", "", text) |
|
|
|
|
| def tokenize(text: str, remove_stopwords: bool = False) -> List[str]: |
| """Tokenize text with optional stopword removal (preserves mental-health terms).""" |
| tokens = word_tokenize(text) |
| if remove_stopwords: |
| tokens = [ |
| t for t in tokens |
| if t not in STOPWORDS or t in PRESERVE_WORDS |
| ] |
| return tokens |
|
|
|
|
| def preprocess_text( |
| text: str, |
| remove_stops: bool = False, |
| keep_punctuation: bool = True, |
| ) -> str: |
| """ |
| Full preprocessing pipeline for social media text. |
| |
| Steps: |
| 1. Remove URLs |
| 2. Remove HTML tags |
| 3. Remove mentions, convert hashtags |
| 4. Remove emojis |
| 5. Expand contractions |
| 6. Normalize text |
| 7. Remove special characters |
| 8. (Optional) Remove stopwords |
| """ |
| if not text or not isinstance(text, str): |
| return "" |
|
|
| text = remove_urls(text) |
| text = remove_html_tags(text) |
| text = remove_mentions_hashtags(text) |
| text = remove_emojis(text) |
| text = expand_contractions(text) |
| text = normalize_text(text) |
| text = remove_special_chars(text, keep_punctuation=keep_punctuation) |
|
|
| if remove_stops: |
| tokens = tokenize(text, remove_stopwords=True) |
| text = " ".join(tokens) |
|
|
| return text.strip() |
|
|
|
|
| def extract_linguistic_features(text: str) -> dict: |
| """ |
| Extract linguistic features relevant to mental health analysis. |
| These act as auxiliary signals for the multi-signal model. |
| """ |
| clean = preprocess_text(text) |
| tokens = tokenize(clean) |
| words = [t for t in tokens if t.isalpha()] |
|
|
| |
| negation_words = {"not", "no", "never", "nothing", "nobody", "none", |
| "neither", "nor", "cannot", "hardly", "barely"} |
| negation_count = sum(1 for w in words if w in negation_words) |
|
|
| |
| first_person = {"i", "me", "my", "mine", "myself"} |
| fp_count = sum(1 for w in words if w in first_person) |
|
|
| |
| absolutist = {"always", "never", "completely", "nothing", "everything", |
| "totally", "entirely", "absolutely", "constantly"} |
| abs_count = sum(1 for w in words if w in absolutist) |
|
|
| word_count = len(words) if words else 1 |
|
|
| return { |
| "word_count": len(words), |
| "avg_word_length": sum(len(w) for w in words) / word_count if words else 0, |
| "negation_ratio": negation_count / word_count, |
| "first_person_ratio": fp_count / word_count, |
| "absolutist_ratio": abs_count / word_count, |
| "question_marks": text.count("?"), |
| "exclamation_marks": text.count("!"), |
| } |
|
|
|
|
| if __name__ == "__main__": |
| samples = [ |
| "I feel completely exhausted and nothing seems to work anymore.", |
| "I can't sleep again tonight. Everything feels pointless. #depression", |
| "Had a great day with friends! 😊 https://t.co/example", |
| "I don't see any point in continuing anymore...", |
| ] |
| for s in samples: |
| print(f"Original : {s}") |
| print(f"Cleaned : {preprocess_text(s)}") |
| print(f"Features : {extract_linguistic_features(s)}") |
| print() |
|
|