| """ |
| MindWatch β Dataset Loader |
| Loads, merges, and prepares multiple NLP datasets for mental health classification. |
| |
| Datasets used: |
| 1. dair-ai/emotion β 6 emotions (HuggingFace) |
| 2. go_emotions (simplified) β 27 emotions (HuggingFace) |
| 3. Built-in curated samples β Reddit-style mental health posts |
| """ |
|
|
| import pandas as pd |
| import numpy as np |
| from typing import Tuple |
| from datasets import load_dataset |
| from utils.preprocessing import preprocess_text |
| from tqdm import tqdm |
|
|
| |
| EMOTION_TO_DISTRESS = { |
| |
| "sadness": "depression", |
| "anger": "stress", |
| "fear": "anxiety", |
| "joy": "normal", |
| "love": "normal", |
| "surprise": "normal", |
| } |
|
|
| |
| GO_EMOTION_MAP = { |
| "sadness": "depression", |
| "grief": "depression", |
| "disappointment": "depression", |
| "remorse": "depression", |
| "anger": "stress", |
| "annoyance": "stress", |
| "disgust": "stress", |
| "embarrassment": "stress", |
| "fear": "anxiety", |
| "nervousness": "anxiety", |
| "confusion": "anxiety", |
| "joy": "normal", |
| "amusement": "normal", |
| "love": "normal", |
| "admiration": "normal", |
| "approval": "normal", |
| "caring": "normal", |
| "curiosity": "normal", |
| "desire": "normal", |
| "excitement": "normal", |
| "gratitude": "normal", |
| "optimism": "normal", |
| "pride": "normal", |
| "relief": "normal", |
| "surprise": "normal", |
| "realization": "normal", |
| "disapproval": "stress", |
| "neutral": "normal", |
| } |
|
|
| |
| CURATED_SAMPLES = [ |
| |
| ("I feel completely exhausted and nothing seems to work anymore.", "depression"), |
| ("I can't remember the last time I felt happy about anything.", "depression"), |
| ("Everything feels so heavy. I just want to stay in bed forever.", "depression"), |
| ("I don't enjoy the things I used to love. Music, games, friends... nothing.", "depression"), |
| ("What's the point of trying when everything always falls apart?", "depression"), |
| ("I've been crying every night for weeks and I can't stop.", "depression"), |
| ("I feel like I'm just existing, not living. There's no color in my world.", "depression"), |
| ("Nobody would even notice if I disappeared tomorrow.", "depression"), |
| ("I haven't showered in days. I just don't have the energy.", "depression"), |
| ("My therapist says I'm making progress but I feel worse than ever.", "depression"), |
| ("I smile at work but inside I feel absolutely empty.", "depression"), |
| ("I can't eat. I can't sleep. I can't think straight anymore.", "depression"), |
| ("Life feels like a punishment I didn't earn.", "depression"), |
| ("I've lost interest in everything. Even getting out of bed feels impossible.", "depression"), |
| ("The darkness never seems to end no matter what I do.", "depression"), |
|
|
| |
| ("My heart is racing and I can't catch my breath for no reason.", "anxiety"), |
| ("I keep thinking something terrible is about to happen.", "anxiety"), |
| ("I woke up at 3am with a panic attack again. Third time this week.", "anxiety"), |
| ("I'm terrified of going to work tomorrow. What if I mess everything up?", "anxiety"), |
| ("My hands won't stop shaking. I feel like I'm losing control.", "anxiety"), |
| ("I can't stop overthinking every single conversation I had today.", "anxiety"), |
| ("What if everyone secretly hates me and they're just being polite?", "anxiety"), |
| ("I feel like I'm on the edge of a cliff and about to fall.", "anxiety"), |
| ("I check the locks five times before bed. I know it's irrational but I can't stop.", "anxiety"), |
| ("The thought of meeting new people makes me physically sick.", "anxiety"), |
| ("I replay embarrassing moments from years ago and feel the shame all over again.", "anxiety"), |
| ("My chest feels tight and I'm convinced something is wrong with my heart.", "anxiety"), |
| ("I catastrophize everything. A headache becomes a brain tumor in my mind.", "anxiety"), |
| ("Social situations drain me completely. I need days to recover.", "anxiety"), |
| ("I've been avoiding phone calls because they trigger my anxiety.", "anxiety"), |
|
|
| |
| ("Work deadlines are piling up and I can barely keep up.", "stress"), |
| ("I haven't had a day off in weeks. I'm running on fumes.", "stress"), |
| ("My boss expects me to do the work of three people.", "stress"), |
| ("Finals are next week and I haven't started studying. I'm freaking out.", "stress"), |
| ("The bills keep coming and I don't know how to pay them all.", "stress"), |
| ("I snapped at my kids today because I'm so overwhelmed with everything.", "stress"), |
| ("I feel like I'm juggling a hundred things and dropping them all.", "stress"), |
| ("Moving to a new city with no support system is breaking me.", "stress"), |
| ("I grind my teeth at night from all the stress. My jaw hurts constantly.", "stress"), |
| ("Every day feels like putting out fires. There's no time to breathe.", "stress"), |
| ("My relationship is falling apart and work is terrible. I can't handle both.", "stress"), |
| ("I've been stress eating and gained 20 pounds this semester.", "stress"), |
| ("The pressure to perform is crushing me. I feel like I'll crack.", "stress"), |
| ("I have so many responsibilities I don't even know where to start.", "stress"), |
| ("Traffic, deadlines, bills, family drama β it never stops.", "stress"), |
|
|
| |
| ("Had a great day hiking with friends! The sunset was beautiful.", "normal"), |
| ("Just finished reading a really good book. Highly recommend it.", "normal"), |
| ("Excited about my new job starting next month!", "normal"), |
| ("Made pasta from scratch today and it turned out amazing.", "normal"), |
| ("Spent the afternoon playing video games. Pretty relaxing day.", "normal"), |
| ("The weather is finally getting warmer. Love spring!", "normal"), |
| ("My dog learned a new trick today. So proud of the little guy.", "normal"), |
| ("Going to a concert this weekend. Can't wait!", "normal"), |
| ("Just had a productive morning. Feeling good about this week.", "normal"), |
| ("Called my mom today. It's always nice to hear her voice.", "normal"), |
| ("Started a new hobby β painting. Not great at it yet but it's fun.", "normal"), |
| ("The coffee shop near my apartment has the best lattes.", "normal"), |
| ("Finished a 5K run today. Personal best time!", "normal"), |
| ("Movie night with friends was exactly what I needed.", "normal"), |
| ("Learning to cook has been one of the best decisions I've made.", "normal"), |
| ] |
|
|
|
|
| def load_emotion_dataset(max_samples: int = 10000) -> pd.DataFrame: |
| """Load dair-ai/emotion dataset and map to distress categories.""" |
| print("Loading dair-ai/emotion dataset...") |
| ds = load_dataset("dair-ai/emotion", split="train") |
| df = pd.DataFrame(ds) |
|
|
| label_names = {0: "sadness", 1: "joy", 2: "love", 3: "anger", 4: "fear", 5: "surprise"} |
| df["emotion"] = df["label"].map(label_names) |
| df["distress_label"] = df["emotion"].map(EMOTION_TO_DISTRESS) |
| df = df.dropna(subset=["distress_label"]) |
|
|
| if max_samples and len(df) > max_samples: |
| samples = [] |
| for label in df["distress_label"].unique(): |
| group = df[df["distress_label"] == label] |
| samples.append(group.sample(min(len(group), max_samples // 4), random_state=42)) |
| df = pd.concat(samples, ignore_index=True) |
|
|
| return df[["text", "distress_label"]].reset_index(drop=True) |
|
|
|
|
| def load_go_emotions_dataset(max_samples: int = 10000) -> pd.DataFrame: |
| """Load GoEmotions (simplified) and map to distress categories.""" |
| print("Loading GoEmotions dataset...") |
| ds = load_dataset("go_emotions", "simplified", split="train") |
| df = pd.DataFrame(ds) |
|
|
| label_names = list(GO_EMOTION_MAP.keys()) |
|
|
| rows = [] |
| for _, row in tqdm(df.iterrows(), total=len(df), desc="Mapping GoEmotions"): |
| labels = row["labels"] |
| if len(labels) == 1: |
| idx = labels[0] |
| if idx < len(label_names): |
| emotion = label_names[idx] |
| distress = GO_EMOTION_MAP.get(emotion) |
| if distress: |
| rows.append({"text": row["text"], "distress_label": distress}) |
|
|
| result = pd.DataFrame(rows) |
|
|
| if max_samples and len(result) > max_samples: |
| samples = [] |
| for label in result["distress_label"].unique(): |
| group = result[result["distress_label"] == label] |
| samples.append(group.sample(min(len(group), max_samples // 4), random_state=42)) |
| result = pd.concat(samples, ignore_index=True) |
|
|
| return result.reset_index(drop=True) |
|
|
|
|
| def load_curated_samples() -> pd.DataFrame: |
| """Load built-in curated Reddit-style samples.""" |
| print("Loading curated mental health samples...") |
| return pd.DataFrame(CURATED_SAMPLES, columns=["text", "distress_label"]) |
|
|
|
|
| def load_all_datasets( |
| max_emotion: int = 8000, |
| max_go_emotions: int = 8000, |
| preprocess: bool = True, |
| ) -> Tuple[pd.DataFrame, dict]: |
| """ |
| Load and merge all available datasets. |
| |
| Returns: |
| df: Merged DataFrame with columns ['text', 'clean_text', 'distress_label', 'label_id'] |
| label_map: Dict mapping label names to integer IDs |
| """ |
| dfs = [] |
|
|
| |
| try: |
| df_emotion = load_emotion_dataset(max_samples=max_emotion) |
| df_emotion["source"] = "emotion" |
| dfs.append(df_emotion) |
| print(f" β Emotion dataset: {len(df_emotion)} samples") |
| except Exception as e: |
| print(f" β Emotion dataset failed: {e}") |
|
|
| |
| try: |
| df_go = load_go_emotions_dataset(max_samples=max_go_emotions) |
| df_go["source"] = "go_emotions" |
| dfs.append(df_go) |
| print(f" β GoEmotions dataset: {len(df_go)} samples") |
| except Exception as e: |
| print(f" β GoEmotions dataset failed: {e}") |
|
|
| |
| df_curated = load_curated_samples() |
| df_curated["source"] = "curated" |
| dfs.append(df_curated) |
| print(f" β Curated samples: {len(df_curated)} samples") |
|
|
| |
| df = pd.concat(dfs, ignore_index=True) |
| df = df.drop_duplicates(subset=["text"]).reset_index(drop=True) |
|
|
| |
| if preprocess: |
| print("Preprocessing text...") |
| df["clean_text"] = df["text"].apply(preprocess_text) |
| df = df[df["clean_text"].str.len() > 10].reset_index(drop=True) |
|
|
| |
| df = df.dropna(subset=["distress_label"]).reset_index(drop=True) |
| label_map = {label: idx for idx, label in enumerate(sorted(df["distress_label"].unique()))} |
| df["label_id"] = df["distress_label"].map(label_map) |
|
|
| print(f"\nπ Final dataset: {len(df)} samples") |
| print(f" Label distribution:\n{df['distress_label'].value_counts().to_string()}") |
| print(f" Label map: {label_map}") |
|
|
| return df, label_map |
|
|
|
|
| if __name__ == "__main__": |
| df, label_map = load_all_datasets(max_emotion=2000, max_go_emotions=2000) |
| print(f"\nSample data:\n{df.head(10).to_string()}") |
|
|