mindwatch / data /dataset_loader.py
priyadip's picture
Fix pandas 2.x groupby.apply() column loss and NaN in distress_label sort
86e7a33 verified
"""
MindWatch β€” Dataset Loader
Loads, merges, and prepares multiple NLP datasets for mental health classification.
Datasets used:
1. dair-ai/emotion β€” 6 emotions (HuggingFace)
2. go_emotions (simplified) β€” 27 emotions (HuggingFace)
3. Built-in curated samples β€” Reddit-style mental health posts
"""
import pandas as pd
import numpy as np
from typing import Tuple
from datasets import load_dataset
from utils.preprocessing import preprocess_text
from tqdm import tqdm
# Mapping from fine-grained emotion labels β†’ our 4 target classes
EMOTION_TO_DISTRESS = {
# dair-ai/emotion labels: 0=sadness, 1=joy, 2=love, 3=anger, 4=fear, 5=surprise
"sadness": "depression",
"anger": "stress",
"fear": "anxiety",
"joy": "normal",
"love": "normal",
"surprise": "normal",
}
# GoEmotions β†’ distress mapping
GO_EMOTION_MAP = {
"sadness": "depression",
"grief": "depression",
"disappointment": "depression",
"remorse": "depression",
"anger": "stress",
"annoyance": "stress",
"disgust": "stress",
"embarrassment": "stress",
"fear": "anxiety",
"nervousness": "anxiety",
"confusion": "anxiety",
"joy": "normal",
"amusement": "normal",
"love": "normal",
"admiration": "normal",
"approval": "normal",
"caring": "normal",
"curiosity": "normal",
"desire": "normal",
"excitement": "normal",
"gratitude": "normal",
"optimism": "normal",
"pride": "normal",
"relief": "normal",
"surprise": "normal",
"realization": "normal",
"disapproval": "stress",
"neutral": "normal",
}
# Curated Reddit-style mental health samples for enrichment
CURATED_SAMPLES = [
# Depression
("I feel completely exhausted and nothing seems to work anymore.", "depression"),
("I can't remember the last time I felt happy about anything.", "depression"),
("Everything feels so heavy. I just want to stay in bed forever.", "depression"),
("I don't enjoy the things I used to love. Music, games, friends... nothing.", "depression"),
("What's the point of trying when everything always falls apart?", "depression"),
("I've been crying every night for weeks and I can't stop.", "depression"),
("I feel like I'm just existing, not living. There's no color in my world.", "depression"),
("Nobody would even notice if I disappeared tomorrow.", "depression"),
("I haven't showered in days. I just don't have the energy.", "depression"),
("My therapist says I'm making progress but I feel worse than ever.", "depression"),
("I smile at work but inside I feel absolutely empty.", "depression"),
("I can't eat. I can't sleep. I can't think straight anymore.", "depression"),
("Life feels like a punishment I didn't earn.", "depression"),
("I've lost interest in everything. Even getting out of bed feels impossible.", "depression"),
("The darkness never seems to end no matter what I do.", "depression"),
# Anxiety
("My heart is racing and I can't catch my breath for no reason.", "anxiety"),
("I keep thinking something terrible is about to happen.", "anxiety"),
("I woke up at 3am with a panic attack again. Third time this week.", "anxiety"),
("I'm terrified of going to work tomorrow. What if I mess everything up?", "anxiety"),
("My hands won't stop shaking. I feel like I'm losing control.", "anxiety"),
("I can't stop overthinking every single conversation I had today.", "anxiety"),
("What if everyone secretly hates me and they're just being polite?", "anxiety"),
("I feel like I'm on the edge of a cliff and about to fall.", "anxiety"),
("I check the locks five times before bed. I know it's irrational but I can't stop.", "anxiety"),
("The thought of meeting new people makes me physically sick.", "anxiety"),
("I replay embarrassing moments from years ago and feel the shame all over again.", "anxiety"),
("My chest feels tight and I'm convinced something is wrong with my heart.", "anxiety"),
("I catastrophize everything. A headache becomes a brain tumor in my mind.", "anxiety"),
("Social situations drain me completely. I need days to recover.", "anxiety"),
("I've been avoiding phone calls because they trigger my anxiety.", "anxiety"),
# Stress
("Work deadlines are piling up and I can barely keep up.", "stress"),
("I haven't had a day off in weeks. I'm running on fumes.", "stress"),
("My boss expects me to do the work of three people.", "stress"),
("Finals are next week and I haven't started studying. I'm freaking out.", "stress"),
("The bills keep coming and I don't know how to pay them all.", "stress"),
("I snapped at my kids today because I'm so overwhelmed with everything.", "stress"),
("I feel like I'm juggling a hundred things and dropping them all.", "stress"),
("Moving to a new city with no support system is breaking me.", "stress"),
("I grind my teeth at night from all the stress. My jaw hurts constantly.", "stress"),
("Every day feels like putting out fires. There's no time to breathe.", "stress"),
("My relationship is falling apart and work is terrible. I can't handle both.", "stress"),
("I've been stress eating and gained 20 pounds this semester.", "stress"),
("The pressure to perform is crushing me. I feel like I'll crack.", "stress"),
("I have so many responsibilities I don't even know where to start.", "stress"),
("Traffic, deadlines, bills, family drama β€” it never stops.", "stress"),
# Normal
("Had a great day hiking with friends! The sunset was beautiful.", "normal"),
("Just finished reading a really good book. Highly recommend it.", "normal"),
("Excited about my new job starting next month!", "normal"),
("Made pasta from scratch today and it turned out amazing.", "normal"),
("Spent the afternoon playing video games. Pretty relaxing day.", "normal"),
("The weather is finally getting warmer. Love spring!", "normal"),
("My dog learned a new trick today. So proud of the little guy.", "normal"),
("Going to a concert this weekend. Can't wait!", "normal"),
("Just had a productive morning. Feeling good about this week.", "normal"),
("Called my mom today. It's always nice to hear her voice.", "normal"),
("Started a new hobby β€” painting. Not great at it yet but it's fun.", "normal"),
("The coffee shop near my apartment has the best lattes.", "normal"),
("Finished a 5K run today. Personal best time!", "normal"),
("Movie night with friends was exactly what I needed.", "normal"),
("Learning to cook has been one of the best decisions I've made.", "normal"),
]
def load_emotion_dataset(max_samples: int = 10000) -> pd.DataFrame:
"""Load dair-ai/emotion dataset and map to distress categories."""
print("Loading dair-ai/emotion dataset...")
ds = load_dataset("dair-ai/emotion", split="train")
df = pd.DataFrame(ds)
label_names = {0: "sadness", 1: "joy", 2: "love", 3: "anger", 4: "fear", 5: "surprise"}
df["emotion"] = df["label"].map(label_names)
df["distress_label"] = df["emotion"].map(EMOTION_TO_DISTRESS)
df = df.dropna(subset=["distress_label"])
if max_samples and len(df) > max_samples:
samples = []
for label in df["distress_label"].unique():
group = df[df["distress_label"] == label]
samples.append(group.sample(min(len(group), max_samples // 4), random_state=42))
df = pd.concat(samples, ignore_index=True)
return df[["text", "distress_label"]].reset_index(drop=True)
def load_go_emotions_dataset(max_samples: int = 10000) -> pd.DataFrame:
"""Load GoEmotions (simplified) and map to distress categories."""
print("Loading GoEmotions dataset...")
ds = load_dataset("go_emotions", "simplified", split="train")
df = pd.DataFrame(ds)
label_names = list(GO_EMOTION_MAP.keys())
rows = []
for _, row in tqdm(df.iterrows(), total=len(df), desc="Mapping GoEmotions"):
labels = row["labels"]
if len(labels) == 1:
idx = labels[0]
if idx < len(label_names):
emotion = label_names[idx]
distress = GO_EMOTION_MAP.get(emotion)
if distress:
rows.append({"text": row["text"], "distress_label": distress})
result = pd.DataFrame(rows)
if max_samples and len(result) > max_samples:
samples = []
for label in result["distress_label"].unique():
group = result[result["distress_label"] == label]
samples.append(group.sample(min(len(group), max_samples // 4), random_state=42))
result = pd.concat(samples, ignore_index=True)
return result.reset_index(drop=True)
def load_curated_samples() -> pd.DataFrame:
"""Load built-in curated Reddit-style samples."""
print("Loading curated mental health samples...")
return pd.DataFrame(CURATED_SAMPLES, columns=["text", "distress_label"])
def load_all_datasets(
max_emotion: int = 8000,
max_go_emotions: int = 8000,
preprocess: bool = True,
) -> Tuple[pd.DataFrame, dict]:
"""
Load and merge all available datasets.
Returns:
df: Merged DataFrame with columns ['text', 'clean_text', 'distress_label', 'label_id']
label_map: Dict mapping label names to integer IDs
"""
dfs = []
# 1. dair-ai/emotion
try:
df_emotion = load_emotion_dataset(max_samples=max_emotion)
df_emotion["source"] = "emotion"
dfs.append(df_emotion)
print(f" βœ“ Emotion dataset: {len(df_emotion)} samples")
except Exception as e:
print(f" βœ— Emotion dataset failed: {e}")
# 2. GoEmotions
try:
df_go = load_go_emotions_dataset(max_samples=max_go_emotions)
df_go["source"] = "go_emotions"
dfs.append(df_go)
print(f" βœ“ GoEmotions dataset: {len(df_go)} samples")
except Exception as e:
print(f" βœ— GoEmotions dataset failed: {e}")
# 3. Curated samples
df_curated = load_curated_samples()
df_curated["source"] = "curated"
dfs.append(df_curated)
print(f" βœ“ Curated samples: {len(df_curated)} samples")
# Merge all
df = pd.concat(dfs, ignore_index=True)
df = df.drop_duplicates(subset=["text"]).reset_index(drop=True)
# Preprocess
if preprocess:
print("Preprocessing text...")
df["clean_text"] = df["text"].apply(preprocess_text)
df = df[df["clean_text"].str.len() > 10].reset_index(drop=True)
# Label encoding
df = df.dropna(subset=["distress_label"]).reset_index(drop=True)
label_map = {label: idx for idx, label in enumerate(sorted(df["distress_label"].unique()))}
df["label_id"] = df["distress_label"].map(label_map)
print(f"\nπŸ“Š Final dataset: {len(df)} samples")
print(f" Label distribution:\n{df['distress_label'].value_counts().to_string()}")
print(f" Label map: {label_map}")
return df, label_map
if __name__ == "__main__":
df, label_map = load_all_datasets(max_emotion=2000, max_go_emotions=2000)
print(f"\nSample data:\n{df.head(10).to_string()}")