EmpathRAG / src /data /preprocess.py
Mukul Rayana
Day 1: data pipeline, session tracker, query router, adversarial probes, Colab training notebooks
bc3ba9e
import re
import pandas as pd
from datasets import Dataset
# Emotion label mapping: 27 GoEmotions labels collapsed to 5 coarse classes
LABEL_MAP = {
# Distress
"grief": 0, "remorse": 0, "fear": 0, "sadness": 0,
# Anxiety
"nervousness": 1, "confusion": 1, "embarrassment": 1,
# Frustration
"anger": 2, "annoyance": 2, "disappointment": 2, "disgust": 2,
# Neutral
"neutral": 3,
# Hopeful
"optimism": 4, "relief": 4, "gratitude": 4, "joy": 4,
"love": 4, "admiration": 4, "amusement": 4, "approval": 4,
"caring": 4, "curiosity": 4, "desire": 4, "excitement": 4,
"pride": 4, "realization": 4, "surprise": 4,
}
LABEL_NAMES = ["distress", "anxiety", "frustration", "neutral", "hopeful"]
def clean_text(text: str) -> str:
"""Remove Reddit artefacts and normalise whitespace."""
text = re.sub(r"u/\w+", "", text)
text = re.sub(r"r/\w+", "", text)
text = re.sub(r"http\S+", "", text)
text = re.sub(r"\[deleted\]|\[removed\]", "", text)
text = re.sub(r"[^\x00-\x7F]+", " ", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def token_length(text: str, tokenizer) -> int:
return len(tokenizer.encode(text, add_special_tokens=False))
def filter_by_length(texts, tokenizer, min_tok=20, max_tok=512):
return [t for t in texts if min_tok <= token_length(t, tokenizer) <= max_tok]
def map_goemotions_label(label_ids: list, id2label: dict) -> int:
"""Return first matched coarse label, else neutral (3)."""
for lid in label_ids:
name = id2label[lid]
if name in LABEL_MAP:
return LABEL_MAP[name]
return 3