Spaces:

MukulRay
/

EmpathRAG

Running

Mukul Rayana

Day 1: data pipeline, session tracker, query router, adversarial probes, Colab training notebooks

bc3ba9e about 1 month ago

1.64 kB

	import re
	import pandas as pd
	from datasets import Dataset

	# Emotion label mapping: 27 GoEmotions labels collapsed to 5 coarse classes
	LABEL_MAP = {
	# Distress
	"grief": 0, "remorse": 0, "fear": 0, "sadness": 0,
	# Anxiety
	"nervousness": 1, "confusion": 1, "embarrassment": 1,
	# Frustration
	"anger": 2, "annoyance": 2, "disappointment": 2, "disgust": 2,
	# Neutral
	"neutral": 3,
	# Hopeful
	"optimism": 4, "relief": 4, "gratitude": 4, "joy": 4,
	"love": 4, "admiration": 4, "amusement": 4, "approval": 4,
	"caring": 4, "curiosity": 4, "desire": 4, "excitement": 4,
	"pride": 4, "realization": 4, "surprise": 4,
	}
	LABEL_NAMES = ["distress", "anxiety", "frustration", "neutral", "hopeful"]


	def clean_text(text: str) -> str:
	"""Remove Reddit artefacts and normalise whitespace."""
	text = re.sub(r"u/\w+", "", text)
	text = re.sub(r"r/\w+", "", text)
	text = re.sub(r"http\S+", "", text)
	text = re.sub(r"\[deleted\]\|\[removed\]", "", text)
	text = re.sub(r"[^\x00-\x7F]+", " ", text)
	text = re.sub(r"\s+", " ", text).strip()
	return text


	def token_length(text: str, tokenizer) -> int:
	return len(tokenizer.encode(text, add_special_tokens=False))


	def filter_by_length(texts, tokenizer, min_tok=20, max_tok=512):
	return [t for t in texts if min_tok <= token_length(t, tokenizer) <= max_tok]


	def map_goemotions_label(label_ids: list, id2label: dict) -> int:
	"""Return first matched coarse label, else neutral (3)."""
	for lid in label_ids:
	name = id2label[lid]
	if name in LABEL_MAP:
	return LABEL_MAP[name]
	return 3