Mukul Rayana
Day 1: data pipeline, session tracker, query router, adversarial probes, Colab training notebooks
bc3ba9e | import re | |
| import pandas as pd | |
| from datasets import Dataset | |
| # Emotion label mapping: 27 GoEmotions labels collapsed to 5 coarse classes | |
| LABEL_MAP = { | |
| # Distress | |
| "grief": 0, "remorse": 0, "fear": 0, "sadness": 0, | |
| # Anxiety | |
| "nervousness": 1, "confusion": 1, "embarrassment": 1, | |
| # Frustration | |
| "anger": 2, "annoyance": 2, "disappointment": 2, "disgust": 2, | |
| # Neutral | |
| "neutral": 3, | |
| # Hopeful | |
| "optimism": 4, "relief": 4, "gratitude": 4, "joy": 4, | |
| "love": 4, "admiration": 4, "amusement": 4, "approval": 4, | |
| "caring": 4, "curiosity": 4, "desire": 4, "excitement": 4, | |
| "pride": 4, "realization": 4, "surprise": 4, | |
| } | |
| LABEL_NAMES = ["distress", "anxiety", "frustration", "neutral", "hopeful"] | |
| def clean_text(text: str) -> str: | |
| """Remove Reddit artefacts and normalise whitespace.""" | |
| text = re.sub(r"u/\w+", "", text) | |
| text = re.sub(r"r/\w+", "", text) | |
| text = re.sub(r"http\S+", "", text) | |
| text = re.sub(r"\[deleted\]|\[removed\]", "", text) | |
| text = re.sub(r"[^\x00-\x7F]+", " ", text) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| return text | |
| def token_length(text: str, tokenizer) -> int: | |
| return len(tokenizer.encode(text, add_special_tokens=False)) | |
| def filter_by_length(texts, tokenizer, min_tok=20, max_tok=512): | |
| return [t for t in texts if min_tok <= token_length(t, tokenizer) <= max_tok] | |
| def map_goemotions_label(label_ids: list, id2label: dict) -> int: | |
| """Return first matched coarse label, else neutral (3).""" | |
| for lid in label_ids: | |
| name = id2label[lid] | |
| if name in LABEL_MAP: | |
| return LABEL_MAP[name] | |
| return 3 | |