File size: 5,458 Bytes
98b332e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 | """Build a 150-row dev set, stratified across the five intent classes.
Outputs two files:
dev_unlabeled.csv text-only sheet for hand-labeling
dev_gold_auto.csv heuristic auto-gold so train.py has something to evaluate
against immediately. Replace by writing dev_gold.csv with
the same rows + a 'label' column; train.py prefers
dev_gold.csv when it exists.
The auto-gold uses tighter rules than weak_labels.py so it isn't a perfect
mirror of the training labels. Where the tight rules and the weak label
disagree, the row is marked needs_review=True.
"""
from __future__ import annotations
import csv
import re
from pathlib import Path
import pandas as pd
ROOT = Path(__file__).resolve().parents[2]
ART = ROOT / "artifacts" / "part2"
WEAK_CSV = ART / "labeled.csv"
PER_CLASS = 30
SEED = 42
# tighter rules used to produce auto-gold. each rule needs at least two signals
# to fire, which makes it stricter than the single-keyword weak rules.
FUTURE_TIME = re.compile(
r"\b(tomorrow|tonight|monday|tuesday|wednesday|thursday|friday|saturday|sunday|"
r"next (week|month|year|monday|tuesday|wednesday|thursday|friday|saturday|sunday)|"
r"at \d{1,2}(:\d{2})?\s*(am|pm)?|in the morning|in the evening|later today)\b",
re.I,
)
REMINDER_KW = re.compile(r"\b(remind me|don'?t forget|reminder|schedule)\b", re.I)
EMO_STRONG = re.compile(
r"\b(sad|anxious|anxiety|stressed|depress(ed|ing)?|lonely|overwhelmed|"
r"crying|miserable|heart ?broken|burn(ed|t) out|exhausted|hurt|grieving)\b",
re.I,
)
EMO_RECEIVE = re.compile(r"\b(sorry to hear|that sucks|hugs|sending love|i'm here for you|here for you)\b", re.I)
IMPER_VERBS = {
"send", "call", "book", "buy", "finish", "email", "text", "pick",
"grab", "remind", "schedule", "cancel", "pay", "submit", "order",
"fix", "review", "check", "update",
}
NEED_KW = re.compile(r"\b(i need to|i have to|gotta|got to|i must|i should)\b", re.I)
SMALL_KW = re.compile(
r"\b(lol|haha+|how are you|how's it going|good morning|good night|"
r"hey there|hi there|hello|see you|take care|bye|thanks|thank you|"
r"no problem|you're welcome|what's up|nice to meet you)\b",
re.I,
)
def auto_gold(text: str) -> tuple[str, str]:
"""Return (label, reasoning). Tighter than weak_labels.label_one."""
t = text.strip()
if not t:
return "unknown", "empty"
# reminder: need an explicit reminder keyword AND a future-time anchor,
# OR an explicit "remind me" / "don't forget" which is unambiguous on its own
if re.search(r"\b(remind me|don'?t forget)\b", t, re.I):
return "reminder", "explicit remind-me phrase"
if REMINDER_KW.search(t) and FUTURE_TIME.search(t):
return "reminder", "reminder kw + future time"
# emotional support: either a strong feeling word about self, or a
# consoling phrase aimed at the other person
if EMO_STRONG.search(t) and re.search(r"\b(i|me|my|i'm|im)\b", t, re.I):
return "emotional-support", "first-person emotion word"
if EMO_RECEIVE.search(t):
return "emotional-support", "consoling phrase"
# action-item: sentence-initial imperative verb, not a question
first = re.split(r"[\s,.!?]+", t.lower(), maxsplit=1)[0]
if first in IMPER_VERBS and not t.endswith("?"):
return "action-item", f"imperative verb '{first}'"
if NEED_KW.search(t) and re.search(r"\b(today|tomorrow|tonight|before|by)\b", t, re.I):
return "action-item", "need-to + deadline"
# small-talk: explicit greeting/closing/filler tokens, or very short ack
if SMALL_KW.search(t):
return "small-talk", "greeting/closing keyword"
if len(t.split()) <= 3:
return "small-talk", "very short utterance"
return "unknown", "no rule fired"
def main():
df = pd.read_csv(WEAK_CSV)
rng = pd.Series(range(len(df))).sample(frac=1, random_state=SEED).tolist()
df = df.iloc[rng].reset_index(drop=True)
picks = []
for cls in ["reminder", "emotional-support", "action-item", "small-talk", "unknown"]:
sub = df[df["label"] == cls].head(PER_CLASS)
picks.append(sub)
sample = pd.concat(picks, ignore_index=True)
# unlabeled sheet for humans
unlabeled = sample[["text"]].copy()
unlabeled["label"] = "" # to be filled in
unlabeled.to_csv(ART / "dev_unlabeled.csv", index=False, quoting=csv.QUOTE_MINIMAL)
# auto-gold for now
gold_rows = []
for _, r in sample.iterrows():
gl, why = auto_gold(r["text"])
needs_review = gl != r["label"]
gold_rows.append((r["text"], gl, r["label"], why, needs_review))
gold = pd.DataFrame(
gold_rows,
columns=["text", "label", "weak_label", "auto_gold_reason", "needs_review"],
)
gold.to_csv(ART / "dev_gold_auto.csv", index=False, quoting=csv.QUOTE_MINIMAL)
print(f"sampled {len(sample)} rows ({PER_CLASS} per class)")
print(f"wrote {ART / 'dev_unlabeled.csv'} (hand-label here)")
print(f"wrote {ART / 'dev_gold_auto.csv'} (auto-gold, used by train.py)")
print()
print(f"auto-gold vs weak-label agreement: "
f"{(~gold['needs_review']).sum()}/{len(gold)} "
f"({100 * (~gold['needs_review']).mean():.1f}%)")
print()
print("auto-gold distribution:")
for cls, n in gold["label"].value_counts().items():
print(f" {cls:<20s} {n}")
if __name__ == "__main__":
main()
|