File size: 5,458 Bytes
98b332e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
"""Build a 150-row dev set, stratified across the five intent classes.

Outputs two files:

  dev_unlabeled.csv   text-only sheet for hand-labeling
  dev_gold_auto.csv   heuristic auto-gold so train.py has something to evaluate
                      against immediately. Replace by writing dev_gold.csv with
                      the same rows + a 'label' column; train.py prefers
                      dev_gold.csv when it exists.

The auto-gold uses tighter rules than weak_labels.py so it isn't a perfect
mirror of the training labels. Where the tight rules and the weak label
disagree, the row is marked needs_review=True.
"""

from __future__ import annotations

import csv
import re
from pathlib import Path

import pandas as pd

ROOT = Path(__file__).resolve().parents[2]
ART = ROOT / "artifacts" / "part2"
WEAK_CSV = ART / "labeled.csv"

PER_CLASS = 30
SEED = 42

# tighter rules used to produce auto-gold. each rule needs at least two signals
# to fire, which makes it stricter than the single-keyword weak rules.

FUTURE_TIME = re.compile(
    r"\b(tomorrow|tonight|monday|tuesday|wednesday|thursday|friday|saturday|sunday|"
    r"next (week|month|year|monday|tuesday|wednesday|thursday|friday|saturday|sunday)|"
    r"at \d{1,2}(:\d{2})?\s*(am|pm)?|in the morning|in the evening|later today)\b",
    re.I,
)
REMINDER_KW = re.compile(r"\b(remind me|don'?t forget|reminder|schedule)\b", re.I)

EMO_STRONG = re.compile(
    r"\b(sad|anxious|anxiety|stressed|depress(ed|ing)?|lonely|overwhelmed|"
    r"crying|miserable|heart ?broken|burn(ed|t) out|exhausted|hurt|grieving)\b",
    re.I,
)
EMO_RECEIVE = re.compile(r"\b(sorry to hear|that sucks|hugs|sending love|i'm here for you|here for you)\b", re.I)

IMPER_VERBS = {
    "send", "call", "book", "buy", "finish", "email", "text", "pick",
    "grab", "remind", "schedule", "cancel", "pay", "submit", "order",
    "fix", "review", "check", "update",
}
NEED_KW = re.compile(r"\b(i need to|i have to|gotta|got to|i must|i should)\b", re.I)

SMALL_KW = re.compile(
    r"\b(lol|haha+|how are you|how's it going|good morning|good night|"
    r"hey there|hi there|hello|see you|take care|bye|thanks|thank you|"
    r"no problem|you're welcome|what's up|nice to meet you)\b",
    re.I,
)


def auto_gold(text: str) -> tuple[str, str]:
    """Return (label, reasoning). Tighter than weak_labels.label_one."""
    t = text.strip()
    if not t:
        return "unknown", "empty"

    # reminder: need an explicit reminder keyword AND a future-time anchor,
    # OR an explicit "remind me" / "don't forget" which is unambiguous on its own
    if re.search(r"\b(remind me|don'?t forget)\b", t, re.I):
        return "reminder", "explicit remind-me phrase"
    if REMINDER_KW.search(t) and FUTURE_TIME.search(t):
        return "reminder", "reminder kw + future time"

    # emotional support: either a strong feeling word about self, or a
    # consoling phrase aimed at the other person
    if EMO_STRONG.search(t) and re.search(r"\b(i|me|my|i'm|im)\b", t, re.I):
        return "emotional-support", "first-person emotion word"
    if EMO_RECEIVE.search(t):
        return "emotional-support", "consoling phrase"

    # action-item: sentence-initial imperative verb, not a question
    first = re.split(r"[\s,.!?]+", t.lower(), maxsplit=1)[0]
    if first in IMPER_VERBS and not t.endswith("?"):
        return "action-item", f"imperative verb '{first}'"
    if NEED_KW.search(t) and re.search(r"\b(today|tomorrow|tonight|before|by)\b", t, re.I):
        return "action-item", "need-to + deadline"

    # small-talk: explicit greeting/closing/filler tokens, or very short ack
    if SMALL_KW.search(t):
        return "small-talk", "greeting/closing keyword"
    if len(t.split()) <= 3:
        return "small-talk", "very short utterance"

    return "unknown", "no rule fired"


def main():
    df = pd.read_csv(WEAK_CSV)
    rng = pd.Series(range(len(df))).sample(frac=1, random_state=SEED).tolist()
    df = df.iloc[rng].reset_index(drop=True)

    picks = []
    for cls in ["reminder", "emotional-support", "action-item", "small-talk", "unknown"]:
        sub = df[df["label"] == cls].head(PER_CLASS)
        picks.append(sub)
    sample = pd.concat(picks, ignore_index=True)

    # unlabeled sheet for humans
    unlabeled = sample[["text"]].copy()
    unlabeled["label"] = ""  # to be filled in
    unlabeled.to_csv(ART / "dev_unlabeled.csv", index=False, quoting=csv.QUOTE_MINIMAL)

    # auto-gold for now
    gold_rows = []
    for _, r in sample.iterrows():
        gl, why = auto_gold(r["text"])
        needs_review = gl != r["label"]
        gold_rows.append((r["text"], gl, r["label"], why, needs_review))

    gold = pd.DataFrame(
        gold_rows,
        columns=["text", "label", "weak_label", "auto_gold_reason", "needs_review"],
    )
    gold.to_csv(ART / "dev_gold_auto.csv", index=False, quoting=csv.QUOTE_MINIMAL)

    print(f"sampled {len(sample)} rows ({PER_CLASS} per class)")
    print(f"wrote {ART / 'dev_unlabeled.csv'} (hand-label here)")
    print(f"wrote {ART / 'dev_gold_auto.csv'} (auto-gold, used by train.py)")
    print()
    print(f"auto-gold vs weak-label agreement: "
          f"{(~gold['needs_review']).sum()}/{len(gold)} "
          f"({100 * (~gold['needs_review']).mean():.1f}%)")
    print()
    print("auto-gold distribution:")
    for cls, n in gold["label"].value_counts().items():
        print(f"  {cls:<20s} {n}")


if __name__ == "__main__":
    main()