Spaces:

imanerd
/

l2-demo

Sleeping

File size: 5,458 Bytes

98b332e

"""Build a 150-row dev set, stratified across the five intent classes.

Outputs two files:

  dev_unlabeled.csv   text-only sheet for hand-labeling
  dev_gold_auto.csv   heuristic auto-gold so train.py has something to evaluate
                      against immediately. Replace by writing dev_gold.csv with
                      the same rows + a 'label' column; train.py prefers
                      dev_gold.csv when it exists.

The auto-gold uses tighter rules than weak_labels.py so it isn't a perfect
mirror of the training labels. Where the tight rules and the weak label
disagree, the row is marked needs_review=True.
"""

from __future__ import annotations

import csv
import re
from pathlib import Path

import pandas as pd

ROOT = Path(__file__).resolve().parents[2]
ART = ROOT / "artifacts" / "part2"
WEAK_CSV = ART / "labeled.csv"

PER_CLASS = 30
SEED = 42

# tighter rules used to produce auto-gold. each rule needs at least two signals
# to fire, which makes it stricter than the single-keyword weak rules.

FUTURE_TIME = re.compile(
    r"\b(tomorrow|tonight|monday|tuesday|wednesday|thursday|friday|saturday|sunday|"
    r"next (week|month|year|monday|tuesday|wednesday|thursday|friday|saturday|sunday)|"
    r"at \d{1,2}(:\d{2})?\s*(am|pm)?|in the morning|in the evening|later today)\b",
    re.I,
)
REMINDER_KW = re.compile(r"\b(remind me|don'?t forget|reminder|schedule)\b", re.I)

EMO_STRONG = re.compile(
    r"\b(sad|anxious|anxiety|stressed|depress(ed|ing)?|lonely|overwhelmed|"
    r"crying|miserable|heart ?broken|burn(ed|t) out|exhausted|hurt|grieving)\b",
    re.I,
)
EMO_RECEIVE = re.compile(r"\b(sorry to hear|that sucks|hugs|sending love|i'm here for you|here for you)\b", re.I)

IMPER_VERBS = {
    "send", "call", "book", "buy", "finish", "email", "text", "pick",
    "grab", "remind", "schedule", "cancel", "pay", "submit", "order",
    "fix", "review", "check", "update",
}
NEED_KW = re.compile(r"\b(i need to|i have to|gotta|got to|i must|i should)\b", re.I)

SMALL_KW = re.compile(
    r"\b(lol|haha+|how are you|how's it going|good morning|good night|"
    r"hey there|hi there|hello|see you|take care|bye|thanks|thank you|"
    r"no problem|you're welcome|what's up|nice to meet you)\b",
    re.I,
)


def auto_gold(text: str) -> tuple[str, str]:
    """Return (label, reasoning). Tighter than weak_labels.label_one."""
    t = text.strip()
    if not t:
        return "unknown", "empty"

    # reminder: need an explicit reminder keyword AND a future-time anchor,
    # OR an explicit "remind me" / "don't forget" which is unambiguous on its own
    if re.search(r"\b(remind me|don'?t forget)\b", t, re.I):
        return "reminder", "explicit remind-me phrase"
    if REMINDER_KW.search(t) and FUTURE_TIME.search(t):
        return "reminder", "reminder kw + future time"

    # emotional support: either a strong feeling word about self, or a
    # consoling phrase aimed at the other person
    if EMO_STRONG.search(t) and re.search(r"\b(i|me|my|i'm|im)\b", t, re.I):
        return "emotional-support", "first-person emotion word"
    if EMO_RECEIVE.search(t):
        return "emotional-support", "consoling phrase"

    # action-item: sentence-initial imperative verb, not a question
    first = re.split(r"[\s,.!?]+", t.lower(), maxsplit=1)[0]
    if first in IMPER_VERBS and not t.endswith("?"):
        return "action-item", f"imperative verb '{first}'"
    if NEED_KW.search(t) and re.search(r"\b(today|tomorrow|tonight|before|by)\b", t, re.I):
        return "action-item", "need-to + deadline"

    # small-talk: explicit greeting/closing/filler tokens, or very short ack
    if SMALL_KW.search(t):
        return "small-talk", "greeting/closing keyword"
    if len(t.split()) <= 3:
        return "small-talk", "very short utterance"

    return "unknown", "no rule fired"


def main():
    df = pd.read_csv(WEAK_CSV)
    rng = pd.Series(range(len(df))).sample(frac=1, random_state=SEED).tolist()
    df = df.iloc[rng].reset_index(drop=True)

    picks = []
    for cls in ["reminder", "emotional-support", "action-item", "small-talk", "unknown"]:
        sub = df[df["label"] == cls].head(PER_CLASS)
        picks.append(sub)
    sample = pd.concat(picks, ignore_index=True)

    # unlabeled sheet for humans
    unlabeled = sample[["text"]].copy()
    unlabeled["label"] = ""  # to be filled in
    unlabeled.to_csv(ART / "dev_unlabeled.csv", index=False, quoting=csv.QUOTE_MINIMAL)

    # auto-gold for now
    gold_rows = []
    for _, r in sample.iterrows():
        gl, why = auto_gold(r["text"])
        needs_review = gl != r["label"]
        gold_rows.append((r["text"], gl, r["label"], why, needs_review))

    gold = pd.DataFrame(
        gold_rows,
        columns=["text", "label", "weak_label", "auto_gold_reason", "needs_review"],
    )
    gold.to_csv(ART / "dev_gold_auto.csv", index=False, quoting=csv.QUOTE_MINIMAL)

    print(f"sampled {len(sample)} rows ({PER_CLASS} per class)")
    print(f"wrote {ART / 'dev_unlabeled.csv'} (hand-label here)")
    print(f"wrote {ART / 'dev_gold_auto.csv'} (auto-gold, used by train.py)")
    print()
    print(f"auto-gold vs weak-label agreement: "
          f"{(~gold['needs_review']).sum()}/{len(gold)} "
          f"({100 * (~gold['needs_review']).mean():.1f}%)")
    print()
    print("auto-gold distribution:")
    for cls, n in gold["label"].value_counts().items():
        print(f"  {cls:<20s} {n}")


if __name__ == "__main__":
    main()