Spaces:

imanerd
/

l2-demo

Sleeping

App Files Files Community

l2-demo / src /part2_intent /dev_set.py

imanerd

initial deploy

98b332e 5 days ago

raw

history blame contribute delete

5.46 kB

	"""Build a 150-row dev set, stratified across the five intent classes.

	Outputs two files:

	dev_unlabeled.csv text-only sheet for hand-labeling
	dev_gold_auto.csv heuristic auto-gold so train.py has something to evaluate
	against immediately. Replace by writing dev_gold.csv with
	the same rows + a 'label' column; train.py prefers
	dev_gold.csv when it exists.

	The auto-gold uses tighter rules than weak_labels.py so it isn't a perfect
	mirror of the training labels. Where the tight rules and the weak label
	disagree, the row is marked needs_review=True.
	"""

	from __future__ import annotations

	import csv
	import re
	from pathlib import Path

	import pandas as pd

	ROOT = Path(__file__).resolve().parents[2]
	ART = ROOT / "artifacts" / "part2"
	WEAK_CSV = ART / "labeled.csv"

	PER_CLASS = 30
	SEED = 42

	# tighter rules used to produce auto-gold. each rule needs at least two signals
	# to fire, which makes it stricter than the single-keyword weak rules.

	FUTURE_TIME = re.compile(
	r"\b(tomorrow\|tonight\|monday\|tuesday\|wednesday\|thursday\|friday\|saturday\|sunday\|"
	r"next (week\|month\|year\|monday\|tuesday\|wednesday\|thursday\|friday\|saturday\|sunday)\|"
	r"at \d{1,2}(:\d{2})?\s*(am\|pm)?\|in the morning\|in the evening\|later today)\b",
	re.I,
	)
	REMINDER_KW = re.compile(r"\b(remind me\|don'?t forget\|reminder\|schedule)\b", re.I)

	EMO_STRONG = re.compile(
	r"\b(sad\|anxious\|anxiety\|stressed\|depress(ed\|ing)?\|lonely\|overwhelmed\|"
	r"crying\|miserable\|heart ?broken\|burn(ed\|t) out\|exhausted\|hurt\|grieving)\b",
	re.I,
	)
	EMO_RECEIVE = re.compile(r"\b(sorry to hear\|that sucks\|hugs\|sending love\|i'm here for you\|here for you)\b", re.I)

	IMPER_VERBS = {
	"send", "call", "book", "buy", "finish", "email", "text", "pick",
	"grab", "remind", "schedule", "cancel", "pay", "submit", "order",
	"fix", "review", "check", "update",
	}
	NEED_KW = re.compile(r"\b(i need to\|i have to\|gotta\|got to\|i must\|i should)\b", re.I)

	SMALL_KW = re.compile(
	r"\b(lol\|haha+\|how are you\|how's it going\|good morning\|good night\|"
	r"hey there\|hi there\|hello\|see you\|take care\|bye\|thanks\|thank you\|"
	r"no problem\|you're welcome\|what's up\|nice to meet you)\b",
	re.I,
	)


	def auto_gold(text: str) -> tuple[str, str]:
	"""Return (label, reasoning). Tighter than weak_labels.label_one."""
	t = text.strip()
	if not t:
	return "unknown", "empty"

	# reminder: need an explicit reminder keyword AND a future-time anchor,
	# OR an explicit "remind me" / "don't forget" which is unambiguous on its own
	if re.search(r"\b(remind me\|don'?t forget)\b", t, re.I):
	return "reminder", "explicit remind-me phrase"
	if REMINDER_KW.search(t) and FUTURE_TIME.search(t):
	return "reminder", "reminder kw + future time"

	# emotional support: either a strong feeling word about self, or a
	# consoling phrase aimed at the other person
	if EMO_STRONG.search(t) and re.search(r"\b(i\|me\|my\|i'm\|im)\b", t, re.I):
	return "emotional-support", "first-person emotion word"
	if EMO_RECEIVE.search(t):
	return "emotional-support", "consoling phrase"

	# action-item: sentence-initial imperative verb, not a question
	first = re.split(r"[\s,.!?]+", t.lower(), maxsplit=1)[0]
	if first in IMPER_VERBS and not t.endswith("?"):
	return "action-item", f"imperative verb '{first}'"
	if NEED_KW.search(t) and re.search(r"\b(today\|tomorrow\|tonight\|before\|by)\b", t, re.I):
	return "action-item", "need-to + deadline"

	# small-talk: explicit greeting/closing/filler tokens, or very short ack
	if SMALL_KW.search(t):
	return "small-talk", "greeting/closing keyword"
	if len(t.split()) <= 3:
	return "small-talk", "very short utterance"

	return "unknown", "no rule fired"


	def main():
	df = pd.read_csv(WEAK_CSV)
	rng = pd.Series(range(len(df))).sample(frac=1, random_state=SEED).tolist()
	df = df.iloc[rng].reset_index(drop=True)

	picks = []
	for cls in ["reminder", "emotional-support", "action-item", "small-talk", "unknown"]:
	sub = df[df["label"] == cls].head(PER_CLASS)
	picks.append(sub)
	sample = pd.concat(picks, ignore_index=True)

	# unlabeled sheet for humans
	unlabeled = sample[["text"]].copy()
	unlabeled["label"] = "" # to be filled in
	unlabeled.to_csv(ART / "dev_unlabeled.csv", index=False, quoting=csv.QUOTE_MINIMAL)

	# auto-gold for now
	gold_rows = []
	for _, r in sample.iterrows():
	gl, why = auto_gold(r["text"])
	needs_review = gl != r["label"]
	gold_rows.append((r["text"], gl, r["label"], why, needs_review))

	gold = pd.DataFrame(
	gold_rows,
	columns=["text", "label", "weak_label", "auto_gold_reason", "needs_review"],
	)
	gold.to_csv(ART / "dev_gold_auto.csv", index=False, quoting=csv.QUOTE_MINIMAL)

	print(f"sampled {len(sample)} rows ({PER_CLASS} per class)")
	print(f"wrote {ART / 'dev_unlabeled.csv'} (hand-label here)")
	print(f"wrote {ART / 'dev_gold_auto.csv'} (auto-gold, used by train.py)")
	print()
	print(f"auto-gold vs weak-label agreement: "
	f"{(~gold['needs_review']).sum()}/{len(gold)} "
	f"({100 * (~gold['needs_review']).mean():.1f}%)")
	print()
	print("auto-gold distribution:")
	for cls, n in gold["label"].value_counts().items():
	print(f" {cls:<20s} {n}")


	if __name__ == "__main__":
	main()