OffGridSchedula / training /gen_eval.py
ParetoOptimal's picture
Initial Commit
0366d65
Raw
History Blame Contribute Delete
17.2 kB
"""Build training/data/eval.jsonl β€” a held-out test set for scoring the fine-tune.
These examples are DISJOINT from training/data/dataset.jsonl (fresh names, dates,
wording) so the eval measures generalization, not memorization. Each record fixes
a `now` so relative-date answers ("tomorrow", "next Tuesday") are deterministic.
Anchor for relative dates: 2026-09-14 is a **Monday** (so tomorrow=Sep 15 Tue,
this Fri=Sep 18, this Sat=Sep 19, next Mon=Sep 21, next Tue=Sep 22, +3 days=Sep 17).
Run: python training/gen_eval.py -> writes training/data/eval.jsonl
Score: see training/eval.py (needs an INFERENCE_BASE_URL serving the model).
"""
from __future__ import annotations
import json
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from server.schema import ActionPlan # noqa: E402
NOW = "2026-09-14T09:00:00" # Monday
def ev(title, start, end=None, location=None, reminder=None, attendees=None, notes=None):
return {
"title": title, "start": start, "end": end, "location": location,
"attendees": attendees or [], "reminder_minutes": reminder, "notes": notes,
}
def plan(events=None, reply="", clarify=None, reasoning=None):
return {
"reasoning": reasoning, "events": events or [], "conflicts": [],
"proposed_times": [], "reply_draft": reply, "needs_clarification": clarify,
}
# (id, category, now, thread, gold_plan)
EVAL = [
# --- A. explicit-date events (copy the stated date/time) ---
("e01", "events", NOW,
"Teacher: Back-to-school night is Wednesday September 16 at 6:30pm in the gym\nMe: we'll be there",
plan([ev("Back-to-school night", "2026-09-16T18:30:00", location="School gym", reminder=120)],
"See you Wednesday at 6:30!")),
("e02", "events", NOW,
"Dr. Okafor's office: Liam's cleaning is set for Tuesday October 6 at 3:30pm\nMe: thanks",
plan([ev("Liam β€” dental cleaning", "2026-10-06T15:30:00", location="Dr. Okafor's office", reminder=120)],
"Got it β€” Tuesday the 6th at 3:30.")),
("e03", "events", NOW,
"Sam: dinner Friday September 25, 7-9pm at Trattoria Verde?\nMe: yes!",
plan([ev("Dinner with Sam", "2026-09-25T19:00:00", "2026-09-25T21:00:00", "Trattoria Verde", 60)],
"Friday at 7 β€” can't wait!")),
("e04", "events", NOW,
"Coach: makeup game is Sunday Sept 27 at 1pm at Field 2\nMe: got it",
plan([ev("Makeup game", "2026-09-27T13:00:00", location="Field 2", reminder=60)],
"We'll be at Field 2 by 1!")),
("e05", "events", NOW,
"Clinic: flu shot clinic Thursday October 1 at 8:15am\nMe: see you",
plan([ev("Flu shot", "2026-10-01T08:15:00", location="Clinic", reminder=60)],
"See you Thursday at 8:15.")),
("e06", "events", NOW,
"PTA: book fair opens Monday Sept 28 at 9am\nMe: noted",
plan([ev("Book fair", "2026-09-28T09:00:00", reminder=60)], "Thanks β€” added it!")),
("e15", "events", NOW,
"Library: your book club meets Thursday October 8 at 7pm\nMe: see you there",
plan([ev("Book club", "2026-10-08T19:00:00", location="Library", reminder=60)],
"See you on the 8th at 7!")),
# --- B. relative-date events (the hard skill) ---
("e07", "events", NOW,
"Maya: study group tomorrow at 4pm at the library\nMe: I'll be there",
plan([ev("Study group", "2026-09-15T16:00:00", location="Library", reminder=30)],
"See you at 4 tomorrow!")),
("e08", "events", NOW,
"Dad: family dinner this Friday 6:30pm at Grandma's\nMe: yum",
plan([ev("Family dinner", "2026-09-18T18:30:00", location="Grandma's", reminder=60)],
"See you Friday at 6:30!")),
("e09", "events", NOW,
"Manager: our 1:1 moves to next Tuesday at 11am\nMe: ok",
plan([ev("1:1 with manager", "2026-09-22T11:00:00", reminder=15)], "Got it β€” Tuesday at 11.")),
("e10", "events", NOW,
"Prof: the problem set is due in three days at noon\nMe: on it",
plan([ev("Problem set due", "2026-09-17T12:00:00", reminder=1440)], "It'll be in by Thursday noon!")),
("e11", "events", NOW,
"Zoe: trivia night tonight at 8pm at The Tap\nMe: in!",
plan([ev("Trivia night", "2026-09-14T20:00:00", location="The Tap", reminder=30)], "See you at 8!")),
("e12", "events", NOW,
"Neighbor: garage sale this Saturday 7am to 1pm\nMe: I'll swing by",
plan([ev("Garage sale", "2026-09-19T07:00:00", "2026-09-19T13:00:00")], "I'll stop by Saturday!")),
("e13", "events", NOW,
"HR: onboarding session is next Monday at 10am\nMe: thanks",
plan([ev("Onboarding session", "2026-09-21T10:00:00", reminder=60)], "Thanks β€” see you Monday at 10.")),
("e14", "events", NOW,
"Landlord: apartment inspection on the 30th at 2pm\nMe: noted",
plan([ev("Apartment inspection", "2026-09-30T14:00:00", reminder=120)], "Noted β€” the 30th at 2.")),
("e16", "events", NOW,
"Boss: all-hands is a week from today at 3pm\nMe: noted",
plan([ev("All-hands", "2026-09-21T15:00:00", reminder=30)], "Got it β€” added the all-hands.")),
# --- C. no-event chitchat (must NOT invent events) ---
("c01", "no_event", NOW,
"Friend: that movie was incredible, thanks for the rec!\nMe: right?? so good",
plan([], "Right? One of the best this year!")),
("c02", "no_event", NOW,
"Mom: just checking you got home safe\nMe: yep, all good!",
plan([], "Home safe β€” thanks for checking!")),
("c03", "no_event", NOW,
"Coworker: congrats on shipping the release!\nMe: thanks, big team effort",
plan([], "Appreciate it β€” couldn't have done it without the team!")),
("c04", "no_event", NOW,
"Sis: the cookies turned out amazing\nMe: yay! glad the recipe worked",
plan([], "So glad β€” they're a keeper!")),
("c05", "no_event", NOW,
"Coach: great hustle at practice today, everyone\nMe: thanks coach!",
plan([], "Thanks, Coach β€” good session today!")),
# --- D. needs clarification (ambiguous; ask, don't invent) ---
("q01", "clarify", NOW,
"Alex: we should grab coffee soon!\nMe: yes, let's!",
plan([], "Definitely! What day works for you?", clarify="When would you like to grab coffee?")),
("q02", "clarify", NOW,
"Team: let's schedule the offsite sometime next month\nMe: sounds good",
plan([], "Sounds good β€” which dates are we considering?",
clarify="What date next month works for the offsite?")),
("q03", "clarify", NOW,
"Jen: dinner this week? not sure which night works for you\nMe: let's find a time",
plan([], "I'm free most nights β€” what works for you?",
clarify="Which night this week works for dinner?")),
("q04", "clarify", NOW,
"Coach: we'll add an extra practice, date still TBD\nMe: keep me posted",
plan([], "Sounds good β€” let me know the day and time.",
clarify="Which day and time is the extra practice?")),
# --- E. multi-event threads ---
("m01", "multi", NOW,
"Spouse: Friday Sept 18 β€” Mia has the dentist at 9am, then soccer at 4pm\nMe: adding both",
plan([ev("Mia β€” dentist", "2026-09-18T09:00:00", reminder=60),
ev("Mia β€” soccer", "2026-09-18T16:00:00", reminder=30)],
"Both on the calendar β€” dentist at 9, soccer at 4.")),
("m02", "multi", NOW,
"Office: standup is Wednesday Sept 16 at 9:30am and the sprint demo is Sept 16 at 2pm\nMe: blocked both",
plan([ev("Standup", "2026-09-16T09:30:00", reminder=15),
ev("Sprint demo", "2026-09-16T14:00:00", reminder=15)],
"Blocked both β€” standup at 9:30, demo at 2.")),
("m03", "multi", NOW,
"Camp: drop-off is tomorrow at 8am and pickup is tomorrow at 3pm\nMe: got it",
plan([ev("Camp drop-off", "2026-09-15T08:00:00", reminder=30),
ev("Camp pickup", "2026-09-15T15:00:00", reminder=30)],
"Got it β€” drop-off 8am, pickup 3pm tomorrow.")),
]
# Expansion (2026-06-10): 28 examples left single binary cases dominating the score
# (each event = 4.5 recall points), so run-to-run training jitter swamped the gate.
# +32 examples with TWO additional weekday anchors. 2026-10-07 = Wednesday,
# 2026-11-13 = Friday. Convention reminder: "next <DOW>" = that DOW of NEXT week.
NOW2 = "2026-10-07T08:30:00" # Wednesday
NOW3 = "2026-11-13T17:00:00" # Friday
EVAL2 = [
# --- explicit dates, varied formats ---
("e17", "events", NOW2,
"Dr. Nguyen's office: confirming Quinn's filling on October 21 at 4:45pm\nMe: thanks",
plan([ev("Quinn β€” filling", "2026-10-21T16:45:00", location="Dr. Nguyen's office", reminder=120)],
"See you on the 21st at 4:45.")),
("e18", "events", NOW2,
"Tessa: my housewarming is 11/14 at 6pm, 48 Birchwood Ln\nMe: wouldn't miss it",
plan([ev("Tessa's housewarming", "2026-11-14T18:00:00", location="48 Birchwood Ln", reminder=120)],
"Can't wait β€” see you the 14th!")),
("e19", "events", NOW2,
"Choir: winter audition is December 3rd at 9:15am in room 12\nMe: signed up",
plan([ev("Choir audition", "2026-12-03T09:15:00", location="Room 12", reminder=60)],
"Signed up β€” Dec 3rd at 9:15.")),
("e20", "events", NOW2,
"Parent council: meeting Thursday October 15 at 7pm in the cafeteria\nMe: I'll come",
plan([ev("Parent council meeting", "2026-10-15T19:00:00", location="Cafeteria", reminder=60)],
"See you Thursday the 15th.")),
("e21", "events", NOW2,
"Mr. Alvarez: tutoring moved to 5:30 on the 28th\nMe: ok",
plan([ev("Tutoring", "2026-10-28T17:30:00", reminder=30)], "Got it β€” the 28th at 5:30.")),
("e22", "events", NOW3,
"Rec center: lifeguard recert is November 30 at 8am, bring your card\nMe: noted",
plan([ev("Lifeguard recertification", "2026-11-30T08:00:00", location="Rec center",
reminder=60, notes="Bring certification card")],
"Noted β€” Nov 30 at 8.")),
("e23", "events", NOW3,
"Wren: brunch December 5, noon, at Petit Four?\nMe: yes please",
plan([ev("Brunch with Wren", "2026-12-05T12:00:00", location="Petit Four", reminder=60)],
"Noon on the 5th β€” in!")),
("e24", "events", NOW3,
"Garage: your inspection slot is Nov 18 at 7:45am\nMe: I'll drop it off",
plan([ev("Car inspection", "2026-11-18T07:45:00", location="Garage", reminder=60)],
"Dropping it off the 18th at 7:45.")),
# --- "opens/starts/launches" phrasing (e06's failure shape) ---
("e25", "events", NOW2,
"Library: the used book sale opens Friday October 16 at 9am\nMe: I'll be there early",
plan([ev("Used book sale", "2026-10-16T09:00:00", location="Library", reminder=60)],
"There at 9 on the 16th!")),
("e26", "events", NOW2,
"City pool: winter session registration opens Tuesday October 13 at 6am\nMe: setting an alarm",
plan([ev("Pool registration opens", "2026-10-13T06:00:00", reminder=30)],
"Alarm set for the 13th at 6.")),
("e27", "events", NOW3,
"Museum: the new dinosaur exhibit opens November 20 at 10am\nMe: kids will love that",
plan([ev("Dinosaur exhibit opening", "2026-11-20T10:00:00", location="Museum", reminder=120)],
"Taking the kids on the 20th!")),
# --- relative dates from non-Monday anchors ---
("r01", "events", NOW2,
"Sage: yoga tomorrow at 7:15am?\nMe: I'm in",
plan([ev("Yoga with Sage", "2026-10-08T07:15:00", reminder=30)], "See you at 7:15!")),
("r02", "events", NOW3,
"Theo: pancakes at ours tomorrow, 9am\nMe: yum, in",
plan([ev("Pancakes at Theo's", "2026-11-14T09:00:00", location="Theo's", reminder=30)],
"See you at 9!")),
("r03", "events", NOW3,
"Bowling league: lanes tonight at 8\nMe: rolling in",
plan([ev("Bowling league", "2026-11-13T20:00:00", reminder=30)], "See everyone at 8!")),
("r04", "events", NOW2,
"Aunt Rosa: lunch this Sunday at 1?\nMe: lovely",
plan([ev("Lunch with Aunt Rosa", "2026-10-11T13:00:00", reminder=60)], "Sunday at 1 β€” lovely!")),
("r05", "events", NOW2,
"Barber: I can fit you in next Wednesday at 5:45\nMe: book it",
plan([ev("Haircut", "2026-10-14T17:45:00", location="Barber", reminder=30)],
"Booked β€” next Wednesday at 5:45.")),
("r06", "events", NOW3,
"Ski club: first meetup is next Friday at 6:30pm\nMe: stoked",
plan([ev("Ski club meetup", "2026-11-20T18:30:00", reminder=60)],
"Stoked β€” see everyone the 20th.")),
("r07", "events", NOW2,
"Prof Idris: revised draft due in five days, by 5pm\nMe: on it",
plan([ev("Revised draft due", "2026-10-12T17:00:00", reminder=1440)],
"It'll be in by Monday at 5.")),
("r08", "events", NOW2,
"Nico: cabin trip a week from Saturday, leaving 8am\nMe: packing already",
plan([ev("Cabin trip departure", "2026-10-17T08:00:00", reminder=120)],
"Packed and ready for the 17th!")),
# --- multi-event ---
("m04", "multi", NOW2,
"Saturday Oct 10 plan β€” farmers market at 9, Quinn's game at 1, dinner with the Patels at 6\nMe: full day!",
plan([ev("Farmers market", "2026-10-10T09:00:00", reminder=30),
ev("Quinn's game", "2026-10-10T13:00:00", reminder=60),
ev("Dinner with the Patels", "2026-10-10T18:00:00", reminder=60)],
"All three added β€” market, game, dinner.")),
("m05", "multi", NOW3,
"Clinic: flu shots Saturday Nov 21 at 10am, and your follow-up is Dec 1 at 2:30pm\nMe: both noted",
plan([ev("Flu shot", "2026-11-21T10:00:00", location="Clinic", reminder=60),
ev("Follow-up appointment", "2026-12-01T14:30:00", location="Clinic", reminder=120)],
"Both on the calendar.")),
("m06", "multi", NOW2,
"School: early release Friday Oct 9 at 12:15, then the fall social runs 5 to 7pm same day\nMe: busy friday",
plan([ev("Early release", "2026-10-09T12:15:00", location="School", reminder=120),
ev("Fall social", "2026-10-09T17:00:00", "2026-10-09T19:00:00", "School", 60)],
"Got both β€” pickup 12:15, social at 5.")),
("m07", "multi", NOW3,
"Vet: Biscuit's nail trim is Nov 17 at 3pm and his dental is Dec 9 at 8am\nMe: poor guy",
plan([ev("Biscuit β€” nail trim", "2026-11-17T15:00:00", location="Vet", reminder=60),
ev("Biscuit β€” dental", "2026-12-09T08:00:00", location="Vet", reminder=120)],
"Both booked for Biscuit.")),
# --- no-event ---
("c06", "no_event", NOW2,
"Marisol: the recital photos came out so cute\nMe: omg yes, framing them",
plan([], "Aren't they adorable? Definitely framing a few!")),
("c07", "no_event", NOW2,
"Dev: ugh, standup ran 40 minutes today\nMe: brutal",
plan([], "Brutal β€” hope tomorrow's is snappier.")),
("c08", "no_event", NOW3,
"Neighbor: leaf pile in front of ours, kids are welcome to jump in\nMe: they'll be thrilled",
plan([], "They'll be over in five minutes, thank you!")),
("c09", "no_event", NOW3,
"Gran: loved the call yesterday, made my week\nMe: mine too",
plan([], "Made mine too β€” talk soon, Gran!")),
("c10", "no_event", NOW2,
"Courier: your parcel was left at the side door\nMe: got it, thanks",
plan([], "Got it β€” thanks for the heads up!")),
# --- clarify (announced but unconfirmed / choices) ---
("q05", "clarify", NOW2,
"Room parent: bake sale is being planned, date to be announced\nMe: count me in for cookies",
plan([], "I'm in for cookies β€” what date should I plan for?",
clarify="What date is the bake sale?")),
("q06", "clarify", NOW2,
"Piano teacher: we should add a makeup lesson β€” does Tuesday or Thursday suit?\nMe: let me check",
plan([], "Let me check the calendar and get back to you.",
clarify="Tuesday or Thursday β€” and what time works for the makeup lesson?")),
("q07", "clarify", NOW3,
"Uncle Ray: thinking of visiting sometime around the holidays\nMe: that would be great",
plan([], "We'd love that! Which dates are you thinking?",
clarify="Which dates would Uncle Ray visit?")),
("q08", "clarify", NOW3,
"HOA: the annual meeting will be rescheduled, new time to come\nMe: ok",
plan([], "Thanks β€” I'll watch for the new time.",
clarify="What is the new date and time of the annual meeting?")),
]
EVAL = EVAL + EVAL2
def main():
out = Path(__file__).with_name("data") / "eval.jsonl"
ok, bad = [], 0
seen_ids = set()
for rid, cat, now, thread, gold in EVAL:
assert rid not in seen_ids, f"duplicate id {rid}"
seen_ids.add(rid)
try:
ActionPlan(**gold) # validate the gold answer
except Exception as e: # noqa: BLE001
print(f" [{rid}] GOLD INVALID: {e}"); bad += 1; continue
ok.append({"id": rid, "category": cat, "now": now, "thread": thread, "gold": gold})
print(f"{len(ok)} valid / {len(EVAL)} ({bad} invalid)")
with out.open("w", encoding="utf-8") as f:
for r in ok:
f.write(json.dumps(r, ensure_ascii=False) + "\n")
print(f"wrote {out}")
if __name__ == "__main__":
main()