"""Build training/data/eval.jsonl — a held-out test set for scoring the fine-tune. These examples are DISJOINT from training/data/dataset.jsonl (fresh names, dates, wording) so the eval measures generalization, not memorization. Each record fixes a `now` so relative-date answers ("tomorrow", "next Tuesday") are deterministic. Anchor for relative dates: 2026-09-14 is a **Monday** (so tomorrow=Sep 15 Tue, this Fri=Sep 18, this Sat=Sep 19, next Mon=Sep 21, next Tue=Sep 22, +3 days=Sep 17). Run: python training/gen_eval.py -> writes training/data/eval.jsonl Score: see training/eval.py (needs an INFERENCE_BASE_URL serving the model). """ from __future__ import annotations import json import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from server.schema import ActionPlan # noqa: E402 NOW = "2026-09-14T09:00:00" # Monday def ev(title, start, end=None, location=None, reminder=None, attendees=None, notes=None): return { "title": title, "start": start, "end": end, "location": location, "attendees": attendees or [], "reminder_minutes": reminder, "notes": notes, } def plan(events=None, reply="", clarify=None, reasoning=None): return { "reasoning": reasoning, "events": events or [], "conflicts": [], "proposed_times": [], "reply_draft": reply, "needs_clarification": clarify, } # (id, category, now, thread, gold_plan) EVAL = [ # --- A. explicit-date events (copy the stated date/time) --- ("e01", "events", NOW, "Teacher: Back-to-school night is Wednesday September 16 at 6:30pm in the gym\nMe: we'll be there", plan([ev("Back-to-school night", "2026-09-16T18:30:00", location="School gym", reminder=120)], "See you Wednesday at 6:30!")), ("e02", "events", NOW, "Dr. Okafor's office: Liam's cleaning is set for Tuesday October 6 at 3:30pm\nMe: thanks", plan([ev("Liam — dental cleaning", "2026-10-06T15:30:00", location="Dr. Okafor's office", reminder=120)], "Got it — Tuesday the 6th at 3:30.")), ("e03", "events", NOW, "Sam: dinner Friday September 25, 7-9pm at Trattoria Verde?\nMe: yes!", plan([ev("Dinner with Sam", "2026-09-25T19:00:00", "2026-09-25T21:00:00", "Trattoria Verde", 60)], "Friday at 7 — can't wait!")), ("e04", "events", NOW, "Coach: makeup game is Sunday Sept 27 at 1pm at Field 2\nMe: got it", plan([ev("Makeup game", "2026-09-27T13:00:00", location="Field 2", reminder=60)], "We'll be at Field 2 by 1!")), ("e05", "events", NOW, "Clinic: flu shot clinic Thursday October 1 at 8:15am\nMe: see you", plan([ev("Flu shot", "2026-10-01T08:15:00", location="Clinic", reminder=60)], "See you Thursday at 8:15.")), ("e06", "events", NOW, "PTA: book fair opens Monday Sept 28 at 9am\nMe: noted", plan([ev("Book fair", "2026-09-28T09:00:00", reminder=60)], "Thanks — added it!")), ("e15", "events", NOW, "Library: your book club meets Thursday October 8 at 7pm\nMe: see you there", plan([ev("Book club", "2026-10-08T19:00:00", location="Library", reminder=60)], "See you on the 8th at 7!")), # --- B. relative-date events (the hard skill) --- ("e07", "events", NOW, "Maya: study group tomorrow at 4pm at the library\nMe: I'll be there", plan([ev("Study group", "2026-09-15T16:00:00", location="Library", reminder=30)], "See you at 4 tomorrow!")), ("e08", "events", NOW, "Dad: family dinner this Friday 6:30pm at Grandma's\nMe: yum", plan([ev("Family dinner", "2026-09-18T18:30:00", location="Grandma's", reminder=60)], "See you Friday at 6:30!")), ("e09", "events", NOW, "Manager: our 1:1 moves to next Tuesday at 11am\nMe: ok", plan([ev("1:1 with manager", "2026-09-22T11:00:00", reminder=15)], "Got it — Tuesday at 11.")), ("e10", "events", NOW, "Prof: the problem set is due in three days at noon\nMe: on it", plan([ev("Problem set due", "2026-09-17T12:00:00", reminder=1440)], "It'll be in by Thursday noon!")), ("e11", "events", NOW, "Zoe: trivia night tonight at 8pm at The Tap\nMe: in!", plan([ev("Trivia night", "2026-09-14T20:00:00", location="The Tap", reminder=30)], "See you at 8!")), ("e12", "events", NOW, "Neighbor: garage sale this Saturday 7am to 1pm\nMe: I'll swing by", plan([ev("Garage sale", "2026-09-19T07:00:00", "2026-09-19T13:00:00")], "I'll stop by Saturday!")), ("e13", "events", NOW, "HR: onboarding session is next Monday at 10am\nMe: thanks", plan([ev("Onboarding session", "2026-09-21T10:00:00", reminder=60)], "Thanks — see you Monday at 10.")), ("e14", "events", NOW, "Landlord: apartment inspection on the 30th at 2pm\nMe: noted", plan([ev("Apartment inspection", "2026-09-30T14:00:00", reminder=120)], "Noted — the 30th at 2.")), ("e16", "events", NOW, "Boss: all-hands is a week from today at 3pm\nMe: noted", plan([ev("All-hands", "2026-09-21T15:00:00", reminder=30)], "Got it — added the all-hands.")), # --- C. no-event chitchat (must NOT invent events) --- ("c01", "no_event", NOW, "Friend: that movie was incredible, thanks for the rec!\nMe: right?? so good", plan([], "Right? One of the best this year!")), ("c02", "no_event", NOW, "Mom: just checking you got home safe\nMe: yep, all good!", plan([], "Home safe — thanks for checking!")), ("c03", "no_event", NOW, "Coworker: congrats on shipping the release!\nMe: thanks, big team effort", plan([], "Appreciate it — couldn't have done it without the team!")), ("c04", "no_event", NOW, "Sis: the cookies turned out amazing\nMe: yay! glad the recipe worked", plan([], "So glad — they're a keeper!")), ("c05", "no_event", NOW, "Coach: great hustle at practice today, everyone\nMe: thanks coach!", plan([], "Thanks, Coach — good session today!")), # --- D. needs clarification (ambiguous; ask, don't invent) --- ("q01", "clarify", NOW, "Alex: we should grab coffee soon!\nMe: yes, let's!", plan([], "Definitely! What day works for you?", clarify="When would you like to grab coffee?")), ("q02", "clarify", NOW, "Team: let's schedule the offsite sometime next month\nMe: sounds good", plan([], "Sounds good — which dates are we considering?", clarify="What date next month works for the offsite?")), ("q03", "clarify", NOW, "Jen: dinner this week? not sure which night works for you\nMe: let's find a time", plan([], "I'm free most nights — what works for you?", clarify="Which night this week works for dinner?")), ("q04", "clarify", NOW, "Coach: we'll add an extra practice, date still TBD\nMe: keep me posted", plan([], "Sounds good — let me know the day and time.", clarify="Which day and time is the extra practice?")), # --- E. multi-event threads --- ("m01", "multi", NOW, "Spouse: Friday Sept 18 — Mia has the dentist at 9am, then soccer at 4pm\nMe: adding both", plan([ev("Mia — dentist", "2026-09-18T09:00:00", reminder=60), ev("Mia — soccer", "2026-09-18T16:00:00", reminder=30)], "Both on the calendar — dentist at 9, soccer at 4.")), ("m02", "multi", NOW, "Office: standup is Wednesday Sept 16 at 9:30am and the sprint demo is Sept 16 at 2pm\nMe: blocked both", plan([ev("Standup", "2026-09-16T09:30:00", reminder=15), ev("Sprint demo", "2026-09-16T14:00:00", reminder=15)], "Blocked both — standup at 9:30, demo at 2.")), ("m03", "multi", NOW, "Camp: drop-off is tomorrow at 8am and pickup is tomorrow at 3pm\nMe: got it", plan([ev("Camp drop-off", "2026-09-15T08:00:00", reminder=30), ev("Camp pickup", "2026-09-15T15:00:00", reminder=30)], "Got it — drop-off 8am, pickup 3pm tomorrow.")), ] # Expansion (2026-06-10): 28 examples left single binary cases dominating the score # (each event = 4.5 recall points), so run-to-run training jitter swamped the gate. # +32 examples with TWO additional weekday anchors. 2026-10-07 = Wednesday, # 2026-11-13 = Friday. Convention reminder: "next " = that DOW of NEXT week. NOW2 = "2026-10-07T08:30:00" # Wednesday NOW3 = "2026-11-13T17:00:00" # Friday EVAL2 = [ # --- explicit dates, varied formats --- ("e17", "events", NOW2, "Dr. Nguyen's office: confirming Quinn's filling on October 21 at 4:45pm\nMe: thanks", plan([ev("Quinn — filling", "2026-10-21T16:45:00", location="Dr. Nguyen's office", reminder=120)], "See you on the 21st at 4:45.")), ("e18", "events", NOW2, "Tessa: my housewarming is 11/14 at 6pm, 48 Birchwood Ln\nMe: wouldn't miss it", plan([ev("Tessa's housewarming", "2026-11-14T18:00:00", location="48 Birchwood Ln", reminder=120)], "Can't wait — see you the 14th!")), ("e19", "events", NOW2, "Choir: winter audition is December 3rd at 9:15am in room 12\nMe: signed up", plan([ev("Choir audition", "2026-12-03T09:15:00", location="Room 12", reminder=60)], "Signed up — Dec 3rd at 9:15.")), ("e20", "events", NOW2, "Parent council: meeting Thursday October 15 at 7pm in the cafeteria\nMe: I'll come", plan([ev("Parent council meeting", "2026-10-15T19:00:00", location="Cafeteria", reminder=60)], "See you Thursday the 15th.")), ("e21", "events", NOW2, "Mr. Alvarez: tutoring moved to 5:30 on the 28th\nMe: ok", plan([ev("Tutoring", "2026-10-28T17:30:00", reminder=30)], "Got it — the 28th at 5:30.")), ("e22", "events", NOW3, "Rec center: lifeguard recert is November 30 at 8am, bring your card\nMe: noted", plan([ev("Lifeguard recertification", "2026-11-30T08:00:00", location="Rec center", reminder=60, notes="Bring certification card")], "Noted — Nov 30 at 8.")), ("e23", "events", NOW3, "Wren: brunch December 5, noon, at Petit Four?\nMe: yes please", plan([ev("Brunch with Wren", "2026-12-05T12:00:00", location="Petit Four", reminder=60)], "Noon on the 5th — in!")), ("e24", "events", NOW3, "Garage: your inspection slot is Nov 18 at 7:45am\nMe: I'll drop it off", plan([ev("Car inspection", "2026-11-18T07:45:00", location="Garage", reminder=60)], "Dropping it off the 18th at 7:45.")), # --- "opens/starts/launches" phrasing (e06's failure shape) --- ("e25", "events", NOW2, "Library: the used book sale opens Friday October 16 at 9am\nMe: I'll be there early", plan([ev("Used book sale", "2026-10-16T09:00:00", location="Library", reminder=60)], "There at 9 on the 16th!")), ("e26", "events", NOW2, "City pool: winter session registration opens Tuesday October 13 at 6am\nMe: setting an alarm", plan([ev("Pool registration opens", "2026-10-13T06:00:00", reminder=30)], "Alarm set for the 13th at 6.")), ("e27", "events", NOW3, "Museum: the new dinosaur exhibit opens November 20 at 10am\nMe: kids will love that", plan([ev("Dinosaur exhibit opening", "2026-11-20T10:00:00", location="Museum", reminder=120)], "Taking the kids on the 20th!")), # --- relative dates from non-Monday anchors --- ("r01", "events", NOW2, "Sage: yoga tomorrow at 7:15am?\nMe: I'm in", plan([ev("Yoga with Sage", "2026-10-08T07:15:00", reminder=30)], "See you at 7:15!")), ("r02", "events", NOW3, "Theo: pancakes at ours tomorrow, 9am\nMe: yum, in", plan([ev("Pancakes at Theo's", "2026-11-14T09:00:00", location="Theo's", reminder=30)], "See you at 9!")), ("r03", "events", NOW3, "Bowling league: lanes tonight at 8\nMe: rolling in", plan([ev("Bowling league", "2026-11-13T20:00:00", reminder=30)], "See everyone at 8!")), ("r04", "events", NOW2, "Aunt Rosa: lunch this Sunday at 1?\nMe: lovely", plan([ev("Lunch with Aunt Rosa", "2026-10-11T13:00:00", reminder=60)], "Sunday at 1 — lovely!")), ("r05", "events", NOW2, "Barber: I can fit you in next Wednesday at 5:45\nMe: book it", plan([ev("Haircut", "2026-10-14T17:45:00", location="Barber", reminder=30)], "Booked — next Wednesday at 5:45.")), ("r06", "events", NOW3, "Ski club: first meetup is next Friday at 6:30pm\nMe: stoked", plan([ev("Ski club meetup", "2026-11-20T18:30:00", reminder=60)], "Stoked — see everyone the 20th.")), ("r07", "events", NOW2, "Prof Idris: revised draft due in five days, by 5pm\nMe: on it", plan([ev("Revised draft due", "2026-10-12T17:00:00", reminder=1440)], "It'll be in by Monday at 5.")), ("r08", "events", NOW2, "Nico: cabin trip a week from Saturday, leaving 8am\nMe: packing already", plan([ev("Cabin trip departure", "2026-10-17T08:00:00", reminder=120)], "Packed and ready for the 17th!")), # --- multi-event --- ("m04", "multi", NOW2, "Saturday Oct 10 plan — farmers market at 9, Quinn's game at 1, dinner with the Patels at 6\nMe: full day!", plan([ev("Farmers market", "2026-10-10T09:00:00", reminder=30), ev("Quinn's game", "2026-10-10T13:00:00", reminder=60), ev("Dinner with the Patels", "2026-10-10T18:00:00", reminder=60)], "All three added — market, game, dinner.")), ("m05", "multi", NOW3, "Clinic: flu shots Saturday Nov 21 at 10am, and your follow-up is Dec 1 at 2:30pm\nMe: both noted", plan([ev("Flu shot", "2026-11-21T10:00:00", location="Clinic", reminder=60), ev("Follow-up appointment", "2026-12-01T14:30:00", location="Clinic", reminder=120)], "Both on the calendar.")), ("m06", "multi", NOW2, "School: early release Friday Oct 9 at 12:15, then the fall social runs 5 to 7pm same day\nMe: busy friday", plan([ev("Early release", "2026-10-09T12:15:00", location="School", reminder=120), ev("Fall social", "2026-10-09T17:00:00", "2026-10-09T19:00:00", "School", 60)], "Got both — pickup 12:15, social at 5.")), ("m07", "multi", NOW3, "Vet: Biscuit's nail trim is Nov 17 at 3pm and his dental is Dec 9 at 8am\nMe: poor guy", plan([ev("Biscuit — nail trim", "2026-11-17T15:00:00", location="Vet", reminder=60), ev("Biscuit — dental", "2026-12-09T08:00:00", location="Vet", reminder=120)], "Both booked for Biscuit.")), # --- no-event --- ("c06", "no_event", NOW2, "Marisol: the recital photos came out so cute\nMe: omg yes, framing them", plan([], "Aren't they adorable? Definitely framing a few!")), ("c07", "no_event", NOW2, "Dev: ugh, standup ran 40 minutes today\nMe: brutal", plan([], "Brutal — hope tomorrow's is snappier.")), ("c08", "no_event", NOW3, "Neighbor: leaf pile in front of ours, kids are welcome to jump in\nMe: they'll be thrilled", plan([], "They'll be over in five minutes, thank you!")), ("c09", "no_event", NOW3, "Gran: loved the call yesterday, made my week\nMe: mine too", plan([], "Made mine too — talk soon, Gran!")), ("c10", "no_event", NOW2, "Courier: your parcel was left at the side door\nMe: got it, thanks", plan([], "Got it — thanks for the heads up!")), # --- clarify (announced but unconfirmed / choices) --- ("q05", "clarify", NOW2, "Room parent: bake sale is being planned, date to be announced\nMe: count me in for cookies", plan([], "I'm in for cookies — what date should I plan for?", clarify="What date is the bake sale?")), ("q06", "clarify", NOW2, "Piano teacher: we should add a makeup lesson — does Tuesday or Thursday suit?\nMe: let me check", plan([], "Let me check the calendar and get back to you.", clarify="Tuesday or Thursday — and what time works for the makeup lesson?")), ("q07", "clarify", NOW3, "Uncle Ray: thinking of visiting sometime around the holidays\nMe: that would be great", plan([], "We'd love that! Which dates are you thinking?", clarify="Which dates would Uncle Ray visit?")), ("q08", "clarify", NOW3, "HOA: the annual meeting will be rescheduled, new time to come\nMe: ok", plan([], "Thanks — I'll watch for the new time.", clarify="What is the new date and time of the annual meeting?")), ] EVAL = EVAL + EVAL2 def main(): out = Path(__file__).with_name("data") / "eval.jsonl" ok, bad = [], 0 seen_ids = set() for rid, cat, now, thread, gold in EVAL: assert rid not in seen_ids, f"duplicate id {rid}" seen_ids.add(rid) try: ActionPlan(**gold) # validate the gold answer except Exception as e: # noqa: BLE001 print(f" [{rid}] GOLD INVALID: {e}"); bad += 1; continue ok.append({"id": rid, "category": cat, "now": now, "thread": thread, "gold": gold}) print(f"{len(ok)} valid / {len(EVAL)} ({bad} invalid)") with out.open("w", encoding="utf-8") as f: for r in ok: f.write(json.dumps(r, ensure_ascii=False) + "\n") print(f"wrote {out}") if __name__ == "__main__": main()