Spaces:

yashash045
/

schemashift

Sleeping

App Files Files Community

yashash04 commited on about 1 month ago

Commit

7828dcd

1 Parent(s): 1f23161

Phase 11: medium scenarios M1/M2/M3 + AdaptationRubric multi-drift verification

Browse files

Files changed (9) hide show

eval_results/naive_heuristic_20260421_233115.json +132 -0
eval_results/policy_aware_heuristic_20260421_233136.json +132 -0
eval_results/policy_aware_heuristic_20260422_000422.json +132 -0
eval_results/policy_aware_heuristic_20260422_000529.json +132 -0
scenarios.py +159 -0
server/environment.py +28 -1
tests/test_graders.py +71 -0
tests/test_scenarios.py +31 -1
tests/test_server.py +5 -2

eval_results/naive_heuristic_20260421_233115.json ADDED Viewed

	@@ -0,0 +1,132 @@

+{
+  "baseline": "naive_heuristic",
+  "timestamp": "20260421_233115",
+  "results": [
+    {
+      "task_id": "E1_onboard_new_hire",
+      "seed": 0,
+      "completion": 0.0,
+      "drift_detection": 0.0,
+      "adaptation": 0.0,
+      "efficiency": 0.8125,
+      "shaped_total": 0.0,
+      "cumulative_reward": 0.221875,
+      "binary": 0.0,
+      "steps_used": 3,
+      "final_action_type": "complete_task",
+      "error": null
+    },
+    {
+      "task_id": "E1_onboard_new_hire",
+      "seed": 1,
+      "completion": 0.0,
+      "drift_detection": 0.0,
+      "adaptation": 0.0,
+      "efficiency": 0.8125,
+      "shaped_total": 0.0,
+      "cumulative_reward": 0.221875,
+      "binary": 0.0,
+      "steps_used": 3,
+      "final_action_type": "complete_task",
+      "error": null
+    },
+    {
+      "task_id": "E1_onboard_new_hire",
+      "seed": 2,
+      "completion": 0.0,
+      "drift_detection": 0.0,
+      "adaptation": 0.0,
+      "efficiency": 0.8125,
+      "shaped_total": 0.0,
+      "cumulative_reward": 0.221875,
+      "binary": 0.0,
+      "steps_used": 3,
+      "final_action_type": "complete_task",
+      "error": null
+    },
+    {
+      "task_id": "E2_meeting_invite_blast",
+      "seed": 0,
+      "completion": 0.0,
+      "drift_detection": 0.0,
+      "adaptation": 0.0,
+      "efficiency": 0.75,
+      "shaped_total": 0.0,
+      "cumulative_reward": 0.21250000000000002,
+      "binary": 0.0,
+      "steps_used": 3,
+      "final_action_type": "complete_task",
+      "error": null
+    },
+    {
+      "task_id": "E2_meeting_invite_blast",
+      "seed": 1,
+      "completion": 0.0,
+      "drift_detection": 0.0,
+      "adaptation": 0.0,
+      "efficiency": 0.75,
+      "shaped_total": 0.0,
+      "cumulative_reward": 0.21250000000000002,
+      "binary": 0.0,
+      "steps_used": 3,
+      "final_action_type": "complete_task",
+      "error": null
+    },
+    {
+      "task_id": "E2_meeting_invite_blast",
+      "seed": 2,
+      "completion": 0.0,
+      "drift_detection": 0.0,
+      "adaptation": 0.0,
+      "efficiency": 0.75,
+      "shaped_total": 0.0,
+      "cumulative_reward": 0.21250000000000002,
+      "binary": 0.0,
+      "steps_used": 3,
+      "final_action_type": "complete_task",
+      "error": null
+    },
+    {
+      "task_id": "E3_customer_lookup",
+      "seed": 0,
+      "completion": 0.0,
+      "drift_detection": 0.0,
+      "adaptation": 0.0,
+      "efficiency": 0.8125,
+      "shaped_total": 0.0,
+      "cumulative_reward": 0.271875,
+      "binary": 0.0,
+      "steps_used": 3,
+      "final_action_type": "complete_task",
+      "error": null
+    },
+    {
+      "task_id": "E3_customer_lookup",
+      "seed": 1,
+      "completion": 0.0,
+      "drift_detection": 0.0,
+      "adaptation": 0.0,
+      "efficiency": 0.8125,
+      "shaped_total": 0.0,
+      "cumulative_reward": 0.271875,
+      "binary": 0.0,
+      "steps_used": 3,
+      "final_action_type": "complete_task",
+      "error": null
+    },
+    {
+      "task_id": "E3_customer_lookup",
+      "seed": 2,
+      "completion": 0.0,
+      "drift_detection": 0.0,
+      "adaptation": 0.0,
+      "efficiency": 0.8125,
+      "shaped_total": 0.0,
+      "cumulative_reward": 0.271875,
+      "binary": 0.0,
+      "steps_used": 3,
+      "final_action_type": "complete_task",
+      "error": null
+    }
+  ]
+}

eval_results/policy_aware_heuristic_20260421_233136.json ADDED Viewed

	@@ -0,0 +1,132 @@

+{
+  "baseline": "policy_aware_heuristic",
+  "timestamp": "20260421_233136",
+  "results": [
+    {
+      "task_id": "E1_onboard_new_hire",
+      "seed": 0,
+      "completion": 1.0,
+      "drift_detection": 0.0,
+      "adaptation": 0.0,
+      "efficiency": 0.8125,
+      "shaped_total": 0.521875,
+      "cumulative_reward": 1.4337499999999999,
+      "binary": 1.0,
+      "steps_used": 3,
+      "final_action_type": "complete_task",
+      "error": null
+    },
+    {
+      "task_id": "E1_onboard_new_hire",
+      "seed": 1,
+      "completion": 1.0,
+      "drift_detection": 0.0,
+      "adaptation": 0.0,
+      "efficiency": 0.8125,
+      "shaped_total": 0.521875,
+      "cumulative_reward": 1.4337499999999999,
+      "binary": 1.0,
+      "steps_used": 3,
+      "final_action_type": "complete_task",
+      "error": null
+    },
+    {
+      "task_id": "E1_onboard_new_hire",
+      "seed": 2,
+      "completion": 1.0,
+      "drift_detection": 0.0,
+      "adaptation": 0.0,
+      "efficiency": 0.8125,
+      "shaped_total": 0.521875,
+      "cumulative_reward": 1.4337499999999999,
+      "binary": 1.0,
+      "steps_used": 3,
+      "final_action_type": "complete_task",
+      "error": null
+    },
+    {
+      "task_id": "E2_meeting_invite_blast",
+      "seed": 0,
+      "completion": 0.0,
+      "drift_detection": 1.0,
+      "adaptation": 1.0,
+      "efficiency": 0.5833333333333333,
+      "shaped_total": 0.0,
+      "cumulative_reward": 1.425,
+      "binary": 0.0,
+      "steps_used": 5,
+      "final_action_type": "complete_task",
+      "error": null
+    },
+    {
+      "task_id": "E2_meeting_invite_blast",
+      "seed": 1,
+      "completion": 0.0,
+      "drift_detection": 1.0,
+      "adaptation": 1.0,
+      "efficiency": 0.5833333333333333,
+      "shaped_total": 0.0,
+      "cumulative_reward": 1.425,
+      "binary": 0.0,
+      "steps_used": 5,
+      "final_action_type": "complete_task",
+      "error": null
+    },
+    {
+      "task_id": "E2_meeting_invite_blast",
+      "seed": 2,
+      "completion": 0.0,
+      "drift_detection": 1.0,
+      "adaptation": 1.0,
+      "efficiency": 0.5833333333333333,
+      "shaped_total": 0.0,
+      "cumulative_reward": 1.425,
+      "binary": 0.0,
+      "steps_used": 5,
+      "final_action_type": "complete_task",
+      "error": null
+    },
+    {
+      "task_id": "E3_customer_lookup",
+      "seed": 0,
+      "completion": 1.0,
+      "drift_detection": 0.0,
+      "adaptation": 0.0,
+      "efficiency": 0.8125,
+      "shaped_total": 0.521875,
+      "cumulative_reward": 0.99375,
+      "binary": 1.0,
+      "steps_used": 3,
+      "final_action_type": "complete_task",
+      "error": null
+    },
+    {
+      "task_id": "E3_customer_lookup",
+      "seed": 1,
+      "completion": 1.0,
+      "drift_detection": 0.0,
+      "adaptation": 0.0,
+      "efficiency": 0.8125,
+      "shaped_total": 0.521875,
+      "cumulative_reward": 0.99375,
+      "binary": 1.0,
+      "steps_used": 3,
+      "final_action_type": "complete_task",
+      "error": null
+    },
+    {
+      "task_id": "E3_customer_lookup",
+      "seed": 2,
+      "completion": 1.0,
+      "drift_detection": 0.0,
+      "adaptation": 0.0,
+      "efficiency": 0.8125,
+      "shaped_total": 0.521875,
+      "cumulative_reward": 0.99375,
+      "binary": 1.0,
+      "steps_used": 3,
+      "final_action_type": "complete_task",
+      "error": null
+    }
+  ]
+}

eval_results/policy_aware_heuristic_20260422_000422.json ADDED Viewed

	@@ -0,0 +1,132 @@

+{
+  "baseline": "policy_aware_heuristic",
+  "timestamp": "20260422_000422",
+  "results": [
+    {
+      "task_id": "M1_customer_escalation",
+      "seed": 0,
+      "completion": 0.5,
+      "drift_detection": 0.5,
+      "adaptation": 0.0,
+      "efficiency": 0.7083333333333333,
+      "shaped_total": 0.0,
+      "cumulative_reward": 2.5104166666666665,
+      "binary": 0.0,
+      "steps_used": 7,
+      "final_action_type": "complete_task",
+      "error": null
+    },
+    {
+      "task_id": "M1_customer_escalation",
+      "seed": 1,
+      "completion": 0.5,
+      "drift_detection": 0.5,
+      "adaptation": 0.0,
+      "efficiency": 0.7083333333333333,
+      "shaped_total": 0.0,
+      "cumulative_reward": 2.5104166666666665,
+      "binary": 0.0,
+      "steps_used": 7,
+      "final_action_type": "complete_task",
+      "error": null
+    },
+    {
+      "task_id": "M1_customer_escalation",
+      "seed": 2,
+      "completion": 0.5,
+      "drift_detection": 0.5,
+      "adaptation": 0.0,
+      "efficiency": 0.7083333333333333,
+      "shaped_total": 0.0,
+      "cumulative_reward": 2.5104166666666665,
+      "binary": 0.0,
+      "steps_used": 7,
+      "final_action_type": "complete_task",
+      "error": null
+    },
+    {
+      "task_id": "M2_weekly_report",
+      "seed": 0,
+      "completion": 0.25,
+      "drift_detection": 0.0,
+      "adaptation": 1.0,
+      "efficiency": 0.5,
+      "shaped_total": 0.0,
+      "cumulative_reward": 3.0125,
+      "binary": 0.0,
+      "steps_used": 10,
+      "final_action_type": "complete_task",
+      "error": null
+    },
+    {
+      "task_id": "M2_weekly_report",
+      "seed": 1,
+      "completion": 0.25,
+      "drift_detection": 0.0,
+      "adaptation": 1.0,
+      "efficiency": 0.5,
+      "shaped_total": 0.0,
+      "cumulative_reward": 3.0125,
+      "binary": 0.0,
+      "steps_used": 10,
+      "final_action_type": "complete_task",
+      "error": null
+    },
+    {
+      "task_id": "M2_weekly_report",
+      "seed": 2,
+      "completion": 0.25,
+      "drift_detection": 0.0,
+      "adaptation": 1.0,
+      "efficiency": 0.5,
+      "shaped_total": 0.0,
+      "cumulative_reward": 3.0125,
+      "binary": 0.0,
+      "steps_used": 10,
+      "final_action_type": "complete_task",
+      "error": null
+    },
+    {
+      "task_id": "M3_event_cleanup",
+      "seed": 0,
+      "completion": 0.2,
+      "drift_detection": 0.0,
+      "adaptation": 0.0,
+      "efficiency": 0.875,
+      "shaped_total": 0.0,
+      "cumulative_reward": 0.44125000000000003,
+      "binary": 0.0,
+      "steps_used": 3,
+      "final_action_type": "complete_task",
+      "error": null
+    },
+    {
+      "task_id": "M3_event_cleanup",
+      "seed": 1,
+      "completion": 0.2,
+      "drift_detection": 0.0,
+      "adaptation": 0.0,
+      "efficiency": 0.875,
+      "shaped_total": 0.0,
+      "cumulative_reward": 0.44125000000000003,
+      "binary": 0.0,
+      "steps_used": 3,
+      "final_action_type": "complete_task",
+      "error": null
+    },
+    {
+      "task_id": "M3_event_cleanup",
+      "seed": 2,
+      "completion": 0.2,
+      "drift_detection": 0.0,
+      "adaptation": 0.0,
+      "efficiency": 0.875,
+      "shaped_total": 0.0,
+      "cumulative_reward": 0.44125000000000003,
+      "binary": 0.0,
+      "steps_used": 3,
+      "final_action_type": "complete_task",
+      "error": null
+    }
+  ]
+}

eval_results/policy_aware_heuristic_20260422_000529.json ADDED Viewed

	@@ -0,0 +1,132 @@

+{
+  "baseline": "policy_aware_heuristic",
+  "timestamp": "20260422_000529",
+  "results": [
+    {
+      "task_id": "E1_onboard_new_hire",
+      "seed": 0,
+      "completion": 1.0,
+      "drift_detection": 0.0,
+      "adaptation": 0.0,
+      "efficiency": 0.8125,
+      "shaped_total": 0.521875,
+      "cumulative_reward": 1.4337499999999999,
+      "binary": 1.0,
+      "steps_used": 3,
+      "final_action_type": "complete_task",
+      "error": null
+    },
+    {
+      "task_id": "E1_onboard_new_hire",
+      "seed": 1,
+      "completion": 1.0,
+      "drift_detection": 0.0,
+      "adaptation": 0.0,
+      "efficiency": 0.8125,
+      "shaped_total": 0.521875,
+      "cumulative_reward": 1.4337499999999999,
+      "binary": 1.0,
+      "steps_used": 3,
+      "final_action_type": "complete_task",
+      "error": null
+    },
+    {
+      "task_id": "E1_onboard_new_hire",
+      "seed": 2,
+      "completion": 1.0,
+      "drift_detection": 0.0,
+      "adaptation": 0.0,
+      "efficiency": 0.8125,
+      "shaped_total": 0.521875,
+      "cumulative_reward": 1.4337499999999999,
+      "binary": 1.0,
+      "steps_used": 3,
+      "final_action_type": "complete_task",
+      "error": null
+    },
+    {
+      "task_id": "E2_meeting_invite_blast",
+      "seed": 0,
+      "completion": 0.0,
+      "drift_detection": 1.0,
+      "adaptation": 1.0,
+      "efficiency": 0.5833333333333333,
+      "shaped_total": 0.0,
+      "cumulative_reward": 1.425,
+      "binary": 0.0,
+      "steps_used": 5,
+      "final_action_type": "complete_task",
+      "error": null
+    },
+    {
+      "task_id": "E2_meeting_invite_blast",
+      "seed": 1,
+      "completion": 0.0,
+      "drift_detection": 1.0,
+      "adaptation": 1.0,
+      "efficiency": 0.5833333333333333,
+      "shaped_total": 0.0,
+      "cumulative_reward": 1.425,
+      "binary": 0.0,
+      "steps_used": 5,
+      "final_action_type": "complete_task",
+      "error": null
+    },
+    {
+      "task_id": "E2_meeting_invite_blast",
+      "seed": 2,
+      "completion": 0.0,
+      "drift_detection": 1.0,
+      "adaptation": 1.0,
+      "efficiency": 0.5833333333333333,
+      "shaped_total": 0.0,
+      "cumulative_reward": 1.425,
+      "binary": 0.0,
+      "steps_used": 5,
+      "final_action_type": "complete_task",
+      "error": null
+    },
+    {
+      "task_id": "E3_customer_lookup",
+      "seed": 0,
+      "completion": 1.0,
+      "drift_detection": 0.0,
+      "adaptation": 0.0,
+      "efficiency": 0.8125,
+      "shaped_total": 0.521875,
+      "cumulative_reward": 0.99375,
+      "binary": 1.0,
+      "steps_used": 3,
+      "final_action_type": "complete_task",
+      "error": null
+    },
+    {
+      "task_id": "E3_customer_lookup",
+      "seed": 1,
+      "completion": 1.0,
+      "drift_detection": 0.0,
+      "adaptation": 0.0,
+      "efficiency": 0.8125,
+      "shaped_total": 0.521875,
+      "cumulative_reward": 0.99375,
+      "binary": 1.0,
+      "steps_used": 3,
+      "final_action_type": "complete_task",
+      "error": null
+    },
+    {
+      "task_id": "E3_customer_lookup",
+      "seed": 2,
+      "completion": 1.0,
+      "drift_detection": 0.0,
+      "adaptation": 0.0,
+      "efficiency": 0.8125,
+      "shaped_total": 0.521875,
+      "cumulative_reward": 0.99375,
+      "binary": 1.0,
+      "steps_used": 3,
+      "final_action_type": "complete_task",
+      "error": null
+    }
+  ]
+}

scenarios.py CHANGED Viewed

@@ -121,4 +121,163 @@ SCENARIOS: dict[str, dict] = {
         },
         "required_tools": ["crm"],
     },
 }

         },
         "required_tools": ["crm"],
     },
+    "M1_customer_escalation": {
+        "difficulty": "medium",
+        "max_steps": 12,
+        "token_budget": 6000,
+        "task_description": (
+            "A VIP customer at bob@customer.com has escalated: their "
+            "subscription is about to lapse. Look them up in CRM, update "
+            "their status to 'vip_escalation', send them a personalized "
+            "retention email from support@company.com with subject "
+            "'Priority Support — [Customer Name]', and schedule a 30-minute "
+            "check-in call on Friday April 24 at 2:00 PM with both the "
+            "customer and the account manager alex@company.com."
+        ),
+        "success_criteria": [
+            "Customer contact retrieved with correct company",
+            "CRM status updated to vip_escalation",
+            "Retention email sent to bob@customer.com with Priority Support subject",
+            "Calendar event created for Friday April 24 2pm with both customer and account manager",
+        ],
+        "seed_data": {
+            "mail": {"messages": []},
+            "calendar": {"events": []},
+            "crm": {
+                "contacts": [
+                    {"contact_id": "c_1", "customer_email": "alice@customer.com",
+                     "name": "Alice Nguyen", "company": "Acme Corp", "status": "active"},
+                    {"contact_id": "c_2", "customer_email": "bob@customer.com",
+                     "name": "Bob Taylor", "company": "Globex Industries", "status": "active"},
+                ],
+            },
+        },
+        "drift_plan": [
+            DriftEvent(
+                tool="crm", endpoint=None, kind="field_rename",
+                fires_at_step=1,
+                details={"from": "customer_email", "to": "email_address"},
+            ),
+            DriftEvent(
+                tool="calendar", endpoint="create_event", kind="field_rename",
+                fires_at_step=6,
+                details={"from": "attendees", "to": "participants"},
+            ),
+        ],
+        "ground_truth_final_state": {
+            "crm.contact_c_2_status": "vip_escalation",
+            "mail.sent_count": 1,
+            "mail.last_sent_to": "bob@customer.com",
+            "mail.last_subject_contains_priority_support": True,
+            "calendar.events_count": 1,
+            "calendar.last_event_has_both_attendees": True,
+        },
+        "required_tools": ["mail", "calendar", "crm"],
+    },
+    "M2_weekly_report": {
+        "difficulty": "medium",
+        "max_steps": 10,
+        "token_budget": 5000,
+        "task_description": (
+            "Prepare the weekly sales report: pull the list of active "
+            "contacts from CRM, send a summary email to "
+            "sales-leads@company.com with subject "
+            "'Weekly Active Contacts Report' listing contact names, and "
+            "schedule a report review meeting next Monday April 27 at "
+            "10:00 AM with the sales team leads sarah@company.com and "
+            "mike@company.com."
+        ),
+        "success_criteria": [
+            "Active contacts retrieved from CRM",
+            "Summary email sent with 'Weekly' in subject",
+            "Meeting scheduled for Monday April 27 10am with both sales leads",
+        ],
+        "seed_data": {
+            "mail": {"messages": []},
+            "calendar": {"events": []},
+            "crm": {
+                "contacts": [
+                    {"contact_id": "c_1", "customer_email": "x@co.com",
+                     "name": "X Person", "company": "Co", "status": "active"},
+                    {"contact_id": "c_2", "customer_email": "y@co.com",
+                     "name": "Y Person", "company": "Co", "status": "active"},
+                    {"contact_id": "c_3", "customer_email": "z@co.com",
+                     "name": "Z Person", "company": "Co", "status": "inactive"},
+                ],
+            },
+        },
+        "drift_plan": [
+            DriftEvent(
+                tool="mail", endpoint="send_message", kind="endpoint_deprecation",
+                fires_at_step=2,
+                details={"replacement": "messages.send"},
+            ),
+            DriftEvent(
+                tool="crm", endpoint=None, kind="rate_limit_tightening",
+                fires_at_step=4,
+                details={"limit": 2},
+            ),
+        ],
+        "ground_truth_final_state": {
+            "mail.sent_count": 1,
+            "mail.last_subject_contains_weekly": True,
+            "calendar.events_count": 1,
+            "calendar.last_event_has_both_sales_leads": True,
+        },
+        "required_tools": ["mail", "calendar", "crm"],
+    },
+    "M3_event_cleanup": {
+        "difficulty": "medium",
+        "max_steps": 12,
+        "token_budget": 6000,
+        "task_description": (
+            "End-of-week calendar cleanup: find and cancel the "
+            "'Old Planning Sync' event, find and cancel the "
+            "'Cancelled Kickoff' event, and create a new 'Friday Wrap-up' "
+            "event for Friday April 24 at 4:00 PM with the team lead "
+            "alex@company.com attending. Send a notification email to "
+            "team-all@company.com with subject "
+            "'Calendar Updated — Friday Wrap-up Added' about the changes."
+        ),
+        "success_criteria": [
+            "Old Planning Sync event cancelled or deleted",
+            "Cancelled Kickoff event cancelled or deleted",
+            "New Friday Wrap-up event created at 4pm with alex attending",
+            "Notification email sent with 'Calendar Updated' in subject",
+        ],
+        "seed_data": {
+            "mail": {"messages": []},
+            "calendar": {"events": [
+                {"event_id": "evt_1", "title": "Old Planning Sync",
+                 "start": "2026-04-20T10:00:00Z", "end": "2026-04-20T11:00:00Z",
+                 "attendees": ["alex@company.com"], "status": "confirmed"},
+                {"event_id": "evt_2", "title": "Cancelled Kickoff",
+                 "start": "2026-04-21T14:00:00Z", "end": "2026-04-21T15:00:00Z",
+                 "attendees": ["alex@company.com"], "status": "confirmed"},
+            ]},
+        },
+        "drift_plan": [
+            DriftEvent(
+                tool="calendar", endpoint="delete_event", kind="tool_removal",
+                fires_at_step=2,
+                details={"fallback": "update_event status=cancelled"},
+            ),
+            DriftEvent(
+                tool="calendar", endpoint="create_event", kind="field_rename",
+                fires_at_step=5,
+                details={"from": "attendees", "to": "participants"},
+            ),
+        ],
+        "ground_truth_final_state": {
+            "calendar.evt_1_status": "cancelled",
+            "calendar.evt_2_status": "cancelled",
+            "calendar.events_count_new_friday_wrapup": 1,
+            "mail.sent_count": 1,
+            "mail.last_subject_contains_calendar_updated": True,
+        },
+        "required_tools": ["mail", "calendar"],
+    },
 }

server/environment.py CHANGED Viewed

@@ -283,6 +283,12 @@ class SchemaShiftEnvironment:
                     st["mail.last_subject_contains_welcome"] = True
                 if "all-hands" in subject or "all hands" in subject:
                     st["mail.last_subject_contains_allhands"] = True
                 recipients: list[str] = st.get("mail.all_recipients", [])
                 if sent_to and sent_to not in recipients:
                     recipients.append(sent_to)
@@ -304,8 +310,29 @@ class SchemaShiftEnvironment:
                     elif isinstance(a, dict):
                         emails.append(a.get("email", ""))
                 st["calendar.last_event_attendees"] = emails
-                if "priya@company.com" in emails and "alex@company.com" in emails:
                     st["calendar.last_event_has_both_attendees"] = True
         # CRM ─────────────────────────────────────────────────────
         if tool == "crm":

                     st["mail.last_subject_contains_welcome"] = True
                 if "all-hands" in subject or "all hands" in subject:
                     st["mail.last_subject_contains_allhands"] = True
+                if "priority support" in subject:
+                    st["mail.last_subject_contains_priority_support"] = True
+                if "weekly" in subject:
+                    st["mail.last_subject_contains_weekly"] = True
+                if "calendar updated" in subject:
+                    st["mail.last_subject_contains_calendar_updated"] = True
                 recipients: list[str] = st.get("mail.all_recipients", [])
                 if sent_to and sent_to not in recipients:
                     recipients.append(sent_to)
                     elif isinstance(a, dict):
                         emails.append(a.get("email", ""))
                 st["calendar.last_event_attendees"] = emails
+                # Recognised attendee pairs (E1 + M1 share this key by design).
+                priya_alex = (
+                    "priya@company.com" in emails and "alex@company.com" in emails
+                )
+                bob_alex = (
+                    "bob@customer.com" in emails and "alex@company.com" in emails
+                )
+                if priya_alex or bob_alex:
                     st["calendar.last_event_has_both_attendees"] = True
+                if "sarah@company.com" in emails and "mike@company.com" in emails:
+                    st["calendar.last_event_has_both_sales_leads"] = True
+                # M3: Friday Wrap-up event counter
+                title = str(body.get("title") or params.get("title") or "").lower()
+                if "friday wrap-up" in title:
+                    st["calendar.events_count_new_friday_wrapup"] = (
+                        st.get("calendar.events_count_new_friday_wrapup", 0) + 1
+                    )
+            elif endpoint == "update_event":
+                # M3: track per-event status transitions (cancellations)
+                event_id = params.get("event_id", "")
+                status = params.get("status")
+                if event_id and status:
+                    st[f"calendar.{event_id}_status"] = status
         # CRM ─────────────────────────────────────────────────────
         if tool == "crm":

tests/test_graders.py CHANGED Viewed

@@ -126,6 +126,77 @@ def test_adaptation_rubric_success() -> None:
     assert details["opportunities"] == 1
 def test_adaptation_rubric_no_post_drift_calls() -> None:
     drift = DriftEvent(
         tool="calendar", endpoint="create_event", kind="field_rename",

     assert details["opportunities"] == 1
+def test_adaptation_rubric_multi_drift_same_tool() -> None:
+    """M3-style stress test: two drifts on the same tool (calendar).
+    History:
+      step 2 — call_tool calendar.delete_event → 410 (post-Drift-A tool_removal)
+      step 5 — call_tool calendar.create_event with attendees → 400 (post-Drift-B field_rename)
+      step 7 — retry_with_variant calendar.create_event with participants → 200 success
+    Expected rubric behavior (per Phase 5 judgment call #2):
+      - Drift A (fires_at_step=2): first post-drift calendar call = step 5 (failed). opp=1, adapted=0.
+      - Drift B (fires_at_step=5): first post-drift calendar call = step 7 (succeeded). opp=1, adapted=1.
+      - Score = 1/2 = 0.5.
+    Documents intentional denominator behavior: partial credit for partial adaptation.
+    Dense step_shaping (+0.20 for successful retry after failure) catches the step 7
+    recovery independently, so the rubric staying conservative is acceptable.
+    """
+    drifts = [
+        DriftEvent(
+            tool="calendar", endpoint="delete_event", kind="tool_removal",
+            fires_at_step=2, details={}, detected_by_agent=True,
+        ),
+        DriftEvent(
+            tool="calendar", endpoint="create_event", kind="field_rename",
+            fires_at_step=5, details={}, detected_by_agent=True,
+        ),
+    ]
+    history = [
+        HistoryStep(
+            step=2,
+            action=Action(
+                type="call_tool",
+                tool_call=ToolCallParams(
+                    tool="calendar", endpoint="delete_event",
+                    params={"event_id": "evt_2"},
+                ),
+            ),
+            response=ToolResponse(ok=False, status=410, error="removed"),
+        ),
+        HistoryStep(
+            step=5,
+            action=Action(
+                type="call_tool",
+                tool_call=ToolCallParams(
+                    tool="calendar", endpoint="create_event",
+                    params={"title": "x", "start": "t1", "end": "t2",
+                            "attendees": ["a@x.com"]},
+                ),
+            ),
+            response=ToolResponse(ok=False, status=400, error="missing required"),
+        ),
+        HistoryStep(
+            step=7,
+            action=Action(
+                type="retry_with_variant",
+                retry=RetryParams(
+                    tool="calendar", endpoint="create_event",
+                    params={"title": "x", "start": "t1", "end": "t2",
+                            "participants": [{"email": "a@x.com", "role": "required"}]},
+                ),
+            ),
+            response=ToolResponse(ok=True, status=200, body={"event_id": "evt_3"}),
+        ),
+    ]
+    s = _state_with(step=7, drift_plan=drifts, history=history)
+    _, val, details = AdaptationRubric().score(s)
+    assert val == 0.5, f"Expected 0.5, got {val}"
+    assert details["adapted"] == 1
+    assert details["opportunities"] == 2
 def test_adaptation_rubric_no_post_drift_calls() -> None:
     drift = DriftEvent(
         tool="calendar", endpoint="create_event", kind="field_rename",

tests/test_scenarios.py CHANGED Viewed

@@ -18,14 +18,44 @@ REQUIRED_KEYS = {
 }
-def test_all_three_scenarios_present() -> None:
     assert set(SCENARIOS.keys()) == {
         "E1_onboard_new_hire",
         "E2_meeting_invite_blast",
         "E3_customer_lookup",
     }
 def test_each_scenario_has_required_fields() -> None:
     for name, sc in SCENARIOS.items():
         missing = REQUIRED_KEYS - set(sc.keys())

 }
+def test_all_scenarios_present() -> None:
     assert set(SCENARIOS.keys()) == {
         "E1_onboard_new_hire",
         "E2_meeting_invite_blast",
         "E3_customer_lookup",
+        "M1_customer_escalation",
+        "M2_weekly_report",
+        "M3_event_cleanup",
     }
+def test_medium_scenarios_present() -> None:
+    for task_id in ("M1_customer_escalation", "M2_weekly_report", "M3_event_cleanup"):
+        assert task_id in SCENARIOS, f"{task_id} missing from SCENARIOS"
+        assert SCENARIOS[task_id]["difficulty"] == "medium"
+def test_medium_scenarios_multi_drift() -> None:
+    for task_id in ("M1_customer_escalation", "M2_weekly_report", "M3_event_cleanup"):
+        plan = SCENARIOS[task_id]["drift_plan"]
+        assert len(plan) == 2, f"{task_id}: expected 2 drifts, got {len(plan)}"
+def test_m3_same_tool_multi_drift() -> None:
+    """M3 is the judgment-call-#2 stress test: both drifts target calendar."""
+    plan = SCENARIOS["M3_event_cleanup"]["drift_plan"]
+    tools = [d.tool for d in plan]
+    assert tools == ["calendar", "calendar"], (
+        f"M3 drifts must both target calendar, got {tools}"
+    )
+def test_medium_required_tools() -> None:
+    assert SCENARIOS["M1_customer_escalation"]["required_tools"] == ["mail", "calendar", "crm"]
+    assert SCENARIOS["M2_weekly_report"]["required_tools"] == ["mail", "calendar", "crm"]
+    assert SCENARIOS["M3_event_cleanup"]["required_tools"] == ["mail", "calendar"]
 def test_each_scenario_has_required_fields() -> None:
     for name, sc in SCENARIOS.items():
         missing = REQUIRED_KEYS - set(sc.keys())

tests/test_server.py CHANGED Viewed

@@ -38,15 +38,18 @@ def test_tasks_endpoint(client) -> None:
     r = client.get("/tasks")
     assert r.status_code == 200
     body = r.json()
-    assert body["count"] == 3
     task_ids = {t["task_id"] for t in body["tasks"]}
     assert task_ids == {
         "E1_onboard_new_hire",
         "E2_meeting_invite_blast",
         "E3_customer_lookup",
     }
     for t in body["tasks"]:
-        assert t["difficulty"] == "easy"
         assert isinstance(t["required_tools"], list)

     r = client.get("/tasks")
     assert r.status_code == 200
     body = r.json()
+    assert body["count"] == 6
     task_ids = {t["task_id"] for t in body["tasks"]}
     assert task_ids == {
         "E1_onboard_new_hire",
         "E2_meeting_invite_blast",
         "E3_customer_lookup",
+        "M1_customer_escalation",
+        "M2_weekly_report",
+        "M3_event_cleanup",
     }
     for t in body["tasks"]:
+        assert t["difficulty"] in ("easy", "medium")
         assert isinstance(t["required_tools"], list)