yashash04 commited on
Commit
7828dcd
·
1 Parent(s): 1f23161

Phase 11: medium scenarios M1/M2/M3 + AdaptationRubric multi-drift verification

Browse files
eval_results/naive_heuristic_20260421_233115.json ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "baseline": "naive_heuristic",
3
+ "timestamp": "20260421_233115",
4
+ "results": [
5
+ {
6
+ "task_id": "E1_onboard_new_hire",
7
+ "seed": 0,
8
+ "completion": 0.0,
9
+ "drift_detection": 0.0,
10
+ "adaptation": 0.0,
11
+ "efficiency": 0.8125,
12
+ "shaped_total": 0.0,
13
+ "cumulative_reward": 0.221875,
14
+ "binary": 0.0,
15
+ "steps_used": 3,
16
+ "final_action_type": "complete_task",
17
+ "error": null
18
+ },
19
+ {
20
+ "task_id": "E1_onboard_new_hire",
21
+ "seed": 1,
22
+ "completion": 0.0,
23
+ "drift_detection": 0.0,
24
+ "adaptation": 0.0,
25
+ "efficiency": 0.8125,
26
+ "shaped_total": 0.0,
27
+ "cumulative_reward": 0.221875,
28
+ "binary": 0.0,
29
+ "steps_used": 3,
30
+ "final_action_type": "complete_task",
31
+ "error": null
32
+ },
33
+ {
34
+ "task_id": "E1_onboard_new_hire",
35
+ "seed": 2,
36
+ "completion": 0.0,
37
+ "drift_detection": 0.0,
38
+ "adaptation": 0.0,
39
+ "efficiency": 0.8125,
40
+ "shaped_total": 0.0,
41
+ "cumulative_reward": 0.221875,
42
+ "binary": 0.0,
43
+ "steps_used": 3,
44
+ "final_action_type": "complete_task",
45
+ "error": null
46
+ },
47
+ {
48
+ "task_id": "E2_meeting_invite_blast",
49
+ "seed": 0,
50
+ "completion": 0.0,
51
+ "drift_detection": 0.0,
52
+ "adaptation": 0.0,
53
+ "efficiency": 0.75,
54
+ "shaped_total": 0.0,
55
+ "cumulative_reward": 0.21250000000000002,
56
+ "binary": 0.0,
57
+ "steps_used": 3,
58
+ "final_action_type": "complete_task",
59
+ "error": null
60
+ },
61
+ {
62
+ "task_id": "E2_meeting_invite_blast",
63
+ "seed": 1,
64
+ "completion": 0.0,
65
+ "drift_detection": 0.0,
66
+ "adaptation": 0.0,
67
+ "efficiency": 0.75,
68
+ "shaped_total": 0.0,
69
+ "cumulative_reward": 0.21250000000000002,
70
+ "binary": 0.0,
71
+ "steps_used": 3,
72
+ "final_action_type": "complete_task",
73
+ "error": null
74
+ },
75
+ {
76
+ "task_id": "E2_meeting_invite_blast",
77
+ "seed": 2,
78
+ "completion": 0.0,
79
+ "drift_detection": 0.0,
80
+ "adaptation": 0.0,
81
+ "efficiency": 0.75,
82
+ "shaped_total": 0.0,
83
+ "cumulative_reward": 0.21250000000000002,
84
+ "binary": 0.0,
85
+ "steps_used": 3,
86
+ "final_action_type": "complete_task",
87
+ "error": null
88
+ },
89
+ {
90
+ "task_id": "E3_customer_lookup",
91
+ "seed": 0,
92
+ "completion": 0.0,
93
+ "drift_detection": 0.0,
94
+ "adaptation": 0.0,
95
+ "efficiency": 0.8125,
96
+ "shaped_total": 0.0,
97
+ "cumulative_reward": 0.271875,
98
+ "binary": 0.0,
99
+ "steps_used": 3,
100
+ "final_action_type": "complete_task",
101
+ "error": null
102
+ },
103
+ {
104
+ "task_id": "E3_customer_lookup",
105
+ "seed": 1,
106
+ "completion": 0.0,
107
+ "drift_detection": 0.0,
108
+ "adaptation": 0.0,
109
+ "efficiency": 0.8125,
110
+ "shaped_total": 0.0,
111
+ "cumulative_reward": 0.271875,
112
+ "binary": 0.0,
113
+ "steps_used": 3,
114
+ "final_action_type": "complete_task",
115
+ "error": null
116
+ },
117
+ {
118
+ "task_id": "E3_customer_lookup",
119
+ "seed": 2,
120
+ "completion": 0.0,
121
+ "drift_detection": 0.0,
122
+ "adaptation": 0.0,
123
+ "efficiency": 0.8125,
124
+ "shaped_total": 0.0,
125
+ "cumulative_reward": 0.271875,
126
+ "binary": 0.0,
127
+ "steps_used": 3,
128
+ "final_action_type": "complete_task",
129
+ "error": null
130
+ }
131
+ ]
132
+ }
eval_results/policy_aware_heuristic_20260421_233136.json ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "baseline": "policy_aware_heuristic",
3
+ "timestamp": "20260421_233136",
4
+ "results": [
5
+ {
6
+ "task_id": "E1_onboard_new_hire",
7
+ "seed": 0,
8
+ "completion": 1.0,
9
+ "drift_detection": 0.0,
10
+ "adaptation": 0.0,
11
+ "efficiency": 0.8125,
12
+ "shaped_total": 0.521875,
13
+ "cumulative_reward": 1.4337499999999999,
14
+ "binary": 1.0,
15
+ "steps_used": 3,
16
+ "final_action_type": "complete_task",
17
+ "error": null
18
+ },
19
+ {
20
+ "task_id": "E1_onboard_new_hire",
21
+ "seed": 1,
22
+ "completion": 1.0,
23
+ "drift_detection": 0.0,
24
+ "adaptation": 0.0,
25
+ "efficiency": 0.8125,
26
+ "shaped_total": 0.521875,
27
+ "cumulative_reward": 1.4337499999999999,
28
+ "binary": 1.0,
29
+ "steps_used": 3,
30
+ "final_action_type": "complete_task",
31
+ "error": null
32
+ },
33
+ {
34
+ "task_id": "E1_onboard_new_hire",
35
+ "seed": 2,
36
+ "completion": 1.0,
37
+ "drift_detection": 0.0,
38
+ "adaptation": 0.0,
39
+ "efficiency": 0.8125,
40
+ "shaped_total": 0.521875,
41
+ "cumulative_reward": 1.4337499999999999,
42
+ "binary": 1.0,
43
+ "steps_used": 3,
44
+ "final_action_type": "complete_task",
45
+ "error": null
46
+ },
47
+ {
48
+ "task_id": "E2_meeting_invite_blast",
49
+ "seed": 0,
50
+ "completion": 0.0,
51
+ "drift_detection": 1.0,
52
+ "adaptation": 1.0,
53
+ "efficiency": 0.5833333333333333,
54
+ "shaped_total": 0.0,
55
+ "cumulative_reward": 1.425,
56
+ "binary": 0.0,
57
+ "steps_used": 5,
58
+ "final_action_type": "complete_task",
59
+ "error": null
60
+ },
61
+ {
62
+ "task_id": "E2_meeting_invite_blast",
63
+ "seed": 1,
64
+ "completion": 0.0,
65
+ "drift_detection": 1.0,
66
+ "adaptation": 1.0,
67
+ "efficiency": 0.5833333333333333,
68
+ "shaped_total": 0.0,
69
+ "cumulative_reward": 1.425,
70
+ "binary": 0.0,
71
+ "steps_used": 5,
72
+ "final_action_type": "complete_task",
73
+ "error": null
74
+ },
75
+ {
76
+ "task_id": "E2_meeting_invite_blast",
77
+ "seed": 2,
78
+ "completion": 0.0,
79
+ "drift_detection": 1.0,
80
+ "adaptation": 1.0,
81
+ "efficiency": 0.5833333333333333,
82
+ "shaped_total": 0.0,
83
+ "cumulative_reward": 1.425,
84
+ "binary": 0.0,
85
+ "steps_used": 5,
86
+ "final_action_type": "complete_task",
87
+ "error": null
88
+ },
89
+ {
90
+ "task_id": "E3_customer_lookup",
91
+ "seed": 0,
92
+ "completion": 1.0,
93
+ "drift_detection": 0.0,
94
+ "adaptation": 0.0,
95
+ "efficiency": 0.8125,
96
+ "shaped_total": 0.521875,
97
+ "cumulative_reward": 0.99375,
98
+ "binary": 1.0,
99
+ "steps_used": 3,
100
+ "final_action_type": "complete_task",
101
+ "error": null
102
+ },
103
+ {
104
+ "task_id": "E3_customer_lookup",
105
+ "seed": 1,
106
+ "completion": 1.0,
107
+ "drift_detection": 0.0,
108
+ "adaptation": 0.0,
109
+ "efficiency": 0.8125,
110
+ "shaped_total": 0.521875,
111
+ "cumulative_reward": 0.99375,
112
+ "binary": 1.0,
113
+ "steps_used": 3,
114
+ "final_action_type": "complete_task",
115
+ "error": null
116
+ },
117
+ {
118
+ "task_id": "E3_customer_lookup",
119
+ "seed": 2,
120
+ "completion": 1.0,
121
+ "drift_detection": 0.0,
122
+ "adaptation": 0.0,
123
+ "efficiency": 0.8125,
124
+ "shaped_total": 0.521875,
125
+ "cumulative_reward": 0.99375,
126
+ "binary": 1.0,
127
+ "steps_used": 3,
128
+ "final_action_type": "complete_task",
129
+ "error": null
130
+ }
131
+ ]
132
+ }
eval_results/policy_aware_heuristic_20260422_000422.json ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "baseline": "policy_aware_heuristic",
3
+ "timestamp": "20260422_000422",
4
+ "results": [
5
+ {
6
+ "task_id": "M1_customer_escalation",
7
+ "seed": 0,
8
+ "completion": 0.5,
9
+ "drift_detection": 0.5,
10
+ "adaptation": 0.0,
11
+ "efficiency": 0.7083333333333333,
12
+ "shaped_total": 0.0,
13
+ "cumulative_reward": 2.5104166666666665,
14
+ "binary": 0.0,
15
+ "steps_used": 7,
16
+ "final_action_type": "complete_task",
17
+ "error": null
18
+ },
19
+ {
20
+ "task_id": "M1_customer_escalation",
21
+ "seed": 1,
22
+ "completion": 0.5,
23
+ "drift_detection": 0.5,
24
+ "adaptation": 0.0,
25
+ "efficiency": 0.7083333333333333,
26
+ "shaped_total": 0.0,
27
+ "cumulative_reward": 2.5104166666666665,
28
+ "binary": 0.0,
29
+ "steps_used": 7,
30
+ "final_action_type": "complete_task",
31
+ "error": null
32
+ },
33
+ {
34
+ "task_id": "M1_customer_escalation",
35
+ "seed": 2,
36
+ "completion": 0.5,
37
+ "drift_detection": 0.5,
38
+ "adaptation": 0.0,
39
+ "efficiency": 0.7083333333333333,
40
+ "shaped_total": 0.0,
41
+ "cumulative_reward": 2.5104166666666665,
42
+ "binary": 0.0,
43
+ "steps_used": 7,
44
+ "final_action_type": "complete_task",
45
+ "error": null
46
+ },
47
+ {
48
+ "task_id": "M2_weekly_report",
49
+ "seed": 0,
50
+ "completion": 0.25,
51
+ "drift_detection": 0.0,
52
+ "adaptation": 1.0,
53
+ "efficiency": 0.5,
54
+ "shaped_total": 0.0,
55
+ "cumulative_reward": 3.0125,
56
+ "binary": 0.0,
57
+ "steps_used": 10,
58
+ "final_action_type": "complete_task",
59
+ "error": null
60
+ },
61
+ {
62
+ "task_id": "M2_weekly_report",
63
+ "seed": 1,
64
+ "completion": 0.25,
65
+ "drift_detection": 0.0,
66
+ "adaptation": 1.0,
67
+ "efficiency": 0.5,
68
+ "shaped_total": 0.0,
69
+ "cumulative_reward": 3.0125,
70
+ "binary": 0.0,
71
+ "steps_used": 10,
72
+ "final_action_type": "complete_task",
73
+ "error": null
74
+ },
75
+ {
76
+ "task_id": "M2_weekly_report",
77
+ "seed": 2,
78
+ "completion": 0.25,
79
+ "drift_detection": 0.0,
80
+ "adaptation": 1.0,
81
+ "efficiency": 0.5,
82
+ "shaped_total": 0.0,
83
+ "cumulative_reward": 3.0125,
84
+ "binary": 0.0,
85
+ "steps_used": 10,
86
+ "final_action_type": "complete_task",
87
+ "error": null
88
+ },
89
+ {
90
+ "task_id": "M3_event_cleanup",
91
+ "seed": 0,
92
+ "completion": 0.2,
93
+ "drift_detection": 0.0,
94
+ "adaptation": 0.0,
95
+ "efficiency": 0.875,
96
+ "shaped_total": 0.0,
97
+ "cumulative_reward": 0.44125000000000003,
98
+ "binary": 0.0,
99
+ "steps_used": 3,
100
+ "final_action_type": "complete_task",
101
+ "error": null
102
+ },
103
+ {
104
+ "task_id": "M3_event_cleanup",
105
+ "seed": 1,
106
+ "completion": 0.2,
107
+ "drift_detection": 0.0,
108
+ "adaptation": 0.0,
109
+ "efficiency": 0.875,
110
+ "shaped_total": 0.0,
111
+ "cumulative_reward": 0.44125000000000003,
112
+ "binary": 0.0,
113
+ "steps_used": 3,
114
+ "final_action_type": "complete_task",
115
+ "error": null
116
+ },
117
+ {
118
+ "task_id": "M3_event_cleanup",
119
+ "seed": 2,
120
+ "completion": 0.2,
121
+ "drift_detection": 0.0,
122
+ "adaptation": 0.0,
123
+ "efficiency": 0.875,
124
+ "shaped_total": 0.0,
125
+ "cumulative_reward": 0.44125000000000003,
126
+ "binary": 0.0,
127
+ "steps_used": 3,
128
+ "final_action_type": "complete_task",
129
+ "error": null
130
+ }
131
+ ]
132
+ }
eval_results/policy_aware_heuristic_20260422_000529.json ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "baseline": "policy_aware_heuristic",
3
+ "timestamp": "20260422_000529",
4
+ "results": [
5
+ {
6
+ "task_id": "E1_onboard_new_hire",
7
+ "seed": 0,
8
+ "completion": 1.0,
9
+ "drift_detection": 0.0,
10
+ "adaptation": 0.0,
11
+ "efficiency": 0.8125,
12
+ "shaped_total": 0.521875,
13
+ "cumulative_reward": 1.4337499999999999,
14
+ "binary": 1.0,
15
+ "steps_used": 3,
16
+ "final_action_type": "complete_task",
17
+ "error": null
18
+ },
19
+ {
20
+ "task_id": "E1_onboard_new_hire",
21
+ "seed": 1,
22
+ "completion": 1.0,
23
+ "drift_detection": 0.0,
24
+ "adaptation": 0.0,
25
+ "efficiency": 0.8125,
26
+ "shaped_total": 0.521875,
27
+ "cumulative_reward": 1.4337499999999999,
28
+ "binary": 1.0,
29
+ "steps_used": 3,
30
+ "final_action_type": "complete_task",
31
+ "error": null
32
+ },
33
+ {
34
+ "task_id": "E1_onboard_new_hire",
35
+ "seed": 2,
36
+ "completion": 1.0,
37
+ "drift_detection": 0.0,
38
+ "adaptation": 0.0,
39
+ "efficiency": 0.8125,
40
+ "shaped_total": 0.521875,
41
+ "cumulative_reward": 1.4337499999999999,
42
+ "binary": 1.0,
43
+ "steps_used": 3,
44
+ "final_action_type": "complete_task",
45
+ "error": null
46
+ },
47
+ {
48
+ "task_id": "E2_meeting_invite_blast",
49
+ "seed": 0,
50
+ "completion": 0.0,
51
+ "drift_detection": 1.0,
52
+ "adaptation": 1.0,
53
+ "efficiency": 0.5833333333333333,
54
+ "shaped_total": 0.0,
55
+ "cumulative_reward": 1.425,
56
+ "binary": 0.0,
57
+ "steps_used": 5,
58
+ "final_action_type": "complete_task",
59
+ "error": null
60
+ },
61
+ {
62
+ "task_id": "E2_meeting_invite_blast",
63
+ "seed": 1,
64
+ "completion": 0.0,
65
+ "drift_detection": 1.0,
66
+ "adaptation": 1.0,
67
+ "efficiency": 0.5833333333333333,
68
+ "shaped_total": 0.0,
69
+ "cumulative_reward": 1.425,
70
+ "binary": 0.0,
71
+ "steps_used": 5,
72
+ "final_action_type": "complete_task",
73
+ "error": null
74
+ },
75
+ {
76
+ "task_id": "E2_meeting_invite_blast",
77
+ "seed": 2,
78
+ "completion": 0.0,
79
+ "drift_detection": 1.0,
80
+ "adaptation": 1.0,
81
+ "efficiency": 0.5833333333333333,
82
+ "shaped_total": 0.0,
83
+ "cumulative_reward": 1.425,
84
+ "binary": 0.0,
85
+ "steps_used": 5,
86
+ "final_action_type": "complete_task",
87
+ "error": null
88
+ },
89
+ {
90
+ "task_id": "E3_customer_lookup",
91
+ "seed": 0,
92
+ "completion": 1.0,
93
+ "drift_detection": 0.0,
94
+ "adaptation": 0.0,
95
+ "efficiency": 0.8125,
96
+ "shaped_total": 0.521875,
97
+ "cumulative_reward": 0.99375,
98
+ "binary": 1.0,
99
+ "steps_used": 3,
100
+ "final_action_type": "complete_task",
101
+ "error": null
102
+ },
103
+ {
104
+ "task_id": "E3_customer_lookup",
105
+ "seed": 1,
106
+ "completion": 1.0,
107
+ "drift_detection": 0.0,
108
+ "adaptation": 0.0,
109
+ "efficiency": 0.8125,
110
+ "shaped_total": 0.521875,
111
+ "cumulative_reward": 0.99375,
112
+ "binary": 1.0,
113
+ "steps_used": 3,
114
+ "final_action_type": "complete_task",
115
+ "error": null
116
+ },
117
+ {
118
+ "task_id": "E3_customer_lookup",
119
+ "seed": 2,
120
+ "completion": 1.0,
121
+ "drift_detection": 0.0,
122
+ "adaptation": 0.0,
123
+ "efficiency": 0.8125,
124
+ "shaped_total": 0.521875,
125
+ "cumulative_reward": 0.99375,
126
+ "binary": 1.0,
127
+ "steps_used": 3,
128
+ "final_action_type": "complete_task",
129
+ "error": null
130
+ }
131
+ ]
132
+ }
scenarios.py CHANGED
@@ -121,4 +121,163 @@ SCENARIOS: dict[str, dict] = {
121
  },
122
  "required_tools": ["crm"],
123
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  }
 
121
  },
122
  "required_tools": ["crm"],
123
  },
124
+
125
+ "M1_customer_escalation": {
126
+ "difficulty": "medium",
127
+ "max_steps": 12,
128
+ "token_budget": 6000,
129
+ "task_description": (
130
+ "A VIP customer at bob@customer.com has escalated: their "
131
+ "subscription is about to lapse. Look them up in CRM, update "
132
+ "their status to 'vip_escalation', send them a personalized "
133
+ "retention email from support@company.com with subject "
134
+ "'Priority Support — [Customer Name]', and schedule a 30-minute "
135
+ "check-in call on Friday April 24 at 2:00 PM with both the "
136
+ "customer and the account manager alex@company.com."
137
+ ),
138
+ "success_criteria": [
139
+ "Customer contact retrieved with correct company",
140
+ "CRM status updated to vip_escalation",
141
+ "Retention email sent to bob@customer.com with Priority Support subject",
142
+ "Calendar event created for Friday April 24 2pm with both customer and account manager",
143
+ ],
144
+ "seed_data": {
145
+ "mail": {"messages": []},
146
+ "calendar": {"events": []},
147
+ "crm": {
148
+ "contacts": [
149
+ {"contact_id": "c_1", "customer_email": "alice@customer.com",
150
+ "name": "Alice Nguyen", "company": "Acme Corp", "status": "active"},
151
+ {"contact_id": "c_2", "customer_email": "bob@customer.com",
152
+ "name": "Bob Taylor", "company": "Globex Industries", "status": "active"},
153
+ ],
154
+ },
155
+ },
156
+ "drift_plan": [
157
+ DriftEvent(
158
+ tool="crm", endpoint=None, kind="field_rename",
159
+ fires_at_step=1,
160
+ details={"from": "customer_email", "to": "email_address"},
161
+ ),
162
+ DriftEvent(
163
+ tool="calendar", endpoint="create_event", kind="field_rename",
164
+ fires_at_step=6,
165
+ details={"from": "attendees", "to": "participants"},
166
+ ),
167
+ ],
168
+ "ground_truth_final_state": {
169
+ "crm.contact_c_2_status": "vip_escalation",
170
+ "mail.sent_count": 1,
171
+ "mail.last_sent_to": "bob@customer.com",
172
+ "mail.last_subject_contains_priority_support": True,
173
+ "calendar.events_count": 1,
174
+ "calendar.last_event_has_both_attendees": True,
175
+ },
176
+ "required_tools": ["mail", "calendar", "crm"],
177
+ },
178
+
179
+ "M2_weekly_report": {
180
+ "difficulty": "medium",
181
+ "max_steps": 10,
182
+ "token_budget": 5000,
183
+ "task_description": (
184
+ "Prepare the weekly sales report: pull the list of active "
185
+ "contacts from CRM, send a summary email to "
186
+ "sales-leads@company.com with subject "
187
+ "'Weekly Active Contacts Report' listing contact names, and "
188
+ "schedule a report review meeting next Monday April 27 at "
189
+ "10:00 AM with the sales team leads sarah@company.com and "
190
+ "mike@company.com."
191
+ ),
192
+ "success_criteria": [
193
+ "Active contacts retrieved from CRM",
194
+ "Summary email sent with 'Weekly' in subject",
195
+ "Meeting scheduled for Monday April 27 10am with both sales leads",
196
+ ],
197
+ "seed_data": {
198
+ "mail": {"messages": []},
199
+ "calendar": {"events": []},
200
+ "crm": {
201
+ "contacts": [
202
+ {"contact_id": "c_1", "customer_email": "x@co.com",
203
+ "name": "X Person", "company": "Co", "status": "active"},
204
+ {"contact_id": "c_2", "customer_email": "y@co.com",
205
+ "name": "Y Person", "company": "Co", "status": "active"},
206
+ {"contact_id": "c_3", "customer_email": "z@co.com",
207
+ "name": "Z Person", "company": "Co", "status": "inactive"},
208
+ ],
209
+ },
210
+ },
211
+ "drift_plan": [
212
+ DriftEvent(
213
+ tool="mail", endpoint="send_message", kind="endpoint_deprecation",
214
+ fires_at_step=2,
215
+ details={"replacement": "messages.send"},
216
+ ),
217
+ DriftEvent(
218
+ tool="crm", endpoint=None, kind="rate_limit_tightening",
219
+ fires_at_step=4,
220
+ details={"limit": 2},
221
+ ),
222
+ ],
223
+ "ground_truth_final_state": {
224
+ "mail.sent_count": 1,
225
+ "mail.last_subject_contains_weekly": True,
226
+ "calendar.events_count": 1,
227
+ "calendar.last_event_has_both_sales_leads": True,
228
+ },
229
+ "required_tools": ["mail", "calendar", "crm"],
230
+ },
231
+
232
+ "M3_event_cleanup": {
233
+ "difficulty": "medium",
234
+ "max_steps": 12,
235
+ "token_budget": 6000,
236
+ "task_description": (
237
+ "End-of-week calendar cleanup: find and cancel the "
238
+ "'Old Planning Sync' event, find and cancel the "
239
+ "'Cancelled Kickoff' event, and create a new 'Friday Wrap-up' "
240
+ "event for Friday April 24 at 4:00 PM with the team lead "
241
+ "alex@company.com attending. Send a notification email to "
242
+ "team-all@company.com with subject "
243
+ "'Calendar Updated — Friday Wrap-up Added' about the changes."
244
+ ),
245
+ "success_criteria": [
246
+ "Old Planning Sync event cancelled or deleted",
247
+ "Cancelled Kickoff event cancelled or deleted",
248
+ "New Friday Wrap-up event created at 4pm with alex attending",
249
+ "Notification email sent with 'Calendar Updated' in subject",
250
+ ],
251
+ "seed_data": {
252
+ "mail": {"messages": []},
253
+ "calendar": {"events": [
254
+ {"event_id": "evt_1", "title": "Old Planning Sync",
255
+ "start": "2026-04-20T10:00:00Z", "end": "2026-04-20T11:00:00Z",
256
+ "attendees": ["alex@company.com"], "status": "confirmed"},
257
+ {"event_id": "evt_2", "title": "Cancelled Kickoff",
258
+ "start": "2026-04-21T14:00:00Z", "end": "2026-04-21T15:00:00Z",
259
+ "attendees": ["alex@company.com"], "status": "confirmed"},
260
+ ]},
261
+ },
262
+ "drift_plan": [
263
+ DriftEvent(
264
+ tool="calendar", endpoint="delete_event", kind="tool_removal",
265
+ fires_at_step=2,
266
+ details={"fallback": "update_event status=cancelled"},
267
+ ),
268
+ DriftEvent(
269
+ tool="calendar", endpoint="create_event", kind="field_rename",
270
+ fires_at_step=5,
271
+ details={"from": "attendees", "to": "participants"},
272
+ ),
273
+ ],
274
+ "ground_truth_final_state": {
275
+ "calendar.evt_1_status": "cancelled",
276
+ "calendar.evt_2_status": "cancelled",
277
+ "calendar.events_count_new_friday_wrapup": 1,
278
+ "mail.sent_count": 1,
279
+ "mail.last_subject_contains_calendar_updated": True,
280
+ },
281
+ "required_tools": ["mail", "calendar"],
282
+ },
283
  }
server/environment.py CHANGED
@@ -283,6 +283,12 @@ class SchemaShiftEnvironment:
283
  st["mail.last_subject_contains_welcome"] = True
284
  if "all-hands" in subject or "all hands" in subject:
285
  st["mail.last_subject_contains_allhands"] = True
 
 
 
 
 
 
286
  recipients: list[str] = st.get("mail.all_recipients", [])
287
  if sent_to and sent_to not in recipients:
288
  recipients.append(sent_to)
@@ -304,8 +310,29 @@ class SchemaShiftEnvironment:
304
  elif isinstance(a, dict):
305
  emails.append(a.get("email", ""))
306
  st["calendar.last_event_attendees"] = emails
307
- if "priya@company.com" in emails and "alex@company.com" in emails:
 
 
 
 
 
 
 
308
  st["calendar.last_event_has_both_attendees"] = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
 
310
  # CRM ─────────────────────────────────────────────────────
311
  if tool == "crm":
 
283
  st["mail.last_subject_contains_welcome"] = True
284
  if "all-hands" in subject or "all hands" in subject:
285
  st["mail.last_subject_contains_allhands"] = True
286
+ if "priority support" in subject:
287
+ st["mail.last_subject_contains_priority_support"] = True
288
+ if "weekly" in subject:
289
+ st["mail.last_subject_contains_weekly"] = True
290
+ if "calendar updated" in subject:
291
+ st["mail.last_subject_contains_calendar_updated"] = True
292
  recipients: list[str] = st.get("mail.all_recipients", [])
293
  if sent_to and sent_to not in recipients:
294
  recipients.append(sent_to)
 
310
  elif isinstance(a, dict):
311
  emails.append(a.get("email", ""))
312
  st["calendar.last_event_attendees"] = emails
313
+ # Recognised attendee pairs (E1 + M1 share this key by design).
314
+ priya_alex = (
315
+ "priya@company.com" in emails and "alex@company.com" in emails
316
+ )
317
+ bob_alex = (
318
+ "bob@customer.com" in emails and "alex@company.com" in emails
319
+ )
320
+ if priya_alex or bob_alex:
321
  st["calendar.last_event_has_both_attendees"] = True
322
+ if "sarah@company.com" in emails and "mike@company.com" in emails:
323
+ st["calendar.last_event_has_both_sales_leads"] = True
324
+ # M3: Friday Wrap-up event counter
325
+ title = str(body.get("title") or params.get("title") or "").lower()
326
+ if "friday wrap-up" in title:
327
+ st["calendar.events_count_new_friday_wrapup"] = (
328
+ st.get("calendar.events_count_new_friday_wrapup", 0) + 1
329
+ )
330
+ elif endpoint == "update_event":
331
+ # M3: track per-event status transitions (cancellations)
332
+ event_id = params.get("event_id", "")
333
+ status = params.get("status")
334
+ if event_id and status:
335
+ st[f"calendar.{event_id}_status"] = status
336
 
337
  # CRM ─────────────────────────────────────────────────────
338
  if tool == "crm":
tests/test_graders.py CHANGED
@@ -126,6 +126,77 @@ def test_adaptation_rubric_success() -> None:
126
  assert details["opportunities"] == 1
127
 
128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  def test_adaptation_rubric_no_post_drift_calls() -> None:
130
  drift = DriftEvent(
131
  tool="calendar", endpoint="create_event", kind="field_rename",
 
126
  assert details["opportunities"] == 1
127
 
128
 
129
+ def test_adaptation_rubric_multi_drift_same_tool() -> None:
130
+ """M3-style stress test: two drifts on the same tool (calendar).
131
+
132
+ History:
133
+ step 2 — call_tool calendar.delete_event → 410 (post-Drift-A tool_removal)
134
+ step 5 — call_tool calendar.create_event with attendees → 400 (post-Drift-B field_rename)
135
+ step 7 — retry_with_variant calendar.create_event with participants → 200 success
136
+
137
+ Expected rubric behavior (per Phase 5 judgment call #2):
138
+ - Drift A (fires_at_step=2): first post-drift calendar call = step 5 (failed). opp=1, adapted=0.
139
+ - Drift B (fires_at_step=5): first post-drift calendar call = step 7 (succeeded). opp=1, adapted=1.
140
+ - Score = 1/2 = 0.5.
141
+
142
+ Documents intentional denominator behavior: partial credit for partial adaptation.
143
+ Dense step_shaping (+0.20 for successful retry after failure) catches the step 7
144
+ recovery independently, so the rubric staying conservative is acceptable.
145
+ """
146
+ drifts = [
147
+ DriftEvent(
148
+ tool="calendar", endpoint="delete_event", kind="tool_removal",
149
+ fires_at_step=2, details={}, detected_by_agent=True,
150
+ ),
151
+ DriftEvent(
152
+ tool="calendar", endpoint="create_event", kind="field_rename",
153
+ fires_at_step=5, details={}, detected_by_agent=True,
154
+ ),
155
+ ]
156
+ history = [
157
+ HistoryStep(
158
+ step=2,
159
+ action=Action(
160
+ type="call_tool",
161
+ tool_call=ToolCallParams(
162
+ tool="calendar", endpoint="delete_event",
163
+ params={"event_id": "evt_2"},
164
+ ),
165
+ ),
166
+ response=ToolResponse(ok=False, status=410, error="removed"),
167
+ ),
168
+ HistoryStep(
169
+ step=5,
170
+ action=Action(
171
+ type="call_tool",
172
+ tool_call=ToolCallParams(
173
+ tool="calendar", endpoint="create_event",
174
+ params={"title": "x", "start": "t1", "end": "t2",
175
+ "attendees": ["a@x.com"]},
176
+ ),
177
+ ),
178
+ response=ToolResponse(ok=False, status=400, error="missing required"),
179
+ ),
180
+ HistoryStep(
181
+ step=7,
182
+ action=Action(
183
+ type="retry_with_variant",
184
+ retry=RetryParams(
185
+ tool="calendar", endpoint="create_event",
186
+ params={"title": "x", "start": "t1", "end": "t2",
187
+ "participants": [{"email": "a@x.com", "role": "required"}]},
188
+ ),
189
+ ),
190
+ response=ToolResponse(ok=True, status=200, body={"event_id": "evt_3"}),
191
+ ),
192
+ ]
193
+ s = _state_with(step=7, drift_plan=drifts, history=history)
194
+ _, val, details = AdaptationRubric().score(s)
195
+ assert val == 0.5, f"Expected 0.5, got {val}"
196
+ assert details["adapted"] == 1
197
+ assert details["opportunities"] == 2
198
+
199
+
200
  def test_adaptation_rubric_no_post_drift_calls() -> None:
201
  drift = DriftEvent(
202
  tool="calendar", endpoint="create_event", kind="field_rename",
tests/test_scenarios.py CHANGED
@@ -18,14 +18,44 @@ REQUIRED_KEYS = {
18
  }
19
 
20
 
21
- def test_all_three_scenarios_present() -> None:
22
  assert set(SCENARIOS.keys()) == {
23
  "E1_onboard_new_hire",
24
  "E2_meeting_invite_blast",
25
  "E3_customer_lookup",
 
 
 
26
  }
27
 
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  def test_each_scenario_has_required_fields() -> None:
30
  for name, sc in SCENARIOS.items():
31
  missing = REQUIRED_KEYS - set(sc.keys())
 
18
  }
19
 
20
 
21
+ def test_all_scenarios_present() -> None:
22
  assert set(SCENARIOS.keys()) == {
23
  "E1_onboard_new_hire",
24
  "E2_meeting_invite_blast",
25
  "E3_customer_lookup",
26
+ "M1_customer_escalation",
27
+ "M2_weekly_report",
28
+ "M3_event_cleanup",
29
  }
30
 
31
 
32
+ def test_medium_scenarios_present() -> None:
33
+ for task_id in ("M1_customer_escalation", "M2_weekly_report", "M3_event_cleanup"):
34
+ assert task_id in SCENARIOS, f"{task_id} missing from SCENARIOS"
35
+ assert SCENARIOS[task_id]["difficulty"] == "medium"
36
+
37
+
38
+ def test_medium_scenarios_multi_drift() -> None:
39
+ for task_id in ("M1_customer_escalation", "M2_weekly_report", "M3_event_cleanup"):
40
+ plan = SCENARIOS[task_id]["drift_plan"]
41
+ assert len(plan) == 2, f"{task_id}: expected 2 drifts, got {len(plan)}"
42
+
43
+
44
+ def test_m3_same_tool_multi_drift() -> None:
45
+ """M3 is the judgment-call-#2 stress test: both drifts target calendar."""
46
+ plan = SCENARIOS["M3_event_cleanup"]["drift_plan"]
47
+ tools = [d.tool for d in plan]
48
+ assert tools == ["calendar", "calendar"], (
49
+ f"M3 drifts must both target calendar, got {tools}"
50
+ )
51
+
52
+
53
+ def test_medium_required_tools() -> None:
54
+ assert SCENARIOS["M1_customer_escalation"]["required_tools"] == ["mail", "calendar", "crm"]
55
+ assert SCENARIOS["M2_weekly_report"]["required_tools"] == ["mail", "calendar", "crm"]
56
+ assert SCENARIOS["M3_event_cleanup"]["required_tools"] == ["mail", "calendar"]
57
+
58
+
59
  def test_each_scenario_has_required_fields() -> None:
60
  for name, sc in SCENARIOS.items():
61
  missing = REQUIRED_KEYS - set(sc.keys())
tests/test_server.py CHANGED
@@ -38,15 +38,18 @@ def test_tasks_endpoint(client) -> None:
38
  r = client.get("/tasks")
39
  assert r.status_code == 200
40
  body = r.json()
41
- assert body["count"] == 3
42
  task_ids = {t["task_id"] for t in body["tasks"]}
43
  assert task_ids == {
44
  "E1_onboard_new_hire",
45
  "E2_meeting_invite_blast",
46
  "E3_customer_lookup",
 
 
 
47
  }
48
  for t in body["tasks"]:
49
- assert t["difficulty"] == "easy"
50
  assert isinstance(t["required_tools"], list)
51
 
52
 
 
38
  r = client.get("/tasks")
39
  assert r.status_code == 200
40
  body = r.json()
41
+ assert body["count"] == 6
42
  task_ids = {t["task_id"] for t in body["tasks"]}
43
  assert task_ids == {
44
  "E1_onboard_new_hire",
45
  "E2_meeting_invite_blast",
46
  "E3_customer_lookup",
47
+ "M1_customer_escalation",
48
+ "M2_weekly_report",
49
+ "M3_event_cleanup",
50
  }
51
  for t in body["tasks"]:
52
+ assert t["difficulty"] in ("easy", "medium")
53
  assert isinstance(t["required_tools"], list)
54
 
55