Spaces:
Sleeping
Sleeping
Phase 11: medium scenarios M1/M2/M3 + AdaptationRubric multi-drift verification
Browse files- eval_results/naive_heuristic_20260421_233115.json +132 -0
- eval_results/policy_aware_heuristic_20260421_233136.json +132 -0
- eval_results/policy_aware_heuristic_20260422_000422.json +132 -0
- eval_results/policy_aware_heuristic_20260422_000529.json +132 -0
- scenarios.py +159 -0
- server/environment.py +28 -1
- tests/test_graders.py +71 -0
- tests/test_scenarios.py +31 -1
- tests/test_server.py +5 -2
eval_results/naive_heuristic_20260421_233115.json
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"baseline": "naive_heuristic",
|
| 3 |
+
"timestamp": "20260421_233115",
|
| 4 |
+
"results": [
|
| 5 |
+
{
|
| 6 |
+
"task_id": "E1_onboard_new_hire",
|
| 7 |
+
"seed": 0,
|
| 8 |
+
"completion": 0.0,
|
| 9 |
+
"drift_detection": 0.0,
|
| 10 |
+
"adaptation": 0.0,
|
| 11 |
+
"efficiency": 0.8125,
|
| 12 |
+
"shaped_total": 0.0,
|
| 13 |
+
"cumulative_reward": 0.221875,
|
| 14 |
+
"binary": 0.0,
|
| 15 |
+
"steps_used": 3,
|
| 16 |
+
"final_action_type": "complete_task",
|
| 17 |
+
"error": null
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"task_id": "E1_onboard_new_hire",
|
| 21 |
+
"seed": 1,
|
| 22 |
+
"completion": 0.0,
|
| 23 |
+
"drift_detection": 0.0,
|
| 24 |
+
"adaptation": 0.0,
|
| 25 |
+
"efficiency": 0.8125,
|
| 26 |
+
"shaped_total": 0.0,
|
| 27 |
+
"cumulative_reward": 0.221875,
|
| 28 |
+
"binary": 0.0,
|
| 29 |
+
"steps_used": 3,
|
| 30 |
+
"final_action_type": "complete_task",
|
| 31 |
+
"error": null
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"task_id": "E1_onboard_new_hire",
|
| 35 |
+
"seed": 2,
|
| 36 |
+
"completion": 0.0,
|
| 37 |
+
"drift_detection": 0.0,
|
| 38 |
+
"adaptation": 0.0,
|
| 39 |
+
"efficiency": 0.8125,
|
| 40 |
+
"shaped_total": 0.0,
|
| 41 |
+
"cumulative_reward": 0.221875,
|
| 42 |
+
"binary": 0.0,
|
| 43 |
+
"steps_used": 3,
|
| 44 |
+
"final_action_type": "complete_task",
|
| 45 |
+
"error": null
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"task_id": "E2_meeting_invite_blast",
|
| 49 |
+
"seed": 0,
|
| 50 |
+
"completion": 0.0,
|
| 51 |
+
"drift_detection": 0.0,
|
| 52 |
+
"adaptation": 0.0,
|
| 53 |
+
"efficiency": 0.75,
|
| 54 |
+
"shaped_total": 0.0,
|
| 55 |
+
"cumulative_reward": 0.21250000000000002,
|
| 56 |
+
"binary": 0.0,
|
| 57 |
+
"steps_used": 3,
|
| 58 |
+
"final_action_type": "complete_task",
|
| 59 |
+
"error": null
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"task_id": "E2_meeting_invite_blast",
|
| 63 |
+
"seed": 1,
|
| 64 |
+
"completion": 0.0,
|
| 65 |
+
"drift_detection": 0.0,
|
| 66 |
+
"adaptation": 0.0,
|
| 67 |
+
"efficiency": 0.75,
|
| 68 |
+
"shaped_total": 0.0,
|
| 69 |
+
"cumulative_reward": 0.21250000000000002,
|
| 70 |
+
"binary": 0.0,
|
| 71 |
+
"steps_used": 3,
|
| 72 |
+
"final_action_type": "complete_task",
|
| 73 |
+
"error": null
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"task_id": "E2_meeting_invite_blast",
|
| 77 |
+
"seed": 2,
|
| 78 |
+
"completion": 0.0,
|
| 79 |
+
"drift_detection": 0.0,
|
| 80 |
+
"adaptation": 0.0,
|
| 81 |
+
"efficiency": 0.75,
|
| 82 |
+
"shaped_total": 0.0,
|
| 83 |
+
"cumulative_reward": 0.21250000000000002,
|
| 84 |
+
"binary": 0.0,
|
| 85 |
+
"steps_used": 3,
|
| 86 |
+
"final_action_type": "complete_task",
|
| 87 |
+
"error": null
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"task_id": "E3_customer_lookup",
|
| 91 |
+
"seed": 0,
|
| 92 |
+
"completion": 0.0,
|
| 93 |
+
"drift_detection": 0.0,
|
| 94 |
+
"adaptation": 0.0,
|
| 95 |
+
"efficiency": 0.8125,
|
| 96 |
+
"shaped_total": 0.0,
|
| 97 |
+
"cumulative_reward": 0.271875,
|
| 98 |
+
"binary": 0.0,
|
| 99 |
+
"steps_used": 3,
|
| 100 |
+
"final_action_type": "complete_task",
|
| 101 |
+
"error": null
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"task_id": "E3_customer_lookup",
|
| 105 |
+
"seed": 1,
|
| 106 |
+
"completion": 0.0,
|
| 107 |
+
"drift_detection": 0.0,
|
| 108 |
+
"adaptation": 0.0,
|
| 109 |
+
"efficiency": 0.8125,
|
| 110 |
+
"shaped_total": 0.0,
|
| 111 |
+
"cumulative_reward": 0.271875,
|
| 112 |
+
"binary": 0.0,
|
| 113 |
+
"steps_used": 3,
|
| 114 |
+
"final_action_type": "complete_task",
|
| 115 |
+
"error": null
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"task_id": "E3_customer_lookup",
|
| 119 |
+
"seed": 2,
|
| 120 |
+
"completion": 0.0,
|
| 121 |
+
"drift_detection": 0.0,
|
| 122 |
+
"adaptation": 0.0,
|
| 123 |
+
"efficiency": 0.8125,
|
| 124 |
+
"shaped_total": 0.0,
|
| 125 |
+
"cumulative_reward": 0.271875,
|
| 126 |
+
"binary": 0.0,
|
| 127 |
+
"steps_used": 3,
|
| 128 |
+
"final_action_type": "complete_task",
|
| 129 |
+
"error": null
|
| 130 |
+
}
|
| 131 |
+
]
|
| 132 |
+
}
|
eval_results/policy_aware_heuristic_20260421_233136.json
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"baseline": "policy_aware_heuristic",
|
| 3 |
+
"timestamp": "20260421_233136",
|
| 4 |
+
"results": [
|
| 5 |
+
{
|
| 6 |
+
"task_id": "E1_onboard_new_hire",
|
| 7 |
+
"seed": 0,
|
| 8 |
+
"completion": 1.0,
|
| 9 |
+
"drift_detection": 0.0,
|
| 10 |
+
"adaptation": 0.0,
|
| 11 |
+
"efficiency": 0.8125,
|
| 12 |
+
"shaped_total": 0.521875,
|
| 13 |
+
"cumulative_reward": 1.4337499999999999,
|
| 14 |
+
"binary": 1.0,
|
| 15 |
+
"steps_used": 3,
|
| 16 |
+
"final_action_type": "complete_task",
|
| 17 |
+
"error": null
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"task_id": "E1_onboard_new_hire",
|
| 21 |
+
"seed": 1,
|
| 22 |
+
"completion": 1.0,
|
| 23 |
+
"drift_detection": 0.0,
|
| 24 |
+
"adaptation": 0.0,
|
| 25 |
+
"efficiency": 0.8125,
|
| 26 |
+
"shaped_total": 0.521875,
|
| 27 |
+
"cumulative_reward": 1.4337499999999999,
|
| 28 |
+
"binary": 1.0,
|
| 29 |
+
"steps_used": 3,
|
| 30 |
+
"final_action_type": "complete_task",
|
| 31 |
+
"error": null
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"task_id": "E1_onboard_new_hire",
|
| 35 |
+
"seed": 2,
|
| 36 |
+
"completion": 1.0,
|
| 37 |
+
"drift_detection": 0.0,
|
| 38 |
+
"adaptation": 0.0,
|
| 39 |
+
"efficiency": 0.8125,
|
| 40 |
+
"shaped_total": 0.521875,
|
| 41 |
+
"cumulative_reward": 1.4337499999999999,
|
| 42 |
+
"binary": 1.0,
|
| 43 |
+
"steps_used": 3,
|
| 44 |
+
"final_action_type": "complete_task",
|
| 45 |
+
"error": null
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"task_id": "E2_meeting_invite_blast",
|
| 49 |
+
"seed": 0,
|
| 50 |
+
"completion": 0.0,
|
| 51 |
+
"drift_detection": 1.0,
|
| 52 |
+
"adaptation": 1.0,
|
| 53 |
+
"efficiency": 0.5833333333333333,
|
| 54 |
+
"shaped_total": 0.0,
|
| 55 |
+
"cumulative_reward": 1.425,
|
| 56 |
+
"binary": 0.0,
|
| 57 |
+
"steps_used": 5,
|
| 58 |
+
"final_action_type": "complete_task",
|
| 59 |
+
"error": null
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"task_id": "E2_meeting_invite_blast",
|
| 63 |
+
"seed": 1,
|
| 64 |
+
"completion": 0.0,
|
| 65 |
+
"drift_detection": 1.0,
|
| 66 |
+
"adaptation": 1.0,
|
| 67 |
+
"efficiency": 0.5833333333333333,
|
| 68 |
+
"shaped_total": 0.0,
|
| 69 |
+
"cumulative_reward": 1.425,
|
| 70 |
+
"binary": 0.0,
|
| 71 |
+
"steps_used": 5,
|
| 72 |
+
"final_action_type": "complete_task",
|
| 73 |
+
"error": null
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"task_id": "E2_meeting_invite_blast",
|
| 77 |
+
"seed": 2,
|
| 78 |
+
"completion": 0.0,
|
| 79 |
+
"drift_detection": 1.0,
|
| 80 |
+
"adaptation": 1.0,
|
| 81 |
+
"efficiency": 0.5833333333333333,
|
| 82 |
+
"shaped_total": 0.0,
|
| 83 |
+
"cumulative_reward": 1.425,
|
| 84 |
+
"binary": 0.0,
|
| 85 |
+
"steps_used": 5,
|
| 86 |
+
"final_action_type": "complete_task",
|
| 87 |
+
"error": null
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"task_id": "E3_customer_lookup",
|
| 91 |
+
"seed": 0,
|
| 92 |
+
"completion": 1.0,
|
| 93 |
+
"drift_detection": 0.0,
|
| 94 |
+
"adaptation": 0.0,
|
| 95 |
+
"efficiency": 0.8125,
|
| 96 |
+
"shaped_total": 0.521875,
|
| 97 |
+
"cumulative_reward": 0.99375,
|
| 98 |
+
"binary": 1.0,
|
| 99 |
+
"steps_used": 3,
|
| 100 |
+
"final_action_type": "complete_task",
|
| 101 |
+
"error": null
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"task_id": "E3_customer_lookup",
|
| 105 |
+
"seed": 1,
|
| 106 |
+
"completion": 1.0,
|
| 107 |
+
"drift_detection": 0.0,
|
| 108 |
+
"adaptation": 0.0,
|
| 109 |
+
"efficiency": 0.8125,
|
| 110 |
+
"shaped_total": 0.521875,
|
| 111 |
+
"cumulative_reward": 0.99375,
|
| 112 |
+
"binary": 1.0,
|
| 113 |
+
"steps_used": 3,
|
| 114 |
+
"final_action_type": "complete_task",
|
| 115 |
+
"error": null
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"task_id": "E3_customer_lookup",
|
| 119 |
+
"seed": 2,
|
| 120 |
+
"completion": 1.0,
|
| 121 |
+
"drift_detection": 0.0,
|
| 122 |
+
"adaptation": 0.0,
|
| 123 |
+
"efficiency": 0.8125,
|
| 124 |
+
"shaped_total": 0.521875,
|
| 125 |
+
"cumulative_reward": 0.99375,
|
| 126 |
+
"binary": 1.0,
|
| 127 |
+
"steps_used": 3,
|
| 128 |
+
"final_action_type": "complete_task",
|
| 129 |
+
"error": null
|
| 130 |
+
}
|
| 131 |
+
]
|
| 132 |
+
}
|
eval_results/policy_aware_heuristic_20260422_000422.json
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"baseline": "policy_aware_heuristic",
|
| 3 |
+
"timestamp": "20260422_000422",
|
| 4 |
+
"results": [
|
| 5 |
+
{
|
| 6 |
+
"task_id": "M1_customer_escalation",
|
| 7 |
+
"seed": 0,
|
| 8 |
+
"completion": 0.5,
|
| 9 |
+
"drift_detection": 0.5,
|
| 10 |
+
"adaptation": 0.0,
|
| 11 |
+
"efficiency": 0.7083333333333333,
|
| 12 |
+
"shaped_total": 0.0,
|
| 13 |
+
"cumulative_reward": 2.5104166666666665,
|
| 14 |
+
"binary": 0.0,
|
| 15 |
+
"steps_used": 7,
|
| 16 |
+
"final_action_type": "complete_task",
|
| 17 |
+
"error": null
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"task_id": "M1_customer_escalation",
|
| 21 |
+
"seed": 1,
|
| 22 |
+
"completion": 0.5,
|
| 23 |
+
"drift_detection": 0.5,
|
| 24 |
+
"adaptation": 0.0,
|
| 25 |
+
"efficiency": 0.7083333333333333,
|
| 26 |
+
"shaped_total": 0.0,
|
| 27 |
+
"cumulative_reward": 2.5104166666666665,
|
| 28 |
+
"binary": 0.0,
|
| 29 |
+
"steps_used": 7,
|
| 30 |
+
"final_action_type": "complete_task",
|
| 31 |
+
"error": null
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"task_id": "M1_customer_escalation",
|
| 35 |
+
"seed": 2,
|
| 36 |
+
"completion": 0.5,
|
| 37 |
+
"drift_detection": 0.5,
|
| 38 |
+
"adaptation": 0.0,
|
| 39 |
+
"efficiency": 0.7083333333333333,
|
| 40 |
+
"shaped_total": 0.0,
|
| 41 |
+
"cumulative_reward": 2.5104166666666665,
|
| 42 |
+
"binary": 0.0,
|
| 43 |
+
"steps_used": 7,
|
| 44 |
+
"final_action_type": "complete_task",
|
| 45 |
+
"error": null
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"task_id": "M2_weekly_report",
|
| 49 |
+
"seed": 0,
|
| 50 |
+
"completion": 0.25,
|
| 51 |
+
"drift_detection": 0.0,
|
| 52 |
+
"adaptation": 1.0,
|
| 53 |
+
"efficiency": 0.5,
|
| 54 |
+
"shaped_total": 0.0,
|
| 55 |
+
"cumulative_reward": 3.0125,
|
| 56 |
+
"binary": 0.0,
|
| 57 |
+
"steps_used": 10,
|
| 58 |
+
"final_action_type": "complete_task",
|
| 59 |
+
"error": null
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"task_id": "M2_weekly_report",
|
| 63 |
+
"seed": 1,
|
| 64 |
+
"completion": 0.25,
|
| 65 |
+
"drift_detection": 0.0,
|
| 66 |
+
"adaptation": 1.0,
|
| 67 |
+
"efficiency": 0.5,
|
| 68 |
+
"shaped_total": 0.0,
|
| 69 |
+
"cumulative_reward": 3.0125,
|
| 70 |
+
"binary": 0.0,
|
| 71 |
+
"steps_used": 10,
|
| 72 |
+
"final_action_type": "complete_task",
|
| 73 |
+
"error": null
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"task_id": "M2_weekly_report",
|
| 77 |
+
"seed": 2,
|
| 78 |
+
"completion": 0.25,
|
| 79 |
+
"drift_detection": 0.0,
|
| 80 |
+
"adaptation": 1.0,
|
| 81 |
+
"efficiency": 0.5,
|
| 82 |
+
"shaped_total": 0.0,
|
| 83 |
+
"cumulative_reward": 3.0125,
|
| 84 |
+
"binary": 0.0,
|
| 85 |
+
"steps_used": 10,
|
| 86 |
+
"final_action_type": "complete_task",
|
| 87 |
+
"error": null
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"task_id": "M3_event_cleanup",
|
| 91 |
+
"seed": 0,
|
| 92 |
+
"completion": 0.2,
|
| 93 |
+
"drift_detection": 0.0,
|
| 94 |
+
"adaptation": 0.0,
|
| 95 |
+
"efficiency": 0.875,
|
| 96 |
+
"shaped_total": 0.0,
|
| 97 |
+
"cumulative_reward": 0.44125000000000003,
|
| 98 |
+
"binary": 0.0,
|
| 99 |
+
"steps_used": 3,
|
| 100 |
+
"final_action_type": "complete_task",
|
| 101 |
+
"error": null
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"task_id": "M3_event_cleanup",
|
| 105 |
+
"seed": 1,
|
| 106 |
+
"completion": 0.2,
|
| 107 |
+
"drift_detection": 0.0,
|
| 108 |
+
"adaptation": 0.0,
|
| 109 |
+
"efficiency": 0.875,
|
| 110 |
+
"shaped_total": 0.0,
|
| 111 |
+
"cumulative_reward": 0.44125000000000003,
|
| 112 |
+
"binary": 0.0,
|
| 113 |
+
"steps_used": 3,
|
| 114 |
+
"final_action_type": "complete_task",
|
| 115 |
+
"error": null
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"task_id": "M3_event_cleanup",
|
| 119 |
+
"seed": 2,
|
| 120 |
+
"completion": 0.2,
|
| 121 |
+
"drift_detection": 0.0,
|
| 122 |
+
"adaptation": 0.0,
|
| 123 |
+
"efficiency": 0.875,
|
| 124 |
+
"shaped_total": 0.0,
|
| 125 |
+
"cumulative_reward": 0.44125000000000003,
|
| 126 |
+
"binary": 0.0,
|
| 127 |
+
"steps_used": 3,
|
| 128 |
+
"final_action_type": "complete_task",
|
| 129 |
+
"error": null
|
| 130 |
+
}
|
| 131 |
+
]
|
| 132 |
+
}
|
eval_results/policy_aware_heuristic_20260422_000529.json
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"baseline": "policy_aware_heuristic",
|
| 3 |
+
"timestamp": "20260422_000529",
|
| 4 |
+
"results": [
|
| 5 |
+
{
|
| 6 |
+
"task_id": "E1_onboard_new_hire",
|
| 7 |
+
"seed": 0,
|
| 8 |
+
"completion": 1.0,
|
| 9 |
+
"drift_detection": 0.0,
|
| 10 |
+
"adaptation": 0.0,
|
| 11 |
+
"efficiency": 0.8125,
|
| 12 |
+
"shaped_total": 0.521875,
|
| 13 |
+
"cumulative_reward": 1.4337499999999999,
|
| 14 |
+
"binary": 1.0,
|
| 15 |
+
"steps_used": 3,
|
| 16 |
+
"final_action_type": "complete_task",
|
| 17 |
+
"error": null
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"task_id": "E1_onboard_new_hire",
|
| 21 |
+
"seed": 1,
|
| 22 |
+
"completion": 1.0,
|
| 23 |
+
"drift_detection": 0.0,
|
| 24 |
+
"adaptation": 0.0,
|
| 25 |
+
"efficiency": 0.8125,
|
| 26 |
+
"shaped_total": 0.521875,
|
| 27 |
+
"cumulative_reward": 1.4337499999999999,
|
| 28 |
+
"binary": 1.0,
|
| 29 |
+
"steps_used": 3,
|
| 30 |
+
"final_action_type": "complete_task",
|
| 31 |
+
"error": null
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"task_id": "E1_onboard_new_hire",
|
| 35 |
+
"seed": 2,
|
| 36 |
+
"completion": 1.0,
|
| 37 |
+
"drift_detection": 0.0,
|
| 38 |
+
"adaptation": 0.0,
|
| 39 |
+
"efficiency": 0.8125,
|
| 40 |
+
"shaped_total": 0.521875,
|
| 41 |
+
"cumulative_reward": 1.4337499999999999,
|
| 42 |
+
"binary": 1.0,
|
| 43 |
+
"steps_used": 3,
|
| 44 |
+
"final_action_type": "complete_task",
|
| 45 |
+
"error": null
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"task_id": "E2_meeting_invite_blast",
|
| 49 |
+
"seed": 0,
|
| 50 |
+
"completion": 0.0,
|
| 51 |
+
"drift_detection": 1.0,
|
| 52 |
+
"adaptation": 1.0,
|
| 53 |
+
"efficiency": 0.5833333333333333,
|
| 54 |
+
"shaped_total": 0.0,
|
| 55 |
+
"cumulative_reward": 1.425,
|
| 56 |
+
"binary": 0.0,
|
| 57 |
+
"steps_used": 5,
|
| 58 |
+
"final_action_type": "complete_task",
|
| 59 |
+
"error": null
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"task_id": "E2_meeting_invite_blast",
|
| 63 |
+
"seed": 1,
|
| 64 |
+
"completion": 0.0,
|
| 65 |
+
"drift_detection": 1.0,
|
| 66 |
+
"adaptation": 1.0,
|
| 67 |
+
"efficiency": 0.5833333333333333,
|
| 68 |
+
"shaped_total": 0.0,
|
| 69 |
+
"cumulative_reward": 1.425,
|
| 70 |
+
"binary": 0.0,
|
| 71 |
+
"steps_used": 5,
|
| 72 |
+
"final_action_type": "complete_task",
|
| 73 |
+
"error": null
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"task_id": "E2_meeting_invite_blast",
|
| 77 |
+
"seed": 2,
|
| 78 |
+
"completion": 0.0,
|
| 79 |
+
"drift_detection": 1.0,
|
| 80 |
+
"adaptation": 1.0,
|
| 81 |
+
"efficiency": 0.5833333333333333,
|
| 82 |
+
"shaped_total": 0.0,
|
| 83 |
+
"cumulative_reward": 1.425,
|
| 84 |
+
"binary": 0.0,
|
| 85 |
+
"steps_used": 5,
|
| 86 |
+
"final_action_type": "complete_task",
|
| 87 |
+
"error": null
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"task_id": "E3_customer_lookup",
|
| 91 |
+
"seed": 0,
|
| 92 |
+
"completion": 1.0,
|
| 93 |
+
"drift_detection": 0.0,
|
| 94 |
+
"adaptation": 0.0,
|
| 95 |
+
"efficiency": 0.8125,
|
| 96 |
+
"shaped_total": 0.521875,
|
| 97 |
+
"cumulative_reward": 0.99375,
|
| 98 |
+
"binary": 1.0,
|
| 99 |
+
"steps_used": 3,
|
| 100 |
+
"final_action_type": "complete_task",
|
| 101 |
+
"error": null
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"task_id": "E3_customer_lookup",
|
| 105 |
+
"seed": 1,
|
| 106 |
+
"completion": 1.0,
|
| 107 |
+
"drift_detection": 0.0,
|
| 108 |
+
"adaptation": 0.0,
|
| 109 |
+
"efficiency": 0.8125,
|
| 110 |
+
"shaped_total": 0.521875,
|
| 111 |
+
"cumulative_reward": 0.99375,
|
| 112 |
+
"binary": 1.0,
|
| 113 |
+
"steps_used": 3,
|
| 114 |
+
"final_action_type": "complete_task",
|
| 115 |
+
"error": null
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"task_id": "E3_customer_lookup",
|
| 119 |
+
"seed": 2,
|
| 120 |
+
"completion": 1.0,
|
| 121 |
+
"drift_detection": 0.0,
|
| 122 |
+
"adaptation": 0.0,
|
| 123 |
+
"efficiency": 0.8125,
|
| 124 |
+
"shaped_total": 0.521875,
|
| 125 |
+
"cumulative_reward": 0.99375,
|
| 126 |
+
"binary": 1.0,
|
| 127 |
+
"steps_used": 3,
|
| 128 |
+
"final_action_type": "complete_task",
|
| 129 |
+
"error": null
|
| 130 |
+
}
|
| 131 |
+
]
|
| 132 |
+
}
|
scenarios.py
CHANGED
|
@@ -121,4 +121,163 @@ SCENARIOS: dict[str, dict] = {
|
|
| 121 |
},
|
| 122 |
"required_tools": ["crm"],
|
| 123 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
}
|
|
|
|
| 121 |
},
|
| 122 |
"required_tools": ["crm"],
|
| 123 |
},
|
| 124 |
+
|
| 125 |
+
"M1_customer_escalation": {
|
| 126 |
+
"difficulty": "medium",
|
| 127 |
+
"max_steps": 12,
|
| 128 |
+
"token_budget": 6000,
|
| 129 |
+
"task_description": (
|
| 130 |
+
"A VIP customer at bob@customer.com has escalated: their "
|
| 131 |
+
"subscription is about to lapse. Look them up in CRM, update "
|
| 132 |
+
"their status to 'vip_escalation', send them a personalized "
|
| 133 |
+
"retention email from support@company.com with subject "
|
| 134 |
+
"'Priority Support — [Customer Name]', and schedule a 30-minute "
|
| 135 |
+
"check-in call on Friday April 24 at 2:00 PM with both the "
|
| 136 |
+
"customer and the account manager alex@company.com."
|
| 137 |
+
),
|
| 138 |
+
"success_criteria": [
|
| 139 |
+
"Customer contact retrieved with correct company",
|
| 140 |
+
"CRM status updated to vip_escalation",
|
| 141 |
+
"Retention email sent to bob@customer.com with Priority Support subject",
|
| 142 |
+
"Calendar event created for Friday April 24 2pm with both customer and account manager",
|
| 143 |
+
],
|
| 144 |
+
"seed_data": {
|
| 145 |
+
"mail": {"messages": []},
|
| 146 |
+
"calendar": {"events": []},
|
| 147 |
+
"crm": {
|
| 148 |
+
"contacts": [
|
| 149 |
+
{"contact_id": "c_1", "customer_email": "alice@customer.com",
|
| 150 |
+
"name": "Alice Nguyen", "company": "Acme Corp", "status": "active"},
|
| 151 |
+
{"contact_id": "c_2", "customer_email": "bob@customer.com",
|
| 152 |
+
"name": "Bob Taylor", "company": "Globex Industries", "status": "active"},
|
| 153 |
+
],
|
| 154 |
+
},
|
| 155 |
+
},
|
| 156 |
+
"drift_plan": [
|
| 157 |
+
DriftEvent(
|
| 158 |
+
tool="crm", endpoint=None, kind="field_rename",
|
| 159 |
+
fires_at_step=1,
|
| 160 |
+
details={"from": "customer_email", "to": "email_address"},
|
| 161 |
+
),
|
| 162 |
+
DriftEvent(
|
| 163 |
+
tool="calendar", endpoint="create_event", kind="field_rename",
|
| 164 |
+
fires_at_step=6,
|
| 165 |
+
details={"from": "attendees", "to": "participants"},
|
| 166 |
+
),
|
| 167 |
+
],
|
| 168 |
+
"ground_truth_final_state": {
|
| 169 |
+
"crm.contact_c_2_status": "vip_escalation",
|
| 170 |
+
"mail.sent_count": 1,
|
| 171 |
+
"mail.last_sent_to": "bob@customer.com",
|
| 172 |
+
"mail.last_subject_contains_priority_support": True,
|
| 173 |
+
"calendar.events_count": 1,
|
| 174 |
+
"calendar.last_event_has_both_attendees": True,
|
| 175 |
+
},
|
| 176 |
+
"required_tools": ["mail", "calendar", "crm"],
|
| 177 |
+
},
|
| 178 |
+
|
| 179 |
+
"M2_weekly_report": {
|
| 180 |
+
"difficulty": "medium",
|
| 181 |
+
"max_steps": 10,
|
| 182 |
+
"token_budget": 5000,
|
| 183 |
+
"task_description": (
|
| 184 |
+
"Prepare the weekly sales report: pull the list of active "
|
| 185 |
+
"contacts from CRM, send a summary email to "
|
| 186 |
+
"sales-leads@company.com with subject "
|
| 187 |
+
"'Weekly Active Contacts Report' listing contact names, and "
|
| 188 |
+
"schedule a report review meeting next Monday April 27 at "
|
| 189 |
+
"10:00 AM with the sales team leads sarah@company.com and "
|
| 190 |
+
"mike@company.com."
|
| 191 |
+
),
|
| 192 |
+
"success_criteria": [
|
| 193 |
+
"Active contacts retrieved from CRM",
|
| 194 |
+
"Summary email sent with 'Weekly' in subject",
|
| 195 |
+
"Meeting scheduled for Monday April 27 10am with both sales leads",
|
| 196 |
+
],
|
| 197 |
+
"seed_data": {
|
| 198 |
+
"mail": {"messages": []},
|
| 199 |
+
"calendar": {"events": []},
|
| 200 |
+
"crm": {
|
| 201 |
+
"contacts": [
|
| 202 |
+
{"contact_id": "c_1", "customer_email": "x@co.com",
|
| 203 |
+
"name": "X Person", "company": "Co", "status": "active"},
|
| 204 |
+
{"contact_id": "c_2", "customer_email": "y@co.com",
|
| 205 |
+
"name": "Y Person", "company": "Co", "status": "active"},
|
| 206 |
+
{"contact_id": "c_3", "customer_email": "z@co.com",
|
| 207 |
+
"name": "Z Person", "company": "Co", "status": "inactive"},
|
| 208 |
+
],
|
| 209 |
+
},
|
| 210 |
+
},
|
| 211 |
+
"drift_plan": [
|
| 212 |
+
DriftEvent(
|
| 213 |
+
tool="mail", endpoint="send_message", kind="endpoint_deprecation",
|
| 214 |
+
fires_at_step=2,
|
| 215 |
+
details={"replacement": "messages.send"},
|
| 216 |
+
),
|
| 217 |
+
DriftEvent(
|
| 218 |
+
tool="crm", endpoint=None, kind="rate_limit_tightening",
|
| 219 |
+
fires_at_step=4,
|
| 220 |
+
details={"limit": 2},
|
| 221 |
+
),
|
| 222 |
+
],
|
| 223 |
+
"ground_truth_final_state": {
|
| 224 |
+
"mail.sent_count": 1,
|
| 225 |
+
"mail.last_subject_contains_weekly": True,
|
| 226 |
+
"calendar.events_count": 1,
|
| 227 |
+
"calendar.last_event_has_both_sales_leads": True,
|
| 228 |
+
},
|
| 229 |
+
"required_tools": ["mail", "calendar", "crm"],
|
| 230 |
+
},
|
| 231 |
+
|
| 232 |
+
"M3_event_cleanup": {
|
| 233 |
+
"difficulty": "medium",
|
| 234 |
+
"max_steps": 12,
|
| 235 |
+
"token_budget": 6000,
|
| 236 |
+
"task_description": (
|
| 237 |
+
"End-of-week calendar cleanup: find and cancel the "
|
| 238 |
+
"'Old Planning Sync' event, find and cancel the "
|
| 239 |
+
"'Cancelled Kickoff' event, and create a new 'Friday Wrap-up' "
|
| 240 |
+
"event for Friday April 24 at 4:00 PM with the team lead "
|
| 241 |
+
"alex@company.com attending. Send a notification email to "
|
| 242 |
+
"team-all@company.com with subject "
|
| 243 |
+
"'Calendar Updated — Friday Wrap-up Added' about the changes."
|
| 244 |
+
),
|
| 245 |
+
"success_criteria": [
|
| 246 |
+
"Old Planning Sync event cancelled or deleted",
|
| 247 |
+
"Cancelled Kickoff event cancelled or deleted",
|
| 248 |
+
"New Friday Wrap-up event created at 4pm with alex attending",
|
| 249 |
+
"Notification email sent with 'Calendar Updated' in subject",
|
| 250 |
+
],
|
| 251 |
+
"seed_data": {
|
| 252 |
+
"mail": {"messages": []},
|
| 253 |
+
"calendar": {"events": [
|
| 254 |
+
{"event_id": "evt_1", "title": "Old Planning Sync",
|
| 255 |
+
"start": "2026-04-20T10:00:00Z", "end": "2026-04-20T11:00:00Z",
|
| 256 |
+
"attendees": ["alex@company.com"], "status": "confirmed"},
|
| 257 |
+
{"event_id": "evt_2", "title": "Cancelled Kickoff",
|
| 258 |
+
"start": "2026-04-21T14:00:00Z", "end": "2026-04-21T15:00:00Z",
|
| 259 |
+
"attendees": ["alex@company.com"], "status": "confirmed"},
|
| 260 |
+
]},
|
| 261 |
+
},
|
| 262 |
+
"drift_plan": [
|
| 263 |
+
DriftEvent(
|
| 264 |
+
tool="calendar", endpoint="delete_event", kind="tool_removal",
|
| 265 |
+
fires_at_step=2,
|
| 266 |
+
details={"fallback": "update_event status=cancelled"},
|
| 267 |
+
),
|
| 268 |
+
DriftEvent(
|
| 269 |
+
tool="calendar", endpoint="create_event", kind="field_rename",
|
| 270 |
+
fires_at_step=5,
|
| 271 |
+
details={"from": "attendees", "to": "participants"},
|
| 272 |
+
),
|
| 273 |
+
],
|
| 274 |
+
"ground_truth_final_state": {
|
| 275 |
+
"calendar.evt_1_status": "cancelled",
|
| 276 |
+
"calendar.evt_2_status": "cancelled",
|
| 277 |
+
"calendar.events_count_new_friday_wrapup": 1,
|
| 278 |
+
"mail.sent_count": 1,
|
| 279 |
+
"mail.last_subject_contains_calendar_updated": True,
|
| 280 |
+
},
|
| 281 |
+
"required_tools": ["mail", "calendar"],
|
| 282 |
+
},
|
| 283 |
}
|
server/environment.py
CHANGED
|
@@ -283,6 +283,12 @@ class SchemaShiftEnvironment:
|
|
| 283 |
st["mail.last_subject_contains_welcome"] = True
|
| 284 |
if "all-hands" in subject or "all hands" in subject:
|
| 285 |
st["mail.last_subject_contains_allhands"] = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
recipients: list[str] = st.get("mail.all_recipients", [])
|
| 287 |
if sent_to and sent_to not in recipients:
|
| 288 |
recipients.append(sent_to)
|
|
@@ -304,8 +310,29 @@ class SchemaShiftEnvironment:
|
|
| 304 |
elif isinstance(a, dict):
|
| 305 |
emails.append(a.get("email", ""))
|
| 306 |
st["calendar.last_event_attendees"] = emails
|
| 307 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 308 |
st["calendar.last_event_has_both_attendees"] = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 309 |
|
| 310 |
# CRM ─────────────────────────────────────────────────────
|
| 311 |
if tool == "crm":
|
|
|
|
| 283 |
st["mail.last_subject_contains_welcome"] = True
|
| 284 |
if "all-hands" in subject or "all hands" in subject:
|
| 285 |
st["mail.last_subject_contains_allhands"] = True
|
| 286 |
+
if "priority support" in subject:
|
| 287 |
+
st["mail.last_subject_contains_priority_support"] = True
|
| 288 |
+
if "weekly" in subject:
|
| 289 |
+
st["mail.last_subject_contains_weekly"] = True
|
| 290 |
+
if "calendar updated" in subject:
|
| 291 |
+
st["mail.last_subject_contains_calendar_updated"] = True
|
| 292 |
recipients: list[str] = st.get("mail.all_recipients", [])
|
| 293 |
if sent_to and sent_to not in recipients:
|
| 294 |
recipients.append(sent_to)
|
|
|
|
| 310 |
elif isinstance(a, dict):
|
| 311 |
emails.append(a.get("email", ""))
|
| 312 |
st["calendar.last_event_attendees"] = emails
|
| 313 |
+
# Recognised attendee pairs (E1 + M1 share this key by design).
|
| 314 |
+
priya_alex = (
|
| 315 |
+
"priya@company.com" in emails and "alex@company.com" in emails
|
| 316 |
+
)
|
| 317 |
+
bob_alex = (
|
| 318 |
+
"bob@customer.com" in emails and "alex@company.com" in emails
|
| 319 |
+
)
|
| 320 |
+
if priya_alex or bob_alex:
|
| 321 |
st["calendar.last_event_has_both_attendees"] = True
|
| 322 |
+
if "sarah@company.com" in emails and "mike@company.com" in emails:
|
| 323 |
+
st["calendar.last_event_has_both_sales_leads"] = True
|
| 324 |
+
# M3: Friday Wrap-up event counter
|
| 325 |
+
title = str(body.get("title") or params.get("title") or "").lower()
|
| 326 |
+
if "friday wrap-up" in title:
|
| 327 |
+
st["calendar.events_count_new_friday_wrapup"] = (
|
| 328 |
+
st.get("calendar.events_count_new_friday_wrapup", 0) + 1
|
| 329 |
+
)
|
| 330 |
+
elif endpoint == "update_event":
|
| 331 |
+
# M3: track per-event status transitions (cancellations)
|
| 332 |
+
event_id = params.get("event_id", "")
|
| 333 |
+
status = params.get("status")
|
| 334 |
+
if event_id and status:
|
| 335 |
+
st[f"calendar.{event_id}_status"] = status
|
| 336 |
|
| 337 |
# CRM ─────────────────────────────────────────────────────
|
| 338 |
if tool == "crm":
|
tests/test_graders.py
CHANGED
|
@@ -126,6 +126,77 @@ def test_adaptation_rubric_success() -> None:
|
|
| 126 |
assert details["opportunities"] == 1
|
| 127 |
|
| 128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
def test_adaptation_rubric_no_post_drift_calls() -> None:
|
| 130 |
drift = DriftEvent(
|
| 131 |
tool="calendar", endpoint="create_event", kind="field_rename",
|
|
|
|
| 126 |
assert details["opportunities"] == 1
|
| 127 |
|
| 128 |
|
| 129 |
+
def test_adaptation_rubric_multi_drift_same_tool() -> None:
|
| 130 |
+
"""M3-style stress test: two drifts on the same tool (calendar).
|
| 131 |
+
|
| 132 |
+
History:
|
| 133 |
+
step 2 — call_tool calendar.delete_event → 410 (post-Drift-A tool_removal)
|
| 134 |
+
step 5 — call_tool calendar.create_event with attendees → 400 (post-Drift-B field_rename)
|
| 135 |
+
step 7 — retry_with_variant calendar.create_event with participants → 200 success
|
| 136 |
+
|
| 137 |
+
Expected rubric behavior (per Phase 5 judgment call #2):
|
| 138 |
+
- Drift A (fires_at_step=2): first post-drift calendar call = step 5 (failed). opp=1, adapted=0.
|
| 139 |
+
- Drift B (fires_at_step=5): first post-drift calendar call = step 7 (succeeded). opp=1, adapted=1.
|
| 140 |
+
- Score = 1/2 = 0.5.
|
| 141 |
+
|
| 142 |
+
Documents intentional denominator behavior: partial credit for partial adaptation.
|
| 143 |
+
Dense step_shaping (+0.20 for successful retry after failure) catches the step 7
|
| 144 |
+
recovery independently, so the rubric staying conservative is acceptable.
|
| 145 |
+
"""
|
| 146 |
+
drifts = [
|
| 147 |
+
DriftEvent(
|
| 148 |
+
tool="calendar", endpoint="delete_event", kind="tool_removal",
|
| 149 |
+
fires_at_step=2, details={}, detected_by_agent=True,
|
| 150 |
+
),
|
| 151 |
+
DriftEvent(
|
| 152 |
+
tool="calendar", endpoint="create_event", kind="field_rename",
|
| 153 |
+
fires_at_step=5, details={}, detected_by_agent=True,
|
| 154 |
+
),
|
| 155 |
+
]
|
| 156 |
+
history = [
|
| 157 |
+
HistoryStep(
|
| 158 |
+
step=2,
|
| 159 |
+
action=Action(
|
| 160 |
+
type="call_tool",
|
| 161 |
+
tool_call=ToolCallParams(
|
| 162 |
+
tool="calendar", endpoint="delete_event",
|
| 163 |
+
params={"event_id": "evt_2"},
|
| 164 |
+
),
|
| 165 |
+
),
|
| 166 |
+
response=ToolResponse(ok=False, status=410, error="removed"),
|
| 167 |
+
),
|
| 168 |
+
HistoryStep(
|
| 169 |
+
step=5,
|
| 170 |
+
action=Action(
|
| 171 |
+
type="call_tool",
|
| 172 |
+
tool_call=ToolCallParams(
|
| 173 |
+
tool="calendar", endpoint="create_event",
|
| 174 |
+
params={"title": "x", "start": "t1", "end": "t2",
|
| 175 |
+
"attendees": ["a@x.com"]},
|
| 176 |
+
),
|
| 177 |
+
),
|
| 178 |
+
response=ToolResponse(ok=False, status=400, error="missing required"),
|
| 179 |
+
),
|
| 180 |
+
HistoryStep(
|
| 181 |
+
step=7,
|
| 182 |
+
action=Action(
|
| 183 |
+
type="retry_with_variant",
|
| 184 |
+
retry=RetryParams(
|
| 185 |
+
tool="calendar", endpoint="create_event",
|
| 186 |
+
params={"title": "x", "start": "t1", "end": "t2",
|
| 187 |
+
"participants": [{"email": "a@x.com", "role": "required"}]},
|
| 188 |
+
),
|
| 189 |
+
),
|
| 190 |
+
response=ToolResponse(ok=True, status=200, body={"event_id": "evt_3"}),
|
| 191 |
+
),
|
| 192 |
+
]
|
| 193 |
+
s = _state_with(step=7, drift_plan=drifts, history=history)
|
| 194 |
+
_, val, details = AdaptationRubric().score(s)
|
| 195 |
+
assert val == 0.5, f"Expected 0.5, got {val}"
|
| 196 |
+
assert details["adapted"] == 1
|
| 197 |
+
assert details["opportunities"] == 2
|
| 198 |
+
|
| 199 |
+
|
| 200 |
def test_adaptation_rubric_no_post_drift_calls() -> None:
|
| 201 |
drift = DriftEvent(
|
| 202 |
tool="calendar", endpoint="create_event", kind="field_rename",
|
tests/test_scenarios.py
CHANGED
|
@@ -18,14 +18,44 @@ REQUIRED_KEYS = {
|
|
| 18 |
}
|
| 19 |
|
| 20 |
|
| 21 |
-
def
|
| 22 |
assert set(SCENARIOS.keys()) == {
|
| 23 |
"E1_onboard_new_hire",
|
| 24 |
"E2_meeting_invite_blast",
|
| 25 |
"E3_customer_lookup",
|
|
|
|
|
|
|
|
|
|
| 26 |
}
|
| 27 |
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
def test_each_scenario_has_required_fields() -> None:
|
| 30 |
for name, sc in SCENARIOS.items():
|
| 31 |
missing = REQUIRED_KEYS - set(sc.keys())
|
|
|
|
| 18 |
}
|
| 19 |
|
| 20 |
|
| 21 |
+
def test_all_scenarios_present() -> None:
|
| 22 |
assert set(SCENARIOS.keys()) == {
|
| 23 |
"E1_onboard_new_hire",
|
| 24 |
"E2_meeting_invite_blast",
|
| 25 |
"E3_customer_lookup",
|
| 26 |
+
"M1_customer_escalation",
|
| 27 |
+
"M2_weekly_report",
|
| 28 |
+
"M3_event_cleanup",
|
| 29 |
}
|
| 30 |
|
| 31 |
|
| 32 |
+
def test_medium_scenarios_present() -> None:
|
| 33 |
+
for task_id in ("M1_customer_escalation", "M2_weekly_report", "M3_event_cleanup"):
|
| 34 |
+
assert task_id in SCENARIOS, f"{task_id} missing from SCENARIOS"
|
| 35 |
+
assert SCENARIOS[task_id]["difficulty"] == "medium"
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def test_medium_scenarios_multi_drift() -> None:
|
| 39 |
+
for task_id in ("M1_customer_escalation", "M2_weekly_report", "M3_event_cleanup"):
|
| 40 |
+
plan = SCENARIOS[task_id]["drift_plan"]
|
| 41 |
+
assert len(plan) == 2, f"{task_id}: expected 2 drifts, got {len(plan)}"
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def test_m3_same_tool_multi_drift() -> None:
|
| 45 |
+
"""M3 is the judgment-call-#2 stress test: both drifts target calendar."""
|
| 46 |
+
plan = SCENARIOS["M3_event_cleanup"]["drift_plan"]
|
| 47 |
+
tools = [d.tool for d in plan]
|
| 48 |
+
assert tools == ["calendar", "calendar"], (
|
| 49 |
+
f"M3 drifts must both target calendar, got {tools}"
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def test_medium_required_tools() -> None:
|
| 54 |
+
assert SCENARIOS["M1_customer_escalation"]["required_tools"] == ["mail", "calendar", "crm"]
|
| 55 |
+
assert SCENARIOS["M2_weekly_report"]["required_tools"] == ["mail", "calendar", "crm"]
|
| 56 |
+
assert SCENARIOS["M3_event_cleanup"]["required_tools"] == ["mail", "calendar"]
|
| 57 |
+
|
| 58 |
+
|
| 59 |
def test_each_scenario_has_required_fields() -> None:
|
| 60 |
for name, sc in SCENARIOS.items():
|
| 61 |
missing = REQUIRED_KEYS - set(sc.keys())
|
tests/test_server.py
CHANGED
|
@@ -38,15 +38,18 @@ def test_tasks_endpoint(client) -> None:
|
|
| 38 |
r = client.get("/tasks")
|
| 39 |
assert r.status_code == 200
|
| 40 |
body = r.json()
|
| 41 |
-
assert body["count"] ==
|
| 42 |
task_ids = {t["task_id"] for t in body["tasks"]}
|
| 43 |
assert task_ids == {
|
| 44 |
"E1_onboard_new_hire",
|
| 45 |
"E2_meeting_invite_blast",
|
| 46 |
"E3_customer_lookup",
|
|
|
|
|
|
|
|
|
|
| 47 |
}
|
| 48 |
for t in body["tasks"]:
|
| 49 |
-
assert t["difficulty"]
|
| 50 |
assert isinstance(t["required_tools"], list)
|
| 51 |
|
| 52 |
|
|
|
|
| 38 |
r = client.get("/tasks")
|
| 39 |
assert r.status_code == 200
|
| 40 |
body = r.json()
|
| 41 |
+
assert body["count"] == 6
|
| 42 |
task_ids = {t["task_id"] for t in body["tasks"]}
|
| 43 |
assert task_ids == {
|
| 44 |
"E1_onboard_new_hire",
|
| 45 |
"E2_meeting_invite_blast",
|
| 46 |
"E3_customer_lookup",
|
| 47 |
+
"M1_customer_escalation",
|
| 48 |
+
"M2_weekly_report",
|
| 49 |
+
"M3_event_cleanup",
|
| 50 |
}
|
| 51 |
for t in body["tasks"]:
|
| 52 |
+
assert t["difficulty"] in ("easy", "medium")
|
| 53 |
assert isinstance(t["required_tools"], list)
|
| 54 |
|
| 55 |
|