SidraMiconi commited on
Commit
a17a9f5
·
1 Parent(s): 6c88a2c

deploy SchemaShift

Browse files
Dockerfile ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+ RUN useradd -m -u 1000 user
3
+ USER user
4
+ ENV PATH="/home/user/.local/bin:$PATH"
5
+ WORKDIR /app
6
+ COPY --chown=user ./requirements.txt requirements.txt
7
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
8
+ COPY --chown=user . /app
9
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,12 +1,10 @@
1
  ---
2
- title: Schemashift
3
- emoji: 👀
4
- colorFrom: indigo
5
- colorTo: red
6
  sdk: docker
7
  pinned: false
8
  license: apache-2.0
9
- short_description: Executive assistant environment with schema drift workflows
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: SchemaShift EA Arena
3
+ emoji: 🔄
4
+ colorFrom: purple
5
+ colorTo: orange
6
  sdk: docker
7
  pinned: false
8
  license: apache-2.0
9
+ app_port: 7860
10
  ---
 
 
__pycache__/models.cpython-312.pyc ADDED
Binary file (2.05 kB). View file
 
__pycache__/schemashift_environment.cpython-312.pyc ADDED
Binary file (9.23 kB). View file
 
__pycache__/tasks.cpython-312.pyc ADDED
Binary file (12.3 kB). View file
 
__pycache__/tools.cpython-312.pyc ADDED
Binary file (18.1 kB). View file
 
__pycache__/verifier.cpython-312.pyc ADDED
Binary file (6.24 kB). View file
 
app.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """SchemaShift EA Arena — FastAPI server (HF Spaces flat structure)."""
2
+ from fastapi import FastAPI
3
+ from pydantic import BaseModel
4
+ from schemashift_environment import SchemaShiftEnvironment
5
+ from models import EAAction
6
+
7
+ app = FastAPI(title="SchemaShift EA Arena")
8
+ env = SchemaShiftEnvironment()
9
+
10
+
11
+ class StepRequest(BaseModel):
12
+ action: dict
13
+
14
+
15
+ @app.get("/health")
16
+ def health():
17
+ return {"status": "healthy", "environment": "schemashift-ea-arena", "tasks": 12}
18
+
19
+
20
+ @app.post("/reset")
21
+ def reset():
22
+ obs = env.reset()
23
+ return {"observation": obs.model_dump(), "reward": 0.0, "done": False}
24
+
25
+
26
+ @app.post("/step")
27
+ def step(req: StepRequest):
28
+ action = EAAction(**req.action)
29
+ obs = env.step(action)
30
+ return {"observation": obs.model_dump(), "reward": obs.reward, "done": obs.done}
31
+
32
+
33
+ @app.get("/state")
34
+ def state():
35
+ s = env.state
36
+ if s is None:
37
+ return {"error": "No active episode. Call /reset first."}
38
+ return s.model_dump()
models.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """SchemaShift EA Arena — Pydantic v2 models."""
2
+ from pydantic import BaseModel, Field
3
+ from typing import Optional
4
+
5
+
6
+ class EAAction(BaseModel):
7
+ tool: str = ""
8
+ action: str = ""
9
+ parameters: dict = Field(default_factory=dict)
10
+
11
+
12
+ class EAObservation(BaseModel):
13
+ success: bool = False
14
+ output: str = ""
15
+ error: Optional[str] = None
16
+ reward: float = 0.0
17
+ done: bool = False
18
+ step_count: int = 0
19
+ task_description: str = ""
20
+ schema_version: int = 1
21
+ drift_occurred: bool = False
22
+
23
+
24
+ class EpisodeState(BaseModel):
25
+ task_id: str = ""
26
+ task_description: str = ""
27
+ step_count: int = 0
28
+ max_steps: int = 20
29
+ completed: bool = False
30
+ verdict: dict = Field(default_factory=dict)
31
+ tools_used: list = Field(default_factory=list)
32
+ policy_violations: int = 0
33
+ invalid_calls: int = 0
34
+ drift_events: list = Field(default_factory=list)
35
+ recovered_from_drift: bool = False
36
+ notifications_sent: list = Field(default_factory=list)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ openenv-core>=0.2.1
2
+ fastapi
3
+ uvicorn[standard]
4
+ pydantic>=2.0
schemashift_environment.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """SchemaShift EA Arena Environment — reset/step/state with schema drift injection."""
2
+ import os, json, copy
3
+ from models import EAAction, EAObservation, EpisodeState
4
+ from tasks import TASKS
5
+ from tools import ALL_TOOLS, CalendarTool, EmailTool, BookingsTool, TravelTool, DocsTool, ExpensesTool, RoomsTool, TeamTool, IncidentsTool
6
+ from verifier import verify_episode
7
+
8
+
9
+ class SchemaShiftEnvironment:
10
+ def __init__(self):
11
+ self._state = None
12
+ self._task = None
13
+ self._task_index = 0
14
+ self._tools = {}
15
+ self._drift_applied = False
16
+
17
+ def _setup_tools(self, seed):
18
+ self._tools = {}
19
+ tool_map = {
20
+ "calendar": CalendarTool, "emails": EmailTool, "email": EmailTool,
21
+ "bookings": BookingsTool, "travel": TravelTool, "docs": DocsTool,
22
+ "expenses": ExpensesTool, "rooms": RoomsTool, "team": TeamTool,
23
+ "incidents": IncidentsTool,
24
+ }
25
+ for key, data in seed.items():
26
+ if key == "policies":
27
+ continue
28
+ cls = tool_map.get(key)
29
+ if cls and isinstance(data, list):
30
+ tool = cls()
31
+ tool.seed(data)
32
+ name = "email" if key == "emails" else key
33
+ self._tools[name] = tool
34
+
35
+ def reset(self):
36
+ self._task = TASKS[self._task_index % len(TASKS)]
37
+ self._task_index += 1
38
+ self._drift_applied = False
39
+ self._setup_tools(self._task.get("seed", {}))
40
+ self._state = EpisodeState(
41
+ task_id=self._task["id"],
42
+ task_description=self._task["description"],
43
+ max_steps=self._task.get("max_steps", 15),
44
+ )
45
+ return EAObservation(
46
+ success=True,
47
+ output=f"TASK: {self._task['title']}\n\n{self._task['description']}",
48
+ task_description=self._task["description"],
49
+ done=False,
50
+ schema_version=1,
51
+ )
52
+
53
+ def _maybe_inject_drift(self):
54
+ drift_step = self._task.get("drift_at_step")
55
+ if drift_step and self._state.step_count >= drift_step and not self._drift_applied:
56
+ drift = self._task.get("drift_event", {})
57
+ tool_name = drift.get("tool", "")
58
+ if tool_name == "emails":
59
+ tool_name = "email"
60
+ tool = self._tools.get(tool_name)
61
+ if tool:
62
+ tool.apply_drift(drift)
63
+ self._drift_applied = True
64
+ self._state.drift_events.append(drift.get("change", "unknown"))
65
+ return drift
66
+ return None
67
+
68
+ def step(self, action):
69
+ if self._state is None:
70
+ return EAObservation(success=False, error="Call reset() first", reward=-1.0, done=True)
71
+
72
+ self._state.step_count += 1
73
+
74
+ tool_name = action.tool if hasattr(action, 'tool') else action.get('tool', '')
75
+ act = action.action if hasattr(action, 'action') else action.get('action', '')
76
+ params = action.parameters if hasattr(action, 'parameters') else action.get('parameters', {})
77
+
78
+ drift = self._maybe_inject_drift()
79
+ drift_msg = ""
80
+ if drift:
81
+ dtype = drift.get("type", "")
82
+ if dtype == "schema_change":
83
+ drift_msg = f"\n⚠️ SCHEMA CHANGE: {drift.get('change', '')}. Check tool documentation."
84
+ elif dtype == "policy_change":
85
+ drift_msg = f"\n⚠️ POLICY CHANGE: {drift.get('change', '')}. Review updated policies."
86
+ elif dtype == "actor_conflict":
87
+ drift_msg = f"\n⚠️ NEW MESSAGE from {drift.get('actor', 'unknown')}: \"{drift.get('message', '')}\""
88
+
89
+ if tool_name == "system" and act == "submit":
90
+ return self._submit()
91
+
92
+ tool = self._tools.get(tool_name)
93
+ if not tool:
94
+ self._state.invalid_calls += 1
95
+ return EAObservation(
96
+ success=False, error=f"Unknown tool: {tool_name}{drift_msg}",
97
+ step_count=self._state.step_count,
98
+ drift_occurred=bool(drift),
99
+ )
100
+
101
+ self._state.tools_used.append(f"{tool_name}.{act}")
102
+ result = tool.execute(act, params)
103
+
104
+ if not result.get("success", False):
105
+ if result.get("policy_violated"):
106
+ self._state.policy_violations += 1
107
+ elif "schema_version" not in result:
108
+ self._state.invalid_calls += 1
109
+
110
+ if self._drift_applied and result.get("success"):
111
+ self._state.recovered_from_drift = True
112
+
113
+ output = json.dumps(result, indent=2) if isinstance(result, dict) else str(result)
114
+ output += drift_msg
115
+
116
+ done = self._state.step_count >= self._state.max_steps
117
+ if done:
118
+ return self._submit()
119
+
120
+ return EAObservation(
121
+ success=result.get("success", False),
122
+ output=output,
123
+ error=result.get("error"),
124
+ step_count=self._state.step_count,
125
+ schema_version=getattr(tool, '_schema_version', 1),
126
+ drift_occurred=bool(drift),
127
+ )
128
+
129
+ def _submit(self):
130
+ snapshots = {}
131
+ for name, tool in self._tools.items():
132
+ snapshots[name] = tool.snapshot()
133
+
134
+ if "email" in self._tools:
135
+ email_snap = self._tools["email"].snapshot()
136
+ if isinstance(email_snap, dict):
137
+ self._state.notifications_sent = [e.get("to", "") for e in email_snap.get("outbox", [])]
138
+
139
+ reward, violations, verdict = verify_episode(
140
+ task=self._task,
141
+ snapshots=snapshots,
142
+ policy_violations=self._state.policy_violations,
143
+ invalid_calls=self._state.invalid_calls,
144
+ tool_calls_made=self._state.step_count,
145
+ drift_events_handled=len(self._state.drift_events),
146
+ recovered_from_drift=self._state.recovered_from_drift,
147
+ )
148
+
149
+ self._state.completed = True
150
+ self._state.verdict = verdict
151
+
152
+ return EAObservation(
153
+ success=True,
154
+ output=json.dumps(verdict, indent=2),
155
+ reward=reward,
156
+ done=True,
157
+ step_count=self._state.step_count,
158
+ )
159
+
160
+ @property
161
+ def state(self):
162
+ return self._state
tasks.py ADDED
@@ -0,0 +1,364 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SchemaShift EA Arena — Task Templates
3
+
4
+ 12 tasks across 3 difficulty tiers with schema drift events.
5
+ Each task simulates a real executive assistant workflow where
6
+ APIs, forms, and policies change mid-episode.
7
+ """
8
+
9
+ TASKS = [
10
+ # ═══════════════════════════════════════════════════════════
11
+ # TIER 1: Simple (3-4 tool calls, 1 drift event)
12
+ # ═══════════════════════════════════════════════════════════
13
+ {
14
+ "id": "reschedule_dinner",
15
+ "title": "Reschedule dinner due to meeting conflict",
16
+ "description": (
17
+ "Your VP moved the board prep meeting to 6:30 PM tonight. "
18
+ "You have dinner with Alex at 7:00 PM at Lucia's. "
19
+ "Reschedule dinner to 8:30 PM, update the restaurant booking, "
20
+ "and email Alex about the change."
21
+ ),
22
+ "seed": {
23
+ "calendar": [
24
+ {"id": 1, "title": "Board Prep", "time": "15:00", "attendees": ["vp@company.com"], "status": "scheduled"},
25
+ {"id": 2, "title": "Dinner with Alex", "time": "19:00", "location": "Lucia's", "attendees": ["alex@friends.com"], "status": "scheduled"},
26
+ ],
27
+ "bookings": [
28
+ {"id": 101, "restaurant": "Lucia's", "time": "19:00", "party_size": 2, "status": "confirmed"},
29
+ ],
30
+ "emails": [],
31
+ "policies": {"max_booking_changes": 3},
32
+ },
33
+ "drift_at_step": 2,
34
+ "drift_event": {"type": "schema_change", "tool": "bookings", "change": "time_field_renamed", "old_field": "time", "new_field": "reservation_time"},
35
+ "target": {
36
+ "calendar": [{"id": 2, "time": "20:30", "status": "rescheduled"}],
37
+ "bookings": [{"id": 101, "reservation_time": "20:30", "status": "confirmed"}],
38
+ "emails": [{"to": "alex@friends.com", "contains": "reschedule"}],
39
+ },
40
+ "max_steps": 10,
41
+ },
42
+ {
43
+ "id": "book_travel_simple",
44
+ "title": "Book a flight for Monday meeting",
45
+ "description": (
46
+ "Book a flight from SFO to LAX for Monday morning. "
47
+ "The meeting is at 2 PM so arrive by noon. "
48
+ "Email travel@company.com with the booking confirmation."
49
+ ),
50
+ "seed": {
51
+ "calendar": [
52
+ {"id": 1, "title": "LA Client Meeting", "time": "14:00", "date": "2026-03-09", "location": "LA Office", "attendees": ["client@partner.com"], "status": "scheduled"},
53
+ ],
54
+ "travel": [
55
+ {"flight": "UA101", "from": "SFO", "to": "LAX", "depart": "07:00", "arrive": "08:30", "price": 189, "status": "available"},
56
+ {"flight": "UA205", "from": "SFO", "to": "LAX", "depart": "09:00", "arrive": "10:30", "price": 249, "status": "available"},
57
+ ],
58
+ "emails": [],
59
+ "policies": {"max_flight_cost": 300},
60
+ },
61
+ "drift_at_step": 2,
62
+ "drift_event": {"type": "policy_change", "tool": "travel", "change": "cost_limit_lowered", "old_limit": 300, "new_limit": 200},
63
+ "target": {
64
+ "travel": [{"flight": "UA101", "status": "booked"}],
65
+ "emails": [{"to": "travel@company.com", "contains": "booking"}],
66
+ },
67
+ "max_steps": 8,
68
+ },
69
+ {
70
+ "id": "reply_email_urgent",
71
+ "title": "Reply to urgent client email",
72
+ "description": (
73
+ "Client Sarah at sarah@bigcorp.com sent an urgent email asking "
74
+ "about the Q2 proposal deadline. The deadline is March 15. "
75
+ "Reply to her email with the deadline and CC your manager mgr@company.com."
76
+ ),
77
+ "seed": {
78
+ "emails": [
79
+ {"id": 1, "from": "sarah@bigcorp.com", "subject": "Q2 Proposal Deadline?", "body": "Hi, when is the Q2 proposal due? We need to plan resources.", "status": "unread"},
80
+ ],
81
+ "docs": [
82
+ {"id": "q2-proposal", "title": "Q2 Proposal", "deadline": "2026-03-15", "status": "draft"},
83
+ ],
84
+ "policies": {"reply_within_hours": 2, "cc_manager_on_client": True},
85
+ },
86
+ "drift_at_step": 1,
87
+ "drift_event": {"type": "schema_change", "tool": "emails", "change": "cc_field_renamed", "old_field": "cc", "new_field": "carbon_copy"},
88
+ "target": {
89
+ "emails": [{"to": "sarah@bigcorp.com", "contains": "March 15", "carbon_copy": "mgr@company.com"}],
90
+ },
91
+ "max_steps": 6,
92
+ },
93
+ {
94
+ "id": "cancel_meeting_notify",
95
+ "title": "Cancel tomorrow's standup and notify team",
96
+ "description": (
97
+ "Cancel tomorrow's team standup (event 1) because the CEO "
98
+ "called an all-hands at the same time. Email the team list: "
99
+ "dev1@company.com, dev2@company.com, dev3@company.com."
100
+ ),
101
+ "seed": {
102
+ "calendar": [
103
+ {"id": 1, "title": "Team Standup", "time": "09:00", "date": "2026-03-08", "attendees": ["dev1@company.com", "dev2@company.com", "dev3@company.com"], "status": "scheduled"},
104
+ {"id": 2, "title": "CEO All-Hands", "time": "09:00", "date": "2026-03-08", "attendees": ["all@company.com"], "status": "scheduled"},
105
+ ],
106
+ "emails": [],
107
+ "policies": {},
108
+ },
109
+ "drift_at_step": 2,
110
+ "drift_event": {"type": "schema_change", "tool": "calendar", "change": "status_values_changed", "old_values": ["scheduled", "cancelled"], "new_values": ["active", "removed"]},
111
+ "target": {
112
+ "calendar": [{"id": 1, "status": "removed"}],
113
+ "emails": [{"to": "dev1@company.com"}, {"to": "dev2@company.com"}, {"to": "dev3@company.com"}],
114
+ },
115
+ "max_steps": 10,
116
+ },
117
+
118
+ # ═══════════════════════════════════════════════════════════
119
+ # TIER 2: Medium (5-6 tool calls, 2 drift events)
120
+ # ═══════════════════════════════════════════════════════════
121
+ {
122
+ "id": "travel_with_approval",
123
+ "title": "Book international travel with manager approval",
124
+ "description": (
125
+ "Book a flight from SFO to London for the conference on March 20. "
126
+ "Budget is $2000. Book hotel for 3 nights near the venue. "
127
+ "Get manager approval (mgr@company.com) since international travel "
128
+ "requires it. Email travel@company.com with full itinerary."
129
+ ),
130
+ "seed": {
131
+ "travel": [
132
+ {"flight": "BA285", "from": "SFO", "to": "LHR", "depart": "19:00", "arrive": "13:00+1", "price": 1200, "status": "available"},
133
+ {"hotel": "Hilton Tower Bridge", "rate": 250, "nights": 3, "status": "available"},
134
+ ],
135
+ "calendar": [
136
+ {"id": 1, "title": "London Conference", "date": "2026-03-20", "location": "ExCeL London", "status": "scheduled"},
137
+ ],
138
+ "emails": [],
139
+ "policies": {"intl_travel_requires_approval": True, "max_hotel_rate": 300},
140
+ },
141
+ "drift_at_step": 3,
142
+ "drift_event": {"type": "policy_change", "tool": "travel", "change": "approval_requires_itemized", "new_requirement": "must include flight cost, hotel cost, and total in approval request"},
143
+ "target": {
144
+ "travel": [{"flight": "BA285", "status": "booked"}, {"hotel": "Hilton Tower Bridge", "status": "booked"}],
145
+ "emails": [{"to": "mgr@company.com", "contains": "approval"}, {"to": "travel@company.com", "contains": "itinerary"}],
146
+ },
147
+ "max_steps": 12,
148
+ },
149
+ {
150
+ "id": "conflict_resolution",
151
+ "title": "Resolve double-booked afternoon",
152
+ "description": (
153
+ "You have 3 meetings at 2 PM: client call (high priority), "
154
+ "team sync (can move), and 1:1 with intern (can move). "
155
+ "Keep the client call, move team sync to 3 PM, move 1:1 to 4 PM. "
156
+ "Email all affected attendees about changes."
157
+ ),
158
+ "seed": {
159
+ "calendar": [
160
+ {"id": 1, "title": "Client Call", "time": "14:00", "priority": "high", "attendees": ["client@partner.com"], "status": "scheduled"},
161
+ {"id": 2, "title": "Team Sync", "time": "14:00", "priority": "medium", "attendees": ["team@company.com"], "status": "scheduled"},
162
+ {"id": 3, "title": "1:1 with Intern", "time": "14:00", "priority": "low", "attendees": ["intern@company.com"], "status": "scheduled"},
163
+ ],
164
+ "emails": [],
165
+ "policies": {"notify_on_reschedule": True},
166
+ },
167
+ "drift_at_step": 3,
168
+ "drift_event": {"type": "actor_conflict", "tool": "calendar", "change": "attendee_requests_different_time", "actor": "team@company.com", "message": "3 PM doesn't work, can we do 3:30?"},
169
+ "target": {
170
+ "calendar": [
171
+ {"id": 1, "time": "14:00", "status": "scheduled"},
172
+ {"id": 2, "time": "15:30", "status": "rescheduled"},
173
+ {"id": 3, "time": "16:00", "status": "rescheduled"},
174
+ ],
175
+ "emails": [{"to": "team@company.com"}, {"to": "intern@company.com"}],
176
+ },
177
+ "max_steps": 12,
178
+ },
179
+ {
180
+ "id": "expense_report",
181
+ "title": "Submit expense report with receipt changes",
182
+ "description": (
183
+ "Submit expense report for last week's client dinner ($185) "
184
+ "and taxi ($42). Attach receipts, categorize correctly, "
185
+ "and email finance@company.com for approval."
186
+ ),
187
+ "seed": {
188
+ "expenses": [
189
+ {"id": 1, "type": "meal", "amount": 185, "description": "Client dinner at Nobu", "receipt": True, "status": "draft"},
190
+ {"id": 2, "type": "transport", "amount": 42, "description": "Taxi to restaurant", "receipt": True, "status": "draft"},
191
+ ],
192
+ "emails": [],
193
+ "policies": {"meal_limit": 200, "require_receipt_over": 25, "approval_required_over": 100},
194
+ },
195
+ "drift_at_step": 2,
196
+ "drift_event": {"type": "policy_change", "tool": "expenses", "change": "meal_limit_lowered", "old_limit": 200, "new_limit": 150, "action": "meals over new limit require VP approval"},
197
+ "target": {
198
+ "expenses": [{"id": 1, "status": "submitted"}, {"id": 2, "status": "submitted"}],
199
+ "emails": [{"to": "finance@company.com", "contains": "expense"}, {"to": "vp@company.com", "contains": "approval"}],
200
+ },
201
+ "max_steps": 10,
202
+ },
203
+ {
204
+ "id": "onboard_new_hire",
205
+ "title": "Onboard new team member",
206
+ "description": (
207
+ "New hire Jordan (jordan@company.com) starts Monday. "
208
+ "Schedule a welcome meeting at 10 AM with the team, "
209
+ "create their onboarding doc, add them to the team calendar, "
210
+ "and email IT (it@company.com) to set up their accounts."
211
+ ),
212
+ "seed": {
213
+ "calendar": [],
214
+ "docs": [],
215
+ "emails": [],
216
+ "team": [
217
+ {"name": "Jordan Lee", "email": "jordan@company.com", "role": "engineer", "start_date": "2026-03-09"},
218
+ ],
219
+ "policies": {"onboard_checklist": ["welcome_meeting", "onboarding_doc", "it_setup", "team_intro"]},
220
+ },
221
+ "drift_at_step": 3,
222
+ "drift_event": {"type": "schema_change", "tool": "docs", "change": "template_format_changed", "old_format": "markdown", "new_format": "json"},
223
+ "target": {
224
+ "calendar": [{"title_contains": "Welcome", "attendees_include": "jordan@company.com"}],
225
+ "docs": [{"title_contains": "Onboarding"}],
226
+ "emails": [{"to": "it@company.com", "contains": "account"}, {"to": "jordan@company.com", "contains": "welcome"}],
227
+ },
228
+ "max_steps": 12,
229
+ },
230
+
231
+ # ═══════════════════════════════════════════════════════════
232
+ # TIER 3: Complex (7+ tool calls, 2-3 drift events)
233
+ # ═══════════════════════════════════════════════════════════
234
+ {
235
+ "id": "full_day_reorg",
236
+ "title": "Reorganize entire day after CEO emergency",
237
+ "description": (
238
+ "CEO called emergency board meeting at 11 AM. Reorganize the day: "
239
+ "move the 11 AM team review to 2 PM, cancel the noon lunch with vendor "
240
+ "(email vendor@partner.com to apologize), keep the 3 PM client call, "
241
+ "book a conference room for the board meeting, and email all attendees "
242
+ "about every change."
243
+ ),
244
+ "seed": {
245
+ "calendar": [
246
+ {"id": 1, "title": "Team Review", "time": "11:00", "attendees": ["team@company.com"], "status": "scheduled"},
247
+ {"id": 2, "title": "Lunch with Vendor", "time": "12:00", "attendees": ["vendor@partner.com"], "status": "scheduled"},
248
+ {"id": 3, "title": "Client Call", "time": "15:00", "attendees": ["client@bigcorp.com"], "status": "scheduled"},
249
+ ],
250
+ "rooms": [
251
+ {"id": "conf-a", "name": "Board Room", "capacity": 20, "available": True},
252
+ {"id": "conf-b", "name": "Small Meeting", "capacity": 6, "available": True},
253
+ ],
254
+ "emails": [],
255
+ "policies": {"board_meeting_room_min_capacity": 15},
256
+ },
257
+ "drift_at_step": 3,
258
+ "drift_event": {"type": "schema_change", "tool": "rooms", "change": "booking_requires_purpose", "new_required_field": "purpose"},
259
+ "target": {
260
+ "calendar": [
261
+ {"id": 1, "time": "14:00", "status": "rescheduled"},
262
+ {"id": 2, "status": "cancelled"},
263
+ ],
264
+ "rooms": [{"id": "conf-a", "status": "booked", "purpose": "CEO Board Meeting"}],
265
+ "emails": [{"to": "vendor@partner.com", "contains": "cancel"}, {"to": "team@company.com", "contains": "moved"}],
266
+ },
267
+ "max_steps": 15,
268
+ },
269
+ {
270
+ "id": "multi_actor_conflict",
271
+ "title": "Handle conflicting requests from VP and client",
272
+ "description": (
273
+ "VP wants you to schedule a strategy session Thursday 2-4 PM. "
274
+ "Client just emailed requesting a demo at the same time. "
275
+ "The client is higher priority. Schedule the demo for Thursday 2-3 PM, "
276
+ "move VP strategy to Friday 2-4 PM, and email both explaining."
277
+ ),
278
+ "seed": {
279
+ "calendar": [],
280
+ "emails": [
281
+ {"id": 1, "from": "vp@company.com", "subject": "Strategy Session", "body": "Block Thursday 2-4 PM for strategy planning.", "status": "unread"},
282
+ {"id": 2, "from": "client@bigcorp.com", "subject": "Demo Request", "body": "Can we see the product demo Thursday 2 PM?", "status": "unread"},
283
+ ],
284
+ "policies": {"client_priority_over_internal": True},
285
+ },
286
+ "drift_at_step": 4,
287
+ "drift_event": {"type": "actor_conflict", "tool": "emails", "change": "vp_insists", "actor": "vp@company.com", "message": "Friday doesn't work. Can we do Thursday morning instead?"},
288
+ "target": {
289
+ "calendar": [
290
+ {"title_contains": "Demo", "time": "14:00", "day": "Thursday"},
291
+ {"title_contains": "Strategy", "time": "10:00", "day": "Thursday"},
292
+ ],
293
+ "emails": [{"to": "vp@company.com", "contains": "Thursday morning"}, {"to": "client@bigcorp.com", "contains": "demo confirmed"}],
294
+ },
295
+ "max_steps": 15,
296
+ },
297
+ {
298
+ "id": "trip_planning_drift",
299
+ "title": "Plan team offsite with multiple schema changes",
300
+ "description": (
301
+ "Plan a 2-day team offsite for 8 people in Napa Valley. "
302
+ "Book hotel, restaurant for team dinner, and transportation. "
303
+ "Budget: $5000 total. Email team@company.com with the itinerary "
304
+ "and finance@company.com for pre-approval."
305
+ ),
306
+ "seed": {
307
+ "travel": [
308
+ {"hotel": "Napa Inn", "rate": 180, "rooms": 4, "nights": 2, "status": "available"},
309
+ {"transport": "Van rental", "cost": 200, "capacity": 10, "status": "available"},
310
+ ],
311
+ "bookings": [
312
+ {"restaurant": "Bistro Don Giovanni", "party_size": 8, "time": "19:00", "cost_per_person": 65, "status": "available"},
313
+ ],
314
+ "emails": [],
315
+ "policies": {"offsite_requires_preapproval": True, "max_offsite_budget": 5000},
316
+ },
317
+ "drift_at_step": 3,
318
+ "drift_event": {"type": "policy_change", "tool": "travel", "change": "budget_cut", "old_budget": 5000, "new_budget": 4000},
319
+ "target": {
320
+ "travel": [{"hotel": "Napa Inn", "status": "booked"}, {"transport": "Van rental", "status": "booked"}],
321
+ "bookings": [{"restaurant": "Bistro Don Giovanni", "status": "booked"}],
322
+ "emails": [{"to": "team@company.com", "contains": "itinerary"}, {"to": "finance@company.com", "contains": "approval"}],
323
+ },
324
+ "max_steps": 15,
325
+ },
326
+ {
327
+ "id": "crisis_management",
328
+ "title": "Handle server outage during client demo",
329
+ "description": (
330
+ "The production server went down during a client demo. "
331
+ "Email the client (client@bigcorp.com) apologizing and offering "
332
+ "to reschedule. Escalate to engineering (eng@company.com) with urgency. "
333
+ "Cancel the next 2 non-critical meetings to free up time. "
334
+ "Schedule a post-mortem for tomorrow at 10 AM. "
335
+ "Email your VP (vp@company.com) with a status update."
336
+ ),
337
+ "seed": {
338
+ "calendar": [
339
+ {"id": 1, "title": "Client Demo", "time": "14:00", "status": "in_progress", "attendees": ["client@bigcorp.com"]},
340
+ {"id": 2, "title": "Team Sync", "time": "15:00", "priority": "low", "status": "scheduled"},
341
+ {"id": 3, "title": "1:1 with PM", "time": "16:00", "priority": "low", "status": "scheduled"},
342
+ {"id": 4, "title": "Board Prep", "time": "17:00", "priority": "high", "status": "scheduled"},
343
+ ],
344
+ "emails": [],
345
+ "incidents": [{"id": "INC-001", "severity": "P1", "status": "active", "service": "production-api"}],
346
+ "policies": {"p1_notify_vp": True, "p1_cancel_nonessential": True},
347
+ },
348
+ "drift_at_step": 4,
349
+ "drift_event": {"type": "schema_change", "tool": "calendar", "change": "cancel_requires_reason", "new_required_field": "cancellation_reason"},
350
+ "target": {
351
+ "calendar": [
352
+ {"id": 2, "status": "cancelled", "cancellation_reason_contains": "outage"},
353
+ {"id": 3, "status": "cancelled", "cancellation_reason_contains": "outage"},
354
+ {"title_contains": "Post-mortem", "time": "10:00"},
355
+ ],
356
+ "emails": [
357
+ {"to": "client@bigcorp.com", "contains": "apologize"},
358
+ {"to": "eng@company.com", "contains": "escalat"},
359
+ {"to": "vp@company.com", "contains": "status"},
360
+ ],
361
+ },
362
+ "max_steps": 18,
363
+ },
364
+ ]
tools.py ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """SchemaShift EA Arena — Simulated enterprise tools with schema drift support."""
2
+ import copy, json
3
+
4
+
5
+ class BaseTool:
6
+ """Base class for all tools. Supports schema drift."""
7
+ def __init__(self):
8
+ self._data = {}
9
+ self._schema_version = 1
10
+ self._field_renames = {} # old_field -> new_field
11
+ self._required_fields = []
12
+ self._policy_overrides = {}
13
+
14
+ def seed(self, data):
15
+ self._data = {item.get("id", i): copy.deepcopy(item) for i, item in enumerate(data)}
16
+
17
+ def apply_drift(self, drift_event):
18
+ change = drift_event.get("change", "")
19
+ if "field_renamed" in change:
20
+ old = drift_event.get("old_field", "")
21
+ new = drift_event.get("new_field", "")
22
+ self._field_renames[old] = new
23
+ for k, v in self._data.items():
24
+ if old in v:
25
+ v[new] = v.pop(old)
26
+ self._schema_version += 1
27
+ elif "requires_" in change or "required_field" in change.replace("new_", ""):
28
+ new_field = drift_event.get("new_required_field", "")
29
+ if new_field:
30
+ self._required_fields.append(new_field)
31
+ self._schema_version += 1
32
+ elif "values_changed" in change:
33
+ self._schema_version += 1
34
+ elif "limit" in change or "budget" in change or "lowered" in change:
35
+ for k, v in drift_event.items():
36
+ if k.startswith("new_"):
37
+ self._policy_overrides[k.replace("new_", "")] = v
38
+ self._schema_version += 1
39
+ elif "format_changed" in change:
40
+ self._policy_overrides["format"] = drift_event.get("new_format", "json")
41
+ self._schema_version += 1
42
+ elif "requires_itemized" in change:
43
+ self._required_fields.append("itemized")
44
+ self._schema_version += 1
45
+
46
+ def snapshot(self):
47
+ return copy.deepcopy(list(self._data.values()))
48
+
49
+
50
+ class CalendarTool(BaseTool):
51
+ def execute(self, action, params):
52
+ if action == "list_events":
53
+ return {"success": True, "events": self.snapshot()}
54
+ elif action == "get_event":
55
+ eid = params.get("id")
56
+ if eid in self._data:
57
+ return {"success": True, "event": copy.deepcopy(self._data[eid])}
58
+ return {"success": False, "error": f"Event {eid} not found"}
59
+ elif action == "create_event":
60
+ for rf in self._required_fields:
61
+ if rf not in params:
62
+ return {"success": False, "error": f"Missing required field: {rf}", "schema_version": self._schema_version}
63
+ eid = params.get("id", max(list(self._data.keys()) or [0]) + 1)
64
+ self._data[eid] = {**params, "id": eid, "status": params.get("status", "scheduled")}
65
+ return {"success": True, "event": copy.deepcopy(self._data[eid])}
66
+ elif action == "reschedule_event":
67
+ eid = params.get("id")
68
+ if eid not in self._data:
69
+ return {"success": False, "error": f"Event {eid} not found"}
70
+ new_time = params.get("time") or params.get("reservation_time")
71
+ if new_time:
72
+ time_field = self._field_renames.get("time", "time")
73
+ self._data[eid][time_field] = new_time
74
+ self._data[eid]["status"] = "rescheduled"
75
+ return {"success": True, "event": copy.deepcopy(self._data[eid])}
76
+ elif action == "cancel_event":
77
+ eid = params.get("id")
78
+ if eid not in self._data:
79
+ return {"success": False, "error": f"Event {eid} not found"}
80
+ for rf in self._required_fields:
81
+ if rf not in params and rf != "itemized":
82
+ return {"success": False, "error": f"Missing required field: {rf}", "schema_version": self._schema_version}
83
+ cancel_status = "cancelled"
84
+ if self._policy_overrides.get("status_values"):
85
+ cancel_status = "removed"
86
+ self._data[eid]["status"] = params.get("status", cancel_status)
87
+ if "cancellation_reason" in params:
88
+ self._data[eid]["cancellation_reason"] = params["cancellation_reason"]
89
+ return {"success": True, "event": copy.deepcopy(self._data[eid])}
90
+ return {"success": False, "error": f"Unknown calendar action: {action}"}
91
+
92
+
93
+ class EmailTool(BaseTool):
94
+ def __init__(self):
95
+ super().__init__()
96
+ self._outbox = []
97
+
98
+ def seed(self, data):
99
+ self._data = {item.get("id", i): copy.deepcopy(item) for i, item in enumerate(data)}
100
+
101
+ def execute(self, action, params):
102
+ if action == "list_emails":
103
+ return {"success": True, "emails": self.snapshot()}
104
+ elif action == "read_email":
105
+ eid = params.get("id")
106
+ if eid in self._data:
107
+ self._data[eid]["status"] = "read"
108
+ return {"success": True, "email": copy.deepcopy(self._data[eid])}
109
+ return {"success": False, "error": f"Email {eid} not found"}
110
+ elif action == "send":
111
+ to = params.get("to", "")
112
+ subject = params.get("subject", "")
113
+ body = params.get("body", "")
114
+ cc = params.get("cc") or params.get("carbon_copy", "")
115
+ if not to:
116
+ return {"success": False, "error": "Missing 'to' field"}
117
+ email = {"to": to, "subject": subject, "body": body, "status": "sent"}
118
+ if cc:
119
+ cc_field = self._field_renames.get("cc", "cc")
120
+ email[cc_field] = cc
121
+ self._outbox.append(email)
122
+ return {"success": True, "output": f"Email sent to {to}"}
123
+ return {"success": False, "error": f"Unknown email action: {action}"}
124
+
125
+ def snapshot(self):
126
+ return {"inbox": list(self._data.values()), "outbox": copy.deepcopy(self._outbox)}
127
+
128
+
129
+ class BookingsTool(BaseTool):
130
+ def execute(self, action, params):
131
+ if action == "list_bookings":
132
+ return {"success": True, "bookings": self.snapshot()}
133
+ elif action == "get_booking":
134
+ bid = params.get("id")
135
+ if bid in self._data:
136
+ return {"success": True, "booking": copy.deepcopy(self._data[bid])}
137
+ return {"success": False, "error": f"Booking {bid} not found"}
138
+ elif action == "update_booking":
139
+ bid = params.get("id")
140
+ if bid not in self._data:
141
+ return {"success": False, "error": f"Booking {bid} not found"}
142
+ for k, v in params.items():
143
+ if k != "id":
144
+ self._data[bid][k] = v
145
+ return {"success": True, "booking": copy.deepcopy(self._data[bid])}
146
+ elif action == "create_booking":
147
+ bid = params.get("id", max(list(self._data.keys()) or [0]) + 1)
148
+ self._data[bid] = {**params, "id": bid, "status": params.get("status", "confirmed")}
149
+ return {"success": True, "booking": copy.deepcopy(self._data[bid])}
150
+ return {"success": False, "error": f"Unknown bookings action: {action}"}
151
+
152
+
153
+ class TravelTool(BaseTool):
154
+ def execute(self, action, params):
155
+ if action == "list_options":
156
+ return {"success": True, "options": self.snapshot()}
157
+ elif action == "book":
158
+ item_id = params.get("id") or params.get("flight") or params.get("hotel") or params.get("transport")
159
+ for k, v in self._data.items():
160
+ match = (v.get("flight") == item_id or v.get("hotel") == item_id or
161
+ v.get("transport") == item_id or k == item_id)
162
+ if match:
163
+ cost = v.get("price") or v.get("rate", 0) * v.get("nights", 1) or v.get("cost", 0)
164
+ limit = self._policy_overrides.get("limit") or self._policy_overrides.get("budget") or 99999
165
+ if cost > limit:
166
+ return {"success": False, "error": f"Cost ${cost} exceeds limit ${limit}", "policy_violated": True}
167
+ v["status"] = "booked"
168
+ return {"success": True, "booking": copy.deepcopy(v)}
169
+ return {"success": False, "error": f"Travel option not found: {item_id}"}
170
+ return {"success": False, "error": f"Unknown travel action: {action}"}
171
+
172
+
173
+ class DocsTool(BaseTool):
174
+ def execute(self, action, params):
175
+ if action == "list_docs":
176
+ return {"success": True, "docs": self.snapshot()}
177
+ elif action == "get_doc":
178
+ did = params.get("id")
179
+ if did in self._data:
180
+ return {"success": True, "doc": copy.deepcopy(self._data[did])}
181
+ return {"success": False, "error": f"Doc {did} not found"}
182
+ elif action == "create_doc":
183
+ did = params.get("id", f"doc-{len(self._data)+1}")
184
+ fmt = self._policy_overrides.get("format", "markdown")
185
+ self._data[did] = {**params, "id": did, "format": fmt, "status": "created"}
186
+ return {"success": True, "doc": copy.deepcopy(self._data[did])}
187
+ return {"success": False, "error": f"Unknown docs action: {action}"}
188
+
189
+
190
+ class ExpensesTool(BaseTool):
191
+ def execute(self, action, params):
192
+ if action == "list_expenses":
193
+ return {"success": True, "expenses": self.snapshot()}
194
+ elif action == "submit_expense":
195
+ eid = params.get("id")
196
+ if eid not in self._data:
197
+ return {"success": False, "error": f"Expense {eid} not found"}
198
+ amount = self._data[eid].get("amount", 0)
199
+ limit = self._policy_overrides.get("limit") or self._policy_overrides.get("meal_limit") or 99999
200
+ expense_type = self._data[eid].get("type", "")
201
+ if expense_type == "meal" and amount > limit:
202
+ return {"success": True, "output": f"Submitted but requires VP approval (${amount} > ${limit} meal limit)",
203
+ "requires_approval": True}
204
+ self._data[eid]["status"] = "submitted"
205
+ return {"success": True, "expense": copy.deepcopy(self._data[eid])}
206
+ return {"success": False, "error": f"Unknown expenses action: {action}"}
207
+
208
+
209
+ class RoomsTool(BaseTool):
210
+ def execute(self, action, params):
211
+ if action == "list_rooms":
212
+ return {"success": True, "rooms": self.snapshot()}
213
+ elif action == "book_room":
214
+ rid = params.get("id")
215
+ if rid not in self._data:
216
+ return {"success": False, "error": f"Room {rid} not found"}
217
+ for rf in self._required_fields:
218
+ if rf not in params:
219
+ return {"success": False, "error": f"Missing required field: {rf}", "schema_version": self._schema_version}
220
+ self._data[rid]["status"] = "booked"
221
+ if "purpose" in params:
222
+ self._data[rid]["purpose"] = params["purpose"]
223
+ return {"success": True, "room": copy.deepcopy(self._data[rid])}
224
+ return {"success": False, "error": f"Unknown rooms action: {action}"}
225
+
226
+
227
+ class TeamTool(BaseTool):
228
+ def execute(self, action, params):
229
+ if action == "list_members":
230
+ return {"success": True, "members": self.snapshot()}
231
+ elif action == "get_member":
232
+ email = params.get("email")
233
+ for k, v in self._data.items():
234
+ if v.get("email") == email:
235
+ return {"success": True, "member": copy.deepcopy(v)}
236
+ return {"success": False, "error": f"Member not found: {email}"}
237
+ return {"success": False, "error": f"Unknown team action: {action}"}
238
+
239
+
240
+ class IncidentsTool(BaseTool):
241
+ def execute(self, action, params):
242
+ if action == "list_incidents":
243
+ return {"success": True, "incidents": self.snapshot()}
244
+ elif action == "get_incident":
245
+ iid = params.get("id")
246
+ if iid in self._data:
247
+ return {"success": True, "incident": copy.deepcopy(self._data[iid])}
248
+ return {"success": False, "error": f"Incident {iid} not found"}
249
+ elif action == "escalate":
250
+ iid = params.get("id")
251
+ if iid in self._data:
252
+ self._data[iid]["status"] = "escalated"
253
+ self._data[iid]["escalated_to"] = params.get("to", "")
254
+ return {"success": True, "incident": copy.deepcopy(self._data[iid])}
255
+ return {"success": False, "error": f"Incident {iid} not found"}
256
+ return {"success": False, "error": f"Unknown incidents action: {action}"}
257
+
258
+
259
+ # Tool registry
260
+ ALL_TOOLS = {
261
+ "calendar": CalendarTool,
262
+ "email": EmailTool,
263
+ "bookings": BookingsTool,
264
+ "travel": TravelTool,
265
+ "docs": DocsTool,
266
+ "expenses": ExpensesTool,
267
+ "rooms": RoomsTool,
268
+ "team": TeamTool,
269
+ "incidents": IncidentsTool,
270
+ }
verifier.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """SchemaShift EA Arena Verifier — deterministic scoring.
2
+
3
+ Score (100 points):
4
+ Task Completion: 30 pts (final state matches target)
5
+ Policy Compliance: 20 pts (no policy violations)
6
+ Notifications: 15 pts (all required emails sent)
7
+ Drift Recovery: 15 pts (adapted to schema changes)
8
+ Tool Efficiency: 10 pts (minimal tool calls)
9
+ Action Hygiene: 10 pts (no invalid calls)
10
+
11
+ Verdict: PASS ≥ 85, HOLD ≥ 55, BLOCK < 55
12
+ """
13
+ import copy
14
+
15
+
16
+ def check_target_match(target, snapshots):
17
+ """Check how well final state matches target."""
18
+ matches = 0
19
+ total = 0
20
+
21
+ for tool_name, expected_items in target.items():
22
+ actual = snapshots.get(tool_name, [])
23
+ if isinstance(actual, dict):
24
+ actual_list = actual.get("outbox", []) if tool_name == "email" else list(actual.values())
25
+ else:
26
+ actual_list = actual
27
+
28
+ for exp in expected_items:
29
+ total += 1
30
+ found = False
31
+ for act in actual_list:
32
+ if not isinstance(act, dict):
33
+ continue
34
+ match = True
35
+ for k, v in exp.items():
36
+ if k.endswith("_contains"):
37
+ real_key = k.replace("_contains", "")
38
+ if real_key == "title":
39
+ act_val = act.get("title", "") + act.get("subject", "")
40
+ else:
41
+ act_val = str(act.get(real_key, ""))
42
+ if v.lower() not in act_val.lower():
43
+ match = False; break
44
+ elif k.endswith("_include"):
45
+ real_key = k.replace("_include", "")
46
+ act_val = act.get(real_key, [])
47
+ if v not in act_val:
48
+ match = False; break
49
+ elif k == "contains":
50
+ body = str(act.get("body", "")) + str(act.get("subject", ""))
51
+ if v.lower() not in body.lower():
52
+ match = False; break
53
+ else:
54
+ if act.get(k) != v:
55
+ match = False; break
56
+ if match:
57
+ found = True; break
58
+ if found:
59
+ matches += 1
60
+
61
+ return (matches / total) if total > 0 else 1.0
62
+
63
+
64
+ def check_notifications(target, email_snapshot):
65
+ """Check if all required emails were sent."""
66
+ if "email" not in target and "emails" not in target:
67
+ return 1.0
68
+
69
+ expected_emails = target.get("emails", target.get("email", []))
70
+ outbox = []
71
+ if isinstance(email_snapshot, dict):
72
+ outbox = email_snapshot.get("outbox", [])
73
+ elif isinstance(email_snapshot, list):
74
+ outbox = email_snapshot
75
+
76
+ if not expected_emails:
77
+ return 1.0
78
+
79
+ sent = 0
80
+ for exp in expected_emails:
81
+ exp_to = exp.get("to", "")
82
+ for actual in outbox:
83
+ if actual.get("to", "") == exp_to:
84
+ sent += 1; break
85
+
86
+ return sent / len(expected_emails)
87
+
88
+
89
+ def verify_episode(task, snapshots, policy_violations, invalid_calls,
90
+ tool_calls_made, drift_events_handled, recovered_from_drift):
91
+ """Score an episode. Returns (reward, violations, verdict)."""
92
+ violations = []
93
+ target = task.get("target", {})
94
+
95
+ # 1. Task Completion (30 pts)
96
+ completion = check_target_match(target, snapshots)
97
+ completion_pts = round(completion * 30, 1)
98
+
99
+ # 2. Policy Compliance (20 pts)
100
+ compliance_pts = max(0, 20 - policy_violations * 10)
101
+ if policy_violations:
102
+ violations.append(f"{policy_violations} policy violation(s)")
103
+
104
+ # 3. Notifications (15 pts)
105
+ notif_score = check_notifications(target, snapshots.get("email", []))
106
+ notif_pts = round(notif_score * 15, 1)
107
+ if notif_score < 1.0:
108
+ violations.append("Missing notifications")
109
+
110
+ # 4. Drift Recovery (15 pts)
111
+ has_drift = task.get("drift_at_step") is not None
112
+ if has_drift:
113
+ if recovered_from_drift:
114
+ drift_pts = 15.0
115
+ elif drift_events_handled > 0:
116
+ drift_pts = 8.0
117
+ else:
118
+ drift_pts = 0.0
119
+ violations.append("Failed to recover from schema drift")
120
+ else:
121
+ drift_pts = 15.0 # No drift = full credit
122
+
123
+ # 5. Tool Efficiency (10 pts)
124
+ max_steps = task.get("max_steps", 15)
125
+ efficiency = max(0, 1 - (tool_calls_made - max_steps * 0.5) / (max_steps * 0.5))
126
+ efficiency_pts = round(min(10, efficiency * 10), 1)
127
+
128
+ # 6. Action Hygiene (10 pts)
129
+ hygiene_pts = max(0, 10 - invalid_calls * 3)
130
+ if invalid_calls:
131
+ violations.append(f"{invalid_calls} invalid call(s)")
132
+
133
+ # Total
134
+ score = round(min(100, completion_pts + compliance_pts + notif_pts +
135
+ drift_pts + efficiency_pts + hygiene_pts), 1)
136
+
137
+ # Verdict
138
+ if score >= 85 and policy_violations == 0:
139
+ decision = "PASS"
140
+ elif score >= 55:
141
+ decision = "HOLD"
142
+ else:
143
+ decision = "BLOCK"
144
+
145
+ grade = "A" if score >= 90 else "B" if score >= 80 else "C" if score >= 70 else "D" if score >= 60 else "F"
146
+ reward = 1.0 if decision == "PASS" else 0.3 if decision == "HOLD" else -0.5
147
+
148
+ return reward, violations, {
149
+ "decision": decision, "score": score, "grade": grade, "reward": reward,
150
+ "breakdown": {
151
+ "task_completion": {"points": completion_pts, "max": 30, "match_rate": round(completion, 3)},
152
+ "policy_compliance": {"points": compliance_pts, "max": 20, "violations": policy_violations},
153
+ "notifications": {"points": notif_pts, "max": 15, "sent_rate": round(notif_score, 3)},
154
+ "drift_recovery": {"points": drift_pts, "max": 15, "recovered": recovered_from_drift},
155
+ "tool_efficiency": {"points": efficiency_pts, "max": 10, "calls": tool_calls_made},
156
+ "action_hygiene": {"points": hygiene_pts, "max": 10, "invalid": invalid_calls},
157
+ },
158
+ "violations": violations,
159
+ }