Spaces:
Sleeping
Sleeping
Commit ·
a17a9f5
1
Parent(s): 6c88a2c
deploy SchemaShift
Browse files- Dockerfile +9 -0
- README.md +5 -7
- __pycache__/models.cpython-312.pyc +0 -0
- __pycache__/schemashift_environment.cpython-312.pyc +0 -0
- __pycache__/tasks.cpython-312.pyc +0 -0
- __pycache__/tools.cpython-312.pyc +0 -0
- __pycache__/verifier.cpython-312.pyc +0 -0
- app.py +38 -0
- models.py +36 -0
- requirements.txt +4 -0
- schemashift_environment.py +162 -0
- tasks.py +364 -0
- tools.py +270 -0
- verifier.py +159 -0
Dockerfile
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
RUN useradd -m -u 1000 user
|
| 3 |
+
USER user
|
| 4 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
| 7 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
| 8 |
+
COPY --chown=user . /app
|
| 9 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -1,12 +1,10 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
license: apache-2.0
|
| 9 |
-
|
| 10 |
---
|
| 11 |
-
|
| 12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
+
title: SchemaShift EA Arena
|
| 3 |
+
emoji: 🔄
|
| 4 |
+
colorFrom: purple
|
| 5 |
+
colorTo: orange
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
license: apache-2.0
|
| 9 |
+
app_port: 7860
|
| 10 |
---
|
|
|
|
|
|
__pycache__/models.cpython-312.pyc
ADDED
|
Binary file (2.05 kB). View file
|
|
|
__pycache__/schemashift_environment.cpython-312.pyc
ADDED
|
Binary file (9.23 kB). View file
|
|
|
__pycache__/tasks.cpython-312.pyc
ADDED
|
Binary file (12.3 kB). View file
|
|
|
__pycache__/tools.cpython-312.pyc
ADDED
|
Binary file (18.1 kB). View file
|
|
|
__pycache__/verifier.cpython-312.pyc
ADDED
|
Binary file (6.24 kB). View file
|
|
|
app.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""SchemaShift EA Arena — FastAPI server (HF Spaces flat structure)."""
|
| 2 |
+
from fastapi import FastAPI
|
| 3 |
+
from pydantic import BaseModel
|
| 4 |
+
from schemashift_environment import SchemaShiftEnvironment
|
| 5 |
+
from models import EAAction
|
| 6 |
+
|
| 7 |
+
app = FastAPI(title="SchemaShift EA Arena")
|
| 8 |
+
env = SchemaShiftEnvironment()
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class StepRequest(BaseModel):
|
| 12 |
+
action: dict
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@app.get("/health")
|
| 16 |
+
def health():
|
| 17 |
+
return {"status": "healthy", "environment": "schemashift-ea-arena", "tasks": 12}
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
@app.post("/reset")
|
| 21 |
+
def reset():
|
| 22 |
+
obs = env.reset()
|
| 23 |
+
return {"observation": obs.model_dump(), "reward": 0.0, "done": False}
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@app.post("/step")
|
| 27 |
+
def step(req: StepRequest):
|
| 28 |
+
action = EAAction(**req.action)
|
| 29 |
+
obs = env.step(action)
|
| 30 |
+
return {"observation": obs.model_dump(), "reward": obs.reward, "done": obs.done}
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
@app.get("/state")
|
| 34 |
+
def state():
|
| 35 |
+
s = env.state
|
| 36 |
+
if s is None:
|
| 37 |
+
return {"error": "No active episode. Call /reset first."}
|
| 38 |
+
return s.model_dump()
|
models.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""SchemaShift EA Arena — Pydantic v2 models."""
|
| 2 |
+
from pydantic import BaseModel, Field
|
| 3 |
+
from typing import Optional
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class EAAction(BaseModel):
|
| 7 |
+
tool: str = ""
|
| 8 |
+
action: str = ""
|
| 9 |
+
parameters: dict = Field(default_factory=dict)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class EAObservation(BaseModel):
|
| 13 |
+
success: bool = False
|
| 14 |
+
output: str = ""
|
| 15 |
+
error: Optional[str] = None
|
| 16 |
+
reward: float = 0.0
|
| 17 |
+
done: bool = False
|
| 18 |
+
step_count: int = 0
|
| 19 |
+
task_description: str = ""
|
| 20 |
+
schema_version: int = 1
|
| 21 |
+
drift_occurred: bool = False
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class EpisodeState(BaseModel):
|
| 25 |
+
task_id: str = ""
|
| 26 |
+
task_description: str = ""
|
| 27 |
+
step_count: int = 0
|
| 28 |
+
max_steps: int = 20
|
| 29 |
+
completed: bool = False
|
| 30 |
+
verdict: dict = Field(default_factory=dict)
|
| 31 |
+
tools_used: list = Field(default_factory=list)
|
| 32 |
+
policy_violations: int = 0
|
| 33 |
+
invalid_calls: int = 0
|
| 34 |
+
drift_events: list = Field(default_factory=list)
|
| 35 |
+
recovered_from_drift: bool = False
|
| 36 |
+
notifications_sent: list = Field(default_factory=list)
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openenv-core>=0.2.1
|
| 2 |
+
fastapi
|
| 3 |
+
uvicorn[standard]
|
| 4 |
+
pydantic>=2.0
|
schemashift_environment.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""SchemaShift EA Arena Environment — reset/step/state with schema drift injection."""
|
| 2 |
+
import os, json, copy
|
| 3 |
+
from models import EAAction, EAObservation, EpisodeState
|
| 4 |
+
from tasks import TASKS
|
| 5 |
+
from tools import ALL_TOOLS, CalendarTool, EmailTool, BookingsTool, TravelTool, DocsTool, ExpensesTool, RoomsTool, TeamTool, IncidentsTool
|
| 6 |
+
from verifier import verify_episode
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class SchemaShiftEnvironment:
|
| 10 |
+
def __init__(self):
|
| 11 |
+
self._state = None
|
| 12 |
+
self._task = None
|
| 13 |
+
self._task_index = 0
|
| 14 |
+
self._tools = {}
|
| 15 |
+
self._drift_applied = False
|
| 16 |
+
|
| 17 |
+
def _setup_tools(self, seed):
|
| 18 |
+
self._tools = {}
|
| 19 |
+
tool_map = {
|
| 20 |
+
"calendar": CalendarTool, "emails": EmailTool, "email": EmailTool,
|
| 21 |
+
"bookings": BookingsTool, "travel": TravelTool, "docs": DocsTool,
|
| 22 |
+
"expenses": ExpensesTool, "rooms": RoomsTool, "team": TeamTool,
|
| 23 |
+
"incidents": IncidentsTool,
|
| 24 |
+
}
|
| 25 |
+
for key, data in seed.items():
|
| 26 |
+
if key == "policies":
|
| 27 |
+
continue
|
| 28 |
+
cls = tool_map.get(key)
|
| 29 |
+
if cls and isinstance(data, list):
|
| 30 |
+
tool = cls()
|
| 31 |
+
tool.seed(data)
|
| 32 |
+
name = "email" if key == "emails" else key
|
| 33 |
+
self._tools[name] = tool
|
| 34 |
+
|
| 35 |
+
def reset(self):
|
| 36 |
+
self._task = TASKS[self._task_index % len(TASKS)]
|
| 37 |
+
self._task_index += 1
|
| 38 |
+
self._drift_applied = False
|
| 39 |
+
self._setup_tools(self._task.get("seed", {}))
|
| 40 |
+
self._state = EpisodeState(
|
| 41 |
+
task_id=self._task["id"],
|
| 42 |
+
task_description=self._task["description"],
|
| 43 |
+
max_steps=self._task.get("max_steps", 15),
|
| 44 |
+
)
|
| 45 |
+
return EAObservation(
|
| 46 |
+
success=True,
|
| 47 |
+
output=f"TASK: {self._task['title']}\n\n{self._task['description']}",
|
| 48 |
+
task_description=self._task["description"],
|
| 49 |
+
done=False,
|
| 50 |
+
schema_version=1,
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
def _maybe_inject_drift(self):
|
| 54 |
+
drift_step = self._task.get("drift_at_step")
|
| 55 |
+
if drift_step and self._state.step_count >= drift_step and not self._drift_applied:
|
| 56 |
+
drift = self._task.get("drift_event", {})
|
| 57 |
+
tool_name = drift.get("tool", "")
|
| 58 |
+
if tool_name == "emails":
|
| 59 |
+
tool_name = "email"
|
| 60 |
+
tool = self._tools.get(tool_name)
|
| 61 |
+
if tool:
|
| 62 |
+
tool.apply_drift(drift)
|
| 63 |
+
self._drift_applied = True
|
| 64 |
+
self._state.drift_events.append(drift.get("change", "unknown"))
|
| 65 |
+
return drift
|
| 66 |
+
return None
|
| 67 |
+
|
| 68 |
+
def step(self, action):
|
| 69 |
+
if self._state is None:
|
| 70 |
+
return EAObservation(success=False, error="Call reset() first", reward=-1.0, done=True)
|
| 71 |
+
|
| 72 |
+
self._state.step_count += 1
|
| 73 |
+
|
| 74 |
+
tool_name = action.tool if hasattr(action, 'tool') else action.get('tool', '')
|
| 75 |
+
act = action.action if hasattr(action, 'action') else action.get('action', '')
|
| 76 |
+
params = action.parameters if hasattr(action, 'parameters') else action.get('parameters', {})
|
| 77 |
+
|
| 78 |
+
drift = self._maybe_inject_drift()
|
| 79 |
+
drift_msg = ""
|
| 80 |
+
if drift:
|
| 81 |
+
dtype = drift.get("type", "")
|
| 82 |
+
if dtype == "schema_change":
|
| 83 |
+
drift_msg = f"\n⚠️ SCHEMA CHANGE: {drift.get('change', '')}. Check tool documentation."
|
| 84 |
+
elif dtype == "policy_change":
|
| 85 |
+
drift_msg = f"\n⚠️ POLICY CHANGE: {drift.get('change', '')}. Review updated policies."
|
| 86 |
+
elif dtype == "actor_conflict":
|
| 87 |
+
drift_msg = f"\n⚠️ NEW MESSAGE from {drift.get('actor', 'unknown')}: \"{drift.get('message', '')}\""
|
| 88 |
+
|
| 89 |
+
if tool_name == "system" and act == "submit":
|
| 90 |
+
return self._submit()
|
| 91 |
+
|
| 92 |
+
tool = self._tools.get(tool_name)
|
| 93 |
+
if not tool:
|
| 94 |
+
self._state.invalid_calls += 1
|
| 95 |
+
return EAObservation(
|
| 96 |
+
success=False, error=f"Unknown tool: {tool_name}{drift_msg}",
|
| 97 |
+
step_count=self._state.step_count,
|
| 98 |
+
drift_occurred=bool(drift),
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
self._state.tools_used.append(f"{tool_name}.{act}")
|
| 102 |
+
result = tool.execute(act, params)
|
| 103 |
+
|
| 104 |
+
if not result.get("success", False):
|
| 105 |
+
if result.get("policy_violated"):
|
| 106 |
+
self._state.policy_violations += 1
|
| 107 |
+
elif "schema_version" not in result:
|
| 108 |
+
self._state.invalid_calls += 1
|
| 109 |
+
|
| 110 |
+
if self._drift_applied and result.get("success"):
|
| 111 |
+
self._state.recovered_from_drift = True
|
| 112 |
+
|
| 113 |
+
output = json.dumps(result, indent=2) if isinstance(result, dict) else str(result)
|
| 114 |
+
output += drift_msg
|
| 115 |
+
|
| 116 |
+
done = self._state.step_count >= self._state.max_steps
|
| 117 |
+
if done:
|
| 118 |
+
return self._submit()
|
| 119 |
+
|
| 120 |
+
return EAObservation(
|
| 121 |
+
success=result.get("success", False),
|
| 122 |
+
output=output,
|
| 123 |
+
error=result.get("error"),
|
| 124 |
+
step_count=self._state.step_count,
|
| 125 |
+
schema_version=getattr(tool, '_schema_version', 1),
|
| 126 |
+
drift_occurred=bool(drift),
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
def _submit(self):
|
| 130 |
+
snapshots = {}
|
| 131 |
+
for name, tool in self._tools.items():
|
| 132 |
+
snapshots[name] = tool.snapshot()
|
| 133 |
+
|
| 134 |
+
if "email" in self._tools:
|
| 135 |
+
email_snap = self._tools["email"].snapshot()
|
| 136 |
+
if isinstance(email_snap, dict):
|
| 137 |
+
self._state.notifications_sent = [e.get("to", "") for e in email_snap.get("outbox", [])]
|
| 138 |
+
|
| 139 |
+
reward, violations, verdict = verify_episode(
|
| 140 |
+
task=self._task,
|
| 141 |
+
snapshots=snapshots,
|
| 142 |
+
policy_violations=self._state.policy_violations,
|
| 143 |
+
invalid_calls=self._state.invalid_calls,
|
| 144 |
+
tool_calls_made=self._state.step_count,
|
| 145 |
+
drift_events_handled=len(self._state.drift_events),
|
| 146 |
+
recovered_from_drift=self._state.recovered_from_drift,
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
self._state.completed = True
|
| 150 |
+
self._state.verdict = verdict
|
| 151 |
+
|
| 152 |
+
return EAObservation(
|
| 153 |
+
success=True,
|
| 154 |
+
output=json.dumps(verdict, indent=2),
|
| 155 |
+
reward=reward,
|
| 156 |
+
done=True,
|
| 157 |
+
step_count=self._state.step_count,
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
@property
|
| 161 |
+
def state(self):
|
| 162 |
+
return self._state
|
tasks.py
ADDED
|
@@ -0,0 +1,364 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SchemaShift EA Arena — Task Templates
|
| 3 |
+
|
| 4 |
+
12 tasks across 3 difficulty tiers with schema drift events.
|
| 5 |
+
Each task simulates a real executive assistant workflow where
|
| 6 |
+
APIs, forms, and policies change mid-episode.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
TASKS = [
|
| 10 |
+
# ═══════════════════════════════════════════════════════════
|
| 11 |
+
# TIER 1: Simple (3-4 tool calls, 1 drift event)
|
| 12 |
+
# ═══════════════════════════════════════════════════════════
|
| 13 |
+
{
|
| 14 |
+
"id": "reschedule_dinner",
|
| 15 |
+
"title": "Reschedule dinner due to meeting conflict",
|
| 16 |
+
"description": (
|
| 17 |
+
"Your VP moved the board prep meeting to 6:30 PM tonight. "
|
| 18 |
+
"You have dinner with Alex at 7:00 PM at Lucia's. "
|
| 19 |
+
"Reschedule dinner to 8:30 PM, update the restaurant booking, "
|
| 20 |
+
"and email Alex about the change."
|
| 21 |
+
),
|
| 22 |
+
"seed": {
|
| 23 |
+
"calendar": [
|
| 24 |
+
{"id": 1, "title": "Board Prep", "time": "15:00", "attendees": ["vp@company.com"], "status": "scheduled"},
|
| 25 |
+
{"id": 2, "title": "Dinner with Alex", "time": "19:00", "location": "Lucia's", "attendees": ["alex@friends.com"], "status": "scheduled"},
|
| 26 |
+
],
|
| 27 |
+
"bookings": [
|
| 28 |
+
{"id": 101, "restaurant": "Lucia's", "time": "19:00", "party_size": 2, "status": "confirmed"},
|
| 29 |
+
],
|
| 30 |
+
"emails": [],
|
| 31 |
+
"policies": {"max_booking_changes": 3},
|
| 32 |
+
},
|
| 33 |
+
"drift_at_step": 2,
|
| 34 |
+
"drift_event": {"type": "schema_change", "tool": "bookings", "change": "time_field_renamed", "old_field": "time", "new_field": "reservation_time"},
|
| 35 |
+
"target": {
|
| 36 |
+
"calendar": [{"id": 2, "time": "20:30", "status": "rescheduled"}],
|
| 37 |
+
"bookings": [{"id": 101, "reservation_time": "20:30", "status": "confirmed"}],
|
| 38 |
+
"emails": [{"to": "alex@friends.com", "contains": "reschedule"}],
|
| 39 |
+
},
|
| 40 |
+
"max_steps": 10,
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"id": "book_travel_simple",
|
| 44 |
+
"title": "Book a flight for Monday meeting",
|
| 45 |
+
"description": (
|
| 46 |
+
"Book a flight from SFO to LAX for Monday morning. "
|
| 47 |
+
"The meeting is at 2 PM so arrive by noon. "
|
| 48 |
+
"Email travel@company.com with the booking confirmation."
|
| 49 |
+
),
|
| 50 |
+
"seed": {
|
| 51 |
+
"calendar": [
|
| 52 |
+
{"id": 1, "title": "LA Client Meeting", "time": "14:00", "date": "2026-03-09", "location": "LA Office", "attendees": ["client@partner.com"], "status": "scheduled"},
|
| 53 |
+
],
|
| 54 |
+
"travel": [
|
| 55 |
+
{"flight": "UA101", "from": "SFO", "to": "LAX", "depart": "07:00", "arrive": "08:30", "price": 189, "status": "available"},
|
| 56 |
+
{"flight": "UA205", "from": "SFO", "to": "LAX", "depart": "09:00", "arrive": "10:30", "price": 249, "status": "available"},
|
| 57 |
+
],
|
| 58 |
+
"emails": [],
|
| 59 |
+
"policies": {"max_flight_cost": 300},
|
| 60 |
+
},
|
| 61 |
+
"drift_at_step": 2,
|
| 62 |
+
"drift_event": {"type": "policy_change", "tool": "travel", "change": "cost_limit_lowered", "old_limit": 300, "new_limit": 200},
|
| 63 |
+
"target": {
|
| 64 |
+
"travel": [{"flight": "UA101", "status": "booked"}],
|
| 65 |
+
"emails": [{"to": "travel@company.com", "contains": "booking"}],
|
| 66 |
+
},
|
| 67 |
+
"max_steps": 8,
|
| 68 |
+
},
|
| 69 |
+
{
|
| 70 |
+
"id": "reply_email_urgent",
|
| 71 |
+
"title": "Reply to urgent client email",
|
| 72 |
+
"description": (
|
| 73 |
+
"Client Sarah at sarah@bigcorp.com sent an urgent email asking "
|
| 74 |
+
"about the Q2 proposal deadline. The deadline is March 15. "
|
| 75 |
+
"Reply to her email with the deadline and CC your manager mgr@company.com."
|
| 76 |
+
),
|
| 77 |
+
"seed": {
|
| 78 |
+
"emails": [
|
| 79 |
+
{"id": 1, "from": "sarah@bigcorp.com", "subject": "Q2 Proposal Deadline?", "body": "Hi, when is the Q2 proposal due? We need to plan resources.", "status": "unread"},
|
| 80 |
+
],
|
| 81 |
+
"docs": [
|
| 82 |
+
{"id": "q2-proposal", "title": "Q2 Proposal", "deadline": "2026-03-15", "status": "draft"},
|
| 83 |
+
],
|
| 84 |
+
"policies": {"reply_within_hours": 2, "cc_manager_on_client": True},
|
| 85 |
+
},
|
| 86 |
+
"drift_at_step": 1,
|
| 87 |
+
"drift_event": {"type": "schema_change", "tool": "emails", "change": "cc_field_renamed", "old_field": "cc", "new_field": "carbon_copy"},
|
| 88 |
+
"target": {
|
| 89 |
+
"emails": [{"to": "sarah@bigcorp.com", "contains": "March 15", "carbon_copy": "mgr@company.com"}],
|
| 90 |
+
},
|
| 91 |
+
"max_steps": 6,
|
| 92 |
+
},
|
| 93 |
+
{
|
| 94 |
+
"id": "cancel_meeting_notify",
|
| 95 |
+
"title": "Cancel tomorrow's standup and notify team",
|
| 96 |
+
"description": (
|
| 97 |
+
"Cancel tomorrow's team standup (event 1) because the CEO "
|
| 98 |
+
"called an all-hands at the same time. Email the team list: "
|
| 99 |
+
"dev1@company.com, dev2@company.com, dev3@company.com."
|
| 100 |
+
),
|
| 101 |
+
"seed": {
|
| 102 |
+
"calendar": [
|
| 103 |
+
{"id": 1, "title": "Team Standup", "time": "09:00", "date": "2026-03-08", "attendees": ["dev1@company.com", "dev2@company.com", "dev3@company.com"], "status": "scheduled"},
|
| 104 |
+
{"id": 2, "title": "CEO All-Hands", "time": "09:00", "date": "2026-03-08", "attendees": ["all@company.com"], "status": "scheduled"},
|
| 105 |
+
],
|
| 106 |
+
"emails": [],
|
| 107 |
+
"policies": {},
|
| 108 |
+
},
|
| 109 |
+
"drift_at_step": 2,
|
| 110 |
+
"drift_event": {"type": "schema_change", "tool": "calendar", "change": "status_values_changed", "old_values": ["scheduled", "cancelled"], "new_values": ["active", "removed"]},
|
| 111 |
+
"target": {
|
| 112 |
+
"calendar": [{"id": 1, "status": "removed"}],
|
| 113 |
+
"emails": [{"to": "dev1@company.com"}, {"to": "dev2@company.com"}, {"to": "dev3@company.com"}],
|
| 114 |
+
},
|
| 115 |
+
"max_steps": 10,
|
| 116 |
+
},
|
| 117 |
+
|
| 118 |
+
# ═══════════════════════════════════════════════════════════
|
| 119 |
+
# TIER 2: Medium (5-6 tool calls, 2 drift events)
|
| 120 |
+
# ═══════════════════════════════════════════════════════════
|
| 121 |
+
{
|
| 122 |
+
"id": "travel_with_approval",
|
| 123 |
+
"title": "Book international travel with manager approval",
|
| 124 |
+
"description": (
|
| 125 |
+
"Book a flight from SFO to London for the conference on March 20. "
|
| 126 |
+
"Budget is $2000. Book hotel for 3 nights near the venue. "
|
| 127 |
+
"Get manager approval (mgr@company.com) since international travel "
|
| 128 |
+
"requires it. Email travel@company.com with full itinerary."
|
| 129 |
+
),
|
| 130 |
+
"seed": {
|
| 131 |
+
"travel": [
|
| 132 |
+
{"flight": "BA285", "from": "SFO", "to": "LHR", "depart": "19:00", "arrive": "13:00+1", "price": 1200, "status": "available"},
|
| 133 |
+
{"hotel": "Hilton Tower Bridge", "rate": 250, "nights": 3, "status": "available"},
|
| 134 |
+
],
|
| 135 |
+
"calendar": [
|
| 136 |
+
{"id": 1, "title": "London Conference", "date": "2026-03-20", "location": "ExCeL London", "status": "scheduled"},
|
| 137 |
+
],
|
| 138 |
+
"emails": [],
|
| 139 |
+
"policies": {"intl_travel_requires_approval": True, "max_hotel_rate": 300},
|
| 140 |
+
},
|
| 141 |
+
"drift_at_step": 3,
|
| 142 |
+
"drift_event": {"type": "policy_change", "tool": "travel", "change": "approval_requires_itemized", "new_requirement": "must include flight cost, hotel cost, and total in approval request"},
|
| 143 |
+
"target": {
|
| 144 |
+
"travel": [{"flight": "BA285", "status": "booked"}, {"hotel": "Hilton Tower Bridge", "status": "booked"}],
|
| 145 |
+
"emails": [{"to": "mgr@company.com", "contains": "approval"}, {"to": "travel@company.com", "contains": "itinerary"}],
|
| 146 |
+
},
|
| 147 |
+
"max_steps": 12,
|
| 148 |
+
},
|
| 149 |
+
{
|
| 150 |
+
"id": "conflict_resolution",
|
| 151 |
+
"title": "Resolve double-booked afternoon",
|
| 152 |
+
"description": (
|
| 153 |
+
"You have 3 meetings at 2 PM: client call (high priority), "
|
| 154 |
+
"team sync (can move), and 1:1 with intern (can move). "
|
| 155 |
+
"Keep the client call, move team sync to 3 PM, move 1:1 to 4 PM. "
|
| 156 |
+
"Email all affected attendees about changes."
|
| 157 |
+
),
|
| 158 |
+
"seed": {
|
| 159 |
+
"calendar": [
|
| 160 |
+
{"id": 1, "title": "Client Call", "time": "14:00", "priority": "high", "attendees": ["client@partner.com"], "status": "scheduled"},
|
| 161 |
+
{"id": 2, "title": "Team Sync", "time": "14:00", "priority": "medium", "attendees": ["team@company.com"], "status": "scheduled"},
|
| 162 |
+
{"id": 3, "title": "1:1 with Intern", "time": "14:00", "priority": "low", "attendees": ["intern@company.com"], "status": "scheduled"},
|
| 163 |
+
],
|
| 164 |
+
"emails": [],
|
| 165 |
+
"policies": {"notify_on_reschedule": True},
|
| 166 |
+
},
|
| 167 |
+
"drift_at_step": 3,
|
| 168 |
+
"drift_event": {"type": "actor_conflict", "tool": "calendar", "change": "attendee_requests_different_time", "actor": "team@company.com", "message": "3 PM doesn't work, can we do 3:30?"},
|
| 169 |
+
"target": {
|
| 170 |
+
"calendar": [
|
| 171 |
+
{"id": 1, "time": "14:00", "status": "scheduled"},
|
| 172 |
+
{"id": 2, "time": "15:30", "status": "rescheduled"},
|
| 173 |
+
{"id": 3, "time": "16:00", "status": "rescheduled"},
|
| 174 |
+
],
|
| 175 |
+
"emails": [{"to": "team@company.com"}, {"to": "intern@company.com"}],
|
| 176 |
+
},
|
| 177 |
+
"max_steps": 12,
|
| 178 |
+
},
|
| 179 |
+
{
|
| 180 |
+
"id": "expense_report",
|
| 181 |
+
"title": "Submit expense report with receipt changes",
|
| 182 |
+
"description": (
|
| 183 |
+
"Submit expense report for last week's client dinner ($185) "
|
| 184 |
+
"and taxi ($42). Attach receipts, categorize correctly, "
|
| 185 |
+
"and email finance@company.com for approval."
|
| 186 |
+
),
|
| 187 |
+
"seed": {
|
| 188 |
+
"expenses": [
|
| 189 |
+
{"id": 1, "type": "meal", "amount": 185, "description": "Client dinner at Nobu", "receipt": True, "status": "draft"},
|
| 190 |
+
{"id": 2, "type": "transport", "amount": 42, "description": "Taxi to restaurant", "receipt": True, "status": "draft"},
|
| 191 |
+
],
|
| 192 |
+
"emails": [],
|
| 193 |
+
"policies": {"meal_limit": 200, "require_receipt_over": 25, "approval_required_over": 100},
|
| 194 |
+
},
|
| 195 |
+
"drift_at_step": 2,
|
| 196 |
+
"drift_event": {"type": "policy_change", "tool": "expenses", "change": "meal_limit_lowered", "old_limit": 200, "new_limit": 150, "action": "meals over new limit require VP approval"},
|
| 197 |
+
"target": {
|
| 198 |
+
"expenses": [{"id": 1, "status": "submitted"}, {"id": 2, "status": "submitted"}],
|
| 199 |
+
"emails": [{"to": "finance@company.com", "contains": "expense"}, {"to": "vp@company.com", "contains": "approval"}],
|
| 200 |
+
},
|
| 201 |
+
"max_steps": 10,
|
| 202 |
+
},
|
| 203 |
+
{
|
| 204 |
+
"id": "onboard_new_hire",
|
| 205 |
+
"title": "Onboard new team member",
|
| 206 |
+
"description": (
|
| 207 |
+
"New hire Jordan (jordan@company.com) starts Monday. "
|
| 208 |
+
"Schedule a welcome meeting at 10 AM with the team, "
|
| 209 |
+
"create their onboarding doc, add them to the team calendar, "
|
| 210 |
+
"and email IT (it@company.com) to set up their accounts."
|
| 211 |
+
),
|
| 212 |
+
"seed": {
|
| 213 |
+
"calendar": [],
|
| 214 |
+
"docs": [],
|
| 215 |
+
"emails": [],
|
| 216 |
+
"team": [
|
| 217 |
+
{"name": "Jordan Lee", "email": "jordan@company.com", "role": "engineer", "start_date": "2026-03-09"},
|
| 218 |
+
],
|
| 219 |
+
"policies": {"onboard_checklist": ["welcome_meeting", "onboarding_doc", "it_setup", "team_intro"]},
|
| 220 |
+
},
|
| 221 |
+
"drift_at_step": 3,
|
| 222 |
+
"drift_event": {"type": "schema_change", "tool": "docs", "change": "template_format_changed", "old_format": "markdown", "new_format": "json"},
|
| 223 |
+
"target": {
|
| 224 |
+
"calendar": [{"title_contains": "Welcome", "attendees_include": "jordan@company.com"}],
|
| 225 |
+
"docs": [{"title_contains": "Onboarding"}],
|
| 226 |
+
"emails": [{"to": "it@company.com", "contains": "account"}, {"to": "jordan@company.com", "contains": "welcome"}],
|
| 227 |
+
},
|
| 228 |
+
"max_steps": 12,
|
| 229 |
+
},
|
| 230 |
+
|
| 231 |
+
# ═══════════════════════════════════════════════════════════
|
| 232 |
+
# TIER 3: Complex (7+ tool calls, 2-3 drift events)
|
| 233 |
+
# ═══════════════════════════════════════════════════════════
|
| 234 |
+
{
|
| 235 |
+
"id": "full_day_reorg",
|
| 236 |
+
"title": "Reorganize entire day after CEO emergency",
|
| 237 |
+
"description": (
|
| 238 |
+
"CEO called emergency board meeting at 11 AM. Reorganize the day: "
|
| 239 |
+
"move the 11 AM team review to 2 PM, cancel the noon lunch with vendor "
|
| 240 |
+
"(email vendor@partner.com to apologize), keep the 3 PM client call, "
|
| 241 |
+
"book a conference room for the board meeting, and email all attendees "
|
| 242 |
+
"about every change."
|
| 243 |
+
),
|
| 244 |
+
"seed": {
|
| 245 |
+
"calendar": [
|
| 246 |
+
{"id": 1, "title": "Team Review", "time": "11:00", "attendees": ["team@company.com"], "status": "scheduled"},
|
| 247 |
+
{"id": 2, "title": "Lunch with Vendor", "time": "12:00", "attendees": ["vendor@partner.com"], "status": "scheduled"},
|
| 248 |
+
{"id": 3, "title": "Client Call", "time": "15:00", "attendees": ["client@bigcorp.com"], "status": "scheduled"},
|
| 249 |
+
],
|
| 250 |
+
"rooms": [
|
| 251 |
+
{"id": "conf-a", "name": "Board Room", "capacity": 20, "available": True},
|
| 252 |
+
{"id": "conf-b", "name": "Small Meeting", "capacity": 6, "available": True},
|
| 253 |
+
],
|
| 254 |
+
"emails": [],
|
| 255 |
+
"policies": {"board_meeting_room_min_capacity": 15},
|
| 256 |
+
},
|
| 257 |
+
"drift_at_step": 3,
|
| 258 |
+
"drift_event": {"type": "schema_change", "tool": "rooms", "change": "booking_requires_purpose", "new_required_field": "purpose"},
|
| 259 |
+
"target": {
|
| 260 |
+
"calendar": [
|
| 261 |
+
{"id": 1, "time": "14:00", "status": "rescheduled"},
|
| 262 |
+
{"id": 2, "status": "cancelled"},
|
| 263 |
+
],
|
| 264 |
+
"rooms": [{"id": "conf-a", "status": "booked", "purpose": "CEO Board Meeting"}],
|
| 265 |
+
"emails": [{"to": "vendor@partner.com", "contains": "cancel"}, {"to": "team@company.com", "contains": "moved"}],
|
| 266 |
+
},
|
| 267 |
+
"max_steps": 15,
|
| 268 |
+
},
|
| 269 |
+
{
|
| 270 |
+
"id": "multi_actor_conflict",
|
| 271 |
+
"title": "Handle conflicting requests from VP and client",
|
| 272 |
+
"description": (
|
| 273 |
+
"VP wants you to schedule a strategy session Thursday 2-4 PM. "
|
| 274 |
+
"Client just emailed requesting a demo at the same time. "
|
| 275 |
+
"The client is higher priority. Schedule the demo for Thursday 2-3 PM, "
|
| 276 |
+
"move VP strategy to Friday 2-4 PM, and email both explaining."
|
| 277 |
+
),
|
| 278 |
+
"seed": {
|
| 279 |
+
"calendar": [],
|
| 280 |
+
"emails": [
|
| 281 |
+
{"id": 1, "from": "vp@company.com", "subject": "Strategy Session", "body": "Block Thursday 2-4 PM for strategy planning.", "status": "unread"},
|
| 282 |
+
{"id": 2, "from": "client@bigcorp.com", "subject": "Demo Request", "body": "Can we see the product demo Thursday 2 PM?", "status": "unread"},
|
| 283 |
+
],
|
| 284 |
+
"policies": {"client_priority_over_internal": True},
|
| 285 |
+
},
|
| 286 |
+
"drift_at_step": 4,
|
| 287 |
+
"drift_event": {"type": "actor_conflict", "tool": "emails", "change": "vp_insists", "actor": "vp@company.com", "message": "Friday doesn't work. Can we do Thursday morning instead?"},
|
| 288 |
+
"target": {
|
| 289 |
+
"calendar": [
|
| 290 |
+
{"title_contains": "Demo", "time": "14:00", "day": "Thursday"},
|
| 291 |
+
{"title_contains": "Strategy", "time": "10:00", "day": "Thursday"},
|
| 292 |
+
],
|
| 293 |
+
"emails": [{"to": "vp@company.com", "contains": "Thursday morning"}, {"to": "client@bigcorp.com", "contains": "demo confirmed"}],
|
| 294 |
+
},
|
| 295 |
+
"max_steps": 15,
|
| 296 |
+
},
|
| 297 |
+
{
|
| 298 |
+
"id": "trip_planning_drift",
|
| 299 |
+
"title": "Plan team offsite with multiple schema changes",
|
| 300 |
+
"description": (
|
| 301 |
+
"Plan a 2-day team offsite for 8 people in Napa Valley. "
|
| 302 |
+
"Book hotel, restaurant for team dinner, and transportation. "
|
| 303 |
+
"Budget: $5000 total. Email team@company.com with the itinerary "
|
| 304 |
+
"and finance@company.com for pre-approval."
|
| 305 |
+
),
|
| 306 |
+
"seed": {
|
| 307 |
+
"travel": [
|
| 308 |
+
{"hotel": "Napa Inn", "rate": 180, "rooms": 4, "nights": 2, "status": "available"},
|
| 309 |
+
{"transport": "Van rental", "cost": 200, "capacity": 10, "status": "available"},
|
| 310 |
+
],
|
| 311 |
+
"bookings": [
|
| 312 |
+
{"restaurant": "Bistro Don Giovanni", "party_size": 8, "time": "19:00", "cost_per_person": 65, "status": "available"},
|
| 313 |
+
],
|
| 314 |
+
"emails": [],
|
| 315 |
+
"policies": {"offsite_requires_preapproval": True, "max_offsite_budget": 5000},
|
| 316 |
+
},
|
| 317 |
+
"drift_at_step": 3,
|
| 318 |
+
"drift_event": {"type": "policy_change", "tool": "travel", "change": "budget_cut", "old_budget": 5000, "new_budget": 4000},
|
| 319 |
+
"target": {
|
| 320 |
+
"travel": [{"hotel": "Napa Inn", "status": "booked"}, {"transport": "Van rental", "status": "booked"}],
|
| 321 |
+
"bookings": [{"restaurant": "Bistro Don Giovanni", "status": "booked"}],
|
| 322 |
+
"emails": [{"to": "team@company.com", "contains": "itinerary"}, {"to": "finance@company.com", "contains": "approval"}],
|
| 323 |
+
},
|
| 324 |
+
"max_steps": 15,
|
| 325 |
+
},
|
| 326 |
+
{
|
| 327 |
+
"id": "crisis_management",
|
| 328 |
+
"title": "Handle server outage during client demo",
|
| 329 |
+
"description": (
|
| 330 |
+
"The production server went down during a client demo. "
|
| 331 |
+
"Email the client (client@bigcorp.com) apologizing and offering "
|
| 332 |
+
"to reschedule. Escalate to engineering (eng@company.com) with urgency. "
|
| 333 |
+
"Cancel the next 2 non-critical meetings to free up time. "
|
| 334 |
+
"Schedule a post-mortem for tomorrow at 10 AM. "
|
| 335 |
+
"Email your VP (vp@company.com) with a status update."
|
| 336 |
+
),
|
| 337 |
+
"seed": {
|
| 338 |
+
"calendar": [
|
| 339 |
+
{"id": 1, "title": "Client Demo", "time": "14:00", "status": "in_progress", "attendees": ["client@bigcorp.com"]},
|
| 340 |
+
{"id": 2, "title": "Team Sync", "time": "15:00", "priority": "low", "status": "scheduled"},
|
| 341 |
+
{"id": 3, "title": "1:1 with PM", "time": "16:00", "priority": "low", "status": "scheduled"},
|
| 342 |
+
{"id": 4, "title": "Board Prep", "time": "17:00", "priority": "high", "status": "scheduled"},
|
| 343 |
+
],
|
| 344 |
+
"emails": [],
|
| 345 |
+
"incidents": [{"id": "INC-001", "severity": "P1", "status": "active", "service": "production-api"}],
|
| 346 |
+
"policies": {"p1_notify_vp": True, "p1_cancel_nonessential": True},
|
| 347 |
+
},
|
| 348 |
+
"drift_at_step": 4,
|
| 349 |
+
"drift_event": {"type": "schema_change", "tool": "calendar", "change": "cancel_requires_reason", "new_required_field": "cancellation_reason"},
|
| 350 |
+
"target": {
|
| 351 |
+
"calendar": [
|
| 352 |
+
{"id": 2, "status": "cancelled", "cancellation_reason_contains": "outage"},
|
| 353 |
+
{"id": 3, "status": "cancelled", "cancellation_reason_contains": "outage"},
|
| 354 |
+
{"title_contains": "Post-mortem", "time": "10:00"},
|
| 355 |
+
],
|
| 356 |
+
"emails": [
|
| 357 |
+
{"to": "client@bigcorp.com", "contains": "apologize"},
|
| 358 |
+
{"to": "eng@company.com", "contains": "escalat"},
|
| 359 |
+
{"to": "vp@company.com", "contains": "status"},
|
| 360 |
+
],
|
| 361 |
+
},
|
| 362 |
+
"max_steps": 18,
|
| 363 |
+
},
|
| 364 |
+
]
|
tools.py
ADDED
|
@@ -0,0 +1,270 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""SchemaShift EA Arena — Simulated enterprise tools with schema drift support."""
|
| 2 |
+
import copy, json
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class BaseTool:
|
| 6 |
+
"""Base class for all tools. Supports schema drift."""
|
| 7 |
+
def __init__(self):
|
| 8 |
+
self._data = {}
|
| 9 |
+
self._schema_version = 1
|
| 10 |
+
self._field_renames = {} # old_field -> new_field
|
| 11 |
+
self._required_fields = []
|
| 12 |
+
self._policy_overrides = {}
|
| 13 |
+
|
| 14 |
+
def seed(self, data):
|
| 15 |
+
self._data = {item.get("id", i): copy.deepcopy(item) for i, item in enumerate(data)}
|
| 16 |
+
|
| 17 |
+
def apply_drift(self, drift_event):
|
| 18 |
+
change = drift_event.get("change", "")
|
| 19 |
+
if "field_renamed" in change:
|
| 20 |
+
old = drift_event.get("old_field", "")
|
| 21 |
+
new = drift_event.get("new_field", "")
|
| 22 |
+
self._field_renames[old] = new
|
| 23 |
+
for k, v in self._data.items():
|
| 24 |
+
if old in v:
|
| 25 |
+
v[new] = v.pop(old)
|
| 26 |
+
self._schema_version += 1
|
| 27 |
+
elif "requires_" in change or "required_field" in change.replace("new_", ""):
|
| 28 |
+
new_field = drift_event.get("new_required_field", "")
|
| 29 |
+
if new_field:
|
| 30 |
+
self._required_fields.append(new_field)
|
| 31 |
+
self._schema_version += 1
|
| 32 |
+
elif "values_changed" in change:
|
| 33 |
+
self._schema_version += 1
|
| 34 |
+
elif "limit" in change or "budget" in change or "lowered" in change:
|
| 35 |
+
for k, v in drift_event.items():
|
| 36 |
+
if k.startswith("new_"):
|
| 37 |
+
self._policy_overrides[k.replace("new_", "")] = v
|
| 38 |
+
self._schema_version += 1
|
| 39 |
+
elif "format_changed" in change:
|
| 40 |
+
self._policy_overrides["format"] = drift_event.get("new_format", "json")
|
| 41 |
+
self._schema_version += 1
|
| 42 |
+
elif "requires_itemized" in change:
|
| 43 |
+
self._required_fields.append("itemized")
|
| 44 |
+
self._schema_version += 1
|
| 45 |
+
|
| 46 |
+
def snapshot(self):
|
| 47 |
+
return copy.deepcopy(list(self._data.values()))
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class CalendarTool(BaseTool):
|
| 51 |
+
def execute(self, action, params):
|
| 52 |
+
if action == "list_events":
|
| 53 |
+
return {"success": True, "events": self.snapshot()}
|
| 54 |
+
elif action == "get_event":
|
| 55 |
+
eid = params.get("id")
|
| 56 |
+
if eid in self._data:
|
| 57 |
+
return {"success": True, "event": copy.deepcopy(self._data[eid])}
|
| 58 |
+
return {"success": False, "error": f"Event {eid} not found"}
|
| 59 |
+
elif action == "create_event":
|
| 60 |
+
for rf in self._required_fields:
|
| 61 |
+
if rf not in params:
|
| 62 |
+
return {"success": False, "error": f"Missing required field: {rf}", "schema_version": self._schema_version}
|
| 63 |
+
eid = params.get("id", max(list(self._data.keys()) or [0]) + 1)
|
| 64 |
+
self._data[eid] = {**params, "id": eid, "status": params.get("status", "scheduled")}
|
| 65 |
+
return {"success": True, "event": copy.deepcopy(self._data[eid])}
|
| 66 |
+
elif action == "reschedule_event":
|
| 67 |
+
eid = params.get("id")
|
| 68 |
+
if eid not in self._data:
|
| 69 |
+
return {"success": False, "error": f"Event {eid} not found"}
|
| 70 |
+
new_time = params.get("time") or params.get("reservation_time")
|
| 71 |
+
if new_time:
|
| 72 |
+
time_field = self._field_renames.get("time", "time")
|
| 73 |
+
self._data[eid][time_field] = new_time
|
| 74 |
+
self._data[eid]["status"] = "rescheduled"
|
| 75 |
+
return {"success": True, "event": copy.deepcopy(self._data[eid])}
|
| 76 |
+
elif action == "cancel_event":
|
| 77 |
+
eid = params.get("id")
|
| 78 |
+
if eid not in self._data:
|
| 79 |
+
return {"success": False, "error": f"Event {eid} not found"}
|
| 80 |
+
for rf in self._required_fields:
|
| 81 |
+
if rf not in params and rf != "itemized":
|
| 82 |
+
return {"success": False, "error": f"Missing required field: {rf}", "schema_version": self._schema_version}
|
| 83 |
+
cancel_status = "cancelled"
|
| 84 |
+
if self._policy_overrides.get("status_values"):
|
| 85 |
+
cancel_status = "removed"
|
| 86 |
+
self._data[eid]["status"] = params.get("status", cancel_status)
|
| 87 |
+
if "cancellation_reason" in params:
|
| 88 |
+
self._data[eid]["cancellation_reason"] = params["cancellation_reason"]
|
| 89 |
+
return {"success": True, "event": copy.deepcopy(self._data[eid])}
|
| 90 |
+
return {"success": False, "error": f"Unknown calendar action: {action}"}
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
class EmailTool(BaseTool):
|
| 94 |
+
def __init__(self):
|
| 95 |
+
super().__init__()
|
| 96 |
+
self._outbox = []
|
| 97 |
+
|
| 98 |
+
def seed(self, data):
|
| 99 |
+
self._data = {item.get("id", i): copy.deepcopy(item) for i, item in enumerate(data)}
|
| 100 |
+
|
| 101 |
+
def execute(self, action, params):
|
| 102 |
+
if action == "list_emails":
|
| 103 |
+
return {"success": True, "emails": self.snapshot()}
|
| 104 |
+
elif action == "read_email":
|
| 105 |
+
eid = params.get("id")
|
| 106 |
+
if eid in self._data:
|
| 107 |
+
self._data[eid]["status"] = "read"
|
| 108 |
+
return {"success": True, "email": copy.deepcopy(self._data[eid])}
|
| 109 |
+
return {"success": False, "error": f"Email {eid} not found"}
|
| 110 |
+
elif action == "send":
|
| 111 |
+
to = params.get("to", "")
|
| 112 |
+
subject = params.get("subject", "")
|
| 113 |
+
body = params.get("body", "")
|
| 114 |
+
cc = params.get("cc") or params.get("carbon_copy", "")
|
| 115 |
+
if not to:
|
| 116 |
+
return {"success": False, "error": "Missing 'to' field"}
|
| 117 |
+
email = {"to": to, "subject": subject, "body": body, "status": "sent"}
|
| 118 |
+
if cc:
|
| 119 |
+
cc_field = self._field_renames.get("cc", "cc")
|
| 120 |
+
email[cc_field] = cc
|
| 121 |
+
self._outbox.append(email)
|
| 122 |
+
return {"success": True, "output": f"Email sent to {to}"}
|
| 123 |
+
return {"success": False, "error": f"Unknown email action: {action}"}
|
| 124 |
+
|
| 125 |
+
def snapshot(self):
|
| 126 |
+
return {"inbox": list(self._data.values()), "outbox": copy.deepcopy(self._outbox)}
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
class BookingsTool(BaseTool):
|
| 130 |
+
def execute(self, action, params):
|
| 131 |
+
if action == "list_bookings":
|
| 132 |
+
return {"success": True, "bookings": self.snapshot()}
|
| 133 |
+
elif action == "get_booking":
|
| 134 |
+
bid = params.get("id")
|
| 135 |
+
if bid in self._data:
|
| 136 |
+
return {"success": True, "booking": copy.deepcopy(self._data[bid])}
|
| 137 |
+
return {"success": False, "error": f"Booking {bid} not found"}
|
| 138 |
+
elif action == "update_booking":
|
| 139 |
+
bid = params.get("id")
|
| 140 |
+
if bid not in self._data:
|
| 141 |
+
return {"success": False, "error": f"Booking {bid} not found"}
|
| 142 |
+
for k, v in params.items():
|
| 143 |
+
if k != "id":
|
| 144 |
+
self._data[bid][k] = v
|
| 145 |
+
return {"success": True, "booking": copy.deepcopy(self._data[bid])}
|
| 146 |
+
elif action == "create_booking":
|
| 147 |
+
bid = params.get("id", max(list(self._data.keys()) or [0]) + 1)
|
| 148 |
+
self._data[bid] = {**params, "id": bid, "status": params.get("status", "confirmed")}
|
| 149 |
+
return {"success": True, "booking": copy.deepcopy(self._data[bid])}
|
| 150 |
+
return {"success": False, "error": f"Unknown bookings action: {action}"}
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
class TravelTool(BaseTool):
|
| 154 |
+
def execute(self, action, params):
|
| 155 |
+
if action == "list_options":
|
| 156 |
+
return {"success": True, "options": self.snapshot()}
|
| 157 |
+
elif action == "book":
|
| 158 |
+
item_id = params.get("id") or params.get("flight") or params.get("hotel") or params.get("transport")
|
| 159 |
+
for k, v in self._data.items():
|
| 160 |
+
match = (v.get("flight") == item_id or v.get("hotel") == item_id or
|
| 161 |
+
v.get("transport") == item_id or k == item_id)
|
| 162 |
+
if match:
|
| 163 |
+
cost = v.get("price") or v.get("rate", 0) * v.get("nights", 1) or v.get("cost", 0)
|
| 164 |
+
limit = self._policy_overrides.get("limit") or self._policy_overrides.get("budget") or 99999
|
| 165 |
+
if cost > limit:
|
| 166 |
+
return {"success": False, "error": f"Cost ${cost} exceeds limit ${limit}", "policy_violated": True}
|
| 167 |
+
v["status"] = "booked"
|
| 168 |
+
return {"success": True, "booking": copy.deepcopy(v)}
|
| 169 |
+
return {"success": False, "error": f"Travel option not found: {item_id}"}
|
| 170 |
+
return {"success": False, "error": f"Unknown travel action: {action}"}
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
class DocsTool(BaseTool):
|
| 174 |
+
def execute(self, action, params):
|
| 175 |
+
if action == "list_docs":
|
| 176 |
+
return {"success": True, "docs": self.snapshot()}
|
| 177 |
+
elif action == "get_doc":
|
| 178 |
+
did = params.get("id")
|
| 179 |
+
if did in self._data:
|
| 180 |
+
return {"success": True, "doc": copy.deepcopy(self._data[did])}
|
| 181 |
+
return {"success": False, "error": f"Doc {did} not found"}
|
| 182 |
+
elif action == "create_doc":
|
| 183 |
+
did = params.get("id", f"doc-{len(self._data)+1}")
|
| 184 |
+
fmt = self._policy_overrides.get("format", "markdown")
|
| 185 |
+
self._data[did] = {**params, "id": did, "format": fmt, "status": "created"}
|
| 186 |
+
return {"success": True, "doc": copy.deepcopy(self._data[did])}
|
| 187 |
+
return {"success": False, "error": f"Unknown docs action: {action}"}
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
class ExpensesTool(BaseTool):
|
| 191 |
+
def execute(self, action, params):
|
| 192 |
+
if action == "list_expenses":
|
| 193 |
+
return {"success": True, "expenses": self.snapshot()}
|
| 194 |
+
elif action == "submit_expense":
|
| 195 |
+
eid = params.get("id")
|
| 196 |
+
if eid not in self._data:
|
| 197 |
+
return {"success": False, "error": f"Expense {eid} not found"}
|
| 198 |
+
amount = self._data[eid].get("amount", 0)
|
| 199 |
+
limit = self._policy_overrides.get("limit") or self._policy_overrides.get("meal_limit") or 99999
|
| 200 |
+
expense_type = self._data[eid].get("type", "")
|
| 201 |
+
if expense_type == "meal" and amount > limit:
|
| 202 |
+
return {"success": True, "output": f"Submitted but requires VP approval (${amount} > ${limit} meal limit)",
|
| 203 |
+
"requires_approval": True}
|
| 204 |
+
self._data[eid]["status"] = "submitted"
|
| 205 |
+
return {"success": True, "expense": copy.deepcopy(self._data[eid])}
|
| 206 |
+
return {"success": False, "error": f"Unknown expenses action: {action}"}
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
class RoomsTool(BaseTool):
|
| 210 |
+
def execute(self, action, params):
|
| 211 |
+
if action == "list_rooms":
|
| 212 |
+
return {"success": True, "rooms": self.snapshot()}
|
| 213 |
+
elif action == "book_room":
|
| 214 |
+
rid = params.get("id")
|
| 215 |
+
if rid not in self._data:
|
| 216 |
+
return {"success": False, "error": f"Room {rid} not found"}
|
| 217 |
+
for rf in self._required_fields:
|
| 218 |
+
if rf not in params:
|
| 219 |
+
return {"success": False, "error": f"Missing required field: {rf}", "schema_version": self._schema_version}
|
| 220 |
+
self._data[rid]["status"] = "booked"
|
| 221 |
+
if "purpose" in params:
|
| 222 |
+
self._data[rid]["purpose"] = params["purpose"]
|
| 223 |
+
return {"success": True, "room": copy.deepcopy(self._data[rid])}
|
| 224 |
+
return {"success": False, "error": f"Unknown rooms action: {action}"}
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
class TeamTool(BaseTool):
|
| 228 |
+
def execute(self, action, params):
|
| 229 |
+
if action == "list_members":
|
| 230 |
+
return {"success": True, "members": self.snapshot()}
|
| 231 |
+
elif action == "get_member":
|
| 232 |
+
email = params.get("email")
|
| 233 |
+
for k, v in self._data.items():
|
| 234 |
+
if v.get("email") == email:
|
| 235 |
+
return {"success": True, "member": copy.deepcopy(v)}
|
| 236 |
+
return {"success": False, "error": f"Member not found: {email}"}
|
| 237 |
+
return {"success": False, "error": f"Unknown team action: {action}"}
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
class IncidentsTool(BaseTool):
|
| 241 |
+
def execute(self, action, params):
|
| 242 |
+
if action == "list_incidents":
|
| 243 |
+
return {"success": True, "incidents": self.snapshot()}
|
| 244 |
+
elif action == "get_incident":
|
| 245 |
+
iid = params.get("id")
|
| 246 |
+
if iid in self._data:
|
| 247 |
+
return {"success": True, "incident": copy.deepcopy(self._data[iid])}
|
| 248 |
+
return {"success": False, "error": f"Incident {iid} not found"}
|
| 249 |
+
elif action == "escalate":
|
| 250 |
+
iid = params.get("id")
|
| 251 |
+
if iid in self._data:
|
| 252 |
+
self._data[iid]["status"] = "escalated"
|
| 253 |
+
self._data[iid]["escalated_to"] = params.get("to", "")
|
| 254 |
+
return {"success": True, "incident": copy.deepcopy(self._data[iid])}
|
| 255 |
+
return {"success": False, "error": f"Incident {iid} not found"}
|
| 256 |
+
return {"success": False, "error": f"Unknown incidents action: {action}"}
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
# Tool registry
|
| 260 |
+
ALL_TOOLS = {
|
| 261 |
+
"calendar": CalendarTool,
|
| 262 |
+
"email": EmailTool,
|
| 263 |
+
"bookings": BookingsTool,
|
| 264 |
+
"travel": TravelTool,
|
| 265 |
+
"docs": DocsTool,
|
| 266 |
+
"expenses": ExpensesTool,
|
| 267 |
+
"rooms": RoomsTool,
|
| 268 |
+
"team": TeamTool,
|
| 269 |
+
"incidents": IncidentsTool,
|
| 270 |
+
}
|
verifier.py
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""SchemaShift EA Arena Verifier — deterministic scoring.
|
| 2 |
+
|
| 3 |
+
Score (100 points):
|
| 4 |
+
Task Completion: 30 pts (final state matches target)
|
| 5 |
+
Policy Compliance: 20 pts (no policy violations)
|
| 6 |
+
Notifications: 15 pts (all required emails sent)
|
| 7 |
+
Drift Recovery: 15 pts (adapted to schema changes)
|
| 8 |
+
Tool Efficiency: 10 pts (minimal tool calls)
|
| 9 |
+
Action Hygiene: 10 pts (no invalid calls)
|
| 10 |
+
|
| 11 |
+
Verdict: PASS ≥ 85, HOLD ≥ 55, BLOCK < 55
|
| 12 |
+
"""
|
| 13 |
+
import copy
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def check_target_match(target, snapshots):
|
| 17 |
+
"""Check how well final state matches target."""
|
| 18 |
+
matches = 0
|
| 19 |
+
total = 0
|
| 20 |
+
|
| 21 |
+
for tool_name, expected_items in target.items():
|
| 22 |
+
actual = snapshots.get(tool_name, [])
|
| 23 |
+
if isinstance(actual, dict):
|
| 24 |
+
actual_list = actual.get("outbox", []) if tool_name == "email" else list(actual.values())
|
| 25 |
+
else:
|
| 26 |
+
actual_list = actual
|
| 27 |
+
|
| 28 |
+
for exp in expected_items:
|
| 29 |
+
total += 1
|
| 30 |
+
found = False
|
| 31 |
+
for act in actual_list:
|
| 32 |
+
if not isinstance(act, dict):
|
| 33 |
+
continue
|
| 34 |
+
match = True
|
| 35 |
+
for k, v in exp.items():
|
| 36 |
+
if k.endswith("_contains"):
|
| 37 |
+
real_key = k.replace("_contains", "")
|
| 38 |
+
if real_key == "title":
|
| 39 |
+
act_val = act.get("title", "") + act.get("subject", "")
|
| 40 |
+
else:
|
| 41 |
+
act_val = str(act.get(real_key, ""))
|
| 42 |
+
if v.lower() not in act_val.lower():
|
| 43 |
+
match = False; break
|
| 44 |
+
elif k.endswith("_include"):
|
| 45 |
+
real_key = k.replace("_include", "")
|
| 46 |
+
act_val = act.get(real_key, [])
|
| 47 |
+
if v not in act_val:
|
| 48 |
+
match = False; break
|
| 49 |
+
elif k == "contains":
|
| 50 |
+
body = str(act.get("body", "")) + str(act.get("subject", ""))
|
| 51 |
+
if v.lower() not in body.lower():
|
| 52 |
+
match = False; break
|
| 53 |
+
else:
|
| 54 |
+
if act.get(k) != v:
|
| 55 |
+
match = False; break
|
| 56 |
+
if match:
|
| 57 |
+
found = True; break
|
| 58 |
+
if found:
|
| 59 |
+
matches += 1
|
| 60 |
+
|
| 61 |
+
return (matches / total) if total > 0 else 1.0
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def check_notifications(target, email_snapshot):
|
| 65 |
+
"""Check if all required emails were sent."""
|
| 66 |
+
if "email" not in target and "emails" not in target:
|
| 67 |
+
return 1.0
|
| 68 |
+
|
| 69 |
+
expected_emails = target.get("emails", target.get("email", []))
|
| 70 |
+
outbox = []
|
| 71 |
+
if isinstance(email_snapshot, dict):
|
| 72 |
+
outbox = email_snapshot.get("outbox", [])
|
| 73 |
+
elif isinstance(email_snapshot, list):
|
| 74 |
+
outbox = email_snapshot
|
| 75 |
+
|
| 76 |
+
if not expected_emails:
|
| 77 |
+
return 1.0
|
| 78 |
+
|
| 79 |
+
sent = 0
|
| 80 |
+
for exp in expected_emails:
|
| 81 |
+
exp_to = exp.get("to", "")
|
| 82 |
+
for actual in outbox:
|
| 83 |
+
if actual.get("to", "") == exp_to:
|
| 84 |
+
sent += 1; break
|
| 85 |
+
|
| 86 |
+
return sent / len(expected_emails)
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def verify_episode(task, snapshots, policy_violations, invalid_calls,
|
| 90 |
+
tool_calls_made, drift_events_handled, recovered_from_drift):
|
| 91 |
+
"""Score an episode. Returns (reward, violations, verdict)."""
|
| 92 |
+
violations = []
|
| 93 |
+
target = task.get("target", {})
|
| 94 |
+
|
| 95 |
+
# 1. Task Completion (30 pts)
|
| 96 |
+
completion = check_target_match(target, snapshots)
|
| 97 |
+
completion_pts = round(completion * 30, 1)
|
| 98 |
+
|
| 99 |
+
# 2. Policy Compliance (20 pts)
|
| 100 |
+
compliance_pts = max(0, 20 - policy_violations * 10)
|
| 101 |
+
if policy_violations:
|
| 102 |
+
violations.append(f"{policy_violations} policy violation(s)")
|
| 103 |
+
|
| 104 |
+
# 3. Notifications (15 pts)
|
| 105 |
+
notif_score = check_notifications(target, snapshots.get("email", []))
|
| 106 |
+
notif_pts = round(notif_score * 15, 1)
|
| 107 |
+
if notif_score < 1.0:
|
| 108 |
+
violations.append("Missing notifications")
|
| 109 |
+
|
| 110 |
+
# 4. Drift Recovery (15 pts)
|
| 111 |
+
has_drift = task.get("drift_at_step") is not None
|
| 112 |
+
if has_drift:
|
| 113 |
+
if recovered_from_drift:
|
| 114 |
+
drift_pts = 15.0
|
| 115 |
+
elif drift_events_handled > 0:
|
| 116 |
+
drift_pts = 8.0
|
| 117 |
+
else:
|
| 118 |
+
drift_pts = 0.0
|
| 119 |
+
violations.append("Failed to recover from schema drift")
|
| 120 |
+
else:
|
| 121 |
+
drift_pts = 15.0 # No drift = full credit
|
| 122 |
+
|
| 123 |
+
# 5. Tool Efficiency (10 pts)
|
| 124 |
+
max_steps = task.get("max_steps", 15)
|
| 125 |
+
efficiency = max(0, 1 - (tool_calls_made - max_steps * 0.5) / (max_steps * 0.5))
|
| 126 |
+
efficiency_pts = round(min(10, efficiency * 10), 1)
|
| 127 |
+
|
| 128 |
+
# 6. Action Hygiene (10 pts)
|
| 129 |
+
hygiene_pts = max(0, 10 - invalid_calls * 3)
|
| 130 |
+
if invalid_calls:
|
| 131 |
+
violations.append(f"{invalid_calls} invalid call(s)")
|
| 132 |
+
|
| 133 |
+
# Total
|
| 134 |
+
score = round(min(100, completion_pts + compliance_pts + notif_pts +
|
| 135 |
+
drift_pts + efficiency_pts + hygiene_pts), 1)
|
| 136 |
+
|
| 137 |
+
# Verdict
|
| 138 |
+
if score >= 85 and policy_violations == 0:
|
| 139 |
+
decision = "PASS"
|
| 140 |
+
elif score >= 55:
|
| 141 |
+
decision = "HOLD"
|
| 142 |
+
else:
|
| 143 |
+
decision = "BLOCK"
|
| 144 |
+
|
| 145 |
+
grade = "A" if score >= 90 else "B" if score >= 80 else "C" if score >= 70 else "D" if score >= 60 else "F"
|
| 146 |
+
reward = 1.0 if decision == "PASS" else 0.3 if decision == "HOLD" else -0.5
|
| 147 |
+
|
| 148 |
+
return reward, violations, {
|
| 149 |
+
"decision": decision, "score": score, "grade": grade, "reward": reward,
|
| 150 |
+
"breakdown": {
|
| 151 |
+
"task_completion": {"points": completion_pts, "max": 30, "match_rate": round(completion, 3)},
|
| 152 |
+
"policy_compliance": {"points": compliance_pts, "max": 20, "violations": policy_violations},
|
| 153 |
+
"notifications": {"points": notif_pts, "max": 15, "sent_rate": round(notif_score, 3)},
|
| 154 |
+
"drift_recovery": {"points": drift_pts, "max": 15, "recovered": recovered_from_drift},
|
| 155 |
+
"tool_efficiency": {"points": efficiency_pts, "max": 10, "calls": tool_calls_made},
|
| 156 |
+
"action_hygiene": {"points": hygiene_pts, "max": 10, "invalid": invalid_calls},
|
| 157 |
+
},
|
| 158 |
+
"violations": violations,
|
| 159 |
+
}
|