Spaces:

SidraMiconi
/

schemashift

Sleeping

App Files Files Community

SidraMiconi commited on Mar 8

Commit

a17a9f5

1 Parent(s): 6c88a2c

deploy SchemaShift

Browse files

Files changed (14) hide show

Dockerfile +9 -0
README.md +5 -7
__pycache__/models.cpython-312.pyc +0 -0
__pycache__/schemashift_environment.cpython-312.pyc +0 -0
__pycache__/tasks.cpython-312.pyc +0 -0
__pycache__/tools.cpython-312.pyc +0 -0
__pycache__/verifier.cpython-312.pyc +0 -0
app.py +38 -0
models.py +36 -0
requirements.txt +4 -0
schemashift_environment.py +162 -0
tasks.py +364 -0
tools.py +270 -0
verifier.py +159 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,9 @@

+FROM python:3.10-slim
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,12 +1,10 @@
 ---
-title: Schemashift
-emoji: 👀
-colorFrom: indigo
-colorTo: red
 sdk: docker
 pinned: false
 license: apache-2.0
-short_description: Executive assistant environment with schema drift workflows
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: SchemaShift EA Arena
+emoji: 🔄
+colorFrom: purple
+colorTo: orange
 sdk: docker
 pinned: false
 license: apache-2.0
+app_port: 7860
 ---

__pycache__/models.cpython-312.pyc ADDED Viewed

Binary file (2.05 kB). View file

__pycache__/schemashift_environment.cpython-312.pyc ADDED Viewed

Binary file (9.23 kB). View file

__pycache__/tasks.cpython-312.pyc ADDED Viewed

Binary file (12.3 kB). View file

__pycache__/tools.cpython-312.pyc ADDED Viewed

Binary file (18.1 kB). View file

__pycache__/verifier.cpython-312.pyc ADDED Viewed

Binary file (6.24 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,38 @@

+"""SchemaShift EA Arena — FastAPI server (HF Spaces flat structure)."""
+from fastapi import FastAPI
+from pydantic import BaseModel
+from schemashift_environment import SchemaShiftEnvironment
+from models import EAAction
+app = FastAPI(title="SchemaShift EA Arena")
+env = SchemaShiftEnvironment()
+class StepRequest(BaseModel):
+    action: dict
+@app.get("/health")
+def health():
+    return {"status": "healthy", "environment": "schemashift-ea-arena", "tasks": 12}
+@app.post("/reset")
+def reset():
+    obs = env.reset()
+    return {"observation": obs.model_dump(), "reward": 0.0, "done": False}
+@app.post("/step")
+def step(req: StepRequest):
+    action = EAAction(**req.action)
+    obs = env.step(action)
+    return {"observation": obs.model_dump(), "reward": obs.reward, "done": obs.done}
+@app.get("/state")
+def state():
+    s = env.state
+    if s is None:
+        return {"error": "No active episode. Call /reset first."}
+    return s.model_dump()

models.py ADDED Viewed

	@@ -0,0 +1,36 @@

+"""SchemaShift EA Arena — Pydantic v2 models."""
+from pydantic import BaseModel, Field
+from typing import Optional
+class EAAction(BaseModel):
+    tool: str = ""
+    action: str = ""
+    parameters: dict = Field(default_factory=dict)
+class EAObservation(BaseModel):
+    success: bool = False
+    output: str = ""
+    error: Optional[str] = None
+    reward: float = 0.0
+    done: bool = False
+    step_count: int = 0
+    task_description: str = ""
+    schema_version: int = 1
+    drift_occurred: bool = False
+class EpisodeState(BaseModel):
+    task_id: str = ""
+    task_description: str = ""
+    step_count: int = 0
+    max_steps: int = 20
+    completed: bool = False
+    verdict: dict = Field(default_factory=dict)
+    tools_used: list = Field(default_factory=list)
+    policy_violations: int = 0
+    invalid_calls: int = 0
+    drift_events: list = Field(default_factory=list)
+    recovered_from_drift: bool = False
+    notifications_sent: list = Field(default_factory=list)

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+openenv-core>=0.2.1
+fastapi
+uvicorn[standard]
+pydantic>=2.0

schemashift_environment.py ADDED Viewed

	@@ -0,0 +1,162 @@

+"""SchemaShift EA Arena Environment — reset/step/state with schema drift injection."""
+import os, json, copy
+from models import EAAction, EAObservation, EpisodeState
+from tasks import TASKS
+from tools import ALL_TOOLS, CalendarTool, EmailTool, BookingsTool, TravelTool, DocsTool, ExpensesTool, RoomsTool, TeamTool, IncidentsTool
+from verifier import verify_episode
+class SchemaShiftEnvironment:
+    def __init__(self):
+        self._state = None
+        self._task = None
+        self._task_index = 0
+        self._tools = {}
+        self._drift_applied = False
+    def _setup_tools(self, seed):
+        self._tools = {}
+        tool_map = {
+            "calendar": CalendarTool, "emails": EmailTool, "email": EmailTool,
+            "bookings": BookingsTool, "travel": TravelTool, "docs": DocsTool,
+            "expenses": ExpensesTool, "rooms": RoomsTool, "team": TeamTool,
+            "incidents": IncidentsTool,
+        }
+        for key, data in seed.items():
+            if key == "policies":
+                continue
+            cls = tool_map.get(key)
+            if cls and isinstance(data, list):
+                tool = cls()
+                tool.seed(data)
+                name = "email" if key == "emails" else key
+                self._tools[name] = tool
+    def reset(self):
+        self._task = TASKS[self._task_index % len(TASKS)]
+        self._task_index += 1
+        self._drift_applied = False
+        self._setup_tools(self._task.get("seed", {}))
+        self._state = EpisodeState(
+            task_id=self._task["id"],
+            task_description=self._task["description"],
+            max_steps=self._task.get("max_steps", 15),
+        )
+        return EAObservation(
+            success=True,
+            output=f"TASK: {self._task['title']}\n\n{self._task['description']}",
+            task_description=self._task["description"],
+            done=False,
+            schema_version=1,
+        )
+    def _maybe_inject_drift(self):
+        drift_step = self._task.get("drift_at_step")
+        if drift_step and self._state.step_count >= drift_step and not self._drift_applied:
+            drift = self._task.get("drift_event", {})
+            tool_name = drift.get("tool", "")
+            if tool_name == "emails":
+                tool_name = "email"
+            tool = self._tools.get(tool_name)
+            if tool:
+                tool.apply_drift(drift)
+            self._drift_applied = True
+            self._state.drift_events.append(drift.get("change", "unknown"))
+            return drift
+        return None
+    def step(self, action):
+        if self._state is None:
+            return EAObservation(success=False, error="Call reset() first", reward=-1.0, done=True)
+        self._state.step_count += 1
+        tool_name = action.tool if hasattr(action, 'tool') else action.get('tool', '')
+        act = action.action if hasattr(action, 'action') else action.get('action', '')
+        params = action.parameters if hasattr(action, 'parameters') else action.get('parameters', {})
+        drift = self._maybe_inject_drift()
+        drift_msg = ""
+        if drift:
+            dtype = drift.get("type", "")
+            if dtype == "schema_change":
+                drift_msg = f"\n⚠️ SCHEMA CHANGE: {drift.get('change', '')}. Check tool documentation."
+            elif dtype == "policy_change":
+                drift_msg = f"\n⚠️ POLICY CHANGE: {drift.get('change', '')}. Review updated policies."
+            elif dtype == "actor_conflict":
+                drift_msg = f"\n⚠️ NEW MESSAGE from {drift.get('actor', 'unknown')}: \"{drift.get('message', '')}\""
+        if tool_name == "system" and act == "submit":
+            return self._submit()
+        tool = self._tools.get(tool_name)
+        if not tool:
+            self._state.invalid_calls += 1
+            return EAObservation(
+                success=False, error=f"Unknown tool: {tool_name}{drift_msg}",
+                step_count=self._state.step_count,
+                drift_occurred=bool(drift),
+            )
+        self._state.tools_used.append(f"{tool_name}.{act}")
+        result = tool.execute(act, params)
+        if not result.get("success", False):
+            if result.get("policy_violated"):
+                self._state.policy_violations += 1
+            elif "schema_version" not in result:
+                self._state.invalid_calls += 1
+        if self._drift_applied and result.get("success"):
+            self._state.recovered_from_drift = True
+        output = json.dumps(result, indent=2) if isinstance(result, dict) else str(result)
+        output += drift_msg
+        done = self._state.step_count >= self._state.max_steps
+        if done:
+            return self._submit()
+        return EAObservation(
+            success=result.get("success", False),
+            output=output,
+            error=result.get("error"),
+            step_count=self._state.step_count,
+            schema_version=getattr(tool, '_schema_version', 1),
+            drift_occurred=bool(drift),
+        )
+    def _submit(self):
+        snapshots = {}
+        for name, tool in self._tools.items():
+            snapshots[name] = tool.snapshot()
+        if "email" in self._tools:
+            email_snap = self._tools["email"].snapshot()
+            if isinstance(email_snap, dict):
+                self._state.notifications_sent = [e.get("to", "") for e in email_snap.get("outbox", [])]
+        reward, violations, verdict = verify_episode(
+            task=self._task,
+            snapshots=snapshots,
+            policy_violations=self._state.policy_violations,
+            invalid_calls=self._state.invalid_calls,
+            tool_calls_made=self._state.step_count,
+            drift_events_handled=len(self._state.drift_events),
+            recovered_from_drift=self._state.recovered_from_drift,
+        )
+        self._state.completed = True
+        self._state.verdict = verdict
+        return EAObservation(
+            success=True,
+            output=json.dumps(verdict, indent=2),
+            reward=reward,
+            done=True,
+            step_count=self._state.step_count,
+        )
+    @property
+    def state(self):
+        return self._state

tasks.py ADDED Viewed

	@@ -0,0 +1,364 @@

+"""
+SchemaShift EA Arena — Task Templates
+12 tasks across 3 difficulty tiers with schema drift events.
+Each task simulates a real executive assistant workflow where
+APIs, forms, and policies change mid-episode.
+"""
+TASKS = [
+    # ═══════════════════════════════════════════════════════════
+    #  TIER 1: Simple (3-4 tool calls, 1 drift event)
+    # ═══════════════════════════════════════════════════════════
+    {
+        "id": "reschedule_dinner",
+        "title": "Reschedule dinner due to meeting conflict",
+        "description": (
+            "Your VP moved the board prep meeting to 6:30 PM tonight. "
+            "You have dinner with Alex at 7:00 PM at Lucia's. "
+            "Reschedule dinner to 8:30 PM, update the restaurant booking, "
+            "and email Alex about the change."
+        ),
+        "seed": {
+            "calendar": [
+                {"id": 1, "title": "Board Prep", "time": "15:00", "attendees": ["vp@company.com"], "status": "scheduled"},
+                {"id": 2, "title": "Dinner with Alex", "time": "19:00", "location": "Lucia's", "attendees": ["alex@friends.com"], "status": "scheduled"},
+            ],
+            "bookings": [
+                {"id": 101, "restaurant": "Lucia's", "time": "19:00", "party_size": 2, "status": "confirmed"},
+            ],
+            "emails": [],
+            "policies": {"max_booking_changes": 3},
+        },
+        "drift_at_step": 2,
+        "drift_event": {"type": "schema_change", "tool": "bookings", "change": "time_field_renamed", "old_field": "time", "new_field": "reservation_time"},
+        "target": {
+            "calendar": [{"id": 2, "time": "20:30", "status": "rescheduled"}],
+            "bookings": [{"id": 101, "reservation_time": "20:30", "status": "confirmed"}],
+            "emails": [{"to": "alex@friends.com", "contains": "reschedule"}],
+        },
+        "max_steps": 10,
+    },
+    {
+        "id": "book_travel_simple",
+        "title": "Book a flight for Monday meeting",
+        "description": (
+            "Book a flight from SFO to LAX for Monday morning. "
+            "The meeting is at 2 PM so arrive by noon. "
+            "Email travel@company.com with the booking confirmation."
+        ),
+        "seed": {
+            "calendar": [
+                {"id": 1, "title": "LA Client Meeting", "time": "14:00", "date": "2026-03-09", "location": "LA Office", "attendees": ["client@partner.com"], "status": "scheduled"},
+            ],
+            "travel": [
+                {"flight": "UA101", "from": "SFO", "to": "LAX", "depart": "07:00", "arrive": "08:30", "price": 189, "status": "available"},
+                {"flight": "UA205", "from": "SFO", "to": "LAX", "depart": "09:00", "arrive": "10:30", "price": 249, "status": "available"},
+            ],
+            "emails": [],
+            "policies": {"max_flight_cost": 300},
+        },
+        "drift_at_step": 2,
+        "drift_event": {"type": "policy_change", "tool": "travel", "change": "cost_limit_lowered", "old_limit": 300, "new_limit": 200},
+        "target": {
+            "travel": [{"flight": "UA101", "status": "booked"}],
+            "emails": [{"to": "travel@company.com", "contains": "booking"}],
+        },
+        "max_steps": 8,
+    },
+    {
+        "id": "reply_email_urgent",
+        "title": "Reply to urgent client email",
+        "description": (
+            "Client Sarah at sarah@bigcorp.com sent an urgent email asking "
+            "about the Q2 proposal deadline. The deadline is March 15. "
+            "Reply to her email with the deadline and CC your manager mgr@company.com."
+        ),
+        "seed": {
+            "emails": [
+                {"id": 1, "from": "sarah@bigcorp.com", "subject": "Q2 Proposal Deadline?", "body": "Hi, when is the Q2 proposal due? We need to plan resources.", "status": "unread"},
+            ],
+            "docs": [
+                {"id": "q2-proposal", "title": "Q2 Proposal", "deadline": "2026-03-15", "status": "draft"},
+            ],
+            "policies": {"reply_within_hours": 2, "cc_manager_on_client": True},
+        },
+        "drift_at_step": 1,
+        "drift_event": {"type": "schema_change", "tool": "emails", "change": "cc_field_renamed", "old_field": "cc", "new_field": "carbon_copy"},
+        "target": {
+            "emails": [{"to": "sarah@bigcorp.com", "contains": "March 15", "carbon_copy": "mgr@company.com"}],
+        },
+        "max_steps": 6,
+    },
+    {
+        "id": "cancel_meeting_notify",
+        "title": "Cancel tomorrow's standup and notify team",
+        "description": (
+            "Cancel tomorrow's team standup (event 1) because the CEO "
+            "called an all-hands at the same time. Email the team list: "
+            "dev1@company.com, dev2@company.com, dev3@company.com."
+        ),
+        "seed": {
+            "calendar": [
+                {"id": 1, "title": "Team Standup", "time": "09:00", "date": "2026-03-08", "attendees": ["dev1@company.com", "dev2@company.com", "dev3@company.com"], "status": "scheduled"},
+                {"id": 2, "title": "CEO All-Hands", "time": "09:00", "date": "2026-03-08", "attendees": ["all@company.com"], "status": "scheduled"},
+            ],
+            "emails": [],
+            "policies": {},
+        },
+        "drift_at_step": 2,
+        "drift_event": {"type": "schema_change", "tool": "calendar", "change": "status_values_changed", "old_values": ["scheduled", "cancelled"], "new_values": ["active", "removed"]},
+        "target": {
+            "calendar": [{"id": 1, "status": "removed"}],
+            "emails": [{"to": "dev1@company.com"}, {"to": "dev2@company.com"}, {"to": "dev3@company.com"}],
+        },
+        "max_steps": 10,
+    },
+    # ═══════════════════════════════════════════════════════════
+    #  TIER 2: Medium (5-6 tool calls, 2 drift events)
+    # ═══════════════════════════════════════════════════════════
+    {
+        "id": "travel_with_approval",
+        "title": "Book international travel with manager approval",
+        "description": (
+            "Book a flight from SFO to London for the conference on March 20. "
+            "Budget is $2000. Book hotel for 3 nights near the venue. "
+            "Get manager approval (mgr@company.com) since international travel "
+            "requires it. Email travel@company.com with full itinerary."
+        ),
+        "seed": {
+            "travel": [
+                {"flight": "BA285", "from": "SFO", "to": "LHR", "depart": "19:00", "arrive": "13:00+1", "price": 1200, "status": "available"},
+                {"hotel": "Hilton Tower Bridge", "rate": 250, "nights": 3, "status": "available"},
+            ],
+            "calendar": [
+                {"id": 1, "title": "London Conference", "date": "2026-03-20", "location": "ExCeL London", "status": "scheduled"},
+            ],
+            "emails": [],
+            "policies": {"intl_travel_requires_approval": True, "max_hotel_rate": 300},
+        },
+        "drift_at_step": 3,
+        "drift_event": {"type": "policy_change", "tool": "travel", "change": "approval_requires_itemized", "new_requirement": "must include flight cost, hotel cost, and total in approval request"},
+        "target": {
+            "travel": [{"flight": "BA285", "status": "booked"}, {"hotel": "Hilton Tower Bridge", "status": "booked"}],
+            "emails": [{"to": "mgr@company.com", "contains": "approval"}, {"to": "travel@company.com", "contains": "itinerary"}],
+        },
+        "max_steps": 12,
+    },
+    {
+        "id": "conflict_resolution",
+        "title": "Resolve double-booked afternoon",
+        "description": (
+            "You have 3 meetings at 2 PM: client call (high priority), "
+            "team sync (can move), and 1:1 with intern (can move). "
+            "Keep the client call, move team sync to 3 PM, move 1:1 to 4 PM. "
+            "Email all affected attendees about changes."
+        ),
+        "seed": {
+            "calendar": [
+                {"id": 1, "title": "Client Call", "time": "14:00", "priority": "high", "attendees": ["client@partner.com"], "status": "scheduled"},
+                {"id": 2, "title": "Team Sync", "time": "14:00", "priority": "medium", "attendees": ["team@company.com"], "status": "scheduled"},
+                {"id": 3, "title": "1:1 with Intern", "time": "14:00", "priority": "low", "attendees": ["intern@company.com"], "status": "scheduled"},
+            ],
+            "emails": [],
+            "policies": {"notify_on_reschedule": True},
+        },
+        "drift_at_step": 3,
+        "drift_event": {"type": "actor_conflict", "tool": "calendar", "change": "attendee_requests_different_time", "actor": "team@company.com", "message": "3 PM doesn't work, can we do 3:30?"},
+        "target": {
+            "calendar": [
+                {"id": 1, "time": "14:00", "status": "scheduled"},
+                {"id": 2, "time": "15:30", "status": "rescheduled"},
+                {"id": 3, "time": "16:00", "status": "rescheduled"},
+            ],
+            "emails": [{"to": "team@company.com"}, {"to": "intern@company.com"}],
+        },
+        "max_steps": 12,
+    },
+    {
+        "id": "expense_report",
+        "title": "Submit expense report with receipt changes",
+        "description": (
+            "Submit expense report for last week's client dinner ($185) "
+            "and taxi ($42). Attach receipts, categorize correctly, "
+            "and email finance@company.com for approval."
+        ),
+        "seed": {
+            "expenses": [
+                {"id": 1, "type": "meal", "amount": 185, "description": "Client dinner at Nobu", "receipt": True, "status": "draft"},
+                {"id": 2, "type": "transport", "amount": 42, "description": "Taxi to restaurant", "receipt": True, "status": "draft"},
+            ],
+            "emails": [],
+            "policies": {"meal_limit": 200, "require_receipt_over": 25, "approval_required_over": 100},
+        },
+        "drift_at_step": 2,
+        "drift_event": {"type": "policy_change", "tool": "expenses", "change": "meal_limit_lowered", "old_limit": 200, "new_limit": 150, "action": "meals over new limit require VP approval"},
+        "target": {
+            "expenses": [{"id": 1, "status": "submitted"}, {"id": 2, "status": "submitted"}],
+            "emails": [{"to": "finance@company.com", "contains": "expense"}, {"to": "vp@company.com", "contains": "approval"}],
+        },
+        "max_steps": 10,
+    },
+    {
+        "id": "onboard_new_hire",
+        "title": "Onboard new team member",
+        "description": (
+            "New hire Jordan (jordan@company.com) starts Monday. "
+            "Schedule a welcome meeting at 10 AM with the team, "
+            "create their onboarding doc, add them to the team calendar, "
+            "and email IT (it@company.com) to set up their accounts."
+        ),
+        "seed": {
+            "calendar": [],
+            "docs": [],
+            "emails": [],
+            "team": [
+                {"name": "Jordan Lee", "email": "jordan@company.com", "role": "engineer", "start_date": "2026-03-09"},
+            ],
+            "policies": {"onboard_checklist": ["welcome_meeting", "onboarding_doc", "it_setup", "team_intro"]},
+        },
+        "drift_at_step": 3,
+        "drift_event": {"type": "schema_change", "tool": "docs", "change": "template_format_changed", "old_format": "markdown", "new_format": "json"},
+        "target": {
+            "calendar": [{"title_contains": "Welcome", "attendees_include": "jordan@company.com"}],
+            "docs": [{"title_contains": "Onboarding"}],
+            "emails": [{"to": "it@company.com", "contains": "account"}, {"to": "jordan@company.com", "contains": "welcome"}],
+        },
+        "max_steps": 12,
+    },
+    # ═══════════════════════════════════════════════════════════
+    #  TIER 3: Complex (7+ tool calls, 2-3 drift events)
+    # ═══════════════════════════════════════════════════════════
+    {
+        "id": "full_day_reorg",
+        "title": "Reorganize entire day after CEO emergency",
+        "description": (
+            "CEO called emergency board meeting at 11 AM. Reorganize the day: "
+            "move the 11 AM team review to 2 PM, cancel the noon lunch with vendor "
+            "(email vendor@partner.com to apologize), keep the 3 PM client call, "
+            "book a conference room for the board meeting, and email all attendees "
+            "about every change."
+        ),
+        "seed": {
+            "calendar": [
+                {"id": 1, "title": "Team Review", "time": "11:00", "attendees": ["team@company.com"], "status": "scheduled"},
+                {"id": 2, "title": "Lunch with Vendor", "time": "12:00", "attendees": ["vendor@partner.com"], "status": "scheduled"},
+                {"id": 3, "title": "Client Call", "time": "15:00", "attendees": ["client@bigcorp.com"], "status": "scheduled"},
+            ],
+            "rooms": [
+                {"id": "conf-a", "name": "Board Room", "capacity": 20, "available": True},
+                {"id": "conf-b", "name": "Small Meeting", "capacity": 6, "available": True},
+            ],
+            "emails": [],
+            "policies": {"board_meeting_room_min_capacity": 15},
+        },
+        "drift_at_step": 3,
+        "drift_event": {"type": "schema_change", "tool": "rooms", "change": "booking_requires_purpose", "new_required_field": "purpose"},
+        "target": {
+            "calendar": [
+                {"id": 1, "time": "14:00", "status": "rescheduled"},
+                {"id": 2, "status": "cancelled"},
+            ],
+            "rooms": [{"id": "conf-a", "status": "booked", "purpose": "CEO Board Meeting"}],
+            "emails": [{"to": "vendor@partner.com", "contains": "cancel"}, {"to": "team@company.com", "contains": "moved"}],
+        },
+        "max_steps": 15,
+    },
+    {
+        "id": "multi_actor_conflict",
+        "title": "Handle conflicting requests from VP and client",
+        "description": (
+            "VP wants you to schedule a strategy session Thursday 2-4 PM. "
+            "Client just emailed requesting a demo at the same time. "
+            "The client is higher priority. Schedule the demo for Thursday 2-3 PM, "
+            "move VP strategy to Friday 2-4 PM, and email both explaining."
+        ),
+        "seed": {
+            "calendar": [],
+            "emails": [
+                {"id": 1, "from": "vp@company.com", "subject": "Strategy Session", "body": "Block Thursday 2-4 PM for strategy planning.", "status": "unread"},
+                {"id": 2, "from": "client@bigcorp.com", "subject": "Demo Request", "body": "Can we see the product demo Thursday 2 PM?", "status": "unread"},
+            ],
+            "policies": {"client_priority_over_internal": True},
+        },
+        "drift_at_step": 4,
+        "drift_event": {"type": "actor_conflict", "tool": "emails", "change": "vp_insists", "actor": "vp@company.com", "message": "Friday doesn't work. Can we do Thursday morning instead?"},
+        "target": {
+            "calendar": [
+                {"title_contains": "Demo", "time": "14:00", "day": "Thursday"},
+                {"title_contains": "Strategy", "time": "10:00", "day": "Thursday"},
+            ],
+            "emails": [{"to": "vp@company.com", "contains": "Thursday morning"}, {"to": "client@bigcorp.com", "contains": "demo confirmed"}],
+        },
+        "max_steps": 15,
+    },
+    {
+        "id": "trip_planning_drift",
+        "title": "Plan team offsite with multiple schema changes",
+        "description": (
+            "Plan a 2-day team offsite for 8 people in Napa Valley. "
+            "Book hotel, restaurant for team dinner, and transportation. "
+            "Budget: $5000 total. Email team@company.com with the itinerary "
+            "and finance@company.com for pre-approval."
+        ),
+        "seed": {
+            "travel": [
+                {"hotel": "Napa Inn", "rate": 180, "rooms": 4, "nights": 2, "status": "available"},
+                {"transport": "Van rental", "cost": 200, "capacity": 10, "status": "available"},
+            ],
+            "bookings": [
+                {"restaurant": "Bistro Don Giovanni", "party_size": 8, "time": "19:00", "cost_per_person": 65, "status": "available"},
+            ],
+            "emails": [],
+            "policies": {"offsite_requires_preapproval": True, "max_offsite_budget": 5000},
+        },
+        "drift_at_step": 3,
+        "drift_event": {"type": "policy_change", "tool": "travel", "change": "budget_cut", "old_budget": 5000, "new_budget": 4000},
+        "target": {
+            "travel": [{"hotel": "Napa Inn", "status": "booked"}, {"transport": "Van rental", "status": "booked"}],
+            "bookings": [{"restaurant": "Bistro Don Giovanni", "status": "booked"}],
+            "emails": [{"to": "team@company.com", "contains": "itinerary"}, {"to": "finance@company.com", "contains": "approval"}],
+        },
+        "max_steps": 15,
+    },
+    {
+        "id": "crisis_management",
+        "title": "Handle server outage during client demo",
+        "description": (
+            "The production server went down during a client demo. "
+            "Email the client (client@bigcorp.com) apologizing and offering "
+            "to reschedule. Escalate to engineering (eng@company.com) with urgency. "
+            "Cancel the next 2 non-critical meetings to free up time. "
+            "Schedule a post-mortem for tomorrow at 10 AM. "
+            "Email your VP (vp@company.com) with a status update."
+        ),
+        "seed": {
+            "calendar": [
+                {"id": 1, "title": "Client Demo", "time": "14:00", "status": "in_progress", "attendees": ["client@bigcorp.com"]},
+                {"id": 2, "title": "Team Sync", "time": "15:00", "priority": "low", "status": "scheduled"},
+                {"id": 3, "title": "1:1 with PM", "time": "16:00", "priority": "low", "status": "scheduled"},
+                {"id": 4, "title": "Board Prep", "time": "17:00", "priority": "high", "status": "scheduled"},
+            ],
+            "emails": [],
+            "incidents": [{"id": "INC-001", "severity": "P1", "status": "active", "service": "production-api"}],
+            "policies": {"p1_notify_vp": True, "p1_cancel_nonessential": True},
+        },
+        "drift_at_step": 4,
+        "drift_event": {"type": "schema_change", "tool": "calendar", "change": "cancel_requires_reason", "new_required_field": "cancellation_reason"},
+        "target": {
+            "calendar": [
+                {"id": 2, "status": "cancelled", "cancellation_reason_contains": "outage"},
+                {"id": 3, "status": "cancelled", "cancellation_reason_contains": "outage"},
+                {"title_contains": "Post-mortem", "time": "10:00"},
+            ],
+            "emails": [
+                {"to": "client@bigcorp.com", "contains": "apologize"},
+                {"to": "eng@company.com", "contains": "escalat"},
+                {"to": "vp@company.com", "contains": "status"},
+            ],
+        },
+        "max_steps": 18,
+    },
+]

tools.py ADDED Viewed

	@@ -0,0 +1,270 @@

+"""SchemaShift EA Arena — Simulated enterprise tools with schema drift support."""
+import copy, json
+class BaseTool:
+    """Base class for all tools. Supports schema drift."""
+    def __init__(self):
+        self._data = {}
+        self._schema_version = 1
+        self._field_renames = {}  # old_field -> new_field
+        self._required_fields = []
+        self._policy_overrides = {}
+    def seed(self, data):
+        self._data = {item.get("id", i): copy.deepcopy(item) for i, item in enumerate(data)}
+    def apply_drift(self, drift_event):
+        change = drift_event.get("change", "")
+        if "field_renamed" in change:
+            old = drift_event.get("old_field", "")
+            new = drift_event.get("new_field", "")
+            self._field_renames[old] = new
+            for k, v in self._data.items():
+                if old in v:
+                    v[new] = v.pop(old)
+            self._schema_version += 1
+        elif "requires_" in change or "required_field" in change.replace("new_", ""):
+            new_field = drift_event.get("new_required_field", "")
+            if new_field:
+                self._required_fields.append(new_field)
+            self._schema_version += 1
+        elif "values_changed" in change:
+            self._schema_version += 1
+        elif "limit" in change or "budget" in change or "lowered" in change:
+            for k, v in drift_event.items():
+                if k.startswith("new_"):
+                    self._policy_overrides[k.replace("new_", "")] = v
+            self._schema_version += 1
+        elif "format_changed" in change:
+            self._policy_overrides["format"] = drift_event.get("new_format", "json")
+            self._schema_version += 1
+        elif "requires_itemized" in change:
+            self._required_fields.append("itemized")
+            self._schema_version += 1
+    def snapshot(self):
+        return copy.deepcopy(list(self._data.values()))
+class CalendarTool(BaseTool):
+    def execute(self, action, params):
+        if action == "list_events":
+            return {"success": True, "events": self.snapshot()}
+        elif action == "get_event":
+            eid = params.get("id")
+            if eid in self._data:
+                return {"success": True, "event": copy.deepcopy(self._data[eid])}
+            return {"success": False, "error": f"Event {eid} not found"}
+        elif action == "create_event":
+            for rf in self._required_fields:
+                if rf not in params:
+                    return {"success": False, "error": f"Missing required field: {rf}", "schema_version": self._schema_version}
+            eid = params.get("id", max(list(self._data.keys()) or [0]) + 1)
+            self._data[eid] = {**params, "id": eid, "status": params.get("status", "scheduled")}
+            return {"success": True, "event": copy.deepcopy(self._data[eid])}
+        elif action == "reschedule_event":
+            eid = params.get("id")
+            if eid not in self._data:
+                return {"success": False, "error": f"Event {eid} not found"}
+            new_time = params.get("time") or params.get("reservation_time")
+            if new_time:
+                time_field = self._field_renames.get("time", "time")
+                self._data[eid][time_field] = new_time
+            self._data[eid]["status"] = "rescheduled"
+            return {"success": True, "event": copy.deepcopy(self._data[eid])}
+        elif action == "cancel_event":
+            eid = params.get("id")
+            if eid not in self._data:
+                return {"success": False, "error": f"Event {eid} not found"}
+            for rf in self._required_fields:
+                if rf not in params and rf != "itemized":
+                    return {"success": False, "error": f"Missing required field: {rf}", "schema_version": self._schema_version}
+            cancel_status = "cancelled"
+            if self._policy_overrides.get("status_values"):
+                cancel_status = "removed"
+            self._data[eid]["status"] = params.get("status", cancel_status)
+            if "cancellation_reason" in params:
+                self._data[eid]["cancellation_reason"] = params["cancellation_reason"]
+            return {"success": True, "event": copy.deepcopy(self._data[eid])}
+        return {"success": False, "error": f"Unknown calendar action: {action}"}
+class EmailTool(BaseTool):
+    def __init__(self):
+        super().__init__()
+        self._outbox = []
+    def seed(self, data):
+        self._data = {item.get("id", i): copy.deepcopy(item) for i, item in enumerate(data)}
+    def execute(self, action, params):
+        if action == "list_emails":
+            return {"success": True, "emails": self.snapshot()}
+        elif action == "read_email":
+            eid = params.get("id")
+            if eid in self._data:
+                self._data[eid]["status"] = "read"
+                return {"success": True, "email": copy.deepcopy(self._data[eid])}
+            return {"success": False, "error": f"Email {eid} not found"}
+        elif action == "send":
+            to = params.get("to", "")
+            subject = params.get("subject", "")
+            body = params.get("body", "")
+            cc = params.get("cc") or params.get("carbon_copy", "")
+            if not to:
+                return {"success": False, "error": "Missing 'to' field"}
+            email = {"to": to, "subject": subject, "body": body, "status": "sent"}
+            if cc:
+                cc_field = self._field_renames.get("cc", "cc")
+                email[cc_field] = cc
+            self._outbox.append(email)
+            return {"success": True, "output": f"Email sent to {to}"}
+        return {"success": False, "error": f"Unknown email action: {action}"}
+    def snapshot(self):
+        return {"inbox": list(self._data.values()), "outbox": copy.deepcopy(self._outbox)}
+class BookingsTool(BaseTool):
+    def execute(self, action, params):
+        if action == "list_bookings":
+            return {"success": True, "bookings": self.snapshot()}
+        elif action == "get_booking":
+            bid = params.get("id")
+            if bid in self._data:
+                return {"success": True, "booking": copy.deepcopy(self._data[bid])}
+            return {"success": False, "error": f"Booking {bid} not found"}
+        elif action == "update_booking":
+            bid = params.get("id")
+            if bid not in self._data:
+                return {"success": False, "error": f"Booking {bid} not found"}
+            for k, v in params.items():
+                if k != "id":
+                    self._data[bid][k] = v
+            return {"success": True, "booking": copy.deepcopy(self._data[bid])}
+        elif action == "create_booking":
+            bid = params.get("id", max(list(self._data.keys()) or [0]) + 1)
+            self._data[bid] = {**params, "id": bid, "status": params.get("status", "confirmed")}
+            return {"success": True, "booking": copy.deepcopy(self._data[bid])}
+        return {"success": False, "error": f"Unknown bookings action: {action}"}
+class TravelTool(BaseTool):
+    def execute(self, action, params):
+        if action == "list_options":
+            return {"success": True, "options": self.snapshot()}
+        elif action == "book":
+            item_id = params.get("id") or params.get("flight") or params.get("hotel") or params.get("transport")
+            for k, v in self._data.items():
+                match = (v.get("flight") == item_id or v.get("hotel") == item_id or
+                         v.get("transport") == item_id or k == item_id)
+                if match:
+                    cost = v.get("price") or v.get("rate", 0) * v.get("nights", 1) or v.get("cost", 0)
+                    limit = self._policy_overrides.get("limit") or self._policy_overrides.get("budget") or 99999
+                    if cost > limit:
+                        return {"success": False, "error": f"Cost ${cost} exceeds limit ${limit}", "policy_violated": True}
+                    v["status"] = "booked"
+                    return {"success": True, "booking": copy.deepcopy(v)}
+            return {"success": False, "error": f"Travel option not found: {item_id}"}
+        return {"success": False, "error": f"Unknown travel action: {action}"}
+class DocsTool(BaseTool):
+    def execute(self, action, params):
+        if action == "list_docs":
+            return {"success": True, "docs": self.snapshot()}
+        elif action == "get_doc":
+            did = params.get("id")
+            if did in self._data:
+                return {"success": True, "doc": copy.deepcopy(self._data[did])}
+            return {"success": False, "error": f"Doc {did} not found"}
+        elif action == "create_doc":
+            did = params.get("id", f"doc-{len(self._data)+1}")
+            fmt = self._policy_overrides.get("format", "markdown")
+            self._data[did] = {**params, "id": did, "format": fmt, "status": "created"}
+            return {"success": True, "doc": copy.deepcopy(self._data[did])}
+        return {"success": False, "error": f"Unknown docs action: {action}"}
+class ExpensesTool(BaseTool):
+    def execute(self, action, params):
+        if action == "list_expenses":
+            return {"success": True, "expenses": self.snapshot()}
+        elif action == "submit_expense":
+            eid = params.get("id")
+            if eid not in self._data:
+                return {"success": False, "error": f"Expense {eid} not found"}
+            amount = self._data[eid].get("amount", 0)
+            limit = self._policy_overrides.get("limit") or self._policy_overrides.get("meal_limit") or 99999
+            expense_type = self._data[eid].get("type", "")
+            if expense_type == "meal" and amount > limit:
+                return {"success": True, "output": f"Submitted but requires VP approval (${amount} > ${limit} meal limit)",
+                        "requires_approval": True}
+            self._data[eid]["status"] = "submitted"
+            return {"success": True, "expense": copy.deepcopy(self._data[eid])}
+        return {"success": False, "error": f"Unknown expenses action: {action}"}
+class RoomsTool(BaseTool):
+    def execute(self, action, params):
+        if action == "list_rooms":
+            return {"success": True, "rooms": self.snapshot()}
+        elif action == "book_room":
+            rid = params.get("id")
+            if rid not in self._data:
+                return {"success": False, "error": f"Room {rid} not found"}
+            for rf in self._required_fields:
+                if rf not in params:
+                    return {"success": False, "error": f"Missing required field: {rf}", "schema_version": self._schema_version}
+            self._data[rid]["status"] = "booked"
+            if "purpose" in params:
+                self._data[rid]["purpose"] = params["purpose"]
+            return {"success": True, "room": copy.deepcopy(self._data[rid])}
+        return {"success": False, "error": f"Unknown rooms action: {action}"}
+class TeamTool(BaseTool):
+    def execute(self, action, params):
+        if action == "list_members":
+            return {"success": True, "members": self.snapshot()}
+        elif action == "get_member":
+            email = params.get("email")
+            for k, v in self._data.items():
+                if v.get("email") == email:
+                    return {"success": True, "member": copy.deepcopy(v)}
+            return {"success": False, "error": f"Member not found: {email}"}
+        return {"success": False, "error": f"Unknown team action: {action}"}
+class IncidentsTool(BaseTool):
+    def execute(self, action, params):
+        if action == "list_incidents":
+            return {"success": True, "incidents": self.snapshot()}
+        elif action == "get_incident":
+            iid = params.get("id")
+            if iid in self._data:
+                return {"success": True, "incident": copy.deepcopy(self._data[iid])}
+            return {"success": False, "error": f"Incident {iid} not found"}
+        elif action == "escalate":
+            iid = params.get("id")
+            if iid in self._data:
+                self._data[iid]["status"] = "escalated"
+                self._data[iid]["escalated_to"] = params.get("to", "")
+                return {"success": True, "incident": copy.deepcopy(self._data[iid])}
+            return {"success": False, "error": f"Incident {iid} not found"}
+        return {"success": False, "error": f"Unknown incidents action: {action}"}
+# Tool registry
+ALL_TOOLS = {
+    "calendar": CalendarTool,
+    "email": EmailTool,
+    "bookings": BookingsTool,
+    "travel": TravelTool,
+    "docs": DocsTool,
+    "expenses": ExpensesTool,
+    "rooms": RoomsTool,
+    "team": TeamTool,
+    "incidents": IncidentsTool,
+}

verifier.py ADDED Viewed

	@@ -0,0 +1,159 @@

+"""SchemaShift EA Arena Verifier — deterministic scoring.
+Score (100 points):
+  Task Completion:    30 pts  (final state matches target)
+  Policy Compliance:  20 pts  (no policy violations)
+  Notifications:      15 pts  (all required emails sent)
+  Drift Recovery:     15 pts  (adapted to schema changes)
+  Tool Efficiency:    10 pts  (minimal tool calls)
+  Action Hygiene:     10 pts  (no invalid calls)
+Verdict: PASS ≥ 85, HOLD ≥ 55, BLOCK < 55
+"""
+import copy
+def check_target_match(target, snapshots):
+    """Check how well final state matches target."""
+    matches = 0
+    total = 0
+    for tool_name, expected_items in target.items():
+        actual = snapshots.get(tool_name, [])
+        if isinstance(actual, dict):
+            actual_list = actual.get("outbox", []) if tool_name == "email" else list(actual.values())
+        else:
+            actual_list = actual
+        for exp in expected_items:
+            total += 1
+            found = False
+            for act in actual_list:
+                if not isinstance(act, dict):
+                    continue
+                match = True
+                for k, v in exp.items():
+                    if k.endswith("_contains"):
+                        real_key = k.replace("_contains", "")
+                        if real_key == "title":
+                            act_val = act.get("title", "") + act.get("subject", "")
+                        else:
+                            act_val = str(act.get(real_key, ""))
+                        if v.lower() not in act_val.lower():
+                            match = False; break
+                    elif k.endswith("_include"):
+                        real_key = k.replace("_include", "")
+                        act_val = act.get(real_key, [])
+                        if v not in act_val:
+                            match = False; break
+                    elif k == "contains":
+                        body = str(act.get("body", "")) + str(act.get("subject", ""))
+                        if v.lower() not in body.lower():
+                            match = False; break
+                    else:
+                        if act.get(k) != v:
+                            match = False; break
+                if match:
+                    found = True; break
+            if found:
+                matches += 1
+    return (matches / total) if total > 0 else 1.0
+def check_notifications(target, email_snapshot):
+    """Check if all required emails were sent."""
+    if "email" not in target and "emails" not in target:
+        return 1.0
+    expected_emails = target.get("emails", target.get("email", []))
+    outbox = []
+    if isinstance(email_snapshot, dict):
+        outbox = email_snapshot.get("outbox", [])
+    elif isinstance(email_snapshot, list):
+        outbox = email_snapshot
+    if not expected_emails:
+        return 1.0
+    sent = 0
+    for exp in expected_emails:
+        exp_to = exp.get("to", "")
+        for actual in outbox:
+            if actual.get("to", "") == exp_to:
+                sent += 1; break
+    return sent / len(expected_emails)
+def verify_episode(task, snapshots, policy_violations, invalid_calls,
+                   tool_calls_made, drift_events_handled, recovered_from_drift):
+    """Score an episode. Returns (reward, violations, verdict)."""
+    violations = []
+    target = task.get("target", {})
+    # 1. Task Completion (30 pts)
+    completion = check_target_match(target, snapshots)
+    completion_pts = round(completion * 30, 1)
+    # 2. Policy Compliance (20 pts)
+    compliance_pts = max(0, 20 - policy_violations * 10)
+    if policy_violations:
+        violations.append(f"{policy_violations} policy violation(s)")
+    # 3. Notifications (15 pts)
+    notif_score = check_notifications(target, snapshots.get("email", []))
+    notif_pts = round(notif_score * 15, 1)
+    if notif_score < 1.0:
+        violations.append("Missing notifications")
+    # 4. Drift Recovery (15 pts)
+    has_drift = task.get("drift_at_step") is not None
+    if has_drift:
+        if recovered_from_drift:
+            drift_pts = 15.0
+        elif drift_events_handled > 0:
+            drift_pts = 8.0
+        else:
+            drift_pts = 0.0
+            violations.append("Failed to recover from schema drift")
+    else:
+        drift_pts = 15.0  # No drift = full credit
+    # 5. Tool Efficiency (10 pts)
+    max_steps = task.get("max_steps", 15)
+    efficiency = max(0, 1 - (tool_calls_made - max_steps * 0.5) / (max_steps * 0.5))
+    efficiency_pts = round(min(10, efficiency * 10), 1)
+    # 6. Action Hygiene (10 pts)
+    hygiene_pts = max(0, 10 - invalid_calls * 3)
+    if invalid_calls:
+        violations.append(f"{invalid_calls} invalid call(s)")
+    # Total
+    score = round(min(100, completion_pts + compliance_pts + notif_pts +
+                       drift_pts + efficiency_pts + hygiene_pts), 1)
+    # Verdict
+    if score >= 85 and policy_violations == 0:
+        decision = "PASS"
+    elif score >= 55:
+        decision = "HOLD"
+    else:
+        decision = "BLOCK"
+    grade = "A" if score >= 90 else "B" if score >= 80 else "C" if score >= 70 else "D" if score >= 60 else "F"
+    reward = 1.0 if decision == "PASS" else 0.3 if decision == "HOLD" else -0.5
+    return reward, violations, {
+        "decision": decision, "score": score, "grade": grade, "reward": reward,
+        "breakdown": {
+            "task_completion": {"points": completion_pts, "max": 30, "match_rate": round(completion, 3)},
+            "policy_compliance": {"points": compliance_pts, "max": 20, "violations": policy_violations},
+            "notifications": {"points": notif_pts, "max": 15, "sent_rate": round(notif_score, 3)},
+            "drift_recovery": {"points": drift_pts, "max": 15, "recovered": recovered_from_drift},
+            "tool_efficiency": {"points": efficiency_pts, "max": 10, "calls": tool_calls_made},
+            "action_hygiene": {"points": hygiene_pts, "max": 10, "invalid": invalid_calls},
+        },
+        "violations": violations,
+    }