File size: 12,919 Bytes
542e1a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
"""Headless batch test β€” run the agent on multiple seeds and print results."""

import asyncio
import json
import os
import random
from datetime import date, timedelta

from dotenv import load_dotenv
from openai import OpenAI
import websockets

load_dotenv()

WS_URL = "ws://localhost:8000/ws"
MODEL = "llama-3.3-70b-versatile"
EPISODE_BASE_DATE = date(2026, 1, 5)
EPISODE_WEEKS = 3
MAX_STEPS = 60

SYSTEM_PROMPT = """You are a calendar personal assistant managing a team's calendar.

Start by reading your inbox (read_inbox) and reviewing the calendar (list_events) to understand what needs to be done today.

IMPORTANT workflow:
1. Read your inbox to see pending requests from your boss and team.
2. Review the calendar to understand the current schedule.
3. BEFORE scheduling any meeting, call get_contact_preferences(person) to learn their constraints.
4. Use check_availability before scheduling β€” don't guess times.
5. Respect HARD constraints (must obey) and SOFT constraints (preferences).
6. After making changes, call check_constraint_violations to verify.
7. Keep checking your inbox β€” new messages may arrive while you work.
8. When attendees decline a meeting, read their feedback and adjust your proposal.
9. Personal events on the calendar are immovable β€” schedule work around them.
10. Reply to messages that need responses.
11. Inbox-driven requests are tracked in the inbox; if any task-style view omits them, use read_inbox as the source of truth.
12. Think step by step about what tools to call and in what order.
13. Pay attention to policy changes announced via interrupts. Rules may change mid-session.
14. If someone's availability changes, re-validate any meetings you already scheduled with that person.
15. Family may text mid-session about personal event changes. Re-check for new conflicts."""

TOOLS = [
    {"type": "function", "function": {"name": "list_events", "description": "List all calendar events for a given date.", "parameters": {"type": "object", "properties": {"date": {"type": "string", "description": "Date string: 'today', 'tomorrow', 'next monday', or YYYY-MM-DD"}}, "required": []}}},
    {"type": "function", "function": {"name": "create_event", "description": "Create a calendar event.", "parameters": {"type": "object", "properties": {"title": {"type": "string"}, "date": {"type": "string", "description": "'today', 'tomorrow', 'next monday', or YYYY-MM-DD"}, "start_time": {"type": "string", "description": "HH:MM format"}, "duration_minutes": {"type": "integer", "default": 60}, "attendees": {"type": "string", "description": "Comma-separated names"}, "description": {"type": "string", "description": "Meeting agenda/description (required for meetings >30 min after policy update)"}}, "required": ["title", "date", "start_time"]}}},
    {"type": "function", "function": {"name": "delete_event", "description": "Delete a calendar event by title.", "parameters": {"type": "object", "properties": {"title": {"type": "string"}}, "required": ["title"]}}},
    {"type": "function", "function": {"name": "edit_event", "description": "Edit an existing event. Only provided fields are changed.", "parameters": {"type": "object", "properties": {"title": {"type": "string", "description": "Current title of the event to edit"}, "new_title": {"type": "string"}, "new_date": {"type": "string"}, "new_start_time": {"type": "string", "description": "HH:MM"}, "new_duration_minutes": {"type": "integer"}, "new_attendees": {"type": "string", "description": "Comma-separated, replaces all"}, "new_description": {"type": "string", "description": "New description/agenda for the event"}}, "required": ["title"]}}},
    {"type": "function", "function": {"name": "find_free_slots", "description": "Find available time slots on a given date (8:00-18:00).", "parameters": {"type": "object", "properties": {"date": {"type": "string", "default": "today"}, "duration_minutes": {"type": "integer", "default": 60}}, "required": []}}},
    {"type": "function", "function": {"name": "check_conflicts", "description": "Check for scheduling conflicts on a date.", "parameters": {"type": "object", "properties": {"date": {"type": "string", "default": "today"}}, "required": []}}},
    {"type": "function", "function": {"name": "resolve_conflict", "description": "Resolve a conflict by moving an event to a new time.", "parameters": {"type": "object", "properties": {"event_title": {"type": "string"}, "new_start_time": {"type": "string", "description": "HH:MM format"}}, "required": ["event_title", "new_start_time"]}}},
    {"type": "function", "function": {"name": "send_notification", "description": "Send a notification to a person.", "parameters": {"type": "object", "properties": {"to": {"type": "string"}, "message": {"type": "string"}}, "required": ["to", "message"]}}},
    {"type": "function", "function": {"name": "check_availability", "description": "Check a person's availability on a given date.", "parameters": {"type": "object", "properties": {"person": {"type": "string"}, "date": {"type": "string", "default": "today"}}, "required": ["person"]}}},
    {"type": "function", "function": {"name": "get_constraints", "description": "Get scheduling constraints (hard and soft) that apply to the calendar. Note: individual people may have additional private constraints β€” use get_contact_preferences to discover them.", "parameters": {"type": "object", "properties": {}, "required": []}}},
    {"type": "function", "function": {"name": "get_contact_preferences", "description": "Get a person's scheduling preferences, private constraints, role, and preferred notification method. Some constraints are only visible through this tool.", "parameters": {"type": "object", "properties": {"person": {"type": "string", "description": "Person's name (e.g. Alice, Bob, Charlie, Dave, Eve)"}}, "required": ["person"]}}},
    {"type": "function", "function": {"name": "check_constraint_violations", "description": "Check the current calendar for all constraint violations.", "parameters": {"type": "object", "properties": {}, "required": []}}},
    {"type": "function", "function": {"name": "read_inbox", "description": "List inbox messages, filtered by all/unread/unreplied.", "parameters": {"type": "object", "properties": {"status": {"type": "string", "description": "Filter: 'all', 'unread', or 'unreplied'", "default": "all"}}, "required": []}}},
    {"type": "function", "function": {"name": "reply_message", "description": "Reply to an inbox message. Reply must address the sender's concern.", "parameters": {"type": "object", "properties": {"message_id": {"type": "string", "description": "ID of the message to reply to"}, "reply": {"type": "string", "description": "Your reply text (min 20 chars, must address the sender's ask)"}}, "required": ["message_id", "reply"]}}},
    {"type": "function", "function": {"name": "check_personal_calendar", "description": "Show personal (immovable) events that cannot be moved or deleted.", "parameters": {"type": "object", "properties": {}, "required": []}}},
]


def _seed_to_episode_today(seed: int) -> str:
    rng = random.Random(seed)
    weekdays = []
    for week in range(EPISODE_WEEKS):
        for day in range(5):
            weekdays.append(EPISODE_BASE_DATE + timedelta(weeks=week, days=day))
    return rng.choice(weekdays).isoformat()


async def run_seed(client: OpenAI, seed: int) -> dict:
    episode_today = _seed_to_episode_today(seed)
    print(f"\n{'='*60}")
    print(f"SEED {seed} | Episode date: {episode_today}")
    print(f"{'='*60}")

    async with websockets.connect(WS_URL, ping_interval=20, ping_timeout=120) as ws:
        # Reset
        await ws.send(json.dumps({"type": "reset", "data": {"seed": seed}}))
        reset_resp = json.loads(await ws.recv())
        obs = reset_resp["data"]["observation"]

        # Sliding window: state summary + last N exchanges
        SLIDING_WINDOW = 6
        history = []
        latest_state_summary = obs.get("state_summary", obs["output"])
        total_tasks = len(obs.get("flags_found", [])) + obs.get("pending_tasks", 0)

        final_reward = 0
        final_flags = []
        steps_used = 0

        for step in range(MAX_STEPS):
            steps_used = step + 1

            # Build messages: system + state summary + recent history
            messages = [
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": latest_state_summary},
            ] + history[-SLIDING_WINDOW:]

            try:
                response = await asyncio.to_thread(
                    client.chat.completions.create,
                    model=MODEL, messages=messages, tools=TOOLS, tool_choice="auto",
                )
            except Exception as e:
                print(f"  Step {step+1}: API error: {e}")
                break

            msg = response.choices[0].message

            if msg.tool_calls:
                history.append(msg)
                for tool_call in msg.tool_calls:
                    tool_name = tool_call.function.name
                    args = json.loads(tool_call.function.arguments)
                    print(f"  Step {step+1}: {tool_name}({json.dumps(args, separators=(',',':'))})")

                    action = {"type": "step", "data": {"instruction": json.dumps({"tool": tool_name, "args": args})}}
                    await ws.send(json.dumps(action))
                    step_resp = json.loads(await ws.recv())

                    data = step_resp["data"]
                    obs = data["observation"]
                    final_reward = data.get("reward", 0)
                    final_flags = obs.get("flags_found", [])
                    total_tasks = len(final_flags) + obs.get("pending_tasks", 0)

                    # Update state summary from latest observation
                    latest_state_summary = obs.get("state_summary", latest_state_summary)

                    history.append({
                        "role": "tool",
                        "tool_call_id": tool_call.id,
                        "content": obs["output"],
                    })

                    if data.get("done", False):
                        print(f"  >>> DONE at step {step+1}")
                        break

                if data.get("done", False):
                    break
            else:
                content = msg.content or ""
                print(f"  Step {step+1}: [text] {content[:80]}...")
                history.append(msg)
                history.append({"role": "user", "content": "Continue completing the remaining tasks. Use the tools available to you."})

        # Fetch final calendar
        calendar_lines = []
        today_dt = date.fromisoformat(episode_today)
        days_since_mon = today_dt.weekday()
        week_start = today_dt - timedelta(days=days_since_mon)
        for week in range(2):
            for day in range(5):
                d = (week_start + timedelta(weeks=week, days=day)).isoformat()
                action = {"type": "step", "data": {"instruction": json.dumps({"tool": "list_events", "args": {"date": d}})}}
                await ws.send(json.dumps(action))
                resp = json.loads(await ws.recv())
                output = resp["data"]["observation"]["output"]
                if "No events" not in output:
                    calendar_lines.append(output)

        calendar = "\n".join(calendar_lines) if calendar_lines else "No events."

    print(f"\n  REWARD: {final_reward:.2f} ({len(final_flags)}/{total_tasks})")
    print(f"  FLAGS: {sorted(final_flags)}")
    print(f"  STEPS: {steps_used}")
    print(f"  CALENDAR:\n{calendar}")

    return {
        "seed": seed,
        "date": episode_today,
        "reward": final_reward,
        "flags": sorted(final_flags),
        "steps": steps_used,
        "total_tasks": total_tasks,
    }


async def main():
    seeds = [1, 3, 5, 9, 12]
    client = OpenAI(
        base_url="https://api.groq.com/openai/v1",
        api_key=os.getenv("GROQ_API_KEY"),
    )

    results = []
    for seed in seeds:
        try:
            result = await run_seed(client, seed)
            results.append(result)
        except Exception as e:
            print(f"\nSEED {seed} FAILED: {e}")
            results.append({"seed": seed, "reward": 0, "flags": [], "error": str(e)})

    print(f"\n{'='*60}")
    print("SUMMARY")
    print(f"{'='*60}")
    for r in results:
        flags_count = len(r.get("flags", []))
        total_tasks = r.get("total_tasks", "?")
        print(f"  Seed {r['seed']:3d} | {r.get('date','?'):>10s} | Reward: {r.get('reward',0):.2f} ({flags_count}/{total_tasks}) | Steps: {r.get('steps','?')}")

    rewards = [r.get("reward", 0) for r in results]
    print(f"\n  Average reward: {sum(rewards)/len(rewards):.2f}")
    print(f"  Min: {min(rewards):.2f} | Max: {max(rewards):.2f}")


if __name__ == "__main__":
    asyncio.run(main())