File size: 8,793 Bytes
6762657
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
af8810b
6762657
 
 
 
 
af8810b
 
6762657
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
"""Baseline inference script for CommitmentOS.

Uses an OpenAI-compatible LLM to play through all 15 scenarios.
Multi-turn: the agent gets the briefing, makes tool calls, then submits.

Required environment variables:
  API_BASE_URL  β€” OpenAI-compatible endpoint
  MODEL_NAME    β€” model identifier
  HF_TOKEN      β€” API key (also checked as OPENAI_API_KEY)
  ENV_BASE_URL  β€” CommitmentOS server URL (default: HF Space)
"""

from __future__ import annotations

import json
import os
import sys
import time
from typing import Any, Dict, List

import requests
from openai import OpenAI
from dotenv import load_dotenv

# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------

load_dotenv()

API_BASE_URL = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4o-mini")
API_KEY = os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY") or ""
ENV_BASE_URL = os.getenv("ENV_BASE_URL", "https://jayant2304-commitment-os.hf.space")

MAX_STEPS = 12

SYSTEM_PROMPT = """You are an expert executive assistant AI. You manage calendars, emails, and dining reservations.

You will be given a scenario briefing describing a situation with calendar conflicts, emails, or planning tasks.

For each turn, you must respond with EXACTLY ONE JSON object choosing a tool to call:

Available tools:
- {"action_type": "view_calendar", "date": "2026-04-25"}
- {"action_type": "check_availability", "person": "Client_Jones"}
- {"action_type": "search_restaurants", "cuisine": "Italian", "max_price": 50, "dietary": "vegetarian", "max_distance_miles": 3.0, "near_airport": false}
- {"action_type": "schedule_meeting", "title": "Demo", "date": "2026-04-25", "time": "14:00", "duration_min": 60, "participants": ["Client_Jones"], "location": "Room A"}
- {"action_type": "reschedule_event", "event_id": "evt_1", "new_time": "15:00"}
- {"action_type": "cancel_event", "event_id": "evt_1"}
- {"action_type": "send_email", "to": "VP_Chen", "subject": "Meeting update", "body": "Hi, I need to reschedule..."}
- {"action_type": "book_restaurant", "restaurant_name": "Sky Lounge"}
- {"action_type": "submit_plan"}

IMPORTANT RULES:
1. Respond with ONLY a JSON object, no markdown, no explanation
2. Handle higher-priority items before lower-priority ones
3. When cancelling or rescheduling commitments, ALWAYS send an email to affected parties BEFORE submitting
4. Call submit_plan when you have resolved all issues
5. Never silently drop a commitment β€” always notify the affected person"""


# ---------------------------------------------------------------------------
# Logging helpers β€” exact format required by hackathon evaluator
# ---------------------------------------------------------------------------

def log_start(task: str, env: str, model: str) -> None:
    print(f"[START] task={task} env={env} model={model}", flush=True)


def log_step(step: int, action: str, reward: float, done: bool, error: str | None = None) -> None:
    err = error if error else "null"
    print(f"[STEP] step={step} action={action} reward={reward:.2f} done={'true' if done else 'false'} error={err}", flush=True)


def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
    print(f"[END] success={'true' if success else 'false'} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)


# ---------------------------------------------------------------------------
# Environment interaction
# ---------------------------------------------------------------------------

def env_reset(task_id: str) -> Dict[str, Any]:
    resp = requests.post(f"{ENV_BASE_URL}/reset", params={"task_id": task_id}, timeout=30)
    resp.raise_for_status()
    data = resp.json()
    return data.get("observation", data)


def env_step(action: Dict[str, Any]) -> Dict[str, Any]:
    resp = requests.post(f"{ENV_BASE_URL}/step", json={"action": action}, timeout=30)
    resp.raise_for_status()
    data = resp.json()
    obs = data.get("observation", data)
    obs["done"] = data.get("done", obs.get("done", False))
    obs["reward"] = data.get("reward", obs.get("reward", 0.0))
    return obs


def get_task_ids() -> List[str]:
    resp = requests.get(f"{ENV_BASE_URL}/tasks", timeout=30)
    resp.raise_for_status()
    data = resp.json()
    ids: List[str] = []
    for difficulty in ["easy", "medium", "hard"]:
        ids.extend(data.get(difficulty, []))
    return ids


# ---------------------------------------------------------------------------
# LLM call
# ---------------------------------------------------------------------------

def call_llm(client: OpenAI, messages: List[Dict[str, str]]) -> str:
    response = client.chat.completions.create(
        model=MODEL_NAME,
        messages=messages,
        temperature=0.2,
        max_tokens=512,
        stream=False,
    )
    return response.choices[0].message.content.strip()


def parse_action(text: str) -> Dict[str, Any]:
    text = text.strip()
    if text.startswith("```"):
        lines = text.split("\n")
        text = "\n".join(lines[1:-1]) if len(lines) > 2 else lines[0]
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        return {"action_type": "submit_plan"}


# ---------------------------------------------------------------------------
# Run one task
# ---------------------------------------------------------------------------

def run_task(client: OpenAI, task_id: str) -> Dict[str, Any]:
    rewards: List[float] = []
    steps_taken = 0
    score = 0.01
    success = False

    try:
        obs = env_reset(task_id)
        log_start(task=task_id, env="commitment-os", model=MODEL_NAME)

        briefing = obs.get("briefing", "")
        calendar = json.dumps(obs.get("calendar_snapshot", []), indent=2)
        inbox = json.dumps(obs.get("inbox", []), indent=2)

        messages: List[Dict[str, str]] = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": f"SCENARIO: {briefing}\n\nCALENDAR:\n{calendar}\n\nINBOX:\n{inbox}\n\nWhat is your first action?"},
        ]

        for step_num in range(1, MAX_STEPS + 1):
            llm_output = call_llm(client, messages)
            action = parse_action(llm_output)

            step_data = env_step(action)
            reward = float(step_data.get("reward", 0.0) or 0.0)
            done = step_data.get("done", False)
            steps_taken = step_num
            rewards.append(reward)

            action_str = json.dumps(action, separators=(",", ":"))
            log_step(step=step_num, action=action_str, reward=reward, done=done)

            if done:
                score = max(0.01, min(0.99, reward))
                success = score > 0.01
                break

            tool_result = step_data.get("tool_result", "")
            messages.append({"role": "assistant", "content": llm_output})
            messages.append({"role": "user", "content": f"TOOL RESULT: {tool_result}\n\nWhat is your next action?"})

        if not done:
            step_data = env_step({"action_type": "submit_plan"})
            reward = float(step_data.get("reward", 0.0) or 0.0)
            steps_taken += 1
            rewards.append(reward)
            score = max(0.01, min(0.99, reward))
            success = score > 0.01
            log_step(step=steps_taken, action='{"action_type":"submit_plan"}', reward=reward, done=True)

    except Exception as exc:
        steps_taken = max(steps_taken, 1)
        if not rewards:
            rewards.append(0.01)
        log_step(step=steps_taken, action="error", reward=0.01, done=True, error=str(exc))

    finally:
        log_end(success=success, steps=steps_taken, score=score, rewards=rewards)

    return {"task_id": task_id, "reward": score, "success": success}


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main() -> None:
    if not API_KEY:
        print("ERROR: Set HF_TOKEN or OPENAI_API_KEY environment variable", file=sys.stderr)
        sys.exit(1)

    client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
    task_ids = get_task_ids()

    results: List[Dict[str, Any]] = []
    for tid in task_ids:
        result = run_task(client, tid)
        results.append(result)

    total = len(results)
    successes = sum(1 for r in results if r["success"])
    mean_reward = sum(r["reward"] for r in results) / total if total > 0 else 0.0
    print(f"\n# Summary: {successes}/{total} tasks succeeded, mean_reward={mean_reward:.3f}", flush=True)


if __name__ == "__main__":
    main()