Spaces:
Sleeping
Sleeping
File size: 8,793 Bytes
6762657 af8810b 6762657 af8810b 6762657 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 | """Baseline inference script for CommitmentOS.
Uses an OpenAI-compatible LLM to play through all 15 scenarios.
Multi-turn: the agent gets the briefing, makes tool calls, then submits.
Required environment variables:
API_BASE_URL β OpenAI-compatible endpoint
MODEL_NAME β model identifier
HF_TOKEN β API key (also checked as OPENAI_API_KEY)
ENV_BASE_URL β CommitmentOS server URL (default: HF Space)
"""
from __future__ import annotations
import json
import os
import sys
import time
from typing import Any, Dict, List
import requests
from openai import OpenAI
from dotenv import load_dotenv
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
load_dotenv()
API_BASE_URL = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4o-mini")
API_KEY = os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY") or ""
ENV_BASE_URL = os.getenv("ENV_BASE_URL", "https://jayant2304-commitment-os.hf.space")
MAX_STEPS = 12
SYSTEM_PROMPT = """You are an expert executive assistant AI. You manage calendars, emails, and dining reservations.
You will be given a scenario briefing describing a situation with calendar conflicts, emails, or planning tasks.
For each turn, you must respond with EXACTLY ONE JSON object choosing a tool to call:
Available tools:
- {"action_type": "view_calendar", "date": "2026-04-25"}
- {"action_type": "check_availability", "person": "Client_Jones"}
- {"action_type": "search_restaurants", "cuisine": "Italian", "max_price": 50, "dietary": "vegetarian", "max_distance_miles": 3.0, "near_airport": false}
- {"action_type": "schedule_meeting", "title": "Demo", "date": "2026-04-25", "time": "14:00", "duration_min": 60, "participants": ["Client_Jones"], "location": "Room A"}
- {"action_type": "reschedule_event", "event_id": "evt_1", "new_time": "15:00"}
- {"action_type": "cancel_event", "event_id": "evt_1"}
- {"action_type": "send_email", "to": "VP_Chen", "subject": "Meeting update", "body": "Hi, I need to reschedule..."}
- {"action_type": "book_restaurant", "restaurant_name": "Sky Lounge"}
- {"action_type": "submit_plan"}
IMPORTANT RULES:
1. Respond with ONLY a JSON object, no markdown, no explanation
2. Handle higher-priority items before lower-priority ones
3. When cancelling or rescheduling commitments, ALWAYS send an email to affected parties BEFORE submitting
4. Call submit_plan when you have resolved all issues
5. Never silently drop a commitment β always notify the affected person"""
# ---------------------------------------------------------------------------
# Logging helpers β exact format required by hackathon evaluator
# ---------------------------------------------------------------------------
def log_start(task: str, env: str, model: str) -> None:
print(f"[START] task={task} env={env} model={model}", flush=True)
def log_step(step: int, action: str, reward: float, done: bool, error: str | None = None) -> None:
err = error if error else "null"
print(f"[STEP] step={step} action={action} reward={reward:.2f} done={'true' if done else 'false'} error={err}", flush=True)
def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
print(f"[END] success={'true' if success else 'false'} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
# ---------------------------------------------------------------------------
# Environment interaction
# ---------------------------------------------------------------------------
def env_reset(task_id: str) -> Dict[str, Any]:
resp = requests.post(f"{ENV_BASE_URL}/reset", params={"task_id": task_id}, timeout=30)
resp.raise_for_status()
data = resp.json()
return data.get("observation", data)
def env_step(action: Dict[str, Any]) -> Dict[str, Any]:
resp = requests.post(f"{ENV_BASE_URL}/step", json={"action": action}, timeout=30)
resp.raise_for_status()
data = resp.json()
obs = data.get("observation", data)
obs["done"] = data.get("done", obs.get("done", False))
obs["reward"] = data.get("reward", obs.get("reward", 0.0))
return obs
def get_task_ids() -> List[str]:
resp = requests.get(f"{ENV_BASE_URL}/tasks", timeout=30)
resp.raise_for_status()
data = resp.json()
ids: List[str] = []
for difficulty in ["easy", "medium", "hard"]:
ids.extend(data.get(difficulty, []))
return ids
# ---------------------------------------------------------------------------
# LLM call
# ---------------------------------------------------------------------------
def call_llm(client: OpenAI, messages: List[Dict[str, str]]) -> str:
response = client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
temperature=0.2,
max_tokens=512,
stream=False,
)
return response.choices[0].message.content.strip()
def parse_action(text: str) -> Dict[str, Any]:
text = text.strip()
if text.startswith("```"):
lines = text.split("\n")
text = "\n".join(lines[1:-1]) if len(lines) > 2 else lines[0]
try:
return json.loads(text)
except json.JSONDecodeError:
return {"action_type": "submit_plan"}
# ---------------------------------------------------------------------------
# Run one task
# ---------------------------------------------------------------------------
def run_task(client: OpenAI, task_id: str) -> Dict[str, Any]:
rewards: List[float] = []
steps_taken = 0
score = 0.01
success = False
try:
obs = env_reset(task_id)
log_start(task=task_id, env="commitment-os", model=MODEL_NAME)
briefing = obs.get("briefing", "")
calendar = json.dumps(obs.get("calendar_snapshot", []), indent=2)
inbox = json.dumps(obs.get("inbox", []), indent=2)
messages: List[Dict[str, str]] = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": f"SCENARIO: {briefing}\n\nCALENDAR:\n{calendar}\n\nINBOX:\n{inbox}\n\nWhat is your first action?"},
]
for step_num in range(1, MAX_STEPS + 1):
llm_output = call_llm(client, messages)
action = parse_action(llm_output)
step_data = env_step(action)
reward = float(step_data.get("reward", 0.0) or 0.0)
done = step_data.get("done", False)
steps_taken = step_num
rewards.append(reward)
action_str = json.dumps(action, separators=(",", ":"))
log_step(step=step_num, action=action_str, reward=reward, done=done)
if done:
score = max(0.01, min(0.99, reward))
success = score > 0.01
break
tool_result = step_data.get("tool_result", "")
messages.append({"role": "assistant", "content": llm_output})
messages.append({"role": "user", "content": f"TOOL RESULT: {tool_result}\n\nWhat is your next action?"})
if not done:
step_data = env_step({"action_type": "submit_plan"})
reward = float(step_data.get("reward", 0.0) or 0.0)
steps_taken += 1
rewards.append(reward)
score = max(0.01, min(0.99, reward))
success = score > 0.01
log_step(step=steps_taken, action='{"action_type":"submit_plan"}', reward=reward, done=True)
except Exception as exc:
steps_taken = max(steps_taken, 1)
if not rewards:
rewards.append(0.01)
log_step(step=steps_taken, action="error", reward=0.01, done=True, error=str(exc))
finally:
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
return {"task_id": task_id, "reward": score, "success": success}
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main() -> None:
if not API_KEY:
print("ERROR: Set HF_TOKEN or OPENAI_API_KEY environment variable", file=sys.stderr)
sys.exit(1)
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
task_ids = get_task_ids()
results: List[Dict[str, Any]] = []
for tid in task_ids:
result = run_task(client, tid)
results.append(result)
total = len(results)
successes = sum(1 for r in results if r["success"])
mean_reward = sum(r["reward"] for r in results) / total if total > 0 else 0.0
print(f"\n# Summary: {successes}/{total} tasks succeeded, mean_reward={mean_reward:.3f}", flush=True)
if __name__ == "__main__":
main()
|