commitment-os / inference.py
jayantaggarwal-sketch
Sync latest code and non-binary artifacts
af8810b
"""Baseline inference script for CommitmentOS.
Uses an OpenAI-compatible LLM to play through all 15 scenarios.
Multi-turn: the agent gets the briefing, makes tool calls, then submits.
Required environment variables:
API_BASE_URL — OpenAI-compatible endpoint
MODEL_NAME — model identifier
HF_TOKEN — API key (also checked as OPENAI_API_KEY)
ENV_BASE_URL — CommitmentOS server URL (default: HF Space)
"""
from __future__ import annotations
import json
import os
import sys
import time
from typing import Any, Dict, List
import requests
from openai import OpenAI
from dotenv import load_dotenv
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
load_dotenv()
API_BASE_URL = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4o-mini")
API_KEY = os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY") or ""
ENV_BASE_URL = os.getenv("ENV_BASE_URL", "https://jayant2304-commitment-os.hf.space")
MAX_STEPS = 12
SYSTEM_PROMPT = """You are an expert executive assistant AI. You manage calendars, emails, and dining reservations.
You will be given a scenario briefing describing a situation with calendar conflicts, emails, or planning tasks.
For each turn, you must respond with EXACTLY ONE JSON object choosing a tool to call:
Available tools:
- {"action_type": "view_calendar", "date": "2026-04-25"}
- {"action_type": "check_availability", "person": "Client_Jones"}
- {"action_type": "search_restaurants", "cuisine": "Italian", "max_price": 50, "dietary": "vegetarian", "max_distance_miles": 3.0, "near_airport": false}
- {"action_type": "schedule_meeting", "title": "Demo", "date": "2026-04-25", "time": "14:00", "duration_min": 60, "participants": ["Client_Jones"], "location": "Room A"}
- {"action_type": "reschedule_event", "event_id": "evt_1", "new_time": "15:00"}
- {"action_type": "cancel_event", "event_id": "evt_1"}
- {"action_type": "send_email", "to": "VP_Chen", "subject": "Meeting update", "body": "Hi, I need to reschedule..."}
- {"action_type": "book_restaurant", "restaurant_name": "Sky Lounge"}
- {"action_type": "submit_plan"}
IMPORTANT RULES:
1. Respond with ONLY a JSON object, no markdown, no explanation
2. Handle higher-priority items before lower-priority ones
3. When cancelling or rescheduling commitments, ALWAYS send an email to affected parties BEFORE submitting
4. Call submit_plan when you have resolved all issues
5. Never silently drop a commitment — always notify the affected person"""
# ---------------------------------------------------------------------------
# Logging helpers — exact format required by hackathon evaluator
# ---------------------------------------------------------------------------
def log_start(task: str, env: str, model: str) -> None:
print(f"[START] task={task} env={env} model={model}", flush=True)
def log_step(step: int, action: str, reward: float, done: bool, error: str | None = None) -> None:
err = error if error else "null"
print(f"[STEP] step={step} action={action} reward={reward:.2f} done={'true' if done else 'false'} error={err}", flush=True)
def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
print(f"[END] success={'true' if success else 'false'} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
# ---------------------------------------------------------------------------
# Environment interaction
# ---------------------------------------------------------------------------
def env_reset(task_id: str) -> Dict[str, Any]:
resp = requests.post(f"{ENV_BASE_URL}/reset", params={"task_id": task_id}, timeout=30)
resp.raise_for_status()
data = resp.json()
return data.get("observation", data)
def env_step(action: Dict[str, Any]) -> Dict[str, Any]:
resp = requests.post(f"{ENV_BASE_URL}/step", json={"action": action}, timeout=30)
resp.raise_for_status()
data = resp.json()
obs = data.get("observation", data)
obs["done"] = data.get("done", obs.get("done", False))
obs["reward"] = data.get("reward", obs.get("reward", 0.0))
return obs
def get_task_ids() -> List[str]:
resp = requests.get(f"{ENV_BASE_URL}/tasks", timeout=30)
resp.raise_for_status()
data = resp.json()
ids: List[str] = []
for difficulty in ["easy", "medium", "hard"]:
ids.extend(data.get(difficulty, []))
return ids
# ---------------------------------------------------------------------------
# LLM call
# ---------------------------------------------------------------------------
def call_llm(client: OpenAI, messages: List[Dict[str, str]]) -> str:
response = client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
temperature=0.2,
max_tokens=512,
stream=False,
)
return response.choices[0].message.content.strip()
def parse_action(text: str) -> Dict[str, Any]:
text = text.strip()
if text.startswith("```"):
lines = text.split("\n")
text = "\n".join(lines[1:-1]) if len(lines) > 2 else lines[0]
try:
return json.loads(text)
except json.JSONDecodeError:
return {"action_type": "submit_plan"}
# ---------------------------------------------------------------------------
# Run one task
# ---------------------------------------------------------------------------
def run_task(client: OpenAI, task_id: str) -> Dict[str, Any]:
rewards: List[float] = []
steps_taken = 0
score = 0.01
success = False
try:
obs = env_reset(task_id)
log_start(task=task_id, env="commitment-os", model=MODEL_NAME)
briefing = obs.get("briefing", "")
calendar = json.dumps(obs.get("calendar_snapshot", []), indent=2)
inbox = json.dumps(obs.get("inbox", []), indent=2)
messages: List[Dict[str, str]] = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": f"SCENARIO: {briefing}\n\nCALENDAR:\n{calendar}\n\nINBOX:\n{inbox}\n\nWhat is your first action?"},
]
for step_num in range(1, MAX_STEPS + 1):
llm_output = call_llm(client, messages)
action = parse_action(llm_output)
step_data = env_step(action)
reward = float(step_data.get("reward", 0.0) or 0.0)
done = step_data.get("done", False)
steps_taken = step_num
rewards.append(reward)
action_str = json.dumps(action, separators=(",", ":"))
log_step(step=step_num, action=action_str, reward=reward, done=done)
if done:
score = max(0.01, min(0.99, reward))
success = score > 0.01
break
tool_result = step_data.get("tool_result", "")
messages.append({"role": "assistant", "content": llm_output})
messages.append({"role": "user", "content": f"TOOL RESULT: {tool_result}\n\nWhat is your next action?"})
if not done:
step_data = env_step({"action_type": "submit_plan"})
reward = float(step_data.get("reward", 0.0) or 0.0)
steps_taken += 1
rewards.append(reward)
score = max(0.01, min(0.99, reward))
success = score > 0.01
log_step(step=steps_taken, action='{"action_type":"submit_plan"}', reward=reward, done=True)
except Exception as exc:
steps_taken = max(steps_taken, 1)
if not rewards:
rewards.append(0.01)
log_step(step=steps_taken, action="error", reward=0.01, done=True, error=str(exc))
finally:
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
return {"task_id": task_id, "reward": score, "success": success}
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main() -> None:
if not API_KEY:
print("ERROR: Set HF_TOKEN or OPENAI_API_KEY environment variable", file=sys.stderr)
sys.exit(1)
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
task_ids = get_task_ids()
results: List[Dict[str, Any]] = []
for tid in task_ids:
result = run_task(client, tid)
results.append(result)
total = len(results)
successes = sum(1 for r in results if r["success"])
mean_reward = sum(r["reward"] for r in results) / total if total > 0 else 0.0
print(f"\n# Summary: {successes}/{total} tasks succeeded, mean_reward={mean_reward:.3f}", flush=True)
if __name__ == "__main__":
main()