Spaces:
Sleeping
Sleeping
| """Baseline inference script for CommitmentOS. | |
| Uses an OpenAI-compatible LLM to play through all 15 scenarios. | |
| Multi-turn: the agent gets the briefing, makes tool calls, then submits. | |
| Required environment variables: | |
| API_BASE_URL — OpenAI-compatible endpoint | |
| MODEL_NAME — model identifier | |
| HF_TOKEN — API key (also checked as OPENAI_API_KEY) | |
| ENV_BASE_URL — CommitmentOS server URL (default: HF Space) | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import os | |
| import sys | |
| import time | |
| from typing import Any, Dict, List | |
| import requests | |
| from openai import OpenAI | |
| from dotenv import load_dotenv | |
| # --------------------------------------------------------------------------- | |
| # Configuration | |
| # --------------------------------------------------------------------------- | |
| load_dotenv() | |
| API_BASE_URL = os.getenv("API_BASE_URL", "https://api.openai.com/v1") | |
| MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4o-mini") | |
| API_KEY = os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY") or "" | |
| ENV_BASE_URL = os.getenv("ENV_BASE_URL", "https://jayant2304-commitment-os.hf.space") | |
| MAX_STEPS = 12 | |
| SYSTEM_PROMPT = """You are an expert executive assistant AI. You manage calendars, emails, and dining reservations. | |
| You will be given a scenario briefing describing a situation with calendar conflicts, emails, or planning tasks. | |
| For each turn, you must respond with EXACTLY ONE JSON object choosing a tool to call: | |
| Available tools: | |
| - {"action_type": "view_calendar", "date": "2026-04-25"} | |
| - {"action_type": "check_availability", "person": "Client_Jones"} | |
| - {"action_type": "search_restaurants", "cuisine": "Italian", "max_price": 50, "dietary": "vegetarian", "max_distance_miles": 3.0, "near_airport": false} | |
| - {"action_type": "schedule_meeting", "title": "Demo", "date": "2026-04-25", "time": "14:00", "duration_min": 60, "participants": ["Client_Jones"], "location": "Room A"} | |
| - {"action_type": "reschedule_event", "event_id": "evt_1", "new_time": "15:00"} | |
| - {"action_type": "cancel_event", "event_id": "evt_1"} | |
| - {"action_type": "send_email", "to": "VP_Chen", "subject": "Meeting update", "body": "Hi, I need to reschedule..."} | |
| - {"action_type": "book_restaurant", "restaurant_name": "Sky Lounge"} | |
| - {"action_type": "submit_plan"} | |
| IMPORTANT RULES: | |
| 1. Respond with ONLY a JSON object, no markdown, no explanation | |
| 2. Handle higher-priority items before lower-priority ones | |
| 3. When cancelling or rescheduling commitments, ALWAYS send an email to affected parties BEFORE submitting | |
| 4. Call submit_plan when you have resolved all issues | |
| 5. Never silently drop a commitment — always notify the affected person""" | |
| # --------------------------------------------------------------------------- | |
| # Logging helpers — exact format required by hackathon evaluator | |
| # --------------------------------------------------------------------------- | |
| def log_start(task: str, env: str, model: str) -> None: | |
| print(f"[START] task={task} env={env} model={model}", flush=True) | |
| def log_step(step: int, action: str, reward: float, done: bool, error: str | None = None) -> None: | |
| err = error if error else "null" | |
| print(f"[STEP] step={step} action={action} reward={reward:.2f} done={'true' if done else 'false'} error={err}", flush=True) | |
| def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None: | |
| rewards_str = ",".join(f"{r:.2f}" for r in rewards) | |
| print(f"[END] success={'true' if success else 'false'} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True) | |
| # --------------------------------------------------------------------------- | |
| # Environment interaction | |
| # --------------------------------------------------------------------------- | |
| def env_reset(task_id: str) -> Dict[str, Any]: | |
| resp = requests.post(f"{ENV_BASE_URL}/reset", params={"task_id": task_id}, timeout=30) | |
| resp.raise_for_status() | |
| data = resp.json() | |
| return data.get("observation", data) | |
| def env_step(action: Dict[str, Any]) -> Dict[str, Any]: | |
| resp = requests.post(f"{ENV_BASE_URL}/step", json={"action": action}, timeout=30) | |
| resp.raise_for_status() | |
| data = resp.json() | |
| obs = data.get("observation", data) | |
| obs["done"] = data.get("done", obs.get("done", False)) | |
| obs["reward"] = data.get("reward", obs.get("reward", 0.0)) | |
| return obs | |
| def get_task_ids() -> List[str]: | |
| resp = requests.get(f"{ENV_BASE_URL}/tasks", timeout=30) | |
| resp.raise_for_status() | |
| data = resp.json() | |
| ids: List[str] = [] | |
| for difficulty in ["easy", "medium", "hard"]: | |
| ids.extend(data.get(difficulty, [])) | |
| return ids | |
| # --------------------------------------------------------------------------- | |
| # LLM call | |
| # --------------------------------------------------------------------------- | |
| def call_llm(client: OpenAI, messages: List[Dict[str, str]]) -> str: | |
| response = client.chat.completions.create( | |
| model=MODEL_NAME, | |
| messages=messages, | |
| temperature=0.2, | |
| max_tokens=512, | |
| stream=False, | |
| ) | |
| return response.choices[0].message.content.strip() | |
| def parse_action(text: str) -> Dict[str, Any]: | |
| text = text.strip() | |
| if text.startswith("```"): | |
| lines = text.split("\n") | |
| text = "\n".join(lines[1:-1]) if len(lines) > 2 else lines[0] | |
| try: | |
| return json.loads(text) | |
| except json.JSONDecodeError: | |
| return {"action_type": "submit_plan"} | |
| # --------------------------------------------------------------------------- | |
| # Run one task | |
| # --------------------------------------------------------------------------- | |
| def run_task(client: OpenAI, task_id: str) -> Dict[str, Any]: | |
| rewards: List[float] = [] | |
| steps_taken = 0 | |
| score = 0.01 | |
| success = False | |
| try: | |
| obs = env_reset(task_id) | |
| log_start(task=task_id, env="commitment-os", model=MODEL_NAME) | |
| briefing = obs.get("briefing", "") | |
| calendar = json.dumps(obs.get("calendar_snapshot", []), indent=2) | |
| inbox = json.dumps(obs.get("inbox", []), indent=2) | |
| messages: List[Dict[str, str]] = [ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| {"role": "user", "content": f"SCENARIO: {briefing}\n\nCALENDAR:\n{calendar}\n\nINBOX:\n{inbox}\n\nWhat is your first action?"}, | |
| ] | |
| for step_num in range(1, MAX_STEPS + 1): | |
| llm_output = call_llm(client, messages) | |
| action = parse_action(llm_output) | |
| step_data = env_step(action) | |
| reward = float(step_data.get("reward", 0.0) or 0.0) | |
| done = step_data.get("done", False) | |
| steps_taken = step_num | |
| rewards.append(reward) | |
| action_str = json.dumps(action, separators=(",", ":")) | |
| log_step(step=step_num, action=action_str, reward=reward, done=done) | |
| if done: | |
| score = max(0.01, min(0.99, reward)) | |
| success = score > 0.01 | |
| break | |
| tool_result = step_data.get("tool_result", "") | |
| messages.append({"role": "assistant", "content": llm_output}) | |
| messages.append({"role": "user", "content": f"TOOL RESULT: {tool_result}\n\nWhat is your next action?"}) | |
| if not done: | |
| step_data = env_step({"action_type": "submit_plan"}) | |
| reward = float(step_data.get("reward", 0.0) or 0.0) | |
| steps_taken += 1 | |
| rewards.append(reward) | |
| score = max(0.01, min(0.99, reward)) | |
| success = score > 0.01 | |
| log_step(step=steps_taken, action='{"action_type":"submit_plan"}', reward=reward, done=True) | |
| except Exception as exc: | |
| steps_taken = max(steps_taken, 1) | |
| if not rewards: | |
| rewards.append(0.01) | |
| log_step(step=steps_taken, action="error", reward=0.01, done=True, error=str(exc)) | |
| finally: | |
| log_end(success=success, steps=steps_taken, score=score, rewards=rewards) | |
| return {"task_id": task_id, "reward": score, "success": success} | |
| # --------------------------------------------------------------------------- | |
| # Main | |
| # --------------------------------------------------------------------------- | |
| def main() -> None: | |
| if not API_KEY: | |
| print("ERROR: Set HF_TOKEN or OPENAI_API_KEY environment variable", file=sys.stderr) | |
| sys.exit(1) | |
| client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY) | |
| task_ids = get_task_ids() | |
| results: List[Dict[str, Any]] = [] | |
| for tid in task_ids: | |
| result = run_task(client, tid) | |
| results.append(result) | |
| total = len(results) | |
| successes = sum(1 for r in results if r["success"]) | |
| mean_reward = sum(r["reward"] for r in results) / total if total > 0 else 0.0 | |
| print(f"\n# Summary: {successes}/{total} tasks succeeded, mean_reward={mean_reward:.3f}", flush=True) | |
| if __name__ == "__main__": | |
| main() | |