Spaces:

Jayant2304
/

commitment-os

Sleeping

jayantaggarwal-sketch

Sync latest code and non-binary artifacts

af8810b 27 days ago

8.79 kB

	"""Baseline inference script for CommitmentOS.

	Uses an OpenAI-compatible LLM to play through all 15 scenarios.
	Multi-turn: the agent gets the briefing, makes tool calls, then submits.

	Required environment variables:
	API_BASE_URL — OpenAI-compatible endpoint
	MODEL_NAME — model identifier
	HF_TOKEN — API key (also checked as OPENAI_API_KEY)
	ENV_BASE_URL — CommitmentOS server URL (default: HF Space)
	"""

	from __future__ import annotations

	import json
	import os
	import sys
	import time
	from typing import Any, Dict, List

	import requests
	from openai import OpenAI
	from dotenv import load_dotenv

	# ---------------------------------------------------------------------------
	# Configuration
	# ---------------------------------------------------------------------------

	load_dotenv()

	API_BASE_URL = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
	MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4o-mini")
	API_KEY = os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY") or ""
	ENV_BASE_URL = os.getenv("ENV_BASE_URL", "https://jayant2304-commitment-os.hf.space")

	MAX_STEPS = 12

	SYSTEM_PROMPT = """You are an expert executive assistant AI. You manage calendars, emails, and dining reservations.

	You will be given a scenario briefing describing a situation with calendar conflicts, emails, or planning tasks.

	For each turn, you must respond with EXACTLY ONE JSON object choosing a tool to call:

	Available tools:
	- {"action_type": "view_calendar", "date": "2026-04-25"}
	- {"action_type": "check_availability", "person": "Client_Jones"}
	- {"action_type": "search_restaurants", "cuisine": "Italian", "max_price": 50, "dietary": "vegetarian", "max_distance_miles": 3.0, "near_airport": false}
	- {"action_type": "schedule_meeting", "title": "Demo", "date": "2026-04-25", "time": "14:00", "duration_min": 60, "participants": ["Client_Jones"], "location": "Room A"}
	- {"action_type": "reschedule_event", "event_id": "evt_1", "new_time": "15:00"}
	- {"action_type": "cancel_event", "event_id": "evt_1"}
	- {"action_type": "send_email", "to": "VP_Chen", "subject": "Meeting update", "body": "Hi, I need to reschedule..."}
	- {"action_type": "book_restaurant", "restaurant_name": "Sky Lounge"}
	- {"action_type": "submit_plan"}

	IMPORTANT RULES:
	1. Respond with ONLY a JSON object, no markdown, no explanation
	2. Handle higher-priority items before lower-priority ones
	3. When cancelling or rescheduling commitments, ALWAYS send an email to affected parties BEFORE submitting
	4. Call submit_plan when you have resolved all issues
	5. Never silently drop a commitment — always notify the affected person"""


	# ---------------------------------------------------------------------------
	# Logging helpers — exact format required by hackathon evaluator
	# ---------------------------------------------------------------------------

	def log_start(task: str, env: str, model: str) -> None:
	print(f"[START] task={task} env={env} model={model}", flush=True)


	def log_step(step: int, action: str, reward: float, done: bool, error: str \| None = None) -> None:
	err = error if error else "null"
	print(f"[STEP] step={step} action={action} reward={reward:.2f} done={'true' if done else 'false'} error={err}", flush=True)


	def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
	rewards_str = ",".join(f"{r:.2f}" for r in rewards)
	print(f"[END] success={'true' if success else 'false'} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)


	# ---------------------------------------------------------------------------
	# Environment interaction
	# ---------------------------------------------------------------------------

	def env_reset(task_id: str) -> Dict[str, Any]:
	resp = requests.post(f"{ENV_BASE_URL}/reset", params={"task_id": task_id}, timeout=30)
	resp.raise_for_status()
	data = resp.json()
	return data.get("observation", data)


	def env_step(action: Dict[str, Any]) -> Dict[str, Any]:
	resp = requests.post(f"{ENV_BASE_URL}/step", json={"action": action}, timeout=30)
	resp.raise_for_status()
	data = resp.json()
	obs = data.get("observation", data)
	obs["done"] = data.get("done", obs.get("done", False))
	obs["reward"] = data.get("reward", obs.get("reward", 0.0))
	return obs


	def get_task_ids() -> List[str]:
	resp = requests.get(f"{ENV_BASE_URL}/tasks", timeout=30)
	resp.raise_for_status()
	data = resp.json()
	ids: List[str] = []
	for difficulty in ["easy", "medium", "hard"]:
	ids.extend(data.get(difficulty, []))
	return ids


	# ---------------------------------------------------------------------------
	# LLM call
	# ---------------------------------------------------------------------------

	def call_llm(client: OpenAI, messages: List[Dict[str, str]]) -> str:
	response = client.chat.completions.create(
	model=MODEL_NAME,
	messages=messages,
	temperature=0.2,
	max_tokens=512,
	stream=False,
	)
	return response.choices[0].message.content.strip()


	def parse_action(text: str) -> Dict[str, Any]:
	text = text.strip()
	if text.startswith("```"):
	lines = text.split("\n")
	text = "\n".join(lines[1:-1]) if len(lines) > 2 else lines[0]
	try:
	return json.loads(text)
	except json.JSONDecodeError:
	return {"action_type": "submit_plan"}


	# ---------------------------------------------------------------------------
	# Run one task
	# ---------------------------------------------------------------------------

	def run_task(client: OpenAI, task_id: str) -> Dict[str, Any]:
	rewards: List[float] = []
	steps_taken = 0
	score = 0.01
	success = False

	try:
	obs = env_reset(task_id)
	log_start(task=task_id, env="commitment-os", model=MODEL_NAME)

	briefing = obs.get("briefing", "")
	calendar = json.dumps(obs.get("calendar_snapshot", []), indent=2)
	inbox = json.dumps(obs.get("inbox", []), indent=2)

	messages: List[Dict[str, str]] = [
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": f"SCENARIO: {briefing}\n\nCALENDAR:\n{calendar}\n\nINBOX:\n{inbox}\n\nWhat is your first action?"},
	]

	for step_num in range(1, MAX_STEPS + 1):
	llm_output = call_llm(client, messages)
	action = parse_action(llm_output)

	step_data = env_step(action)
	reward = float(step_data.get("reward", 0.0) or 0.0)
	done = step_data.get("done", False)
	steps_taken = step_num
	rewards.append(reward)

	action_str = json.dumps(action, separators=(",", ":"))
	log_step(step=step_num, action=action_str, reward=reward, done=done)

	if done:
	score = max(0.01, min(0.99, reward))
	success = score > 0.01
	break

	tool_result = step_data.get("tool_result", "")
	messages.append({"role": "assistant", "content": llm_output})
	messages.append({"role": "user", "content": f"TOOL RESULT: {tool_result}\n\nWhat is your next action?"})

	if not done:
	step_data = env_step({"action_type": "submit_plan"})
	reward = float(step_data.get("reward", 0.0) or 0.0)
	steps_taken += 1
	rewards.append(reward)
	score = max(0.01, min(0.99, reward))
	success = score > 0.01
	log_step(step=steps_taken, action='{"action_type":"submit_plan"}', reward=reward, done=True)

	except Exception as exc:
	steps_taken = max(steps_taken, 1)
	if not rewards:
	rewards.append(0.01)
	log_step(step=steps_taken, action="error", reward=0.01, done=True, error=str(exc))

	finally:
	log_end(success=success, steps=steps_taken, score=score, rewards=rewards)

	return {"task_id": task_id, "reward": score, "success": success}


	# ---------------------------------------------------------------------------
	# Main
	# ---------------------------------------------------------------------------

	def main() -> None:
	if not API_KEY:
	print("ERROR: Set HF_TOKEN or OPENAI_API_KEY environment variable", file=sys.stderr)
	sys.exit(1)

	client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
	task_ids = get_task_ids()

	results: List[Dict[str, Any]] = []
	for tid in task_ids:
	result = run_task(client, tid)
	results.append(result)

	total = len(results)
	successes = sum(1 for r in results if r["success"])
	mean_reward = sum(r["reward"] for r in results) / total if total > 0 else 0.0
	print(f"\n# Summary: {successes}/{total} tasks succeeded, mean_reward={mean_reward:.3f}", flush=True)


	if __name__ == "__main__":
	main()