Spaces:

JOY0021
/

autonomy-calibration-benchmark

Paused

autonomy-calibration-benchmark / scripts /inference.py

Rhythm@28

deploy: final verified championship submission

ef737d3 22 days ago

5.8 kB

	"""
	inference.py — Research-Grade Evaluation Script
	Autonomy Calibration Benchmark (OpenEnv v2.0.0)
	─────────────────────────────────────────────────────────────────────────────
	Generates side-by-side comparison metrics between:
	1. Blind Baseline (Heuristic, no investigation)
	2. Smart Baseline (Deterministic investigation)
	3. (Optional) Trained Agent (Selective investigation)

	Usage: python3 inference.py --episodes 10
	"""

	import os
	import json
	import argparse
	import requests
	import numpy as np
	from typing import List, Dict

	API_BASE = os.getenv("API_BASE_URL", "http://localhost:8000/api").rstrip("/")
	TASKS = ["email_triage", "devops_incident", "financial_request"]

	class Evaluator:
	def __init__(self, mode: str):
	self.mode = mode
	self.results = {t: [] for t in TASKS}
	self.investigate_stats = {t: [] for t in TASKS}
	self.high_ambiguity_failures = {t: 0 for t in TASKS}
	self.total_high_ambiguity = {t: 0 for t in TASKS}

	def _reset(self, task: str, seed: int) -> dict:
	r = requests.post(f"{API_BASE}/reset", json={"task": task, "seed": seed}, timeout=10)
	return r.json()

	def _step(self, action_type: str) -> dict:
	r = requests.post(f"{API_BASE}/step", json={"type": action_type}, timeout=10)
	return r.json()

	def select_action(self, obs: dict, investigated: bool) -> str:
	"""Implements the agent policy."""
	avail = obs.get("available_actions", [])
	ambiguity = obs.get("state", {}).get("ambiguity", 0.1)

	# Mode Logic
	if self.mode == "blind":
	# Never investigate, just pick the first non-investigate action
	return [a for a in avail if a != "investigate"][0]

	elif self.mode == "smart":
	# Always investigate at step 0 if available
	if "investigate" in avail and not investigated:
	return "investigate"
	# Otherwise use heuristic
	choices = [a for a in avail if a != "investigate"]
	return choices[0] if choices else avail[0]

	return avail[0]

	def run_eval(self, episodes_per_task=10):
	print(f"\n🚀 Running evaluation: MODE = {self.mode.upper()}")
	for task in TASKS:
	print(f" Evaluating {task}...", end="", flush=True)
	for seed in range(episodes_per_task):
	obs = self._reset(task, seed)
	investigated = False
	done = False
	total_reward = 0

	# Track ambiguity for metrics
	ambiguity = obs.get("state", {}).get("ambiguity", 0.0)
	is_high_ambiguity = ambiguity > 0.70
	if is_high_ambiguity:
	self.total_high_ambiguity[task] += 1

	while not done:
	action = self.select_action(obs, investigated)
	if action == "investigate":
	investigated = True

	res = self._step(action)
	done = res.get("done", False)
	obs = res.get("observation", {})

	score = res.get("info", {}).get("episode_score", 0.0)
	self.results[task].append(score)
	self.investigate_stats[task].append(1 if investigated else 0)

	if is_high_ambiguity and score < 0.20:
	self.high_ambiguity_failures[task] += 1

	print(" Done.")

	def get_summary(self):
	summary = {}
	for t in TASKS:
	avg_rew = np.mean(self.results[t])
	inv_rate = np.mean(self.investigate_stats[t])
	fail_rate = self.high_ambiguity_failures[t] / self.total_high_ambiguity[t] if self.total_high_ambiguity[t] > 0 else 0
	summary[t] = {
	"avg_reward": avg_rew,
	"investigate_rate": inv_rate,
	"failure_rate_ambiguous": fail_rate
	}
	return summary

	def print_final_report(blind: dict, smart: dict):
	print("\n" + "="*80)
	print("🏆 RESEARCH-GRADE EVALUATION REPORT")
	print("="*80)
	print(f"{'Task':<20} \| {'Mode':<8} \| {'Reward':<7} \| {'Inv%':<6} \| {'AmbFail%':<10}")
	print("-" * 80)

	for t in TASKS:
	b = blind[t]
	s = smart[t]
	delta = ((s['avg_reward'] - b['avg_reward']) / b['avg_reward']) * 100 if b['avg_reward'] > 0 else 0

	print(f"{t[:20]:<20} \| {'BLIND':<8} \| {b['avg_reward']:.4f} \| {b['investigate_rate']100:>5.0f}% \| {b['failure_rate_ambiguous']100:>9.0f}%")
	print(f"{'':<20} \| {'SMART':<8} \| {s['avg_reward']:.4f} \| {s['investigate_rate']100:>5.0f}% \| {s['failure_rate_ambiguous']100:>9.0f}%")
	print(f"{'':<20} \| {'IMPROVE':<8} \| {delta:>+6.1f}% \| {'--':>6} \| {'--':>10}")
	print("-" * 80)

	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("--episodes", type=int, default=10)
	args = parser.parse_args()

	# Step 1: Evaluate Blind Baseline
	blind_eval = Evaluator("blind")
	blind_eval.run_eval(args.episodes)
	blind_summary = blind_eval.get_summary()

	# Step 2: Evaluate Smart Baseline (Upper Bound)
	smart_eval = Evaluator("smart")
	smart_eval.run_eval(args.episodes)
	smart_summary = smart_eval.get_summary()

	# Final Report
	print_final_report(blind_summary, smart_summary)

	print("\n💡 INSIGHT: The Smart baseline establishes the ceiling for 'Total Curiosity'.")
	print(" The winning GRPO model learns to bridge this gap selectively.")