| """ |
| inference.py — Research-Grade Evaluation Script |
| Autonomy Calibration Benchmark (OpenEnv v2.0.0) |
| ───────────────────────────────────────────────────────────────────────────── |
| Generates side-by-side comparison metrics between: |
| 1. Blind Baseline (Heuristic, no investigation) |
| 2. Smart Baseline (Deterministic investigation) |
| 3. (Optional) Trained Agent (Selective investigation) |
| |
| Usage: python3 inference.py --episodes 10 |
| """ |
|
|
| import os |
| import json |
| import argparse |
| import requests |
| import numpy as np |
| from typing import List, Dict |
|
|
| API_BASE = os.getenv("API_BASE_URL", "http://localhost:8000/api").rstrip("/") |
| TASKS = ["email_triage", "devops_incident", "financial_request"] |
|
|
| class Evaluator: |
| def __init__(self, mode: str): |
| self.mode = mode |
| self.results = {t: [] for t in TASKS} |
| self.investigate_stats = {t: [] for t in TASKS} |
| self.high_ambiguity_failures = {t: 0 for t in TASKS} |
| self.total_high_ambiguity = {t: 0 for t in TASKS} |
|
|
| def _reset(self, task: str, seed: int) -> dict: |
| r = requests.post(f"{API_BASE}/reset", json={"task": task, "seed": seed}, timeout=10) |
| return r.json() |
|
|
| def _step(self, action_type: str) -> dict: |
| r = requests.post(f"{API_BASE}/step", json={"type": action_type}, timeout=10) |
| return r.json() |
|
|
| def select_action(self, obs: dict, investigated: bool) -> str: |
| """Implements the agent policy.""" |
| avail = obs.get("available_actions", []) |
| ambiguity = obs.get("state", {}).get("ambiguity", 0.1) |
| |
| |
| if self.mode == "blind": |
| |
| return [a for a in avail if a != "investigate"][0] |
| |
| elif self.mode == "smart": |
| |
| if "investigate" in avail and not investigated: |
| return "investigate" |
| |
| choices = [a for a in avail if a != "investigate"] |
| return choices[0] if choices else avail[0] |
|
|
| return avail[0] |
|
|
| def run_eval(self, episodes_per_task=10): |
| print(f"\n🚀 Running evaluation: MODE = {self.mode.upper()}") |
| for task in TASKS: |
| print(f" Evaluating {task}...", end="", flush=True) |
| for seed in range(episodes_per_task): |
| obs = self._reset(task, seed) |
| investigated = False |
| done = False |
| total_reward = 0 |
| |
| |
| ambiguity = obs.get("state", {}).get("ambiguity", 0.0) |
| is_high_ambiguity = ambiguity > 0.70 |
| if is_high_ambiguity: |
| self.total_high_ambiguity[task] += 1 |
|
|
| while not done: |
| action = self.select_action(obs, investigated) |
| if action == "investigate": |
| investigated = True |
| |
| res = self._step(action) |
| done = res.get("done", False) |
| obs = res.get("observation", {}) |
| |
| score = res.get("info", {}).get("episode_score", 0.0) |
| self.results[task].append(score) |
| self.investigate_stats[task].append(1 if investigated else 0) |
| |
| if is_high_ambiguity and score < 0.20: |
| self.high_ambiguity_failures[task] += 1 |
| |
| print(" Done.") |
|
|
| def get_summary(self): |
| summary = {} |
| for t in TASKS: |
| avg_rew = np.mean(self.results[t]) |
| inv_rate = np.mean(self.investigate_stats[t]) |
| fail_rate = self.high_ambiguity_failures[t] / self.total_high_ambiguity[t] if self.total_high_ambiguity[t] > 0 else 0 |
| summary[t] = { |
| "avg_reward": avg_rew, |
| "investigate_rate": inv_rate, |
| "failure_rate_ambiguous": fail_rate |
| } |
| return summary |
|
|
| def print_final_report(blind: dict, smart: dict): |
| print("\n" + "="*80) |
| print("🏆 RESEARCH-GRADE EVALUATION REPORT") |
| print("="*80) |
| print(f"{'Task':<20} | {'Mode':<8} | {'Reward':<7} | {'Inv%':<6} | {'AmbFail%':<10}") |
| print("-" * 80) |
| |
| for t in TASKS: |
| b = blind[t] |
| s = smart[t] |
| delta = ((s['avg_reward'] - b['avg_reward']) / b['avg_reward']) * 100 if b['avg_reward'] > 0 else 0 |
| |
| print(f"{t[:20]:<20} | {'BLIND':<8} | {b['avg_reward']:.4f} | {b['investigate_rate']*100:>5.0f}% | {b['failure_rate_ambiguous']*100:>9.0f}%") |
| print(f"{'':<20} | {'SMART':<8} | {s['avg_reward']:.4f} | {s['investigate_rate']*100:>5.0f}% | {s['failure_rate_ambiguous']*100:>9.0f}%") |
| print(f"{'':<20} | {'IMPROVE':<8} | {delta:>+6.1f}% | {'--':>6} | {'--':>10}") |
| print("-" * 80) |
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--episodes", type=int, default=10) |
| args = parser.parse_args() |
|
|
| |
| blind_eval = Evaluator("blind") |
| blind_eval.run_eval(args.episodes) |
| blind_summary = blind_eval.get_summary() |
|
|
| |
| smart_eval = Evaluator("smart") |
| smart_eval.run_eval(args.episodes) |
| smart_summary = smart_eval.get_summary() |
|
|
| |
| print_final_report(blind_summary, smart_summary) |
| |
| print("\n💡 INSIGHT: The Smart baseline establishes the ceiling for 'Total Curiosity'.") |
| print(" The winning GRPO model learns to bridge this gap selectively.") |
|
|