File size: 5,804 Bytes
ef737d3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | """
inference.py β Research-Grade Evaluation Script
Autonomy Calibration Benchmark (OpenEnv v2.0.0)
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Generates side-by-side comparison metrics between:
1. Blind Baseline (Heuristic, no investigation)
2. Smart Baseline (Deterministic investigation)
3. (Optional) Trained Agent (Selective investigation)
Usage: python3 inference.py --episodes 10
"""
import os
import json
import argparse
import requests
import numpy as np
from typing import List, Dict
API_BASE = os.getenv("API_BASE_URL", "http://localhost:8000/api").rstrip("/")
TASKS = ["email_triage", "devops_incident", "financial_request"]
class Evaluator:
def __init__(self, mode: str):
self.mode = mode
self.results = {t: [] for t in TASKS}
self.investigate_stats = {t: [] for t in TASKS}
self.high_ambiguity_failures = {t: 0 for t in TASKS}
self.total_high_ambiguity = {t: 0 for t in TASKS}
def _reset(self, task: str, seed: int) -> dict:
r = requests.post(f"{API_BASE}/reset", json={"task": task, "seed": seed}, timeout=10)
return r.json()
def _step(self, action_type: str) -> dict:
r = requests.post(f"{API_BASE}/step", json={"type": action_type}, timeout=10)
return r.json()
def select_action(self, obs: dict, investigated: bool) -> str:
"""Implements the agent policy."""
avail = obs.get("available_actions", [])
ambiguity = obs.get("state", {}).get("ambiguity", 0.1)
# Mode Logic
if self.mode == "blind":
# Never investigate, just pick the first non-investigate action
return [a for a in avail if a != "investigate"][0]
elif self.mode == "smart":
# Always investigate at step 0 if available
if "investigate" in avail and not investigated:
return "investigate"
# Otherwise use heuristic
choices = [a for a in avail if a != "investigate"]
return choices[0] if choices else avail[0]
return avail[0]
def run_eval(self, episodes_per_task=10):
print(f"\nπ Running evaluation: MODE = {self.mode.upper()}")
for task in TASKS:
print(f" Evaluating {task}...", end="", flush=True)
for seed in range(episodes_per_task):
obs = self._reset(task, seed)
investigated = False
done = False
total_reward = 0
# Track ambiguity for metrics
ambiguity = obs.get("state", {}).get("ambiguity", 0.0)
is_high_ambiguity = ambiguity > 0.70
if is_high_ambiguity:
self.total_high_ambiguity[task] += 1
while not done:
action = self.select_action(obs, investigated)
if action == "investigate":
investigated = True
res = self._step(action)
done = res.get("done", False)
obs = res.get("observation", {})
score = res.get("info", {}).get("episode_score", 0.0)
self.results[task].append(score)
self.investigate_stats[task].append(1 if investigated else 0)
if is_high_ambiguity and score < 0.20:
self.high_ambiguity_failures[task] += 1
print(" Done.")
def get_summary(self):
summary = {}
for t in TASKS:
avg_rew = np.mean(self.results[t])
inv_rate = np.mean(self.investigate_stats[t])
fail_rate = self.high_ambiguity_failures[t] / self.total_high_ambiguity[t] if self.total_high_ambiguity[t] > 0 else 0
summary[t] = {
"avg_reward": avg_rew,
"investigate_rate": inv_rate,
"failure_rate_ambiguous": fail_rate
}
return summary
def print_final_report(blind: dict, smart: dict):
print("\n" + "="*80)
print("π RESEARCH-GRADE EVALUATION REPORT")
print("="*80)
print(f"{'Task':<20} | {'Mode':<8} | {'Reward':<7} | {'Inv%':<6} | {'AmbFail%':<10}")
print("-" * 80)
for t in TASKS:
b = blind[t]
s = smart[t]
delta = ((s['avg_reward'] - b['avg_reward']) / b['avg_reward']) * 100 if b['avg_reward'] > 0 else 0
print(f"{t[:20]:<20} | {'BLIND':<8} | {b['avg_reward']:.4f} | {b['investigate_rate']*100:>5.0f}% | {b['failure_rate_ambiguous']*100:>9.0f}%")
print(f"{'':<20} | {'SMART':<8} | {s['avg_reward']:.4f} | {s['investigate_rate']*100:>5.0f}% | {s['failure_rate_ambiguous']*100:>9.0f}%")
print(f"{'':<20} | {'IMPROVE':<8} | {delta:>+6.1f}% | {'--':>6} | {'--':>10}")
print("-" * 80)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--episodes", type=int, default=10)
args = parser.parse_args()
# Step 1: Evaluate Blind Baseline
blind_eval = Evaluator("blind")
blind_eval.run_eval(args.episodes)
blind_summary = blind_eval.get_summary()
# Step 2: Evaluate Smart Baseline (Upper Bound)
smart_eval = Evaluator("smart")
smart_eval.run_eval(args.episodes)
smart_summary = smart_eval.get_summary()
# Final Report
print_final_report(blind_summary, smart_summary)
print("\nπ‘ INSIGHT: The Smart baseline establishes the ceiling for 'Total Curiosity'.")
print(" The winning GRPO model learns to bridge this gap selectively.")
|