Rhythm@28
deploy: final verified championship submission
ef737d3
"""
inference.py — Research-Grade Evaluation Script
Autonomy Calibration Benchmark (OpenEnv v2.0.0)
─────────────────────────────────────────────────────────────────────────────
Generates side-by-side comparison metrics between:
1. Blind Baseline (Heuristic, no investigation)
2. Smart Baseline (Deterministic investigation)
3. (Optional) Trained Agent (Selective investigation)
Usage: python3 inference.py --episodes 10
"""
import os
import json
import argparse
import requests
import numpy as np
from typing import List, Dict
API_BASE = os.getenv("API_BASE_URL", "http://localhost:8000/api").rstrip("/")
TASKS = ["email_triage", "devops_incident", "financial_request"]
class Evaluator:
def __init__(self, mode: str):
self.mode = mode
self.results = {t: [] for t in TASKS}
self.investigate_stats = {t: [] for t in TASKS}
self.high_ambiguity_failures = {t: 0 for t in TASKS}
self.total_high_ambiguity = {t: 0 for t in TASKS}
def _reset(self, task: str, seed: int) -> dict:
r = requests.post(f"{API_BASE}/reset", json={"task": task, "seed": seed}, timeout=10)
return r.json()
def _step(self, action_type: str) -> dict:
r = requests.post(f"{API_BASE}/step", json={"type": action_type}, timeout=10)
return r.json()
def select_action(self, obs: dict, investigated: bool) -> str:
"""Implements the agent policy."""
avail = obs.get("available_actions", [])
ambiguity = obs.get("state", {}).get("ambiguity", 0.1)
# Mode Logic
if self.mode == "blind":
# Never investigate, just pick the first non-investigate action
return [a for a in avail if a != "investigate"][0]
elif self.mode == "smart":
# Always investigate at step 0 if available
if "investigate" in avail and not investigated:
return "investigate"
# Otherwise use heuristic
choices = [a for a in avail if a != "investigate"]
return choices[0] if choices else avail[0]
return avail[0]
def run_eval(self, episodes_per_task=10):
print(f"\n🚀 Running evaluation: MODE = {self.mode.upper()}")
for task in TASKS:
print(f" Evaluating {task}...", end="", flush=True)
for seed in range(episodes_per_task):
obs = self._reset(task, seed)
investigated = False
done = False
total_reward = 0
# Track ambiguity for metrics
ambiguity = obs.get("state", {}).get("ambiguity", 0.0)
is_high_ambiguity = ambiguity > 0.70
if is_high_ambiguity:
self.total_high_ambiguity[task] += 1
while not done:
action = self.select_action(obs, investigated)
if action == "investigate":
investigated = True
res = self._step(action)
done = res.get("done", False)
obs = res.get("observation", {})
score = res.get("info", {}).get("episode_score", 0.0)
self.results[task].append(score)
self.investigate_stats[task].append(1 if investigated else 0)
if is_high_ambiguity and score < 0.20:
self.high_ambiguity_failures[task] += 1
print(" Done.")
def get_summary(self):
summary = {}
for t in TASKS:
avg_rew = np.mean(self.results[t])
inv_rate = np.mean(self.investigate_stats[t])
fail_rate = self.high_ambiguity_failures[t] / self.total_high_ambiguity[t] if self.total_high_ambiguity[t] > 0 else 0
summary[t] = {
"avg_reward": avg_rew,
"investigate_rate": inv_rate,
"failure_rate_ambiguous": fail_rate
}
return summary
def print_final_report(blind: dict, smart: dict):
print("\n" + "="*80)
print("🏆 RESEARCH-GRADE EVALUATION REPORT")
print("="*80)
print(f"{'Task':<20} | {'Mode':<8} | {'Reward':<7} | {'Inv%':<6} | {'AmbFail%':<10}")
print("-" * 80)
for t in TASKS:
b = blind[t]
s = smart[t]
delta = ((s['avg_reward'] - b['avg_reward']) / b['avg_reward']) * 100 if b['avg_reward'] > 0 else 0
print(f"{t[:20]:<20} | {'BLIND':<8} | {b['avg_reward']:.4f} | {b['investigate_rate']*100:>5.0f}% | {b['failure_rate_ambiguous']*100:>9.0f}%")
print(f"{'':<20} | {'SMART':<8} | {s['avg_reward']:.4f} | {s['investigate_rate']*100:>5.0f}% | {s['failure_rate_ambiguous']*100:>9.0f}%")
print(f"{'':<20} | {'IMPROVE':<8} | {delta:>+6.1f}% | {'--':>6} | {'--':>10}")
print("-" * 80)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--episodes", type=int, default=10)
args = parser.parse_args()
# Step 1: Evaluate Blind Baseline
blind_eval = Evaluator("blind")
blind_eval.run_eval(args.episodes)
blind_summary = blind_eval.get_summary()
# Step 2: Evaluate Smart Baseline (Upper Bound)
smart_eval = Evaluator("smart")
smart_eval.run_eval(args.episodes)
smart_summary = smart_eval.get_summary()
# Final Report
print_final_report(blind_summary, smart_summary)
print("\n💡 INSIGHT: The Smart baseline establishes the ceiling for 'Total Curiosity'.")
print(" The winning GRPO model learns to bridge this gap selectively.")