Spaces:

JOY0021
/

autonomy-calibration-benchmark

Paused

File size: 5,804 Bytes

ef737d3

"""
inference.py — Research-Grade Evaluation Script
Autonomy Calibration Benchmark (OpenEnv v2.0.0)
─────────────────────────────────────────────────────────────────────────────
Generates side-by-side comparison metrics between:
1. Blind Baseline (Heuristic, no investigation)
2. Smart Baseline (Deterministic investigation)
3. (Optional) Trained Agent (Selective investigation)

Usage: python3 inference.py --episodes 10
"""

import os
import json
import argparse
import requests
import numpy as np
from typing import List, Dict

API_BASE = os.getenv("API_BASE_URL", "http://localhost:8000/api").rstrip("/")
TASKS    = ["email_triage", "devops_incident", "financial_request"]

class Evaluator:
    def __init__(self, mode: str):
        self.mode = mode
        self.results = {t: [] for t in TASKS}
        self.investigate_stats = {t: [] for t in TASKS}
        self.high_ambiguity_failures = {t: 0 for t in TASKS}
        self.total_high_ambiguity = {t: 0 for t in TASKS}

    def _reset(self, task: str, seed: int) -> dict:
        r = requests.post(f"{API_BASE}/reset", json={"task": task, "seed": seed}, timeout=10)
        return r.json()

    def _step(self, action_type: str) -> dict:
        r = requests.post(f"{API_BASE}/step", json={"type": action_type}, timeout=10)
        return r.json()

    def select_action(self, obs: dict, investigated: bool) -> str:
        """Implements the agent policy."""
        avail = obs.get("available_actions", [])
        ambiguity = obs.get("state", {}).get("ambiguity", 0.1)
        
        # Mode Logic
        if self.mode == "blind":
            # Never investigate, just pick the first non-investigate action
            return [a for a in avail if a != "investigate"][0]
            
        elif self.mode == "smart":
            # Always investigate at step 0 if available
            if "investigate" in avail and not investigated:
                return "investigate"
            # Otherwise use heuristic
            choices = [a for a in avail if a != "investigate"]
            return choices[0] if choices else avail[0]

        return avail[0]

    def run_eval(self, episodes_per_task=10):
        print(f"\n🚀 Running evaluation: MODE = {self.mode.upper()}")
        for task in TASKS:
            print(f"  Evaluating {task}...", end="", flush=True)
            for seed in range(episodes_per_task):
                obs = self._reset(task, seed)
                investigated = False
                done = False
                total_reward = 0
                
                # Track ambiguity for metrics
                ambiguity = obs.get("state", {}).get("ambiguity", 0.0)
                is_high_ambiguity = ambiguity > 0.70
                if is_high_ambiguity:
                    self.total_high_ambiguity[task] += 1

                while not done:
                    action = self.select_action(obs, investigated)
                    if action == "investigate":
                        investigated = True
                    
                    res = self._step(action)
                    done = res.get("done", False)
                    obs = res.get("observation", {})
                    
                score = res.get("info", {}).get("episode_score", 0.0)
                self.results[task].append(score)
                self.investigate_stats[task].append(1 if investigated else 0)
                
                if is_high_ambiguity and score < 0.20:
                    self.high_ambiguity_failures[task] += 1
            
            print(" Done.")

    def get_summary(self):
        summary = {}
        for t in TASKS:
            avg_rew = np.mean(self.results[t])
            inv_rate = np.mean(self.investigate_stats[t])
            fail_rate = self.high_ambiguity_failures[t] / self.total_high_ambiguity[t] if self.total_high_ambiguity[t] > 0 else 0
            summary[t] = {
                "avg_reward": avg_rew,
                "investigate_rate": inv_rate,
                "failure_rate_ambiguous": fail_rate
            }
        return summary

def print_final_report(blind: dict, smart: dict):
    print("\n" + "="*80)
    print("🏆 RESEARCH-GRADE EVALUATION REPORT")
    print("="*80)
    print(f"{'Task':<20} | {'Mode':<8} | {'Reward':<7} | {'Inv%':<6} | {'AmbFail%':<10}")
    print("-" * 80)
    
    for t in TASKS:
        b = blind[t]
        s = smart[t]
        delta = ((s['avg_reward'] - b['avg_reward']) / b['avg_reward']) * 100 if b['avg_reward'] > 0 else 0
        
        print(f"{t[:20]:<20} | {'BLIND':<8} | {b['avg_reward']:.4f} | {b['investigate_rate']*100:>5.0f}% | {b['failure_rate_ambiguous']*100:>9.0f}%")
        print(f"{'':<20} | {'SMART':<8} | {s['avg_reward']:.4f} | {s['investigate_rate']*100:>5.0f}% | {s['failure_rate_ambiguous']*100:>9.0f}%")
        print(f"{'':<20} | {'IMPROVE':<8} | {delta:>+6.1f}% | {'--':>6} | {'--':>10}")
        print("-" * 80)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--episodes", type=int, default=10)
    args = parser.parse_args()

    # Step 1: Evaluate Blind Baseline
    blind_eval = Evaluator("blind")
    blind_eval.run_eval(args.episodes)
    blind_summary = blind_eval.get_summary()

    # Step 2: Evaluate Smart Baseline (Upper Bound)
    smart_eval = Evaluator("smart")
    smart_eval.run_eval(args.episodes)
    smart_summary = smart_eval.get_summary()

    # Final Report
    print_final_report(blind_summary, smart_summary)
    
    print("\n💡 INSIGHT: The Smart baseline establishes the ceiling for 'Total Curiosity'.")
    print("   The winning GRPO model learns to bridge this gap selectively.")