File size: 5,804 Bytes
ef737d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
"""
inference.py β€” Research-Grade Evaluation Script
Autonomy Calibration Benchmark (OpenEnv v2.0.0)
─────────────────────────────────────────────────────────────────────────────
Generates side-by-side comparison metrics between:
1. Blind Baseline (Heuristic, no investigation)
2. Smart Baseline (Deterministic investigation)
3. (Optional) Trained Agent (Selective investigation)

Usage: python3 inference.py --episodes 10
"""

import os
import json
import argparse
import requests
import numpy as np
from typing import List, Dict

API_BASE = os.getenv("API_BASE_URL", "http://localhost:8000/api").rstrip("/")
TASKS    = ["email_triage", "devops_incident", "financial_request"]

class Evaluator:
    def __init__(self, mode: str):
        self.mode = mode
        self.results = {t: [] for t in TASKS}
        self.investigate_stats = {t: [] for t in TASKS}
        self.high_ambiguity_failures = {t: 0 for t in TASKS}
        self.total_high_ambiguity = {t: 0 for t in TASKS}

    def _reset(self, task: str, seed: int) -> dict:
        r = requests.post(f"{API_BASE}/reset", json={"task": task, "seed": seed}, timeout=10)
        return r.json()

    def _step(self, action_type: str) -> dict:
        r = requests.post(f"{API_BASE}/step", json={"type": action_type}, timeout=10)
        return r.json()

    def select_action(self, obs: dict, investigated: bool) -> str:
        """Implements the agent policy."""
        avail = obs.get("available_actions", [])
        ambiguity = obs.get("state", {}).get("ambiguity", 0.1)
        
        # Mode Logic
        if self.mode == "blind":
            # Never investigate, just pick the first non-investigate action
            return [a for a in avail if a != "investigate"][0]
            
        elif self.mode == "smart":
            # Always investigate at step 0 if available
            if "investigate" in avail and not investigated:
                return "investigate"
            # Otherwise use heuristic
            choices = [a for a in avail if a != "investigate"]
            return choices[0] if choices else avail[0]

        return avail[0]

    def run_eval(self, episodes_per_task=10):
        print(f"\nπŸš€ Running evaluation: MODE = {self.mode.upper()}")
        for task in TASKS:
            print(f"  Evaluating {task}...", end="", flush=True)
            for seed in range(episodes_per_task):
                obs = self._reset(task, seed)
                investigated = False
                done = False
                total_reward = 0
                
                # Track ambiguity for metrics
                ambiguity = obs.get("state", {}).get("ambiguity", 0.0)
                is_high_ambiguity = ambiguity > 0.70
                if is_high_ambiguity:
                    self.total_high_ambiguity[task] += 1

                while not done:
                    action = self.select_action(obs, investigated)
                    if action == "investigate":
                        investigated = True
                    
                    res = self._step(action)
                    done = res.get("done", False)
                    obs = res.get("observation", {})
                    
                score = res.get("info", {}).get("episode_score", 0.0)
                self.results[task].append(score)
                self.investigate_stats[task].append(1 if investigated else 0)
                
                if is_high_ambiguity and score < 0.20:
                    self.high_ambiguity_failures[task] += 1
            
            print(" Done.")

    def get_summary(self):
        summary = {}
        for t in TASKS:
            avg_rew = np.mean(self.results[t])
            inv_rate = np.mean(self.investigate_stats[t])
            fail_rate = self.high_ambiguity_failures[t] / self.total_high_ambiguity[t] if self.total_high_ambiguity[t] > 0 else 0
            summary[t] = {
                "avg_reward": avg_rew,
                "investigate_rate": inv_rate,
                "failure_rate_ambiguous": fail_rate
            }
        return summary

def print_final_report(blind: dict, smart: dict):
    print("\n" + "="*80)
    print("πŸ† RESEARCH-GRADE EVALUATION REPORT")
    print("="*80)
    print(f"{'Task':<20} | {'Mode':<8} | {'Reward':<7} | {'Inv%':<6} | {'AmbFail%':<10}")
    print("-" * 80)
    
    for t in TASKS:
        b = blind[t]
        s = smart[t]
        delta = ((s['avg_reward'] - b['avg_reward']) / b['avg_reward']) * 100 if b['avg_reward'] > 0 else 0
        
        print(f"{t[:20]:<20} | {'BLIND':<8} | {b['avg_reward']:.4f} | {b['investigate_rate']*100:>5.0f}% | {b['failure_rate_ambiguous']*100:>9.0f}%")
        print(f"{'':<20} | {'SMART':<8} | {s['avg_reward']:.4f} | {s['investigate_rate']*100:>5.0f}% | {s['failure_rate_ambiguous']*100:>9.0f}%")
        print(f"{'':<20} | {'IMPROVE':<8} | {delta:>+6.1f}% | {'--':>6} | {'--':>10}")
        print("-" * 80)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--episodes", type=int, default=10)
    args = parser.parse_args()

    # Step 1: Evaluate Blind Baseline
    blind_eval = Evaluator("blind")
    blind_eval.run_eval(args.episodes)
    blind_summary = blind_eval.get_summary()

    # Step 2: Evaluate Smart Baseline (Upper Bound)
    smart_eval = Evaluator("smart")
    smart_eval.run_eval(args.episodes)
    smart_summary = smart_eval.get_summary()

    # Final Report
    print_final_report(blind_summary, smart_summary)
    
    print("\nπŸ’‘ INSIGHT: The Smart baseline establishes the ceiling for 'Total Curiosity'.")
    print("   The winning GRPO model learns to bridge this gap selectively.")