File size: 10,678 Bytes
dfbd16e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
# server/advanced_metrics.py
"""
Advanced Metrics Engine.

Computes metrics that existing benchmarks (SWE-bench, etc.) completely ignore:
- Exploration vs Exploitation ratio across episode
- Consistency score across multiple runs of same task
- Reliability index (weighted aggregate)
- Reasoning efficiency (useful actions / total actions)
- Decision entropy (how predictable/focused the agent is)
"""
import math
from typing import List, Dict, Any, Optional
from dataclasses import dataclass, field


@dataclass
class AdvancedMetricsReport:
    """All advanced metrics for one episode or cross-episode comparison."""

    # Per-episode
    reasoning_efficiency: float    # Useful steps / total steps
    exploration_ratio: float       # Read+search vs write+test ratio
    decision_entropy: float        # Shannon entropy of action distribution
    reliability_index: float       # Composite reliability score
    pivot_rate: float              # Strategy changes per 10 steps
    wasteful_ratio: float          # Redundant actions / total actions

    # Cross-episode (populated when history provided)
    consistency_score: float = 0.0   # Variance across runs (lower variance = higher consistency)
    runs_analyzed: int = 0

    # Breakdowns
    action_distribution: Dict[str, int] = field(default_factory=dict)
    useful_actions: List[str] = field(default_factory=list)
    wasteful_actions: List[str] = field(default_factory=list)
    reliability_breakdown: Dict[str, float] = field(default_factory=dict)

    def to_dict(self) -> dict:
        return {
            "reasoning_efficiency": round(self.reasoning_efficiency, 3),
            "exploration_ratio": round(self.exploration_ratio, 3),
            "decision_entropy": round(self.decision_entropy, 3),
            "reliability_index": round(self.reliability_index, 3),
            "pivot_rate": round(self.pivot_rate, 3),
            "wasteful_ratio": round(self.wasteful_ratio, 3),
            "consistency_score": round(self.consistency_score, 3),
            "runs_analyzed": self.runs_analyzed,
            "action_distribution": self.action_distribution,
            "useful_actions": self.useful_actions,
            "wasteful_actions": self.wasteful_actions,
            "reliability_breakdown": {
                k: round(v, 3) for k, v in self.reliability_breakdown.items()
            },
        }


class AdvancedMetricsEngine:
    """
    Computes advanced behavioral and reliability metrics from trajectory data.

    Usage:
        engine = AdvancedMetricsEngine()
        report = engine.compute(
            trajectory_steps=[...],
            variant_meta={...},
            final_score=0.7,
            files_read=[...],
            files_written=[...],
            history=[],  # Pass previous episode scores for consistency
        )
    """

    def __init__(self):
        self._score_history: List[float] = []  # Tracks scores across episodes

    def compute(
        self,
        trajectory_steps: List[dict],
        variant_meta: Dict[str, Any],
        final_score: float,
        files_read: List[str],
        files_written: List[str],
        history: Optional[List[float]] = None,
    ) -> AdvancedMetricsReport:
        """Compute all advanced metrics for one episode."""
        # Record this score in history
        self._score_history.append(final_score)

        if not trajectory_steps:
            return AdvancedMetricsReport(
                reasoning_efficiency=0.0,
                exploration_ratio=0.5,
                decision_entropy=0.0,
                reliability_index=0.0,
                pivot_rate=0.0,
                wasteful_ratio=1.0,
            )

        action_seq = [s.get("action_type", "unknown") for s in trajectory_steps]
        total = len(action_seq)

        # ── Action distribution ───────────────────────────────────────────────
        from collections import Counter
        dist = Counter(action_seq)
        action_distribution = dict(dist)

        # ── Decision entropy (Shannon entropy of action types) ────────────────
        entropy = 0.0
        for count in dist.values():
            p = count / total
            if p > 0:
                entropy -= p * math.log2(p)
        # Normalize by max possible entropy (log2 of unique action types)
        max_entropy = math.log2(len(dist)) if len(dist) > 1 else 1.0
        normalized_entropy = entropy / max_entropy if max_entropy > 0 else 0.0

        # ── Exploration vs exploitation ratio ─────────────────────────────────
        explore = dist.get("read_file", 0) + dist.get("search_code", 0)
        exploit = dist.get("write_file", 0) + dist.get("run_tests", 0)
        exploration_ratio = explore / (explore + exploit) if (explore + exploit) > 0 else 0.5

        # ── Redundancy / wasteful actions ─────────────────────────────────────
        read_paths = [
            s.get("action_path")
            for s in trajectory_steps
            if s.get("action_type") == "read_file" and s.get("action_path")
        ]
        seen = set()
        redundant_reads = 0
        for p in read_paths:
            if p in seen:
                redundant_reads += 1
            seen.add(p)

        error_actions = sum(1 for s in trajectory_steps if s.get("error"))
        total_wasteful = redundant_reads + error_actions
        wasteful_ratio = total_wasteful / total if total > 0 else 0.0

        wasteful_actions = []
        if redundant_reads > 0:
            wasteful_actions.append(f"{redundant_reads}x redundant file reads")
        if error_actions > 0:
            wasteful_actions.append(f"{error_actions}x actions that produced errors")

        # ── Useful action detection ───────────────────────────────────────────
        useful_actions = []
        relevant = set(
            variant_meta.get("bug_files", []) +
            variant_meta.get("interface_files", []) +
            variant_meta.get("read_first_files", []) +
            variant_meta.get("files_to_implement", [])
        )
        relevant_reads = [f for f in files_read if f in relevant]
        if relevant_reads:
            useful_actions.append(f"Read {len(relevant_reads)} key files: {relevant_reads[:3]}")

        test_rates = [
            s.get("test_pass_rate")
            for s in trajectory_steps
            if s.get("test_pass_rate") is not None
        ]
        if len(test_rates) >= 2 and test_rates[-1] > test_rates[0]:
            useful_actions.append(
                f"Test pass rate improved from {test_rates[0]:.2f} to {test_rates[-1]:.2f}"
            )

        if files_written:
            useful_actions.append(f"Wrote {len(files_written)} file(s): {files_written[:3]}")

        # ── Reasoning efficiency ──────────────────────────────────────────────
        useful_count = len(relevant_reads) + (1 if files_written else 0) + (1 if test_rates else 0)
        reasoning_efficiency = min(1.0, useful_count / max(total, 1))

        # ── Pivot rate (strategy switches per 10 steps) ───────────────────────
        pivots = 0
        for i in range(1, len(action_seq)):
            prev_explore = action_seq[i-1] in ("read_file", "search_code")
            curr_exploit = action_seq[i] in ("write_file", "run_tests")
            prev_exploit = action_seq[i-1] in ("write_file", "run_tests")
            curr_explore = action_seq[i] in ("read_file", "search_code")
            if (prev_explore and curr_exploit) or (prev_exploit and curr_explore):
                pivots += 1
        pivot_rate = (pivots / total) * 10 if total > 0 else 0.0  # per 10 steps

        # ── Reliability index ─────────────────────────────────────────────────
        # Weighted aggregate: correctness matters most
        reliability_breakdown = {
            "correctness": final_score,
            "efficiency": max(0.0, 1.0 - wasteful_ratio),
            "focus": 1.0 - normalized_entropy,  # Low entropy = focused behavior
            "verification": 1.0 if test_rates else 0.0,
            "safety": 1.0,  # Will be reduced by security violations
        }

        # Check for security flags
        sec_flags = sum(len(s.get("security_flags", [])) for s in trajectory_steps)
        if sec_flags > 0:
            reliability_breakdown["safety"] = max(0.0, 1.0 - sec_flags * 0.2)

        # Weighted reliability index
        weights = {
            "correctness": 0.40,
            "efficiency": 0.20,
            "focus": 0.15,
            "verification": 0.15,
            "safety": 0.10,
        }
        reliability_index = sum(
            reliability_breakdown[k] * weights[k]
            for k in weights
        )

        # ── Consistency score (cross-episode) ────────────────────────────────
        scores_to_use = list(history) if history else self._score_history
        consistency_score = 0.0
        runs_analyzed = len(scores_to_use)

        if runs_analyzed >= 2:
            mean = sum(scores_to_use) / runs_analyzed
            variance = sum((s - mean) ** 2 for s in scores_to_use) / runs_analyzed
            std_dev = math.sqrt(variance)
            # Consistency = 1 - normalized_std_dev (higher = more consistent)
            consistency_score = max(0.0, 1.0 - (std_dev / max(mean, 0.01)))

        return AdvancedMetricsReport(
            reasoning_efficiency=reasoning_efficiency,
            exploration_ratio=exploration_ratio,
            decision_entropy=normalized_entropy,
            reliability_index=reliability_index,
            pivot_rate=pivot_rate,
            wasteful_ratio=wasteful_ratio,
            consistency_score=consistency_score,
            runs_analyzed=runs_analyzed,
            action_distribution=action_distribution,
            useful_actions=useful_actions,
            wasteful_actions=wasteful_actions,
            reliability_breakdown=reliability_breakdown,
        )

    def get_score_history(self) -> List[float]:
        return list(self._score_history)

    def reset_history(self):
        self._score_history = []