File size: 16,068 Bytes
0b0338d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
# server/confidence_calibrator.py
"""
Confidence Calibration Engine β€” v4.0

The key scientific question: Is the agent calibrated?
An agent is calibrated when its certainty level (inferred from behavior)
matches its likelihood of being correct.

Since agents don't expose probability distributions directly, we infer
confidence from behavioral proxies:
- How quickly did it commit to a hypothesis (read β†’ write speed)?
- How much did it re-explore after writing (re-reads after write)?
- Did it verify (run_tests) before submitting?
- How many steps did it spend before the first write?

We then compare inferred confidence to actual accuracy (final_score).
Overconfident agents submit fast but score poorly.
Underconfident agents explore extensively but still score well.
Well-calibrated agents: confidence ∝ accuracy.

This is NOT measured by any existing benchmark or tracing tool.
"""
from __future__ import annotations
import math
from typing import List, Dict, Any, Optional
from dataclasses import dataclass, field
from enum import Enum


class CalibrationProfile(str, Enum):
    WELL_CALIBRATED = "WELL_CALIBRATED"    # Confidence β‰ˆ accuracy
    OVERCONFIDENT = "OVERCONFIDENT"        # High confidence, low accuracy
    UNDERCONFIDENT = "UNDERCONFIDENT"      # Low confidence, high accuracy
    ERRATIC = "ERRATIC"                   # Confidence changes randomly


@dataclass
class ConfidenceSample:
    """Inferred confidence at one point in the trajectory."""
    step: int
    action_type: str
    inferred_confidence: float   # 0.0–1.0 based on behavioral proxy
    actual_accuracy: Optional[float]  # test_pass_rate at this step if known
    calibration_error: Optional[float]  # |confidence - accuracy| if both known


@dataclass
class CalibrationReport:
    """Full confidence calibration analysis."""
    episode_id: str
    task: str

    profile: CalibrationProfile
    calibration_score: float      # 1.0 = perfectly calibrated

    # Inferred overall confidence level (behavioral proxy)
    inferred_confidence: float    # 0.0–1.0
    actual_performance: float     # final_score

    # Decomposed signals
    commitment_speed: float      # How fast did agent commit? (0=slow/careful, 1=fast)
    re_exploration_rate: float   # Reads after first write / total reads
    verification_rate: float     # run_tests per write_file
    submit_speed: float          # Submit step / max_steps (early=overconfident)

    # Trajectory of inferred confidence
    confidence_trajectory: List[ConfidenceSample]

    # Calibration error
    expected_calibration_error: float  # Mean(|conf - acc|) where acc is known
    confidence_accuracy_correlation: float  # Should be high for good agents

    diagnosis: str
    recommendations: List[str]

    def to_dict(self) -> dict:
        return {
            "episode_id": self.episode_id,
            "task": self.task,
            "profile": self.profile.value,
            "calibration_score": round(self.calibration_score, 3),
            "inferred_confidence": round(self.inferred_confidence, 3),
            "actual_performance": round(self.actual_performance, 3),
            "signals": {
                "commitment_speed": round(self.commitment_speed, 3),
                "re_exploration_rate": round(self.re_exploration_rate, 3),
                "verification_rate": round(self.verification_rate, 3),
                "submit_speed": round(self.submit_speed, 3),
            },
            "expected_calibration_error": round(self.expected_calibration_error, 3),
            "confidence_accuracy_correlation": round(self.confidence_accuracy_correlation, 3),
            "confidence_trajectory": [
                {
                    "step": s.step,
                    "action": s.action_type,
                    "confidence": round(s.inferred_confidence, 3),
                    "accuracy": round(s.actual_accuracy, 3) if s.actual_accuracy is not None else None,
                    "error": round(s.calibration_error, 3) if s.calibration_error is not None else None,
                }
                for s in self.confidence_trajectory
            ],
            "diagnosis": self.diagnosis,
            "recommendations": self.recommendations,
        }


class ConfidenceCalibrator:
    """
    Infers behavioral confidence and compares to actual performance.

    Confidence proxy model:
    - Reading files = low confidence (still exploring)
    - Writing files = medium-high confidence (committed to hypothesis)
    - Running tests = verification (moderate, checking own hypothesis)
    - Submitting = maximum commitment (fully confident)

    Each action type has a confidence weight:
      read_file:   0.2  (exploring, uncertain)
      search_code: 0.3  (slightly more directed)
      run_tests:   0.6  (confident enough to test)
      write_file:  0.75 (committed to hypothesis)
      submit:      1.0  (maximum confidence)

    We track how this evolves over the trajectory.
    """

    ACTION_CONFIDENCE = {
        "read_file":   0.2,
        "search_code": 0.3,
        "run_tests":   0.6,
        "write_file":  0.75,
        "submit":      1.0,
    }

    def calibrate(
        self,
        episode_id: str,
        task: str,
        trajectory_steps: List[dict],
        final_score: float,
        max_steps: int = 20,
    ) -> CalibrationReport:
        """Compute the full calibration report for one episode."""

        if not trajectory_steps:
            return self._empty_report(episode_id, task, final_score)

        action_types = [s.get("action_type", "read_file") for s in trajectory_steps]
        total_steps = len(trajectory_steps)

        # ── Build confidence trajectory ───────────────────────────────────────
        confidence_traj: List[ConfidenceSample] = []
        running_conf = 0.0

        for s in trajectory_steps:
            atype = s.get("action_type", "read_file")
            base_conf = self.ACTION_CONFIDENCE.get(atype, 0.3)

            # Confidence grows as episode progresses
            step_n = s.get("step_number", 1)
            progress_bonus = (step_n / max(total_steps, 1)) * 0.1

            # Re-reads slightly lower confidence
            step_write_count = sum(
                1 for s2 in trajectory_steps
                if s2.get("action_type") == "write_file"
                and s2.get("step_number", 99) < step_n
            )
            step_reread = (
                s.get("action_type") == "read_file"
                and any(
                    s2.get("action_path") == s.get("action_path")
                    and s2.get("step_number", 0) < step_n
                    for s2 in trajectory_steps
                )
            )
            reread_penalty = -0.1 if step_reread else 0.0

            # After a write, confidence should be higher
            post_write_bonus = min(0.15, step_write_count * 0.05)

            inferred = min(1.0, max(0.0,
                base_conf + progress_bonus + post_write_bonus + reread_penalty
            ))

            # Actual accuracy at this step if test_pass_rate is known
            actual_acc = s.get("test_pass_rate")
            calib_err = abs(inferred - actual_acc) if actual_acc is not None else None

            confidence_traj.append(ConfidenceSample(
                step=step_n,
                action_type=atype,
                inferred_confidence=inferred,
                actual_accuracy=actual_acc,
                calibration_error=calib_err,
            ))

        # ── Behavioral signal computation ─────────────────────────────────────
        total = max(total_steps, 1)

        # Commitment speed: how many reads before first write?
        read_steps = [i for i, a in enumerate(action_types) if a == "read_file"]
        write_steps = [i for i, a in enumerate(action_types) if a == "write_file"]
        submit_step = next(
            (s.get("step_number", total) for s in trajectory_steps if s.get("action_type") == "submit"),
            total,
        )

        if write_steps:
            reads_before_first_write = len([r for r in read_steps if r < write_steps[0]])
            # Low reads before write = high commitment speed = overconfident
            commitment_speed = max(0.0, 1.0 - reads_before_first_write / max(total, 1))
        else:
            commitment_speed = 0.0  # Never wrote = very cautious

        # Re-exploration rate: reads after first write / total reads
        if write_steps and read_steps:
            reads_after_write = len([r for r in read_steps if r > write_steps[0]])
            re_exploration_rate = reads_after_write / len(read_steps)
        else:
            re_exploration_rate = 0.0

        # Verification rate: run_tests per write
        test_count = action_types.count("run_tests")
        write_count = action_types.count("write_file")
        verification_rate = test_count / max(write_count, 1)

        # Submit speed: earlier = more overconfident
        submit_speed = 1.0 - (submit_step / max(max_steps, 1))
        submit_speed = max(0.0, min(1.0, submit_speed))

        # ── Inferred overall confidence ───────────────────────────────────────
        # Weighted behavioral proxy
        inferred_confidence = (
            commitment_speed * 0.30 +
            (1.0 - re_exploration_rate) * 0.15 +
            verification_rate * 0.15 +
            submit_speed * 0.20 +
            (confidence_traj[-1].inferred_confidence if confidence_traj else 0.5) * 0.20
        )
        inferred_confidence = min(1.0, max(0.0, inferred_confidence))

        # ── Calibration error (where we have both conf + acc) ─────────────────
        calib_errors = [
            s.calibration_error for s in confidence_traj
            if s.calibration_error is not None
        ]
        ece = sum(calib_errors) / len(calib_errors) if calib_errors else abs(inferred_confidence - final_score)

        # ── Confidence-accuracy correlation ────────────────────────────────────
        paired = [
            (s.inferred_confidence, s.actual_accuracy)
            for s in confidence_traj
            if s.actual_accuracy is not None
        ]
        if len(paired) >= 2:
            corr = self._pearson_r([p[0] for p in paired], [p[1] for p in paired])
        else:
            # Fallback: use final point only
            conf_err = abs(inferred_confidence - final_score)
            corr = 1.0 - conf_err * 2

        corr = max(-1.0, min(1.0, corr))

        # ── Calibration score ─────────────────────────────────────────────────
        calibration_score = max(0.0, 1.0 - ece) * 0.5 + max(0.0, corr) * 0.5
        calibration_score = max(0.0, min(1.0, calibration_score))

        # ── Profile classification ─────────────────────────────────────────────
        conf_diff = inferred_confidence - final_score
        if abs(conf_diff) <= 0.2:
            profile = CalibrationProfile.WELL_CALIBRATED
        elif conf_diff > 0.2:
            profile = CalibrationProfile.OVERCONFIDENT
        elif conf_diff < -0.2:
            profile = CalibrationProfile.UNDERCONFIDENT
        else:
            profile = CalibrationProfile.ERRATIC

        # ── Diagnosis ─────────────────────────────────────────────────────────
        diagnoses = {
            CalibrationProfile.WELL_CALIBRATED: (
                f"Agent is well-calibrated: inferred confidence ({inferred_confidence:.2f}) "
                f"closely matches actual performance ({final_score:.2f}). "
                "This indicates genuine self-awareness β€” the agent commits when ready and "
                "explores when uncertain."
            ),
            CalibrationProfile.OVERCONFIDENT: (
                f"Agent is overconfident: behavioral confidence ({inferred_confidence:.2f}) "
                f"significantly exceeds actual performance ({final_score:.2f}). "
                "Agent committed to a hypothesis too early, skipped verification, "
                "or submitted without adequate exploration. This is the profile of agents "
                "that 'feel certain but are wrong'."
            ),
            CalibrationProfile.UNDERCONFIDENT: (
                f"Agent is underconfident: behavioral confidence ({inferred_confidence:.2f}) "
                f"is well below actual performance ({final_score:.2f}). "
                "Agent explored far more than necessary, re-read files unnecessarily, "
                "or hesitated to commit despite having the right information. "
                "This wastes compute and steps without improving accuracy."
            ),
            CalibrationProfile.ERRATIC: (
                "Agent calibration is erratic β€” confidence signals are inconsistent "
                "with behavior. The agent may be applying a rigid strategy regardless "
                "of the task difficulty."
            ),
        }

        recs = []
        if profile == CalibrationProfile.OVERCONFIDENT:
            recs.append("Read more files before writing β€” commit only when you've seen the full causal chain.")
            recs.append("Always run_tests after writing β€” don't trust your fix without verification.")
        elif profile == CalibrationProfile.UNDERCONFIDENT:
            recs.append("Commit to hypotheses earlier β€” excessive re-reading wastes steps.")
            recs.append("After reading tests + source files, write your fix. Stop re-reading.")
        if verification_rate < 0.5:
            recs.append("Increase test verification rate: run_tests after each write.")
        if re_exploration_rate > 0.5:
            recs.append("High re-exploration after writing suggests uncalibrated hypothesis formation.")

        return CalibrationReport(
            episode_id=episode_id,
            task=task,
            profile=profile,
            calibration_score=calibration_score,
            inferred_confidence=inferred_confidence,
            actual_performance=final_score,
            commitment_speed=commitment_speed,
            re_exploration_rate=re_exploration_rate,
            verification_rate=verification_rate,
            submit_speed=submit_speed,
            confidence_trajectory=confidence_traj,
            expected_calibration_error=ece,
            confidence_accuracy_correlation=corr,
            diagnosis=diagnoses[profile],
            recommendations=recs,
        )

    def _pearson_r(self, xs: List[float], ys: List[float]) -> float:
        n = len(xs)
        if n < 2:
            return 0.0
        mx, my = sum(xs) / n, sum(ys) / n
        num = sum((x - mx) * (y - my) for x, y in zip(xs, ys))
        dx = math.sqrt(sum((x - mx) ** 2 for x in xs))
        dy = math.sqrt(sum((y - my) ** 2 for y in ys))
        if dx * dy == 0:
            return 0.0
        return num / (dx * dy)

    def _empty_report(self, episode_id: str, task: str, final_score: float) -> CalibrationReport:
        return CalibrationReport(
            episode_id=episode_id, task=task,
            profile=CalibrationProfile.ERRATIC,
            calibration_score=0.0,
            inferred_confidence=0.0, actual_performance=final_score,
            commitment_speed=0.0, re_exploration_rate=0.0,
            verification_rate=0.0, submit_speed=0.0,
            confidence_trajectory=[],
            expected_calibration_error=1.0,
            confidence_accuracy_correlation=0.0,
            diagnosis="No trajectory data.", recommendations=[],
        )