File size: 11,487 Bytes
28f702f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
"""Unit tests for counterfeint.eval_suite — parser and writer layers.



These tests intentionally stay below the network boundary: we exercise the

pure ``_parse_episode_metrics`` extraction helper and the JSON / markdown /

PNG writers against hand-crafted episode-result dicts so the test suite

runs without a live CounterFeint server.

"""

from __future__ import annotations

import json
from pathlib import Path

import pytest

from counterfeint.eval_suite import (
    EVAL_SEEDS,
    AggregatedMetrics,
    EpisodeMetrics,
    _aggregate_per_task,
    _parse_episode_metrics,
    _write_eval_json,
    _write_eval_plot,
    _write_eval_summary_md,
    summarize_real_world_holdout,
)


def _make_episode_result(

    *,

    task_id: str = "task_1",

    grader_score: float = 0.5,

    track_a: float = 0.9,

    track_b: float = 0.95,

    verdicts: dict | None = None,

    remaining_budget: int = 4,

    total_ads: int = 12,

    investigator_fallback: int = 0,

    steps: int = 30,

    end_reason: str | None = "audit_complete",

    error: str | None = None,

) -> dict:
    verdicts = verdicts if verdicts is not None else {}
    return {
        "task_id": task_id,
        "grader_score": grader_score,
        "steps": steps,
        "end_reason": end_reason,
        "rewards_by_role": {"investigator": 1.5, "fraudster": -0.5, "auditor": 0.0},
        "fallback_counts": {"investigator": investigator_fallback, "fraudster": 0},
        "final_state": {
            "audit_report": {
                "investigator_audit_score": track_a,
                "fraudster_plausibility_score": track_b,
            },
            "investigator_state": {
                "total_ads": total_ads,
                "remaining_budget": remaining_budget,
                "verdicts": verdicts,
            },
        },
        **({"error": error} if error is not None else {}),
    }


class TestEvalSeeds:
    # Per-task seed counts: 10 each on the training-tier tasks (task_1..3)
    # and 5 on the held-out generalisation task (task_3_unseen).  The
    # smaller count on the unseen task keeps eval wallclock from doubling
    # for what is purely a generalisation probe — see eval_suite.EVAL_SEEDS.
    EXPECTED_SEED_COUNTS = {
        "task_1": 10,
        "task_2": 10,
        "task_3": 10,
        "task_3_unseen": 5,
    }

    def test_expected_tasks_with_expected_seed_counts(self) -> None:
        assert set(EVAL_SEEDS.keys()) == set(self.EXPECTED_SEED_COUNTS)
        for task_id, expected in self.EXPECTED_SEED_COUNTS.items():
            seeds = EVAL_SEEDS[task_id]
            assert len(seeds) == expected, f"{task_id} has wrong seed count"
            assert len(set(seeds)) == expected, f"{task_id} has duplicate seeds"

    def test_seeds_disjoint_from_training_seed(self) -> None:
        all_seeds = {s for seeds in EVAL_SEEDS.values() for s in seeds}
        # Training baseline uses seed=42 and small self-play seeds; eval
        # seeds live in the 1000+ range so they never collide.
        assert 42 not in all_seeds
        assert all(s >= 1000 for s in all_seeds)

    def test_seed_ranges_disjoint_across_tasks(self) -> None:
        """Each task owns a distinct seed range so an eval failure can be

        traced to one task without ambiguity."""
        seen: dict = {}
        for task_id, seeds in EVAL_SEEDS.items():
            for s in seeds:
                assert s not in seen, f"seed {s} reused across {seen[s]} and {task_id}"
                seen[s] = task_id


class TestParseEpisodeMetrics:
    def test_parses_headline_fields(self) -> None:
        result = _make_episode_result()
        m = _parse_episode_metrics("before", "task_1", 1001, result)
        assert isinstance(m, EpisodeMetrics)
        assert m.tag == "before"
        assert m.task_id == "task_1"
        assert m.seed == 1001
        assert m.grader_score == pytest.approx(0.5)
        assert m.track_a_score == pytest.approx(0.9)
        assert m.track_b_score == pytest.approx(0.95)
        assert m.steps == 30
        assert m.end_reason == "audit_complete"
        assert m.rewards_by_role["investigator"] == 1.5

    def test_counts_fraud_leaks_and_ground_truth_totals(self) -> None:
        result = _make_episode_result(
            verdicts={
                "ad_1": {"verdict": "approve", "ground_truth": "fraud"},
                "ad_2": {"verdict": "reject", "ground_truth": "fraud"},
                "ad_3": {"verdict": "approve", "ground_truth": "legit"},
                "ad_4": {"verdict": "approve", "ground_truth": "fraud"},
                "ad_5": {"verdict": "escalate", "ground_truth": "escalate"},
            }
        )
        m = _parse_episode_metrics("x", "task_1", 1, result)
        assert m.n_ground_truth_fraud == 3
        assert m.n_fraud_leaks == 2  # ad_1 and ad_4

    def test_budget_used_pct_from_remaining_budget(self) -> None:
        result = _make_episode_result(total_ads=10, remaining_budget=3)
        m = _parse_episode_metrics("x", "task_1", 1, result)
        # 10 total ads, 3 left => 7/10 = 0.7 consumed
        assert m.budget_used_pct == pytest.approx(0.7)

    def test_budget_pct_clamps_to_unit_interval(self) -> None:
        # remaining_budget can exceed total_ads in degenerate cases — clamp.
        result = _make_episode_result(total_ads=5, remaining_budget=100)
        m = _parse_episode_metrics("x", "task_1", 1, result)
        assert 0.0 <= m.budget_used_pct <= 1.0

    def test_budget_pct_zero_when_no_ads(self) -> None:
        result = _make_episode_result(total_ads=0, remaining_budget=0)
        m = _parse_episode_metrics("x", "task_1", 1, result)
        assert m.budget_used_pct == 0.0

    def test_investigator_fallback_count_extracted(self) -> None:
        result = _make_episode_result(investigator_fallback=4)
        m = _parse_episode_metrics("x", "task_1", 1, result)
        assert m.fallback_count == 4

    def test_missing_audit_report_defaults_to_one(self) -> None:
        result = _make_episode_result()
        result["final_state"]["audit_report"] = {}
        m = _parse_episode_metrics("x", "task_1", 1, result)
        assert m.track_a_score == pytest.approx(1.0)
        assert m.track_b_score == pytest.approx(1.0)

    def test_error_round_trips(self) -> None:
        result = _make_episode_result(error="boom")
        m = _parse_episode_metrics("x", "task_1", 1, result)
        assert m.error == "boom"


class TestAggregation:
    def test_aggregates_only_valid_episodes(self) -> None:
        eps = [
            _parse_episode_metrics(
                "after", "task_1", 1, _make_episode_result(grader_score=0.8)
            ),
            _parse_episode_metrics(
                "after", "task_1", 2, _make_episode_result(grader_score=0.6)
            ),
            _parse_episode_metrics(
                "after",
                "task_1",
                3,
                _make_episode_result(grader_score=0.0, error="boom"),
            ),
        ]
        agg = _aggregate_per_task("after", "task_1", eps)
        assert isinstance(agg, AggregatedMetrics)
        assert agg.n_episodes == 2  # the errored one is excluded
        assert agg.errors == 1
        assert agg.grader_score_mean == pytest.approx(0.7)

    def test_all_errors_returns_zeroed_aggregate(self) -> None:
        eps = [
            _parse_episode_metrics(
                "x",
                "task_1",
                1,
                _make_episode_result(error="x", investigator_fallback=2),
            )
        ]
        agg = _aggregate_per_task("x", "task_1", eps)
        assert agg.n_episodes == 0
        assert agg.errors == 1
        assert agg.fallback_count_total == 2


class TestArtefactWriters:
    def _make_before_after(self, tmp_path: Path) -> tuple:
        before_eps = {
            "task_1": [
                _parse_episode_metrics(
                    "before",
                    "task_1",
                    seed,
                    _make_episode_result(grader_score=0.4, track_a=0.7),
                )
                for seed in EVAL_SEEDS["task_1"][:2]
            ]
        }
        after_eps = {
            "task_1": [
                _parse_episode_metrics(
                    "after",
                    "task_1",
                    seed,
                    _make_episode_result(grader_score=0.8, track_a=0.95),
                )
                for seed in EVAL_SEEDS["task_1"][:2]
            ]
        }
        before_agg = {"task_1": _aggregate_per_task("before", "task_1", before_eps["task_1"])}
        after_agg = {"task_1": _aggregate_per_task("after", "task_1", after_eps["task_1"])}
        return before_eps, after_eps, before_agg, after_agg

    def test_write_eval_json_roundtrips(self, tmp_path: Path) -> None:
        before_eps, after_eps, _, _ = self._make_before_after(tmp_path)
        out = tmp_path / "eval_results.json"
        _write_eval_json(before_eps, after_eps, "before", "after", out)

        loaded = json.loads(out.read_text(encoding="utf-8"))
        assert loaded["schema"] == "counterfeint.eval_suite.v1"
        assert loaded["tags"] == {"before": "before", "after": "after"}
        assert len(loaded["before"]["task_1"]) == 2
        assert len(loaded["after"]["task_1"]) == 2

    def test_write_summary_md_mentions_delta(self, tmp_path: Path) -> None:
        _, _, before_agg, after_agg = self._make_before_after(tmp_path)
        out = tmp_path / "eval_summary.md"
        _write_eval_summary_md(before_agg, after_agg, "before", "after", out)

        text = out.read_text(encoding="utf-8")
        assert "before" in text
        assert "after" in text
        assert "grader_score" in text
        assert "track_a_score" in text
        # after > before, so we expect a "+" in the delta column.
        assert "+0.400" in text or "+0.4" in text

    def test_write_eval_plot_creates_png_or_stub(self, tmp_path: Path) -> None:
        _, _, before_agg, after_agg = self._make_before_after(tmp_path)
        out = tmp_path / "eval_plot.png"
        _write_eval_plot(before_agg, after_agg, "before", "after", out)

        # Either the PNG was written (matplotlib installed) or the .txt stub was.
        assert out.exists() or out.with_suffix(".txt").exists()

    def test_write_eval_json_includes_holdout_summary(self, tmp_path: Path) -> None:
        before_eps, after_eps, _, _ = self._make_before_after(tmp_path)
        out = tmp_path / "eval_results.json"
        holdout = {"n_ads_total": 15, "n_case_studies": 4}
        _write_eval_json(
            before_eps, after_eps, "before", "after", out, holdout_summary=holdout
        )
        loaded = json.loads(out.read_text(encoding="utf-8"))
        assert loaded["real_world_holdout"] == holdout


class TestRealWorldHoldoutSummary:
    def test_summary_reports_15_ads(self) -> None:
        s = summarize_real_world_holdout()
        assert s["n_ads_total"] == 15
        assert s["n_case_studies"] >= 3
        assert "Ghana DigitSol-style" in s["case_studies"]
        assert "Benin Digited-style" in s["case_studies"]
        assert "China-Russia-style hub" in s["case_studies"]
        assert sum(s["ads_per_case_study"].values()) == s["n_ads_total"]