File size: 7,392 Bytes
26bf1c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
"""

Read-only loader for the Meta-CIB-modeled holdout dataset.



This module is the **only** sanctioned import surface for

``counterfeint/data/real_world_test_set.json``. It exists to enforce the

core constraint of the eval lane:



    The synthetic-but-CIB-grounded ads in real_world_test_set.json are a

    HOLDOUT set. They MUST NEVER be used in training rollouts.



To keep that boundary visible at the import level, the loader functions

take an explicit ``confirm_eval_only=True`` argument. Any caller passing

``False`` (or omitting it) gets a :class:`PermissionError`. Training

code paths simply never call this module, so there is no realistic way

to leak holdout data into the training distribution by accident.



Returned ads conform to the existing :class:`counterfeint.data.ad_generator.Ad`

dataclass shape, so the eval lane can drop them straight into the

existing observation builder.



Cross-references

----------------



* Per-ad fields ``case_study_source``, ``provenance_quarter``, and

  ``ring_membership`` mirror the shape used by

  :class:`counterfeint.data.network_generator.FraudRing` and align with

  the three CIB topologies named in

  :data:`counterfeint.data.network_generator.RING_CASE_STUDIES`

  (Ghana DigitSol, Benin Digited, China-Russia hub).

* The :func:`count_by_ring` summary helper feeds the README "Evaluated

  against Meta-CIB-modeled ads" section.

"""

from __future__ import annotations

import json
from dataclasses import dataclass
from functools import lru_cache
from pathlib import Path
from typing import Any, Dict, List, Optional

try:
    from .ad_generator import Ad
except ImportError:  # pragma: no cover - script-level fallback
    from counterfeint.data.ad_generator import Ad  # type: ignore[no-redef]


HOLDOUT_PATH: Path = Path(__file__).resolve().parent / "real_world_test_set.json"


# ---------------------------------------------------------------------------
# Extended Ad row carrying provenance
# ---------------------------------------------------------------------------


@dataclass
class HoldoutAd:
    """Wraps an :class:`Ad` with the CIB provenance fields the eval lane uses.



    The wrapped :class:`Ad` is exposed via :attr:`ad` so existing observation

    builders can consume it without changes; the new fields live on the

    wrapper so they're never accidentally written back into the procedural

    generator's state.

    """

    ad: Ad
    case_study_source: str
    provenance_quarter: str
    ring_membership: Optional[str]
    shared_signals: Dict[str, str]

    def to_dict(self) -> Dict[str, Any]:
        return {
            "ad_id": self.ad.ad_id,
            "ad_copy": self.ad.ad_copy,
            "category": self.ad.category,
            "targeting_summary": self.ad.targeting_summary,
            "initial_risk_signals": list(self.ad.initial_risk_signals),
            "ground_truth_label": self.ad.ground_truth_label,
            "fraud_type": self.ad.fraud_type,
            "severity": self.ad.severity,
            "difficulty": self.ad.difficulty,
            "case_study_source": self.case_study_source,
            "provenance_quarter": self.provenance_quarter,
            "ring_membership": self.ring_membership,
            "shared_signals": dict(self.shared_signals),
        }


# ---------------------------------------------------------------------------
# Public loader API
# ---------------------------------------------------------------------------


class HoldoutAccessError(PermissionError):
    """Raised when the holdout dataset is requested without an eval-only confirmation."""


@lru_cache(maxsize=1)
def _read_raw(path_str: str) -> Dict[str, Any]:
    """Cached JSON read so the test suite doesn't re-parse on every call."""
    return json.loads(Path(path_str).read_text(encoding="utf-8"))


def _coerce_ad(raw: Dict[str, Any]) -> HoldoutAd:
    ad = Ad(
        ad_id=str(raw["ad_id"]),
        ad_copy=str(raw["ad_copy"]),
        category=str(raw["category"]),
        targeting_summary=str(raw["targeting_summary"]),
        initial_risk_signals=list(raw.get("initial_risk_signals") or []),
        ground_truth_label=str(raw["ground_truth_label"]),
        fraud_type=str(raw.get("fraud_type") or ""),
        severity=float(raw.get("severity") or 0.0),
        difficulty=str(raw.get("difficulty") or "medium"),
    )
    return HoldoutAd(
        ad=ad,
        case_study_source=str(raw.get("case_study_source") or ""),
        provenance_quarter=str(raw.get("provenance_quarter") or ""),
        ring_membership=raw.get("ring_membership"),
        shared_signals=dict(raw.get("shared_signals") or {}),
    )


def load_real_world_holdout(

    *,

    confirm_eval_only: bool = False,

    path: Path = HOLDOUT_PATH,

) -> List[HoldoutAd]:
    """Load the full Meta-CIB-modeled holdout set.



    Parameters

    ----------

    confirm_eval_only

        Must be set to ``True`` by every caller. Acts as a one-line

        opt-in declaration that the loaded data is going to the eval

        lane, not into a training rollout.

    path

        Override the JSON path (used only by tests).



    Raises

    ------

    HoldoutAccessError

        If ``confirm_eval_only`` is not explicitly ``True``.

    """
    if confirm_eval_only is not True:
        raise HoldoutAccessError(
            "real_world_test_set.json is HOLDOUT data. Pass "
            "`confirm_eval_only=True` to acknowledge that the loaded ads "
            "will not be used in training rollouts (see eval_suite.py)."
        )
    raw = _read_raw(str(path))
    return [_coerce_ad(entry) for entry in raw.get("ads", [])]


def load_for_ring(

    case_study_source: str,

    *,

    confirm_eval_only: bool = False,

) -> List[HoldoutAd]:
    """Filter the holdout to a single CIB case study.



    Useful for the demo when a judge asks "show me the China-Russia

    examples" — pass ``"China-Russia-style hub"``.

    """
    return [
        h
        for h in load_real_world_holdout(confirm_eval_only=confirm_eval_only)
        if h.case_study_source == case_study_source
    ]


# ---------------------------------------------------------------------------
# Summary helpers (no opt-in — they only report counts)
# ---------------------------------------------------------------------------


def count_by_ring(path: Path = HOLDOUT_PATH) -> Dict[str, int]:
    """Return ``{case_study_source: count}`` so the README/UI can render summaries.



    Counts are derived from the on-disk JSON without producing any

    actual ad text, so this helper is safe to call from any context

    (including training-time logging).

    """
    raw = _read_raw(str(path))
    out: Dict[str, int] = {}
    for entry in raw.get("ads", []):
        key = entry.get("case_study_source", "unknown")
        out[key] = out.get(key, 0) + 1
    return out


def list_case_studies(path: Path = HOLDOUT_PATH) -> List[str]:
    """Distinct, stably-ordered list of case study labels in the holdout."""
    raw = _read_raw(str(path))
    seen: List[str] = []
    for entry in raw.get("ads", []):
        label = entry.get("case_study_source", "")
        if label and label not in seen:
            seen.append(label)
    return seen