Spaces:
Running
Running
File size: 7,392 Bytes
26bf1c9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 | """
Read-only loader for the Meta-CIB-modeled holdout dataset.
This module is the **only** sanctioned import surface for
``counterfeint/data/real_world_test_set.json``. It exists to enforce the
core constraint of the eval lane:
The synthetic-but-CIB-grounded ads in real_world_test_set.json are a
HOLDOUT set. They MUST NEVER be used in training rollouts.
To keep that boundary visible at the import level, the loader functions
take an explicit ``confirm_eval_only=True`` argument. Any caller passing
``False`` (or omitting it) gets a :class:`PermissionError`. Training
code paths simply never call this module, so there is no realistic way
to leak holdout data into the training distribution by accident.
Returned ads conform to the existing :class:`counterfeint.data.ad_generator.Ad`
dataclass shape, so the eval lane can drop them straight into the
existing observation builder.
Cross-references
----------------
* Per-ad fields ``case_study_source``, ``provenance_quarter``, and
``ring_membership`` mirror the shape used by
:class:`counterfeint.data.network_generator.FraudRing` and align with
the three CIB topologies named in
:data:`counterfeint.data.network_generator.RING_CASE_STUDIES`
(Ghana DigitSol, Benin Digited, China-Russia hub).
* The :func:`count_by_ring` summary helper feeds the README "Evaluated
against Meta-CIB-modeled ads" section.
"""
from __future__ import annotations
import json
from dataclasses import dataclass
from functools import lru_cache
from pathlib import Path
from typing import Any, Dict, List, Optional
try:
from .ad_generator import Ad
except ImportError: # pragma: no cover - script-level fallback
from counterfeint.data.ad_generator import Ad # type: ignore[no-redef]
HOLDOUT_PATH: Path = Path(__file__).resolve().parent / "real_world_test_set.json"
# ---------------------------------------------------------------------------
# Extended Ad row carrying provenance
# ---------------------------------------------------------------------------
@dataclass
class HoldoutAd:
"""Wraps an :class:`Ad` with the CIB provenance fields the eval lane uses.
The wrapped :class:`Ad` is exposed via :attr:`ad` so existing observation
builders can consume it without changes; the new fields live on the
wrapper so they're never accidentally written back into the procedural
generator's state.
"""
ad: Ad
case_study_source: str
provenance_quarter: str
ring_membership: Optional[str]
shared_signals: Dict[str, str]
def to_dict(self) -> Dict[str, Any]:
return {
"ad_id": self.ad.ad_id,
"ad_copy": self.ad.ad_copy,
"category": self.ad.category,
"targeting_summary": self.ad.targeting_summary,
"initial_risk_signals": list(self.ad.initial_risk_signals),
"ground_truth_label": self.ad.ground_truth_label,
"fraud_type": self.ad.fraud_type,
"severity": self.ad.severity,
"difficulty": self.ad.difficulty,
"case_study_source": self.case_study_source,
"provenance_quarter": self.provenance_quarter,
"ring_membership": self.ring_membership,
"shared_signals": dict(self.shared_signals),
}
# ---------------------------------------------------------------------------
# Public loader API
# ---------------------------------------------------------------------------
class HoldoutAccessError(PermissionError):
"""Raised when the holdout dataset is requested without an eval-only confirmation."""
@lru_cache(maxsize=1)
def _read_raw(path_str: str) -> Dict[str, Any]:
"""Cached JSON read so the test suite doesn't re-parse on every call."""
return json.loads(Path(path_str).read_text(encoding="utf-8"))
def _coerce_ad(raw: Dict[str, Any]) -> HoldoutAd:
ad = Ad(
ad_id=str(raw["ad_id"]),
ad_copy=str(raw["ad_copy"]),
category=str(raw["category"]),
targeting_summary=str(raw["targeting_summary"]),
initial_risk_signals=list(raw.get("initial_risk_signals") or []),
ground_truth_label=str(raw["ground_truth_label"]),
fraud_type=str(raw.get("fraud_type") or ""),
severity=float(raw.get("severity") or 0.0),
difficulty=str(raw.get("difficulty") or "medium"),
)
return HoldoutAd(
ad=ad,
case_study_source=str(raw.get("case_study_source") or ""),
provenance_quarter=str(raw.get("provenance_quarter") or ""),
ring_membership=raw.get("ring_membership"),
shared_signals=dict(raw.get("shared_signals") or {}),
)
def load_real_world_holdout(
*,
confirm_eval_only: bool = False,
path: Path = HOLDOUT_PATH,
) -> List[HoldoutAd]:
"""Load the full Meta-CIB-modeled holdout set.
Parameters
----------
confirm_eval_only
Must be set to ``True`` by every caller. Acts as a one-line
opt-in declaration that the loaded data is going to the eval
lane, not into a training rollout.
path
Override the JSON path (used only by tests).
Raises
------
HoldoutAccessError
If ``confirm_eval_only`` is not explicitly ``True``.
"""
if confirm_eval_only is not True:
raise HoldoutAccessError(
"real_world_test_set.json is HOLDOUT data. Pass "
"`confirm_eval_only=True` to acknowledge that the loaded ads "
"will not be used in training rollouts (see eval_suite.py)."
)
raw = _read_raw(str(path))
return [_coerce_ad(entry) for entry in raw.get("ads", [])]
def load_for_ring(
case_study_source: str,
*,
confirm_eval_only: bool = False,
) -> List[HoldoutAd]:
"""Filter the holdout to a single CIB case study.
Useful for the demo when a judge asks "show me the China-Russia
examples" — pass ``"China-Russia-style hub"``.
"""
return [
h
for h in load_real_world_holdout(confirm_eval_only=confirm_eval_only)
if h.case_study_source == case_study_source
]
# ---------------------------------------------------------------------------
# Summary helpers (no opt-in — they only report counts)
# ---------------------------------------------------------------------------
def count_by_ring(path: Path = HOLDOUT_PATH) -> Dict[str, int]:
"""Return ``{case_study_source: count}`` so the README/UI can render summaries.
Counts are derived from the on-disk JSON without producing any
actual ad text, so this helper is safe to call from any context
(including training-time logging).
"""
raw = _read_raw(str(path))
out: Dict[str, int] = {}
for entry in raw.get("ads", []):
key = entry.get("case_study_source", "unknown")
out[key] = out.get(key, 0) + 1
return out
def list_case_studies(path: Path = HOLDOUT_PATH) -> List[str]:
"""Distinct, stably-ordered list of case study labels in the holdout."""
raw = _read_raw(str(path))
seen: List[str] = []
for entry in raw.get("ads", []):
label = entry.get("case_study_source", "")
if label and label not in seen:
seen.append(label)
return seen
|