Spaces:
Sleeping
Sleeping
| """ | |
| Read-only loader for the Meta-CIB-modeled holdout dataset. | |
| This module is the **only** sanctioned import surface for | |
| ``counterfeint/data/real_world_test_set.json``. It exists to enforce the | |
| core constraint of the eval lane: | |
| The synthetic-but-CIB-grounded ads in real_world_test_set.json are a | |
| HOLDOUT set. They MUST NEVER be used in training rollouts. | |
| To keep that boundary visible at the import level, the loader functions | |
| take an explicit ``confirm_eval_only=True`` argument. Any caller passing | |
| ``False`` (or omitting it) gets a :class:`PermissionError`. Training | |
| code paths simply never call this module, so there is no realistic way | |
| to leak holdout data into the training distribution by accident. | |
| Returned ads conform to the existing :class:`counterfeint.data.ad_generator.Ad` | |
| dataclass shape, so the eval lane can drop them straight into the | |
| existing observation builder. | |
| Cross-references | |
| ---------------- | |
| * Per-ad fields ``case_study_source``, ``provenance_quarter``, and | |
| ``ring_membership`` mirror the shape used by | |
| :class:`counterfeint.data.network_generator.FraudRing` and align with | |
| the three CIB topologies named in | |
| :data:`counterfeint.data.network_generator.RING_CASE_STUDIES` | |
| (Ghana DigitSol, Benin Digited, China-Russia hub). | |
| * The :func:`count_by_ring` summary helper feeds the README "Evaluated | |
| against Meta-CIB-modeled ads" section. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| from dataclasses import dataclass | |
| from functools import lru_cache | |
| from pathlib import Path | |
| from typing import Any, Dict, List, Optional | |
| try: | |
| from .ad_generator import Ad | |
| except ImportError: # pragma: no cover - script-level fallback | |
| from counterfeint.data.ad_generator import Ad # type: ignore[no-redef] | |
| HOLDOUT_PATH: Path = Path(__file__).resolve().parent / "real_world_test_set.json" | |
| # --------------------------------------------------------------------------- | |
| # Extended Ad row carrying provenance | |
| # --------------------------------------------------------------------------- | |
| class HoldoutAd: | |
| """Wraps an :class:`Ad` with the CIB provenance fields the eval lane uses. | |
| The wrapped :class:`Ad` is exposed via :attr:`ad` so existing observation | |
| builders can consume it without changes; the new fields live on the | |
| wrapper so they're never accidentally written back into the procedural | |
| generator's state. | |
| """ | |
| ad: Ad | |
| case_study_source: str | |
| provenance_quarter: str | |
| ring_membership: Optional[str] | |
| shared_signals: Dict[str, str] | |
| def to_dict(self) -> Dict[str, Any]: | |
| return { | |
| "ad_id": self.ad.ad_id, | |
| "ad_copy": self.ad.ad_copy, | |
| "category": self.ad.category, | |
| "targeting_summary": self.ad.targeting_summary, | |
| "initial_risk_signals": list(self.ad.initial_risk_signals), | |
| "ground_truth_label": self.ad.ground_truth_label, | |
| "fraud_type": self.ad.fraud_type, | |
| "severity": self.ad.severity, | |
| "difficulty": self.ad.difficulty, | |
| "case_study_source": self.case_study_source, | |
| "provenance_quarter": self.provenance_quarter, | |
| "ring_membership": self.ring_membership, | |
| "shared_signals": dict(self.shared_signals), | |
| } | |
| # --------------------------------------------------------------------------- | |
| # Public loader API | |
| # --------------------------------------------------------------------------- | |
| class HoldoutAccessError(PermissionError): | |
| """Raised when the holdout dataset is requested without an eval-only confirmation.""" | |
| def _read_raw(path_str: str) -> Dict[str, Any]: | |
| """Cached JSON read so the test suite doesn't re-parse on every call.""" | |
| return json.loads(Path(path_str).read_text(encoding="utf-8")) | |
| def _coerce_ad(raw: Dict[str, Any]) -> HoldoutAd: | |
| ad = Ad( | |
| ad_id=str(raw["ad_id"]), | |
| ad_copy=str(raw["ad_copy"]), | |
| category=str(raw["category"]), | |
| targeting_summary=str(raw["targeting_summary"]), | |
| initial_risk_signals=list(raw.get("initial_risk_signals") or []), | |
| ground_truth_label=str(raw["ground_truth_label"]), | |
| fraud_type=str(raw.get("fraud_type") or ""), | |
| severity=float(raw.get("severity") or 0.0), | |
| difficulty=str(raw.get("difficulty") or "medium"), | |
| ) | |
| return HoldoutAd( | |
| ad=ad, | |
| case_study_source=str(raw.get("case_study_source") or ""), | |
| provenance_quarter=str(raw.get("provenance_quarter") or ""), | |
| ring_membership=raw.get("ring_membership"), | |
| shared_signals=dict(raw.get("shared_signals") or {}), | |
| ) | |
| def load_real_world_holdout( | |
| *, | |
| confirm_eval_only: bool = False, | |
| path: Path = HOLDOUT_PATH, | |
| ) -> List[HoldoutAd]: | |
| """Load the full Meta-CIB-modeled holdout set. | |
| Parameters | |
| ---------- | |
| confirm_eval_only | |
| Must be set to ``True`` by every caller. Acts as a one-line | |
| opt-in declaration that the loaded data is going to the eval | |
| lane, not into a training rollout. | |
| path | |
| Override the JSON path (used only by tests). | |
| Raises | |
| ------ | |
| HoldoutAccessError | |
| If ``confirm_eval_only`` is not explicitly ``True``. | |
| """ | |
| if confirm_eval_only is not True: | |
| raise HoldoutAccessError( | |
| "real_world_test_set.json is HOLDOUT data. Pass " | |
| "`confirm_eval_only=True` to acknowledge that the loaded ads " | |
| "will not be used in training rollouts (see eval_suite.py)." | |
| ) | |
| raw = _read_raw(str(path)) | |
| return [_coerce_ad(entry) for entry in raw.get("ads", [])] | |
| def load_for_ring( | |
| case_study_source: str, | |
| *, | |
| confirm_eval_only: bool = False, | |
| ) -> List[HoldoutAd]: | |
| """Filter the holdout to a single CIB case study. | |
| Useful for the demo when a judge asks "show me the China-Russia | |
| examples" — pass ``"China-Russia-style hub"``. | |
| """ | |
| return [ | |
| h | |
| for h in load_real_world_holdout(confirm_eval_only=confirm_eval_only) | |
| if h.case_study_source == case_study_source | |
| ] | |
| # --------------------------------------------------------------------------- | |
| # Summary helpers (no opt-in — they only report counts) | |
| # --------------------------------------------------------------------------- | |
| def count_by_ring(path: Path = HOLDOUT_PATH) -> Dict[str, int]: | |
| """Return ``{case_study_source: count}`` so the README/UI can render summaries. | |
| Counts are derived from the on-disk JSON without producing any | |
| actual ad text, so this helper is safe to call from any context | |
| (including training-time logging). | |
| """ | |
| raw = _read_raw(str(path)) | |
| out: Dict[str, int] = {} | |
| for entry in raw.get("ads", []): | |
| key = entry.get("case_study_source", "unknown") | |
| out[key] = out.get(key, 0) + 1 | |
| return out | |
| def list_case_studies(path: Path = HOLDOUT_PATH) -> List[str]: | |
| """Distinct, stably-ordered list of case study labels in the holdout.""" | |
| raw = _read_raw(str(path)) | |
| seen: List[str] = [] | |
| for entry in raw.get("ads", []): | |
| label = entry.get("case_study_source", "") | |
| if label and label not in seen: | |
| seen.append(label) | |
| return seen | |