CounterFeint / data /plausibility_references.py
QuantumTransformer's picture
Upload folder using huggingface_hub
26bf1c9 verified
Raw
History Blame Contribute Delete
9.86 kB
"""
Static reference tables for Track B plausibility audit.
These are deliberately **small, hand-curated lookups** rather than scraped
from the web — they run fully offline at judging time. The tables are
tuned against CounterFeint's R1 synthetic data (see `fraud_patterns.py`,
`advertiser_profiles.py`, `landing_pages.py`) so a realistic R1-generated
fraud ad should *not* trip them, while obviously absurd / gibberish ads
clearly will.
"""
from __future__ import annotations
import re
from typing import Dict, FrozenSet, List, Set
# -----------------------------------------------------------------------------
# Country ↔ TLD plausibility.
#
# Map ISO country codes to the set of TLDs that are "plausible" (a common
# ccTLD plus the gTLDs anyone uses). Ads claiming a US advertiser with a
# `.cn` landing page in a fake-crypto category is classic fraudster-
# signal-mismatch.
# -----------------------------------------------------------------------------
_GLOBAL_TLDS: FrozenSet[str] = frozenset(
{
"com",
"net",
"org",
"io",
"co",
"shop",
"store",
"xyz",
"online",
"site",
}
)
VALID_COUNTRY_TLD_PAIRS: Dict[str, FrozenSet[str]] = {
"US": _GLOBAL_TLDS | frozenset({"us"}),
"UK": _GLOBAL_TLDS | frozenset({"uk", "co.uk"}),
"GB": _GLOBAL_TLDS | frozenset({"uk", "co.uk"}),
"DE": _GLOBAL_TLDS | frozenset({"de"}),
"FR": _GLOBAL_TLDS | frozenset({"fr"}),
"ES": _GLOBAL_TLDS | frozenset({"es"}),
"IT": _GLOBAL_TLDS | frozenset({"it"}),
"NL": _GLOBAL_TLDS | frozenset({"nl"}),
"CA": _GLOBAL_TLDS | frozenset({"ca"}),
"AU": _GLOBAL_TLDS | frozenset({"au", "com.au"}),
"IN": _GLOBAL_TLDS | frozenset({"in"}),
"JP": _GLOBAL_TLDS | frozenset({"jp"}),
"CN": _GLOBAL_TLDS | frozenset({"cn", "com.cn"}),
"RU": _GLOBAL_TLDS | frozenset({"ru"}),
"NG": _GLOBAL_TLDS | frozenset({"ng"}),
"BR": _GLOBAL_TLDS | frozenset({"br", "com.br"}),
"MX": _GLOBAL_TLDS | frozenset({"mx"}),
}
# TLDs that should make us suspicious whenever they co-occur with a
# Western advertiser country in a financial / crypto / health category.
_HIGH_RISK_TLDS: FrozenSet[str] = frozenset({"cn", "ru", "tk", "ml", "ga", "cf", "xyz"})
# -----------------------------------------------------------------------------
# Category ↔ targeting compatibility.
#
# Each category has a list of *token* substrings we expect to appear in
# plausible targeting strings. E.g. weight-loss targeting kids is an
# obvious parameter mismatch. Lookups are lower-cased substring `in`
# checks so any reasonable phrasing matches.
# -----------------------------------------------------------------------------
CATEGORY_TARGETING_COMPATIBILITY: Dict[str, List[str]] = {
"ecommerce": [
"adults",
"shoppers",
"shopping",
"fashion",
"home",
"kitchen",
"beauty",
"gift",
],
"saas": [
"adults",
"professionals",
"business",
"developers",
"technology",
"it ",
"b2b",
],
"local_service": [
"local",
"homeowners",
"neighborhood",
"residents",
"adults",
],
"education": [
"students",
"learners",
"adults",
"teachers",
"parents",
"kids ", # note trailing space so we don't match "kidsafe"
],
"fitness": [
"adults",
"fitness",
"athletes",
"gym",
"workout",
"health",
],
"fake_giveaway": [
"adults",
"18+",
"sweepstakes",
"rewards",
"gift",
],
"counterfeit_goods": [
"shoppers",
"fashion",
"adults",
"deals",
],
"miracle_cure": [
"adults",
"health",
"wellness",
"weight loss",
"senior",
],
"advance_fee": [
"adults",
"finance",
"investing",
"entrepreneurs",
],
"fake_crypto": [
"adults",
"crypto",
"investing",
"finance",
],
"celebrity_endorsement_fraud": [
"adults",
"fans",
"investing",
"lifestyle",
],
"clone_brand": [
"shoppers",
"fashion",
"adults",
"bargain",
],
"gray_area_supplements": [
"adults",
"wellness",
"fitness",
"health",
],
"network_crypto": [
"adults",
"crypto",
"investing",
"finance",
],
"network_ecommerce": [
"adults",
"shoppers",
"shopping",
],
"network_fintech": [
"adults",
"finance",
"investing",
"business",
],
"network_health": [
"adults",
"health",
"wellness",
],
}
# Categories we *never* want to see targeting minors.
_ADULT_ONLY_CATEGORIES: FrozenSet[str] = frozenset(
{
"fake_giveaway",
"miracle_cure",
"advance_fee",
"fake_crypto",
"celebrity_endorsement_fraud",
"gray_area_supplements",
"network_crypto",
"network_fintech",
}
)
# Explicit "minor" targeting signals that should trigger a flag for adult-only cats.
_MINOR_TARGETING_MARKERS: FrozenSet[str] = frozenset(
{
"kids",
"teens",
"teenagers",
"children",
"minors",
"under 18",
"under-18",
"schoolers",
}
)
# -----------------------------------------------------------------------------
# Known fraud markers — patterns the R1 environment uses to *signal* fraud to
# the Investigator. A Fraudster copying these verbatim into its ad_copy /
# landing_page_blurb is "grader-hacking" rather than generating plausible
# adversarial surface text.
# -----------------------------------------------------------------------------
KNOWN_FRAUD_MARKER_REGEXES: List[re.Pattern[str]] = [
re.compile(r"\btmpl_\d{3,}\b", re.IGNORECASE),
re.compile(r"\bpmt_ring_\d{3,}\b", re.IGNORECASE),
re.compile(r"\bring_[a-z0-9]{3,}\b", re.IGNORECASE),
re.compile(r"\bcfg_[a-f0-9]{3,}\b", re.IGNORECASE),
re.compile(r"\badv_[a-z0-9]{3,}_fraud\b", re.IGNORECASE),
]
# -----------------------------------------------------------------------------
# Deny-list tokens — obvious "adversarial" / prompt-injection markers and
# control characters. If the Fraudster tries to emit these it has clearly
# abandoned the "pretend to be a real advertiser" pretence.
# -----------------------------------------------------------------------------
DENY_LIST_TOKENS: FrozenSet[str] = frozenset(
{
"<bypass>",
"<adversarial>",
"<jailbreak>",
"<system>",
"<<</>>",
"</instructions>",
"ignore previous",
"ignore all previous",
"disregard instructions",
"disregard previous",
}
)
# -----------------------------------------------------------------------------
# Helpers
# -----------------------------------------------------------------------------
def extract_tlds_from_text(text: str) -> Set[str]:
"""Lower-case TLDs found as domain suffixes in free text."""
if not text:
return set()
out: Set[str] = set()
for match in re.findall(
r"\b[a-z0-9][a-z0-9\-]*\.([a-z]{2,6}(?:\.[a-z]{2})?)\b",
text,
re.IGNORECASE,
):
out.add(match.lower())
return out
def is_tld_plausible_for_country(country: str, tld: str) -> bool:
country = (country or "").upper()
tld = (tld or "").lower().lstrip(".")
if not tld:
return True
allowed = VALID_COUNTRY_TLD_PAIRS.get(country)
if allowed is None:
return True # unknown country, can't flag
return tld in allowed
def is_high_risk_tld(tld: str) -> bool:
return (tld or "").lower().lstrip(".") in _HIGH_RISK_TLDS
def is_adult_only_category(category: str) -> bool:
return (category or "").lower() in _ADULT_ONLY_CATEGORIES
def targeting_mentions_minors(targeting: str) -> bool:
if not targeting:
return False
lowered = targeting.lower()
return any(marker in lowered for marker in _MINOR_TARGETING_MARKERS)
def targeting_matches_category(category: str, targeting: str) -> bool:
"""
Soft compatibility check: True if the targeting string contains ≥1
category-appropriate keyword, or if the category has no configured
expectations (unknown category → don't flag).
"""
expected = CATEGORY_TARGETING_COMPATIBILITY.get((category or "").lower())
if expected is None:
return True
if not targeting:
return False
lowered = targeting.lower()
return any(tok in lowered for tok in expected)
def contains_fraud_marker(text: str) -> bool:
if not text:
return False
return any(rx.search(text) for rx in KNOWN_FRAUD_MARKER_REGEXES)
def contains_deny_token(text: str) -> bool:
if not text:
return False
lowered = text.lower()
return any(tok in lowered for tok in DENY_LIST_TOKENS)
__all__ = [
"CATEGORY_TARGETING_COMPATIBILITY",
"DENY_LIST_TOKENS",
"KNOWN_FRAUD_MARKER_REGEXES",
"VALID_COUNTRY_TLD_PAIRS",
"contains_deny_token",
"contains_fraud_marker",
"extract_tlds_from_text",
"is_adult_only_category",
"is_high_risk_tld",
"is_tld_plausible_for_country",
"targeting_matches_category",
"targeting_mentions_minors",
]