Spaces:

QuantumTransformer
/

CounterFeint

Sleeping

App Files Files Community

CounterFeint / data /plausibility_references.py

QuantumTransformer

Upload folder using huggingface_hub

26bf1c9 verified 2 months ago

Raw

History Blame Contribute Delete

9.86 kB

	"""
	Static reference tables for Track B plausibility audit.

	These are deliberately small, hand-curated lookups rather than scraped
	from the web — they run fully offline at judging time. The tables are
	tuned against CounterFeint's R1 synthetic data (see `fraud_patterns.py`,
	`advertiser_profiles.py`, `landing_pages.py`) so a realistic R1-generated
	fraud ad should not trip them, while obviously absurd / gibberish ads
	clearly will.
	"""

	from __future__ import annotations

	import re
	from typing import Dict, FrozenSet, List, Set

	# -----------------------------------------------------------------------------
	# Country ↔ TLD plausibility.
	#
	# Map ISO country codes to the set of TLDs that are "plausible" (a common
	# ccTLD plus the gTLDs anyone uses). Ads claiming a US advertiser with a
	# `.cn` landing page in a fake-crypto category is classic fraudster-
	# signal-mismatch.
	# -----------------------------------------------------------------------------

	_GLOBAL_TLDS: FrozenSet[str] = frozenset(
	{
	"com",
	"net",
	"org",
	"io",
	"co",
	"shop",
	"store",
	"xyz",
	"online",
	"site",
	}
	)

	VALID_COUNTRY_TLD_PAIRS: Dict[str, FrozenSet[str]] = {
	"US": _GLOBAL_TLDS \| frozenset({"us"}),
	"UK": _GLOBAL_TLDS \| frozenset({"uk", "co.uk"}),
	"GB": _GLOBAL_TLDS \| frozenset({"uk", "co.uk"}),
	"DE": _GLOBAL_TLDS \| frozenset({"de"}),
	"FR": _GLOBAL_TLDS \| frozenset({"fr"}),
	"ES": _GLOBAL_TLDS \| frozenset({"es"}),
	"IT": _GLOBAL_TLDS \| frozenset({"it"}),
	"NL": _GLOBAL_TLDS \| frozenset({"nl"}),
	"CA": _GLOBAL_TLDS \| frozenset({"ca"}),
	"AU": _GLOBAL_TLDS \| frozenset({"au", "com.au"}),
	"IN": _GLOBAL_TLDS \| frozenset({"in"}),
	"JP": _GLOBAL_TLDS \| frozenset({"jp"}),
	"CN": _GLOBAL_TLDS \| frozenset({"cn", "com.cn"}),
	"RU": _GLOBAL_TLDS \| frozenset({"ru"}),
	"NG": _GLOBAL_TLDS \| frozenset({"ng"}),
	"BR": _GLOBAL_TLDS \| frozenset({"br", "com.br"}),
	"MX": _GLOBAL_TLDS \| frozenset({"mx"}),
	}

	# TLDs that should make us suspicious whenever they co-occur with a
	# Western advertiser country in a financial / crypto / health category.
	_HIGH_RISK_TLDS: FrozenSet[str] = frozenset({"cn", "ru", "tk", "ml", "ga", "cf", "xyz"})


	# -----------------------------------------------------------------------------
	# Category ↔ targeting compatibility.
	#
	# Each category has a list of token substrings we expect to appear in
	# plausible targeting strings. E.g. weight-loss targeting kids is an
	# obvious parameter mismatch. Lookups are lower-cased substring `in`
	# checks so any reasonable phrasing matches.
	# -----------------------------------------------------------------------------

	CATEGORY_TARGETING_COMPATIBILITY: Dict[str, List[str]] = {
	"ecommerce": [
	"adults",
	"shoppers",
	"shopping",
	"fashion",
	"home",
	"kitchen",
	"beauty",
	"gift",
	],
	"saas": [
	"adults",
	"professionals",
	"business",
	"developers",
	"technology",
	"it ",
	"b2b",
	],
	"local_service": [
	"local",
	"homeowners",
	"neighborhood",
	"residents",
	"adults",
	],
	"education": [
	"students",
	"learners",
	"adults",
	"teachers",
	"parents",
	"kids ", # note trailing space so we don't match "kidsafe"
	],
	"fitness": [
	"adults",
	"fitness",
	"athletes",
	"gym",
	"workout",
	"health",
	],
	"fake_giveaway": [
	"adults",
	"18+",
	"sweepstakes",
	"rewards",
	"gift",
	],
	"counterfeit_goods": [
	"shoppers",
	"fashion",
	"adults",
	"deals",
	],
	"miracle_cure": [
	"adults",
	"health",
	"wellness",
	"weight loss",
	"senior",
	],
	"advance_fee": [
	"adults",
	"finance",
	"investing",
	"entrepreneurs",
	],
	"fake_crypto": [
	"adults",
	"crypto",
	"investing",
	"finance",
	],
	"celebrity_endorsement_fraud": [
	"adults",
	"fans",
	"investing",
	"lifestyle",
	],
	"clone_brand": [
	"shoppers",
	"fashion",
	"adults",
	"bargain",
	],
	"gray_area_supplements": [
	"adults",
	"wellness",
	"fitness",
	"health",
	],
	"network_crypto": [
	"adults",
	"crypto",
	"investing",
	"finance",
	],
	"network_ecommerce": [
	"adults",
	"shoppers",
	"shopping",
	],
	"network_fintech": [
	"adults",
	"finance",
	"investing",
	"business",
	],
	"network_health": [
	"adults",
	"health",
	"wellness",
	],
	}

	# Categories we never want to see targeting minors.
	_ADULT_ONLY_CATEGORIES: FrozenSet[str] = frozenset(
	{
	"fake_giveaway",
	"miracle_cure",
	"advance_fee",
	"fake_crypto",
	"celebrity_endorsement_fraud",
	"gray_area_supplements",
	"network_crypto",
	"network_fintech",
	}
	)

	# Explicit "minor" targeting signals that should trigger a flag for adult-only cats.
	_MINOR_TARGETING_MARKERS: FrozenSet[str] = frozenset(
	{
	"kids",
	"teens",
	"teenagers",
	"children",
	"minors",
	"under 18",
	"under-18",
	"schoolers",
	}
	)


	# -----------------------------------------------------------------------------
	# Known fraud markers — patterns the R1 environment uses to signal fraud to
	# the Investigator. A Fraudster copying these verbatim into its ad_copy /
	# landing_page_blurb is "grader-hacking" rather than generating plausible
	# adversarial surface text.
	# -----------------------------------------------------------------------------

	KNOWN_FRAUD_MARKER_REGEXES: List[re.Pattern[str]] = [
	re.compile(r"\btmpl_\d{3,}\b", re.IGNORECASE),
	re.compile(r"\bpmt_ring_\d{3,}\b", re.IGNORECASE),
	re.compile(r"\bring_[a-z0-9]{3,}\b", re.IGNORECASE),
	re.compile(r"\bcfg_[a-f0-9]{3,}\b", re.IGNORECASE),
	re.compile(r"\badv_[a-z0-9]{3,}_fraud\b", re.IGNORECASE),
	]


	# -----------------------------------------------------------------------------
	# Deny-list tokens — obvious "adversarial" / prompt-injection markers and
	# control characters. If the Fraudster tries to emit these it has clearly
	# abandoned the "pretend to be a real advertiser" pretence.
	# -----------------------------------------------------------------------------

	DENY_LIST_TOKENS: FrozenSet[str] = frozenset(
	{
	"<bypass>",
	"<adversarial>",
	"<jailbreak>",
	"<system>",
	"<<</>>",
	"</instructions>",
	"ignore previous",
	"ignore all previous",
	"disregard instructions",
	"disregard previous",
	}
	)


	# -----------------------------------------------------------------------------
	# Helpers
	# -----------------------------------------------------------------------------


	def extract_tlds_from_text(text: str) -> Set[str]:
	"""Lower-case TLDs found as domain suffixes in free text."""
	if not text:
	return set()
	out: Set[str] = set()
	for match in re.findall(
	r"\b[a-z0-9][a-z0-9\-]*\.([a-z]{2,6}(?:\.[a-z]{2})?)\b",
	text,
	re.IGNORECASE,
	):
	out.add(match.lower())
	return out


	def is_tld_plausible_for_country(country: str, tld: str) -> bool:
	country = (country or "").upper()
	tld = (tld or "").lower().lstrip(".")
	if not tld:
	return True
	allowed = VALID_COUNTRY_TLD_PAIRS.get(country)
	if allowed is None:
	return True # unknown country, can't flag
	return tld in allowed


	def is_high_risk_tld(tld: str) -> bool:
	return (tld or "").lower().lstrip(".") in _HIGH_RISK_TLDS


	def is_adult_only_category(category: str) -> bool:
	return (category or "").lower() in _ADULT_ONLY_CATEGORIES


	def targeting_mentions_minors(targeting: str) -> bool:
	if not targeting:
	return False
	lowered = targeting.lower()
	return any(marker in lowered for marker in _MINOR_TARGETING_MARKERS)


	def targeting_matches_category(category: str, targeting: str) -> bool:
	"""
	Soft compatibility check: True if the targeting string contains ≥1
	category-appropriate keyword, or if the category has no configured
	expectations (unknown category → don't flag).
	"""
	expected = CATEGORY_TARGETING_COMPATIBILITY.get((category or "").lower())
	if expected is None:
	return True
	if not targeting:
	return False
	lowered = targeting.lower()
	return any(tok in lowered for tok in expected)


	def contains_fraud_marker(text: str) -> bool:
	if not text:
	return False
	return any(rx.search(text) for rx in KNOWN_FRAUD_MARKER_REGEXES)


	def contains_deny_token(text: str) -> bool:
	if not text:
	return False
	lowered = text.lower()
	return any(tok in lowered for tok in DENY_LIST_TOKENS)


	__all__ = [
	"CATEGORY_TARGETING_COMPATIBILITY",
	"DENY_LIST_TOKENS",
	"KNOWN_FRAUD_MARKER_REGEXES",
	"VALID_COUNTRY_TLD_PAIRS",
	"contains_deny_token",
	"contains_fraud_marker",
	"extract_tlds_from_text",
	"is_adult_only_category",
	"is_high_risk_tld",
	"is_tld_plausible_for_country",
	"targeting_matches_category",
	"targeting_mentions_minors",
	]