Spaces:

Pandago
/

graphstrike

Sleeping

App Files Files Community

graphstrike / server /generator.py

Pandago

Upload folder using huggingface_hub

87f2d84 verified about 2 months ago

raw

history blame contribute delete

11.9 kB

	"""Synthetic episode generator for the Fake Gang Detection environment."""

	from __future__ import annotations

	import json
	import math
	import random
	import uuid
	from pathlib import Path
	from typing import Any, Dict, List, Tuple

	# ---------------------------------------------------------------------------
	# Task configuration
	# ---------------------------------------------------------------------------

	TASK_CONFIG = {
	"easy": {
	"network_size": 50,
	"gang_size": 10,
	"decoy_count": 0,
	"max_steps": 30,
	"starting_visible": 5,
	"post_hour_mean": 14.0,
	"post_hour_std": 0.5,
	"photo_reuse_range": (0.70, 0.95),
	"bio_template_range": (0.60, 0.90),
	"intra_gang_density": 0.80,
	"evasion_schedule": [],
	"win_recall": 0.8,
	"win_precision": 0.7,
	},
	"medium": {
	"network_size": 200,
	"gang_size": 10,
	"decoy_count": 20,
	"max_steps": 50,
	"starting_visible": 10,
	"post_hour_mean": 14.0,
	"post_hour_std": 1.5,
	"photo_reuse_range": (0.40, 0.70),
	"bio_template_range": (0.30, 0.60),
	"intra_gang_density": 0.70,
	"evasion_schedule": [
	{"step": 20, "event": "unfollow_intragang", "drop_rate": 0.5}
	],
	"win_recall": 0.8,
	"win_precision": 0.7,
	},
	"hard": {
	"network_size": 1000,
	"gang_size": 10,
	"decoy_count": 50,
	"max_steps": 80,
	"starting_visible": 20,
	"post_hour_mean": 14.0,
	"post_hour_std": 2.5,
	"photo_reuse_range": (0.30, 0.55),
	"bio_template_range": (0.20, 0.50),
	"intra_gang_density": 0.60,
	"evasion_schedule": [
	{"step": 15, "event": "unfollow_intragang", "drop_rate": 0.3, "rename_count": 3},
	{"step": 30, "event": "unfollow_intragang", "drop_rate": 0.3, "rename_count": 3},
	{"step": 45, "event": "unfollow_intragang", "drop_rate": 0.3, "rename_count": 3},
	{"step": 60, "event": "unfollow_intragang", "drop_rate": 0.3, "rename_count": 3},
	],
	"win_recall": 0.9,
	"win_precision": 0.8,
	},
	}


	# ---------------------------------------------------------------------------
	# Helpers
	# ---------------------------------------------------------------------------

	def _lognormal_int(rng: random.Random, mu: float, sigma: float, lo: int, hi: int) -> int:
	val = math.exp(rng.gauss(mu, sigma))
	return max(lo, min(hi, int(val)))


	def _beta(rng: random.Random, a: float, b: float) -> float:
	"""Simple beta distribution via gamma approximation."""
	x = rng.gammavariate(a, 1)
	y = rng.gammavariate(b, 1)
	return x / (x + y)


	# ---------------------------------------------------------------------------
	# Account generators
	# ---------------------------------------------------------------------------

	def _gen_real_account(rng: random.Random, acc_id: str) -> Dict[str, Any]:
	return {
	"id": acc_id,
	"is_fake": False,
	"gang_id": None,
	"features": {
	"follower_count": _lognormal_int(rng, 6.0, 1.5, 10, 100_000),
	"following_count": _lognormal_int(rng, 5.5, 1.2, 10, 5_000),
	"post_count": _lognormal_int(rng, 4.0, 1.0, 1, 5_000),
	"avg_post_hour": rng.uniform(6, 23),
	"photo_reuse_score": round(_beta(rng, 1, 10), 4),
	"bio_template_score": round(_beta(rng, 1, 8), 4),
	"account_age_days": rng.randint(30, 1825),
	"comment_repeat_score": round(_beta(rng, 1, 20), 4), # mostly 0.0-0.08
	"ip_cluster_id": f"ip_real_{acc_id}",
	"shared_ip_count": 0,
	"name_change_count": 0,
	},
	"true_edges": {"follows": [], "followed_by": []},
	}


	def _gen_decoy_account(rng: random.Random, acc_id: str) -> Dict[str, Any]:
	acc = _gen_real_account(rng, acc_id)
	acc["features"]["photo_reuse_score"] = round(rng.uniform(0.2, 0.4), 4)
	acc["features"]["bio_template_score"] = round(rng.uniform(0.2, 0.4), 4)
	acc["features"]["comment_repeat_score"] = round(rng.uniform(0.10, 0.30), 4)
	return acc


	def _gen_celebrity_account(rng: random.Random, acc_id: str) -> Dict[str, Any]:
	return {
	"id": acc_id,
	"is_fake": False,
	"gang_id": None,
	"features": {
	"follower_count": rng.randint(100_000, 5_000_000),
	"following_count": rng.randint(50, 500),
	"post_count": rng.randint(500, 5000),
	"avg_post_hour": rng.uniform(6, 23),
	"photo_reuse_score": round(rng.uniform(0.0, 0.05), 4),
	"bio_template_score": round(rng.uniform(0.0, 0.05), 4),
	"account_age_days": rng.randint(1000, 3000),
	"comment_repeat_score": round(rng.uniform(0.0, 0.03), 4),
	"ip_cluster_id": f"ip_celeb_{acc_id}",
	"shared_ip_count": 0,
	"name_change_count": 0,
	},
	"true_edges": {"follows": [], "followed_by": []},
	}


	def _gen_gang_accounts(
	rng: random.Random,
	ids: List[str],
	cfg: Dict[str, Any],
	seed: int,
	) -> List[Dict[str, Any]]:
	gang_size = len(ids)
	base_age = rng.randint(30, 180)
	base_hour = cfg["post_hour_mean"]
	ph_lo, ph_hi = cfg["photo_reuse_range"]
	bt_lo, bt_hi = cfg["bio_template_range"]
	std = cfg["post_hour_std"]
	accounts = []
	for acc_id in ids:
	accounts.append({
	"id": acc_id,
	"is_fake": True,
	"gang_id": "gang_A",
	"features": {
	"follower_count": rng.randint(1_000, 8_000),
	"following_count": rng.randint(200, 2_000),
	"post_count": rng.randint(50, 500),
	"avg_post_hour": round(max(0, min(23, rng.gauss(base_hour, std))), 2),
	"photo_reuse_score": round(rng.uniform(ph_lo, ph_hi), 4),
	"bio_template_score": round(rng.uniform(bt_lo, bt_hi), 4),
	"account_age_days": base_age + rng.randint(0, 7),
	"comment_repeat_score": round(rng.uniform(0.60, 0.90), 4),
	"ip_cluster_id": f"ip_gang_{seed}",
	"shared_ip_count": gang_size - 1,
	"name_change_count": 0,
	},
	"true_edges": {"follows": [], "followed_by": []},
	})
	return accounts


	# ---------------------------------------------------------------------------
	# Edge generation
	# ---------------------------------------------------------------------------

	def _build_edges(
	rng: random.Random,
	accounts: List[Dict[str, Any]],
	gang_ids: List[str],
	density: float,
	) -> None:
	"""Add follower edges in-place."""
	acc_map = {a["id"]: a for a in accounts}
	all_ids = [a["id"] for a in accounts]

	# Intra-gang edges (high density)
	for g in gang_ids:
	for h in gang_ids:
	if g != h and rng.random() < density:
	acc_map[g]["true_edges"]["follows"].append(h)
	acc_map[h]["true_edges"]["followed_by"].append(g)

	# Real/decoy accounts: sparse preferential attachment
	non_gang = [a["id"] for a in accounts if a["id"] not in gang_ids]
	for acc_id in non_gang:
	n_follows = rng.randint(5, min(50, len(all_ids) - 1))
	targets = rng.sample([x for x in all_ids if x != acc_id], min(n_follows, len(all_ids) - 1))
	for t in targets:
	acc_map[acc_id]["true_edges"]["follows"].append(t)
	acc_map[t]["true_edges"]["followed_by"].append(acc_id)


	# ---------------------------------------------------------------------------
	# Episode generator
	# ---------------------------------------------------------------------------

	def generate_episode(task: str, seed: int) -> Dict[str, Any]:
	cfg = TASK_CONFIG[task]
	rng = random.Random(seed)

	network_size = cfg["network_size"]
	gang_size = cfg["gang_size"]
	decoy_count = cfg["decoy_count"]
	real_count = network_size - gang_size - decoy_count

	# Generate IDs
	all_ids = [f"acc_{i:04d}" for i in range(network_size)]
	rng.shuffle(all_ids)
	gang_ids = all_ids[:gang_size]
	decoy_ids = all_ids[gang_size: gang_size + decoy_count]
	real_ids = all_ids[gang_size + decoy_count:]

	# Partition real IDs: last 2 → celebrities, 2 before that → zero-edge isolates
	# Need at least 4 real accounts for this split; fall back gracefully if not.
	if len(real_ids) >= 4:
	celeb_ids = real_ids[-2:]
	zero_edge_ids = real_ids[-4:-2]
	standard_real_ids = real_ids[:-4]
	elif len(real_ids) >= 2:
	celeb_ids = real_ids[-2:]
	zero_edge_ids = []
	standard_real_ids = real_ids[:-2]
	else:
	celeb_ids = []
	zero_edge_ids = []
	standard_real_ids = real_ids

	accounts: List[Dict[str, Any]] = []
	accounts.extend(_gen_gang_accounts(rng, gang_ids, cfg, seed))
	accounts.extend(_gen_decoy_account(rng, did) for did in decoy_ids)
	accounts.extend(_gen_real_account(rng, rid) for rid in standard_real_ids)

	# Zero-edge isolates: real accounts with no connections (edge cases for the agent)
	for rid in zero_edge_ids:
	acc = _gen_real_account(rng, rid)
	acc["features"]["follower_count"] = 0
	acc["features"]["following_count"] = 0
	accounts.append(acc)

	# Celebrity accounts: large legit hubs (high hub_legitimacy → low fake_risk)
	for cid in celeb_ids:
	accounts.append(_gen_celebrity_account(rng, cid))

	_build_edges(rng, accounts, gang_ids, cfg["intra_gang_density"])

	# Choose starting visible accounts.
	# Guarantee exactly 1 gang member is included so the cascade CAN start
	# regardless of seed. The agent still has to identify WHICH account is fake
	# (requires inspecting profiles) — so difficulty is preserved.
	# Without this, ~31% of easy episodes and ~82% of hard episodes start with
	# zero gang members visible, making score variance seed-luck rather than
	# agent skill.
	starting_count = cfg["starting_visible"]
	forced_gang = rng.sample(gang_ids, 1) # exactly 1 gang member
	rest_pool = [i for i in all_ids if i not in forced_gang]
	additional = rng.sample(rest_pool, starting_count - 1)
	starting_visible = forced_gang + additional
	rng.shuffle(starting_visible) # don't reveal which is fake

	return {
	"episode_id": str(uuid.uuid4()),
	"task": task,
	"seed": seed,
	"max_steps": cfg["max_steps"],
	"win_recall": cfg["win_recall"],
	"win_precision": cfg["win_precision"],
	"starting_visible": starting_visible,
	"gang_member_ids": gang_ids,
	"decoy_ids": decoy_ids,
	"celeb_ids": celeb_ids,
	"zero_edge_ids": zero_edge_ids,
	"network": {"accounts": accounts},
	"evasion_schedule": cfg["evasion_schedule"],
	}


	# ---------------------------------------------------------------------------
	# Bulk generation
	# ---------------------------------------------------------------------------

	def generate_all_episodes(
	output_dir: Path,
	counts: Dict[str, int] \| None = None,
	) -> None:
	if counts is None:
	counts = {"easy": 50, "medium": 50, "hard": 50}

	output_dir.mkdir(parents=True, exist_ok=True)
	total = 0
	for task, n in counts.items():
	for i in range(n):
	seed = i # deterministic
	ep = generate_episode(task, seed)
	fname = output_dir / f"{task}_{i:03d}.json"
	fname.write_text(json.dumps(ep, indent=2))
	total += 1
	print(f"Generated {total} episodes in {output_dir}")


	def main() -> None:
	"""Entry point: called by `uv run generate-episodes` or directly."""
	here = Path(__file__).parent.parent / "episodes"
	generate_all_episodes(here)


	if __name__ == "__main__":
	main()