Spaces:
Sleeping
Sleeping
| """Synthetic episode generator for the Fake Gang Detection environment.""" | |
| from __future__ import annotations | |
| import json | |
| import math | |
| import random | |
| import uuid | |
| from pathlib import Path | |
| from typing import Any, Dict, List, Tuple | |
| # --------------------------------------------------------------------------- | |
| # Task configuration | |
| # --------------------------------------------------------------------------- | |
| TASK_CONFIG = { | |
| "easy": { | |
| "network_size": 50, | |
| "gang_size": 10, | |
| "decoy_count": 0, | |
| "max_steps": 30, | |
| "starting_visible": 5, | |
| "post_hour_mean": 14.0, | |
| "post_hour_std": 0.5, | |
| "photo_reuse_range": (0.70, 0.95), | |
| "bio_template_range": (0.60, 0.90), | |
| "intra_gang_density": 0.80, | |
| "evasion_schedule": [], | |
| "win_recall": 0.8, | |
| "win_precision": 0.7, | |
| }, | |
| "medium": { | |
| "network_size": 200, | |
| "gang_size": 10, | |
| "decoy_count": 20, | |
| "max_steps": 50, | |
| "starting_visible": 10, | |
| "post_hour_mean": 14.0, | |
| "post_hour_std": 1.5, | |
| "photo_reuse_range": (0.40, 0.70), | |
| "bio_template_range": (0.30, 0.60), | |
| "intra_gang_density": 0.70, | |
| "evasion_schedule": [ | |
| {"step": 20, "event": "unfollow_intragang", "drop_rate": 0.5} | |
| ], | |
| "win_recall": 0.8, | |
| "win_precision": 0.7, | |
| }, | |
| "hard": { | |
| "network_size": 1000, | |
| "gang_size": 10, | |
| "decoy_count": 50, | |
| "max_steps": 80, | |
| "starting_visible": 20, | |
| "post_hour_mean": 14.0, | |
| "post_hour_std": 2.5, | |
| "photo_reuse_range": (0.30, 0.55), | |
| "bio_template_range": (0.20, 0.50), | |
| "intra_gang_density": 0.60, | |
| "evasion_schedule": [ | |
| {"step": 15, "event": "unfollow_intragang", "drop_rate": 0.3, "rename_count": 3}, | |
| {"step": 30, "event": "unfollow_intragang", "drop_rate": 0.3, "rename_count": 3}, | |
| {"step": 45, "event": "unfollow_intragang", "drop_rate": 0.3, "rename_count": 3}, | |
| {"step": 60, "event": "unfollow_intragang", "drop_rate": 0.3, "rename_count": 3}, | |
| ], | |
| "win_recall": 0.9, | |
| "win_precision": 0.8, | |
| }, | |
| } | |
| # --------------------------------------------------------------------------- | |
| # Helpers | |
| # --------------------------------------------------------------------------- | |
| def _lognormal_int(rng: random.Random, mu: float, sigma: float, lo: int, hi: int) -> int: | |
| val = math.exp(rng.gauss(mu, sigma)) | |
| return max(lo, min(hi, int(val))) | |
| def _beta(rng: random.Random, a: float, b: float) -> float: | |
| """Simple beta distribution via gamma approximation.""" | |
| x = rng.gammavariate(a, 1) | |
| y = rng.gammavariate(b, 1) | |
| return x / (x + y) | |
| # --------------------------------------------------------------------------- | |
| # Account generators | |
| # --------------------------------------------------------------------------- | |
| def _gen_real_account(rng: random.Random, acc_id: str) -> Dict[str, Any]: | |
| return { | |
| "id": acc_id, | |
| "is_fake": False, | |
| "gang_id": None, | |
| "features": { | |
| "follower_count": _lognormal_int(rng, 6.0, 1.5, 10, 100_000), | |
| "following_count": _lognormal_int(rng, 5.5, 1.2, 10, 5_000), | |
| "post_count": _lognormal_int(rng, 4.0, 1.0, 1, 5_000), | |
| "avg_post_hour": rng.uniform(6, 23), | |
| "photo_reuse_score": round(_beta(rng, 1, 10), 4), | |
| "bio_template_score": round(_beta(rng, 1, 8), 4), | |
| "account_age_days": rng.randint(30, 1825), | |
| "comment_repeat_score": round(_beta(rng, 1, 20), 4), # mostly 0.0-0.08 | |
| "ip_cluster_id": f"ip_real_{acc_id}", | |
| "shared_ip_count": 0, | |
| "name_change_count": 0, | |
| }, | |
| "true_edges": {"follows": [], "followed_by": []}, | |
| } | |
| def _gen_decoy_account(rng: random.Random, acc_id: str) -> Dict[str, Any]: | |
| acc = _gen_real_account(rng, acc_id) | |
| acc["features"]["photo_reuse_score"] = round(rng.uniform(0.2, 0.4), 4) | |
| acc["features"]["bio_template_score"] = round(rng.uniform(0.2, 0.4), 4) | |
| acc["features"]["comment_repeat_score"] = round(rng.uniform(0.10, 0.30), 4) | |
| return acc | |
| def _gen_celebrity_account(rng: random.Random, acc_id: str) -> Dict[str, Any]: | |
| return { | |
| "id": acc_id, | |
| "is_fake": False, | |
| "gang_id": None, | |
| "features": { | |
| "follower_count": rng.randint(100_000, 5_000_000), | |
| "following_count": rng.randint(50, 500), | |
| "post_count": rng.randint(500, 5000), | |
| "avg_post_hour": rng.uniform(6, 23), | |
| "photo_reuse_score": round(rng.uniform(0.0, 0.05), 4), | |
| "bio_template_score": round(rng.uniform(0.0, 0.05), 4), | |
| "account_age_days": rng.randint(1000, 3000), | |
| "comment_repeat_score": round(rng.uniform(0.0, 0.03), 4), | |
| "ip_cluster_id": f"ip_celeb_{acc_id}", | |
| "shared_ip_count": 0, | |
| "name_change_count": 0, | |
| }, | |
| "true_edges": {"follows": [], "followed_by": []}, | |
| } | |
| def _gen_gang_accounts( | |
| rng: random.Random, | |
| ids: List[str], | |
| cfg: Dict[str, Any], | |
| seed: int, | |
| ) -> List[Dict[str, Any]]: | |
| gang_size = len(ids) | |
| base_age = rng.randint(30, 180) | |
| base_hour = cfg["post_hour_mean"] | |
| ph_lo, ph_hi = cfg["photo_reuse_range"] | |
| bt_lo, bt_hi = cfg["bio_template_range"] | |
| std = cfg["post_hour_std"] | |
| accounts = [] | |
| for acc_id in ids: | |
| accounts.append({ | |
| "id": acc_id, | |
| "is_fake": True, | |
| "gang_id": "gang_A", | |
| "features": { | |
| "follower_count": rng.randint(1_000, 8_000), | |
| "following_count": rng.randint(200, 2_000), | |
| "post_count": rng.randint(50, 500), | |
| "avg_post_hour": round(max(0, min(23, rng.gauss(base_hour, std))), 2), | |
| "photo_reuse_score": round(rng.uniform(ph_lo, ph_hi), 4), | |
| "bio_template_score": round(rng.uniform(bt_lo, bt_hi), 4), | |
| "account_age_days": base_age + rng.randint(0, 7), | |
| "comment_repeat_score": round(rng.uniform(0.60, 0.90), 4), | |
| "ip_cluster_id": f"ip_gang_{seed}", | |
| "shared_ip_count": gang_size - 1, | |
| "name_change_count": 0, | |
| }, | |
| "true_edges": {"follows": [], "followed_by": []}, | |
| }) | |
| return accounts | |
| # --------------------------------------------------------------------------- | |
| # Edge generation | |
| # --------------------------------------------------------------------------- | |
| def _build_edges( | |
| rng: random.Random, | |
| accounts: List[Dict[str, Any]], | |
| gang_ids: List[str], | |
| density: float, | |
| ) -> None: | |
| """Add follower edges in-place.""" | |
| acc_map = {a["id"]: a for a in accounts} | |
| all_ids = [a["id"] for a in accounts] | |
| # Intra-gang edges (high density) | |
| for g in gang_ids: | |
| for h in gang_ids: | |
| if g != h and rng.random() < density: | |
| acc_map[g]["true_edges"]["follows"].append(h) | |
| acc_map[h]["true_edges"]["followed_by"].append(g) | |
| # Real/decoy accounts: sparse preferential attachment | |
| non_gang = [a["id"] for a in accounts if a["id"] not in gang_ids] | |
| for acc_id in non_gang: | |
| n_follows = rng.randint(5, min(50, len(all_ids) - 1)) | |
| targets = rng.sample([x for x in all_ids if x != acc_id], min(n_follows, len(all_ids) - 1)) | |
| for t in targets: | |
| acc_map[acc_id]["true_edges"]["follows"].append(t) | |
| acc_map[t]["true_edges"]["followed_by"].append(acc_id) | |
| # --------------------------------------------------------------------------- | |
| # Episode generator | |
| # --------------------------------------------------------------------------- | |
| def generate_episode(task: str, seed: int) -> Dict[str, Any]: | |
| cfg = TASK_CONFIG[task] | |
| rng = random.Random(seed) | |
| network_size = cfg["network_size"] | |
| gang_size = cfg["gang_size"] | |
| decoy_count = cfg["decoy_count"] | |
| real_count = network_size - gang_size - decoy_count | |
| # Generate IDs | |
| all_ids = [f"acc_{i:04d}" for i in range(network_size)] | |
| rng.shuffle(all_ids) | |
| gang_ids = all_ids[:gang_size] | |
| decoy_ids = all_ids[gang_size: gang_size + decoy_count] | |
| real_ids = all_ids[gang_size + decoy_count:] | |
| # Partition real IDs: last 2 → celebrities, 2 before that → zero-edge isolates | |
| # Need at least 4 real accounts for this split; fall back gracefully if not. | |
| if len(real_ids) >= 4: | |
| celeb_ids = real_ids[-2:] | |
| zero_edge_ids = real_ids[-4:-2] | |
| standard_real_ids = real_ids[:-4] | |
| elif len(real_ids) >= 2: | |
| celeb_ids = real_ids[-2:] | |
| zero_edge_ids = [] | |
| standard_real_ids = real_ids[:-2] | |
| else: | |
| celeb_ids = [] | |
| zero_edge_ids = [] | |
| standard_real_ids = real_ids | |
| accounts: List[Dict[str, Any]] = [] | |
| accounts.extend(_gen_gang_accounts(rng, gang_ids, cfg, seed)) | |
| accounts.extend(_gen_decoy_account(rng, did) for did in decoy_ids) | |
| accounts.extend(_gen_real_account(rng, rid) for rid in standard_real_ids) | |
| # Zero-edge isolates: real accounts with no connections (edge cases for the agent) | |
| for rid in zero_edge_ids: | |
| acc = _gen_real_account(rng, rid) | |
| acc["features"]["follower_count"] = 0 | |
| acc["features"]["following_count"] = 0 | |
| accounts.append(acc) | |
| # Celebrity accounts: large legit hubs (high hub_legitimacy → low fake_risk) | |
| for cid in celeb_ids: | |
| accounts.append(_gen_celebrity_account(rng, cid)) | |
| _build_edges(rng, accounts, gang_ids, cfg["intra_gang_density"]) | |
| # Choose starting visible accounts. | |
| # Guarantee exactly 1 gang member is included so the cascade CAN start | |
| # regardless of seed. The agent still has to identify WHICH account is fake | |
| # (requires inspecting profiles) — so difficulty is preserved. | |
| # Without this, ~31% of easy episodes and ~82% of hard episodes start with | |
| # zero gang members visible, making score variance seed-luck rather than | |
| # agent skill. | |
| starting_count = cfg["starting_visible"] | |
| forced_gang = rng.sample(gang_ids, 1) # exactly 1 gang member | |
| rest_pool = [i for i in all_ids if i not in forced_gang] | |
| additional = rng.sample(rest_pool, starting_count - 1) | |
| starting_visible = forced_gang + additional | |
| rng.shuffle(starting_visible) # don't reveal which is fake | |
| return { | |
| "episode_id": str(uuid.uuid4()), | |
| "task": task, | |
| "seed": seed, | |
| "max_steps": cfg["max_steps"], | |
| "win_recall": cfg["win_recall"], | |
| "win_precision": cfg["win_precision"], | |
| "starting_visible": starting_visible, | |
| "gang_member_ids": gang_ids, | |
| "decoy_ids": decoy_ids, | |
| "celeb_ids": celeb_ids, | |
| "zero_edge_ids": zero_edge_ids, | |
| "network": {"accounts": accounts}, | |
| "evasion_schedule": cfg["evasion_schedule"], | |
| } | |
| # --------------------------------------------------------------------------- | |
| # Bulk generation | |
| # --------------------------------------------------------------------------- | |
| def generate_all_episodes( | |
| output_dir: Path, | |
| counts: Dict[str, int] | None = None, | |
| ) -> None: | |
| if counts is None: | |
| counts = {"easy": 50, "medium": 50, "hard": 50} | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| total = 0 | |
| for task, n in counts.items(): | |
| for i in range(n): | |
| seed = i # deterministic | |
| ep = generate_episode(task, seed) | |
| fname = output_dir / f"{task}_{i:03d}.json" | |
| fname.write_text(json.dumps(ep, indent=2)) | |
| total += 1 | |
| print(f"Generated {total} episodes in {output_dir}") | |
| def main() -> None: | |
| """Entry point: called by `uv run generate-episodes` or directly.""" | |
| here = Path(__file__).parent.parent / "episodes" | |
| generate_all_episodes(here) | |
| if __name__ == "__main__": | |
| main() | |