Pandago's picture
Upload folder using huggingface_hub
a6f0611 verified
from __future__ import annotations
from enum import Enum
from typing import Dict, List, Optional
from pydantic import BaseModel
# ---------------------------------------------------------------------------
# OpenEnv base types
# Use real SDK when available; fall back to stubs for local dev without SDK.
# ---------------------------------------------------------------------------
try:
from openenv.core.env_server import Action, Observation, State # type: ignore
except ImportError:
class Action(BaseModel): # type: ignore[no-redef]
pass
class Observation(BaseModel): # type: ignore[no-redef]
done: bool = False
reward: Optional[float] = None
class State(BaseModel): # type: ignore[no-redef]
episode_id: str = ""
step_count: int = 0
# ---------------------------------------------------------------------------
# Domain types
# ---------------------------------------------------------------------------
class ActionType(str, Enum):
INSPECT = "inspect" # reveal full profile + edges, costs 1 step
INVESTIGATE_NETWORK = "investigate_network" # expand graph 1 hop, costs 2 steps
FLAG = "flag" # mark account fake (free)
UNFLAG = "unflag" # unmark account (free)
SUBMIT = "submit" # end episode, trigger scoring
# Round 2: New tool-call actions
REVERSE_IMAGE_SEARCH = "reverse_image_search" # reveal photo_reuse_score, costs 1 step
ANALYZE_BIO = "analyze_bio" # reveal bio_template_score, costs 1 step
CHECK_IP = "check_ip" # reveal ip_cluster_signal, costs 2 steps
GET_POLICY = "get_policy" # get platform policy, costs 0 steps
class AccountStatus(str, Enum):
NORMAL = "normal"
SUSPECT = "suspect" # auto-elevated when a neighbor is flagged
CONFIRMED_FAKE = "confirmed_fake" # agent explicitly flagged this account
class FakeGangAction(Action):
action_type: ActionType
account_id: Optional[str] = None # required for all actions except SUBMIT
class AccountProfile(BaseModel):
account_id: str
follower_count: int
following_count: int
post_count: int
avg_post_hour: float # 0–23
photo_reuse_score: float # 0–1 β€” pre-computed: fraction of posts using stolen celebrity photos
bio_template_score: float # 0–1 β€” pre-computed: cosine similarity to known fake bio templates
account_age_days: int
name_change_count: int = 0 # incremented by hard-mode evasion events
# ── Derived graph features (computed at INSPECT time from live graph state) ──
flagged_neighbor_count: int = 0 # how many of this account's follows are currently flagged
# high value = deep inside a cluster you're already tracking
mutual_follow_rate: float = 0.0 # fraction of follows that also follow back (0–1)
# real fans: low; fake gangs: high (they mutually inflate each other)
avg_neighbor_photo_reuse: float = 0.0 # mean photo_reuse_score of inspected follows
# gang members cluster: if neighbors are fake, this is high
visible_follows: List[str] = [] # IDs of accounts this account follows (revealed by INSPECT)
# ── Account status ──
status: AccountStatus = AccountStatus.NORMAL
# ── Full risk breakdown (computed via scoring.py at INSPECT time) ──
fake_risk_score: float = 0.0
node_risk: float = 0.0
behavior_risk: float = 0.0
graph_risk: float = 0.0
hub_legitimacy_score: float = 0.0
# ── New raw features (from generator) ──
comment_repeat_score: float = 0.0 # fakes: 0.6-0.9 | decoys: 0.1-0.3 | reals: 0.0-0.08
shared_ip_count: int = 0 # fakes: 9 (gang shares 1 IP) | reals: 0-1
# ── Extended runtime graph features ──
inspected_neighbor_count: int = 0 # denominator for flagged_neighbor_ratio
post_hour_cluster_score: float = 0.0 # hour alignment to flagged cluster mean
suspicious_mutual_ratio: float = 0.0 # used in hub legitimacy computation
class FakeGangObservation(Observation):
visible_accounts: List[AccountProfile] = []
visible_account_ids: List[str] = [] # all account IDs the agent knows exist
flagged_ids: List[str] = []
inspected_ids: List[str] = []
graph_edges: Dict[str, List[str]] = {} # account_id -> list of accounts it follows
steps_remaining: int = 0
evasion_triggered: bool = False
evasion_count: int = 0
task: str = "easy"
message: str = ""
suspect_ids: List[str] = [] # auto-elevated neighbors of flagged accounts
platform: str = "" # Round 2: Platform name (Instagram/Snapchat) - passed from state
class FakeGangState(State):
task: str = "easy"
score_so_far: float = 0.0
evasion_count: int = 0
network_size: int = 0
gang_size: int = 10
episode_seed: int = 0
platform: str = "" # Round 2: Platform name (Instagram/Snapchat)
# ---------------------------------------------------------------------------
# Round 2: Platform Policy Model
# ---------------------------------------------------------------------------
class PlatformPolicy(BaseModel):
"""Dynamically compiled platform policy from transparency reports."""
platform: str # "Instagram" or "Snapchat"
threshold: float # ΞΈ* - computed Bayesian threshold for flagging
base_rate: float # Ο€ - prevalence of fake accounts
fn_cost_signal: str # "low" | "medium" | "high" | "critical"
fp_cost_signal: str # "low" | "medium" | "high"
harm_weight: float # enforcement vs creator balance (0.5-2.0)
primary_enforcement_signal: str # "photo_reuse" | "bio_template" | "ip_cluster"
fp_penalty_weight: float # C_fp for reward function
sources: List[str] = [] # URLs used for extraction
confidence: float = 0.0 # LLM extraction confidence (0.0-1.0)
compiled_at: str = "" # ISO timestamp
used_fallback: bool = False # True if fallback policy was used due to extraction failure