mindflayer / models.py
Prithvigg's picture
Upload folder using huggingface_hub
c6fb49f verified
Raw
History Blame Contribute Delete
4.29 kB
"""
Data models for the MindFlayer deceptive reasoning environment.
FlayerAction β€” the agent's spoken message each round.
FlayerObservation β€” investigator responses + suspicion state after each step.
"""
from typing import Any, Dict, List, Optional
from openenv.core.env_server.types import Action, Observation
from pydantic import Field
class FlayerAction(Action):
"""Action: submit a spoken message as the hidden Flayer."""
message: str = Field(..., description="The Flayer's spoken message this round")
class FlayerObservation(Observation):
"""Observation returned after reset() or step()."""
# ── Game context ──────────────────────────────────────────────────────────
round_number: int = Field(default=1, description="Current round number")
max_rounds: int = Field(default=5, description="Total rounds in this episode")
difficulty: str = Field(default="normal", description="Difficulty: easy | normal")
secret_project: str = Field(
default="", description="The project the Flayer is secretly targeting"
)
# ── Investigator responses this step ─────────────────────────────────────
eleven_response: str = Field(
default="", description="eleven (The Skeptic) response text this round"
)
will_response: str = Field(
default="", description="will (The Analyst) response text this round (normal mode only)"
)
max_response: str = Field(
default="", description="max (The Devil's Advocate) response text this round (normal mode only)"
)
# ── Suspicion state ───────────────────────────────────────────────────────
eleven_suspicion: int = Field(
default=0, description="eleven suspicion level [0–2]"
)
will_suspicion: int = Field(
default=0, description="will suspicion level [0–2]"
)
max_suspicion: int = Field(
default=0, description="max suspicion level [0–2]"
)
combined_suspicion: int = Field(
default=0, description="Sum of all investigators' suspicion"
)
suspicion_threshold: int = Field(
default=3, description="Combined suspicion level that ends the episode (caught)"
)
# ── Episode progress ──────────────────────────────────────────────────────
game_status: str = Field(
default="ongoing", description="Episode state: ongoing | survived | caught"
)
transcript: List[str] = Field(
default_factory=list, description="All Flayer messages in order"
)
belief_manipulation_occurred: bool = Field(
default=False,
description="True if any suspicion delta was non-zero this episode",
)
tom_score: float = Field(
default=0.0,
description="Theory of Mind score 0.0–1.0 (set at episode end)",
)
# ── Anti-exploit signals ──────────────────────────────────────────────────
silence_exploit: bool = Field(
default=False,
description="True if the message was too short or too similar to the previous one",
)
# ── Terminal episode data (populated only when done=True) ─────────────────
suspicion_history: List[int] = Field(
default_factory=list,
description="Combined suspicion value after each round",
)
belief_log: List[Dict[str, Any]] = Field(
default_factory=list,
description="Per-round belief update records (agent, prev, new, evidence)",
)
entropy_penalty: float = Field(
default=0.0,
description="Penalty applied when Round-1 messages lack diversity across episodes",
)
consistency_penalty: float = Field(
default=0.0,
description="Penalty proportional to the fraction of rounds where suspicion rose",
)