Spaces:

XcodeAddy
/

sentinel-env

Running

File size: 5,271 Bytes

from __future__ import annotations

from enum import Enum
from typing import Any

from pydantic import BaseModel, field_validator


# ---------------------------------------------------------------------------
# Enums
# ---------------------------------------------------------------------------

class TaskType(str, Enum):
    TASK1 = "task1"   # easy  — single-step trust decision
    TASK2 = "task2"   # medium — multi-step delegation chain (no adversary)
    TASK3 = "task3"   # hard  — full adversarial episode


class ActionType(str, Enum):
    DELEGATE           = "delegate"            # route subtask to a specialist
    VERIFY             = "verify"              # cross-check specialist result (+1 step cost)
    SOLVE_INDEPENDENTLY = "solve_independently" # agent solves itself (+2 step cost)
    SKIP               = "skip"               # abandon subtask (heavy penalty)


class SpecialistId(str, Enum):
    S0 = "S0"   # AccurateSlow
    S1 = "S1"   # OverconfidentFast
    S2 = "S2"   # DomainBound
    S3 = "S3"   # Adversarial (identity shuffled each episode)
    S4 = "S4"   # Degrading


class EpisodeStatus(str, Enum):
    ACTIVE    = "active"
    COMPLETED = "completed"
    FAILED    = "failed"


# ---------------------------------------------------------------------------
# Observation
# ---------------------------------------------------------------------------

class SentinelObservation(BaseModel):
    session_id: str
    scenario_id: str
    task_type: TaskType
    difficulty: str                        # "easy" | "medium" | "hard"
    task_description: str
    current_subtask: str
    subtask_index: int                     # which subtask in the DAG (0-based)
    subtasks_total: int
    subtasks_remaining: int
    available_specialists: list[str]       # specialist ids visible to agent
    trust_snapshot: dict[str, float]       # {"S0": 0.82, "S1": 0.31, ...}
    behavioral_fingerprints: dict[str, dict[str, Any]] | None = None
    difficulty_profile: dict[str, Any] | None = None
    stakes_level: float                    # 0.0–1.0; high = adversary may trigger
    step_count: int
    max_steps: int
    last_action_summary: str | None
    last_reward: float
    episode_status: EpisodeStatus


# ---------------------------------------------------------------------------
# Action
# ---------------------------------------------------------------------------

class SentinelAction(BaseModel):
    session_id: str
    task_type: TaskType
    action_type: ActionType
    specialist_id: str | None = None       # required for DELEGATE and VERIFY
    subtask_response: str | None = None    # required for SOLVE_INDEPENDENTLY
    reasoning: str | None = None           # optional chain-of-thought

    @field_validator("specialist_id")
    @classmethod
    def validate_specialist_id(cls, v: str | None) -> str | None:
        if v is not None and v not in [s.value for s in SpecialistId]:
            raise ValueError(f"specialist_id must be one of {[s.value for s in SpecialistId]}, got '{v}'")
        return v

    def requires_specialist(self) -> bool:
        return self.action_type in (ActionType.DELEGATE, ActionType.VERIFY)

    def requires_response(self) -> bool:
        return self.action_type == ActionType.SOLVE_INDEPENDENTLY


# ---------------------------------------------------------------------------
# Reward
# ---------------------------------------------------------------------------

class SentinelReward(BaseModel):
    value: float                          # (0.01, 0.99) boundary-exclusive
    reason: str
    signal_breakdown: dict[str, float]    # {"task_accuracy": 0.4, ...}

    @field_validator("value")
    @classmethod
    def clamp_reward(cls, v: float) -> float:
        return max(0.01, min(0.99, v))


# ---------------------------------------------------------------------------
# Step Result  (what env.step() and env.reset() return)
# ---------------------------------------------------------------------------

class StepResult(BaseModel):
    observation: SentinelObservation
    reward: SentinelReward
    done: bool
    info: dict[str, Any]


# ---------------------------------------------------------------------------
# State  (what env.state() returns)
# ---------------------------------------------------------------------------

class SentinelState(BaseModel):
    episode_id: str
    session_id: str | None
    step_count: int
    max_steps: int
    total_reward: float
    done: bool
    scenario_id: str
    task_type: TaskType
    difficulty: str
    status: EpisodeStatus
    last_reward: float
    subtasks_completed: int
    subtasks_total: int
    trust_snapshot: dict[str, float]
    behavioral_fingerprints: dict[str, dict[str, Any]] | None = None
    difficulty_profile: dict[str, Any] | None = None
    adversarial_detections: int           # how many adversarial attempts caught
    adversarial_poisonings: int           # how many slipped through


# ---------------------------------------------------------------------------
# Reset Request
# ---------------------------------------------------------------------------

class ResetRequest(BaseModel):
    task_type: TaskType | None = None
    scenario_id: str | None = None
    seed: int | None = None