Spaces:
Running
Running
File size: 5,271 Bytes
325aa05 74b74f1 325aa05 74b74f1 325aa05 74b74f1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 | from __future__ import annotations
from enum import Enum
from typing import Any
from pydantic import BaseModel, field_validator
# ---------------------------------------------------------------------------
# Enums
# ---------------------------------------------------------------------------
class TaskType(str, Enum):
TASK1 = "task1" # easy — single-step trust decision
TASK2 = "task2" # medium — multi-step delegation chain (no adversary)
TASK3 = "task3" # hard — full adversarial episode
class ActionType(str, Enum):
DELEGATE = "delegate" # route subtask to a specialist
VERIFY = "verify" # cross-check specialist result (+1 step cost)
SOLVE_INDEPENDENTLY = "solve_independently" # agent solves itself (+2 step cost)
SKIP = "skip" # abandon subtask (heavy penalty)
class SpecialistId(str, Enum):
S0 = "S0" # AccurateSlow
S1 = "S1" # OverconfidentFast
S2 = "S2" # DomainBound
S3 = "S3" # Adversarial (identity shuffled each episode)
S4 = "S4" # Degrading
class EpisodeStatus(str, Enum):
ACTIVE = "active"
COMPLETED = "completed"
FAILED = "failed"
# ---------------------------------------------------------------------------
# Observation
# ---------------------------------------------------------------------------
class SentinelObservation(BaseModel):
session_id: str
scenario_id: str
task_type: TaskType
difficulty: str # "easy" | "medium" | "hard"
task_description: str
current_subtask: str
subtask_index: int # which subtask in the DAG (0-based)
subtasks_total: int
subtasks_remaining: int
available_specialists: list[str] # specialist ids visible to agent
trust_snapshot: dict[str, float] # {"S0": 0.82, "S1": 0.31, ...}
behavioral_fingerprints: dict[str, dict[str, Any]] | None = None
difficulty_profile: dict[str, Any] | None = None
stakes_level: float # 0.0–1.0; high = adversary may trigger
step_count: int
max_steps: int
last_action_summary: str | None
last_reward: float
episode_status: EpisodeStatus
# ---------------------------------------------------------------------------
# Action
# ---------------------------------------------------------------------------
class SentinelAction(BaseModel):
session_id: str
task_type: TaskType
action_type: ActionType
specialist_id: str | None = None # required for DELEGATE and VERIFY
subtask_response: str | None = None # required for SOLVE_INDEPENDENTLY
reasoning: str | None = None # optional chain-of-thought
@field_validator("specialist_id")
@classmethod
def validate_specialist_id(cls, v: str | None) -> str | None:
if v is not None and v not in [s.value for s in SpecialistId]:
raise ValueError(f"specialist_id must be one of {[s.value for s in SpecialistId]}, got '{v}'")
return v
def requires_specialist(self) -> bool:
return self.action_type in (ActionType.DELEGATE, ActionType.VERIFY)
def requires_response(self) -> bool:
return self.action_type == ActionType.SOLVE_INDEPENDENTLY
# ---------------------------------------------------------------------------
# Reward
# ---------------------------------------------------------------------------
class SentinelReward(BaseModel):
value: float # (0.01, 0.99) boundary-exclusive
reason: str
signal_breakdown: dict[str, float] # {"task_accuracy": 0.4, ...}
@field_validator("value")
@classmethod
def clamp_reward(cls, v: float) -> float:
return max(0.01, min(0.99, v))
# ---------------------------------------------------------------------------
# Step Result (what env.step() and env.reset() return)
# ---------------------------------------------------------------------------
class StepResult(BaseModel):
observation: SentinelObservation
reward: SentinelReward
done: bool
info: dict[str, Any]
# ---------------------------------------------------------------------------
# State (what env.state() returns)
# ---------------------------------------------------------------------------
class SentinelState(BaseModel):
episode_id: str
session_id: str | None
step_count: int
max_steps: int
total_reward: float
done: bool
scenario_id: str
task_type: TaskType
difficulty: str
status: EpisodeStatus
last_reward: float
subtasks_completed: int
subtasks_total: int
trust_snapshot: dict[str, float]
behavioral_fingerprints: dict[str, dict[str, Any]] | None = None
difficulty_profile: dict[str, Any] | None = None
adversarial_detections: int # how many adversarial attempts caught
adversarial_poisonings: int # how many slipped through
# ---------------------------------------------------------------------------
# Reset Request
# ---------------------------------------------------------------------------
class ResetRequest(BaseModel):
task_type: TaskType | None = None
scenario_id: str | None = None
seed: int | None = None
|