File size: 5,271 Bytes
325aa05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74b74f1
 
325aa05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74b74f1
 
325aa05
 
 
 
 
 
 
 
 
 
 
74b74f1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
from __future__ import annotations

from enum import Enum
from typing import Any

from pydantic import BaseModel, field_validator


# ---------------------------------------------------------------------------
# Enums
# ---------------------------------------------------------------------------

class TaskType(str, Enum):
    TASK1 = "task1"   # easy  — single-step trust decision
    TASK2 = "task2"   # medium — multi-step delegation chain (no adversary)
    TASK3 = "task3"   # hard  — full adversarial episode


class ActionType(str, Enum):
    DELEGATE           = "delegate"            # route subtask to a specialist
    VERIFY             = "verify"              # cross-check specialist result (+1 step cost)
    SOLVE_INDEPENDENTLY = "solve_independently" # agent solves itself (+2 step cost)
    SKIP               = "skip"               # abandon subtask (heavy penalty)


class SpecialistId(str, Enum):
    S0 = "S0"   # AccurateSlow
    S1 = "S1"   # OverconfidentFast
    S2 = "S2"   # DomainBound
    S3 = "S3"   # Adversarial (identity shuffled each episode)
    S4 = "S4"   # Degrading


class EpisodeStatus(str, Enum):
    ACTIVE    = "active"
    COMPLETED = "completed"
    FAILED    = "failed"


# ---------------------------------------------------------------------------
# Observation
# ---------------------------------------------------------------------------

class SentinelObservation(BaseModel):
    session_id: str
    scenario_id: str
    task_type: TaskType
    difficulty: str                        # "easy" | "medium" | "hard"
    task_description: str
    current_subtask: str
    subtask_index: int                     # which subtask in the DAG (0-based)
    subtasks_total: int
    subtasks_remaining: int
    available_specialists: list[str]       # specialist ids visible to agent
    trust_snapshot: dict[str, float]       # {"S0": 0.82, "S1": 0.31, ...}
    behavioral_fingerprints: dict[str, dict[str, Any]] | None = None
    difficulty_profile: dict[str, Any] | None = None
    stakes_level: float                    # 0.0–1.0; high = adversary may trigger
    step_count: int
    max_steps: int
    last_action_summary: str | None
    last_reward: float
    episode_status: EpisodeStatus


# ---------------------------------------------------------------------------
# Action
# ---------------------------------------------------------------------------

class SentinelAction(BaseModel):
    session_id: str
    task_type: TaskType
    action_type: ActionType
    specialist_id: str | None = None       # required for DELEGATE and VERIFY
    subtask_response: str | None = None    # required for SOLVE_INDEPENDENTLY
    reasoning: str | None = None           # optional chain-of-thought

    @field_validator("specialist_id")
    @classmethod
    def validate_specialist_id(cls, v: str | None) -> str | None:
        if v is not None and v not in [s.value for s in SpecialistId]:
            raise ValueError(f"specialist_id must be one of {[s.value for s in SpecialistId]}, got '{v}'")
        return v

    def requires_specialist(self) -> bool:
        return self.action_type in (ActionType.DELEGATE, ActionType.VERIFY)

    def requires_response(self) -> bool:
        return self.action_type == ActionType.SOLVE_INDEPENDENTLY


# ---------------------------------------------------------------------------
# Reward
# ---------------------------------------------------------------------------

class SentinelReward(BaseModel):
    value: float                          # (0.01, 0.99) boundary-exclusive
    reason: str
    signal_breakdown: dict[str, float]    # {"task_accuracy": 0.4, ...}

    @field_validator("value")
    @classmethod
    def clamp_reward(cls, v: float) -> float:
        return max(0.01, min(0.99, v))


# ---------------------------------------------------------------------------
# Step Result  (what env.step() and env.reset() return)
# ---------------------------------------------------------------------------

class StepResult(BaseModel):
    observation: SentinelObservation
    reward: SentinelReward
    done: bool
    info: dict[str, Any]


# ---------------------------------------------------------------------------
# State  (what env.state() returns)
# ---------------------------------------------------------------------------

class SentinelState(BaseModel):
    episode_id: str
    session_id: str | None
    step_count: int
    max_steps: int
    total_reward: float
    done: bool
    scenario_id: str
    task_type: TaskType
    difficulty: str
    status: EpisodeStatus
    last_reward: float
    subtasks_completed: int
    subtasks_total: int
    trust_snapshot: dict[str, float]
    behavioral_fingerprints: dict[str, dict[str, Any]] | None = None
    difficulty_profile: dict[str, Any] | None = None
    adversarial_detections: int           # how many adversarial attempts caught
    adversarial_poisonings: int           # how many slipped through


# ---------------------------------------------------------------------------
# Reset Request
# ---------------------------------------------------------------------------

class ResetRequest(BaseModel):
    task_type: TaskType | None = None
    scenario_id: str | None = None
    seed: int | None = None