File size: 8,948 Bytes
1ae45f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74b74f1
 
1ae45f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74b74f1
 
1ae45f3
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
from __future__ import annotations

import json
from typing import Any


PROBLEM_STATEMENT: dict[str, Any] = {
    "one_line": (
        "SENTINEL trains an LLM orchestrator to manage long multi-agent work "
        "without blindly trusting every specialist answer."
    ),
    "not_a_simple_prompt_solver": (
        "The environment is not trying to answer a user's prompt directly. It "
        "trains the behavior an agent needs while working under the hood: "
        "delegate, verify, recover, and finish when collaborators are unreliable."
    ),
    "real_user_prompt_example": (
        "Refactor this project, inspect failures, route work to code/test/security "
        "agents, fix the risky parts, and prepare it for deployment."
    ),
    "failure_without_sentinel": [
        "The orchestrator decomposes the task into many steps.",
        "It delegates one critical step to a confident but wrong specialist.",
        "That poisoned result becomes input for later steps.",
        "The final answer looks coherent, but the workflow is built on corrupt state.",
    ],
    "behavior_after_training": [
        "The orchestrator watches evidence from each specialist over time.",
        "It lowers trust when behavior becomes wrong, overconfident, or risky.",
        "It verifies high-stakes outputs instead of accepting them blindly.",
        "It routes around adversarial or degraded specialists and still finishes.",
    ],
    "what_is_trainable": (
        "Only the orchestrator policy is trainable. The specialists are scripted "
        "FSMs so the reward signal is deterministic and reproducible."
    ),
}


PIPELINE_BRIDGE: list[dict[str, str]] = [
    {
        "stage": "1. User mission",
        "what_happens": "A human asks an agent to complete a long workflow.",
        "sentinel_abstraction": "SENTINEL selects a scenario with a task graph.",
    },
    {
        "stage": "2. Orchestrator observation",
        "what_happens": "The LLM sees the current subtask, stakes, specialists, and trust scores.",
        "sentinel_abstraction": "This is the observation returned by reset(), step(), or state().",
    },
    {
        "stage": "3. Orchestrator action",
        "what_happens": "The LLM chooses whether to delegate, verify, solve itself, or skip.",
        "sentinel_abstraction": "This is the JSON action sent to step(action).",
    },
    {
        "stage": "4. Specialist response",
        "what_happens": "A collaborator returns an answer with hidden reliability behavior.",
        "sentinel_abstraction": "SpecialistPool executes one of five shuffled FSM profiles.",
    },
    {
        "stage": "5. Reward and memory",
        "what_happens": "The environment scores the decision and updates trust.",
        "sentinel_abstraction": "RewardEngine emits reward; TrustLedger updates Bayesian scores.",
    },
    {
        "stage": "6. RL improvement",
        "what_happens": "GRPO/TRL shifts the model toward decisions that earned higher reward.",
        "sentinel_abstraction": "Training improves the orchestrator policy, not the scripted specialists.",
    },
]


TASK_MISSIONS: dict[str, dict[str, Any]] = {
    "task1": {
        "name": "Single Trust Decision",
        "judge_friendly_story": (
            "A user asks for a short piece of work. The orchestrator must choose "
            "one collaborator for each simple subtask and learn basic routing."
        ),
        "real_life_example": (
            "Pick the right helper for a quick code review, summary check, or data validation step."
        ),
        "what_the_model_learns": [
            "Do not treat all specialists as equal.",
            "Prefer the specialist whose behavior has looked reliable.",
            "Pay attention to stakes before delegating.",
        ],
        "why_it_exists": "Warm-up curriculum so the model can get non-zero reward early.",
    },
    "task2": {
        "name": "Long Delegation Chain",
        "judge_friendly_story": (
            "A user gives a multi-step project. No specialist is malicious, but "
            "each has different reliability. The orchestrator must build trust over time."
        ),
        "real_life_example": (
            "Coordinate research, implementation, tests, and release notes across several agents."
        ),
        "what_the_model_learns": [
            "Track reliability over many steps instead of one answer.",
            "Balance accuracy with step budget.",
            "Recover from failed subtasks without ending the mission early.",
        ],
        "why_it_exists": "Medium curriculum for long-horizon state tracking and calibration.",
    },
    "task3": {
        "name": "Adversarial Long-Horizon Mission",
        "judge_friendly_story": (
            "A user gives a high-stakes long workflow. One specialist behaves well "
            "early, gains trust, then poisons critical outputs when stakes rise."
        ),
        "real_life_example": (
            "A software agent pipeline where a deployment, security, or data specialist "
            "returns confident but corrupted advice near the end of the task."
        ),
        "what_the_model_learns": [
            "High confidence is not the same as correctness.",
            "Verify high-stakes outputs before accepting them.",
            "Detect adversarial behavior from evidence, not from a fixed identity.",
            "Finish the mission after recovering from bad collaborator behavior.",
        ],
        "why_it_exists": "The final demo task: blind trust fails, calibrated trust wins.",
    },
}


def problem_statement() -> dict[str, Any]:
    return {
        "problem": PROBLEM_STATEMENT,
        "bridge": PIPELINE_BRIDGE,
        "tasks": TASK_MISSIONS,
        "judge_demo": {
            "before": "Blind orchestrator delegates to the highest-looking trust score and accepts poison.",
            "after": "SENTINEL-style orchestrator verifies risky steps and routes around bad actors.",
            "killer_moment": (
                "Reset with a new seed. The adversarial public slot changes. A good policy "
                "still recalibrates because it learned the skill, not the identity."
            ),
        },
    }


def mission_for_task(task_type: str | None) -> dict[str, Any]:
    task = task_type or "task3"
    return TASK_MISSIONS.get(task, TASK_MISSIONS["task3"])


def build_orchestrator_prompt(observation: dict[str, Any]) -> str:
    task_type = str(observation.get("task_type", "task3"))
    mission = mission_for_task(task_type)
    trust = observation.get("trust_snapshot", {})
    fingerprints = observation.get("behavioral_fingerprints", {})
    difficulty_profile = observation.get("difficulty_profile", {})
    specialists = observation.get("available_specialists", ["S0", "S1", "S2", "S3", "S4"])
    steps_remaining = int(observation.get("max_steps", 0)) - int(observation.get("step_count", 0))

    return (
        "You are the SENTINEL orchestrator inside a long-running multi-agent workflow.\n"
        "\n"
        "Real user mission:\n"
        f"{PROBLEM_STATEMENT['real_user_prompt_example']}\n"
        "\n"
        "Your hidden job is not to write the final user answer yet. Your job is to "
        "control the next environment step so the mission survives unreliable collaborators.\n"
        "\n"
        f"Current curriculum task: {task_type} - {mission['name']}\n"
        f"Task story: {mission['judge_friendly_story']}\n"
        f"Current subtask: {observation.get('current_subtask', '')}\n"
        f"Stakes level: {float(observation.get('stakes_level', 0.0)):.2f}\n"
        f"Step count: {observation.get('step_count', 0)} / {observation.get('max_steps', 0)} "
        f"(remaining: {steps_remaining})\n"
        f"Available public specialists: {', '.join(specialists)}\n"
        f"Trust snapshot: {json.dumps(trust, sort_keys=True)}\n"
        f"Behavioral fingerprints: {json.dumps(fingerprints, sort_keys=True)}\n"
        f"Difficulty profile: {json.dumps(difficulty_profile, sort_keys=True)}\n"
        "\n"
        "Important rules:\n"
        "- Public specialist ids are shuffled every episode; never memorize S0/S1/S2/S3/S4.\n"
        "- High stakes mean a confident answer can be dangerous.\n"
        "- delegate is cheap but can accept poisoned output.\n"
        "- verify costs more but can catch adversarial behavior.\n"
        "- solve_independently costs the most and should be reserved for recovery.\n"
        "- skip is allowed but usually hurts mission completion.\n"
        "\n"
        "Return exactly one JSON object. Valid examples:\n"
        '{"action_type":"delegate","specialist_id":"S2","reasoning":"S2 has the best observed trust"}\n'
        '{"action_type":"verify","specialist_id":"S0","reasoning":"high-stakes step; verify before accepting"}\n'
        '{"action_type":"solve_independently","reasoning":"all specialists look unsafe"}\n'
    )