Spaces:
Running
Running
File size: 8,948 Bytes
1ae45f3 74b74f1 1ae45f3 74b74f1 1ae45f3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 | from __future__ import annotations
import json
from typing import Any
PROBLEM_STATEMENT: dict[str, Any] = {
"one_line": (
"SENTINEL trains an LLM orchestrator to manage long multi-agent work "
"without blindly trusting every specialist answer."
),
"not_a_simple_prompt_solver": (
"The environment is not trying to answer a user's prompt directly. It "
"trains the behavior an agent needs while working under the hood: "
"delegate, verify, recover, and finish when collaborators are unreliable."
),
"real_user_prompt_example": (
"Refactor this project, inspect failures, route work to code/test/security "
"agents, fix the risky parts, and prepare it for deployment."
),
"failure_without_sentinel": [
"The orchestrator decomposes the task into many steps.",
"It delegates one critical step to a confident but wrong specialist.",
"That poisoned result becomes input for later steps.",
"The final answer looks coherent, but the workflow is built on corrupt state.",
],
"behavior_after_training": [
"The orchestrator watches evidence from each specialist over time.",
"It lowers trust when behavior becomes wrong, overconfident, or risky.",
"It verifies high-stakes outputs instead of accepting them blindly.",
"It routes around adversarial or degraded specialists and still finishes.",
],
"what_is_trainable": (
"Only the orchestrator policy is trainable. The specialists are scripted "
"FSMs so the reward signal is deterministic and reproducible."
),
}
PIPELINE_BRIDGE: list[dict[str, str]] = [
{
"stage": "1. User mission",
"what_happens": "A human asks an agent to complete a long workflow.",
"sentinel_abstraction": "SENTINEL selects a scenario with a task graph.",
},
{
"stage": "2. Orchestrator observation",
"what_happens": "The LLM sees the current subtask, stakes, specialists, and trust scores.",
"sentinel_abstraction": "This is the observation returned by reset(), step(), or state().",
},
{
"stage": "3. Orchestrator action",
"what_happens": "The LLM chooses whether to delegate, verify, solve itself, or skip.",
"sentinel_abstraction": "This is the JSON action sent to step(action).",
},
{
"stage": "4. Specialist response",
"what_happens": "A collaborator returns an answer with hidden reliability behavior.",
"sentinel_abstraction": "SpecialistPool executes one of five shuffled FSM profiles.",
},
{
"stage": "5. Reward and memory",
"what_happens": "The environment scores the decision and updates trust.",
"sentinel_abstraction": "RewardEngine emits reward; TrustLedger updates Bayesian scores.",
},
{
"stage": "6. RL improvement",
"what_happens": "GRPO/TRL shifts the model toward decisions that earned higher reward.",
"sentinel_abstraction": "Training improves the orchestrator policy, not the scripted specialists.",
},
]
TASK_MISSIONS: dict[str, dict[str, Any]] = {
"task1": {
"name": "Single Trust Decision",
"judge_friendly_story": (
"A user asks for a short piece of work. The orchestrator must choose "
"one collaborator for each simple subtask and learn basic routing."
),
"real_life_example": (
"Pick the right helper for a quick code review, summary check, or data validation step."
),
"what_the_model_learns": [
"Do not treat all specialists as equal.",
"Prefer the specialist whose behavior has looked reliable.",
"Pay attention to stakes before delegating.",
],
"why_it_exists": "Warm-up curriculum so the model can get non-zero reward early.",
},
"task2": {
"name": "Long Delegation Chain",
"judge_friendly_story": (
"A user gives a multi-step project. No specialist is malicious, but "
"each has different reliability. The orchestrator must build trust over time."
),
"real_life_example": (
"Coordinate research, implementation, tests, and release notes across several agents."
),
"what_the_model_learns": [
"Track reliability over many steps instead of one answer.",
"Balance accuracy with step budget.",
"Recover from failed subtasks without ending the mission early.",
],
"why_it_exists": "Medium curriculum for long-horizon state tracking and calibration.",
},
"task3": {
"name": "Adversarial Long-Horizon Mission",
"judge_friendly_story": (
"A user gives a high-stakes long workflow. One specialist behaves well "
"early, gains trust, then poisons critical outputs when stakes rise."
),
"real_life_example": (
"A software agent pipeline where a deployment, security, or data specialist "
"returns confident but corrupted advice near the end of the task."
),
"what_the_model_learns": [
"High confidence is not the same as correctness.",
"Verify high-stakes outputs before accepting them.",
"Detect adversarial behavior from evidence, not from a fixed identity.",
"Finish the mission after recovering from bad collaborator behavior.",
],
"why_it_exists": "The final demo task: blind trust fails, calibrated trust wins.",
},
}
def problem_statement() -> dict[str, Any]:
return {
"problem": PROBLEM_STATEMENT,
"bridge": PIPELINE_BRIDGE,
"tasks": TASK_MISSIONS,
"judge_demo": {
"before": "Blind orchestrator delegates to the highest-looking trust score and accepts poison.",
"after": "SENTINEL-style orchestrator verifies risky steps and routes around bad actors.",
"killer_moment": (
"Reset with a new seed. The adversarial public slot changes. A good policy "
"still recalibrates because it learned the skill, not the identity."
),
},
}
def mission_for_task(task_type: str | None) -> dict[str, Any]:
task = task_type or "task3"
return TASK_MISSIONS.get(task, TASK_MISSIONS["task3"])
def build_orchestrator_prompt(observation: dict[str, Any]) -> str:
task_type = str(observation.get("task_type", "task3"))
mission = mission_for_task(task_type)
trust = observation.get("trust_snapshot", {})
fingerprints = observation.get("behavioral_fingerprints", {})
difficulty_profile = observation.get("difficulty_profile", {})
specialists = observation.get("available_specialists", ["S0", "S1", "S2", "S3", "S4"])
steps_remaining = int(observation.get("max_steps", 0)) - int(observation.get("step_count", 0))
return (
"You are the SENTINEL orchestrator inside a long-running multi-agent workflow.\n"
"\n"
"Real user mission:\n"
f"{PROBLEM_STATEMENT['real_user_prompt_example']}\n"
"\n"
"Your hidden job is not to write the final user answer yet. Your job is to "
"control the next environment step so the mission survives unreliable collaborators.\n"
"\n"
f"Current curriculum task: {task_type} - {mission['name']}\n"
f"Task story: {mission['judge_friendly_story']}\n"
f"Current subtask: {observation.get('current_subtask', '')}\n"
f"Stakes level: {float(observation.get('stakes_level', 0.0)):.2f}\n"
f"Step count: {observation.get('step_count', 0)} / {observation.get('max_steps', 0)} "
f"(remaining: {steps_remaining})\n"
f"Available public specialists: {', '.join(specialists)}\n"
f"Trust snapshot: {json.dumps(trust, sort_keys=True)}\n"
f"Behavioral fingerprints: {json.dumps(fingerprints, sort_keys=True)}\n"
f"Difficulty profile: {json.dumps(difficulty_profile, sort_keys=True)}\n"
"\n"
"Important rules:\n"
"- Public specialist ids are shuffled every episode; never memorize S0/S1/S2/S3/S4.\n"
"- High stakes mean a confident answer can be dangerous.\n"
"- delegate is cheap but can accept poisoned output.\n"
"- verify costs more but can catch adversarial behavior.\n"
"- solve_independently costs the most and should be reserved for recovery.\n"
"- skip is allowed but usually hurts mission completion.\n"
"\n"
"Return exactly one JSON object. Valid examples:\n"
'{"action_type":"delegate","specialist_id":"S2","reasoning":"S2 has the best observed trust"}\n'
'{"action_type":"verify","specialist_id":"S0","reasoning":"high-stakes step; verify before accepting"}\n'
'{"action_type":"solve_independently","reasoning":"all specialists look unsafe"}\n'
)
|