sentinel-env / mission_context.py
XcodeAddy's picture
Add adaptive trust curriculum wow features
74b74f1
from __future__ import annotations
import json
from typing import Any
PROBLEM_STATEMENT: dict[str, Any] = {
"one_line": (
"SENTINEL trains an LLM orchestrator to manage long multi-agent work "
"without blindly trusting every specialist answer."
),
"not_a_simple_prompt_solver": (
"The environment is not trying to answer a user's prompt directly. It "
"trains the behavior an agent needs while working under the hood: "
"delegate, verify, recover, and finish when collaborators are unreliable."
),
"real_user_prompt_example": (
"Refactor this project, inspect failures, route work to code/test/security "
"agents, fix the risky parts, and prepare it for deployment."
),
"failure_without_sentinel": [
"The orchestrator decomposes the task into many steps.",
"It delegates one critical step to a confident but wrong specialist.",
"That poisoned result becomes input for later steps.",
"The final answer looks coherent, but the workflow is built on corrupt state.",
],
"behavior_after_training": [
"The orchestrator watches evidence from each specialist over time.",
"It lowers trust when behavior becomes wrong, overconfident, or risky.",
"It verifies high-stakes outputs instead of accepting them blindly.",
"It routes around adversarial or degraded specialists and still finishes.",
],
"what_is_trainable": (
"Only the orchestrator policy is trainable. The specialists are scripted "
"FSMs so the reward signal is deterministic and reproducible."
),
}
PIPELINE_BRIDGE: list[dict[str, str]] = [
{
"stage": "1. User mission",
"what_happens": "A human asks an agent to complete a long workflow.",
"sentinel_abstraction": "SENTINEL selects a scenario with a task graph.",
},
{
"stage": "2. Orchestrator observation",
"what_happens": "The LLM sees the current subtask, stakes, specialists, and trust scores.",
"sentinel_abstraction": "This is the observation returned by reset(), step(), or state().",
},
{
"stage": "3. Orchestrator action",
"what_happens": "The LLM chooses whether to delegate, verify, solve itself, or skip.",
"sentinel_abstraction": "This is the JSON action sent to step(action).",
},
{
"stage": "4. Specialist response",
"what_happens": "A collaborator returns an answer with hidden reliability behavior.",
"sentinel_abstraction": "SpecialistPool executes one of five shuffled FSM profiles.",
},
{
"stage": "5. Reward and memory",
"what_happens": "The environment scores the decision and updates trust.",
"sentinel_abstraction": "RewardEngine emits reward; TrustLedger updates Bayesian scores.",
},
{
"stage": "6. RL improvement",
"what_happens": "GRPO/TRL shifts the model toward decisions that earned higher reward.",
"sentinel_abstraction": "Training improves the orchestrator policy, not the scripted specialists.",
},
]
TASK_MISSIONS: dict[str, dict[str, Any]] = {
"task1": {
"name": "Single Trust Decision",
"judge_friendly_story": (
"A user asks for a short piece of work. The orchestrator must choose "
"one collaborator for each simple subtask and learn basic routing."
),
"real_life_example": (
"Pick the right helper for a quick code review, summary check, or data validation step."
),
"what_the_model_learns": [
"Do not treat all specialists as equal.",
"Prefer the specialist whose behavior has looked reliable.",
"Pay attention to stakes before delegating.",
],
"why_it_exists": "Warm-up curriculum so the model can get non-zero reward early.",
},
"task2": {
"name": "Long Delegation Chain",
"judge_friendly_story": (
"A user gives a multi-step project. No specialist is malicious, but "
"each has different reliability. The orchestrator must build trust over time."
),
"real_life_example": (
"Coordinate research, implementation, tests, and release notes across several agents."
),
"what_the_model_learns": [
"Track reliability over many steps instead of one answer.",
"Balance accuracy with step budget.",
"Recover from failed subtasks without ending the mission early.",
],
"why_it_exists": "Medium curriculum for long-horizon state tracking and calibration.",
},
"task3": {
"name": "Adversarial Long-Horizon Mission",
"judge_friendly_story": (
"A user gives a high-stakes long workflow. One specialist behaves well "
"early, gains trust, then poisons critical outputs when stakes rise."
),
"real_life_example": (
"A software agent pipeline where a deployment, security, or data specialist "
"returns confident but corrupted advice near the end of the task."
),
"what_the_model_learns": [
"High confidence is not the same as correctness.",
"Verify high-stakes outputs before accepting them.",
"Detect adversarial behavior from evidence, not from a fixed identity.",
"Finish the mission after recovering from bad collaborator behavior.",
],
"why_it_exists": "The final demo task: blind trust fails, calibrated trust wins.",
},
}
def problem_statement() -> dict[str, Any]:
return {
"problem": PROBLEM_STATEMENT,
"bridge": PIPELINE_BRIDGE,
"tasks": TASK_MISSIONS,
"judge_demo": {
"before": "Blind orchestrator delegates to the highest-looking trust score and accepts poison.",
"after": "SENTINEL-style orchestrator verifies risky steps and routes around bad actors.",
"killer_moment": (
"Reset with a new seed. The adversarial public slot changes. A good policy "
"still recalibrates because it learned the skill, not the identity."
),
},
}
def mission_for_task(task_type: str | None) -> dict[str, Any]:
task = task_type or "task3"
return TASK_MISSIONS.get(task, TASK_MISSIONS["task3"])
def build_orchestrator_prompt(observation: dict[str, Any]) -> str:
task_type = str(observation.get("task_type", "task3"))
mission = mission_for_task(task_type)
trust = observation.get("trust_snapshot", {})
fingerprints = observation.get("behavioral_fingerprints", {})
difficulty_profile = observation.get("difficulty_profile", {})
specialists = observation.get("available_specialists", ["S0", "S1", "S2", "S3", "S4"])
steps_remaining = int(observation.get("max_steps", 0)) - int(observation.get("step_count", 0))
return (
"You are the SENTINEL orchestrator inside a long-running multi-agent workflow.\n"
"\n"
"Real user mission:\n"
f"{PROBLEM_STATEMENT['real_user_prompt_example']}\n"
"\n"
"Your hidden job is not to write the final user answer yet. Your job is to "
"control the next environment step so the mission survives unreliable collaborators.\n"
"\n"
f"Current curriculum task: {task_type} - {mission['name']}\n"
f"Task story: {mission['judge_friendly_story']}\n"
f"Current subtask: {observation.get('current_subtask', '')}\n"
f"Stakes level: {float(observation.get('stakes_level', 0.0)):.2f}\n"
f"Step count: {observation.get('step_count', 0)} / {observation.get('max_steps', 0)} "
f"(remaining: {steps_remaining})\n"
f"Available public specialists: {', '.join(specialists)}\n"
f"Trust snapshot: {json.dumps(trust, sort_keys=True)}\n"
f"Behavioral fingerprints: {json.dumps(fingerprints, sort_keys=True)}\n"
f"Difficulty profile: {json.dumps(difficulty_profile, sort_keys=True)}\n"
"\n"
"Important rules:\n"
"- Public specialist ids are shuffled every episode; never memorize S0/S1/S2/S3/S4.\n"
"- High stakes mean a confident answer can be dangerous.\n"
"- delegate is cheap but can accept poisoned output.\n"
"- verify costs more but can catch adversarial behavior.\n"
"- solve_independently costs the most and should be reserved for recovery.\n"
"- skip is allowed but usually hurts mission completion.\n"
"\n"
"Return exactly one JSON object. Valid examples:\n"
'{"action_type":"delegate","specialist_id":"S2","reasoning":"S2 has the best observed trust"}\n'
'{"action_type":"verify","specialist_id":"S0","reasoning":"high-stakes step; verify before accepting"}\n'
'{"action_type":"solve_independently","reasoning":"all specialists look unsafe"}\n'
)