Spaces:
Running
Running
| from __future__ import annotations | |
| import json | |
| from typing import Any | |
| PROBLEM_STATEMENT: dict[str, Any] = { | |
| "one_line": ( | |
| "SENTINEL trains an LLM orchestrator to manage long multi-agent work " | |
| "without blindly trusting every specialist answer." | |
| ), | |
| "not_a_simple_prompt_solver": ( | |
| "The environment is not trying to answer a user's prompt directly. It " | |
| "trains the behavior an agent needs while working under the hood: " | |
| "delegate, verify, recover, and finish when collaborators are unreliable." | |
| ), | |
| "real_user_prompt_example": ( | |
| "Refactor this project, inspect failures, route work to code/test/security " | |
| "agents, fix the risky parts, and prepare it for deployment." | |
| ), | |
| "failure_without_sentinel": [ | |
| "The orchestrator decomposes the task into many steps.", | |
| "It delegates one critical step to a confident but wrong specialist.", | |
| "That poisoned result becomes input for later steps.", | |
| "The final answer looks coherent, but the workflow is built on corrupt state.", | |
| ], | |
| "behavior_after_training": [ | |
| "The orchestrator watches evidence from each specialist over time.", | |
| "It lowers trust when behavior becomes wrong, overconfident, or risky.", | |
| "It verifies high-stakes outputs instead of accepting them blindly.", | |
| "It routes around adversarial or degraded specialists and still finishes.", | |
| ], | |
| "what_is_trainable": ( | |
| "Only the orchestrator policy is trainable. The specialists are scripted " | |
| "FSMs so the reward signal is deterministic and reproducible." | |
| ), | |
| } | |
| PIPELINE_BRIDGE: list[dict[str, str]] = [ | |
| { | |
| "stage": "1. User mission", | |
| "what_happens": "A human asks an agent to complete a long workflow.", | |
| "sentinel_abstraction": "SENTINEL selects a scenario with a task graph.", | |
| }, | |
| { | |
| "stage": "2. Orchestrator observation", | |
| "what_happens": "The LLM sees the current subtask, stakes, specialists, and trust scores.", | |
| "sentinel_abstraction": "This is the observation returned by reset(), step(), or state().", | |
| }, | |
| { | |
| "stage": "3. Orchestrator action", | |
| "what_happens": "The LLM chooses whether to delegate, verify, solve itself, or skip.", | |
| "sentinel_abstraction": "This is the JSON action sent to step(action).", | |
| }, | |
| { | |
| "stage": "4. Specialist response", | |
| "what_happens": "A collaborator returns an answer with hidden reliability behavior.", | |
| "sentinel_abstraction": "SpecialistPool executes one of five shuffled FSM profiles.", | |
| }, | |
| { | |
| "stage": "5. Reward and memory", | |
| "what_happens": "The environment scores the decision and updates trust.", | |
| "sentinel_abstraction": "RewardEngine emits reward; TrustLedger updates Bayesian scores.", | |
| }, | |
| { | |
| "stage": "6. RL improvement", | |
| "what_happens": "GRPO/TRL shifts the model toward decisions that earned higher reward.", | |
| "sentinel_abstraction": "Training improves the orchestrator policy, not the scripted specialists.", | |
| }, | |
| ] | |
| TASK_MISSIONS: dict[str, dict[str, Any]] = { | |
| "task1": { | |
| "name": "Single Trust Decision", | |
| "judge_friendly_story": ( | |
| "A user asks for a short piece of work. The orchestrator must choose " | |
| "one collaborator for each simple subtask and learn basic routing." | |
| ), | |
| "real_life_example": ( | |
| "Pick the right helper for a quick code review, summary check, or data validation step." | |
| ), | |
| "what_the_model_learns": [ | |
| "Do not treat all specialists as equal.", | |
| "Prefer the specialist whose behavior has looked reliable.", | |
| "Pay attention to stakes before delegating.", | |
| ], | |
| "why_it_exists": "Warm-up curriculum so the model can get non-zero reward early.", | |
| }, | |
| "task2": { | |
| "name": "Long Delegation Chain", | |
| "judge_friendly_story": ( | |
| "A user gives a multi-step project. No specialist is malicious, but " | |
| "each has different reliability. The orchestrator must build trust over time." | |
| ), | |
| "real_life_example": ( | |
| "Coordinate research, implementation, tests, and release notes across several agents." | |
| ), | |
| "what_the_model_learns": [ | |
| "Track reliability over many steps instead of one answer.", | |
| "Balance accuracy with step budget.", | |
| "Recover from failed subtasks without ending the mission early.", | |
| ], | |
| "why_it_exists": "Medium curriculum for long-horizon state tracking and calibration.", | |
| }, | |
| "task3": { | |
| "name": "Adversarial Long-Horizon Mission", | |
| "judge_friendly_story": ( | |
| "A user gives a high-stakes long workflow. One specialist behaves well " | |
| "early, gains trust, then poisons critical outputs when stakes rise." | |
| ), | |
| "real_life_example": ( | |
| "A software agent pipeline where a deployment, security, or data specialist " | |
| "returns confident but corrupted advice near the end of the task." | |
| ), | |
| "what_the_model_learns": [ | |
| "High confidence is not the same as correctness.", | |
| "Verify high-stakes outputs before accepting them.", | |
| "Detect adversarial behavior from evidence, not from a fixed identity.", | |
| "Finish the mission after recovering from bad collaborator behavior.", | |
| ], | |
| "why_it_exists": "The final demo task: blind trust fails, calibrated trust wins.", | |
| }, | |
| } | |
| def problem_statement() -> dict[str, Any]: | |
| return { | |
| "problem": PROBLEM_STATEMENT, | |
| "bridge": PIPELINE_BRIDGE, | |
| "tasks": TASK_MISSIONS, | |
| "judge_demo": { | |
| "before": "Blind orchestrator delegates to the highest-looking trust score and accepts poison.", | |
| "after": "SENTINEL-style orchestrator verifies risky steps and routes around bad actors.", | |
| "killer_moment": ( | |
| "Reset with a new seed. The adversarial public slot changes. A good policy " | |
| "still recalibrates because it learned the skill, not the identity." | |
| ), | |
| }, | |
| } | |
| def mission_for_task(task_type: str | None) -> dict[str, Any]: | |
| task = task_type or "task3" | |
| return TASK_MISSIONS.get(task, TASK_MISSIONS["task3"]) | |
| def build_orchestrator_prompt(observation: dict[str, Any]) -> str: | |
| task_type = str(observation.get("task_type", "task3")) | |
| mission = mission_for_task(task_type) | |
| trust = observation.get("trust_snapshot", {}) | |
| fingerprints = observation.get("behavioral_fingerprints", {}) | |
| difficulty_profile = observation.get("difficulty_profile", {}) | |
| specialists = observation.get("available_specialists", ["S0", "S1", "S2", "S3", "S4"]) | |
| steps_remaining = int(observation.get("max_steps", 0)) - int(observation.get("step_count", 0)) | |
| return ( | |
| "You are the SENTINEL orchestrator inside a long-running multi-agent workflow.\n" | |
| "\n" | |
| "Real user mission:\n" | |
| f"{PROBLEM_STATEMENT['real_user_prompt_example']}\n" | |
| "\n" | |
| "Your hidden job is not to write the final user answer yet. Your job is to " | |
| "control the next environment step so the mission survives unreliable collaborators.\n" | |
| "\n" | |
| f"Current curriculum task: {task_type} - {mission['name']}\n" | |
| f"Task story: {mission['judge_friendly_story']}\n" | |
| f"Current subtask: {observation.get('current_subtask', '')}\n" | |
| f"Stakes level: {float(observation.get('stakes_level', 0.0)):.2f}\n" | |
| f"Step count: {observation.get('step_count', 0)} / {observation.get('max_steps', 0)} " | |
| f"(remaining: {steps_remaining})\n" | |
| f"Available public specialists: {', '.join(specialists)}\n" | |
| f"Trust snapshot: {json.dumps(trust, sort_keys=True)}\n" | |
| f"Behavioral fingerprints: {json.dumps(fingerprints, sort_keys=True)}\n" | |
| f"Difficulty profile: {json.dumps(difficulty_profile, sort_keys=True)}\n" | |
| "\n" | |
| "Important rules:\n" | |
| "- Public specialist ids are shuffled every episode; never memorize S0/S1/S2/S3/S4.\n" | |
| "- High stakes mean a confident answer can be dangerous.\n" | |
| "- delegate is cheap but can accept poisoned output.\n" | |
| "- verify costs more but can catch adversarial behavior.\n" | |
| "- solve_independently costs the most and should be reserved for recovery.\n" | |
| "- skip is allowed but usually hurts mission completion.\n" | |
| "\n" | |
| "Return exactly one JSON object. Valid examples:\n" | |
| '{"action_type":"delegate","specialist_id":"S2","reasoning":"S2 has the best observed trust"}\n' | |
| '{"action_type":"verify","specialist_id":"S0","reasoning":"high-stakes step; verify before accepting"}\n' | |
| '{"action_type":"solve_independently","reasoning":"all specialists look unsafe"}\n' | |
| ) | |