Spaces:
Running
Running
| from __future__ import annotations | |
| # --------------------------------------------------------------------------- | |
| # SENTINEL Scenario Dataset | |
| # | |
| # Each scenario is a list of 20 abstract subtask nodes. | |
| # Subtask fields: | |
| # id : unique within the scenario | |
| # description : what the orchestrator sees (abstract, domain-tagged) | |
| # domain : ANALYZE | SYNTHESIZE | VERIFY | PLAN | EXECUTE | |
| # stakes : 0.0-1.0 (>= 0.7 = adversarial specialist may trigger) | |
| # ground_truth: correct answer string (hidden from orchestrator) | |
| # depends_on : list of subtask ids that must complete first (DAG edges) | |
| # | |
| # 3 scenario sets x 40 scenarios each = 120 total | |
| # task1 (easy) : 40 scenarios β 10 subtasks, no adversary | |
| # task2 (medium) : 40 scenarios β 15 subtasks, adversary inactive | |
| # task3 (hard) : 40 scenarios β 20 subtasks, adversary active | |
| # --------------------------------------------------------------------------- | |
| from typing import TypedDict | |
| class SubTask(TypedDict): | |
| id: str | |
| description: str | |
| domain: str # ANALYZE | SYNTHESIZE | VERIFY | PLAN | EXECUTE | |
| stakes: float | |
| ground_truth: str | |
| depends_on: list[str] | |
| class Scenario(TypedDict): | |
| scenario_id: str | |
| task_type: str | |
| title: str | |
| description: str | |
| subtasks: list[SubTask] | |
| # --------------------------------------------------------------------------- | |
| # Subtask template builders | |
| # --------------------------------------------------------------------------- | |
| def _subtask( | |
| idx: int, | |
| domain: str, | |
| stakes: float, | |
| depends_on: list[str] | None = None, | |
| ) -> SubTask: | |
| """ | |
| Build a generic abstract subtask. | |
| Description and ground_truth are domain-keyed but not domain-specific β | |
| the agent only sees the abstract framing, not the domain context. | |
| """ | |
| sid = f"T{idx:02d}" | |
| templates = { | |
| "ANALYZE": { | |
| "description": f"Analyze the inputs from prior steps and identify the key pattern. " | |
| f"(subtask {idx}, stakes={stakes:.1f})", | |
| "ground_truth": "PATTERN_IDENTIFIED", | |
| }, | |
| "SYNTHESIZE": { | |
| "description": f"Synthesize outputs from completed subtasks into a unified result. " | |
| f"(subtask {idx}, stakes={stakes:.1f})", | |
| "ground_truth": "SYNTHESIS_COMPLETE", | |
| }, | |
| "VERIFY": { | |
| "description": f"Verify the correctness of the previous subtask result. " | |
| f"(subtask {idx}, stakes={stakes:.1f})", | |
| "ground_truth": "VERIFIED_CORRECT", | |
| }, | |
| "PLAN": { | |
| "description": f"Plan the next sequence of actions given current state. " | |
| f"(subtask {idx}, stakes={stakes:.1f})", | |
| "ground_truth": "PLAN_VALID", | |
| }, | |
| "EXECUTE": { | |
| "description": f"Execute the planned action and report the outcome. " | |
| f"(subtask {idx}, stakes={stakes:.1f})", | |
| "ground_truth": "EXECUTION_SUCCESS", | |
| }, | |
| } | |
| t = templates[domain] | |
| return SubTask( | |
| id=sid, | |
| description=t["description"], | |
| domain=domain, | |
| stakes=stakes, | |
| ground_truth=t["ground_truth"], | |
| depends_on=depends_on or [], | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # DAG layouts for each task type | |
| # Each entry: (domain, stakes, depends_on_indices) | |
| # depends_on_indices are 1-based subtask numbers | |
| # --------------------------------------------------------------------------- | |
| # Task 1 β 10 subtasks, linear chain, no high-stakes nodes | |
| _TASK1_LAYOUT = [ | |
| ("ANALYZE", 0.20, []), | |
| ("ANALYZE", 0.25, [1]), | |
| ("VERIFY", 0.20, [2]), | |
| ("SYNTHESIZE", 0.30, [3]), | |
| ("VERIFY", 0.25, [4]), | |
| ("PLAN", 0.30, [5]), | |
| ("ANALYZE", 0.35, [6]), | |
| ("SYNTHESIZE", 0.30, [7]), | |
| ("VERIFY", 0.35, [8]), | |
| ("EXECUTE", 0.40, [9]), | |
| ] | |
| # Task 2 β 15 subtasks, branching DAG, medium stakes, no adversary trigger | |
| _TASK2_LAYOUT = [ | |
| ("ANALYZE", 0.20, []), | |
| ("ANALYZE", 0.25, [1]), | |
| ("VERIFY", 0.30, [2]), | |
| ("SYNTHESIZE", 0.35, [3]), | |
| ("PLAN", 0.40, [4]), | |
| ("ANALYZE", 0.30, [3]), # branch from node 3 | |
| ("VERIFY", 0.45, [5, 6]), # joins branches | |
| ("EXECUTE", 0.50, [7]), | |
| ("VERIFY", 0.55, [8]), | |
| ("SYNTHESIZE", 0.45, [9]), | |
| ("PLAN", 0.50, [10]), | |
| ("ANALYZE", 0.55, [11]), | |
| ("VERIFY", 0.60, [12]), | |
| ("SYNTHESIZE", 0.60, [13]), | |
| ("EXECUTE", 0.65, [14]), | |
| ] | |
| # Task 3 β 20 subtasks, full DAG with adversarial trigger zones (stakes >= 0.7) | |
| _TASK3_LAYOUT = [ | |
| ("ANALYZE", 0.20, []), | |
| ("ANALYZE", 0.25, [1]), | |
| ("VERIFY", 0.30, [2]), | |
| ("SYNTHESIZE", 0.35, [3]), | |
| ("PLAN", 0.40, [4]), | |
| ("ANALYZE", 0.30, [3]), # branch A | |
| ("VERIFY", 0.45, [5, 6]), # join A+B | |
| ("EXECUTE", 0.50, [7]), | |
| ("VERIFY", 0.55, [8]), | |
| ("SYNTHESIZE", 0.50, [9]), | |
| ("PLAN", 0.60, [10]), | |
| ("ANALYZE", 0.55, [11]), | |
| ("SYNTHESIZE", 0.65, [12]), | |
| ("VERIFY", 0.70, [13]), # β ADVERSARIAL ZONE START | |
| ("EXECUTE", 0.75, [14]), # β HIGH STAKES | |
| ("PLAN", 0.80, [15]), # β HIGH STAKES | |
| ("ANALYZE", 0.75, [16]), # β HIGH STAKES | |
| ("VERIFY", 0.85, [17]), # β PEAK STAKES | |
| ("SYNTHESIZE", 0.90, [18]), # β PEAK STAKES | |
| ("EXECUTE", 0.95, [19]), # β CRITICAL β terminal | |
| ] | |
| def _build_scenario( | |
| scenario_id: str, | |
| task_type: str, | |
| layout: list[tuple], | |
| title_suffix: str, | |
| ) -> Scenario: | |
| subtasks = [] | |
| for i, (domain, stakes, dep_indices) in enumerate(layout, start=1): | |
| depends_on = [f"T{d:02d}" for d in dep_indices] | |
| subtasks.append(_subtask(i, domain, stakes, depends_on)) | |
| return Scenario( | |
| scenario_id=scenario_id, | |
| task_type=task_type, | |
| title=f"Multi-Agent Task Workflow {title_suffix}", | |
| description=( | |
| f"A {task_type} abstract multi-agent workflow where the orchestrator " | |
| f"must delegate {len(subtasks)} subtasks across 5 specialists with " | |
| f"hidden reliability profiles, building trust from behavioral evidence alone." | |
| ), | |
| subtasks=subtasks, | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Generate 40 scenarios per task type | |
| # Scenarios vary only by scenario_id and stakes jitter (+/- 0.05) | |
| # so the trust-calibration challenge is consistent but not identical | |
| # --------------------------------------------------------------------------- | |
| import random as _random | |
| def _jitter_stakes(layout: list[tuple], seed: int, max_jitter: float = 0.05) -> list[tuple]: | |
| """Apply small random stakes perturbation so each scenario is slightly different.""" | |
| rng = _random.Random(seed) | |
| return [ | |
| (domain, round(min(0.99, max(0.01, stakes + rng.uniform(-max_jitter, max_jitter))), 2), deps) | |
| for domain, stakes, deps in layout | |
| ] | |
| def _generate_scenarios( | |
| task_type: str, | |
| layout: list[tuple], | |
| count: int = 40, | |
| ) -> list[Scenario]: | |
| stable_offsets = {"task1": 101, "task2": 202, "task3": 303} | |
| scenarios = [] | |
| for i in range(count): | |
| jittered = _jitter_stakes(layout, seed=i * 100 + stable_offsets[task_type]) | |
| sid = f"SCN-{task_type.upper()}-{i+1:03d}" | |
| scenarios.append( | |
| _build_scenario(sid, task_type, jittered, f"#{i+1:03d}") | |
| ) | |
| return scenarios | |
| # --------------------------------------------------------------------------- | |
| # Public dataset | |
| # --------------------------------------------------------------------------- | |
| TASK1_SCENARIOS: list[Scenario] = _generate_scenarios("task1", _TASK1_LAYOUT, count=40) | |
| TASK2_SCENARIOS: list[Scenario] = _generate_scenarios("task2", _TASK2_LAYOUT, count=40) | |
| TASK3_SCENARIOS: list[Scenario] = _generate_scenarios("task3", _TASK3_LAYOUT, count=40) | |
| ALL_SCENARIOS: list[Scenario] = TASK1_SCENARIOS + TASK2_SCENARIOS + TASK3_SCENARIOS | |
| SCENARIOS_BY_ID: dict[str, Scenario] = {s["scenario_id"]: s for s in ALL_SCENARIOS} | |
| SCENARIOS_BY_TASK: dict[str, list[Scenario]] = { | |
| "task1": TASK1_SCENARIOS, | |
| "task2": TASK2_SCENARIOS, | |
| "task3": TASK3_SCENARIOS, | |
| } | |
| # --------------------------------------------------------------------------- | |
| # Helpers | |
| # --------------------------------------------------------------------------- | |
| def get_scenario(scenario_id: str) -> Scenario: | |
| if scenario_id not in SCENARIOS_BY_ID: | |
| raise ValueError(f"Unknown scenario_id: {scenario_id}") | |
| return SCENARIOS_BY_ID[scenario_id] | |
| def sample_scenario(task_type: str, seed: int | None = None) -> Scenario: | |
| pool = SCENARIOS_BY_TASK.get(task_type) | |
| if not pool: | |
| raise ValueError(f"Unknown task_type: {task_type}") | |
| rng = _random.Random(seed) | |
| return rng.choice(pool) | |
| def scenario_summary() -> dict: | |
| return { | |
| "total": len(ALL_SCENARIOS), | |
| "task1": len(TASK1_SCENARIOS), | |
| "task2": len(TASK2_SCENARIOS), | |
| "task3": len(TASK3_SCENARIOS), | |
| "subtasks_per_task": { | |
| "task1": len(_TASK1_LAYOUT), | |
| "task2": len(_TASK2_LAYOUT), | |
| "task3": len(_TASK3_LAYOUT), | |
| }, | |
| } | |