Spaces:

XcodeAddy
/

sentinel-env

Running

File size: 9,714 Bytes

a36db1b

from __future__ import annotations

import argparse
import json
import random
import sys
from pathlib import Path
from typing import Any

ROOT = Path(__file__).resolve().parents[1]
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from adversary import AdversaryFSM
from audit_ledger import AuditLedger
from cluster_rewards import (
    auditor_reward,
    global_cluster_reward,
    orchestrator_reward,
    resource_manager_reward,
    task1_cluster_terminal,
    task2_cluster_terminal,
    task3_cluster_terminal,
    worker_reward,
)
from environment import SentinelEnv
from gpu_pool import GPUPool
from graders import grade_task1_step, grade_task2_step, grade_task3_step
from job_queue import GPUJob, JobQueue


def main() -> None:
    parser = argparse.ArgumentParser(description="Explain and test SENTINEL reward logic.")
    parser.add_argument("--section", choices=["all", "current", "cluster"], default="all")
    parser.add_argument("--seed", type=int, default=42)
    args = parser.parse_args()

    if args.section in ("all", "current"):
        explain_current_reward_engine(args.seed)
    if args.section in ("all", "cluster"):
        explain_cluster_reward_engine(args.seed)


def explain_current_reward_engine(seed: int) -> None:
    print_rule("CURRENT REWARD ENGINE V2 - DIRECT GRADER CASES")
    cases = [
        (
            "Task1 low-stakes correct delegate",
            grade_task1_step(
                chosen_specialist="S0",
                specialist_outcome=1.0,
                stakes=0.25,
                was_adversarial=False,
                action_type="delegate",
                confidence=0.85,
                result_metadata={"step_cost": 1},
                trust_score=0.50,
            ),
            "Healthy simple step. Accuracy, stakes awareness, and efficiency are all high.",
        ),
        (
            "Task2 overconfident wrong answer",
            grade_task2_step(
                specialist_outcome=0.0,
                action_type="delegate",
                step_count=8,
                max_steps=30,
                confidence=0.95,
                result_metadata={"step_cost": 1},
            ),
            "This is the reward-hacking smell: high confidence on wrong output. Confidence alignment drops hard.",
        ),
        (
            "Task3 high-stakes poison accepted",
            grade_task3_step(
                specialist_outcome=0.0,
                stakes=0.85,
                was_adversarial=True,
                action_type="delegate",
                step_count=20,
                max_steps=45,
                confidence=0.92,
                result_metadata={"triggered": True, "threshold": 0.70},
                trust_score=0.88,
            ),
            "Bad case. The agent delegated during a high-stakes adversarial step, so task accuracy and stakes awareness collapse.",
        ),
        (
            "Task3 high-stakes adversary verified",
            grade_task3_step(
                specialist_outcome=1.0,
                stakes=0.85,
                was_adversarial=True,
                action_type="verify",
                step_count=22,
                max_steps=45,
                confidence=0.92,
                result_metadata={"triggered": True, "threshold": 0.70},
                trust_score=0.42,
            ),
            "Good case. Verification catches the adversary, so verification quality and stakes awareness become strong.",
        ),
    ]

    for title, (score, reason, breakdown), meaning in cases:
        print_case(title, score, reason, breakdown, meaning)

    print_rule("CURRENT REWARD ENGINE V2 - LIVE ENV REWARD REPORT")
    env = SentinelEnv()
    result = env.reset(task_type="task3", seed=seed)
    obs = result["observation"]
    result = env.step(
        {
            "session_id": obs["session_id"],
            "task_type": obs["task_type"],
            "action_type": "delegate",
            "specialist_id": "S0",
            "reasoning": "walkthrough first step",
        }
    )
    report = env.reward_report()
    print_json(
        {
            "step_reward": result["reward"],
            "score_so_far": result["info"]["score"],
            "reward_report": report,
        }
    )
    print(
        "\nMeaning: /reward-report is the judge-friendly audit trail. It shows every reward event, "
        "the formula components, the trust before/after, and why the score moved.\n"
    )


def explain_cluster_reward_engine(seed: int) -> None:
    print_rule("GPU CLUSTER PHASE 1 - SIMULATION INGREDIENTS")
    rng = random.Random(seed)
    pool = GPUPool(num_gpus=4, memory_per_gpu=80, failure_probability=0.0)
    queue = JobQueue(
        [
            GPUJob("JOB-001", priority=5, memory_required=48, steps_to_complete=3, deadline=8, owner="train"),
            GPUJob("JOB-002", priority=2, memory_required=24, steps_to_complete=2, deadline=7, owner="eval"),
            GPUJob("JOB-003", priority=3, memory_required=72, steps_to_complete=5, deadline=12, owner="research"),
        ]
    )

    pool.allocate("JOB-001", "GPU-00", 48)
    queue.assign("JOB-001", "GPU-00")
    pool.allocate("JOB-002", "GPU-00", 24)
    queue.assign("JOB-002", "GPU-00")
    queue.tick(current_step=1, active_job_ids={"JOB-001", "JOB-002"})

    adversary = AdversaryFSM(seed=seed, attack_probability=1.0, min_attack_gap=0)
    attack = adversary.maybe_inject(
        step=2,
        auditor_detection_rate=0.0,
        gpu_ids=["GPU-00", "GPU-01"],
        job_ids=["JOB-001", "JOB-002"],
    )

    ledger = AuditLedger()
    ledger.record_action("resource_manager", {"action_type": "allocate", "job_id": "JOB-001"}, timestamp=1)
    ledger.record_claim("resource_manager", 0.98, "claimed all jobs healthy", timestamp=2, approved=False)
    ledger.flag("resource_manager", "reward claim conflicts with job progress")

    print_json(
        {
            "gpu_pool": pool.summary(),
            "visible_gpus": pool.snapshot(include_hidden=False),
            "hidden_jobs": queue.snapshot(include_hidden=True),
            "adversary_attack": attack,
            "audit_anomaly_scores": ledger.anomaly_scores(),
        }
    )
    print(
        "\nMeaning: these are not final rewards yet. They are the raw signals the cluster reward "
        "engine will consume: utilization, deadline progress, false reports, attacks, and audit anomalies.\n"
    )

    print_rule("GPU CLUSTER REWARD FORMULAS - PER AGENT")
    orchestrator, orchestrator_breakdown = orchestrator_reward(
        goal_completion_rate=queue.completion_rate(),
        plan_coherence_score=0.72,
        recovery_speed=0.60,
    )
    resource_mgr, resource_breakdown = resource_manager_reward(
        gpu_utilization_efficiency=pool.utilization_rate(),
        deadline_hit_rate=queue.deadline_hit_rate(),
        waste_penalty=0.18,
    )
    auditor, auditor_breakdown = auditor_reward(
        detection_rate=0.75,
        false_positive_rate=0.10,
        calibration_score=0.70,
    )
    worker, worker_breakdown = worker_reward(
        job_completion_accuracy=0.66,
        report_honesty_score=0.55,
    )
    global_score, global_breakdown = global_cluster_reward(
        {
            "orchestrator": orchestrator,
            "resource_manager": resource_mgr,
            "auditor": auditor,
            "worker": worker,
        },
        cluster_health_score=pool.cluster_health_score(),
    )
    print_json(
        {
            "orchestrator": {"reward": orchestrator, "breakdown": orchestrator_breakdown},
            "resource_manager": {"reward": resource_mgr, "breakdown": resource_breakdown},
            "auditor": {"reward": auditor, "breakdown": auditor_breakdown},
            "worker": {"reward": worker, "breakdown": worker_breakdown},
            "global_cluster_reward": {"reward": global_score, "breakdown": global_breakdown},
        }
    )
    print(
        "\nMeaning: per-agent rewards can be decent, but the global reward is multiplied by cluster health. "
        "That is the anti-hack design: individual gaming cannot win if the cluster degrades.\n"
    )

    print_rule("GPU CLUSTER TASK TERMINAL REWARDS")
    task1 = task1_cluster_terminal(jobs_completed_rate=0.82, avg_gpu_utilization=0.74)
    task2 = task2_cluster_terminal(
        jobs_completed_rate=0.76,
        worker_trust_calibration=0.68,
        deadline_recovery_rate=0.61,
    )
    task3 = task3_cluster_terminal(
        jobs_completed_rate=0.70,
        adversarial_detection_rate=0.80,
        reward_hack_detection_rate=0.75,
        plan_coherence_score=0.66,
        efficiency_score=0.58,
    )
    print_json(
        {
            "task1_cluster_basics": {"reward": task1[0], "breakdown": task1[1]},
            "task2_unreliable_workers": {"reward": task2[0], "breakdown": task2[1]},
            "task3_full_adversarial_cluster": {"reward": task3[0], "breakdown": task3[1]},
        }
    )
    print(
        "\nMeaning: these are the terminal scores for the GPU-cluster version. "
        "Task3 is intentionally multi-objective: complete jobs, catch adversary, catch reward hacks, keep plan coherence, stay efficient.\n"
    )


def print_case(title: str, score: float, reason: str, breakdown: dict[str, Any], meaning: str) -> None:
    print(f"\n{title}")
    print("-" * len(title))
    print_json({"reward": round(score, 4), "reason": reason, "breakdown": breakdown})
    print(f"Meaning: {meaning}")


def print_rule(title: str) -> None:
    print("\n" + "=" * 100)
    print(title)
    print("=" * 100)


def print_json(value: Any) -> None:
    print(json.dumps(value, indent=2, sort_keys=True))


if __name__ == "__main__":
    main()