Spaces:

XcodeAddy
/

sentinel-env

Running

File size: 8,058 Bytes

"""
SENTINEL — Baseline Inference Script
=====================================
Runs a deterministic heuristic agent against all 3 task types.
Emits [START] / [STEP] / [END] structured logs exactly as required.

Heuristic agent logic:
  - Always delegates to the specialist with highest trust score
  - If stakes enters the adversarial-awareness zone and trust of chosen
    specialist < 0.60 -> verify instead
  - Never skips
  - Never solves independently (too expensive)

Usage:
    python inference.py

Environment variables (optional):
    API_BASE_URL  — OpenAI-compatible endpoint (for LLM agent, not heuristic)
    MODEL_NAME    — model identifier
    HF_TOKEN      — API key
    ENV_URL       — remote env URL (default: in-process)
"""

from __future__ import annotations

import json
import os
from pathlib import Path

# ---------------------------------------------------------------------------
# Try remote env first, fall back to in-process
# ---------------------------------------------------------------------------

ENV_URL = os.environ.get("ENV_URL", "").strip()

if ENV_URL:
    import httpx
    USE_REMOTE = True
else:
    from environment import SentinelEnv
    USE_REMOTE = False

from sentinel_config import ADVERSARIAL_AWARENESS_STAKES


# ---------------------------------------------------------------------------
# Env interface (works both remote and in-process)
# ---------------------------------------------------------------------------

class EnvClient:
    def __init__(self):
        if USE_REMOTE:
            self._client = httpx.Client(base_url=ENV_URL, timeout=30)
        else:
            self._env = SentinelEnv()
        self.session_id: str = ""

    def reset(self, task_type: str, scenario_id: str | None = None, seed: int | None = None) -> dict:
        payload = {"task_type": task_type, "scenario_id": scenario_id, "seed": seed}
        if USE_REMOTE:
            r = self._client.post("/reset", json=payload)
            r.raise_for_status()
            result = r.json()
        else:
            result = self._env.reset(task_type=task_type, scenario_id=scenario_id, seed=seed)
        self.session_id = result["info"]["session_id"]
        return result

    def step(self, action: dict) -> dict:
        action["session_id"] = self.session_id
        if USE_REMOTE:
            r = self._client.post(f"/step?session_id={self.session_id}", json=action)
            r.raise_for_status()
            return r.json()
        else:
            return self._env.step(action)

    def state(self) -> dict:
        if USE_REMOTE:
            r = self._client.get(f"/state?session_id={self.session_id}")
            r.raise_for_status()
            return r.json()
        else:
            return self._env.state(session_id=self.session_id)


# ---------------------------------------------------------------------------
# Heuristic agent
# ---------------------------------------------------------------------------

def heuristic_action(obs: dict, session_id: str) -> dict:
    """
    Deterministic trust-weighted delegation.
    Picks specialist with highest trust. Upgrades to verify at high stakes.
    """
    trust        = obs["trust_snapshot"]          # {"S0": 0.5, ...}
    stakes       = obs["stakes_level"]
    task_type    = obs["task_type"]
    available    = obs["available_specialists"]

    # Pick specialist with highest trust
    best_specialist = max(available, key=lambda s: trust.get(s, 0.5))
    best_trust      = trust.get(best_specialist, 0.5)

    # Upgrade to verify if high stakes AND low trust in best specialist
    if stakes >= ADVERSARIAL_AWARENESS_STAKES and best_trust < 0.60:
        action_type = "verify"
    else:
        action_type = "delegate"

    return {
        "session_id":       session_id,
        "task_type":        task_type,
        "action_type":      action_type,
        "specialist_id":    best_specialist,
        "subtask_response": None,
        "reasoning":        f"Trust-weighted: {best_specialist}={best_trust:.3f}, stakes={stakes:.2f}",
    }


# ---------------------------------------------------------------------------
# Run one scenario
# ---------------------------------------------------------------------------

def run_episode(
    client: EnvClient,
    task_type: str,
    scenario_id: str,
    seed: int,
) -> dict:
    result     = client.reset(task_type=task_type, scenario_id=scenario_id, seed=seed)
    session_id = client.session_id

    print(f"[START] task={scenario_id} env=sentinel-env model=heuristic-baseline")

    step_num    = 0
    rewards: list[float] = []
    final_score = 0.0

    while True:
        obs    = result["observation"]
        action = heuristic_action(obs, session_id)

        result    = client.step(action)
        reward    = result["reward"]["value"]
        done      = result["done"]
        step_num += 1
        rewards.append(reward)
        final_score = result["info"].get("score", 0.0)

        action_str = f"{action['action_type']}:{action.get('specialist_id','SELF')}"
        print(
            f"[STEP] step={step_num} "
            f"action={action_str} "
            f"reward={reward:.2f} "
            f"done={str(done).lower()} "
            f"error=null"
        )

        if done:
            break

    # Final info
    info = result["info"]
    completion  = info.get("completion_rate", 0.0)
    detections  = info.get("adversarial_detections", 0)
    poisonings  = info.get("adversarial_poisonings", 0)
    trust_snap  = info.get("trust_snapshot", {})
    rewards_str = ",".join(f"{r:.2f}" for r in rewards)

    print(
        f"[END] success=true "
        f"steps={step_num} "
        f"score={final_score:.3f} "
        f"rewards={rewards_str}"
    )

    return {
        "scenario_id":            scenario_id,
        "task_type":              task_type,
        "steps":                  step_num,
        "score":                  round(final_score, 4),
        "total_reward":           round(info.get("total_reward", 0.0), 4),
        "completion_rate":        round(completion, 4),
        "adversarial_detections": detections,
        "adversarial_poisonings": poisonings,
        "final_trust":            trust_snap,
    }


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main():
    client = EnvClient()
    all_results = []

    # Run 10 episodes per task type (30 total — fast enough for validation)
    for task_type in ["task1", "task2", "task3"]:
        for i in range(10):
            scenario_id = f"SCN-{task_type.upper()}-{i+1:03d}"
            try:
                result = run_episode(client, task_type, scenario_id, seed=i)
                all_results.append(result)
            except Exception as e:
                print(f"[STEP] step=0 action=error reward=0.00 done=true error={e}")
                print(f"[END] success=false steps=0 score=0.000 rewards=0.00")

    if all_results:
        by_task: dict[str, list] = {"task1": [], "task2": [], "task3": []}
        for r in all_results:
            by_task[r["task_type"]].append(r["score"])

        overall_scores = []
        for task_type, scores in by_task.items():
            if scores:
                overall_scores.extend(scores)

        overall_avg = sum(overall_scores) / len(overall_scores) if overall_scores else 0.0

        out_path = Path("outputs/baseline_scores.json")
        out_path.parent.mkdir(exist_ok=True)
        with open(out_path, "w") as f:
            json.dump({
                "model":          "heuristic-baseline",
                "total_episodes": len(all_results),
                "avg_score":      round(overall_avg, 4),
                "by_task": {
                    t: {"episodes": len(s), "avg_score": round(sum(s)/len(s), 4)}
                    for t, s in by_task.items() if s
                },
                "episodes": all_results,
            }, f, indent=2)


if __name__ == "__main__":
    main()