Spaces:

XcodeAddy
/

sentinel-env

Running

App Files Files Community

sentinel-env / scripts /backend_walkthrough.py

XcodeAddy

Harden backend session and reward constants

aad7819 19 days ago

raw

history blame contribute delete

10.4 kB

	from __future__ import annotations

	import argparse
	import json
	import random
	import sys
	from dataclasses import dataclass
	from pathlib import Path
	from typing import Callable

	ROOT = Path(__file__).resolve().parents[1]
	if str(ROOT) not in sys.path:
	sys.path.insert(0, str(ROOT))

	from environment import SentinelEnv, _GROUND_TRUTH_RELIABILITY
	from mission_context import build_orchestrator_prompt, mission_for_task, problem_statement
	from sentinel_config import ADVERSARIAL_AWARENESS_STAKES


	Policy = Callable[[SentinelEnv, dict, random.Random], dict]


	@dataclass
	class TraceRow:
	step_count: int
	subtask_index: int
	stakes: float
	action: str
	reward: float
	score: float
	completed: int
	remaining: int
	trust: dict[str, float]
	reason: str
	detections: int
	poisonings: int


	def blind_trust_policy(env: SentinelEnv, obs: dict, rng: random.Random) -> dict:
	"""Before SENTINEL: always trust the currently highest-trust specialist."""
	trust = obs["trust_snapshot"]
	specialist = max(obs["available_specialists"], key=lambda sid: trust.get(sid, 0.5))
	return action(obs, "delegate", specialist, f"blind-trust best={specialist}")


	def sentinel_heuristic_policy(env: SentinelEnv, obs: dict, rng: random.Random) -> dict:
	"""
	Simple SENTINEL-style baseline.
	It still has flaws, but it shows the intended skill: verify when risk rises.
	"""
	trust = obs["trust_snapshot"]
	specialist = max(obs["available_specialists"], key=lambda sid: trust.get(sid, 0.5))
	score = trust.get(specialist, 0.5)
	action_type = "verify" if obs["stakes_level"] >= ADVERSARIAL_AWARENESS_STAKES and score < 0.65 else "delegate"
	return action(
	obs,
	action_type,
	specialist,
	f"sentinel-heuristic best={specialist} trust={score:.3f} stakes={obs['stakes_level']:.2f}",
	)


	def oracle_lite_policy(env: SentinelEnv, obs: dict, rng: random.Random) -> dict:
	"""
	Upper-bound policy used for demos and evaluator comparison.
	It uses hidden builder-only info, so it is NOT a deployable policy.
	"""
	reliability = env._pool.public_ground_truth_reliability(_GROUND_TRUTH_RELIABILITY)
	if obs["task_type"] == "task3" and obs["stakes_level"] >= ADVERSARIAL_AWARENESS_STAKES:
	return action(obs, "verify", env._pool.adversarial_slot, "oracle-lite verifies adversarial slot")
	specialist = max(obs["available_specialists"], key=lambda sid: reliability.get(sid, 0.5))
	return action(obs, "delegate", specialist, f"oracle-lite best={specialist}")


	POLICIES: dict[str, Policy] = {
	"blind": blind_trust_policy,
	"heuristic": sentinel_heuristic_policy,
	"oracle": oracle_lite_policy,
	}


	def action(obs: dict, action_type: str, specialist_id: str \| None, reason: str) -> dict:
	return {
	"session_id": obs["session_id"],
	"task_type": obs["task_type"],
	"action_type": action_type,
	"specialist_id": specialist_id,
	"subtask_response": "SELF_SOLVED" if action_type == "solve_independently" else None,
	"reasoning": reason,
	}


	def compact_reset(result: dict) -> dict:
	obs = result["observation"]
	return {
	"session_id": obs["session_id"],
	"scenario_id": obs["scenario_id"],
	"task_type": obs["task_type"],
	"current_subtask": obs["current_subtask"],
	"available_specialists": obs["available_specialists"],
	"trust_snapshot": obs["trust_snapshot"],
	"stakes_level": obs["stakes_level"],
	"step_count": obs["step_count"],
	"max_steps": obs["max_steps"],
	"done": result["done"],
	"reward": result["reward"],
	}


	def run_episode(
	policy_name: str,
	task_type: str,
	seed: int,
	show_hidden: bool,
	max_rows: int \| None,
	) -> tuple[SentinelEnv, dict, list[TraceRow]]:
	policy = POLICIES[policy_name]
	rng = random.Random(seed)
	env = SentinelEnv()
	result = env.reset(task_type=task_type, seed=seed)
	rows: list[TraceRow] = []

	print_header(policy_name, task_type, seed)
	print("RESET JSON - compact agent-facing shape")
	print(json.dumps(compact_reset(result), indent=2))
	print()
	print("LLM ORCHESTRATOR PROMPT - first 28 lines")
	prompt_lines = build_orchestrator_prompt(result["observation"]).splitlines()
	print("\n".join(prompt_lines[:28]))
	if len(prompt_lines) > 28:
	print("...")
	print()
	if show_hidden:
	print("BUILDER-ONLY HIDDEN PROFILE - agent never sees this")
	print(json.dumps({
	"public_slot_to_internal_behavior": env._pool.internal_profile(),
	"adversarial_public_slot": env._pool.adversarial_slot,
	}, indent=2))
	print()

	print_trace_header()
	guard = 0
	while not result["done"] and guard < 100:
	obs = result["observation"]
	chosen = policy(env, obs, rng)
	result = env.step(chosen)
	graph_summary = env._graph.summary()
	row = TraceRow(
	step_count=result["info"]["step_count"],
	subtask_index=result["observation"]["subtask_index"],
	stakes=obs["stakes_level"],
	action=f"{chosen['action_type']}:{chosen.get('specialist_id') or 'SELF'}",
	reward=result["reward"]["value"],
	score=result["info"]["score"],
	completed=graph_summary["subtasks_completed"],
	remaining=graph_summary["subtasks_remaining"],
	trust=result["observation"]["trust_snapshot"],
	reason=result["reward"]["reason"],
	detections=graph_summary["adversarial_detections"],
	poisonings=graph_summary["adversarial_poisonings"],
	)
	rows.append(row)
	if max_rows is None or len(rows) <= max_rows:
	print_trace_row(row)
	guard += 1

	if max_rows is not None and len(rows) > max_rows:
	print(f"... {len(rows) - max_rows} more rows hidden by --max-rows")

	print()
	print("FINAL INFO")
	print(json.dumps(result["info"], indent=2))
	print("FINAL REWARD")
	print(json.dumps(result["reward"], indent=2))
	print()
	return env, result, rows


	def print_header(policy_name: str, task_type: str, seed: int) -> None:
	problem = problem_statement()["problem"]
	mission = mission_for_task(task_type)
	print("=" * 92)
	print("SENTINEL BACKEND WALKTHROUGH")
	print("=" * 92)
	print(f"policy={policy_name} task={task_type} seed={seed}")
	print()
	print("REAL USER PROMPT EXAMPLE")
	print(problem["real_user_prompt_example"])
	print()
	print("REAL-WORLD MAPPING")
	print(problem["not_a_simple_prompt_solver"])
	print(f"Task mission: {mission['judge_friendly_story']}")
	print("The JSON action is the next internal control move, not the final user answer.")
	print("SENTINEL trains the transferable behavior: trust, verify, recover, finish.")
	print()


	def print_trace_header() -> None:
	print("STEP TRACE")
	print(
	"step \| node \| stake \| action \| reward \| score \| done/rem \| adv det/poison \| trust snapshot"
	)
	print("-" * 132)


	def print_trace_row(row: TraceRow) -> None:
	trust = " ".join(f"{sid}:{score:.3f}" for sid, score in row.trust.items())
	print(
	f"{row.step_count:>4} \| {row.subtask_index:>4} \| {row.stakes:>5.2f} \| "
	f"{row.action:<15} \| {row.reward:>6.3f} \| {row.score:>5.3f} \| "
	f"{row.completed:>2}/{row.completed + row.remaining:<2} \| "
	f"{row.detections:>2}/{row.poisonings:<2} \| {trust}"
	)
	print(f" reason: {row.reason}")


	def compare_policies(task_type: str, seed: int, show_hidden: bool) -> None:
	mission = mission_for_task(task_type)
	print("=" * 92)
	print("BEFORE / AFTER BACKEND COMPARISON")
	print("=" * 92)
	print("before=blind trust, middle=heuristic trust, target=oracle-lite upper bound")
	print(f"mission={mission['name']} - {mission['real_life_example']}")
	print()
	results = []
	for policy_name in ("blind", "heuristic", "oracle"):
	env = SentinelEnv()
	result = env.reset(task_type=task_type, seed=seed)
	rng = random.Random(seed)
	while not result["done"]:
	chosen = POLICIES[policy_name](env, result["observation"], rng)
	result = env.step(chosen)
	info = result["info"]
	results.append({
	"policy": policy_name,
	"score": info.get("score", 0.0),
	"completion": info.get("completion_rate", 0.0),
	"detections": info.get("adversarial_detections", 0),
	"poisonings": info.get("adversarial_poisonings", 0),
	"steps": info.get("step_count", 0),
	"status": "failed" if info.get("forced_end") else "completed",
	})
	if show_hidden and policy_name == "blind":
	print("Hidden profile for this comparison seed:")
	print(json.dumps({
	"public_slot_to_internal_behavior": env._pool.internal_profile(),
	"adversarial_public_slot": env._pool.adversarial_slot,
	}, indent=2))
	print()

	print("policy \| score \| completion \| detections \| poisonings \| steps \| status")
	print("-" * 78)
	for item in results:
	print(
	f"{item['policy']:<9} \| {item['score']:.3f} \| "
	f"{item['completion']:.3f} \| {item['detections']:<10} \| "
	f"{item['poisonings']:<10} \| {item['steps']:<5} \| {item['status']}"
	)
	print()


	def main() -> None:
	parser = argparse.ArgumentParser(description="Explain SENTINEL backend behavior from terminal.")
	parser.add_argument("--task", default="task3", choices=["task1", "task2", "task3"])
	parser.add_argument("--seed", type=int, default=42)
	parser.add_argument("--policy", default="heuristic", choices=sorted(POLICIES))
	parser.add_argument("--hide-hidden", action="store_true", help="Do not print builder-only hidden profile.")
	parser.add_argument("--max-rows", type=int, default=None, help="Limit printed trace rows.")
	parser.add_argument("--compare", action="store_true", help="Compare blind vs heuristic vs oracle-lite.")
	args = parser.parse_args()

	show_hidden = not args.hide_hidden
	if args.compare:
	compare_policies(args.task, args.seed, show_hidden)
	run_episode(args.policy, args.task, args.seed, show_hidden, args.max_rows)


	if __name__ == "__main__":
	main()