Spaces:

XcodeAddy
/

sentinel-env

Running

App Files Files Community

sentinel-env / models.py

XcodeAddy

Add adaptive trust curriculum wow features

74b74f1 18 days ago

raw

history blame contribute delete

5.27 kB

	from __future__ import annotations

	from enum import Enum
	from typing import Any

	from pydantic import BaseModel, field_validator


	# ---------------------------------------------------------------------------
	# Enums
	# ---------------------------------------------------------------------------

	class TaskType(str, Enum):
	TASK1 = "task1" # easy — single-step trust decision
	TASK2 = "task2" # medium — multi-step delegation chain (no adversary)
	TASK3 = "task3" # hard — full adversarial episode


	class ActionType(str, Enum):
	DELEGATE = "delegate" # route subtask to a specialist
	VERIFY = "verify" # cross-check specialist result (+1 step cost)
	SOLVE_INDEPENDENTLY = "solve_independently" # agent solves itself (+2 step cost)
	SKIP = "skip" # abandon subtask (heavy penalty)


	class SpecialistId(str, Enum):
	S0 = "S0" # AccurateSlow
	S1 = "S1" # OverconfidentFast
	S2 = "S2" # DomainBound
	S3 = "S3" # Adversarial (identity shuffled each episode)
	S4 = "S4" # Degrading


	class EpisodeStatus(str, Enum):
	ACTIVE = "active"
	COMPLETED = "completed"
	FAILED = "failed"


	# ---------------------------------------------------------------------------
	# Observation
	# ---------------------------------------------------------------------------

	class SentinelObservation(BaseModel):
	session_id: str
	scenario_id: str
	task_type: TaskType
	difficulty: str # "easy" \| "medium" \| "hard"
	task_description: str
	current_subtask: str
	subtask_index: int # which subtask in the DAG (0-based)
	subtasks_total: int
	subtasks_remaining: int
	available_specialists: list[str] # specialist ids visible to agent
	trust_snapshot: dict[str, float] # {"S0": 0.82, "S1": 0.31, ...}
	behavioral_fingerprints: dict[str, dict[str, Any]] \| None = None
	difficulty_profile: dict[str, Any] \| None = None
	stakes_level: float # 0.0–1.0; high = adversary may trigger
	step_count: int
	max_steps: int
	last_action_summary: str \| None
	last_reward: float
	episode_status: EpisodeStatus


	# ---------------------------------------------------------------------------
	# Action
	# ---------------------------------------------------------------------------

	class SentinelAction(BaseModel):
	session_id: str
	task_type: TaskType
	action_type: ActionType
	specialist_id: str \| None = None # required for DELEGATE and VERIFY
	subtask_response: str \| None = None # required for SOLVE_INDEPENDENTLY
	reasoning: str \| None = None # optional chain-of-thought

	@field_validator("specialist_id")
	@classmethod
	def validate_specialist_id(cls, v: str \| None) -> str \| None:
	if v is not None and v not in [s.value for s in SpecialistId]:
	raise ValueError(f"specialist_id must be one of {[s.value for s in SpecialistId]}, got '{v}'")
	return v

	def requires_specialist(self) -> bool:
	return self.action_type in (ActionType.DELEGATE, ActionType.VERIFY)

	def requires_response(self) -> bool:
	return self.action_type == ActionType.SOLVE_INDEPENDENTLY


	# ---------------------------------------------------------------------------
	# Reward
	# ---------------------------------------------------------------------------

	class SentinelReward(BaseModel):
	value: float # (0.01, 0.99) boundary-exclusive
	reason: str
	signal_breakdown: dict[str, float] # {"task_accuracy": 0.4, ...}

	@field_validator("value")
	@classmethod
	def clamp_reward(cls, v: float) -> float:
	return max(0.01, min(0.99, v))


	# ---------------------------------------------------------------------------
	# Step Result (what env.step() and env.reset() return)
	# ---------------------------------------------------------------------------

	class StepResult(BaseModel):
	observation: SentinelObservation
	reward: SentinelReward
	done: bool
	info: dict[str, Any]


	# ---------------------------------------------------------------------------
	# State (what env.state() returns)
	# ---------------------------------------------------------------------------

	class SentinelState(BaseModel):
	episode_id: str
	session_id: str \| None
	step_count: int
	max_steps: int
	total_reward: float
	done: bool
	scenario_id: str
	task_type: TaskType
	difficulty: str
	status: EpisodeStatus
	last_reward: float
	subtasks_completed: int
	subtasks_total: int
	trust_snapshot: dict[str, float]
	behavioral_fingerprints: dict[str, dict[str, Any]] \| None = None
	difficulty_profile: dict[str, Any] \| None = None
	adversarial_detections: int # how many adversarial attempts caught
	adversarial_poisonings: int # how many slipped through


	# ---------------------------------------------------------------------------
	# Reset Request
	# ---------------------------------------------------------------------------

	class ResetRequest(BaseModel):
	task_type: TaskType \| None = None
	scenario_id: str \| None = None
	seed: int \| None = None