Spaces:

XcodeAddy
/

sentinel-env

Running

App Files Files Community

sentinel-env / openenv.yaml

XcodeAddy

Add GPU trust environment and GRPO replay pipeline

a36db1b 17 days ago

raw

history blame contribute delete

9.17 kB

	spec_version: 1

	name: sentinel-env

	type: space

	runtime: fastapi

	app: app:app

	port: 7860

	version: "1.0.0"

	tags: [openenv, multi-agent, trust-calibration, adversarial, long-horizon, gpu-cluster]

	description: >
	SENTINEL is a multi-agent trust calibration RL environment. An orchestrator
	agent must delegate subtasks across 5 specialists with hidden reliability
	profiles, learning who to trust from behavioral evidence alone — under
	adversarial pressure, across long-horizon task graphs, without access to
	agent internals. Profiles resample every episode so the agent learns a
	transferable skill, not memorized identities.

	The same API can also launch the GPU-cluster mode with mode=cluster or
	task_type=cluster_task3. In that mode, the environment simulates scarce GPU
	memory, job deadlines, worker progress reports, audit claims, false
	completions, and AI reliability failures such as loops, context drift, and
	hallucinated confidence.

	api:
	base_url: https://xcodeaddy-sentinel-env.hf.space
	endpoints:
	health:
	method: GET
	path: /health
	returns: health status

	metadata:
	method: GET
	path: /metadata
	returns: task metadata, specialist descriptions, scenario summary

	reset:
	method: POST
	path: /reset
	body:
	task_type:
	type: string
	required: false
	enum: [task1, task2, task3, cluster_task1, cluster_task2, cluster_task3]
	mode:
	type: string
	required: false
	enum: [abstract, cluster, gpu, gpu_cluster]
	note: set to cluster to run the GPU-cluster trust environment
	scenario_id:
	type: string
	required: false
	seed:
	type: integer
	required: false
	adaptive:
	type: boolean
	required: false
	note: enables adaptive difficulty curriculum for Theme 4 demos
	returns: StepResult with observation, reward, done, info (includes session_id)

	step:
	method: POST
	path: /step
	params:
	session_id:
	type: string
	required: true
	body:
	session_id:
	type: string
	required: true
	task_type:
	type: string
	required: false
	enum: [task1, task2, task3, cluster_task1, cluster_task2, cluster_task3]
	action_type:
	type: string
	required: true
	enum: [delegate, verify, solve_independently, skip, allocate, preempt, request_info, tick]
	specialist_id:
	type: string
	required: false
	enum: [S0, S1, S2, S3, S4]
	note: required for delegate and verify
	worker_id:
	type: string
	required: false
	enum: [S0, S1, S2, S3, S4]
	note: cluster mode worker slot for allocate/request_info
	job_id:
	type: string
	required: false
	note: cluster mode job id
	gpu_id:
	type: string
	required: false
	note: cluster mode GPU id
	subtask_response:
	type: string
	required: false
	note: required for solve_independently
	reasoning:
	type: string
	required: false
	returns: StepResult with reward, done, info

	state:
	method: GET
	path: /state
	params:
	session_id:
	type: string
	required: true
	returns: SentinelState with trust_snapshot, completion, adversarial stats

	reward_report:
	method: GET
	path: /reward-report
	params:
	session_id:
	type: string
	required: true
	returns: Reward component trace with per-step process-aware signals

	difficulty:
	method: GET
	path: /difficulty
	returns: adaptive curriculum controller state

	stream:
	method: GET
	path: /stream
	params:
	session_id:
	type: string
	required: true
	returns: text/event-stream trust snapshots for live dashboards

	trust_dashboard:
	method: GET
	path: /trust-dashboard
	params:
	session_id:
	type: string
	required: false
	returns: browser dashboard with live S0-S4 trust bars

	cluster_dashboard:
	method: GET
	path: /cluster-dashboard
	params:
	session_id:
	type: string
	required: false
	returns: browser dashboard with trust, cluster health, utilization, attacks, and AI reliability

	deployment:
	session_backend: single_process_memory
	workers: 1
	session_ttl_seconds: 1800
	session_max_active: 256
	note: >
	Active SentinelEnv sessions are stored in one process with TTL/LRU cleanup.
	Multi-worker deployments require sticky sessions or a shared session store.

	tasks:
	task1:
	name: Single-Step Trust Decision
	difficulty: easy
	subtasks: 10
	max_steps: 15
	adversary_active: false
	reward: "0.99 correct delegation + stakes awareness \| 0.02 skip penalty"

	task2:
	name: Multi-Step Delegation Chain
	difficulty: medium
	subtasks: 15
	max_steps: 30
	adversary_active: false
	reward: "per-step accuracy + efficiency + confidence alignment + domain routing \| terminal completion×0.65 + calibration×0.35"

	task3:
	name: Full Adversarial Episode
	difficulty: hard
	subtasks: 20
	max_steps: 45
	adversary_active: true
	reward: "step accuracy + stakes awareness + efficiency + confidence alignment + verification quality + domain routing \| terminal completion×0.35 + detection×0.30 + calibration×0.25 + efficiency×0.10"

	cluster_task1:
	name: Cluster Basics
	difficulty: easy
	jobs: 10
	gpus: 8
	max_steps: 30
	adversary_active: false
	reward: "jobs_completed_rate×0.60 + avg_gpu_utilization×0.40"

	cluster_task2:
	name: Unreliable Workers
	difficulty: medium
	jobs: 20
	gpus: 12
	max_steps: 60
	adversary_active: false
	reward: "jobs×0.40 + worker_trust_calibration×0.30 + deadline_recovery×0.30"

	cluster_task3:
	name: Full Adversarial Cluster
	difficulty: hard
	jobs: 30
	gpus: 16
	max_steps: 120
	adversary_active: true
	reward: "global_agent_score × cluster_health × ai_reliability_modifier \| terminal jobs×0.30 + detection×0.25 + reward_hack_detection×0.20 + plan×0.15 + efficiency×0.10"

	reward_engine_v2:
	source: verifier/execution-style behavioral outcomes
	granularity: step plus terminal trajectory
	aggregation: fixed weighted multi-signal reward
	process_signals:
	confidence_alignment: penalizes high confidence on wrong outputs
	domain_routing: rewards in-domain specialist behavior when metadata exists
	verification_quality: rewards verification when it catches high-stakes adversarial risk

	cluster_reward_engine:
	source: simulated GPU state transitions, worker reports, audit ledger, and adversary attacks
	granularity: per-step global health plus terminal cluster trajectory
	aggregation: per-agent rewards multiplied by cluster_health and ai_reliability_modifier
	process_signals:
	loop_avoidance: repeated no-progress actions reduce global reward
	context_memory_score: actions drifting from the persistent cluster goal reduce global reward
	hallucination_resistance: confidence_accuracy_gap penalizes confident wrong reports
	evaluation_freshness: scenario signature, shuffled profiles, and attack diversity resist memorization

	specialists:
	S0: "AccurateSlow — 90% accurate, costs 2 steps"
	S1: "OverconfidentFast — 60% accurate, always reports high confidence"
	S2: "DomainBound — 95% in-domain, 10% out-of-domain"
	S3: "Adversarial — benign until stakes >= 0.70, then poisons result"
	S4: "Degrading — 95% first call, -15% accuracy per reuse"

	thresholds:
	adversarial_trigger_stakes: 0.70
	adversarial_awareness_stakes: 0.70
	critical_poison_stakes: 0.85
	verify_extra_step_cost: 1

	adaptive_curriculum:
	optional_reset_flag: adaptive
	default_enabled: false
	controller_window_episodes: 20
	harder_when_detection_rate_above: 0.70
	easier_when_detection_rate_below: 0.30
	knobs:
	- adversarial_threshold
	- high_stakes_ratio
	- verify_budget_penalty
	- adversary_confidence
	note: >
	Standard episodes remain deterministic by default. Passing adaptive=true to
	/reset enables a self-improving curriculum that tightens or relaxes task3
	difficulty based on recent adversarial detection performance.

	dataset:
	total_scenarios: 120
	split:
	task1: 40
	task2: 40
	task3: 40
	profile_shuffle: true
	note: >
	Specialist profiles (which slot is adversarial) resample every episode.
	Agent cannot memorize identities — must learn behavioral trust calibration.

	baseline:
	script: inference.py
	required_env_vars: [API_BASE_URL, MODEL_NAME, HF_TOKEN]
	optional_env_vars: [ENV_URL]
	latest_local_score: 0.8162
	latest_local_episodes: 60
	comparison_artifact: outputs/baseline_comparison.png
	reproducibility:
	inference_temperature: 0.0
	agent: heuristic-trust-weighted
	dataset_order: fixed SCN-TASK-001 through SCN-TASK-020 per task