File size: 9,167 Bytes
325aa05
 
 
 
 
 
 
 
 
 
 
 
 
 
a36db1b
325aa05
 
 
 
 
 
 
 
 
a36db1b
 
 
 
 
 
325aa05
a434e53
325aa05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a36db1b
 
 
 
 
 
325aa05
 
 
 
 
 
74b74f1
 
 
 
325aa05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a36db1b
 
325aa05
 
 
a36db1b
325aa05
 
 
 
 
a36db1b
 
 
 
 
 
 
 
 
 
 
 
 
325aa05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b3b9bbd
 
 
 
 
 
 
 
 
74b74f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a36db1b
 
 
 
 
 
 
 
 
aad7819
 
 
 
 
 
 
 
 
325aa05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b3b9bbd
325aa05
 
 
 
 
 
 
b3b9bbd
 
a36db1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b3b9bbd
 
 
 
 
 
 
 
325aa05
a36db1b
 
 
 
 
 
 
 
 
 
325aa05
aad7819
325aa05
 
 
 
 
aad7819
 
 
 
 
 
74b74f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325aa05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a434e53
 
 
325aa05
 
 
a434e53
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
spec_version: 1

name: sentinel-env

type: space

runtime: fastapi

app: app:app

port: 7860

version: "1.0.0"

tags: [openenv, multi-agent, trust-calibration, adversarial, long-horizon, gpu-cluster]

description: >
  SENTINEL is a multi-agent trust calibration RL environment. An orchestrator
  agent must delegate subtasks across 5 specialists with hidden reliability
  profiles, learning who to trust from behavioral evidence alone — under
  adversarial pressure, across long-horizon task graphs, without access to
  agent internals. Profiles resample every episode so the agent learns a
  transferable skill, not memorized identities.

  The same API can also launch the GPU-cluster mode with mode=cluster or
  task_type=cluster_task3. In that mode, the environment simulates scarce GPU
  memory, job deadlines, worker progress reports, audit claims, false
  completions, and AI reliability failures such as loops, context drift, and
  hallucinated confidence.

api:
  base_url: https://xcodeaddy-sentinel-env.hf.space
  endpoints:
    health:
      method: GET
      path: /health
      returns: health status

    metadata:
      method: GET
      path: /metadata
      returns: task metadata, specialist descriptions, scenario summary

    reset:
      method: POST
      path: /reset
      body:
        task_type:
          type: string
          required: false
          enum: [task1, task2, task3, cluster_task1, cluster_task2, cluster_task3]
        mode:
          type: string
          required: false
          enum: [abstract, cluster, gpu, gpu_cluster]
          note: set to cluster to run the GPU-cluster trust environment
        scenario_id:
          type: string
          required: false
        seed:
          type: integer
          required: false
        adaptive:
          type: boolean
          required: false
          note: enables adaptive difficulty curriculum for Theme 4 demos
      returns: StepResult with observation, reward, done, info (includes session_id)

    step:
      method: POST
      path: /step
      params:
        session_id:
          type: string
          required: true
      body:
        session_id:
          type: string
          required: true
        task_type:
          type: string
          required: false
          enum: [task1, task2, task3, cluster_task1, cluster_task2, cluster_task3]
        action_type:
          type: string
          required: true
          enum: [delegate, verify, solve_independently, skip, allocate, preempt, request_info, tick]
        specialist_id:
          type: string
          required: false
          enum: [S0, S1, S2, S3, S4]
          note: required for delegate and verify
        worker_id:
          type: string
          required: false
          enum: [S0, S1, S2, S3, S4]
          note: cluster mode worker slot for allocate/request_info
        job_id:
          type: string
          required: false
          note: cluster mode job id
        gpu_id:
          type: string
          required: false
          note: cluster mode GPU id
        subtask_response:
          type: string
          required: false
          note: required for solve_independently
        reasoning:
          type: string
          required: false
      returns: StepResult with reward, done, info

    state:
      method: GET
      path: /state
      params:
        session_id:
          type: string
          required: true
      returns: SentinelState with trust_snapshot, completion, adversarial stats

    reward_report:
      method: GET
      path: /reward-report
      params:
        session_id:
          type: string
          required: true
      returns: Reward component trace with per-step process-aware signals

    difficulty:
      method: GET
      path: /difficulty
      returns: adaptive curriculum controller state

    stream:
      method: GET
      path: /stream
      params:
        session_id:
          type: string
          required: true
      returns: text/event-stream trust snapshots for live dashboards

    trust_dashboard:
      method: GET
      path: /trust-dashboard
      params:
        session_id:
          type: string
          required: false
      returns: browser dashboard with live S0-S4 trust bars

    cluster_dashboard:
      method: GET
      path: /cluster-dashboard
      params:
        session_id:
          type: string
          required: false
      returns: browser dashboard with trust, cluster health, utilization, attacks, and AI reliability

deployment:
  session_backend: single_process_memory
  workers: 1
  session_ttl_seconds: 1800
  session_max_active: 256
  note: >
    Active SentinelEnv sessions are stored in one process with TTL/LRU cleanup.
    Multi-worker deployments require sticky sessions or a shared session store.

tasks:
  task1:
    name: Single-Step Trust Decision
    difficulty: easy
    subtasks: 10
    max_steps: 15
    adversary_active: false
    reward: "0.99 correct delegation + stakes awareness | 0.02 skip penalty"

  task2:
    name: Multi-Step Delegation Chain
    difficulty: medium
    subtasks: 15
    max_steps: 30
    adversary_active: false
    reward: "per-step accuracy + efficiency + confidence alignment + domain routing | terminal completion×0.65 + calibration×0.35"

  task3:
    name: Full Adversarial Episode
    difficulty: hard
    subtasks: 20
    max_steps: 45
    adversary_active: true
    reward: "step accuracy + stakes awareness + efficiency + confidence alignment + verification quality + domain routing | terminal completion×0.35 + detection×0.30 + calibration×0.25 + efficiency×0.10"

  cluster_task1:
    name: Cluster Basics
    difficulty: easy
    jobs: 10
    gpus: 8
    max_steps: 30
    adversary_active: false
    reward: "jobs_completed_rate×0.60 + avg_gpu_utilization×0.40"

  cluster_task2:
    name: Unreliable Workers
    difficulty: medium
    jobs: 20
    gpus: 12
    max_steps: 60
    adversary_active: false
    reward: "jobs×0.40 + worker_trust_calibration×0.30 + deadline_recovery×0.30"

  cluster_task3:
    name: Full Adversarial Cluster
    difficulty: hard
    jobs: 30
    gpus: 16
    max_steps: 120
    adversary_active: true
    reward: "global_agent_score × cluster_health × ai_reliability_modifier | terminal jobs×0.30 + detection×0.25 + reward_hack_detection×0.20 + plan×0.15 + efficiency×0.10"

reward_engine_v2:
  source: verifier/execution-style behavioral outcomes
  granularity: step plus terminal trajectory
  aggregation: fixed weighted multi-signal reward
  process_signals:
    confidence_alignment: penalizes high confidence on wrong outputs
    domain_routing: rewards in-domain specialist behavior when metadata exists
    verification_quality: rewards verification when it catches high-stakes adversarial risk

cluster_reward_engine:
  source: simulated GPU state transitions, worker reports, audit ledger, and adversary attacks
  granularity: per-step global health plus terminal cluster trajectory
  aggregation: per-agent rewards multiplied by cluster_health and ai_reliability_modifier
  process_signals:
    loop_avoidance: repeated no-progress actions reduce global reward
    context_memory_score: actions drifting from the persistent cluster goal reduce global reward
    hallucination_resistance: confidence_accuracy_gap penalizes confident wrong reports
    evaluation_freshness: scenario signature, shuffled profiles, and attack diversity resist memorization

specialists:
  S0: "AccurateSlow — 90% accurate, costs 2 steps"
  S1: "OverconfidentFast — 60% accurate, always reports high confidence"
  S2: "DomainBound — 95% in-domain, 10% out-of-domain"
  S3: "Adversarial — benign until stakes >= 0.70, then poisons result"
  S4: "Degrading — 95% first call, -15% accuracy per reuse"

thresholds:
  adversarial_trigger_stakes: 0.70
  adversarial_awareness_stakes: 0.70
  critical_poison_stakes: 0.85
  verify_extra_step_cost: 1

adaptive_curriculum:
  optional_reset_flag: adaptive
  default_enabled: false
  controller_window_episodes: 20
  harder_when_detection_rate_above: 0.70
  easier_when_detection_rate_below: 0.30
  knobs:
    - adversarial_threshold
    - high_stakes_ratio
    - verify_budget_penalty
    - adversary_confidence
  note: >
    Standard episodes remain deterministic by default. Passing adaptive=true to
    /reset enables a self-improving curriculum that tightens or relaxes task3
    difficulty based on recent adversarial detection performance.

dataset:
  total_scenarios: 120
  split:
    task1: 40
    task2: 40
    task3: 40
  profile_shuffle: true
  note: >
    Specialist profiles (which slot is adversarial) resample every episode.
    Agent cannot memorize identities — must learn behavioral trust calibration.

baseline:
  script: inference.py
  required_env_vars: [API_BASE_URL, MODEL_NAME, HF_TOKEN]
  optional_env_vars: [ENV_URL]
  latest_local_score: 0.8162
  latest_local_episodes: 60
  comparison_artifact: outputs/baseline_comparison.png
  reproducibility:
    inference_temperature: 0.0
    agent: heuristic-trust-weighted
    dataset_order: fixed SCN-TASK*-001 through SCN-TASK*-020 per task