Spaces:

XcodeAddy
/

sentinel-env

Running

File size: 9,167 Bytes

spec_version: 1

name: sentinel-env

type: space

runtime: fastapi

app: app:app

port: 7860

version: "1.0.0"

tags: [openenv, multi-agent, trust-calibration, adversarial, long-horizon, gpu-cluster]

description: >
  SENTINEL is a multi-agent trust calibration RL environment. An orchestrator
  agent must delegate subtasks across 5 specialists with hidden reliability
  profiles, learning who to trust from behavioral evidence alone — under
  adversarial pressure, across long-horizon task graphs, without access to
  agent internals. Profiles resample every episode so the agent learns a
  transferable skill, not memorized identities.

  The same API can also launch the GPU-cluster mode with mode=cluster or
  task_type=cluster_task3. In that mode, the environment simulates scarce GPU
  memory, job deadlines, worker progress reports, audit claims, false
  completions, and AI reliability failures such as loops, context drift, and
  hallucinated confidence.

api:
  base_url: https://xcodeaddy-sentinel-env.hf.space
  endpoints:
    health:
      method: GET
      path: /health
      returns: health status

    metadata:
      method: GET
      path: /metadata
      returns: task metadata, specialist descriptions, scenario summary

    reset:
      method: POST
      path: /reset
      body:
        task_type:
          type: string
          required: false
          enum: [task1, task2, task3, cluster_task1, cluster_task2, cluster_task3]
        mode:
          type: string
          required: false
          enum: [abstract, cluster, gpu, gpu_cluster]
          note: set to cluster to run the GPU-cluster trust environment
        scenario_id:
          type: string
          required: false
        seed:
          type: integer
          required: false
        adaptive:
          type: boolean
          required: false
          note: enables adaptive difficulty curriculum for Theme 4 demos
      returns: StepResult with observation, reward, done, info (includes session_id)

    step:
      method: POST
      path: /step
      params:
        session_id:
          type: string
          required: true
      body:
        session_id:
          type: string
          required: true
        task_type:
          type: string
          required: false
          enum: [task1, task2, task3, cluster_task1, cluster_task2, cluster_task3]
        action_type:
          type: string
          required: true
          enum: [delegate, verify, solve_independently, skip, allocate, preempt, request_info, tick]
        specialist_id:
          type: string
          required: false
          enum: [S0, S1, S2, S3, S4]
          note: required for delegate and verify
        worker_id:
          type: string
          required: false
          enum: [S0, S1, S2, S3, S4]
          note: cluster mode worker slot for allocate/request_info
        job_id:
          type: string
          required: false
          note: cluster mode job id
        gpu_id:
          type: string
          required: false
          note: cluster mode GPU id
        subtask_response:
          type: string
          required: false
          note: required for solve_independently
        reasoning:
          type: string
          required: false
      returns: StepResult with reward, done, info

    state:
      method: GET
      path: /state
      params:
        session_id:
          type: string
          required: true
      returns: SentinelState with trust_snapshot, completion, adversarial stats

    reward_report:
      method: GET
      path: /reward-report
      params:
        session_id:
          type: string
          required: true
      returns: Reward component trace with per-step process-aware signals

    difficulty:
      method: GET
      path: /difficulty
      returns: adaptive curriculum controller state

    stream:
      method: GET
      path: /stream
      params:
        session_id:
          type: string
          required: true
      returns: text/event-stream trust snapshots for live dashboards

    trust_dashboard:
      method: GET
      path: /trust-dashboard
      params:
        session_id:
          type: string
          required: false
      returns: browser dashboard with live S0-S4 trust bars

    cluster_dashboard:
      method: GET
      path: /cluster-dashboard
      params:
        session_id:
          type: string
          required: false
      returns: browser dashboard with trust, cluster health, utilization, attacks, and AI reliability

deployment:
  session_backend: single_process_memory
  workers: 1
  session_ttl_seconds: 1800
  session_max_active: 256
  note: >
    Active SentinelEnv sessions are stored in one process with TTL/LRU cleanup.
    Multi-worker deployments require sticky sessions or a shared session store.

tasks:
  task1:
    name: Single-Step Trust Decision
    difficulty: easy
    subtasks: 10
    max_steps: 15
    adversary_active: false
    reward: "0.99 correct delegation + stakes awareness | 0.02 skip penalty"

  task2:
    name: Multi-Step Delegation Chain
    difficulty: medium
    subtasks: 15
    max_steps: 30
    adversary_active: false
    reward: "per-step accuracy + efficiency + confidence alignment + domain routing | terminal completion×0.65 + calibration×0.35"

  task3:
    name: Full Adversarial Episode
    difficulty: hard
    subtasks: 20
    max_steps: 45
    adversary_active: true
    reward: "step accuracy + stakes awareness + efficiency + confidence alignment + verification quality + domain routing | terminal completion×0.35 + detection×0.30 + calibration×0.25 + efficiency×0.10"

  cluster_task1:
    name: Cluster Basics
    difficulty: easy
    jobs: 10
    gpus: 8
    max_steps: 30
    adversary_active: false
    reward: "jobs_completed_rate×0.60 + avg_gpu_utilization×0.40"

  cluster_task2:
    name: Unreliable Workers
    difficulty: medium
    jobs: 20
    gpus: 12
    max_steps: 60
    adversary_active: false
    reward: "jobs×0.40 + worker_trust_calibration×0.30 + deadline_recovery×0.30"

  cluster_task3:
    name: Full Adversarial Cluster
    difficulty: hard
    jobs: 30
    gpus: 16
    max_steps: 120
    adversary_active: true
    reward: "global_agent_score × cluster_health × ai_reliability_modifier | terminal jobs×0.30 + detection×0.25 + reward_hack_detection×0.20 + plan×0.15 + efficiency×0.10"

reward_engine_v2:
  source: verifier/execution-style behavioral outcomes
  granularity: step plus terminal trajectory
  aggregation: fixed weighted multi-signal reward
  process_signals:
    confidence_alignment: penalizes high confidence on wrong outputs
    domain_routing: rewards in-domain specialist behavior when metadata exists
    verification_quality: rewards verification when it catches high-stakes adversarial risk

cluster_reward_engine:
  source: simulated GPU state transitions, worker reports, audit ledger, and adversary attacks
  granularity: per-step global health plus terminal cluster trajectory
  aggregation: per-agent rewards multiplied by cluster_health and ai_reliability_modifier
  process_signals:
    loop_avoidance: repeated no-progress actions reduce global reward
    context_memory_score: actions drifting from the persistent cluster goal reduce global reward
    hallucination_resistance: confidence_accuracy_gap penalizes confident wrong reports
    evaluation_freshness: scenario signature, shuffled profiles, and attack diversity resist memorization

specialists:
  S0: "AccurateSlow — 90% accurate, costs 2 steps"
  S1: "OverconfidentFast — 60% accurate, always reports high confidence"
  S2: "DomainBound — 95% in-domain, 10% out-of-domain"
  S3: "Adversarial — benign until stakes >= 0.70, then poisons result"
  S4: "Degrading — 95% first call, -15% accuracy per reuse"

thresholds:
  adversarial_trigger_stakes: 0.70
  adversarial_awareness_stakes: 0.70
  critical_poison_stakes: 0.85
  verify_extra_step_cost: 1

adaptive_curriculum:
  optional_reset_flag: adaptive
  default_enabled: false
  controller_window_episodes: 20
  harder_when_detection_rate_above: 0.70
  easier_when_detection_rate_below: 0.30
  knobs:
    - adversarial_threshold
    - high_stakes_ratio
    - verify_budget_penalty
    - adversary_confidence
  note: >
    Standard episodes remain deterministic by default. Passing adaptive=true to
    /reset enables a self-improving curriculum that tightens or relaxes task3
    difficulty based on recent adversarial detection performance.

dataset:
  total_scenarios: 120
  split:
    task1: 40
    task2: 40
    task3: 40
  profile_shuffle: true
  note: >
    Specialist profiles (which slot is adversarial) resample every episode.
    Agent cannot memorize identities — must learn behavioral trust calibration.

baseline:
  script: inference.py
  required_env_vars: [API_BASE_URL, MODEL_NAME, HF_TOKEN]
  optional_env_vars: [ENV_URL]
  latest_local_score: 0.8162
  latest_local_episodes: 60
  comparison_artifact: outputs/baseline_comparison.png
  reproducibility:
    inference_temperature: 0.0
    agent: heuristic-trust-weighted
    dataset_order: fixed SCN-TASK*-001 through SCN-TASK*-020 per task