File size: 8,914 Bytes
06b4790
9f5ce4f
06b4790
9f5ce4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
06b4790
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d50c3f9
06b4790
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8be69b1
 
 
 
 
 
 
 
 
 
 
 
0a14522
 
 
 
 
 
 
 
 
 
 
 
d59268c
 
 
 
 
 
 
 
 
 
 
 
 
700603e
 
 
 
 
 
 
 
 
 
 
 
 
06b4790
 
 
 
 
 
 
 
 
 
 
77eea12
 
06b4790
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8be69b1
 
0a14522
 
d59268c
 
06b4790
 
 
 
 
 
 
 
 
 
 
 
 
9f5ce4f
06b4790
 
 
 
9f5ce4f
 
 
 
 
 
 
bdd0439
 
 
9f5ce4f
 
 
 
 
 
 
 
 
 
 
 
 
 
06b4790
fca2aa4
 
 
 
 
06b4790
 
 
 
 
 
 
c5da483
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
name: devops-incident-response
version: "2.0.0"
description: >
  ARIA (Adaptive Reward & Incident Architecture) — an OpenEnv-compliant RL
  environment where AI agents learn to diagnose and remediate production
  software incidents under partial observability. Agents read logs, metrics,
  and alerts across a 12-service microservices architecture, then choose
  from 14 action types (restart, rollback, block_ip_range, create_index,
  failover, alert_oncall, and more). Seven curated tasks of escalating
  difficulty plus procedural seed-based generation provide a meaningful
  progression for benchmarking agent reasoning quality. Dense reward shaping
  with anti-gaming mechanisms (collateral damage penalty, blind remediation
  penalty, semantic diagnosis matching) ensures the reward signal is
  informative and resistant to exploitation. Curriculum engine tracks agent
  mastery per task and recommends adaptive training sequences. Multi-agent
  mode splits observability between an Observer (logs/alerts) and a
  Responder (metrics/dependencies), enabling communication and coordination
  research.

author: "Arijit-07"
tags:
  - openenv
  - devops
  - incident-response
  - real-world
  - multi-step
  - microservices
  - reward-shaping

tasks:
  - id: easy
    name: Single Service Anomaly
    description: >
      A payment service is crash-looping due to a JVM heap memory leak.
      Logs clearly show OutOfMemoryError and OOMKilled pod restarts.
      The agent must read logs/metrics, diagnose the memory leak, and
      restart the affected service without touching healthy services.
    difficulty: easy
    max_steps: 15
    reward_range: [0.0, 1.0]
    expected_score_random_agent: 0.05
    expected_score_strong_llm: 0.90

  - id: medium
    name: Cascading Multi-Service Failure
    description: >
      A bad deployment of inventory-service introduced connection pool
      exhaustion, cascading to order-service timeouts and api-gateway
      errors. A red-herring alert fires on notification-service (high CPU
      from a scheduled batch job). The agent must trace the cascade to the
      root service and rollback — not restart downstream victims.
    difficulty: medium
    max_steps: 20
    reward_range: [0.0, 1.0]
    expected_score_random_agent: 0.03
    expected_score_strong_llm: 0.55

  - id: hard
    name: Silent Data Corruption
    description: >
      A data pipeline deployment silently writes incorrect price values to
      the product catalog. No standard error-rate or latency alerts fire —
      all services show green health. The signal is buried in
      price-validation WARN logs (15% mismatch rate) and an analytics
      anomaly (avg order value 9x baseline). Full credit requires both
      rollback of the pipeline AND alerting on-call for a data audit.
    difficulty: hard
    max_steps: 25
    reward_range: [0.0, 1.0]
    expected_score_random_agent: 0.01
    expected_score_strong_llm: 0.35

  - id: bonus
    name: Simultaneous Dual Failure
    description: >
      Two independent failures strike at once: log-aggregator disk is 100% full
      (causing log loss across all services) and ml-inference-service is stuck
      in a model reload CPU loop. Neither failure is related to the other.
      Full credit requires fixing both root causes independently.
    difficulty: hard
    max_steps: 25
    reward_range: [0.0, 1.0]
    expected_score_random_agent: 0.01
    expected_score_strong_llm: 0.40

  - id: security
    name: Security Incident (DDoS)
    description: >
      A botnet is performing a DDoS and credential stuffing attack against the login endpoint.
      The API gateway and Auth service are overwhelmed. The agent must read access logs,
      diagnose the attack IP range, block the CIDR, and alert the security team.
    difficulty: hard
    max_steps: 20
    reward_range: [0.0, 1.0]
    expected_score_random_agent: 0.01
    expected_score_strong_llm: 0.35

  - id: database
    name: Database Performance Degradation (Missing Index)
    description: >
      A database migration ran 15 minutes ago that added a new column but forgot to add an index.
      Now queries are doing full table scans sequentially, leading to major DB degradation.
      The agent must read the Postgres slow query logs, evaluate sequential scan rates via metrics, and correctly assign a missing index or rollback the migration.
    difficulty: hard
    max_steps: 20
    reward_range: [0.0, 1.0]
    expected_score_random_agent: 0.01
    expected_score_strong_llm: 0.35

  - id: failover
    name: Multi-Region Failover
    description: >
      A primary datacenter region (us-east-1) is degraded due to a network partition.
      The agent must correctly identify which services support automatic multi-region failover
      (api-gateway, cdn-service, order-service, redis-cache) and which do not (payment-service, postgres-primary).
      Failing over the wrong services causes severe data inconsistency penalties.
    difficulty: hard
    max_steps: 25
    reward_range: [0.0, 1.0]
    expected_score_random_agent: 0.01
    expected_score_strong_llm: 0.25

  - id: generated
    name: Procedural Incident
    description: >
      A seed-based procedural incident generated by ARIA's IncidentFactory.
      Deterministic and reproducible — any integer seed 0-99999 produces a unique,
      consistent incident scenario. Failure modes include OOM, cascade, corruption,
      security breaches, database degradation, and network partition.
    difficulty: variable
    max_steps: 20
    reward_range: [0.0, 1.0]
    expected_score_random_agent: 0.02
    expected_score_strong_llm: 0.60

action_space:
  type: structured
  description: >
    Discrete action types with optional service/parameter arguments.
    Actions are expressed as Pydantic Action objects with fields:
    action_type, service, root_cause, runbook, version, reason.
  actions:
    - name: diagnose
      description: Record the agent's root cause hypothesis
    - name: read_logs
      description: Read recent log lines for a named service
    - name: search_logs
      description: Search log lines for a service matching a query string
    - name: read_metrics
      description: Read CPU, memory, error rate, latency for a named service
    - name: read_runbook
      description: Read an operational runbook by filename
    - name: restart_service
      description: Restart a named service (clears memory, resets connections)
    - name: rollback
      description: Roll back a service to a previous version
    - name: scale_up
      description: Increase replica count for a named service
    - name: alert_oncall
      description: Page the on-call engineering team
    - name: acknowledge
      description: Acknowledge an active alert by ID
    - name: noop
      description: Take no action this step
    - name: block_ip_range
      description: Block traffic from an IP range (CIDR format)
    - name: create_index
      description: Create a database index on a specific table and column
    - name: failover
      description: Failover a service to a different target region

observation_space:
  type: structured
  description: >
    Pydantic Observation object containing: current step, task description,
    list of ServiceStatus objects (name, status, cpu, memory, error_rate,
    latency_p99, replicas, version, last_deployed), list of Alert objects
    (severity, service, message, acknowledged), recent log lines per
    service (dict of service_name -> last 10 lines), available runbook
    names, last action result/error, and incident timing info.

reward:
  type: dense
  range: [0.001, 0.999]
  description: >
    Partial credit for information gathering, correct diagnosis, and
    precise remediation. Penalties for collateral damage (restarting
    healthy services), excessive noops, and treating symptoms instead
    of root causes. Efficiency bonus for fast resolution. Rewards
    clamped to [0.001, 0.999] to avoid dead gradients in RL training.
    Anti-gaming mechanisms: collateral_damage_penalty, blind_remediation_penalty,
    semantic diagnosis matching (fuzzy match against ground truth root cause).

training:
  algorithm: GRPO
  model: Llama-3.1-8B-Instruct
  adapter: https://huggingface.co/Arijit-07/aria-devops-llama8b
  episodes: 160
  framework: HuggingFace TRL + Unsloth
  results:
    easy_pre: 0.42
    easy_post: 0.87
    medium_pre: 0.18
    medium_post: 0.51
    hard_pre: 0.05
    hard_post: 0.22
    average_improvement: 0.31

aria_features:
  curriculum_engine: true
  incident_generator: true
  dual_agent_mode: true

websocket:
  endpoint: /ws
  protocol: json
  commands: [reset, step, state]

docker:
  base_image: python:3.11-slim
  port: 7860
  health_endpoint: /health
  reset_endpoint: /reset
  step_endpoint: /step
  state_endpoint: /state
  metrics_endpoint: /metrics
  leaderboard_endpoint: /leaderboard