File size: 3,088 Bytes
ad6248e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
name: meta-sre
version: "1.0.0"
description: >
  OpenEnv environment for training LLM agents to act as Senior SREs.
  Simulates real Meta production incidents across 3 interconnected services
  with 5 difficulty levels, 10 engineering tools, and a self-improving
  difficulty controller (Theme 4: Self-Improvement).

author: Meta-SRE Hackathon Team (Bhavya + Anvit)
license: MIT

endpoints:
  base_url: http://localhost:8000
  reset:  POST /reset
  step:   POST /step
  state:  GET  /state
  grade:  GET  /grade
  tools:  GET  /tools

observation_space:
  type: object
  fields:
    - step:              integer
    - incident_id:       string
    - system_metrics:    object   # {service: ServiceMetrics}
    - active_alerts:     array    # List[Alert]
    - open_file:         object   # FileView | null
    - terminal_output:   string
    - git_diff:          string   # null if no edits yet
    - dependency_graph:  object
    - sre_memory:        array    # agent's working notes
    - budget_remaining:  integer  # steps before SLA breach

action_space:
  type: tool_call
  tools:
    - view_file
    - edit_line
    - run_tests
    - check_dependency
    - read_logs
    - git_blame
    - rollback
    - query_metrics_history
    - ask_senior_sre
    - write_incident_report

reward:
  step_penalty:         -0.1
  syntax_error_penalty: -0.5
  rollback_penalty:     -1.0
  senior_sre_penalty:   -0.2
  terminal_tests_pass:  +1.0
  terminal_report_max:  +0.5
  terminal_sla_bonus:   +0.3
  terminal_no_regress:  +0.2
  security_patch_bonus: +0.5    # Task 5 only
  max_possible:          3.0

tasks:
  - id: 1
    difficulty: easy
    sla_budget: 15
    description: Single service AttributeError  hallucinated dict method

  - id: 2
    difficulty: medium
    sla_budget: 20
    description: Silent timestamp corruption in CAPI  ROAS degradation

  - id: 3
    difficulty: medium-hard
    sla_budget: 20
    description: DB connection pool exhaustion under load

  - id: 4
    difficulty: hard
    sla_budget: 25
    description: Circular FK migration cascading to 3 services (red herrings)

  - id: 5
    difficulty: hard
    sla_budget: 20
    description: PII data exposure via DEBUG_MODE=True (security incident)

self_improvement:
  enabled: true
  controller: DifficultyController
  description: >
    After each episode the DifficultyController analyses which bug categories
    the agent failed on and weights future task selection toward those weaknesses.
    Bug categories: async_bugs, data_corruption, security_bugs,
                    cascading_failures, red_herrings.

usage_example: |
  import requests

  BASE = "http://localhost:8000"

  obs   = requests.post(f"{BASE}/reset", json={"task_id": 1}).json()
  done  = False

  while not done:
      action = your_agent.decide(obs)          # returns {"tool": ..., "params": ...}
      result = requests.post(f"{BASE}/step", json=action).json()
      obs    = result["observation"]
      done   = result["done"]

  score = requests.get(f"{BASE}/grade").json()["normalized_score"]
  print(f"Score: {score:.3f}")