Spaces:

Anvit25
/

Meta-SRE

Sleeping

App Files Files Community

Meta-SRE / openenv.yaml

Anvit25

Deploy Meta-SRE OpenEnv benchmark FastAPI server

ad6248e about 1 month ago

raw

history blame contribute delete

3.09 kB

	name: meta-sre
	version: "1.0.0"
	description: >
	OpenEnv environment for training LLM agents to act as Senior SREs.
	Simulates real Meta production incidents across 3 interconnected services
	with 5 difficulty levels, 10 engineering tools, and a self-improving
	difficulty controller (Theme 4: Self-Improvement).

	author: Meta-SRE Hackathon Team (Bhavya + Anvit)
	license: MIT

	endpoints:
	base_url: http://localhost:8000
	reset: POST /reset
	step: POST /step
	state: GET /state
	grade: GET /grade
	tools: GET /tools

	observation_space:
	type: object
	fields:
	- step: integer
	- incident_id: string
	- system_metrics: object # {service: ServiceMetrics}
	- active_alerts: array # List[Alert]
	- open_file: object # FileView \| null
	- terminal_output: string
	- git_diff: string # null if no edits yet
	- dependency_graph: object
	- sre_memory: array # agent's working notes
	- budget_remaining: integer # steps before SLA breach

	action_space:
	type: tool_call
	tools:
	- view_file
	- edit_line
	- run_tests
	- check_dependency
	- read_logs
	- git_blame
	- rollback
	- query_metrics_history
	- ask_senior_sre
	- write_incident_report

	reward:
	step_penalty: -0.1
	syntax_error_penalty: -0.5
	rollback_penalty: -1.0
	senior_sre_penalty: -0.2
	terminal_tests_pass: +1.0
	terminal_report_max: +0.5
	terminal_sla_bonus: +0.3
	terminal_no_regress: +0.2
	security_patch_bonus: +0.5 # Task 5 only
	max_possible: 3.0

	tasks:
	- id: 1
	difficulty: easy
	sla_budget: 15
	description: Single service AttributeError — hallucinated dict method

	- id: 2
	difficulty: medium
	sla_budget: 20
	description: Silent timestamp corruption in CAPI → ROAS degradation

	- id: 3
	difficulty: medium-hard
	sla_budget: 20
	description: DB connection pool exhaustion under load

	- id: 4
	difficulty: hard
	sla_budget: 25
	description: Circular FK migration cascading to 3 services (red herrings)

	- id: 5
	difficulty: hard
	sla_budget: 20
	description: PII data exposure via DEBUG_MODE=True (security incident)

	self_improvement:
	enabled: true
	controller: DifficultyController
	description: >
	After each episode the DifficultyController analyses which bug categories
	the agent failed on and weights future task selection toward those weaknesses.
	Bug categories: async_bugs, data_corruption, security_bugs,
	cascading_failures, red_herrings.

	usage_example: \|
	import requests

	BASE = "http://localhost:8000"

	obs = requests.post(f"{BASE}/reset", json={"task_id": 1}).json()
	done = False

	while not done:
	action = your_agent.decide(obs) # returns {"tool": ..., "params": ...}
	result = requests.post(f"{BASE}/step", json=action).json()
	obs = result["observation"]
	done = result["done"]

	score = requests.get(f"{BASE}/grade").json()["normalized_score"]
	print(f"Score: {score:.3f}")