Spaces:

TokenTraveler
/

oncall-env

Sleeping

App Files Files Community

oncall-env / run_simulation.py

TokenTraveler

added project files.

34bd75f about 1 month ago

raw

history blame contribute delete

7.14 kB

	"""
	Interactive simulation runner — demonstrates OnCallEnv with optimal agent strategies.
	Shows step-by-step investigation and remediation for all 6 tasks.
	"""

	import sys
	sys.path.insert(0, ".")

	from environment import OnCallEnvironment
	from models import Action
	from graders import grade_task


	def banner(text: str):
	w = 70
	print("\n" + "=" * w)
	print(f" {text}")
	print("=" * w)


	def run_task(env: OnCallEnvironment, task_id: str, actions: list[str]):
	"""Run a task with a predefined action sequence and print each step."""
	obs = env.reset(task_id)
	print(f"\n Task: {obs.task_id}")
	print(f" Goal: {obs.goal}")
	print(f" Max steps: {obs.max_steps}")
	print(f" Services: {', '.join(obs.services)}")
	print(f" Alerts:")
	for a in obs.alerts:
	print(f" [{a.severity.upper()}] {a.service}: {a.message}")
	print()

	for i, cmd in enumerate(actions, 1):
	resp = env.step(Action(command=cmd))
	obs = resp.observation
	status = "ERROR" if obs.last_action_error else "OK"
	print(f" Step {i}: {cmd}")
	lines = (obs.last_action_result or "").split("\n")
	for line in lines[:5]:
	print(f" \| {line}")
	if len(lines) > 5:
	print(f" \| ... ({len(lines) - 5} more lines)")
	print(f" [{status}] Running score: {resp.reward.total}")
	print()

	if resp.done:
	print(f" >>> Episode finished: {resp.info.get('reason', '?')}")
	break

	state = env.state()
	grader_score = grade_task(task_id, state)
	print(f"\n -- Final Results --")
	print(f" Env Score: {state.score}")
	print(f" Grader Score: {grader_score}")
	print(f" Breakdown:")
	for k, v in state.reward_breakdown.items():
	print(f" {k:20s} {v:+.3f}")
	print(f" Steps used: {state.step}")
	print(f" Root cause ID: {state.root_cause_identified}")
	print(f" Remediation: {state.remediation_applied}")
	return state.score


	def main():
	env = OnCallEnvironment()
	scores = {}

	# ── EASY ──────────────────────────────────────────────────────────
	banner("EASY: Memory Leak in Payment Service")
	scores["easy_memory_leak"] = run_task(env, "easy_memory_leak", [
	"check_logs payment-service",
	"check_metrics payment-service",
	"check_metrics api-gateway",
	"restart_service payment-service",
	"mark_resolved payment-service memory leak OOM out of memory causing repeated kills",
	])

	# ── MEDIUM ────────────────────────────────────────────────────────
	banner("MEDIUM: Cascading Connection Pool Exhaustion")
	scores["medium_cascading_failure"] = run_task(env, "medium_cascading_failure", [
	"check_metrics api-gateway",
	"check_logs api-gateway",
	"check_dependencies api-gateway",
	"check_metrics order-service",
	"check_logs order-service",
	"check_config order-service",
	"update_config order-service db_pool_size 50",
	"mark_resolved order-service connection pool exhausted db_pool_size config changed to 5 by auto-scaler",
	])

	# ── HARD (Cache) ──────────────────────────────────────────────────
	banner("HARD: Subtle Cache Bug Causing Cross-Service Degradation")
	scores["hard_cache_degradation"] = run_task(env, "hard_cache_degradation", [
	"check_metrics api-gateway",
	"check_metrics order-service",
	"check_metrics product-service",
	"check_metrics cache-service",
	"check_logs cache-service",
	"check_deploy_history cache-service",
	"check_metrics postgres-primary",
	"rollback_deploy cache-service",
	"mark_resolved cache-service deployment changed key hashing algorithm causing 60% cache miss rate",
	])

	# ── MEDIUM (DNS) ──────────────────────────────────────────────────
	banner("MEDIUM: DNS Misconfiguration Causing Intermittent Failures")
	scores["medium_dns_misconfiguration"] = run_task(env, "medium_dns_misconfiguration", [
	"check_metrics order-service",
	"check_logs order-service",
	"check_config order-service",
	"check_metrics inventory-service",
	"check_metrics api-gateway",
	"update_config order-service inventory_host inventory-service.internal",
	"mark_resolved order-service dns hostname misconfiguration inventory_host pointed to decommissioned host",
	])

	# ── HARD (Replication) ────────────────────────────────────────────
	banner("HARD: Database Replication Lag from Runaway Batch Job")
	scores["hard_replication_lag"] = run_task(env, "hard_replication_lag", [
	"check_metrics user-service",
	"check_logs user-service",
	"check_metrics order-service",
	"check_logs order-service",
	"check_metrics postgres-primary",
	"check_logs postgres-primary",
	"check_config postgres-primary",
	"check_metrics postgres-replica",
	"update_config postgres-primary batch_job_enabled false",
	"mark_resolved postgres-primary batch job nightly_aggregation running during peak hours causing replication lag",
	])

	# ── EXPERT (Multi-Root-Cause) ────────────────────────────────────
	banner("EXPERT: Simultaneous Bad Deployment and Config Drift")
	scores["expert_multi_root_cause"] = run_task(env, "expert_multi_root_cause", [
	"check_metrics api-gateway",
	"check_logs api-gateway",
	"check_metrics search-service",
	"check_logs search-service",
	"check_deploy_history search-service",
	"check_metrics order-service",
	"check_logs order-service",
	"check_config order-service",
	"check_metrics elasticsearch",
	"rollback_deploy search-service",
	"update_config order-service db_pool_size 50",
	"mark_resolved search-service bad deployment v3.1.0 broke elasticsearch query AND order-service db_pool_size config reduced to 3 by capacity-planner both issues fixed",
	])

	# ── SUMMARY ───────────────────────────────────────────────────────
	banner("SIMULATION SUMMARY")
	total = 0.0
	for tid, score in scores.items():
	print(f" {tid:35s} {score:.3f}")
	total += score
	avg = total / len(scores)
	print(f" {'':35s} -----")
	print(f" {'Average':35s} {avg:.3f}")
	print()


	if __name__ == "__main__":
	main()