Spaces:

NoNameFound
/

sentinel_env

Sleeping

App Files Files Community

sentinel_env / tests /test_grading.py

KaushikSarveswaran

Initial submission: OpenEnv-Sentinel SRE triage environment

33dd3ee about 1 month ago

raw

history blame contribute delete

3.58 kB

	"""Tests for per-task grading: perfect scores, partial credit, wrong service, destructive penalty."""

	import pytest


	class TestTask1Grading:
	def test_perfect_resolution(self, task1):
	result = task1.grade_resolution(
	{
	"root_cause": "Missing DB_CONNECTION_STRING env var after deploy v2.3.1",
	"affected_service": "payment-api",
	"recommendation": "Rollback to v2.3.0 or set the DB_CONNECTION_STRING env var",
	},
	step_count=3,
	)
	assert result["score"] >= 0.80

	def test_wrong_service(self, task1):
	result = task1.grade_resolution(
	{
	"root_cause": "Missing DB_CONNECTION_STRING env var after deploy v2.3.1",
	"affected_service": "order-service",
	"recommendation": "Rollback",
	},
	step_count=3,
	)
	# Should lose the affected_service points
	assert result["score"] <= 0.85

	def test_empty_resolution(self, task1):
	result = task1.grade_resolution(
	{"root_cause": "", "affected_service": "", "recommendation": ""},
	step_count=1,
	)
	assert result["score"] <= 0.20


	class TestTask2Grading:
	def test_perfect_resolution(self, task2):
	result = task2.grade_resolution(
	{
	"root_cause": "inventory-service OOM memory leak from batch processing causing checkout-service timeout",
	"affected_service": "inventory-service",
	"recommendation": "Increase memory limit to 1Gi and reduce batch size or stream results",
	},
	step_count=4,
	)
	assert result["score"] >= 0.70

	def test_wrong_root_cause(self, task2):
	result = task2.grade_resolution(
	{
	"root_cause": "Network partition",
	"affected_service": "inventory-service",
	"recommendation": "Restart networking",
	},
	step_count=3,
	)
	assert result["score"] < 0.40


	class TestTask3Grading:
	def test_perfect_resolution(self, task3):
	result = task3.grade_resolution(
	{
	"root_cause": "analytics-worker long-running query exhausted the connection pool, cascade to auth-service, user-profile-service, notification-service",
	"affected_service": "postgres-primary",
	"recommendation": "Kill the query and set statement_timeout, use read replica for analytics",
	},
	step_count=5,
	)
	assert result["score"] >= 0.70

	def test_blames_notification_deploy(self, task3):
	result = task3.grade_resolution(
	{
	"root_cause": "notification-service deploy v3.1 caused the failure",
	"affected_service": "notification-service",
	"recommendation": "Rollback notification-service",
	},
	step_count=3,
	)
	# Should score poorly — wrong root cause and wrong affected service
	assert result["score"] <= 0.20

	def test_partial_credit_pool_only(self, task3):
	result = task3.grade_resolution(
	{
	"root_cause": "postgres connection pool exhausted and full",
	"affected_service": "postgres-primary",
	"recommendation": "Increase pool size",
	},
	step_count=3,
	)
	# Pool identified but not analytics-worker
	assert 0.20 <= result["score"] <= 0.65