sentinel_env / tests /test_grading.py
KaushikSarveswaran's picture
Initial submission: OpenEnv-Sentinel SRE triage environment
33dd3ee
"""Tests for per-task grading: perfect scores, partial credit, wrong service, destructive penalty."""
import pytest
class TestTask1Grading:
def test_perfect_resolution(self, task1):
result = task1.grade_resolution(
{
"root_cause": "Missing DB_CONNECTION_STRING env var after deploy v2.3.1",
"affected_service": "payment-api",
"recommendation": "Rollback to v2.3.0 or set the DB_CONNECTION_STRING env var",
},
step_count=3,
)
assert result["score"] >= 0.80
def test_wrong_service(self, task1):
result = task1.grade_resolution(
{
"root_cause": "Missing DB_CONNECTION_STRING env var after deploy v2.3.1",
"affected_service": "order-service",
"recommendation": "Rollback",
},
step_count=3,
)
# Should lose the affected_service points
assert result["score"] <= 0.85
def test_empty_resolution(self, task1):
result = task1.grade_resolution(
{"root_cause": "", "affected_service": "", "recommendation": ""},
step_count=1,
)
assert result["score"] <= 0.20
class TestTask2Grading:
def test_perfect_resolution(self, task2):
result = task2.grade_resolution(
{
"root_cause": "inventory-service OOM memory leak from batch processing causing checkout-service timeout",
"affected_service": "inventory-service",
"recommendation": "Increase memory limit to 1Gi and reduce batch size or stream results",
},
step_count=4,
)
assert result["score"] >= 0.70
def test_wrong_root_cause(self, task2):
result = task2.grade_resolution(
{
"root_cause": "Network partition",
"affected_service": "inventory-service",
"recommendation": "Restart networking",
},
step_count=3,
)
assert result["score"] < 0.40
class TestTask3Grading:
def test_perfect_resolution(self, task3):
result = task3.grade_resolution(
{
"root_cause": "analytics-worker long-running query exhausted the connection pool, cascade to auth-service, user-profile-service, notification-service",
"affected_service": "postgres-primary",
"recommendation": "Kill the query and set statement_timeout, use read replica for analytics",
},
step_count=5,
)
assert result["score"] >= 0.70
def test_blames_notification_deploy(self, task3):
result = task3.grade_resolution(
{
"root_cause": "notification-service deploy v3.1 caused the failure",
"affected_service": "notification-service",
"recommendation": "Rollback notification-service",
},
step_count=3,
)
# Should score poorly — wrong root cause and wrong affected service
assert result["score"] <= 0.20
def test_partial_credit_pool_only(self, task3):
result = task3.grade_resolution(
{
"root_cause": "postgres connection pool exhausted and full",
"affected_service": "postgres-primary",
"recommendation": "Increase pool size",
},
step_count=3,
)
# Pool identified but not analytics-worker
assert 0.20 <= result["score"] <= 0.65