Spaces:
Running
Running
File size: 5,212 Bytes
c745a99 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 | """Tests for warmup-tier tasks — verifies every task executes and grades correctly.
Each test sends the correct AWS CLI command for a warmup task against MiniStack
and asserts the grader returns task_achieved=True with reward=1.0.
Run inside Docker:
docker exec aws-rl-env python -m pytest tests/test_warmup_tasks.py -v
"""
import pytest
import yaml
from pathlib import Path
from models import SuccessCriteria, Task, TaskID, TaskDifficulty, SetupCommand
from server.services.simulator_strategy import SimulatorStrategy
from server.services.task_grader import TaskGrader
from server.services.episode_tracker import EpisodeTracker
TASKS_FILE = (
Path(__file__).resolve().parent.parent
/ "server"
/ "services"
/ "tasks"
/ "warmup.yaml"
)
# Mapping of task_id -> correct AWS CLI command
WARMUP_COMMANDS: dict[int, str] = {
0: "aws s3 ls",
1: "aws ec2 describe-instances",
2: "aws dynamodb list-tables",
3: "aws lambda list-functions",
4: "aws sqs list-queues",
5: "aws sns list-topics",
27: "aws iam list-users",
28: "aws secretsmanager list-secrets",
29: "aws ecs list-clusters",
30: "aws rds describe-db-instances",
31: "aws elasticache describe-cache-clusters",
32: "aws athena list-named-queries",
33: "aws glue get-databases",
34: "aws firehose list-delivery-streams",
35: "aws emr list-clusters",
36: "aws apigatewayv2 get-apis",
37: "aws route53 list-hosted-zones",
38: "aws elbv2 describe-load-balancers",
39: "aws ec2 describe-volumes",
40: "aws efs describe-file-systems",
41: "aws cognito-idp list-user-pools --max-results 10",
42: "aws ssm describe-parameters",
43: "aws events list-rules",
44: "aws cloudformation list-stacks",
45: "aws apigateway get-rest-apis",
}
@pytest.fixture(scope="module")
def backend() -> SimulatorStrategy:
return SimulatorStrategy()
@pytest.fixture(scope="module")
def grader(backend: SimulatorStrategy) -> TaskGrader:
return TaskGrader(backend)
@pytest.fixture(scope="module")
def warmup_tasks() -> list[dict]:
with open(TASKS_FILE) as f:
return yaml.safe_load(f)
def _build_task(entry: dict) -> Task:
"""Build a Task model from a raw YAML entry."""
return Task(
task_id=TaskID(entry["task_id"]),
difficulty=TaskDifficulty.WARMUP,
description=entry["description"],
success_criteria=SuccessCriteria(**entry.get("success_criteria", {})),
setup_commands=[
SetupCommand(command=cmd) if isinstance(cmd, str) else SetupCommand(**cmd)
for cmd in entry.get("setup_commands", [])
],
)
def test_all_warmup_tasks_have_commands(warmup_tasks: list[dict]) -> None:
"""Every warmup task in the YAML must have a corresponding test command."""
missing = [
t["task_id"] for t in warmup_tasks if t["task_id"] not in WARMUP_COMMANDS
]
assert not missing, f"No test command mapped for task_ids: {missing}"
@pytest.mark.parametrize(
"task_id",
sorted(WARMUP_COMMANDS.keys()),
ids=[f"task_{tid}" for tid in sorted(WARMUP_COMMANDS.keys())],
)
def test_warmup_task_grading(
task_id: int,
warmup_tasks: list[dict],
backend: SimulatorStrategy,
grader: TaskGrader,
) -> None:
"""Send the correct command for a warmup task and verify it grades as achieved."""
entry = next((t for t in warmup_tasks if t["task_id"] == task_id), None)
assert entry is not None, f"task_id {task_id} not found in warmup.yaml"
task = _build_task(entry)
cmd = WARMUP_COMMANDS[task_id]
# Execute against MiniStack
success, stdout, stderr = backend.execute_command(cmd)
assert success, f"Command failed: {cmd}\nstderr: {stderr}"
# Grade the step
tracker = EpisodeTracker()
step = tracker.record_step(cmd, success, stdout, stderr)
result = grader.grade(task, tracker, step)
assert result.task_achieved, (
f"Task {task_id} not achieved.\n"
f" Command: {cmd}\n"
f" Reason: {result.reason}\n"
f" Reward: {result.reward}"
)
assert result.reward == 1.0, f"Expected reward=1.0, got {result.reward}"
@pytest.mark.parametrize(
"task_id",
sorted(WARMUP_COMMANDS.keys()),
ids=[f"task_{tid}_wrong_cmd" for tid in sorted(WARMUP_COMMANDS.keys())],
)
def test_warmup_task_rejects_wrong_command(
task_id: int,
warmup_tasks: list[dict],
backend: SimulatorStrategy,
grader: TaskGrader,
) -> None:
"""A wrong command should not achieve a warmup task."""
entry = next((t for t in warmup_tasks if t["task_id"] == task_id), None)
assert entry is not None, f"task_id {task_id} not found in warmup.yaml"
task = _build_task(entry)
# Use a deliberately wrong command (different service)
wrong_cmd = "aws sts get-caller-identity"
success, stdout, stderr = backend.execute_command(wrong_cmd)
tracker = EpisodeTracker()
step = tracker.record_step(wrong_cmd, success, stdout, stderr)
result = grader.grade(task, tracker, step)
assert not result.task_achieved, (
f"Task {task_id} should NOT be achieved with wrong command '{wrong_cmd}'"
)
assert result.reward < 1.0
|