aws_rl_env / tests_tasks /test_warmup_tasks.py
Sizzing's picture
Upload folder using huggingface_hub
c745a99 verified
"""Tests for warmup-tier tasks — verifies every task executes and grades correctly.
Each test sends the correct AWS CLI command for a warmup task against MiniStack
and asserts the grader returns task_achieved=True with reward=1.0.
Run inside Docker:
docker exec aws-rl-env python -m pytest tests/test_warmup_tasks.py -v
"""
import pytest
import yaml
from pathlib import Path
from models import SuccessCriteria, Task, TaskID, TaskDifficulty, SetupCommand
from server.services.simulator_strategy import SimulatorStrategy
from server.services.task_grader import TaskGrader
from server.services.episode_tracker import EpisodeTracker
TASKS_FILE = (
Path(__file__).resolve().parent.parent
/ "server"
/ "services"
/ "tasks"
/ "warmup.yaml"
)
# Mapping of task_id -> correct AWS CLI command
WARMUP_COMMANDS: dict[int, str] = {
0: "aws s3 ls",
1: "aws ec2 describe-instances",
2: "aws dynamodb list-tables",
3: "aws lambda list-functions",
4: "aws sqs list-queues",
5: "aws sns list-topics",
27: "aws iam list-users",
28: "aws secretsmanager list-secrets",
29: "aws ecs list-clusters",
30: "aws rds describe-db-instances",
31: "aws elasticache describe-cache-clusters",
32: "aws athena list-named-queries",
33: "aws glue get-databases",
34: "aws firehose list-delivery-streams",
35: "aws emr list-clusters",
36: "aws apigatewayv2 get-apis",
37: "aws route53 list-hosted-zones",
38: "aws elbv2 describe-load-balancers",
39: "aws ec2 describe-volumes",
40: "aws efs describe-file-systems",
41: "aws cognito-idp list-user-pools --max-results 10",
42: "aws ssm describe-parameters",
43: "aws events list-rules",
44: "aws cloudformation list-stacks",
45: "aws apigateway get-rest-apis",
}
@pytest.fixture(scope="module")
def backend() -> SimulatorStrategy:
return SimulatorStrategy()
@pytest.fixture(scope="module")
def grader(backend: SimulatorStrategy) -> TaskGrader:
return TaskGrader(backend)
@pytest.fixture(scope="module")
def warmup_tasks() -> list[dict]:
with open(TASKS_FILE) as f:
return yaml.safe_load(f)
def _build_task(entry: dict) -> Task:
"""Build a Task model from a raw YAML entry."""
return Task(
task_id=TaskID(entry["task_id"]),
difficulty=TaskDifficulty.WARMUP,
description=entry["description"],
success_criteria=SuccessCriteria(**entry.get("success_criteria", {})),
setup_commands=[
SetupCommand(command=cmd) if isinstance(cmd, str) else SetupCommand(**cmd)
for cmd in entry.get("setup_commands", [])
],
)
def test_all_warmup_tasks_have_commands(warmup_tasks: list[dict]) -> None:
"""Every warmup task in the YAML must have a corresponding test command."""
missing = [
t["task_id"] for t in warmup_tasks if t["task_id"] not in WARMUP_COMMANDS
]
assert not missing, f"No test command mapped for task_ids: {missing}"
@pytest.mark.parametrize(
"task_id",
sorted(WARMUP_COMMANDS.keys()),
ids=[f"task_{tid}" for tid in sorted(WARMUP_COMMANDS.keys())],
)
def test_warmup_task_grading(
task_id: int,
warmup_tasks: list[dict],
backend: SimulatorStrategy,
grader: TaskGrader,
) -> None:
"""Send the correct command for a warmup task and verify it grades as achieved."""
entry = next((t for t in warmup_tasks if t["task_id"] == task_id), None)
assert entry is not None, f"task_id {task_id} not found in warmup.yaml"
task = _build_task(entry)
cmd = WARMUP_COMMANDS[task_id]
# Execute against MiniStack
success, stdout, stderr = backend.execute_command(cmd)
assert success, f"Command failed: {cmd}\nstderr: {stderr}"
# Grade the step
tracker = EpisodeTracker()
step = tracker.record_step(cmd, success, stdout, stderr)
result = grader.grade(task, tracker, step)
assert result.task_achieved, (
f"Task {task_id} not achieved.\n"
f" Command: {cmd}\n"
f" Reason: {result.reason}\n"
f" Reward: {result.reward}"
)
assert result.reward == 1.0, f"Expected reward=1.0, got {result.reward}"
@pytest.mark.parametrize(
"task_id",
sorted(WARMUP_COMMANDS.keys()),
ids=[f"task_{tid}_wrong_cmd" for tid in sorted(WARMUP_COMMANDS.keys())],
)
def test_warmup_task_rejects_wrong_command(
task_id: int,
warmup_tasks: list[dict],
backend: SimulatorStrategy,
grader: TaskGrader,
) -> None:
"""A wrong command should not achieve a warmup task."""
entry = next((t for t in warmup_tasks if t["task_id"] == task_id), None)
assert entry is not None, f"task_id {task_id} not found in warmup.yaml"
task = _build_task(entry)
# Use a deliberately wrong command (different service)
wrong_cmd = "aws sts get-caller-identity"
success, stdout, stderr = backend.execute_command(wrong_cmd)
tracker = EpisodeTracker()
step = tracker.record_step(wrong_cmd, success, stdout, stderr)
result = grader.grade(task, tracker, step)
assert not result.task_achieved, (
f"Task {task_id} should NOT be achieved with wrong command '{wrong_cmd}'"
)
assert result.reward < 1.0