File size: 5,212 Bytes
c745a99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
"""Tests for warmup-tier tasks — verifies every task executes and grades correctly.

Each test sends the correct AWS CLI command for a warmup task against MiniStack
and asserts the grader returns task_achieved=True with reward=1.0.

Run inside Docker:
    docker exec aws-rl-env python -m pytest tests/test_warmup_tasks.py -v
"""

import pytest
import yaml
from pathlib import Path

from models import SuccessCriteria, Task, TaskID, TaskDifficulty, SetupCommand
from server.services.simulator_strategy import SimulatorStrategy
from server.services.task_grader import TaskGrader
from server.services.episode_tracker import EpisodeTracker

TASKS_FILE = (
    Path(__file__).resolve().parent.parent
    / "server"
    / "services"
    / "tasks"
    / "warmup.yaml"
)

# Mapping of task_id -> correct AWS CLI command
WARMUP_COMMANDS: dict[int, str] = {
    0: "aws s3 ls",
    1: "aws ec2 describe-instances",
    2: "aws dynamodb list-tables",
    3: "aws lambda list-functions",
    4: "aws sqs list-queues",
    5: "aws sns list-topics",
    27: "aws iam list-users",
    28: "aws secretsmanager list-secrets",
    29: "aws ecs list-clusters",
    30: "aws rds describe-db-instances",
    31: "aws elasticache describe-cache-clusters",
    32: "aws athena list-named-queries",
    33: "aws glue get-databases",
    34: "aws firehose list-delivery-streams",
    35: "aws emr list-clusters",
    36: "aws apigatewayv2 get-apis",
    37: "aws route53 list-hosted-zones",
    38: "aws elbv2 describe-load-balancers",
    39: "aws ec2 describe-volumes",
    40: "aws efs describe-file-systems",
    41: "aws cognito-idp list-user-pools --max-results 10",
    42: "aws ssm describe-parameters",
    43: "aws events list-rules",
    44: "aws cloudformation list-stacks",
    45: "aws apigateway get-rest-apis",
}


@pytest.fixture(scope="module")
def backend() -> SimulatorStrategy:
    return SimulatorStrategy()


@pytest.fixture(scope="module")
def grader(backend: SimulatorStrategy) -> TaskGrader:
    return TaskGrader(backend)


@pytest.fixture(scope="module")
def warmup_tasks() -> list[dict]:
    with open(TASKS_FILE) as f:
        return yaml.safe_load(f)


def _build_task(entry: dict) -> Task:
    """Build a Task model from a raw YAML entry."""
    return Task(
        task_id=TaskID(entry["task_id"]),
        difficulty=TaskDifficulty.WARMUP,
        description=entry["description"],
        success_criteria=SuccessCriteria(**entry.get("success_criteria", {})),
        setup_commands=[
            SetupCommand(command=cmd) if isinstance(cmd, str) else SetupCommand(**cmd)
            for cmd in entry.get("setup_commands", [])
        ],
    )


def test_all_warmup_tasks_have_commands(warmup_tasks: list[dict]) -> None:
    """Every warmup task in the YAML must have a corresponding test command."""
    missing = [
        t["task_id"] for t in warmup_tasks if t["task_id"] not in WARMUP_COMMANDS
    ]
    assert not missing, f"No test command mapped for task_ids: {missing}"


@pytest.mark.parametrize(
    "task_id",
    sorted(WARMUP_COMMANDS.keys()),
    ids=[f"task_{tid}" for tid in sorted(WARMUP_COMMANDS.keys())],
)
def test_warmup_task_grading(
    task_id: int,
    warmup_tasks: list[dict],
    backend: SimulatorStrategy,
    grader: TaskGrader,
) -> None:
    """Send the correct command for a warmup task and verify it grades as achieved."""
    entry = next((t for t in warmup_tasks if t["task_id"] == task_id), None)
    assert entry is not None, f"task_id {task_id} not found in warmup.yaml"

    task = _build_task(entry)
    cmd = WARMUP_COMMANDS[task_id]

    # Execute against MiniStack
    success, stdout, stderr = backend.execute_command(cmd)
    assert success, f"Command failed: {cmd}\nstderr: {stderr}"

    # Grade the step
    tracker = EpisodeTracker()
    step = tracker.record_step(cmd, success, stdout, stderr)
    result = grader.grade(task, tracker, step)

    assert result.task_achieved, (
        f"Task {task_id} not achieved.\n"
        f"  Command: {cmd}\n"
        f"  Reason: {result.reason}\n"
        f"  Reward: {result.reward}"
    )
    assert result.reward == 1.0, f"Expected reward=1.0, got {result.reward}"


@pytest.mark.parametrize(
    "task_id",
    sorted(WARMUP_COMMANDS.keys()),
    ids=[f"task_{tid}_wrong_cmd" for tid in sorted(WARMUP_COMMANDS.keys())],
)
def test_warmup_task_rejects_wrong_command(
    task_id: int,
    warmup_tasks: list[dict],
    backend: SimulatorStrategy,
    grader: TaskGrader,
) -> None:
    """A wrong command should not achieve a warmup task."""
    entry = next((t for t in warmup_tasks if t["task_id"] == task_id), None)
    assert entry is not None, f"task_id {task_id} not found in warmup.yaml"

    task = _build_task(entry)

    # Use a deliberately wrong command (different service)
    wrong_cmd = "aws sts get-caller-identity"

    success, stdout, stderr = backend.execute_command(wrong_cmd)
    tracker = EpisodeTracker()
    step = tracker.record_step(wrong_cmd, success, stdout, stderr)
    result = grader.grade(task, tracker, step)

    assert not result.task_achieved, (
        f"Task {task_id} should NOT be achieved with wrong command '{wrong_cmd}'"
    )
    assert result.reward < 1.0