File size: 5,697 Bytes
78ea1a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
"""Tests for MLOpsEnvironment — core episode flow, state management, and step logic."""

import pytest
from mlops_environment import MLOpsEnvironment, TASK_MAX_STEPS
from models import MLOpsAction


class TestReset:
    """reset() should produce a clean, valid initial state."""

    @pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
    def test_reset_returns_valid_observation(self, task_id):
        env = MLOpsEnvironment(task_id=task_id)
        obs = env.reset(seed=42)
        assert obs.task_id == task_id
        assert obs.step_count == 0
        assert obs.max_steps == TASK_MAX_STEPS[task_id]
        assert obs.done is False
        assert len(obs.available_artifacts) == 6
        assert obs.artifacts_read == []

    def test_reset_with_same_seed_is_deterministic(self):
        env1 = MLOpsEnvironment(task_id="easy")
        env2 = MLOpsEnvironment(task_id="easy")
        obs1 = env1.reset(seed=123)
        obs2 = env2.reset(seed=123)
        assert obs1.run_id == obs2.run_id
        assert env1.bug_type == env2.bug_type

    def test_reset_with_different_seeds_varies(self):
        env = MLOpsEnvironment(task_id="easy")
        obs1 = env.reset(seed=1)
        run_id_1 = obs1.run_id
        obs2 = env.reset(seed=999)
        assert obs2.run_id != run_id_1

    def test_reset_clears_previous_episode(self):
        env = MLOpsEnvironment(task_id="easy")
        env.reset(seed=42)
        env.step(MLOpsAction(action_type="read_config"))
        assert len(env._artifacts_read) == 1
        env.reset(seed=42)
        assert len(env._artifacts_read) == 0
        assert env._step_count == 0


class TestStepActions:
    """Each action type should return expected structure and reward."""

    @pytest.fixture
    def env(self):
        env = MLOpsEnvironment(task_id="easy")
        env.reset(seed=42)
        return env

    def test_read_config(self, env):
        obs, reward, done, info = env.step(MLOpsAction(action_type="read_config"))
        assert reward == 0.02
        assert done is False
        assert "config.yaml" in obs.artifacts_read
        assert "content" in obs.last_action_result

    def test_read_logs(self, env):
        obs, reward, done, info = env.step(MLOpsAction(action_type="read_logs"))
        assert reward == 0.02
        assert "train.log" in obs.artifacts_read

    def test_read_logs_with_filter(self, env):
        obs, reward, done, info = env.step(
            MLOpsAction(action_type="read_logs", log_filter="epoch:1-3")
        )
        assert reward == 0.02
        content = obs.last_action_result.get("content", "")
        assert "EPOCH" in content or "No log lines" in content

    def test_check_dataset_stats(self, env):
        obs, reward, done, info = env.step(MLOpsAction(action_type="check_dataset_stats"))
        assert reward == 0.02
        assert "dataset_stats.json" in obs.artifacts_read

    def test_inspect_preprocessing(self, env):
        obs, reward, done, info = env.step(MLOpsAction(action_type="inspect_preprocessing"))
        assert reward == 0.02
        assert "preprocessing.py" in obs.artifacts_read

    def test_read_eval_results(self, env):
        obs, reward, done, info = env.step(MLOpsAction(action_type="read_eval_results"))
        assert reward == 0.02
        assert "eval_results.json" in obs.artifacts_read

    def test_run_sanity_check(self, env):
        obs, reward, done, info = env.step(
            MLOpsAction(action_type="run_sanity_check", sanity_check_type="loss_trajectory")
        )
        assert reward == 0.01
        assert obs.last_action_result["status"] == "ok"
        assert "sanity_check" in obs.last_action_result

    def test_query_artifact(self, env):
        env.step(MLOpsAction(action_type="read_config"))
        obs, reward, done, info = env.step(
            MLOpsAction(action_type="query_artifact", artifact_name="config.yaml", field_path="model.architecture")
        )
        assert obs.last_action_result["status"] == "ok"

    def test_duplicate_read_penalty(self, env):
        env.step(MLOpsAction(action_type="read_config"))
        obs, reward, done, info = env.step(MLOpsAction(action_type="read_config"))
        assert reward == -0.02

    def test_step_count_increments(self, env):
        env.step(MLOpsAction(action_type="read_config"))
        env.step(MLOpsAction(action_type="read_logs"))
        assert env._step_count == 2

    def test_done_after_submit(self, env):
        obs, reward, done, info = env.step(MLOpsAction(action_type="submit_diagnosis"))
        assert done is True

    def test_step_after_done_returns_done(self, env):
        env.step(MLOpsAction(action_type="submit_diagnosis"))
        obs, reward, done, info = env.step(MLOpsAction(action_type="read_config"))
        assert done is True
        assert reward == 0.01  # clamped minimum
        assert "score" in info


class TestEpisodeBoundaries:
    """Episode should terminate correctly on submit, timeout, and re-step."""

    def test_timeout_at_max_steps(self):
        env = MLOpsEnvironment(task_id="easy")
        env.reset(seed=42)
        for _ in range(TASK_MAX_STEPS["easy"]):
            obs, reward, done, info = env.step(MLOpsAction(action_type="read_config"))
            if done:
                break
        assert done is True
        assert "score" in info

    def test_submit_ends_episode(self):
        env = MLOpsEnvironment(task_id="medium")
        env.reset(seed=42)
        env.step(MLOpsAction(action_type="read_logs"))
        obs, reward, done, info = env.step(MLOpsAction(action_type="submit_diagnosis"))
        assert done is True
        assert "score" in info
        assert "breakdown" in info