File size: 16,953 Bytes
ebf4715
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
from __future__ import annotations

import copy
import hashlib
from typing import Any

import yaml

from .data import TASK_REGISTRY, TaskSpec
from .models import ConfigAction, ConfigObservation, ConfigReward, EnvState, TaskType


class ConfigDebuggerEnv:
    def __init__(self) -> None:
        self.task_spec: TaskSpec | None = None
        self.task_id: TaskType | None = None
        self.current_config_text: str = ""
        self.previous_score: float = 0.0
        self.step_count: int = 0
        self.done: bool = False
        self.max_steps: int = 15
        self.last_reward: ConfigReward | None = None
        self._state_visit_count: dict[str, int] = {}

    def reset(self, task_id: TaskType | str) -> ConfigObservation:
        normalized_task_id = task_id.value if isinstance(task_id, TaskType) else str(task_id)

        if normalized_task_id not in TASK_REGISTRY:
            valid = ", ".join(TASK_REGISTRY.keys())
            raise ValueError(f"Unknown task_id '{task_id}'. Valid task ids: {valid}")

        spec = TASK_REGISTRY[normalized_task_id]
        self.task_spec = spec
        self.task_id = TaskType(normalized_task_id)
        self.current_config_text = spec.broken
        self.step_count = 0
        self.done = False
        self.max_steps = spec.max_steps
        self._state_visit_count = {}
        initial_score = self._grade(self.current_config_text)["overall"]
        self.previous_score = initial_score
        self.last_reward = None

        self._track_state_visit(self.current_config_text)
        return self._build_observation()

    def step(self, action: ConfigAction) -> tuple[ConfigObservation, ConfigReward, bool, dict[str, Any]]:
        if self.task_spec is None or self.task_id is None:
            raise RuntimeError("Environment is not initialized. Call reset() first.")

        if self.done:
            obs = self._build_observation()
            reward = ConfigReward(
                value=0.0,
                previous_score=self.previous_score,
                current_score=self.previous_score,
                delta=0.0,
                penalties=["episode_already_done"],
            )
            self.last_reward = reward
            return obs, reward, True, {"reason": "episode_already_done"}

        self.step_count += 1
        penalties: list[str] = []

        try:
            new_config_text, action_penalties = self._apply_action(self.current_config_text, action)
            penalties.extend(action_penalties)
            self.current_config_text = new_config_text
        except Exception as exc:
            penalties.append(f"invalid_action:{exc}")

        grading = self._grade(self.current_config_text)
        current_score = grading["overall"]
        delta = round(current_score - self.previous_score, 4)

        loop_penalty = self._track_state_visit(self.current_config_text)
        if loop_penalty > 0:
            penalties.append(f"loop_penalty:{loop_penalty:.2f}")

        reward_value = self._compute_reward(current_score, delta, penalties, loop_penalty)

        reward = ConfigReward(
            value=reward_value,
            previous_score=round(self.previous_score, 4),
            current_score=round(current_score, 4),
            delta=delta,
            penalties=penalties,
        )

        self.previous_score = current_score
        self.done = current_score >= 0.98 or self.step_count >= self.max_steps
        self.last_reward = reward

        info = {
            "task_id": self.task_id.value,
            "schema_score": grading["schema"],
            "logic_score": grading["logic"],
            "syntax_valid": grading["syntax_valid"],
        }

        return self._build_observation(grading), reward, self.done, info

    def state(self) -> EnvState:
        observation = self._build_observation() if self.task_spec is not None else None
        return EnvState(
            task_id=self.task_id,
            done=self.done,
            step_count=self.step_count,
            max_steps=self.max_steps,
            observation=observation,
            last_reward=self.last_reward,
        )

    def _build_observation(self, grading: dict[str, Any] | None = None) -> ConfigObservation:
        if self.task_spec is None or self.task_id is None:
            raise RuntimeError("Environment is not initialized. Call reset() first.")

        if grading is None:
            grading = self._grade(self.current_config_text)

        return ConfigObservation(
            task_id=self.task_id,
            task_description=self.task_spec.description,
            current_config=self.current_config_text,
            syntax_valid=grading["syntax_valid"],
            validation_errors=grading["errors"],
            schema_score=grading["schema"],
            logic_score=grading["logic"],
            overall_score=grading["overall"],
            step_count=self.step_count,
            max_steps=self.max_steps,
        )

    def _compute_reward(self, current_score: float, delta: float, penalties: list[str], loop_penalty: float) -> float:
        reward = current_score
        if delta > 0:
            reward += min(0.15, delta)
        elif delta < 0:
            reward += delta * 0.4

        penalty_total = loop_penalty
        if any(p.startswith("invalid_action") for p in penalties):
            penalty_total += 0.10
        if any(p.startswith("destructive_delete") for p in penalties):
            penalty_total += 0.08

        reward -= penalty_total
        if current_score >= 0.98:
            reward += 0.05

        return round(max(0.0, min(1.0, reward)), 4)

    def _track_state_visit(self, config_text: str) -> float:
        state_hash = hashlib.sha1(config_text.encode("utf-8")).hexdigest()
        count = self._state_visit_count.get(state_hash, 0) + 1
        self._state_visit_count[state_hash] = count
        # Penalize repeated states to discourage loops.
        if count <= 1:
            return 0.0
        return min(0.03 * (count - 1), 0.12)

    def _apply_action(self, config_text: str, action: ConfigAction) -> tuple[str, list[str]]:
        penalties: list[str] = []

        data = yaml.safe_load(config_text)
        if data is None:
            data = {}
        if not isinstance(data, dict):
            raise ValueError("current config is not a dictionary-like YAML document")

        root = copy.deepcopy(data)
        tokens = self._parse_path(action.path)

        if action.operation == "delete" and tokens and isinstance(tokens[0], str):
            if tokens[0] in {"services", "spec", "training", "hardware"} and len(tokens) == 1:
                penalties.append("destructive_delete:top_level_critical_key")

        if action.operation in {"edit", "add"}:
            self._set_path(root, tokens, action.value)
        else:
            deleted = self._delete_path(root, tokens)
            if not deleted:
                penalties.append("delete_noop")

        dumped = yaml.safe_dump(root, sort_keys=False)
        return dumped, penalties

    def _parse_path(self, path: str) -> list[str | int]:
        tokens: list[str | int] = []
        for chunk in path.split("."):
            chunk = chunk.strip()
            if chunk == "":
                raise ValueError("path contains empty token")
            if chunk.isdigit():
                tokens.append(int(chunk))
            else:
                tokens.append(chunk)
        return tokens

    def _set_path(self, root: dict[str, Any], tokens: list[str | int], value: Any) -> None:
        if not tokens:
            raise ValueError("cannot set empty path")

        cursor: Any = root
        for i, token in enumerate(tokens[:-1]):
            nxt = tokens[i + 1]
            if isinstance(token, int):
                if not isinstance(cursor, list):
                    raise ValueError("list index used on non-list node")
                while token >= len(cursor):
                    cursor.append({} if isinstance(nxt, str) else [])
                if cursor[token] is None:
                    cursor[token] = {} if isinstance(nxt, str) else []
                cursor = cursor[token]
            else:
                if not isinstance(cursor, dict):
                    raise ValueError("dict key used on non-dict node")
                if token not in cursor or cursor[token] is None:
                    cursor[token] = {} if isinstance(nxt, str) else []
                cursor = cursor[token]

        final = tokens[-1]
        if isinstance(final, int):
            if not isinstance(cursor, list):
                raise ValueError("final list index used on non-list node")
            while final >= len(cursor):
                cursor.append(None)
            cursor[final] = value
        else:
            if not isinstance(cursor, dict):
                raise ValueError("final dict key used on non-dict node")
            cursor[final] = value

    def _delete_path(self, root: dict[str, Any], tokens: list[str | int]) -> bool:
        if not tokens:
            return False

        cursor: Any = root
        for token in tokens[:-1]:
            if isinstance(token, int):
                if not isinstance(cursor, list) or token >= len(cursor):
                    return False
                cursor = cursor[token]
            else:
                if not isinstance(cursor, dict) or token not in cursor:
                    return False
                cursor = cursor[token]

        final = tokens[-1]
        if isinstance(final, int):
            if not isinstance(cursor, list) or final >= len(cursor):
                return False
            cursor.pop(final)
            return True

        if not isinstance(cursor, dict) or final not in cursor:
            return False
        del cursor[final]
        return True

    def _grade(self, config_text: str) -> dict[str, Any]:
        assert self.task_spec is not None

        errors: list[str] = []
        try:
            parsed = yaml.safe_load(config_text)
        except Exception as exc:
            return {
                "syntax_valid": False,
                "schema": 0.0,
                "logic": 0.0,
                "overall": 0.0,
                "errors": [f"YAML syntax error: {exc}"],
            }

        if parsed is None:
            parsed = {}

        if not isinstance(parsed, dict):
            return {
                "syntax_valid": True,
                "schema": 0.0,
                "logic": 0.0,
                "overall": 0.0,
                "errors": ["Root document must be a mapping/dict"],
            }

        schema_score, schema_errors = self._grade_schema(parsed)
        logic_score, logic_errors = self._grade_logic(parsed)
        errors.extend(schema_errors)
        errors.extend(logic_errors)

        overall = round((0.60 * schema_score) + (0.40 * logic_score), 4)

        return {
            "syntax_valid": True,
            "schema": schema_score,
            "logic": logic_score,
            "overall": overall,
            "errors": errors[:20],
        }

    def _grade_schema(self, parsed: dict[str, Any]) -> tuple[float, list[str]]:
        assert self.task_spec is not None

        total_weight = 0.0
        matched_weight = 0.0
        errors: list[str] = []

        for path, weight in self.task_spec.required_paths.items():
            total_weight += weight
            expected = self._read_path(self.task_spec.target, self._parse_path(path))
            got, exists = self._safe_read(parsed, self._parse_path(path))
            if not exists:
                errors.append(f"Missing required path: {path}")
                continue
            if got == expected:
                matched_weight += weight
            else:
                errors.append(f"Mismatch at {path}: expected={expected!r}, got={got!r}")

        score = 0.0 if total_weight == 0 else round(matched_weight / total_weight, 4)
        return score, errors

    def _grade_logic(self, parsed: dict[str, Any]) -> tuple[float, list[str]]:
        assert self.task_spec is not None

        checks: list[tuple[str, bool]] = []
        t = self.task_spec.task_id

        if t == "easy_docker":
            web_ports = self._safe_get(parsed, ["services", "web", "ports"], default=[])
            db_ports = self._safe_get(parsed, ["services", "db", "ports"], default=[])
            env_node = self._safe_get(parsed, ["services", "web", "environment"], default={})
            checks.append(("web ports must be list", isinstance(web_ports, list)))
            checks.append(("all web ports must contain ':'", all(isinstance(p, str) and ":" in p for p in web_ports)))
            checks.append(("db port must include host and container", "5432:5432" in db_ports if isinstance(db_ports, list) else False))
            checks.append(("environment must be dict", isinstance(env_node, dict)))

        elif t == "medium_k8s":
            replicas = self._safe_get(parsed, ["spec", "replicas"], default=None)
            limits_mem = self._safe_get(
                parsed,
                ["spec", "template", "spec", "containers", 0, "resources", "limits", "memory"],
                default="",
            )
            req_mem = self._safe_get(
                parsed,
                ["spec", "template", "spec", "containers", 0, "resources", "requests", "memory"],
                default="",
            )
            req_cpu = self._safe_get(
                parsed,
                ["spec", "template", "spec", "containers", 0, "resources", "requests", "cpu"],
                default="",
            )
            checks.append(("replicas should be int", isinstance(replicas, int)))
            checks.append(("limits memory must include unit", isinstance(limits_mem, str) and limits_mem.endswith(("Mi", "Gi"))))
            checks.append(("requests memory must include unit", isinstance(req_mem, str) and req_mem.endswith(("Mi", "Gi"))))
            checks.append(("cpu request should be millicore string", isinstance(req_cpu, str) and req_cpu.endswith("m")))

        elif t == "hard_ml_config":
            warmup = self._safe_get(parsed, ["training", "warmup_steps"], default=0)
            max_steps = self._safe_get(parsed, ["training", "max_steps"], default=0)
            use_cuda = self._safe_get(parsed, ["hardware", "use_cuda"], default=False)
            gpu_count = self._safe_get(parsed, ["hardware", "gpu_count"], default=0)
            batch_size = self._safe_get(parsed, ["training", "batch_size"], default=0)
            train_batch = self._safe_get(parsed, ["data", "train_batch_size"], default=0)
            log_interval = self._safe_get(parsed, ["logging", "log_interval"], default=999999)
            checks.append(("warmup_steps < max_steps", isinstance(warmup, int) and isinstance(max_steps, int) and warmup < max_steps))
            checks.append(("gpu_count >=1 when use_cuda", (not use_cuda) or (isinstance(gpu_count, int) and gpu_count >= 1)))
            checks.append(("train_batch_size equals 2 * batch_size", isinstance(batch_size, int) and isinstance(train_batch, int) and train_batch == 2 * batch_size))
            checks.append(("log_interval <= 100", isinstance(log_interval, int) and log_interval <= 100))

        total = len(checks)
        passed = sum(1 for _, ok in checks if ok)
        errors = [msg for msg, ok in checks if not ok]
        score = 0.0 if total == 0 else round(passed / total, 4)
        return score, errors

    def _read_path(self, source: Any, tokens: list[str | int]) -> Any:
        cursor = source
        for token in tokens:
            if isinstance(token, int):
                cursor = cursor[token]
            else:
                cursor = cursor[token]
        return cursor

    def _safe_read(self, source: Any, tokens: list[str | int]) -> tuple[Any, bool]:
        cursor = source
        for token in tokens:
            try:
                if isinstance(token, int):
                    if not isinstance(cursor, list):
                        return None, False
                    cursor = cursor[token]
                else:
                    if not isinstance(cursor, dict) or token not in cursor:
                        return None, False
                    cursor = cursor[token]
            except Exception:
                return None, False
        return cursor, True

    def _safe_get(self, source: Any, tokens: list[str | int], default: Any) -> Any:
        value, exists = self._safe_read(source, tokens)
        return value if exists else default