Lishika commited on
Commit
ae94737
·
1 Parent(s): 30bf68a

finally added all

Browse files
.gitignore CHANGED
@@ -7,7 +7,7 @@ __pycache__/
7
  # Keep source folder env/ tracked; only ignore venv directories.
8
  .venv/
9
  venv/
10
- ENV/
11
  env.bak/
12
  venv.bak/
13
 
 
7
  # Keep source folder env/ tracked; only ignore venv directories.
8
  .venv/
9
  venv/
10
+
11
  env.bak/
12
  venv.bak/
13
 
env/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from env.rewards import RewardCalculator
2
+ from env.environment import CICDDebuggerEnvironment, MAX_STEPS, REQUIRED_TOOLS
3
+
4
+ __all__ = ["RewardCalculator", "CICDDebuggerEnvironment", "REQUIRED_TOOLS", "MAX_STEPS"]
env/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (366 Bytes). View file
 
env/__pycache__/anti_hacking.cpython-313.pyc ADDED
Binary file (9.81 kB). View file
 
env/__pycache__/environment.cpython-313.pyc ADDED
Binary file (41.4 kB). View file
 
env/__pycache__/hidden_tests.cpython-313.pyc ADDED
Binary file (3.53 kB). View file
 
env/__pycache__/models.cpython-313.pyc ADDED
Binary file (4.39 kB). View file
 
env/__pycache__/rewards.cpython-313.pyc ADDED
Binary file (8.82 kB). View file
 
env/anti_hacking.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from typing import Any
5
+
6
+ import yaml
7
+
8
+
9
+ class AntiHackingDetector:
10
+ """Detects shortcut behaviors that can game CI-fix rewards."""
11
+
12
+ STAGE_SKIP_PATTERNS = (
13
+ r"\bif\s*:\s*false\b",
14
+ r"\bwhen\s*:\s*never\b",
15
+ r"\bon\s*:\s*\[\s*\]\b",
16
+ r"\bon\s*:\s*{}",
17
+ r"\ballow_failure\s*:\s*true\b",
18
+ )
19
+
20
+ FAKE_SUCCESS_PATTERNS = (
21
+ r"echo\s+[\"']?tests\s+passed[\"']?",
22
+ r"echo\s+[\"']?success[\"']?",
23
+ r"\bexit\s+0\b",
24
+ r"\btrue\b\s*#?\s*force",
25
+ )
26
+
27
+ HARDCODED_OUTPUT_PATTERNS = (
28
+ r"\bstatus\s*:\s*success\b",
29
+ r"\bresult\s*:\s*passed\b",
30
+ r"\btests?_passed\s*=\s*true\b",
31
+ r"\bforce_success\b",
32
+ )
33
+
34
+ BROKEN_COMMAND_PATTERNS = (
35
+ r"\bnpm\s+tset\b",
36
+ r"\bpyhton\b",
37
+ r"\bpip\s+isntall\b",
38
+ r"\bgo\s+tset\b",
39
+ )
40
+
41
+ def penalty_stage_skipping(self, config_text: str) -> float:
42
+ hits = self._count_hits(config_text, self.STAGE_SKIP_PATTERNS)
43
+ if hits == 0:
44
+ return 0.0
45
+ return -min(0.20 * hits, 0.70)
46
+
47
+ def penalty_fake_success(self, config_text: str) -> float:
48
+ hits = self._count_hits(config_text, self.FAKE_SUCCESS_PATTERNS)
49
+ if hits == 0:
50
+ return 0.0
51
+
52
+ normalized = (config_text or "").lower()
53
+ has_real_test_cmd = any(token in normalized for token in ("npm test", "pytest", "go test", "mvn test", "yarn test", "pnpm test"))
54
+ base = 0.15 if has_real_test_cmd else 0.25
55
+ return -min(base * hits, 0.70)
56
+
57
+ def penalty_hardcoded_outputs(self, config_text: str) -> float:
58
+ hits = self._count_hits(config_text, self.HARDCODED_OUTPUT_PATTERNS)
59
+ if hits == 0:
60
+ return 0.0
61
+ return -min(0.18 * hits, 0.60)
62
+
63
+ def penalty_invalid_config(self, config_text: str) -> float:
64
+ if not (config_text or "").strip():
65
+ return -0.30
66
+ if not self._is_yaml_valid(config_text):
67
+ return -0.35
68
+ return 0.0
69
+
70
+ def penalty_breaking_pipeline(self, previous_config: str, new_config: str) -> float:
71
+ if not previous_config or not new_config:
72
+ return 0.0
73
+
74
+ penalty = 0.0
75
+
76
+ previous_valid = self._is_yaml_valid(previous_config)
77
+ new_valid = self._is_yaml_valid(new_config)
78
+ if previous_valid and not new_valid:
79
+ penalty -= 0.40
80
+
81
+ previous_stages = self._extract_stage_names(previous_config)
82
+ new_stages = self._extract_stage_names(new_config)
83
+ missing_stages = previous_stages - new_stages
84
+ if missing_stages:
85
+ penalty -= min(0.15 * len(missing_stages), 0.45)
86
+
87
+ previous_broken = self._count_hits(previous_config, self.BROKEN_COMMAND_PATTERNS)
88
+ new_broken = self._count_hits(new_config, self.BROKEN_COMMAND_PATTERNS)
89
+ if new_broken > previous_broken:
90
+ penalty -= min(0.10 * (new_broken - previous_broken), 0.30)
91
+
92
+ return max(-1.0, penalty)
93
+
94
+ def penalty_excessive_edits(
95
+ self,
96
+ edit_count: int | dict[str, Any] | None = None,
97
+ changed_files_count: int = 0,
98
+ changed_lines_count: int = 0,
99
+ ) -> float:
100
+ if isinstance(edit_count, dict):
101
+ changed_files_count = int(edit_count.get("changed_files_count", changed_files_count) or 0)
102
+ changed_lines_count = int(edit_count.get("changed_lines_count", changed_lines_count) or 0)
103
+ elif isinstance(edit_count, int):
104
+ changed_lines_count = max(changed_lines_count, int(edit_count))
105
+
106
+ penalty = 0.0
107
+
108
+ if changed_files_count > 5:
109
+ penalty -= 0.15
110
+ if changed_files_count > 10:
111
+ penalty -= 0.25
112
+
113
+ if changed_lines_count > 120:
114
+ penalty -= 0.15
115
+ if changed_lines_count > 300:
116
+ penalty -= 0.25
117
+
118
+ return max(-0.80, penalty)
119
+
120
+ def penalty_timeout_abuse(self, step_count: int) -> float:
121
+ if step_count > 30:
122
+ return -0.80
123
+ if step_count > 20:
124
+ return -0.50
125
+ return 0.0
126
+
127
+ def penalty_bruteforce_attempts(self, consecutive_edit_actions: int, failed_validations: int) -> float:
128
+ penalty = 0.0
129
+ if consecutive_edit_actions >= 6:
130
+ penalty -= 0.25
131
+ if consecutive_edit_actions >= 10:
132
+ penalty -= 0.35
133
+
134
+ if failed_validations >= 3:
135
+ penalty -= 0.20
136
+ if failed_validations >= 6:
137
+ penalty -= 0.35
138
+
139
+ return max(-0.80, penalty)
140
+
141
+ def total_penalty(
142
+ self,
143
+ current_config: str = "",
144
+ previous_config: str = "",
145
+ edit_count: int | dict[str, Any] | None = None,
146
+ changed_files_count: int = 0,
147
+ changed_lines_count: int = 0,
148
+ step_count: int = 0,
149
+ consecutive_edit_actions: int = 0,
150
+ failed_validations: int = 0,
151
+ ) -> float:
152
+ total = 0.0
153
+ total += self.penalty_invalid_config(current_config)
154
+ total += self.penalty_stage_skipping(current_config)
155
+ total += self.penalty_fake_success(current_config)
156
+ total += self.penalty_hardcoded_outputs(current_config)
157
+ total += self.penalty_breaking_pipeline(previous_config, current_config)
158
+ total += self.penalty_excessive_edits(
159
+ edit_count=edit_count,
160
+ changed_files_count=changed_files_count,
161
+ changed_lines_count=changed_lines_count,
162
+ )
163
+ total += self.penalty_timeout_abuse(step_count)
164
+ total += self.penalty_bruteforce_attempts(consecutive_edit_actions, failed_validations)
165
+
166
+ return round(total, 4)
167
+
168
+ def _count_hits(self, text: str, patterns: tuple[str, ...]) -> int:
169
+ text = text or ""
170
+ return sum(1 for pattern in patterns if re.search(pattern, text, flags=re.IGNORECASE))
171
+
172
+ def _is_yaml_valid(self, config_text: str) -> bool:
173
+ if not (config_text or "").strip():
174
+ return False
175
+ try:
176
+ yaml.safe_load(config_text)
177
+ return True
178
+ except yaml.YAMLError:
179
+ return False
180
+
181
+ def _extract_stage_names(self, config_text: str) -> set[str]:
182
+ try:
183
+ parsed = yaml.safe_load(config_text)
184
+ except yaml.YAMLError:
185
+ return set()
186
+
187
+ if parsed is None:
188
+ return set()
189
+
190
+ stages: set[str] = set()
191
+ self._walk_for_stages(parsed, stages)
192
+ return stages
193
+
194
+ def _walk_for_stages(self, node: Any, stages: set[str]) -> None:
195
+ if isinstance(node, dict):
196
+ for key, value in node.items():
197
+ key_name = str(key).lower()
198
+ if key_name in {"stages", "jobs", "job"}:
199
+ if isinstance(value, dict):
200
+ for stage_name in value.keys():
201
+ stages.add(str(stage_name))
202
+ elif isinstance(value, list):
203
+ for stage_name in value:
204
+ stages.add(str(stage_name))
205
+ self._walk_for_stages(value, stages)
206
+ elif isinstance(node, list):
207
+ for item in node:
208
+ self._walk_for_stages(item, stages)
env/environment.py ADDED
@@ -0,0 +1,769 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from difflib import SequenceMatcher
5
+ import random
6
+ import re
7
+ from typing import Any
8
+
9
+ import yaml
10
+
11
+ from env.models import Action, EnvStateSnapshot, Observation, Reward
12
+ from env.rewards import RewardCalculator
13
+ from env.tasks import get_task_by_id, get_tasks_by_difficulty
14
+ from env.tasks.task_types import CICDTask
15
+
16
+
17
+ REQUIRED_TOOLS = [
18
+ "read_file",
19
+ "read_logs",
20
+ "analyze_error",
21
+ "edit_config",
22
+ "run_pipeline_stage",
23
+ "run_tests",
24
+ "validate_fix",
25
+ "submit_solution",
26
+ ]
27
+
28
+ MAX_STEPS = 30
29
+
30
+
31
+ @dataclass
32
+ class EnvironmentState:
33
+ task: CICDTask
34
+ current_config: str
35
+ previous_config: str
36
+ step_count: int = 0
37
+ done: bool = False
38
+ progress_flags: dict[str, bool] = field(default_factory=dict)
39
+ file_modification_count: int = 0
40
+ total_changed_lines: int = 0
41
+ hidden_test_pass_rate: float = 0.0
42
+ action_history: list[str] = field(default_factory=list)
43
+ stage_results: dict[str, bool] = field(default_factory=dict)
44
+ failed_validations: int = 0
45
+ consecutive_edit_actions: int = 0
46
+ current_logs: str = ""
47
+ last_error: str = ""
48
+ last_action_error: str | None = None
49
+ last_info: dict[str, Any] = field(default_factory=dict)
50
+
51
+
52
+ class CICDDebuggerEnvironment:
53
+ """RL-style CI/CD debugging environment with strict tool-based actions."""
54
+
55
+ def __init__(
56
+ self,
57
+ max_steps: int = MAX_STEPS,
58
+ seed: int | None = None,
59
+ llm_judge: Any | None = None,
60
+ ) -> None:
61
+ self.max_steps = max(1, int(max_steps))
62
+ self.random = random.Random(seed)
63
+ self.reward_calculator = RewardCalculator(llm_judge=llm_judge)
64
+ self._state: EnvironmentState | None = None
65
+
66
+ async def reset(self, task_id: str | None = None, difficulty: str | None = None) -> dict[str, Any]:
67
+ task = self._select_task(task_id=task_id, difficulty=difficulty)
68
+
69
+ self._state = EnvironmentState(
70
+ task=task,
71
+ current_config=task.broken_config,
72
+ previous_config=task.broken_config,
73
+ progress_flags={tool: False for tool in REQUIRED_TOOLS},
74
+ current_logs=task.logs,
75
+ last_error=task.error_message,
76
+ )
77
+
78
+ return Observation.model_validate(self._build_observation()).model_dump()
79
+
80
+ async def step(self, action: Any) -> tuple[dict[str, Any], float, bool, dict[str, Any]]:
81
+ if self._state is None:
82
+ raise RuntimeError("Environment not initialized. Call reset() first.")
83
+
84
+ if self._state.done:
85
+ reward_model = Reward(value=0.0, components={"total": 0.0})
86
+ return Observation.model_validate(self._build_observation()).model_dump(), float(reward_model.value), True, {
87
+ "tool": "none",
88
+ "message": "episode already completed",
89
+ "error": None,
90
+ "reward_model": reward_model.model_dump(),
91
+ }
92
+
93
+ parsed_action = Action.from_input(action)
94
+ tool, payload = parsed_action.tool, dict(parsed_action.payload)
95
+ self._state.step_count += 1
96
+ self._state.previous_config = self._state.current_config
97
+ self._state.action_history.append(tool)
98
+ self._state.last_action_error = None
99
+
100
+ info: dict[str, Any] = {
101
+ "tool": tool,
102
+ "message": "",
103
+ "error": None,
104
+ }
105
+ changed_lines = 0
106
+
107
+ result: dict[str, Any] = {
108
+ "previous_config": self._state.previous_config,
109
+ "current_config": self._state.current_config,
110
+ "fixed_config": self._state.current_config,
111
+ "expected_config": self._state.task.expected_config,
112
+ "error": self._state.last_error,
113
+ "logs_analyzed": False,
114
+ "error_diagnosed": False,
115
+ "fix_proposed": False,
116
+ "pipeline_run": False,
117
+ "tests_passed": False,
118
+ "command_succeeded": False,
119
+ "changed_files_count": 0,
120
+ "changed_lines_count": 0,
121
+ "edit_count": {
122
+ "changed_files_count": self._state.file_modification_count,
123
+ "changed_lines_count": self._state.total_changed_lines,
124
+ },
125
+ "deterministic_score": None,
126
+ "hidden_test_pass_rate": None,
127
+ "judge_scores": None,
128
+ "hacking_attempt": False,
129
+ }
130
+
131
+ if tool not in REQUIRED_TOOLS:
132
+ info["message"] = "unsupported action tool"
133
+ info["error"] = f"tool '{tool}' is not allowed"
134
+ self._state.last_action_error = str(info["error"])
135
+ elif tool == "read_file":
136
+ self._state.progress_flags[tool] = True
137
+ result["command_succeeded"] = True
138
+ info["message"] = "returned current workflow config"
139
+ self._state.current_logs = self._state.current_config
140
+ self._state.consecutive_edit_actions = 0
141
+ elif tool == "read_logs":
142
+ self._state.progress_flags[tool] = True
143
+ result["logs_analyzed"] = True
144
+ result["command_succeeded"] = True
145
+ info["message"] = "returned pipeline failure logs"
146
+ self._state.current_logs = self._state.task.logs
147
+ self._state.consecutive_edit_actions = 0
148
+ elif tool == "analyze_error":
149
+ self._state.progress_flags[tool] = True
150
+ result["error_diagnosed"] = True
151
+ result["command_succeeded"] = True
152
+ root_cause = self._detect_root_cause(self._state.current_config, self._state.task)
153
+ info["message"] = f"root cause: {root_cause}"
154
+ self._state.current_logs = f"analysis result: {root_cause}"
155
+ self._state.consecutive_edit_actions = 0
156
+ elif tool == "edit_config":
157
+ self._state.progress_flags[tool] = True
158
+ updated_config, summary = self._apply_edit(self._state.current_config, payload, self._state.task)
159
+ changed_lines = self._count_changed_lines(self._state.current_config, updated_config)
160
+
161
+ if changed_lines > 0:
162
+ self._state.current_config = updated_config
163
+ self._state.file_modification_count += 1
164
+ self._state.total_changed_lines += changed_lines
165
+ result["fix_proposed"] = True
166
+ result["command_succeeded"] = True
167
+ info["message"] = summary
168
+ self._state.current_logs = f"edit applied: {summary}"
169
+ else:
170
+ result["command_succeeded"] = False
171
+ info["message"] = "no config changes applied"
172
+ info["error"] = "edit_config did not modify workflow"
173
+ self._state.last_action_error = str(info["error"])
174
+ self._state.current_logs = "edit action produced no changes"
175
+
176
+ self._state.consecutive_edit_actions += 1
177
+ elif tool == "run_pipeline_stage":
178
+ self._state.progress_flags[tool] = True
179
+ stage = self._extract_stage(payload, fallback=self._state.task.failure_stage)
180
+ success, stage_logs = self._simulate_stage(self._state.current_config, stage, self._state.task)
181
+ self._state.stage_results[stage] = success
182
+ result["pipeline_run"] = True
183
+ result["command_succeeded"] = success
184
+ info["message"] = f"stage '{stage}' {'passed' if success else 'failed'}"
185
+ if not success:
186
+ info["error"] = stage_logs
187
+ self._state.last_action_error = stage_logs
188
+ self._state.last_error = stage_logs
189
+ self._state.current_logs = stage_logs
190
+ self._state.consecutive_edit_actions = 0
191
+ elif tool == "run_tests":
192
+ self._state.progress_flags[tool] = True
193
+ tests_passed, test_logs = self._run_tests(self._state.current_config, self._state.task)
194
+ result["pipeline_run"] = True
195
+ result["tests_passed"] = tests_passed
196
+ result["command_succeeded"] = tests_passed
197
+ info["message"] = "tests passed" if tests_passed else "tests failed"
198
+ if not tests_passed:
199
+ info["error"] = test_logs
200
+ self._state.last_action_error = test_logs
201
+ self._state.last_error = test_logs
202
+ self._state.current_logs = test_logs
203
+ self._state.consecutive_edit_actions = 0
204
+ elif tool == "validate_fix":
205
+ self._state.progress_flags[tool] = True
206
+ validation = self._validate_current_fix(self._state)
207
+ result.update(validation)
208
+ result["pipeline_run"] = True
209
+ is_valid = bool(validation.get("is_valid"))
210
+ result["command_succeeded"] = is_valid
211
+
212
+ if not is_valid:
213
+ self._state.failed_validations += 1
214
+ info["error"] = str(validation.get("summary", "validation failed"))
215
+ self._state.last_action_error = str(info["error"])
216
+
217
+ info["message"] = "validation passed" if is_valid else "validation failed"
218
+ self._state.hidden_test_pass_rate = float(validation.get("hidden_test_pass_rate") or 0.0)
219
+ self._state.current_logs = str(validation.get("summary", "validation complete"))
220
+ self._state.consecutive_edit_actions = 0
221
+ elif tool == "submit_solution":
222
+ validation = self._validate_current_fix(self._state)
223
+ result.update(validation)
224
+ result["pipeline_run"] = True
225
+ self._state.progress_flags[tool] = True
226
+ accepted = bool(validation.get("is_valid"))
227
+ result["command_succeeded"] = accepted
228
+
229
+ if accepted:
230
+ self._state.done = True
231
+ info["message"] = "solution accepted"
232
+ self._state.current_logs = "submission accepted"
233
+ else:
234
+ self._state.failed_validations += 1
235
+ info["message"] = "solution rejected"
236
+ info["error"] = "submission failed quality checks"
237
+ self._state.last_action_error = str(info["error"])
238
+ self._state.current_logs = str(validation.get("summary", "submission rejected"))
239
+
240
+ self._state.hidden_test_pass_rate = float(validation.get("hidden_test_pass_rate") or 0.0)
241
+ self._state.consecutive_edit_actions = 0
242
+
243
+ result["hacking_attempt"] = self._detect_hacking_attempt(tool, payload, self._state.current_config)
244
+ result["current_config"] = self._state.current_config
245
+ result["fixed_config"] = self._state.current_config
246
+ result["changed_files_count"] = 1 if changed_lines > 0 else 0
247
+ result["changed_lines_count"] = changed_lines
248
+ result["edit_count"] = {
249
+ "changed_files_count": self._state.file_modification_count,
250
+ "changed_lines_count": self._state.total_changed_lines,
251
+ }
252
+
253
+ if info["error"]:
254
+ self._state.last_error = str(info["error"])
255
+ result["error"] = self._state.last_error
256
+
257
+ if self._state.step_count >= self.max_steps and not self._state.done:
258
+ self._state.done = True
259
+ if not info["error"]:
260
+ info["error"] = "max_steps_reached"
261
+ info["message"] = "max steps reached"
262
+
263
+ reward = self.reward_calculator.calculate_step_reward(
264
+ state={
265
+ "step_count": self._state.step_count,
266
+ "previous_config": self._state.previous_config,
267
+ "expected_config": self._state.task.expected_config,
268
+ "original_config": self._state.task.broken_config,
269
+ "error": self._state.last_error,
270
+ "changed_files_count": self._state.file_modification_count,
271
+ "changed_lines_count": self._state.total_changed_lines,
272
+ "consecutive_edit_actions": self._state.consecutive_edit_actions,
273
+ "failed_validations": self._state.failed_validations,
274
+ },
275
+ action=tool,
276
+ result=result,
277
+ original_config=self._state.task.broken_config,
278
+ fixed_config=self._state.current_config,
279
+ error_message=self._state.last_error,
280
+ expected_config=self._state.task.expected_config,
281
+ metadata=self._state.task.metadata,
282
+ )
283
+
284
+ reward_model = Reward(value=float(reward), components={"total": float(reward)})
285
+ info["reward_model"] = reward_model.model_dump()
286
+
287
+ self._state.last_info = info
288
+ observation = Observation.model_validate(self._build_observation()).model_dump()
289
+ done = bool(self._state.done)
290
+
291
+ return observation, float(reward_model.value), done, info
292
+
293
+ async def close(self) -> None:
294
+ return None
295
+
296
+ def get_state(self) -> dict[str, Any]:
297
+ if self._state is None:
298
+ return EnvStateSnapshot(initialized=False).model_dump()
299
+
300
+ snapshot = {
301
+ "initialized": True,
302
+ "task_id": self._state.task.task_id,
303
+ "difficulty": self._state.task.difficulty,
304
+ "actual_bug": self._state.task.actual_bug,
305
+ "correct_solution": self._state.task.expected_config,
306
+ "failure_stage": self._state.task.failure_stage,
307
+ "step_count": self._state.step_count,
308
+ "done": self._state.done,
309
+ "progress_flags": dict(self._state.progress_flags),
310
+ "file_modification_count": self._state.file_modification_count,
311
+ "total_changed_lines": self._state.total_changed_lines,
312
+ "hidden_test_pass_rate": self._state.hidden_test_pass_rate,
313
+ "stage_results": dict(self._state.stage_results),
314
+ "failed_validations": self._state.failed_validations,
315
+ "last_action_error": self._state.last_action_error,
316
+ "last_error": self._state.last_error,
317
+ }
318
+ return EnvStateSnapshot.model_validate(snapshot).model_dump()
319
+
320
+ def state(self) -> dict[str, Any]:
321
+ return self.get_state()
322
+
323
+ def _build_observation(self) -> dict[str, Any]:
324
+ if self._state is None:
325
+ raise RuntimeError("Environment not initialized")
326
+
327
+ observation = {
328
+ "task_id": self._state.task.task_id,
329
+ "difficulty": self._state.task.difficulty,
330
+ "failure_stage": self._state.task.failure_stage,
331
+ "actual_bug": self._state.task.actual_bug,
332
+ "config": self._state.current_config,
333
+ "logs": self._state.current_logs,
334
+ "error_message": self._state.last_error,
335
+ "available_tools": list(REQUIRED_TOOLS),
336
+ "progress_flags": dict(self._state.progress_flags),
337
+ "file_modification_count": self._state.file_modification_count,
338
+ "hidden_test_pass_rate": self._state.hidden_test_pass_rate,
339
+ "step_count": self._state.step_count,
340
+ "last_action_error": self._state.last_action_error,
341
+ }
342
+ return Observation.model_validate(observation).model_dump()
343
+
344
+ def _select_task(self, task_id: str | None, difficulty: str | None) -> CICDTask:
345
+ if task_id:
346
+ task = get_task_by_id(task_id)
347
+ if task is None:
348
+ raise ValueError(f"Unknown task_id: {task_id}")
349
+ return task
350
+
351
+ filtered = get_tasks_by_difficulty(difficulty)
352
+ if not filtered:
353
+ raise ValueError(f"No tasks available for difficulty: {difficulty}")
354
+
355
+ return self.random.choice(filtered)
356
+
357
+ def _parse_action(self, action: Any) -> tuple[str, dict[str, Any]]:
358
+ parsed = Action.from_input(action)
359
+ return parsed.tool, dict(parsed.payload)
360
+
361
+ def _extract_stage(self, payload: dict[str, Any], fallback: str) -> str:
362
+ direct_stage = str(payload.get("stage") or "").strip().lower()
363
+ if direct_stage in {"build", "test", "deploy"}:
364
+ return direct_stage
365
+
366
+ raw = str(payload.get("raw") or "").lower()
367
+ for stage in ("build", "test", "deploy"):
368
+ if stage in raw:
369
+ return stage
370
+
371
+ return fallback
372
+
373
+ def _detect_root_cause(self, config_text: str, task: CICDTask) -> str:
374
+ normalized = self._normalize(config_text)
375
+ broken_token = self._normalize(str(task.metadata.get("broken_token", "")))
376
+
377
+ if broken_token and broken_token in normalized:
378
+ return task.actual_bug
379
+
380
+ if not self._is_yaml_valid(config_text):
381
+ return "workflow YAML is invalid"
382
+
383
+ fixed_token = self._normalize(str(task.metadata.get("fixed_token", "")))
384
+ if fixed_token and fixed_token not in normalized:
385
+ return f"missing expected fix token: {task.metadata.get('fixed_token')}"
386
+
387
+ return "configuration still deviates from expected pipeline behavior"
388
+
389
+ def _apply_edit(self, current_config: str, payload: dict[str, Any], task: CICDTask) -> tuple[str, str]:
390
+ candidate = current_config
391
+ edits: list[str] = []
392
+
393
+ new_config = payload.get("new_config")
394
+ if isinstance(new_config, str) and new_config.strip():
395
+ return new_config.strip(), "applied payload new_config"
396
+
397
+ raw = str(payload.get("raw") or "")
398
+ raw_lower = raw.lower()
399
+
400
+ replace_match = re.search(
401
+ r"replace\s+['\"]?(.+?)['\"]?\s+with\s+['\"]?(.+?)['\"]?\s*$",
402
+ raw,
403
+ flags=re.IGNORECASE,
404
+ )
405
+ if replace_match:
406
+ old = replace_match.group(1).strip()
407
+ new = replace_match.group(2).strip()
408
+ if old and old in candidate:
409
+ candidate = candidate.replace(old, new)
410
+ edits.append(f"replaced '{old}' with '{new}'")
411
+
412
+ if "checkout" in raw_lower and "actions/checkout@v4" not in candidate:
413
+ updated = self._ensure_checkout(candidate)
414
+ if updated != candidate:
415
+ candidate = updated
416
+ edits.append("inserted actions/checkout@v4 step")
417
+
418
+ if "permissions" in raw_lower or "actions: write" in raw_lower:
419
+ updated = self._ensure_actions_write(candidate)
420
+ if updated != candidate:
421
+ candidate = updated
422
+ edits.append("added actions: write permission")
423
+
424
+ if not edits and any(token in raw_lower for token in ("yaml", "indent", "syntax")):
425
+ updated = self._repair_yaml(candidate, task.expected_config)
426
+ if updated != candidate:
427
+ candidate = updated
428
+ edits.append("repaired YAML structure")
429
+
430
+ broken_token = str(task.metadata.get("broken_token", ""))
431
+ fixed_token = str(task.metadata.get("fixed_token", ""))
432
+ if not edits and broken_token and fixed_token and broken_token in candidate:
433
+ occurrence_count = candidate.count(broken_token)
434
+
435
+ if occurrence_count > 1:
436
+ candidate = task.expected_config
437
+ edits.append("applied canonical fix for ambiguous token")
438
+ elif fixed_token.strip().endswith(":"):
439
+ expected_block = self._extract_expected_block(task.expected_config, fixed_token)
440
+ if expected_block and expected_block not in candidate:
441
+ candidate = candidate.replace(broken_token, f"{broken_token}\n{expected_block}", 1)
442
+ edits.append("inserted expected YAML block")
443
+ else:
444
+ candidate = candidate.replace(broken_token, fixed_token, 1)
445
+ edits.append("applied metadata token replacement")
446
+ else:
447
+ expected_line = self._find_line_containing(task.expected_config, fixed_token)
448
+ replacement = expected_line.strip() if expected_line else fixed_token
449
+ candidate = candidate.replace(broken_token, replacement, 1)
450
+ edits.append("applied metadata token replacement")
451
+
452
+ if not edits and fixed_token and fixed_token not in candidate and not broken_token:
453
+ updated = self._append_missing_token(candidate, fixed_token)
454
+ if updated != candidate:
455
+ candidate = updated
456
+ edits.append("appended expected token")
457
+
458
+ if not edits and any(token in raw_lower for token in ("expected config", "apply expected", "canonical fix")):
459
+ candidate = task.expected_config
460
+ edits.append("replaced with expected task config")
461
+
462
+ summary = "; ".join(edits) if edits else "no-op edit"
463
+ return candidate, summary
464
+
465
+ def _ensure_checkout(self, config_text: str) -> str:
466
+ if "actions/checkout@v4" in config_text:
467
+ return config_text
468
+
469
+ marker = "steps:\n"
470
+ insert = " - uses: actions/checkout@v4\n"
471
+ if marker in config_text:
472
+ return config_text.replace(marker, marker + insert, 1)
473
+
474
+ return config_text
475
+
476
+ def _ensure_actions_write(self, config_text: str) -> str:
477
+ if "actions: write" in config_text:
478
+ return config_text
479
+
480
+ if "permissions:" in config_text:
481
+ lines = config_text.splitlines()
482
+ out: list[str] = []
483
+ inserted = False
484
+ for line in lines:
485
+ out.append(line)
486
+ if line.strip().startswith("permissions:") and not inserted:
487
+ continue
488
+ if line.strip().startswith("contents:") and not inserted:
489
+ indent = line[: len(line) - len(line.lstrip(" "))]
490
+ out.append(f"{indent}actions: write")
491
+ inserted = True
492
+ if inserted:
493
+ return "\n".join(out)
494
+
495
+ return "permissions:\n actions: write\n" + config_text
496
+
497
+ def _append_missing_token(self, config_text: str, token: str) -> str:
498
+ if not token or token in config_text:
499
+ return config_text
500
+
501
+ lower_token = token.lower()
502
+ if "actions/checkout@v4" in lower_token:
503
+ return self._ensure_checkout(config_text)
504
+ if "actions: write" in lower_token:
505
+ return self._ensure_actions_write(config_text)
506
+
507
+ return config_text + "\n" + token
508
+
509
+ def _repair_yaml(self, current_config: str, expected_config: str) -> str:
510
+ if self._is_yaml_valid(current_config):
511
+ return current_config
512
+
513
+ if expected_config and self._is_yaml_valid(expected_config):
514
+ return expected_config
515
+
516
+ return current_config
517
+
518
+ def _find_line_containing(self, config_text: str, token: str) -> str | None:
519
+ target = (token or "").strip()
520
+ if not target:
521
+ return None
522
+
523
+ for line in (config_text or "").splitlines():
524
+ if target in line:
525
+ return line
526
+
527
+ return None
528
+
529
+ def _extract_expected_block(self, config_text: str, token: str) -> str:
530
+ lines = (config_text or "").splitlines()
531
+ target = (token or "").strip()
532
+ if not target:
533
+ return ""
534
+
535
+ for idx, line in enumerate(lines):
536
+ if target not in line:
537
+ continue
538
+
539
+ base_indent = len(line) - len(line.lstrip(" "))
540
+ block = [line]
541
+ for next_line in lines[idx + 1 :]:
542
+ if not next_line.strip():
543
+ break
544
+ next_indent = len(next_line) - len(next_line.lstrip(" "))
545
+ if next_indent <= base_indent:
546
+ break
547
+ block.append(next_line)
548
+ return "\n".join(block)
549
+
550
+ return ""
551
+
552
+ def _simulate_stage(self, config_text: str, stage: str, task: CICDTask) -> tuple[bool, str]:
553
+ if not self._is_yaml_valid(config_text):
554
+ return False, "invalid workflow YAML"
555
+
556
+ expected_has_stage = self._stage_exists(task.expected_config, stage)
557
+ current_has_stage = self._stage_exists(config_text, stage)
558
+
559
+ if expected_has_stage and not current_has_stage:
560
+ return False, f"required stage '{stage}' is missing"
561
+
562
+ if not expected_has_stage and not current_has_stage:
563
+ return True, f"{stage} stage not required for this task"
564
+
565
+ normalized = self._normalize(config_text)
566
+ broken_token = self._normalize(str(task.metadata.get("broken_token", "")))
567
+ fixed_token = self._normalize(str(task.metadata.get("fixed_token", "")))
568
+
569
+ if self._contains_hacking_pattern(config_text):
570
+ return False, "unsafe shortcut pattern detected"
571
+
572
+ if stage == task.failure_stage and broken_token and broken_token in normalized:
573
+ return False, task.logs
574
+
575
+ if stage == task.failure_stage and fixed_token and fixed_token not in normalized:
576
+ return False, task.logs
577
+
578
+ commands = self._extract_commands(config_text)
579
+
580
+ if stage == "build":
581
+ build_tokens = ("npm ci", "npm install", "pip install", "go build", "mvn", "yarn install", "pnpm install")
582
+ if not any(any(token in cmd for token in build_tokens) for cmd in commands):
583
+ return False, "build stage has no install/build command"
584
+
585
+ if stage == "test":
586
+ test_tokens = ("npm test", "pytest", "go test", "mvn test", "yarn test", "pnpm test")
587
+ if not any(any(token in cmd for token in test_tokens) for cmd in commands):
588
+ return False, "test stage has no test command"
589
+
590
+ if stage == "deploy":
591
+ deploy_tokens = ("deploy", "publish", "upload-artifact", "release")
592
+ if not any(any(token in cmd for token in deploy_tokens) for cmd in commands):
593
+ return False, "deploy stage has no deployment command"
594
+
595
+ return True, f"{stage} stage passed"
596
+
597
+ def _run_tests(self, config_text: str, task: CICDTask) -> tuple[bool, str]:
598
+ if self._stage_exists(task.expected_config, "build"):
599
+ build_ok, build_logs = self._simulate_stage(config_text, "build", task)
600
+ if not build_ok:
601
+ return False, build_logs
602
+
603
+ if self._stage_exists(task.expected_config, "test"):
604
+ test_ok, test_logs = self._simulate_stage(config_text, "test", task)
605
+ if not test_ok:
606
+ return False, test_logs
607
+
608
+ similarity = SequenceMatcher(None, self._normalize(config_text), self._normalize(task.expected_config)).ratio()
609
+ if similarity < 0.45:
610
+ return False, "tests failed: fix diverges significantly from expected pipeline"
611
+
612
+ return True, "tests passed"
613
+
614
+ def _validate_current_fix(self, state: EnvironmentState) -> dict[str, Any]:
615
+ current = state.current_config
616
+ task = state.task
617
+
618
+ deterministic_score = self.reward_calculator.deterministic_grader.grade(
619
+ current,
620
+ task.expected_config,
621
+ metadata=task.metadata,
622
+ )
623
+ hidden_test_pass_rate = self.reward_calculator.hidden_test_runner.evaluate_fix(
624
+ fixed_config=current,
625
+ expected_config=task.expected_config,
626
+ metadata=task.metadata,
627
+ )
628
+
629
+ judge_scores = None
630
+ if self.reward_calculator.llm_judge is not None:
631
+ try:
632
+ judge_scores = self.reward_calculator.llm_judge.evaluate_fix(
633
+ task.broken_config,
634
+ current,
635
+ state.last_error,
636
+ )
637
+ except Exception:
638
+ judge_scores = None
639
+
640
+ tests_passed, test_logs = self._run_tests(current, task)
641
+
642
+ stage_ok, stage_logs = self._simulate_stage(current, task.failure_stage, task)
643
+
644
+ broken_token = self._normalize(str(task.metadata.get("broken_token", "")))
645
+ fixed_token = self._normalize(str(task.metadata.get("fixed_token", "")))
646
+ normalized_current = self._normalize(current)
647
+
648
+ token_constraints_met = True
649
+ if broken_token and broken_token in normalized_current:
650
+ token_constraints_met = False
651
+ if fixed_token and fixed_token not in normalized_current:
652
+ token_constraints_met = False
653
+
654
+ judge_average = 1.0
655
+ if isinstance(judge_scores, dict):
656
+ judge_average = (
657
+ float(judge_scores.get("correctness", 0.0))
658
+ + float(judge_scores.get("minimalism", 0.0))
659
+ + float(judge_scores.get("quality", 0.0))
660
+ ) / 3.0
661
+
662
+ is_valid = (
663
+ tests_passed
664
+ and stage_ok
665
+ and token_constraints_met
666
+ and deterministic_score >= 0.72
667
+ and hidden_test_pass_rate >= 0.65
668
+ and judge_average >= 0.5
669
+ )
670
+
671
+ summary = (
672
+ f"deterministic={deterministic_score:.2f}; "
673
+ f"hidden={hidden_test_pass_rate:.2f}; "
674
+ f"tests={'pass' if tests_passed else 'fail'}; "
675
+ f"failure_stage={'pass' if stage_ok else 'fail'}"
676
+ )
677
+ if not tests_passed:
678
+ summary = test_logs
679
+ elif not stage_ok:
680
+ summary = stage_logs
681
+ elif not token_constraints_met:
682
+ summary = "validation failed: required bug-fix token constraints not satisfied"
683
+
684
+ return {
685
+ "deterministic_score": deterministic_score,
686
+ "hidden_test_pass_rate": hidden_test_pass_rate,
687
+ "judge_scores": judge_scores,
688
+ "tests_passed": tests_passed,
689
+ "is_valid": is_valid,
690
+ "summary": summary,
691
+ }
692
+
693
+ def _detect_hacking_attempt(self, tool: str, payload: dict[str, Any], config_text: str) -> bool:
694
+ payload_text = str(payload.get("raw") or "")
695
+ brute_force_detected = bool(
696
+ self._state and (self._state.consecutive_edit_actions >= 6 or self._state.failed_validations >= 3)
697
+ )
698
+
699
+ return (
700
+ brute_force_detected
701
+ or self._contains_hacking_pattern(payload_text)
702
+ or self._contains_hacking_pattern(config_text)
703
+ )
704
+
705
+ def _contains_hacking_pattern(self, text: str) -> bool:
706
+ if not text:
707
+ return False
708
+
709
+ patterns = (
710
+ *self.reward_calculator.anti_hacking_detector.STAGE_SKIP_PATTERNS,
711
+ *self.reward_calculator.anti_hacking_detector.FAKE_SUCCESS_PATTERNS,
712
+ *self.reward_calculator.anti_hacking_detector.HARDCODED_OUTPUT_PATTERNS,
713
+ )
714
+ return any(re.search(pattern, text, flags=re.IGNORECASE) for pattern in patterns)
715
+
716
+ def _extract_commands(self, config_text: str) -> list[str]:
717
+ commands: list[str] = []
718
+ for raw_line in config_text.splitlines():
719
+ line = raw_line.strip().lower()
720
+ if "- run:" in line:
721
+ commands.append(line.replace("- run:", "").strip())
722
+ elif line.startswith("- run "):
723
+ commands.append(line.replace("- run ", "", 1).strip())
724
+ return commands
725
+
726
+ def _is_yaml_valid(self, config_text: str) -> bool:
727
+ try:
728
+ parsed = yaml.safe_load(config_text)
729
+ except yaml.YAMLError:
730
+ return False
731
+ return isinstance(parsed, dict)
732
+
733
+ def _stage_exists(self, config_text: str, stage: str) -> bool:
734
+ try:
735
+ parsed = yaml.safe_load(config_text)
736
+ except yaml.YAMLError:
737
+ return False
738
+
739
+ if not isinstance(parsed, dict):
740
+ return False
741
+
742
+ jobs = parsed.get("jobs")
743
+ if isinstance(jobs, dict) and stage in jobs:
744
+ return True
745
+
746
+ stages = parsed.get("stages")
747
+ if isinstance(stages, dict) and stage in stages:
748
+ return True
749
+ if isinstance(stages, list) and stage in stages:
750
+ return True
751
+
752
+ return False
753
+
754
+ def _count_changed_lines(self, previous: str, current: str) -> int:
755
+ prev_lines = previous.splitlines()
756
+ curr_lines = current.splitlines()
757
+ changed = 0
758
+
759
+ max_len = max(len(prev_lines), len(curr_lines))
760
+ for idx in range(max_len):
761
+ left = prev_lines[idx] if idx < len(prev_lines) else ""
762
+ right = curr_lines[idx] if idx < len(curr_lines) else ""
763
+ if left != right:
764
+ changed += 1
765
+
766
+ return changed
767
+
768
+ def _normalize(self, value: str) -> str:
769
+ return re.sub(r"\s+", " ", value.strip().lower())
env/graders/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from env.graders.deterministic import DeterministicGrader
2
+ from env.graders.llm_judge import LLMJudge
3
+
4
+ __all__ = ["DeterministicGrader", "LLMJudge"]
env/graders/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (327 Bytes). View file
 
env/graders/__pycache__/deterministic.cpython-313.pyc ADDED
Binary file (9.91 kB). View file
 
env/graders/__pycache__/llm_judge.cpython-313.pyc ADDED
Binary file (5.8 kB). View file
 
env/graders/deterministic.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from difflib import SequenceMatcher
5
+ from typing import Any
6
+
7
+ import yaml
8
+
9
+
10
+ class DeterministicGrader:
11
+ """Deterministic correctness scoring for CI/CD config fixes."""
12
+
13
+ COMMAND_KEYS = {
14
+ "script",
15
+ "scripts",
16
+ "run",
17
+ "command",
18
+ "commands",
19
+ "steps",
20
+ "before_script",
21
+ "after_script",
22
+ }
23
+
24
+ BROKEN_COMMAND_PATTERNS = (
25
+ r"\bnpm\s+tset\b",
26
+ r"\bpyhton\b",
27
+ r"\bpip\s+isntall\b",
28
+ r"\bgo\s+tset\b",
29
+ )
30
+
31
+ def grade(self, current_config: str, expected_config: str, metadata: dict[str, Any] | None = None) -> float:
32
+ metadata = metadata or {}
33
+ current_config = current_config or ""
34
+ expected_config = expected_config or ""
35
+
36
+ syntax_score = self._syntax_score(current_config)
37
+ functional_score = self._functional_score(current_config, expected_config, metadata)
38
+ similarity_score = self._similarity_score(current_config, expected_config)
39
+
40
+ total = (0.20 * syntax_score) + (0.60 * functional_score) + (0.20 * similarity_score)
41
+
42
+ if syntax_score == 0.0:
43
+ total = min(total, 0.30)
44
+
45
+ return round(self._clamp_01(total), 4)
46
+
47
+ def _syntax_score(self, config_text: str) -> float:
48
+ if not (config_text or "").strip():
49
+ return 0.0
50
+
51
+ try:
52
+ yaml.safe_load(config_text)
53
+ return 1.0
54
+ except yaml.YAMLError:
55
+ return 0.0
56
+
57
+ def _functional_score(self, current_config: str, expected_config: str, metadata: dict[str, Any]) -> float:
58
+ expected_commands = self._extract_commands(expected_config)
59
+ current_commands = self._extract_commands(current_config)
60
+
61
+ if expected_commands:
62
+ matched = 0
63
+ for expected in expected_commands:
64
+ if any(self._commands_match(expected, current) for current in current_commands):
65
+ matched += 1
66
+ command_score = matched / len(expected_commands)
67
+ else:
68
+ command_score = self._similarity_score(current_config, expected_config)
69
+
70
+ issue_score = self._issue_resolution_score(current_config, metadata)
71
+ broken_penalty = 0.35 if self._has_known_broken_command(current_config) else 0.0
72
+
73
+ combined = (0.80 * command_score) + (0.20 * issue_score) - broken_penalty
74
+ return self._clamp_01(combined)
75
+
76
+ def _issue_resolution_score(self, current_config: str, metadata: dict[str, Any]) -> float:
77
+ broken_token = self._normalize_text(str(metadata.get("broken_token", "")))
78
+ fixed_token = self._normalize_text(str(metadata.get("fixed_token", "")))
79
+ current_normalized = self._normalize_text(current_config)
80
+
81
+ if not broken_token and not fixed_token:
82
+ return 1.0
83
+
84
+ if broken_token and broken_token in current_normalized:
85
+ return 0.0
86
+
87
+ if fixed_token and fixed_token not in current_normalized:
88
+ return 0.0
89
+
90
+ return 1.0
91
+
92
+ def _extract_commands(self, config_text: str) -> list[str]:
93
+ commands: list[str] = []
94
+
95
+ try:
96
+ parsed = yaml.safe_load(config_text)
97
+ except yaml.YAMLError:
98
+ parsed = None
99
+
100
+ if parsed is not None:
101
+ self._walk_yaml(parsed, commands)
102
+
103
+ if not commands:
104
+ commands.extend(self._extract_commands_from_text(config_text))
105
+
106
+ deduped: list[str] = []
107
+ seen: set[str] = set()
108
+ for command in commands:
109
+ normalized = self._normalize_text(command)
110
+ if normalized and normalized not in seen:
111
+ seen.add(normalized)
112
+ deduped.append(normalized)
113
+
114
+ return deduped
115
+
116
+ def _walk_yaml(self, node: Any, commands: list[str]) -> None:
117
+ if isinstance(node, dict):
118
+ for key, value in node.items():
119
+ key_name = str(key).lower()
120
+ if key_name in self.COMMAND_KEYS:
121
+ commands.extend(self._extract_string_values(value))
122
+ self._walk_yaml(value, commands)
123
+ elif isinstance(node, list):
124
+ for item in node:
125
+ self._walk_yaml(item, commands)
126
+
127
+ def _extract_string_values(self, value: Any) -> list[str]:
128
+ if isinstance(value, str):
129
+ return [value]
130
+ if isinstance(value, list):
131
+ return [item for item in value if isinstance(item, str)]
132
+ if isinstance(value, dict):
133
+ output: list[str] = []
134
+ for nested in value.values():
135
+ output.extend(self._extract_string_values(nested))
136
+ return output
137
+ return []
138
+
139
+ def _extract_commands_from_text(self, config_text: str) -> list[str]:
140
+ commands: list[str] = []
141
+
142
+ for raw_line in (config_text or "").splitlines():
143
+ line = raw_line.strip()
144
+ if not line or line.startswith("#"):
145
+ continue
146
+
147
+ if ":" in line and not line.startswith("-") and line.endswith(":"):
148
+ continue
149
+
150
+ line = line.lstrip("-").strip()
151
+ if any(token in line.lower() for token in ("npm", "pytest", "python", "yarn", "pnpm", "go test", "mvn test")):
152
+ commands.append(line)
153
+
154
+ return commands
155
+
156
+ def _has_known_broken_command(self, config_text: str) -> bool:
157
+ return any(re.search(pattern, config_text or "", flags=re.IGNORECASE) for pattern in self.BROKEN_COMMAND_PATTERNS)
158
+
159
+ def _commands_match(self, expected: str, current: str) -> bool:
160
+ expected_normalized = self._normalize_text(expected)
161
+ current_normalized = self._normalize_text(current)
162
+
163
+ if expected_normalized == current_normalized:
164
+ return True
165
+
166
+ if expected_normalized in current_normalized:
167
+ return True
168
+
169
+ if current_normalized in expected_normalized and len(current_normalized) > 6:
170
+ return True
171
+
172
+ return False
173
+
174
+ def _similarity_score(self, current_config: str, expected_config: str) -> float:
175
+ left = self._normalize_text(current_config)
176
+ right = self._normalize_text(expected_config)
177
+
178
+ if not left and not right:
179
+ return 1.0
180
+ if not left or not right:
181
+ return 0.0
182
+
183
+ return self._clamp_01(SequenceMatcher(None, left, right).ratio())
184
+
185
+ def _normalize_text(self, value: str) -> str:
186
+ return re.sub(r"\s+", " ", (value or "")).strip().lower()
187
+
188
+ def _clamp_01(self, value: float) -> float:
189
+ return max(0.0, min(1.0, float(value)))
env/graders/llm_judge.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import re
5
+ from typing import Any
6
+
7
+
8
+ class LLMJudge:
9
+ """Scores qualitative fix quality while remaining robust to bad model output."""
10
+
11
+ def __init__(self, model: Any):
12
+ self.model = model
13
+
14
+ def build_prompt(self, original_config: str, fixed_config: str, error_message: str) -> str:
15
+ return (
16
+ "You are a CI/CD fix quality judge.\n"
17
+ "Return strict JSON with keys correctness, minimalism, quality in [0,1].\n"
18
+ "No prose.\n\n"
19
+ f"Original config:\n{original_config}\n\n"
20
+ f"Fixed config:\n{fixed_config}\n\n"
21
+ f"Error message:\n{error_message}\n"
22
+ )
23
+
24
+ def evaluate_fix(self, original_config: str, fixed_config: str, error_message: str) -> dict[str, float]:
25
+ default = {
26
+ "correctness": 0.0,
27
+ "minimalism": 0.0,
28
+ "quality": 0.0,
29
+ }
30
+
31
+ if self.model is None:
32
+ return default
33
+
34
+ prompt = self.build_prompt(original_config or "", fixed_config or "", error_message or "")
35
+
36
+ try:
37
+ raw_output = self.model(prompt, max_length=300)
38
+ text = self._extract_text(raw_output)
39
+ except Exception:
40
+ return default
41
+
42
+ if not text.strip():
43
+ return default
44
+
45
+ parsed = self._parse_json_with_fallback(text)
46
+ if parsed is None:
47
+ parsed = self._parse_regex_scores(text)
48
+
49
+ return {
50
+ "correctness": self._clamp(parsed.get("correctness", 0.0) if parsed else 0.0),
51
+ "minimalism": self._clamp(parsed.get("minimalism", 0.0) if parsed else 0.0),
52
+ "quality": self._clamp(parsed.get("quality", 0.0) if parsed else 0.0),
53
+ }
54
+
55
+ def _extract_text(self, raw_output: Any) -> str:
56
+ if isinstance(raw_output, str):
57
+ return raw_output
58
+
59
+ if isinstance(raw_output, list) and raw_output:
60
+ first = raw_output[0]
61
+ if isinstance(first, dict):
62
+ for key in ("generated_text", "text", "content"):
63
+ if key in first and first[key] is not None:
64
+ return str(first[key])
65
+ return str(first)
66
+
67
+ if isinstance(raw_output, dict):
68
+ for key in ("generated_text", "text", "content"):
69
+ if key in raw_output and raw_output[key] is not None:
70
+ return str(raw_output[key])
71
+
72
+ return str(raw_output)
73
+
74
+ def _parse_json_with_fallback(self, text: str) -> dict[str, float] | None:
75
+ decoder = json.JSONDecoder()
76
+ for idx, char in enumerate(text):
77
+ if char != "{":
78
+ continue
79
+ try:
80
+ obj, _ = decoder.raw_decode(text[idx:])
81
+ except json.JSONDecodeError:
82
+ continue
83
+ if isinstance(obj, dict):
84
+ return self._normalize_partial_scores(obj)
85
+ return None
86
+
87
+ def _parse_regex_scores(self, text: str) -> dict[str, float]:
88
+ return {
89
+ "correctness": self._extract_score(text, "correctness"),
90
+ "minimalism": self._extract_score(text, "minimalism"),
91
+ "quality": self._extract_score(text, "quality"),
92
+ }
93
+
94
+ def _extract_score(self, text: str, key: str) -> float:
95
+ match = re.search(rf"{key}\s*[:=\-]\s*([0-9]*\.?[0-9]+)", text, flags=re.IGNORECASE)
96
+ if not match:
97
+ return 0.0
98
+ return self._clamp(match.group(1))
99
+
100
+ def _normalize_partial_scores(self, obj: dict[str, Any]) -> dict[str, float]:
101
+ return {
102
+ "correctness": self._clamp(obj.get("correctness", 0.0)),
103
+ "minimalism": self._clamp(obj.get("minimalism", 0.0)),
104
+ "quality": self._clamp(obj.get("quality", 0.0)),
105
+ }
106
+
107
+ def _clamp(self, value: Any) -> float:
108
+ try:
109
+ parsed = float(value)
110
+ except (TypeError, ValueError):
111
+ parsed = 0.0
112
+ return max(0.0, min(1.0, parsed))
env/hidden_tests.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from env.graders.deterministic import DeterministicGrader
6
+
7
+
8
+ class HiddenTestRunner:
9
+ """Evaluates whether a fix generalizes across deterministic CI variants."""
10
+
11
+ def __init__(self, grader: DeterministicGrader | None = None, pass_threshold: float = 0.65):
12
+ self.grader = grader or DeterministicGrader()
13
+ self.pass_threshold = pass_threshold
14
+
15
+ def generate_variants(self, config_text: str) -> list[str]:
16
+ base = config_text or ""
17
+ variants: list[str] = []
18
+
19
+ for replacements in self._variant_replacement_sets():
20
+ variant = self._apply_replacements(base, replacements)
21
+ if variant not in variants:
22
+ variants.append(variant)
23
+
24
+ return variants
25
+
26
+ def evaluate_fix(
27
+ self,
28
+ fixed_config: str,
29
+ task: dict[str, Any] | None = None,
30
+ expected_config: str | None = None,
31
+ metadata: dict[str, Any] | None = None,
32
+ ) -> float:
33
+ fixed_config = fixed_config or ""
34
+ task = task or {}
35
+ metadata = metadata or {}
36
+ expected = expected_config or str(task.get("expected_config", ""))
37
+
38
+ if not fixed_config.strip() or not expected.strip():
39
+ return 0.0
40
+
41
+ total = 0
42
+ passed = 0
43
+
44
+ for replacements in self._variant_replacement_sets():
45
+ fixed_variant = self._apply_replacements(fixed_config, replacements)
46
+ expected_variant = self._apply_replacements(expected, replacements)
47
+ score = self.grader.grade(fixed_variant, expected_variant, metadata)
48
+ total += 1
49
+ if score >= self.pass_threshold:
50
+ passed += 1
51
+
52
+ if total == 0:
53
+ return 0.0
54
+
55
+ return round(passed / total, 4)
56
+
57
+ def _variant_replacement_sets(self) -> list[tuple[tuple[str, str], ...]]:
58
+ return [
59
+ tuple(),
60
+ (("ubuntu-latest", "windows-latest"),),
61
+ (("windows-latest", "ubuntu-latest"),),
62
+ (("node-version: 16", "node-version: 18"),),
63
+ (("node-version: \"16\"", "node-version: \"18\""),),
64
+ (("python-version: \"3.10\"", "python-version: \"3.12\""),),
65
+ (("NODE_ENV=production", "NODE_ENV=development"),),
66
+ ]
67
+
68
+ def _apply_replacements(self, text: str, replacements: tuple[tuple[str, str], ...]) -> str:
69
+ output = text
70
+ for old, new in replacements:
71
+ output = output.replace(old, new)
72
+ return output
env/models.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class Observation(BaseModel):
9
+ task_id: str
10
+ difficulty: str
11
+ failure_stage: str
12
+ actual_bug: str
13
+ config: str
14
+ logs: str
15
+ error_message: str
16
+ available_tools: list[str]
17
+ progress_flags: dict[str, bool]
18
+ file_modification_count: int
19
+ hidden_test_pass_rate: float
20
+ step_count: int
21
+ last_action_error: str | None = None
22
+
23
+
24
+ class Action(BaseModel):
25
+ tool: str = ""
26
+ payload: dict[str, Any] = Field(default_factory=dict)
27
+
28
+ @classmethod
29
+ def from_input(cls, raw_action: Any) -> "Action":
30
+ if isinstance(raw_action, cls):
31
+ return raw_action
32
+
33
+ if isinstance(raw_action, str):
34
+ raw = raw_action.strip()
35
+ if not raw:
36
+ return cls(tool="", payload={})
37
+
38
+ if ":" in raw:
39
+ tool_part, payload_part = raw.split(":", 1)
40
+ return cls(tool=tool_part.strip().lower(), payload={"raw": payload_part.strip()})
41
+
42
+ parts = raw.split(maxsplit=1)
43
+ tool = parts[0].strip().lower() if parts else ""
44
+ payload = {"raw": parts[1].strip()} if len(parts) > 1 else {}
45
+ return cls(tool=tool, payload=payload)
46
+
47
+ if isinstance(raw_action, dict):
48
+ tool = str(raw_action.get("tool") or raw_action.get("action_type") or "").strip().lower()
49
+ incoming_payload = raw_action.get("payload")
50
+
51
+ if isinstance(incoming_payload, dict):
52
+ payload: dict[str, Any] = dict(incoming_payload)
53
+ elif incoming_payload is not None:
54
+ payload = {"raw": str(incoming_payload)}
55
+ elif "input" in raw_action:
56
+ payload = {"raw": str(raw_action.get("input") or "").strip()}
57
+ else:
58
+ payload = {}
59
+
60
+ return cls(tool=tool, payload=payload)
61
+
62
+ return cls(tool="", payload={})
63
+
64
+
65
+ class Reward(BaseModel):
66
+ value: float = Field(ge=0.0, le=1.0)
67
+ components: dict[str, float] = Field(default_factory=dict)
68
+
69
+
70
+ class EnvStateSnapshot(BaseModel):
71
+ initialized: bool
72
+ task_id: str | None = None
73
+ difficulty: str | None = None
74
+ actual_bug: str | None = None
75
+ correct_solution: str | None = None
76
+ failure_stage: str | None = None
77
+ step_count: int = 0
78
+ done: bool = False
79
+ progress_flags: dict[str, bool] = Field(default_factory=dict)
80
+ file_modification_count: int = 0
81
+ total_changed_lines: int = 0
82
+ hidden_test_pass_rate: float = 0.0
83
+ stage_results: dict[str, bool] = Field(default_factory=dict)
84
+ failed_validations: int = 0
85
+ last_action_error: str | None = None
86
+ last_error: str | None = None
env/rewards.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from env.anti_hacking import AntiHackingDetector
6
+ from env.graders.deterministic import DeterministicGrader
7
+ from env.hidden_tests import HiddenTestRunner
8
+
9
+
10
+ class RewardCalculator:
11
+ """Composes progress, execution, quality, and anti-hacking penalties."""
12
+
13
+ ACTION_PROGRESS_REWARDS = {
14
+ "read_file": 0.02,
15
+ "read_logs": 0.03,
16
+ "analyze_error": 0.05,
17
+ "edit_config": 0.06,
18
+ "run_pipeline_stage": 0.07,
19
+ "run_tests": 0.08,
20
+ "validate_fix": 0.10,
21
+ "submit_solution": 0.12,
22
+ }
23
+
24
+ QUALITY_WEIGHTS = {
25
+ "deterministic": 0.40,
26
+ "hidden": 0.25,
27
+ "llm": 0.20,
28
+ }
29
+
30
+ def __init__(
31
+ self,
32
+ llm_judge: Any | None = None,
33
+ anti_hacking_detector: AntiHackingDetector | None = None,
34
+ deterministic_grader: DeterministicGrader | None = None,
35
+ hidden_test_runner: HiddenTestRunner | None = None,
36
+ ):
37
+ self.llm_judge = llm_judge
38
+ self.anti_hacking_detector = anti_hacking_detector or AntiHackingDetector()
39
+ self.deterministic_grader = deterministic_grader or DeterministicGrader()
40
+ self.hidden_test_runner = hidden_test_runner or HiddenTestRunner(grader=self.deterministic_grader)
41
+
42
+ def calculate_step_reward(
43
+ self,
44
+ state: dict[str, Any] | None,
45
+ action: str,
46
+ result: dict[str, Any] | None,
47
+ original_config: str | None = None,
48
+ fixed_config: str | None = None,
49
+ error_message: str | None = None,
50
+ expected_config: str | None = None,
51
+ metadata: dict[str, Any] | None = None,
52
+ ) -> float:
53
+ state = state or {}
54
+ result = result or {}
55
+ metadata = metadata or {}
56
+
57
+ current_config = fixed_config or result.get("fixed_config") or result.get("current_config") or ""
58
+ expected_config = expected_config or result.get("expected_config") or state.get("expected_config") or ""
59
+ original_config = original_config or result.get("original_config") or state.get("original_config") or ""
60
+ error_message = error_message or result.get("error") or state.get("error") or ""
61
+
62
+ reward = 0.0
63
+ reward += self._progress_reward(action, result)
64
+ reward += self._execution_reward(result)
65
+ reward += self._quality_reward(
66
+ action=action,
67
+ current_config=current_config,
68
+ expected_config=expected_config,
69
+ original_config=original_config,
70
+ error_message=error_message,
71
+ result=result,
72
+ metadata=metadata,
73
+ )
74
+ reward += self._penalty_reward(state=state, result=result, current_config=current_config)
75
+
76
+ return round(self._clamp_01(reward), 4)
77
+
78
+ def _progress_reward(self, action: str, result: dict[str, Any]) -> float:
79
+ reward = self.ACTION_PROGRESS_REWARDS.get(action, 0.0)
80
+
81
+ if result.get("logs_analyzed"):
82
+ reward += 0.04
83
+ if result.get("error_diagnosed"):
84
+ reward += 0.08
85
+ if result.get("fix_proposed"):
86
+ reward += 0.05
87
+
88
+ return reward
89
+
90
+ def _execution_reward(self, result: dict[str, Any]) -> float:
91
+ reward = 0.0
92
+
93
+ if result.get("pipeline_run"):
94
+ reward += 0.10
95
+ if result.get("tests_passed"):
96
+ reward += 0.20
97
+ if result.get("command_succeeded"):
98
+ reward += 0.06
99
+
100
+ return reward
101
+
102
+ def _quality_reward(
103
+ self,
104
+ action: str,
105
+ current_config: str,
106
+ expected_config: str,
107
+ original_config: str,
108
+ error_message: str,
109
+ result: dict[str, Any],
110
+ metadata: dict[str, Any],
111
+ ) -> float:
112
+ if not current_config or not expected_config:
113
+ return 0.0
114
+
115
+ deterministic_score = result.get("deterministic_score")
116
+ if deterministic_score is None:
117
+ deterministic_score = self.deterministic_grader.grade(current_config, expected_config, metadata)
118
+
119
+ hidden_pass_rate = result.get("hidden_test_pass_rate")
120
+ if hidden_pass_rate is None and action in {"validate_fix", "submit_solution"}:
121
+ hidden_pass_rate = self.hidden_test_runner.evaluate_fix(
122
+ fixed_config=current_config,
123
+ expected_config=expected_config,
124
+ metadata=metadata,
125
+ )
126
+
127
+ llm_average = 0.0
128
+ judge_scores = result.get("judge_scores")
129
+ if not judge_scores and self.llm_judge and original_config and current_config:
130
+ try:
131
+ judge_scores = self.llm_judge.evaluate_fix(original_config, current_config, error_message)
132
+ except Exception:
133
+ judge_scores = None
134
+
135
+ if isinstance(judge_scores, dict):
136
+ correctness = self._clamp_01(judge_scores.get("correctness", 0.0))
137
+ minimalism = self._clamp_01(judge_scores.get("minimalism", 0.0))
138
+ quality = self._clamp_01(judge_scores.get("quality", 0.0))
139
+ llm_average = (correctness + minimalism + quality) / 3.0
140
+
141
+ quality_reward = 0.0
142
+ quality_reward += self.QUALITY_WEIGHTS["deterministic"] * self._clamp_01(deterministic_score)
143
+ quality_reward += self.QUALITY_WEIGHTS["hidden"] * self._clamp_01(hidden_pass_rate or 0.0)
144
+ quality_reward += self.QUALITY_WEIGHTS["llm"] * self._clamp_01(llm_average)
145
+
146
+ return quality_reward
147
+
148
+ def _penalty_reward(self, state: dict[str, Any], result: dict[str, Any], current_config: str) -> float:
149
+ changed_files_count = int(result.get("changed_files_count", state.get("changed_files_count", 0)) or 0)
150
+ changed_lines_count = int(result.get("changed_lines_count", state.get("changed_lines_count", 0)) or 0)
151
+ edit_count = result.get("edit_count", state.get("edit_count", 0))
152
+ step_count = int(state.get("step_count", 0) or 0)
153
+ previous_config = result.get("previous_config") or state.get("previous_config") or ""
154
+ consecutive_edit_actions = int(
155
+ result.get("consecutive_edit_actions", state.get("consecutive_edit_actions", 0)) or 0
156
+ )
157
+ failed_validations = int(result.get("failed_validations", state.get("failed_validations", 0)) or 0)
158
+
159
+ penalty = self.anti_hacking_detector.total_penalty(
160
+ current_config=current_config,
161
+ previous_config=previous_config,
162
+ edit_count=edit_count,
163
+ changed_files_count=changed_files_count,
164
+ changed_lines_count=changed_lines_count,
165
+ step_count=step_count,
166
+ consecutive_edit_actions=consecutive_edit_actions,
167
+ failed_validations=failed_validations,
168
+ )
169
+
170
+ if result.get("hacking_attempt"):
171
+ penalty -= 0.30
172
+
173
+ return penalty
174
+
175
+ def _clamp_01(self, value: Any) -> float:
176
+ try:
177
+ parsed = float(value)
178
+ except (TypeError, ValueError):
179
+ parsed = 0.0
180
+ return max(0.0, min(1.0, parsed))
env/tasks/__init__.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from env.tasks.easy import EASY_TASKS
4
+ from env.tasks.hard import HARD_TASKS
5
+ from env.tasks.medium import MEDIUM_TASKS
6
+ from env.tasks.task_types import CICDTask
7
+
8
+
9
+ def get_all_tasks() -> list[CICDTask]:
10
+ return [*EASY_TASKS, *MEDIUM_TASKS, *HARD_TASKS]
11
+
12
+
13
+ def get_tasks_by_difficulty(difficulty: str | None) -> list[CICDTask]:
14
+ if not difficulty:
15
+ return get_all_tasks()
16
+
17
+ normalized = difficulty.strip().lower()
18
+ return [task for task in get_all_tasks() if task.difficulty.lower() == normalized]
19
+
20
+
21
+ def get_task_by_id(task_id: str | None) -> CICDTask | None:
22
+ if not task_id:
23
+ return None
24
+
25
+ for task in get_all_tasks():
26
+ if task.task_id == task_id:
27
+ return task
28
+ return None
29
+
30
+
31
+ __all__ = [
32
+ "CICDTask",
33
+ "get_all_tasks",
34
+ "get_task_by_id",
35
+ "get_tasks_by_difficulty",
36
+ ]
env/tasks/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (1.48 kB). View file
 
env/tasks/__pycache__/easy.cpython-313.pyc ADDED
Binary file (2.79 kB). View file
 
env/tasks/__pycache__/hard.cpython-313.pyc ADDED
Binary file (4.5 kB). View file
 
env/tasks/__pycache__/medium.cpython-313.pyc ADDED
Binary file (3.89 kB). View file
 
env/tasks/__pycache__/task_types.cpython-313.pyc ADDED
Binary file (1.05 kB). View file
 
env/tasks/easy.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from env.tasks.task_types import CICDTask
4
+
5
+
6
+ EASY_TASKS: list[CICDTask] = [
7
+ CICDTask(
8
+ task_id="easy-command-typo",
9
+ title="Fix test command typo",
10
+ description="A typo in the test command breaks the CI test stage.",
11
+ difficulty="easy",
12
+ failure_stage="test",
13
+ broken_config="""
14
+ name: CI
15
+ on: [push]
16
+ jobs:
17
+ build:
18
+ runs-on: ubuntu-latest
19
+ steps:
20
+ - uses: actions/checkout@v4
21
+ - run: npm ci
22
+ test:
23
+ runs-on: ubuntu-latest
24
+ needs: build
25
+ steps:
26
+ - uses: actions/checkout@v4
27
+ - run: npm tset
28
+ """.strip(),
29
+ expected_config="""
30
+ name: CI
31
+ on: [push]
32
+ jobs:
33
+ build:
34
+ runs-on: ubuntu-latest
35
+ steps:
36
+ - uses: actions/checkout@v4
37
+ - run: npm ci
38
+ test:
39
+ runs-on: ubuntu-latest
40
+ needs: build
41
+ steps:
42
+ - uses: actions/checkout@v4
43
+ - run: npm test
44
+ """.strip(),
45
+ logs="test stage failed: npm ERR! missing script: tset",
46
+ error_message="command not found: npm tset",
47
+ actual_bug="test step runs npm tset instead of npm test",
48
+ metadata={"broken_token": "npm tset", "fixed_token": "npm test"},
49
+ ),
50
+ CICDTask(
51
+ task_id="easy-missing-checkout",
52
+ title="Add missing checkout",
53
+ description="Build stage fails because repository checkout is missing.",
54
+ difficulty="easy",
55
+ failure_stage="build",
56
+ broken_config="""
57
+ name: CI
58
+ on: [push]
59
+ jobs:
60
+ build:
61
+ runs-on: ubuntu-latest
62
+ steps:
63
+ - run: npm ci
64
+ - run: npm run build
65
+ """.strip(),
66
+ expected_config="""
67
+ name: CI
68
+ on: [push]
69
+ jobs:
70
+ build:
71
+ runs-on: ubuntu-latest
72
+ steps:
73
+ - uses: actions/checkout@v4
74
+ - run: npm ci
75
+ - run: npm run build
76
+ """.strip(),
77
+ logs="build stage failed: package-lock.json not found in workspace",
78
+ error_message="missing checkout step before build commands",
79
+ actual_bug="repository checkout step was removed",
80
+ metadata={"broken_token": "", "fixed_token": "uses: actions/checkout@v4"},
81
+ ),
82
+ CICDTask(
83
+ task_id="easy-yaml-indentation",
84
+ title="Fix YAML indentation",
85
+ description="Pipeline config has malformed YAML indentation.",
86
+ difficulty="easy",
87
+ failure_stage="build",
88
+ broken_config="""
89
+ name: CI
90
+ on: [push]
91
+ jobs:
92
+ test:
93
+ runs-on: ubuntu-latest
94
+ steps:
95
+ - uses: actions/checkout@v4
96
+ - run: pytest
97
+ """.strip(),
98
+ expected_config="""
99
+ name: CI
100
+ on: [push]
101
+ jobs:
102
+ test:
103
+ runs-on: ubuntu-latest
104
+ steps:
105
+ - uses: actions/checkout@v4
106
+ - run: pytest
107
+ """.strip(),
108
+ logs="yaml parser error: while parsing a block mapping",
109
+ error_message="invalid YAML structure in workflow file",
110
+ actual_bug="test command list item is mis-indented",
111
+ metadata={},
112
+ ),
113
+ ]
env/tasks/hard.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from env.tasks.task_types import CICDTask
4
+
5
+
6
+ HARD_TASKS: list[CICDTask] = [
7
+ CICDTask(
8
+ task_id="hard-matrix-logic",
9
+ title="Fix matrix include-exclude logic",
10
+ description="Matrix includes unsupported versions and causes deterministic CI breakage.",
11
+ difficulty="hard",
12
+ failure_stage="test",
13
+ broken_config="""
14
+ name: CI
15
+ on: [push]
16
+ jobs:
17
+ test:
18
+ runs-on: ${{ matrix.os }}
19
+ strategy:
20
+ matrix:
21
+ os: [ubuntu-latest, windows-latest]
22
+ python-version: ["3.10", "3.11", "3.13"]
23
+ steps:
24
+ - uses: actions/checkout@v4
25
+ - uses: actions/setup-python@v5
26
+ with:
27
+ python-version: ${{ matrix.python-version }}
28
+ - run: pip install -r requirements.txt
29
+ - run: pytest -q
30
+ """.strip(),
31
+ expected_config="""
32
+ name: CI
33
+ on: [push]
34
+ jobs:
35
+ test:
36
+ runs-on: ${{ matrix.os }}
37
+ strategy:
38
+ matrix:
39
+ os: [ubuntu-latest, windows-latest]
40
+ python-version: ["3.10", "3.11", "3.13"]
41
+ exclude:
42
+ - os: windows-latest
43
+ python-version: "3.13"
44
+ steps:
45
+ - uses: actions/checkout@v4
46
+ - uses: actions/setup-python@v5
47
+ with:
48
+ python-version: ${{ matrix.python-version }}
49
+ - run: pip install -r requirements.txt
50
+ - run: pytest -q
51
+ """.strip(),
52
+ logs="test stage failed: wheel build unavailable for windows-latest + python 3.13",
53
+ error_message="matrix includes unsupported runtime combination",
54
+ actual_bug="matrix logic is missing an exclude for unstable runtime pair",
55
+ metadata={"broken_token": "python-version: [\"3.10\", \"3.11\", \"3.13\"]", "fixed_token": "exclude:"},
56
+ ),
57
+ CICDTask(
58
+ task_id="hard-conditional-deploy",
59
+ title="Repair deploy conditional",
60
+ description="Deploy job runs regardless of failed tests due to always() condition.",
61
+ difficulty="hard",
62
+ failure_stage="deploy",
63
+ broken_config="""
64
+ name: CI
65
+ on: [push]
66
+ jobs:
67
+ build:
68
+ runs-on: ubuntu-latest
69
+ steps:
70
+ - uses: actions/checkout@v4
71
+ - run: npm ci
72
+ - run: npm run build
73
+ test:
74
+ runs-on: ubuntu-latest
75
+ needs: build
76
+ steps:
77
+ - uses: actions/checkout@v4
78
+ - run: npm test
79
+ deploy:
80
+ runs-on: ubuntu-latest
81
+ needs: test
82
+ if: always()
83
+ steps:
84
+ - run: echo deploying
85
+ """.strip(),
86
+ expected_config="""
87
+ name: CI
88
+ on: [push]
89
+ jobs:
90
+ build:
91
+ runs-on: ubuntu-latest
92
+ steps:
93
+ - uses: actions/checkout@v4
94
+ - run: npm ci
95
+ - run: npm run build
96
+ test:
97
+ runs-on: ubuntu-latest
98
+ needs: build
99
+ steps:
100
+ - uses: actions/checkout@v4
101
+ - run: npm test
102
+ deploy:
103
+ runs-on: ubuntu-latest
104
+ needs: test
105
+ if: success() && github.ref == 'refs/heads/main'
106
+ steps:
107
+ - run: echo deploying
108
+ """.strip(),
109
+ logs="deploy stage triggered despite failing tests on non-main branch",
110
+ error_message="unsafe deploy condition bypasses quality gates",
111
+ actual_bug="deploy condition uses always() instead of guarded success check",
112
+ metadata={"broken_token": "if: always()", "fixed_token": "if: success() && github.ref == 'refs/heads/main'"},
113
+ ),
114
+ CICDTask(
115
+ task_id="hard-needs-order",
116
+ title="Fix job dependency ordering",
117
+ description="Deploy depends only on build and can run before tests complete.",
118
+ difficulty="hard",
119
+ failure_stage="deploy",
120
+ broken_config="""
121
+ name: CI
122
+ on: [push]
123
+ jobs:
124
+ build:
125
+ runs-on: ubuntu-latest
126
+ steps:
127
+ - uses: actions/checkout@v4
128
+ - run: npm ci
129
+ test:
130
+ runs-on: ubuntu-latest
131
+ needs: build
132
+ steps:
133
+ - uses: actions/checkout@v4
134
+ - run: npm test
135
+ deploy:
136
+ runs-on: ubuntu-latest
137
+ needs: build
138
+ steps:
139
+ - run: echo deploying package
140
+ """.strip(),
141
+ expected_config="""
142
+ name: CI
143
+ on: [push]
144
+ jobs:
145
+ build:
146
+ runs-on: ubuntu-latest
147
+ steps:
148
+ - uses: actions/checkout@v4
149
+ - run: npm ci
150
+ test:
151
+ runs-on: ubuntu-latest
152
+ needs: build
153
+ steps:
154
+ - uses: actions/checkout@v4
155
+ - run: npm test
156
+ deploy:
157
+ runs-on: ubuntu-latest
158
+ needs: [build, test]
159
+ steps:
160
+ - run: echo deploying package
161
+ """.strip(),
162
+ logs="deploy stage started before tests finished, causing regression release",
163
+ error_message="deploy dependency graph skips mandatory test gate",
164
+ actual_bug="deploy job does not depend on test job",
165
+ metadata={"broken_token": "needs: build", "fixed_token": "needs: [build, test]"},
166
+ ),
167
+ ]
env/tasks/medium.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from env.tasks.task_types import CICDTask
4
+
5
+
6
+ MEDIUM_TASKS: list[CICDTask] = [
7
+ CICDTask(
8
+ task_id="medium-python-version",
9
+ title="Align Python version",
10
+ description="Tests require Python 3.11 but workflow pins an older version.",
11
+ difficulty="medium",
12
+ failure_stage="build",
13
+ broken_config="""
14
+ name: CI
15
+ on: [push]
16
+ jobs:
17
+ test:
18
+ runs-on: ubuntu-latest
19
+ steps:
20
+ - uses: actions/checkout@v4
21
+ - uses: actions/setup-python@v5
22
+ with:
23
+ python-version: "3.8"
24
+ - run: pip install -r requirements.txt
25
+ - run: pytest -q
26
+ """.strip(),
27
+ expected_config="""
28
+ name: CI
29
+ on: [push]
30
+ jobs:
31
+ test:
32
+ runs-on: ubuntu-latest
33
+ steps:
34
+ - uses: actions/checkout@v4
35
+ - uses: actions/setup-python@v5
36
+ with:
37
+ python-version: "3.11"
38
+ - run: pip install -r requirements.txt
39
+ - run: pytest -q
40
+ """.strip(),
41
+ logs="build failed: package requires python>=3.11",
42
+ error_message="python interpreter version mismatch",
43
+ actual_bug="workflow pins python-version 3.8 while project requires 3.11",
44
+ metadata={"broken_token": 'python-version: "3.8"', "fixed_token": 'python-version: "3.11"'},
45
+ ),
46
+ CICDTask(
47
+ task_id="medium-cache-key",
48
+ title="Fix cache invalidation key",
49
+ description="Dependency cache key ignores lockfile hash and restores stale dependencies.",
50
+ difficulty="medium",
51
+ failure_stage="test",
52
+ broken_config="""
53
+ name: CI
54
+ on: [push]
55
+ jobs:
56
+ test:
57
+ runs-on: ubuntu-latest
58
+ steps:
59
+ - uses: actions/checkout@v4
60
+ - uses: actions/setup-node@v4
61
+ with:
62
+ node-version: 20
63
+ - uses: actions/cache@v4
64
+ with:
65
+ path: ~/.npm
66
+ key: node-modules-${{ runner.os }}
67
+ - run: npm ci
68
+ - run: npm test
69
+ """.strip(),
70
+ expected_config="""
71
+ name: CI
72
+ on: [push]
73
+ jobs:
74
+ test:
75
+ runs-on: ubuntu-latest
76
+ steps:
77
+ - uses: actions/checkout@v4
78
+ - uses: actions/setup-node@v4
79
+ with:
80
+ node-version: 20
81
+ - uses: actions/cache@v4
82
+ with:
83
+ path: ~/.npm
84
+ key: node-modules-${{ runner.os }}-${{ hashFiles('**/package-lock.json') }}
85
+ - run: npm ci
86
+ - run: npm test
87
+ """.strip(),
88
+ logs="test failed: stale cache restored old dependency tree",
89
+ error_message="cache key misses lockfile fingerprint",
90
+ actual_bug="cache key is too broad and never invalidates on dependency changes",
91
+ metadata={"broken_token": "key: node-modules-${{ runner.os }}", "fixed_token": "hashFiles('**/package-lock.json')"},
92
+ ),
93
+ CICDTask(
94
+ task_id="medium-artifact-permissions",
95
+ title="Repair artifact permissions",
96
+ description="Artifact upload fails due to insufficient token permissions.",
97
+ difficulty="medium",
98
+ failure_stage="deploy",
99
+ broken_config="""
100
+ name: CI
101
+ on: [push]
102
+ permissions:
103
+ contents: read
104
+ jobs:
105
+ build:
106
+ runs-on: ubuntu-latest
107
+ steps:
108
+ - uses: actions/checkout@v4
109
+ - run: npm ci
110
+ - run: npm run build
111
+ - uses: actions/upload-artifact@v4
112
+ with:
113
+ name: web-build
114
+ path: dist/
115
+ """.strip(),
116
+ expected_config="""
117
+ name: CI
118
+ on: [push]
119
+ permissions:
120
+ contents: read
121
+ actions: write
122
+ jobs:
123
+ build:
124
+ runs-on: ubuntu-latest
125
+ steps:
126
+ - uses: actions/checkout@v4
127
+ - run: npm ci
128
+ - run: npm run build
129
+ - uses: actions/upload-artifact@v4
130
+ with:
131
+ name: web-build
132
+ path: dist/
133
+ """.strip(),
134
+ logs="deploy stage failed: Resource not accessible by integration",
135
+ error_message="insufficient permissions for upload-artifact",
136
+ actual_bug="actions:write permission missing from workflow permissions",
137
+ metadata={"broken_token": "permissions:\n contents: read", "fixed_token": "actions: write"},
138
+ ),
139
+ ]
env/tasks/task_types.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Any
5
+
6
+
7
+ @dataclass(frozen=True)
8
+ class CICDTask:
9
+ """Represents a single CI/CD debugging scenario."""
10
+
11
+ task_id: str
12
+ title: str
13
+ description: str
14
+ difficulty: str
15
+ failure_stage: str
16
+ broken_config: str
17
+ expected_config: str
18
+ logs: str
19
+ error_message: str
20
+ actual_bug: str
21
+ metadata: dict[str, Any] = field(default_factory=dict)