Krishna1107 commited on
Commit
a7caaff
Β·
1 Parent(s): 804f70e

fixing old codes

Browse files
baseline_runner.py CHANGED
@@ -1,11 +1,8 @@
1
- """Baseline runner for the /baseline endpoint.
2
 
3
- Runs episodes using a simple heuristic agent (no LLM required).
4
- The heuristic agent applies expected_fixes directly to demonstrate
5
- that the environment and grader work correctly end-to-end.
6
  """
7
 
8
- from __future__ import annotations
9
 
10
  from typing import List, Optional
11
 
 
1
+ """Heuristic baseline runner for the /baseline endpoint.
2
 
3
+ Applies expected_fixes directly to verify the environment + grader work e2e.
 
 
4
  """
5
 
 
6
 
7
  from typing import List, Optional
8
 
inference.py CHANGED
@@ -10,7 +10,6 @@ Usage:
10
  python inference.py
11
  """
12
 
13
- from __future__ import annotations
14
 
15
  import json
16
  import os
@@ -22,7 +21,6 @@ from typing import Any, Dict, List, Optional
22
  import requests
23
  from openai import OpenAI
24
 
25
- # ── Configuration ─────────────────────────────────────────────────
26
 
27
  API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
28
  MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/Llama-3.1-70B-Instruct")
@@ -183,7 +181,6 @@ def run_episode(client: OpenAI, task_id: Optional[str] = None, scenario_id: Opti
183
  actual_task_id = info.get("task_id", task_id or "unknown")
184
  actual_scenario_id = info.get("scenario_id", scenario_id or "unknown")
185
 
186
- # ── [START] structured log ──
187
  print(f"[START] task_id={actual_task_id} scenario_id={actual_scenario_id}")
188
 
189
  messages = [{"role": "system", "content": SYSTEM_PROMPT}]
@@ -221,7 +218,6 @@ def run_episode(client: OpenAI, task_id: Optional[str] = None, scenario_id: Opti
221
  issues_fixed = step_info.get("issues_fixed", 0)
222
  issues_total = step_info.get("issues_total", 0)
223
 
224
- # ── [STEP] structured log ──
225
  print(f"[STEP] step={total_steps} action={action['action_type']} reward={reward:.2f} done={str(done).lower()} issues_fixed={issues_fixed} issues_total={issues_total}")
226
 
227
  trajectory.append({
@@ -243,7 +239,6 @@ def run_episode(client: OpenAI, task_id: Optional[str] = None, scenario_id: Opti
243
  result = grade_resp.get("result", {})
244
  score = result.get("score", 0.0)
245
 
246
- # ── [END] structured log ──
247
  print(f"[END] task_id={actual_task_id} scenario_id={actual_scenario_id} score={score:.3f} steps={total_steps}")
248
  return result
249
 
 
10
  python inference.py
11
  """
12
 
 
13
 
14
  import json
15
  import os
 
21
  import requests
22
  from openai import OpenAI
23
 
 
24
 
25
  API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
26
  MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/Llama-3.1-70B-Instruct")
 
181
  actual_task_id = info.get("task_id", task_id or "unknown")
182
  actual_scenario_id = info.get("scenario_id", scenario_id or "unknown")
183
 
 
184
  print(f"[START] task_id={actual_task_id} scenario_id={actual_scenario_id}")
185
 
186
  messages = [{"role": "system", "content": SYSTEM_PROMPT}]
 
218
  issues_fixed = step_info.get("issues_fixed", 0)
219
  issues_total = step_info.get("issues_total", 0)
220
 
 
221
  print(f"[STEP] step={total_steps} action={action['action_type']} reward={reward:.2f} done={str(done).lower()} issues_fixed={issues_fixed} issues_total={issues_total}")
222
 
223
  trajectory.append({
 
239
  result = grade_resp.get("result", {})
240
  score = result.get("score", 0.0)
241
 
 
242
  print(f"[END] task_id={actual_task_id} scenario_id={actual_scenario_id} score={score:.3f} steps={total_steps}")
243
  return result
244
 
server/environment.py CHANGED
@@ -1,6 +1,4 @@
1
- """Core environment loop for Day 1-2 foundation."""
2
-
3
- from __future__ import annotations
4
 
5
  import copy
6
  import random
 
1
+ """Core environment logic."""
 
 
2
 
3
  import copy
4
  import random
server/graders/__init__.py CHANGED
@@ -1,21 +1,13 @@
1
  """Deterministic grader for trajectory scoring.
2
 
3
- Scoring breakdown (matches CONTEXT.md):
4
- - Partial fixes: 40% proportional to fix ratio
5
- - Complete solution bonus: 30% if ALL issues fixed
6
- - Efficiency: 20% max, decays with extra steps
7
- - Hint penalty: -5% per hint used
8
- - Failed action penalty: -2% per failed edit (no valid edits)
9
-
10
- Score examples (2-bug scenario):
11
- Fix 1/2 β†’ ~0.40
12
- Fix 2/2 (slow) β†’ ~0.85
13
- Fix 2/2 (fast) β†’ ~1.0
14
- 2 hints used β†’ -0.10
15
  """
16
 
17
- from __future__ import annotations
18
-
19
  from typing import Any, Dict, List
20
 
21
  from server.models import GraderResult
 
1
  """Deterministic grader for trajectory scoring.
2
 
3
+ Scoring weights:
4
+ partial fixes 40% (proportional to fix ratio)
5
+ complete bonus 30% (all issues fixed)
6
+ efficiency 30% (decays with extra steps)
7
+ hint penalty -5% each
8
+ failed edit -2% each
 
 
 
 
 
 
9
  """
10
 
 
 
11
  from typing import Any, Dict, List
12
 
13
  from server.models import GraderResult
server/graders/base.py DELETED
@@ -1,101 +0,0 @@
1
- """Base grader interface with shared scoring utilities.
2
-
3
- The concrete default grader lives in ``server.graders.__init__``.
4
- This module provides a class-based interface for task-specific overrides.
5
- """
6
-
7
- from __future__ import annotations
8
-
9
- from typing import Any, Dict, List
10
-
11
- from server.models import GraderResult
12
-
13
-
14
- class BaseGrader:
15
- """Base class for task graders.
16
-
17
- Subclass and override ``grade()`` for task-specific scoring.
18
- The default pipeline in ``server.graders.__init__.run_grader``
19
- works for all tasks without subclassing.
20
- """
21
-
22
- PARTIAL_FIX_WEIGHT: float = 0.40
23
- COMPLETE_BONUS: float = 0.30
24
- EFFICIENCY_MAX: float = 0.30
25
- EFFICIENCY_DECAY: float = 0.03
26
- HINT_PENALTY_EACH: float = 0.05
27
- FAILED_ACTION_PENALTY: float = 0.02
28
-
29
- EDIT_ACTION_TYPES = frozenset({
30
- "edit_file", "replace_line", "add_line",
31
- "delete_line", "add_block", "delete_block",
32
- })
33
-
34
- def grade(self, task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult:
35
- return self.compute_score(task_id, trajectory)
36
-
37
- def compute_score(self, task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult:
38
- if not trajectory:
39
- return GraderResult(
40
- task_id=task_id,
41
- score=0.0,
42
- breakdown={"partial_fixes": 0.0, "complete_solution": 0.0, "efficiency": 0.0, "hint_penalty": 0.0},
43
- feedback="No actions taken",
44
- steps_taken=0,
45
- hints_used=0,
46
- )
47
-
48
- final_step = trajectory[-1]
49
- steps_taken = len(trajectory)
50
- hints_used = self._count_hints(trajectory)
51
-
52
- issues_fixed = int(final_step.get("info", {}).get("issues_fixed", 0))
53
- issues_total = max(1, int(final_step.get("info", {}).get("issues_total", 1)))
54
- fix_ratio = issues_fixed / issues_total
55
-
56
- partial_score = self.PARTIAL_FIX_WEIGHT * fix_ratio
57
- complete_bonus = self.COMPLETE_BONUS if issues_fixed == issues_total else 0.0
58
- efficiency = self._efficiency_score(steps_taken, issues_total, issues_fixed)
59
- hint_pen = self.HINT_PENALTY_EACH * hints_used
60
-
61
- score = max(0.0, min(1.0, partial_score + complete_bonus + efficiency - hint_pen))
62
-
63
- return GraderResult(
64
- task_id=task_id,
65
- score=round(score, 3),
66
- breakdown={
67
- "partial_fixes": round(partial_score, 3),
68
- "complete_solution": round(complete_bonus, 3),
69
- "efficiency": round(efficiency, 3),
70
- "hint_penalty": round(-hint_pen, 3),
71
- },
72
- feedback=self._feedback_message(score),
73
- steps_taken=steps_taken,
74
- hints_used=hints_used,
75
- )
76
-
77
- @staticmethod
78
- def _count_hints(trajectory: List[Dict[str, Any]]) -> int:
79
- return sum(
80
- 1 for step in trajectory
81
- if step.get("action", {}).get("action_type") == "request_hint"
82
- )
83
-
84
- def _efficiency_score(self, steps_taken: int, issues_total: int, issues_fixed: int = 1) -> float:
85
- if issues_fixed == 0:
86
- return 0.0
87
- if steps_taken <= issues_total:
88
- return self.EFFICIENCY_MAX
89
- return max(0.0, self.EFFICIENCY_MAX - self.EFFICIENCY_DECAY * (steps_taken - issues_total))
90
-
91
- @staticmethod
92
- def _feedback_message(score: float) -> str:
93
- if score >= 0.9:
94
- return "Excellent! All issues fixed efficiently."
95
- if score >= 0.7:
96
- return "Good job! Most issues fixed."
97
- if score >= 0.5:
98
- return "Partial success. Some issues remain."
99
- if score >= 0.3:
100
- return "Limited progress. Review the error messages carefully."
101
- return "Needs improvement. Try analyzing the error phase first."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
server/main.py CHANGED
@@ -1,6 +1,4 @@
1
- """FastAPI entrypoint implementing required environment endpoints."""
2
-
3
- from __future__ import annotations
4
 
5
  from pathlib import Path
6
  from typing import Optional
 
1
+ """FastAPI server for the CI/CD Debug Environment."""
 
 
2
 
3
  from pathlib import Path
4
  from typing import Optional
server/models.py CHANGED
@@ -1,6 +1,4 @@
1
- """Typed Pydantic models for API and environment state."""
2
-
3
- from __future__ import annotations
4
 
5
  from enum import Enum
6
  from typing import Any, Dict, List, Optional, Union
@@ -43,18 +41,18 @@ class ErrorPhase(str, Enum):
43
 
44
 
45
  class FileContent(BaseModel):
46
- path: str = Field(..., description="File path")
47
- content: str = Field(..., description="Current file content")
48
- file_type: FileType = Field(..., description="Type of file")
49
- line_count: int = Field(..., description="Number of lines")
50
 
51
 
52
  class ErrorInfo(BaseModel):
53
- phase: ErrorPhase = Field(..., description="Failed phase")
54
- error_message: str = Field(..., description="Error output")
55
- exit_code: Optional[int] = Field(None, description="Exit code")
56
- failed_step: Optional[str] = Field(None, description="Failed step")
57
- line_hint: Optional[int] = Field(None, description="Suggested line")
58
 
59
 
60
  class Observation(BaseModel):
@@ -89,12 +87,6 @@ class Action(BaseModel):
89
  reasoning: Optional[str] = None
90
 
91
 
92
- class StepResult(BaseModel):
93
- observation: Observation
94
- reward: float = Field(..., ge=-1.0, le=2.0)
95
- done: bool
96
- info: Dict[str, Any] = Field(default_factory=dict)
97
-
98
 
99
  class TaskInfo(BaseModel):
100
  id: str
 
1
+ """Pydantic models for the API."""
 
 
2
 
3
  from enum import Enum
4
  from typing import Any, Dict, List, Optional, Union
 
41
 
42
 
43
  class FileContent(BaseModel):
44
+ path: str
45
+ content: str
46
+ file_type: FileType
47
+ line_count: int
48
 
49
 
50
  class ErrorInfo(BaseModel):
51
+ phase: ErrorPhase
52
+ error_message: str
53
+ exit_code: Optional[int] = None
54
+ failed_step: Optional[str] = None
55
+ line_hint: Optional[int] = None
56
 
57
 
58
  class Observation(BaseModel):
 
87
  reasoning: Optional[str] = None
88
 
89
 
 
 
 
 
 
 
90
 
91
  class TaskInfo(BaseModel):
92
  id: str
server/simulators/docker_simulator.py CHANGED
@@ -1,6 +1,4 @@
1
- """Docker build/run simulator with deterministic rule-based validation."""
2
-
3
- from __future__ import annotations
4
 
5
  from typing import Dict, List, Optional, Set
6
 
@@ -70,7 +68,7 @@ class DockerSimulator:
70
  if not active_lines:
71
  return {"build_success": False, "run_success": False, "error": "Dockerfile is empty"}
72
 
73
- # --- ARG before FROM is allowed, but first non-ARG instruction must be FROM ---
74
  first_non_arg = None
75
  for line in active_lines:
76
  token = line.split()[0].upper()
@@ -86,7 +84,7 @@ class DockerSimulator:
86
  "error": "Dockerfile must start with FROM",
87
  }
88
 
89
- # --- Instruction validation ---
90
  for idx, raw in enumerate(active_lines, start=1):
91
  token = raw.split()[0].upper()
92
  # Handle --platform= prefix on FROM
@@ -110,7 +108,7 @@ class DockerSimulator:
110
  "line": idx,
111
  }
112
 
113
- # --- Invalid base image tags ---
114
  if "FROM python:3.9-slimm" in content:
115
  return {
116
  "build_success": False,
@@ -118,7 +116,7 @@ class DockerSimulator:
118
  "error": "pull access denied for python:3.9-slimm",
119
  }
120
 
121
- # --- Typo in requirements filename ---
122
  if "requirments.txt" in content:
123
  return {
124
  "build_success": False,
@@ -126,7 +124,7 @@ class DockerSimulator:
126
  "error": "COPY failed: file not found in build context: requirments.txt",
127
  }
128
 
129
- # --- COPY source validation ---
130
  for raw in active_lines:
131
  upper = raw.upper()
132
  if upper.startswith("COPY "):
@@ -149,7 +147,7 @@ class DockerSimulator:
149
  "error": f"COPY failed: file not found in build context: {src}",
150
  }
151
 
152
- # --- Platform ARG declarations ---
153
  if "--platform=$BUILDPLATFORM" in content and "ARG BUILDPLATFORM" not in content:
154
  return {
155
  "build_success": False,
@@ -163,7 +161,7 @@ class DockerSimulator:
163
  "error": "failed to parse platform: TARGETPLATFORM not declared",
164
  }
165
 
166
- # --- Multi-stage artifact path mismatch (dist vs build) ---
167
  if "COPY --from=builder /app/dist" in content:
168
  pkg = context_files.get("package.json")
169
  if pkg and "react-scripts build" in pkg.content:
@@ -173,7 +171,7 @@ class DockerSimulator:
173
  "error": "COPY failed: stat app/dist: file does not exist",
174
  }
175
 
176
- # --- EXPOSE string validation ---
177
  for raw in active_lines:
178
  upper = raw.upper()
179
  if upper.startswith("EXPOSE "):
@@ -188,11 +186,11 @@ class DockerSimulator:
188
  "error": f"EXPOSE requires numeric port or port/protocol, got: {cleaned}",
189
  }
190
 
191
- # =====================================================
192
- # Runtime checks (build succeeds, run may fail)
193
- # =====================================================
194
 
195
- # --- Missing WORKDIR causing module resolution failures ---
196
  has_workdir = "WORKDIR" in content
197
  if ("npm start" in content or 'CMD ["npm", "start"]' in content) and not has_workdir:
198
  return {
@@ -201,7 +199,7 @@ class DockerSimulator:
201
  "run_error": "Error: Cannot find module '/package.json'",
202
  }
203
 
204
- # --- ENTRYPOINT + identical CMD conflict ---
205
  if 'ENTRYPOINT ["python"' in content and 'CMD ["python"' in content:
206
  return {
207
  "build_success": True,
@@ -209,7 +207,7 @@ class DockerSimulator:
209
  "run_error": "container exits immediately; ENTRYPOINT and CMD both specify full command",
210
  }
211
 
212
- # --- Entrypoint script not executable ---
213
  if 'ENTRYPOINT ["./start.sh"]' in content and "chmod +x" not in content:
214
  return {
215
  "build_success": True,
@@ -217,8 +215,7 @@ class DockerSimulator:
217
  "run_error": "exec ./start.sh: permission denied",
218
  }
219
 
220
- # --- Missing required ENV variable (DATABASE_URL) ---
221
- # Check if the scenario error mentions DATABASE_URL (via context files or content)
222
  has_database_url_env = "ENV DATABASE_URL" in content
223
  needs_database_url = (
224
  "app.py" in content
@@ -232,7 +229,7 @@ class DockerSimulator:
232
  "run_error": "KeyError: 'DATABASE_URL' β€” Application requires DATABASE_URL environment variable",
233
  }
234
 
235
- # --- Non-root user binding to privileged port ---
236
  has_user_switch = False
237
  expose_port = None
238
  for raw in active_lines:
 
1
+ """Docker build/run simulator β€” deterministic, rule-based."""
 
 
2
 
3
  from typing import Dict, List, Optional, Set
4
 
 
68
  if not active_lines:
69
  return {"build_success": False, "run_success": False, "error": "Dockerfile is empty"}
70
 
71
+ # ARG before FROM is fine, but the first real instruction must be FROM
72
  first_non_arg = None
73
  for line in active_lines:
74
  token = line.split()[0].upper()
 
84
  "error": "Dockerfile must start with FROM",
85
  }
86
 
87
+ # validate instructions
88
  for idx, raw in enumerate(active_lines, start=1):
89
  token = raw.split()[0].upper()
90
  # Handle --platform= prefix on FROM
 
108
  "line": idx,
109
  }
110
 
111
+ # known-bad base image tags
112
  if "FROM python:3.9-slimm" in content:
113
  return {
114
  "build_success": False,
 
116
  "error": "pull access denied for python:3.9-slimm",
117
  }
118
 
119
+ # typo in requirements filename
120
  if "requirments.txt" in content:
121
  return {
122
  "build_success": False,
 
124
  "error": "COPY failed: file not found in build context: requirments.txt",
125
  }
126
 
127
+ # COPY source must exist in build context
128
  for raw in active_lines:
129
  upper = raw.upper()
130
  if upper.startswith("COPY "):
 
147
  "error": f"COPY failed: file not found in build context: {src}",
148
  }
149
 
150
+ # platform ARGs need to be declared
151
  if "--platform=$BUILDPLATFORM" in content and "ARG BUILDPLATFORM" not in content:
152
  return {
153
  "build_success": False,
 
161
  "error": "failed to parse platform: TARGETPLATFORM not declared",
162
  }
163
 
164
+ # multi-stage: output dir mismatch (dist vs build)
165
  if "COPY --from=builder /app/dist" in content:
166
  pkg = context_files.get("package.json")
167
  if pkg and "react-scripts build" in pkg.content:
 
171
  "error": "COPY failed: stat app/dist: file does not exist",
172
  }
173
 
174
+ # EXPOSE must have a numeric port
175
  for raw in active_lines:
176
  upper = raw.upper()
177
  if upper.startswith("EXPOSE "):
 
186
  "error": f"EXPOSE requires numeric port or port/protocol, got: {cleaned}",
187
  }
188
 
189
+ # ============================
190
+ # runtime checks (build OK, run might fail)
191
+ # ============================
192
 
193
+ # no WORKDIR β†’ module resolution fails
194
  has_workdir = "WORKDIR" in content
195
  if ("npm start" in content or 'CMD ["npm", "start"]' in content) and not has_workdir:
196
  return {
 
199
  "run_error": "Error: Cannot find module '/package.json'",
200
  }
201
 
202
+ # ENTRYPOINT + CMD both specify python β†’ conflict
203
  if 'ENTRYPOINT ["python"' in content and 'CMD ["python"' in content:
204
  return {
205
  "build_success": True,
 
207
  "run_error": "container exits immediately; ENTRYPOINT and CMD both specify full command",
208
  }
209
 
210
+ # entrypoint script needs chmod +x
211
  if 'ENTRYPOINT ["./start.sh"]' in content and "chmod +x" not in content:
212
  return {
213
  "build_success": True,
 
215
  "run_error": "exec ./start.sh: permission denied",
216
  }
217
 
218
+ # DATABASE_URL env var missing
 
219
  has_database_url_env = "ENV DATABASE_URL" in content
220
  needs_database_url = (
221
  "app.py" in content
 
229
  "run_error": "KeyError: 'DATABASE_URL' β€” Application requires DATABASE_URL environment variable",
230
  }
231
 
232
+ # non-root user can't bind privileged ports
233
  has_user_switch = False
234
  expose_port = None
235
  for raw in active_lines:
server/simulators/workflow_simulator.py CHANGED
@@ -1,6 +1,4 @@
1
- """Workflow simulator with YAML parse and deterministic CI rule checks."""
2
-
3
- from __future__ import annotations
4
 
5
  import re
6
  from typing import Any, Dict, List, Optional
@@ -17,7 +15,7 @@ class WorkflowSimulator:
17
 
18
  content = workflow.content
19
 
20
- # --- Single-brace expression check (${ } instead of ${{ }}) ---
21
  # Match ${ ... } that is NOT ${{ ... }}
22
  single_brace = re.findall(r'\$\{(?!\{)\s*[^}]+\}', content)
23
  if single_brace:
@@ -30,7 +28,7 @@ class WorkflowSimulator:
30
  ),
31
  }
32
 
33
- # --- YAML parse ---
34
  try:
35
  parsed = yaml.safe_load(content)
36
  except yaml.YAMLError as exc:
@@ -47,7 +45,7 @@ class WorkflowSimulator:
47
  "error": "Workflow root must be a mapping",
48
  }
49
 
50
- # --- Missing 'on' trigger ---
51
  if "on" not in parsed and True not in parsed:
52
  # yaml.safe_load converts `on:` to True key in some contexts
53
  return {
@@ -56,7 +54,7 @@ class WorkflowSimulator:
56
  "error": "Workflow must define an 'on' trigger event",
57
  }
58
 
59
- # --- Validate 'on' trigger structure ---
60
  on_value = parsed.get("on") or parsed.get(True)
61
  if isinstance(on_value, dict):
62
  for event_key, event_config in on_value.items():
@@ -73,7 +71,7 @@ class WorkflowSimulator:
73
  ),
74
  }
75
 
76
- # --- Jobs validation ---
77
  jobs = parsed.get("jobs")
78
  if not isinstance(jobs, dict) or not jobs:
79
  return {
@@ -98,7 +96,7 @@ class WorkflowSimulator:
98
  if not isinstance(job, dict):
99
  continue
100
 
101
- # --- Missing runs-on ---
102
  if "runs-on" not in job:
103
  return {
104
  "parse_success": False,
@@ -106,7 +104,7 @@ class WorkflowSimulator:
106
  "error": f"Job '{job_name}' is missing required field 'runs-on'",
107
  }
108
 
109
- # --- Validate 'needs' references ---
110
  needs = job.get("needs")
111
  if needs:
112
  needed = [needs] if isinstance(needs, str) else (needs if isinstance(needs, list) else [])
@@ -126,7 +124,7 @@ class WorkflowSimulator:
126
  "error": f"Job '{job_name}' steps must be a list",
127
  }
128
 
129
- # --- Validate each step has 'uses' or 'run' ---
130
  for step in steps:
131
  if not isinstance(step, dict):
132
  continue
@@ -140,7 +138,7 @@ class WorkflowSimulator:
140
  "error": f"Every step must define a 'uses' or 'run' key. Step '{step_name}' has neither.",
141
  }
142
 
143
- # --- Checkout before build order ---
144
  checkout_index = -1
145
  build_index = -1
146
  for idx, step in enumerate(steps):
@@ -162,7 +160,7 @@ class WorkflowSimulator:
162
  "exec_error": "Checkout must happen before Docker build steps",
163
  }
164
 
165
- # --- Cross-job artifact dependency check ---
166
  # If a job uses download-artifact but doesn't declare needs on the upload job
167
  for job_name, job in jobs.items():
168
  if not isinstance(job, dict):
@@ -186,7 +184,7 @@ class WorkflowSimulator:
186
  ),
187
  }
188
 
189
- # --- Docker login with secrets not wired via env ---
190
  if has_docker_login:
191
  # Check if the login step has env block with secrets
192
  login_has_env_secrets = has_username_secret and has_password_secret
@@ -199,7 +197,7 @@ class WorkflowSimulator:
199
  "exec_error": "Docker login secrets not wired β€” add env block with secrets.DOCKER_USERNAME and secrets.DOCKER_PASSWORD",
200
  }
201
 
202
- # --- Push without login ---
203
  if has_docker_push and not has_docker_login:
204
  # Check if using docker/login-action instead
205
  has_login_action = "docker/login-action" in content
@@ -210,7 +208,7 @@ class WorkflowSimulator:
210
  "exec_error": "Docker push without login β€” add a docker login step before pushing",
211
  }
212
 
213
- # --- GHCR login with wrong credentials ---
214
  if "docker login ghcr.io" in content:
215
  if has_password_secret and not has_github_token_secret:
216
  return {
@@ -219,7 +217,7 @@ class WorkflowSimulator:
219
  "exec_error": "GHCR requires GITHUB_TOKEN for authentication, not DOCKER_PASSWORD",
220
  }
221
 
222
- # --- Missing permissions for GHCR push ---
223
  if "ghcr.io" in content and "docker push" in content:
224
  # Check if permissions block has packages: write
225
  if "packages: write" not in content and "packages:write" not in content:
@@ -229,7 +227,7 @@ class WorkflowSimulator:
229
  "exec_error": "GITHUB_TOKEN does not have packages:write permission β€” add permissions block",
230
  }
231
 
232
- # --- Multi-platform without buildx ---
233
  if has_platforms and not has_buildx_setup:
234
  return {
235
  "parse_success": True,
@@ -237,7 +235,7 @@ class WorkflowSimulator:
237
  "exec_error": "Multi-platform build requires docker/setup-buildx-action",
238
  }
239
 
240
- # --- Cache export without buildx driver ---
241
  if "cache-to:" in content and "cache-from:" in content:
242
  # Check for mode=max
243
  if "cache-to: type=gha" in content and "mode=max" not in content:
@@ -247,7 +245,7 @@ class WorkflowSimulator:
247
  "exec_error": "GHA cache export needs mode=max for proper cache support",
248
  }
249
 
250
- # --- Build context / Dockerfile path mismatch ---
251
  for job_name, job in jobs.items():
252
  if not isinstance(job, dict):
253
  continue
@@ -268,7 +266,7 @@ class WorkflowSimulator:
268
  "exec_error": f"Dockerfile path '{file_path}' does not match build context '{context}'",
269
  }
270
 
271
- # --- Secret referenced in run but not mapped via env block ---
272
  for job_name, job in jobs.items():
273
  if not isinstance(job, dict):
274
  continue
@@ -296,7 +294,7 @@ class WorkflowSimulator:
296
  "exec_error": f"{var} is empty β€” secret not available in shell environment. Map it via env block.",
297
  }
298
 
299
- # --- Matrix: Node version incompatibility check ---
300
  for job_name, job in jobs.items():
301
  if not isinstance(job, dict):
302
  continue
 
1
+ """Workflow simulator β€” YAML parse + CI rule checks."""
 
 
2
 
3
  import re
4
  from typing import Any, Dict, List, Optional
 
15
 
16
  content = workflow.content
17
 
18
+ # single-brace expressions: ${ } should be ${{ }}
19
  # Match ${ ... } that is NOT ${{ ... }}
20
  single_brace = re.findall(r'\$\{(?!\{)\s*[^}]+\}', content)
21
  if single_brace:
 
28
  ),
29
  }
30
 
31
+ # parse yaml
32
  try:
33
  parsed = yaml.safe_load(content)
34
  except yaml.YAMLError as exc:
 
45
  "error": "Workflow root must be a mapping",
46
  }
47
 
48
+ # needs an 'on' trigger
49
  if "on" not in parsed and True not in parsed:
50
  # yaml.safe_load converts `on:` to True key in some contexts
51
  return {
 
54
  "error": "Workflow must define an 'on' trigger event",
55
  }
56
 
57
+ # validate trigger structure
58
  on_value = parsed.get("on") or parsed.get(True)
59
  if isinstance(on_value, dict):
60
  for event_key, event_config in on_value.items():
 
71
  ),
72
  }
73
 
74
+ # jobs block
75
  jobs = parsed.get("jobs")
76
  if not isinstance(jobs, dict) or not jobs:
77
  return {
 
96
  if not isinstance(job, dict):
97
  continue
98
 
99
+ # runs-on is required
100
  if "runs-on" not in job:
101
  return {
102
  "parse_success": False,
 
104
  "error": f"Job '{job_name}' is missing required field 'runs-on'",
105
  }
106
 
107
+ # check 'needs' refs point to real jobs
108
  needs = job.get("needs")
109
  if needs:
110
  needed = [needs] if isinstance(needs, str) else (needs if isinstance(needs, list) else [])
 
124
  "error": f"Job '{job_name}' steps must be a list",
125
  }
126
 
127
+ # every step needs 'uses' or 'run'
128
  for step in steps:
129
  if not isinstance(step, dict):
130
  continue
 
138
  "error": f"Every step must define a 'uses' or 'run' key. Step '{step_name}' has neither.",
139
  }
140
 
141
+ # checkout must come before docker build
142
  checkout_index = -1
143
  build_index = -1
144
  for idx, step in enumerate(steps):
 
160
  "exec_error": "Checkout must happen before Docker build steps",
161
  }
162
 
163
+ # cross-job artifact dependency: download needs 'needs'
164
  # If a job uses download-artifact but doesn't declare needs on the upload job
165
  for job_name, job in jobs.items():
166
  if not isinstance(job, dict):
 
184
  ),
185
  }
186
 
187
+ # docker login needs secrets wired via env
188
  if has_docker_login:
189
  # Check if the login step has env block with secrets
190
  login_has_env_secrets = has_username_secret and has_password_secret
 
197
  "exec_error": "Docker login secrets not wired β€” add env block with secrets.DOCKER_USERNAME and secrets.DOCKER_PASSWORD",
198
  }
199
 
200
+ # push without login
201
  if has_docker_push and not has_docker_login:
202
  # Check if using docker/login-action instead
203
  has_login_action = "docker/login-action" in content
 
208
  "exec_error": "Docker push without login β€” add a docker login step before pushing",
209
  }
210
 
211
+ # ghcr.io needs GITHUB_TOKEN not DOCKER_PASSWORD
212
  if "docker login ghcr.io" in content:
213
  if has_password_secret and not has_github_token_secret:
214
  return {
 
217
  "exec_error": "GHCR requires GITHUB_TOKEN for authentication, not DOCKER_PASSWORD",
218
  }
219
 
220
+ # ghcr push needs packages:write permission
221
  if "ghcr.io" in content and "docker push" in content:
222
  # Check if permissions block has packages: write
223
  if "packages: write" not in content and "packages:write" not in content:
 
227
  "exec_error": "GITHUB_TOKEN does not have packages:write permission β€” add permissions block",
228
  }
229
 
230
+ # multi-platform needs buildx
231
  if has_platforms and not has_buildx_setup:
232
  return {
233
  "parse_success": True,
 
235
  "exec_error": "Multi-platform build requires docker/setup-buildx-action",
236
  }
237
 
238
+ # GHA cache export needs mode=max
239
  if "cache-to:" in content and "cache-from:" in content:
240
  # Check for mode=max
241
  if "cache-to: type=gha" in content and "mode=max" not in content:
 
245
  "exec_error": "GHA cache export needs mode=max for proper cache support",
246
  }
247
 
248
+ # context vs dockerfile path mismatch
249
  for job_name, job in jobs.items():
250
  if not isinstance(job, dict):
251
  continue
 
266
  "exec_error": f"Dockerfile path '{file_path}' does not match build context '{context}'",
267
  }
268
 
269
+ # shell env var from secret but not mapped in env block
270
  for job_name, job in jobs.items():
271
  if not isinstance(job, dict):
272
  continue
 
294
  "exec_error": f"{var} is empty β€” secret not available in shell environment. Map it via env block.",
295
  }
296
 
297
+ # node version vs package.json engines
298
  for job_name, job in jobs.items():
299
  if not isinstance(job, dict):
300
  continue
server/tasks/base.py CHANGED
@@ -1,6 +1,4 @@
1
- """Base task class for scenario-based tasks."""
2
-
3
- from __future__ import annotations
4
 
5
  import random
6
  from typing import Dict, Optional
 
1
+ """Base task class."""
 
 
2
 
3
  import random
4
  from typing import Dict, Optional
server/tasks/task_1_build_errors.py CHANGED
@@ -5,7 +5,7 @@ typos in filenames, invalid base image tags, bad RUN syntax,
5
  quoted EXPOSE values, missing FROM instruction.
6
  """
7
 
8
- from __future__ import annotations
9
 
10
  from server.models import TaskDifficulty
11
  from server.tasks.base import BaseTask
 
5
  quoted EXPOSE values, missing FROM instruction.
6
  """
7
 
8
+
9
 
10
  from server.models import TaskDifficulty
11
  from server.tasks.base import BaseTask
server/tasks/task_2_docker_runtime.py CHANGED
@@ -5,7 +5,7 @@ runtime: missing WORKDIR, CMD/ENTRYPOINT conflicts, permission issues,
5
  and missing environment variables.
6
  """
7
 
8
- from __future__ import annotations
9
 
10
  from server.models import TaskDifficulty
11
  from server.tasks.base import BaseTask
 
5
  and missing environment variables.
6
  """
7
 
8
+
9
 
10
  from server.models import TaskDifficulty
11
  from server.tasks.base import BaseTask
server/tasks/task_3_workflow_syntax.py CHANGED
@@ -5,7 +5,7 @@ step ordering, missing runs-on, invalid triggers, duplicate job IDs,
5
  and missing 'on' trigger.
6
  """
7
 
8
- from __future__ import annotations
9
 
10
  from server.models import TaskDifficulty
11
  from server.tasks.base import BaseTask
 
5
  and missing 'on' trigger.
6
  """
7
 
8
+
9
 
10
  from server.models import TaskDifficulty
11
  from server.tasks.base import BaseTask
server/tasks/task_4_workflow_secrets_permissions.py CHANGED
@@ -9,7 +9,7 @@ in GitHub Actions workflows:
9
  - Missing write permission for packages
10
  """
11
 
12
- from __future__ import annotations
13
 
14
  from server.models import TaskDifficulty
15
  from server.tasks.base import BaseTask
 
9
  - Missing write permission for packages
10
  """
11
 
12
+
13
 
14
  from server.models import TaskDifficulty
15
  from server.tasks.base import BaseTask
server/tasks/task_5_ci_docker_integration.py CHANGED
@@ -8,7 +8,7 @@ Agent debugs combined workflow + Docker build integration failures:
8
  - Missing Docker login before push
9
  """
10
 
11
- from __future__ import annotations
12
 
13
  from server.models import TaskDifficulty
14
  from server.tasks.base import BaseTask
 
8
  - Missing Docker login before push
9
  """
10
 
11
+
12
 
13
  from server.models import TaskDifficulty
14
  from server.tasks.base import BaseTask
server/tasks/task_6_multi_stage_matrix.py CHANGED
@@ -8,7 +8,7 @@ Agent debugs complex multi-stage Docker builds and matrix CI/CD pipelines:
8
  - Matrix strategy with version-specific failures
9
  """
10
 
11
- from __future__ import annotations
12
 
13
  from server.models import TaskDifficulty
14
  from server.tasks.base import BaseTask
 
8
  - Matrix strategy with version-specific failures
9
  """
10
 
11
+
12
 
13
  from server.models import TaskDifficulty
14
  from server.tasks.base import BaseTask
server/tasks/task_registry.py CHANGED
@@ -1,6 +1,5 @@
1
  """Task registry for the environment."""
2
 
3
- from __future__ import annotations
4
 
5
  from typing import Dict, Type
6
 
 
1
  """Task registry for the environment."""
2
 
 
3
 
4
  from typing import Dict, Type
5
 
server/utils/yaml_parser.py CHANGED
@@ -1,6 +1,4 @@
1
- """Safe YAML parsing utilities for workflow validation."""
2
-
3
- from __future__ import annotations
4
 
5
  from typing import Any, Optional, Tuple
6
 
 
1
+ """YAML parsing helpers."""
 
 
2
 
3
  from typing import Any, Optional, Tuple
4
 
smoke_test.py CHANGED
@@ -9,7 +9,6 @@ Modes:
9
  - live: uses requests against a running server.
10
  """
11
 
12
- from __future__ import annotations
13
 
14
  import argparse
15
  import json
@@ -42,7 +41,10 @@ class InProcessClient(EndpointClient):
42
 
43
  def get(self, path: str) -> Tuple[int, Dict[str, Any]]:
44
  response = self._client.get(path)
45
- data = response.json() if response.content else {}
 
 
 
46
  return response.status_code, data
47
 
48
  def post(self, path: str, body: Optional[Dict[str, Any]] = None) -> Tuple[int, Dict[str, Any]]:
@@ -60,7 +62,10 @@ class LiveClient(EndpointClient):
60
 
61
  def get(self, path: str) -> Tuple[int, Dict[str, Any]]:
62
  response = self._requests.get(f"{self._base_url}{path}", timeout=20)
63
- data = response.json() if response.content else {}
 
 
 
64
  return response.status_code, data
65
 
66
  def post(self, path: str, body: Optional[Dict[str, Any]] = None) -> Tuple[int, Dict[str, Any]]:
@@ -76,8 +81,12 @@ def assert_true(name: str, cond: bool, details: str = "") -> TestResult:
76
  def run_smoke(client: EndpointClient) -> int:
77
  results = []
78
 
79
- status, data = client.get("/")
80
- results.append(assert_true("GET / health", status == 200 and data.get("status") == "healthy", str(data)))
 
 
 
 
81
 
82
  status, info = client.get("/info")
83
  results.append(assert_true("GET /info", status == 200 and isinstance(info.get("tasks"), list), str(info)))
 
9
  - live: uses requests against a running server.
10
  """
11
 
 
12
 
13
  import argparse
14
  import json
 
41
 
42
  def get(self, path: str) -> Tuple[int, Dict[str, Any]]:
43
  response = self._client.get(path)
44
+ try:
45
+ data = response.json()
46
+ except Exception:
47
+ data = {}
48
  return response.status_code, data
49
 
50
  def post(self, path: str, body: Optional[Dict[str, Any]] = None) -> Tuple[int, Dict[str, Any]]:
 
62
 
63
  def get(self, path: str) -> Tuple[int, Dict[str, Any]]:
64
  response = self._requests.get(f"{self._base_url}{path}", timeout=20)
65
+ try:
66
+ data = response.json()
67
+ except Exception:
68
+ data = {}
69
  return response.status_code, data
70
 
71
  def post(self, path: str, body: Optional[Dict[str, Any]] = None) -> Tuple[int, Dict[str, Any]]:
 
81
  def run_smoke(client: EndpointClient) -> int:
82
  results = []
83
 
84
+ # root serves the landing page now (HTML), just check it's 200
85
+ status, _ = client.get("/")
86
+ results.append(assert_true("GET / landing page", status == 200))
87
+
88
+ status, health = client.get("/health")
89
+ results.append(assert_true("GET /health", status == 200 and health.get("status") == "healthy", str(health)))
90
 
91
  status, info = client.get("/info")
92
  results.append(assert_true("GET /info", status == 200 and isinstance(info.get("tasks"), list), str(info)))
tests/test_determinism.py CHANGED
@@ -1,10 +1,4 @@
1
- """Determinism and score range tests for grader and environment.
2
-
3
- Day 7 deliverables:
4
- - Same trajectory β†’ same score (determinism)
5
- - Score ranges match CONTEXT.md expectations
6
- - Difficulty progression verified
7
- """
8
 
9
  from server.environment import CICDDebugEnvironment
10
  from server.graders import run_grader
@@ -12,7 +6,7 @@ from server.models import Action, ActionType, FileEdit
12
  from server.tasks.task_registry import TASK_REGISTRY
13
 
14
 
15
- # ── Determinism Tests ──────────────────────────────────────────────
16
 
17
 
18
  def test_reset_deterministic_with_seed():
@@ -89,7 +83,7 @@ def test_full_episode_determinism():
89
  assert len(set(scores)) == 1, f"Non-deterministic episode scores: {scores}"
90
 
91
 
92
- # ── Score Range Tests ──────────────────────────────────────────────
93
 
94
 
95
  def test_empty_trajectory_scores_zero():
@@ -174,7 +168,7 @@ def test_score_always_in_0_1_range():
174
  assert 0.0 <= r.score <= 1.0, f"Score {r.score} out of [0, 1] range"
175
 
176
 
177
- # ── Difficulty Progression Tests ───────────────────────────────────
178
 
179
 
180
  def test_difficulty_progression():
@@ -241,7 +235,7 @@ def test_all_scenarios_have_required_fields():
241
  assert len(scenario["expected_fixes"]) >= 1, f"{task_id}/{scenario['id']}: no expected_fixes"
242
 
243
 
244
- # ── End-to-End Score Verification ──────────────────────────────────
245
 
246
 
247
  def test_end_to_end_grading_all_tasks():
 
1
+ """Determinism and score-range tests for the grader and environment."""
 
 
 
 
 
 
2
 
3
  from server.environment import CICDDebugEnvironment
4
  from server.graders import run_grader
 
6
  from server.tasks.task_registry import TASK_REGISTRY
7
 
8
 
9
+ # -- determinism --
10
 
11
 
12
  def test_reset_deterministic_with_seed():
 
83
  assert len(set(scores)) == 1, f"Non-deterministic episode scores: {scores}"
84
 
85
 
86
+ # -- score ranges --
87
 
88
 
89
  def test_empty_trajectory_scores_zero():
 
168
  assert 0.0 <= r.score <= 1.0, f"Score {r.score} out of [0, 1] range"
169
 
170
 
171
+ # -- difficulty progression --
172
 
173
 
174
  def test_difficulty_progression():
 
235
  assert len(scenario["expected_fixes"]) >= 1, f"{task_id}/{scenario['id']}: no expected_fixes"
236
 
237
 
238
+ # -- e2e grading --
239
 
240
 
241
  def test_end_to_end_grading_all_tasks():