Commit Β·
a7caaff
1
Parent(s): 804f70e
fixing old codes
Browse files- baseline_runner.py +2 -5
- inference.py +0 -5
- server/environment.py +1 -3
- server/graders/__init__.py +6 -14
- server/graders/base.py +0 -101
- server/main.py +1 -3
- server/models.py +10 -18
- server/simulators/docker_simulator.py +17 -20
- server/simulators/workflow_simulator.py +20 -22
- server/tasks/base.py +1 -3
- server/tasks/task_1_build_errors.py +1 -1
- server/tasks/task_2_docker_runtime.py +1 -1
- server/tasks/task_3_workflow_syntax.py +1 -1
- server/tasks/task_4_workflow_secrets_permissions.py +1 -1
- server/tasks/task_5_ci_docker_integration.py +1 -1
- server/tasks/task_6_multi_stage_matrix.py +1 -1
- server/tasks/task_registry.py +0 -1
- server/utils/yaml_parser.py +1 -3
- smoke_test.py +14 -5
- tests/test_determinism.py +5 -11
baseline_runner.py
CHANGED
|
@@ -1,11 +1,8 @@
|
|
| 1 |
-
"""
|
| 2 |
|
| 3 |
-
|
| 4 |
-
The heuristic agent applies expected_fixes directly to demonstrate
|
| 5 |
-
that the environment and grader work correctly end-to-end.
|
| 6 |
"""
|
| 7 |
|
| 8 |
-
from __future__ import annotations
|
| 9 |
|
| 10 |
from typing import List, Optional
|
| 11 |
|
|
|
|
| 1 |
+
"""Heuristic baseline runner for the /baseline endpoint.
|
| 2 |
|
| 3 |
+
Applies expected_fixes directly to verify the environment + grader work e2e.
|
|
|
|
|
|
|
| 4 |
"""
|
| 5 |
|
|
|
|
| 6 |
|
| 7 |
from typing import List, Optional
|
| 8 |
|
inference.py
CHANGED
|
@@ -10,7 +10,6 @@ Usage:
|
|
| 10 |
python inference.py
|
| 11 |
"""
|
| 12 |
|
| 13 |
-
from __future__ import annotations
|
| 14 |
|
| 15 |
import json
|
| 16 |
import os
|
|
@@ -22,7 +21,6 @@ from typing import Any, Dict, List, Optional
|
|
| 22 |
import requests
|
| 23 |
from openai import OpenAI
|
| 24 |
|
| 25 |
-
# ββ Configuration βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 26 |
|
| 27 |
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 28 |
MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/Llama-3.1-70B-Instruct")
|
|
@@ -183,7 +181,6 @@ def run_episode(client: OpenAI, task_id: Optional[str] = None, scenario_id: Opti
|
|
| 183 |
actual_task_id = info.get("task_id", task_id or "unknown")
|
| 184 |
actual_scenario_id = info.get("scenario_id", scenario_id or "unknown")
|
| 185 |
|
| 186 |
-
# ββ [START] structured log ββ
|
| 187 |
print(f"[START] task_id={actual_task_id} scenario_id={actual_scenario_id}")
|
| 188 |
|
| 189 |
messages = [{"role": "system", "content": SYSTEM_PROMPT}]
|
|
@@ -221,7 +218,6 @@ def run_episode(client: OpenAI, task_id: Optional[str] = None, scenario_id: Opti
|
|
| 221 |
issues_fixed = step_info.get("issues_fixed", 0)
|
| 222 |
issues_total = step_info.get("issues_total", 0)
|
| 223 |
|
| 224 |
-
# ββ [STEP] structured log ββ
|
| 225 |
print(f"[STEP] step={total_steps} action={action['action_type']} reward={reward:.2f} done={str(done).lower()} issues_fixed={issues_fixed} issues_total={issues_total}")
|
| 226 |
|
| 227 |
trajectory.append({
|
|
@@ -243,7 +239,6 @@ def run_episode(client: OpenAI, task_id: Optional[str] = None, scenario_id: Opti
|
|
| 243 |
result = grade_resp.get("result", {})
|
| 244 |
score = result.get("score", 0.0)
|
| 245 |
|
| 246 |
-
# ββ [END] structured log ββ
|
| 247 |
print(f"[END] task_id={actual_task_id} scenario_id={actual_scenario_id} score={score:.3f} steps={total_steps}")
|
| 248 |
return result
|
| 249 |
|
|
|
|
| 10 |
python inference.py
|
| 11 |
"""
|
| 12 |
|
|
|
|
| 13 |
|
| 14 |
import json
|
| 15 |
import os
|
|
|
|
| 21 |
import requests
|
| 22 |
from openai import OpenAI
|
| 23 |
|
|
|
|
| 24 |
|
| 25 |
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 26 |
MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/Llama-3.1-70B-Instruct")
|
|
|
|
| 181 |
actual_task_id = info.get("task_id", task_id or "unknown")
|
| 182 |
actual_scenario_id = info.get("scenario_id", scenario_id or "unknown")
|
| 183 |
|
|
|
|
| 184 |
print(f"[START] task_id={actual_task_id} scenario_id={actual_scenario_id}")
|
| 185 |
|
| 186 |
messages = [{"role": "system", "content": SYSTEM_PROMPT}]
|
|
|
|
| 218 |
issues_fixed = step_info.get("issues_fixed", 0)
|
| 219 |
issues_total = step_info.get("issues_total", 0)
|
| 220 |
|
|
|
|
| 221 |
print(f"[STEP] step={total_steps} action={action['action_type']} reward={reward:.2f} done={str(done).lower()} issues_fixed={issues_fixed} issues_total={issues_total}")
|
| 222 |
|
| 223 |
trajectory.append({
|
|
|
|
| 239 |
result = grade_resp.get("result", {})
|
| 240 |
score = result.get("score", 0.0)
|
| 241 |
|
|
|
|
| 242 |
print(f"[END] task_id={actual_task_id} scenario_id={actual_scenario_id} score={score:.3f} steps={total_steps}")
|
| 243 |
return result
|
| 244 |
|
server/environment.py
CHANGED
|
@@ -1,6 +1,4 @@
|
|
| 1 |
-
"""Core environment
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
|
| 5 |
import copy
|
| 6 |
import random
|
|
|
|
| 1 |
+
"""Core environment logic."""
|
|
|
|
|
|
|
| 2 |
|
| 3 |
import copy
|
| 4 |
import random
|
server/graders/__init__.py
CHANGED
|
@@ -1,21 +1,13 @@
|
|
| 1 |
"""Deterministic grader for trajectory scoring.
|
| 2 |
|
| 3 |
-
Scoring
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
Score examples (2-bug scenario):
|
| 11 |
-
Fix 1/2 β ~0.40
|
| 12 |
-
Fix 2/2 (slow) β ~0.85
|
| 13 |
-
Fix 2/2 (fast) β ~1.0
|
| 14 |
-
2 hints used β -0.10
|
| 15 |
"""
|
| 16 |
|
| 17 |
-
from __future__ import annotations
|
| 18 |
-
|
| 19 |
from typing import Any, Dict, List
|
| 20 |
|
| 21 |
from server.models import GraderResult
|
|
|
|
| 1 |
"""Deterministic grader for trajectory scoring.
|
| 2 |
|
| 3 |
+
Scoring weights:
|
| 4 |
+
partial fixes 40% (proportional to fix ratio)
|
| 5 |
+
complete bonus 30% (all issues fixed)
|
| 6 |
+
efficiency 30% (decays with extra steps)
|
| 7 |
+
hint penalty -5% each
|
| 8 |
+
failed edit -2% each
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
"""
|
| 10 |
|
|
|
|
|
|
|
| 11 |
from typing import Any, Dict, List
|
| 12 |
|
| 13 |
from server.models import GraderResult
|
server/graders/base.py
DELETED
|
@@ -1,101 +0,0 @@
|
|
| 1 |
-
"""Base grader interface with shared scoring utilities.
|
| 2 |
-
|
| 3 |
-
The concrete default grader lives in ``server.graders.__init__``.
|
| 4 |
-
This module provides a class-based interface for task-specific overrides.
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
from __future__ import annotations
|
| 8 |
-
|
| 9 |
-
from typing import Any, Dict, List
|
| 10 |
-
|
| 11 |
-
from server.models import GraderResult
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
class BaseGrader:
|
| 15 |
-
"""Base class for task graders.
|
| 16 |
-
|
| 17 |
-
Subclass and override ``grade()`` for task-specific scoring.
|
| 18 |
-
The default pipeline in ``server.graders.__init__.run_grader``
|
| 19 |
-
works for all tasks without subclassing.
|
| 20 |
-
"""
|
| 21 |
-
|
| 22 |
-
PARTIAL_FIX_WEIGHT: float = 0.40
|
| 23 |
-
COMPLETE_BONUS: float = 0.30
|
| 24 |
-
EFFICIENCY_MAX: float = 0.30
|
| 25 |
-
EFFICIENCY_DECAY: float = 0.03
|
| 26 |
-
HINT_PENALTY_EACH: float = 0.05
|
| 27 |
-
FAILED_ACTION_PENALTY: float = 0.02
|
| 28 |
-
|
| 29 |
-
EDIT_ACTION_TYPES = frozenset({
|
| 30 |
-
"edit_file", "replace_line", "add_line",
|
| 31 |
-
"delete_line", "add_block", "delete_block",
|
| 32 |
-
})
|
| 33 |
-
|
| 34 |
-
def grade(self, task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult:
|
| 35 |
-
return self.compute_score(task_id, trajectory)
|
| 36 |
-
|
| 37 |
-
def compute_score(self, task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult:
|
| 38 |
-
if not trajectory:
|
| 39 |
-
return GraderResult(
|
| 40 |
-
task_id=task_id,
|
| 41 |
-
score=0.0,
|
| 42 |
-
breakdown={"partial_fixes": 0.0, "complete_solution": 0.0, "efficiency": 0.0, "hint_penalty": 0.0},
|
| 43 |
-
feedback="No actions taken",
|
| 44 |
-
steps_taken=0,
|
| 45 |
-
hints_used=0,
|
| 46 |
-
)
|
| 47 |
-
|
| 48 |
-
final_step = trajectory[-1]
|
| 49 |
-
steps_taken = len(trajectory)
|
| 50 |
-
hints_used = self._count_hints(trajectory)
|
| 51 |
-
|
| 52 |
-
issues_fixed = int(final_step.get("info", {}).get("issues_fixed", 0))
|
| 53 |
-
issues_total = max(1, int(final_step.get("info", {}).get("issues_total", 1)))
|
| 54 |
-
fix_ratio = issues_fixed / issues_total
|
| 55 |
-
|
| 56 |
-
partial_score = self.PARTIAL_FIX_WEIGHT * fix_ratio
|
| 57 |
-
complete_bonus = self.COMPLETE_BONUS if issues_fixed == issues_total else 0.0
|
| 58 |
-
efficiency = self._efficiency_score(steps_taken, issues_total, issues_fixed)
|
| 59 |
-
hint_pen = self.HINT_PENALTY_EACH * hints_used
|
| 60 |
-
|
| 61 |
-
score = max(0.0, min(1.0, partial_score + complete_bonus + efficiency - hint_pen))
|
| 62 |
-
|
| 63 |
-
return GraderResult(
|
| 64 |
-
task_id=task_id,
|
| 65 |
-
score=round(score, 3),
|
| 66 |
-
breakdown={
|
| 67 |
-
"partial_fixes": round(partial_score, 3),
|
| 68 |
-
"complete_solution": round(complete_bonus, 3),
|
| 69 |
-
"efficiency": round(efficiency, 3),
|
| 70 |
-
"hint_penalty": round(-hint_pen, 3),
|
| 71 |
-
},
|
| 72 |
-
feedback=self._feedback_message(score),
|
| 73 |
-
steps_taken=steps_taken,
|
| 74 |
-
hints_used=hints_used,
|
| 75 |
-
)
|
| 76 |
-
|
| 77 |
-
@staticmethod
|
| 78 |
-
def _count_hints(trajectory: List[Dict[str, Any]]) -> int:
|
| 79 |
-
return sum(
|
| 80 |
-
1 for step in trajectory
|
| 81 |
-
if step.get("action", {}).get("action_type") == "request_hint"
|
| 82 |
-
)
|
| 83 |
-
|
| 84 |
-
def _efficiency_score(self, steps_taken: int, issues_total: int, issues_fixed: int = 1) -> float:
|
| 85 |
-
if issues_fixed == 0:
|
| 86 |
-
return 0.0
|
| 87 |
-
if steps_taken <= issues_total:
|
| 88 |
-
return self.EFFICIENCY_MAX
|
| 89 |
-
return max(0.0, self.EFFICIENCY_MAX - self.EFFICIENCY_DECAY * (steps_taken - issues_total))
|
| 90 |
-
|
| 91 |
-
@staticmethod
|
| 92 |
-
def _feedback_message(score: float) -> str:
|
| 93 |
-
if score >= 0.9:
|
| 94 |
-
return "Excellent! All issues fixed efficiently."
|
| 95 |
-
if score >= 0.7:
|
| 96 |
-
return "Good job! Most issues fixed."
|
| 97 |
-
if score >= 0.5:
|
| 98 |
-
return "Partial success. Some issues remain."
|
| 99 |
-
if score >= 0.3:
|
| 100 |
-
return "Limited progress. Review the error messages carefully."
|
| 101 |
-
return "Needs improvement. Try analyzing the error phase first."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
server/main.py
CHANGED
|
@@ -1,6 +1,4 @@
|
|
| 1 |
-
"""FastAPI
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
|
| 5 |
from pathlib import Path
|
| 6 |
from typing import Optional
|
|
|
|
| 1 |
+
"""FastAPI server for the CI/CD Debug Environment."""
|
|
|
|
|
|
|
| 2 |
|
| 3 |
from pathlib import Path
|
| 4 |
from typing import Optional
|
server/models.py
CHANGED
|
@@ -1,6 +1,4 @@
|
|
| 1 |
-
"""
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
|
| 5 |
from enum import Enum
|
| 6 |
from typing import Any, Dict, List, Optional, Union
|
|
@@ -43,18 +41,18 @@ class ErrorPhase(str, Enum):
|
|
| 43 |
|
| 44 |
|
| 45 |
class FileContent(BaseModel):
|
| 46 |
-
path: str
|
| 47 |
-
content: str
|
| 48 |
-
file_type: FileType
|
| 49 |
-
line_count: int
|
| 50 |
|
| 51 |
|
| 52 |
class ErrorInfo(BaseModel):
|
| 53 |
-
phase: ErrorPhase
|
| 54 |
-
error_message: str
|
| 55 |
-
exit_code: Optional[int] =
|
| 56 |
-
failed_step: Optional[str] =
|
| 57 |
-
line_hint: Optional[int] =
|
| 58 |
|
| 59 |
|
| 60 |
class Observation(BaseModel):
|
|
@@ -89,12 +87,6 @@ class Action(BaseModel):
|
|
| 89 |
reasoning: Optional[str] = None
|
| 90 |
|
| 91 |
|
| 92 |
-
class StepResult(BaseModel):
|
| 93 |
-
observation: Observation
|
| 94 |
-
reward: float = Field(..., ge=-1.0, le=2.0)
|
| 95 |
-
done: bool
|
| 96 |
-
info: Dict[str, Any] = Field(default_factory=dict)
|
| 97 |
-
|
| 98 |
|
| 99 |
class TaskInfo(BaseModel):
|
| 100 |
id: str
|
|
|
|
| 1 |
+
"""Pydantic models for the API."""
|
|
|
|
|
|
|
| 2 |
|
| 3 |
from enum import Enum
|
| 4 |
from typing import Any, Dict, List, Optional, Union
|
|
|
|
| 41 |
|
| 42 |
|
| 43 |
class FileContent(BaseModel):
|
| 44 |
+
path: str
|
| 45 |
+
content: str
|
| 46 |
+
file_type: FileType
|
| 47 |
+
line_count: int
|
| 48 |
|
| 49 |
|
| 50 |
class ErrorInfo(BaseModel):
|
| 51 |
+
phase: ErrorPhase
|
| 52 |
+
error_message: str
|
| 53 |
+
exit_code: Optional[int] = None
|
| 54 |
+
failed_step: Optional[str] = None
|
| 55 |
+
line_hint: Optional[int] = None
|
| 56 |
|
| 57 |
|
| 58 |
class Observation(BaseModel):
|
|
|
|
| 87 |
reasoning: Optional[str] = None
|
| 88 |
|
| 89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
class TaskInfo(BaseModel):
|
| 92 |
id: str
|
server/simulators/docker_simulator.py
CHANGED
|
@@ -1,6 +1,4 @@
|
|
| 1 |
-
"""Docker build/run simulator
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
|
| 5 |
from typing import Dict, List, Optional, Set
|
| 6 |
|
|
@@ -70,7 +68,7 @@ class DockerSimulator:
|
|
| 70 |
if not active_lines:
|
| 71 |
return {"build_success": False, "run_success": False, "error": "Dockerfile is empty"}
|
| 72 |
|
| 73 |
-
#
|
| 74 |
first_non_arg = None
|
| 75 |
for line in active_lines:
|
| 76 |
token = line.split()[0].upper()
|
|
@@ -86,7 +84,7 @@ class DockerSimulator:
|
|
| 86 |
"error": "Dockerfile must start with FROM",
|
| 87 |
}
|
| 88 |
|
| 89 |
-
#
|
| 90 |
for idx, raw in enumerate(active_lines, start=1):
|
| 91 |
token = raw.split()[0].upper()
|
| 92 |
# Handle --platform= prefix on FROM
|
|
@@ -110,7 +108,7 @@ class DockerSimulator:
|
|
| 110 |
"line": idx,
|
| 111 |
}
|
| 112 |
|
| 113 |
-
# -
|
| 114 |
if "FROM python:3.9-slimm" in content:
|
| 115 |
return {
|
| 116 |
"build_success": False,
|
|
@@ -118,7 +116,7 @@ class DockerSimulator:
|
|
| 118 |
"error": "pull access denied for python:3.9-slimm",
|
| 119 |
}
|
| 120 |
|
| 121 |
-
#
|
| 122 |
if "requirments.txt" in content:
|
| 123 |
return {
|
| 124 |
"build_success": False,
|
|
@@ -126,7 +124,7 @@ class DockerSimulator:
|
|
| 126 |
"error": "COPY failed: file not found in build context: requirments.txt",
|
| 127 |
}
|
| 128 |
|
| 129 |
-
#
|
| 130 |
for raw in active_lines:
|
| 131 |
upper = raw.upper()
|
| 132 |
if upper.startswith("COPY "):
|
|
@@ -149,7 +147,7 @@ class DockerSimulator:
|
|
| 149 |
"error": f"COPY failed: file not found in build context: {src}",
|
| 150 |
}
|
| 151 |
|
| 152 |
-
#
|
| 153 |
if "--platform=$BUILDPLATFORM" in content and "ARG BUILDPLATFORM" not in content:
|
| 154 |
return {
|
| 155 |
"build_success": False,
|
|
@@ -163,7 +161,7 @@ class DockerSimulator:
|
|
| 163 |
"error": "failed to parse platform: TARGETPLATFORM not declared",
|
| 164 |
}
|
| 165 |
|
| 166 |
-
# -
|
| 167 |
if "COPY --from=builder /app/dist" in content:
|
| 168 |
pkg = context_files.get("package.json")
|
| 169 |
if pkg and "react-scripts build" in pkg.content:
|
|
@@ -173,7 +171,7 @@ class DockerSimulator:
|
|
| 173 |
"error": "COPY failed: stat app/dist: file does not exist",
|
| 174 |
}
|
| 175 |
|
| 176 |
-
#
|
| 177 |
for raw in active_lines:
|
| 178 |
upper = raw.upper()
|
| 179 |
if upper.startswith("EXPOSE "):
|
|
@@ -188,11 +186,11 @@ class DockerSimulator:
|
|
| 188 |
"error": f"EXPOSE requires numeric port or port/protocol, got: {cleaned}",
|
| 189 |
}
|
| 190 |
|
| 191 |
-
# ============================
|
| 192 |
-
#
|
| 193 |
-
# ============================
|
| 194 |
|
| 195 |
-
#
|
| 196 |
has_workdir = "WORKDIR" in content
|
| 197 |
if ("npm start" in content or 'CMD ["npm", "start"]' in content) and not has_workdir:
|
| 198 |
return {
|
|
@@ -201,7 +199,7 @@ class DockerSimulator:
|
|
| 201 |
"run_error": "Error: Cannot find module '/package.json'",
|
| 202 |
}
|
| 203 |
|
| 204 |
-
#
|
| 205 |
if 'ENTRYPOINT ["python"' in content and 'CMD ["python"' in content:
|
| 206 |
return {
|
| 207 |
"build_success": True,
|
|
@@ -209,7 +207,7 @@ class DockerSimulator:
|
|
| 209 |
"run_error": "container exits immediately; ENTRYPOINT and CMD both specify full command",
|
| 210 |
}
|
| 211 |
|
| 212 |
-
#
|
| 213 |
if 'ENTRYPOINT ["./start.sh"]' in content and "chmod +x" not in content:
|
| 214 |
return {
|
| 215 |
"build_success": True,
|
|
@@ -217,8 +215,7 @@ class DockerSimulator:
|
|
| 217 |
"run_error": "exec ./start.sh: permission denied",
|
| 218 |
}
|
| 219 |
|
| 220 |
-
#
|
| 221 |
-
# Check if the scenario error mentions DATABASE_URL (via context files or content)
|
| 222 |
has_database_url_env = "ENV DATABASE_URL" in content
|
| 223 |
needs_database_url = (
|
| 224 |
"app.py" in content
|
|
@@ -232,7 +229,7 @@ class DockerSimulator:
|
|
| 232 |
"run_error": "KeyError: 'DATABASE_URL' β Application requires DATABASE_URL environment variable",
|
| 233 |
}
|
| 234 |
|
| 235 |
-
# -
|
| 236 |
has_user_switch = False
|
| 237 |
expose_port = None
|
| 238 |
for raw in active_lines:
|
|
|
|
| 1 |
+
"""Docker build/run simulator β deterministic, rule-based."""
|
|
|
|
|
|
|
| 2 |
|
| 3 |
from typing import Dict, List, Optional, Set
|
| 4 |
|
|
|
|
| 68 |
if not active_lines:
|
| 69 |
return {"build_success": False, "run_success": False, "error": "Dockerfile is empty"}
|
| 70 |
|
| 71 |
+
# ARG before FROM is fine, but the first real instruction must be FROM
|
| 72 |
first_non_arg = None
|
| 73 |
for line in active_lines:
|
| 74 |
token = line.split()[0].upper()
|
|
|
|
| 84 |
"error": "Dockerfile must start with FROM",
|
| 85 |
}
|
| 86 |
|
| 87 |
+
# validate instructions
|
| 88 |
for idx, raw in enumerate(active_lines, start=1):
|
| 89 |
token = raw.split()[0].upper()
|
| 90 |
# Handle --platform= prefix on FROM
|
|
|
|
| 108 |
"line": idx,
|
| 109 |
}
|
| 110 |
|
| 111 |
+
# known-bad base image tags
|
| 112 |
if "FROM python:3.9-slimm" in content:
|
| 113 |
return {
|
| 114 |
"build_success": False,
|
|
|
|
| 116 |
"error": "pull access denied for python:3.9-slimm",
|
| 117 |
}
|
| 118 |
|
| 119 |
+
# typo in requirements filename
|
| 120 |
if "requirments.txt" in content:
|
| 121 |
return {
|
| 122 |
"build_success": False,
|
|
|
|
| 124 |
"error": "COPY failed: file not found in build context: requirments.txt",
|
| 125 |
}
|
| 126 |
|
| 127 |
+
# COPY source must exist in build context
|
| 128 |
for raw in active_lines:
|
| 129 |
upper = raw.upper()
|
| 130 |
if upper.startswith("COPY "):
|
|
|
|
| 147 |
"error": f"COPY failed: file not found in build context: {src}",
|
| 148 |
}
|
| 149 |
|
| 150 |
+
# platform ARGs need to be declared
|
| 151 |
if "--platform=$BUILDPLATFORM" in content and "ARG BUILDPLATFORM" not in content:
|
| 152 |
return {
|
| 153 |
"build_success": False,
|
|
|
|
| 161 |
"error": "failed to parse platform: TARGETPLATFORM not declared",
|
| 162 |
}
|
| 163 |
|
| 164 |
+
# multi-stage: output dir mismatch (dist vs build)
|
| 165 |
if "COPY --from=builder /app/dist" in content:
|
| 166 |
pkg = context_files.get("package.json")
|
| 167 |
if pkg and "react-scripts build" in pkg.content:
|
|
|
|
| 171 |
"error": "COPY failed: stat app/dist: file does not exist",
|
| 172 |
}
|
| 173 |
|
| 174 |
+
# EXPOSE must have a numeric port
|
| 175 |
for raw in active_lines:
|
| 176 |
upper = raw.upper()
|
| 177 |
if upper.startswith("EXPOSE "):
|
|
|
|
| 186 |
"error": f"EXPOSE requires numeric port or port/protocol, got: {cleaned}",
|
| 187 |
}
|
| 188 |
|
| 189 |
+
# ============================
|
| 190 |
+
# runtime checks (build OK, run might fail)
|
| 191 |
+
# ============================
|
| 192 |
|
| 193 |
+
# no WORKDIR β module resolution fails
|
| 194 |
has_workdir = "WORKDIR" in content
|
| 195 |
if ("npm start" in content or 'CMD ["npm", "start"]' in content) and not has_workdir:
|
| 196 |
return {
|
|
|
|
| 199 |
"run_error": "Error: Cannot find module '/package.json'",
|
| 200 |
}
|
| 201 |
|
| 202 |
+
# ENTRYPOINT + CMD both specify python β conflict
|
| 203 |
if 'ENTRYPOINT ["python"' in content and 'CMD ["python"' in content:
|
| 204 |
return {
|
| 205 |
"build_success": True,
|
|
|
|
| 207 |
"run_error": "container exits immediately; ENTRYPOINT and CMD both specify full command",
|
| 208 |
}
|
| 209 |
|
| 210 |
+
# entrypoint script needs chmod +x
|
| 211 |
if 'ENTRYPOINT ["./start.sh"]' in content and "chmod +x" not in content:
|
| 212 |
return {
|
| 213 |
"build_success": True,
|
|
|
|
| 215 |
"run_error": "exec ./start.sh: permission denied",
|
| 216 |
}
|
| 217 |
|
| 218 |
+
# DATABASE_URL env var missing
|
|
|
|
| 219 |
has_database_url_env = "ENV DATABASE_URL" in content
|
| 220 |
needs_database_url = (
|
| 221 |
"app.py" in content
|
|
|
|
| 229 |
"run_error": "KeyError: 'DATABASE_URL' β Application requires DATABASE_URL environment variable",
|
| 230 |
}
|
| 231 |
|
| 232 |
+
# non-root user can't bind privileged ports
|
| 233 |
has_user_switch = False
|
| 234 |
expose_port = None
|
| 235 |
for raw in active_lines:
|
server/simulators/workflow_simulator.py
CHANGED
|
@@ -1,6 +1,4 @@
|
|
| 1 |
-
"""Workflow simulator
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
|
| 5 |
import re
|
| 6 |
from typing import Any, Dict, List, Optional
|
|
@@ -17,7 +15,7 @@ class WorkflowSimulator:
|
|
| 17 |
|
| 18 |
content = workflow.content
|
| 19 |
|
| 20 |
-
# -
|
| 21 |
# Match ${ ... } that is NOT ${{ ... }}
|
| 22 |
single_brace = re.findall(r'\$\{(?!\{)\s*[^}]+\}', content)
|
| 23 |
if single_brace:
|
|
@@ -30,7 +28,7 @@ class WorkflowSimulator:
|
|
| 30 |
),
|
| 31 |
}
|
| 32 |
|
| 33 |
-
#
|
| 34 |
try:
|
| 35 |
parsed = yaml.safe_load(content)
|
| 36 |
except yaml.YAMLError as exc:
|
|
@@ -47,7 +45,7 @@ class WorkflowSimulator:
|
|
| 47 |
"error": "Workflow root must be a mapping",
|
| 48 |
}
|
| 49 |
|
| 50 |
-
#
|
| 51 |
if "on" not in parsed and True not in parsed:
|
| 52 |
# yaml.safe_load converts `on:` to True key in some contexts
|
| 53 |
return {
|
|
@@ -56,7 +54,7 @@ class WorkflowSimulator:
|
|
| 56 |
"error": "Workflow must define an 'on' trigger event",
|
| 57 |
}
|
| 58 |
|
| 59 |
-
#
|
| 60 |
on_value = parsed.get("on") or parsed.get(True)
|
| 61 |
if isinstance(on_value, dict):
|
| 62 |
for event_key, event_config in on_value.items():
|
|
@@ -73,7 +71,7 @@ class WorkflowSimulator:
|
|
| 73 |
),
|
| 74 |
}
|
| 75 |
|
| 76 |
-
#
|
| 77 |
jobs = parsed.get("jobs")
|
| 78 |
if not isinstance(jobs, dict) or not jobs:
|
| 79 |
return {
|
|
@@ -98,7 +96,7 @@ class WorkflowSimulator:
|
|
| 98 |
if not isinstance(job, dict):
|
| 99 |
continue
|
| 100 |
|
| 101 |
-
#
|
| 102 |
if "runs-on" not in job:
|
| 103 |
return {
|
| 104 |
"parse_success": False,
|
|
@@ -106,7 +104,7 @@ class WorkflowSimulator:
|
|
| 106 |
"error": f"Job '{job_name}' is missing required field 'runs-on'",
|
| 107 |
}
|
| 108 |
|
| 109 |
-
#
|
| 110 |
needs = job.get("needs")
|
| 111 |
if needs:
|
| 112 |
needed = [needs] if isinstance(needs, str) else (needs if isinstance(needs, list) else [])
|
|
@@ -126,7 +124,7 @@ class WorkflowSimulator:
|
|
| 126 |
"error": f"Job '{job_name}' steps must be a list",
|
| 127 |
}
|
| 128 |
|
| 129 |
-
#
|
| 130 |
for step in steps:
|
| 131 |
if not isinstance(step, dict):
|
| 132 |
continue
|
|
@@ -140,7 +138,7 @@ class WorkflowSimulator:
|
|
| 140 |
"error": f"Every step must define a 'uses' or 'run' key. Step '{step_name}' has neither.",
|
| 141 |
}
|
| 142 |
|
| 143 |
-
#
|
| 144 |
checkout_index = -1
|
| 145 |
build_index = -1
|
| 146 |
for idx, step in enumerate(steps):
|
|
@@ -162,7 +160,7 @@ class WorkflowSimulator:
|
|
| 162 |
"exec_error": "Checkout must happen before Docker build steps",
|
| 163 |
}
|
| 164 |
|
| 165 |
-
# -
|
| 166 |
# If a job uses download-artifact but doesn't declare needs on the upload job
|
| 167 |
for job_name, job in jobs.items():
|
| 168 |
if not isinstance(job, dict):
|
|
@@ -186,7 +184,7 @@ class WorkflowSimulator:
|
|
| 186 |
),
|
| 187 |
}
|
| 188 |
|
| 189 |
-
#
|
| 190 |
if has_docker_login:
|
| 191 |
# Check if the login step has env block with secrets
|
| 192 |
login_has_env_secrets = has_username_secret and has_password_secret
|
|
@@ -199,7 +197,7 @@ class WorkflowSimulator:
|
|
| 199 |
"exec_error": "Docker login secrets not wired β add env block with secrets.DOCKER_USERNAME and secrets.DOCKER_PASSWORD",
|
| 200 |
}
|
| 201 |
|
| 202 |
-
#
|
| 203 |
if has_docker_push and not has_docker_login:
|
| 204 |
# Check if using docker/login-action instead
|
| 205 |
has_login_action = "docker/login-action" in content
|
|
@@ -210,7 +208,7 @@ class WorkflowSimulator:
|
|
| 210 |
"exec_error": "Docker push without login β add a docker login step before pushing",
|
| 211 |
}
|
| 212 |
|
| 213 |
-
#
|
| 214 |
if "docker login ghcr.io" in content:
|
| 215 |
if has_password_secret and not has_github_token_secret:
|
| 216 |
return {
|
|
@@ -219,7 +217,7 @@ class WorkflowSimulator:
|
|
| 219 |
"exec_error": "GHCR requires GITHUB_TOKEN for authentication, not DOCKER_PASSWORD",
|
| 220 |
}
|
| 221 |
|
| 222 |
-
#
|
| 223 |
if "ghcr.io" in content and "docker push" in content:
|
| 224 |
# Check if permissions block has packages: write
|
| 225 |
if "packages: write" not in content and "packages:write" not in content:
|
|
@@ -229,7 +227,7 @@ class WorkflowSimulator:
|
|
| 229 |
"exec_error": "GITHUB_TOKEN does not have packages:write permission β add permissions block",
|
| 230 |
}
|
| 231 |
|
| 232 |
-
# -
|
| 233 |
if has_platforms and not has_buildx_setup:
|
| 234 |
return {
|
| 235 |
"parse_success": True,
|
|
@@ -237,7 +235,7 @@ class WorkflowSimulator:
|
|
| 237 |
"exec_error": "Multi-platform build requires docker/setup-buildx-action",
|
| 238 |
}
|
| 239 |
|
| 240 |
-
#
|
| 241 |
if "cache-to:" in content and "cache-from:" in content:
|
| 242 |
# Check for mode=max
|
| 243 |
if "cache-to: type=gha" in content and "mode=max" not in content:
|
|
@@ -247,7 +245,7 @@ class WorkflowSimulator:
|
|
| 247 |
"exec_error": "GHA cache export needs mode=max for proper cache support",
|
| 248 |
}
|
| 249 |
|
| 250 |
-
#
|
| 251 |
for job_name, job in jobs.items():
|
| 252 |
if not isinstance(job, dict):
|
| 253 |
continue
|
|
@@ -268,7 +266,7 @@ class WorkflowSimulator:
|
|
| 268 |
"exec_error": f"Dockerfile path '{file_path}' does not match build context '{context}'",
|
| 269 |
}
|
| 270 |
|
| 271 |
-
#
|
| 272 |
for job_name, job in jobs.items():
|
| 273 |
if not isinstance(job, dict):
|
| 274 |
continue
|
|
@@ -296,7 +294,7 @@ class WorkflowSimulator:
|
|
| 296 |
"exec_error": f"{var} is empty β secret not available in shell environment. Map it via env block.",
|
| 297 |
}
|
| 298 |
|
| 299 |
-
#
|
| 300 |
for job_name, job in jobs.items():
|
| 301 |
if not isinstance(job, dict):
|
| 302 |
continue
|
|
|
|
| 1 |
+
"""Workflow simulator β YAML parse + CI rule checks."""
|
|
|
|
|
|
|
| 2 |
|
| 3 |
import re
|
| 4 |
from typing import Any, Dict, List, Optional
|
|
|
|
| 15 |
|
| 16 |
content = workflow.content
|
| 17 |
|
| 18 |
+
# single-brace expressions: ${ } should be ${{ }}
|
| 19 |
# Match ${ ... } that is NOT ${{ ... }}
|
| 20 |
single_brace = re.findall(r'\$\{(?!\{)\s*[^}]+\}', content)
|
| 21 |
if single_brace:
|
|
|
|
| 28 |
),
|
| 29 |
}
|
| 30 |
|
| 31 |
+
# parse yaml
|
| 32 |
try:
|
| 33 |
parsed = yaml.safe_load(content)
|
| 34 |
except yaml.YAMLError as exc:
|
|
|
|
| 45 |
"error": "Workflow root must be a mapping",
|
| 46 |
}
|
| 47 |
|
| 48 |
+
# needs an 'on' trigger
|
| 49 |
if "on" not in parsed and True not in parsed:
|
| 50 |
# yaml.safe_load converts `on:` to True key in some contexts
|
| 51 |
return {
|
|
|
|
| 54 |
"error": "Workflow must define an 'on' trigger event",
|
| 55 |
}
|
| 56 |
|
| 57 |
+
# validate trigger structure
|
| 58 |
on_value = parsed.get("on") or parsed.get(True)
|
| 59 |
if isinstance(on_value, dict):
|
| 60 |
for event_key, event_config in on_value.items():
|
|
|
|
| 71 |
),
|
| 72 |
}
|
| 73 |
|
| 74 |
+
# jobs block
|
| 75 |
jobs = parsed.get("jobs")
|
| 76 |
if not isinstance(jobs, dict) or not jobs:
|
| 77 |
return {
|
|
|
|
| 96 |
if not isinstance(job, dict):
|
| 97 |
continue
|
| 98 |
|
| 99 |
+
# runs-on is required
|
| 100 |
if "runs-on" not in job:
|
| 101 |
return {
|
| 102 |
"parse_success": False,
|
|
|
|
| 104 |
"error": f"Job '{job_name}' is missing required field 'runs-on'",
|
| 105 |
}
|
| 106 |
|
| 107 |
+
# check 'needs' refs point to real jobs
|
| 108 |
needs = job.get("needs")
|
| 109 |
if needs:
|
| 110 |
needed = [needs] if isinstance(needs, str) else (needs if isinstance(needs, list) else [])
|
|
|
|
| 124 |
"error": f"Job '{job_name}' steps must be a list",
|
| 125 |
}
|
| 126 |
|
| 127 |
+
# every step needs 'uses' or 'run'
|
| 128 |
for step in steps:
|
| 129 |
if not isinstance(step, dict):
|
| 130 |
continue
|
|
|
|
| 138 |
"error": f"Every step must define a 'uses' or 'run' key. Step '{step_name}' has neither.",
|
| 139 |
}
|
| 140 |
|
| 141 |
+
# checkout must come before docker build
|
| 142 |
checkout_index = -1
|
| 143 |
build_index = -1
|
| 144 |
for idx, step in enumerate(steps):
|
|
|
|
| 160 |
"exec_error": "Checkout must happen before Docker build steps",
|
| 161 |
}
|
| 162 |
|
| 163 |
+
# cross-job artifact dependency: download needs 'needs'
|
| 164 |
# If a job uses download-artifact but doesn't declare needs on the upload job
|
| 165 |
for job_name, job in jobs.items():
|
| 166 |
if not isinstance(job, dict):
|
|
|
|
| 184 |
),
|
| 185 |
}
|
| 186 |
|
| 187 |
+
# docker login needs secrets wired via env
|
| 188 |
if has_docker_login:
|
| 189 |
# Check if the login step has env block with secrets
|
| 190 |
login_has_env_secrets = has_username_secret and has_password_secret
|
|
|
|
| 197 |
"exec_error": "Docker login secrets not wired β add env block with secrets.DOCKER_USERNAME and secrets.DOCKER_PASSWORD",
|
| 198 |
}
|
| 199 |
|
| 200 |
+
# push without login
|
| 201 |
if has_docker_push and not has_docker_login:
|
| 202 |
# Check if using docker/login-action instead
|
| 203 |
has_login_action = "docker/login-action" in content
|
|
|
|
| 208 |
"exec_error": "Docker push without login β add a docker login step before pushing",
|
| 209 |
}
|
| 210 |
|
| 211 |
+
# ghcr.io needs GITHUB_TOKEN not DOCKER_PASSWORD
|
| 212 |
if "docker login ghcr.io" in content:
|
| 213 |
if has_password_secret and not has_github_token_secret:
|
| 214 |
return {
|
|
|
|
| 217 |
"exec_error": "GHCR requires GITHUB_TOKEN for authentication, not DOCKER_PASSWORD",
|
| 218 |
}
|
| 219 |
|
| 220 |
+
# ghcr push needs packages:write permission
|
| 221 |
if "ghcr.io" in content and "docker push" in content:
|
| 222 |
# Check if permissions block has packages: write
|
| 223 |
if "packages: write" not in content and "packages:write" not in content:
|
|
|
|
| 227 |
"exec_error": "GITHUB_TOKEN does not have packages:write permission β add permissions block",
|
| 228 |
}
|
| 229 |
|
| 230 |
+
# multi-platform needs buildx
|
| 231 |
if has_platforms and not has_buildx_setup:
|
| 232 |
return {
|
| 233 |
"parse_success": True,
|
|
|
|
| 235 |
"exec_error": "Multi-platform build requires docker/setup-buildx-action",
|
| 236 |
}
|
| 237 |
|
| 238 |
+
# GHA cache export needs mode=max
|
| 239 |
if "cache-to:" in content and "cache-from:" in content:
|
| 240 |
# Check for mode=max
|
| 241 |
if "cache-to: type=gha" in content and "mode=max" not in content:
|
|
|
|
| 245 |
"exec_error": "GHA cache export needs mode=max for proper cache support",
|
| 246 |
}
|
| 247 |
|
| 248 |
+
# context vs dockerfile path mismatch
|
| 249 |
for job_name, job in jobs.items():
|
| 250 |
if not isinstance(job, dict):
|
| 251 |
continue
|
|
|
|
| 266 |
"exec_error": f"Dockerfile path '{file_path}' does not match build context '{context}'",
|
| 267 |
}
|
| 268 |
|
| 269 |
+
# shell env var from secret but not mapped in env block
|
| 270 |
for job_name, job in jobs.items():
|
| 271 |
if not isinstance(job, dict):
|
| 272 |
continue
|
|
|
|
| 294 |
"exec_error": f"{var} is empty β secret not available in shell environment. Map it via env block.",
|
| 295 |
}
|
| 296 |
|
| 297 |
+
# node version vs package.json engines
|
| 298 |
for job_name, job in jobs.items():
|
| 299 |
if not isinstance(job, dict):
|
| 300 |
continue
|
server/tasks/base.py
CHANGED
|
@@ -1,6 +1,4 @@
|
|
| 1 |
-
"""Base task class
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
|
| 5 |
import random
|
| 6 |
from typing import Dict, Optional
|
|
|
|
| 1 |
+
"""Base task class."""
|
|
|
|
|
|
|
| 2 |
|
| 3 |
import random
|
| 4 |
from typing import Dict, Optional
|
server/tasks/task_1_build_errors.py
CHANGED
|
@@ -5,7 +5,7 @@ typos in filenames, invalid base image tags, bad RUN syntax,
|
|
| 5 |
quoted EXPOSE values, missing FROM instruction.
|
| 6 |
"""
|
| 7 |
|
| 8 |
-
|
| 9 |
|
| 10 |
from server.models import TaskDifficulty
|
| 11 |
from server.tasks.base import BaseTask
|
|
|
|
| 5 |
quoted EXPOSE values, missing FROM instruction.
|
| 6 |
"""
|
| 7 |
|
| 8 |
+
|
| 9 |
|
| 10 |
from server.models import TaskDifficulty
|
| 11 |
from server.tasks.base import BaseTask
|
server/tasks/task_2_docker_runtime.py
CHANGED
|
@@ -5,7 +5,7 @@ runtime: missing WORKDIR, CMD/ENTRYPOINT conflicts, permission issues,
|
|
| 5 |
and missing environment variables.
|
| 6 |
"""
|
| 7 |
|
| 8 |
-
|
| 9 |
|
| 10 |
from server.models import TaskDifficulty
|
| 11 |
from server.tasks.base import BaseTask
|
|
|
|
| 5 |
and missing environment variables.
|
| 6 |
"""
|
| 7 |
|
| 8 |
+
|
| 9 |
|
| 10 |
from server.models import TaskDifficulty
|
| 11 |
from server.tasks.base import BaseTask
|
server/tasks/task_3_workflow_syntax.py
CHANGED
|
@@ -5,7 +5,7 @@ step ordering, missing runs-on, invalid triggers, duplicate job IDs,
|
|
| 5 |
and missing 'on' trigger.
|
| 6 |
"""
|
| 7 |
|
| 8 |
-
|
| 9 |
|
| 10 |
from server.models import TaskDifficulty
|
| 11 |
from server.tasks.base import BaseTask
|
|
|
|
| 5 |
and missing 'on' trigger.
|
| 6 |
"""
|
| 7 |
|
| 8 |
+
|
| 9 |
|
| 10 |
from server.models import TaskDifficulty
|
| 11 |
from server.tasks.base import BaseTask
|
server/tasks/task_4_workflow_secrets_permissions.py
CHANGED
|
@@ -9,7 +9,7 @@ in GitHub Actions workflows:
|
|
| 9 |
- Missing write permission for packages
|
| 10 |
"""
|
| 11 |
|
| 12 |
-
|
| 13 |
|
| 14 |
from server.models import TaskDifficulty
|
| 15 |
from server.tasks.base import BaseTask
|
|
|
|
| 9 |
- Missing write permission for packages
|
| 10 |
"""
|
| 11 |
|
| 12 |
+
|
| 13 |
|
| 14 |
from server.models import TaskDifficulty
|
| 15 |
from server.tasks.base import BaseTask
|
server/tasks/task_5_ci_docker_integration.py
CHANGED
|
@@ -8,7 +8,7 @@ Agent debugs combined workflow + Docker build integration failures:
|
|
| 8 |
- Missing Docker login before push
|
| 9 |
"""
|
| 10 |
|
| 11 |
-
|
| 12 |
|
| 13 |
from server.models import TaskDifficulty
|
| 14 |
from server.tasks.base import BaseTask
|
|
|
|
| 8 |
- Missing Docker login before push
|
| 9 |
"""
|
| 10 |
|
| 11 |
+
|
| 12 |
|
| 13 |
from server.models import TaskDifficulty
|
| 14 |
from server.tasks.base import BaseTask
|
server/tasks/task_6_multi_stage_matrix.py
CHANGED
|
@@ -8,7 +8,7 @@ Agent debugs complex multi-stage Docker builds and matrix CI/CD pipelines:
|
|
| 8 |
- Matrix strategy with version-specific failures
|
| 9 |
"""
|
| 10 |
|
| 11 |
-
|
| 12 |
|
| 13 |
from server.models import TaskDifficulty
|
| 14 |
from server.tasks.base import BaseTask
|
|
|
|
| 8 |
- Matrix strategy with version-specific failures
|
| 9 |
"""
|
| 10 |
|
| 11 |
+
|
| 12 |
|
| 13 |
from server.models import TaskDifficulty
|
| 14 |
from server.tasks.base import BaseTask
|
server/tasks/task_registry.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
"""Task registry for the environment."""
|
| 2 |
|
| 3 |
-
from __future__ import annotations
|
| 4 |
|
| 5 |
from typing import Dict, Type
|
| 6 |
|
|
|
|
| 1 |
"""Task registry for the environment."""
|
| 2 |
|
|
|
|
| 3 |
|
| 4 |
from typing import Dict, Type
|
| 5 |
|
server/utils/yaml_parser.py
CHANGED
|
@@ -1,6 +1,4 @@
|
|
| 1 |
-
"""
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
|
| 5 |
from typing import Any, Optional, Tuple
|
| 6 |
|
|
|
|
| 1 |
+
"""YAML parsing helpers."""
|
|
|
|
|
|
|
| 2 |
|
| 3 |
from typing import Any, Optional, Tuple
|
| 4 |
|
smoke_test.py
CHANGED
|
@@ -9,7 +9,6 @@ Modes:
|
|
| 9 |
- live: uses requests against a running server.
|
| 10 |
"""
|
| 11 |
|
| 12 |
-
from __future__ import annotations
|
| 13 |
|
| 14 |
import argparse
|
| 15 |
import json
|
|
@@ -42,7 +41,10 @@ class InProcessClient(EndpointClient):
|
|
| 42 |
|
| 43 |
def get(self, path: str) -> Tuple[int, Dict[str, Any]]:
|
| 44 |
response = self._client.get(path)
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
| 46 |
return response.status_code, data
|
| 47 |
|
| 48 |
def post(self, path: str, body: Optional[Dict[str, Any]] = None) -> Tuple[int, Dict[str, Any]]:
|
|
@@ -60,7 +62,10 @@ class LiveClient(EndpointClient):
|
|
| 60 |
|
| 61 |
def get(self, path: str) -> Tuple[int, Dict[str, Any]]:
|
| 62 |
response = self._requests.get(f"{self._base_url}{path}", timeout=20)
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
| 64 |
return response.status_code, data
|
| 65 |
|
| 66 |
def post(self, path: str, body: Optional[Dict[str, Any]] = None) -> Tuple[int, Dict[str, Any]]:
|
|
@@ -76,8 +81,12 @@ def assert_true(name: str, cond: bool, details: str = "") -> TestResult:
|
|
| 76 |
def run_smoke(client: EndpointClient) -> int:
|
| 77 |
results = []
|
| 78 |
|
| 79 |
-
|
| 80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
status, info = client.get("/info")
|
| 83 |
results.append(assert_true("GET /info", status == 200 and isinstance(info.get("tasks"), list), str(info)))
|
|
|
|
| 9 |
- live: uses requests against a running server.
|
| 10 |
"""
|
| 11 |
|
|
|
|
| 12 |
|
| 13 |
import argparse
|
| 14 |
import json
|
|
|
|
| 41 |
|
| 42 |
def get(self, path: str) -> Tuple[int, Dict[str, Any]]:
|
| 43 |
response = self._client.get(path)
|
| 44 |
+
try:
|
| 45 |
+
data = response.json()
|
| 46 |
+
except Exception:
|
| 47 |
+
data = {}
|
| 48 |
return response.status_code, data
|
| 49 |
|
| 50 |
def post(self, path: str, body: Optional[Dict[str, Any]] = None) -> Tuple[int, Dict[str, Any]]:
|
|
|
|
| 62 |
|
| 63 |
def get(self, path: str) -> Tuple[int, Dict[str, Any]]:
|
| 64 |
response = self._requests.get(f"{self._base_url}{path}", timeout=20)
|
| 65 |
+
try:
|
| 66 |
+
data = response.json()
|
| 67 |
+
except Exception:
|
| 68 |
+
data = {}
|
| 69 |
return response.status_code, data
|
| 70 |
|
| 71 |
def post(self, path: str, body: Optional[Dict[str, Any]] = None) -> Tuple[int, Dict[str, Any]]:
|
|
|
|
| 81 |
def run_smoke(client: EndpointClient) -> int:
|
| 82 |
results = []
|
| 83 |
|
| 84 |
+
# root serves the landing page now (HTML), just check it's 200
|
| 85 |
+
status, _ = client.get("/")
|
| 86 |
+
results.append(assert_true("GET / landing page", status == 200))
|
| 87 |
+
|
| 88 |
+
status, health = client.get("/health")
|
| 89 |
+
results.append(assert_true("GET /health", status == 200 and health.get("status") == "healthy", str(health)))
|
| 90 |
|
| 91 |
status, info = client.get("/info")
|
| 92 |
results.append(assert_true("GET /info", status == 200 and isinstance(info.get("tasks"), list), str(info)))
|
tests/test_determinism.py
CHANGED
|
@@ -1,10 +1,4 @@
|
|
| 1 |
-
"""Determinism and score
|
| 2 |
-
|
| 3 |
-
Day 7 deliverables:
|
| 4 |
-
- Same trajectory β same score (determinism)
|
| 5 |
-
- Score ranges match CONTEXT.md expectations
|
| 6 |
-
- Difficulty progression verified
|
| 7 |
-
"""
|
| 8 |
|
| 9 |
from server.environment import CICDDebugEnvironment
|
| 10 |
from server.graders import run_grader
|
|
@@ -12,7 +6,7 @@ from server.models import Action, ActionType, FileEdit
|
|
| 12 |
from server.tasks.task_registry import TASK_REGISTRY
|
| 13 |
|
| 14 |
|
| 15 |
-
#
|
| 16 |
|
| 17 |
|
| 18 |
def test_reset_deterministic_with_seed():
|
|
@@ -89,7 +83,7 @@ def test_full_episode_determinism():
|
|
| 89 |
assert len(set(scores)) == 1, f"Non-deterministic episode scores: {scores}"
|
| 90 |
|
| 91 |
|
| 92 |
-
#
|
| 93 |
|
| 94 |
|
| 95 |
def test_empty_trajectory_scores_zero():
|
|
@@ -174,7 +168,7 @@ def test_score_always_in_0_1_range():
|
|
| 174 |
assert 0.0 <= r.score <= 1.0, f"Score {r.score} out of [0, 1] range"
|
| 175 |
|
| 176 |
|
| 177 |
-
#
|
| 178 |
|
| 179 |
|
| 180 |
def test_difficulty_progression():
|
|
@@ -241,7 +235,7 @@ def test_all_scenarios_have_required_fields():
|
|
| 241 |
assert len(scenario["expected_fixes"]) >= 1, f"{task_id}/{scenario['id']}: no expected_fixes"
|
| 242 |
|
| 243 |
|
| 244 |
-
#
|
| 245 |
|
| 246 |
|
| 247 |
def test_end_to_end_grading_all_tasks():
|
|
|
|
| 1 |
+
"""Determinism and score-range tests for the grader and environment."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
from server.environment import CICDDebugEnvironment
|
| 4 |
from server.graders import run_grader
|
|
|
|
| 6 |
from server.tasks.task_registry import TASK_REGISTRY
|
| 7 |
|
| 8 |
|
| 9 |
+
# -- determinism --
|
| 10 |
|
| 11 |
|
| 12 |
def test_reset_deterministic_with_seed():
|
|
|
|
| 83 |
assert len(set(scores)) == 1, f"Non-deterministic episode scores: {scores}"
|
| 84 |
|
| 85 |
|
| 86 |
+
# -- score ranges --
|
| 87 |
|
| 88 |
|
| 89 |
def test_empty_trajectory_scores_zero():
|
|
|
|
| 168 |
assert 0.0 <= r.score <= 1.0, f"Score {r.score} out of [0, 1] range"
|
| 169 |
|
| 170 |
|
| 171 |
+
# -- difficulty progression --
|
| 172 |
|
| 173 |
|
| 174 |
def test_difficulty_progression():
|
|
|
|
| 235 |
assert len(scenario["expected_fixes"]) >= 1, f"{task_id}/{scenario['id']}: no expected_fixes"
|
| 236 |
|
| 237 |
|
| 238 |
+
# -- e2e grading --
|
| 239 |
|
| 240 |
|
| 241 |
def test_end_to_end_grading_all_tasks():
|