Commit Β·
4b07aaf
1
Parent(s): deb4824
grading logic + tasks implemented
Browse files- .gitignore +4 -0
- CONTEXT.md +347 -0
- IMPLEMENTATION_PLAN.md +57 -32
- openenv.yaml +10 -4
- server/environment.py +16 -6
- server/graders/__init__.py +65 -46
- server/graders/base.py +101 -1
- server/models.py +1 -1
- server/simulators/docker_simulator.py +116 -6
- server/simulators/workflow_simulator.py +242 -12
- server/tasks/base.py +11 -3
- server/tasks/task_1_build_errors.py +184 -15
- server/tasks/task_2_docker_runtime.py +195 -15
- server/tasks/task_2_workflow_config.py +0 -52
- server/tasks/task_3_multi_stage.py +0 -44
- server/tasks/task_3_workflow_syntax.py +190 -16
- server/tasks/task_4_workflow_secrets_permissions.py +254 -17
- server/tasks/task_5_ci_docker_integration.py +280 -16
- server/tasks/task_6_multi_stage_matrix.py +366 -16
- server/utils/yaml_parser.py +43 -0
- tests/test_determinism.py +228 -7
.gitignore
CHANGED
|
@@ -37,3 +37,7 @@ dist/
|
|
| 37 |
# OS files
|
| 38 |
.DS_Store
|
| 39 |
Thumbs.db
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
# OS files
|
| 38 |
.DS_Store
|
| 39 |
Thumbs.db
|
| 40 |
+
|
| 41 |
+
*.zip
|
| 42 |
+
|
| 43 |
+
# CONTEXT.md
|
CONTEXT.md
ADDED
|
@@ -0,0 +1,347 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# π§ PROJECT CONTEXT
|
| 2 |
+
## CI/CD Debug Environment for OpenEnv Hackathon
|
| 3 |
+
|
| 4 |
+
> **For Claude Code**: Read this file first to understand the project background, decisions made, and current status.
|
| 5 |
+
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
## π HACKATHON OVERVIEW
|
| 9 |
+
|
| 10 |
+
**Event**: OpenEnv Hackathon by Scaler School of Technology
|
| 11 |
+
**Partners**: Meta, HuggingFace, PyTorch
|
| 12 |
+
**Deadline**: April 8, 2026 (Round 1 online submission)
|
| 13 |
+
**Finale**: April 25-26, 2026 in Bangalore
|
| 14 |
+
**Prize Pool**: $30,000 + direct interview opportunities
|
| 15 |
+
|
| 16 |
+
**Goal**: Build a complete, real-world OpenEnv environment that an AI agent can learn from through the standard step()/reset()/state() API.
|
| 17 |
+
|
| 18 |
+
---
|
| 19 |
+
|
| 20 |
+
## π― WHAT WE'RE BUILDING
|
| 21 |
+
|
| 22 |
+
**Environment Name**: `cicd-debug-env`
|
| 23 |
+
**Concept**: AI agents debug broken GitHub Actions workflows and Dockerfiles
|
| 24 |
+
|
| 25 |
+
The agent receives:
|
| 26 |
+
1. Error messages from failed builds/workflows
|
| 27 |
+
2. Configuration files (Dockerfile, workflow YAML)
|
| 28 |
+
3. Context about available secrets
|
| 29 |
+
|
| 30 |
+
The agent must:
|
| 31 |
+
1. Analyze the error
|
| 32 |
+
2. Identify the root cause
|
| 33 |
+
3. Fix the files
|
| 34 |
+
4. Submit the solution
|
| 35 |
+
|
| 36 |
+
---
|
| 37 |
+
|
| 38 |
+
## π WHY THIS IDEA WINS
|
| 39 |
+
|
| 40 |
+
| Criteria | Weight | Our Score | Why |
|
| 41 |
+
|----------|--------|-----------|-----|
|
| 42 |
+
| Real-world utility | 30% | 30/30 | Every developer debugs Docker + CI/CD daily |
|
| 43 |
+
| Task & grader quality | 25% | 25/25 | 6 tasks, deterministic + dynamic graders |
|
| 44 |
+
| Environment design | 20% | 20/20 | Clean state, typed models, dense rewards |
|
| 45 |
+
| Code quality & spec | 15% | 15/15 | Full OpenEnv compliance |
|
| 46 |
+
| Creativity & novelty | 10% | 10/10 | First CI/CD debugging env on OpenEnv |
|
| 47 |
+
|
| 48 |
+
**Key Insight**: Judges are Meta/HuggingFace engineers who debug Docker and GitHub Actions EVERY DAY.
|
| 49 |
+
|
| 50 |
+
---
|
| 51 |
+
|
| 52 |
+
## π THE 6 TASKS
|
| 53 |
+
|
| 54 |
+
| # | Task ID | Name | Difficulty | Category |
|
| 55 |
+
|---|---------|------|------------|----------|
|
| 56 |
+
| 1 | `dockerfile_syntax` | Dockerfile Syntax Errors | Easy | Docker |
|
| 57 |
+
| 2 | `dockerfile_runtime` | Dockerfile Runtime Errors | Medium | Docker |
|
| 58 |
+
| 3 | `workflow_syntax_structure` | Workflow Syntax and Structure | Easy | Workflow |
|
| 59 |
+
| 4 | `workflow_secrets_permissions` | Workflow Secrets and Permissions | Medium | Workflow |
|
| 60 |
+
| 5 | `ci_docker_integration` | CI and Docker Build Integration | Medium-Hard | Combined |
|
| 61 |
+
| 6 | `multi_stage_pipeline_matrix` | Multi-Stage Pipeline and Matrix | Hard | Combined |
|
| 62 |
+
|
| 63 |
+
**Structure**: 2 Docker-only + 2 Workflow-only + 2 Combined = 6 tasks total
|
| 64 |
+
|
| 65 |
+
**Scenarios per task**: Aim for 4-5 scenarios each (total ~25-30 scenarios)
|
| 66 |
+
|
| 67 |
+
---
|
| 68 |
+
|
| 69 |
+
## π GRADING LOGIC
|
| 70 |
+
|
| 71 |
+
### Key Principles:
|
| 72 |
+
- **DYNAMIC**: Score depends on what the agent actually does
|
| 73 |
+
- **DETERMINISTIC**: Same actions = same score (required for reproducibility)
|
| 74 |
+
- **PARTIAL CREDIT**: Reward progress, not just final solution
|
| 75 |
+
|
| 76 |
+
### Score Components:
|
| 77 |
+
|
| 78 |
+
| Component | Weight | Description |
|
| 79 |
+
|-----------|--------|-------------|
|
| 80 |
+
| Issue Identification | 15% | Agent targets correct file/line |
|
| 81 |
+
| Partial Fixes | 25% | Fix is partially correct |
|
| 82 |
+
| Complete Fixes | 40% | All issues fully resolved |
|
| 83 |
+
| Efficiency Bonus | 15% | Solved in minimal steps |
|
| 84 |
+
| Hint Penalty | -5% each | Penalty for hints used |
|
| 85 |
+
|
| 86 |
+
### Example:
|
| 87 |
+
```
|
| 88 |
+
Scenario: Dockerfile has 2 bugs
|
| 89 |
+
|
| 90 |
+
Agent fixes bug 1 only β ~0.4 score
|
| 91 |
+
Agent fixes bug 2 only β ~0.4 score
|
| 92 |
+
Agent fixes both β ~0.85 score
|
| 93 |
+
Agent fixes both quickly β ~1.0 score (with efficiency bonus)
|
| 94 |
+
Agent uses 2 hints β -0.10 penalty
|
| 95 |
+
```
|
| 96 |
+
|
| 97 |
+
---
|
| 98 |
+
|
| 99 |
+
## π REQUIRED API ENDPOINTS (7 total)
|
| 100 |
+
|
| 101 |
+
| Endpoint | Method | Purpose |
|
| 102 |
+
|----------|--------|---------|
|
| 103 |
+
| `/` | GET | Health check |
|
| 104 |
+
| `/reset` | POST | Start new episode |
|
| 105 |
+
| `/step` | POST | Take action |
|
| 106 |
+
| `/state` | GET | Current state |
|
| 107 |
+
| `/info` | GET | Environment metadata |
|
| 108 |
+
| `/tasks` | GET | List tasks |
|
| 109 |
+
| `/grader` | POST | Grade trajectory |
|
| 110 |
+
| `/baseline` | POST | Run baseline agent |
|
| 111 |
+
|
| 112 |
+
---
|
| 113 |
+
|
| 114 |
+
## π PROJECT STRUCTURE
|
| 115 |
+
|
| 116 |
+
```
|
| 117 |
+
cicd-debug-env/
|
| 118 |
+
βββ openenv.yaml # OpenEnv metadata (REQUIRED)
|
| 119 |
+
βββ inference.py # Baseline script (REQUIRED)
|
| 120 |
+
βββ Dockerfile # For HF Spaces (REQUIRED)
|
| 121 |
+
βββ requirements.txt
|
| 122 |
+
βββ README.md
|
| 123 |
+
βββ CONTEXT.md # This file
|
| 124 |
+
β
|
| 125 |
+
βββ server/
|
| 126 |
+
β βββ __init__.py
|
| 127 |
+
β βββ main.py # FastAPI with all 7 endpoints
|
| 128 |
+
β βββ models.py # Pydantic models
|
| 129 |
+
β βββ environment.py # Core environment logic
|
| 130 |
+
β β
|
| 131 |
+
β βββ tasks/
|
| 132 |
+
β β βββ __init__.py
|
| 133 |
+
β β βββ base.py
|
| 134 |
+
β β βββ task_registry.py
|
| 135 |
+
β β βββ task_1_dockerfile_syntax.py
|
| 136 |
+
β β βββ task_2_dockerfile_runtime.py
|
| 137 |
+
β β βββ task_3_workflow_syntax_structure.py
|
| 138 |
+
β β βββ task_4_workflow_secrets_permissions.py
|
| 139 |
+
β β βββ task_5_ci_docker_integration.py
|
| 140 |
+
β β βββ task_6_multi_stage_pipeline_matrix.py
|
| 141 |
+
β β
|
| 142 |
+
β βββ graders/
|
| 143 |
+
β β βββ __init__.py
|
| 144 |
+
β β βββ grader.py
|
| 145 |
+
β β
|
| 146 |
+
β βββ simulators/
|
| 147 |
+
β β βββ __init__.py
|
| 148 |
+
β β βββ docker_simulator.py
|
| 149 |
+
β β βββ workflow_simulator.py
|
| 150 |
+
β β
|
| 151 |
+
β βββ utils/
|
| 152 |
+
β βββ yaml_parser.py
|
| 153 |
+
β
|
| 154 |
+
βββ tests/
|
| 155 |
+
βββ conftest.py
|
| 156 |
+
βββ test_endpoints.py
|
| 157 |
+
```
|
| 158 |
+
|
| 159 |
+
---
|
| 160 |
+
|
| 161 |
+
## π― EXPECTED BASELINE SCORES
|
| 162 |
+
|
| 163 |
+
| Task | Expected Score |
|
| 164 |
+
|------|---------------|
|
| 165 |
+
| dockerfile_syntax | 0.70 |
|
| 166 |
+
| dockerfile_runtime | 0.55 |
|
| 167 |
+
| workflow_syntax_structure | 0.65 |
|
| 168 |
+
| workflow_secrets_permissions | 0.50 |
|
| 169 |
+
| ci_docker_integration | 0.45 |
|
| 170 |
+
| multi_stage_pipeline_matrix | 0.30 |
|
| 171 |
+
|
| 172 |
+
---
|
| 173 |
+
|
| 174 |
+
## β
CURRENT STATUS
|
| 175 |
+
|
| 176 |
+
### What's Been Decided:
|
| 177 |
+
- [x] Environment concept (CI/CD debugging)
|
| 178 |
+
- [x] 6 tasks with difficulty progression
|
| 179 |
+
- [x] Grading logic (dynamic + deterministic)
|
| 180 |
+
- [x] Project structure
|
| 181 |
+
- [x] Implementation plan created
|
| 182 |
+
|
| 183 |
+
### Day 1-2: Foundation (COMPLETE)
|
| 184 |
+
- [x] Pydantic models (server/models.py) β Observation, Action, FileEdit, GraderResult, etc.
|
| 185 |
+
- [x] FastAPI server (server/main.py) β All 7 endpoints working
|
| 186 |
+
- [x] openenv.yaml β Full spec compliance
|
| 187 |
+
|
| 188 |
+
### Day 3-4: Core Environment (COMPLETE)
|
| 189 |
+
- [x] Core environment (server/environment.py) β reset, step, state, hint, submit
|
| 190 |
+
- [x] Docker simulator (server/simulators/docker_simulator.py) β 15+ validation rules
|
| 191 |
+
- [x] Workflow simulator (server/simulators/workflow_simulator.py) β 15+ validation rules
|
| 192 |
+
|
| 193 |
+
### Day 5-6: Tasks & Scenarios (COMPLETE)
|
| 194 |
+
- [x] Task 1: dockerfile_syntax (5 scenarios) β typo, bad tag, RUN syntax, EXPOSE, missing FROM
|
| 195 |
+
- [x] Task 2: dockerfile_runtime (5 scenarios) β WORKDIR, CMD/ENTRYPOINT, chmod, ENV, port
|
| 196 |
+
- [x] Task 3: workflow_syntax_structure (5 scenarios) β checkout order, runs-on, triggers, uses/run, on
|
| 197 |
+
- [x] Task 4: workflow_secrets_permissions (5 scenarios) β env secrets, ${{ }}, permissions, env mapping, GHCR
|
| 198 |
+
- [x] Task 5: ci_docker_integration (5 scenarios) β buildx, login secrets, context path, cache, push auth
|
| 199 |
+
- [x] Task 6: multi_stage_pipeline_matrix (5 scenarios) β dist/build, platform ARGs, needs, multi-issue, matrix
|
| 200 |
+
- [x] 30/30 scenarios verified end-to-end
|
| 201 |
+
|
| 202 |
+
### Day 7: Graders & Rewards (COMPLETE)
|
| 203 |
+
- [x] Grader implementation β deterministic, dynamic, partial credit
|
| 204 |
+
- [x] Reward shaping β dense rewards at every step
|
| 205 |
+
- [x] Determinism verified β same input = same output (17 tests)
|
| 206 |
+
- [x] Score ranges verified β 0.0 to 1.0, matching CONTEXT.md examples
|
| 207 |
+
- [x] 26/26 total tests passing
|
| 208 |
+
|
| 209 |
+
### Remaining (Day 8-10):
|
| 210 |
+
- [ ] Baseline inference script (inference.py)
|
| 211 |
+
- [ ] Dockerfile for deployment
|
| 212 |
+
- [ ] Deploy to HuggingFace Spaces
|
| 213 |
+
- [ ] Run `openenv validate`
|
| 214 |
+
- [ ] Test with real LLM (Llama 3.1 70B)
|
| 215 |
+
- [ ] Verify baseline scores match expectations
|
| 216 |
+
- [ ] Write comprehensive README
|
| 217 |
+
- [ ] Final polish and submit
|
| 218 |
+
|
| 219 |
+
---
|
| 220 |
+
|
| 221 |
+
## π§ͺ HOW TO RUN
|
| 222 |
+
|
| 223 |
+
### Local Development:
|
| 224 |
+
```bash
|
| 225 |
+
pip install -r requirements.txt
|
| 226 |
+
python -m server.main
|
| 227 |
+
# Server at http://localhost:7860
|
| 228 |
+
```
|
| 229 |
+
|
| 230 |
+
### Test Endpoints:
|
| 231 |
+
```bash
|
| 232 |
+
curl http://localhost:7860/
|
| 233 |
+
curl http://localhost:7860/info
|
| 234 |
+
curl -X POST http://localhost:7860/reset -H "Content-Type: application/json" -d '{}'
|
| 235 |
+
```
|
| 236 |
+
|
| 237 |
+
### Run Tests:
|
| 238 |
+
```bash
|
| 239 |
+
pytest tests/ -v
|
| 240 |
+
```
|
| 241 |
+
|
| 242 |
+
### Docker:
|
| 243 |
+
```bash
|
| 244 |
+
docker build -t cicd-debug-env .
|
| 245 |
+
docker run -p 7860:7860 cicd-debug-env
|
| 246 |
+
```
|
| 247 |
+
|
| 248 |
+
### Baseline Inference:
|
| 249 |
+
```bash
|
| 250 |
+
export API_BASE_URL=https://router.huggingface.co/v1
|
| 251 |
+
export MODEL_NAME=meta-llama/Llama-3.1-70B-Instruct
|
| 252 |
+
export HF_TOKEN=your_token_here
|
| 253 |
+
python inference.py
|
| 254 |
+
```
|
| 255 |
+
|
| 256 |
+
---
|
| 257 |
+
|
| 258 |
+
## π¨ DISQUALIFICATION CRITERIA (AVOID!)
|
| 259 |
+
|
| 260 |
+
- β Environment does not deploy or respond
|
| 261 |
+
- β Plagiarized or trivially modified existing environments
|
| 262 |
+
- β Graders that always return the same score
|
| 263 |
+
- β No baseline inference script
|
| 264 |
+
|
| 265 |
+
---
|
| 266 |
+
|
| 267 |
+
## π‘ KEY DESIGN DECISIONS
|
| 268 |
+
|
| 269 |
+
1. **Combined Docker + GitHub Actions**: The intersection is the most painful real-world failure
|
| 270 |
+
|
| 271 |
+
2. **6 tasks (2+2+2)**: 2 Docker + 2 Workflow + 2 Combined, clear difficulty progression
|
| 272 |
+
|
| 273 |
+
3. **Dynamic but deterministic grading**: Score varies by agent actions, but same actions = same score
|
| 274 |
+
|
| 275 |
+
4. **Simulated validation**: No real Docker containers, just static analysis for speed and determinism
|
| 276 |
+
|
| 277 |
+
5. **Dense rewards with partial credit**: Better than sparse (pass/fail) for agent training
|
| 278 |
+
|
| 279 |
+
6. **OpenAI client for baseline**: Required by hackathon (not Anthropic client)
|
| 280 |
+
|
| 281 |
+
---
|
| 282 |
+
|
| 283 |
+
## π REFERENCE: Scenario Structure
|
| 284 |
+
|
| 285 |
+
Each scenario should have:
|
| 286 |
+
```python
|
| 287 |
+
{
|
| 288 |
+
"id": "unique_scenario_id",
|
| 289 |
+
"files": [
|
| 290 |
+
{
|
| 291 |
+
"path": "Dockerfile",
|
| 292 |
+
"type": "dockerfile",
|
| 293 |
+
"content": "FROM python:3.11-slim\n..."
|
| 294 |
+
}
|
| 295 |
+
],
|
| 296 |
+
"error": {
|
| 297 |
+
"phase": "docker_build",
|
| 298 |
+
"message": "COPY failed: file not found...",
|
| 299 |
+
"exit_code": 1,
|
| 300 |
+
"failed_step": "COPY requirements.txt",
|
| 301 |
+
"line_hint": 3
|
| 302 |
+
},
|
| 303 |
+
"expected_fixes": [
|
| 304 |
+
{
|
| 305 |
+
"file": "Dockerfile",
|
| 306 |
+
"type": "contains", # or "not_contains", "line_equals", "regex"
|
| 307 |
+
"expected": "COPY requirements.txt",
|
| 308 |
+
"line": 3,
|
| 309 |
+
"hint": "Check the spelling of the filename",
|
| 310 |
+
"points": 0.5
|
| 311 |
+
}
|
| 312 |
+
]
|
| 313 |
+
}
|
| 314 |
+
```
|
| 315 |
+
|
| 316 |
+
---
|
| 317 |
+
|
| 318 |
+
## π COMMON ISSUES TO DEBUG
|
| 319 |
+
|
| 320 |
+
### Dockerfile Issues:
|
| 321 |
+
- Typos in filenames (requirments.txt)
|
| 322 |
+
- Invalid base image tags (python:3.11-slimm)
|
| 323 |
+
- Invalid EXPOSE syntax (EXPOSE "eighty")
|
| 324 |
+
- Missing WORKDIR before COPY
|
| 325 |
+
- Permission issues (chmod +x)
|
| 326 |
+
- CMD/ENTRYPOINT conflicts
|
| 327 |
+
|
| 328 |
+
### Workflow Issues:
|
| 329 |
+
- Missing env block for secrets
|
| 330 |
+
- Wrong secret syntax (${ vs ${{)
|
| 331 |
+
- Missing runs-on field
|
| 332 |
+
- Checkout after build (wrong order)
|
| 333 |
+
- Missing permissions for GITHUB_TOKEN
|
| 334 |
+
- Invalid event triggers
|
| 335 |
+
- Duplicate job IDs
|
| 336 |
+
|
| 337 |
+
### Combined Issues:
|
| 338 |
+
- Docker login needs secrets in env block
|
| 339 |
+
- Multi-platform builds need setup-buildx-action
|
| 340 |
+
- Cross-job artifacts need 'needs' dependency
|
| 341 |
+
- Path mismatches (dist vs build directory)
|
| 342 |
+
- GHCR uses GITHUB_TOKEN not DOCKER_PASSWORD
|
| 343 |
+
|
| 344 |
+
---
|
| 345 |
+
|
| 346 |
+
*Last updated: April 4, 2026*
|
| 347 |
+
*Author: Krishna*
|
IMPLEMENTATION_PLAN.md
CHANGED
|
@@ -129,7 +129,6 @@ cicd-debug-env/
|
|
| 129 |
|
| 130 |
## 4.1 openenv.yaml
|
| 131 |
|
| 132 |
-
```yaml
|
| 133 |
name: cicd-debug-env
|
| 134 |
version: "1.0.0"
|
| 135 |
description: >
|
|
@@ -152,53 +151,73 @@ environment:
|
|
| 152 |
max_steps: 10
|
| 153 |
|
| 154 |
tasks:
|
|
|
|
| 155 |
- id: dockerfile_syntax
|
| 156 |
name: "Dockerfile Syntax Errors"
|
| 157 |
description: "Fix syntax and instruction errors in Dockerfiles"
|
| 158 |
difficulty: easy
|
| 159 |
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
difficulty: medium
|
| 164 |
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
|
|
|
| 169 |
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
difficulty: hard
|
| 174 |
|
| 175 |
graders:
|
| 176 |
dockerfile_syntax:
|
| 177 |
type: deterministic
|
| 178 |
score_range: [0.0, 1.0]
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
|
|
|
|
|
|
|
|
|
| 183 |
type: deterministic
|
| 184 |
score_range: [0.0, 1.0]
|
| 185 |
-
|
|
|
|
|
|
|
|
|
|
| 186 |
type: deterministic
|
| 187 |
score_range: [0.0, 1.0]
|
| 188 |
|
| 189 |
baseline:
|
| 190 |
script: inference.py
|
| 191 |
expected_scores:
|
| 192 |
-
dockerfile_syntax: 0.
|
| 193 |
-
|
|
|
|
|
|
|
| 194 |
ci_docker_integration: 0.45
|
| 195 |
-
multi_stage_pipeline_matrix: 0.
|
| 196 |
|
| 197 |
resources:
|
| 198 |
vcpu: 2
|
| 199 |
memory: 8gb
|
| 200 |
-
timeout: 1200
|
| 201 |
-
```
|
| 202 |
|
| 203 |
## 4.2 Pydantic Models (server/models.py)
|
| 204 |
|
|
@@ -2702,19 +2721,25 @@ echo "=== ALL CHECKS PASSED ==="
|
|
| 2702 |
- [x] Test basic episode flow
|
| 2703 |
|
| 2704 |
### Day 5-6: Tasks & Scenarios
|
| 2705 |
-
- [
|
| 2706 |
-
- [
|
| 2707 |
-
- [
|
| 2708 |
-
- [
|
| 2709 |
-
- [
|
| 2710 |
-
- [
|
| 2711 |
-
- [
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2712 |
|
| 2713 |
### Day 7: Graders & Rewards
|
| 2714 |
-
- [
|
| 2715 |
-
- [
|
| 2716 |
-
- [
|
| 2717 |
-
- [
|
|
|
|
|
|
|
| 2718 |
|
| 2719 |
### Day 8: Baseline & Testing
|
| 2720 |
- [ ] Write inference.py baseline
|
|
|
|
| 129 |
|
| 130 |
## 4.1 openenv.yaml
|
| 131 |
|
|
|
|
| 132 |
name: cicd-debug-env
|
| 133 |
version: "1.0.0"
|
| 134 |
description: >
|
|
|
|
| 151 |
max_steps: 10
|
| 152 |
|
| 153 |
tasks:
|
| 154 |
+
# Docker-only tasks (2)
|
| 155 |
- id: dockerfile_syntax
|
| 156 |
name: "Dockerfile Syntax Errors"
|
| 157 |
description: "Fix syntax and instruction errors in Dockerfiles"
|
| 158 |
difficulty: easy
|
| 159 |
|
| 160 |
+
- id: dockerfile_runtime
|
| 161 |
+
name: "Dockerfile Runtime Errors"
|
| 162 |
+
description: "Fix Dockerfiles that build but fail at runtime"
|
| 163 |
+
difficulty: medium
|
| 164 |
+
|
| 165 |
+
# Workflow-only tasks (2)
|
| 166 |
+
- id: workflow_syntax_structure
|
| 167 |
+
name: "Workflow Syntax and Structure"
|
| 168 |
+
description: "Fix YAML syntax and structural issues in GitHub Actions"
|
| 169 |
+
difficulty: easy
|
| 170 |
+
|
| 171 |
+
- id: workflow_secrets_permissions
|
| 172 |
+
name: "Workflow Secrets and Permissions"
|
| 173 |
+
description: "Fix secret wiring, env usage, and permissions in workflows"
|
| 174 |
difficulty: medium
|
| 175 |
|
| 176 |
+
# Combined tasks (2)
|
| 177 |
+
- id: ci_docker_integration
|
| 178 |
+
name: "CI and Docker Build Integration"
|
| 179 |
+
description: "Debug combined workflow and Docker build integration failures"
|
| 180 |
+
difficulty: medium-hard
|
| 181 |
|
| 182 |
+
- id: multi_stage_pipeline_matrix
|
| 183 |
+
name: "Multi-Stage Pipeline and Matrix"
|
| 184 |
+
description: "Debug complex multi-stage and matrix CI/CD pipelines"
|
| 185 |
difficulty: hard
|
| 186 |
|
| 187 |
graders:
|
| 188 |
dockerfile_syntax:
|
| 189 |
type: deterministic
|
| 190 |
score_range: [0.0, 1.0]
|
| 191 |
+
dockerfile_runtime:
|
| 192 |
+
type: deterministic
|
| 193 |
+
score_range: [0.0, 1.0]
|
| 194 |
+
workflow_syntax_structure:
|
| 195 |
+
type: deterministic
|
| 196 |
+
score_range: [0.0, 1.0]
|
| 197 |
+
workflow_secrets_permissions:
|
| 198 |
type: deterministic
|
| 199 |
score_range: [0.0, 1.0]
|
| 200 |
+
ci_docker_integration:
|
| 201 |
+
type: deterministic
|
| 202 |
+
score_range: [0.0, 1.0]
|
| 203 |
+
multi_stage_pipeline_matrix:
|
| 204 |
type: deterministic
|
| 205 |
score_range: [0.0, 1.0]
|
| 206 |
|
| 207 |
baseline:
|
| 208 |
script: inference.py
|
| 209 |
expected_scores:
|
| 210 |
+
dockerfile_syntax: 0.70
|
| 211 |
+
dockerfile_runtime: 0.55
|
| 212 |
+
workflow_syntax_structure: 0.65
|
| 213 |
+
workflow_secrets_permissions: 0.50
|
| 214 |
ci_docker_integration: 0.45
|
| 215 |
+
multi_stage_pipeline_matrix: 0.30
|
| 216 |
|
| 217 |
resources:
|
| 218 |
vcpu: 2
|
| 219 |
memory: 8gb
|
| 220 |
+
timeout: 1200
|
|
|
|
| 221 |
|
| 222 |
## 4.2 Pydantic Models (server/models.py)
|
| 223 |
|
|
|
|
| 2721 |
- [x] Test basic episode flow
|
| 2722 |
|
| 2723 |
### Day 5-6: Tasks & Scenarios
|
| 2724 |
+
- [x] Implement Task 1: Dockerfile Syntax (5 scenarios)
|
| 2725 |
+
- [x] Implement Task 2: Dockerfile Runtime (5 scenarios)
|
| 2726 |
+
- [x] Implement Task 3: Workflow Syntax and Structure (5 scenarios)
|
| 2727 |
+
- [x] Implement Task 4: Workflow Secrets and Permissions (5 scenarios)
|
| 2728 |
+
- [x] Implement Task 5: CI and Docker Build Integration (5 scenarios)
|
| 2729 |
+
- [x] Implement Task 6: Multi-Stage Pipeline and Matrix (5 scenarios)
|
| 2730 |
+
- [x] Verify difficulty progression (easy β medium β hard)
|
| 2731 |
+
- [x] Enhanced DockerSimulator: 15+ validation rules (typos, bad tags, EXPOSE, platform ARGs, runtime: WORKDIR, ENTRYPOINT, ENV, privileged ports)
|
| 2732 |
+
- [x] Enhanced WorkflowSimulator: 15+ validation rules (on trigger, runs-on, branches syntax, run/uses, ${{ }}, permissions, needs, secrets env, GHCR creds, cache, context paths, push auth)
|
| 2733 |
+
- [x] Fixed environment.py: dynamic workflow file lookup, trajectory includes info dict
|
| 2734 |
+
- [x] 30/30 scenarios verified end-to-end (reset β fix β grade)
|
| 2735 |
|
| 2736 |
### Day 7: Graders & Rewards
|
| 2737 |
+
- [x] Implement grader logic (deterministic, dynamic scoring)
|
| 2738 |
+
- [x] Test determinism (10x replay β identical scores)
|
| 2739 |
+
- [x] Tune reward shaping (dense: +0.1 validation, +0.3/fix, -0.05/hint, -0.02/failed)
|
| 2740 |
+
- [x] Verify score ranges (0/nβ0.0, partialβ~0.5, completeβ1.0, hints penalized)
|
| 2741 |
+
- [x] Grader weights: 40% partial fixes + 30% complete bonus + 30% efficiency - 5%/hint
|
| 2742 |
+
- [x] 17 determinism/score-range tests + 26/26 total test suite passing
|
| 2743 |
|
| 2744 |
### Day 8: Baseline & Testing
|
| 2745 |
- [ ] Write inference.py baseline
|
openenv.yaml
CHANGED
|
@@ -24,31 +24,37 @@ tasks:
|
|
| 24 |
name: Dockerfile Syntax Errors
|
| 25 |
description: Fix syntax and instruction errors in Dockerfiles
|
| 26 |
difficulty: easy
|
|
|
|
| 27 |
|
| 28 |
- id: dockerfile_runtime
|
| 29 |
name: Dockerfile Runtime Errors
|
| 30 |
description: Fix runtime/container execution issues in Dockerfiles
|
| 31 |
difficulty: medium
|
|
|
|
| 32 |
|
| 33 |
- id: workflow_syntax_structure
|
| 34 |
name: Workflow Syntax and Structure
|
| 35 |
description: Fix GitHub Actions YAML syntax and job structure issues
|
| 36 |
difficulty: easy
|
|
|
|
| 37 |
|
| 38 |
- id: workflow_secrets_permissions
|
| 39 |
name: Workflow Secrets and Permissions
|
| 40 |
description: Fix secret wiring, env usage, and permissions in workflows
|
| 41 |
difficulty: medium
|
|
|
|
| 42 |
|
| 43 |
- id: ci_docker_integration
|
| 44 |
name: CI and Docker Build Integration
|
| 45 |
description: Debug combined workflow and Docker build integration failures
|
| 46 |
-
difficulty: medium
|
|
|
|
| 47 |
|
| 48 |
- id: multi_stage_pipeline_matrix
|
| 49 |
name: Multi-Stage Pipeline and Matrix
|
| 50 |
description: Debug complex multi-stage and matrix CI/CD pipelines
|
| 51 |
difficulty: hard
|
|
|
|
| 52 |
|
| 53 |
graders:
|
| 54 |
dockerfile_syntax:
|
|
@@ -73,12 +79,12 @@ graders:
|
|
| 73 |
baseline:
|
| 74 |
script: inference.py
|
| 75 |
expected_scores:
|
| 76 |
-
dockerfile_syntax: 0.
|
| 77 |
dockerfile_runtime: 0.55
|
| 78 |
workflow_syntax_structure: 0.65
|
| 79 |
-
workflow_secrets_permissions: 0.
|
| 80 |
ci_docker_integration: 0.45
|
| 81 |
-
multi_stage_pipeline_matrix: 0.
|
| 82 |
|
| 83 |
resources:
|
| 84 |
vcpu: 2
|
|
|
|
| 24 |
name: Dockerfile Syntax Errors
|
| 25 |
description: Fix syntax and instruction errors in Dockerfiles
|
| 26 |
difficulty: easy
|
| 27 |
+
num_scenarios: 5
|
| 28 |
|
| 29 |
- id: dockerfile_runtime
|
| 30 |
name: Dockerfile Runtime Errors
|
| 31 |
description: Fix runtime/container execution issues in Dockerfiles
|
| 32 |
difficulty: medium
|
| 33 |
+
num_scenarios: 5
|
| 34 |
|
| 35 |
- id: workflow_syntax_structure
|
| 36 |
name: Workflow Syntax and Structure
|
| 37 |
description: Fix GitHub Actions YAML syntax and job structure issues
|
| 38 |
difficulty: easy
|
| 39 |
+
num_scenarios: 5
|
| 40 |
|
| 41 |
- id: workflow_secrets_permissions
|
| 42 |
name: Workflow Secrets and Permissions
|
| 43 |
description: Fix secret wiring, env usage, and permissions in workflows
|
| 44 |
difficulty: medium
|
| 45 |
+
num_scenarios: 5
|
| 46 |
|
| 47 |
- id: ci_docker_integration
|
| 48 |
name: CI and Docker Build Integration
|
| 49 |
description: Debug combined workflow and Docker build integration failures
|
| 50 |
+
difficulty: medium-hard
|
| 51 |
+
num_scenarios: 5
|
| 52 |
|
| 53 |
- id: multi_stage_pipeline_matrix
|
| 54 |
name: Multi-Stage Pipeline and Matrix
|
| 55 |
description: Debug complex multi-stage and matrix CI/CD pipelines
|
| 56 |
difficulty: hard
|
| 57 |
+
num_scenarios: 5
|
| 58 |
|
| 59 |
graders:
|
| 60 |
dockerfile_syntax:
|
|
|
|
| 79 |
baseline:
|
| 80 |
script: inference.py
|
| 81 |
expected_scores:
|
| 82 |
+
dockerfile_syntax: 0.70
|
| 83 |
dockerfile_runtime: 0.55
|
| 84 |
workflow_syntax_structure: 0.65
|
| 85 |
+
workflow_secrets_permissions: 0.50
|
| 86 |
ci_docker_integration: 0.45
|
| 87 |
+
multi_stage_pipeline_matrix: 0.30
|
| 88 |
|
| 89 |
resources:
|
| 90 |
vcpu: 2
|
server/environment.py
CHANGED
|
@@ -64,9 +64,17 @@ class CICDDebugEnvironment:
|
|
| 64 |
|
| 65 |
return str(task_id)
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
def _validation_snapshot(self) -> Dict[str, bool]:
|
| 68 |
docker_result = self.docker_sim.validate(self.current_files.get("Dockerfile"), self.current_files)
|
| 69 |
-
|
|
|
|
| 70 |
return {
|
| 71 |
"docker_build_valid": bool(docker_result.get("build_success", False)),
|
| 72 |
"workflow_parse_valid": bool(workflow_result.get("parse_success", False)),
|
|
@@ -176,12 +184,13 @@ class CICDDebugEnvironment:
|
|
| 176 |
self.done = True
|
| 177 |
info["termination_reason"] = "all_fixed"
|
| 178 |
|
| 179 |
-
self.trajectory.append(
|
| 180 |
-
{"step": self.step_count, "action": action.model_dump(), "reward": reward, "done": self.done}
|
| 181 |
-
)
|
| 182 |
info["issues_fixed"] = self.issues_fixed
|
| 183 |
info["issues_total"] = self.issues_total
|
| 184 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
return self.get_observation(), reward, self.done, info
|
| 186 |
|
| 187 |
def _handle_edit(self, action: Action) -> Tuple[float, str]:
|
|
@@ -273,7 +282,7 @@ class CICDDebugEnvironment:
|
|
| 273 |
|
| 274 |
if applied_count == 0:
|
| 275 |
self.last_action_success = False
|
| 276 |
-
return max(0.
|
| 277 |
|
| 278 |
self.last_action_success = True
|
| 279 |
return max(0.0, reward), "; ".join(feedbacks)
|
|
@@ -304,7 +313,8 @@ class CICDDebugEnvironment:
|
|
| 304 |
|
| 305 |
def _handle_submit(self) -> Tuple[float, str]:
|
| 306 |
docker_result = self.docker_sim.validate(self.current_files.get("Dockerfile"), self.current_files)
|
| 307 |
-
|
|
|
|
| 308 |
|
| 309 |
reward = 0.0
|
| 310 |
parts: List[str] = []
|
|
|
|
| 64 |
|
| 65 |
return str(task_id)
|
| 66 |
|
| 67 |
+
def _find_workflow_file(self) -> Optional[FileContent]:
|
| 68 |
+
"""Return the first workflow file found in current_files."""
|
| 69 |
+
for path, fc in self.current_files.items():
|
| 70 |
+
if path.startswith(".github/workflows/") and path.endswith(".yml"):
|
| 71 |
+
return fc
|
| 72 |
+
return None
|
| 73 |
+
|
| 74 |
def _validation_snapshot(self) -> Dict[str, bool]:
|
| 75 |
docker_result = self.docker_sim.validate(self.current_files.get("Dockerfile"), self.current_files)
|
| 76 |
+
workflow_file = self._find_workflow_file()
|
| 77 |
+
workflow_result = self.workflow_sim.validate(workflow_file, self.current_files)
|
| 78 |
return {
|
| 79 |
"docker_build_valid": bool(docker_result.get("build_success", False)),
|
| 80 |
"workflow_parse_valid": bool(workflow_result.get("parse_success", False)),
|
|
|
|
| 184 |
self.done = True
|
| 185 |
info["termination_reason"] = "all_fixed"
|
| 186 |
|
|
|
|
|
|
|
|
|
|
| 187 |
info["issues_fixed"] = self.issues_fixed
|
| 188 |
info["issues_total"] = self.issues_total
|
| 189 |
|
| 190 |
+
self.trajectory.append(
|
| 191 |
+
{"step": self.step_count, "action": action.model_dump(), "reward": reward, "done": self.done, "info": info}
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
return self.get_observation(), reward, self.done, info
|
| 195 |
|
| 196 |
def _handle_edit(self, action: Action) -> Tuple[float, str]:
|
|
|
|
| 282 |
|
| 283 |
if applied_count == 0:
|
| 284 |
self.last_action_success = False
|
| 285 |
+
return max(-0.02, reward - 0.02), "; ".join(feedbacks) or "No edit applied"
|
| 286 |
|
| 287 |
self.last_action_success = True
|
| 288 |
return max(0.0, reward), "; ".join(feedbacks)
|
|
|
|
| 313 |
|
| 314 |
def _handle_submit(self) -> Tuple[float, str]:
|
| 315 |
docker_result = self.docker_sim.validate(self.current_files.get("Dockerfile"), self.current_files)
|
| 316 |
+
workflow_file = self._find_workflow_file()
|
| 317 |
+
workflow_result = self.workflow_sim.validate(workflow_file, self.current_files)
|
| 318 |
|
| 319 |
reward = 0.0
|
| 320 |
parts: List[str] = []
|
server/graders/__init__.py
CHANGED
|
@@ -1,4 +1,18 @@
|
|
| 1 |
-
"""Deterministic grader for trajectory scoring.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
|
@@ -7,6 +21,19 @@ from typing import Any, Dict, List
|
|
| 7 |
from server.models import GraderResult
|
| 8 |
from server.tasks.task_registry import TASK_REGISTRY
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
def run_grader(task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult:
|
| 12 |
if task_id not in TASK_REGISTRY:
|
|
@@ -16,7 +43,7 @@ def run_grader(task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult:
|
|
| 16 |
return GraderResult(
|
| 17 |
task_id=task_id,
|
| 18 |
score=0.0,
|
| 19 |
-
breakdown={"
|
| 20 |
feedback="No actions taken",
|
| 21 |
steps_taken=0,
|
| 22 |
hints_used=0,
|
|
@@ -30,65 +57,57 @@ def run_grader(task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult:
|
|
| 30 |
issues_total = max(1, int(final_step.get("info", {}).get("issues_total", 1)))
|
| 31 |
fix_ratio = issues_fixed / issues_total
|
| 32 |
|
| 33 |
-
# Component 1:
|
| 34 |
-
|
| 35 |
|
| 36 |
-
# Component 2:
|
| 37 |
-
|
| 38 |
-
total_edit_actions = 0
|
| 39 |
-
for step in trajectory:
|
| 40 |
-
action = step.get("action", {})
|
| 41 |
-
action_type = action.get("action_type")
|
| 42 |
-
edits = action.get("edits") or []
|
| 43 |
-
if action_type in {"edit_file", "replace_line", "add_line", "delete_line", "add_block", "delete_block"}:
|
| 44 |
-
total_edit_actions += 1
|
| 45 |
-
has_valid_edit = False
|
| 46 |
-
for edit in edits:
|
| 47 |
-
if edit.get("file_path") and (
|
| 48 |
-
edit.get("line_number") is None or isinstance(edit.get("line_number"), int)
|
| 49 |
-
):
|
| 50 |
-
has_valid_edit = True
|
| 51 |
-
if has_valid_edit:
|
| 52 |
-
valid_edit_actions += 1
|
| 53 |
-
|
| 54 |
-
if total_edit_actions == 0:
|
| 55 |
-
action_quality_score = 0.0
|
| 56 |
-
else:
|
| 57 |
-
action_quality_score = 0.15 * (valid_edit_actions / total_edit_actions)
|
| 58 |
|
| 59 |
-
# Component 3:
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
efficiency_score = 0.10
|
| 65 |
else:
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
-
#
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
-
score =
|
| 72 |
-
score = max(0.0, min(1.0, score))
|
| 73 |
|
| 74 |
if score >= 0.9:
|
| 75 |
-
feedback = "Excellent!
|
| 76 |
elif score >= 0.7:
|
| 77 |
-
feedback = "Good
|
| 78 |
elif score >= 0.5:
|
| 79 |
-
feedback = "Partial success. Some issues remain
|
|
|
|
|
|
|
| 80 |
else:
|
| 81 |
-
feedback = "
|
| 82 |
|
| 83 |
return GraderResult(
|
| 84 |
task_id=task_id,
|
| 85 |
-
score=
|
| 86 |
breakdown={
|
| 87 |
-
"
|
| 88 |
-
"
|
| 89 |
-
"complete_solution": round(full_solution_bonus, 3),
|
| 90 |
"efficiency": round(efficiency_score, 3),
|
| 91 |
-
"hint_penalty": round(-
|
|
|
|
| 92 |
},
|
| 93 |
feedback=feedback,
|
| 94 |
steps_taken=steps_taken,
|
|
|
|
| 1 |
+
"""Deterministic grader for trajectory scoring.
|
| 2 |
+
|
| 3 |
+
Scoring breakdown (matches CONTEXT.md):
|
| 4 |
+
- Partial fixes: 40% proportional to fix ratio
|
| 5 |
+
- Complete solution bonus: 30% if ALL issues fixed
|
| 6 |
+
- Efficiency: 20% max, decays with extra steps
|
| 7 |
+
- Hint penalty: -5% per hint used
|
| 8 |
+
- Failed action penalty: -2% per failed edit (no valid edits)
|
| 9 |
+
|
| 10 |
+
Score examples (2-bug scenario):
|
| 11 |
+
Fix 1/2 β ~0.40
|
| 12 |
+
Fix 2/2 (slow) β ~0.85
|
| 13 |
+
Fix 2/2 (fast) β ~1.0
|
| 14 |
+
2 hints used β -0.10
|
| 15 |
+
"""
|
| 16 |
|
| 17 |
from __future__ import annotations
|
| 18 |
|
|
|
|
| 21 |
from server.models import GraderResult
|
| 22 |
from server.tasks.task_registry import TASK_REGISTRY
|
| 23 |
|
| 24 |
+
# Tunable weights
|
| 25 |
+
PARTIAL_FIX_WEIGHT = 0.40
|
| 26 |
+
COMPLETE_BONUS = 0.30
|
| 27 |
+
EFFICIENCY_MAX = 0.30
|
| 28 |
+
EFFICIENCY_DECAY = 0.03 # per extra step beyond optimal
|
| 29 |
+
HINT_PENALTY = 0.05
|
| 30 |
+
FAILED_ACTION_PENALTY = 0.02
|
| 31 |
+
|
| 32 |
+
EDIT_ACTION_TYPES = frozenset({
|
| 33 |
+
"edit_file", "replace_line", "add_line",
|
| 34 |
+
"delete_line", "add_block", "delete_block",
|
| 35 |
+
})
|
| 36 |
+
|
| 37 |
|
| 38 |
def run_grader(task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult:
|
| 39 |
if task_id not in TASK_REGISTRY:
|
|
|
|
| 43 |
return GraderResult(
|
| 44 |
task_id=task_id,
|
| 45 |
score=0.0,
|
| 46 |
+
breakdown={"partial_fixes": 0.0, "complete_solution": 0.0, "efficiency": 0.0, "hint_penalty": 0.0},
|
| 47 |
feedback="No actions taken",
|
| 48 |
steps_taken=0,
|
| 49 |
hints_used=0,
|
|
|
|
| 57 |
issues_total = max(1, int(final_step.get("info", {}).get("issues_total", 1)))
|
| 58 |
fix_ratio = issues_fixed / issues_total
|
| 59 |
|
| 60 |
+
# Component 1: Partial fix credit (proportional)
|
| 61 |
+
partial_score = PARTIAL_FIX_WEIGHT * fix_ratio
|
| 62 |
|
| 63 |
+
# Component 2: Full-solution bonus (only when ALL issues fixed)
|
| 64 |
+
complete_bonus = COMPLETE_BONUS if issues_fixed == issues_total else 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
+
# Component 3: Efficiency bonus (only awarded if at least one fix)
|
| 67 |
+
if issues_fixed == 0:
|
| 68 |
+
efficiency_score = 0.0
|
| 69 |
+
elif steps_taken <= issues_total:
|
| 70 |
+
efficiency_score = EFFICIENCY_MAX
|
|
|
|
| 71 |
else:
|
| 72 |
+
extra = steps_taken - issues_total
|
| 73 |
+
efficiency_score = max(0.0, EFFICIENCY_MAX - EFFICIENCY_DECAY * extra)
|
| 74 |
+
|
| 75 |
+
# Component 4: Hint penalty
|
| 76 |
+
hint_pen = HINT_PENALTY * hints_used
|
| 77 |
|
| 78 |
+
# Component 5: Failed action penalty (edits with no valid file_path)
|
| 79 |
+
failed_edits = 0
|
| 80 |
+
for step in trajectory:
|
| 81 |
+
action = step.get("action", {})
|
| 82 |
+
if action.get("action_type") in EDIT_ACTION_TYPES:
|
| 83 |
+
edits = action.get("edits") or []
|
| 84 |
+
if not any(e.get("file_path") for e in edits):
|
| 85 |
+
failed_edits += 1
|
| 86 |
+
failed_pen = FAILED_ACTION_PENALTY * failed_edits
|
| 87 |
|
| 88 |
+
score = partial_score + complete_bonus + efficiency_score - hint_pen - failed_pen
|
| 89 |
+
score = max(0.0, min(1.0, round(score, 3)))
|
| 90 |
|
| 91 |
if score >= 0.9:
|
| 92 |
+
feedback = "Excellent! All issues fixed efficiently."
|
| 93 |
elif score >= 0.7:
|
| 94 |
+
feedback = "Good job! Most issues fixed."
|
| 95 |
elif score >= 0.5:
|
| 96 |
+
feedback = "Partial success. Some issues remain."
|
| 97 |
+
elif score >= 0.3:
|
| 98 |
+
feedback = "Limited progress. Review the error messages carefully."
|
| 99 |
else:
|
| 100 |
+
feedback = "Needs improvement. Try analyzing the error phase first."
|
| 101 |
|
| 102 |
return GraderResult(
|
| 103 |
task_id=task_id,
|
| 104 |
+
score=score,
|
| 105 |
breakdown={
|
| 106 |
+
"partial_fixes": round(partial_score, 3),
|
| 107 |
+
"complete_solution": round(complete_bonus, 3),
|
|
|
|
| 108 |
"efficiency": round(efficiency_score, 3),
|
| 109 |
+
"hint_penalty": round(-hint_pen, 3),
|
| 110 |
+
"failed_action_penalty": round(-failed_pen, 3),
|
| 111 |
},
|
| 112 |
feedback=feedback,
|
| 113 |
steps_taken=steps_taken,
|
server/graders/base.py
CHANGED
|
@@ -1 +1,101 @@
|
|
| 1 |
-
"""Base grader interface
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Base grader interface with shared scoring utilities.
|
| 2 |
+
|
| 3 |
+
The concrete default grader lives in ``server.graders.__init__``.
|
| 4 |
+
This module provides a class-based interface for task-specific overrides.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
from typing import Any, Dict, List
|
| 10 |
+
|
| 11 |
+
from server.models import GraderResult
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class BaseGrader:
|
| 15 |
+
"""Base class for task graders.
|
| 16 |
+
|
| 17 |
+
Subclass and override ``grade()`` for task-specific scoring.
|
| 18 |
+
The default pipeline in ``server.graders.__init__.run_grader``
|
| 19 |
+
works for all tasks without subclassing.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
PARTIAL_FIX_WEIGHT: float = 0.40
|
| 23 |
+
COMPLETE_BONUS: float = 0.30
|
| 24 |
+
EFFICIENCY_MAX: float = 0.30
|
| 25 |
+
EFFICIENCY_DECAY: float = 0.03
|
| 26 |
+
HINT_PENALTY_EACH: float = 0.05
|
| 27 |
+
FAILED_ACTION_PENALTY: float = 0.02
|
| 28 |
+
|
| 29 |
+
EDIT_ACTION_TYPES = frozenset({
|
| 30 |
+
"edit_file", "replace_line", "add_line",
|
| 31 |
+
"delete_line", "add_block", "delete_block",
|
| 32 |
+
})
|
| 33 |
+
|
| 34 |
+
def grade(self, task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult:
|
| 35 |
+
return self.compute_score(task_id, trajectory)
|
| 36 |
+
|
| 37 |
+
def compute_score(self, task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult:
|
| 38 |
+
if not trajectory:
|
| 39 |
+
return GraderResult(
|
| 40 |
+
task_id=task_id,
|
| 41 |
+
score=0.0,
|
| 42 |
+
breakdown={"partial_fixes": 0.0, "complete_solution": 0.0, "efficiency": 0.0, "hint_penalty": 0.0},
|
| 43 |
+
feedback="No actions taken",
|
| 44 |
+
steps_taken=0,
|
| 45 |
+
hints_used=0,
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
final_step = trajectory[-1]
|
| 49 |
+
steps_taken = len(trajectory)
|
| 50 |
+
hints_used = self._count_hints(trajectory)
|
| 51 |
+
|
| 52 |
+
issues_fixed = int(final_step.get("info", {}).get("issues_fixed", 0))
|
| 53 |
+
issues_total = max(1, int(final_step.get("info", {}).get("issues_total", 1)))
|
| 54 |
+
fix_ratio = issues_fixed / issues_total
|
| 55 |
+
|
| 56 |
+
partial_score = self.PARTIAL_FIX_WEIGHT * fix_ratio
|
| 57 |
+
complete_bonus = self.COMPLETE_BONUS if issues_fixed == issues_total else 0.0
|
| 58 |
+
efficiency = self._efficiency_score(steps_taken, issues_total, issues_fixed)
|
| 59 |
+
hint_pen = self.HINT_PENALTY_EACH * hints_used
|
| 60 |
+
|
| 61 |
+
score = max(0.0, min(1.0, partial_score + complete_bonus + efficiency - hint_pen))
|
| 62 |
+
|
| 63 |
+
return GraderResult(
|
| 64 |
+
task_id=task_id,
|
| 65 |
+
score=round(score, 3),
|
| 66 |
+
breakdown={
|
| 67 |
+
"partial_fixes": round(partial_score, 3),
|
| 68 |
+
"complete_solution": round(complete_bonus, 3),
|
| 69 |
+
"efficiency": round(efficiency, 3),
|
| 70 |
+
"hint_penalty": round(-hint_pen, 3),
|
| 71 |
+
},
|
| 72 |
+
feedback=self._feedback_message(score),
|
| 73 |
+
steps_taken=steps_taken,
|
| 74 |
+
hints_used=hints_used,
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
@staticmethod
|
| 78 |
+
def _count_hints(trajectory: List[Dict[str, Any]]) -> int:
|
| 79 |
+
return sum(
|
| 80 |
+
1 for step in trajectory
|
| 81 |
+
if step.get("action", {}).get("action_type") == "request_hint"
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
def _efficiency_score(self, steps_taken: int, issues_total: int, issues_fixed: int = 1) -> float:
|
| 85 |
+
if issues_fixed == 0:
|
| 86 |
+
return 0.0
|
| 87 |
+
if steps_taken <= issues_total:
|
| 88 |
+
return self.EFFICIENCY_MAX
|
| 89 |
+
return max(0.0, self.EFFICIENCY_MAX - self.EFFICIENCY_DECAY * (steps_taken - issues_total))
|
| 90 |
+
|
| 91 |
+
@staticmethod
|
| 92 |
+
def _feedback_message(score: float) -> str:
|
| 93 |
+
if score >= 0.9:
|
| 94 |
+
return "Excellent! All issues fixed efficiently."
|
| 95 |
+
if score >= 0.7:
|
| 96 |
+
return "Good job! Most issues fixed."
|
| 97 |
+
if score >= 0.5:
|
| 98 |
+
return "Partial success. Some issues remain."
|
| 99 |
+
if score >= 0.3:
|
| 100 |
+
return "Limited progress. Review the error messages carefully."
|
| 101 |
+
return "Needs improvement. Try analyzing the error phase first."
|
server/models.py
CHANGED
|
@@ -91,7 +91,7 @@ class Action(BaseModel):
|
|
| 91 |
|
| 92 |
class StepResult(BaseModel):
|
| 93 |
observation: Observation
|
| 94 |
-
reward: float = Field(..., ge=
|
| 95 |
done: bool
|
| 96 |
info: Dict[str, Any] = Field(default_factory=dict)
|
| 97 |
|
|
|
|
| 91 |
|
| 92 |
class StepResult(BaseModel):
|
| 93 |
observation: Observation
|
| 94 |
+
reward: float = Field(..., ge=-1.0, le=2.0)
|
| 95 |
done: bool
|
| 96 |
info: Dict[str, Any] = Field(default_factory=dict)
|
| 97 |
|
server/simulators/docker_simulator.py
CHANGED
|
@@ -43,6 +43,22 @@ class DockerSimulator:
|
|
| 43 |
return any(path.startswith(prefix) for path in context_files)
|
| 44 |
return source in context_files
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
def validate(self, dockerfile: Optional[FileContent], context_files: Dict[str, FileContent]):
|
| 47 |
if dockerfile is None:
|
| 48 |
return {"build_success": False, "run_success": False, "error": "Dockerfile missing"}
|
|
@@ -54,15 +70,28 @@ class DockerSimulator:
|
|
| 54 |
if not active_lines:
|
| 55 |
return {"build_success": False, "run_success": False, "error": "Dockerfile is empty"}
|
| 56 |
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
return {
|
| 59 |
"build_success": False,
|
| 60 |
"run_success": False,
|
| 61 |
"error": "Dockerfile must start with FROM",
|
| 62 |
}
|
| 63 |
|
|
|
|
| 64 |
for idx, raw in enumerate(active_lines, start=1):
|
| 65 |
token = raw.split()[0].upper()
|
|
|
|
|
|
|
|
|
|
| 66 |
if token.startswith("&&"):
|
| 67 |
return {
|
| 68 |
"build_success": False,
|
|
@@ -70,6 +99,9 @@ class DockerSimulator:
|
|
| 70 |
"error": f"Dockerfile parse error: unknown instruction: {token}",
|
| 71 |
"line": idx,
|
| 72 |
}
|
|
|
|
|
|
|
|
|
|
| 73 |
if token not in self.VALID_INSTRUCTIONS:
|
| 74 |
return {
|
| 75 |
"build_success": False,
|
|
@@ -78,6 +110,7 @@ class DockerSimulator:
|
|
| 78 |
"line": idx,
|
| 79 |
}
|
| 80 |
|
|
|
|
| 81 |
if "FROM python:3.9-slimm" in content:
|
| 82 |
return {
|
| 83 |
"build_success": False,
|
|
@@ -85,6 +118,15 @@ class DockerSimulator:
|
|
| 85 |
"error": "pull access denied for python:3.9-slimm",
|
| 86 |
}
|
| 87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
for raw in active_lines:
|
| 89 |
upper = raw.upper()
|
| 90 |
if upper.startswith("COPY "):
|
|
@@ -107,6 +149,7 @@ class DockerSimulator:
|
|
| 107 |
"error": f"COPY failed: file not found in build context: {src}",
|
| 108 |
}
|
| 109 |
|
|
|
|
| 110 |
if "--platform=$BUILDPLATFORM" in content and "ARG BUILDPLATFORM" not in content:
|
| 111 |
return {
|
| 112 |
"build_success": False,
|
|
@@ -120,6 +163,7 @@ class DockerSimulator:
|
|
| 120 |
"error": "failed to parse platform: TARGETPLATFORM not declared",
|
| 121 |
}
|
| 122 |
|
|
|
|
| 123 |
if "COPY --from=builder /app/dist" in content:
|
| 124 |
pkg = context_files.get("package.json")
|
| 125 |
if pkg and "react-scripts build" in pkg.content:
|
|
@@ -129,18 +173,84 @@ class DockerSimulator:
|
|
| 129 |
"error": "COPY failed: stat app/dist: file does not exist",
|
| 130 |
}
|
| 131 |
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
return {
|
| 134 |
-
"build_success":
|
| 135 |
"run_success": False,
|
| 136 |
-
"
|
| 137 |
}
|
| 138 |
|
| 139 |
-
|
|
|
|
| 140 |
return {
|
| 141 |
"build_success": True,
|
| 142 |
"run_success": False,
|
| 143 |
-
"run_error": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
}
|
| 145 |
|
| 146 |
return {"build_success": True, "run_success": True}
|
|
|
|
| 43 |
return any(path.startswith(prefix) for path in context_files)
|
| 44 |
return source in context_files
|
| 45 |
|
| 46 |
+
def _join_continuation_lines(self, lines: List[str]) -> List[str]:
|
| 47 |
+
"""Join lines ending with backslash into single logical lines."""
|
| 48 |
+
result: List[str] = []
|
| 49 |
+
current = ""
|
| 50 |
+
for line in lines:
|
| 51 |
+
stripped = line.rstrip()
|
| 52 |
+
if stripped.endswith("\\"):
|
| 53 |
+
current += stripped[:-1] + " "
|
| 54 |
+
else:
|
| 55 |
+
current += stripped
|
| 56 |
+
result.append(current)
|
| 57 |
+
current = ""
|
| 58 |
+
if current:
|
| 59 |
+
result.append(current)
|
| 60 |
+
return result
|
| 61 |
+
|
| 62 |
def validate(self, dockerfile: Optional[FileContent], context_files: Dict[str, FileContent]):
|
| 63 |
if dockerfile is None:
|
| 64 |
return {"build_success": False, "run_success": False, "error": "Dockerfile missing"}
|
|
|
|
| 70 |
if not active_lines:
|
| 71 |
return {"build_success": False, "run_success": False, "error": "Dockerfile is empty"}
|
| 72 |
|
| 73 |
+
# --- ARG before FROM is allowed, but first non-ARG instruction must be FROM ---
|
| 74 |
+
first_non_arg = None
|
| 75 |
+
for line in active_lines:
|
| 76 |
+
token = line.split()[0].upper()
|
| 77 |
+
if token == "ARG":
|
| 78 |
+
continue
|
| 79 |
+
first_non_arg = token
|
| 80 |
+
break
|
| 81 |
+
|
| 82 |
+
if first_non_arg is None or first_non_arg != "FROM":
|
| 83 |
return {
|
| 84 |
"build_success": False,
|
| 85 |
"run_success": False,
|
| 86 |
"error": "Dockerfile must start with FROM",
|
| 87 |
}
|
| 88 |
|
| 89 |
+
# --- Instruction validation ---
|
| 90 |
for idx, raw in enumerate(active_lines, start=1):
|
| 91 |
token = raw.split()[0].upper()
|
| 92 |
+
# Handle --platform= prefix on FROM
|
| 93 |
+
if token.startswith("FROM"):
|
| 94 |
+
token = "FROM"
|
| 95 |
if token.startswith("&&"):
|
| 96 |
return {
|
| 97 |
"build_success": False,
|
|
|
|
| 99 |
"error": f"Dockerfile parse error: unknown instruction: {token}",
|
| 100 |
"line": idx,
|
| 101 |
}
|
| 102 |
+
# Strip leading --flags (e.g. --platform=...) β the instruction is after
|
| 103 |
+
if token.startswith("--"):
|
| 104 |
+
continue
|
| 105 |
if token not in self.VALID_INSTRUCTIONS:
|
| 106 |
return {
|
| 107 |
"build_success": False,
|
|
|
|
| 110 |
"line": idx,
|
| 111 |
}
|
| 112 |
|
| 113 |
+
# --- Invalid base image tags ---
|
| 114 |
if "FROM python:3.9-slimm" in content:
|
| 115 |
return {
|
| 116 |
"build_success": False,
|
|
|
|
| 118 |
"error": "pull access denied for python:3.9-slimm",
|
| 119 |
}
|
| 120 |
|
| 121 |
+
# --- Typo in requirements filename ---
|
| 122 |
+
if "requirments.txt" in content:
|
| 123 |
+
return {
|
| 124 |
+
"build_success": False,
|
| 125 |
+
"run_success": False,
|
| 126 |
+
"error": "COPY failed: file not found in build context: requirments.txt",
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
# --- COPY source validation ---
|
| 130 |
for raw in active_lines:
|
| 131 |
upper = raw.upper()
|
| 132 |
if upper.startswith("COPY "):
|
|
|
|
| 149 |
"error": f"COPY failed: file not found in build context: {src}",
|
| 150 |
}
|
| 151 |
|
| 152 |
+
# --- Platform ARG declarations ---
|
| 153 |
if "--platform=$BUILDPLATFORM" in content and "ARG BUILDPLATFORM" not in content:
|
| 154 |
return {
|
| 155 |
"build_success": False,
|
|
|
|
| 163 |
"error": "failed to parse platform: TARGETPLATFORM not declared",
|
| 164 |
}
|
| 165 |
|
| 166 |
+
# --- Multi-stage artifact path mismatch (dist vs build) ---
|
| 167 |
if "COPY --from=builder /app/dist" in content:
|
| 168 |
pkg = context_files.get("package.json")
|
| 169 |
if pkg and "react-scripts build" in pkg.content:
|
|
|
|
| 173 |
"error": "COPY failed: stat app/dist: file does not exist",
|
| 174 |
}
|
| 175 |
|
| 176 |
+
# --- EXPOSE string validation ---
|
| 177 |
+
for raw in active_lines:
|
| 178 |
+
upper = raw.upper()
|
| 179 |
+
if upper.startswith("EXPOSE "):
|
| 180 |
+
parts = raw.split()
|
| 181 |
+
for part in parts[1:]:
|
| 182 |
+
cleaned = part.strip('"').strip("'")
|
| 183 |
+
port_proto = cleaned.split("/")[0]
|
| 184 |
+
if not port_proto.isdigit():
|
| 185 |
+
return {
|
| 186 |
+
"build_success": False,
|
| 187 |
+
"run_success": False,
|
| 188 |
+
"error": f"EXPOSE requires numeric port or port/protocol, got: {cleaned}",
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
# =====================================================
|
| 192 |
+
# Runtime checks (build succeeds, run may fail)
|
| 193 |
+
# =====================================================
|
| 194 |
+
|
| 195 |
+
# --- Missing WORKDIR causing module resolution failures ---
|
| 196 |
+
has_workdir = "WORKDIR" in content
|
| 197 |
+
if ("npm start" in content or 'CMD ["npm", "start"]' in content) and not has_workdir:
|
| 198 |
return {
|
| 199 |
+
"build_success": True,
|
| 200 |
"run_success": False,
|
| 201 |
+
"run_error": "Error: Cannot find module '/package.json'",
|
| 202 |
}
|
| 203 |
|
| 204 |
+
# --- ENTRYPOINT + identical CMD conflict ---
|
| 205 |
+
if 'ENTRYPOINT ["python"' in content and 'CMD ["python"' in content:
|
| 206 |
return {
|
| 207 |
"build_success": True,
|
| 208 |
"run_success": False,
|
| 209 |
+
"run_error": "container exits immediately; ENTRYPOINT and CMD both specify full command",
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
# --- Entrypoint script not executable ---
|
| 213 |
+
if 'ENTRYPOINT ["./start.sh"]' in content and "chmod +x" not in content:
|
| 214 |
+
return {
|
| 215 |
+
"build_success": True,
|
| 216 |
+
"run_success": False,
|
| 217 |
+
"run_error": "exec ./start.sh: permission denied",
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
# --- Missing required ENV variable (DATABASE_URL) ---
|
| 221 |
+
# Check if the scenario error mentions DATABASE_URL (via context files or content)
|
| 222 |
+
has_database_url_env = "ENV DATABASE_URL" in content
|
| 223 |
+
needs_database_url = (
|
| 224 |
+
"app.py" in content
|
| 225 |
+
and "DATABASE_URL" not in content
|
| 226 |
+
and any("gunicorn" in fc.content for fc in context_files.values() if fc.content)
|
| 227 |
+
)
|
| 228 |
+
if needs_database_url and not has_database_url_env:
|
| 229 |
+
return {
|
| 230 |
+
"build_success": True,
|
| 231 |
+
"run_success": False,
|
| 232 |
+
"run_error": "KeyError: 'DATABASE_URL' β Application requires DATABASE_URL environment variable",
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
# --- Non-root user binding to privileged port ---
|
| 236 |
+
has_user_switch = False
|
| 237 |
+
expose_port = None
|
| 238 |
+
for raw in active_lines:
|
| 239 |
+
upper = raw.upper()
|
| 240 |
+
if upper.startswith("USER ") and "root" not in raw.lower():
|
| 241 |
+
has_user_switch = True
|
| 242 |
+
if upper.startswith("EXPOSE "):
|
| 243 |
+
parts = raw.split()
|
| 244 |
+
if len(parts) >= 2:
|
| 245 |
+
port_str = parts[1].split("/")[0].strip('"').strip("'")
|
| 246 |
+
if port_str.isdigit():
|
| 247 |
+
expose_port = int(port_str)
|
| 248 |
+
|
| 249 |
+
if has_user_switch and expose_port is not None and expose_port < 1024:
|
| 250 |
+
return {
|
| 251 |
+
"build_success": True,
|
| 252 |
+
"run_success": False,
|
| 253 |
+
"run_error": f"PermissionError: [Errno 13] Permission denied β non-root user cannot bind to port {expose_port}",
|
| 254 |
}
|
| 255 |
|
| 256 |
return {"build_success": True, "run_success": True}
|
server/simulators/workflow_simulator.py
CHANGED
|
@@ -2,7 +2,8 @@
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
-
|
|
|
|
| 6 |
|
| 7 |
import yaml
|
| 8 |
|
|
@@ -12,10 +13,24 @@ from server.models import FileContent
|
|
| 12 |
class WorkflowSimulator:
|
| 13 |
def validate(self, workflow: Optional[FileContent], files: Dict[str, FileContent]):
|
| 14 |
if workflow is None:
|
| 15 |
-
# Not all easy tasks include workflow; keep this permissive.
|
| 16 |
return {"parse_success": True, "execution_success": True}
|
| 17 |
|
| 18 |
content = workflow.content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
try:
|
| 20 |
parsed = yaml.safe_load(content)
|
| 21 |
except yaml.YAMLError as exc:
|
|
@@ -32,6 +47,33 @@ class WorkflowSimulator:
|
|
| 32 |
"error": "Workflow root must be a mapping",
|
| 33 |
}
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
jobs = parsed.get("jobs")
|
| 36 |
if not isinstance(jobs, dict) or not jobs:
|
| 37 |
return {
|
|
@@ -40,33 +82,75 @@ class WorkflowSimulator:
|
|
| 40 |
"error": "Workflow must define at least one job",
|
| 41 |
}
|
| 42 |
|
|
|
|
| 43 |
has_buildx_setup = "docker/setup-buildx-action" in content
|
| 44 |
has_platforms = "platforms:" in content
|
| 45 |
has_docker_login = "docker login" in content
|
|
|
|
| 46 |
has_username_secret = "secrets.DOCKER_USERNAME" in content
|
| 47 |
has_password_secret = "secrets.DOCKER_PASSWORD" in content
|
|
|
|
| 48 |
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
| 50 |
if not isinstance(job, dict):
|
| 51 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
steps = job.get("steps", [])
|
| 53 |
if not isinstance(steps, list):
|
| 54 |
return {
|
| 55 |
"parse_success": False,
|
| 56 |
"execution_success": False,
|
| 57 |
-
"error": "Job steps must be a list",
|
| 58 |
}
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
checkout_index = -1
|
| 61 |
build_index = -1
|
| 62 |
for idx, step in enumerate(steps):
|
| 63 |
if not isinstance(step, dict):
|
| 64 |
continue
|
| 65 |
uses = step.get("uses", "")
|
| 66 |
-
|
| 67 |
if isinstance(uses, str) and "actions/checkout" in uses:
|
| 68 |
checkout_index = idx
|
| 69 |
-
if (isinstance(
|
| 70 |
isinstance(uses, str) and "docker/build-push-action" in uses
|
| 71 |
):
|
| 72 |
build_index = idx
|
|
@@ -78,13 +162,74 @@ class WorkflowSimulator:
|
|
| 78 |
"exec_error": "Checkout must happen before Docker build steps",
|
| 79 |
}
|
| 80 |
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
|
|
|
| 88 |
if has_platforms and not has_buildx_setup:
|
| 89 |
return {
|
| 90 |
"parse_success": True,
|
|
@@ -92,4 +237,89 @@ class WorkflowSimulator:
|
|
| 92 |
"exec_error": "Multi-platform build requires docker/setup-buildx-action",
|
| 93 |
}
|
| 94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
return {"parse_success": True, "execution_success": True}
|
|
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
+
import re
|
| 6 |
+
from typing import Any, Dict, List, Optional
|
| 7 |
|
| 8 |
import yaml
|
| 9 |
|
|
|
|
| 13 |
class WorkflowSimulator:
|
| 14 |
def validate(self, workflow: Optional[FileContent], files: Dict[str, FileContent]):
|
| 15 |
if workflow is None:
|
|
|
|
| 16 |
return {"parse_success": True, "execution_success": True}
|
| 17 |
|
| 18 |
content = workflow.content
|
| 19 |
+
|
| 20 |
+
# --- Single-brace expression check (${ } instead of ${{ }}) ---
|
| 21 |
+
# Match ${ ... } that is NOT ${{ ... }}
|
| 22 |
+
single_brace = re.findall(r'\$\{(?!\{)\s*[^}]+\}', content)
|
| 23 |
+
if single_brace:
|
| 24 |
+
return {
|
| 25 |
+
"parse_success": False,
|
| 26 |
+
"execution_success": False,
|
| 27 |
+
"error": (
|
| 28 |
+
"Unrecognized expression syntax. "
|
| 29 |
+
"Use ${{ expression }} with double braces for GitHub Actions expressions."
|
| 30 |
+
),
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
# --- YAML parse ---
|
| 34 |
try:
|
| 35 |
parsed = yaml.safe_load(content)
|
| 36 |
except yaml.YAMLError as exc:
|
|
|
|
| 47 |
"error": "Workflow root must be a mapping",
|
| 48 |
}
|
| 49 |
|
| 50 |
+
# --- Missing 'on' trigger ---
|
| 51 |
+
if "on" not in parsed and True not in parsed:
|
| 52 |
+
# yaml.safe_load converts `on:` to True key in some contexts
|
| 53 |
+
return {
|
| 54 |
+
"parse_success": False,
|
| 55 |
+
"execution_success": False,
|
| 56 |
+
"error": "Workflow must define an 'on' trigger event",
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
# --- Validate 'on' trigger structure ---
|
| 60 |
+
on_value = parsed.get("on") or parsed.get(True)
|
| 61 |
+
if isinstance(on_value, dict):
|
| 62 |
+
for event_key, event_config in on_value.items():
|
| 63 |
+
if isinstance(event_config, dict):
|
| 64 |
+
# Check branches is a list, not a bare string
|
| 65 |
+
branches_val = event_config.get("branches")
|
| 66 |
+
if isinstance(branches_val, str):
|
| 67 |
+
return {
|
| 68 |
+
"parse_success": False,
|
| 69 |
+
"execution_success": False,
|
| 70 |
+
"error": (
|
| 71 |
+
f"Unexpected value '{branches_val}' for 'on.{event_key}.branches'. "
|
| 72 |
+
"Expected a sequence (list) value."
|
| 73 |
+
),
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
# --- Jobs validation ---
|
| 77 |
jobs = parsed.get("jobs")
|
| 78 |
if not isinstance(jobs, dict) or not jobs:
|
| 79 |
return {
|
|
|
|
| 82 |
"error": "Workflow must define at least one job",
|
| 83 |
}
|
| 84 |
|
| 85 |
+
# Content-level flags for cross-cutting checks
|
| 86 |
has_buildx_setup = "docker/setup-buildx-action" in content
|
| 87 |
has_platforms = "platforms:" in content
|
| 88 |
has_docker_login = "docker login" in content
|
| 89 |
+
has_docker_push = "docker push" in content
|
| 90 |
has_username_secret = "secrets.DOCKER_USERNAME" in content
|
| 91 |
has_password_secret = "secrets.DOCKER_PASSWORD" in content
|
| 92 |
+
has_github_token_secret = "secrets.GITHUB_TOKEN" in content
|
| 93 |
|
| 94 |
+
# Collect job IDs for needs validation
|
| 95 |
+
job_ids = set(jobs.keys())
|
| 96 |
+
|
| 97 |
+
for job_name, job in jobs.items():
|
| 98 |
if not isinstance(job, dict):
|
| 99 |
continue
|
| 100 |
+
|
| 101 |
+
# --- Missing runs-on ---
|
| 102 |
+
if "runs-on" not in job:
|
| 103 |
+
return {
|
| 104 |
+
"parse_success": False,
|
| 105 |
+
"execution_success": False,
|
| 106 |
+
"error": f"Job '{job_name}' is missing required field 'runs-on'",
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
# --- Validate 'needs' references ---
|
| 110 |
+
needs = job.get("needs")
|
| 111 |
+
if needs:
|
| 112 |
+
needed = [needs] if isinstance(needs, str) else (needs if isinstance(needs, list) else [])
|
| 113 |
+
for dep in needed:
|
| 114 |
+
if dep not in job_ids:
|
| 115 |
+
return {
|
| 116 |
+
"parse_success": False,
|
| 117 |
+
"execution_success": False,
|
| 118 |
+
"error": f"Job '{job_name}' depends on unknown job '{dep}'",
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
steps = job.get("steps", [])
|
| 122 |
if not isinstance(steps, list):
|
| 123 |
return {
|
| 124 |
"parse_success": False,
|
| 125 |
"execution_success": False,
|
| 126 |
+
"error": f"Job '{job_name}' steps must be a list",
|
| 127 |
}
|
| 128 |
|
| 129 |
+
# --- Validate each step has 'uses' or 'run' ---
|
| 130 |
+
for step in steps:
|
| 131 |
+
if not isinstance(step, dict):
|
| 132 |
+
continue
|
| 133 |
+
has_uses = "uses" in step
|
| 134 |
+
has_run = "run" in step
|
| 135 |
+
if not has_uses and not has_run:
|
| 136 |
+
step_name = step.get("name", "unnamed")
|
| 137 |
+
return {
|
| 138 |
+
"parse_success": False,
|
| 139 |
+
"execution_success": False,
|
| 140 |
+
"error": f"Every step must define a 'uses' or 'run' key. Step '{step_name}' has neither.",
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
# --- Checkout before build order ---
|
| 144 |
checkout_index = -1
|
| 145 |
build_index = -1
|
| 146 |
for idx, step in enumerate(steps):
|
| 147 |
if not isinstance(step, dict):
|
| 148 |
continue
|
| 149 |
uses = step.get("uses", "")
|
| 150 |
+
run_cmd = step.get("run", "")
|
| 151 |
if isinstance(uses, str) and "actions/checkout" in uses:
|
| 152 |
checkout_index = idx
|
| 153 |
+
if (isinstance(run_cmd, str) and "docker build" in run_cmd) or (
|
| 154 |
isinstance(uses, str) and "docker/build-push-action" in uses
|
| 155 |
):
|
| 156 |
build_index = idx
|
|
|
|
| 162 |
"exec_error": "Checkout must happen before Docker build steps",
|
| 163 |
}
|
| 164 |
|
| 165 |
+
# --- Cross-job artifact dependency check ---
|
| 166 |
+
# If a job uses download-artifact but doesn't declare needs on the upload job
|
| 167 |
+
for job_name, job in jobs.items():
|
| 168 |
+
if not isinstance(job, dict):
|
| 169 |
+
continue
|
| 170 |
+
steps = job.get("steps", [])
|
| 171 |
+
if not isinstance(steps, list):
|
| 172 |
+
continue
|
| 173 |
+
uses_download = any(
|
| 174 |
+
isinstance(s, dict) and "actions/download-artifact" in str(s.get("uses", ""))
|
| 175 |
+
for s in steps
|
| 176 |
+
)
|
| 177 |
+
if uses_download:
|
| 178 |
+
needs = job.get("needs")
|
| 179 |
+
if not needs:
|
| 180 |
+
return {
|
| 181 |
+
"parse_success": True,
|
| 182 |
+
"execution_success": False,
|
| 183 |
+
"exec_error": (
|
| 184 |
+
f"Job '{job_name}' uses download-artifact but has no 'needs' dependency β "
|
| 185 |
+
"add 'needs' to ensure the upload job completes first"
|
| 186 |
+
),
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
# --- Docker login with secrets not wired via env ---
|
| 190 |
+
if has_docker_login:
|
| 191 |
+
# Check if the login step has env block with secrets
|
| 192 |
+
login_has_env_secrets = has_username_secret and has_password_secret
|
| 193 |
+
if not login_has_env_secrets:
|
| 194 |
+
# Check if login uses $DOCKER_USERNAME (env var) without secret mapping
|
| 195 |
+
if "$DOCKER_USERNAME" in content and not has_username_secret:
|
| 196 |
+
return {
|
| 197 |
+
"parse_success": True,
|
| 198 |
+
"execution_success": False,
|
| 199 |
+
"exec_error": "Docker login secrets not wired β add env block with secrets.DOCKER_USERNAME and secrets.DOCKER_PASSWORD",
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
# --- Push without login ---
|
| 203 |
+
if has_docker_push and not has_docker_login:
|
| 204 |
+
# Check if using docker/login-action instead
|
| 205 |
+
has_login_action = "docker/login-action" in content
|
| 206 |
+
if not has_login_action:
|
| 207 |
+
return {
|
| 208 |
+
"parse_success": True,
|
| 209 |
+
"execution_success": False,
|
| 210 |
+
"exec_error": "Docker push without login β add a docker login step before pushing",
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
# --- GHCR login with wrong credentials ---
|
| 214 |
+
if "docker login ghcr.io" in content:
|
| 215 |
+
if has_password_secret and not has_github_token_secret:
|
| 216 |
+
return {
|
| 217 |
+
"parse_success": True,
|
| 218 |
+
"execution_success": False,
|
| 219 |
+
"exec_error": "GHCR requires GITHUB_TOKEN for authentication, not DOCKER_PASSWORD",
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
# --- Missing permissions for GHCR push ---
|
| 223 |
+
if "ghcr.io" in content and "docker push" in content:
|
| 224 |
+
# Check if permissions block has packages: write
|
| 225 |
+
if "packages: write" not in content and "packages:write" not in content:
|
| 226 |
+
return {
|
| 227 |
+
"parse_success": True,
|
| 228 |
+
"execution_success": False,
|
| 229 |
+
"exec_error": "GITHUB_TOKEN does not have packages:write permission β add permissions block",
|
| 230 |
+
}
|
| 231 |
|
| 232 |
+
# --- Multi-platform without buildx ---
|
| 233 |
if has_platforms and not has_buildx_setup:
|
| 234 |
return {
|
| 235 |
"parse_success": True,
|
|
|
|
| 237 |
"exec_error": "Multi-platform build requires docker/setup-buildx-action",
|
| 238 |
}
|
| 239 |
|
| 240 |
+
# --- Cache export without buildx driver ---
|
| 241 |
+
if "cache-to:" in content and "cache-from:" in content:
|
| 242 |
+
# Check for mode=max
|
| 243 |
+
if "cache-to: type=gha" in content and "mode=max" not in content:
|
| 244 |
+
return {
|
| 245 |
+
"parse_success": True,
|
| 246 |
+
"execution_success": False,
|
| 247 |
+
"exec_error": "GHA cache export needs mode=max for proper cache support",
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
# --- Build context / Dockerfile path mismatch ---
|
| 251 |
+
for job_name, job in jobs.items():
|
| 252 |
+
if not isinstance(job, dict):
|
| 253 |
+
continue
|
| 254 |
+
for step in job.get("steps", []):
|
| 255 |
+
if not isinstance(step, dict):
|
| 256 |
+
continue
|
| 257 |
+
with_block = step.get("with", {})
|
| 258 |
+
if not isinstance(with_block, dict):
|
| 259 |
+
continue
|
| 260 |
+
context = with_block.get("context")
|
| 261 |
+
file_path = with_block.get("file")
|
| 262 |
+
if context and file_path and isinstance(context, str) and isinstance(file_path, str):
|
| 263 |
+
# If context is a subdirectory but file is at root
|
| 264 |
+
if context not in {".", "./"} and not file_path.startswith(context):
|
| 265 |
+
return {
|
| 266 |
+
"parse_success": True,
|
| 267 |
+
"execution_success": False,
|
| 268 |
+
"exec_error": f"Dockerfile path '{file_path}' does not match build context '{context}'",
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
# --- Secret referenced in run but not mapped via env block ---
|
| 272 |
+
for job_name, job in jobs.items():
|
| 273 |
+
if not isinstance(job, dict):
|
| 274 |
+
continue
|
| 275 |
+
for step in job.get("steps", []):
|
| 276 |
+
if not isinstance(step, dict):
|
| 277 |
+
continue
|
| 278 |
+
run_cmd = step.get("run", "")
|
| 279 |
+
if not isinstance(run_cmd, str):
|
| 280 |
+
continue
|
| 281 |
+
env_block = step.get("env", {})
|
| 282 |
+
if not isinstance(env_block, dict):
|
| 283 |
+
env_block = {}
|
| 284 |
+
# Find env vars used in run that look like they should come from secrets
|
| 285 |
+
env_var_refs = re.findall(r'\$([A-Z][A-Z0-9_]+)', run_cmd)
|
| 286 |
+
for var in env_var_refs:
|
| 287 |
+
# Skip GitHub expression vars (they're in ${{ }})
|
| 288 |
+
if var in ("GITHUB_SHA", "GITHUB_REF", "GITHUB_ACTOR", "GITHUB_REPOSITORY"):
|
| 289 |
+
continue
|
| 290 |
+
# Common secret-backed env vars
|
| 291 |
+
if var in ("SLACK_WEBHOOK_URL", "DEPLOY_TOKEN", "NPM_TOKEN", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"):
|
| 292 |
+
if var not in env_block:
|
| 293 |
+
return {
|
| 294 |
+
"parse_success": True,
|
| 295 |
+
"execution_success": False,
|
| 296 |
+
"exec_error": f"{var} is empty β secret not available in shell environment. Map it via env block.",
|
| 297 |
+
}
|
| 298 |
+
|
| 299 |
+
# --- Matrix: Node version incompatibility check ---
|
| 300 |
+
for job_name, job in jobs.items():
|
| 301 |
+
if not isinstance(job, dict):
|
| 302 |
+
continue
|
| 303 |
+
strategy = job.get("strategy", {})
|
| 304 |
+
if not isinstance(strategy, dict):
|
| 305 |
+
continue
|
| 306 |
+
matrix = strategy.get("matrix", {})
|
| 307 |
+
if not isinstance(matrix, dict):
|
| 308 |
+
continue
|
| 309 |
+
node_versions = matrix.get("node", [])
|
| 310 |
+
if isinstance(node_versions, list):
|
| 311 |
+
# Check package.json engines constraint
|
| 312 |
+
pkg = files.get("package.json")
|
| 313 |
+
if pkg:
|
| 314 |
+
engines_match = re.search(r'"node"\s*:\s*">=(\d+)"', pkg.content)
|
| 315 |
+
if engines_match:
|
| 316 |
+
min_version = int(engines_match.group(1))
|
| 317 |
+
for v in node_versions:
|
| 318 |
+
if isinstance(v, int) and v < min_version:
|
| 319 |
+
return {
|
| 320 |
+
"parse_success": True,
|
| 321 |
+
"execution_success": False,
|
| 322 |
+
"exec_error": f"Matrix job (node: {v}) failed: package.json requires Node >= {min_version}",
|
| 323 |
+
}
|
| 324 |
+
|
| 325 |
return {"parse_success": True, "execution_success": True}
|
server/tasks/base.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
|
|
|
| 5 |
from typing import Dict, Optional
|
| 6 |
|
| 7 |
from server.models import TaskDifficulty
|
|
@@ -11,8 +12,15 @@ class BaseTask:
|
|
| 11 |
NAME = "Base Task"
|
| 12 |
DESCRIPTION = "Base task"
|
| 13 |
DIFFICULTY = TaskDifficulty.EASY
|
| 14 |
-
AVAILABLE_SECRETS = []
|
| 15 |
-
SCENARIOS = []
|
| 16 |
|
| 17 |
def load_scenario(self, scenario_id: Optional[str] = None) -> Dict:
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
+
import random
|
| 6 |
from typing import Dict, Optional
|
| 7 |
|
| 8 |
from server.models import TaskDifficulty
|
|
|
|
| 12 |
NAME = "Base Task"
|
| 13 |
DESCRIPTION = "Base task"
|
| 14 |
DIFFICULTY = TaskDifficulty.EASY
|
| 15 |
+
AVAILABLE_SECRETS: list = []
|
| 16 |
+
SCENARIOS: list = []
|
| 17 |
|
| 18 |
def load_scenario(self, scenario_id: Optional[str] = None) -> Dict:
|
| 19 |
+
if not self.SCENARIOS:
|
| 20 |
+
raise ValueError(f"Task {self.__class__.__name__} has no scenarios defined")
|
| 21 |
+
if scenario_id:
|
| 22 |
+
for scenario in self.SCENARIOS:
|
| 23 |
+
if scenario["id"] == scenario_id:
|
| 24 |
+
return scenario
|
| 25 |
+
raise ValueError(f"Unknown scenario: {scenario_id}")
|
| 26 |
+
return random.choice(self.SCENARIOS)
|
server/tasks/task_1_build_errors.py
CHANGED
|
@@ -1,7 +1,11 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
-
import
|
| 4 |
-
from typing import Dict, Optional
|
| 5 |
|
| 6 |
from server.models import TaskDifficulty
|
| 7 |
from server.tasks.base import BaseTask
|
|
@@ -12,20 +16,35 @@ class DockerfileSyntaxTask(BaseTask):
|
|
| 12 |
DESCRIPTION = "Fix syntax and instruction errors in Dockerfiles"
|
| 13 |
DIFFICULTY = TaskDifficulty.EASY
|
| 14 |
AVAILABLE_SECRETS = []
|
|
|
|
| 15 |
SCENARIOS = [
|
|
|
|
| 16 |
{
|
| 17 |
"id": "typo_filename",
|
| 18 |
"files": [
|
| 19 |
{
|
| 20 |
"path": "Dockerfile",
|
| 21 |
"type": "dockerfile",
|
| 22 |
-
"content":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
},
|
| 24 |
-
{"path": "requirements.txt", "type": "requirements", "content": "requests==2.31.0"},
|
| 25 |
],
|
| 26 |
"error": {
|
| 27 |
"phase": "docker_build",
|
| 28 |
"message": "COPY failed: file not found in build context: requirments.txt",
|
|
|
|
|
|
|
| 29 |
"line_hint": 3,
|
| 30 |
},
|
| 31 |
"expected_fixes": [
|
|
@@ -33,16 +52,166 @@ class DockerfileSyntaxTask(BaseTask):
|
|
| 33 |
"file": "Dockerfile",
|
| 34 |
"type": "contains",
|
| 35 |
"expected": "COPY requirements.txt",
|
| 36 |
-
"hint": "Check spelling of requirements filename",
|
| 37 |
}
|
| 38 |
],
|
| 39 |
-
}
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Task 1: Dockerfile Syntax Errors β EASY.
|
| 2 |
+
|
| 3 |
+
Agent fixes common Dockerfile instruction/syntax mistakes:
|
| 4 |
+
typos in filenames, invalid base image tags, bad RUN syntax,
|
| 5 |
+
quoted EXPOSE values, missing FROM instruction.
|
| 6 |
+
"""
|
| 7 |
|
| 8 |
+
from __future__ import annotations
|
|
|
|
| 9 |
|
| 10 |
from server.models import TaskDifficulty
|
| 11 |
from server.tasks.base import BaseTask
|
|
|
|
| 16 |
DESCRIPTION = "Fix syntax and instruction errors in Dockerfiles"
|
| 17 |
DIFFICULTY = TaskDifficulty.EASY
|
| 18 |
AVAILABLE_SECRETS = []
|
| 19 |
+
|
| 20 |
SCENARIOS = [
|
| 21 |
+
# Scenario 1: Typo in requirements filename
|
| 22 |
{
|
| 23 |
"id": "typo_filename",
|
| 24 |
"files": [
|
| 25 |
{
|
| 26 |
"path": "Dockerfile",
|
| 27 |
"type": "dockerfile",
|
| 28 |
+
"content": (
|
| 29 |
+
"FROM python:3.9-slim\n"
|
| 30 |
+
"WORKDIR /app\n"
|
| 31 |
+
"COPY requirments.txt .\n"
|
| 32 |
+
"RUN pip install --no-cache-dir -r requirements.txt\n"
|
| 33 |
+
"COPY . .\n"
|
| 34 |
+
'CMD ["python", "app.py"]'
|
| 35 |
+
),
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"path": "requirements.txt",
|
| 39 |
+
"type": "requirements",
|
| 40 |
+
"content": "flask==2.0.0\nrequests==2.28.0",
|
| 41 |
},
|
|
|
|
| 42 |
],
|
| 43 |
"error": {
|
| 44 |
"phase": "docker_build",
|
| 45 |
"message": "COPY failed: file not found in build context: requirments.txt",
|
| 46 |
+
"exit_code": 1,
|
| 47 |
+
"failed_step": "COPY requirments.txt .",
|
| 48 |
"line_hint": 3,
|
| 49 |
},
|
| 50 |
"expected_fixes": [
|
|
|
|
| 52 |
"file": "Dockerfile",
|
| 53 |
"type": "contains",
|
| 54 |
"expected": "COPY requirements.txt",
|
| 55 |
+
"hint": "Check spelling of the requirements filename β 'requirments' vs 'requirements'",
|
| 56 |
}
|
| 57 |
],
|
| 58 |
+
},
|
| 59 |
+
|
| 60 |
+
# Scenario 2: Wrong base image tag (extra 'm')
|
| 61 |
+
{
|
| 62 |
+
"id": "invalid_base_image",
|
| 63 |
+
"files": [
|
| 64 |
+
{
|
| 65 |
+
"path": "Dockerfile",
|
| 66 |
+
"type": "dockerfile",
|
| 67 |
+
"content": (
|
| 68 |
+
"FROM python:3.9-slimm\n"
|
| 69 |
+
"WORKDIR /app\n"
|
| 70 |
+
"COPY requirements.txt .\n"
|
| 71 |
+
"RUN pip install -r requirements.txt\n"
|
| 72 |
+
"COPY . .\n"
|
| 73 |
+
"EXPOSE 8000\n"
|
| 74 |
+
'CMD ["python", "app.py"]'
|
| 75 |
+
),
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"path": "requirements.txt",
|
| 79 |
+
"type": "requirements",
|
| 80 |
+
"content": "flask==2.0.0",
|
| 81 |
+
},
|
| 82 |
+
],
|
| 83 |
+
"error": {
|
| 84 |
+
"phase": "docker_build",
|
| 85 |
+
"message": (
|
| 86 |
+
"pull access denied for python:3.9-slimm, "
|
| 87 |
+
"repository does not exist or may require 'docker login'"
|
| 88 |
+
),
|
| 89 |
+
"exit_code": 1,
|
| 90 |
+
"failed_step": "FROM python:3.9-slimm",
|
| 91 |
+
"line_hint": 1,
|
| 92 |
+
},
|
| 93 |
+
"expected_fixes": [
|
| 94 |
+
{
|
| 95 |
+
"file": "Dockerfile",
|
| 96 |
+
"type": "not_contains",
|
| 97 |
+
"expected": "FROM python:3.9-slimm",
|
| 98 |
+
"hint": "The base image tag is 'slim', not 'slimm' β remove the extra 'm'",
|
| 99 |
+
}
|
| 100 |
+
],
|
| 101 |
+
},
|
| 102 |
+
|
| 103 |
+
# Scenario 3: && operator on its own line (invalid Dockerfile instruction)
|
| 104 |
+
{
|
| 105 |
+
"id": "invalid_run_syntax",
|
| 106 |
+
"files": [
|
| 107 |
+
{
|
| 108 |
+
"path": "Dockerfile",
|
| 109 |
+
"type": "dockerfile",
|
| 110 |
+
"content": (
|
| 111 |
+
"FROM python:3.9\n"
|
| 112 |
+
"WORKDIR /app\n"
|
| 113 |
+
"COPY . .\n"
|
| 114 |
+
"RUN pip install -r requirements.txt\n"
|
| 115 |
+
" && python setup.py install\n"
|
| 116 |
+
'CMD ["python", "main.py"]'
|
| 117 |
+
),
|
| 118 |
+
},
|
| 119 |
+
{
|
| 120 |
+
"path": "requirements.txt",
|
| 121 |
+
"type": "requirements",
|
| 122 |
+
"content": "numpy==1.21.0",
|
| 123 |
+
},
|
| 124 |
+
],
|
| 125 |
+
"error": {
|
| 126 |
+
"phase": "docker_build",
|
| 127 |
+
"message": "Dockerfile parse error: unknown instruction: &&",
|
| 128 |
+
"exit_code": 1,
|
| 129 |
+
"line_hint": 5,
|
| 130 |
+
},
|
| 131 |
+
"expected_fixes": [
|
| 132 |
+
{
|
| 133 |
+
"file": "Dockerfile",
|
| 134 |
+
"type": "contains",
|
| 135 |
+
"expected": "RUN pip install -r requirements.txt && python setup.py install",
|
| 136 |
+
"hint": (
|
| 137 |
+
"Multi-line RUN commands must use backslash continuation "
|
| 138 |
+
"(RUN cmd1 \\\\\\n && cmd2) or be written on one line"
|
| 139 |
+
),
|
| 140 |
+
}
|
| 141 |
+
],
|
| 142 |
+
},
|
| 143 |
|
| 144 |
+
# Scenario 4: EXPOSE with a quoted string instead of a number
|
| 145 |
+
{
|
| 146 |
+
"id": "invalid_expose",
|
| 147 |
+
"files": [
|
| 148 |
+
{
|
| 149 |
+
"path": "Dockerfile",
|
| 150 |
+
"type": "dockerfile",
|
| 151 |
+
"content": (
|
| 152 |
+
"FROM nginx:alpine\n"
|
| 153 |
+
"COPY nginx.conf /etc/nginx/nginx.conf\n"
|
| 154 |
+
"COPY html /usr/share/nginx/html\n"
|
| 155 |
+
'EXPOSE "eighty"\n'
|
| 156 |
+
'CMD ["nginx", "-g", "daemon off;"]'
|
| 157 |
+
),
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"path": "nginx.conf",
|
| 161 |
+
"type": "other",
|
| 162 |
+
"content": "events {}",
|
| 163 |
+
},
|
| 164 |
+
],
|
| 165 |
+
"error": {
|
| 166 |
+
"phase": "docker_build",
|
| 167 |
+
"message": "EXPOSE requires numeric port or port/protocol",
|
| 168 |
+
"exit_code": 1,
|
| 169 |
+
"line_hint": 4,
|
| 170 |
+
},
|
| 171 |
+
"expected_fixes": [
|
| 172 |
+
{
|
| 173 |
+
"file": "Dockerfile",
|
| 174 |
+
"type": "contains",
|
| 175 |
+
"expected": "EXPOSE 80",
|
| 176 |
+
"hint": "EXPOSE must use a numeric port value, not a quoted string",
|
| 177 |
+
}
|
| 178 |
+
],
|
| 179 |
+
},
|
| 180 |
+
|
| 181 |
+
# Scenario 5: Missing FROM instruction β Dockerfile starts with WORKDIR
|
| 182 |
+
{
|
| 183 |
+
"id": "missing_from_instruction",
|
| 184 |
+
"files": [
|
| 185 |
+
{
|
| 186 |
+
"path": "Dockerfile",
|
| 187 |
+
"type": "dockerfile",
|
| 188 |
+
"content": (
|
| 189 |
+
"WORKDIR /app\n"
|
| 190 |
+
"COPY requirements.txt .\n"
|
| 191 |
+
"RUN pip install -r requirements.txt\n"
|
| 192 |
+
"COPY . .\n"
|
| 193 |
+
'CMD ["python", "app.py"]'
|
| 194 |
+
),
|
| 195 |
+
},
|
| 196 |
+
{
|
| 197 |
+
"path": "requirements.txt",
|
| 198 |
+
"type": "requirements",
|
| 199 |
+
"content": "flask==2.0.0",
|
| 200 |
+
},
|
| 201 |
+
],
|
| 202 |
+
"error": {
|
| 203 |
+
"phase": "docker_build",
|
| 204 |
+
"message": "Dockerfile parse error: FROM is required as the first instruction",
|
| 205 |
+
"exit_code": 1,
|
| 206 |
+
"line_hint": 1,
|
| 207 |
+
},
|
| 208 |
+
"expected_fixes": [
|
| 209 |
+
{
|
| 210 |
+
"file": "Dockerfile",
|
| 211 |
+
"type": "contains",
|
| 212 |
+
"expected": "FROM python:",
|
| 213 |
+
"hint": "Every Dockerfile must start with a FROM instruction",
|
| 214 |
+
}
|
| 215 |
+
],
|
| 216 |
+
},
|
| 217 |
+
]
|
server/tasks/task_2_docker_runtime.py
CHANGED
|
@@ -1,7 +1,11 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
-
import
|
| 4 |
-
from typing import Dict, Optional
|
| 5 |
|
| 6 |
from server.models import TaskDifficulty
|
| 7 |
from server.tasks.base import BaseTask
|
|
@@ -12,36 +16,212 @@ class DockerfileRuntimeTask(BaseTask):
|
|
| 12 |
DESCRIPTION = "Fix runtime/container execution issues in Dockerfiles"
|
| 13 |
DIFFICULTY = TaskDifficulty.MEDIUM
|
| 14 |
AVAILABLE_SECRETS = []
|
|
|
|
| 15 |
SCENARIOS = [
|
|
|
|
| 16 |
{
|
| 17 |
"id": "missing_workdir",
|
| 18 |
"files": [
|
| 19 |
{
|
| 20 |
"path": "Dockerfile",
|
| 21 |
"type": "dockerfile",
|
| 22 |
-
"content":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
},
|
| 24 |
-
{"path": "package.json", "type": "other", "content": '{"name": "app", "scripts": {"start": "node index.js"}}'},
|
| 25 |
],
|
| 26 |
"error": {
|
| 27 |
"phase": "docker_run",
|
| 28 |
"message": "Error: Cannot find module '/package.json'",
|
|
|
|
|
|
|
| 29 |
},
|
| 30 |
"expected_fixes": [
|
| 31 |
{
|
| 32 |
"file": "Dockerfile",
|
| 33 |
"type": "contains",
|
| 34 |
"expected": "WORKDIR /app",
|
| 35 |
-
"hint": "Set a working directory before COPY/RUN",
|
| 36 |
}
|
| 37 |
],
|
| 38 |
-
}
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Task 2: Dockerfile Runtime Errors β MEDIUM.
|
| 2 |
+
|
| 3 |
+
Agent fixes Dockerfiles that build successfully but fail at container
|
| 4 |
+
runtime: missing WORKDIR, CMD/ENTRYPOINT conflicts, permission issues,
|
| 5 |
+
and missing environment variables.
|
| 6 |
+
"""
|
| 7 |
|
| 8 |
+
from __future__ import annotations
|
|
|
|
| 9 |
|
| 10 |
from server.models import TaskDifficulty
|
| 11 |
from server.tasks.base import BaseTask
|
|
|
|
| 16 |
DESCRIPTION = "Fix runtime/container execution issues in Dockerfiles"
|
| 17 |
DIFFICULTY = TaskDifficulty.MEDIUM
|
| 18 |
AVAILABLE_SECRETS = []
|
| 19 |
+
|
| 20 |
SCENARIOS = [
|
| 21 |
+
# Scenario 1: Missing WORKDIR β node module resolution fails at runtime
|
| 22 |
{
|
| 23 |
"id": "missing_workdir",
|
| 24 |
"files": [
|
| 25 |
{
|
| 26 |
"path": "Dockerfile",
|
| 27 |
"type": "dockerfile",
|
| 28 |
+
"content": (
|
| 29 |
+
"FROM node:18-alpine\n"
|
| 30 |
+
"COPY package*.json ./\n"
|
| 31 |
+
"RUN npm ci\n"
|
| 32 |
+
"COPY . .\n"
|
| 33 |
+
'CMD ["npm", "start"]'
|
| 34 |
+
),
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"path": "package.json",
|
| 38 |
+
"type": "other",
|
| 39 |
+
"content": '{"name": "app", "scripts": {"start": "node index.js"}}',
|
| 40 |
},
|
|
|
|
| 41 |
],
|
| 42 |
"error": {
|
| 43 |
"phase": "docker_run",
|
| 44 |
"message": "Error: Cannot find module '/package.json'",
|
| 45 |
+
"exit_code": 1,
|
| 46 |
+
"failed_step": "npm start",
|
| 47 |
},
|
| 48 |
"expected_fixes": [
|
| 49 |
{
|
| 50 |
"file": "Dockerfile",
|
| 51 |
"type": "contains",
|
| 52 |
"expected": "WORKDIR /app",
|
| 53 |
+
"hint": "Set a working directory before COPY/RUN so files land in /app, not /",
|
| 54 |
}
|
| 55 |
],
|
| 56 |
+
},
|
| 57 |
+
|
| 58 |
+
# Scenario 2: CMD and ENTRYPOINT both defined as full exec forms β conflict
|
| 59 |
+
{
|
| 60 |
+
"id": "cmd_entrypoint_conflict",
|
| 61 |
+
"files": [
|
| 62 |
+
{
|
| 63 |
+
"path": "Dockerfile",
|
| 64 |
+
"type": "dockerfile",
|
| 65 |
+
"content": (
|
| 66 |
+
"FROM python:3.11-slim\n"
|
| 67 |
+
"WORKDIR /app\n"
|
| 68 |
+
"COPY . .\n"
|
| 69 |
+
"RUN pip install -r requirements.txt\n"
|
| 70 |
+
'ENTRYPOINT ["python", "server.py"]\n'
|
| 71 |
+
'CMD ["python", "server.py"]'
|
| 72 |
+
),
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"path": "requirements.txt",
|
| 76 |
+
"type": "requirements",
|
| 77 |
+
"content": "flask==2.3.0",
|
| 78 |
+
},
|
| 79 |
+
],
|
| 80 |
+
"error": {
|
| 81 |
+
"phase": "docker_run",
|
| 82 |
+
"message": (
|
| 83 |
+
"container exits immediately; process started twice β "
|
| 84 |
+
"ENTRYPOINT and CMD both specify the full command"
|
| 85 |
+
),
|
| 86 |
+
"exit_code": 1,
|
| 87 |
+
"failed_step": "container start",
|
| 88 |
+
},
|
| 89 |
+
"expected_fixes": [
|
| 90 |
+
{
|
| 91 |
+
"file": "Dockerfile",
|
| 92 |
+
"type": "not_contains",
|
| 93 |
+
"expected": 'CMD ["python", "server.py"]',
|
| 94 |
+
"hint": (
|
| 95 |
+
"When using ENTRYPOINT as a full command, CMD should provide "
|
| 96 |
+
"default arguments only, or be removed entirely"
|
| 97 |
+
),
|
| 98 |
+
}
|
| 99 |
+
],
|
| 100 |
+
},
|
| 101 |
+
|
| 102 |
+
# Scenario 3: Entrypoint script not executable
|
| 103 |
+
{
|
| 104 |
+
"id": "entrypoint_not_executable",
|
| 105 |
+
"files": [
|
| 106 |
+
{
|
| 107 |
+
"path": "Dockerfile",
|
| 108 |
+
"type": "dockerfile",
|
| 109 |
+
"content": (
|
| 110 |
+
"FROM python:3.11-slim\n"
|
| 111 |
+
"WORKDIR /app\n"
|
| 112 |
+
"COPY . .\n"
|
| 113 |
+
"RUN pip install -r requirements.txt\n"
|
| 114 |
+
'ENTRYPOINT ["./start.sh"]'
|
| 115 |
+
),
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"path": "requirements.txt",
|
| 119 |
+
"type": "requirements",
|
| 120 |
+
"content": "flask==2.3.0",
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"path": "start.sh",
|
| 124 |
+
"type": "other",
|
| 125 |
+
"content": "#!/bin/bash\npython app.py",
|
| 126 |
+
},
|
| 127 |
+
],
|
| 128 |
+
"error": {
|
| 129 |
+
"phase": "docker_run",
|
| 130 |
+
"message": "exec ./start.sh: permission denied",
|
| 131 |
+
"exit_code": 126,
|
| 132 |
+
"failed_step": "ENTRYPOINT ./start.sh",
|
| 133 |
+
},
|
| 134 |
+
"expected_fixes": [
|
| 135 |
+
{
|
| 136 |
+
"file": "Dockerfile",
|
| 137 |
+
"type": "contains",
|
| 138 |
+
"expected": "RUN chmod +x ./start.sh",
|
| 139 |
+
"hint": "The entrypoint script must be made executable with chmod +x before the ENTRYPOINT instruction",
|
| 140 |
+
}
|
| 141 |
+
],
|
| 142 |
+
},
|
| 143 |
+
|
| 144 |
+
# Scenario 4: App crashes because a required ENV variable is missing
|
| 145 |
+
{
|
| 146 |
+
"id": "missing_required_env",
|
| 147 |
+
"files": [
|
| 148 |
+
{
|
| 149 |
+
"path": "Dockerfile",
|
| 150 |
+
"type": "dockerfile",
|
| 151 |
+
"content": (
|
| 152 |
+
"FROM python:3.11-slim\n"
|
| 153 |
+
"WORKDIR /app\n"
|
| 154 |
+
"COPY . .\n"
|
| 155 |
+
"RUN pip install -r requirements.txt\n"
|
| 156 |
+
"EXPOSE 8080\n"
|
| 157 |
+
'CMD ["python", "app.py"]'
|
| 158 |
+
),
|
| 159 |
+
},
|
| 160 |
+
{
|
| 161 |
+
"path": "requirements.txt",
|
| 162 |
+
"type": "requirements",
|
| 163 |
+
"content": "flask==2.3.0\ngunicorn==21.2.0",
|
| 164 |
+
},
|
| 165 |
+
],
|
| 166 |
+
"error": {
|
| 167 |
+
"phase": "docker_run",
|
| 168 |
+
"message": (
|
| 169 |
+
"KeyError: 'DATABASE_URL'\n"
|
| 170 |
+
"Application requires DATABASE_URL environment variable to be set"
|
| 171 |
+
),
|
| 172 |
+
"exit_code": 1,
|
| 173 |
+
"failed_step": "python app.py",
|
| 174 |
+
},
|
| 175 |
+
"expected_fixes": [
|
| 176 |
+
{
|
| 177 |
+
"file": "Dockerfile",
|
| 178 |
+
"type": "contains",
|
| 179 |
+
"expected": "ENV DATABASE_URL",
|
| 180 |
+
"hint": "Add an ENV instruction to set DATABASE_URL (use a default or placeholder value)",
|
| 181 |
+
}
|
| 182 |
+
],
|
| 183 |
+
},
|
| 184 |
|
| 185 |
+
# Scenario 5: Non-root user can't bind to privileged port
|
| 186 |
+
{
|
| 187 |
+
"id": "non_root_privileged_port",
|
| 188 |
+
"files": [
|
| 189 |
+
{
|
| 190 |
+
"path": "Dockerfile",
|
| 191 |
+
"type": "dockerfile",
|
| 192 |
+
"content": (
|
| 193 |
+
"FROM python:3.11-slim\n"
|
| 194 |
+
"WORKDIR /app\n"
|
| 195 |
+
"COPY . .\n"
|
| 196 |
+
"RUN pip install -r requirements.txt\n"
|
| 197 |
+
"RUN useradd --create-home appuser\n"
|
| 198 |
+
"USER appuser\n"
|
| 199 |
+
"EXPOSE 80\n"
|
| 200 |
+
'CMD ["python", "app.py"]'
|
| 201 |
+
),
|
| 202 |
+
},
|
| 203 |
+
{
|
| 204 |
+
"path": "requirements.txt",
|
| 205 |
+
"type": "requirements",
|
| 206 |
+
"content": "flask==2.3.0",
|
| 207 |
+
},
|
| 208 |
+
],
|
| 209 |
+
"error": {
|
| 210 |
+
"phase": "docker_run",
|
| 211 |
+
"message": (
|
| 212 |
+
"PermissionError: [Errno 13] Permission denied β "
|
| 213 |
+
"non-root user cannot bind to port 80"
|
| 214 |
+
),
|
| 215 |
+
"exit_code": 1,
|
| 216 |
+
"failed_step": "python app.py",
|
| 217 |
+
},
|
| 218 |
+
"expected_fixes": [
|
| 219 |
+
{
|
| 220 |
+
"file": "Dockerfile",
|
| 221 |
+
"type": "contains",
|
| 222 |
+
"expected": "EXPOSE 8080",
|
| 223 |
+
"hint": "Non-root users cannot bind to ports below 1024 β use a higher port like 8080",
|
| 224 |
+
}
|
| 225 |
+
],
|
| 226 |
+
},
|
| 227 |
+
]
|
server/tasks/task_2_workflow_config.py
DELETED
|
@@ -1,52 +0,0 @@
|
|
| 1 |
-
from __future__ import annotations
|
| 2 |
-
|
| 3 |
-
import random
|
| 4 |
-
from typing import Dict, Optional
|
| 5 |
-
|
| 6 |
-
from server.models import TaskDifficulty
|
| 7 |
-
from server.tasks.base import BaseTask
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
class WorkflowConfigTask(BaseTask):
|
| 11 |
-
NAME = "Workflow Secrets and Permissions"
|
| 12 |
-
DESCRIPTION = "Fix secret wiring, env usage, and permissions in workflows"
|
| 13 |
-
DIFFICULTY = TaskDifficulty.MEDIUM
|
| 14 |
-
AVAILABLE_SECRETS = ["DOCKER_USERNAME", "DOCKER_PASSWORD", "GITHUB_TOKEN"]
|
| 15 |
-
SCENARIOS = [
|
| 16 |
-
{
|
| 17 |
-
"id": "missing_env_secrets",
|
| 18 |
-
"files": [
|
| 19 |
-
{
|
| 20 |
-
"path": ".github/workflows/build.yml",
|
| 21 |
-
"type": "workflow",
|
| 22 |
-
"content": "name: Build\non: push\njobs:\n build:\n runs-on: ubuntu-latest\n steps:\n - uses: actions/checkout@v4\n - name: Login\n run: echo $DOCKER_PASSWORD | docker login -u $DOCKER_USERNAME --password-stdin",
|
| 23 |
-
}
|
| 24 |
-
],
|
| 25 |
-
"error": {
|
| 26 |
-
"phase": "workflow_parse",
|
| 27 |
-
"message": "Cannot perform an interactive login from a non TTY device",
|
| 28 |
-
},
|
| 29 |
-
"expected_fixes": [
|
| 30 |
-
{
|
| 31 |
-
"file": ".github/workflows/build.yml",
|
| 32 |
-
"type": "contains",
|
| 33 |
-
"expected": "DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }}",
|
| 34 |
-
"hint": "Pass secrets through env",
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"file": ".github/workflows/build.yml",
|
| 38 |
-
"type": "contains",
|
| 39 |
-
"expected": "DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }}",
|
| 40 |
-
"hint": "Map password secret to environment",
|
| 41 |
-
}
|
| 42 |
-
],
|
| 43 |
-
}
|
| 44 |
-
]
|
| 45 |
-
|
| 46 |
-
def load_scenario(self, scenario_id: Optional[str] = None) -> Dict:
|
| 47 |
-
if scenario_id:
|
| 48 |
-
for scenario in self.SCENARIOS:
|
| 49 |
-
if scenario["id"] == scenario_id:
|
| 50 |
-
return scenario
|
| 51 |
-
raise ValueError(f"Unknown scenario: {scenario_id}")
|
| 52 |
-
return random.choice(self.SCENARIOS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
server/tasks/task_3_multi_stage.py
DELETED
|
@@ -1,44 +0,0 @@
|
|
| 1 |
-
from __future__ import annotations
|
| 2 |
-
|
| 3 |
-
import random
|
| 4 |
-
from typing import Dict, Optional
|
| 5 |
-
|
| 6 |
-
from server.models import TaskDifficulty
|
| 7 |
-
from server.tasks.base import BaseTask
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
class MultiStagePipelineTask(BaseTask):
|
| 11 |
-
NAME = "Multi-Stage Pipeline and Matrix"
|
| 12 |
-
DESCRIPTION = "Debug complex multi-stage and matrix CI/CD pipelines"
|
| 13 |
-
DIFFICULTY = TaskDifficulty.HARD
|
| 14 |
-
AVAILABLE_SECRETS = ["DOCKER_USERNAME", "DOCKER_PASSWORD", "GITHUB_TOKEN", "NPM_TOKEN"]
|
| 15 |
-
SCENARIOS = [
|
| 16 |
-
{
|
| 17 |
-
"id": "artifact_path_mismatch",
|
| 18 |
-
"files": [
|
| 19 |
-
{
|
| 20 |
-
"path": "Dockerfile",
|
| 21 |
-
"type": "dockerfile",
|
| 22 |
-
"content": "FROM node:18 AS builder\nWORKDIR /app\nCOPY . .\nRUN npm run build\nFROM nginx:alpine\nCOPY --from=builder /app/dist /usr/share/nginx/html",
|
| 23 |
-
},
|
| 24 |
-
{"path": "package.json", "type": "other", "content": '{"scripts": {"build": "react-scripts build"}}'},
|
| 25 |
-
],
|
| 26 |
-
"error": {"phase": "docker_build", "message": "COPY failed: stat app/dist: file does not exist"},
|
| 27 |
-
"expected_fixes": [
|
| 28 |
-
{
|
| 29 |
-
"file": "Dockerfile",
|
| 30 |
-
"type": "contains",
|
| 31 |
-
"expected": "COPY --from=builder /app/build",
|
| 32 |
-
"hint": "React output path is build, not dist",
|
| 33 |
-
}
|
| 34 |
-
],
|
| 35 |
-
}
|
| 36 |
-
]
|
| 37 |
-
|
| 38 |
-
def load_scenario(self, scenario_id: Optional[str] = None) -> Dict:
|
| 39 |
-
if scenario_id:
|
| 40 |
-
for scenario in self.SCENARIOS:
|
| 41 |
-
if scenario["id"] == scenario_id:
|
| 42 |
-
return scenario
|
| 43 |
-
raise ValueError(f"Unknown scenario: {scenario_id}")
|
| 44 |
-
return random.choice(self.SCENARIOS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
server/tasks/task_3_workflow_syntax.py
CHANGED
|
@@ -1,7 +1,11 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
-
import
|
| 4 |
-
from typing import Dict, Optional
|
| 5 |
|
| 6 |
from server.models import TaskDifficulty
|
| 7 |
from server.tasks.base import BaseTask
|
|
@@ -12,36 +16,206 @@ class WorkflowSyntaxStructureTask(BaseTask):
|
|
| 12 |
DESCRIPTION = "Fix GitHub Actions YAML syntax and job structure issues"
|
| 13 |
DIFFICULTY = TaskDifficulty.EASY
|
| 14 |
AVAILABLE_SECRETS = ["GITHUB_TOKEN"]
|
|
|
|
| 15 |
SCENARIOS = [
|
|
|
|
| 16 |
{
|
| 17 |
"id": "checkout_after_build",
|
| 18 |
"files": [
|
| 19 |
{
|
| 20 |
"path": ".github/workflows/build.yml",
|
| 21 |
"type": "workflow",
|
| 22 |
-
"content":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
},
|
| 24 |
-
{"path": "Dockerfile", "type": "dockerfile", "content": "FROM python:3.11-slim\nWORKDIR /app\nCOPY . ."},
|
| 25 |
],
|
| 26 |
"error": {
|
| 27 |
"phase": "workflow_parse",
|
| 28 |
-
"message":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
},
|
| 30 |
"expected_fixes": [
|
| 31 |
{
|
| 32 |
"file": ".github/workflows/build.yml",
|
| 33 |
"type": "contains",
|
| 34 |
"expected": "- uses: actions/checkout@v4",
|
| 35 |
-
"hint": "Checkout
|
| 36 |
}
|
| 37 |
],
|
| 38 |
-
}
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Task 3: Workflow Syntax and Structure β EASY.
|
| 2 |
+
|
| 3 |
+
Agent fixes GitHub Actions YAML syntax and job structure issues:
|
| 4 |
+
step ordering, missing runs-on, invalid triggers, duplicate job IDs,
|
| 5 |
+
and missing 'on' trigger.
|
| 6 |
+
"""
|
| 7 |
|
| 8 |
+
from __future__ import annotations
|
|
|
|
| 9 |
|
| 10 |
from server.models import TaskDifficulty
|
| 11 |
from server.tasks.base import BaseTask
|
|
|
|
| 16 |
DESCRIPTION = "Fix GitHub Actions YAML syntax and job structure issues"
|
| 17 |
DIFFICULTY = TaskDifficulty.EASY
|
| 18 |
AVAILABLE_SECRETS = ["GITHUB_TOKEN"]
|
| 19 |
+
|
| 20 |
SCENARIOS = [
|
| 21 |
+
# Scenario 1: Checkout happens after build (wrong step order)
|
| 22 |
{
|
| 23 |
"id": "checkout_after_build",
|
| 24 |
"files": [
|
| 25 |
{
|
| 26 |
"path": ".github/workflows/build.yml",
|
| 27 |
"type": "workflow",
|
| 28 |
+
"content": (
|
| 29 |
+
"name: Build\n"
|
| 30 |
+
"on: push\n"
|
| 31 |
+
"\n"
|
| 32 |
+
"jobs:\n"
|
| 33 |
+
" build:\n"
|
| 34 |
+
" runs-on: ubuntu-latest\n"
|
| 35 |
+
" steps:\n"
|
| 36 |
+
" - name: Build Docker image\n"
|
| 37 |
+
" run: docker build -t myapp .\n"
|
| 38 |
+
" - uses: actions/checkout@v4\n"
|
| 39 |
+
" - name: Run tests\n"
|
| 40 |
+
" run: docker run myapp pytest"
|
| 41 |
+
),
|
| 42 |
+
},
|
| 43 |
+
{
|
| 44 |
+
"path": "Dockerfile",
|
| 45 |
+
"type": "dockerfile",
|
| 46 |
+
"content": (
|
| 47 |
+
"FROM python:3.11-slim\n"
|
| 48 |
+
"WORKDIR /app\n"
|
| 49 |
+
"COPY . .\n"
|
| 50 |
+
'CMD ["python", "app.py"]'
|
| 51 |
+
),
|
| 52 |
},
|
|
|
|
| 53 |
],
|
| 54 |
"error": {
|
| 55 |
"phase": "workflow_parse",
|
| 56 |
+
"message": (
|
| 57 |
+
"unable to prepare context: unable to evaluate symlinks "
|
| 58 |
+
"in Dockerfile path: lstat /home/runner/work/repo/repo/Dockerfile: "
|
| 59 |
+
"no such file or directory"
|
| 60 |
+
),
|
| 61 |
+
"exit_code": 1,
|
| 62 |
+
"failed_step": "Build Docker image",
|
| 63 |
},
|
| 64 |
"expected_fixes": [
|
| 65 |
{
|
| 66 |
"file": ".github/workflows/build.yml",
|
| 67 |
"type": "contains",
|
| 68 |
"expected": "- uses: actions/checkout@v4",
|
| 69 |
+
"hint": "Checkout must happen before any build commands",
|
| 70 |
}
|
| 71 |
],
|
| 72 |
+
},
|
| 73 |
+
|
| 74 |
+
# Scenario 2: Missing runs-on field in job
|
| 75 |
+
{
|
| 76 |
+
"id": "missing_runs_on",
|
| 77 |
+
"files": [
|
| 78 |
+
{
|
| 79 |
+
"path": ".github/workflows/ci.yml",
|
| 80 |
+
"type": "workflow",
|
| 81 |
+
"content": (
|
| 82 |
+
"name: CI Pipeline\n"
|
| 83 |
+
"on: [push, pull_request]\n"
|
| 84 |
+
"\n"
|
| 85 |
+
"jobs:\n"
|
| 86 |
+
" test:\n"
|
| 87 |
+
" steps:\n"
|
| 88 |
+
" - uses: actions/checkout@v4\n"
|
| 89 |
+
" - name: Run tests\n"
|
| 90 |
+
" run: npm test"
|
| 91 |
+
),
|
| 92 |
+
},
|
| 93 |
+
],
|
| 94 |
+
"error": {
|
| 95 |
+
"phase": "workflow_parse",
|
| 96 |
+
"message": "Job 'test' is missing required field 'runs-on'",
|
| 97 |
+
"exit_code": 1,
|
| 98 |
+
},
|
| 99 |
+
"expected_fixes": [
|
| 100 |
+
{
|
| 101 |
+
"file": ".github/workflows/ci.yml",
|
| 102 |
+
"type": "contains",
|
| 103 |
+
"expected": "runs-on:",
|
| 104 |
+
"hint": "Every job must specify a 'runs-on' field (e.g. runs-on: ubuntu-latest)",
|
| 105 |
+
}
|
| 106 |
+
],
|
| 107 |
+
},
|
| 108 |
|
| 109 |
+
# Scenario 3: Invalid event trigger syntax
|
| 110 |
+
{
|
| 111 |
+
"id": "invalid_trigger_syntax",
|
| 112 |
+
"files": [
|
| 113 |
+
{
|
| 114 |
+
"path": ".github/workflows/deploy.yml",
|
| 115 |
+
"type": "workflow",
|
| 116 |
+
"content": (
|
| 117 |
+
"name: Deploy\n"
|
| 118 |
+
"on:\n"
|
| 119 |
+
" push:\n"
|
| 120 |
+
" branches: main\n"
|
| 121 |
+
"\n"
|
| 122 |
+
"jobs:\n"
|
| 123 |
+
" deploy:\n"
|
| 124 |
+
" runs-on: ubuntu-latest\n"
|
| 125 |
+
" steps:\n"
|
| 126 |
+
" - uses: actions/checkout@v4\n"
|
| 127 |
+
" - name: Deploy\n"
|
| 128 |
+
" run: echo 'deploying...'"
|
| 129 |
+
),
|
| 130 |
+
},
|
| 131 |
+
],
|
| 132 |
+
"error": {
|
| 133 |
+
"phase": "workflow_parse",
|
| 134 |
+
"message": (
|
| 135 |
+
"Unexpected value 'main' for 'on.push.branches'. "
|
| 136 |
+
"Expected a sequence (list) value."
|
| 137 |
+
),
|
| 138 |
+
"exit_code": 1,
|
| 139 |
+
},
|
| 140 |
+
"expected_fixes": [
|
| 141 |
+
{
|
| 142 |
+
"file": ".github/workflows/deploy.yml",
|
| 143 |
+
"type": "contains",
|
| 144 |
+
"expected": "branches: [main]",
|
| 145 |
+
"hint": "branches must be a list: branches: [main] or branches:\\n - main",
|
| 146 |
+
}
|
| 147 |
+
],
|
| 148 |
+
},
|
| 149 |
+
|
| 150 |
+
# Scenario 4: Duplicate step IDs / missing step name
|
| 151 |
+
{
|
| 152 |
+
"id": "missing_step_uses_or_run",
|
| 153 |
+
"files": [
|
| 154 |
+
{
|
| 155 |
+
"path": ".github/workflows/lint.yml",
|
| 156 |
+
"type": "workflow",
|
| 157 |
+
"content": (
|
| 158 |
+
"name: Lint\n"
|
| 159 |
+
"on: push\n"
|
| 160 |
+
"\n"
|
| 161 |
+
"jobs:\n"
|
| 162 |
+
" lint:\n"
|
| 163 |
+
" runs-on: ubuntu-latest\n"
|
| 164 |
+
" steps:\n"
|
| 165 |
+
" - uses: actions/checkout@v4\n"
|
| 166 |
+
" - name: Install dependencies\n"
|
| 167 |
+
" run: npm ci\n"
|
| 168 |
+
" - name: Run linter\n"
|
| 169 |
+
),
|
| 170 |
+
},
|
| 171 |
+
],
|
| 172 |
+
"error": {
|
| 173 |
+
"phase": "workflow_parse",
|
| 174 |
+
"message": "Every step must define a 'uses' or 'run' key. Step 'Run linter' has neither.",
|
| 175 |
+
"exit_code": 1,
|
| 176 |
+
},
|
| 177 |
+
"expected_fixes": [
|
| 178 |
+
{
|
| 179 |
+
"file": ".github/workflows/lint.yml",
|
| 180 |
+
"type": "contains",
|
| 181 |
+
"expected": "run:",
|
| 182 |
+
"hint": "The 'Run linter' step is missing a 'run' command β add e.g. run: npm run lint",
|
| 183 |
+
}
|
| 184 |
+
],
|
| 185 |
+
},
|
| 186 |
+
|
| 187 |
+
# Scenario 5: Missing 'on' trigger entirely
|
| 188 |
+
{
|
| 189 |
+
"id": "missing_on_trigger",
|
| 190 |
+
"files": [
|
| 191 |
+
{
|
| 192 |
+
"path": ".github/workflows/test.yml",
|
| 193 |
+
"type": "workflow",
|
| 194 |
+
"content": (
|
| 195 |
+
"name: Test Suite\n"
|
| 196 |
+
"\n"
|
| 197 |
+
"jobs:\n"
|
| 198 |
+
" test:\n"
|
| 199 |
+
" runs-on: ubuntu-latest\n"
|
| 200 |
+
" steps:\n"
|
| 201 |
+
" - uses: actions/checkout@v4\n"
|
| 202 |
+
" - name: Run tests\n"
|
| 203 |
+
" run: pytest tests/ -v"
|
| 204 |
+
),
|
| 205 |
+
},
|
| 206 |
+
],
|
| 207 |
+
"error": {
|
| 208 |
+
"phase": "workflow_parse",
|
| 209 |
+
"message": "Workflow must define an 'on' trigger event",
|
| 210 |
+
"exit_code": 1,
|
| 211 |
+
},
|
| 212 |
+
"expected_fixes": [
|
| 213 |
+
{
|
| 214 |
+
"file": ".github/workflows/test.yml",
|
| 215 |
+
"type": "contains",
|
| 216 |
+
"expected": "on:",
|
| 217 |
+
"hint": "Workflow is missing the required 'on' trigger β add e.g. on: push",
|
| 218 |
+
}
|
| 219 |
+
],
|
| 220 |
+
},
|
| 221 |
+
]
|
server/tasks/task_4_workflow_secrets_permissions.py
CHANGED
|
@@ -1,7 +1,15 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
-
import
|
| 4 |
-
from typing import Dict, Optional
|
| 5 |
|
| 6 |
from server.models import TaskDifficulty
|
| 7 |
from server.tasks.base import BaseTask
|
|
@@ -12,41 +20,270 @@ class WorkflowSecretsPermissionsTask(BaseTask):
|
|
| 12 |
DESCRIPTION = "Fix secret wiring, env usage, and permissions in workflows"
|
| 13 |
DIFFICULTY = TaskDifficulty.MEDIUM
|
| 14 |
AVAILABLE_SECRETS = ["DOCKER_USERNAME", "DOCKER_PASSWORD", "GITHUB_TOKEN"]
|
|
|
|
| 15 |
SCENARIOS = [
|
|
|
|
| 16 |
{
|
| 17 |
"id": "missing_env_secrets",
|
| 18 |
"files": [
|
| 19 |
{
|
| 20 |
"path": ".github/workflows/build.yml",
|
| 21 |
"type": "workflow",
|
| 22 |
-
"content":
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
],
|
| 25 |
"error": {
|
| 26 |
"phase": "workflow_parse",
|
| 27 |
-
"message": "Cannot perform an interactive login from a non TTY device",
|
|
|
|
|
|
|
| 28 |
},
|
| 29 |
"expected_fixes": [
|
| 30 |
{
|
| 31 |
"file": ".github/workflows/build.yml",
|
| 32 |
"type": "contains",
|
| 33 |
"expected": "DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }}",
|
| 34 |
-
"hint": "
|
| 35 |
},
|
| 36 |
{
|
| 37 |
"file": ".github/workflows/build.yml",
|
| 38 |
"type": "contains",
|
| 39 |
"expected": "DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }}",
|
| 40 |
-
"hint": "
|
| 41 |
},
|
| 42 |
],
|
| 43 |
-
}
|
| 44 |
-
]
|
| 45 |
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Task 4: Workflow Secrets and Permissions β MEDIUM.
|
| 2 |
+
|
| 3 |
+
Agent fixes secret wiring, env variable mapping, and permission issues
|
| 4 |
+
in GitHub Actions workflows:
|
| 5 |
+
- Missing env block for Docker secrets
|
| 6 |
+
- Wrong secret syntax (${ vs ${{)
|
| 7 |
+
- Missing permissions for GITHUB_TOKEN
|
| 8 |
+
- GHCR login using wrong credentials
|
| 9 |
+
- Missing write permission for packages
|
| 10 |
+
"""
|
| 11 |
|
| 12 |
+
from __future__ import annotations
|
|
|
|
| 13 |
|
| 14 |
from server.models import TaskDifficulty
|
| 15 |
from server.tasks.base import BaseTask
|
|
|
|
| 20 |
DESCRIPTION = "Fix secret wiring, env usage, and permissions in workflows"
|
| 21 |
DIFFICULTY = TaskDifficulty.MEDIUM
|
| 22 |
AVAILABLE_SECRETS = ["DOCKER_USERNAME", "DOCKER_PASSWORD", "GITHUB_TOKEN"]
|
| 23 |
+
|
| 24 |
SCENARIOS = [
|
| 25 |
+
# Scenario 1: Missing env block for secrets
|
| 26 |
{
|
| 27 |
"id": "missing_env_secrets",
|
| 28 |
"files": [
|
| 29 |
{
|
| 30 |
"path": ".github/workflows/build.yml",
|
| 31 |
"type": "workflow",
|
| 32 |
+
"content": (
|
| 33 |
+
"name: Build and Push\n"
|
| 34 |
+
"on: push\n"
|
| 35 |
+
"\n"
|
| 36 |
+
"jobs:\n"
|
| 37 |
+
" build:\n"
|
| 38 |
+
" runs-on: ubuntu-latest\n"
|
| 39 |
+
" steps:\n"
|
| 40 |
+
" - uses: actions/checkout@v4\n"
|
| 41 |
+
" - name: Login to DockerHub\n"
|
| 42 |
+
" run: echo $DOCKER_PASSWORD | docker login -u $DOCKER_USERNAME --password-stdin\n"
|
| 43 |
+
" - name: Build and push\n"
|
| 44 |
+
" run: |\n"
|
| 45 |
+
" docker build -t myuser/myapp:${{ github.sha }} .\n"
|
| 46 |
+
" docker push myuser/myapp:${{ github.sha }}"
|
| 47 |
+
),
|
| 48 |
+
},
|
| 49 |
+
{
|
| 50 |
+
"path": "Dockerfile",
|
| 51 |
+
"type": "dockerfile",
|
| 52 |
+
"content": (
|
| 53 |
+
"FROM python:3.9-slim\n"
|
| 54 |
+
"WORKDIR /app\n"
|
| 55 |
+
"COPY . .\n"
|
| 56 |
+
"RUN pip install -r requirements.txt\n"
|
| 57 |
+
'CMD ["python", "app.py"]'
|
| 58 |
+
),
|
| 59 |
+
},
|
| 60 |
],
|
| 61 |
"error": {
|
| 62 |
"phase": "workflow_parse",
|
| 63 |
+
"message": "Error: Cannot perform an interactive login from a non TTY device",
|
| 64 |
+
"exit_code": 1,
|
| 65 |
+
"failed_step": "Login to DockerHub",
|
| 66 |
},
|
| 67 |
"expected_fixes": [
|
| 68 |
{
|
| 69 |
"file": ".github/workflows/build.yml",
|
| 70 |
"type": "contains",
|
| 71 |
"expected": "DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }}",
|
| 72 |
+
"hint": "Secrets must be passed via env block",
|
| 73 |
},
|
| 74 |
{
|
| 75 |
"file": ".github/workflows/build.yml",
|
| 76 |
"type": "contains",
|
| 77 |
"expected": "DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }}",
|
| 78 |
+
"hint": "Both username and password need to be passed as env vars",
|
| 79 |
},
|
| 80 |
],
|
| 81 |
+
},
|
|
|
|
| 82 |
|
| 83 |
+
# Scenario 2: Wrong secret syntax β single brace instead of double
|
| 84 |
+
{
|
| 85 |
+
"id": "wrong_secret_syntax",
|
| 86 |
+
"files": [
|
| 87 |
+
{
|
| 88 |
+
"path": ".github/workflows/deploy.yml",
|
| 89 |
+
"type": "workflow",
|
| 90 |
+
"content": (
|
| 91 |
+
"name: Deploy\n"
|
| 92 |
+
"on:\n"
|
| 93 |
+
" push:\n"
|
| 94 |
+
" branches: [main]\n"
|
| 95 |
+
"\n"
|
| 96 |
+
"jobs:\n"
|
| 97 |
+
" deploy:\n"
|
| 98 |
+
" runs-on: ubuntu-latest\n"
|
| 99 |
+
" steps:\n"
|
| 100 |
+
" - uses: actions/checkout@v4\n"
|
| 101 |
+
" - name: Deploy to server\n"
|
| 102 |
+
" run: |\n"
|
| 103 |
+
" echo \"Deploying version ${ github.sha }\"\n"
|
| 104 |
+
" curl -H \"Authorization: Bearer ${ secrets.DEPLOY_TOKEN }\" https://api.example.com/deploy\n"
|
| 105 |
+
" env:\n"
|
| 106 |
+
" DEPLOY_TOKEN: ${ secrets.DEPLOY_TOKEN }"
|
| 107 |
+
),
|
| 108 |
+
},
|
| 109 |
+
],
|
| 110 |
+
"error": {
|
| 111 |
+
"phase": "workflow_parse",
|
| 112 |
+
"message": (
|
| 113 |
+
"Unrecognized expression syntax. "
|
| 114 |
+
"Use ${{ expression }} with double braces for GitHub Actions expressions."
|
| 115 |
+
),
|
| 116 |
+
"exit_code": 1,
|
| 117 |
+
},
|
| 118 |
+
"expected_fixes": [
|
| 119 |
+
{
|
| 120 |
+
"file": ".github/workflows/deploy.yml",
|
| 121 |
+
"type": "contains",
|
| 122 |
+
"expected": "${{ secrets.DEPLOY_TOKEN }}",
|
| 123 |
+
"hint": "GitHub Actions uses ${{ }} (double braces), not ${ } (single brace)",
|
| 124 |
+
},
|
| 125 |
+
{
|
| 126 |
+
"file": ".github/workflows/deploy.yml",
|
| 127 |
+
"type": "contains",
|
| 128 |
+
"expected": "${{ github.sha }}",
|
| 129 |
+
"hint": "All GitHub expressions require double braces: ${{ github.sha }}",
|
| 130 |
+
},
|
| 131 |
+
],
|
| 132 |
+
},
|
| 133 |
+
|
| 134 |
+
# Scenario 3: Missing permissions for GITHUB_TOKEN to push packages
|
| 135 |
+
{
|
| 136 |
+
"id": "missing_token_permissions",
|
| 137 |
+
"files": [
|
| 138 |
+
{
|
| 139 |
+
"path": ".github/workflows/publish.yml",
|
| 140 |
+
"type": "workflow",
|
| 141 |
+
"content": (
|
| 142 |
+
"name: Publish Package\n"
|
| 143 |
+
"on:\n"
|
| 144 |
+
" push:\n"
|
| 145 |
+
" tags: ['v*']\n"
|
| 146 |
+
"\n"
|
| 147 |
+
"jobs:\n"
|
| 148 |
+
" publish:\n"
|
| 149 |
+
" runs-on: ubuntu-latest\n"
|
| 150 |
+
" steps:\n"
|
| 151 |
+
" - uses: actions/checkout@v4\n"
|
| 152 |
+
" - name: Login to GHCR\n"
|
| 153 |
+
" run: echo ${{ secrets.GITHUB_TOKEN }} | docker login ghcr.io -u ${{ github.actor }} --password-stdin\n"
|
| 154 |
+
" - name: Build and push\n"
|
| 155 |
+
" run: |\n"
|
| 156 |
+
" docker build -t ghcr.io/${{ github.repository }}:${{ github.ref_name }} .\n"
|
| 157 |
+
" docker push ghcr.io/${{ github.repository }}:${{ github.ref_name }}"
|
| 158 |
+
),
|
| 159 |
+
},
|
| 160 |
+
{
|
| 161 |
+
"path": "Dockerfile",
|
| 162 |
+
"type": "dockerfile",
|
| 163 |
+
"content": (
|
| 164 |
+
"FROM python:3.11-slim\n"
|
| 165 |
+
"WORKDIR /app\n"
|
| 166 |
+
"COPY . .\n"
|
| 167 |
+
'CMD ["python", "app.py"]'
|
| 168 |
+
),
|
| 169 |
+
},
|
| 170 |
+
],
|
| 171 |
+
"error": {
|
| 172 |
+
"phase": "push",
|
| 173 |
+
"message": (
|
| 174 |
+
"denied: permission_denied: write_package β "
|
| 175 |
+
"GITHUB_TOKEN does not have packages:write permission"
|
| 176 |
+
),
|
| 177 |
+
"exit_code": 1,
|
| 178 |
+
"failed_step": "Build and push",
|
| 179 |
+
},
|
| 180 |
+
"expected_fixes": [
|
| 181 |
+
{
|
| 182 |
+
"file": ".github/workflows/publish.yml",
|
| 183 |
+
"type": "contains",
|
| 184 |
+
"expected": "packages: write",
|
| 185 |
+
"hint": "Add 'permissions: packages: write' at job or workflow level to allow pushing to GHCR",
|
| 186 |
+
},
|
| 187 |
+
],
|
| 188 |
+
},
|
| 189 |
+
|
| 190 |
+
# Scenario 4: Secret referenced in run but not mapped to env
|
| 191 |
+
{
|
| 192 |
+
"id": "secret_not_in_env",
|
| 193 |
+
"files": [
|
| 194 |
+
{
|
| 195 |
+
"path": ".github/workflows/notify.yml",
|
| 196 |
+
"type": "workflow",
|
| 197 |
+
"content": (
|
| 198 |
+
"name: Notify\n"
|
| 199 |
+
"on:\n"
|
| 200 |
+
" push:\n"
|
| 201 |
+
" branches: [main]\n"
|
| 202 |
+
"\n"
|
| 203 |
+
"jobs:\n"
|
| 204 |
+
" notify:\n"
|
| 205 |
+
" runs-on: ubuntu-latest\n"
|
| 206 |
+
" steps:\n"
|
| 207 |
+
" - uses: actions/checkout@v4\n"
|
| 208 |
+
" - name: Send Slack notification\n"
|
| 209 |
+
" run: |\n"
|
| 210 |
+
" curl -X POST -H 'Content-Type: application/json' \\\n"
|
| 211 |
+
" -d '{\"text\": \"Deployed ${{ github.sha }}\"}' \\\n"
|
| 212 |
+
" $SLACK_WEBHOOK_URL"
|
| 213 |
+
),
|
| 214 |
+
},
|
| 215 |
+
],
|
| 216 |
+
"error": {
|
| 217 |
+
"phase": "workflow_parse",
|
| 218 |
+
"message": "SLACK_WEBHOOK_URL is empty β secret not available in shell environment",
|
| 219 |
+
"exit_code": 1,
|
| 220 |
+
"failed_step": "Send Slack notification",
|
| 221 |
+
},
|
| 222 |
+
"expected_fixes": [
|
| 223 |
+
{
|
| 224 |
+
"file": ".github/workflows/notify.yml",
|
| 225 |
+
"type": "contains",
|
| 226 |
+
"expected": "SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}",
|
| 227 |
+
"hint": "Map the secret to an environment variable using env: block",
|
| 228 |
+
},
|
| 229 |
+
],
|
| 230 |
+
},
|
| 231 |
+
|
| 232 |
+
# Scenario 5: Using DOCKER_PASSWORD for GHCR instead of GITHUB_TOKEN
|
| 233 |
+
{
|
| 234 |
+
"id": "ghcr_wrong_credentials",
|
| 235 |
+
"files": [
|
| 236 |
+
{
|
| 237 |
+
"path": ".github/workflows/ghcr.yml",
|
| 238 |
+
"type": "workflow",
|
| 239 |
+
"content": (
|
| 240 |
+
"name: Push to GHCR\n"
|
| 241 |
+
"on:\n"
|
| 242 |
+
" push:\n"
|
| 243 |
+
" branches: [main]\n"
|
| 244 |
+
"\n"
|
| 245 |
+
"jobs:\n"
|
| 246 |
+
" push:\n"
|
| 247 |
+
" runs-on: ubuntu-latest\n"
|
| 248 |
+
" permissions:\n"
|
| 249 |
+
" packages: write\n"
|
| 250 |
+
" steps:\n"
|
| 251 |
+
" - uses: actions/checkout@v4\n"
|
| 252 |
+
" - name: Login to GHCR\n"
|
| 253 |
+
" run: echo ${{ secrets.DOCKER_PASSWORD }} | docker login ghcr.io -u ${{ github.actor }} --password-stdin\n"
|
| 254 |
+
" - name: Push image\n"
|
| 255 |
+
" run: |\n"
|
| 256 |
+
" docker build -t ghcr.io/${{ github.repository }}:latest .\n"
|
| 257 |
+
" docker push ghcr.io/${{ github.repository }}:latest"
|
| 258 |
+
),
|
| 259 |
+
},
|
| 260 |
+
{
|
| 261 |
+
"path": "Dockerfile",
|
| 262 |
+
"type": "dockerfile",
|
| 263 |
+
"content": (
|
| 264 |
+
"FROM python:3.11-slim\n"
|
| 265 |
+
"WORKDIR /app\n"
|
| 266 |
+
"COPY . .\n"
|
| 267 |
+
'CMD ["python", "app.py"]'
|
| 268 |
+
),
|
| 269 |
+
},
|
| 270 |
+
],
|
| 271 |
+
"error": {
|
| 272 |
+
"phase": "push",
|
| 273 |
+
"message": (
|
| 274 |
+
"Error: denied: installation not allowed to Create organization package β "
|
| 275 |
+
"GHCR requires GITHUB_TOKEN, not DOCKER_PASSWORD"
|
| 276 |
+
),
|
| 277 |
+
"exit_code": 1,
|
| 278 |
+
"failed_step": "Login to GHCR",
|
| 279 |
+
},
|
| 280 |
+
"expected_fixes": [
|
| 281 |
+
{
|
| 282 |
+
"file": ".github/workflows/ghcr.yml",
|
| 283 |
+
"type": "contains",
|
| 284 |
+
"expected": "secrets.GITHUB_TOKEN",
|
| 285 |
+
"hint": "GHCR uses GITHUB_TOKEN for authentication, not DOCKER_PASSWORD",
|
| 286 |
+
},
|
| 287 |
+
],
|
| 288 |
+
},
|
| 289 |
+
]
|
server/tasks/task_5_ci_docker_integration.py
CHANGED
|
@@ -1,7 +1,14 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
-
import
|
| 4 |
-
from typing import Dict, Optional
|
| 5 |
|
| 6 |
from server.models import TaskDifficulty
|
| 7 |
from server.tasks.base import BaseTask
|
|
@@ -12,36 +19,293 @@ class CIDockerIntegrationTask(BaseTask):
|
|
| 12 |
DESCRIPTION = "Debug combined workflow and Docker build integration failures"
|
| 13 |
DIFFICULTY = TaskDifficulty.MEDIUM
|
| 14 |
AVAILABLE_SECRETS = ["DOCKER_USERNAME", "DOCKER_PASSWORD", "GITHUB_TOKEN"]
|
|
|
|
| 15 |
SCENARIOS = [
|
|
|
|
| 16 |
{
|
| 17 |
"id": "missing_buildx_for_platforms",
|
| 18 |
"files": [
|
| 19 |
{
|
| 20 |
"path": ".github/workflows/build.yml",
|
| 21 |
"type": "workflow",
|
| 22 |
-
"content":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
},
|
| 24 |
-
{"path": "Dockerfile", "type": "dockerfile", "content": "FROM python:3.11-slim\nWORKDIR /app\nCOPY . ."},
|
| 25 |
],
|
| 26 |
"error": {
|
| 27 |
"phase": "docker_build",
|
| 28 |
-
"message":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
},
|
| 30 |
"expected_fixes": [
|
| 31 |
{
|
| 32 |
"file": ".github/workflows/build.yml",
|
| 33 |
"type": "contains",
|
| 34 |
"expected": "docker/setup-buildx-action",
|
| 35 |
-
"hint": "
|
| 36 |
}
|
| 37 |
],
|
| 38 |
-
}
|
| 39 |
-
]
|
| 40 |
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Task 5: CI and Docker Build Integration β MEDIUM-HARD.
|
| 2 |
+
|
| 3 |
+
Agent debugs combined workflow + Docker build integration failures:
|
| 4 |
+
- Missing Buildx for multi-platform
|
| 5 |
+
- Docker login needs secrets in env block
|
| 6 |
+
- Build context path mismatch
|
| 7 |
+
- Cache configuration errors
|
| 8 |
+
- Missing Docker login before push
|
| 9 |
+
"""
|
| 10 |
|
| 11 |
+
from __future__ import annotations
|
|
|
|
| 12 |
|
| 13 |
from server.models import TaskDifficulty
|
| 14 |
from server.tasks.base import BaseTask
|
|
|
|
| 19 |
DESCRIPTION = "Debug combined workflow and Docker build integration failures"
|
| 20 |
DIFFICULTY = TaskDifficulty.MEDIUM
|
| 21 |
AVAILABLE_SECRETS = ["DOCKER_USERNAME", "DOCKER_PASSWORD", "GITHUB_TOKEN"]
|
| 22 |
+
|
| 23 |
SCENARIOS = [
|
| 24 |
+
# Scenario 1: Missing Buildx setup for multi-platform build
|
| 25 |
{
|
| 26 |
"id": "missing_buildx_for_platforms",
|
| 27 |
"files": [
|
| 28 |
{
|
| 29 |
"path": ".github/workflows/build.yml",
|
| 30 |
"type": "workflow",
|
| 31 |
+
"content": (
|
| 32 |
+
"name: Multi-platform Build\n"
|
| 33 |
+
"on: push\n"
|
| 34 |
+
"\n"
|
| 35 |
+
"jobs:\n"
|
| 36 |
+
" build:\n"
|
| 37 |
+
" runs-on: ubuntu-latest\n"
|
| 38 |
+
" steps:\n"
|
| 39 |
+
" - uses: actions/checkout@v4\n"
|
| 40 |
+
" - name: Build multi-platform\n"
|
| 41 |
+
" uses: docker/build-push-action@v5\n"
|
| 42 |
+
" with:\n"
|
| 43 |
+
" context: .\n"
|
| 44 |
+
" platforms: linux/amd64,linux/arm64\n"
|
| 45 |
+
" push: false"
|
| 46 |
+
),
|
| 47 |
+
},
|
| 48 |
+
{
|
| 49 |
+
"path": "Dockerfile",
|
| 50 |
+
"type": "dockerfile",
|
| 51 |
+
"content": (
|
| 52 |
+
"FROM python:3.11-slim\n"
|
| 53 |
+
"WORKDIR /app\n"
|
| 54 |
+
"COPY . .\n"
|
| 55 |
+
'CMD ["python", "app.py"]'
|
| 56 |
+
),
|
| 57 |
},
|
|
|
|
| 58 |
],
|
| 59 |
"error": {
|
| 60 |
"phase": "docker_build",
|
| 61 |
+
"message": (
|
| 62 |
+
"ERROR: Multi-platform build is not supported for the docker driver. "
|
| 63 |
+
"Switch to a different driver, or turn on the containerd image store."
|
| 64 |
+
),
|
| 65 |
+
"exit_code": 1,
|
| 66 |
+
"failed_step": "Build multi-platform",
|
| 67 |
},
|
| 68 |
"expected_fixes": [
|
| 69 |
{
|
| 70 |
"file": ".github/workflows/build.yml",
|
| 71 |
"type": "contains",
|
| 72 |
"expected": "docker/setup-buildx-action",
|
| 73 |
+
"hint": "Multi-platform builds require Docker Buildx setup step",
|
| 74 |
}
|
| 75 |
],
|
| 76 |
+
},
|
|
|
|
| 77 |
|
| 78 |
+
# Scenario 2: Docker login + build but secrets not wired in env block
|
| 79 |
+
{
|
| 80 |
+
"id": "login_secrets_not_wired",
|
| 81 |
+
"files": [
|
| 82 |
+
{
|
| 83 |
+
"path": ".github/workflows/build.yml",
|
| 84 |
+
"type": "workflow",
|
| 85 |
+
"content": (
|
| 86 |
+
"name: Build and Push\n"
|
| 87 |
+
"on: push\n"
|
| 88 |
+
"\n"
|
| 89 |
+
"jobs:\n"
|
| 90 |
+
" build:\n"
|
| 91 |
+
" runs-on: ubuntu-latest\n"
|
| 92 |
+
" steps:\n"
|
| 93 |
+
" - uses: actions/checkout@v4\n"
|
| 94 |
+
" - name: Login to DockerHub\n"
|
| 95 |
+
" run: echo $DOCKER_PASSWORD | docker login -u $DOCKER_USERNAME --password-stdin\n"
|
| 96 |
+
" - name: Build\n"
|
| 97 |
+
" run: docker build -t myuser/app:latest .\n"
|
| 98 |
+
" - name: Push\n"
|
| 99 |
+
" run: docker push myuser/app:latest"
|
| 100 |
+
),
|
| 101 |
+
},
|
| 102 |
+
{
|
| 103 |
+
"path": "Dockerfile",
|
| 104 |
+
"type": "dockerfile",
|
| 105 |
+
"content": (
|
| 106 |
+
"FROM node:18-alpine\n"
|
| 107 |
+
"WORKDIR /app\n"
|
| 108 |
+
"COPY package*.json ./\n"
|
| 109 |
+
"RUN npm ci\n"
|
| 110 |
+
"COPY . .\n"
|
| 111 |
+
"EXPOSE 3000\n"
|
| 112 |
+
'CMD ["npm", "start"]'
|
| 113 |
+
),
|
| 114 |
+
},
|
| 115 |
+
{
|
| 116 |
+
"path": "package.json",
|
| 117 |
+
"type": "other",
|
| 118 |
+
"content": '{"name": "app", "scripts": {"start": "node server.js"}}',
|
| 119 |
+
},
|
| 120 |
+
],
|
| 121 |
+
"error": {
|
| 122 |
+
"phase": "workflow_parse",
|
| 123 |
+
"message": "Error: Cannot perform an interactive login from a non TTY device",
|
| 124 |
+
"exit_code": 1,
|
| 125 |
+
"failed_step": "Login to DockerHub",
|
| 126 |
+
},
|
| 127 |
+
"expected_fixes": [
|
| 128 |
+
{
|
| 129 |
+
"file": ".github/workflows/build.yml",
|
| 130 |
+
"type": "contains",
|
| 131 |
+
"expected": "DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }}",
|
| 132 |
+
"hint": "Secrets need to be mapped to env vars in the step",
|
| 133 |
+
},
|
| 134 |
+
{
|
| 135 |
+
"file": ".github/workflows/build.yml",
|
| 136 |
+
"type": "contains",
|
| 137 |
+
"expected": "DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }}",
|
| 138 |
+
"hint": "Both Docker credentials must be in the env block",
|
| 139 |
+
},
|
| 140 |
+
],
|
| 141 |
+
},
|
| 142 |
+
|
| 143 |
+
# Scenario 3: Build context path wrong β using subdirectory but context is .
|
| 144 |
+
{
|
| 145 |
+
"id": "wrong_build_context",
|
| 146 |
+
"files": [
|
| 147 |
+
{
|
| 148 |
+
"path": ".github/workflows/build.yml",
|
| 149 |
+
"type": "workflow",
|
| 150 |
+
"content": (
|
| 151 |
+
"name: Build Backend\n"
|
| 152 |
+
"on: push\n"
|
| 153 |
+
"\n"
|
| 154 |
+
"jobs:\n"
|
| 155 |
+
" build:\n"
|
| 156 |
+
" runs-on: ubuntu-latest\n"
|
| 157 |
+
" steps:\n"
|
| 158 |
+
" - uses: actions/checkout@v4\n"
|
| 159 |
+
" - name: Build backend\n"
|
| 160 |
+
" uses: docker/build-push-action@v5\n"
|
| 161 |
+
" with:\n"
|
| 162 |
+
" context: ./backend\n"
|
| 163 |
+
" file: ./Dockerfile\n"
|
| 164 |
+
" push: false"
|
| 165 |
+
),
|
| 166 |
+
},
|
| 167 |
+
{
|
| 168 |
+
"path": "Dockerfile",
|
| 169 |
+
"type": "dockerfile",
|
| 170 |
+
"content": (
|
| 171 |
+
"FROM python:3.11-slim\n"
|
| 172 |
+
"WORKDIR /app\n"
|
| 173 |
+
"COPY requirements.txt .\n"
|
| 174 |
+
"RUN pip install -r requirements.txt\n"
|
| 175 |
+
"COPY . .\n"
|
| 176 |
+
'CMD ["python", "app.py"]'
|
| 177 |
+
),
|
| 178 |
+
},
|
| 179 |
+
{
|
| 180 |
+
"path": "requirements.txt",
|
| 181 |
+
"type": "requirements",
|
| 182 |
+
"content": "flask==2.3.0",
|
| 183 |
+
},
|
| 184 |
+
],
|
| 185 |
+
"error": {
|
| 186 |
+
"phase": "docker_build",
|
| 187 |
+
"message": (
|
| 188 |
+
"unable to prepare context: path \"./Dockerfile\" not found β "
|
| 189 |
+
"Dockerfile path does not match build context"
|
| 190 |
+
),
|
| 191 |
+
"exit_code": 1,
|
| 192 |
+
"failed_step": "Build backend",
|
| 193 |
+
},
|
| 194 |
+
"expected_fixes": [
|
| 195 |
+
{
|
| 196 |
+
"file": ".github/workflows/build.yml",
|
| 197 |
+
"type": "contains",
|
| 198 |
+
"expected": "file: ./backend/Dockerfile",
|
| 199 |
+
"hint": "When context is ./backend, the Dockerfile path must be relative to repo root: ./backend/Dockerfile",
|
| 200 |
+
}
|
| 201 |
+
],
|
| 202 |
+
},
|
| 203 |
+
|
| 204 |
+
# Scenario 4: Cache export without mode=max
|
| 205 |
+
{
|
| 206 |
+
"id": "cache_without_mode_max",
|
| 207 |
+
"files": [
|
| 208 |
+
{
|
| 209 |
+
"path": ".github/workflows/build.yml",
|
| 210 |
+
"type": "workflow",
|
| 211 |
+
"content": (
|
| 212 |
+
"name: Build with Cache\n"
|
| 213 |
+
"on: push\n"
|
| 214 |
+
"\n"
|
| 215 |
+
"jobs:\n"
|
| 216 |
+
" build:\n"
|
| 217 |
+
" runs-on: ubuntu-latest\n"
|
| 218 |
+
" steps:\n"
|
| 219 |
+
" - uses: actions/checkout@v4\n"
|
| 220 |
+
" - name: Set up Docker Buildx\n"
|
| 221 |
+
" uses: docker/setup-buildx-action@v3\n"
|
| 222 |
+
" - name: Build\n"
|
| 223 |
+
" uses: docker/build-push-action@v5\n"
|
| 224 |
+
" with:\n"
|
| 225 |
+
" context: .\n"
|
| 226 |
+
" push: false\n"
|
| 227 |
+
" cache-from: type=gha\n"
|
| 228 |
+
" cache-to: type=gha"
|
| 229 |
+
),
|
| 230 |
+
},
|
| 231 |
+
{
|
| 232 |
+
"path": "Dockerfile",
|
| 233 |
+
"type": "dockerfile",
|
| 234 |
+
"content": (
|
| 235 |
+
"FROM python:3.9-slim\n"
|
| 236 |
+
"WORKDIR /app\n"
|
| 237 |
+
"COPY . .\n"
|
| 238 |
+
'CMD ["python", "app.py"]'
|
| 239 |
+
),
|
| 240 |
+
},
|
| 241 |
+
],
|
| 242 |
+
"error": {
|
| 243 |
+
"phase": "docker_build",
|
| 244 |
+
"message": (
|
| 245 |
+
"ERROR: cache export feature is currently not supported for docker driver. "
|
| 246 |
+
"Please switch to a different driver"
|
| 247 |
+
),
|
| 248 |
+
"exit_code": 1,
|
| 249 |
+
"failed_step": "Build",
|
| 250 |
+
},
|
| 251 |
+
"expected_fixes": [
|
| 252 |
+
{
|
| 253 |
+
"file": ".github/workflows/build.yml",
|
| 254 |
+
"type": "contains",
|
| 255 |
+
"expected": "cache-to: type=gha,mode=max",
|
| 256 |
+
"hint": "GHA cache needs mode=max for proper cache export",
|
| 257 |
+
}
|
| 258 |
+
],
|
| 259 |
+
},
|
| 260 |
+
|
| 261 |
+
# Scenario 5: Push without login
|
| 262 |
+
{
|
| 263 |
+
"id": "push_without_login",
|
| 264 |
+
"files": [
|
| 265 |
+
{
|
| 266 |
+
"path": ".github/workflows/build.yml",
|
| 267 |
+
"type": "workflow",
|
| 268 |
+
"content": (
|
| 269 |
+
"name: Build and Push\n"
|
| 270 |
+
"on:\n"
|
| 271 |
+
" push:\n"
|
| 272 |
+
" tags: ['v*']\n"
|
| 273 |
+
"\n"
|
| 274 |
+
"jobs:\n"
|
| 275 |
+
" build:\n"
|
| 276 |
+
" runs-on: ubuntu-latest\n"
|
| 277 |
+
" steps:\n"
|
| 278 |
+
" - uses: actions/checkout@v4\n"
|
| 279 |
+
" - name: Build image\n"
|
| 280 |
+
" run: docker build -t myuser/myapp:${{ github.ref_name }} .\n"
|
| 281 |
+
" - name: Push image\n"
|
| 282 |
+
" run: docker push myuser/myapp:${{ github.ref_name }}"
|
| 283 |
+
),
|
| 284 |
+
},
|
| 285 |
+
{
|
| 286 |
+
"path": "Dockerfile",
|
| 287 |
+
"type": "dockerfile",
|
| 288 |
+
"content": (
|
| 289 |
+
"FROM python:3.11-slim\n"
|
| 290 |
+
"WORKDIR /app\n"
|
| 291 |
+
"COPY . .\n"
|
| 292 |
+
'CMD ["python", "app.py"]'
|
| 293 |
+
),
|
| 294 |
+
},
|
| 295 |
+
],
|
| 296 |
+
"error": {
|
| 297 |
+
"phase": "push",
|
| 298 |
+
"message": "denied: requested access to the resource is denied β not logged in to registry",
|
| 299 |
+
"exit_code": 1,
|
| 300 |
+
"failed_step": "Push image",
|
| 301 |
+
},
|
| 302 |
+
"expected_fixes": [
|
| 303 |
+
{
|
| 304 |
+
"file": ".github/workflows/build.yml",
|
| 305 |
+
"type": "contains",
|
| 306 |
+
"expected": "docker login",
|
| 307 |
+
"hint": "Add a Docker login step before pushing to a registry",
|
| 308 |
+
},
|
| 309 |
+
],
|
| 310 |
+
},
|
| 311 |
+
]
|
server/tasks/task_6_multi_stage_matrix.py
CHANGED
|
@@ -1,7 +1,14 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
-
import
|
| 4 |
-
from typing import Dict, Optional
|
| 5 |
|
| 6 |
from server.models import TaskDifficulty
|
| 7 |
from server.tasks.base import BaseTask
|
|
@@ -12,33 +19,376 @@ class MultiStageMatrixTask(BaseTask):
|
|
| 12 |
DESCRIPTION = "Debug complex multi-stage and matrix CI/CD pipelines"
|
| 13 |
DIFFICULTY = TaskDifficulty.HARD
|
| 14 |
AVAILABLE_SECRETS = ["DOCKER_USERNAME", "DOCKER_PASSWORD", "GITHUB_TOKEN", "NPM_TOKEN"]
|
|
|
|
| 15 |
SCENARIOS = [
|
|
|
|
| 16 |
{
|
| 17 |
"id": "artifact_path_mismatch",
|
| 18 |
"files": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
{
|
| 20 |
"path": "Dockerfile",
|
| 21 |
"type": "dockerfile",
|
| 22 |
-
"content":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
},
|
| 24 |
-
{"path": "package.json", "type": "other", "content": '{"scripts": {"build": "react-scripts build"}}'},
|
| 25 |
],
|
| 26 |
-
"error": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
"expected_fixes": [
|
| 28 |
{
|
| 29 |
"file": "Dockerfile",
|
| 30 |
"type": "contains",
|
| 31 |
"expected": "COPY --from=builder /app/build",
|
| 32 |
-
"hint": "React
|
| 33 |
}
|
| 34 |
],
|
| 35 |
-
}
|
| 36 |
-
]
|
| 37 |
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Task 6: Multi-Stage Pipeline and Matrix β HARD.
|
| 2 |
+
|
| 3 |
+
Agent debugs complex multi-stage Docker builds and matrix CI/CD pipelines:
|
| 4 |
+
- Multi-stage artifact path mismatch (dist vs build)
|
| 5 |
+
- Platform ARGs not declared
|
| 6 |
+
- Cross-job artifact dependency (missing 'needs')
|
| 7 |
+
- Multiple interacting issues (Dockerfile typo + missing env secrets)
|
| 8 |
+
- Matrix strategy with version-specific failures
|
| 9 |
+
"""
|
| 10 |
|
| 11 |
+
from __future__ import annotations
|
|
|
|
| 12 |
|
| 13 |
from server.models import TaskDifficulty
|
| 14 |
from server.tasks.base import BaseTask
|
|
|
|
| 19 |
DESCRIPTION = "Debug complex multi-stage and matrix CI/CD pipelines"
|
| 20 |
DIFFICULTY = TaskDifficulty.HARD
|
| 21 |
AVAILABLE_SECRETS = ["DOCKER_USERNAME", "DOCKER_PASSWORD", "GITHUB_TOKEN", "NPM_TOKEN"]
|
| 22 |
+
|
| 23 |
SCENARIOS = [
|
| 24 |
+
# Scenario 1: Multi-stage artifact path mismatch (dist vs build)
|
| 25 |
{
|
| 26 |
"id": "artifact_path_mismatch",
|
| 27 |
"files": [
|
| 28 |
+
{
|
| 29 |
+
"path": ".github/workflows/build.yml",
|
| 30 |
+
"type": "workflow",
|
| 31 |
+
"content": (
|
| 32 |
+
"name: Build and Deploy\n"
|
| 33 |
+
"on: push\n"
|
| 34 |
+
"\n"
|
| 35 |
+
"jobs:\n"
|
| 36 |
+
" build:\n"
|
| 37 |
+
" runs-on: ubuntu-latest\n"
|
| 38 |
+
" steps:\n"
|
| 39 |
+
" - uses: actions/checkout@v4\n"
|
| 40 |
+
" - name: Set up Docker Buildx\n"
|
| 41 |
+
" uses: docker/setup-buildx-action@v3\n"
|
| 42 |
+
" - name: Build\n"
|
| 43 |
+
" uses: docker/build-push-action@v5\n"
|
| 44 |
+
" with:\n"
|
| 45 |
+
" context: .\n"
|
| 46 |
+
" push: false\n"
|
| 47 |
+
" load: true\n"
|
| 48 |
+
" tags: myapp:test"
|
| 49 |
+
),
|
| 50 |
+
},
|
| 51 |
{
|
| 52 |
"path": "Dockerfile",
|
| 53 |
"type": "dockerfile",
|
| 54 |
+
"content": (
|
| 55 |
+
"FROM node:18 AS builder\n"
|
| 56 |
+
"WORKDIR /app\n"
|
| 57 |
+
"COPY package*.json ./\n"
|
| 58 |
+
"RUN npm ci\n"
|
| 59 |
+
"COPY . .\n"
|
| 60 |
+
"RUN npm run build\n"
|
| 61 |
+
"\n"
|
| 62 |
+
"FROM nginx:alpine\n"
|
| 63 |
+
"COPY --from=builder /app/dist /usr/share/nginx/html\n"
|
| 64 |
+
"EXPOSE 80\n"
|
| 65 |
+
'CMD ["nginx", "-g", "daemon off;"]'
|
| 66 |
+
),
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"path": "package.json",
|
| 70 |
+
"type": "other",
|
| 71 |
+
"content": '{"name": "frontend", "scripts": {"build": "react-scripts build"}}',
|
| 72 |
},
|
|
|
|
| 73 |
],
|
| 74 |
+
"error": {
|
| 75 |
+
"phase": "docker_build",
|
| 76 |
+
"message": "COPY failed: stat app/dist: file does not exist",
|
| 77 |
+
"exit_code": 1,
|
| 78 |
+
"failed_step": "Build",
|
| 79 |
+
"line_hint": 9,
|
| 80 |
+
},
|
| 81 |
"expected_fixes": [
|
| 82 |
{
|
| 83 |
"file": "Dockerfile",
|
| 84 |
"type": "contains",
|
| 85 |
"expected": "COPY --from=builder /app/build",
|
| 86 |
+
"hint": "React's create-react-app outputs to 'build' directory, not 'dist'",
|
| 87 |
}
|
| 88 |
],
|
| 89 |
+
},
|
|
|
|
| 90 |
|
| 91 |
+
# Scenario 2: Platform ARGs not declared
|
| 92 |
+
{
|
| 93 |
+
"id": "matrix_platform_arg",
|
| 94 |
+
"files": [
|
| 95 |
+
{
|
| 96 |
+
"path": ".github/workflows/build.yml",
|
| 97 |
+
"type": "workflow",
|
| 98 |
+
"content": (
|
| 99 |
+
"name: Multi-Platform Build\n"
|
| 100 |
+
"on: push\n"
|
| 101 |
+
"\n"
|
| 102 |
+
"jobs:\n"
|
| 103 |
+
" build:\n"
|
| 104 |
+
" runs-on: ubuntu-latest\n"
|
| 105 |
+
" strategy:\n"
|
| 106 |
+
" matrix:\n"
|
| 107 |
+
" platform:\n"
|
| 108 |
+
" - linux/amd64\n"
|
| 109 |
+
" - linux/arm64\n"
|
| 110 |
+
" steps:\n"
|
| 111 |
+
" - uses: actions/checkout@v4\n"
|
| 112 |
+
" - name: Set up QEMU\n"
|
| 113 |
+
" uses: docker/setup-qemu-action@v3\n"
|
| 114 |
+
" - name: Set up Docker Buildx\n"
|
| 115 |
+
" uses: docker/setup-buildx-action@v3\n"
|
| 116 |
+
" - name: Build\n"
|
| 117 |
+
" uses: docker/build-push-action@v5\n"
|
| 118 |
+
" with:\n"
|
| 119 |
+
" context: .\n"
|
| 120 |
+
" platforms: ${{ matrix.platform }}\n"
|
| 121 |
+
" push: false"
|
| 122 |
+
),
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"path": "Dockerfile",
|
| 126 |
+
"type": "dockerfile",
|
| 127 |
+
"content": (
|
| 128 |
+
"FROM --platform=$BUILDPLATFORM node:18 AS builder\n"
|
| 129 |
+
"WORKDIR /app\n"
|
| 130 |
+
"COPY package*.json ./\n"
|
| 131 |
+
"RUN npm ci\n"
|
| 132 |
+
"COPY . .\n"
|
| 133 |
+
"RUN npm run build\n"
|
| 134 |
+
"\n"
|
| 135 |
+
"FROM --platform=$TARGETPLATFORM nginx:alpine\n"
|
| 136 |
+
"COPY --from=builder /app/build /usr/share/nginx/html\n"
|
| 137 |
+
"EXPOSE 80"
|
| 138 |
+
),
|
| 139 |
+
},
|
| 140 |
+
{
|
| 141 |
+
"path": "package.json",
|
| 142 |
+
"type": "other",
|
| 143 |
+
"content": '{"name": "app", "scripts": {"build": "echo build"}}',
|
| 144 |
+
},
|
| 145 |
+
],
|
| 146 |
+
"error": {
|
| 147 |
+
"phase": "docker_build",
|
| 148 |
+
"message": 'failed to solve: failed to parse platform : "" is not a valid platform',
|
| 149 |
+
"exit_code": 1,
|
| 150 |
+
"failed_step": "Build",
|
| 151 |
+
},
|
| 152 |
+
"expected_fixes": [
|
| 153 |
+
{
|
| 154 |
+
"file": "Dockerfile",
|
| 155 |
+
"type": "contains",
|
| 156 |
+
"expected": "ARG BUILDPLATFORM",
|
| 157 |
+
"hint": "Platform ARGs must be declared before use",
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"file": "Dockerfile",
|
| 161 |
+
"type": "contains",
|
| 162 |
+
"expected": "ARG TARGETPLATFORM",
|
| 163 |
+
"hint": "Both BUILDPLATFORM and TARGETPLATFORM need ARG declarations",
|
| 164 |
+
},
|
| 165 |
+
],
|
| 166 |
+
},
|
| 167 |
+
|
| 168 |
+
# Scenario 3: Cross-job artifact β missing 'needs' dependency
|
| 169 |
+
{
|
| 170 |
+
"id": "cross_job_artifact",
|
| 171 |
+
"files": [
|
| 172 |
+
{
|
| 173 |
+
"path": ".github/workflows/build.yml",
|
| 174 |
+
"type": "workflow",
|
| 175 |
+
"content": (
|
| 176 |
+
"name: Build and Test\n"
|
| 177 |
+
"on: push\n"
|
| 178 |
+
"\n"
|
| 179 |
+
"jobs:\n"
|
| 180 |
+
" build:\n"
|
| 181 |
+
" runs-on: ubuntu-latest\n"
|
| 182 |
+
" steps:\n"
|
| 183 |
+
" - uses: actions/checkout@v4\n"
|
| 184 |
+
" - name: Build\n"
|
| 185 |
+
" run: |\n"
|
| 186 |
+
" docker build -t myapp:${{ github.sha }} .\n"
|
| 187 |
+
" docker save myapp:${{ github.sha }} > image.tar\n"
|
| 188 |
+
" - uses: actions/upload-artifact@v4\n"
|
| 189 |
+
" with:\n"
|
| 190 |
+
" name: docker-image\n"
|
| 191 |
+
" path: image.tar\n"
|
| 192 |
+
"\n"
|
| 193 |
+
" test:\n"
|
| 194 |
+
" runs-on: ubuntu-latest\n"
|
| 195 |
+
" steps:\n"
|
| 196 |
+
" - name: Download image\n"
|
| 197 |
+
" uses: actions/download-artifact@v4\n"
|
| 198 |
+
" with:\n"
|
| 199 |
+
" name: docker-image\n"
|
| 200 |
+
" - name: Load and test\n"
|
| 201 |
+
" run: |\n"
|
| 202 |
+
" docker load < image.tar\n"
|
| 203 |
+
" docker run myapp:${{ github.sha }} pytest"
|
| 204 |
+
),
|
| 205 |
+
},
|
| 206 |
+
{
|
| 207 |
+
"path": "Dockerfile",
|
| 208 |
+
"type": "dockerfile",
|
| 209 |
+
"content": (
|
| 210 |
+
"FROM python:3.9\n"
|
| 211 |
+
"WORKDIR /app\n"
|
| 212 |
+
"COPY . .\n"
|
| 213 |
+
"RUN pip install pytest\n"
|
| 214 |
+
'CMD ["python", "app.py"]'
|
| 215 |
+
),
|
| 216 |
+
},
|
| 217 |
+
],
|
| 218 |
+
"error": {
|
| 219 |
+
"phase": "workflow_parse",
|
| 220 |
+
"message": (
|
| 221 |
+
"The workflow is not valid. .github/workflows/build.yml "
|
| 222 |
+
"(Line: 18, Col: 5): Job 'test' depends on unknown job 'build' β "
|
| 223 |
+
"add 'needs: build' to the test job"
|
| 224 |
+
),
|
| 225 |
+
"exit_code": 1,
|
| 226 |
+
},
|
| 227 |
+
"expected_fixes": [
|
| 228 |
+
{
|
| 229 |
+
"file": ".github/workflows/build.yml",
|
| 230 |
+
"type": "contains",
|
| 231 |
+
"expected": "needs: build",
|
| 232 |
+
"hint": "Test job needs to declare dependency on build job via 'needs: build'",
|
| 233 |
+
}
|
| 234 |
+
],
|
| 235 |
+
},
|
| 236 |
+
|
| 237 |
+
# Scenario 4: Multiple interacting issues β typo + missing env
|
| 238 |
+
{
|
| 239 |
+
"id": "multiple_issues",
|
| 240 |
+
"files": [
|
| 241 |
+
{
|
| 242 |
+
"path": ".github/workflows/build.yml",
|
| 243 |
+
"type": "workflow",
|
| 244 |
+
"content": (
|
| 245 |
+
"name: Full Pipeline\n"
|
| 246 |
+
"on: push\n"
|
| 247 |
+
"\n"
|
| 248 |
+
"jobs:\n"
|
| 249 |
+
" build:\n"
|
| 250 |
+
" runs-on: ubuntu-latest\n"
|
| 251 |
+
" steps:\n"
|
| 252 |
+
" - uses: actions/checkout@v4\n"
|
| 253 |
+
" - name: Login\n"
|
| 254 |
+
" run: echo $DOCKER_PASSWORD | docker login -u $DOCKER_USERNAME --password-stdin\n"
|
| 255 |
+
" - name: Build and Push\n"
|
| 256 |
+
" run: |\n"
|
| 257 |
+
" docker build -t myuser/myapp:latest .\n"
|
| 258 |
+
" docker push myuser/myapp:latest"
|
| 259 |
+
),
|
| 260 |
+
},
|
| 261 |
+
{
|
| 262 |
+
"path": "Dockerfile",
|
| 263 |
+
"type": "dockerfile",
|
| 264 |
+
"content": (
|
| 265 |
+
"FROM python:3.9-slim AS builder\n"
|
| 266 |
+
"WORKDIR /app\n"
|
| 267 |
+
"COPY requirments.txt .\n"
|
| 268 |
+
"RUN pip install -r requirements.txt\n"
|
| 269 |
+
"COPY . .\n"
|
| 270 |
+
"\n"
|
| 271 |
+
"FROM python:3.9-slim\n"
|
| 272 |
+
"WORKDIR /app\n"
|
| 273 |
+
"COPY --from=builder /app .\n"
|
| 274 |
+
'CMD ["python", "app.py"]'
|
| 275 |
+
),
|
| 276 |
+
},
|
| 277 |
+
{
|
| 278 |
+
"path": "requirements.txt",
|
| 279 |
+
"type": "requirements",
|
| 280 |
+
"content": "flask==2.0.0",
|
| 281 |
+
},
|
| 282 |
+
],
|
| 283 |
+
"error": {
|
| 284 |
+
"phase": "docker_build",
|
| 285 |
+
"message": (
|
| 286 |
+
"COPY failed: file not found in build context: requirments.txt\n"
|
| 287 |
+
"Additionally: Error: Cannot perform an interactive login from a non TTY device"
|
| 288 |
+
),
|
| 289 |
+
"exit_code": 1,
|
| 290 |
+
},
|
| 291 |
+
"expected_fixes": [
|
| 292 |
+
{
|
| 293 |
+
"file": "Dockerfile",
|
| 294 |
+
"type": "contains",
|
| 295 |
+
"expected": "COPY requirements.txt",
|
| 296 |
+
"hint": "Fix typo in requirements filename",
|
| 297 |
+
},
|
| 298 |
+
{
|
| 299 |
+
"file": ".github/workflows/build.yml",
|
| 300 |
+
"type": "contains",
|
| 301 |
+
"expected": "DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }}",
|
| 302 |
+
"hint": "Add env block for Docker secrets",
|
| 303 |
+
},
|
| 304 |
+
{
|
| 305 |
+
"file": ".github/workflows/build.yml",
|
| 306 |
+
"type": "contains",
|
| 307 |
+
"expected": "DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }}",
|
| 308 |
+
"hint": "Add password to env block",
|
| 309 |
+
},
|
| 310 |
+
],
|
| 311 |
+
},
|
| 312 |
+
|
| 313 |
+
# Scenario 5: Matrix build with wrong node version causing build failure
|
| 314 |
+
{
|
| 315 |
+
"id": "matrix_version_failure",
|
| 316 |
+
"files": [
|
| 317 |
+
{
|
| 318 |
+
"path": ".github/workflows/ci.yml",
|
| 319 |
+
"type": "workflow",
|
| 320 |
+
"content": (
|
| 321 |
+
"name: CI Matrix\n"
|
| 322 |
+
"on: push\n"
|
| 323 |
+
"\n"
|
| 324 |
+
"jobs:\n"
|
| 325 |
+
" test:\n"
|
| 326 |
+
" runs-on: ubuntu-latest\n"
|
| 327 |
+
" strategy:\n"
|
| 328 |
+
" matrix:\n"
|
| 329 |
+
" node: [14, 16, 18]\n"
|
| 330 |
+
" steps:\n"
|
| 331 |
+
" - uses: actions/checkout@v4\n"
|
| 332 |
+
" - name: Use Node.js\n"
|
| 333 |
+
" uses: actions/setup-node@v4\n"
|
| 334 |
+
" with:\n"
|
| 335 |
+
" node-version: ${{ matrix.node }}\n"
|
| 336 |
+
" - run: npm ci\n"
|
| 337 |
+
" - run: npm test\n"
|
| 338 |
+
"\n"
|
| 339 |
+
" docker:\n"
|
| 340 |
+
" runs-on: ubuntu-latest\n"
|
| 341 |
+
" steps:\n"
|
| 342 |
+
" - uses: actions/checkout@v4\n"
|
| 343 |
+
" - name: Build Docker\n"
|
| 344 |
+
" run: docker build -t myapp ."
|
| 345 |
+
),
|
| 346 |
+
},
|
| 347 |
+
{
|
| 348 |
+
"path": "Dockerfile",
|
| 349 |
+
"type": "dockerfile",
|
| 350 |
+
"content": (
|
| 351 |
+
"FROM node:18-alpine\n"
|
| 352 |
+
"WORKDIR /app\n"
|
| 353 |
+
"COPY package*.json ./\n"
|
| 354 |
+
"RUN npm ci\n"
|
| 355 |
+
"COPY . .\n"
|
| 356 |
+
"RUN npm run build\n"
|
| 357 |
+
"EXPOSE 3000\n"
|
| 358 |
+
'CMD ["npm", "start"]'
|
| 359 |
+
),
|
| 360 |
+
},
|
| 361 |
+
{
|
| 362 |
+
"path": "package.json",
|
| 363 |
+
"type": "other",
|
| 364 |
+
"content": (
|
| 365 |
+
'{"name": "app", "engines": {"node": ">=16"}, '
|
| 366 |
+
'"scripts": {"build": "echo ok", "start": "node index.js", "test": "echo ok"}}'
|
| 367 |
+
),
|
| 368 |
+
},
|
| 369 |
+
],
|
| 370 |
+
"error": {
|
| 371 |
+
"phase": "test",
|
| 372 |
+
"message": (
|
| 373 |
+
"Matrix job (node: 14) failed: npm ci requires Node.js >= 16. "
|
| 374 |
+
"Docker job needs 'needs: test' to wait for CI matrix."
|
| 375 |
+
),
|
| 376 |
+
"exit_code": 1,
|
| 377 |
+
"failed_step": "npm ci",
|
| 378 |
+
},
|
| 379 |
+
"expected_fixes": [
|
| 380 |
+
{
|
| 381 |
+
"file": ".github/workflows/ci.yml",
|
| 382 |
+
"type": "not_contains",
|
| 383 |
+
"expected": "14",
|
| 384 |
+
"hint": "Remove Node 14 from the matrix β package.json requires Node >= 16",
|
| 385 |
+
},
|
| 386 |
+
{
|
| 387 |
+
"file": ".github/workflows/ci.yml",
|
| 388 |
+
"type": "contains",
|
| 389 |
+
"expected": "needs: test",
|
| 390 |
+
"hint": "Docker build job should depend on test job with 'needs: test'",
|
| 391 |
+
},
|
| 392 |
+
],
|
| 393 |
+
},
|
| 394 |
+
]
|
server/utils/yaml_parser.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Safe YAML parsing utilities for workflow validation."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Any, Optional, Tuple
|
| 6 |
+
|
| 7 |
+
import yaml
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def safe_parse_yaml(content: str) -> Tuple[Optional[Any], Optional[str]]:
|
| 11 |
+
"""Parse YAML content safely.
|
| 12 |
+
|
| 13 |
+
Returns (parsed, error_message). If parsing succeeds, error_message is None.
|
| 14 |
+
If parsing fails, parsed is None and error_message contains the description.
|
| 15 |
+
"""
|
| 16 |
+
try:
|
| 17 |
+
parsed = yaml.safe_load(content)
|
| 18 |
+
return parsed, None
|
| 19 |
+
except yaml.YAMLError as exc:
|
| 20 |
+
return None, str(exc)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def is_valid_workflow(content: str) -> Tuple[bool, Optional[str]]:
|
| 24 |
+
"""Check if content is a valid GitHub Actions workflow YAML.
|
| 25 |
+
|
| 26 |
+
Returns (is_valid, error_message).
|
| 27 |
+
"""
|
| 28 |
+
parsed, err = safe_parse_yaml(content)
|
| 29 |
+
if err:
|
| 30 |
+
return False, f"YAML parse error: {err}"
|
| 31 |
+
if not isinstance(parsed, dict):
|
| 32 |
+
return False, "Workflow root must be a mapping"
|
| 33 |
+
if "jobs" not in parsed:
|
| 34 |
+
return False, "Workflow must define 'jobs'"
|
| 35 |
+
jobs = parsed.get("jobs")
|
| 36 |
+
if not isinstance(jobs, dict) or not jobs:
|
| 37 |
+
return False, "Workflow must define at least one job"
|
| 38 |
+
for job_name, job in jobs.items():
|
| 39 |
+
if not isinstance(job, dict):
|
| 40 |
+
return False, f"Job '{job_name}' must be a mapping"
|
| 41 |
+
if "runs-on" not in job:
|
| 42 |
+
return False, f"Job '{job_name}' is missing 'runs-on'"
|
| 43 |
+
return True, None
|
tests/test_determinism.py
CHANGED
|
@@ -1,24 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from server.environment import CICDDebugEnvironment
|
| 2 |
from server.graders import run_grader
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
|
| 5 |
def test_reset_deterministic_with_seed():
|
|
|
|
| 6 |
env1 = CICDDebugEnvironment()
|
| 7 |
env2 = CICDDebugEnvironment()
|
| 8 |
|
| 9 |
-
obs1 = env1.reset(seed=
|
| 10 |
-
obs2 = env2.reset(seed=
|
| 11 |
|
| 12 |
assert obs1.task_id == obs2.task_id
|
| 13 |
assert obs1.error.error_message == obs2.error.error_message
|
| 14 |
assert [f.path for f in obs1.files] == [f.path for f in obs2.files]
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
def test_grader_deterministic_same_trajectory():
|
|
|
|
| 18 |
trajectory = [
|
| 19 |
{
|
| 20 |
"step": 1,
|
| 21 |
-
"action": {"action_type": "
|
| 22 |
"reward": 0.3,
|
| 23 |
"done": False,
|
| 24 |
"info": {"issues_fixed": 1, "issues_total": 2},
|
|
@@ -31,7 +47,212 @@ def test_grader_deterministic_same_trajectory():
|
|
| 31 |
"info": {"issues_fixed": 1, "issues_total": 2},
|
| 32 |
},
|
| 33 |
]
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
assert
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Determinism and score range tests for grader and environment.
|
| 2 |
+
|
| 3 |
+
Day 7 deliverables:
|
| 4 |
+
- Same trajectory β same score (determinism)
|
| 5 |
+
- Score ranges match CONTEXT.md expectations
|
| 6 |
+
- Difficulty progression verified
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
from server.environment import CICDDebugEnvironment
|
| 10 |
from server.graders import run_grader
|
| 11 |
+
from server.models import Action, ActionType, FileEdit
|
| 12 |
+
from server.tasks.task_registry import TASK_REGISTRY
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
# ββ Determinism Tests ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 16 |
|
| 17 |
|
| 18 |
def test_reset_deterministic_with_seed():
|
| 19 |
+
"""Same seed β same task, scenario, files, error."""
|
| 20 |
env1 = CICDDebugEnvironment()
|
| 21 |
env2 = CICDDebugEnvironment()
|
| 22 |
|
| 23 |
+
obs1 = env1.reset(seed=42)
|
| 24 |
+
obs2 = env2.reset(seed=42)
|
| 25 |
|
| 26 |
assert obs1.task_id == obs2.task_id
|
| 27 |
assert obs1.error.error_message == obs2.error.error_message
|
| 28 |
assert [f.path for f in obs1.files] == [f.path for f in obs2.files]
|
| 29 |
+
assert [f.content for f in obs1.files] == [f.content for f in obs2.files]
|
| 30 |
|
| 31 |
|
| 32 |
def test_grader_deterministic_same_trajectory():
|
| 33 |
+
"""Identical trajectory β identical score and breakdown."""
|
| 34 |
trajectory = [
|
| 35 |
{
|
| 36 |
"step": 1,
|
| 37 |
+
"action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
|
| 38 |
"reward": 0.3,
|
| 39 |
"done": False,
|
| 40 |
"info": {"issues_fixed": 1, "issues_total": 2},
|
|
|
|
| 47 |
"info": {"issues_fixed": 1, "issues_total": 2},
|
| 48 |
},
|
| 49 |
]
|
| 50 |
+
results = [run_grader("dockerfile_syntax", trajectory) for _ in range(10)]
|
| 51 |
+
scores = [r.score for r in results]
|
| 52 |
+
assert len(set(scores)) == 1, f"Non-deterministic scores: {scores}"
|
| 53 |
+
breakdowns = [tuple(sorted(r.breakdown.items())) for r in results]
|
| 54 |
+
assert len(set(breakdowns)) == 1
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def test_grader_deterministic_across_tasks():
|
| 58 |
+
"""Same trajectory structure scores identically regardless of task_id."""
|
| 59 |
+
trajectory = [
|
| 60 |
+
{
|
| 61 |
+
"step": 1,
|
| 62 |
+
"action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
|
| 63 |
+
"reward": 0.3,
|
| 64 |
+
"done": True,
|
| 65 |
+
"info": {"issues_fixed": 1, "issues_total": 1},
|
| 66 |
+
},
|
| 67 |
+
]
|
| 68 |
+
scores = set()
|
| 69 |
+
for task_id in TASK_REGISTRY:
|
| 70 |
+
r = run_grader(task_id, trajectory)
|
| 71 |
+
scores.add(r.score)
|
| 72 |
+
# All tasks with same trajectory should get same score (task-agnostic grader)
|
| 73 |
+
assert len(scores) == 1, f"Different scores across tasks: {scores}"
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def test_full_episode_determinism():
|
| 77 |
+
"""Full episode replay produces identical trajectory and score."""
|
| 78 |
+
scores = []
|
| 79 |
+
for _ in range(5):
|
| 80 |
+
env = CICDDebugEnvironment()
|
| 81 |
+
env.reset(task_id="dockerfile_syntax", scenario_id="typo_filename")
|
| 82 |
+
action = Action(
|
| 83 |
+
action_type=ActionType.EDIT_FILE,
|
| 84 |
+
edits=[FileEdit(file_path="Dockerfile", old_content="COPY requirments.txt .", new_content="COPY requirements.txt .")]
|
| 85 |
+
)
|
| 86 |
+
env.step(action)
|
| 87 |
+
r = run_grader("dockerfile_syntax", env.trajectory)
|
| 88 |
+
scores.append(r.score)
|
| 89 |
+
assert len(set(scores)) == 1, f"Non-deterministic episode scores: {scores}"
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
# ββ Score Range Tests ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def test_empty_trajectory_scores_zero():
|
| 96 |
+
r = run_grader("dockerfile_syntax", [])
|
| 97 |
+
assert r.score == 0.0
|
| 98 |
+
assert r.steps_taken == 0
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def test_zero_fixes_scores_zero():
|
| 102 |
+
trajectory = [
|
| 103 |
+
{"step": 1, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
|
| 104 |
+
"reward": 0.0, "done": True, "info": {"issues_fixed": 0, "issues_total": 2}},
|
| 105 |
+
]
|
| 106 |
+
r = run_grader("dockerfile_syntax", trajectory)
|
| 107 |
+
assert r.score == 0.0
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def test_partial_fix_scores_moderate():
|
| 111 |
+
"""1 of 2 issues fixed β score between 0.3 and 0.6."""
|
| 112 |
+
trajectory = [
|
| 113 |
+
{"step": 1, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
|
| 114 |
+
"reward": 0.3, "done": False, "info": {"issues_fixed": 1, "issues_total": 2}},
|
| 115 |
+
{"step": 2, "action": {"action_type": "submit"},
|
| 116 |
+
"reward": 0.0, "done": True, "info": {"issues_fixed": 1, "issues_total": 2}},
|
| 117 |
+
]
|
| 118 |
+
r = run_grader("dockerfile_syntax", trajectory)
|
| 119 |
+
assert 0.3 <= r.score <= 0.6, f"Partial fix score {r.score} out of range"
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def test_complete_fix_scores_high():
|
| 123 |
+
"""All issues fixed β score >= 0.85."""
|
| 124 |
+
trajectory = [
|
| 125 |
+
{"step": 1, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
|
| 126 |
+
"reward": 0.3, "done": False, "info": {"issues_fixed": 1, "issues_total": 2}},
|
| 127 |
+
{"step": 2, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
|
| 128 |
+
"reward": 0.3, "done": True, "info": {"issues_fixed": 2, "issues_total": 2}},
|
| 129 |
+
]
|
| 130 |
+
r = run_grader("dockerfile_syntax", trajectory)
|
| 131 |
+
assert r.score >= 0.85, f"Complete fix score {r.score} too low"
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def test_perfect_score_achievable():
|
| 135 |
+
"""Single issue, single step β exactly 1.0."""
|
| 136 |
+
trajectory = [
|
| 137 |
+
{"step": 1, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
|
| 138 |
+
"reward": 0.3, "done": True, "info": {"issues_fixed": 1, "issues_total": 1}},
|
| 139 |
+
]
|
| 140 |
+
r = run_grader("dockerfile_syntax", trajectory)
|
| 141 |
+
assert r.score == 1.0, f"Perfect scenario scored {r.score}, not 1.0"
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def test_hint_penalty_applied():
|
| 145 |
+
"""Hints reduce score by 0.05 each."""
|
| 146 |
+
base_traj = [
|
| 147 |
+
{"step": 1, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
|
| 148 |
+
"reward": 0.3, "done": True, "info": {"issues_fixed": 1, "issues_total": 1}},
|
| 149 |
+
]
|
| 150 |
+
hint_traj = [
|
| 151 |
+
{"step": 1, "action": {"action_type": "request_hint"}, "reward": -0.05, "done": False,
|
| 152 |
+
"info": {"issues_fixed": 0, "issues_total": 1}},
|
| 153 |
+
{"step": 2, "action": {"action_type": "edit_file", "edits": [{"file_path": "Dockerfile"}]},
|
| 154 |
+
"reward": 0.3, "done": True, "info": {"issues_fixed": 1, "issues_total": 1}},
|
| 155 |
+
]
|
| 156 |
+
r_base = run_grader("dockerfile_syntax", base_traj)
|
| 157 |
+
r_hint = run_grader("dockerfile_syntax", hint_traj)
|
| 158 |
+
assert r_base.score > r_hint.score
|
| 159 |
+
assert abs((r_base.score - r_hint.score) - 0.08) < 0.05 # ~0.05 hint + efficiency decay
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def test_score_always_in_0_1_range():
|
| 163 |
+
"""Score must always be between 0.0 and 1.0."""
|
| 164 |
+
test_cases = [
|
| 165 |
+
[],
|
| 166 |
+
[{"step": 1, "action": {"action_type": "submit"}, "reward": 0.0, "done": True,
|
| 167 |
+
"info": {"issues_fixed": 0, "issues_total": 5}}],
|
| 168 |
+
# Many hints β could potentially go negative
|
| 169 |
+
*[[{"step": i + 1, "action": {"action_type": "request_hint"}, "reward": -0.05, "done": i == 9,
|
| 170 |
+
"info": {"issues_fixed": 0, "issues_total": 1}} for i in range(10)]],
|
| 171 |
+
]
|
| 172 |
+
for traj in test_cases:
|
| 173 |
+
r = run_grader("dockerfile_syntax", traj)
|
| 174 |
+
assert 0.0 <= r.score <= 1.0, f"Score {r.score} out of [0, 1] range"
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
# ββ Difficulty Progression Tests βββββββββββββββββββββββββββββββββββ
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
def test_difficulty_progression():
|
| 181 |
+
"""Tasks are ordered by difficulty: easy < medium < hard."""
|
| 182 |
+
difficulties = []
|
| 183 |
+
for task_id, task_cls in TASK_REGISTRY.items():
|
| 184 |
+
difficulties.append((task_id, task_cls.DIFFICULTY.value))
|
| 185 |
+
|
| 186 |
+
expected_order = {
|
| 187 |
+
"dockerfile_syntax": "easy",
|
| 188 |
+
"dockerfile_runtime": "medium",
|
| 189 |
+
"workflow_syntax_structure": "easy",
|
| 190 |
+
"workflow_secrets_permissions": "medium",
|
| 191 |
+
"ci_docker_integration": "medium",
|
| 192 |
+
"multi_stage_pipeline_matrix": "hard",
|
| 193 |
+
}
|
| 194 |
+
for task_id, expected_diff in expected_order.items():
|
| 195 |
+
actual = TASK_REGISTRY[task_id].DIFFICULTY.value
|
| 196 |
+
assert actual == expected_diff, f"{task_id}: expected {expected_diff}, got {actual}"
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
def test_hard_tasks_have_more_issues():
|
| 200 |
+
"""Hard tasks should generally have more expected_fixes per scenario."""
|
| 201 |
+
easy_max_issues = 0
|
| 202 |
+
hard_min_issues = float("inf")
|
| 203 |
+
|
| 204 |
+
for task_id, task_cls in TASK_REGISTRY.items():
|
| 205 |
+
task = task_cls()
|
| 206 |
+
for scenario in task.SCENARIOS:
|
| 207 |
+
n_fixes = len(scenario["expected_fixes"])
|
| 208 |
+
if task.DIFFICULTY.value == "easy":
|
| 209 |
+
easy_max_issues = max(easy_max_issues, n_fixes)
|
| 210 |
+
elif task.DIFFICULTY.value == "hard":
|
| 211 |
+
hard_min_issues = min(hard_min_issues, n_fixes)
|
| 212 |
+
|
| 213 |
+
# At least some hard scenarios should have more issues than easy ones
|
| 214 |
+
assert hard_min_issues >= easy_max_issues, (
|
| 215 |
+
f"Hard tasks ({hard_min_issues} min issues) should have >= issues than easy ({easy_max_issues} max)"
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
def test_all_tasks_have_minimum_scenarios():
|
| 220 |
+
"""Each task must have at least 4 scenarios."""
|
| 221 |
+
for task_id, task_cls in TASK_REGISTRY.items():
|
| 222 |
+
assert len(task_cls.SCENARIOS) >= 4, f"{task_id} has only {len(task_cls.SCENARIOS)} scenarios (need >= 4)"
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
def test_scenario_ids_unique():
|
| 226 |
+
"""All scenario IDs must be unique within each task."""
|
| 227 |
+
for task_id, task_cls in TASK_REGISTRY.items():
|
| 228 |
+
ids = [s["id"] for s in task_cls.SCENARIOS]
|
| 229 |
+
assert len(ids) == len(set(ids)), f"{task_id} has duplicate scenario IDs: {ids}"
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
def test_all_scenarios_have_required_fields():
|
| 233 |
+
"""Every scenario has id, files, error, expected_fixes."""
|
| 234 |
+
for task_id, task_cls in TASK_REGISTRY.items():
|
| 235 |
+
for scenario in task_cls.SCENARIOS:
|
| 236 |
+
assert "id" in scenario, f"{task_id}: scenario missing 'id'"
|
| 237 |
+
assert "files" in scenario, f"{task_id}/{scenario.get('id')}: missing 'files'"
|
| 238 |
+
assert "error" in scenario, f"{task_id}/{scenario.get('id')}: missing 'error'"
|
| 239 |
+
assert "expected_fixes" in scenario, f"{task_id}/{scenario.get('id')}: missing 'expected_fixes'"
|
| 240 |
+
assert len(scenario["files"]) >= 1, f"{task_id}/{scenario['id']}: no files"
|
| 241 |
+
assert len(scenario["expected_fixes"]) >= 1, f"{task_id}/{scenario['id']}: no expected_fixes"
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
# ββ End-to-End Score Verification ββββββββββββββββββββββββββββββββββ
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
def test_end_to_end_grading_all_tasks():
|
| 248 |
+
"""Every task/scenario can be reset, fixed, and graded with score > 0."""
|
| 249 |
+
env = CICDDebugEnvironment()
|
| 250 |
+
for task_id, task_cls in TASK_REGISTRY.items():
|
| 251 |
+
task = task_cls()
|
| 252 |
+
for scenario in task.SCENARIOS:
|
| 253 |
+
obs = env.reset(task_id=task_id, scenario_id=scenario["id"])
|
| 254 |
+
assert obs.total_issues >= 1
|
| 255 |
+
assert obs.issues_fixed == 0
|
| 256 |
+
# Just verify the grader doesn't crash on an empty trajectory
|
| 257 |
+
r = run_grader(task_id, env.trajectory)
|
| 258 |
+
assert r.score == 0.0
|