Commit ·
85b7ac8
0
Parent(s):
init
Browse files- .gitignore +39 -0
- Dockerfile +15 -0
- IMPLEMENTATION_PLAN.md +2789 -0
- README.md +26 -0
- baseline_runner.py +40 -0
- inference.py +9 -0
- openenv.yaml +86 -0
- requirements.txt +0 -0
- sample_inf_script.py +255 -0
- sample_val_script.txt +185 -0
- server/__init__.py +1 -0
- server/environment.py +384 -0
- server/graders/__init__.py +96 -0
- server/graders/base.py +1 -0
- server/main.py +156 -0
- server/models.py +172 -0
- server/simulators/__init__.py +1 -0
- server/simulators/docker_simulator.py +146 -0
- server/simulators/workflow_simulator.py +95 -0
- server/tasks/__init__.py +1 -0
- server/tasks/base.py +18 -0
- server/tasks/task_1_build_errors.py +48 -0
- server/tasks/task_2_docker_runtime.py +47 -0
- server/tasks/task_2_workflow_config.py +52 -0
- server/tasks/task_3_multi_stage.py +44 -0
- server/tasks/task_3_workflow_syntax.py +47 -0
- server/tasks/task_4_workflow_secrets_permissions.py +52 -0
- server/tasks/task_5_ci_docker_integration.py +47 -0
- server/tasks/task_6_multi_stage_matrix.py +44 -0
- server/tasks/task_registry.py +28 -0
- server/utils/__init__.py +1 -0
- smoke_test.py +233 -0
- tests/test_determinism.py +37 -0
- tests/test_endpoints.py +27 -0
- tests/test_environment_flow.py +37 -0
- tests/test_simulators.py +50 -0
.gitignore
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python bytecode and caches
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# Virtual environments
|
| 7 |
+
.venv/
|
| 8 |
+
venv/
|
| 9 |
+
env/
|
| 10 |
+
|
| 11 |
+
# Test and tooling caches
|
| 12 |
+
.pytest_cache/
|
| 13 |
+
.mypy_cache/
|
| 14 |
+
.ruff_cache/
|
| 15 |
+
.coverage
|
| 16 |
+
.coverage.*
|
| 17 |
+
htmlcov/
|
| 18 |
+
|
| 19 |
+
# Build and packaging
|
| 20 |
+
build/
|
| 21 |
+
dist/
|
| 22 |
+
*.egg-info/
|
| 23 |
+
.eggs/
|
| 24 |
+
|
| 25 |
+
# Logs
|
| 26 |
+
*.log
|
| 27 |
+
|
| 28 |
+
# Environment and secrets
|
| 29 |
+
.env
|
| 30 |
+
.env.*
|
| 31 |
+
!.env.example
|
| 32 |
+
|
| 33 |
+
# IDE and editor
|
| 34 |
+
.vscode/
|
| 35 |
+
.idea/
|
| 36 |
+
|
| 37 |
+
# OS files
|
| 38 |
+
.DS_Store
|
| 39 |
+
Thumbs.db
|
Dockerfile
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
COPY requirements.txt .
|
| 6 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 7 |
+
|
| 8 |
+
COPY server/ ./server/
|
| 9 |
+
COPY openenv.yaml .
|
| 10 |
+
COPY inference.py .
|
| 11 |
+
COPY baseline_runner.py .
|
| 12 |
+
|
| 13 |
+
EXPOSE 7860
|
| 14 |
+
|
| 15 |
+
CMD ["python", "-m", "uvicorn", "server.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
IMPLEMENTATION_PLAN.md
ADDED
|
@@ -0,0 +1,2789 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🏗️ CI/CD Infrastructure Debugging Environment
|
| 2 |
+
## Complete Implementation Plan
|
| 3 |
+
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
# 📋 TABLE OF CONTENTS
|
| 7 |
+
|
| 8 |
+
1. [Executive Summary](#1-executive-summary)
|
| 9 |
+
2. [Scoring Strategy](#2-scoring-strategy)
|
| 10 |
+
3. [Project Structure](#3-project-structure)
|
| 11 |
+
4. [OpenEnv Spec Compliance](#4-openenv-spec-compliance)
|
| 12 |
+
5. [Environment Design](#5-environment-design)
|
| 13 |
+
6. [Task Design (6 Tasks)](#6-task-design)
|
| 14 |
+
7. [Grader Implementation](#7-grader-implementation)
|
| 15 |
+
8. [Reward Function Design](#8-reward-function-design)
|
| 16 |
+
9. [Baseline Inference Script](#9-baseline-inference-script)
|
| 17 |
+
10. [Dockerfile & Deployment](#10-dockerfile--deployment)
|
| 18 |
+
11. [Testing Plan](#11-testing-plan)
|
| 19 |
+
12. [Timeline & Milestones](#12-timeline--milestones)
|
| 20 |
+
|
| 21 |
+
---
|
| 22 |
+
|
| 23 |
+
# 1. EXECUTIVE SUMMARY
|
| 24 |
+
|
| 25 |
+
## Environment Name
|
| 26 |
+
**`cicd-debug-env`** — CI/CD Infrastructure Debugging Environment
|
| 27 |
+
|
| 28 |
+
## Concept
|
| 29 |
+
An OpenEnv-compliant environment where AI agents debug broken GitHub Actions workflows that build and deploy Docker containers. The agent receives error logs, workflow files, and Dockerfiles, then must identify and fix the issues.
|
| 30 |
+
|
| 31 |
+
## Why This Wins
|
| 32 |
+
|
| 33 |
+
| Criteria | Weight | Our Score | Why |
|
| 34 |
+
|----------|--------|-----------|-----|
|
| 35 |
+
| Real-world utility | 30% | 28-30 | Every developer uses Docker + CI/CD daily |
|
| 36 |
+
| Task & grader quality | 25% | 23-25 | Deterministic + dynamic scoring, 6-task progression |
|
| 37 |
+
| Environment design | 20% | 18-20 | Clean state, rich observations, dense rewards |
|
| 38 |
+
| Code quality & spec | 15% | 15 | Full OpenEnv compliance, clean code |
|
| 39 |
+
| Creativity & novelty | 10% | 10 | First CI/CD debugging env on OpenEnv |
|
| 40 |
+
| **TOTAL** | 100% | **94-100** | |
|
| 41 |
+
|
| 42 |
+
---
|
| 43 |
+
|
| 44 |
+
# 2. SCORING STRATEGY
|
| 45 |
+
|
| 46 |
+
## Phase 1: Automated Validation (Pass/Fail Gate)
|
| 47 |
+
We MUST pass all of these or we're disqualified:
|
| 48 |
+
|
| 49 |
+
| Check | How We Pass |
|
| 50 |
+
|-------|-------------|
|
| 51 |
+
| HF Space deploys | FastAPI server with health checks, proper port binding |
|
| 52 |
+
| OpenEnv spec compliance | `openenv.yaml` + typed Pydantic models + all 7 endpoints |
|
| 53 |
+
| Dockerfile builds | Multi-stage build, pinned versions, no external deps |
|
| 54 |
+
| Baseline reproduces | `inference.py` using OpenAI client, runs in <20min |
|
| 55 |
+
| 3+ tasks with graders | 6 tasks with deterministic 0.0-1.0 graders |
|
| 56 |
+
|
| 57 |
+
## Phase 2: Agentic Evaluation (Nemotron 3 Super)
|
| 58 |
+
Optimize for Nemotron's strengths:
|
| 59 |
+
- **Structured output**: YAML/Dockerfile are structured formats ✓
|
| 60 |
+
- **Multi-step reasoning**: Debug → Identify → Fix → Verify ✓
|
| 61 |
+
- **Tool calling patterns**: Action space maps to tool calls ✓
|
| 62 |
+
- **Long context**: Can include full workflow + Dockerfile + error logs ✓
|
| 63 |
+
|
| 64 |
+
## Phase 3: Human Review (Meta/HF Engineers)
|
| 65 |
+
Appeal to judges:
|
| 66 |
+
- **Real-world utility**: They debug CI/CD daily
|
| 67 |
+
- **Meta-relevance**: Hackathon requires Docker, we're debugging Docker
|
| 68 |
+
- **Clever mechanics**: Progressive hints, partial credit, multi-file fixes
|
| 69 |
+
|
| 70 |
+
---
|
| 71 |
+
|
| 72 |
+
# 3. PROJECT STRUCTURE
|
| 73 |
+
|
| 74 |
+
```
|
| 75 |
+
cicd-debug-env/
|
| 76 |
+
├── openenv.yaml # OpenEnv metadata (REQUIRED)
|
| 77 |
+
├── inference.py # Baseline inference script (REQUIRED)
|
| 78 |
+
├── Dockerfile # Container definition (REQUIRED)
|
| 79 |
+
├── requirements.txt # Python dependencies
|
| 80 |
+
├── README.md # Documentation
|
| 81 |
+
│
|
| 82 |
+
├── server/
|
| 83 |
+
│ ├── __init__.py
|
| 84 |
+
│ ├── main.py # FastAPI application with all endpoints
|
| 85 |
+
│ ├── models.py # Pydantic models (Observation, Action, etc.)
|
| 86 |
+
│ ├── environment.py # Core environment logic
|
| 87 |
+
│ ├── tasks/
|
| 88 |
+
│ │ ├── __init__.py
|
| 89 |
+
│ │ ├── base.py # Base task class
|
| 90 |
+
│ │ ├── task_registry.py # Task registration
|
| 91 |
+
│ │ ├── task_1_build_errors.py # Easy: Dockerfile syntax
|
| 92 |
+
│ │ ├── task_2_docker_runtime.py # Medium: Docker runtime
|
| 93 |
+
│ │ ├── task_3_workflow_syntax.py # Easy: Workflow syntax/structure
|
| 94 |
+
│ │ ├── task_4_workflow_secrets_permissions.py # Medium: Secrets/permissions
|
| 95 |
+
│ │ ├── task_5_ci_docker_integration.py # Medium-Hard: Combined CI+Docker
|
| 96 |
+
│ │ └── task_6_multi_stage_matrix.py # Hard: Multi-stage + matrix
|
| 97 |
+
│ ├── graders/
|
| 98 |
+
│ │ ├── __init__.py
|
| 99 |
+
│ │ ├── base.py # Base grader class
|
| 100 |
+
│ │ ├── dockerfile_grader.py # Dockerfile validation
|
| 101 |
+
│ │ ├── workflow_grader.py # GitHub Actions validation
|
| 102 |
+
│ │ └── integration_grader.py # Full pipeline validation
|
| 103 |
+
│ ├── simulators/
|
| 104 |
+
│ │ ├── __init__.py
|
| 105 |
+
│ │ ├── docker_simulator.py # Simulates docker build
|
| 106 |
+
│ │ └── workflow_simulator.py # Simulates GHA execution
|
| 107 |
+
│ └── utils/
|
| 108 |
+
│ ├── __init__.py
|
| 109 |
+
│ ├── yaml_parser.py # Safe YAML parsing
|
| 110 |
+
│ └── error_generator.py # Generates realistic errors
|
| 111 |
+
│
|
| 112 |
+
├── data/
|
| 113 |
+
│ ├── scenarios/ # Pre-built debugging scenarios
|
| 114 |
+
│ �� ├── easy/
|
| 115 |
+
│ │ ├── medium/
|
| 116 |
+
│ │ └── hard/
|
| 117 |
+
│ └── templates/ # Base templates for generation
|
| 118 |
+
│
|
| 119 |
+
└── tests/
|
| 120 |
+
├── test_endpoints.py # API endpoint tests
|
| 121 |
+
├── test_graders.py # Grader correctness tests
|
| 122 |
+
├── test_tasks.py # Task validation tests
|
| 123 |
+
└── test_determinism.py # Reproducibility tests
|
| 124 |
+
```
|
| 125 |
+
|
| 126 |
+
---
|
| 127 |
+
|
| 128 |
+
# 4. OPENENV SPEC COMPLIANCE
|
| 129 |
+
|
| 130 |
+
## 4.1 openenv.yaml
|
| 131 |
+
|
| 132 |
+
```yaml
|
| 133 |
+
name: cicd-debug-env
|
| 134 |
+
version: "1.0.0"
|
| 135 |
+
description: >
|
| 136 |
+
Debug broken GitHub Actions workflows and Dockerfiles.
|
| 137 |
+
AI agents identify and fix CI/CD infrastructure issues.
|
| 138 |
+
|
| 139 |
+
author: Krishna
|
| 140 |
+
license: MIT
|
| 141 |
+
tags:
|
| 142 |
+
- devops
|
| 143 |
+
- docker
|
| 144 |
+
- github-actions
|
| 145 |
+
- debugging
|
| 146 |
+
- infrastructure
|
| 147 |
+
|
| 148 |
+
environment:
|
| 149 |
+
type: text
|
| 150 |
+
observation_space: structured
|
| 151 |
+
action_space: structured
|
| 152 |
+
max_steps: 10
|
| 153 |
+
|
| 154 |
+
tasks:
|
| 155 |
+
- id: dockerfile_syntax
|
| 156 |
+
name: "Dockerfile Syntax Errors"
|
| 157 |
+
description: "Fix syntax and instruction errors in Dockerfiles"
|
| 158 |
+
difficulty: easy
|
| 159 |
+
|
| 160 |
+
- id: workflow_secrets_permissions
|
| 161 |
+
name: "Workflow Secrets and Permissions"
|
| 162 |
+
description: "Fix secret wiring, env usage, and permissions in workflows"
|
| 163 |
+
difficulty: medium
|
| 164 |
+
|
| 165 |
+
- id: ci_docker_integration
|
| 166 |
+
name: "CI and Docker Build Integration"
|
| 167 |
+
description: "Debug combined workflow and Docker build integration failures"
|
| 168 |
+
difficulty: medium
|
| 169 |
+
|
| 170 |
+
- id: multi_stage_pipeline_matrix
|
| 171 |
+
name: "Multi-Stage Pipeline and Matrix"
|
| 172 |
+
description: "Debug complex multi-stage and matrix CI/CD pipelines"
|
| 173 |
+
difficulty: hard
|
| 174 |
+
|
| 175 |
+
graders:
|
| 176 |
+
dockerfile_syntax:
|
| 177 |
+
type: deterministic
|
| 178 |
+
score_range: [0.0, 1.0]
|
| 179 |
+
workflow_secrets_permissions:
|
| 180 |
+
type: deterministic
|
| 181 |
+
score_range: [0.0, 1.0]
|
| 182 |
+
ci_docker_integration:
|
| 183 |
+
type: deterministic
|
| 184 |
+
score_range: [0.0, 1.0]
|
| 185 |
+
multi_stage_pipeline_matrix:
|
| 186 |
+
type: deterministic
|
| 187 |
+
score_range: [0.0, 1.0]
|
| 188 |
+
|
| 189 |
+
baseline:
|
| 190 |
+
script: inference.py
|
| 191 |
+
expected_scores:
|
| 192 |
+
dockerfile_syntax: 0.7
|
| 193 |
+
workflow_secrets_permissions: 0.5
|
| 194 |
+
ci_docker_integration: 0.45
|
| 195 |
+
multi_stage_pipeline_matrix: 0.3
|
| 196 |
+
|
| 197 |
+
resources:
|
| 198 |
+
vcpu: 2
|
| 199 |
+
memory: 8gb
|
| 200 |
+
timeout: 1200 # 20 minutes max
|
| 201 |
+
```
|
| 202 |
+
|
| 203 |
+
## 4.2 Pydantic Models (server/models.py)
|
| 204 |
+
|
| 205 |
+
```python
|
| 206 |
+
"""
|
| 207 |
+
Typed Pydantic models for OpenEnv compliance.
|
| 208 |
+
All models must be serializable and well-documented.
|
| 209 |
+
"""
|
| 210 |
+
|
| 211 |
+
from typing import List, Dict, Optional, Literal, Any
|
| 212 |
+
from pydantic import BaseModel, Field
|
| 213 |
+
from enum import Enum
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
# ============== ENUMS ==============
|
| 217 |
+
|
| 218 |
+
class TaskDifficulty(str, Enum):
|
| 219 |
+
EASY = "easy"
|
| 220 |
+
MEDIUM = "medium"
|
| 221 |
+
HARD = "hard"
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
class ActionType(str, Enum):
|
| 225 |
+
EDIT_FILE = "edit_file"
|
| 226 |
+
ADD_LINE = "add_line"
|
| 227 |
+
DELETE_LINE = "delete_line"
|
| 228 |
+
REPLACE_LINE = "replace_line"
|
| 229 |
+
ADD_BLOCK = "add_block"
|
| 230 |
+
DELETE_BLOCK = "delete_block"
|
| 231 |
+
SUBMIT = "submit"
|
| 232 |
+
REQUEST_HINT = "request_hint"
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
class FileType(str, Enum):
|
| 236 |
+
DOCKERFILE = "dockerfile"
|
| 237 |
+
WORKFLOW = "workflow"
|
| 238 |
+
DOCKER_COMPOSE = "docker_compose"
|
| 239 |
+
REQUIREMENTS = "requirements"
|
| 240 |
+
OTHER = "other"
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
class ErrorPhase(str, Enum):
|
| 244 |
+
WORKFLOW_PARSE = "workflow_parse"
|
| 245 |
+
DOCKER_BUILD = "docker_build"
|
| 246 |
+
DOCKER_RUN = "docker_run"
|
| 247 |
+
TEST = "test"
|
| 248 |
+
PUSH = "push"
|
| 249 |
+
DEPLOY = "deploy"
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
# ============== OBSERVATION ==============
|
| 253 |
+
|
| 254 |
+
class FileContent(BaseModel):
|
| 255 |
+
"""Represents a file in the debugging scenario."""
|
| 256 |
+
path: str = Field(..., description="File path (e.g., 'Dockerfile', '.github/workflows/build.yml')")
|
| 257 |
+
content: str = Field(..., description="Current file content")
|
| 258 |
+
file_type: FileType = Field(..., description="Type of file")
|
| 259 |
+
line_count: int = Field(..., description="Number of lines in file")
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
class ErrorInfo(BaseModel):
|
| 263 |
+
"""Information about the CI/CD error."""
|
| 264 |
+
phase: ErrorPhase = Field(..., description="Phase where error occurred")
|
| 265 |
+
error_message: str = Field(..., description="The error message/log output")
|
| 266 |
+
exit_code: Optional[int] = Field(None, description="Exit code if applicable")
|
| 267 |
+
failed_step: Optional[str] = Field(None, description="Name of failed step/stage")
|
| 268 |
+
line_hint: Optional[int] = Field(None, description="Line number hint if available")
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
class Observation(BaseModel):
|
| 272 |
+
"""
|
| 273 |
+
Complete observation of the debugging environment state.
|
| 274 |
+
Provided to the agent at each step.
|
| 275 |
+
"""
|
| 276 |
+
# Task context
|
| 277 |
+
task_id: str = Field(..., description="Current task identifier")
|
| 278 |
+
task_description: str = Field(..., description="What needs to be fixed")
|
| 279 |
+
difficulty: TaskDifficulty = Field(..., description="Task difficulty level")
|
| 280 |
+
|
| 281 |
+
# Files to debug
|
| 282 |
+
files: List[FileContent] = Field(..., description="All files in the scenario")
|
| 283 |
+
|
| 284 |
+
# Error information
|
| 285 |
+
error: ErrorInfo = Field(..., description="Error that needs to be fixed")
|
| 286 |
+
|
| 287 |
+
# Build context (what's available in the CI environment)
|
| 288 |
+
available_secrets: List[str] = Field(default_factory=list, description="Available secret names")
|
| 289 |
+
runner_os: str = Field(default="ubuntu-latest", description="CI runner OS")
|
| 290 |
+
|
| 291 |
+
# Episode state
|
| 292 |
+
step_number: int = Field(..., description="Current step (1-indexed)")
|
| 293 |
+
max_steps: int = Field(..., description="Maximum allowed steps")
|
| 294 |
+
hints_used: int = Field(default=0, description="Number of hints requested")
|
| 295 |
+
hints_available: int = Field(default=3, description="Remaining hints")
|
| 296 |
+
|
| 297 |
+
# Previous action feedback
|
| 298 |
+
last_action_success: Optional[bool] = Field(None, description="Whether last action succeeded")
|
| 299 |
+
last_action_feedback: Optional[str] = Field(None, description="Feedback from last action")
|
| 300 |
+
|
| 301 |
+
# For partial credit tracking
|
| 302 |
+
issues_found: int = Field(default=0, description="Number of issues identified")
|
| 303 |
+
issues_fixed: int = Field(default=0, description="Number of issues fixed")
|
| 304 |
+
total_issues: int = Field(..., description="Total issues in this scenario")
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
# ============== ACTION ==============
|
| 308 |
+
|
| 309 |
+
class FileEdit(BaseModel):
|
| 310 |
+
"""A single edit to apply to a file."""
|
| 311 |
+
file_path: str = Field(..., description="Path to the file to edit")
|
| 312 |
+
line_number: Optional[int] = Field(None, description="Line number (1-indexed) for line operations")
|
| 313 |
+
old_content: Optional[str] = Field(None, description="Content to find/replace")
|
| 314 |
+
new_content: Optional[str] = Field(None, description="New content to insert/replace with")
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
class Action(BaseModel):
|
| 318 |
+
"""
|
| 319 |
+
Action taken by the agent to fix the CI/CD issue.
|
| 320 |
+
"""
|
| 321 |
+
action_type: ActionType = Field(..., description="Type of action to perform")
|
| 322 |
+
edits: Optional[List[FileEdit]] = Field(None, description="File edits for edit actions")
|
| 323 |
+
reasoning: Optional[str] = Field(None, description="Agent's reasoning (for logging)")
|
| 324 |
+
|
| 325 |
+
class Config:
|
| 326 |
+
json_schema_extra = {
|
| 327 |
+
"examples": [
|
| 328 |
+
{
|
| 329 |
+
"action_type": "replace_line",
|
| 330 |
+
"edits": [{
|
| 331 |
+
"file_path": "Dockerfile",
|
| 332 |
+
"line_number": 5,
|
| 333 |
+
"old_content": "RUN pip install -r requirments.txt",
|
| 334 |
+
"new_content": "RUN pip install -r requirements.txt"
|
| 335 |
+
}],
|
| 336 |
+
"reasoning": "Fixed typo in requirements.txt filename"
|
| 337 |
+
},
|
| 338 |
+
{
|
| 339 |
+
"action_type": "add_block",
|
| 340 |
+
"edits": [{
|
| 341 |
+
"file_path": ".github/workflows/build.yml",
|
| 342 |
+
"line_number": 15,
|
| 343 |
+
"new_content": " env:\n DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }}"
|
| 344 |
+
}],
|
| 345 |
+
"reasoning": "Added missing env block for secrets"
|
| 346 |
+
},
|
| 347 |
+
{
|
| 348 |
+
"action_type": "submit",
|
| 349 |
+
"reasoning": "All issues fixed, submitting solution"
|
| 350 |
+
}
|
| 351 |
+
]
|
| 352 |
+
}
|
| 353 |
+
|
| 354 |
+
|
| 355 |
+
# ============== STEP RESULT ==============
|
| 356 |
+
|
| 357 |
+
class StepResult(BaseModel):
|
| 358 |
+
"""Result of taking an action in the environment."""
|
| 359 |
+
observation: Observation = Field(..., description="New observation after action")
|
| 360 |
+
reward: float = Field(..., ge=0.0, le=1.0, description="Reward for this step")
|
| 361 |
+
done: bool = Field(..., description="Whether episode is complete")
|
| 362 |
+
info: Dict[str, Any] = Field(default_factory=dict, description="Additional info")
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
# ============== TASK INFO ==============
|
| 366 |
+
|
| 367 |
+
class TaskInfo(BaseModel):
|
| 368 |
+
"""Information about a single task."""
|
| 369 |
+
id: str = Field(..., description="Task identifier")
|
| 370 |
+
name: str = Field(..., description="Human-readable task name")
|
| 371 |
+
description: str = Field(..., description="Task description")
|
| 372 |
+
difficulty: TaskDifficulty = Field(..., description="Difficulty level")
|
| 373 |
+
num_scenarios: int = Field(..., description="Number of scenarios for this task")
|
| 374 |
+
|
| 375 |
+
|
| 376 |
+
class EnvironmentInfo(BaseModel):
|
| 377 |
+
"""Information about the environment."""
|
| 378 |
+
name: str = Field(default="cicd-debug-env")
|
| 379 |
+
version: str = Field(default="1.0.0")
|
| 380 |
+
description: str = Field(default="Debug CI/CD infrastructure issues")
|
| 381 |
+
tasks: List[TaskInfo] = Field(..., description="Available tasks")
|
| 382 |
+
max_steps: int = Field(default=10, description="Maximum steps per episode")
|
| 383 |
+
action_space: Dict[str, Any] = Field(..., description="Action space schema")
|
| 384 |
+
observation_space: Dict[str, Any] = Field(..., description="Observation space schema")
|
| 385 |
+
|
| 386 |
+
|
| 387 |
+
# ============== GRADER RESULT ==============
|
| 388 |
+
|
| 389 |
+
class GraderResult(BaseModel):
|
| 390 |
+
"""Result from running the grader."""
|
| 391 |
+
task_id: str = Field(..., description="Task that was graded")
|
| 392 |
+
score: float = Field(..., ge=0.0, le=1.0, description="Final score")
|
| 393 |
+
max_score: float = Field(default=1.0, description="Maximum possible score")
|
| 394 |
+
breakdown: Dict[str, float] = Field(default_factory=dict, description="Score breakdown")
|
| 395 |
+
feedback: str = Field(default="", description="Human-readable feedback")
|
| 396 |
+
steps_taken: int = Field(..., description="Number of steps taken")
|
| 397 |
+
hints_used: int = Field(default=0, description="Number of hints used")
|
| 398 |
+
|
| 399 |
+
|
| 400 |
+
# ============== API REQUEST/RESPONSE MODELS ==============
|
| 401 |
+
|
| 402 |
+
class ResetRequest(BaseModel):
|
| 403 |
+
"""Request to reset the environment."""
|
| 404 |
+
task_id: Optional[str] = Field(None, description="Specific task to load (random if not specified)")
|
| 405 |
+
scenario_id: Optional[str] = Field(None, description="Specific scenario within task")
|
| 406 |
+
seed: Optional[int] = Field(None, description="Random seed for reproducibility")
|
| 407 |
+
|
| 408 |
+
|
| 409 |
+
class ResetResponse(BaseModel):
|
| 410 |
+
"""Response from reset endpoint."""
|
| 411 |
+
observation: Observation
|
| 412 |
+
info: Dict[str, Any] = Field(default_factory=dict)
|
| 413 |
+
|
| 414 |
+
|
| 415 |
+
class StepRequest(BaseModel):
|
| 416 |
+
"""Request to take a step."""
|
| 417 |
+
action: Action
|
| 418 |
+
|
| 419 |
+
|
| 420 |
+
class StepResponse(BaseModel):
|
| 421 |
+
"""Response from step endpoint."""
|
| 422 |
+
observation: Observation
|
| 423 |
+
reward: float
|
| 424 |
+
done: bool
|
| 425 |
+
info: Dict[str, Any] = Field(default_factory=dict)
|
| 426 |
+
|
| 427 |
+
|
| 428 |
+
class StateResponse(BaseModel):
|
| 429 |
+
"""Response from state endpoint."""
|
| 430 |
+
observation: Observation
|
| 431 |
+
episode_reward: float = Field(..., description="Cumulative reward this episode")
|
| 432 |
+
steps_taken: int
|
| 433 |
+
done: bool
|
| 434 |
+
|
| 435 |
+
|
| 436 |
+
class GraderRequest(BaseModel):
|
| 437 |
+
"""Request to run grader."""
|
| 438 |
+
task_id: str
|
| 439 |
+
trajectory: List[Dict[str, Any]] = Field(..., description="List of (observation, action, reward) tuples")
|
| 440 |
+
|
| 441 |
+
|
| 442 |
+
class GraderResponse(BaseModel):
|
| 443 |
+
"""Response from grader endpoint."""
|
| 444 |
+
result: GraderResult
|
| 445 |
+
|
| 446 |
+
|
| 447 |
+
class BaselineRequest(BaseModel):
|
| 448 |
+
"""Request to run baseline."""
|
| 449 |
+
task_id: Optional[str] = Field(None, description="Specific task (all if not specified)")
|
| 450 |
+
num_episodes: int = Field(default=1, description="Number of episodes to run")
|
| 451 |
+
|
| 452 |
+
|
| 453 |
+
class BaselineResponse(BaseModel):
|
| 454 |
+
"""Response from baseline endpoint."""
|
| 455 |
+
results: List[GraderResult]
|
| 456 |
+
aggregate_score: float
|
| 457 |
+
```
|
| 458 |
+
|
| 459 |
+
## 4.3 FastAPI Endpoints (server/main.py)
|
| 460 |
+
|
| 461 |
+
```python
|
| 462 |
+
"""
|
| 463 |
+
FastAPI server implementing all required OpenEnv endpoints.
|
| 464 |
+
"""
|
| 465 |
+
|
| 466 |
+
from fastapi import FastAPI, HTTPException
|
| 467 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 468 |
+
import uvicorn
|
| 469 |
+
from typing import Optional
|
| 470 |
+
|
| 471 |
+
from models import (
|
| 472 |
+
ResetRequest, ResetResponse,
|
| 473 |
+
StepRequest, StepResponse,
|
| 474 |
+
StateResponse,
|
| 475 |
+
EnvironmentInfo, TaskInfo,
|
| 476 |
+
GraderRequest, GraderResponse,
|
| 477 |
+
BaselineRequest, BaselineResponse,
|
| 478 |
+
Observation, Action, GraderResult
|
| 479 |
+
)
|
| 480 |
+
from environment import CICDDebugEnvironment
|
| 481 |
+
from tasks.task_registry import TASK_REGISTRY
|
| 482 |
+
from graders import run_grader
|
| 483 |
+
|
| 484 |
+
app = FastAPI(
|
| 485 |
+
title="CI/CD Debug Environment",
|
| 486 |
+
description="OpenEnv-compliant environment for debugging Docker + GitHub Actions",
|
| 487 |
+
version="1.0.0"
|
| 488 |
+
)
|
| 489 |
+
|
| 490 |
+
app.add_middleware(
|
| 491 |
+
CORSMiddleware,
|
| 492 |
+
allow_origins=["*"],
|
| 493 |
+
allow_credentials=True,
|
| 494 |
+
allow_methods=["*"],
|
| 495 |
+
allow_headers=["*"],
|
| 496 |
+
)
|
| 497 |
+
|
| 498 |
+
# Global environment instance (per-request in production)
|
| 499 |
+
env: Optional[CICDDebugEnvironment] = None
|
| 500 |
+
|
| 501 |
+
|
| 502 |
+
@app.get("/")
|
| 503 |
+
async def root():
|
| 504 |
+
"""Health check endpoint."""
|
| 505 |
+
return {"status": "healthy", "environment": "cicd-debug-env"}
|
| 506 |
+
|
| 507 |
+
|
| 508 |
+
@app.post("/reset", response_model=ResetResponse)
|
| 509 |
+
async def reset(request: ResetRequest = None):
|
| 510 |
+
"""
|
| 511 |
+
Reset the environment to a new episode.
|
| 512 |
+
|
| 513 |
+
POST /reset
|
| 514 |
+
|
| 515 |
+
Optionally specify task_id and scenario_id for reproducibility.
|
| 516 |
+
Returns initial observation.
|
| 517 |
+
"""
|
| 518 |
+
global env
|
| 519 |
+
|
| 520 |
+
request = request or ResetRequest()
|
| 521 |
+
|
| 522 |
+
env = CICDDebugEnvironment()
|
| 523 |
+
observation = env.reset(
|
| 524 |
+
task_id=request.task_id,
|
| 525 |
+
scenario_id=request.scenario_id,
|
| 526 |
+
seed=request.seed
|
| 527 |
+
)
|
| 528 |
+
|
| 529 |
+
return ResetResponse(
|
| 530 |
+
observation=observation,
|
| 531 |
+
info={
|
| 532 |
+
"task_id": env.current_task_id,
|
| 533 |
+
"scenario_id": env.current_scenario_id,
|
| 534 |
+
"difficulty": env.current_difficulty
|
| 535 |
+
}
|
| 536 |
+
)
|
| 537 |
+
|
| 538 |
+
|
| 539 |
+
@app.post("/step", response_model=StepResponse)
|
| 540 |
+
async def step(request: StepRequest):
|
| 541 |
+
"""
|
| 542 |
+
Take an action in the environment.
|
| 543 |
+
|
| 544 |
+
POST /step
|
| 545 |
+
|
| 546 |
+
Returns new observation, reward, done flag, and info.
|
| 547 |
+
"""
|
| 548 |
+
global env
|
| 549 |
+
|
| 550 |
+
if env is None:
|
| 551 |
+
raise HTTPException(status_code=400, detail="Environment not initialized. Call /reset first.")
|
| 552 |
+
|
| 553 |
+
observation, reward, done, info = env.step(request.action)
|
| 554 |
+
|
| 555 |
+
return StepResponse(
|
| 556 |
+
observation=observation,
|
| 557 |
+
reward=reward,
|
| 558 |
+
done=done,
|
| 559 |
+
info=info
|
| 560 |
+
)
|
| 561 |
+
|
| 562 |
+
|
| 563 |
+
@app.get("/state", response_model=StateResponse)
|
| 564 |
+
async def get_state():
|
| 565 |
+
"""
|
| 566 |
+
Get current environment state.
|
| 567 |
+
|
| 568 |
+
GET /state
|
| 569 |
+
|
| 570 |
+
Returns current observation and episode statistics.
|
| 571 |
+
"""
|
| 572 |
+
global env
|
| 573 |
+
|
| 574 |
+
if env is None:
|
| 575 |
+
raise HTTPException(status_code=400, detail="Environment not initialized. Call /reset first.")
|
| 576 |
+
|
| 577 |
+
return StateResponse(
|
| 578 |
+
observation=env.get_observation(),
|
| 579 |
+
episode_reward=env.episode_reward,
|
| 580 |
+
steps_taken=env.step_count,
|
| 581 |
+
done=env.done
|
| 582 |
+
)
|
| 583 |
+
|
| 584 |
+
|
| 585 |
+
@app.get("/info", response_model=EnvironmentInfo)
|
| 586 |
+
async def get_info():
|
| 587 |
+
"""
|
| 588 |
+
Get environment metadata.
|
| 589 |
+
|
| 590 |
+
GET /info
|
| 591 |
+
|
| 592 |
+
Returns environment info, available tasks, and action/observation schemas.
|
| 593 |
+
"""
|
| 594 |
+
tasks = [
|
| 595 |
+
TaskInfo(
|
| 596 |
+
id=task_id,
|
| 597 |
+
name=task_cls.NAME,
|
| 598 |
+
description=task_cls.DESCRIPTION,
|
| 599 |
+
difficulty=task_cls.DIFFICULTY,
|
| 600 |
+
num_scenarios=len(task_cls.SCENARIOS)
|
| 601 |
+
)
|
| 602 |
+
for task_id, task_cls in TASK_REGISTRY.items()
|
| 603 |
+
]
|
| 604 |
+
|
| 605 |
+
return EnvironmentInfo(
|
| 606 |
+
name="cicd-debug-env",
|
| 607 |
+
version="1.0.0",
|
| 608 |
+
description="Debug CI/CD infrastructure issues (Docker + GitHub Actions)",
|
| 609 |
+
tasks=tasks,
|
| 610 |
+
max_steps=10,
|
| 611 |
+
action_space=Action.model_json_schema(),
|
| 612 |
+
observation_space=Observation.model_json_schema()
|
| 613 |
+
)
|
| 614 |
+
|
| 615 |
+
|
| 616 |
+
@app.get("/tasks")
|
| 617 |
+
async def get_tasks():
|
| 618 |
+
"""
|
| 619 |
+
Get list of available tasks.
|
| 620 |
+
|
| 621 |
+
GET /tasks
|
| 622 |
+
|
| 623 |
+
Returns task IDs, names, descriptions, and difficulties.
|
| 624 |
+
"""
|
| 625 |
+
return {
|
| 626 |
+
"tasks": [
|
| 627 |
+
{
|
| 628 |
+
"id": task_id,
|
| 629 |
+
"name": task_cls.NAME,
|
| 630 |
+
"description": task_cls.DESCRIPTION,
|
| 631 |
+
"difficulty": task_cls.DIFFICULTY.value
|
| 632 |
+
}
|
| 633 |
+
for task_id, task_cls in TASK_REGISTRY.items()
|
| 634 |
+
]
|
| 635 |
+
}
|
| 636 |
+
|
| 637 |
+
|
| 638 |
+
@app.post("/grader", response_model=GraderResponse)
|
| 639 |
+
async def grade(request: GraderRequest):
|
| 640 |
+
"""
|
| 641 |
+
Run grader on a trajectory.
|
| 642 |
+
|
| 643 |
+
POST /grader
|
| 644 |
+
|
| 645 |
+
Takes task_id and trajectory, returns score and breakdown.
|
| 646 |
+
"""
|
| 647 |
+
result = run_grader(
|
| 648 |
+
task_id=request.task_id,
|
| 649 |
+
trajectory=request.trajectory
|
| 650 |
+
)
|
| 651 |
+
|
| 652 |
+
return GraderResponse(result=result)
|
| 653 |
+
|
| 654 |
+
|
| 655 |
+
@app.post("/baseline", response_model=BaselineResponse)
|
| 656 |
+
async def run_baseline(request: BaselineRequest = None):
|
| 657 |
+
"""
|
| 658 |
+
Run baseline agent on tasks.
|
| 659 |
+
|
| 660 |
+
POST /baseline
|
| 661 |
+
|
| 662 |
+
Runs the baseline inference script and returns scores.
|
| 663 |
+
"""
|
| 664 |
+
request = request or BaselineRequest()
|
| 665 |
+
|
| 666 |
+
# Import and run baseline
|
| 667 |
+
from baseline_runner import run_baseline_episodes
|
| 668 |
+
|
| 669 |
+
results = run_baseline_episodes(
|
| 670 |
+
task_id=request.task_id,
|
| 671 |
+
num_episodes=request.num_episodes
|
| 672 |
+
)
|
| 673 |
+
|
| 674 |
+
aggregate = sum(r.score for r in results) / len(results) if results else 0.0
|
| 675 |
+
|
| 676 |
+
return BaselineResponse(
|
| 677 |
+
results=results,
|
| 678 |
+
aggregate_score=aggregate
|
| 679 |
+
)
|
| 680 |
+
|
| 681 |
+
|
| 682 |
+
if __name__ == "__main__":
|
| 683 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
| 684 |
+
```
|
| 685 |
+
|
| 686 |
+
---
|
| 687 |
+
|
| 688 |
+
# 5. ENVIRONMENT DESIGN
|
| 689 |
+
|
| 690 |
+
## 5.1 Core Environment Logic (server/environment.py)
|
| 691 |
+
|
| 692 |
+
```python
|
| 693 |
+
"""
|
| 694 |
+
Core environment logic for CI/CD debugging.
|
| 695 |
+
"""
|
| 696 |
+
|
| 697 |
+
from typing import Optional, Tuple, Dict, Any, List
|
| 698 |
+
import random
|
| 699 |
+
import copy
|
| 700 |
+
|
| 701 |
+
from models import (
|
| 702 |
+
Observation, Action, ActionType, FileContent, ErrorInfo,
|
| 703 |
+
TaskDifficulty, ErrorPhase, FileType
|
| 704 |
+
)
|
| 705 |
+
from tasks.task_registry import TASK_REGISTRY, get_task
|
| 706 |
+
from simulators.docker_simulator import DockerSimulator
|
| 707 |
+
from simulators.workflow_simulator import WorkflowSimulator
|
| 708 |
+
|
| 709 |
+
|
| 710 |
+
class CICDDebugEnvironment:
|
| 711 |
+
"""
|
| 712 |
+
OpenEnv-compliant environment for debugging CI/CD infrastructure.
|
| 713 |
+
|
| 714 |
+
Episode Flow:
|
| 715 |
+
1. reset() loads a scenario with broken config files
|
| 716 |
+
2. Agent observes files + error message
|
| 717 |
+
3. Agent takes actions to fix issues
|
| 718 |
+
4. Environment simulates build/run to verify fixes
|
| 719 |
+
5. Episode ends when all issues fixed or max_steps reached
|
| 720 |
+
"""
|
| 721 |
+
|
| 722 |
+
MAX_STEPS = 10
|
| 723 |
+
MAX_HINTS = 3
|
| 724 |
+
|
| 725 |
+
def __init__(self):
|
| 726 |
+
self.docker_sim = DockerSimulator()
|
| 727 |
+
self.workflow_sim = WorkflowSimulator()
|
| 728 |
+
|
| 729 |
+
# Episode state
|
| 730 |
+
self.current_task_id: Optional[str] = None
|
| 731 |
+
self.current_scenario_id: Optional[str] = None
|
| 732 |
+
self.current_difficulty: Optional[TaskDifficulty] = None
|
| 733 |
+
self.current_task = None
|
| 734 |
+
|
| 735 |
+
# File states
|
| 736 |
+
self.original_files: Dict[str, FileContent] = {}
|
| 737 |
+
self.current_files: Dict[str, FileContent] = {}
|
| 738 |
+
self.expected_fixes: List[Dict] = []
|
| 739 |
+
|
| 740 |
+
# Error state
|
| 741 |
+
self.current_error: Optional[ErrorInfo] = None
|
| 742 |
+
self.issues_total: int = 0
|
| 743 |
+
self.issues_fixed: int = 0
|
| 744 |
+
|
| 745 |
+
# Episode tracking
|
| 746 |
+
self.step_count: int = 0
|
| 747 |
+
self.episode_reward: float = 0.0
|
| 748 |
+
self.done: bool = False
|
| 749 |
+
self.hints_used: int = 0
|
| 750 |
+
|
| 751 |
+
# Action history
|
| 752 |
+
self.trajectory: List[Dict] = []
|
| 753 |
+
self.last_action_success: Optional[bool] = None
|
| 754 |
+
self.last_action_feedback: Optional[str] = None
|
| 755 |
+
|
| 756 |
+
def reset(
|
| 757 |
+
self,
|
| 758 |
+
task_id: Optional[str] = None,
|
| 759 |
+
scenario_id: Optional[str] = None,
|
| 760 |
+
seed: Optional[int] = None
|
| 761 |
+
) -> Observation:
|
| 762 |
+
"""Reset environment to a new episode."""
|
| 763 |
+
|
| 764 |
+
if seed is not None:
|
| 765 |
+
random.seed(seed)
|
| 766 |
+
|
| 767 |
+
# Select task
|
| 768 |
+
if task_id is None:
|
| 769 |
+
task_id = random.choice(list(TASK_REGISTRY.keys()))
|
| 770 |
+
|
| 771 |
+
if task_id not in TASK_REGISTRY:
|
| 772 |
+
raise ValueError(f"Unknown task: {task_id}")
|
| 773 |
+
|
| 774 |
+
self.current_task_id = task_id
|
| 775 |
+
self.current_task = get_task(task_id)
|
| 776 |
+
self.current_difficulty = self.current_task.DIFFICULTY
|
| 777 |
+
|
| 778 |
+
# Load scenario
|
| 779 |
+
scenario = self.current_task.load_scenario(scenario_id)
|
| 780 |
+
self.current_scenario_id = scenario["id"]
|
| 781 |
+
|
| 782 |
+
# Initialize files
|
| 783 |
+
self.original_files = {
|
| 784 |
+
f["path"]: FileContent(
|
| 785 |
+
path=f["path"],
|
| 786 |
+
content=f["content"],
|
| 787 |
+
file_type=FileType(f["type"]),
|
| 788 |
+
line_count=f["content"].count("\n") + 1
|
| 789 |
+
)
|
| 790 |
+
for f in scenario["files"]
|
| 791 |
+
}
|
| 792 |
+
self.current_files = copy.deepcopy(self.original_files)
|
| 793 |
+
|
| 794 |
+
# Initialize error
|
| 795 |
+
self.current_error = ErrorInfo(
|
| 796 |
+
phase=ErrorPhase(scenario["error"]["phase"]),
|
| 797 |
+
error_message=scenario["error"]["message"],
|
| 798 |
+
exit_code=scenario["error"].get("exit_code"),
|
| 799 |
+
failed_step=scenario["error"].get("failed_step"),
|
| 800 |
+
line_hint=scenario["error"].get("line_hint")
|
| 801 |
+
)
|
| 802 |
+
|
| 803 |
+
# Initialize fixes tracking
|
| 804 |
+
self.expected_fixes = scenario["expected_fixes"]
|
| 805 |
+
self.issues_total = len(self.expected_fixes)
|
| 806 |
+
self.issues_fixed = 0
|
| 807 |
+
|
| 808 |
+
# Reset episode state
|
| 809 |
+
self.step_count = 0
|
| 810 |
+
self.episode_reward = 0.0
|
| 811 |
+
self.done = False
|
| 812 |
+
self.hints_used = 0
|
| 813 |
+
self.trajectory = []
|
| 814 |
+
self.last_action_success = None
|
| 815 |
+
self.last_action_feedback = None
|
| 816 |
+
|
| 817 |
+
return self.get_observation()
|
| 818 |
+
|
| 819 |
+
def step(self, action: Action) -> Tuple[Observation, float, bool, Dict[str, Any]]:
|
| 820 |
+
"""Take an action and return (observation, reward, done, info)."""
|
| 821 |
+
|
| 822 |
+
if self.done:
|
| 823 |
+
return self.get_observation(), 0.0, True, {"error": "Episode already done"}
|
| 824 |
+
|
| 825 |
+
self.step_count += 1
|
| 826 |
+
reward = 0.0
|
| 827 |
+
info = {}
|
| 828 |
+
|
| 829 |
+
# Process action
|
| 830 |
+
if action.action_type == ActionType.REQUEST_HINT:
|
| 831 |
+
reward, feedback = self._handle_hint_request()
|
| 832 |
+
elif action.action_type == ActionType.SUBMIT:
|
| 833 |
+
reward, feedback = self._handle_submit()
|
| 834 |
+
else:
|
| 835 |
+
reward, feedback = self._handle_edit(action)
|
| 836 |
+
|
| 837 |
+
self.last_action_feedback = feedback
|
| 838 |
+
self.episode_reward += reward
|
| 839 |
+
|
| 840 |
+
# Check termination conditions
|
| 841 |
+
if self.step_count >= self.MAX_STEPS:
|
| 842 |
+
self.done = True
|
| 843 |
+
info["termination_reason"] = "max_steps"
|
| 844 |
+
elif action.action_type == ActionType.SUBMIT:
|
| 845 |
+
self.done = True
|
| 846 |
+
info["termination_reason"] = "submitted"
|
| 847 |
+
elif self.issues_fixed == self.issues_total:
|
| 848 |
+
# All issues fixed, auto-complete
|
| 849 |
+
self.done = True
|
| 850 |
+
info["termination_reason"] = "all_fixed"
|
| 851 |
+
|
| 852 |
+
# Record trajectory
|
| 853 |
+
self.trajectory.append({
|
| 854 |
+
"step": self.step_count,
|
| 855 |
+
"action": action.model_dump(),
|
| 856 |
+
"reward": reward,
|
| 857 |
+
"done": self.done
|
| 858 |
+
})
|
| 859 |
+
|
| 860 |
+
info["issues_fixed"] = self.issues_fixed
|
| 861 |
+
info["issues_total"] = self.issues_total
|
| 862 |
+
|
| 863 |
+
return self.get_observation(), reward, self.done, info
|
| 864 |
+
|
| 865 |
+
def _handle_edit(self, action: Action) -> Tuple[float, str]:
|
| 866 |
+
"""Handle file edit actions."""
|
| 867 |
+
|
| 868 |
+
if not action.edits:
|
| 869 |
+
self.last_action_success = False
|
| 870 |
+
return 0.0, "No edits provided"
|
| 871 |
+
|
| 872 |
+
reward = 0.0
|
| 873 |
+
feedbacks = []
|
| 874 |
+
|
| 875 |
+
for edit in action.edits:
|
| 876 |
+
# Check file exists
|
| 877 |
+
if edit.file_path not in self.current_files:
|
| 878 |
+
feedbacks.append(f"File not found: {edit.file_path}")
|
| 879 |
+
continue
|
| 880 |
+
|
| 881 |
+
file_content = self.current_files[edit.file_path]
|
| 882 |
+
lines = file_content.content.split("\n")
|
| 883 |
+
|
| 884 |
+
try:
|
| 885 |
+
if action.action_type == ActionType.REPLACE_LINE:
|
| 886 |
+
if edit.line_number and 1 <= edit.line_number <= len(lines):
|
| 887 |
+
lines[edit.line_number - 1] = edit.new_content or ""
|
| 888 |
+
feedbacks.append(f"Replaced line {edit.line_number} in {edit.file_path}")
|
| 889 |
+
else:
|
| 890 |
+
feedbacks.append(f"Invalid line number: {edit.line_number}")
|
| 891 |
+
continue
|
| 892 |
+
|
| 893 |
+
elif action.action_type == ActionType.ADD_LINE:
|
| 894 |
+
insert_at = edit.line_number - 1 if edit.line_number else len(lines)
|
| 895 |
+
lines.insert(insert_at, edit.new_content or "")
|
| 896 |
+
feedbacks.append(f"Added line at {insert_at + 1} in {edit.file_path}")
|
| 897 |
+
|
| 898 |
+
elif action.action_type == ActionType.DELETE_LINE:
|
| 899 |
+
if edit.line_number and 1 <= edit.line_number <= len(lines):
|
| 900 |
+
del lines[edit.line_number - 1]
|
| 901 |
+
feedbacks.append(f"Deleted line {edit.line_number} in {edit.file_path}")
|
| 902 |
+
else:
|
| 903 |
+
feedbacks.append(f"Invalid line number: {edit.line_number}")
|
| 904 |
+
continue
|
| 905 |
+
|
| 906 |
+
elif action.action_type == ActionType.EDIT_FILE:
|
| 907 |
+
# Find and replace
|
| 908 |
+
if edit.old_content and edit.old_content in file_content.content:
|
| 909 |
+
new_content = file_content.content.replace(
|
| 910 |
+
edit.old_content,
|
| 911 |
+
edit.new_content or "",
|
| 912 |
+
1
|
| 913 |
+
)
|
| 914 |
+
lines = new_content.split("\n")
|
| 915 |
+
feedbacks.append(f"Replaced content in {edit.file_path}")
|
| 916 |
+
else:
|
| 917 |
+
feedbacks.append(f"Content not found in {edit.file_path}")
|
| 918 |
+
continue
|
| 919 |
+
|
| 920 |
+
# Update file
|
| 921 |
+
new_content = "\n".join(lines)
|
| 922 |
+
self.current_files[edit.file_path] = FileContent(
|
| 923 |
+
path=file_content.path,
|
| 924 |
+
content=new_content,
|
| 925 |
+
file_type=file_content.file_type,
|
| 926 |
+
line_count=len(lines)
|
| 927 |
+
)
|
| 928 |
+
|
| 929 |
+
# Check if this fixed an issue
|
| 930 |
+
fix_reward = self._check_fix_progress()
|
| 931 |
+
reward += fix_reward
|
| 932 |
+
|
| 933 |
+
except Exception as e:
|
| 934 |
+
feedbacks.append(f"Error applying edit: {str(e)}")
|
| 935 |
+
|
| 936 |
+
self.last_action_success = reward > 0
|
| 937 |
+
return reward, "; ".join(feedbacks)
|
| 938 |
+
|
| 939 |
+
def _check_fix_progress(self) -> float:
|
| 940 |
+
"""Check if current state fixes any issues."""
|
| 941 |
+
|
| 942 |
+
# Simulate build with current files
|
| 943 |
+
dockerfile = self.current_files.get("Dockerfile")
|
| 944 |
+
workflow = self.current_files.get(".github/workflows/build.yml")
|
| 945 |
+
|
| 946 |
+
fixes_applied = 0
|
| 947 |
+
|
| 948 |
+
for fix in self.expected_fixes:
|
| 949 |
+
file_path = fix["file"]
|
| 950 |
+
if file_path in self.current_files:
|
| 951 |
+
current_content = self.current_files[file_path].content
|
| 952 |
+
|
| 953 |
+
# Check if fix is applied
|
| 954 |
+
if fix["type"] == "contains":
|
| 955 |
+
if fix["expected"] in current_content:
|
| 956 |
+
fixes_applied += 1
|
| 957 |
+
elif fix["type"] == "not_contains":
|
| 958 |
+
if fix["expected"] not in current_content:
|
| 959 |
+
fixes_applied += 1
|
| 960 |
+
elif fix["type"] == "line_equals":
|
| 961 |
+
lines = current_content.split("\n")
|
| 962 |
+
if fix["line"] <= len(lines):
|
| 963 |
+
if lines[fix["line"] - 1].strip() == fix["expected"].strip():
|
| 964 |
+
fixes_applied += 1
|
| 965 |
+
|
| 966 |
+
new_fixed = fixes_applied - self.issues_fixed
|
| 967 |
+
if new_fixed > 0:
|
| 968 |
+
self.issues_fixed = fixes_applied
|
| 969 |
+
# Partial reward for each fix
|
| 970 |
+
return 0.3 * new_fixed
|
| 971 |
+
|
| 972 |
+
return 0.0
|
| 973 |
+
|
| 974 |
+
def _handle_submit(self) -> Tuple[float, str]:
|
| 975 |
+
"""Handle submission - run full validation."""
|
| 976 |
+
|
| 977 |
+
# Run Docker simulation
|
| 978 |
+
docker_result = self.docker_sim.validate(
|
| 979 |
+
dockerfile=self.current_files.get("Dockerfile"),
|
| 980 |
+
context_files=self.current_files
|
| 981 |
+
)
|
| 982 |
+
|
| 983 |
+
# Run workflow simulation
|
| 984 |
+
workflow_result = self.workflow_sim.validate(
|
| 985 |
+
workflow=self.current_files.get(".github/workflows/build.yml"),
|
| 986 |
+
files=self.current_files
|
| 987 |
+
)
|
| 988 |
+
|
| 989 |
+
# Calculate final reward
|
| 990 |
+
reward = 0.0
|
| 991 |
+
feedback_parts = []
|
| 992 |
+
|
| 993 |
+
# Docker build success (0.3)
|
| 994 |
+
if docker_result["build_success"]:
|
| 995 |
+
reward += 0.3
|
| 996 |
+
feedback_parts.append("Docker build: PASS")
|
| 997 |
+
else:
|
| 998 |
+
feedback_parts.append(f"Docker build: FAIL - {docker_result['error']}")
|
| 999 |
+
|
| 1000 |
+
# Docker run success (0.2)
|
| 1001 |
+
if docker_result["run_success"]:
|
| 1002 |
+
reward += 0.2
|
| 1003 |
+
feedback_parts.append("Docker run: PASS")
|
| 1004 |
+
else:
|
| 1005 |
+
feedback_parts.append(f"Docker run: FAIL - {docker_result.get('run_error', 'unknown')}")
|
| 1006 |
+
|
| 1007 |
+
# Workflow parse success (0.2)
|
| 1008 |
+
if workflow_result["parse_success"]:
|
| 1009 |
+
reward += 0.2
|
| 1010 |
+
feedback_parts.append("Workflow parse: PASS")
|
| 1011 |
+
else:
|
| 1012 |
+
feedback_parts.append(f"Workflow parse: FAIL - {workflow_result['error']}")
|
| 1013 |
+
|
| 1014 |
+
# Workflow execution success (0.3)
|
| 1015 |
+
if workflow_result["execution_success"]:
|
| 1016 |
+
reward += 0.3
|
| 1017 |
+
feedback_parts.append("Workflow execution: PASS")
|
| 1018 |
+
else:
|
| 1019 |
+
feedback_parts.append(f"Workflow execution: FAIL - {workflow_result.get('exec_error', 'unknown')}")
|
| 1020 |
+
|
| 1021 |
+
self.last_action_success = reward >= 0.8
|
| 1022 |
+
return reward, "; ".join(feedback_parts)
|
| 1023 |
+
|
| 1024 |
+
def _handle_hint_request(self) -> Tuple[float, str]:
|
| 1025 |
+
"""Handle hint request."""
|
| 1026 |
+
|
| 1027 |
+
if self.hints_used >= self.MAX_HINTS:
|
| 1028 |
+
self.last_action_success = False
|
| 1029 |
+
return 0.0, "No hints remaining"
|
| 1030 |
+
|
| 1031 |
+
self.hints_used += 1
|
| 1032 |
+
|
| 1033 |
+
# Get next unfixed issue
|
| 1034 |
+
for fix in self.expected_fixes:
|
| 1035 |
+
file_path = fix["file"]
|
| 1036 |
+
if file_path in self.current_files:
|
| 1037 |
+
current_content = self.current_files[file_path].content
|
| 1038 |
+
|
| 1039 |
+
is_fixed = False
|
| 1040 |
+
if fix["type"] == "contains":
|
| 1041 |
+
is_fixed = fix["expected"] in current_content
|
| 1042 |
+
elif fix["type"] == "not_contains":
|
| 1043 |
+
is_fixed = fix["expected"] not in current_content
|
| 1044 |
+
|
| 1045 |
+
if not is_fixed:
|
| 1046 |
+
hint = fix.get("hint", f"Check {file_path} around line {fix.get('line', '?')}")
|
| 1047 |
+
self.last_action_success = True
|
| 1048 |
+
# Small negative reward for using hint
|
| 1049 |
+
return -0.05, f"Hint ({self.hints_used}/{self.MAX_HINTS}): {hint}"
|
| 1050 |
+
|
| 1051 |
+
self.last_action_success = True
|
| 1052 |
+
return 0.0, "All known issues appear to be fixed"
|
| 1053 |
+
|
| 1054 |
+
def get_observation(self) -> Observation:
|
| 1055 |
+
"""Get current observation."""
|
| 1056 |
+
|
| 1057 |
+
return Observation(
|
| 1058 |
+
task_id=self.current_task_id,
|
| 1059 |
+
task_description=self.current_task.DESCRIPTION,
|
| 1060 |
+
difficulty=self.current_difficulty,
|
| 1061 |
+
files=list(self.current_files.values()),
|
| 1062 |
+
error=self.current_error,
|
| 1063 |
+
available_secrets=self.current_task.AVAILABLE_SECRETS,
|
| 1064 |
+
runner_os="ubuntu-latest",
|
| 1065 |
+
step_number=self.step_count,
|
| 1066 |
+
max_steps=self.MAX_STEPS,
|
| 1067 |
+
hints_used=self.hints_used,
|
| 1068 |
+
hints_available=self.MAX_HINTS - self.hints_used,
|
| 1069 |
+
last_action_success=self.last_action_success,
|
| 1070 |
+
last_action_feedback=self.last_action_feedback,
|
| 1071 |
+
issues_found=self.issues_fixed, # Simplified: found = fixed
|
| 1072 |
+
issues_fixed=self.issues_fixed,
|
| 1073 |
+
total_issues=self.issues_total
|
| 1074 |
+
)
|
| 1075 |
+
```
|
| 1076 |
+
|
| 1077 |
+
---
|
| 1078 |
+
|
| 1079 |
+
# 6. TASK DESIGN (6 Tasks)
|
| 1080 |
+
|
| 1081 |
+
## 6.1 Task Registry (server/tasks/task_registry.py)
|
| 1082 |
+
|
| 1083 |
+
```python
|
| 1084 |
+
"""Task registration and loading."""
|
| 1085 |
+
|
| 1086 |
+
from typing import Dict, Type
|
| 1087 |
+
from .base import BaseTask
|
| 1088 |
+
from .task_1_build_errors import DockerfileSyntaxTask
|
| 1089 |
+
from .task_2_docker_runtime import DockerfileRuntimeTask
|
| 1090 |
+
from .task_3_workflow_syntax import WorkflowSyntaxStructureTask
|
| 1091 |
+
from .task_4_workflow_secrets_permissions import WorkflowSecretsPermissionsTask
|
| 1092 |
+
from .task_5_ci_docker_integration import CIDockerIntegrationTask
|
| 1093 |
+
from .task_6_multi_stage_matrix import MultiStageMatrixTask
|
| 1094 |
+
|
| 1095 |
+
TASK_REGISTRY: Dict[str, Type[BaseTask]] = {
|
| 1096 |
+
"dockerfile_syntax": DockerfileSyntaxTask,
|
| 1097 |
+
"dockerfile_runtime": DockerfileRuntimeTask,
|
| 1098 |
+
"workflow_syntax_structure": WorkflowSyntaxStructureTask,
|
| 1099 |
+
"workflow_secrets_permissions": WorkflowSecretsPermissionsTask,
|
| 1100 |
+
"ci_docker_integration": CIDockerIntegrationTask,
|
| 1101 |
+
"multi_stage_pipeline_matrix": MultiStageMatrixTask,
|
| 1102 |
+
}
|
| 1103 |
+
|
| 1104 |
+
def get_task(task_id: str) -> BaseTask:
|
| 1105 |
+
"""Get task instance by ID."""
|
| 1106 |
+
if task_id not in TASK_REGISTRY:
|
| 1107 |
+
raise ValueError(f"Unknown task: {task_id}")
|
| 1108 |
+
return TASK_REGISTRY[task_id]()
|
| 1109 |
+
```
|
| 1110 |
+
|
| 1111 |
+
## 6.2 Task 1: Dockerfile Syntax Errors (EASY)
|
| 1112 |
+
|
| 1113 |
+
```python
|
| 1114 |
+
"""
|
| 1115 |
+
Task 1: Dockerfile Syntax Errors
|
| 1116 |
+
Difficulty: EASY
|
| 1117 |
+
Focus: Pure Dockerfile issues - no GitHub Actions involved
|
| 1118 |
+
|
| 1119 |
+
Agent must fix common Dockerfile mistakes:
|
| 1120 |
+
- Typos in instruction names
|
| 1121 |
+
- Wrong file paths
|
| 1122 |
+
- Missing instructions
|
| 1123 |
+
- Invalid syntax
|
| 1124 |
+
"""
|
| 1125 |
+
|
| 1126 |
+
from typing import Dict, List, Optional
|
| 1127 |
+
import random
|
| 1128 |
+
from models import TaskDifficulty
|
| 1129 |
+
from .base import BaseTask
|
| 1130 |
+
|
| 1131 |
+
|
| 1132 |
+
class DockerfileSyntaxTask(BaseTask):
|
| 1133 |
+
|
| 1134 |
+
NAME = "Dockerfile Syntax Errors"
|
| 1135 |
+
DESCRIPTION = "Fix syntax and instruction errors in Dockerfiles"
|
| 1136 |
+
DIFFICULTY = TaskDifficulty.EASY
|
| 1137 |
+
AVAILABLE_SECRETS = [] # No secrets needed for this task
|
| 1138 |
+
|
| 1139 |
+
SCENARIOS = [
|
| 1140 |
+
# Scenario 1: Typo in filename
|
| 1141 |
+
{
|
| 1142 |
+
"id": "typo_filename",
|
| 1143 |
+
"files": [
|
| 1144 |
+
{
|
| 1145 |
+
"path": "Dockerfile",
|
| 1146 |
+
"type": "dockerfile",
|
| 1147 |
+
"content": """FROM python:3.9-slim
|
| 1148 |
+
WORKDIR /app
|
| 1149 |
+
COPY requirments.txt .
|
| 1150 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 1151 |
+
COPY . .
|
| 1152 |
+
CMD ["python", "app.py"]"""
|
| 1153 |
+
},
|
| 1154 |
+
{
|
| 1155 |
+
"path": "requirements.txt",
|
| 1156 |
+
"type": "requirements",
|
| 1157 |
+
"content": "flask==2.0.0\nrequests==2.28.0"
|
| 1158 |
+
}
|
| 1159 |
+
],
|
| 1160 |
+
"error": {
|
| 1161 |
+
"phase": "docker_build",
|
| 1162 |
+
"message": "COPY failed: file not found in build context: requirments.txt",
|
| 1163 |
+
"exit_code": 1,
|
| 1164 |
+
"failed_step": "COPY requirments.txt .",
|
| 1165 |
+
"line_hint": 3
|
| 1166 |
+
},
|
| 1167 |
+
"expected_fixes": [
|
| 1168 |
+
{
|
| 1169 |
+
"file": "Dockerfile",
|
| 1170 |
+
"type": "contains",
|
| 1171 |
+
"expected": "COPY requirements.txt",
|
| 1172 |
+
"line": 3,
|
| 1173 |
+
"hint": "Check the spelling of the requirements file"
|
| 1174 |
+
}
|
| 1175 |
+
]
|
| 1176 |
+
},
|
| 1177 |
+
|
| 1178 |
+
# Scenario 2: Wrong base image tag
|
| 1179 |
+
{
|
| 1180 |
+
"id": "invalid_base_image",
|
| 1181 |
+
"files": [
|
| 1182 |
+
{
|
| 1183 |
+
"path": "Dockerfile",
|
| 1184 |
+
"type": "dockerfile",
|
| 1185 |
+
"content": """FROM python:3.9-slimm
|
| 1186 |
+
WORKDIR /app
|
| 1187 |
+
COPY requirements.txt .
|
| 1188 |
+
RUN pip install -r requirements.txt
|
| 1189 |
+
COPY . .
|
| 1190 |
+
EXPOSE 8000
|
| 1191 |
+
CMD ["python", "app.py"]"""
|
| 1192 |
+
},
|
| 1193 |
+
{
|
| 1194 |
+
"path": "requirements.txt",
|
| 1195 |
+
"type": "requirements",
|
| 1196 |
+
"content": "flask==2.0.0"
|
| 1197 |
+
}
|
| 1198 |
+
],
|
| 1199 |
+
"error": {
|
| 1200 |
+
"phase": "docker_build",
|
| 1201 |
+
"message": "pull access denied for python:3.9-slimm, repository does not exist or may require 'docker login'",
|
| 1202 |
+
"exit_code": 1,
|
| 1203 |
+
"failed_step": "FROM python:3.9-slimm",
|
| 1204 |
+
"line_hint": 1
|
| 1205 |
+
},
|
| 1206 |
+
"expected_fixes": [
|
| 1207 |
+
{
|
| 1208 |
+
"file": "Dockerfile",
|
| 1209 |
+
"type": "contains",
|
| 1210 |
+
"expected": "FROM python:3.9-slim",
|
| 1211 |
+
"line": 1,
|
| 1212 |
+
"hint": "Check the base image tag - 'slimm' vs 'slim'"
|
| 1213 |
+
}
|
| 1214 |
+
]
|
| 1215 |
+
},
|
| 1216 |
+
|
| 1217 |
+
# Scenario 3: Missing WORKDIR before COPY
|
| 1218 |
+
{
|
| 1219 |
+
"id": "missing_workdir",
|
| 1220 |
+
"files": [
|
| 1221 |
+
{
|
| 1222 |
+
"path": "Dockerfile",
|
| 1223 |
+
"type": "dockerfile",
|
| 1224 |
+
"content": """FROM node:18-alpine
|
| 1225 |
+
COPY package*.json ./
|
| 1226 |
+
RUN npm ci
|
| 1227 |
+
COPY . .
|
| 1228 |
+
RUN npm run build
|
| 1229 |
+
EXPOSE 3000
|
| 1230 |
+
CMD ["npm", "start"]"""
|
| 1231 |
+
},
|
| 1232 |
+
{
|
| 1233 |
+
"path": "package.json",
|
| 1234 |
+
"type": "other",
|
| 1235 |
+
"content": '{"name": "app", "version": "1.0.0"}'
|
| 1236 |
+
}
|
| 1237 |
+
],
|
| 1238 |
+
"error": {
|
| 1239 |
+
"phase": "docker_run",
|
| 1240 |
+
"message": "Error: Cannot find module '/package.json'",
|
| 1241 |
+
"exit_code": 1,
|
| 1242 |
+
"failed_step": "npm start"
|
| 1243 |
+
},
|
| 1244 |
+
"expected_fixes": [
|
| 1245 |
+
{
|
| 1246 |
+
"file": "Dockerfile",
|
| 1247 |
+
"type": "contains",
|
| 1248 |
+
"expected": "WORKDIR /app",
|
| 1249 |
+
"hint": "Add WORKDIR before COPY to set proper working directory"
|
| 1250 |
+
}
|
| 1251 |
+
]
|
| 1252 |
+
},
|
| 1253 |
+
|
| 1254 |
+
# Scenario 4: Invalid RUN syntax
|
| 1255 |
+
{
|
| 1256 |
+
"id": "invalid_run_syntax",
|
| 1257 |
+
"files": [
|
| 1258 |
+
{
|
| 1259 |
+
"path": "Dockerfile",
|
| 1260 |
+
"type": "dockerfile",
|
| 1261 |
+
"content": """FROM python:3.9
|
| 1262 |
+
WORKDIR /app
|
| 1263 |
+
COPY . .
|
| 1264 |
+
RUN pip install -r requirements.txt
|
| 1265 |
+
&& python setup.py install
|
| 1266 |
+
CMD ["python", "main.py"]"""
|
| 1267 |
+
},
|
| 1268 |
+
{
|
| 1269 |
+
"path": "requirements.txt",
|
| 1270 |
+
"type": "requirements",
|
| 1271 |
+
"content": "numpy==1.21.0"
|
| 1272 |
+
}
|
| 1273 |
+
],
|
| 1274 |
+
"error": {
|
| 1275 |
+
"phase": "docker_build",
|
| 1276 |
+
"message": "Dockerfile parse error: unknown instruction: &&",
|
| 1277 |
+
"exit_code": 1,
|
| 1278 |
+
"line_hint": 5
|
| 1279 |
+
},
|
| 1280 |
+
"expected_fixes": [
|
| 1281 |
+
{
|
| 1282 |
+
"file": "Dockerfile",
|
| 1283 |
+
"type": "contains",
|
| 1284 |
+
"expected": "RUN pip install -r requirements.txt && python setup.py install",
|
| 1285 |
+
"hint": "Multi-line RUN commands need backslash continuation or be on same line"
|
| 1286 |
+
}
|
| 1287 |
+
]
|
| 1288 |
+
},
|
| 1289 |
+
|
| 1290 |
+
# Scenario 5: EXPOSE with invalid port
|
| 1291 |
+
{
|
| 1292 |
+
"id": "invalid_expose",
|
| 1293 |
+
"files": [
|
| 1294 |
+
{
|
| 1295 |
+
"path": "Dockerfile",
|
| 1296 |
+
"type": "dockerfile",
|
| 1297 |
+
"content": """FROM nginx:alpine
|
| 1298 |
+
COPY nginx.conf /etc/nginx/nginx.conf
|
| 1299 |
+
COPY html /usr/share/nginx/html
|
| 1300 |
+
EXPOSE "eighty"
|
| 1301 |
+
CMD ["nginx", "-g", "daemon off;"]"""
|
| 1302 |
+
},
|
| 1303 |
+
{
|
| 1304 |
+
"path": "nginx.conf",
|
| 1305 |
+
"type": "other",
|
| 1306 |
+
"content": "events {}"
|
| 1307 |
+
}
|
| 1308 |
+
],
|
| 1309 |
+
"error": {
|
| 1310 |
+
"phase": "docker_build",
|
| 1311 |
+
"message": "EXPOSE requires numeric port or port/protocol",
|
| 1312 |
+
"exit_code": 1,
|
| 1313 |
+
"line_hint": 4
|
| 1314 |
+
},
|
| 1315 |
+
"expected_fixes": [
|
| 1316 |
+
{
|
| 1317 |
+
"file": "Dockerfile",
|
| 1318 |
+
"type": "contains",
|
| 1319 |
+
"expected": "EXPOSE 80",
|
| 1320 |
+
"line": 4,
|
| 1321 |
+
"hint": "EXPOSE must use numeric port values"
|
| 1322 |
+
}
|
| 1323 |
+
]
|
| 1324 |
+
}
|
| 1325 |
+
]
|
| 1326 |
+
|
| 1327 |
+
def load_scenario(self, scenario_id: Optional[str] = None) -> Dict:
|
| 1328 |
+
"""Load a specific scenario or random one."""
|
| 1329 |
+
if scenario_id:
|
| 1330 |
+
for s in self.SCENARIOS:
|
| 1331 |
+
if s["id"] == scenario_id:
|
| 1332 |
+
return s
|
| 1333 |
+
raise ValueError(f"Unknown scenario: {scenario_id}")
|
| 1334 |
+
return random.choice(self.SCENARIOS)
|
| 1335 |
+
```
|
| 1336 |
+
|
| 1337 |
+
## 6.3 Task 2: Workflow Configuration Errors (MEDIUM)
|
| 1338 |
+
|
| 1339 |
+
```python
|
| 1340 |
+
"""
|
| 1341 |
+
Task 2: Workflow Configuration Errors
|
| 1342 |
+
Difficulty: MEDIUM
|
| 1343 |
+
Focus: GitHub Actions + Docker interaction issues
|
| 1344 |
+
|
| 1345 |
+
Agent must fix:
|
| 1346 |
+
- Missing secret references
|
| 1347 |
+
- Wrong env variable syntax
|
| 1348 |
+
- Incorrect step ordering
|
| 1349 |
+
- Missing permissions
|
| 1350 |
+
"""
|
| 1351 |
+
|
| 1352 |
+
from typing import Dict, Optional
|
| 1353 |
+
import random
|
| 1354 |
+
from models import TaskDifficulty
|
| 1355 |
+
from .base import BaseTask
|
| 1356 |
+
|
| 1357 |
+
|
| 1358 |
+
class WorkflowConfigTask(BaseTask):
|
| 1359 |
+
|
| 1360 |
+
NAME = "Workflow Configuration Errors"
|
| 1361 |
+
DESCRIPTION = "Fix GitHub Actions workflow configuration issues involving Docker"
|
| 1362 |
+
DIFFICULTY = TaskDifficulty.MEDIUM
|
| 1363 |
+
AVAILABLE_SECRETS = ["DOCKER_USERNAME", "DOCKER_PASSWORD", "GITHUB_TOKEN"]
|
| 1364 |
+
|
| 1365 |
+
SCENARIOS = [
|
| 1366 |
+
# Scenario 1: Missing env block for secrets
|
| 1367 |
+
{
|
| 1368 |
+
"id": "missing_env_secrets",
|
| 1369 |
+
"files": [
|
| 1370 |
+
{
|
| 1371 |
+
"path": ".github/workflows/build.yml",
|
| 1372 |
+
"type": "workflow",
|
| 1373 |
+
"content": """name: Build and Push
|
| 1374 |
+
on: push
|
| 1375 |
+
|
| 1376 |
+
jobs:
|
| 1377 |
+
build:
|
| 1378 |
+
runs-on: ubuntu-latest
|
| 1379 |
+
steps:
|
| 1380 |
+
- uses: actions/checkout@v4
|
| 1381 |
+
|
| 1382 |
+
- name: Login to DockerHub
|
| 1383 |
+
run: echo $DOCKER_PASSWORD | docker login -u $DOCKER_USERNAME --password-stdin
|
| 1384 |
+
|
| 1385 |
+
- name: Build and push
|
| 1386 |
+
run: |
|
| 1387 |
+
docker build -t myuser/myapp:${{ github.sha }} .
|
| 1388 |
+
docker push myuser/myapp:${{ github.sha }}"""
|
| 1389 |
+
},
|
| 1390 |
+
{
|
| 1391 |
+
"path": "Dockerfile",
|
| 1392 |
+
"type": "dockerfile",
|
| 1393 |
+
"content": """FROM python:3.9-slim
|
| 1394 |
+
WORKDIR /app
|
| 1395 |
+
COPY . .
|
| 1396 |
+
RUN pip install -r requirements.txt
|
| 1397 |
+
CMD ["python", "app.py"]"""
|
| 1398 |
+
}
|
| 1399 |
+
],
|
| 1400 |
+
"error": {
|
| 1401 |
+
"phase": "workflow_parse",
|
| 1402 |
+
"message": "Error: Cannot perform an interactive login from a non TTY device",
|
| 1403 |
+
"exit_code": 1,
|
| 1404 |
+
"failed_step": "Login to DockerHub"
|
| 1405 |
+
},
|
| 1406 |
+
"expected_fixes": [
|
| 1407 |
+
{
|
| 1408 |
+
"file": ".github/workflows/build.yml",
|
| 1409 |
+
"type": "contains",
|
| 1410 |
+
"expected": "DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }}",
|
| 1411 |
+
"hint": "Secrets must be passed via env block"
|
| 1412 |
+
},
|
| 1413 |
+
{
|
| 1414 |
+
"file": ".github/workflows/build.yml",
|
| 1415 |
+
"type": "contains",
|
| 1416 |
+
"expected": "DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }}",
|
| 1417 |
+
"hint": "Both username and password need to be passed as env vars"
|
| 1418 |
+
}
|
| 1419 |
+
]
|
| 1420 |
+
},
|
| 1421 |
+
|
| 1422 |
+
# Scenario 2: Wrong checkout order
|
| 1423 |
+
{
|
| 1424 |
+
"id": "checkout_after_build",
|
| 1425 |
+
"files": [
|
| 1426 |
+
{
|
| 1427 |
+
"path": ".github/workflows/build.yml",
|
| 1428 |
+
"type": "workflow",
|
| 1429 |
+
"content": """name: Build
|
| 1430 |
+
on: push
|
| 1431 |
+
|
| 1432 |
+
jobs:
|
| 1433 |
+
build:
|
| 1434 |
+
runs-on: ubuntu-latest
|
| 1435 |
+
steps:
|
| 1436 |
+
- name: Build Docker image
|
| 1437 |
+
run: docker build -t myapp .
|
| 1438 |
+
|
| 1439 |
+
- uses: actions/checkout@v4
|
| 1440 |
+
|
| 1441 |
+
- name: Run tests
|
| 1442 |
+
run: docker run myapp pytest"""
|
| 1443 |
+
},
|
| 1444 |
+
{
|
| 1445 |
+
"path": "Dockerfile",
|
| 1446 |
+
"type": "dockerfile",
|
| 1447 |
+
"content": """FROM python:3.9
|
| 1448 |
+
WORKDIR /app
|
| 1449 |
+
COPY . .
|
| 1450 |
+
CMD ["python", "app.py"]"""
|
| 1451 |
+
}
|
| 1452 |
+
],
|
| 1453 |
+
"error": {
|
| 1454 |
+
"phase": "docker_build",
|
| 1455 |
+
"message": "unable to prepare context: unable to evaluate symlinks in Dockerfile path: lstat /home/runner/work/repo/repo/Dockerfile: no such file or directory",
|
| 1456 |
+
"exit_code": 1,
|
| 1457 |
+
"failed_step": "Build Docker image"
|
| 1458 |
+
},
|
| 1459 |
+
"expected_fixes": [
|
| 1460 |
+
{
|
| 1461 |
+
"file": ".github/workflows/build.yml",
|
| 1462 |
+
"type": "line_equals",
|
| 1463 |
+
"line": 8,
|
| 1464 |
+
"expected": " - uses: actions/checkout@v4",
|
| 1465 |
+
"hint": "Checkout must happen before any build commands"
|
| 1466 |
+
}
|
| 1467 |
+
]
|
| 1468 |
+
},
|
| 1469 |
+
|
| 1470 |
+
# Scenario 3: Missing Docker Buildx setup for multi-platform
|
| 1471 |
+
{
|
| 1472 |
+
"id": "missing_buildx",
|
| 1473 |
+
"files": [
|
| 1474 |
+
{
|
| 1475 |
+
"path": ".github/workflows/build.yml",
|
| 1476 |
+
"type": "workflow",
|
| 1477 |
+
"content": """name: Multi-platform Build
|
| 1478 |
+
on: push
|
| 1479 |
+
|
| 1480 |
+
jobs:
|
| 1481 |
+
build:
|
| 1482 |
+
runs-on: ubuntu-latest
|
| 1483 |
+
steps:
|
| 1484 |
+
- uses: actions/checkout@v4
|
| 1485 |
+
|
| 1486 |
+
- name: Build multi-platform
|
| 1487 |
+
uses: docker/build-push-action@v5
|
| 1488 |
+
with:
|
| 1489 |
+
context: .
|
| 1490 |
+
platforms: linux/amd64,linux/arm64
|
| 1491 |
+
push: false"""
|
| 1492 |
+
},
|
| 1493 |
+
{
|
| 1494 |
+
"path": "Dockerfile",
|
| 1495 |
+
"type": "dockerfile",
|
| 1496 |
+
"content": """FROM python:3.9-slim
|
| 1497 |
+
WORKDIR /app
|
| 1498 |
+
COPY . .
|
| 1499 |
+
CMD ["python", "app.py"]"""
|
| 1500 |
+
}
|
| 1501 |
+
],
|
| 1502 |
+
"error": {
|
| 1503 |
+
"phase": "docker_build",
|
| 1504 |
+
"message": "ERROR: Multi-platform build is not supported for the docker driver. Switch to a different driver, or turn on the containerd image store, and try again.",
|
| 1505 |
+
"exit_code": 1,
|
| 1506 |
+
"failed_step": "Build multi-platform"
|
| 1507 |
+
},
|
| 1508 |
+
"expected_fixes": [
|
| 1509 |
+
{
|
| 1510 |
+
"file": ".github/workflows/build.yml",
|
| 1511 |
+
"type": "contains",
|
| 1512 |
+
"expected": "docker/setup-buildx-action",
|
| 1513 |
+
"hint": "Multi-platform builds require Docker Buildx setup"
|
| 1514 |
+
}
|
| 1515 |
+
]
|
| 1516 |
+
},
|
| 1517 |
+
|
| 1518 |
+
# Scenario 4: Incorrect caching configuration
|
| 1519 |
+
{
|
| 1520 |
+
"id": "wrong_cache_config",
|
| 1521 |
+
"files": [
|
| 1522 |
+
{
|
| 1523 |
+
"path": ".github/workflows/build.yml",
|
| 1524 |
+
"type": "workflow",
|
| 1525 |
+
"content": """name: Build with Cache
|
| 1526 |
+
on: push
|
| 1527 |
+
|
| 1528 |
+
jobs:
|
| 1529 |
+
build:
|
| 1530 |
+
runs-on: ubuntu-latest
|
| 1531 |
+
steps:
|
| 1532 |
+
- uses: actions/checkout@v4
|
| 1533 |
+
|
| 1534 |
+
- name: Set up Docker Buildx
|
| 1535 |
+
uses: docker/setup-buildx-action@v3
|
| 1536 |
+
|
| 1537 |
+
- name: Build
|
| 1538 |
+
uses: docker/build-push-action@v5
|
| 1539 |
+
with:
|
| 1540 |
+
context: .
|
| 1541 |
+
push: false
|
| 1542 |
+
cache-from: type=gha
|
| 1543 |
+
cache-to: type=gha"""
|
| 1544 |
+
},
|
| 1545 |
+
{
|
| 1546 |
+
"path": "Dockerfile",
|
| 1547 |
+
"type": "dockerfile",
|
| 1548 |
+
"content": """FROM python:3.9-slim
|
| 1549 |
+
WORKDIR /app
|
| 1550 |
+
COPY . .
|
| 1551 |
+
CMD ["python", "app.py"]"""
|
| 1552 |
+
}
|
| 1553 |
+
],
|
| 1554 |
+
"error": {
|
| 1555 |
+
"phase": "docker_build",
|
| 1556 |
+
"message": "ERROR: cache export feature is currently not supported for docker driver. Please switch to a different driver",
|
| 1557 |
+
"exit_code": 1,
|
| 1558 |
+
"failed_step": "Build"
|
| 1559 |
+
},
|
| 1560 |
+
"expected_fixes": [
|
| 1561 |
+
{
|
| 1562 |
+
"file": ".github/workflows/build.yml",
|
| 1563 |
+
"type": "contains",
|
| 1564 |
+
"expected": "cache-to: type=gha,mode=max",
|
| 1565 |
+
"hint": "GHA cache needs mode=max for proper export"
|
| 1566 |
+
}
|
| 1567 |
+
]
|
| 1568 |
+
}
|
| 1569 |
+
]
|
| 1570 |
+
|
| 1571 |
+
def load_scenario(self, scenario_id: Optional[str] = None) -> Dict:
|
| 1572 |
+
if scenario_id:
|
| 1573 |
+
for s in self.SCENARIOS:
|
| 1574 |
+
if s["id"] == scenario_id:
|
| 1575 |
+
return s
|
| 1576 |
+
raise ValueError(f"Unknown scenario: {scenario_id}")
|
| 1577 |
+
return random.choice(self.SCENARIOS)
|
| 1578 |
+
```
|
| 1579 |
+
|
| 1580 |
+
## 6.4 Task 3: Multi-Stage Pipeline Failures (HARD)
|
| 1581 |
+
|
| 1582 |
+
```python
|
| 1583 |
+
"""
|
| 1584 |
+
Task 3: Multi-Stage Pipeline Failures
|
| 1585 |
+
Difficulty: HARD
|
| 1586 |
+
Focus: Complex interactions between multi-stage Docker builds and CI/CD
|
| 1587 |
+
|
| 1588 |
+
Agent must debug:
|
| 1589 |
+
- Multi-stage build artifact issues
|
| 1590 |
+
- Cross-job dependencies
|
| 1591 |
+
- Matrix build failures
|
| 1592 |
+
- Platform-specific issues
|
| 1593 |
+
"""
|
| 1594 |
+
|
| 1595 |
+
from typing import Dict, Optional
|
| 1596 |
+
import random
|
| 1597 |
+
from models import TaskDifficulty
|
| 1598 |
+
from .base import BaseTask
|
| 1599 |
+
|
| 1600 |
+
|
| 1601 |
+
class MultiStagePipelineTask(BaseTask):
|
| 1602 |
+
|
| 1603 |
+
NAME = "Multi-Stage Pipeline Failures"
|
| 1604 |
+
DESCRIPTION = "Debug complex multi-stage Docker builds with CI/CD integration"
|
| 1605 |
+
DIFFICULTY = TaskDifficulty.HARD
|
| 1606 |
+
AVAILABLE_SECRETS = ["DOCKER_USERNAME", "DOCKER_PASSWORD", "GITHUB_TOKEN", "NPM_TOKEN"]
|
| 1607 |
+
|
| 1608 |
+
SCENARIOS = [
|
| 1609 |
+
# Scenario 1: Multi-stage artifact path mismatch
|
| 1610 |
+
{
|
| 1611 |
+
"id": "artifact_path_mismatch",
|
| 1612 |
+
"files": [
|
| 1613 |
+
{
|
| 1614 |
+
"path": ".github/workflows/build.yml",
|
| 1615 |
+
"type": "workflow",
|
| 1616 |
+
"content": """name: Build and Deploy
|
| 1617 |
+
on: push
|
| 1618 |
+
|
| 1619 |
+
jobs:
|
| 1620 |
+
build:
|
| 1621 |
+
runs-on: ubuntu-latest
|
| 1622 |
+
steps:
|
| 1623 |
+
- uses: actions/checkout@v4
|
| 1624 |
+
|
| 1625 |
+
- name: Set up Docker Buildx
|
| 1626 |
+
uses: docker/setup-buildx-action@v3
|
| 1627 |
+
|
| 1628 |
+
- name: Build
|
| 1629 |
+
uses: docker/build-push-action@v5
|
| 1630 |
+
with:
|
| 1631 |
+
context: .
|
| 1632 |
+
push: false
|
| 1633 |
+
load: true
|
| 1634 |
+
tags: myapp:test
|
| 1635 |
+
|
| 1636 |
+
- name: Test
|
| 1637 |
+
run: |
|
| 1638 |
+
docker run myapp:test ls -la /usr/share/nginx/html
|
| 1639 |
+
docker run myapp:test curl -f http://localhost:80/ || exit 1"""
|
| 1640 |
+
},
|
| 1641 |
+
{
|
| 1642 |
+
"path": "Dockerfile",
|
| 1643 |
+
"type": "dockerfile",
|
| 1644 |
+
"content": """FROM node:18 AS builder
|
| 1645 |
+
WORKDIR /app
|
| 1646 |
+
COPY package*.json ./
|
| 1647 |
+
RUN npm ci
|
| 1648 |
+
COPY . .
|
| 1649 |
+
RUN npm run build
|
| 1650 |
+
|
| 1651 |
+
FROM nginx:alpine
|
| 1652 |
+
# Bug: React builds to 'build', not 'dist'
|
| 1653 |
+
COPY --from=builder /app/dist /usr/share/nginx/html
|
| 1654 |
+
EXPOSE 80
|
| 1655 |
+
CMD ["nginx", "-g", "daemon off;"]"""
|
| 1656 |
+
},
|
| 1657 |
+
{
|
| 1658 |
+
"path": "package.json",
|
| 1659 |
+
"type": "other",
|
| 1660 |
+
"content": """{
|
| 1661 |
+
"name": "frontend",
|
| 1662 |
+
"scripts": {
|
| 1663 |
+
"build": "react-scripts build"
|
| 1664 |
+
}
|
| 1665 |
+
}"""
|
| 1666 |
+
}
|
| 1667 |
+
],
|
| 1668 |
+
"error": {
|
| 1669 |
+
"phase": "docker_build",
|
| 1670 |
+
"message": "COPY failed: stat app/dist: file does not exist",
|
| 1671 |
+
"exit_code": 1,
|
| 1672 |
+
"failed_step": "Build",
|
| 1673 |
+
"line_hint": 10
|
| 1674 |
+
},
|
| 1675 |
+
"expected_fixes": [
|
| 1676 |
+
{
|
| 1677 |
+
"file": "Dockerfile",
|
| 1678 |
+
"type": "contains",
|
| 1679 |
+
"expected": "COPY --from=builder /app/build",
|
| 1680 |
+
"line": 10,
|
| 1681 |
+
"hint": "React's create-react-app outputs to 'build' directory, not 'dist'"
|
| 1682 |
+
}
|
| 1683 |
+
]
|
| 1684 |
+
},
|
| 1685 |
+
|
| 1686 |
+
# Scenario 2: Matrix + Platform ARG issue
|
| 1687 |
+
{
|
| 1688 |
+
"id": "matrix_platform_arg",
|
| 1689 |
+
"files": [
|
| 1690 |
+
{
|
| 1691 |
+
"path": ".github/workflows/build.yml",
|
| 1692 |
+
"type": "workflow",
|
| 1693 |
+
"content": """name: Multi-Platform Build
|
| 1694 |
+
on: push
|
| 1695 |
+
|
| 1696 |
+
jobs:
|
| 1697 |
+
build:
|
| 1698 |
+
runs-on: ubuntu-latest
|
| 1699 |
+
strategy:
|
| 1700 |
+
matrix:
|
| 1701 |
+
platform:
|
| 1702 |
+
- linux/amd64
|
| 1703 |
+
- linux/arm64
|
| 1704 |
+
steps:
|
| 1705 |
+
- uses: actions/checkout@v4
|
| 1706 |
+
|
| 1707 |
+
- name: Set up QEMU
|
| 1708 |
+
uses: docker/setup-qemu-action@v3
|
| 1709 |
+
|
| 1710 |
+
- name: Set up Docker Buildx
|
| 1711 |
+
uses: docker/setup-buildx-action@v3
|
| 1712 |
+
|
| 1713 |
+
- name: Build
|
| 1714 |
+
uses: docker/build-push-action@v5
|
| 1715 |
+
with:
|
| 1716 |
+
context: .
|
| 1717 |
+
platforms: ${{ matrix.platform }}
|
| 1718 |
+
push: false"""
|
| 1719 |
+
},
|
| 1720 |
+
{
|
| 1721 |
+
"path": "Dockerfile",
|
| 1722 |
+
"type": "dockerfile",
|
| 1723 |
+
"content": """FROM --platform=$BUILDPLATFORM node:18 AS builder
|
| 1724 |
+
WORKDIR /app
|
| 1725 |
+
COPY package*.json ./
|
| 1726 |
+
RUN npm ci
|
| 1727 |
+
COPY . .
|
| 1728 |
+
RUN npm run build
|
| 1729 |
+
|
| 1730 |
+
FROM --platform=$TARGETPLATFORM nginx:alpine
|
| 1731 |
+
COPY --from=builder /app/build /usr/share/nginx/html
|
| 1732 |
+
EXPOSE 80"""
|
| 1733 |
+
},
|
| 1734 |
+
{
|
| 1735 |
+
"path": "package.json",
|
| 1736 |
+
"type": "other",
|
| 1737 |
+
"content": '{"name": "app", "scripts": {"build": "echo build"}}'
|
| 1738 |
+
}
|
| 1739 |
+
],
|
| 1740 |
+
"error": {
|
| 1741 |
+
"phase": "docker_build",
|
| 1742 |
+
"message": "failed to solve: failed to parse platform : \"\" is not a valid platform",
|
| 1743 |
+
"exit_code": 1,
|
| 1744 |
+
"failed_step": "Build"
|
| 1745 |
+
},
|
| 1746 |
+
"expected_fixes": [
|
| 1747 |
+
{
|
| 1748 |
+
"file": "Dockerfile",
|
| 1749 |
+
"type": "contains",
|
| 1750 |
+
"expected": "ARG BUILDPLATFORM",
|
| 1751 |
+
"hint": "Platform ARGs must be declared before use"
|
| 1752 |
+
},
|
| 1753 |
+
{
|
| 1754 |
+
"file": "Dockerfile",
|
| 1755 |
+
"type": "contains",
|
| 1756 |
+
"expected": "ARG TARGETPLATFORM",
|
| 1757 |
+
"hint": "Both BUILDPLATFORM and TARGETPLATFORM need ARG declarations"
|
| 1758 |
+
}
|
| 1759 |
+
]
|
| 1760 |
+
},
|
| 1761 |
+
|
| 1762 |
+
# Scenario 3: Cross-job artifact dependency failure
|
| 1763 |
+
{
|
| 1764 |
+
"id": "cross_job_artifact",
|
| 1765 |
+
"files": [
|
| 1766 |
+
{
|
| 1767 |
+
"path": ".github/workflows/build.yml",
|
| 1768 |
+
"type": "workflow",
|
| 1769 |
+
"content": """name: Build and Test
|
| 1770 |
+
on: push
|
| 1771 |
+
|
| 1772 |
+
jobs:
|
| 1773 |
+
build:
|
| 1774 |
+
runs-on: ubuntu-latest
|
| 1775 |
+
steps:
|
| 1776 |
+
- uses: actions/checkout@v4
|
| 1777 |
+
|
| 1778 |
+
- name: Build
|
| 1779 |
+
run: |
|
| 1780 |
+
docker build -t myapp:${{ github.sha }} .
|
| 1781 |
+
docker save myapp:${{ github.sha }} > image.tar
|
| 1782 |
+
|
| 1783 |
+
- uses: actions/upload-artifact@v4
|
| 1784 |
+
with:
|
| 1785 |
+
name: docker-image
|
| 1786 |
+
path: image.tar
|
| 1787 |
+
|
| 1788 |
+
test:
|
| 1789 |
+
runs-on: ubuntu-latest
|
| 1790 |
+
steps:
|
| 1791 |
+
- name: Download image
|
| 1792 |
+
uses: actions/download-artifact@v4
|
| 1793 |
+
with:
|
| 1794 |
+
name: docker-image
|
| 1795 |
+
|
| 1796 |
+
- name: Load and test
|
| 1797 |
+
run: |
|
| 1798 |
+
docker load < image.tar
|
| 1799 |
+
docker run myapp:${{ github.sha }} pytest"""
|
| 1800 |
+
},
|
| 1801 |
+
{
|
| 1802 |
+
"path": "Dockerfile",
|
| 1803 |
+
"type": "dockerfile",
|
| 1804 |
+
"content": """FROM python:3.9
|
| 1805 |
+
WORKDIR /app
|
| 1806 |
+
COPY . .
|
| 1807 |
+
RUN pip install pytest
|
| 1808 |
+
CMD ["python", "app.py"]"""
|
| 1809 |
+
}
|
| 1810 |
+
],
|
| 1811 |
+
"error": {
|
| 1812 |
+
"phase": "workflow_parse",
|
| 1813 |
+
"message": "The workflow is not valid. .github/workflows/build.yml (Line: 22, Col: 5): Job 'test' depends on unknown job 'build'",
|
| 1814 |
+
"exit_code": 1
|
| 1815 |
+
},
|
| 1816 |
+
"expected_fixes": [
|
| 1817 |
+
{
|
| 1818 |
+
"file": ".github/workflows/build.yml",
|
| 1819 |
+
"type": "contains",
|
| 1820 |
+
"expected": "needs: build",
|
| 1821 |
+
"hint": "Test job needs to declare dependency on build job"
|
| 1822 |
+
}
|
| 1823 |
+
]
|
| 1824 |
+
},
|
| 1825 |
+
|
| 1826 |
+
# Scenario 4: Multiple interacting issues
|
| 1827 |
+
{
|
| 1828 |
+
"id": "multiple_issues",
|
| 1829 |
+
"files": [
|
| 1830 |
+
{
|
| 1831 |
+
"path": ".github/workflows/build.yml",
|
| 1832 |
+
"type": "workflow",
|
| 1833 |
+
"content": """name: Full Pipeline
|
| 1834 |
+
on: push
|
| 1835 |
+
|
| 1836 |
+
jobs:
|
| 1837 |
+
build:
|
| 1838 |
+
runs-on: ubuntu-latest
|
| 1839 |
+
steps:
|
| 1840 |
+
- uses: actions/checkout@v4
|
| 1841 |
+
|
| 1842 |
+
- name: Login
|
| 1843 |
+
run: echo $DOCKER_PASSWORD | docker login -u $DOCKER_USERNAME --password-stdin
|
| 1844 |
+
|
| 1845 |
+
- name: Build and Push
|
| 1846 |
+
run: |
|
| 1847 |
+
docker build -t myuser/myapp:latest .
|
| 1848 |
+
docker push myuser/myapp:latest"""
|
| 1849 |
+
},
|
| 1850 |
+
{
|
| 1851 |
+
"path": "Dockerfile",
|
| 1852 |
+
"type": "dockerfile",
|
| 1853 |
+
"content": """FROM python:3.9-slim AS builder
|
| 1854 |
+
WORKDIR /app
|
| 1855 |
+
COPY requirments.txt .
|
| 1856 |
+
RUN pip install -r requirements.txt
|
| 1857 |
+
COPY . .
|
| 1858 |
+
|
| 1859 |
+
FROM python:3.9-slim
|
| 1860 |
+
WORKDIR /app
|
| 1861 |
+
COPY --from=builder /app .
|
| 1862 |
+
CMD ["python", "app.py"]"""
|
| 1863 |
+
},
|
| 1864 |
+
{
|
| 1865 |
+
"path": "requirements.txt",
|
| 1866 |
+
"type": "requirements",
|
| 1867 |
+
"content": "flask==2.0.0"
|
| 1868 |
+
}
|
| 1869 |
+
],
|
| 1870 |
+
"error": {
|
| 1871 |
+
"phase": "docker_build",
|
| 1872 |
+
"message": "COPY failed: file not found in build context: requirments.txt\nAdditionally: Error: Cannot perform an interactive login from a non TTY device",
|
| 1873 |
+
"exit_code": 1
|
| 1874 |
+
},
|
| 1875 |
+
"expected_fixes": [
|
| 1876 |
+
{
|
| 1877 |
+
"file": "Dockerfile",
|
| 1878 |
+
"type": "contains",
|
| 1879 |
+
"expected": "COPY requirements.txt",
|
| 1880 |
+
"hint": "Fix typo in requirements filename"
|
| 1881 |
+
},
|
| 1882 |
+
{
|
| 1883 |
+
"file": ".github/workflows/build.yml",
|
| 1884 |
+
"type": "contains",
|
| 1885 |
+
"expected": "DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }}",
|
| 1886 |
+
"hint": "Add env block for secrets"
|
| 1887 |
+
},
|
| 1888 |
+
{
|
| 1889 |
+
"file": ".github/workflows/build.yml",
|
| 1890 |
+
"type": "contains",
|
| 1891 |
+
"expected": "DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }}",
|
| 1892 |
+
"hint": "Add password to env block"
|
| 1893 |
+
}
|
| 1894 |
+
]
|
| 1895 |
+
}
|
| 1896 |
+
]
|
| 1897 |
+
|
| 1898 |
+
def load_scenario(self, scenario_id: Optional[str] = None) -> Dict:
|
| 1899 |
+
if scenario_id:
|
| 1900 |
+
for s in self.SCENARIOS:
|
| 1901 |
+
if s["id"] == scenario_id:
|
| 1902 |
+
return s
|
| 1903 |
+
raise ValueError(f"Unknown scenario: {scenario_id}")
|
| 1904 |
+
return random.choice(self.SCENARIOS)
|
| 1905 |
+
```
|
| 1906 |
+
|
| 1907 |
+
---
|
| 1908 |
+
|
| 1909 |
+
# 7. GRADER IMPLEMENTATION
|
| 1910 |
+
|
| 1911 |
+
## 7.1 Grader Logic (server/graders/__init__.py)
|
| 1912 |
+
|
| 1913 |
+
```python
|
| 1914 |
+
"""
|
| 1915 |
+
Deterministic graders for CI/CD debugging tasks.
|
| 1916 |
+
|
| 1917 |
+
Grading Philosophy:
|
| 1918 |
+
- 100% deterministic (same input = same output)
|
| 1919 |
+
- Dynamic scoring based on what the agent actually fixes
|
| 1920 |
+
- Granular partial credit (completion, action quality, efficiency)
|
| 1921 |
+
- Score breakdown for transparency
|
| 1922 |
+
- Penalties for hints used
|
| 1923 |
+
"""
|
| 1924 |
+
|
| 1925 |
+
from typing import List, Dict, Any
|
| 1926 |
+
from models import GraderResult, TaskDifficulty
|
| 1927 |
+
from tasks.task_registry import TASK_REGISTRY
|
| 1928 |
+
|
| 1929 |
+
|
| 1930 |
+
def run_grader(task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult:
|
| 1931 |
+
"""
|
| 1932 |
+
Grade a trajectory for a given task.
|
| 1933 |
+
|
| 1934 |
+
Scoring breakdown:
|
| 1935 |
+
- Completion: proportion of issues fixed (dominant component)
|
| 1936 |
+
- Action quality: valid targeted edit actions
|
| 1937 |
+
- Full solution bonus: bonus if all issues are fixed
|
| 1938 |
+
- Efficiency: bonus for fewer extra steps
|
| 1939 |
+
- Hint penalty: -0.05 per hint used
|
| 1940 |
+
"""
|
| 1941 |
+
|
| 1942 |
+
if task_id not in TASK_REGISTRY:
|
| 1943 |
+
raise ValueError(f"Unknown task: {task_id}")
|
| 1944 |
+
|
| 1945 |
+
task = TASK_REGISTRY[task_id]()
|
| 1946 |
+
|
| 1947 |
+
# Extract final state
|
| 1948 |
+
if not trajectory:
|
| 1949 |
+
return GraderResult(
|
| 1950 |
+
task_id=task_id,
|
| 1951 |
+
score=0.0,
|
| 1952 |
+
breakdown={"error": "Empty trajectory"},
|
| 1953 |
+
feedback="No actions taken",
|
| 1954 |
+
steps_taken=0,
|
| 1955 |
+
hints_used=0
|
| 1956 |
+
)
|
| 1957 |
+
|
| 1958 |
+
final_step = trajectory[-1]
|
| 1959 |
+
steps_taken = len(trajectory)
|
| 1960 |
+
|
| 1961 |
+
# Count hints used
|
| 1962 |
+
hints_used = sum(
|
| 1963 |
+
1 for step in trajectory
|
| 1964 |
+
if step.get("action", {}).get("action_type") == "request_hint"
|
| 1965 |
+
)
|
| 1966 |
+
|
| 1967 |
+
# Calculate score components
|
| 1968 |
+
score = 0.0
|
| 1969 |
+
breakdown = {}
|
| 1970 |
+
|
| 1971 |
+
# Get issues fixed from final observation
|
| 1972 |
+
issues_fixed = final_step.get("info", {}).get("issues_fixed", 0)
|
| 1973 |
+
issues_total = final_step.get("info", {}).get("issues_total", 1)
|
| 1974 |
+
|
| 1975 |
+
# Per-issue credit (0.6 total for fixing all)
|
| 1976 |
+
fix_ratio = issues_fixed / issues_total if issues_total > 0 else 0
|
| 1977 |
+
fix_score = 0.6 * fix_ratio
|
| 1978 |
+
breakdown["issues_fixed"] = fix_score
|
| 1979 |
+
score += fix_score
|
| 1980 |
+
|
| 1981 |
+
# Full solution bonus (0.2)
|
| 1982 |
+
if issues_fixed == issues_total:
|
| 1983 |
+
breakdown["complete_solution"] = 0.2
|
| 1984 |
+
score += 0.2
|
| 1985 |
+
else:
|
| 1986 |
+
breakdown["complete_solution"] = 0.0
|
| 1987 |
+
|
| 1988 |
+
# Efficiency bonus (0.2 max)
|
| 1989 |
+
# Optimal: 1 step per issue. Penalty for extra steps.
|
| 1990 |
+
optimal_steps = issues_total
|
| 1991 |
+
if steps_taken <= optimal_steps:
|
| 1992 |
+
efficiency_score = 0.2
|
| 1993 |
+
else:
|
| 1994 |
+
# Lose 0.02 per extra step, minimum 0
|
| 1995 |
+
extra_steps = steps_taken - optimal_steps
|
| 1996 |
+
efficiency_score = max(0, 0.2 - (extra_steps * 0.02))
|
| 1997 |
+
breakdown["efficiency"] = efficiency_score
|
| 1998 |
+
score += efficiency_score
|
| 1999 |
+
|
| 2000 |
+
# Hint penalty
|
| 2001 |
+
hint_penalty = hints_used * 0.05
|
| 2002 |
+
breakdown["hint_penalty"] = -hint_penalty
|
| 2003 |
+
score -= hint_penalty
|
| 2004 |
+
|
| 2005 |
+
# Clamp to [0, 1]
|
| 2006 |
+
score = max(0.0, min(1.0, score))
|
| 2007 |
+
|
| 2008 |
+
# Generate feedback
|
| 2009 |
+
if score >= 0.9:
|
| 2010 |
+
feedback = "Excellent! All issues fixed efficiently."
|
| 2011 |
+
elif score >= 0.7:
|
| 2012 |
+
feedback = "Good job! Most issues fixed."
|
| 2013 |
+
elif score >= 0.5:
|
| 2014 |
+
feedback = "Partial success. Some issues remain."
|
| 2015 |
+
elif score >= 0.3:
|
| 2016 |
+
feedback = "Limited progress. Review the error messages carefully."
|
| 2017 |
+
else:
|
| 2018 |
+
feedback = "Needs improvement. Try analyzing the error phase first."
|
| 2019 |
+
|
| 2020 |
+
return GraderResult(
|
| 2021 |
+
task_id=task_id,
|
| 2022 |
+
score=round(score, 3),
|
| 2023 |
+
breakdown={k: round(v, 3) for k, v in breakdown.items()},
|
| 2024 |
+
feedback=feedback,
|
| 2025 |
+
steps_taken=steps_taken,
|
| 2026 |
+
hints_used=hints_used
|
| 2027 |
+
)
|
| 2028 |
+
```
|
| 2029 |
+
|
| 2030 |
+
---
|
| 2031 |
+
|
| 2032 |
+
# 8. REWARD FUNCTION DESIGN
|
| 2033 |
+
|
| 2034 |
+
## Dense Reward Strategy
|
| 2035 |
+
|
| 2036 |
+
```python
|
| 2037 |
+
"""
|
| 2038 |
+
Reward Function Design
|
| 2039 |
+
|
| 2040 |
+
Properties:
|
| 2041 |
+
1. Dense (signal at every step, not just end)
|
| 2042 |
+
2. Shaped (guides toward solution)
|
| 2043 |
+
3. Bounded [0, 1] per step
|
| 2044 |
+
4. Cumulative episode reward can exceed 1.0
|
| 2045 |
+
|
| 2046 |
+
Reward Components:
|
| 2047 |
+
- Syntax validation: +0.1 when file becomes syntactically valid
|
| 2048 |
+
- Issue identification: +0.1 when agent actions target correct file/line
|
| 2049 |
+
- Partial fix: +0.2 when fix is partially correct
|
| 2050 |
+
- Full fix: +0.3 when issue is fully resolved
|
| 2051 |
+
- Submit bonus: +0.0 to +0.5 based on final validation
|
| 2052 |
+
- Hint penalty: -0.05 per hint
|
| 2053 |
+
|
| 2054 |
+
This creates a curriculum:
|
| 2055 |
+
- Agent learns to identify issues first (+0.1)
|
| 2056 |
+
- Then learns to fix them (+0.2 to +0.3)
|
| 2057 |
+
- Finally learns to validate (+0.0 to +0.5)
|
| 2058 |
+
"""
|
| 2059 |
+
|
| 2060 |
+
def calculate_step_reward(
|
| 2061 |
+
prev_state: EnvironmentState,
|
| 2062 |
+
action: Action,
|
| 2063 |
+
new_state: EnvironmentState
|
| 2064 |
+
) -> float:
|
| 2065 |
+
"""Calculate reward for a single step."""
|
| 2066 |
+
|
| 2067 |
+
reward = 0.0
|
| 2068 |
+
|
| 2069 |
+
# 1. Syntax validation reward
|
| 2070 |
+
for file_path in new_state.files:
|
| 2071 |
+
prev_valid = prev_state.file_valid.get(file_path, False)
|
| 2072 |
+
new_valid = new_state.file_valid.get(file_path, False)
|
| 2073 |
+
if not prev_valid and new_valid:
|
| 2074 |
+
reward += 0.1 # File became valid
|
| 2075 |
+
|
| 2076 |
+
# 2. Issue targeting reward
|
| 2077 |
+
if action.edits:
|
| 2078 |
+
for edit in action.edits:
|
| 2079 |
+
if is_correct_target(edit, new_state.expected_fixes):
|
| 2080 |
+
reward += 0.1 # Targeting correct area
|
| 2081 |
+
|
| 2082 |
+
# 3. Fix progress reward
|
| 2083 |
+
new_fixes = new_state.issues_fixed - prev_state.issues_fixed
|
| 2084 |
+
if new_fixes > 0:
|
| 2085 |
+
reward += 0.3 * new_fixes # Per issue fixed
|
| 2086 |
+
|
| 2087 |
+
# 4. Submit reward (calculated in _handle_submit)
|
| 2088 |
+
if action.action_type == ActionType.SUBMIT:
|
| 2089 |
+
# This is handled separately in _handle_submit
|
| 2090 |
+
pass
|
| 2091 |
+
|
| 2092 |
+
# 5. Hint penalty
|
| 2093 |
+
if action.action_type == ActionType.REQUEST_HINT:
|
| 2094 |
+
reward -= 0.05
|
| 2095 |
+
|
| 2096 |
+
# 6. Invalid action penalty
|
| 2097 |
+
if not new_state.last_action_success:
|
| 2098 |
+
reward -= 0.02 # Small penalty for failed actions
|
| 2099 |
+
|
| 2100 |
+
return reward
|
| 2101 |
+
```
|
| 2102 |
+
|
| 2103 |
+
---
|
| 2104 |
+
|
| 2105 |
+
# 9. BASELINE INFERENCE SCRIPT
|
| 2106 |
+
|
| 2107 |
+
## inference.py (Root Directory)
|
| 2108 |
+
|
| 2109 |
+
```python
|
| 2110 |
+
"""
|
| 2111 |
+
Baseline Inference Script for CI/CD Debug Environment
|
| 2112 |
+
======================================================
|
| 2113 |
+
|
| 2114 |
+
MANDATORY REQUIREMENTS:
|
| 2115 |
+
- Uses OpenAI Client for all LLM calls
|
| 2116 |
+
- Reads API_BASE_URL, MODEL_NAME, HF_TOKEN from environment
|
| 2117 |
+
- Named 'inference.py' in root directory
|
| 2118 |
+
- Runtime < 20 minutes
|
| 2119 |
+
- Works on vcpu=2, memory=8gb
|
| 2120 |
+
|
| 2121 |
+
This baseline demonstrates a simple but effective approach:
|
| 2122 |
+
1. Parse the error message to identify error type
|
| 2123 |
+
2. Locate the problematic file and line
|
| 2124 |
+
3. Apply appropriate fix based on error pattern
|
| 2125 |
+
4. Submit and verify
|
| 2126 |
+
"""
|
| 2127 |
+
|
| 2128 |
+
import os
|
| 2129 |
+
import re
|
| 2130 |
+
import json
|
| 2131 |
+
import time
|
| 2132 |
+
from typing import List, Dict, Any, Optional
|
| 2133 |
+
|
| 2134 |
+
import requests
|
| 2135 |
+
from openai import OpenAI
|
| 2136 |
+
|
| 2137 |
+
# ============== CONFIGURATION ==============
|
| 2138 |
+
|
| 2139 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 2140 |
+
API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 2141 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/Llama-3.1-70B-Instruct")
|
| 2142 |
+
ENV_URL = os.getenv("ENV_URL", "http://localhost:7860")
|
| 2143 |
+
|
| 2144 |
+
MAX_STEPS = 10
|
| 2145 |
+
TEMPERATURE = 0.2
|
| 2146 |
+
MAX_TOKENS = 500
|
| 2147 |
+
|
| 2148 |
+
# ============== SYSTEM PROMPT ==============
|
| 2149 |
+
|
| 2150 |
+
SYSTEM_PROMPT = """You are an expert DevOps engineer debugging CI/CD infrastructure.
|
| 2151 |
+
|
| 2152 |
+
You are given:
|
| 2153 |
+
1. Error message from a failed Docker build or GitHub Actions workflow
|
| 2154 |
+
2. The relevant configuration files (Dockerfile, workflow YAML)
|
| 2155 |
+
3. Available actions to fix the issues
|
| 2156 |
+
|
| 2157 |
+
Your task is to identify and fix the issues. Common problems include:
|
| 2158 |
+
- Typos in filenames (requirments.txt vs requirements.txt)
|
| 2159 |
+
- Missing environment variable references for secrets
|
| 2160 |
+
- Wrong file paths in COPY commands
|
| 2161 |
+
- Missing steps (checkout before build, buildx for multi-platform)
|
| 2162 |
+
- Invalid syntax in YAML or Dockerfile
|
| 2163 |
+
|
| 2164 |
+
Respond with a JSON object containing your action:
|
| 2165 |
+
{
|
| 2166 |
+
"action_type": "replace_line" | "add_line" | "edit_file" | "submit" | "request_hint",
|
| 2167 |
+
"edits": [
|
| 2168 |
+
{
|
| 2169 |
+
"file_path": "path/to/file",
|
| 2170 |
+
"line_number": 5,
|
| 2171 |
+
"old_content": "old text",
|
| 2172 |
+
"new_content": "new text"
|
| 2173 |
+
}
|
| 2174 |
+
],
|
| 2175 |
+
"reasoning": "Brief explanation of the fix"
|
| 2176 |
+
}
|
| 2177 |
+
|
| 2178 |
+
When you believe all issues are fixed, use action_type: "submit".
|
| 2179 |
+
Be precise and fix one issue at a time."""
|
| 2180 |
+
|
| 2181 |
+
# ============== HELPER FUNCTIONS ==============
|
| 2182 |
+
|
| 2183 |
+
def build_user_prompt(observation: Dict) -> str:
|
| 2184 |
+
"""Build the user prompt from observation."""
|
| 2185 |
+
|
| 2186 |
+
files_str = ""
|
| 2187 |
+
for f in observation.get("files", []):
|
| 2188 |
+
content = f["content"]
|
| 2189 |
+
# Add line numbers
|
| 2190 |
+
lines = content.split("\n")
|
| 2191 |
+
numbered = "\n".join(f"{i+1:3}: {line}" for i, line in enumerate(lines))
|
| 2192 |
+
files_str += f"\n### {f['path']}\n```\n{numbered}\n```\n"
|
| 2193 |
+
|
| 2194 |
+
error = observation.get("error", {})
|
| 2195 |
+
|
| 2196 |
+
prompt = f"""## Current State
|
| 2197 |
+
Task: {observation.get('task_description', 'Fix CI/CD issues')}
|
| 2198 |
+
Difficulty: {observation.get('difficulty', 'unknown')}
|
| 2199 |
+
Step: {observation.get('step_number', 0)}/{observation.get('max_steps', 10)}
|
| 2200 |
+
Issues Fixed: {observation.get('issues_fixed', 0)}/{observation.get('total_issues', '?')}
|
| 2201 |
+
|
| 2202 |
+
## Error Information
|
| 2203 |
+
Phase: {error.get('phase', 'unknown')}
|
| 2204 |
+
Message: {error.get('error_message', 'No error message')}
|
| 2205 |
+
Failed Step: {error.get('failed_step', 'unknown')}
|
| 2206 |
+
Line Hint: {error.get('line_hint', 'none')}
|
| 2207 |
+
|
| 2208 |
+
## Files
|
| 2209 |
+
{files_str}
|
| 2210 |
+
|
| 2211 |
+
## Last Action Feedback
|
| 2212 |
+
{observation.get('last_action_feedback', 'None')}
|
| 2213 |
+
|
| 2214 |
+
Analyze the error and provide your fix as JSON."""
|
| 2215 |
+
|
| 2216 |
+
return prompt
|
| 2217 |
+
|
| 2218 |
+
|
| 2219 |
+
def parse_model_response(response_text: str) -> Dict:
|
| 2220 |
+
"""Parse the model's JSON response."""
|
| 2221 |
+
|
| 2222 |
+
# Try to extract JSON from response
|
| 2223 |
+
try:
|
| 2224 |
+
# Look for JSON block
|
| 2225 |
+
json_match = re.search(r'\{[^{}]*\}', response_text, re.DOTALL)
|
| 2226 |
+
if json_match:
|
| 2227 |
+
return json.loads(json_match.group())
|
| 2228 |
+
except json.JSONDecodeError:
|
| 2229 |
+
pass
|
| 2230 |
+
|
| 2231 |
+
# Fallback: try to parse whole response
|
| 2232 |
+
try:
|
| 2233 |
+
return json.loads(response_text)
|
| 2234 |
+
except json.JSONDecodeError:
|
| 2235 |
+
pass
|
| 2236 |
+
|
| 2237 |
+
# Default action
|
| 2238 |
+
return {
|
| 2239 |
+
"action_type": "request_hint",
|
| 2240 |
+
"reasoning": "Could not parse response"
|
| 2241 |
+
}
|
| 2242 |
+
|
| 2243 |
+
|
| 2244 |
+
def call_environment(endpoint: str, method: str = "GET", data: Dict = None) -> Dict:
|
| 2245 |
+
"""Make a request to the environment."""
|
| 2246 |
+
|
| 2247 |
+
url = f"{ENV_URL}{endpoint}"
|
| 2248 |
+
|
| 2249 |
+
if method == "GET":
|
| 2250 |
+
response = requests.get(url, timeout=30)
|
| 2251 |
+
else:
|
| 2252 |
+
response = requests.post(url, json=data or {}, timeout=30)
|
| 2253 |
+
|
| 2254 |
+
response.raise_for_status()
|
| 2255 |
+
return response.json()
|
| 2256 |
+
|
| 2257 |
+
|
| 2258 |
+
# ============== MAIN INFERENCE LOOP ==============
|
| 2259 |
+
|
| 2260 |
+
def run_episode(task_id: Optional[str] = None) -> Dict:
|
| 2261 |
+
"""Run a single episode."""
|
| 2262 |
+
|
| 2263 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
|
| 2264 |
+
|
| 2265 |
+
# Reset environment
|
| 2266 |
+
reset_response = call_environment("/reset", "POST", {"task_id": task_id})
|
| 2267 |
+
observation = reset_response["observation"]
|
| 2268 |
+
|
| 2269 |
+
print(f"Starting episode: {observation['task_id']}")
|
| 2270 |
+
print(f"Task: {observation['task_description']}")
|
| 2271 |
+
print(f"Difficulty: {observation['difficulty']}")
|
| 2272 |
+
|
| 2273 |
+
trajectory = []
|
| 2274 |
+
episode_reward = 0.0
|
| 2275 |
+
|
| 2276 |
+
for step in range(1, MAX_STEPS + 1):
|
| 2277 |
+
# Build prompt
|
| 2278 |
+
user_prompt = build_user_prompt(observation)
|
| 2279 |
+
|
| 2280 |
+
# Call LLM
|
| 2281 |
+
try:
|
| 2282 |
+
completion = client.chat.completions.create(
|
| 2283 |
+
model=MODEL_NAME,
|
| 2284 |
+
messages=[
|
| 2285 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 2286 |
+
{"role": "user", "content": user_prompt}
|
| 2287 |
+
],
|
| 2288 |
+
temperature=TEMPERATURE,
|
| 2289 |
+
max_tokens=MAX_TOKENS
|
| 2290 |
+
)
|
| 2291 |
+
response_text = completion.choices[0].message.content or ""
|
| 2292 |
+
except Exception as e:
|
| 2293 |
+
print(f"LLM error: {e}")
|
| 2294 |
+
response_text = '{"action_type": "request_hint"}'
|
| 2295 |
+
|
| 2296 |
+
# Parse action
|
| 2297 |
+
action = parse_model_response(response_text)
|
| 2298 |
+
print(f"Step {step}: {action.get('action_type')} - {action.get('reasoning', '')[:50]}")
|
| 2299 |
+
|
| 2300 |
+
# Take step
|
| 2301 |
+
step_response = call_environment("/step", "POST", {"action": action})
|
| 2302 |
+
|
| 2303 |
+
observation = step_response["observation"]
|
| 2304 |
+
reward = step_response["reward"]
|
| 2305 |
+
done = step_response["done"]
|
| 2306 |
+
info = step_response["info"]
|
| 2307 |
+
|
| 2308 |
+
episode_reward += reward
|
| 2309 |
+
|
| 2310 |
+
trajectory.append({
|
| 2311 |
+
"step": step,
|
| 2312 |
+
"action": action,
|
| 2313 |
+
"reward": reward,
|
| 2314 |
+
"done": done,
|
| 2315 |
+
"info": info
|
| 2316 |
+
})
|
| 2317 |
+
|
| 2318 |
+
print(f" Reward: {reward:.3f} | Done: {done} | Fixed: {info.get('issues_fixed', 0)}/{info.get('issues_total', '?')}")
|
| 2319 |
+
|
| 2320 |
+
if done:
|
| 2321 |
+
break
|
| 2322 |
+
|
| 2323 |
+
# Get final grading
|
| 2324 |
+
grader_response = call_environment("/grader", "POST", {
|
| 2325 |
+
"task_id": observation["task_id"],
|
| 2326 |
+
"trajectory": trajectory
|
| 2327 |
+
})
|
| 2328 |
+
|
| 2329 |
+
result = grader_response["result"]
|
| 2330 |
+
print(f"\nFinal Score: {result['score']:.3f}")
|
| 2331 |
+
print(f"Feedback: {result['feedback']}")
|
| 2332 |
+
|
| 2333 |
+
return result
|
| 2334 |
+
|
| 2335 |
+
|
| 2336 |
+
def main():
|
| 2337 |
+
"""Run baseline on all tasks."""
|
| 2338 |
+
|
| 2339 |
+
print("=" * 60)
|
| 2340 |
+
print("CI/CD Debug Environment - Baseline Inference")
|
| 2341 |
+
print("=" * 60)
|
| 2342 |
+
print(f"API: {API_BASE_URL}")
|
| 2343 |
+
print(f"Model: {MODEL_NAME}")
|
| 2344 |
+
print(f"Environment: {ENV_URL}")
|
| 2345 |
+
print()
|
| 2346 |
+
|
| 2347 |
+
# Get available tasks
|
| 2348 |
+
info = call_environment("/info")
|
| 2349 |
+
tasks = info["tasks"]
|
| 2350 |
+
|
| 2351 |
+
results = []
|
| 2352 |
+
|
| 2353 |
+
for task in tasks:
|
| 2354 |
+
print(f"\n{'='*60}")
|
| 2355 |
+
print(f"Task: {task['name']} ({task['difficulty']})")
|
| 2356 |
+
print("=" * 60)
|
| 2357 |
+
|
| 2358 |
+
result = run_episode(task["id"])
|
| 2359 |
+
results.append(result)
|
| 2360 |
+
|
| 2361 |
+
time.sleep(1) # Rate limiting
|
| 2362 |
+
|
| 2363 |
+
# Summary
|
| 2364 |
+
print("\n" + "=" * 60)
|
| 2365 |
+
print("SUMMARY")
|
| 2366 |
+
print("=" * 60)
|
| 2367 |
+
|
| 2368 |
+
total_score = 0
|
| 2369 |
+
for task, result in zip(tasks, results):
|
| 2370 |
+
print(f"{task['name']}: {result['score']:.3f}")
|
| 2371 |
+
total_score += result["score"]
|
| 2372 |
+
|
| 2373 |
+
avg_score = total_score / len(results) if results else 0
|
| 2374 |
+
print(f"\nAverage Score: {avg_score:.3f}")
|
| 2375 |
+
|
| 2376 |
+
return results
|
| 2377 |
+
|
| 2378 |
+
|
| 2379 |
+
if __name__ == "__main__":
|
| 2380 |
+
main()
|
| 2381 |
+
```
|
| 2382 |
+
|
| 2383 |
+
---
|
| 2384 |
+
|
| 2385 |
+
# 10. DOCKERFILE & DEPLOYMENT
|
| 2386 |
+
|
| 2387 |
+
## 10.1 Dockerfile
|
| 2388 |
+
|
| 2389 |
+
```dockerfile
|
| 2390 |
+
# Multi-stage build for smaller image
|
| 2391 |
+
FROM python:3.11-slim AS builder
|
| 2392 |
+
|
| 2393 |
+
WORKDIR /app
|
| 2394 |
+
|
| 2395 |
+
# Install build dependencies
|
| 2396 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 2397 |
+
gcc \
|
| 2398 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 2399 |
+
|
| 2400 |
+
# Copy and install requirements
|
| 2401 |
+
COPY requirements.txt .
|
| 2402 |
+
RUN pip install --no-cache-dir --user -r requirements.txt
|
| 2403 |
+
|
| 2404 |
+
# Production stage
|
| 2405 |
+
FROM python:3.11-slim
|
| 2406 |
+
|
| 2407 |
+
WORKDIR /app
|
| 2408 |
+
|
| 2409 |
+
# Copy installed packages from builder
|
| 2410 |
+
COPY --from=builder /root/.local /root/.local
|
| 2411 |
+
ENV PATH=/root/.local/bin:$PATH
|
| 2412 |
+
|
| 2413 |
+
# Copy application code
|
| 2414 |
+
COPY server/ ./server/
|
| 2415 |
+
COPY data/ ./data/
|
| 2416 |
+
COPY openenv.yaml .
|
| 2417 |
+
COPY inference.py .
|
| 2418 |
+
|
| 2419 |
+
# Create non-root user for security
|
| 2420 |
+
RUN useradd --create-home appuser
|
| 2421 |
+
USER appuser
|
| 2422 |
+
|
| 2423 |
+
# Expose port (HuggingFace Spaces uses 7860)
|
| 2424 |
+
EXPOSE 7860
|
| 2425 |
+
|
| 2426 |
+
# Health check
|
| 2427 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
| 2428 |
+
CMD python -c "import requests; requests.get('http://localhost:7860/')" || exit 1
|
| 2429 |
+
|
| 2430 |
+
# Run the server
|
| 2431 |
+
CMD ["python", "-m", "uvicorn", "server.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
| 2432 |
+
```
|
| 2433 |
+
|
| 2434 |
+
## 10.2 requirements.txt
|
| 2435 |
+
|
| 2436 |
+
```
|
| 2437 |
+
# Core
|
| 2438 |
+
fastapi==0.109.0
|
| 2439 |
+
uvicorn[standard]==0.27.0
|
| 2440 |
+
pydantic==2.5.3
|
| 2441 |
+
|
| 2442 |
+
# HTTP client
|
| 2443 |
+
requests==2.31.0
|
| 2444 |
+
httpx==0.26.0
|
| 2445 |
+
|
| 2446 |
+
# OpenAI client (for baseline)
|
| 2447 |
+
openai==1.12.0
|
| 2448 |
+
|
| 2449 |
+
# YAML parsing (for workflow validation)
|
| 2450 |
+
pyyaml==6.0.1
|
| 2451 |
+
ruamel.yaml==0.18.5
|
| 2452 |
+
|
| 2453 |
+
# Testing
|
| 2454 |
+
pytest==7.4.4
|
| 2455 |
+
pytest-asyncio==0.23.3
|
| 2456 |
+
|
| 2457 |
+
# Utilities
|
| 2458 |
+
python-dotenv==1.0.0
|
| 2459 |
+
```
|
| 2460 |
+
|
| 2461 |
+
## 10.3 HuggingFace Spaces Deployment
|
| 2462 |
+
|
| 2463 |
+
```yaml
|
| 2464 |
+
# README.md for HF Space
|
| 2465 |
+
---
|
| 2466 |
+
title: CI/CD Debug Environment
|
| 2467 |
+
emoji: 🔧
|
| 2468 |
+
colorFrom: blue
|
| 2469 |
+
colorTo: green
|
| 2470 |
+
sdk: docker
|
| 2471 |
+
app_port: 7860
|
| 2472 |
+
pinned: false
|
| 2473 |
+
license: mit
|
| 2474 |
+
---
|
| 2475 |
+
|
| 2476 |
+
# CI/CD Debug Environment
|
| 2477 |
+
|
| 2478 |
+
An OpenEnv-compliant environment for training AI agents to debug Docker and GitHub Actions issues.
|
| 2479 |
+
|
| 2480 |
+
## Quick Start
|
| 2481 |
+
|
| 2482 |
+
```bash
|
| 2483 |
+
# Reset environment
|
| 2484 |
+
curl -X POST https://your-space.hf.space/reset
|
| 2485 |
+
|
| 2486 |
+
# Take action
|
| 2487 |
+
curl -X POST https://your-space.hf.space/step \
|
| 2488 |
+
-H "Content-Type: application/json" \
|
| 2489 |
+
-d '{"action": {"action_type": "submit"}}'
|
| 2490 |
+
```
|
| 2491 |
+
|
| 2492 |
+
## Tasks
|
| 2493 |
+
|
| 2494 |
+
1. **Dockerfile Syntax** (Easy) - Fix common Dockerfile errors
|
| 2495 |
+
2. **Workflow Config** (Medium) - Fix GitHub Actions + Docker issues
|
| 2496 |
+
3. **Multi-Stage Pipeline** (Hard) - Debug complex CI/CD pipelines
|
| 2497 |
+
```
|
| 2498 |
+
|
| 2499 |
+
---
|
| 2500 |
+
|
| 2501 |
+
# 11. TESTING PLAN
|
| 2502 |
+
|
| 2503 |
+
## 11.1 Test Categories
|
| 2504 |
+
|
| 2505 |
+
```python
|
| 2506 |
+
# tests/test_endpoints.py
|
| 2507 |
+
"""Test all required OpenEnv endpoints."""
|
| 2508 |
+
|
| 2509 |
+
import pytest
|
| 2510 |
+
from fastapi.testclient import TestClient
|
| 2511 |
+
from server.main import app
|
| 2512 |
+
|
| 2513 |
+
client = TestClient(app)
|
| 2514 |
+
|
| 2515 |
+
|
| 2516 |
+
class TestEndpoints:
|
| 2517 |
+
"""Verify all 7 endpoints work correctly."""
|
| 2518 |
+
|
| 2519 |
+
def test_root_health(self):
|
| 2520 |
+
"""GET / returns healthy status."""
|
| 2521 |
+
response = client.get("/")
|
| 2522 |
+
assert response.status_code == 200
|
| 2523 |
+
assert response.json()["status"] == "healthy"
|
| 2524 |
+
|
| 2525 |
+
def test_reset_returns_observation(self):
|
| 2526 |
+
"""POST /reset returns valid observation."""
|
| 2527 |
+
response = client.post("/reset", json={})
|
| 2528 |
+
assert response.status_code == 200
|
| 2529 |
+
data = response.json()
|
| 2530 |
+
assert "observation" in data
|
| 2531 |
+
assert "task_id" in data["observation"]
|
| 2532 |
+
assert "files" in data["observation"]
|
| 2533 |
+
assert "error" in data["observation"]
|
| 2534 |
+
|
| 2535 |
+
def test_step_requires_reset(self):
|
| 2536 |
+
"""POST /step fails without reset."""
|
| 2537 |
+
# Fresh client/environment
|
| 2538 |
+
response = client.post("/step", json={
|
| 2539 |
+
"action": {"action_type": "submit"}
|
| 2540 |
+
})
|
| 2541 |
+
# Should fail or require reset
|
| 2542 |
+
# (Implementation dependent)
|
| 2543 |
+
|
| 2544 |
+
def test_step_returns_result(self):
|
| 2545 |
+
"""POST /step returns observation, reward, done."""
|
| 2546 |
+
client.post("/reset", json={})
|
| 2547 |
+
response = client.post("/step", json={
|
| 2548 |
+
"action": {"action_type": "request_hint"}
|
| 2549 |
+
})
|
| 2550 |
+
assert response.status_code == 200
|
| 2551 |
+
data = response.json()
|
| 2552 |
+
assert "observation" in data
|
| 2553 |
+
assert "reward" in data
|
| 2554 |
+
assert "done" in data
|
| 2555 |
+
|
| 2556 |
+
def test_state_returns_current(self):
|
| 2557 |
+
"""GET /state returns current observation."""
|
| 2558 |
+
client.post("/reset", json={})
|
| 2559 |
+
response = client.get("/state")
|
| 2560 |
+
assert response.status_code == 200
|
| 2561 |
+
assert "observation" in response.json()
|
| 2562 |
+
|
| 2563 |
+
def test_info_returns_metadata(self):
|
| 2564 |
+
"""GET /info returns environment metadata."""
|
| 2565 |
+
response = client.get("/info")
|
| 2566 |
+
assert response.status_code == 200
|
| 2567 |
+
data = response.json()
|
| 2568 |
+
assert "tasks" in data
|
| 2569 |
+
assert len(data["tasks"]) >= 3
|
| 2570 |
+
|
| 2571 |
+
def test_tasks_returns_list(self):
|
| 2572 |
+
"""GET /tasks returns task list."""
|
| 2573 |
+
response = client.get("/tasks")
|
| 2574 |
+
assert response.status_code == 200
|
| 2575 |
+
assert "tasks" in response.json()
|
| 2576 |
+
|
| 2577 |
+
def test_grader_returns_score(self):
|
| 2578 |
+
"""POST /grader returns valid score."""
|
| 2579 |
+
response = client.post("/grader", json={
|
| 2580 |
+
"task_id": "dockerfile_syntax",
|
| 2581 |
+
"trajectory": []
|
| 2582 |
+
})
|
| 2583 |
+
assert response.status_code == 200
|
| 2584 |
+
result = response.json()["result"]
|
| 2585 |
+
assert 0.0 <= result["score"] <= 1.0
|
| 2586 |
+
|
| 2587 |
+
def test_baseline_runs(self):
|
| 2588 |
+
"""POST /baseline executes baseline script."""
|
| 2589 |
+
response = client.post("/baseline", json={
|
| 2590 |
+
"task_id": "dockerfile_syntax",
|
| 2591 |
+
"num_episodes": 1
|
| 2592 |
+
})
|
| 2593 |
+
assert response.status_code == 200
|
| 2594 |
+
|
| 2595 |
+
|
| 2596 |
+
# tests/test_graders.py
|
| 2597 |
+
"""Test grader determinism and correctness."""
|
| 2598 |
+
|
| 2599 |
+
class TestGraderDeterminism:
|
| 2600 |
+
"""Verify graders are deterministic."""
|
| 2601 |
+
|
| 2602 |
+
def test_same_trajectory_same_score(self):
|
| 2603 |
+
"""Same trajectory produces same score."""
|
| 2604 |
+
trajectory = [
|
| 2605 |
+
{"step": 1, "action": {"action_type": "submit"}, "reward": 0.5, "done": True, "info": {"issues_fixed": 1, "issues_total": 2}}
|
| 2606 |
+
]
|
| 2607 |
+
|
| 2608 |
+
result1 = run_grader("dockerfile_syntax", trajectory)
|
| 2609 |
+
result2 = run_grader("dockerfile_syntax", trajectory)
|
| 2610 |
+
|
| 2611 |
+
assert result1.score == result2.score
|
| 2612 |
+
assert result1.breakdown == result2.breakdown
|
| 2613 |
+
|
| 2614 |
+
def test_score_in_valid_range(self):
|
| 2615 |
+
"""Score is always between 0.0 and 1.0."""
|
| 2616 |
+
for _ in range(100):
|
| 2617 |
+
trajectory = generate_random_trajectory()
|
| 2618 |
+
result = run_grader("dockerfile_syntax", trajectory)
|
| 2619 |
+
assert 0.0 <= result.score <= 1.0
|
| 2620 |
+
|
| 2621 |
+
|
| 2622 |
+
# tests/test_tasks.py
|
| 2623 |
+
"""Test task scenarios."""
|
| 2624 |
+
|
| 2625 |
+
class TestTaskScenarios:
|
| 2626 |
+
"""Verify each task has valid scenarios."""
|
| 2627 |
+
|
| 2628 |
+
def test_each_task_has_3_plus_scenarios(self):
|
| 2629 |
+
"""Every task has at least 3 scenarios."""
|
| 2630 |
+
for task_id, task_cls in TASK_REGISTRY.items():
|
| 2631 |
+
assert len(task_cls.SCENARIOS) >= 3, f"{task_id} has < 3 scenarios"
|
| 2632 |
+
|
| 2633 |
+
def test_scenarios_have_required_fields(self):
|
| 2634 |
+
"""Each scenario has all required fields."""
|
| 2635 |
+
required = ["id", "files", "error", "expected_fixes"]
|
| 2636 |
+
for task_id, task_cls in TASK_REGISTRY.items():
|
| 2637 |
+
for scenario in task_cls.SCENARIOS:
|
| 2638 |
+
for field in required:
|
| 2639 |
+
assert field in scenario, f"{task_id} scenario missing {field}"
|
| 2640 |
+
|
| 2641 |
+
def test_expected_fixes_are_verifiable(self):
|
| 2642 |
+
"""Each expected fix can be verified programmatically."""
|
| 2643 |
+
for task_id, task_cls in TASK_REGISTRY.items():
|
| 2644 |
+
task = task_cls()
|
| 2645 |
+
for scenario in task_cls.SCENARIOS:
|
| 2646 |
+
for fix in scenario["expected_fixes"]:
|
| 2647 |
+
assert "file" in fix
|
| 2648 |
+
assert "type" in fix
|
| 2649 |
+
assert fix["type"] in ["contains", "not_contains", "line_equals"]
|
| 2650 |
+
```
|
| 2651 |
+
|
| 2652 |
+
## 11.2 Validation Script (Local)
|
| 2653 |
+
|
| 2654 |
+
```bash
|
| 2655 |
+
#!/bin/bash
|
| 2656 |
+
# validate-local.sh - Run all checks locally
|
| 2657 |
+
|
| 2658 |
+
set -e
|
| 2659 |
+
|
| 2660 |
+
echo "=== 1. Running unit tests ==="
|
| 2661 |
+
pytest tests/ -v
|
| 2662 |
+
|
| 2663 |
+
echo "=== 2. Building Docker image ==="
|
| 2664 |
+
docker build -t cicd-debug-env:test .
|
| 2665 |
+
|
| 2666 |
+
echo "=== 3. Running container ==="
|
| 2667 |
+
docker run -d --name test-env -p 7860:7860 cicd-debug-env:test
|
| 2668 |
+
sleep 5
|
| 2669 |
+
|
| 2670 |
+
echo "=== 4. Testing endpoints ==="
|
| 2671 |
+
curl -f http://localhost:7860/ || exit 1
|
| 2672 |
+
curl -f -X POST http://localhost:7860/reset || exit 1
|
| 2673 |
+
curl -f http://localhost:7860/info || exit 1
|
| 2674 |
+
curl -f http://localhost:7860/tasks || exit 1
|
| 2675 |
+
|
| 2676 |
+
echo "=== 5. Running openenv validate ==="
|
| 2677 |
+
openenv validate
|
| 2678 |
+
|
| 2679 |
+
echo "=== 6. Cleanup ==="
|
| 2680 |
+
docker stop test-env
|
| 2681 |
+
docker rm test-env
|
| 2682 |
+
|
| 2683 |
+
echo "=== ALL CHECKS PASSED ==="
|
| 2684 |
+
```
|
| 2685 |
+
|
| 2686 |
+
---
|
| 2687 |
+
|
| 2688 |
+
# 12. TIMELINE & MILESTONES
|
| 2689 |
+
|
| 2690 |
+
## Development Schedule (Assuming 7-10 days)
|
| 2691 |
+
|
| 2692 |
+
### Day 1-2: Foundation
|
| 2693 |
+
- [x] Set up project structure
|
| 2694 |
+
- [x] Implement Pydantic models
|
| 2695 |
+
- [x] Create base FastAPI server with all endpoints
|
| 2696 |
+
- [x] Write openenv.yaml
|
| 2697 |
+
|
| 2698 |
+
### Day 3-4: Core Environment
|
| 2699 |
+
- [x] Implement environment.py (reset, step, state)
|
| 2700 |
+
- [x] Create Docker simulator (validate Dockerfile syntax)
|
| 2701 |
+
- [x] Create Workflow simulator (validate YAML)
|
| 2702 |
+
- [x] Test basic episode flow
|
| 2703 |
+
|
| 2704 |
+
### Day 5-6: Tasks & Scenarios
|
| 2705 |
+
- [ ] Implement Task 1: Dockerfile Syntax (5+ scenarios)
|
| 2706 |
+
- [ ] Implement Task 2: Dockerfile Runtime (5+ scenarios)
|
| 2707 |
+
- [ ] Implement Task 3: Workflow Syntax and Structure (5+ scenarios)
|
| 2708 |
+
- [ ] Implement Task 4: Workflow Secrets and Permissions (5+ scenarios)
|
| 2709 |
+
- [ ] Implement Task 5: CI and Docker Build Integration (4+ scenarios)
|
| 2710 |
+
- [ ] Implement Task 6: Multi-Stage Pipeline and Matrix (4+ scenarios)
|
| 2711 |
+
- [ ] Verify difficulty progression
|
| 2712 |
+
|
| 2713 |
+
### Day 7: Graders & Rewards
|
| 2714 |
+
- [ ] Implement grader logic
|
| 2715 |
+
- [ ] Test determinism
|
| 2716 |
+
- [ ] Tune reward shaping
|
| 2717 |
+
- [ ] Verify score ranges
|
| 2718 |
+
|
| 2719 |
+
### Day 8: Baseline & Testing
|
| 2720 |
+
- [ ] Write inference.py baseline
|
| 2721 |
+
- [ ] Run baseline on all tasks
|
| 2722 |
+
- [ ] Verify expected scores
|
| 2723 |
+
- [ ] Full test suite
|
| 2724 |
+
|
| 2725 |
+
### Day 9: Docker & Deployment
|
| 2726 |
+
- [ ] Finalize Dockerfile
|
| 2727 |
+
- [ ] Test local Docker build/run
|
| 2728 |
+
- [ ] Deploy to HuggingFace Spaces
|
| 2729 |
+
- [ ] Run validation script
|
| 2730 |
+
|
| 2731 |
+
### Day 10: Polish & Submit
|
| 2732 |
+
- [ ] Write comprehensive README
|
| 2733 |
+
- [ ] Final testing
|
| 2734 |
+
- [ ] Submit before deadline
|
| 2735 |
+
|
| 2736 |
+
---
|
| 2737 |
+
|
| 2738 |
+
# APPENDIX: Quick Reference
|
| 2739 |
+
|
| 2740 |
+
## Required Files Checklist
|
| 2741 |
+
|
| 2742 |
+
```
|
| 2743 |
+
✓ openenv.yaml - Environment metadata
|
| 2744 |
+
✓ inference.py - Baseline script (root dir)
|
| 2745 |
+
✓ Dockerfile - Container definition
|
| 2746 |
+
✓ requirements.txt - Python dependencies
|
| 2747 |
+
✓ README.md - Documentation
|
| 2748 |
+
✓ server/main.py - FastAPI app
|
| 2749 |
+
✓ server/models.py - Pydantic models
|
| 2750 |
+
✓ server/environment.py - Core logic
|
| 2751 |
+
✓ server/tasks/*.py - 6 task definitions
|
| 2752 |
+
✓ server/graders/*.py - Grading logic
|
| 2753 |
+
```
|
| 2754 |
+
|
| 2755 |
+
## Required Endpoints
|
| 2756 |
+
|
| 2757 |
+
```
|
| 2758 |
+
GET / - Health check
|
| 2759 |
+
POST /reset - Start new episode
|
| 2760 |
+
POST /step - Take action
|
| 2761 |
+
GET /state - Current observation
|
| 2762 |
+
GET /info - Environment metadata
|
| 2763 |
+
GET /tasks - List tasks
|
| 2764 |
+
POST /grader - Grade trajectory
|
| 2765 |
+
POST /baseline - Run baseline
|
| 2766 |
+
```
|
| 2767 |
+
|
| 2768 |
+
## Environment Variables
|
| 2769 |
+
|
| 2770 |
+
```bash
|
| 2771 |
+
API_BASE_URL=https://router.huggingface.co/v1
|
| 2772 |
+
MODEL_NAME=meta-llama/Llama-3.1-70B-Instruct
|
| 2773 |
+
HF_TOKEN=your_token_here
|
| 2774 |
+
```
|
| 2775 |
+
|
| 2776 |
+
## Score Targets
|
| 2777 |
+
|
| 2778 |
+
| Task | Expected Baseline Score |
|
| 2779 |
+
|------|------------------------|
|
| 2780 |
+
| dockerfile_syntax | 0.7 |
|
| 2781 |
+
| dockerfile_runtime | 0.55 |
|
| 2782 |
+
| workflow_syntax_structure | 0.65 |
|
| 2783 |
+
| workflow_secrets_permissions | 0.5 |
|
| 2784 |
+
| ci_docker_integration | 0.45 |
|
| 2785 |
+
| multi_stage_pipeline_matrix | 0.3 |
|
| 2786 |
+
|
| 2787 |
+
---
|
| 2788 |
+
|
| 2789 |
+
*End of Implementation Plan*
|
README.md
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CI/CD Debug Environment
|
| 2 |
+
|
| 3 |
+
OpenEnv-style environment for debugging Docker and GitHub Actions failures.
|
| 4 |
+
|
| 5 |
+
## Day 1-2 Status
|
| 6 |
+
|
| 7 |
+
- Project scaffold created
|
| 8 |
+
- Typed Pydantic models implemented
|
| 9 |
+
- FastAPI app with core endpoints implemented
|
| 10 |
+
- Initial 6-task registry and environment loop wired
|
| 11 |
+
- Deterministic dynamic grader scaffold implemented with score breakdown
|
| 12 |
+
|
| 13 |
+
## Run locally
|
| 14 |
+
|
| 15 |
+
```bash
|
| 16 |
+
pip install -r requirements.txt
|
| 17 |
+
python -m uvicorn server.main:app --reload --port 7860
|
| 18 |
+
```
|
| 19 |
+
|
| 20 |
+
## Quick checks
|
| 21 |
+
|
| 22 |
+
```bash
|
| 23 |
+
curl http://localhost:7860/
|
| 24 |
+
curl -X POST http://localhost:7860/reset
|
| 25 |
+
curl http://localhost:7860/info
|
| 26 |
+
```
|
baseline_runner.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Optional, List
|
| 4 |
+
|
| 5 |
+
from server.graders import run_grader
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def run_baseline_episodes(task_id: Optional[str], num_episodes: int):
|
| 9 |
+
"""Simple placeholder baseline for initial setup.
|
| 10 |
+
|
| 11 |
+
Day 1-2 goal is wiring and endpoint functionality, not model quality.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
task_ids: List[str]
|
| 15 |
+
if task_id:
|
| 16 |
+
task_ids = [task_id]
|
| 17 |
+
else:
|
| 18 |
+
task_ids = [
|
| 19 |
+
"dockerfile_syntax",
|
| 20 |
+
"dockerfile_runtime",
|
| 21 |
+
"workflow_syntax_structure",
|
| 22 |
+
"workflow_secrets_permissions",
|
| 23 |
+
"ci_docker_integration",
|
| 24 |
+
"multi_stage_pipeline_matrix",
|
| 25 |
+
]
|
| 26 |
+
|
| 27 |
+
results = []
|
| 28 |
+
for tid in task_ids:
|
| 29 |
+
for _ in range(max(1, num_episodes)):
|
| 30 |
+
trajectory = [
|
| 31 |
+
{
|
| 32 |
+
"step": 1,
|
| 33 |
+
"action": {"action_type": "submit"},
|
| 34 |
+
"reward": 0.0,
|
| 35 |
+
"done": True,
|
| 36 |
+
"info": {"issues_fixed": 0, "issues_total": 1},
|
| 37 |
+
}
|
| 38 |
+
]
|
| 39 |
+
results.append(run_grader(tid, trajectory))
|
| 40 |
+
return results
|
inference.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Baseline inference placeholder for initial setup."""
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def main():
|
| 5 |
+
print("Baseline inference placeholder. Implement full baseline in Day 8.")
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
if __name__ == "__main__":
|
| 9 |
+
main()
|
openenv.yaml
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: cicd-debug-env
|
| 2 |
+
version: "1.0.0"
|
| 3 |
+
description: >
|
| 4 |
+
Debug broken GitHub Actions workflows and Dockerfiles.
|
| 5 |
+
AI agents identify and fix CI/CD infrastructure issues.
|
| 6 |
+
|
| 7 |
+
author: Krishna
|
| 8 |
+
license: MIT
|
| 9 |
+
tags:
|
| 10 |
+
- devops
|
| 11 |
+
- docker
|
| 12 |
+
- github-actions
|
| 13 |
+
- debugging
|
| 14 |
+
- infrastructure
|
| 15 |
+
|
| 16 |
+
environment:
|
| 17 |
+
type: text
|
| 18 |
+
observation_space: structured
|
| 19 |
+
action_space: structured
|
| 20 |
+
max_steps: 10
|
| 21 |
+
|
| 22 |
+
tasks:
|
| 23 |
+
- id: dockerfile_syntax
|
| 24 |
+
name: Dockerfile Syntax Errors
|
| 25 |
+
description: Fix syntax and instruction errors in Dockerfiles
|
| 26 |
+
difficulty: easy
|
| 27 |
+
|
| 28 |
+
- id: dockerfile_runtime
|
| 29 |
+
name: Dockerfile Runtime Errors
|
| 30 |
+
description: Fix runtime/container execution issues in Dockerfiles
|
| 31 |
+
difficulty: medium
|
| 32 |
+
|
| 33 |
+
- id: workflow_syntax_structure
|
| 34 |
+
name: Workflow Syntax and Structure
|
| 35 |
+
description: Fix GitHub Actions YAML syntax and job structure issues
|
| 36 |
+
difficulty: easy
|
| 37 |
+
|
| 38 |
+
- id: workflow_secrets_permissions
|
| 39 |
+
name: Workflow Secrets and Permissions
|
| 40 |
+
description: Fix secret wiring, env usage, and permissions in workflows
|
| 41 |
+
difficulty: medium
|
| 42 |
+
|
| 43 |
+
- id: ci_docker_integration
|
| 44 |
+
name: CI and Docker Build Integration
|
| 45 |
+
description: Debug combined workflow and Docker build integration failures
|
| 46 |
+
difficulty: medium
|
| 47 |
+
|
| 48 |
+
- id: multi_stage_pipeline_matrix
|
| 49 |
+
name: Multi-Stage Pipeline and Matrix
|
| 50 |
+
description: Debug complex multi-stage and matrix CI/CD pipelines
|
| 51 |
+
difficulty: hard
|
| 52 |
+
|
| 53 |
+
graders:
|
| 54 |
+
dockerfile_syntax:
|
| 55 |
+
type: deterministic
|
| 56 |
+
score_range: [0.0, 1.0]
|
| 57 |
+
dockerfile_runtime:
|
| 58 |
+
type: deterministic
|
| 59 |
+
score_range: [0.0, 1.0]
|
| 60 |
+
workflow_syntax_structure:
|
| 61 |
+
type: deterministic
|
| 62 |
+
score_range: [0.0, 1.0]
|
| 63 |
+
workflow_secrets_permissions:
|
| 64 |
+
type: deterministic
|
| 65 |
+
score_range: [0.0, 1.0]
|
| 66 |
+
ci_docker_integration:
|
| 67 |
+
type: deterministic
|
| 68 |
+
score_range: [0.0, 1.0]
|
| 69 |
+
multi_stage_pipeline_matrix:
|
| 70 |
+
type: deterministic
|
| 71 |
+
score_range: [0.0, 1.0]
|
| 72 |
+
|
| 73 |
+
baseline:
|
| 74 |
+
script: inference.py
|
| 75 |
+
expected_scores:
|
| 76 |
+
dockerfile_syntax: 0.7
|
| 77 |
+
dockerfile_runtime: 0.55
|
| 78 |
+
workflow_syntax_structure: 0.65
|
| 79 |
+
workflow_secrets_permissions: 0.5
|
| 80 |
+
ci_docker_integration: 0.45
|
| 81 |
+
multi_stage_pipeline_matrix: 0.3
|
| 82 |
+
|
| 83 |
+
resources:
|
| 84 |
+
vcpu: 2
|
| 85 |
+
memory: 8gb
|
| 86 |
+
timeout: 1200
|
requirements.txt
ADDED
|
Binary file (4.24 kB). View file
|
|
|
sample_inf_script.py
ADDED
|
@@ -0,0 +1,255 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Inference Script Example
|
| 3 |
+
===================================
|
| 4 |
+
MANDATORY
|
| 5 |
+
- Before submitting, ensure the following variables are defined in your environment configuration:
|
| 6 |
+
API_BASE_URL The API endpoint for the LLM.
|
| 7 |
+
MODEL_NAME The model identifier to use for inference.
|
| 8 |
+
HF_TOKEN Your Hugging Face / API key.
|
| 9 |
+
|
| 10 |
+
- The inference script must be named `inference.py` and placed in the root directory of the project
|
| 11 |
+
- Participants must use OpenAI Client for all LLM calls using above variables
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import os
|
| 15 |
+
import re
|
| 16 |
+
import base64
|
| 17 |
+
import textwrap
|
| 18 |
+
from io import BytesIO
|
| 19 |
+
from typing import List, Optional, Dict
|
| 20 |
+
|
| 21 |
+
from openai import OpenAI
|
| 22 |
+
import numpy as np
|
| 23 |
+
from PIL import Image
|
| 24 |
+
|
| 25 |
+
from browsergym_env import BrowserGymAction, BrowserGymEnv
|
| 26 |
+
|
| 27 |
+
API_BASE_URL = os.getenv("API_BASE_URL") // "https://router.huggingface.co/v1"
|
| 28 |
+
API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 29 |
+
MODEL_NAME = os.getenv("MODEL_NAME")
|
| 30 |
+
MAX_STEPS = 8
|
| 31 |
+
MAX_DOM_CHARS = 3500
|
| 32 |
+
TEMPERATURE = 0.2
|
| 33 |
+
MAX_TOKENS = 200
|
| 34 |
+
FALLBACK_ACTION = "noop()"
|
| 35 |
+
|
| 36 |
+
DEBUG = True
|
| 37 |
+
ACTION_PREFIX_RE = re.compile(
|
| 38 |
+
r"^(action|next action)\s*[:\-]\s*",
|
| 39 |
+
re.IGNORECASE,
|
| 40 |
+
)
|
| 41 |
+
ACTION_PATTERN = re.compile(r"[A-Za-z_]+\s*\(.*\)", re.DOTALL)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
SYSTEM_PROMPT = textwrap.dedent(
|
| 45 |
+
"""
|
| 46 |
+
You control a web browser through BrowserGym.
|
| 47 |
+
Reply with exactly one action string.
|
| 48 |
+
The action must be a valid BrowserGym command such as:
|
| 49 |
+
- noop()
|
| 50 |
+
- click('<BID>')
|
| 51 |
+
- type('selector', 'text to enter')
|
| 52 |
+
- fill('selector', 'text to enter')
|
| 53 |
+
- send_keys('Enter')
|
| 54 |
+
- scroll('down')
|
| 55 |
+
Use single quotes around string arguments.
|
| 56 |
+
When clicking, use the BrowserGym element IDs (BIDs) listed in the user message.
|
| 57 |
+
If you are unsure, respond with noop().
|
| 58 |
+
Do not include explanations or additional text.
|
| 59 |
+
"""
|
| 60 |
+
).strip()
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def build_history_lines(history: List[str]) -> str:
|
| 64 |
+
if not history:
|
| 65 |
+
return "None"
|
| 66 |
+
return "\n".join(history[-4:])
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def extract_screenshot_uri(observation) -> Optional[str]:
|
| 70 |
+
if observation.screenshot is None:
|
| 71 |
+
return None
|
| 72 |
+
screen_array = np.array(observation.screenshot, dtype=np.uint8)
|
| 73 |
+
image = Image.fromarray(screen_array)
|
| 74 |
+
buffer = BytesIO()
|
| 75 |
+
image.save(buffer, format="PNG")
|
| 76 |
+
buffer.seek(0)
|
| 77 |
+
data_uri = base64.b64encode(buffer.read()).decode("utf-8")
|
| 78 |
+
return f"data:image/png;base64,{data_uri}"
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def extract_clickable_elements(observation) -> List[Dict[str, str]]:
|
| 82 |
+
"""Collect BrowserGym element IDs that can be clicked."""
|
| 83 |
+
|
| 84 |
+
metadata = getattr(observation, "metadata", {}) or {}
|
| 85 |
+
obs_dict = metadata.get("browsergym_obs", {}) or {}
|
| 86 |
+
extra_props = obs_dict.get("extra_element_properties", {}) or {}
|
| 87 |
+
|
| 88 |
+
clickables: List[Dict[str, str]] = []
|
| 89 |
+
for bid, props in extra_props.items():
|
| 90 |
+
if not props.get("clickable"):
|
| 91 |
+
continue
|
| 92 |
+
|
| 93 |
+
bbox = props.get("bbox") or []
|
| 94 |
+
bbox_str = ", ".join(bbox) if bbox else "?"
|
| 95 |
+
clickables.append(
|
| 96 |
+
{
|
| 97 |
+
"bid": str(bid),
|
| 98 |
+
"bbox": bbox_str,
|
| 99 |
+
}
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
# Keep a stable ordering for readability
|
| 103 |
+
clickables.sort(key=lambda item: item["bid"])
|
| 104 |
+
return clickables
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def build_user_prompt(step: int, observation, history: List[str]) -> str:
|
| 108 |
+
goal = observation.goal or "(not provided)"
|
| 109 |
+
url = observation.url or "(unknown)"
|
| 110 |
+
error_note = "Yes" if observation.last_action_error else "No"
|
| 111 |
+
|
| 112 |
+
clickables = extract_clickable_elements(observation)
|
| 113 |
+
if clickables:
|
| 114 |
+
actions_hint = "\n".join(
|
| 115 |
+
f" - {item['bid']} (bbox: {item['bbox']})" for item in clickables
|
| 116 |
+
)
|
| 117 |
+
else:
|
| 118 |
+
actions_hint = " (none detected)"
|
| 119 |
+
|
| 120 |
+
prompt = textwrap.dedent(
|
| 121 |
+
f"""
|
| 122 |
+
Step: {step}
|
| 123 |
+
Goal: {goal}
|
| 124 |
+
Current URL: {url}
|
| 125 |
+
Previous steps:
|
| 126 |
+
{build_history_lines(history)}
|
| 127 |
+
Last action error: {error_note}
|
| 128 |
+
Available clickable element IDs: {actions_hint}
|
| 129 |
+
Reply with exactly one BrowserGym action string.
|
| 130 |
+
"""
|
| 131 |
+
).strip()
|
| 132 |
+
return prompt
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def parse_model_action(response_text: str) -> str:
|
| 136 |
+
if not response_text:
|
| 137 |
+
return FALLBACK_ACTION
|
| 138 |
+
|
| 139 |
+
# Prefer the first line that looks like an action string
|
| 140 |
+
lines = response_text.splitlines()
|
| 141 |
+
for raw_line in lines:
|
| 142 |
+
line = raw_line.strip()
|
| 143 |
+
if not line:
|
| 144 |
+
continue
|
| 145 |
+
line = ACTION_PREFIX_RE.sub("", line)
|
| 146 |
+
match = ACTION_PATTERN.search(line)
|
| 147 |
+
if match:
|
| 148 |
+
action = match.group(0).strip()
|
| 149 |
+
# Collapse internal whitespace
|
| 150 |
+
action = re.sub(r"\s+", " ", action)
|
| 151 |
+
# If the model tried to click by natural-language description while we
|
| 152 |
+
# only exposed numeric BrowserGym IDs, fallback to the single detected ID.
|
| 153 |
+
return action
|
| 154 |
+
|
| 155 |
+
# Fall back to searching the whole response
|
| 156 |
+
match = ACTION_PATTERN.search(response_text)
|
| 157 |
+
if match:
|
| 158 |
+
action = match.group(0).strip()
|
| 159 |
+
action = re.sub(r"\s+", " ", action)
|
| 160 |
+
return action
|
| 161 |
+
|
| 162 |
+
return FALLBACK_ACTION
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def main() -> None:
|
| 166 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
|
| 167 |
+
|
| 168 |
+
env = BrowserGymEnv.from_docker_image(
|
| 169 |
+
image="browsergym-env:latest",
|
| 170 |
+
env_vars={
|
| 171 |
+
"BROWSERGYM_BENCHMARK": "miniwob",
|
| 172 |
+
"BROWSERGYM_TASK_NAME": "click-test",
|
| 173 |
+
},
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
history: List[str] = []
|
| 177 |
+
|
| 178 |
+
try:
|
| 179 |
+
result = env.reset()
|
| 180 |
+
observation = result.observation
|
| 181 |
+
print(f"Episode goal: {observation.goal}")
|
| 182 |
+
|
| 183 |
+
for step in range(1, MAX_STEPS + 1):
|
| 184 |
+
if result.done:
|
| 185 |
+
print("Environment signalled done. Stopping early.")
|
| 186 |
+
break
|
| 187 |
+
|
| 188 |
+
user_prompt = build_user_prompt(step, observation, history)
|
| 189 |
+
user_content = [{"type": "text", "text": user_prompt}]
|
| 190 |
+
screenshot_uri = extract_screenshot_uri(observation)
|
| 191 |
+
if screenshot_uri:
|
| 192 |
+
user_content.append(
|
| 193 |
+
{
|
| 194 |
+
"type": "image_url",
|
| 195 |
+
"image_url": {"url": screenshot_uri},
|
| 196 |
+
}
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
messages = [
|
| 200 |
+
{
|
| 201 |
+
"role": "system",
|
| 202 |
+
"content": [{"type": "text", "text": SYSTEM_PROMPT}],
|
| 203 |
+
},
|
| 204 |
+
{
|
| 205 |
+
"role": "user",
|
| 206 |
+
"content": user_content,
|
| 207 |
+
},
|
| 208 |
+
]
|
| 209 |
+
|
| 210 |
+
try:
|
| 211 |
+
completion = client.chat.completions.create(
|
| 212 |
+
model=MODEL_NAME,
|
| 213 |
+
messages=messages,
|
| 214 |
+
temperature=TEMPERATURE,
|
| 215 |
+
max_tokens=MAX_TOKENS,
|
| 216 |
+
stream=False,
|
| 217 |
+
)
|
| 218 |
+
response_text = completion.choices[0].message.content or ""
|
| 219 |
+
# pylint: disable=broad-except
|
| 220 |
+
except Exception as exc: # noqa: BLE001
|
| 221 |
+
failure_msg = f"Model request failed ({exc}). Using fallback action."
|
| 222 |
+
print(failure_msg)
|
| 223 |
+
response_text = FALLBACK_ACTION
|
| 224 |
+
|
| 225 |
+
action_str = parse_model_action(response_text)
|
| 226 |
+
print(f"Step {step}: model suggested -> {action_str}")
|
| 227 |
+
|
| 228 |
+
result = env.step(BrowserGymAction(action_str=action_str))
|
| 229 |
+
observation = result.observation
|
| 230 |
+
|
| 231 |
+
reward = result.reward or 0.0
|
| 232 |
+
error_flag = " ERROR" if observation.last_action_error else ""
|
| 233 |
+
history_line = (
|
| 234 |
+
f"Step {step}: {action_str} -> reward {reward:+.2f}{error_flag}"
|
| 235 |
+
)
|
| 236 |
+
history.append(history_line)
|
| 237 |
+
print(
|
| 238 |
+
" Reward: "
|
| 239 |
+
f"{reward:+.2f} | Done: {result.done} | Last action error: "
|
| 240 |
+
f"{observation.last_action_error}"
|
| 241 |
+
)
|
| 242 |
+
|
| 243 |
+
if result.done:
|
| 244 |
+
print("Episode complete.")
|
| 245 |
+
break
|
| 246 |
+
|
| 247 |
+
else:
|
| 248 |
+
print(f"Reached max steps ({MAX_STEPS}).")
|
| 249 |
+
|
| 250 |
+
finally:
|
| 251 |
+
env.close()
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
if __name__ == "__main__":
|
| 255 |
+
main()
|
sample_val_script.txt
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
#
|
| 3 |
+
# validate-submission.sh — OpenEnv Submission Validator
|
| 4 |
+
#
|
| 5 |
+
# Checks that your HF Space is live, Docker image builds, and openenv validate passes.
|
| 6 |
+
#
|
| 7 |
+
# Prerequisites:
|
| 8 |
+
# - Docker: https://docs.docker.com/get-docker/
|
| 9 |
+
# - openenv-core: pip install openenv-core
|
| 10 |
+
# - curl (usually pre-installed)
|
| 11 |
+
#
|
| 12 |
+
# Run:
|
| 13 |
+
# curl -fsSL https://raw.githubusercontent.com/<owner>/<repo>/main/scripts/validate-submission.sh | bash -s -- <ping_url> [repo_dir]
|
| 14 |
+
#
|
| 15 |
+
# Or download and run locally:
|
| 16 |
+
# chmod +x validate-submission.sh
|
| 17 |
+
# ./validate-submission.sh <ping_url> [repo_dir]
|
| 18 |
+
#
|
| 19 |
+
# Arguments:
|
| 20 |
+
# ping_url Your HuggingFace Space URL (e.g. https://your-space.hf.space)
|
| 21 |
+
# repo_dir Path to your repo (default: current directory)
|
| 22 |
+
#
|
| 23 |
+
# Examples:
|
| 24 |
+
# ./validate-submission.sh https://my-team.hf.space
|
| 25 |
+
# ./validate-submission.sh https://my-team.hf.space ./my-repo
|
| 26 |
+
#
|
| 27 |
+
|
| 28 |
+
set -uo pipefail
|
| 29 |
+
|
| 30 |
+
DOCKER_BUILD_TIMEOUT=600
|
| 31 |
+
if [ -t 1 ]; then
|
| 32 |
+
RED='\033[0;31m'
|
| 33 |
+
GREEN='\033[0;32m'
|
| 34 |
+
YELLOW='\033[1;33m'
|
| 35 |
+
BOLD='\033[1m'
|
| 36 |
+
NC='\033[0m'
|
| 37 |
+
else
|
| 38 |
+
RED='' GREEN='' YELLOW='' BOLD='' NC=''
|
| 39 |
+
fi
|
| 40 |
+
|
| 41 |
+
run_with_timeout() {
|
| 42 |
+
local secs="$1"; shift
|
| 43 |
+
if command -v timeout &>/dev/null; then
|
| 44 |
+
timeout "$secs" "$@"
|
| 45 |
+
elif command -v gtimeout &>/dev/null; then
|
| 46 |
+
gtimeout "$secs" "$@"
|
| 47 |
+
else
|
| 48 |
+
"$@" &
|
| 49 |
+
local pid=$!
|
| 50 |
+
( sleep "$secs" && kill "$pid" 2>/dev/null ) &
|
| 51 |
+
local watcher=$!
|
| 52 |
+
wait "$pid" 2>/dev/null
|
| 53 |
+
local rc=$?
|
| 54 |
+
kill "$watcher" 2>/dev/null
|
| 55 |
+
wait "$watcher" 2>/dev/null
|
| 56 |
+
return $rc
|
| 57 |
+
fi
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
portable_mktemp() {
|
| 61 |
+
local prefix="${1:-validate}"
|
| 62 |
+
mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
CLEANUP_FILES=()
|
| 66 |
+
cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
|
| 67 |
+
trap cleanup EXIT
|
| 68 |
+
|
| 69 |
+
PING_URL="${1:-}"
|
| 70 |
+
REPO_DIR="${2:-.}"
|
| 71 |
+
|
| 72 |
+
if [ -z "$PING_URL" ]; then
|
| 73 |
+
printf "Usage: %s <ping_url> [repo_dir]\n" "$0"
|
| 74 |
+
printf "\n"
|
| 75 |
+
printf " ping_url Your HuggingFace Space URL (e.g. https://your-space.hf.space)\n"
|
| 76 |
+
printf " repo_dir Path to your repo (default: current directory)\n"
|
| 77 |
+
exit 1
|
| 78 |
+
fi
|
| 79 |
+
|
| 80 |
+
if ! REPO_DIR="$(cd "$REPO_DIR" 2>/dev/null && pwd)"; then
|
| 81 |
+
printf "Error: directory '%s' not found\n" "${2:-.}"
|
| 82 |
+
exit 1
|
| 83 |
+
fi
|
| 84 |
+
PING_URL="${PING_URL%/}"
|
| 85 |
+
export PING_URL
|
| 86 |
+
PASS=0
|
| 87 |
+
|
| 88 |
+
log() { printf "[%s] %b\n" "$(date -u +%H:%M:%S)" "$*"; }
|
| 89 |
+
pass() { log "${GREEN}PASSED${NC} -- $1"; PASS=$((PASS + 1)); }
|
| 90 |
+
fail() { log "${RED}FAILED${NC} -- $1"; }
|
| 91 |
+
hint() { printf " ${YELLOW}Hint:${NC} %b\n" "$1"; }
|
| 92 |
+
stop_at() {
|
| 93 |
+
printf "\n"
|
| 94 |
+
printf "${RED}${BOLD}Validation stopped at %s.${NC} Fix the above before continuing.\n" "$1"
|
| 95 |
+
exit 1
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
printf "\n"
|
| 99 |
+
printf "${BOLD}========================================${NC}\n"
|
| 100 |
+
printf "${BOLD} OpenEnv Submission Validator${NC}\n"
|
| 101 |
+
printf "${BOLD}========================================${NC}\n"
|
| 102 |
+
log "Repo: $REPO_DIR"
|
| 103 |
+
log "Ping URL: $PING_URL"
|
| 104 |
+
printf "\n"
|
| 105 |
+
|
| 106 |
+
log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
|
| 107 |
+
|
| 108 |
+
CURL_OUTPUT=$(portable_mktemp "validate-curl")
|
| 109 |
+
CLEANUP_FILES+=("$CURL_OUTPUT")
|
| 110 |
+
HTTP_CODE=$(curl -s -o "$CURL_OUTPUT" -w "%{http_code}" -X POST \
|
| 111 |
+
-H "Content-Type: application/json" -d '{}' \
|
| 112 |
+
"$PING_URL/reset" --max-time 30 2>"$CURL_OUTPUT" || printf "000")
|
| 113 |
+
|
| 114 |
+
if [ "$HTTP_CODE" = "200" ]; then
|
| 115 |
+
pass "HF Space is live and responds to /reset"
|
| 116 |
+
elif [ "$HTTP_CODE" = "000" ]; then
|
| 117 |
+
fail "HF Space not reachable (connection failed or timed out)"
|
| 118 |
+
hint "Check your network connection and that the Space is running."
|
| 119 |
+
hint "Try: curl -s -o /dev/null -w '%%{http_code}' -X POST $PING_URL/reset"
|
| 120 |
+
stop_at "Step 1"
|
| 121 |
+
else
|
| 122 |
+
fail "HF Space /reset returned HTTP $HTTP_CODE (expected 200)"
|
| 123 |
+
hint "Make sure your Space is running and the URL is correct."
|
| 124 |
+
hint "Try opening $PING_URL in your browser first."
|
| 125 |
+
stop_at "Step 1"
|
| 126 |
+
fi
|
| 127 |
+
|
| 128 |
+
log "${BOLD}Step 2/3: Running docker build${NC} ..."
|
| 129 |
+
|
| 130 |
+
if ! command -v docker &>/dev/null; then
|
| 131 |
+
fail "docker command not found"
|
| 132 |
+
hint "Install Docker: https://docs.docker.com/get-docker/"
|
| 133 |
+
stop_at "Step 2"
|
| 134 |
+
fi
|
| 135 |
+
|
| 136 |
+
if [ -f "$REPO_DIR/Dockerfile" ]; then
|
| 137 |
+
DOCKER_CONTEXT="$REPO_DIR"
|
| 138 |
+
elif [ -f "$REPO_DIR/server/Dockerfile" ]; then
|
| 139 |
+
DOCKER_CONTEXT="$REPO_DIR/server"
|
| 140 |
+
else
|
| 141 |
+
fail "No Dockerfile found in repo root or server/ directory"
|
| 142 |
+
stop_at "Step 2"
|
| 143 |
+
fi
|
| 144 |
+
|
| 145 |
+
log " Found Dockerfile in $DOCKER_CONTEXT"
|
| 146 |
+
|
| 147 |
+
BUILD_OK=false
|
| 148 |
+
BUILD_OUTPUT=$(run_with_timeout "$DOCKER_BUILD_TIMEOUT" docker build "$DOCKER_CONTEXT" 2>&1) && BUILD_OK=true
|
| 149 |
+
|
| 150 |
+
if [ "$BUILD_OK" = true ]; then
|
| 151 |
+
pass "Docker build succeeded"
|
| 152 |
+
else
|
| 153 |
+
fail "Docker build failed (timeout=${DOCKER_BUILD_TIMEOUT}s)"
|
| 154 |
+
printf "%s\n" "$BUILD_OUTPUT" | tail -20
|
| 155 |
+
stop_at "Step 2"
|
| 156 |
+
fi
|
| 157 |
+
|
| 158 |
+
log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
|
| 159 |
+
|
| 160 |
+
if ! command -v openenv &>/dev/null; then
|
| 161 |
+
fail "openenv command not found"
|
| 162 |
+
hint "Install it: pip install openenv-core"
|
| 163 |
+
stop_at "Step 3"
|
| 164 |
+
fi
|
| 165 |
+
|
| 166 |
+
VALIDATE_OK=false
|
| 167 |
+
VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
|
| 168 |
+
|
| 169 |
+
if [ "$VALIDATE_OK" = true ]; then
|
| 170 |
+
pass "openenv validate passed"
|
| 171 |
+
[ -n "$VALIDATE_OUTPUT" ] && log " $VALIDATE_OUTPUT"
|
| 172 |
+
else
|
| 173 |
+
fail "openenv validate failed"
|
| 174 |
+
printf "%s\n" "$VALIDATE_OUTPUT"
|
| 175 |
+
stop_at "Step 3"
|
| 176 |
+
fi
|
| 177 |
+
|
| 178 |
+
printf "\n"
|
| 179 |
+
printf "${BOLD}========================================${NC}\n"
|
| 180 |
+
printf "${GREEN}${BOLD} All 3/3 checks passed!${NC}\n"
|
| 181 |
+
printf "${GREEN}${BOLD} Your submission is ready to submit.${NC}\n"
|
| 182 |
+
printf "${BOLD}========================================${NC}\n"
|
| 183 |
+
printf "\n"
|
| 184 |
+
|
| 185 |
+
exit 0
|
server/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""CI/CD debug environment server package."""
|
server/environment.py
ADDED
|
@@ -0,0 +1,384 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Core environment loop for Day 1-2 foundation."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import copy
|
| 6 |
+
import random
|
| 7 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 8 |
+
|
| 9 |
+
from server.models import (
|
| 10 |
+
Action,
|
| 11 |
+
ActionType,
|
| 12 |
+
ErrorInfo,
|
| 13 |
+
ErrorPhase,
|
| 14 |
+
FileContent,
|
| 15 |
+
FileType,
|
| 16 |
+
Observation,
|
| 17 |
+
TaskDifficulty,
|
| 18 |
+
)
|
| 19 |
+
from server.simulators.docker_simulator import DockerSimulator
|
| 20 |
+
from server.simulators.workflow_simulator import WorkflowSimulator
|
| 21 |
+
from server.tasks.task_registry import TASK_REGISTRY, get_task
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class CICDDebugEnvironment:
|
| 25 |
+
MAX_STEPS = 10
|
| 26 |
+
MAX_HINTS = 3
|
| 27 |
+
|
| 28 |
+
def _resolve_task_id(self, task_id: Optional[object]) -> Optional[str]:
|
| 29 |
+
"""Resolve task identifiers from either canonical string IDs or numeric indices.
|
| 30 |
+
|
| 31 |
+
Supports:
|
| 32 |
+
- canonical ids (e.g. "dockerfile_syntax")
|
| 33 |
+
- numeric values (1-based preferred, 0-based tolerated)
|
| 34 |
+
- numeric strings ("1", "0")
|
| 35 |
+
"""
|
| 36 |
+
if task_id is None:
|
| 37 |
+
return None
|
| 38 |
+
|
| 39 |
+
keys = list(TASK_REGISTRY.keys())
|
| 40 |
+
|
| 41 |
+
if isinstance(task_id, int):
|
| 42 |
+
if 1 <= task_id <= len(keys):
|
| 43 |
+
return keys[task_id - 1]
|
| 44 |
+
if 0 <= task_id < len(keys):
|
| 45 |
+
return keys[task_id]
|
| 46 |
+
return str(task_id)
|
| 47 |
+
|
| 48 |
+
if isinstance(task_id, str):
|
| 49 |
+
raw = task_id.strip()
|
| 50 |
+
if raw in TASK_REGISTRY:
|
| 51 |
+
return raw
|
| 52 |
+
if raw.isdigit():
|
| 53 |
+
numeric = int(raw)
|
| 54 |
+
if 1 <= numeric <= len(keys):
|
| 55 |
+
return keys[numeric - 1]
|
| 56 |
+
if 0 <= numeric < len(keys):
|
| 57 |
+
return keys[numeric]
|
| 58 |
+
return raw
|
| 59 |
+
lowered = raw.lower()
|
| 60 |
+
for key in keys:
|
| 61 |
+
if key.lower() == lowered:
|
| 62 |
+
return key
|
| 63 |
+
return raw
|
| 64 |
+
|
| 65 |
+
return str(task_id)
|
| 66 |
+
|
| 67 |
+
def _validation_snapshot(self) -> Dict[str, bool]:
|
| 68 |
+
docker_result = self.docker_sim.validate(self.current_files.get("Dockerfile"), self.current_files)
|
| 69 |
+
workflow_result = self.workflow_sim.validate(self.current_files.get(".github/workflows/build.yml"), self.current_files)
|
| 70 |
+
return {
|
| 71 |
+
"docker_build_valid": bool(docker_result.get("build_success", False)),
|
| 72 |
+
"workflow_parse_valid": bool(workflow_result.get("parse_success", False)),
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
def __init__(self):
|
| 76 |
+
self.docker_sim = DockerSimulator()
|
| 77 |
+
self.workflow_sim = WorkflowSimulator()
|
| 78 |
+
|
| 79 |
+
self.current_task_id: Optional[str] = None
|
| 80 |
+
self.current_scenario_id: Optional[str] = None
|
| 81 |
+
self.current_difficulty: Optional[TaskDifficulty] = None
|
| 82 |
+
self.current_task = None
|
| 83 |
+
|
| 84 |
+
self.original_files: Dict[str, FileContent] = {}
|
| 85 |
+
self.current_files: Dict[str, FileContent] = {}
|
| 86 |
+
self.expected_fixes: List[Dict[str, Any]] = []
|
| 87 |
+
|
| 88 |
+
self.current_error: Optional[ErrorInfo] = None
|
| 89 |
+
self.issues_total = 0
|
| 90 |
+
self.issues_fixed = 0
|
| 91 |
+
|
| 92 |
+
self.step_count = 0
|
| 93 |
+
self.episode_reward = 0.0
|
| 94 |
+
self.done = False
|
| 95 |
+
self.hints_used = 0
|
| 96 |
+
|
| 97 |
+
self.trajectory: List[Dict[str, Any]] = []
|
| 98 |
+
self.last_action_success: Optional[bool] = None
|
| 99 |
+
self.last_action_feedback: Optional[str] = None
|
| 100 |
+
|
| 101 |
+
def reset(self, task_id: Optional[str] = None, scenario_id: Optional[str] = None, seed: Optional[int] = None):
|
| 102 |
+
if seed is not None:
|
| 103 |
+
random.seed(seed)
|
| 104 |
+
|
| 105 |
+
task_id = self._resolve_task_id(task_id)
|
| 106 |
+
|
| 107 |
+
if task_id is None:
|
| 108 |
+
task_id = random.choice(list(TASK_REGISTRY.keys()))
|
| 109 |
+
if task_id not in TASK_REGISTRY:
|
| 110 |
+
raise ValueError(f"Unknown task: {task_id}")
|
| 111 |
+
|
| 112 |
+
self.current_task_id = task_id
|
| 113 |
+
self.current_task = get_task(task_id)
|
| 114 |
+
self.current_difficulty = self.current_task.DIFFICULTY
|
| 115 |
+
|
| 116 |
+
scenario = self.current_task.load_scenario(scenario_id)
|
| 117 |
+
self.current_scenario_id = scenario["id"]
|
| 118 |
+
|
| 119 |
+
self.original_files = {
|
| 120 |
+
f["path"]: FileContent(
|
| 121 |
+
path=f["path"],
|
| 122 |
+
content=f["content"],
|
| 123 |
+
file_type=FileType(f["type"]),
|
| 124 |
+
line_count=f["content"].count("\n") + 1,
|
| 125 |
+
)
|
| 126 |
+
for f in scenario["files"]
|
| 127 |
+
}
|
| 128 |
+
self.current_files = copy.deepcopy(self.original_files)
|
| 129 |
+
|
| 130 |
+
self.current_error = ErrorInfo(
|
| 131 |
+
phase=ErrorPhase(scenario["error"]["phase"]),
|
| 132 |
+
error_message=scenario["error"]["message"],
|
| 133 |
+
exit_code=scenario["error"].get("exit_code"),
|
| 134 |
+
failed_step=scenario["error"].get("failed_step"),
|
| 135 |
+
line_hint=scenario["error"].get("line_hint"),
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
self.expected_fixes = scenario["expected_fixes"]
|
| 139 |
+
self.issues_total = len(self.expected_fixes)
|
| 140 |
+
self.issues_fixed = 0
|
| 141 |
+
|
| 142 |
+
self.step_count = 0
|
| 143 |
+
self.episode_reward = 0.0
|
| 144 |
+
self.done = False
|
| 145 |
+
self.hints_used = 0
|
| 146 |
+
self.trajectory = []
|
| 147 |
+
self.last_action_success = None
|
| 148 |
+
self.last_action_feedback = None
|
| 149 |
+
|
| 150 |
+
return self.get_observation()
|
| 151 |
+
|
| 152 |
+
def step(self, action: Action) -> Tuple[Observation, float, bool, Dict[str, Any]]:
|
| 153 |
+
if self.done:
|
| 154 |
+
return self.get_observation(), 0.0, True, {"error": "Episode already done"}
|
| 155 |
+
|
| 156 |
+
self.step_count += 1
|
| 157 |
+
|
| 158 |
+
if action.action_type == ActionType.REQUEST_HINT:
|
| 159 |
+
reward, feedback = self._handle_hint_request()
|
| 160 |
+
elif action.action_type == ActionType.SUBMIT:
|
| 161 |
+
reward, feedback = self._handle_submit()
|
| 162 |
+
else:
|
| 163 |
+
reward, feedback = self._handle_edit(action)
|
| 164 |
+
|
| 165 |
+
self.last_action_feedback = feedback
|
| 166 |
+
self.episode_reward += reward
|
| 167 |
+
|
| 168 |
+
info: Dict[str, Any] = {}
|
| 169 |
+
if self.step_count >= self.MAX_STEPS:
|
| 170 |
+
self.done = True
|
| 171 |
+
info["termination_reason"] = "max_steps"
|
| 172 |
+
elif action.action_type == ActionType.SUBMIT:
|
| 173 |
+
self.done = True
|
| 174 |
+
info["termination_reason"] = "submitted"
|
| 175 |
+
elif self.issues_fixed == self.issues_total:
|
| 176 |
+
self.done = True
|
| 177 |
+
info["termination_reason"] = "all_fixed"
|
| 178 |
+
|
| 179 |
+
self.trajectory.append(
|
| 180 |
+
{"step": self.step_count, "action": action.model_dump(), "reward": reward, "done": self.done}
|
| 181 |
+
)
|
| 182 |
+
info["issues_fixed"] = self.issues_fixed
|
| 183 |
+
info["issues_total"] = self.issues_total
|
| 184 |
+
|
| 185 |
+
return self.get_observation(), reward, self.done, info
|
| 186 |
+
|
| 187 |
+
def _handle_edit(self, action: Action) -> Tuple[float, str]:
|
| 188 |
+
if not action.edits:
|
| 189 |
+
self.last_action_success = False
|
| 190 |
+
return 0.0, "No edits provided"
|
| 191 |
+
|
| 192 |
+
before_validation = self._validation_snapshot()
|
| 193 |
+
|
| 194 |
+
reward = 0.0
|
| 195 |
+
feedbacks: List[str] = []
|
| 196 |
+
applied_count = 0
|
| 197 |
+
for edit in action.edits:
|
| 198 |
+
if edit.file_path not in self.current_files:
|
| 199 |
+
feedbacks.append(f"File not found: {edit.file_path}")
|
| 200 |
+
continue
|
| 201 |
+
|
| 202 |
+
file_content = self.current_files[edit.file_path]
|
| 203 |
+
lines = file_content.content.split("\n")
|
| 204 |
+
|
| 205 |
+
if action.action_type == ActionType.REPLACE_LINE:
|
| 206 |
+
if edit.line_number and 1 <= edit.line_number <= len(lines):
|
| 207 |
+
lines[edit.line_number - 1] = edit.new_content or ""
|
| 208 |
+
feedbacks.append(f"Replaced line {edit.line_number} in {edit.file_path}")
|
| 209 |
+
applied_count += 1
|
| 210 |
+
else:
|
| 211 |
+
feedbacks.append(f"Invalid line number: {edit.line_number}")
|
| 212 |
+
continue
|
| 213 |
+
elif action.action_type == ActionType.ADD_LINE:
|
| 214 |
+
insert_at = edit.line_number - 1 if edit.line_number else len(lines)
|
| 215 |
+
insert_at = max(0, min(insert_at, len(lines)))
|
| 216 |
+
lines.insert(insert_at, edit.new_content or "")
|
| 217 |
+
feedbacks.append(f"Added line at {insert_at + 1} in {edit.file_path}")
|
| 218 |
+
applied_count += 1
|
| 219 |
+
elif action.action_type == ActionType.DELETE_LINE:
|
| 220 |
+
if edit.line_number and 1 <= edit.line_number <= len(lines):
|
| 221 |
+
del lines[edit.line_number - 1]
|
| 222 |
+
feedbacks.append(f"Deleted line {edit.line_number} in {edit.file_path}")
|
| 223 |
+
applied_count += 1
|
| 224 |
+
else:
|
| 225 |
+
feedbacks.append(f"Invalid line number: {edit.line_number}")
|
| 226 |
+
continue
|
| 227 |
+
elif action.action_type == ActionType.EDIT_FILE:
|
| 228 |
+
if edit.old_content and edit.old_content in file_content.content:
|
| 229 |
+
new_content = file_content.content.replace(edit.old_content, edit.new_content or "", 1)
|
| 230 |
+
lines = new_content.split("\n")
|
| 231 |
+
feedbacks.append(f"Edited content in {edit.file_path}")
|
| 232 |
+
applied_count += 1
|
| 233 |
+
else:
|
| 234 |
+
feedbacks.append(f"Content not found in {edit.file_path}")
|
| 235 |
+
continue
|
| 236 |
+
elif action.action_type == ActionType.ADD_BLOCK:
|
| 237 |
+
insert_at = edit.line_number - 1 if edit.line_number else len(lines)
|
| 238 |
+
insert_at = max(0, min(insert_at, len(lines)))
|
| 239 |
+
block_lines = (edit.new_content or "").split("\n")
|
| 240 |
+
lines[insert_at:insert_at] = block_lines
|
| 241 |
+
feedbacks.append(f"Added block at {insert_at + 1} in {edit.file_path}")
|
| 242 |
+
applied_count += 1
|
| 243 |
+
elif action.action_type == ActionType.DELETE_BLOCK:
|
| 244 |
+
if edit.old_content and edit.old_content in file_content.content:
|
| 245 |
+
new_content = file_content.content.replace(edit.old_content, "", 1)
|
| 246 |
+
lines = new_content.split("\n")
|
| 247 |
+
feedbacks.append(f"Deleted block in {edit.file_path}")
|
| 248 |
+
applied_count += 1
|
| 249 |
+
else:
|
| 250 |
+
feedbacks.append(f"Block content not found in {edit.file_path}")
|
| 251 |
+
continue
|
| 252 |
+
else:
|
| 253 |
+
feedbacks.append(f"Unsupported action type for now: {action.action_type.value}")
|
| 254 |
+
continue
|
| 255 |
+
|
| 256 |
+
new_content = "\n".join(lines)
|
| 257 |
+
self.current_files[edit.file_path] = FileContent(
|
| 258 |
+
path=file_content.path,
|
| 259 |
+
content=new_content,
|
| 260 |
+
file_type=file_content.file_type,
|
| 261 |
+
line_count=len(lines),
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
reward += self._check_fix_progress()
|
| 265 |
+
|
| 266 |
+
after_validation = self._validation_snapshot()
|
| 267 |
+
if not before_validation["docker_build_valid"] and after_validation["docker_build_valid"]:
|
| 268 |
+
reward += 0.1
|
| 269 |
+
feedbacks.append("Docker build validity improved")
|
| 270 |
+
if not before_validation["workflow_parse_valid"] and after_validation["workflow_parse_valid"]:
|
| 271 |
+
reward += 0.1
|
| 272 |
+
feedbacks.append("Workflow parse validity improved")
|
| 273 |
+
|
| 274 |
+
if applied_count == 0:
|
| 275 |
+
self.last_action_success = False
|
| 276 |
+
return max(0.0, reward), "; ".join(feedbacks) or "No edit applied"
|
| 277 |
+
|
| 278 |
+
self.last_action_success = True
|
| 279 |
+
return max(0.0, reward), "; ".join(feedbacks)
|
| 280 |
+
|
| 281 |
+
def _check_fix_progress(self) -> float:
|
| 282 |
+
fixes_applied = 0
|
| 283 |
+
for fix in self.expected_fixes:
|
| 284 |
+
file_path = fix["file"]
|
| 285 |
+
if file_path not in self.current_files:
|
| 286 |
+
continue
|
| 287 |
+
current_content = self.current_files[file_path].content
|
| 288 |
+
if fix["type"] == "contains" and fix["expected"] in current_content:
|
| 289 |
+
fixes_applied += 1
|
| 290 |
+
if fix["type"] == "not_contains" and fix["expected"] not in current_content:
|
| 291 |
+
fixes_applied += 1
|
| 292 |
+
if fix["type"] == "line_equals":
|
| 293 |
+
lines = current_content.split("\n")
|
| 294 |
+
line_num = int(fix.get("line", 0))
|
| 295 |
+
if 1 <= line_num <= len(lines):
|
| 296 |
+
if lines[line_num - 1].strip() == str(fix["expected"]).strip():
|
| 297 |
+
fixes_applied += 1
|
| 298 |
+
|
| 299 |
+
new_fixed = fixes_applied - self.issues_fixed
|
| 300 |
+
if new_fixed > 0:
|
| 301 |
+
self.issues_fixed = fixes_applied
|
| 302 |
+
return 0.3 * new_fixed
|
| 303 |
+
return 0.0
|
| 304 |
+
|
| 305 |
+
def _handle_submit(self) -> Tuple[float, str]:
|
| 306 |
+
docker_result = self.docker_sim.validate(self.current_files.get("Dockerfile"), self.current_files)
|
| 307 |
+
workflow_result = self.workflow_sim.validate(self.current_files.get(".github/workflows/build.yml"), self.current_files)
|
| 308 |
+
|
| 309 |
+
reward = 0.0
|
| 310 |
+
parts: List[str] = []
|
| 311 |
+
|
| 312 |
+
if docker_result["build_success"]:
|
| 313 |
+
reward += 0.3
|
| 314 |
+
parts.append("Docker build: PASS")
|
| 315 |
+
else:
|
| 316 |
+
parts.append(f"Docker build: FAIL - {docker_result.get('error', 'unknown')}")
|
| 317 |
+
|
| 318 |
+
if docker_result["run_success"]:
|
| 319 |
+
reward += 0.2
|
| 320 |
+
parts.append("Docker run: PASS")
|
| 321 |
+
else:
|
| 322 |
+
parts.append(f"Docker run: FAIL - {docker_result.get('run_error', 'unknown')}")
|
| 323 |
+
|
| 324 |
+
if workflow_result["parse_success"]:
|
| 325 |
+
reward += 0.2
|
| 326 |
+
parts.append("Workflow parse: PASS")
|
| 327 |
+
else:
|
| 328 |
+
parts.append(f"Workflow parse: FAIL - {workflow_result.get('error', 'unknown')}")
|
| 329 |
+
|
| 330 |
+
if workflow_result["execution_success"]:
|
| 331 |
+
reward += 0.3
|
| 332 |
+
parts.append("Workflow execution: PASS")
|
| 333 |
+
else:
|
| 334 |
+
parts.append(f"Workflow execution: FAIL - {workflow_result.get('exec_error', 'unknown')}")
|
| 335 |
+
|
| 336 |
+
self.last_action_success = reward >= 0.8
|
| 337 |
+
return reward, "; ".join(parts)
|
| 338 |
+
|
| 339 |
+
def _handle_hint_request(self) -> Tuple[float, str]:
|
| 340 |
+
if self.hints_used >= self.MAX_HINTS:
|
| 341 |
+
self.last_action_success = False
|
| 342 |
+
return 0.0, "No hints remaining"
|
| 343 |
+
|
| 344 |
+
self.hints_used += 1
|
| 345 |
+
for fix in self.expected_fixes:
|
| 346 |
+
file_path = fix["file"]
|
| 347 |
+
if file_path not in self.current_files:
|
| 348 |
+
continue
|
| 349 |
+
current_content = self.current_files[file_path].content
|
| 350 |
+
is_fixed = False
|
| 351 |
+
if fix["type"] == "contains":
|
| 352 |
+
is_fixed = fix["expected"] in current_content
|
| 353 |
+
elif fix["type"] == "not_contains":
|
| 354 |
+
is_fixed = fix["expected"] not in current_content
|
| 355 |
+
if not is_fixed:
|
| 356 |
+
hint = fix.get("hint", f"Check {file_path}")
|
| 357 |
+
self.last_action_success = True
|
| 358 |
+
return -0.05, f"Hint ({self.hints_used}/{self.MAX_HINTS}): {hint}"
|
| 359 |
+
|
| 360 |
+
self.last_action_success = True
|
| 361 |
+
return 0.0, "All known issues appear to be fixed"
|
| 362 |
+
|
| 363 |
+
def get_observation(self) -> Observation:
|
| 364 |
+
return Observation(
|
| 365 |
+
task_id=self.current_task_id or "unknown",
|
| 366 |
+
task_description=self.current_task.DESCRIPTION if self.current_task else "",
|
| 367 |
+
difficulty=self.current_difficulty or TaskDifficulty.EASY,
|
| 368 |
+
files=list(self.current_files.values()),
|
| 369 |
+
error=self.current_error or ErrorInfo(
|
| 370 |
+
phase=ErrorPhase.DOCKER_BUILD,
|
| 371 |
+
error_message="No error",
|
| 372 |
+
),
|
| 373 |
+
available_secrets=self.current_task.AVAILABLE_SECRETS if self.current_task else [],
|
| 374 |
+
runner_os="ubuntu-latest",
|
| 375 |
+
step_number=self.step_count,
|
| 376 |
+
max_steps=self.MAX_STEPS,
|
| 377 |
+
hints_used=self.hints_used,
|
| 378 |
+
hints_available=self.MAX_HINTS - self.hints_used,
|
| 379 |
+
last_action_success=self.last_action_success,
|
| 380 |
+
last_action_feedback=self.last_action_feedback,
|
| 381 |
+
issues_found=self.issues_fixed,
|
| 382 |
+
issues_fixed=self.issues_fixed,
|
| 383 |
+
total_issues=max(1, self.issues_total),
|
| 384 |
+
)
|
server/graders/__init__.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Deterministic grader for trajectory scoring."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Any, Dict, List
|
| 6 |
+
|
| 7 |
+
from server.models import GraderResult
|
| 8 |
+
from server.tasks.task_registry import TASK_REGISTRY
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def run_grader(task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult:
|
| 12 |
+
if task_id not in TASK_REGISTRY:
|
| 13 |
+
raise ValueError(f"Unknown task: {task_id}")
|
| 14 |
+
|
| 15 |
+
if not trajectory:
|
| 16 |
+
return GraderResult(
|
| 17 |
+
task_id=task_id,
|
| 18 |
+
score=0.0,
|
| 19 |
+
breakdown={"issues_fixed": 0.0, "complete_solution": 0.0, "efficiency": 0.0, "hint_penalty": 0.0},
|
| 20 |
+
feedback="No actions taken",
|
| 21 |
+
steps_taken=0,
|
| 22 |
+
hints_used=0,
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
final_step = trajectory[-1]
|
| 26 |
+
steps_taken = len(trajectory)
|
| 27 |
+
hints_used = sum(1 for s in trajectory if s.get("action", {}).get("action_type") == "request_hint")
|
| 28 |
+
|
| 29 |
+
issues_fixed = int(final_step.get("info", {}).get("issues_fixed", 0))
|
| 30 |
+
issues_total = max(1, int(final_step.get("info", {}).get("issues_total", 1)))
|
| 31 |
+
fix_ratio = issues_fixed / issues_total
|
| 32 |
+
|
| 33 |
+
# Component 1: issue completion (dominant, dynamic by actual fix progress)
|
| 34 |
+
completion_score = 0.55 * fix_ratio
|
| 35 |
+
|
| 36 |
+
# Component 2: action quality via targeted edits on valid files and lines
|
| 37 |
+
valid_edit_actions = 0
|
| 38 |
+
total_edit_actions = 0
|
| 39 |
+
for step in trajectory:
|
| 40 |
+
action = step.get("action", {})
|
| 41 |
+
action_type = action.get("action_type")
|
| 42 |
+
edits = action.get("edits") or []
|
| 43 |
+
if action_type in {"edit_file", "replace_line", "add_line", "delete_line", "add_block", "delete_block"}:
|
| 44 |
+
total_edit_actions += 1
|
| 45 |
+
has_valid_edit = False
|
| 46 |
+
for edit in edits:
|
| 47 |
+
if edit.get("file_path") and (
|
| 48 |
+
edit.get("line_number") is None or isinstance(edit.get("line_number"), int)
|
| 49 |
+
):
|
| 50 |
+
has_valid_edit = True
|
| 51 |
+
if has_valid_edit:
|
| 52 |
+
valid_edit_actions += 1
|
| 53 |
+
|
| 54 |
+
if total_edit_actions == 0:
|
| 55 |
+
action_quality_score = 0.0
|
| 56 |
+
else:
|
| 57 |
+
action_quality_score = 0.15 * (valid_edit_actions / total_edit_actions)
|
| 58 |
+
|
| 59 |
+
# Component 3: full-solution bonus if all issues are fixed
|
| 60 |
+
full_solution_bonus = 0.2 if issues_fixed == issues_total else 0.0
|
| 61 |
+
|
| 62 |
+
# Component 4: efficiency bonus (fewer extra steps beyond issue count)
|
| 63 |
+
if steps_taken <= issues_total:
|
| 64 |
+
efficiency_score = 0.10
|
| 65 |
+
else:
|
| 66 |
+
efficiency_score = max(0.0, 0.10 - 0.01 * (steps_taken - issues_total))
|
| 67 |
+
|
| 68 |
+
# Penalty: hint usage
|
| 69 |
+
hint_penalty = 0.05 * hints_used
|
| 70 |
+
|
| 71 |
+
score = completion_score + action_quality_score + full_solution_bonus + efficiency_score - hint_penalty
|
| 72 |
+
score = max(0.0, min(1.0, score))
|
| 73 |
+
|
| 74 |
+
if score >= 0.9:
|
| 75 |
+
feedback = "Excellent! Complete solution with strong efficiency."
|
| 76 |
+
elif score >= 0.7:
|
| 77 |
+
feedback = "Good progress with meaningful fixes."
|
| 78 |
+
elif score >= 0.5:
|
| 79 |
+
feedback = "Partial success. Some issues remain unresolved."
|
| 80 |
+
else:
|
| 81 |
+
feedback = "Limited progress. Focus on fixing core reported failures first."
|
| 82 |
+
|
| 83 |
+
return GraderResult(
|
| 84 |
+
task_id=task_id,
|
| 85 |
+
score=round(score, 3),
|
| 86 |
+
breakdown={
|
| 87 |
+
"completion": round(completion_score, 3),
|
| 88 |
+
"action_quality": round(action_quality_score, 3),
|
| 89 |
+
"complete_solution": round(full_solution_bonus, 3),
|
| 90 |
+
"efficiency": round(efficiency_score, 3),
|
| 91 |
+
"hint_penalty": round(-hint_penalty, 3),
|
| 92 |
+
},
|
| 93 |
+
feedback=feedback,
|
| 94 |
+
steps_taken=steps_taken,
|
| 95 |
+
hints_used=hints_used,
|
| 96 |
+
)
|
server/graders/base.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Base grader interface (placeholder for future special graders)."""
|
server/main.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""FastAPI entrypoint implementing required environment endpoints."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Optional
|
| 6 |
+
|
| 7 |
+
import uvicorn
|
| 8 |
+
from fastapi import FastAPI, HTTPException
|
| 9 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 10 |
+
|
| 11 |
+
from server.environment import CICDDebugEnvironment
|
| 12 |
+
from server.graders import run_grader
|
| 13 |
+
from server.models import (
|
| 14 |
+
Action,
|
| 15 |
+
BaselineRequest,
|
| 16 |
+
BaselineResponse,
|
| 17 |
+
EnvironmentInfo,
|
| 18 |
+
GraderRequest,
|
| 19 |
+
GraderResponse,
|
| 20 |
+
Observation,
|
| 21 |
+
ResetRequest,
|
| 22 |
+
ResetResponse,
|
| 23 |
+
StateResponse,
|
| 24 |
+
StepRequest,
|
| 25 |
+
StepResponse,
|
| 26 |
+
TaskInfo,
|
| 27 |
+
)
|
| 28 |
+
from server.tasks.task_registry import TASK_REGISTRY
|
| 29 |
+
|
| 30 |
+
app = FastAPI(
|
| 31 |
+
title="CI/CD Debug Environment",
|
| 32 |
+
description="OpenEnv-style environment for Docker + GitHub Actions debugging",
|
| 33 |
+
version="1.0.0",
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
app.add_middleware(
|
| 37 |
+
CORSMiddleware,
|
| 38 |
+
allow_origins=["*"],
|
| 39 |
+
allow_credentials=True,
|
| 40 |
+
allow_methods=["*"],
|
| 41 |
+
allow_headers=["*"],
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
env: Optional[CICDDebugEnvironment] = None
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
@app.get("/")
|
| 48 |
+
async def root():
|
| 49 |
+
return {"status": "healthy", "environment": "cicd-debug-env"}
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
@app.post("/reset", response_model=ResetResponse)
|
| 53 |
+
async def reset(request: Optional[ResetRequest] = None):
|
| 54 |
+
global env
|
| 55 |
+
|
| 56 |
+
request = request or ResetRequest()
|
| 57 |
+
env = CICDDebugEnvironment()
|
| 58 |
+
try:
|
| 59 |
+
observation = env.reset(
|
| 60 |
+
task_id=request.task_id,
|
| 61 |
+
scenario_id=request.scenario_id,
|
| 62 |
+
seed=request.seed,
|
| 63 |
+
)
|
| 64 |
+
except ValueError as exc:
|
| 65 |
+
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
| 66 |
+
|
| 67 |
+
return ResetResponse(
|
| 68 |
+
observation=observation,
|
| 69 |
+
info={
|
| 70 |
+
"task_id": env.current_task_id,
|
| 71 |
+
"scenario_id": env.current_scenario_id,
|
| 72 |
+
"difficulty": env.current_difficulty,
|
| 73 |
+
},
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
@app.post("/step", response_model=StepResponse)
|
| 78 |
+
async def step(request: StepRequest):
|
| 79 |
+
global env
|
| 80 |
+
|
| 81 |
+
if env is None:
|
| 82 |
+
raise HTTPException(status_code=400, detail="Environment not initialized. Call /reset first.")
|
| 83 |
+
|
| 84 |
+
observation, reward, done, info = env.step(request.action)
|
| 85 |
+
return StepResponse(observation=observation, reward=reward, done=done, info=info)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
@app.get("/state", response_model=StateResponse)
|
| 89 |
+
async def get_state():
|
| 90 |
+
global env
|
| 91 |
+
|
| 92 |
+
if env is None:
|
| 93 |
+
raise HTTPException(status_code=400, detail="Environment not initialized. Call /reset first.")
|
| 94 |
+
|
| 95 |
+
return StateResponse(
|
| 96 |
+
observation=env.get_observation(),
|
| 97 |
+
episode_reward=env.episode_reward,
|
| 98 |
+
steps_taken=env.step_count,
|
| 99 |
+
done=env.done,
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
@app.get("/info", response_model=EnvironmentInfo)
|
| 104 |
+
async def get_info():
|
| 105 |
+
tasks = [
|
| 106 |
+
TaskInfo(
|
| 107 |
+
id=task_id,
|
| 108 |
+
name=task_cls.NAME,
|
| 109 |
+
description=task_cls.DESCRIPTION,
|
| 110 |
+
difficulty=task_cls.DIFFICULTY,
|
| 111 |
+
num_scenarios=len(task_cls.SCENARIOS),
|
| 112 |
+
)
|
| 113 |
+
for task_id, task_cls in TASK_REGISTRY.items()
|
| 114 |
+
]
|
| 115 |
+
return EnvironmentInfo(
|
| 116 |
+
tasks=tasks,
|
| 117 |
+
max_steps=10,
|
| 118 |
+
action_space=Action.model_json_schema(),
|
| 119 |
+
observation_space=Observation.model_json_schema(),
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
@app.get("/tasks")
|
| 124 |
+
async def get_tasks():
|
| 125 |
+
return {
|
| 126 |
+
"tasks": [
|
| 127 |
+
{
|
| 128 |
+
"id": task_id,
|
| 129 |
+
"name": task_cls.NAME,
|
| 130 |
+
"description": task_cls.DESCRIPTION,
|
| 131 |
+
"difficulty": task_cls.DIFFICULTY.value,
|
| 132 |
+
}
|
| 133 |
+
for task_id, task_cls in TASK_REGISTRY.items()
|
| 134 |
+
]
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
@app.post("/grader", response_model=GraderResponse)
|
| 139 |
+
async def grade(request: GraderRequest):
|
| 140 |
+
result = run_grader(task_id=request.task_id, trajectory=request.trajectory)
|
| 141 |
+
return GraderResponse(result=result)
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
@app.post("/baseline", response_model=BaselineResponse)
|
| 145 |
+
async def run_baseline(request: Optional[BaselineRequest] = None):
|
| 146 |
+
request = request or BaselineRequest()
|
| 147 |
+
|
| 148 |
+
from baseline_runner import run_baseline_episodes
|
| 149 |
+
|
| 150 |
+
results = run_baseline_episodes(task_id=request.task_id, num_episodes=request.num_episodes)
|
| 151 |
+
aggregate = sum(r.score for r in results) / len(results) if results else 0.0
|
| 152 |
+
return BaselineResponse(results=results, aggregate_score=aggregate)
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
if __name__ == "__main__":
|
| 156 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
server/models.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Typed Pydantic models for API and environment state."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from enum import Enum
|
| 6 |
+
from typing import Any, Dict, List, Optional, Union
|
| 7 |
+
|
| 8 |
+
from pydantic import BaseModel, Field
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class TaskDifficulty(str, Enum):
|
| 12 |
+
EASY = "easy"
|
| 13 |
+
MEDIUM = "medium"
|
| 14 |
+
HARD = "hard"
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class ActionType(str, Enum):
|
| 18 |
+
EDIT_FILE = "edit_file"
|
| 19 |
+
ADD_LINE = "add_line"
|
| 20 |
+
DELETE_LINE = "delete_line"
|
| 21 |
+
REPLACE_LINE = "replace_line"
|
| 22 |
+
ADD_BLOCK = "add_block"
|
| 23 |
+
DELETE_BLOCK = "delete_block"
|
| 24 |
+
SUBMIT = "submit"
|
| 25 |
+
REQUEST_HINT = "request_hint"
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class FileType(str, Enum):
|
| 29 |
+
DOCKERFILE = "dockerfile"
|
| 30 |
+
WORKFLOW = "workflow"
|
| 31 |
+
DOCKER_COMPOSE = "docker_compose"
|
| 32 |
+
REQUIREMENTS = "requirements"
|
| 33 |
+
OTHER = "other"
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class ErrorPhase(str, Enum):
|
| 37 |
+
WORKFLOW_PARSE = "workflow_parse"
|
| 38 |
+
DOCKER_BUILD = "docker_build"
|
| 39 |
+
DOCKER_RUN = "docker_run"
|
| 40 |
+
TEST = "test"
|
| 41 |
+
PUSH = "push"
|
| 42 |
+
DEPLOY = "deploy"
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class FileContent(BaseModel):
|
| 46 |
+
path: str = Field(..., description="File path")
|
| 47 |
+
content: str = Field(..., description="Current file content")
|
| 48 |
+
file_type: FileType = Field(..., description="Type of file")
|
| 49 |
+
line_count: int = Field(..., description="Number of lines")
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class ErrorInfo(BaseModel):
|
| 53 |
+
phase: ErrorPhase = Field(..., description="Failed phase")
|
| 54 |
+
error_message: str = Field(..., description="Error output")
|
| 55 |
+
exit_code: Optional[int] = Field(None, description="Exit code")
|
| 56 |
+
failed_step: Optional[str] = Field(None, description="Failed step")
|
| 57 |
+
line_hint: Optional[int] = Field(None, description="Suggested line")
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class Observation(BaseModel):
|
| 61 |
+
task_id: str
|
| 62 |
+
task_description: str
|
| 63 |
+
difficulty: TaskDifficulty
|
| 64 |
+
files: List[FileContent]
|
| 65 |
+
error: ErrorInfo
|
| 66 |
+
available_secrets: List[str] = Field(default_factory=list)
|
| 67 |
+
runner_os: str = "ubuntu-latest"
|
| 68 |
+
step_number: int
|
| 69 |
+
max_steps: int
|
| 70 |
+
hints_used: int = 0
|
| 71 |
+
hints_available: int = 3
|
| 72 |
+
last_action_success: Optional[bool] = None
|
| 73 |
+
last_action_feedback: Optional[str] = None
|
| 74 |
+
issues_found: int = 0
|
| 75 |
+
issues_fixed: int = 0
|
| 76 |
+
total_issues: int
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
class FileEdit(BaseModel):
|
| 80 |
+
file_path: str
|
| 81 |
+
line_number: Optional[int] = None
|
| 82 |
+
old_content: Optional[str] = None
|
| 83 |
+
new_content: Optional[str] = None
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
class Action(BaseModel):
|
| 87 |
+
action_type: ActionType
|
| 88 |
+
edits: Optional[List[FileEdit]] = None
|
| 89 |
+
reasoning: Optional[str] = None
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
class StepResult(BaseModel):
|
| 93 |
+
observation: Observation
|
| 94 |
+
reward: float = Field(..., ge=0.0, le=1.0)
|
| 95 |
+
done: bool
|
| 96 |
+
info: Dict[str, Any] = Field(default_factory=dict)
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
class TaskInfo(BaseModel):
|
| 100 |
+
id: str
|
| 101 |
+
name: str
|
| 102 |
+
description: str
|
| 103 |
+
difficulty: TaskDifficulty
|
| 104 |
+
num_scenarios: int
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
class EnvironmentInfo(BaseModel):
|
| 108 |
+
name: str = "cicd-debug-env"
|
| 109 |
+
version: str = "1.0.0"
|
| 110 |
+
description: str = "Debug CI/CD infrastructure issues"
|
| 111 |
+
tasks: List[TaskInfo]
|
| 112 |
+
max_steps: int = 10
|
| 113 |
+
action_space: Dict[str, Any]
|
| 114 |
+
observation_space: Dict[str, Any]
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
class GraderResult(BaseModel):
|
| 118 |
+
task_id: str
|
| 119 |
+
score: float = Field(..., ge=0.0, le=1.0)
|
| 120 |
+
max_score: float = 1.0
|
| 121 |
+
breakdown: Dict[str, float] = Field(default_factory=dict)
|
| 122 |
+
feedback: str = ""
|
| 123 |
+
steps_taken: int
|
| 124 |
+
hints_used: int = 0
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
class ResetRequest(BaseModel):
|
| 128 |
+
task_id: Optional[Union[str, int]] = None
|
| 129 |
+
scenario_id: Optional[str] = None
|
| 130 |
+
seed: Optional[int] = None
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
class ResetResponse(BaseModel):
|
| 134 |
+
observation: Observation
|
| 135 |
+
info: Dict[str, Any] = Field(default_factory=dict)
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
class StepRequest(BaseModel):
|
| 139 |
+
action: Action
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
class StepResponse(BaseModel):
|
| 143 |
+
observation: Observation
|
| 144 |
+
reward: float
|
| 145 |
+
done: bool
|
| 146 |
+
info: Dict[str, Any] = Field(default_factory=dict)
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
class StateResponse(BaseModel):
|
| 150 |
+
observation: Observation
|
| 151 |
+
episode_reward: float
|
| 152 |
+
steps_taken: int
|
| 153 |
+
done: bool
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
class GraderRequest(BaseModel):
|
| 157 |
+
task_id: str
|
| 158 |
+
trajectory: List[Dict[str, Any]]
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
class GraderResponse(BaseModel):
|
| 162 |
+
result: GraderResult
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
class BaselineRequest(BaseModel):
|
| 166 |
+
task_id: Optional[str] = None
|
| 167 |
+
num_episodes: int = 1
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
class BaselineResponse(BaseModel):
|
| 171 |
+
results: List[GraderResult]
|
| 172 |
+
aggregate_score: float
|
server/simulators/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Simulator package."""
|
server/simulators/docker_simulator.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Docker build/run simulator with deterministic rule-based validation."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Dict, List, Optional, Set
|
| 6 |
+
|
| 7 |
+
from server.models import FileContent
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class DockerSimulator:
|
| 11 |
+
VALID_INSTRUCTIONS: Set[str] = {
|
| 12 |
+
"FROM",
|
| 13 |
+
"RUN",
|
| 14 |
+
"CMD",
|
| 15 |
+
"LABEL",
|
| 16 |
+
"MAINTAINER",
|
| 17 |
+
"EXPOSE",
|
| 18 |
+
"ENV",
|
| 19 |
+
"ADD",
|
| 20 |
+
"COPY",
|
| 21 |
+
"ENTRYPOINT",
|
| 22 |
+
"VOLUME",
|
| 23 |
+
"USER",
|
| 24 |
+
"WORKDIR",
|
| 25 |
+
"ARG",
|
| 26 |
+
"ONBUILD",
|
| 27 |
+
"STOPSIGNAL",
|
| 28 |
+
"HEALTHCHECK",
|
| 29 |
+
"SHELL",
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
def _split_lines(self, content: str) -> List[str]:
|
| 33 |
+
return [line.rstrip() for line in content.split("\n")]
|
| 34 |
+
|
| 35 |
+
def _non_empty_non_comment_lines(self, lines: List[str]) -> List[str]:
|
| 36 |
+
return [line.strip() for line in lines if line.strip() and not line.strip().startswith("#")]
|
| 37 |
+
|
| 38 |
+
def _source_exists(self, source: str, context_files: Dict[str, FileContent]) -> bool:
|
| 39 |
+
if source in {".", "./"}:
|
| 40 |
+
return True
|
| 41 |
+
if "*" in source:
|
| 42 |
+
prefix = source.replace("*", "")
|
| 43 |
+
return any(path.startswith(prefix) for path in context_files)
|
| 44 |
+
return source in context_files
|
| 45 |
+
|
| 46 |
+
def validate(self, dockerfile: Optional[FileContent], context_files: Dict[str, FileContent]):
|
| 47 |
+
if dockerfile is None:
|
| 48 |
+
return {"build_success": False, "run_success": False, "error": "Dockerfile missing"}
|
| 49 |
+
|
| 50 |
+
content = dockerfile.content
|
| 51 |
+
lines = self._split_lines(content)
|
| 52 |
+
active_lines = self._non_empty_non_comment_lines(lines)
|
| 53 |
+
|
| 54 |
+
if not active_lines:
|
| 55 |
+
return {"build_success": False, "run_success": False, "error": "Dockerfile is empty"}
|
| 56 |
+
|
| 57 |
+
if not active_lines[0].upper().startswith("FROM "):
|
| 58 |
+
return {
|
| 59 |
+
"build_success": False,
|
| 60 |
+
"run_success": False,
|
| 61 |
+
"error": "Dockerfile must start with FROM",
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
for idx, raw in enumerate(active_lines, start=1):
|
| 65 |
+
token = raw.split()[0].upper()
|
| 66 |
+
if token.startswith("&&"):
|
| 67 |
+
return {
|
| 68 |
+
"build_success": False,
|
| 69 |
+
"run_success": False,
|
| 70 |
+
"error": f"Dockerfile parse error: unknown instruction: {token}",
|
| 71 |
+
"line": idx,
|
| 72 |
+
}
|
| 73 |
+
if token not in self.VALID_INSTRUCTIONS:
|
| 74 |
+
return {
|
| 75 |
+
"build_success": False,
|
| 76 |
+
"run_success": False,
|
| 77 |
+
"error": f"Dockerfile parse error: unknown instruction: {token}",
|
| 78 |
+
"line": idx,
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
if "FROM python:3.9-slimm" in content:
|
| 82 |
+
return {
|
| 83 |
+
"build_success": False,
|
| 84 |
+
"run_success": False,
|
| 85 |
+
"error": "pull access denied for python:3.9-slimm",
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
for raw in active_lines:
|
| 89 |
+
upper = raw.upper()
|
| 90 |
+
if upper.startswith("COPY "):
|
| 91 |
+
parts = raw.split()
|
| 92 |
+
if len(parts) < 3:
|
| 93 |
+
return {
|
| 94 |
+
"build_success": False,
|
| 95 |
+
"run_success": False,
|
| 96 |
+
"error": "COPY requires source and destination",
|
| 97 |
+
}
|
| 98 |
+
src = parts[1]
|
| 99 |
+
if src.startswith("--from=") and len(parts) >= 4:
|
| 100 |
+
src = parts[2]
|
| 101 |
+
if src.startswith("--"):
|
| 102 |
+
continue
|
| 103 |
+
if not self._source_exists(src, context_files):
|
| 104 |
+
return {
|
| 105 |
+
"build_success": False,
|
| 106 |
+
"run_success": False,
|
| 107 |
+
"error": f"COPY failed: file not found in build context: {src}",
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
if "--platform=$BUILDPLATFORM" in content and "ARG BUILDPLATFORM" not in content:
|
| 111 |
+
return {
|
| 112 |
+
"build_success": False,
|
| 113 |
+
"run_success": False,
|
| 114 |
+
"error": "failed to parse platform: BUILDPLATFORM not declared",
|
| 115 |
+
}
|
| 116 |
+
if "--platform=$TARGETPLATFORM" in content and "ARG TARGETPLATFORM" not in content:
|
| 117 |
+
return {
|
| 118 |
+
"build_success": False,
|
| 119 |
+
"run_success": False,
|
| 120 |
+
"error": "failed to parse platform: TARGETPLATFORM not declared",
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
if "COPY --from=builder /app/dist" in content:
|
| 124 |
+
pkg = context_files.get("package.json")
|
| 125 |
+
if pkg and "react-scripts build" in pkg.content:
|
| 126 |
+
return {
|
| 127 |
+
"build_success": False,
|
| 128 |
+
"run_success": False,
|
| 129 |
+
"error": "COPY failed: stat app/dist: file does not exist",
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
if "requirments.txt" in content:
|
| 133 |
+
return {
|
| 134 |
+
"build_success": False,
|
| 135 |
+
"run_success": False,
|
| 136 |
+
"error": "COPY failed: file not found in build context: requirments.txt",
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
if ("npm start" in content or 'CMD ["npm", "start"]' in content) and "WORKDIR /app" not in content:
|
| 140 |
+
return {
|
| 141 |
+
"build_success": True,
|
| 142 |
+
"run_success": False,
|
| 143 |
+
"run_error": "Error: Cannot find module '/package.json'",
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
return {"build_success": True, "run_success": True}
|
server/simulators/workflow_simulator.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Workflow simulator with YAML parse and deterministic CI rule checks."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Dict, Optional
|
| 6 |
+
|
| 7 |
+
import yaml
|
| 8 |
+
|
| 9 |
+
from server.models import FileContent
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class WorkflowSimulator:
|
| 13 |
+
def validate(self, workflow: Optional[FileContent], files: Dict[str, FileContent]):
|
| 14 |
+
if workflow is None:
|
| 15 |
+
# Not all easy tasks include workflow; keep this permissive.
|
| 16 |
+
return {"parse_success": True, "execution_success": True}
|
| 17 |
+
|
| 18 |
+
content = workflow.content
|
| 19 |
+
try:
|
| 20 |
+
parsed = yaml.safe_load(content)
|
| 21 |
+
except yaml.YAMLError as exc:
|
| 22 |
+
return {
|
| 23 |
+
"parse_success": False,
|
| 24 |
+
"execution_success": False,
|
| 25 |
+
"error": f"YAML parse error: {exc}",
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
if not isinstance(parsed, dict):
|
| 29 |
+
return {
|
| 30 |
+
"parse_success": False,
|
| 31 |
+
"execution_success": False,
|
| 32 |
+
"error": "Workflow root must be a mapping",
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
jobs = parsed.get("jobs")
|
| 36 |
+
if not isinstance(jobs, dict) or not jobs:
|
| 37 |
+
return {
|
| 38 |
+
"parse_success": False,
|
| 39 |
+
"execution_success": False,
|
| 40 |
+
"error": "Workflow must define at least one job",
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
has_buildx_setup = "docker/setup-buildx-action" in content
|
| 44 |
+
has_platforms = "platforms:" in content
|
| 45 |
+
has_docker_login = "docker login" in content
|
| 46 |
+
has_username_secret = "secrets.DOCKER_USERNAME" in content
|
| 47 |
+
has_password_secret = "secrets.DOCKER_PASSWORD" in content
|
| 48 |
+
|
| 49 |
+
for _, job in jobs.items():
|
| 50 |
+
if not isinstance(job, dict):
|
| 51 |
+
continue
|
| 52 |
+
steps = job.get("steps", [])
|
| 53 |
+
if not isinstance(steps, list):
|
| 54 |
+
return {
|
| 55 |
+
"parse_success": False,
|
| 56 |
+
"execution_success": False,
|
| 57 |
+
"error": "Job steps must be a list",
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
checkout_index = -1
|
| 61 |
+
build_index = -1
|
| 62 |
+
for idx, step in enumerate(steps):
|
| 63 |
+
if not isinstance(step, dict):
|
| 64 |
+
continue
|
| 65 |
+
uses = step.get("uses", "")
|
| 66 |
+
run = step.get("run", "")
|
| 67 |
+
if isinstance(uses, str) and "actions/checkout" in uses:
|
| 68 |
+
checkout_index = idx
|
| 69 |
+
if (isinstance(run, str) and "docker build" in run) or (
|
| 70 |
+
isinstance(uses, str) and "docker/build-push-action" in uses
|
| 71 |
+
):
|
| 72 |
+
build_index = idx
|
| 73 |
+
|
| 74 |
+
if build_index != -1 and (checkout_index == -1 or checkout_index > build_index):
|
| 75 |
+
return {
|
| 76 |
+
"parse_success": True,
|
| 77 |
+
"execution_success": False,
|
| 78 |
+
"exec_error": "Checkout must happen before Docker build steps",
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
if has_docker_login and (not has_username_secret or not has_password_secret):
|
| 82 |
+
return {
|
| 83 |
+
"parse_success": True,
|
| 84 |
+
"execution_success": False,
|
| 85 |
+
"exec_error": "Missing secrets environment variables",
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
if has_platforms and not has_buildx_setup:
|
| 89 |
+
return {
|
| 90 |
+
"parse_success": True,
|
| 91 |
+
"execution_success": False,
|
| 92 |
+
"exec_error": "Multi-platform build requires docker/setup-buildx-action",
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
return {"parse_success": True, "execution_success": True}
|
server/tasks/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Task package."""
|
server/tasks/base.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Base task class for scenario-based tasks."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Dict, Optional
|
| 6 |
+
|
| 7 |
+
from server.models import TaskDifficulty
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class BaseTask:
|
| 11 |
+
NAME = "Base Task"
|
| 12 |
+
DESCRIPTION = "Base task"
|
| 13 |
+
DIFFICULTY = TaskDifficulty.EASY
|
| 14 |
+
AVAILABLE_SECRETS = []
|
| 15 |
+
SCENARIOS = []
|
| 16 |
+
|
| 17 |
+
def load_scenario(self, scenario_id: Optional[str] = None) -> Dict:
|
| 18 |
+
raise NotImplementedError
|
server/tasks/task_1_build_errors.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import random
|
| 4 |
+
from typing import Dict, Optional
|
| 5 |
+
|
| 6 |
+
from server.models import TaskDifficulty
|
| 7 |
+
from server.tasks.base import BaseTask
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class DockerfileSyntaxTask(BaseTask):
|
| 11 |
+
NAME = "Dockerfile Syntax Errors"
|
| 12 |
+
DESCRIPTION = "Fix syntax and instruction errors in Dockerfiles"
|
| 13 |
+
DIFFICULTY = TaskDifficulty.EASY
|
| 14 |
+
AVAILABLE_SECRETS = []
|
| 15 |
+
SCENARIOS = [
|
| 16 |
+
{
|
| 17 |
+
"id": "typo_filename",
|
| 18 |
+
"files": [
|
| 19 |
+
{
|
| 20 |
+
"path": "Dockerfile",
|
| 21 |
+
"type": "dockerfile",
|
| 22 |
+
"content": "FROM python:3.9-slim\nWORKDIR /app\nCOPY requirments.txt .\nRUN pip install -r requirements.txt",
|
| 23 |
+
},
|
| 24 |
+
{"path": "requirements.txt", "type": "requirements", "content": "requests==2.31.0"},
|
| 25 |
+
],
|
| 26 |
+
"error": {
|
| 27 |
+
"phase": "docker_build",
|
| 28 |
+
"message": "COPY failed: file not found in build context: requirments.txt",
|
| 29 |
+
"line_hint": 3,
|
| 30 |
+
},
|
| 31 |
+
"expected_fixes": [
|
| 32 |
+
{
|
| 33 |
+
"file": "Dockerfile",
|
| 34 |
+
"type": "contains",
|
| 35 |
+
"expected": "COPY requirements.txt",
|
| 36 |
+
"hint": "Check spelling of requirements filename",
|
| 37 |
+
}
|
| 38 |
+
],
|
| 39 |
+
}
|
| 40 |
+
]
|
| 41 |
+
|
| 42 |
+
def load_scenario(self, scenario_id: Optional[str] = None) -> Dict:
|
| 43 |
+
if scenario_id:
|
| 44 |
+
for scenario in self.SCENARIOS:
|
| 45 |
+
if scenario["id"] == scenario_id:
|
| 46 |
+
return scenario
|
| 47 |
+
raise ValueError(f"Unknown scenario: {scenario_id}")
|
| 48 |
+
return random.choice(self.SCENARIOS)
|
server/tasks/task_2_docker_runtime.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import random
|
| 4 |
+
from typing import Dict, Optional
|
| 5 |
+
|
| 6 |
+
from server.models import TaskDifficulty
|
| 7 |
+
from server.tasks.base import BaseTask
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class DockerfileRuntimeTask(BaseTask):
|
| 11 |
+
NAME = "Dockerfile Runtime Errors"
|
| 12 |
+
DESCRIPTION = "Fix runtime/container execution issues in Dockerfiles"
|
| 13 |
+
DIFFICULTY = TaskDifficulty.MEDIUM
|
| 14 |
+
AVAILABLE_SECRETS = []
|
| 15 |
+
SCENARIOS = [
|
| 16 |
+
{
|
| 17 |
+
"id": "missing_workdir",
|
| 18 |
+
"files": [
|
| 19 |
+
{
|
| 20 |
+
"path": "Dockerfile",
|
| 21 |
+
"type": "dockerfile",
|
| 22 |
+
"content": "FROM node:18-alpine\nCOPY package*.json ./\nRUN npm ci\nCOPY . .\nCMD [\"npm\", \"start\"]",
|
| 23 |
+
},
|
| 24 |
+
{"path": "package.json", "type": "other", "content": '{"name": "app", "scripts": {"start": "node index.js"}}'},
|
| 25 |
+
],
|
| 26 |
+
"error": {
|
| 27 |
+
"phase": "docker_run",
|
| 28 |
+
"message": "Error: Cannot find module '/package.json'",
|
| 29 |
+
},
|
| 30 |
+
"expected_fixes": [
|
| 31 |
+
{
|
| 32 |
+
"file": "Dockerfile",
|
| 33 |
+
"type": "contains",
|
| 34 |
+
"expected": "WORKDIR /app",
|
| 35 |
+
"hint": "Set a working directory before COPY/RUN",
|
| 36 |
+
}
|
| 37 |
+
],
|
| 38 |
+
}
|
| 39 |
+
]
|
| 40 |
+
|
| 41 |
+
def load_scenario(self, scenario_id: Optional[str] = None) -> Dict:
|
| 42 |
+
if scenario_id:
|
| 43 |
+
for scenario in self.SCENARIOS:
|
| 44 |
+
if scenario["id"] == scenario_id:
|
| 45 |
+
return scenario
|
| 46 |
+
raise ValueError(f"Unknown scenario: {scenario_id}")
|
| 47 |
+
return random.choice(self.SCENARIOS)
|
server/tasks/task_2_workflow_config.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import random
|
| 4 |
+
from typing import Dict, Optional
|
| 5 |
+
|
| 6 |
+
from server.models import TaskDifficulty
|
| 7 |
+
from server.tasks.base import BaseTask
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class WorkflowConfigTask(BaseTask):
|
| 11 |
+
NAME = "Workflow Secrets and Permissions"
|
| 12 |
+
DESCRIPTION = "Fix secret wiring, env usage, and permissions in workflows"
|
| 13 |
+
DIFFICULTY = TaskDifficulty.MEDIUM
|
| 14 |
+
AVAILABLE_SECRETS = ["DOCKER_USERNAME", "DOCKER_PASSWORD", "GITHUB_TOKEN"]
|
| 15 |
+
SCENARIOS = [
|
| 16 |
+
{
|
| 17 |
+
"id": "missing_env_secrets",
|
| 18 |
+
"files": [
|
| 19 |
+
{
|
| 20 |
+
"path": ".github/workflows/build.yml",
|
| 21 |
+
"type": "workflow",
|
| 22 |
+
"content": "name: Build\non: push\njobs:\n build:\n runs-on: ubuntu-latest\n steps:\n - uses: actions/checkout@v4\n - name: Login\n run: echo $DOCKER_PASSWORD | docker login -u $DOCKER_USERNAME --password-stdin",
|
| 23 |
+
}
|
| 24 |
+
],
|
| 25 |
+
"error": {
|
| 26 |
+
"phase": "workflow_parse",
|
| 27 |
+
"message": "Cannot perform an interactive login from a non TTY device",
|
| 28 |
+
},
|
| 29 |
+
"expected_fixes": [
|
| 30 |
+
{
|
| 31 |
+
"file": ".github/workflows/build.yml",
|
| 32 |
+
"type": "contains",
|
| 33 |
+
"expected": "DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }}",
|
| 34 |
+
"hint": "Pass secrets through env",
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"file": ".github/workflows/build.yml",
|
| 38 |
+
"type": "contains",
|
| 39 |
+
"expected": "DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }}",
|
| 40 |
+
"hint": "Map password secret to environment",
|
| 41 |
+
}
|
| 42 |
+
],
|
| 43 |
+
}
|
| 44 |
+
]
|
| 45 |
+
|
| 46 |
+
def load_scenario(self, scenario_id: Optional[str] = None) -> Dict:
|
| 47 |
+
if scenario_id:
|
| 48 |
+
for scenario in self.SCENARIOS:
|
| 49 |
+
if scenario["id"] == scenario_id:
|
| 50 |
+
return scenario
|
| 51 |
+
raise ValueError(f"Unknown scenario: {scenario_id}")
|
| 52 |
+
return random.choice(self.SCENARIOS)
|
server/tasks/task_3_multi_stage.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import random
|
| 4 |
+
from typing import Dict, Optional
|
| 5 |
+
|
| 6 |
+
from server.models import TaskDifficulty
|
| 7 |
+
from server.tasks.base import BaseTask
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class MultiStagePipelineTask(BaseTask):
|
| 11 |
+
NAME = "Multi-Stage Pipeline and Matrix"
|
| 12 |
+
DESCRIPTION = "Debug complex multi-stage and matrix CI/CD pipelines"
|
| 13 |
+
DIFFICULTY = TaskDifficulty.HARD
|
| 14 |
+
AVAILABLE_SECRETS = ["DOCKER_USERNAME", "DOCKER_PASSWORD", "GITHUB_TOKEN", "NPM_TOKEN"]
|
| 15 |
+
SCENARIOS = [
|
| 16 |
+
{
|
| 17 |
+
"id": "artifact_path_mismatch",
|
| 18 |
+
"files": [
|
| 19 |
+
{
|
| 20 |
+
"path": "Dockerfile",
|
| 21 |
+
"type": "dockerfile",
|
| 22 |
+
"content": "FROM node:18 AS builder\nWORKDIR /app\nCOPY . .\nRUN npm run build\nFROM nginx:alpine\nCOPY --from=builder /app/dist /usr/share/nginx/html",
|
| 23 |
+
},
|
| 24 |
+
{"path": "package.json", "type": "other", "content": '{"scripts": {"build": "react-scripts build"}}'},
|
| 25 |
+
],
|
| 26 |
+
"error": {"phase": "docker_build", "message": "COPY failed: stat app/dist: file does not exist"},
|
| 27 |
+
"expected_fixes": [
|
| 28 |
+
{
|
| 29 |
+
"file": "Dockerfile",
|
| 30 |
+
"type": "contains",
|
| 31 |
+
"expected": "COPY --from=builder /app/build",
|
| 32 |
+
"hint": "React output path is build, not dist",
|
| 33 |
+
}
|
| 34 |
+
],
|
| 35 |
+
}
|
| 36 |
+
]
|
| 37 |
+
|
| 38 |
+
def load_scenario(self, scenario_id: Optional[str] = None) -> Dict:
|
| 39 |
+
if scenario_id:
|
| 40 |
+
for scenario in self.SCENARIOS:
|
| 41 |
+
if scenario["id"] == scenario_id:
|
| 42 |
+
return scenario
|
| 43 |
+
raise ValueError(f"Unknown scenario: {scenario_id}")
|
| 44 |
+
return random.choice(self.SCENARIOS)
|
server/tasks/task_3_workflow_syntax.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import random
|
| 4 |
+
from typing import Dict, Optional
|
| 5 |
+
|
| 6 |
+
from server.models import TaskDifficulty
|
| 7 |
+
from server.tasks.base import BaseTask
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class WorkflowSyntaxStructureTask(BaseTask):
|
| 11 |
+
NAME = "Workflow Syntax and Structure"
|
| 12 |
+
DESCRIPTION = "Fix GitHub Actions YAML syntax and job structure issues"
|
| 13 |
+
DIFFICULTY = TaskDifficulty.EASY
|
| 14 |
+
AVAILABLE_SECRETS = ["GITHUB_TOKEN"]
|
| 15 |
+
SCENARIOS = [
|
| 16 |
+
{
|
| 17 |
+
"id": "checkout_after_build",
|
| 18 |
+
"files": [
|
| 19 |
+
{
|
| 20 |
+
"path": ".github/workflows/build.yml",
|
| 21 |
+
"type": "workflow",
|
| 22 |
+
"content": "name: Build\non: push\njobs:\n build:\n runs-on: ubuntu-latest\n steps:\n - name: Build Docker image\n run: docker build -t myapp .\n - uses: actions/checkout@v4",
|
| 23 |
+
},
|
| 24 |
+
{"path": "Dockerfile", "type": "dockerfile", "content": "FROM python:3.11-slim\nWORKDIR /app\nCOPY . ."},
|
| 25 |
+
],
|
| 26 |
+
"error": {
|
| 27 |
+
"phase": "workflow_parse",
|
| 28 |
+
"message": "Build step runs before source checkout",
|
| 29 |
+
},
|
| 30 |
+
"expected_fixes": [
|
| 31 |
+
{
|
| 32 |
+
"file": ".github/workflows/build.yml",
|
| 33 |
+
"type": "contains",
|
| 34 |
+
"expected": "- uses: actions/checkout@v4",
|
| 35 |
+
"hint": "Checkout should happen before build commands",
|
| 36 |
+
}
|
| 37 |
+
],
|
| 38 |
+
}
|
| 39 |
+
]
|
| 40 |
+
|
| 41 |
+
def load_scenario(self, scenario_id: Optional[str] = None) -> Dict:
|
| 42 |
+
if scenario_id:
|
| 43 |
+
for scenario in self.SCENARIOS:
|
| 44 |
+
if scenario["id"] == scenario_id:
|
| 45 |
+
return scenario
|
| 46 |
+
raise ValueError(f"Unknown scenario: {scenario_id}")
|
| 47 |
+
return random.choice(self.SCENARIOS)
|
server/tasks/task_4_workflow_secrets_permissions.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import random
|
| 4 |
+
from typing import Dict, Optional
|
| 5 |
+
|
| 6 |
+
from server.models import TaskDifficulty
|
| 7 |
+
from server.tasks.base import BaseTask
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class WorkflowSecretsPermissionsTask(BaseTask):
|
| 11 |
+
NAME = "Workflow Secrets and Permissions"
|
| 12 |
+
DESCRIPTION = "Fix secret wiring, env usage, and permissions in workflows"
|
| 13 |
+
DIFFICULTY = TaskDifficulty.MEDIUM
|
| 14 |
+
AVAILABLE_SECRETS = ["DOCKER_USERNAME", "DOCKER_PASSWORD", "GITHUB_TOKEN"]
|
| 15 |
+
SCENARIOS = [
|
| 16 |
+
{
|
| 17 |
+
"id": "missing_env_secrets",
|
| 18 |
+
"files": [
|
| 19 |
+
{
|
| 20 |
+
"path": ".github/workflows/build.yml",
|
| 21 |
+
"type": "workflow",
|
| 22 |
+
"content": "name: Build and Push\non: push\njobs:\n build:\n runs-on: ubuntu-latest\n steps:\n - uses: actions/checkout@v4\n - name: Login\n run: echo $DOCKER_PASSWORD | docker login -u $DOCKER_USERNAME --password-stdin",
|
| 23 |
+
}
|
| 24 |
+
],
|
| 25 |
+
"error": {
|
| 26 |
+
"phase": "workflow_parse",
|
| 27 |
+
"message": "Cannot perform an interactive login from a non TTY device",
|
| 28 |
+
},
|
| 29 |
+
"expected_fixes": [
|
| 30 |
+
{
|
| 31 |
+
"file": ".github/workflows/build.yml",
|
| 32 |
+
"type": "contains",
|
| 33 |
+
"expected": "DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }}",
|
| 34 |
+
"hint": "Pass secrets through env",
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"file": ".github/workflows/build.yml",
|
| 38 |
+
"type": "contains",
|
| 39 |
+
"expected": "DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }}",
|
| 40 |
+
"hint": "Map password secret to environment",
|
| 41 |
+
},
|
| 42 |
+
],
|
| 43 |
+
}
|
| 44 |
+
]
|
| 45 |
+
|
| 46 |
+
def load_scenario(self, scenario_id: Optional[str] = None) -> Dict:
|
| 47 |
+
if scenario_id:
|
| 48 |
+
for scenario in self.SCENARIOS:
|
| 49 |
+
if scenario["id"] == scenario_id:
|
| 50 |
+
return scenario
|
| 51 |
+
raise ValueError(f"Unknown scenario: {scenario_id}")
|
| 52 |
+
return random.choice(self.SCENARIOS)
|
server/tasks/task_5_ci_docker_integration.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import random
|
| 4 |
+
from typing import Dict, Optional
|
| 5 |
+
|
| 6 |
+
from server.models import TaskDifficulty
|
| 7 |
+
from server.tasks.base import BaseTask
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class CIDockerIntegrationTask(BaseTask):
|
| 11 |
+
NAME = "CI and Docker Build Integration"
|
| 12 |
+
DESCRIPTION = "Debug combined workflow and Docker build integration failures"
|
| 13 |
+
DIFFICULTY = TaskDifficulty.MEDIUM
|
| 14 |
+
AVAILABLE_SECRETS = ["DOCKER_USERNAME", "DOCKER_PASSWORD", "GITHUB_TOKEN"]
|
| 15 |
+
SCENARIOS = [
|
| 16 |
+
{
|
| 17 |
+
"id": "missing_buildx_for_platforms",
|
| 18 |
+
"files": [
|
| 19 |
+
{
|
| 20 |
+
"path": ".github/workflows/build.yml",
|
| 21 |
+
"type": "workflow",
|
| 22 |
+
"content": "name: Build\non: push\njobs:\n build:\n runs-on: ubuntu-latest\n steps:\n - uses: actions/checkout@v4\n - name: Build\n uses: docker/build-push-action@v5\n with:\n context: .\n platforms: linux/amd64,linux/arm64\n push: false",
|
| 23 |
+
},
|
| 24 |
+
{"path": "Dockerfile", "type": "dockerfile", "content": "FROM python:3.11-slim\nWORKDIR /app\nCOPY . ."},
|
| 25 |
+
],
|
| 26 |
+
"error": {
|
| 27 |
+
"phase": "docker_build",
|
| 28 |
+
"message": "Multi-platform build is not supported for default docker driver",
|
| 29 |
+
},
|
| 30 |
+
"expected_fixes": [
|
| 31 |
+
{
|
| 32 |
+
"file": ".github/workflows/build.yml",
|
| 33 |
+
"type": "contains",
|
| 34 |
+
"expected": "docker/setup-buildx-action",
|
| 35 |
+
"hint": "Set up Buildx before multi-platform build",
|
| 36 |
+
}
|
| 37 |
+
],
|
| 38 |
+
}
|
| 39 |
+
]
|
| 40 |
+
|
| 41 |
+
def load_scenario(self, scenario_id: Optional[str] = None) -> Dict:
|
| 42 |
+
if scenario_id:
|
| 43 |
+
for scenario in self.SCENARIOS:
|
| 44 |
+
if scenario["id"] == scenario_id:
|
| 45 |
+
return scenario
|
| 46 |
+
raise ValueError(f"Unknown scenario: {scenario_id}")
|
| 47 |
+
return random.choice(self.SCENARIOS)
|
server/tasks/task_6_multi_stage_matrix.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import random
|
| 4 |
+
from typing import Dict, Optional
|
| 5 |
+
|
| 6 |
+
from server.models import TaskDifficulty
|
| 7 |
+
from server.tasks.base import BaseTask
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class MultiStageMatrixTask(BaseTask):
|
| 11 |
+
NAME = "Multi-Stage Pipeline and Matrix"
|
| 12 |
+
DESCRIPTION = "Debug complex multi-stage and matrix CI/CD pipelines"
|
| 13 |
+
DIFFICULTY = TaskDifficulty.HARD
|
| 14 |
+
AVAILABLE_SECRETS = ["DOCKER_USERNAME", "DOCKER_PASSWORD", "GITHUB_TOKEN", "NPM_TOKEN"]
|
| 15 |
+
SCENARIOS = [
|
| 16 |
+
{
|
| 17 |
+
"id": "artifact_path_mismatch",
|
| 18 |
+
"files": [
|
| 19 |
+
{
|
| 20 |
+
"path": "Dockerfile",
|
| 21 |
+
"type": "dockerfile",
|
| 22 |
+
"content": "FROM node:18 AS builder\nWORKDIR /app\nCOPY . .\nRUN npm run build\nFROM nginx:alpine\nCOPY --from=builder /app/dist /usr/share/nginx/html",
|
| 23 |
+
},
|
| 24 |
+
{"path": "package.json", "type": "other", "content": '{"scripts": {"build": "react-scripts build"}}'},
|
| 25 |
+
],
|
| 26 |
+
"error": {"phase": "docker_build", "message": "COPY failed: stat app/dist: file does not exist"},
|
| 27 |
+
"expected_fixes": [
|
| 28 |
+
{
|
| 29 |
+
"file": "Dockerfile",
|
| 30 |
+
"type": "contains",
|
| 31 |
+
"expected": "COPY --from=builder /app/build",
|
| 32 |
+
"hint": "React output path is build, not dist",
|
| 33 |
+
}
|
| 34 |
+
],
|
| 35 |
+
}
|
| 36 |
+
]
|
| 37 |
+
|
| 38 |
+
def load_scenario(self, scenario_id: Optional[str] = None) -> Dict:
|
| 39 |
+
if scenario_id:
|
| 40 |
+
for scenario in self.SCENARIOS:
|
| 41 |
+
if scenario["id"] == scenario_id:
|
| 42 |
+
return scenario
|
| 43 |
+
raise ValueError(f"Unknown scenario: {scenario_id}")
|
| 44 |
+
return random.choice(self.SCENARIOS)
|
server/tasks/task_registry.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Task registry for the environment."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Dict, Type
|
| 6 |
+
|
| 7 |
+
from server.tasks.base import BaseTask
|
| 8 |
+
from server.tasks.task_1_build_errors import DockerfileSyntaxTask
|
| 9 |
+
from server.tasks.task_2_docker_runtime import DockerfileRuntimeTask
|
| 10 |
+
from server.tasks.task_3_workflow_syntax import WorkflowSyntaxStructureTask
|
| 11 |
+
from server.tasks.task_4_workflow_secrets_permissions import WorkflowSecretsPermissionsTask
|
| 12 |
+
from server.tasks.task_5_ci_docker_integration import CIDockerIntegrationTask
|
| 13 |
+
from server.tasks.task_6_multi_stage_matrix import MultiStageMatrixTask
|
| 14 |
+
|
| 15 |
+
TASK_REGISTRY: Dict[str, Type[BaseTask]] = {
|
| 16 |
+
"dockerfile_syntax": DockerfileSyntaxTask,
|
| 17 |
+
"dockerfile_runtime": DockerfileRuntimeTask,
|
| 18 |
+
"workflow_syntax_structure": WorkflowSyntaxStructureTask,
|
| 19 |
+
"workflow_secrets_permissions": WorkflowSecretsPermissionsTask,
|
| 20 |
+
"ci_docker_integration": CIDockerIntegrationTask,
|
| 21 |
+
"multi_stage_pipeline_matrix": MultiStageMatrixTask,
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def get_task(task_id: str) -> BaseTask:
|
| 26 |
+
if task_id not in TASK_REGISTRY:
|
| 27 |
+
raise ValueError(f"Unknown task: {task_id}")
|
| 28 |
+
return TASK_REGISTRY[task_id]()
|
server/utils/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Utility package placeholder."""
|
smoke_test.py
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Comprehensive smoke test for the CI/CD Debug FastAPI server.
|
| 2 |
+
|
| 3 |
+
Usage:
|
| 4 |
+
.\\.venv\\Scripts\\python.exe smoke_test.py
|
| 5 |
+
.\\.venv\\Scripts\\python.exe smoke_test.py --mode live --base-url http://127.0.0.1:7860
|
| 6 |
+
|
| 7 |
+
Modes:
|
| 8 |
+
- inprocess (default): uses FastAPI TestClient, no running server needed.
|
| 9 |
+
- live: uses requests against a running server.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import argparse
|
| 15 |
+
import json
|
| 16 |
+
import sys
|
| 17 |
+
from dataclasses import dataclass
|
| 18 |
+
from typing import Any, Dict, Optional, Tuple
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@dataclass
|
| 22 |
+
class TestResult:
|
| 23 |
+
name: str
|
| 24 |
+
ok: bool
|
| 25 |
+
details: str = ""
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class EndpointClient:
|
| 29 |
+
def get(self, path: str) -> Tuple[int, Dict[str, Any]]:
|
| 30 |
+
raise NotImplementedError
|
| 31 |
+
|
| 32 |
+
def post(self, path: str, body: Optional[Dict[str, Any]] = None) -> Tuple[int, Dict[str, Any]]:
|
| 33 |
+
raise NotImplementedError
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class InProcessClient(EndpointClient):
|
| 37 |
+
def __init__(self):
|
| 38 |
+
from fastapi.testclient import TestClient
|
| 39 |
+
from server.main import app
|
| 40 |
+
|
| 41 |
+
self._client = TestClient(app)
|
| 42 |
+
|
| 43 |
+
def get(self, path: str) -> Tuple[int, Dict[str, Any]]:
|
| 44 |
+
response = self._client.get(path)
|
| 45 |
+
data = response.json() if response.content else {}
|
| 46 |
+
return response.status_code, data
|
| 47 |
+
|
| 48 |
+
def post(self, path: str, body: Optional[Dict[str, Any]] = None) -> Tuple[int, Dict[str, Any]]:
|
| 49 |
+
response = self._client.post(path, json=body or {})
|
| 50 |
+
data = response.json() if response.content else {}
|
| 51 |
+
return response.status_code, data
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class LiveClient(EndpointClient):
|
| 55 |
+
def __init__(self, base_url: str):
|
| 56 |
+
import requests
|
| 57 |
+
|
| 58 |
+
self._requests = requests
|
| 59 |
+
self._base_url = base_url.rstrip("/")
|
| 60 |
+
|
| 61 |
+
def get(self, path: str) -> Tuple[int, Dict[str, Any]]:
|
| 62 |
+
response = self._requests.get(f"{self._base_url}{path}", timeout=20)
|
| 63 |
+
data = response.json() if response.content else {}
|
| 64 |
+
return response.status_code, data
|
| 65 |
+
|
| 66 |
+
def post(self, path: str, body: Optional[Dict[str, Any]] = None) -> Tuple[int, Dict[str, Any]]:
|
| 67 |
+
response = self._requests.post(f"{self._base_url}{path}", json=body or {}, timeout=20)
|
| 68 |
+
data = response.json() if response.content else {}
|
| 69 |
+
return response.status_code, data
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def assert_true(name: str, cond: bool, details: str = "") -> TestResult:
|
| 73 |
+
return TestResult(name=name, ok=bool(cond), details=details if not cond else "")
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def run_smoke(client: EndpointClient) -> int:
|
| 77 |
+
results = []
|
| 78 |
+
|
| 79 |
+
status, data = client.get("/")
|
| 80 |
+
results.append(assert_true("GET / health", status == 200 and data.get("status") == "healthy", str(data)))
|
| 81 |
+
|
| 82 |
+
status, info = client.get("/info")
|
| 83 |
+
results.append(assert_true("GET /info", status == 200 and isinstance(info.get("tasks"), list), str(info)))
|
| 84 |
+
|
| 85 |
+
status, tasks_payload = client.get("/tasks")
|
| 86 |
+
tasks = tasks_payload.get("tasks", []) if isinstance(tasks_payload, dict) else []
|
| 87 |
+
results.append(assert_true("GET /tasks", status == 200 and len(tasks) >= 6, str(tasks_payload)))
|
| 88 |
+
|
| 89 |
+
status, reset_data = client.post("/reset", {"seed": 123})
|
| 90 |
+
obs = reset_data.get("observation", {})
|
| 91 |
+
results.append(
|
| 92 |
+
assert_true(
|
| 93 |
+
"POST /reset random",
|
| 94 |
+
status == 200 and isinstance(obs.get("task_id"), str) and isinstance(obs.get("files"), list),
|
| 95 |
+
str(reset_data),
|
| 96 |
+
)
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
status_int, reset_int = client.post("/reset", {"task_id": 1, "seed": 1})
|
| 100 |
+
status_str, reset_str = client.post("/reset", {"task_id": "1", "seed": 1})
|
| 101 |
+
int_task = reset_int.get("observation", {}).get("task_id")
|
| 102 |
+
str_task = reset_str.get("observation", {}).get("task_id")
|
| 103 |
+
results.append(
|
| 104 |
+
assert_true(
|
| 105 |
+
"POST /reset accepts int/string index",
|
| 106 |
+
status_int == 200 and status_str == 200 and int_task == str_task,
|
| 107 |
+
f"int={status_int}:{int_task}, str={status_str}:{str_task}",
|
| 108 |
+
)
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
status_a, reset_a = client.post("/reset", {"seed": 999})
|
| 112 |
+
status_b, reset_b = client.post("/reset", {"seed": 999})
|
| 113 |
+
a_obs = reset_a.get("observation", {})
|
| 114 |
+
b_obs = reset_b.get("observation", {})
|
| 115 |
+
results.append(
|
| 116 |
+
assert_true(
|
| 117 |
+
"Deterministic reset with seed",
|
| 118 |
+
status_a == 200
|
| 119 |
+
and status_b == 200
|
| 120 |
+
and a_obs.get("task_id") == b_obs.get("task_id")
|
| 121 |
+
and a_obs.get("error", {}).get("error_message") == b_obs.get("error", {}).get("error_message"),
|
| 122 |
+
f"A={a_obs.get('task_id')} B={b_obs.get('task_id')}",
|
| 123 |
+
)
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
status, _ = client.post("/reset", {"task_id": "dockerfile_syntax", "scenario_id": "typo_filename", "seed": 7})
|
| 127 |
+
results.append(assert_true("POST /reset specific scenario", status == 200))
|
| 128 |
+
|
| 129 |
+
status, step_hint = client.post(
|
| 130 |
+
"/step",
|
| 131 |
+
{"action": {"action_type": "request_hint", "reasoning": "Need help"}},
|
| 132 |
+
)
|
| 133 |
+
results.append(
|
| 134 |
+
assert_true(
|
| 135 |
+
"POST /step request_hint",
|
| 136 |
+
status == 200 and "observation" in step_hint and "reward" in step_hint,
|
| 137 |
+
str(step_hint),
|
| 138 |
+
)
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
status, step_fix = client.post(
|
| 142 |
+
"/step",
|
| 143 |
+
{
|
| 144 |
+
"action": {
|
| 145 |
+
"action_type": "replace_line",
|
| 146 |
+
"edits": [{"file_path": "Dockerfile", "line_number": 3, "new_content": "COPY requirements.txt ."}],
|
| 147 |
+
"reasoning": "Fix typo",
|
| 148 |
+
}
|
| 149 |
+
},
|
| 150 |
+
)
|
| 151 |
+
fix_info = step_fix.get("info", {})
|
| 152 |
+
results.append(
|
| 153 |
+
assert_true(
|
| 154 |
+
"POST /step replace_line",
|
| 155 |
+
status == 200 and fix_info.get("issues_fixed", 0) >= 1,
|
| 156 |
+
str(step_fix),
|
| 157 |
+
)
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
status, state = client.get("/state")
|
| 161 |
+
results.append(assert_true("GET /state", status == 200 and "observation" in state, str(state)))
|
| 162 |
+
|
| 163 |
+
status, submit = client.post("/step", {"action": {"action_type": "submit", "reasoning": "Done"}})
|
| 164 |
+
results.append(assert_true("POST /step submit", status == 200 and submit.get("done") is True, str(submit)))
|
| 165 |
+
|
| 166 |
+
trajectory = [
|
| 167 |
+
{
|
| 168 |
+
"step": 1,
|
| 169 |
+
"action": {"action_type": "replace_line", "edits": [{"file_path": "Dockerfile", "line_number": 3}]},
|
| 170 |
+
"reward": 0.3,
|
| 171 |
+
"done": False,
|
| 172 |
+
"info": {"issues_fixed": 1, "issues_total": 1},
|
| 173 |
+
},
|
| 174 |
+
{
|
| 175 |
+
"step": 2,
|
| 176 |
+
"action": {"action_type": "submit"},
|
| 177 |
+
"reward": 0.7,
|
| 178 |
+
"done": True,
|
| 179 |
+
"info": {"issues_fixed": 1, "issues_total": 1},
|
| 180 |
+
},
|
| 181 |
+
]
|
| 182 |
+
status, grader = client.post("/grader", {"task_id": "dockerfile_syntax", "trajectory": trajectory})
|
| 183 |
+
score = grader.get("result", {}).get("score")
|
| 184 |
+
results.append(
|
| 185 |
+
assert_true(
|
| 186 |
+
"POST /grader",
|
| 187 |
+
status == 200 and isinstance(score, (int, float)) and 0.0 <= float(score) <= 1.0,
|
| 188 |
+
str(grader),
|
| 189 |
+
)
|
| 190 |
+
)
|
| 191 |
+
|
| 192 |
+
status, baseline = client.post("/baseline", {"task_id": "dockerfile_syntax", "num_episodes": 1})
|
| 193 |
+
results.append(
|
| 194 |
+
assert_true(
|
| 195 |
+
"POST /baseline",
|
| 196 |
+
status == 200 and isinstance(baseline.get("results"), list),
|
| 197 |
+
str(baseline),
|
| 198 |
+
)
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
passed = sum(1 for r in results if r.ok)
|
| 202 |
+
total = len(results)
|
| 203 |
+
|
| 204 |
+
print("\n=== Smoke Test Results ===")
|
| 205 |
+
for r in results:
|
| 206 |
+
marker = "PASS" if r.ok else "FAIL"
|
| 207 |
+
print(f"[{marker}] {r.name}")
|
| 208 |
+
if not r.ok and r.details:
|
| 209 |
+
detail = r.details
|
| 210 |
+
if len(detail) > 300:
|
| 211 |
+
detail = detail[:300] + "..."
|
| 212 |
+
print(f" {detail}")
|
| 213 |
+
|
| 214 |
+
print(f"\nSummary: {passed}/{total} passed")
|
| 215 |
+
return 0 if passed == total else 1
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
def main() -> int:
|
| 219 |
+
parser = argparse.ArgumentParser(description="Smoke test CI/CD Debug FastAPI server")
|
| 220 |
+
parser.add_argument("--mode", choices=["inprocess", "live"], default="inprocess")
|
| 221 |
+
parser.add_argument("--base-url", default="http://127.0.0.1:7860")
|
| 222 |
+
args = parser.parse_args()
|
| 223 |
+
|
| 224 |
+
if args.mode == "inprocess":
|
| 225 |
+
client = InProcessClient()
|
| 226 |
+
else:
|
| 227 |
+
client = LiveClient(args.base_url)
|
| 228 |
+
|
| 229 |
+
return run_smoke(client)
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
if __name__ == "__main__":
|
| 233 |
+
raise SystemExit(main())
|
tests/test_determinism.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from server.environment import CICDDebugEnvironment
|
| 2 |
+
from server.graders import run_grader
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def test_reset_deterministic_with_seed():
|
| 6 |
+
env1 = CICDDebugEnvironment()
|
| 7 |
+
env2 = CICDDebugEnvironment()
|
| 8 |
+
|
| 9 |
+
obs1 = env1.reset(seed=123)
|
| 10 |
+
obs2 = env2.reset(seed=123)
|
| 11 |
+
|
| 12 |
+
assert obs1.task_id == obs2.task_id
|
| 13 |
+
assert obs1.error.error_message == obs2.error.error_message
|
| 14 |
+
assert [f.path for f in obs1.files] == [f.path for f in obs2.files]
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def test_grader_deterministic_same_trajectory():
|
| 18 |
+
trajectory = [
|
| 19 |
+
{
|
| 20 |
+
"step": 1,
|
| 21 |
+
"action": {"action_type": "replace_line", "edits": [{"file_path": "Dockerfile", "line_number": 3}]},
|
| 22 |
+
"reward": 0.3,
|
| 23 |
+
"done": False,
|
| 24 |
+
"info": {"issues_fixed": 1, "issues_total": 2},
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"step": 2,
|
| 28 |
+
"action": {"action_type": "submit"},
|
| 29 |
+
"reward": 0.4,
|
| 30 |
+
"done": True,
|
| 31 |
+
"info": {"issues_fixed": 1, "issues_total": 2},
|
| 32 |
+
},
|
| 33 |
+
]
|
| 34 |
+
r1 = run_grader("dockerfile_syntax", trajectory)
|
| 35 |
+
r2 = run_grader("dockerfile_syntax", trajectory)
|
| 36 |
+
assert r1.score == r2.score
|
| 37 |
+
assert r1.breakdown == r2.breakdown
|
tests/test_endpoints.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi.testclient import TestClient
|
| 2 |
+
|
| 3 |
+
from server.main import app
|
| 4 |
+
|
| 5 |
+
client = TestClient(app)
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def test_root_health():
|
| 9 |
+
response = client.get("/")
|
| 10 |
+
assert response.status_code == 200
|
| 11 |
+
assert response.json()["status"] == "healthy"
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def test_reset_and_state():
|
| 15 |
+
reset = client.post("/reset", json={})
|
| 16 |
+
assert reset.status_code == 200
|
| 17 |
+
state = client.get("/state")
|
| 18 |
+
assert state.status_code == 200
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def test_info_and_tasks():
|
| 22 |
+
info = client.get("/info")
|
| 23 |
+
assert info.status_code == 200
|
| 24 |
+
assert len(info.json().get("tasks", [])) >= 6
|
| 25 |
+
tasks = client.get("/tasks")
|
| 26 |
+
assert tasks.status_code == 200
|
| 27 |
+
assert len(tasks.json().get("tasks", [])) >= 6
|
tests/test_environment_flow.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from server.environment import CICDDebugEnvironment
|
| 2 |
+
from server.models import Action, ActionType, FileEdit
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def test_episode_flow_fix_and_autocomplete():
|
| 6 |
+
env = CICDDebugEnvironment()
|
| 7 |
+
obs = env.reset(task_id="dockerfile_syntax", scenario_id="typo_filename", seed=7)
|
| 8 |
+
assert obs.task_id == "dockerfile_syntax"
|
| 9 |
+
assert obs.total_issues >= 1
|
| 10 |
+
|
| 11 |
+
action = Action(
|
| 12 |
+
action_type=ActionType.REPLACE_LINE,
|
| 13 |
+
edits=[
|
| 14 |
+
FileEdit(
|
| 15 |
+
file_path="Dockerfile",
|
| 16 |
+
line_number=3,
|
| 17 |
+
new_content="COPY requirements.txt .",
|
| 18 |
+
)
|
| 19 |
+
],
|
| 20 |
+
reasoning="Fix typo in requirements filename",
|
| 21 |
+
)
|
| 22 |
+
next_obs, reward, done, info = env.step(action)
|
| 23 |
+
|
| 24 |
+
assert reward > 0
|
| 25 |
+
assert info["issues_fixed"] >= 1
|
| 26 |
+
assert done is True
|
| 27 |
+
assert next_obs.issues_fixed >= 1
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def test_submit_runs_combined_simulation():
|
| 31 |
+
env = CICDDebugEnvironment()
|
| 32 |
+
env.reset(task_id="workflow_secrets_permissions", scenario_id="missing_env_secrets", seed=42)
|
| 33 |
+
obs, reward, done, info = env.step(Action(action_type=ActionType.SUBMIT, reasoning="validate"))
|
| 34 |
+
assert done is True
|
| 35 |
+
assert "issues_total" in info
|
| 36 |
+
assert reward >= 0.0
|
| 37 |
+
assert obs.task_id == "workflow_secrets_permissions"
|
tests/test_simulators.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from server.models import FileContent, FileType
|
| 2 |
+
from server.simulators.docker_simulator import DockerSimulator
|
| 3 |
+
from server.simulators.workflow_simulator import WorkflowSimulator
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def _fc(path: str, content: str, file_type: FileType = FileType.OTHER) -> FileContent:
|
| 7 |
+
return FileContent(path=path, content=content, file_type=file_type, line_count=content.count("\n") + 1)
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def test_docker_simulator_catches_missing_copy_source():
|
| 11 |
+
sim = DockerSimulator()
|
| 12 |
+
dockerfile = _fc("Dockerfile", "FROM python:3.11-slim\nCOPY missing.txt .", FileType.DOCKERFILE)
|
| 13 |
+
result = sim.validate(dockerfile, {"Dockerfile": dockerfile})
|
| 14 |
+
assert result["build_success"] is False
|
| 15 |
+
assert "missing.txt" in result.get("error", "")
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def test_docker_simulator_detects_runtime_workdir_issue():
|
| 19 |
+
sim = DockerSimulator()
|
| 20 |
+
dockerfile = _fc(
|
| 21 |
+
"Dockerfile",
|
| 22 |
+
"FROM node:18-alpine\nCOPY package*.json ./\nRUN npm ci\nCOPY . .\nCMD [\"npm\", \"start\"]",
|
| 23 |
+
FileType.DOCKERFILE,
|
| 24 |
+
)
|
| 25 |
+
context = {
|
| 26 |
+
"Dockerfile": dockerfile,
|
| 27 |
+
"package.json": _fc("package.json", '{"name":"app"}'),
|
| 28 |
+
}
|
| 29 |
+
result = sim.validate(dockerfile, context)
|
| 30 |
+
assert result["build_success"] is True
|
| 31 |
+
assert result["run_success"] is False
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def test_workflow_simulator_catches_missing_secrets():
|
| 35 |
+
sim = WorkflowSimulator()
|
| 36 |
+
wf = _fc(
|
| 37 |
+
".github/workflows/build.yml",
|
| 38 |
+
"name: Build\non: push\njobs:\n build:\n runs-on: ubuntu-latest\n steps:\n - uses: actions/checkout@v4\n - run: echo $DOCKER_PASSWORD | docker login -u $DOCKER_USERNAME --password-stdin",
|
| 39 |
+
FileType.WORKFLOW,
|
| 40 |
+
)
|
| 41 |
+
result = sim.validate(wf, {".github/workflows/build.yml": wf})
|
| 42 |
+
assert result["parse_success"] is True
|
| 43 |
+
assert result["execution_success"] is False
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def test_workflow_simulator_catches_yaml_errors():
|
| 47 |
+
sim = WorkflowSimulator()
|
| 48 |
+
wf = _fc(".github/workflows/build.yml", "jobs:\n build: [", FileType.WORKFLOW)
|
| 49 |
+
result = sim.validate(wf, {".github/workflows/build.yml": wf})
|
| 50 |
+
assert result["parse_success"] is False
|