Commit ·
30bf68a
0
Parent(s):
clean final submission
Browse files- .dockerignore +11 -0
- .env.example +23 -0
- .gitattributes +35 -0
- .gitignore +56 -0
- Dockerfile +13 -0
- README.md +382 -0
- artifacts/baseline_scores.json +60 -0
- artifacts/metrics.json +7 -0
- artifacts/reward_curve.csv +8 -0
- artifacts/success_rate.csv +2 -0
- baseline_inference.py +200 -0
- inference.py +257 -0
- inference/__init__.py +4 -0
- inference/metrics.py +56 -0
- inference/model_wrapper.py +115 -0
- inference/prompts.py +125 -0
- inference/visualize.py +42 -0
- openenv.yaml +54 -0
- pyproject.toml +21 -0
- requirements.txt +9 -0
- server/__init__.py +0 -0
- server/app.py +184 -0
- tests/__init__.py +0 -0
- tests/test_day2_engine.py +199 -0
- tests/test_env.py +113 -0
- tests/test_inference.py +54 -0
- tests/test_judge.py +51 -0
- tests/test_server_api.py +49 -0
- uv.lock +0 -0
- validate-submission.sh +187 -0
.dockerignore
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.git/
|
| 2 |
+
.venv/
|
| 3 |
+
.pytest_cache/
|
| 4 |
+
__pycache__/
|
| 5 |
+
*.py[cod]
|
| 6 |
+
*.log
|
| 7 |
+
.env
|
| 8 |
+
artifacts/
|
| 9 |
+
tests/
|
| 10 |
+
uv.lock
|
| 11 |
+
migrated_from_cicd-debugger-env-2/
|
.env.example
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Default profile: Hugging Face Router (OpenAI-compatible API)
|
| 2 |
+
API_BASE_URL=https://router.huggingface.co/v1
|
| 3 |
+
MODEL_NAME=Qwen/Qwen2.5-72B-Instruct
|
| 4 |
+
HF_TOKEN=<your_openai_or_router_api_key>
|
| 5 |
+
|
| 6 |
+
# Optional alias. If both are set, OPENAI_API_KEY is used first by inference.py.
|
| 7 |
+
OPENAI_API_KEY=
|
| 8 |
+
|
| 9 |
+
# OpenAI direct profile (uncomment for OpenAI access token usage):
|
| 10 |
+
# API_BASE_URL=https://api.openai.com/v1
|
| 11 |
+
# MODEL_NAME=gpt-4o-mini
|
| 12 |
+
# HF_TOKEN=<your_openai_access_token>
|
| 13 |
+
# OPENAI_API_KEY=<optional_same_token_as_hf_token>
|
| 14 |
+
|
| 15 |
+
# Optional runtime knobs
|
| 16 |
+
LOCAL_IMAGE_NAME=
|
| 17 |
+
MY_ENV_V4_TASK=easy-command-typo
|
| 18 |
+
MY_ENV_V4_BENCHMARK=cicd_debugger_env
|
| 19 |
+
MAX_STEPS=8
|
| 20 |
+
TEMPERATURE=0.2
|
| 21 |
+
MAX_TOKENS=120
|
| 22 |
+
SUCCESS_SCORE_THRESHOLD=0.1
|
| 23 |
+
OFFLINE_INFERENCE=0
|
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python bytecode and cache
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# Virtual environments
|
| 7 |
+
# Keep source folder env/ tracked; only ignore venv directories.
|
| 8 |
+
.venv/
|
| 9 |
+
venv/
|
| 10 |
+
ENV/
|
| 11 |
+
env.bak/
|
| 12 |
+
venv.bak/
|
| 13 |
+
|
| 14 |
+
# Build and packaging artifacts
|
| 15 |
+
build/
|
| 16 |
+
dist/
|
| 17 |
+
.eggs/
|
| 18 |
+
*.egg-info/
|
| 19 |
+
*.egg
|
| 20 |
+
pip-wheel-metadata/
|
| 21 |
+
|
| 22 |
+
# Testing, typing, linting caches
|
| 23 |
+
.pytest_cache/
|
| 24 |
+
.mypy_cache/
|
| 25 |
+
.ruff_cache/
|
| 26 |
+
.pyre/
|
| 27 |
+
.pytype/
|
| 28 |
+
.hypothesis/
|
| 29 |
+
.tox/
|
| 30 |
+
.nox/
|
| 31 |
+
.coverage
|
| 32 |
+
.coverage.*
|
| 33 |
+
htmlcov/
|
| 34 |
+
|
| 35 |
+
# Jupyter
|
| 36 |
+
.ipynb_checkpoints/
|
| 37 |
+
|
| 38 |
+
# Logs and temp files
|
| 39 |
+
*.log
|
| 40 |
+
*.out
|
| 41 |
+
*.err
|
| 42 |
+
tmp/
|
| 43 |
+
temp/
|
| 44 |
+
|
| 45 |
+
# Local environment and secrets
|
| 46 |
+
.env
|
| 47 |
+
.env.*
|
| 48 |
+
!.env.example
|
| 49 |
+
|
| 50 |
+
# IDE/editor and OS files
|
| 51 |
+
.vscode/
|
| 52 |
+
.idea/
|
| 53 |
+
*.swp
|
| 54 |
+
*.swo
|
| 55 |
+
.DS_Store
|
| 56 |
+
Thumbs.db
|
Dockerfile
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
COPY requirements.txt ./
|
| 6 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 7 |
+
|
| 8 |
+
COPY . .
|
| 9 |
+
|
| 10 |
+
ENV PORT=7860
|
| 11 |
+
EXPOSE 7860
|
| 12 |
+
|
| 13 |
+
CMD ["python", "-m", "server.app"]
|
README.md
ADDED
|
@@ -0,0 +1,382 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: CICD_DEBUGGER
|
| 3 |
+
colorFrom: blue
|
| 4 |
+
colorTo: green
|
| 5 |
+
sdk: docker
|
| 6 |
+
app_port: 7860
|
| 7 |
+
pinned: false
|
| 8 |
+
tags:
|
| 9 |
+
- openenv
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# CI/CD Pipeline Debugger Environment (OpenEnv)
|
| 13 |
+
|
| 14 |
+
## 1. Project Goal
|
| 15 |
+
|
| 16 |
+
This repository implements an AI training and evaluation environment where an agent learns to debug broken CI/CD pipelines automatically.
|
| 17 |
+
|
| 18 |
+
The environment targets real-world DevOps failure patterns, including:
|
| 19 |
+
|
| 20 |
+
- YAML syntax and structure issues
|
| 21 |
+
- Incorrect build/test commands (for example, npm tset -> npm test)
|
| 22 |
+
- Dependency and setup failures
|
| 23 |
+
- Multi-stage pipeline execution errors
|
| 24 |
+
|
| 25 |
+
This is designed as an RL-style interaction loop:
|
| 26 |
+
|
| 27 |
+
Observe -> Think -> Act -> Get Reward -> Repeat
|
| 28 |
+
|
| 29 |
+
## 2. Why This Matters
|
| 30 |
+
|
| 31 |
+
CI/CD failures are common, repetitive, and often multi-step to resolve. This project turns that workflow into a structured learning environment where agents:
|
| 32 |
+
|
| 33 |
+
- Read failure context
|
| 34 |
+
- Reason about root causes
|
| 35 |
+
- Propose and apply fixes
|
| 36 |
+
- Get shaped rewards for robust behavior
|
| 37 |
+
|
| 38 |
+
## 3. System Architecture
|
| 39 |
+
|
| 40 |
+
High-level flow:
|
| 41 |
+
|
| 42 |
+
Agent (LLM) -> Action -> Environment.step() -> Reward/Evaluation -> Next step
|
| 43 |
+
|
| 44 |
+
Core integration path:
|
| 45 |
+
|
| 46 |
+
Model -> Action -> Environment.step() -> RewardCalculator
|
| 47 |
+
|
| 48 |
+
RewardCalculator integrates:
|
| 49 |
+
|
| 50 |
+
- DeterministicGrader
|
| 51 |
+
- LLMJudge
|
| 52 |
+
- HiddenTestRunner
|
| 53 |
+
- AntiHackingDetector
|
| 54 |
+
|
| 55 |
+
### 3.1 OpenEnv Interface (Typed)
|
| 56 |
+
|
| 57 |
+
Typed Pydantic models are defined in `env/models.py`:
|
| 58 |
+
|
| 59 |
+
- `Observation`: strict schema for environment observations
|
| 60 |
+
- `Action`: normalized tool + payload action schema
|
| 61 |
+
- `Reward`: bounded reward model with components
|
| 62 |
+
|
| 63 |
+
Environment contract:
|
| 64 |
+
|
| 65 |
+
- `reset()` returns the initial `Observation` payload
|
| 66 |
+
- `step(action)` returns `(observation, reward, done, info)`
|
| 67 |
+
- `state()` returns current environment state snapshot
|
| 68 |
+
|
| 69 |
+
Server/API contract models are exposed in `server/app.py` and use the same typed observation/action/reward structures.
|
| 70 |
+
|
| 71 |
+
### 3.2 Action and Observation Spaces
|
| 72 |
+
|
| 73 |
+
Observation fields include:
|
| 74 |
+
|
| 75 |
+
- `task_id`, `difficulty`, `failure_stage`, `actual_bug`
|
| 76 |
+
- `config`, `logs`, `error_message`
|
| 77 |
+
- `available_tools`, `progress_flags`
|
| 78 |
+
- `file_modification_count`, `hidden_test_pass_rate`, `step_count`, `last_action_error`
|
| 79 |
+
|
| 80 |
+
Action schema:
|
| 81 |
+
|
| 82 |
+
- `tool`: one of `read_file`, `read_logs`, `analyze_error`, `edit_config`, `run_pipeline_stage`, `run_tests`, `validate_fix`, `submit_solution`
|
| 83 |
+
- `payload`: optional dict (for example `{ "raw": "replace npm tset with npm test" }`)
|
| 84 |
+
|
| 85 |
+
Reward schema:
|
| 86 |
+
|
| 87 |
+
- `value`: bounded float in `[0.0, 1.0]`
|
| 88 |
+
- `components`: reward breakdown dictionary
|
| 89 |
+
|
| 90 |
+
## 4. Core Modules
|
| 91 |
+
|
| 92 |
+
### 4.1 Quality Judge
|
| 93 |
+
|
| 94 |
+
- File: env/graders/llm_judge.py
|
| 95 |
+
- Purpose: quality-aware scoring of fixes
|
| 96 |
+
- Output keys: correctness, minimalism, quality (all in [0,1])
|
| 97 |
+
- Guarantees:
|
| 98 |
+
- strict JSON parsing attempt
|
| 99 |
+
- robust fallback parsing for messy output
|
| 100 |
+
- no-crash behavior (safe zero scores on failure)
|
| 101 |
+
|
| 102 |
+
### 4.2 Deterministic Grader
|
| 103 |
+
|
| 104 |
+
- File: env/graders/deterministic.py
|
| 105 |
+
- Purpose: reproducible correctness scoring (0-1)
|
| 106 |
+
- Checks:
|
| 107 |
+
- YAML validity
|
| 108 |
+
- command and fix correctness
|
| 109 |
+
- similarity and issue resolution
|
| 110 |
+
- Rules:
|
| 111 |
+
- deterministic only
|
| 112 |
+
- same input, same score
|
| 113 |
+
|
| 114 |
+
### 4.3 Anti-Hacking Detector
|
| 115 |
+
|
| 116 |
+
- File: env/anti_hacking.py
|
| 117 |
+
- Purpose: detect reward-hacking and shortcut behavior
|
| 118 |
+
- Penalty detectors:
|
| 119 |
+
- stage skipping (if: false, when: never)
|
| 120 |
+
- fake success (echo tests passed, unsafe exit 0 patterns)
|
| 121 |
+
- pipeline breakage between versions
|
| 122 |
+
- excessive edits
|
| 123 |
+
- timeout abuse via too many steps
|
| 124 |
+
|
| 125 |
+
### 4.4 Hidden Tests
|
| 126 |
+
|
| 127 |
+
- File: env/hidden_tests.py
|
| 128 |
+
- Purpose: test fix robustness, not just exact-match overfitting
|
| 129 |
+
- Method:
|
| 130 |
+
- deterministic variant generation (OS, versions, env shifts)
|
| 131 |
+
- evaluate pass rate across variants
|
| 132 |
+
|
| 133 |
+
### 4.5 Reward Shaping
|
| 134 |
+
|
| 135 |
+
- File: env/rewards.py
|
| 136 |
+
- Purpose: step-level learning signal
|
| 137 |
+
- Components:
|
| 138 |
+
- progress rewards (logs, analysis, fix proposal)
|
| 139 |
+
- execution rewards (pipeline run, tests pass)
|
| 140 |
+
- quality rewards (deterministic + hidden tests + LLM judge)
|
| 141 |
+
- anti-hacking penalties
|
| 142 |
+
|
| 143 |
+
## 5. Inference and Evaluation
|
| 144 |
+
|
| 145 |
+
### 5.1 Prompt and Model Layers
|
| 146 |
+
|
| 147 |
+
- inference/prompts.py: stable prompt templates and fallback action heuristics
|
| 148 |
+
- inference/model_wrapper.py: OpenAI client action generation, candidate generation, and safe fallback
|
| 149 |
+
|
| 150 |
+
Canonical action tools used by environment and inference:
|
| 151 |
+
|
| 152 |
+
- read_file
|
| 153 |
+
- read_logs
|
| 154 |
+
- analyze_error
|
| 155 |
+
- edit_config
|
| 156 |
+
- run_pipeline_stage
|
| 157 |
+
- run_tests
|
| 158 |
+
- validate_fix
|
| 159 |
+
- submit_solution
|
| 160 |
+
|
| 161 |
+
### 5.2 Metrics and Artifacts
|
| 162 |
+
|
| 163 |
+
- inference/metrics.py: reward, success-rate, and failure reason tracking
|
| 164 |
+
- inference/visualize.py: reward curve and metrics artifact export
|
| 165 |
+
|
| 166 |
+
### 5.3 Submission-Critical Runtime
|
| 167 |
+
|
| 168 |
+
- File: inference.py (root)
|
| 169 |
+
- Responsibilities:
|
| 170 |
+
- initialize model and environment
|
| 171 |
+
- run step loop
|
| 172 |
+
- calculate rewards
|
| 173 |
+
- emit strict stdout contract
|
| 174 |
+
- always emit END line
|
| 175 |
+
|
| 176 |
+
Required output format:
|
| 177 |
+
|
| 178 |
+
- [START] task=... env=... model=...
|
| 179 |
+
- [STEP] step=<n> action=... reward=0.00 done=<true|false> error=<msg|null>
|
| 180 |
+
- [END] success=<true|false> steps=<n> score=<0.000> rewards=<r1,r2,...>
|
| 181 |
+
|
| 182 |
+
Rules enforced:
|
| 183 |
+
|
| 184 |
+
- single-line logs only
|
| 185 |
+
- reward values with 2 decimals
|
| 186 |
+
- lowercase booleans
|
| 187 |
+
- no extra runtime log noise
|
| 188 |
+
|
| 189 |
+
## 6. Task Coverage
|
| 190 |
+
|
| 191 |
+
The project includes 9 CI-fix tasks spanning:
|
| 192 |
+
|
| 193 |
+
- easy: syntax and typo fixes
|
| 194 |
+
- medium: dependency/env/cache/permissions issues
|
| 195 |
+
- hard: matrix logic, conditional flow, orchestration-level failures
|
| 196 |
+
|
| 197 |
+
Representative baseline tasks (one per difficulty):
|
| 198 |
+
|
| 199 |
+
- easy: `easy-command-typo` (fix invalid `npm tset` command)
|
| 200 |
+
- medium: `medium-python-version` (align workflow Python version)
|
| 201 |
+
- hard: `hard-needs-order` (repair deploy job dependency ordering)
|
| 202 |
+
|
| 203 |
+
## 7. Setup
|
| 204 |
+
|
| 205 |
+
```bash
|
| 206 |
+
python3 -m venv .venv
|
| 207 |
+
source .venv/bin/activate
|
| 208 |
+
pip install -r requirements.txt
|
| 209 |
+
```
|
| 210 |
+
|
| 211 |
+
Environment variables:
|
| 212 |
+
|
| 213 |
+
```bash
|
| 214 |
+
export API_BASE_URL="https://router.huggingface.co/v1"
|
| 215 |
+
export MODEL_NAME="Qwen/Qwen2.5-72B-Instruct"
|
| 216 |
+
export HF_TOKEN="<your_openai_compatible_api_key>"
|
| 217 |
+
# Optional alias; if set, this takes precedence over HF_TOKEN in inference.py
|
| 218 |
+
export OPENAI_API_KEY="<same_token_optional>"
|
| 219 |
+
# Optional, only if your inference spins environments from local images.
|
| 220 |
+
export LOCAL_IMAGE_NAME="<local_env_image_name>"
|
| 221 |
+
```
|
| 222 |
+
|
| 223 |
+
If you want to use an OpenAI access token directly:
|
| 224 |
+
|
| 225 |
+
```bash
|
| 226 |
+
export API_BASE_URL="https://api.openai.com/v1"
|
| 227 |
+
export MODEL_NAME="gpt-4o-mini"
|
| 228 |
+
export HF_TOKEN="<your_openai_access_token>"
|
| 229 |
+
# Optional alias:
|
| 230 |
+
export OPENAI_API_KEY="<same_token_optional>"
|
| 231 |
+
```
|
| 232 |
+
|
| 233 |
+
## 8. Run Inference
|
| 234 |
+
|
| 235 |
+
Offline/local mode:
|
| 236 |
+
|
| 237 |
+
```bash
|
| 238 |
+
python inference.py --offline --force-local-env --max-steps 8 --policy-mode imp --trajectories 4
|
| 239 |
+
```
|
| 240 |
+
|
| 241 |
+
Model-backed mode:
|
| 242 |
+
|
| 243 |
+
```bash
|
| 244 |
+
python inference.py --max-steps 8 --policy-mode imp --trajectories 4
|
| 245 |
+
```
|
| 246 |
+
|
| 247 |
+
Run baseline across easy/medium/hard tasks:
|
| 248 |
+
|
| 249 |
+
OpenAI client mode:
|
| 250 |
+
|
| 251 |
+
```bash
|
| 252 |
+
OPENAI_API_KEY="<your_openai_compatible_api_key>" python baseline_inference.py --max-steps 5 --policy-mode imp --trajectories 3 --force-local-env
|
| 253 |
+
```
|
| 254 |
+
|
| 255 |
+
Offline reproducible mode:
|
| 256 |
+
|
| 257 |
+
```bash
|
| 258 |
+
python baseline_inference.py --max-steps 5 --policy-mode imp --trajectories 3 --offline --force-local-env
|
| 259 |
+
```
|
| 260 |
+
|
| 261 |
+
Policy modes:
|
| 262 |
+
|
| 263 |
+
- sft: deterministic heuristic policy
|
| 264 |
+
- direct: single model action per step
|
| 265 |
+
- imp: multi-candidate generation and ranking
|
| 266 |
+
|
| 267 |
+
## 9. Baseline Scores
|
| 268 |
+
|
| 269 |
+
Reproducible baseline artifact:
|
| 270 |
+
|
| 271 |
+
- `artifacts/baseline_scores.json`
|
| 272 |
+
|
| 273 |
+
Latest baseline run (`max_steps=5`, `policy_mode=imp`, `trajectories=3`):
|
| 274 |
+
|
| 275 |
+
| Task ID | Difficulty | Score | Success |
|
| 276 |
+
|---|---|---:|---:|
|
| 277 |
+
| easy-command-typo | easy | 0.541 | false |
|
| 278 |
+
| medium-python-version | medium | 0.679 | false |
|
| 279 |
+
| hard-needs-order | hard | 0.513 | false |
|
| 280 |
+
|
| 281 |
+
Aggregate:
|
| 282 |
+
|
| 283 |
+
- average score: `0.578`
|
| 284 |
+
- success rate: `0.000`
|
| 285 |
+
|
| 286 |
+
When `OPENAI_API_KEY` is provided, the same script runs with the OpenAI API client path in `inference.py`.
|
| 287 |
+
|
| 288 |
+
## 10. Tests
|
| 289 |
+
|
| 290 |
+
Run all tests:
|
| 291 |
+
|
| 292 |
+
```bash
|
| 293 |
+
python -m unittest discover -s tests -v
|
| 294 |
+
```
|
| 295 |
+
|
| 296 |
+
Coverage includes:
|
| 297 |
+
|
| 298 |
+
- LLM judge
|
| 299 |
+
- deterministic grader
|
| 300 |
+
- anti-hacking detectors
|
| 301 |
+
- hidden tests
|
| 302 |
+
- reward system
|
| 303 |
+
- end-to-end inference output format
|
| 304 |
+
|
| 305 |
+
## 11. Validation and Submission
|
| 306 |
+
|
| 307 |
+
OpenEnv validation:
|
| 308 |
+
|
| 309 |
+
```bash
|
| 310 |
+
python -m openenv.cli.__main__ validate
|
| 311 |
+
```
|
| 312 |
+
|
| 313 |
+
Pre-submission script:
|
| 314 |
+
|
| 315 |
+
```bash
|
| 316 |
+
./validate-submission.sh <your_hf_space_url>
|
| 317 |
+
```
|
| 318 |
+
|
| 319 |
+
Required environment variables:
|
| 320 |
+
|
| 321 |
+
```bash
|
| 322 |
+
export API_BASE_URL="https://router.huggingface.co/v1"
|
| 323 |
+
export MODEL_NAME="Qwen/Qwen2.5-72B-Instruct"
|
| 324 |
+
export OPENAI_API_KEY="<your_openai_compatible_api_key>"
|
| 325 |
+
# Optional fallback:
|
| 326 |
+
export HF_TOKEN="<your_token>"
|
| 327 |
+
```
|
| 328 |
+
|
| 329 |
+
Docker run (Space/API mode):
|
| 330 |
+
|
| 331 |
+
```bash
|
| 332 |
+
docker build -t cicd-debugger-env .
|
| 333 |
+
docker run --rm -p 7860:7860 cicd-debugger-env
|
| 334 |
+
```
|
| 335 |
+
|
| 336 |
+
Server endpoints used by validators:
|
| 337 |
+
|
| 338 |
+
- `POST /reset`
|
| 339 |
+
- `POST /step`
|
| 340 |
+
- `GET /state`
|
| 341 |
+
- `GET /health`
|
| 342 |
+
|
| 343 |
+
## 12. Deploy to Hugging Face Space (OpenAI Token)
|
| 344 |
+
|
| 345 |
+
This repository is already configured for Docker Spaces (`sdk: docker` in this README front matter).
|
| 346 |
+
|
| 347 |
+
1. Create a new Hugging Face Space with SDK set to `Docker`.
|
| 348 |
+
2. Push this repository to the Space git remote.
|
| 349 |
+
3. In Space Settings -> Variables and secrets, add these Secrets:
|
| 350 |
+
|
| 351 |
+
```text
|
| 352 |
+
OPENAI_API_KEY=<your_openai_access_token>
|
| 353 |
+
API_BASE_URL=https://api.openai.com/v1
|
| 354 |
+
MODEL_NAME=gpt-4o-mini
|
| 355 |
+
```
|
| 356 |
+
|
| 357 |
+
4. Optional Secrets:
|
| 358 |
+
|
| 359 |
+
```text
|
| 360 |
+
HF_TOKEN=<optional_fallback_token>
|
| 361 |
+
OFFLINE_INFERENCE=0
|
| 362 |
+
MAX_STEPS=8
|
| 363 |
+
TEMPERATURE=0.2
|
| 364 |
+
MAX_TOKENS=120
|
| 365 |
+
```
|
| 366 |
+
|
| 367 |
+
5. Keep the app port as `7860` (already configured).
|
| 368 |
+
6. Wait for build completion, then verify:
|
| 369 |
+
|
| 370 |
+
```bash
|
| 371 |
+
curl -sS https://<your-space-name>.hf.space/health
|
| 372 |
+
curl -sS -X POST https://<your-space-name>.hf.space/reset -H 'Content-Type: application/json' -d '{}'
|
| 373 |
+
```
|
| 374 |
+
|
| 375 |
+
Notes:
|
| 376 |
+
|
| 377 |
+
- `.env.example` is for local development reference only. Hugging Face Spaces use Secrets/Variables from Space Settings.
|
| 378 |
+
- Runtime code reads `OPENAI_API_KEY` first and falls back to `HF_TOKEN` when `OPENAI_API_KEY` is not provided.
|
| 379 |
+
|
| 380 |
+
## 13. One-line Presentation Summary
|
| 381 |
+
|
| 382 |
+
We built an OpenEnv-compliant reinforcement learning environment where AI agents learn to debug real CI/CD pipelines using multi-step reasoning, hybrid grading, anti-hacking safeguards, and robust reward shaping.
|
artifacts/baseline_scores.json
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"mode": "offline",
|
| 3 |
+
"model_name": "Qwen/Qwen2.5-72B-Instruct",
|
| 4 |
+
"api_base_url": "https://router.huggingface.co/v1",
|
| 5 |
+
"max_steps": 5,
|
| 6 |
+
"policy_mode": "imp",
|
| 7 |
+
"trajectories": 3,
|
| 8 |
+
"average_score": 0.578,
|
| 9 |
+
"success_rate": 0.0,
|
| 10 |
+
"results": [
|
| 11 |
+
{
|
| 12 |
+
"task_id": "easy-command-typo",
|
| 13 |
+
"difficulty": "easy",
|
| 14 |
+
"success": false,
|
| 15 |
+
"steps": 5,
|
| 16 |
+
"score": 0.541,
|
| 17 |
+
"rewards": [
|
| 18 |
+
0.3,
|
| 19 |
+
0.36,
|
| 20 |
+
0.57,
|
| 21 |
+
0.63,
|
| 22 |
+
0.84
|
| 23 |
+
],
|
| 24 |
+
"start_line": "[START] task=easy-command-typo env=cicd_debugger_env model=Qwen/Qwen2.5-72B-Instruct",
|
| 25 |
+
"end_line": "[END] success=false steps=5 score=0.541 rewards=0.30,0.36,0.57,0.63,0.84"
|
| 26 |
+
},
|
| 27 |
+
{
|
| 28 |
+
"task_id": "medium-python-version",
|
| 29 |
+
"difficulty": "medium",
|
| 30 |
+
"success": false,
|
| 31 |
+
"steps": 5,
|
| 32 |
+
"score": 0.679,
|
| 33 |
+
"rewards": [
|
| 34 |
+
0.48,
|
| 35 |
+
0.54,
|
| 36 |
+
0.58,
|
| 37 |
+
0.79,
|
| 38 |
+
1.0
|
| 39 |
+
],
|
| 40 |
+
"start_line": "[START] task=medium-python-version env=cicd_debugger_env model=Qwen/Qwen2.5-72B-Instruct",
|
| 41 |
+
"end_line": "[END] success=false steps=5 score=0.679 rewards=0.48,0.54,0.58,0.79,1.00"
|
| 42 |
+
},
|
| 43 |
+
{
|
| 44 |
+
"task_id": "hard-needs-order",
|
| 45 |
+
"difficulty": "hard",
|
| 46 |
+
"success": false,
|
| 47 |
+
"steps": 5,
|
| 48 |
+
"score": 0.513,
|
| 49 |
+
"rewards": [
|
| 50 |
+
0.48,
|
| 51 |
+
0.54,
|
| 52 |
+
0.52,
|
| 53 |
+
0.57,
|
| 54 |
+
0.46
|
| 55 |
+
],
|
| 56 |
+
"start_line": "[START] task=hard-needs-order env=cicd_debugger_env model=Qwen/Qwen2.5-72B-Instruct",
|
| 57 |
+
"end_line": "[END] success=false steps=5 score=0.513 rewards=0.48,0.54,0.52,0.57,0.46"
|
| 58 |
+
}
|
| 59 |
+
]
|
| 60 |
+
}
|
artifacts/metrics.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"average_reward": 0.6719,
|
| 3 |
+
"failure_reasons": {},
|
| 4 |
+
"steps": 7,
|
| 5 |
+
"success_rate": 0.1429,
|
| 6 |
+
"total_reward": 4.7032
|
| 7 |
+
}
|
artifacts/reward_curve.csv
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
step,reward
|
| 2 |
+
1,0.3016
|
| 3 |
+
2,0.3616
|
| 4 |
+
3,0.5700
|
| 5 |
+
4,0.6300
|
| 6 |
+
5,0.8400
|
| 7 |
+
6,1.0000
|
| 8 |
+
7,1.0000
|
artifacts/success_rate.csv
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
episode,success,success_rate
|
| 2 |
+
1,1,1.0000
|
baseline_inference.py
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import re
|
| 8 |
+
import subprocess
|
| 9 |
+
import sys
|
| 10 |
+
from typing import Any
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
BASELINE_TASKS: list[tuple[str, str]] = [
|
| 14 |
+
("easy-command-typo", "easy"),
|
| 15 |
+
("medium-python-version", "medium"),
|
| 16 |
+
("hard-needs-order", "hard"),
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
+
END_PATTERN = re.compile(
|
| 20 |
+
r"^\[END\] success=(true|false) steps=(\d+) score=(\d+\.\d{3}) rewards=(.*)$"
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def parse_args() -> argparse.Namespace:
|
| 25 |
+
parser = argparse.ArgumentParser(description="Run baseline inference on easy/medium/hard tasks")
|
| 26 |
+
parser.add_argument("--tasks", default=",".join(task for task, _ in BASELINE_TASKS))
|
| 27 |
+
parser.add_argument("--max-steps", type=int, default=int(os.getenv("MAX_STEPS", "8")))
|
| 28 |
+
parser.add_argument("--policy-mode", choices=["sft", "imp", "direct"], default="imp")
|
| 29 |
+
parser.add_argument("--trajectories", type=int, default=3)
|
| 30 |
+
parser.add_argument("--benchmark", default=os.getenv("MY_ENV_V4_BENCHMARK", "cicd_debugger_env"))
|
| 31 |
+
parser.add_argument("--offline", action="store_true", default=False)
|
| 32 |
+
parser.add_argument("--force-local-env", action="store_true", default=True)
|
| 33 |
+
parser.add_argument("--output", default="artifacts/baseline_scores.json")
|
| 34 |
+
return parser.parse_args()
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def should_run_offline(args: argparse.Namespace) -> bool:
|
| 38 |
+
if args.offline:
|
| 39 |
+
return True
|
| 40 |
+
|
| 41 |
+
key = os.getenv("OPENAI_API_KEY") or os.getenv("HF_TOKEN")
|
| 42 |
+
if not key:
|
| 43 |
+
return True
|
| 44 |
+
|
| 45 |
+
return os.getenv("OFFLINE_INFERENCE", "0") == "1"
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def parse_end_line(lines: list[str]) -> dict[str, Any]:
|
| 49 |
+
for raw_line in reversed(lines):
|
| 50 |
+
line = raw_line.strip()
|
| 51 |
+
if not line.startswith("[END] "):
|
| 52 |
+
continue
|
| 53 |
+
|
| 54 |
+
matched = END_PATTERN.match(line)
|
| 55 |
+
if not matched:
|
| 56 |
+
raise RuntimeError(f"Malformed END line: {line}")
|
| 57 |
+
|
| 58 |
+
success = matched.group(1) == "true"
|
| 59 |
+
steps = int(matched.group(2))
|
| 60 |
+
score = float(matched.group(3))
|
| 61 |
+
rewards_str = matched.group(4).strip()
|
| 62 |
+
|
| 63 |
+
rewards: list[float] = []
|
| 64 |
+
if rewards_str:
|
| 65 |
+
rewards = [float(value) for value in rewards_str.split(",") if value]
|
| 66 |
+
|
| 67 |
+
return {
|
| 68 |
+
"success": success,
|
| 69 |
+
"steps": steps,
|
| 70 |
+
"score": score,
|
| 71 |
+
"rewards": rewards,
|
| 72 |
+
"end_line": line,
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
raise RuntimeError("No END line found in inference output")
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def run_single_task(
|
| 79 |
+
task_id: str,
|
| 80 |
+
difficulty: str,
|
| 81 |
+
args: argparse.Namespace,
|
| 82 |
+
project_root: Path,
|
| 83 |
+
offline_mode: bool,
|
| 84 |
+
) -> dict[str, Any]:
|
| 85 |
+
command = [
|
| 86 |
+
sys.executable,
|
| 87 |
+
"inference.py",
|
| 88 |
+
"--task",
|
| 89 |
+
task_id,
|
| 90 |
+
"--benchmark",
|
| 91 |
+
str(args.benchmark),
|
| 92 |
+
"--max-steps",
|
| 93 |
+
str(max(1, int(args.max_steps))),
|
| 94 |
+
"--policy-mode",
|
| 95 |
+
str(args.policy_mode),
|
| 96 |
+
"--trajectories",
|
| 97 |
+
str(max(1, int(args.trajectories))),
|
| 98 |
+
]
|
| 99 |
+
|
| 100 |
+
if offline_mode:
|
| 101 |
+
command.append("--offline")
|
| 102 |
+
if args.force_local_env:
|
| 103 |
+
command.append("--force-local-env")
|
| 104 |
+
|
| 105 |
+
env = os.environ.copy()
|
| 106 |
+
if offline_mode:
|
| 107 |
+
env["OFFLINE_INFERENCE"] = "1"
|
| 108 |
+
|
| 109 |
+
completed = subprocess.run(
|
| 110 |
+
command,
|
| 111 |
+
cwd=project_root,
|
| 112 |
+
capture_output=True,
|
| 113 |
+
text=True,
|
| 114 |
+
env=env,
|
| 115 |
+
check=True,
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
lines = [line for line in completed.stdout.splitlines() if line.strip()]
|
| 119 |
+
summary = parse_end_line(lines)
|
| 120 |
+
|
| 121 |
+
return {
|
| 122 |
+
"task_id": task_id,
|
| 123 |
+
"difficulty": difficulty,
|
| 124 |
+
"success": summary["success"],
|
| 125 |
+
"steps": summary["steps"],
|
| 126 |
+
"score": summary["score"],
|
| 127 |
+
"rewards": summary["rewards"],
|
| 128 |
+
"start_line": next((line for line in lines if line.startswith("[START] ")), ""),
|
| 129 |
+
"end_line": summary["end_line"],
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def main() -> int:
|
| 134 |
+
args = parse_args()
|
| 135 |
+
project_root = Path(__file__).resolve().parent
|
| 136 |
+
|
| 137 |
+
known_difficulties = {task: difficulty for task, difficulty in BASELINE_TASKS}
|
| 138 |
+
requested_tasks = [task.strip() for task in str(args.tasks).split(",") if task.strip()]
|
| 139 |
+
|
| 140 |
+
if not requested_tasks:
|
| 141 |
+
print("No tasks provided for baseline run", file=sys.stderr)
|
| 142 |
+
return 1
|
| 143 |
+
|
| 144 |
+
offline_mode = should_run_offline(args)
|
| 145 |
+
|
| 146 |
+
print(
|
| 147 |
+
f"[BASELINE] mode={'offline' if offline_mode else 'openai'} tasks={len(requested_tasks)} "
|
| 148 |
+
f"max_steps={max(1, int(args.max_steps))} policy={args.policy_mode}",
|
| 149 |
+
flush=True,
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
results: list[dict[str, Any]] = []
|
| 153 |
+
for task_id in requested_tasks:
|
| 154 |
+
difficulty = known_difficulties.get(task_id, "custom")
|
| 155 |
+
try:
|
| 156 |
+
result = run_single_task(task_id, difficulty, args, project_root, offline_mode)
|
| 157 |
+
results.append(result)
|
| 158 |
+
print(
|
| 159 |
+
f"[BASELINE] task={task_id} difficulty={difficulty} success={str(result['success']).lower()} "
|
| 160 |
+
f"score={result['score']:.3f} steps={result['steps']}",
|
| 161 |
+
flush=True,
|
| 162 |
+
)
|
| 163 |
+
except subprocess.CalledProcessError as exc:
|
| 164 |
+
print(f"[BASELINE] task={task_id} failed with return code {exc.returncode}", file=sys.stderr)
|
| 165 |
+
if exc.stdout:
|
| 166 |
+
print(exc.stdout, file=sys.stderr)
|
| 167 |
+
if exc.stderr:
|
| 168 |
+
print(exc.stderr, file=sys.stderr)
|
| 169 |
+
return exc.returncode or 1
|
| 170 |
+
except Exception as exc:
|
| 171 |
+
print(f"[BASELINE] task={task_id} failed: {exc}", file=sys.stderr)
|
| 172 |
+
return 1
|
| 173 |
+
|
| 174 |
+
average_score = sum(item["score"] for item in results) / len(results)
|
| 175 |
+
success_rate = sum(1 for item in results if item["success"]) / len(results)
|
| 176 |
+
|
| 177 |
+
payload = {
|
| 178 |
+
"mode": "offline" if offline_mode else "openai",
|
| 179 |
+
"model_name": os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct"),
|
| 180 |
+
"api_base_url": os.getenv("API_BASE_URL", "https://router.huggingface.co/v1"),
|
| 181 |
+
"max_steps": max(1, int(args.max_steps)),
|
| 182 |
+
"policy_mode": str(args.policy_mode),
|
| 183 |
+
"trajectories": max(1, int(args.trajectories)),
|
| 184 |
+
"average_score": round(float(average_score), 3),
|
| 185 |
+
"success_rate": round(float(success_rate), 3),
|
| 186 |
+
"results": results,
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
output_path = project_root / str(args.output)
|
| 190 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 191 |
+
output_path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
|
| 192 |
+
|
| 193 |
+
print(f"[BASELINE] average_score={payload['average_score']:.3f} success_rate={payload['success_rate']:.3f}", flush=True)
|
| 194 |
+
print(f"[BASELINE] wrote {output_path}", flush=True)
|
| 195 |
+
|
| 196 |
+
return 0
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
if __name__ == "__main__":
|
| 200 |
+
raise SystemExit(main())
|
inference.py
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import asyncio
|
| 5 |
+
import os
|
| 6 |
+
from typing import Any
|
| 7 |
+
|
| 8 |
+
from openai import OpenAI
|
| 9 |
+
|
| 10 |
+
from env.environment import CICDDebuggerEnvironment, REQUIRED_TOOLS
|
| 11 |
+
from inference.metrics import EpisodeMetrics
|
| 12 |
+
from inference.model_wrapper import ModelWrapper, score_action_candidate
|
| 13 |
+
from inference.prompts import heuristic_action
|
| 14 |
+
from inference.visualize import save_metrics_json, save_reward_curve, save_success_rate_history
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 18 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
|
| 19 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 20 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 21 |
+
API_KEY = OPENAI_API_KEY or HF_TOKEN
|
| 22 |
+
LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME")
|
| 23 |
+
DEFAULT_TASK_ID = os.getenv("MY_ENV_V4_TASK", "easy-command-typo")
|
| 24 |
+
DEFAULT_BENCHMARK = os.getenv("MY_ENV_V4_BENCHMARK", "cicd_debugger_env")
|
| 25 |
+
|
| 26 |
+
MAX_STEPS_DEFAULT = int(os.getenv("MAX_STEPS", "8"))
|
| 27 |
+
TEMPERATURE = float(os.getenv("TEMPERATURE", "0.2"))
|
| 28 |
+
MAX_TOKENS = int(os.getenv("MAX_TOKENS", "120"))
|
| 29 |
+
OFFLINE_INFERENCE = os.getenv("OFFLINE_INFERENCE", "0") == "1"
|
| 30 |
+
SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.1"))
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def log_start(task: str, env_name: str, model: str) -> None:
|
| 34 |
+
print(f"[START] task={_single_line(task)} env={_single_line(env_name)} model={_single_line(model)}", flush=True)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def log_step(step: int, action: str, reward: float, done: bool, error: str | None) -> None:
|
| 38 |
+
done_val = str(done).lower()
|
| 39 |
+
error_val = _single_line(error) if error else "null"
|
| 40 |
+
action_val = _single_line(action)
|
| 41 |
+
print(f"[STEP] step={step} action={action_val} reward={reward:.2f} done={done_val} error={error_val}", flush=True)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def log_end(success: bool, steps: int, score: float, rewards: list[float]) -> None:
|
| 45 |
+
rewards_str = ",".join(f"{value:.2f}" for value in rewards)
|
| 46 |
+
print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def _single_line(value: Any) -> str:
|
| 50 |
+
return " ".join(str(value).replace("\n", " ").replace("\r", " ").split())
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def _is_hacking_action(action_text: str) -> bool:
|
| 54 |
+
value = (action_text or "").lower()
|
| 55 |
+
patterns = (
|
| 56 |
+
"if: false",
|
| 57 |
+
"when: never",
|
| 58 |
+
"echo \"tests passed\"",
|
| 59 |
+
"echo 'tests passed'",
|
| 60 |
+
"exit 0",
|
| 61 |
+
"force success",
|
| 62 |
+
"status: success",
|
| 63 |
+
)
|
| 64 |
+
return any(token in value for token in patterns)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def _extract_error(info: dict[str, Any] | None) -> str | None:
|
| 68 |
+
if not info:
|
| 69 |
+
return None
|
| 70 |
+
error = info.get("error")
|
| 71 |
+
return str(error) if error else None
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def _extract_observation_fields(observation: dict[str, Any]) -> tuple[str, str, list[str]]:
|
| 75 |
+
config_text = str(observation.get("config") or "")
|
| 76 |
+
error_message = str(observation.get("error_message") or "")
|
| 77 |
+
tools = [str(item) for item in (observation.get("available_tools") or REQUIRED_TOOLS)]
|
| 78 |
+
return config_text, error_message, tools
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def _tool_from_action(action_text: str) -> str:
|
| 82 |
+
return str(action_text or "").split(":", 1)[0].strip().lower()
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def _is_action_allowed(action_text: str, available_tools: list[str]) -> bool:
|
| 86 |
+
return _tool_from_action(action_text) in {tool.lower() for tool in available_tools}
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def _normalize_action(action_text: str, available_tools: list[str], fallback: str) -> str:
|
| 90 |
+
action = str(action_text or "").strip()
|
| 91 |
+
if not action:
|
| 92 |
+
return fallback
|
| 93 |
+
|
| 94 |
+
aliases = {
|
| 95 |
+
"run_stage": "run_pipeline_stage",
|
| 96 |
+
"validate": "validate_fix",
|
| 97 |
+
"submit": "submit_solution",
|
| 98 |
+
"submit_fix": "submit_solution",
|
| 99 |
+
}
|
| 100 |
+
tool = _tool_from_action(action)
|
| 101 |
+
normalized_tool = aliases.get(tool, tool)
|
| 102 |
+
if normalized_tool != tool:
|
| 103 |
+
suffix = action.split(":", 1)[1].strip() if ":" in action else ""
|
| 104 |
+
action = f"{normalized_tool}: {suffix}" if suffix else normalized_tool
|
| 105 |
+
|
| 106 |
+
if _is_action_allowed(action, available_tools):
|
| 107 |
+
return action
|
| 108 |
+
|
| 109 |
+
return fallback
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def _select_action(
|
| 113 |
+
model_wrapper: ModelWrapper,
|
| 114 |
+
step: int,
|
| 115 |
+
config_text: str,
|
| 116 |
+
error_message: str,
|
| 117 |
+
history: list[str],
|
| 118 |
+
available_actions: list[str],
|
| 119 |
+
policy_mode: str,
|
| 120 |
+
trajectories: int,
|
| 121 |
+
) -> str:
|
| 122 |
+
mode = (policy_mode or "imp").lower()
|
| 123 |
+
fallback = heuristic_action(config_text, error_message, available_actions, history)
|
| 124 |
+
|
| 125 |
+
if mode == "sft":
|
| 126 |
+
return _normalize_action(fallback, available_actions, fallback)
|
| 127 |
+
|
| 128 |
+
if mode == "direct":
|
| 129 |
+
action = model_wrapper.generate_action(
|
| 130 |
+
step=step,
|
| 131 |
+
config_text=config_text,
|
| 132 |
+
error_message=error_message,
|
| 133 |
+
history=history,
|
| 134 |
+
available_actions=available_actions,
|
| 135 |
+
)
|
| 136 |
+
return _normalize_action(action, available_actions, fallback)
|
| 137 |
+
|
| 138 |
+
candidates = model_wrapper.generate_candidates(
|
| 139 |
+
step=step,
|
| 140 |
+
config_text=config_text,
|
| 141 |
+
error_message=error_message,
|
| 142 |
+
history=history,
|
| 143 |
+
count=max(1, int(trajectories)),
|
| 144 |
+
available_actions=available_actions,
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
if not candidates:
|
| 148 |
+
return _normalize_action(fallback, available_actions, fallback)
|
| 149 |
+
|
| 150 |
+
observation_text = f"{config_text}\n{error_message}"
|
| 151 |
+
best = max(candidates, key=lambda item: score_action_candidate(observation_text, item, _is_hacking_action))
|
| 152 |
+
return _normalize_action(best, available_actions, fallback)
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def parse_args() -> argparse.Namespace:
|
| 156 |
+
parser = argparse.ArgumentParser(description="Run CI/CD debugger inference loop")
|
| 157 |
+
parser.add_argument("--max-steps", type=int, default=MAX_STEPS_DEFAULT)
|
| 158 |
+
parser.add_argument("--task", default=DEFAULT_TASK_ID)
|
| 159 |
+
parser.add_argument("--benchmark", default=DEFAULT_BENCHMARK)
|
| 160 |
+
parser.add_argument("--difficulty", choices=["easy", "medium", "hard"], default=None)
|
| 161 |
+
parser.add_argument("--offline", action="store_true", default=OFFLINE_INFERENCE)
|
| 162 |
+
parser.add_argument("--force-local-env", action="store_true", default=False)
|
| 163 |
+
parser.add_argument("--policy-mode", choices=["sft", "imp", "direct"], default="imp")
|
| 164 |
+
parser.add_argument("--trajectories", type=int, default=3)
|
| 165 |
+
return parser.parse_args()
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
async def run_episode(args: argparse.Namespace) -> int:
|
| 169 |
+
history: list[str] = []
|
| 170 |
+
steps_taken = 0
|
| 171 |
+
success = False
|
| 172 |
+
episode_completed_cleanly = False
|
| 173 |
+
metrics = EpisodeMetrics()
|
| 174 |
+
|
| 175 |
+
env = CICDDebuggerEnvironment(max_steps=max(1, int(args.max_steps)))
|
| 176 |
+
|
| 177 |
+
offline_mode = bool(args.offline or not API_KEY)
|
| 178 |
+
client: OpenAI | None = None
|
| 179 |
+
if not offline_mode:
|
| 180 |
+
try:
|
| 181 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
|
| 182 |
+
except Exception:
|
| 183 |
+
client = None
|
| 184 |
+
offline_mode = True
|
| 185 |
+
|
| 186 |
+
model_wrapper = ModelWrapper(
|
| 187 |
+
client=client,
|
| 188 |
+
model_name=MODEL_NAME,
|
| 189 |
+
temperature=TEMPERATURE,
|
| 190 |
+
max_tokens=MAX_TOKENS,
|
| 191 |
+
offline=offline_mode,
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
log_start(task=str(args.task), env_name=str(args.benchmark), model=MODEL_NAME)
|
| 195 |
+
|
| 196 |
+
try:
|
| 197 |
+
observation = await env.reset(task_id=str(args.task), difficulty=args.difficulty)
|
| 198 |
+
|
| 199 |
+
for step in range(1, max(1, int(args.max_steps)) + 1):
|
| 200 |
+
config_text, error_message, available_tools = _extract_observation_fields(observation)
|
| 201 |
+
|
| 202 |
+
action_text = _select_action(
|
| 203 |
+
model_wrapper=model_wrapper,
|
| 204 |
+
step=step,
|
| 205 |
+
config_text=config_text,
|
| 206 |
+
error_message=error_message,
|
| 207 |
+
history=history,
|
| 208 |
+
available_actions=available_tools,
|
| 209 |
+
policy_mode=str(args.policy_mode),
|
| 210 |
+
trajectories=max(1, int(args.trajectories)),
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
observation, reward, done, info = await env.step(action_text)
|
| 214 |
+
step_error = _extract_error(info)
|
| 215 |
+
|
| 216 |
+
metrics.add_step(action=action_text, reward=float(reward), error=step_error, done=bool(done))
|
| 217 |
+
steps_taken = step
|
| 218 |
+
|
| 219 |
+
log_step(step=step, action=action_text, reward=float(reward), done=bool(done), error=step_error)
|
| 220 |
+
history.append(f"step={step} action={_single_line(action_text)} reward={float(reward):.2f}")
|
| 221 |
+
|
| 222 |
+
if done:
|
| 223 |
+
episode_completed_cleanly = step_error is None and not _is_hacking_action(action_text)
|
| 224 |
+
break
|
| 225 |
+
|
| 226 |
+
except Exception as exc:
|
| 227 |
+
success = False
|
| 228 |
+
if not metrics.rewards:
|
| 229 |
+
metrics.add_step(action="system_error", reward=0.0, error=str(exc), done=True)
|
| 230 |
+
finally:
|
| 231 |
+
score = max(0.0, min(1.0, float(metrics.average_reward)))
|
| 232 |
+
success = episode_completed_cleanly and score >= SUCCESS_SCORE_THRESHOLD
|
| 233 |
+
|
| 234 |
+
try:
|
| 235 |
+
save_reward_curve(metrics.rewards)
|
| 236 |
+
save_metrics_json(metrics.summary())
|
| 237 |
+
save_success_rate_history([success])
|
| 238 |
+
except Exception:
|
| 239 |
+
pass
|
| 240 |
+
|
| 241 |
+
try:
|
| 242 |
+
await env.close()
|
| 243 |
+
except Exception:
|
| 244 |
+
pass
|
| 245 |
+
|
| 246 |
+
log_end(success=success, steps=steps_taken, score=score, rewards=metrics.rewards)
|
| 247 |
+
|
| 248 |
+
return 0
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
def main() -> int:
|
| 252 |
+
args = parse_args()
|
| 253 |
+
return asyncio.run(run_episode(args))
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
if __name__ == "__main__":
|
| 257 |
+
raise SystemExit(main())
|
inference/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from inference.metrics import EpisodeMetrics
|
| 2 |
+
from inference.model_wrapper import ModelWrapper
|
| 3 |
+
|
| 4 |
+
__all__ = ["EpisodeMetrics", "ModelWrapper"]
|
inference/metrics.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass, field
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
@dataclass
|
| 7 |
+
class EpisodeMetrics:
|
| 8 |
+
rewards: list[float] = field(default_factory=list)
|
| 9 |
+
actions: list[str] = field(default_factory=list)
|
| 10 |
+
errors: list[str | None] = field(default_factory=list)
|
| 11 |
+
dones: list[bool] = field(default_factory=list)
|
| 12 |
+
|
| 13 |
+
def add_step(self, action: str, reward: float, error: str | None, done: bool) -> None:
|
| 14 |
+
self.actions.append(action)
|
| 15 |
+
self.rewards.append(float(reward))
|
| 16 |
+
self.errors.append(error)
|
| 17 |
+
self.dones.append(bool(done))
|
| 18 |
+
|
| 19 |
+
@property
|
| 20 |
+
def steps(self) -> int:
|
| 21 |
+
return len(self.rewards)
|
| 22 |
+
|
| 23 |
+
@property
|
| 24 |
+
def total_reward(self) -> float:
|
| 25 |
+
return round(sum(self.rewards), 4)
|
| 26 |
+
|
| 27 |
+
@property
|
| 28 |
+
def average_reward(self) -> float:
|
| 29 |
+
if not self.rewards:
|
| 30 |
+
return 0.0
|
| 31 |
+
return round(self.total_reward / len(self.rewards), 4)
|
| 32 |
+
|
| 33 |
+
@property
|
| 34 |
+
def success_rate(self) -> float:
|
| 35 |
+
if not self.dones:
|
| 36 |
+
return 0.0
|
| 37 |
+
successes = sum(1 for flag in self.dones if flag)
|
| 38 |
+
return round(successes / len(self.dones), 4)
|
| 39 |
+
|
| 40 |
+
@property
|
| 41 |
+
def failure_reasons(self) -> dict[str, int]:
|
| 42 |
+
counts: dict[str, int] = {}
|
| 43 |
+
for err in self.errors:
|
| 44 |
+
if not err:
|
| 45 |
+
continue
|
| 46 |
+
counts[err] = counts.get(err, 0) + 1
|
| 47 |
+
return counts
|
| 48 |
+
|
| 49 |
+
def summary(self) -> dict[str, float | int | dict[str, int]]:
|
| 50 |
+
return {
|
| 51 |
+
"steps": self.steps,
|
| 52 |
+
"total_reward": self.total_reward,
|
| 53 |
+
"average_reward": self.average_reward,
|
| 54 |
+
"success_rate": self.success_rate,
|
| 55 |
+
"failure_reasons": self.failure_reasons,
|
| 56 |
+
}
|
inference/model_wrapper.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
from typing import Any, Iterable
|
| 5 |
+
|
| 6 |
+
from openai import OpenAI
|
| 7 |
+
|
| 8 |
+
from inference.prompts import REQUIRED_ACTIONS, SYSTEM_PROMPT, build_user_prompt, heuristic_action, sanitize_action_text
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@dataclass
|
| 12 |
+
class ModelWrapper:
|
| 13 |
+
client: OpenAI | None
|
| 14 |
+
model_name: str
|
| 15 |
+
temperature: float
|
| 16 |
+
max_tokens: int
|
| 17 |
+
offline: bool
|
| 18 |
+
|
| 19 |
+
def generate_action(
|
| 20 |
+
self,
|
| 21 |
+
step: int,
|
| 22 |
+
config_text: str,
|
| 23 |
+
error_message: str,
|
| 24 |
+
history: list[str],
|
| 25 |
+
available_actions: Iterable[str] | None = None,
|
| 26 |
+
) -> str:
|
| 27 |
+
fallback = heuristic_action(config_text, error_message, available_actions, history)
|
| 28 |
+
if self.offline or self.client is None:
|
| 29 |
+
return fallback
|
| 30 |
+
|
| 31 |
+
user_prompt = build_user_prompt(
|
| 32 |
+
step=step,
|
| 33 |
+
config_text=config_text,
|
| 34 |
+
error_message=error_message,
|
| 35 |
+
history=history,
|
| 36 |
+
available_actions=available_actions,
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
try:
|
| 40 |
+
completion = self.client.chat.completions.create(
|
| 41 |
+
model=self.model_name,
|
| 42 |
+
messages=[
|
| 43 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 44 |
+
{"role": "user", "content": user_prompt},
|
| 45 |
+
],
|
| 46 |
+
temperature=max(float(self.temperature), 0.0),
|
| 47 |
+
max_tokens=max(16, int(self.max_tokens)),
|
| 48 |
+
stream=False,
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
generated = str(completion.choices[0].message.content or "")
|
| 52 |
+
return sanitize_action_text(generated, fallback=fallback)
|
| 53 |
+
except Exception:
|
| 54 |
+
return fallback
|
| 55 |
+
|
| 56 |
+
def generate_candidates(
|
| 57 |
+
self,
|
| 58 |
+
step: int,
|
| 59 |
+
config_text: str,
|
| 60 |
+
error_message: str,
|
| 61 |
+
history: list[str],
|
| 62 |
+
count: int,
|
| 63 |
+
available_actions: Iterable[str] | None = None,
|
| 64 |
+
) -> list[str]:
|
| 65 |
+
candidates = [heuristic_action(config_text, error_message, available_actions, history)]
|
| 66 |
+
|
| 67 |
+
for idx in range(max(1, count)):
|
| 68 |
+
action = self.generate_action(
|
| 69 |
+
step=step,
|
| 70 |
+
config_text=config_text,
|
| 71 |
+
error_message=error_message,
|
| 72 |
+
history=history + [f"candidate={idx}"],
|
| 73 |
+
available_actions=available_actions,
|
| 74 |
+
)
|
| 75 |
+
if action:
|
| 76 |
+
candidates.append(action)
|
| 77 |
+
|
| 78 |
+
deduped: list[str] = []
|
| 79 |
+
seen: set[str] = set()
|
| 80 |
+
for candidate in candidates:
|
| 81 |
+
normalized = candidate.strip()
|
| 82 |
+
if not normalized or normalized in seen:
|
| 83 |
+
continue
|
| 84 |
+
seen.add(normalized)
|
| 85 |
+
deduped.append(normalized)
|
| 86 |
+
|
| 87 |
+
return deduped
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def score_action_candidate(observation_text: str, action_text: str, hacking_predicate: Any) -> float:
|
| 91 |
+
obs = (observation_text or "").lower()
|
| 92 |
+
action = (action_text or "").lower()
|
| 93 |
+
|
| 94 |
+
score = 0.0
|
| 95 |
+
|
| 96 |
+
selected_tool = action.split(":", 1)[0].strip()
|
| 97 |
+
if selected_tool in REQUIRED_ACTIONS:
|
| 98 |
+
score += 0.5
|
| 99 |
+
else:
|
| 100 |
+
score -= 0.5
|
| 101 |
+
|
| 102 |
+
if "npm tset" in obs and "edit_config" in action and "npm test" in action:
|
| 103 |
+
score += 2.0
|
| 104 |
+
if "yaml" in obs and "edit_config" in action:
|
| 105 |
+
score += 1.2
|
| 106 |
+
if "missing" in obs and "analyze_error" in action:
|
| 107 |
+
score += 0.8
|
| 108 |
+
if "validate_fix" in action or "submit_solution" in action:
|
| 109 |
+
score += 0.5
|
| 110 |
+
|
| 111 |
+
if hacking_predicate(action_text):
|
| 112 |
+
score -= 3.0
|
| 113 |
+
|
| 114 |
+
score += min(len(action_text) / 240.0, 0.2)
|
| 115 |
+
return score
|
inference/prompts.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
import textwrap
|
| 5 |
+
from typing import Iterable
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
SYSTEM_PROMPT = textwrap.dedent(
|
| 9 |
+
"""
|
| 10 |
+
You are a CI/CD pipeline debugger assistant.
|
| 11 |
+
Return exactly one single-line action describing the next debugging move.
|
| 12 |
+
Do not output markdown. Do not include explanations.
|
| 13 |
+
"""
|
| 14 |
+
).strip()
|
| 15 |
+
|
| 16 |
+
JUDGE_SYSTEM_PROMPT = textwrap.dedent(
|
| 17 |
+
"""
|
| 18 |
+
You are a strict CI/CD judge.
|
| 19 |
+
Return JSON only with keys correctness, minimalism, quality and values in [0,1].
|
| 20 |
+
"""
|
| 21 |
+
).strip()
|
| 22 |
+
|
| 23 |
+
REQUIRED_ACTIONS = (
|
| 24 |
+
"read_file",
|
| 25 |
+
"read_logs",
|
| 26 |
+
"analyze_error",
|
| 27 |
+
"edit_config",
|
| 28 |
+
"run_pipeline_stage",
|
| 29 |
+
"run_tests",
|
| 30 |
+
"validate_fix",
|
| 31 |
+
"submit_solution",
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def build_user_prompt(
|
| 36 |
+
step: int,
|
| 37 |
+
config_text: str,
|
| 38 |
+
error_message: str,
|
| 39 |
+
history: list[str],
|
| 40 |
+
available_actions: Iterable[str] | None = None,
|
| 41 |
+
) -> str:
|
| 42 |
+
history_text = "\n".join(history[-5:]) if history else "None"
|
| 43 |
+
actions_text = ", ".join(available_actions) if available_actions else ", ".join(REQUIRED_ACTIONS)
|
| 44 |
+
|
| 45 |
+
return textwrap.dedent(
|
| 46 |
+
f"""
|
| 47 |
+
Step: {step}
|
| 48 |
+
|
| 49 |
+
Current config:
|
| 50 |
+
{config_text}
|
| 51 |
+
|
| 52 |
+
Current error:
|
| 53 |
+
{error_message}
|
| 54 |
+
|
| 55 |
+
Recent history:
|
| 56 |
+
{history_text}
|
| 57 |
+
|
| 58 |
+
Available action categories:
|
| 59 |
+
{actions_text}
|
| 60 |
+
|
| 61 |
+
Output one actionable single-line fix/debug action.
|
| 62 |
+
"""
|
| 63 |
+
).strip()
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def sanitize_action_text(raw_text: str, fallback: str = "read logs and analyze failing command") -> str:
|
| 67 |
+
text = (raw_text or "").strip()
|
| 68 |
+
if not text:
|
| 69 |
+
return fallback
|
| 70 |
+
text = text.replace("\n", " ").replace("\r", " ")
|
| 71 |
+
text = " ".join(text.split())
|
| 72 |
+
return text or fallback
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def heuristic_action(
|
| 76 |
+
config_text: str,
|
| 77 |
+
error_message: str,
|
| 78 |
+
available_actions: Iterable[str] | None = None,
|
| 79 |
+
history: list[str] | None = None,
|
| 80 |
+
) -> str:
|
| 81 |
+
lower_cfg = (config_text or "").lower()
|
| 82 |
+
lower_err = (error_message or "").lower()
|
| 83 |
+
seen = _extract_seen_tools(history or [])
|
| 84 |
+
allowed = {item.strip() for item in (available_actions or REQUIRED_ACTIONS)}
|
| 85 |
+
|
| 86 |
+
def has_tool(name: str) -> bool:
|
| 87 |
+
return name in allowed
|
| 88 |
+
|
| 89 |
+
if has_tool("read_logs") and "read_logs" not in seen:
|
| 90 |
+
return "read_logs: inspect failing stage logs"
|
| 91 |
+
|
| 92 |
+
if has_tool("analyze_error") and "analyze_error" not in seen:
|
| 93 |
+
return "analyze_error: identify root cause from logs and config"
|
| 94 |
+
|
| 95 |
+
if has_tool("edit_config") and "npm tset" in lower_cfg:
|
| 96 |
+
return "edit_config: replace npm tset with npm test"
|
| 97 |
+
|
| 98 |
+
if has_tool("edit_config") and ("yaml" in lower_err or "mapping values are not allowed" in lower_err):
|
| 99 |
+
return "edit_config: fix YAML indentation and syntax"
|
| 100 |
+
|
| 101 |
+
if has_tool("edit_config") and ("module not found" in lower_err or "dependency" in lower_err):
|
| 102 |
+
return "edit_config: repair dependency install and test commands"
|
| 103 |
+
|
| 104 |
+
if has_tool("run_pipeline_stage") and "run_pipeline_stage" not in seen:
|
| 105 |
+
return "run_pipeline_stage: run test stage"
|
| 106 |
+
|
| 107 |
+
if has_tool("run_tests") and "run_tests" not in seen:
|
| 108 |
+
return "run_tests: execute full pipeline tests"
|
| 109 |
+
|
| 110 |
+
if has_tool("validate_fix") and "validate_fix" not in seen:
|
| 111 |
+
return "validate_fix: check deterministic, hidden, and quality scores"
|
| 112 |
+
|
| 113 |
+
if has_tool("submit_solution"):
|
| 114 |
+
return "submit_solution: submit current configuration"
|
| 115 |
+
|
| 116 |
+
return "read_logs: inspect failing stage logs and identify root cause"
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def _extract_seen_tools(history: list[str]) -> set[str]:
|
| 120 |
+
seen: set[str] = set()
|
| 121 |
+
for item in history:
|
| 122 |
+
for tool in REQUIRED_ACTIONS:
|
| 123 |
+
if re.search(rf"\b{re.escape(tool)}\b", item):
|
| 124 |
+
seen.add(tool)
|
| 125 |
+
return seen
|
inference/visualize.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def save_reward_curve(rewards: list[float], output_path: str = "artifacts/reward_curve.csv") -> str:
|
| 8 |
+
path = Path(output_path)
|
| 9 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 10 |
+
|
| 11 |
+
with path.open("w", encoding="utf-8") as handle:
|
| 12 |
+
handle.write("step,reward\n")
|
| 13 |
+
for idx, reward in enumerate(rewards, start=1):
|
| 14 |
+
handle.write(f"{idx},{float(reward):.4f}\n")
|
| 15 |
+
|
| 16 |
+
return str(path)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def save_success_rate_history(success_flags: list[bool], output_path: str = "artifacts/success_rate.csv") -> str:
|
| 20 |
+
path = Path(output_path)
|
| 21 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 22 |
+
|
| 23 |
+
running = 0
|
| 24 |
+
with path.open("w", encoding="utf-8") as handle:
|
| 25 |
+
handle.write("episode,success,success_rate\n")
|
| 26 |
+
for idx, flag in enumerate(success_flags, start=1):
|
| 27 |
+
if flag:
|
| 28 |
+
running += 1
|
| 29 |
+
rate = running / idx
|
| 30 |
+
handle.write(f"{idx},{int(flag)},{rate:.4f}\n")
|
| 31 |
+
|
| 32 |
+
return str(path)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def save_metrics_json(metrics: dict, output_path: str = "artifacts/metrics.json") -> str:
|
| 36 |
+
path = Path(output_path)
|
| 37 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 38 |
+
|
| 39 |
+
with path.open("w", encoding="utf-8") as handle:
|
| 40 |
+
json.dump(metrics, handle, indent=2, sort_keys=True)
|
| 41 |
+
|
| 42 |
+
return str(path)
|
openenv.yaml
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: "0.2"
|
| 2 |
+
name: "cicd-debugger-env"
|
| 3 |
+
description: "RL environment for CI/CD debugging with deterministic, hidden, and quality-aware scoring"
|
| 4 |
+
metadata:
|
| 5 |
+
domain: "devops"
|
| 6 |
+
real_world_task: "ci-cd pipeline debugging"
|
| 7 |
+
deployment: "huggingface-space-docker"
|
| 8 |
+
|
| 9 |
+
environment:
|
| 10 |
+
entry_point: "env.environment:CICDDebuggerEnvironment"
|
| 11 |
+
|
| 12 |
+
interface:
|
| 13 |
+
observation_type: "json"
|
| 14 |
+
action_type: "text"
|
| 15 |
+
max_steps: 30
|
| 16 |
+
|
| 17 |
+
action_space:
|
| 18 |
+
tools:
|
| 19 |
+
- read_file
|
| 20 |
+
- read_logs
|
| 21 |
+
- analyze_error
|
| 22 |
+
- edit_config
|
| 23 |
+
- run_pipeline_stage
|
| 24 |
+
- run_tests
|
| 25 |
+
- validate_fix
|
| 26 |
+
- submit_solution
|
| 27 |
+
|
| 28 |
+
tasks:
|
| 29 |
+
- id: "easy-command-typo"
|
| 30 |
+
grader: "env.graders.deterministic:DeterministicGrader"
|
| 31 |
+
|
| 32 |
+
- id: "easy-missing-checkout"
|
| 33 |
+
grader: "env.graders.deterministic:DeterministicGrader"
|
| 34 |
+
|
| 35 |
+
- id: "easy-yaml-indentation"
|
| 36 |
+
grader: "env.graders.deterministic:DeterministicGrader"
|
| 37 |
+
|
| 38 |
+
- id: "medium-python-version"
|
| 39 |
+
grader: "env.graders.deterministic:DeterministicGrader"
|
| 40 |
+
|
| 41 |
+
- id: "medium-cache-key"
|
| 42 |
+
grader: "env.graders.deterministic:DeterministicGrader"
|
| 43 |
+
|
| 44 |
+
- id: "medium-artifact-permissions"
|
| 45 |
+
grader: "env.graders.deterministic:DeterministicGrader"
|
| 46 |
+
|
| 47 |
+
- id: "hard-matrix-logic"
|
| 48 |
+
grader: "env.graders.deterministic:DeterministicGrader"
|
| 49 |
+
|
| 50 |
+
- id: "hard-conditional-deploy"
|
| 51 |
+
grader: "env.graders.deterministic:DeterministicGrader"
|
| 52 |
+
|
| 53 |
+
- id: "hard-needs-order"
|
| 54 |
+
grader: "env.graders.deterministic:DeterministicGrader"
|
pyproject.toml
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "cicd-debugger-env"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "OpenEnv CI/CD pipeline debugging environment with hybrid grading and reward shaping"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.10"
|
| 7 |
+
dependencies = [
|
| 8 |
+
"pyyaml",
|
| 9 |
+
"fastapi",
|
| 10 |
+
"uvicorn",
|
| 11 |
+
"openenv-core",
|
| 12 |
+
"openai",
|
| 13 |
+
]
|
| 14 |
+
|
| 15 |
+
[project.scripts]
|
| 16 |
+
server = "server.app:main"
|
| 17 |
+
|
| 18 |
+
[build-system]
|
| 19 |
+
requires = ["setuptools>=68", "wheel"]
|
| 20 |
+
build-backend = "setuptools.build_meta"
|
| 21 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pyyaml
|
| 2 |
+
fastapi
|
| 3 |
+
uvicorn[standard]
|
| 4 |
+
openenv-core
|
| 5 |
+
openai
|
| 6 |
+
pydantic>=2.0.0
|
| 7 |
+
transformers>=4.30.0
|
| 8 |
+
torch>=2.0.0
|
| 9 |
+
pytest>=7.0.0
|
server/__init__.py
ADDED
|
File without changes
|
server/app.py
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
import os
|
| 5 |
+
from typing import Any
|
| 6 |
+
|
| 7 |
+
from fastapi import FastAPI
|
| 8 |
+
from fastapi import HTTPException
|
| 9 |
+
from pydantic import BaseModel, Field
|
| 10 |
+
import uvicorn
|
| 11 |
+
|
| 12 |
+
from env.environment import CICDDebuggerEnvironment, MAX_STEPS
|
| 13 |
+
from env.models import Action, Observation, Reward
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
app = FastAPI(title="CI/CD Debugger OpenEnv Server")
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class ResetRequest(BaseModel):
|
| 20 |
+
task_id: str | None = None
|
| 21 |
+
difficulty: str | None = None
|
| 22 |
+
max_steps: int = Field(default=MAX_STEPS, ge=1, le=100)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class StepRequest(BaseModel):
|
| 26 |
+
action: Action | str | dict[str, Any]
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class StepResponse(BaseModel):
|
| 30 |
+
task_id: str
|
| 31 |
+
step_count: int
|
| 32 |
+
reward: float
|
| 33 |
+
reward_model: Reward
|
| 34 |
+
done: bool
|
| 35 |
+
observation: Observation
|
| 36 |
+
last_action: str | None = None
|
| 37 |
+
info: dict[str, Any] = Field(default_factory=dict)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class StateResponse(BaseModel):
|
| 41 |
+
initialized: bool
|
| 42 |
+
task_id: str | None = None
|
| 43 |
+
step_count: int = 0
|
| 44 |
+
done: bool = False
|
| 45 |
+
last_action: str | None = None
|
| 46 |
+
observation: Observation | None = None
|
| 47 |
+
internal_state: dict[str, Any] = Field(default_factory=dict)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
@dataclass
|
| 51 |
+
class RuntimeSession:
|
| 52 |
+
env: CICDDebuggerEnvironment
|
| 53 |
+
task_id: str
|
| 54 |
+
step_count: int = 0
|
| 55 |
+
done: bool = False
|
| 56 |
+
last_action: str | None = None
|
| 57 |
+
last_reward: float = 0.0
|
| 58 |
+
last_observation: dict[str, Any] | None = None
|
| 59 |
+
last_info: dict[str, Any] | None = None
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
runtime_session: RuntimeSession | None = None
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def _as_observation_model(observation: dict[str, Any] | Observation) -> Observation:
|
| 66 |
+
if isinstance(observation, Observation):
|
| 67 |
+
return observation
|
| 68 |
+
return Observation.model_validate(observation)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def _build_step_response(session: RuntimeSession) -> StepResponse:
|
| 72 |
+
observation = session.last_observation or {}
|
| 73 |
+
info_payload = session.last_info or {}
|
| 74 |
+
reward_payload = info_payload.get("reward_model")
|
| 75 |
+
if isinstance(reward_payload, dict):
|
| 76 |
+
reward_model = Reward.model_validate(reward_payload)
|
| 77 |
+
else:
|
| 78 |
+
reward_model = Reward(value=float(session.last_reward), components={"total": float(session.last_reward)})
|
| 79 |
+
|
| 80 |
+
return StepResponse(
|
| 81 |
+
task_id=session.task_id,
|
| 82 |
+
step_count=int(observation.get("step_count") or session.step_count),
|
| 83 |
+
reward=float(session.last_reward),
|
| 84 |
+
reward_model=reward_model,
|
| 85 |
+
done=bool(session.done),
|
| 86 |
+
observation=_as_observation_model(observation),
|
| 87 |
+
last_action=session.last_action,
|
| 88 |
+
info=info_payload,
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
@app.get("/")
|
| 93 |
+
def root() -> dict[str, Any]:
|
| 94 |
+
return {
|
| 95 |
+
"message": "CI/CD Debugger Environment is running 🚀",
|
| 96 |
+
"endpoints": ["/health", "/reset", "/step", "/state"],
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
@app.get("/health")
|
| 101 |
+
def health() -> dict[str, str]:
|
| 102 |
+
return {"status": "ok"}
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
@app.post("/reset", response_model=StepResponse)
|
| 106 |
+
async def reset(payload: ResetRequest | None = None) -> StepResponse:
|
| 107 |
+
global runtime_session
|
| 108 |
+
|
| 109 |
+
request = payload or ResetRequest()
|
| 110 |
+
env = CICDDebuggerEnvironment(max_steps=int(request.max_steps))
|
| 111 |
+
observation = await env.reset(task_id=request.task_id, difficulty=request.difficulty)
|
| 112 |
+
|
| 113 |
+
runtime_session = RuntimeSession(
|
| 114 |
+
env=env,
|
| 115 |
+
task_id=str(observation.get("task_id", request.task_id or "cicd-debugger-task")),
|
| 116 |
+
step_count=0,
|
| 117 |
+
done=False,
|
| 118 |
+
last_action=None,
|
| 119 |
+
last_reward=0.0,
|
| 120 |
+
last_observation=observation,
|
| 121 |
+
last_info={
|
| 122 |
+
"message": "environment reset",
|
| 123 |
+
"tool": "reset",
|
| 124 |
+
"error": None,
|
| 125 |
+
"reward_model": Reward(value=0.0, components={"total": 0.0}).model_dump(),
|
| 126 |
+
},
|
| 127 |
+
)
|
| 128 |
+
return _build_step_response(runtime_session)
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
@app.post("/step", response_model=StepResponse)
|
| 132 |
+
async def step(payload: StepRequest) -> StepResponse:
|
| 133 |
+
global runtime_session
|
| 134 |
+
|
| 135 |
+
if runtime_session is None:
|
| 136 |
+
raise HTTPException(status_code=400, detail="Environment not initialized. Call /reset first.")
|
| 137 |
+
|
| 138 |
+
if runtime_session.done:
|
| 139 |
+
return _build_step_response(runtime_session)
|
| 140 |
+
|
| 141 |
+
observation, reward, done, info = await runtime_session.env.step(payload.action)
|
| 142 |
+
|
| 143 |
+
runtime_session.step_count = int(observation.get("step_count", runtime_session.step_count + 1))
|
| 144 |
+
runtime_session.done = bool(done)
|
| 145 |
+
runtime_session.last_action = payload.action if isinstance(payload.action, str) else str(payload.action)
|
| 146 |
+
runtime_session.last_reward = float(reward)
|
| 147 |
+
runtime_session.last_observation = observation
|
| 148 |
+
runtime_session.last_info = dict(info or {})
|
| 149 |
+
|
| 150 |
+
return _build_step_response(runtime_session)
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
@app.get("/state", response_model=StateResponse)
|
| 154 |
+
async def state() -> StateResponse:
|
| 155 |
+
if runtime_session is None:
|
| 156 |
+
return StateResponse(initialized=False)
|
| 157 |
+
|
| 158 |
+
observation = None
|
| 159 |
+
if runtime_session.last_observation is not None:
|
| 160 |
+
observation = _as_observation_model(runtime_session.last_observation)
|
| 161 |
+
|
| 162 |
+
return StateResponse(
|
| 163 |
+
initialized=True,
|
| 164 |
+
task_id=runtime_session.task_id,
|
| 165 |
+
step_count=runtime_session.step_count,
|
| 166 |
+
done=runtime_session.done,
|
| 167 |
+
last_action=runtime_session.last_action,
|
| 168 |
+
observation=observation,
|
| 169 |
+
internal_state=runtime_session.env.state(),
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
@app.post("/state", response_model=StateResponse)
|
| 174 |
+
async def state_post() -> StateResponse:
|
| 175 |
+
return await state()
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def main() -> None:
|
| 179 |
+
port = int(os.getenv("PORT", "7860"))
|
| 180 |
+
uvicorn.run(app, host="0.0.0.0", port=port)
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
if __name__ == "__main__":
|
| 184 |
+
main()
|
tests/__init__.py
ADDED
|
File without changes
|
tests/test_day2_engine.py
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unittest
|
| 2 |
+
|
| 3 |
+
from env.anti_hacking import AntiHackingDetector
|
| 4 |
+
from env.graders.deterministic import DeterministicGrader
|
| 5 |
+
from env.hidden_tests import HiddenTestRunner
|
| 6 |
+
from env.rewards import RewardCalculator
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
EXPECTED_CONFIG = """
|
| 10 |
+
name: CI
|
| 11 |
+
on: [push]
|
| 12 |
+
jobs:
|
| 13 |
+
test:
|
| 14 |
+
runs-on: ubuntu-latest
|
| 15 |
+
steps:
|
| 16 |
+
- uses: actions/checkout@v4
|
| 17 |
+
- run: npm ci
|
| 18 |
+
- run: npm test
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
WRONG_CONFIG = """
|
| 22 |
+
name: CI
|
| 23 |
+
on: [push]
|
| 24 |
+
jobs:
|
| 25 |
+
test:
|
| 26 |
+
runs-on: ubuntu-latest
|
| 27 |
+
steps:
|
| 28 |
+
- uses: actions/checkout@v4
|
| 29 |
+
- run: npm ci
|
| 30 |
+
- run: npm tset
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
BROKEN_YAML = """
|
| 34 |
+
name CI
|
| 35 |
+
jobs:
|
| 36 |
+
test:
|
| 37 |
+
steps
|
| 38 |
+
- run npm test
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class FakeJudge:
|
| 43 |
+
def evaluate_fix(self, original, fixed, error):
|
| 44 |
+
return {
|
| 45 |
+
"correctness": 0.9,
|
| 46 |
+
"minimalism": 0.8,
|
| 47 |
+
"quality": 0.9,
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
class Day2EngineTests(unittest.TestCase):
|
| 52 |
+
def setUp(self):
|
| 53 |
+
self.grader = DeterministicGrader()
|
| 54 |
+
self.detector = AntiHackingDetector()
|
| 55 |
+
self.hidden_runner = HiddenTestRunner(grader=self.grader)
|
| 56 |
+
self.reward_calculator = RewardCalculator(
|
| 57 |
+
llm_judge=FakeJudge(),
|
| 58 |
+
anti_hacking_detector=self.detector,
|
| 59 |
+
deterministic_grader=self.grader,
|
| 60 |
+
hidden_test_runner=self.hidden_runner,
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
def test_deterministic_grader_high_for_correct_fix(self):
|
| 64 |
+
score = self.grader.grade(EXPECTED_CONFIG, EXPECTED_CONFIG)
|
| 65 |
+
self.assertGreaterEqual(score, 0.85)
|
| 66 |
+
self.assertLessEqual(score, 1.0)
|
| 67 |
+
|
| 68 |
+
def test_deterministic_grader_low_for_wrong_fix(self):
|
| 69 |
+
score = self.grader.grade(WRONG_CONFIG, EXPECTED_CONFIG)
|
| 70 |
+
self.assertLess(score, 0.6)
|
| 71 |
+
|
| 72 |
+
def test_deterministic_grader_penalizes_broken_yaml(self):
|
| 73 |
+
score = self.grader.grade(BROKEN_YAML, EXPECTED_CONFIG)
|
| 74 |
+
self.assertLess(score, 0.4)
|
| 75 |
+
|
| 76 |
+
def test_deterministic_grader_is_reproducible(self):
|
| 77 |
+
first = self.grader.grade(WRONG_CONFIG, EXPECTED_CONFIG)
|
| 78 |
+
second = self.grader.grade(WRONG_CONFIG, EXPECTED_CONFIG)
|
| 79 |
+
self.assertEqual(first, second)
|
| 80 |
+
|
| 81 |
+
def test_anti_hacking_detects_stage_skipping(self):
|
| 82 |
+
config = EXPECTED_CONFIG + "\n if: false\n"
|
| 83 |
+
penalty = self.detector.penalty_stage_skipping(config)
|
| 84 |
+
self.assertLess(penalty, 0.0)
|
| 85 |
+
|
| 86 |
+
def test_anti_hacking_detects_fake_success(self):
|
| 87 |
+
config = EXPECTED_CONFIG.replace("npm test", 'echo "tests passed" && exit 0')
|
| 88 |
+
penalty = self.detector.penalty_fake_success(config)
|
| 89 |
+
self.assertLess(penalty, 0.0)
|
| 90 |
+
|
| 91 |
+
def test_anti_hacking_detects_breaking_pipeline(self):
|
| 92 |
+
previous_config = """
|
| 93 |
+
stages:
|
| 94 |
+
- build
|
| 95 |
+
- test
|
| 96 |
+
jobs:
|
| 97 |
+
build:
|
| 98 |
+
run: npm ci
|
| 99 |
+
test:
|
| 100 |
+
run: npm test
|
| 101 |
+
"""
|
| 102 |
+
new_config = """
|
| 103 |
+
stages:
|
| 104 |
+
- build
|
| 105 |
+
jobs:
|
| 106 |
+
build:
|
| 107 |
+
run: npm ci
|
| 108 |
+
"""
|
| 109 |
+
penalty = self.detector.penalty_breaking_pipeline(previous_config, new_config)
|
| 110 |
+
self.assertLess(penalty, 0.0)
|
| 111 |
+
|
| 112 |
+
def test_anti_hacking_detects_excessive_edits(self):
|
| 113 |
+
penalty = self.detector.penalty_excessive_edits(changed_files_count=12, changed_lines_count=400)
|
| 114 |
+
self.assertLess(penalty, 0.0)
|
| 115 |
+
|
| 116 |
+
def test_anti_hacking_detects_timeout_abuse(self):
|
| 117 |
+
penalty = self.detector.penalty_timeout_abuse(step_count=25)
|
| 118 |
+
self.assertLess(penalty, 0.0)
|
| 119 |
+
|
| 120 |
+
def test_hidden_tests_returns_high_pass_rate_for_good_fix(self):
|
| 121 |
+
pass_rate = self.hidden_runner.evaluate_fix(
|
| 122 |
+
fixed_config=EXPECTED_CONFIG,
|
| 123 |
+
expected_config=EXPECTED_CONFIG,
|
| 124 |
+
)
|
| 125 |
+
self.assertGreaterEqual(pass_rate, 0.8)
|
| 126 |
+
|
| 127 |
+
def test_hidden_tests_returns_lower_pass_rate_for_bad_fix(self):
|
| 128 |
+
pass_rate = self.hidden_runner.evaluate_fix(
|
| 129 |
+
fixed_config=WRONG_CONFIG,
|
| 130 |
+
expected_config=EXPECTED_CONFIG,
|
| 131 |
+
)
|
| 132 |
+
self.assertLess(pass_rate, 0.8)
|
| 133 |
+
|
| 134 |
+
def test_reward_calculator_positive_for_good_behavior(self):
|
| 135 |
+
state = {
|
| 136 |
+
"step_count": 3,
|
| 137 |
+
"previous_config": WRONG_CONFIG,
|
| 138 |
+
"expected_config": EXPECTED_CONFIG,
|
| 139 |
+
"original_config": WRONG_CONFIG,
|
| 140 |
+
"error": "command not found",
|
| 141 |
+
}
|
| 142 |
+
result = {
|
| 143 |
+
"logs_analyzed": True,
|
| 144 |
+
"error_diagnosed": True,
|
| 145 |
+
"fix_proposed": True,
|
| 146 |
+
"pipeline_run": True,
|
| 147 |
+
"tests_passed": True,
|
| 148 |
+
"command_succeeded": True,
|
| 149 |
+
"current_config": EXPECTED_CONFIG,
|
| 150 |
+
"expected_config": EXPECTED_CONFIG,
|
| 151 |
+
"changed_files_count": 1,
|
| 152 |
+
"changed_lines_count": 1,
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
reward = self.reward_calculator.calculate_step_reward(
|
| 156 |
+
state=state,
|
| 157 |
+
action="validate_fix",
|
| 158 |
+
result=result,
|
| 159 |
+
original_config=WRONG_CONFIG,
|
| 160 |
+
fixed_config=EXPECTED_CONFIG,
|
| 161 |
+
error_message="command not found",
|
| 162 |
+
expected_config=EXPECTED_CONFIG,
|
| 163 |
+
metadata={"broken_token": "npm tset", "fixed_token": "npm test"},
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
self.assertGreater(reward, 0.5)
|
| 167 |
+
|
| 168 |
+
def test_reward_calculator_penalizes_hacking(self):
|
| 169 |
+
hacked_config = EXPECTED_CONFIG.replace("npm test", 'echo "tests passed" && exit 0') + "\n if: false\n"
|
| 170 |
+
state = {
|
| 171 |
+
"step_count": 25,
|
| 172 |
+
"previous_config": EXPECTED_CONFIG,
|
| 173 |
+
"changed_files_count": 15,
|
| 174 |
+
"changed_lines_count": 500,
|
| 175 |
+
}
|
| 176 |
+
result = {
|
| 177 |
+
"current_config": hacked_config,
|
| 178 |
+
"expected_config": EXPECTED_CONFIG,
|
| 179 |
+
"hacking_attempt": True,
|
| 180 |
+
"changed_files_count": 15,
|
| 181 |
+
"changed_lines_count": 500,
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
reward = self.reward_calculator.calculate_step_reward(
|
| 185 |
+
state=state,
|
| 186 |
+
action="edit_config",
|
| 187 |
+
result=result,
|
| 188 |
+
original_config=EXPECTED_CONFIG,
|
| 189 |
+
fixed_config=hacked_config,
|
| 190 |
+
error_message="",
|
| 191 |
+
expected_config=EXPECTED_CONFIG,
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
self.assertGreaterEqual(reward, 0.0)
|
| 195 |
+
self.assertLessEqual(reward, 0.3)
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
if __name__ == "__main__":
|
| 199 |
+
unittest.main()
|
tests/test_env.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import unittest
|
| 3 |
+
|
| 4 |
+
from env.environment import CICDDebuggerEnvironment, REQUIRED_TOOLS
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class EnvironmentContractTests(unittest.TestCase):
|
| 8 |
+
def test_reset_returns_structured_observation(self):
|
| 9 |
+
env = CICDDebuggerEnvironment(max_steps=10, seed=7)
|
| 10 |
+
observation = asyncio.run(env.reset(task_id="easy-command-typo"))
|
| 11 |
+
|
| 12 |
+
self.assertIn("config", observation)
|
| 13 |
+
self.assertIn("logs", observation)
|
| 14 |
+
self.assertIn("error_message", observation)
|
| 15 |
+
self.assertIn("progress_flags", observation)
|
| 16 |
+
self.assertEqual(observation["task_id"], "easy-command-typo")
|
| 17 |
+
self.assertEqual(observation["available_tools"], REQUIRED_TOOLS)
|
| 18 |
+
self.assertEqual(observation["step_count"], 0)
|
| 19 |
+
|
| 20 |
+
def test_step_returns_obs_reward_done_info(self):
|
| 21 |
+
env = CICDDebuggerEnvironment(max_steps=10, seed=3)
|
| 22 |
+
asyncio.run(env.reset(task_id="easy-command-typo"))
|
| 23 |
+
|
| 24 |
+
observation, reward, done, info = asyncio.run(env.step("read_logs: inspect failing stage logs"))
|
| 25 |
+
|
| 26 |
+
self.assertIsInstance(observation, dict)
|
| 27 |
+
self.assertIsInstance(reward, float)
|
| 28 |
+
self.assertIsInstance(done, bool)
|
| 29 |
+
self.assertIsInstance(info, dict)
|
| 30 |
+
self.assertIn("tool", info)
|
| 31 |
+
|
| 32 |
+
def test_action_space_rejects_extra_tools(self):
|
| 33 |
+
env = CICDDebuggerEnvironment(max_steps=10, seed=5)
|
| 34 |
+
asyncio.run(env.reset(task_id="easy-command-typo"))
|
| 35 |
+
|
| 36 |
+
observation, reward, done, info = asyncio.run(env.step("propose_fix: force deploy"))
|
| 37 |
+
|
| 38 |
+
self.assertIn("error", info)
|
| 39 |
+
self.assertIsNotNone(info["error"])
|
| 40 |
+
self.assertFalse(done)
|
| 41 |
+
self.assertGreaterEqual(reward, 0.0)
|
| 42 |
+
self.assertIn("config", observation)
|
| 43 |
+
|
| 44 |
+
def test_action_space_rejects_alias_tools(self):
|
| 45 |
+
env = CICDDebuggerEnvironment(max_steps=10, seed=15)
|
| 46 |
+
asyncio.run(env.reset(task_id="easy-command-typo"))
|
| 47 |
+
|
| 48 |
+
_, _, done, info = asyncio.run(env.step("read: workflow file"))
|
| 49 |
+
|
| 50 |
+
self.assertIn("error", info)
|
| 51 |
+
self.assertIsNotNone(info["error"])
|
| 52 |
+
self.assertFalse(done)
|
| 53 |
+
|
| 54 |
+
def test_submit_solution_path(self):
|
| 55 |
+
env = CICDDebuggerEnvironment(max_steps=12, seed=9)
|
| 56 |
+
asyncio.run(env.reset(task_id="easy-command-typo"))
|
| 57 |
+
|
| 58 |
+
asyncio.run(env.step("read_logs: inspect logs"))
|
| 59 |
+
asyncio.run(env.step("analyze_error: identify root cause"))
|
| 60 |
+
asyncio.run(env.step("edit_config: replace npm tset with npm test"))
|
| 61 |
+
asyncio.run(env.step("run_pipeline_stage: run test stage"))
|
| 62 |
+
asyncio.run(env.step("run_tests: execute tests"))
|
| 63 |
+
asyncio.run(env.step("validate_fix: validate score"))
|
| 64 |
+
observation, reward, done, info = asyncio.run(env.step("submit_solution: submit current fix"))
|
| 65 |
+
|
| 66 |
+
self.assertTrue(done)
|
| 67 |
+
self.assertGreaterEqual(reward, 0.0)
|
| 68 |
+
self.assertIsNone(info.get("error"))
|
| 69 |
+
self.assertEqual(observation["progress_flags"].get("submit_solution"), True)
|
| 70 |
+
|
| 71 |
+
def test_internal_state_tracks_required_fields(self):
|
| 72 |
+
env = CICDDebuggerEnvironment(max_steps=10, seed=11)
|
| 73 |
+
asyncio.run(env.reset(task_id="easy-command-typo"))
|
| 74 |
+
asyncio.run(env.step("read_logs: inspect logs"))
|
| 75 |
+
|
| 76 |
+
state = env.get_state()
|
| 77 |
+
self.assertTrue(state.get("initialized"))
|
| 78 |
+
self.assertIn("actual_bug", state)
|
| 79 |
+
self.assertIn("correct_solution", state)
|
| 80 |
+
self.assertIn("progress_flags", state)
|
| 81 |
+
self.assertIn("file_modification_count", state)
|
| 82 |
+
self.assertIn("hidden_test_pass_rate", state)
|
| 83 |
+
|
| 84 |
+
def test_yaml_task_is_fixable_via_edit_flow(self):
|
| 85 |
+
env = CICDDebuggerEnvironment(max_steps=12, seed=17)
|
| 86 |
+
asyncio.run(env.reset(task_id="easy-yaml-indentation"))
|
| 87 |
+
|
| 88 |
+
asyncio.run(env.step("read_logs: inspect logs"))
|
| 89 |
+
asyncio.run(env.step("analyze_error: identify root cause"))
|
| 90 |
+
observation, _, _, _ = asyncio.run(env.step("edit_config: fix YAML indentation and syntax"))
|
| 91 |
+
|
| 92 |
+
self.assertIn("- run: pytest", observation["config"])
|
| 93 |
+
self.assertNotIn(" - run: pytest", observation["config"])
|
| 94 |
+
|
| 95 |
+
asyncio.run(env.step("run_tests: execute tests"))
|
| 96 |
+
asyncio.run(env.step("validate_fix: validate score"))
|
| 97 |
+
_, _, done, info = asyncio.run(env.step("submit_solution: submit current fix"))
|
| 98 |
+
|
| 99 |
+
self.assertTrue(done)
|
| 100 |
+
self.assertIsNone(info.get("error"))
|
| 101 |
+
|
| 102 |
+
def test_hard_needs_order_edit_updates_deploy_dependency(self):
|
| 103 |
+
env = CICDDebuggerEnvironment(max_steps=12, seed=19)
|
| 104 |
+
asyncio.run(env.reset(task_id="hard-needs-order"))
|
| 105 |
+
|
| 106 |
+
observation, _, _, _ = asyncio.run(env.step("edit_config: fix deploy dependency ordering"))
|
| 107 |
+
|
| 108 |
+
self.assertIn("needs: [build, test]", observation["config"])
|
| 109 |
+
self.assertEqual(observation["config"].count("needs: build"), 1)
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
if __name__ == "__main__":
|
| 113 |
+
unittest.main()
|
tests/test_inference.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import subprocess
|
| 4 |
+
import sys
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
import unittest
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class InferenceOutputFormatTests(unittest.TestCase):
|
| 10 |
+
def test_inference_prints_required_markers(self):
|
| 11 |
+
project_root = Path(__file__).resolve().parents[1]
|
| 12 |
+
env = os.environ.copy()
|
| 13 |
+
env["OFFLINE_INFERENCE"] = "1"
|
| 14 |
+
|
| 15 |
+
completed = subprocess.run(
|
| 16 |
+
[sys.executable, "inference.py", "--max-steps", "3", "--offline", "--force-local-env"],
|
| 17 |
+
cwd=project_root,
|
| 18 |
+
capture_output=True,
|
| 19 |
+
text=True,
|
| 20 |
+
env=env,
|
| 21 |
+
check=True,
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
lines = [line.strip() for line in completed.stdout.splitlines() if line.strip()]
|
| 25 |
+
self.assertGreaterEqual(len(lines), 3)
|
| 26 |
+
self.assertTrue(lines[0].startswith("[START] "))
|
| 27 |
+
self.assertTrue(lines[-1].startswith("[END] "))
|
| 28 |
+
|
| 29 |
+
start_pattern = re.compile(r"^\[START\] task=\S+ env=\S+ model=.+$")
|
| 30 |
+
step_pattern = re.compile(
|
| 31 |
+
r"^\[STEP\] step=\d+ action=.* reward=-?\d+\.\d{2} done=(true|false) error=(null|.+)$"
|
| 32 |
+
)
|
| 33 |
+
end_pattern = re.compile(
|
| 34 |
+
r"^\[END\] success=(true|false) steps=\d+ score=\d+\.\d{3} rewards=(-?\d+\.\d{2}(,-?\d+\.\d{2})*)?$"
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
self.assertRegex(lines[0], start_pattern)
|
| 38 |
+
|
| 39 |
+
step_lines = [line for line in lines if line.startswith("[STEP] ")]
|
| 40 |
+
self.assertTrue(step_lines)
|
| 41 |
+
for line in step_lines:
|
| 42 |
+
self.assertRegex(line, step_pattern)
|
| 43 |
+
|
| 44 |
+
self.assertRegex(lines[-1], end_pattern)
|
| 45 |
+
|
| 46 |
+
for line in lines:
|
| 47 |
+
self.assertTrue(
|
| 48 |
+
line.startswith("[START] ") or line.startswith("[STEP] ") or line.startswith("[END] "),
|
| 49 |
+
f"Unexpected output line: {line}",
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
if __name__ == "__main__":
|
| 54 |
+
unittest.main()
|
tests/test_judge.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unittest
|
| 2 |
+
|
| 3 |
+
from env.graders.llm_judge import LLMJudge
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class FakeModel:
|
| 7 |
+
def __init__(self, payload, raise_error: bool = False):
|
| 8 |
+
self.payload = payload
|
| 9 |
+
self.raise_error = raise_error
|
| 10 |
+
|
| 11 |
+
def __call__(self, prompt, **kwargs):
|
| 12 |
+
if self.raise_error:
|
| 13 |
+
raise RuntimeError("model failure")
|
| 14 |
+
return [{"generated_text": self.payload}]
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class LLMJudgeTests(unittest.TestCase):
|
| 18 |
+
def test_good_json_scores_are_parsed(self):
|
| 19 |
+
judge = LLMJudge(FakeModel('{"correctness": 1.0, "minimalism": 0.8, "quality": 0.9}'))
|
| 20 |
+
result = judge.evaluate_fix("npm tset", "npm test", "command not found")
|
| 21 |
+
|
| 22 |
+
self.assertGreaterEqual(result["correctness"], 0.9)
|
| 23 |
+
self.assertGreaterEqual(result["minimalism"], 0.7)
|
| 24 |
+
self.assertGreaterEqual(result["quality"], 0.8)
|
| 25 |
+
|
| 26 |
+
def test_regex_fallback_for_noisy_output(self):
|
| 27 |
+
noisy = "Correctness: 0.7\nMinimalism: 0.6\nQuality: 0.75"
|
| 28 |
+
judge = LLMJudge(FakeModel(noisy))
|
| 29 |
+
result = judge.evaluate_fix("a", "b", "err")
|
| 30 |
+
|
| 31 |
+
self.assertAlmostEqual(result["correctness"], 0.7)
|
| 32 |
+
self.assertAlmostEqual(result["minimalism"], 0.6)
|
| 33 |
+
self.assertAlmostEqual(result["quality"], 0.75)
|
| 34 |
+
|
| 35 |
+
def test_partial_fields_default_to_zero(self):
|
| 36 |
+
judge = LLMJudge(FakeModel('{"correctness": 0.8}'))
|
| 37 |
+
result = judge.evaluate_fix("a", "b", "err")
|
| 38 |
+
|
| 39 |
+
self.assertAlmostEqual(result["correctness"], 0.8)
|
| 40 |
+
self.assertAlmostEqual(result["minimalism"], 0.0)
|
| 41 |
+
self.assertAlmostEqual(result["quality"], 0.0)
|
| 42 |
+
|
| 43 |
+
def test_model_failure_returns_zeroes(self):
|
| 44 |
+
judge = LLMJudge(FakeModel("", raise_error=True))
|
| 45 |
+
result = judge.evaluate_fix("a", "b", "err")
|
| 46 |
+
|
| 47 |
+
self.assertEqual(result, {"correctness": 0.0, "minimalism": 0.0, "quality": 0.0})
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
if __name__ == "__main__":
|
| 51 |
+
unittest.main()
|
tests/test_server_api.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unittest
|
| 2 |
+
|
| 3 |
+
from fastapi.testclient import TestClient
|
| 4 |
+
|
| 5 |
+
from server.app import app
|
| 6 |
+
import server.app as server_app
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class ServerApiTests(unittest.TestCase):
|
| 10 |
+
def setUp(self):
|
| 11 |
+
server_app.runtime_session = None
|
| 12 |
+
self.client = TestClient(app)
|
| 13 |
+
|
| 14 |
+
def test_health(self):
|
| 15 |
+
response = self.client.get("/health")
|
| 16 |
+
self.assertEqual(response.status_code, 200)
|
| 17 |
+
self.assertEqual(response.json().get("status"), "ok")
|
| 18 |
+
|
| 19 |
+
def test_reset_state_step_flow(self):
|
| 20 |
+
reset_response = self.client.post("/reset", json={})
|
| 21 |
+
self.assertEqual(reset_response.status_code, 200)
|
| 22 |
+
reset_payload = reset_response.json()
|
| 23 |
+
self.assertIn("observation", reset_payload)
|
| 24 |
+
self.assertIn("step_count", reset_payload)
|
| 25 |
+
self.assertEqual(reset_payload["step_count"], 0)
|
| 26 |
+
|
| 27 |
+
state_response = self.client.get("/state")
|
| 28 |
+
self.assertEqual(state_response.status_code, 200)
|
| 29 |
+
state_payload = state_response.json()
|
| 30 |
+
self.assertTrue(state_payload.get("initialized"))
|
| 31 |
+
|
| 32 |
+
step_response = self.client.post(
|
| 33 |
+
"/step",
|
| 34 |
+
json={"action": "edit_config: replace npm tset with npm test"},
|
| 35 |
+
)
|
| 36 |
+
self.assertEqual(step_response.status_code, 200)
|
| 37 |
+
step_payload = step_response.json()
|
| 38 |
+
self.assertIn("reward", step_payload)
|
| 39 |
+
self.assertIn("done", step_payload)
|
| 40 |
+
|
| 41 |
+
def test_step_requires_reset(self):
|
| 42 |
+
server_app.runtime_session = None
|
| 43 |
+
client = TestClient(app)
|
| 44 |
+
response = client.post("/step", json={"action": "read_logs: inspect logs"})
|
| 45 |
+
self.assertEqual(response.status_code, 400)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
if __name__ == "__main__":
|
| 49 |
+
unittest.main()
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
validate-submission.sh
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
#
|
| 3 |
+
# validate-submission.sh - OpenEnv Submission Validator
|
| 4 |
+
#
|
| 5 |
+
# Checks that your HF Space is live, Docker image builds, and openenv validate passes.
|
| 6 |
+
#
|
| 7 |
+
# Prerequisites:
|
| 8 |
+
# - Docker: https://docs.docker.com/get-docker/
|
| 9 |
+
# - openenv-core: pip install openenv-core
|
| 10 |
+
# - curl (usually pre-installed)
|
| 11 |
+
#
|
| 12 |
+
# Run:
|
| 13 |
+
# curl -fsSL https://raw.githubusercontent.com/<owner>/<repo>/main/scripts/validate-submission.sh | bash -s -- <ping_url> [repo_dir]
|
| 14 |
+
#
|
| 15 |
+
# Or download and run locally:
|
| 16 |
+
# chmod +x validate-submission.sh
|
| 17 |
+
# ./validate-submission.sh <ping_url> [repo_dir]
|
| 18 |
+
#
|
| 19 |
+
# Arguments:
|
| 20 |
+
# ping_url Your HuggingFace Space URL (e.g. https://your-space.hf.space)
|
| 21 |
+
# repo_dir Path to your repo (default: current directory)
|
| 22 |
+
#
|
| 23 |
+
# Examples:
|
| 24 |
+
# ./validate-submission.sh https://my-team.hf.space
|
| 25 |
+
# ./validate-submission.sh https://my-team.hf.space ./my-repo
|
| 26 |
+
#
|
| 27 |
+
|
| 28 |
+
set -uo pipefail
|
| 29 |
+
|
| 30 |
+
DOCKER_BUILD_TIMEOUT=600
|
| 31 |
+
if [ -t 1 ]; then
|
| 32 |
+
RED='\033[0;31m'
|
| 33 |
+
GREEN='\033[0;32m'
|
| 34 |
+
YELLOW='\033[1;33m'
|
| 35 |
+
BOLD='\033[1m'
|
| 36 |
+
NC='\033[0m'
|
| 37 |
+
else
|
| 38 |
+
RED='' GREEN='' YELLOW='' BOLD='' NC=''
|
| 39 |
+
fi
|
| 40 |
+
|
| 41 |
+
run_with_timeout() {
|
| 42 |
+
local secs="$1"; shift
|
| 43 |
+
if command -v timeout &>/dev/null; then
|
| 44 |
+
timeout "$secs" "$@"
|
| 45 |
+
elif command -v gtimeout &>/dev/null; then
|
| 46 |
+
gtimeout "$secs" "$@"
|
| 47 |
+
else
|
| 48 |
+
"$@" &
|
| 49 |
+
local pid=$!
|
| 50 |
+
( sleep "$secs" && kill "$pid" 2>/dev/null ) &
|
| 51 |
+
local watcher=$!
|
| 52 |
+
wait "$pid" 2>/dev/null
|
| 53 |
+
local rc=$?
|
| 54 |
+
kill "$watcher" 2>/dev/null
|
| 55 |
+
wait "$watcher" 2>/dev/null
|
| 56 |
+
return $rc
|
| 57 |
+
fi
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
portable_mktemp() {
|
| 61 |
+
local prefix="${1:-validate}"
|
| 62 |
+
mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
CLEANUP_FILES=()
|
| 66 |
+
cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
|
| 67 |
+
trap cleanup EXIT
|
| 68 |
+
|
| 69 |
+
PING_URL="${1:-}"
|
| 70 |
+
REPO_DIR="${2:-.}"
|
| 71 |
+
|
| 72 |
+
if [ -z "$PING_URL" ]; then
|
| 73 |
+
printf "Usage: %s <ping_url> [repo_dir]\n" "$0"
|
| 74 |
+
printf "\n"
|
| 75 |
+
printf " ping_url Your HuggingFace Space URL (e.g. https://your-space.hf.space)\n"
|
| 76 |
+
printf " repo_dir Path to your repo (default: current directory)\n"
|
| 77 |
+
exit 1
|
| 78 |
+
fi
|
| 79 |
+
|
| 80 |
+
if ! REPO_DIR="$(cd "$REPO_DIR" 2>/dev/null && pwd)"; then
|
| 81 |
+
printf "Error: directory '%s' not found\n" "${2:-.}"
|
| 82 |
+
exit 1
|
| 83 |
+
fi
|
| 84 |
+
PING_URL="${PING_URL%/}"
|
| 85 |
+
export PING_URL
|
| 86 |
+
PASS=0
|
| 87 |
+
|
| 88 |
+
log() { printf "[%s] %b\n" "$(date -u +%H:%M:%S)" "$*"; }
|
| 89 |
+
pass() { log "${GREEN}PASSED${NC} -- $1"; PASS=$((PASS + 1)); }
|
| 90 |
+
fail() { log "${RED}FAILED${NC} -- $1"; }
|
| 91 |
+
hint() { printf " ${YELLOW}Hint:${NC} %b\n" "$1"; }
|
| 92 |
+
stop_at() {
|
| 93 |
+
printf "\n"
|
| 94 |
+
printf "${RED}${BOLD}Validation stopped at %s.${NC} Fix the above before continuing.\n" "$1"
|
| 95 |
+
exit 1
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
printf "\n"
|
| 99 |
+
printf "${BOLD}========================================${NC}\n"
|
| 100 |
+
printf "${BOLD} OpenEnv Submission Validator${NC}\n"
|
| 101 |
+
printf "${BOLD}========================================${NC}\n"
|
| 102 |
+
log "Repo: $REPO_DIR"
|
| 103 |
+
log "Ping URL: $PING_URL"
|
| 104 |
+
printf "\n"
|
| 105 |
+
|
| 106 |
+
log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
|
| 107 |
+
|
| 108 |
+
CURL_OUTPUT=$(portable_mktemp "validate-curl")
|
| 109 |
+
CLEANUP_FILES+=("$CURL_OUTPUT")
|
| 110 |
+
HTTP_CODE=$(curl -s -o "$CURL_OUTPUT" -w "%{http_code}" -X POST \
|
| 111 |
+
-H "Content-Type: application/json" -d '{}' \
|
| 112 |
+
"$PING_URL/reset" --max-time 30 2>"$CURL_OUTPUT" || printf "000")
|
| 113 |
+
|
| 114 |
+
if [ "$HTTP_CODE" = "200" ]; then
|
| 115 |
+
pass "HF Space is live and responds to /reset"
|
| 116 |
+
elif [ "$HTTP_CODE" = "000" ]; then
|
| 117 |
+
fail "HF Space not reachable (connection failed or timed out)"
|
| 118 |
+
hint "Check your network connection and that the Space is running."
|
| 119 |
+
hint "Try: curl -s -o /dev/null -w '%%{http_code}' -X POST $PING_URL/reset"
|
| 120 |
+
stop_at "Step 1"
|
| 121 |
+
else
|
| 122 |
+
fail "HF Space /reset returned HTTP $HTTP_CODE (expected 200)"
|
| 123 |
+
hint "Make sure your Space is running and the URL is correct."
|
| 124 |
+
hint "Try opening $PING_URL in your browser first."
|
| 125 |
+
stop_at "Step 1"
|
| 126 |
+
fi
|
| 127 |
+
|
| 128 |
+
log "${BOLD}Step 2/3: Running docker build${NC} ..."
|
| 129 |
+
|
| 130 |
+
if ! command -v docker &>/dev/null; then
|
| 131 |
+
fail "docker command not found"
|
| 132 |
+
hint "Install Docker: https://docs.docker.com/get-docker/"
|
| 133 |
+
stop_at "Step 2"
|
| 134 |
+
fi
|
| 135 |
+
|
| 136 |
+
if [ -f "$REPO_DIR/Dockerfile" ]; then
|
| 137 |
+
DOCKER_CONTEXT="$REPO_DIR"
|
| 138 |
+
elif [ -f "$REPO_DIR/server/Dockerfile" ]; then
|
| 139 |
+
DOCKER_CONTEXT="$REPO_DIR/server"
|
| 140 |
+
else
|
| 141 |
+
fail "No Dockerfile found in repo root or server/ directory"
|
| 142 |
+
stop_at "Step 2"
|
| 143 |
+
fi
|
| 144 |
+
|
| 145 |
+
log " Found Dockerfile in $DOCKER_CONTEXT"
|
| 146 |
+
|
| 147 |
+
BUILD_OK=false
|
| 148 |
+
BUILD_OUTPUT=$(run_with_timeout "$DOCKER_BUILD_TIMEOUT" docker build "$DOCKER_CONTEXT" 2>&1) && BUILD_OK=true
|
| 149 |
+
|
| 150 |
+
if [ "$BUILD_OK" = true ]; then
|
| 151 |
+
pass "Docker build succeeded"
|
| 152 |
+
else
|
| 153 |
+
fail "Docker build failed (timeout=${DOCKER_BUILD_TIMEOUT}s)"
|
| 154 |
+
printf "%s\n" "$BUILD_OUTPUT" | tail -20
|
| 155 |
+
stop_at "Step 2"
|
| 156 |
+
fi
|
| 157 |
+
|
| 158 |
+
log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
|
| 159 |
+
|
| 160 |
+
VALIDATE_OK=false
|
| 161 |
+
if command -v openenv &>/dev/null; then
|
| 162 |
+
VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
|
| 163 |
+
else
|
| 164 |
+
PY_VALIDATE="python3"
|
| 165 |
+
if [ -x "$REPO_DIR/.venv/bin/python" ]; then
|
| 166 |
+
PY_VALIDATE="$REPO_DIR/.venv/bin/python"
|
| 167 |
+
fi
|
| 168 |
+
VALIDATE_OUTPUT=$(cd "$REPO_DIR" && "$PY_VALIDATE" -m openenv.cli.__main__ validate 2>&1) && VALIDATE_OK=true
|
| 169 |
+
fi
|
| 170 |
+
|
| 171 |
+
if [ "$VALIDATE_OK" = true ]; then
|
| 172 |
+
pass "openenv validate passed"
|
| 173 |
+
[ -n "$VALIDATE_OUTPUT" ] && log " $VALIDATE_OUTPUT"
|
| 174 |
+
else
|
| 175 |
+
fail "openenv validate failed"
|
| 176 |
+
printf "%s\n" "$VALIDATE_OUTPUT"
|
| 177 |
+
stop_at "Step 3"
|
| 178 |
+
fi
|
| 179 |
+
|
| 180 |
+
printf "\n"
|
| 181 |
+
printf "${BOLD}========================================${NC}\n"
|
| 182 |
+
printf "${GREEN}${BOLD} All 3/3 checks passed!${NC}\n"
|
| 183 |
+
printf "${GREEN}${BOLD} Your submission is ready to submit.${NC}\n"
|
| 184 |
+
printf "${BOLD}========================================${NC}\n"
|
| 185 |
+
printf "\n"
|
| 186 |
+
|
| 187 |
+
exit 0
|