Lishika commited on
Commit
30bf68a
·
0 Parent(s):

clean final submission

Browse files
.dockerignore ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .git/
2
+ .venv/
3
+ .pytest_cache/
4
+ __pycache__/
5
+ *.py[cod]
6
+ *.log
7
+ .env
8
+ artifacts/
9
+ tests/
10
+ uv.lock
11
+ migrated_from_cicd-debugger-env-2/
.env.example ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Default profile: Hugging Face Router (OpenAI-compatible API)
2
+ API_BASE_URL=https://router.huggingface.co/v1
3
+ MODEL_NAME=Qwen/Qwen2.5-72B-Instruct
4
+ HF_TOKEN=<your_openai_or_router_api_key>
5
+
6
+ # Optional alias. If both are set, OPENAI_API_KEY is used first by inference.py.
7
+ OPENAI_API_KEY=
8
+
9
+ # OpenAI direct profile (uncomment for OpenAI access token usage):
10
+ # API_BASE_URL=https://api.openai.com/v1
11
+ # MODEL_NAME=gpt-4o-mini
12
+ # HF_TOKEN=<your_openai_access_token>
13
+ # OPENAI_API_KEY=<optional_same_token_as_hf_token>
14
+
15
+ # Optional runtime knobs
16
+ LOCAL_IMAGE_NAME=
17
+ MY_ENV_V4_TASK=easy-command-typo
18
+ MY_ENV_V4_BENCHMARK=cicd_debugger_env
19
+ MAX_STEPS=8
20
+ TEMPERATURE=0.2
21
+ MAX_TOKENS=120
22
+ SUCCESS_SCORE_THRESHOLD=0.1
23
+ OFFLINE_INFERENCE=0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python bytecode and cache
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # Virtual environments
7
+ # Keep source folder env/ tracked; only ignore venv directories.
8
+ .venv/
9
+ venv/
10
+ ENV/
11
+ env.bak/
12
+ venv.bak/
13
+
14
+ # Build and packaging artifacts
15
+ build/
16
+ dist/
17
+ .eggs/
18
+ *.egg-info/
19
+ *.egg
20
+ pip-wheel-metadata/
21
+
22
+ # Testing, typing, linting caches
23
+ .pytest_cache/
24
+ .mypy_cache/
25
+ .ruff_cache/
26
+ .pyre/
27
+ .pytype/
28
+ .hypothesis/
29
+ .tox/
30
+ .nox/
31
+ .coverage
32
+ .coverage.*
33
+ htmlcov/
34
+
35
+ # Jupyter
36
+ .ipynb_checkpoints/
37
+
38
+ # Logs and temp files
39
+ *.log
40
+ *.out
41
+ *.err
42
+ tmp/
43
+ temp/
44
+
45
+ # Local environment and secrets
46
+ .env
47
+ .env.*
48
+ !.env.example
49
+
50
+ # IDE/editor and OS files
51
+ .vscode/
52
+ .idea/
53
+ *.swp
54
+ *.swo
55
+ .DS_Store
56
+ Thumbs.db
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ COPY requirements.txt ./
6
+ RUN pip install --no-cache-dir -r requirements.txt
7
+
8
+ COPY . .
9
+
10
+ ENV PORT=7860
11
+ EXPOSE 7860
12
+
13
+ CMD ["python", "-m", "server.app"]
README.md ADDED
@@ -0,0 +1,382 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: CICD_DEBUGGER
3
+ colorFrom: blue
4
+ colorTo: green
5
+ sdk: docker
6
+ app_port: 7860
7
+ pinned: false
8
+ tags:
9
+ - openenv
10
+ ---
11
+
12
+ # CI/CD Pipeline Debugger Environment (OpenEnv)
13
+
14
+ ## 1. Project Goal
15
+
16
+ This repository implements an AI training and evaluation environment where an agent learns to debug broken CI/CD pipelines automatically.
17
+
18
+ The environment targets real-world DevOps failure patterns, including:
19
+
20
+ - YAML syntax and structure issues
21
+ - Incorrect build/test commands (for example, npm tset -> npm test)
22
+ - Dependency and setup failures
23
+ - Multi-stage pipeline execution errors
24
+
25
+ This is designed as an RL-style interaction loop:
26
+
27
+ Observe -> Think -> Act -> Get Reward -> Repeat
28
+
29
+ ## 2. Why This Matters
30
+
31
+ CI/CD failures are common, repetitive, and often multi-step to resolve. This project turns that workflow into a structured learning environment where agents:
32
+
33
+ - Read failure context
34
+ - Reason about root causes
35
+ - Propose and apply fixes
36
+ - Get shaped rewards for robust behavior
37
+
38
+ ## 3. System Architecture
39
+
40
+ High-level flow:
41
+
42
+ Agent (LLM) -> Action -> Environment.step() -> Reward/Evaluation -> Next step
43
+
44
+ Core integration path:
45
+
46
+ Model -> Action -> Environment.step() -> RewardCalculator
47
+
48
+ RewardCalculator integrates:
49
+
50
+ - DeterministicGrader
51
+ - LLMJudge
52
+ - HiddenTestRunner
53
+ - AntiHackingDetector
54
+
55
+ ### 3.1 OpenEnv Interface (Typed)
56
+
57
+ Typed Pydantic models are defined in `env/models.py`:
58
+
59
+ - `Observation`: strict schema for environment observations
60
+ - `Action`: normalized tool + payload action schema
61
+ - `Reward`: bounded reward model with components
62
+
63
+ Environment contract:
64
+
65
+ - `reset()` returns the initial `Observation` payload
66
+ - `step(action)` returns `(observation, reward, done, info)`
67
+ - `state()` returns current environment state snapshot
68
+
69
+ Server/API contract models are exposed in `server/app.py` and use the same typed observation/action/reward structures.
70
+
71
+ ### 3.2 Action and Observation Spaces
72
+
73
+ Observation fields include:
74
+
75
+ - `task_id`, `difficulty`, `failure_stage`, `actual_bug`
76
+ - `config`, `logs`, `error_message`
77
+ - `available_tools`, `progress_flags`
78
+ - `file_modification_count`, `hidden_test_pass_rate`, `step_count`, `last_action_error`
79
+
80
+ Action schema:
81
+
82
+ - `tool`: one of `read_file`, `read_logs`, `analyze_error`, `edit_config`, `run_pipeline_stage`, `run_tests`, `validate_fix`, `submit_solution`
83
+ - `payload`: optional dict (for example `{ "raw": "replace npm tset with npm test" }`)
84
+
85
+ Reward schema:
86
+
87
+ - `value`: bounded float in `[0.0, 1.0]`
88
+ - `components`: reward breakdown dictionary
89
+
90
+ ## 4. Core Modules
91
+
92
+ ### 4.1 Quality Judge
93
+
94
+ - File: env/graders/llm_judge.py
95
+ - Purpose: quality-aware scoring of fixes
96
+ - Output keys: correctness, minimalism, quality (all in [0,1])
97
+ - Guarantees:
98
+ - strict JSON parsing attempt
99
+ - robust fallback parsing for messy output
100
+ - no-crash behavior (safe zero scores on failure)
101
+
102
+ ### 4.2 Deterministic Grader
103
+
104
+ - File: env/graders/deterministic.py
105
+ - Purpose: reproducible correctness scoring (0-1)
106
+ - Checks:
107
+ - YAML validity
108
+ - command and fix correctness
109
+ - similarity and issue resolution
110
+ - Rules:
111
+ - deterministic only
112
+ - same input, same score
113
+
114
+ ### 4.3 Anti-Hacking Detector
115
+
116
+ - File: env/anti_hacking.py
117
+ - Purpose: detect reward-hacking and shortcut behavior
118
+ - Penalty detectors:
119
+ - stage skipping (if: false, when: never)
120
+ - fake success (echo tests passed, unsafe exit 0 patterns)
121
+ - pipeline breakage between versions
122
+ - excessive edits
123
+ - timeout abuse via too many steps
124
+
125
+ ### 4.4 Hidden Tests
126
+
127
+ - File: env/hidden_tests.py
128
+ - Purpose: test fix robustness, not just exact-match overfitting
129
+ - Method:
130
+ - deterministic variant generation (OS, versions, env shifts)
131
+ - evaluate pass rate across variants
132
+
133
+ ### 4.5 Reward Shaping
134
+
135
+ - File: env/rewards.py
136
+ - Purpose: step-level learning signal
137
+ - Components:
138
+ - progress rewards (logs, analysis, fix proposal)
139
+ - execution rewards (pipeline run, tests pass)
140
+ - quality rewards (deterministic + hidden tests + LLM judge)
141
+ - anti-hacking penalties
142
+
143
+ ## 5. Inference and Evaluation
144
+
145
+ ### 5.1 Prompt and Model Layers
146
+
147
+ - inference/prompts.py: stable prompt templates and fallback action heuristics
148
+ - inference/model_wrapper.py: OpenAI client action generation, candidate generation, and safe fallback
149
+
150
+ Canonical action tools used by environment and inference:
151
+
152
+ - read_file
153
+ - read_logs
154
+ - analyze_error
155
+ - edit_config
156
+ - run_pipeline_stage
157
+ - run_tests
158
+ - validate_fix
159
+ - submit_solution
160
+
161
+ ### 5.2 Metrics and Artifacts
162
+
163
+ - inference/metrics.py: reward, success-rate, and failure reason tracking
164
+ - inference/visualize.py: reward curve and metrics artifact export
165
+
166
+ ### 5.3 Submission-Critical Runtime
167
+
168
+ - File: inference.py (root)
169
+ - Responsibilities:
170
+ - initialize model and environment
171
+ - run step loop
172
+ - calculate rewards
173
+ - emit strict stdout contract
174
+ - always emit END line
175
+
176
+ Required output format:
177
+
178
+ - [START] task=... env=... model=...
179
+ - [STEP] step=<n> action=... reward=0.00 done=<true|false> error=<msg|null>
180
+ - [END] success=<true|false> steps=<n> score=<0.000> rewards=<r1,r2,...>
181
+
182
+ Rules enforced:
183
+
184
+ - single-line logs only
185
+ - reward values with 2 decimals
186
+ - lowercase booleans
187
+ - no extra runtime log noise
188
+
189
+ ## 6. Task Coverage
190
+
191
+ The project includes 9 CI-fix tasks spanning:
192
+
193
+ - easy: syntax and typo fixes
194
+ - medium: dependency/env/cache/permissions issues
195
+ - hard: matrix logic, conditional flow, orchestration-level failures
196
+
197
+ Representative baseline tasks (one per difficulty):
198
+
199
+ - easy: `easy-command-typo` (fix invalid `npm tset` command)
200
+ - medium: `medium-python-version` (align workflow Python version)
201
+ - hard: `hard-needs-order` (repair deploy job dependency ordering)
202
+
203
+ ## 7. Setup
204
+
205
+ ```bash
206
+ python3 -m venv .venv
207
+ source .venv/bin/activate
208
+ pip install -r requirements.txt
209
+ ```
210
+
211
+ Environment variables:
212
+
213
+ ```bash
214
+ export API_BASE_URL="https://router.huggingface.co/v1"
215
+ export MODEL_NAME="Qwen/Qwen2.5-72B-Instruct"
216
+ export HF_TOKEN="<your_openai_compatible_api_key>"
217
+ # Optional alias; if set, this takes precedence over HF_TOKEN in inference.py
218
+ export OPENAI_API_KEY="<same_token_optional>"
219
+ # Optional, only if your inference spins environments from local images.
220
+ export LOCAL_IMAGE_NAME="<local_env_image_name>"
221
+ ```
222
+
223
+ If you want to use an OpenAI access token directly:
224
+
225
+ ```bash
226
+ export API_BASE_URL="https://api.openai.com/v1"
227
+ export MODEL_NAME="gpt-4o-mini"
228
+ export HF_TOKEN="<your_openai_access_token>"
229
+ # Optional alias:
230
+ export OPENAI_API_KEY="<same_token_optional>"
231
+ ```
232
+
233
+ ## 8. Run Inference
234
+
235
+ Offline/local mode:
236
+
237
+ ```bash
238
+ python inference.py --offline --force-local-env --max-steps 8 --policy-mode imp --trajectories 4
239
+ ```
240
+
241
+ Model-backed mode:
242
+
243
+ ```bash
244
+ python inference.py --max-steps 8 --policy-mode imp --trajectories 4
245
+ ```
246
+
247
+ Run baseline across easy/medium/hard tasks:
248
+
249
+ OpenAI client mode:
250
+
251
+ ```bash
252
+ OPENAI_API_KEY="<your_openai_compatible_api_key>" python baseline_inference.py --max-steps 5 --policy-mode imp --trajectories 3 --force-local-env
253
+ ```
254
+
255
+ Offline reproducible mode:
256
+
257
+ ```bash
258
+ python baseline_inference.py --max-steps 5 --policy-mode imp --trajectories 3 --offline --force-local-env
259
+ ```
260
+
261
+ Policy modes:
262
+
263
+ - sft: deterministic heuristic policy
264
+ - direct: single model action per step
265
+ - imp: multi-candidate generation and ranking
266
+
267
+ ## 9. Baseline Scores
268
+
269
+ Reproducible baseline artifact:
270
+
271
+ - `artifacts/baseline_scores.json`
272
+
273
+ Latest baseline run (`max_steps=5`, `policy_mode=imp`, `trajectories=3`):
274
+
275
+ | Task ID | Difficulty | Score | Success |
276
+ |---|---|---:|---:|
277
+ | easy-command-typo | easy | 0.541 | false |
278
+ | medium-python-version | medium | 0.679 | false |
279
+ | hard-needs-order | hard | 0.513 | false |
280
+
281
+ Aggregate:
282
+
283
+ - average score: `0.578`
284
+ - success rate: `0.000`
285
+
286
+ When `OPENAI_API_KEY` is provided, the same script runs with the OpenAI API client path in `inference.py`.
287
+
288
+ ## 10. Tests
289
+
290
+ Run all tests:
291
+
292
+ ```bash
293
+ python -m unittest discover -s tests -v
294
+ ```
295
+
296
+ Coverage includes:
297
+
298
+ - LLM judge
299
+ - deterministic grader
300
+ - anti-hacking detectors
301
+ - hidden tests
302
+ - reward system
303
+ - end-to-end inference output format
304
+
305
+ ## 11. Validation and Submission
306
+
307
+ OpenEnv validation:
308
+
309
+ ```bash
310
+ python -m openenv.cli.__main__ validate
311
+ ```
312
+
313
+ Pre-submission script:
314
+
315
+ ```bash
316
+ ./validate-submission.sh <your_hf_space_url>
317
+ ```
318
+
319
+ Required environment variables:
320
+
321
+ ```bash
322
+ export API_BASE_URL="https://router.huggingface.co/v1"
323
+ export MODEL_NAME="Qwen/Qwen2.5-72B-Instruct"
324
+ export OPENAI_API_KEY="<your_openai_compatible_api_key>"
325
+ # Optional fallback:
326
+ export HF_TOKEN="<your_token>"
327
+ ```
328
+
329
+ Docker run (Space/API mode):
330
+
331
+ ```bash
332
+ docker build -t cicd-debugger-env .
333
+ docker run --rm -p 7860:7860 cicd-debugger-env
334
+ ```
335
+
336
+ Server endpoints used by validators:
337
+
338
+ - `POST /reset`
339
+ - `POST /step`
340
+ - `GET /state`
341
+ - `GET /health`
342
+
343
+ ## 12. Deploy to Hugging Face Space (OpenAI Token)
344
+
345
+ This repository is already configured for Docker Spaces (`sdk: docker` in this README front matter).
346
+
347
+ 1. Create a new Hugging Face Space with SDK set to `Docker`.
348
+ 2. Push this repository to the Space git remote.
349
+ 3. In Space Settings -> Variables and secrets, add these Secrets:
350
+
351
+ ```text
352
+ OPENAI_API_KEY=<your_openai_access_token>
353
+ API_BASE_URL=https://api.openai.com/v1
354
+ MODEL_NAME=gpt-4o-mini
355
+ ```
356
+
357
+ 4. Optional Secrets:
358
+
359
+ ```text
360
+ HF_TOKEN=<optional_fallback_token>
361
+ OFFLINE_INFERENCE=0
362
+ MAX_STEPS=8
363
+ TEMPERATURE=0.2
364
+ MAX_TOKENS=120
365
+ ```
366
+
367
+ 5. Keep the app port as `7860` (already configured).
368
+ 6. Wait for build completion, then verify:
369
+
370
+ ```bash
371
+ curl -sS https://<your-space-name>.hf.space/health
372
+ curl -sS -X POST https://<your-space-name>.hf.space/reset -H 'Content-Type: application/json' -d '{}'
373
+ ```
374
+
375
+ Notes:
376
+
377
+ - `.env.example` is for local development reference only. Hugging Face Spaces use Secrets/Variables from Space Settings.
378
+ - Runtime code reads `OPENAI_API_KEY` first and falls back to `HF_TOKEN` when `OPENAI_API_KEY` is not provided.
379
+
380
+ ## 13. One-line Presentation Summary
381
+
382
+ We built an OpenEnv-compliant reinforcement learning environment where AI agents learn to debug real CI/CD pipelines using multi-step reasoning, hybrid grading, anti-hacking safeguards, and robust reward shaping.
artifacts/baseline_scores.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mode": "offline",
3
+ "model_name": "Qwen/Qwen2.5-72B-Instruct",
4
+ "api_base_url": "https://router.huggingface.co/v1",
5
+ "max_steps": 5,
6
+ "policy_mode": "imp",
7
+ "trajectories": 3,
8
+ "average_score": 0.578,
9
+ "success_rate": 0.0,
10
+ "results": [
11
+ {
12
+ "task_id": "easy-command-typo",
13
+ "difficulty": "easy",
14
+ "success": false,
15
+ "steps": 5,
16
+ "score": 0.541,
17
+ "rewards": [
18
+ 0.3,
19
+ 0.36,
20
+ 0.57,
21
+ 0.63,
22
+ 0.84
23
+ ],
24
+ "start_line": "[START] task=easy-command-typo env=cicd_debugger_env model=Qwen/Qwen2.5-72B-Instruct",
25
+ "end_line": "[END] success=false steps=5 score=0.541 rewards=0.30,0.36,0.57,0.63,0.84"
26
+ },
27
+ {
28
+ "task_id": "medium-python-version",
29
+ "difficulty": "medium",
30
+ "success": false,
31
+ "steps": 5,
32
+ "score": 0.679,
33
+ "rewards": [
34
+ 0.48,
35
+ 0.54,
36
+ 0.58,
37
+ 0.79,
38
+ 1.0
39
+ ],
40
+ "start_line": "[START] task=medium-python-version env=cicd_debugger_env model=Qwen/Qwen2.5-72B-Instruct",
41
+ "end_line": "[END] success=false steps=5 score=0.679 rewards=0.48,0.54,0.58,0.79,1.00"
42
+ },
43
+ {
44
+ "task_id": "hard-needs-order",
45
+ "difficulty": "hard",
46
+ "success": false,
47
+ "steps": 5,
48
+ "score": 0.513,
49
+ "rewards": [
50
+ 0.48,
51
+ 0.54,
52
+ 0.52,
53
+ 0.57,
54
+ 0.46
55
+ ],
56
+ "start_line": "[START] task=hard-needs-order env=cicd_debugger_env model=Qwen/Qwen2.5-72B-Instruct",
57
+ "end_line": "[END] success=false steps=5 score=0.513 rewards=0.48,0.54,0.52,0.57,0.46"
58
+ }
59
+ ]
60
+ }
artifacts/metrics.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "average_reward": 0.6719,
3
+ "failure_reasons": {},
4
+ "steps": 7,
5
+ "success_rate": 0.1429,
6
+ "total_reward": 4.7032
7
+ }
artifacts/reward_curve.csv ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ step,reward
2
+ 1,0.3016
3
+ 2,0.3616
4
+ 3,0.5700
5
+ 4,0.6300
6
+ 5,0.8400
7
+ 6,1.0000
8
+ 7,1.0000
artifacts/success_rate.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ episode,success,success_rate
2
+ 1,1,1.0000
baseline_inference.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import os
6
+ from pathlib import Path
7
+ import re
8
+ import subprocess
9
+ import sys
10
+ from typing import Any
11
+
12
+
13
+ BASELINE_TASKS: list[tuple[str, str]] = [
14
+ ("easy-command-typo", "easy"),
15
+ ("medium-python-version", "medium"),
16
+ ("hard-needs-order", "hard"),
17
+ ]
18
+
19
+ END_PATTERN = re.compile(
20
+ r"^\[END\] success=(true|false) steps=(\d+) score=(\d+\.\d{3}) rewards=(.*)$"
21
+ )
22
+
23
+
24
+ def parse_args() -> argparse.Namespace:
25
+ parser = argparse.ArgumentParser(description="Run baseline inference on easy/medium/hard tasks")
26
+ parser.add_argument("--tasks", default=",".join(task for task, _ in BASELINE_TASKS))
27
+ parser.add_argument("--max-steps", type=int, default=int(os.getenv("MAX_STEPS", "8")))
28
+ parser.add_argument("--policy-mode", choices=["sft", "imp", "direct"], default="imp")
29
+ parser.add_argument("--trajectories", type=int, default=3)
30
+ parser.add_argument("--benchmark", default=os.getenv("MY_ENV_V4_BENCHMARK", "cicd_debugger_env"))
31
+ parser.add_argument("--offline", action="store_true", default=False)
32
+ parser.add_argument("--force-local-env", action="store_true", default=True)
33
+ parser.add_argument("--output", default="artifacts/baseline_scores.json")
34
+ return parser.parse_args()
35
+
36
+
37
+ def should_run_offline(args: argparse.Namespace) -> bool:
38
+ if args.offline:
39
+ return True
40
+
41
+ key = os.getenv("OPENAI_API_KEY") or os.getenv("HF_TOKEN")
42
+ if not key:
43
+ return True
44
+
45
+ return os.getenv("OFFLINE_INFERENCE", "0") == "1"
46
+
47
+
48
+ def parse_end_line(lines: list[str]) -> dict[str, Any]:
49
+ for raw_line in reversed(lines):
50
+ line = raw_line.strip()
51
+ if not line.startswith("[END] "):
52
+ continue
53
+
54
+ matched = END_PATTERN.match(line)
55
+ if not matched:
56
+ raise RuntimeError(f"Malformed END line: {line}")
57
+
58
+ success = matched.group(1) == "true"
59
+ steps = int(matched.group(2))
60
+ score = float(matched.group(3))
61
+ rewards_str = matched.group(4).strip()
62
+
63
+ rewards: list[float] = []
64
+ if rewards_str:
65
+ rewards = [float(value) for value in rewards_str.split(",") if value]
66
+
67
+ return {
68
+ "success": success,
69
+ "steps": steps,
70
+ "score": score,
71
+ "rewards": rewards,
72
+ "end_line": line,
73
+ }
74
+
75
+ raise RuntimeError("No END line found in inference output")
76
+
77
+
78
+ def run_single_task(
79
+ task_id: str,
80
+ difficulty: str,
81
+ args: argparse.Namespace,
82
+ project_root: Path,
83
+ offline_mode: bool,
84
+ ) -> dict[str, Any]:
85
+ command = [
86
+ sys.executable,
87
+ "inference.py",
88
+ "--task",
89
+ task_id,
90
+ "--benchmark",
91
+ str(args.benchmark),
92
+ "--max-steps",
93
+ str(max(1, int(args.max_steps))),
94
+ "--policy-mode",
95
+ str(args.policy_mode),
96
+ "--trajectories",
97
+ str(max(1, int(args.trajectories))),
98
+ ]
99
+
100
+ if offline_mode:
101
+ command.append("--offline")
102
+ if args.force_local_env:
103
+ command.append("--force-local-env")
104
+
105
+ env = os.environ.copy()
106
+ if offline_mode:
107
+ env["OFFLINE_INFERENCE"] = "1"
108
+
109
+ completed = subprocess.run(
110
+ command,
111
+ cwd=project_root,
112
+ capture_output=True,
113
+ text=True,
114
+ env=env,
115
+ check=True,
116
+ )
117
+
118
+ lines = [line for line in completed.stdout.splitlines() if line.strip()]
119
+ summary = parse_end_line(lines)
120
+
121
+ return {
122
+ "task_id": task_id,
123
+ "difficulty": difficulty,
124
+ "success": summary["success"],
125
+ "steps": summary["steps"],
126
+ "score": summary["score"],
127
+ "rewards": summary["rewards"],
128
+ "start_line": next((line for line in lines if line.startswith("[START] ")), ""),
129
+ "end_line": summary["end_line"],
130
+ }
131
+
132
+
133
+ def main() -> int:
134
+ args = parse_args()
135
+ project_root = Path(__file__).resolve().parent
136
+
137
+ known_difficulties = {task: difficulty for task, difficulty in BASELINE_TASKS}
138
+ requested_tasks = [task.strip() for task in str(args.tasks).split(",") if task.strip()]
139
+
140
+ if not requested_tasks:
141
+ print("No tasks provided for baseline run", file=sys.stderr)
142
+ return 1
143
+
144
+ offline_mode = should_run_offline(args)
145
+
146
+ print(
147
+ f"[BASELINE] mode={'offline' if offline_mode else 'openai'} tasks={len(requested_tasks)} "
148
+ f"max_steps={max(1, int(args.max_steps))} policy={args.policy_mode}",
149
+ flush=True,
150
+ )
151
+
152
+ results: list[dict[str, Any]] = []
153
+ for task_id in requested_tasks:
154
+ difficulty = known_difficulties.get(task_id, "custom")
155
+ try:
156
+ result = run_single_task(task_id, difficulty, args, project_root, offline_mode)
157
+ results.append(result)
158
+ print(
159
+ f"[BASELINE] task={task_id} difficulty={difficulty} success={str(result['success']).lower()} "
160
+ f"score={result['score']:.3f} steps={result['steps']}",
161
+ flush=True,
162
+ )
163
+ except subprocess.CalledProcessError as exc:
164
+ print(f"[BASELINE] task={task_id} failed with return code {exc.returncode}", file=sys.stderr)
165
+ if exc.stdout:
166
+ print(exc.stdout, file=sys.stderr)
167
+ if exc.stderr:
168
+ print(exc.stderr, file=sys.stderr)
169
+ return exc.returncode or 1
170
+ except Exception as exc:
171
+ print(f"[BASELINE] task={task_id} failed: {exc}", file=sys.stderr)
172
+ return 1
173
+
174
+ average_score = sum(item["score"] for item in results) / len(results)
175
+ success_rate = sum(1 for item in results if item["success"]) / len(results)
176
+
177
+ payload = {
178
+ "mode": "offline" if offline_mode else "openai",
179
+ "model_name": os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct"),
180
+ "api_base_url": os.getenv("API_BASE_URL", "https://router.huggingface.co/v1"),
181
+ "max_steps": max(1, int(args.max_steps)),
182
+ "policy_mode": str(args.policy_mode),
183
+ "trajectories": max(1, int(args.trajectories)),
184
+ "average_score": round(float(average_score), 3),
185
+ "success_rate": round(float(success_rate), 3),
186
+ "results": results,
187
+ }
188
+
189
+ output_path = project_root / str(args.output)
190
+ output_path.parent.mkdir(parents=True, exist_ok=True)
191
+ output_path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
192
+
193
+ print(f"[BASELINE] average_score={payload['average_score']:.3f} success_rate={payload['success_rate']:.3f}", flush=True)
194
+ print(f"[BASELINE] wrote {output_path}", flush=True)
195
+
196
+ return 0
197
+
198
+
199
+ if __name__ == "__main__":
200
+ raise SystemExit(main())
inference.py ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import asyncio
5
+ import os
6
+ from typing import Any
7
+
8
+ from openai import OpenAI
9
+
10
+ from env.environment import CICDDebuggerEnvironment, REQUIRED_TOOLS
11
+ from inference.metrics import EpisodeMetrics
12
+ from inference.model_wrapper import ModelWrapper, score_action_candidate
13
+ from inference.prompts import heuristic_action
14
+ from inference.visualize import save_metrics_json, save_reward_curve, save_success_rate_history
15
+
16
+
17
+ API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
18
+ MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
19
+ HF_TOKEN = os.getenv("HF_TOKEN")
20
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
21
+ API_KEY = OPENAI_API_KEY or HF_TOKEN
22
+ LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME")
23
+ DEFAULT_TASK_ID = os.getenv("MY_ENV_V4_TASK", "easy-command-typo")
24
+ DEFAULT_BENCHMARK = os.getenv("MY_ENV_V4_BENCHMARK", "cicd_debugger_env")
25
+
26
+ MAX_STEPS_DEFAULT = int(os.getenv("MAX_STEPS", "8"))
27
+ TEMPERATURE = float(os.getenv("TEMPERATURE", "0.2"))
28
+ MAX_TOKENS = int(os.getenv("MAX_TOKENS", "120"))
29
+ OFFLINE_INFERENCE = os.getenv("OFFLINE_INFERENCE", "0") == "1"
30
+ SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.1"))
31
+
32
+
33
+ def log_start(task: str, env_name: str, model: str) -> None:
34
+ print(f"[START] task={_single_line(task)} env={_single_line(env_name)} model={_single_line(model)}", flush=True)
35
+
36
+
37
+ def log_step(step: int, action: str, reward: float, done: bool, error: str | None) -> None:
38
+ done_val = str(done).lower()
39
+ error_val = _single_line(error) if error else "null"
40
+ action_val = _single_line(action)
41
+ print(f"[STEP] step={step} action={action_val} reward={reward:.2f} done={done_val} error={error_val}", flush=True)
42
+
43
+
44
+ def log_end(success: bool, steps: int, score: float, rewards: list[float]) -> None:
45
+ rewards_str = ",".join(f"{value:.2f}" for value in rewards)
46
+ print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
47
+
48
+
49
+ def _single_line(value: Any) -> str:
50
+ return " ".join(str(value).replace("\n", " ").replace("\r", " ").split())
51
+
52
+
53
+ def _is_hacking_action(action_text: str) -> bool:
54
+ value = (action_text or "").lower()
55
+ patterns = (
56
+ "if: false",
57
+ "when: never",
58
+ "echo \"tests passed\"",
59
+ "echo 'tests passed'",
60
+ "exit 0",
61
+ "force success",
62
+ "status: success",
63
+ )
64
+ return any(token in value for token in patterns)
65
+
66
+
67
+ def _extract_error(info: dict[str, Any] | None) -> str | None:
68
+ if not info:
69
+ return None
70
+ error = info.get("error")
71
+ return str(error) if error else None
72
+
73
+
74
+ def _extract_observation_fields(observation: dict[str, Any]) -> tuple[str, str, list[str]]:
75
+ config_text = str(observation.get("config") or "")
76
+ error_message = str(observation.get("error_message") or "")
77
+ tools = [str(item) for item in (observation.get("available_tools") or REQUIRED_TOOLS)]
78
+ return config_text, error_message, tools
79
+
80
+
81
+ def _tool_from_action(action_text: str) -> str:
82
+ return str(action_text or "").split(":", 1)[0].strip().lower()
83
+
84
+
85
+ def _is_action_allowed(action_text: str, available_tools: list[str]) -> bool:
86
+ return _tool_from_action(action_text) in {tool.lower() for tool in available_tools}
87
+
88
+
89
+ def _normalize_action(action_text: str, available_tools: list[str], fallback: str) -> str:
90
+ action = str(action_text or "").strip()
91
+ if not action:
92
+ return fallback
93
+
94
+ aliases = {
95
+ "run_stage": "run_pipeline_stage",
96
+ "validate": "validate_fix",
97
+ "submit": "submit_solution",
98
+ "submit_fix": "submit_solution",
99
+ }
100
+ tool = _tool_from_action(action)
101
+ normalized_tool = aliases.get(tool, tool)
102
+ if normalized_tool != tool:
103
+ suffix = action.split(":", 1)[1].strip() if ":" in action else ""
104
+ action = f"{normalized_tool}: {suffix}" if suffix else normalized_tool
105
+
106
+ if _is_action_allowed(action, available_tools):
107
+ return action
108
+
109
+ return fallback
110
+
111
+
112
+ def _select_action(
113
+ model_wrapper: ModelWrapper,
114
+ step: int,
115
+ config_text: str,
116
+ error_message: str,
117
+ history: list[str],
118
+ available_actions: list[str],
119
+ policy_mode: str,
120
+ trajectories: int,
121
+ ) -> str:
122
+ mode = (policy_mode or "imp").lower()
123
+ fallback = heuristic_action(config_text, error_message, available_actions, history)
124
+
125
+ if mode == "sft":
126
+ return _normalize_action(fallback, available_actions, fallback)
127
+
128
+ if mode == "direct":
129
+ action = model_wrapper.generate_action(
130
+ step=step,
131
+ config_text=config_text,
132
+ error_message=error_message,
133
+ history=history,
134
+ available_actions=available_actions,
135
+ )
136
+ return _normalize_action(action, available_actions, fallback)
137
+
138
+ candidates = model_wrapper.generate_candidates(
139
+ step=step,
140
+ config_text=config_text,
141
+ error_message=error_message,
142
+ history=history,
143
+ count=max(1, int(trajectories)),
144
+ available_actions=available_actions,
145
+ )
146
+
147
+ if not candidates:
148
+ return _normalize_action(fallback, available_actions, fallback)
149
+
150
+ observation_text = f"{config_text}\n{error_message}"
151
+ best = max(candidates, key=lambda item: score_action_candidate(observation_text, item, _is_hacking_action))
152
+ return _normalize_action(best, available_actions, fallback)
153
+
154
+
155
+ def parse_args() -> argparse.Namespace:
156
+ parser = argparse.ArgumentParser(description="Run CI/CD debugger inference loop")
157
+ parser.add_argument("--max-steps", type=int, default=MAX_STEPS_DEFAULT)
158
+ parser.add_argument("--task", default=DEFAULT_TASK_ID)
159
+ parser.add_argument("--benchmark", default=DEFAULT_BENCHMARK)
160
+ parser.add_argument("--difficulty", choices=["easy", "medium", "hard"], default=None)
161
+ parser.add_argument("--offline", action="store_true", default=OFFLINE_INFERENCE)
162
+ parser.add_argument("--force-local-env", action="store_true", default=False)
163
+ parser.add_argument("--policy-mode", choices=["sft", "imp", "direct"], default="imp")
164
+ parser.add_argument("--trajectories", type=int, default=3)
165
+ return parser.parse_args()
166
+
167
+
168
+ async def run_episode(args: argparse.Namespace) -> int:
169
+ history: list[str] = []
170
+ steps_taken = 0
171
+ success = False
172
+ episode_completed_cleanly = False
173
+ metrics = EpisodeMetrics()
174
+
175
+ env = CICDDebuggerEnvironment(max_steps=max(1, int(args.max_steps)))
176
+
177
+ offline_mode = bool(args.offline or not API_KEY)
178
+ client: OpenAI | None = None
179
+ if not offline_mode:
180
+ try:
181
+ client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
182
+ except Exception:
183
+ client = None
184
+ offline_mode = True
185
+
186
+ model_wrapper = ModelWrapper(
187
+ client=client,
188
+ model_name=MODEL_NAME,
189
+ temperature=TEMPERATURE,
190
+ max_tokens=MAX_TOKENS,
191
+ offline=offline_mode,
192
+ )
193
+
194
+ log_start(task=str(args.task), env_name=str(args.benchmark), model=MODEL_NAME)
195
+
196
+ try:
197
+ observation = await env.reset(task_id=str(args.task), difficulty=args.difficulty)
198
+
199
+ for step in range(1, max(1, int(args.max_steps)) + 1):
200
+ config_text, error_message, available_tools = _extract_observation_fields(observation)
201
+
202
+ action_text = _select_action(
203
+ model_wrapper=model_wrapper,
204
+ step=step,
205
+ config_text=config_text,
206
+ error_message=error_message,
207
+ history=history,
208
+ available_actions=available_tools,
209
+ policy_mode=str(args.policy_mode),
210
+ trajectories=max(1, int(args.trajectories)),
211
+ )
212
+
213
+ observation, reward, done, info = await env.step(action_text)
214
+ step_error = _extract_error(info)
215
+
216
+ metrics.add_step(action=action_text, reward=float(reward), error=step_error, done=bool(done))
217
+ steps_taken = step
218
+
219
+ log_step(step=step, action=action_text, reward=float(reward), done=bool(done), error=step_error)
220
+ history.append(f"step={step} action={_single_line(action_text)} reward={float(reward):.2f}")
221
+
222
+ if done:
223
+ episode_completed_cleanly = step_error is None and not _is_hacking_action(action_text)
224
+ break
225
+
226
+ except Exception as exc:
227
+ success = False
228
+ if not metrics.rewards:
229
+ metrics.add_step(action="system_error", reward=0.0, error=str(exc), done=True)
230
+ finally:
231
+ score = max(0.0, min(1.0, float(metrics.average_reward)))
232
+ success = episode_completed_cleanly and score >= SUCCESS_SCORE_THRESHOLD
233
+
234
+ try:
235
+ save_reward_curve(metrics.rewards)
236
+ save_metrics_json(metrics.summary())
237
+ save_success_rate_history([success])
238
+ except Exception:
239
+ pass
240
+
241
+ try:
242
+ await env.close()
243
+ except Exception:
244
+ pass
245
+
246
+ log_end(success=success, steps=steps_taken, score=score, rewards=metrics.rewards)
247
+
248
+ return 0
249
+
250
+
251
+ def main() -> int:
252
+ args = parse_args()
253
+ return asyncio.run(run_episode(args))
254
+
255
+
256
+ if __name__ == "__main__":
257
+ raise SystemExit(main())
inference/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from inference.metrics import EpisodeMetrics
2
+ from inference.model_wrapper import ModelWrapper
3
+
4
+ __all__ = ["EpisodeMetrics", "ModelWrapper"]
inference/metrics.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+
5
+
6
+ @dataclass
7
+ class EpisodeMetrics:
8
+ rewards: list[float] = field(default_factory=list)
9
+ actions: list[str] = field(default_factory=list)
10
+ errors: list[str | None] = field(default_factory=list)
11
+ dones: list[bool] = field(default_factory=list)
12
+
13
+ def add_step(self, action: str, reward: float, error: str | None, done: bool) -> None:
14
+ self.actions.append(action)
15
+ self.rewards.append(float(reward))
16
+ self.errors.append(error)
17
+ self.dones.append(bool(done))
18
+
19
+ @property
20
+ def steps(self) -> int:
21
+ return len(self.rewards)
22
+
23
+ @property
24
+ def total_reward(self) -> float:
25
+ return round(sum(self.rewards), 4)
26
+
27
+ @property
28
+ def average_reward(self) -> float:
29
+ if not self.rewards:
30
+ return 0.0
31
+ return round(self.total_reward / len(self.rewards), 4)
32
+
33
+ @property
34
+ def success_rate(self) -> float:
35
+ if not self.dones:
36
+ return 0.0
37
+ successes = sum(1 for flag in self.dones if flag)
38
+ return round(successes / len(self.dones), 4)
39
+
40
+ @property
41
+ def failure_reasons(self) -> dict[str, int]:
42
+ counts: dict[str, int] = {}
43
+ for err in self.errors:
44
+ if not err:
45
+ continue
46
+ counts[err] = counts.get(err, 0) + 1
47
+ return counts
48
+
49
+ def summary(self) -> dict[str, float | int | dict[str, int]]:
50
+ return {
51
+ "steps": self.steps,
52
+ "total_reward": self.total_reward,
53
+ "average_reward": self.average_reward,
54
+ "success_rate": self.success_rate,
55
+ "failure_reasons": self.failure_reasons,
56
+ }
inference/model_wrapper.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Any, Iterable
5
+
6
+ from openai import OpenAI
7
+
8
+ from inference.prompts import REQUIRED_ACTIONS, SYSTEM_PROMPT, build_user_prompt, heuristic_action, sanitize_action_text
9
+
10
+
11
+ @dataclass
12
+ class ModelWrapper:
13
+ client: OpenAI | None
14
+ model_name: str
15
+ temperature: float
16
+ max_tokens: int
17
+ offline: bool
18
+
19
+ def generate_action(
20
+ self,
21
+ step: int,
22
+ config_text: str,
23
+ error_message: str,
24
+ history: list[str],
25
+ available_actions: Iterable[str] | None = None,
26
+ ) -> str:
27
+ fallback = heuristic_action(config_text, error_message, available_actions, history)
28
+ if self.offline or self.client is None:
29
+ return fallback
30
+
31
+ user_prompt = build_user_prompt(
32
+ step=step,
33
+ config_text=config_text,
34
+ error_message=error_message,
35
+ history=history,
36
+ available_actions=available_actions,
37
+ )
38
+
39
+ try:
40
+ completion = self.client.chat.completions.create(
41
+ model=self.model_name,
42
+ messages=[
43
+ {"role": "system", "content": SYSTEM_PROMPT},
44
+ {"role": "user", "content": user_prompt},
45
+ ],
46
+ temperature=max(float(self.temperature), 0.0),
47
+ max_tokens=max(16, int(self.max_tokens)),
48
+ stream=False,
49
+ )
50
+
51
+ generated = str(completion.choices[0].message.content or "")
52
+ return sanitize_action_text(generated, fallback=fallback)
53
+ except Exception:
54
+ return fallback
55
+
56
+ def generate_candidates(
57
+ self,
58
+ step: int,
59
+ config_text: str,
60
+ error_message: str,
61
+ history: list[str],
62
+ count: int,
63
+ available_actions: Iterable[str] | None = None,
64
+ ) -> list[str]:
65
+ candidates = [heuristic_action(config_text, error_message, available_actions, history)]
66
+
67
+ for idx in range(max(1, count)):
68
+ action = self.generate_action(
69
+ step=step,
70
+ config_text=config_text,
71
+ error_message=error_message,
72
+ history=history + [f"candidate={idx}"],
73
+ available_actions=available_actions,
74
+ )
75
+ if action:
76
+ candidates.append(action)
77
+
78
+ deduped: list[str] = []
79
+ seen: set[str] = set()
80
+ for candidate in candidates:
81
+ normalized = candidate.strip()
82
+ if not normalized or normalized in seen:
83
+ continue
84
+ seen.add(normalized)
85
+ deduped.append(normalized)
86
+
87
+ return deduped
88
+
89
+
90
+ def score_action_candidate(observation_text: str, action_text: str, hacking_predicate: Any) -> float:
91
+ obs = (observation_text or "").lower()
92
+ action = (action_text or "").lower()
93
+
94
+ score = 0.0
95
+
96
+ selected_tool = action.split(":", 1)[0].strip()
97
+ if selected_tool in REQUIRED_ACTIONS:
98
+ score += 0.5
99
+ else:
100
+ score -= 0.5
101
+
102
+ if "npm tset" in obs and "edit_config" in action and "npm test" in action:
103
+ score += 2.0
104
+ if "yaml" in obs and "edit_config" in action:
105
+ score += 1.2
106
+ if "missing" in obs and "analyze_error" in action:
107
+ score += 0.8
108
+ if "validate_fix" in action or "submit_solution" in action:
109
+ score += 0.5
110
+
111
+ if hacking_predicate(action_text):
112
+ score -= 3.0
113
+
114
+ score += min(len(action_text) / 240.0, 0.2)
115
+ return score
inference/prompts.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ import textwrap
5
+ from typing import Iterable
6
+
7
+
8
+ SYSTEM_PROMPT = textwrap.dedent(
9
+ """
10
+ You are a CI/CD pipeline debugger assistant.
11
+ Return exactly one single-line action describing the next debugging move.
12
+ Do not output markdown. Do not include explanations.
13
+ """
14
+ ).strip()
15
+
16
+ JUDGE_SYSTEM_PROMPT = textwrap.dedent(
17
+ """
18
+ You are a strict CI/CD judge.
19
+ Return JSON only with keys correctness, minimalism, quality and values in [0,1].
20
+ """
21
+ ).strip()
22
+
23
+ REQUIRED_ACTIONS = (
24
+ "read_file",
25
+ "read_logs",
26
+ "analyze_error",
27
+ "edit_config",
28
+ "run_pipeline_stage",
29
+ "run_tests",
30
+ "validate_fix",
31
+ "submit_solution",
32
+ )
33
+
34
+
35
+ def build_user_prompt(
36
+ step: int,
37
+ config_text: str,
38
+ error_message: str,
39
+ history: list[str],
40
+ available_actions: Iterable[str] | None = None,
41
+ ) -> str:
42
+ history_text = "\n".join(history[-5:]) if history else "None"
43
+ actions_text = ", ".join(available_actions) if available_actions else ", ".join(REQUIRED_ACTIONS)
44
+
45
+ return textwrap.dedent(
46
+ f"""
47
+ Step: {step}
48
+
49
+ Current config:
50
+ {config_text}
51
+
52
+ Current error:
53
+ {error_message}
54
+
55
+ Recent history:
56
+ {history_text}
57
+
58
+ Available action categories:
59
+ {actions_text}
60
+
61
+ Output one actionable single-line fix/debug action.
62
+ """
63
+ ).strip()
64
+
65
+
66
+ def sanitize_action_text(raw_text: str, fallback: str = "read logs and analyze failing command") -> str:
67
+ text = (raw_text or "").strip()
68
+ if not text:
69
+ return fallback
70
+ text = text.replace("\n", " ").replace("\r", " ")
71
+ text = " ".join(text.split())
72
+ return text or fallback
73
+
74
+
75
+ def heuristic_action(
76
+ config_text: str,
77
+ error_message: str,
78
+ available_actions: Iterable[str] | None = None,
79
+ history: list[str] | None = None,
80
+ ) -> str:
81
+ lower_cfg = (config_text or "").lower()
82
+ lower_err = (error_message or "").lower()
83
+ seen = _extract_seen_tools(history or [])
84
+ allowed = {item.strip() for item in (available_actions or REQUIRED_ACTIONS)}
85
+
86
+ def has_tool(name: str) -> bool:
87
+ return name in allowed
88
+
89
+ if has_tool("read_logs") and "read_logs" not in seen:
90
+ return "read_logs: inspect failing stage logs"
91
+
92
+ if has_tool("analyze_error") and "analyze_error" not in seen:
93
+ return "analyze_error: identify root cause from logs and config"
94
+
95
+ if has_tool("edit_config") and "npm tset" in lower_cfg:
96
+ return "edit_config: replace npm tset with npm test"
97
+
98
+ if has_tool("edit_config") and ("yaml" in lower_err or "mapping values are not allowed" in lower_err):
99
+ return "edit_config: fix YAML indentation and syntax"
100
+
101
+ if has_tool("edit_config") and ("module not found" in lower_err or "dependency" in lower_err):
102
+ return "edit_config: repair dependency install and test commands"
103
+
104
+ if has_tool("run_pipeline_stage") and "run_pipeline_stage" not in seen:
105
+ return "run_pipeline_stage: run test stage"
106
+
107
+ if has_tool("run_tests") and "run_tests" not in seen:
108
+ return "run_tests: execute full pipeline tests"
109
+
110
+ if has_tool("validate_fix") and "validate_fix" not in seen:
111
+ return "validate_fix: check deterministic, hidden, and quality scores"
112
+
113
+ if has_tool("submit_solution"):
114
+ return "submit_solution: submit current configuration"
115
+
116
+ return "read_logs: inspect failing stage logs and identify root cause"
117
+
118
+
119
+ def _extract_seen_tools(history: list[str]) -> set[str]:
120
+ seen: set[str] = set()
121
+ for item in history:
122
+ for tool in REQUIRED_ACTIONS:
123
+ if re.search(rf"\b{re.escape(tool)}\b", item):
124
+ seen.add(tool)
125
+ return seen
inference/visualize.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+
6
+
7
+ def save_reward_curve(rewards: list[float], output_path: str = "artifacts/reward_curve.csv") -> str:
8
+ path = Path(output_path)
9
+ path.parent.mkdir(parents=True, exist_ok=True)
10
+
11
+ with path.open("w", encoding="utf-8") as handle:
12
+ handle.write("step,reward\n")
13
+ for idx, reward in enumerate(rewards, start=1):
14
+ handle.write(f"{idx},{float(reward):.4f}\n")
15
+
16
+ return str(path)
17
+
18
+
19
+ def save_success_rate_history(success_flags: list[bool], output_path: str = "artifacts/success_rate.csv") -> str:
20
+ path = Path(output_path)
21
+ path.parent.mkdir(parents=True, exist_ok=True)
22
+
23
+ running = 0
24
+ with path.open("w", encoding="utf-8") as handle:
25
+ handle.write("episode,success,success_rate\n")
26
+ for idx, flag in enumerate(success_flags, start=1):
27
+ if flag:
28
+ running += 1
29
+ rate = running / idx
30
+ handle.write(f"{idx},{int(flag)},{rate:.4f}\n")
31
+
32
+ return str(path)
33
+
34
+
35
+ def save_metrics_json(metrics: dict, output_path: str = "artifacts/metrics.json") -> str:
36
+ path = Path(output_path)
37
+ path.parent.mkdir(parents=True, exist_ok=True)
38
+
39
+ with path.open("w", encoding="utf-8") as handle:
40
+ json.dump(metrics, handle, indent=2, sort_keys=True)
41
+
42
+ return str(path)
openenv.yaml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: "0.2"
2
+ name: "cicd-debugger-env"
3
+ description: "RL environment for CI/CD debugging with deterministic, hidden, and quality-aware scoring"
4
+ metadata:
5
+ domain: "devops"
6
+ real_world_task: "ci-cd pipeline debugging"
7
+ deployment: "huggingface-space-docker"
8
+
9
+ environment:
10
+ entry_point: "env.environment:CICDDebuggerEnvironment"
11
+
12
+ interface:
13
+ observation_type: "json"
14
+ action_type: "text"
15
+ max_steps: 30
16
+
17
+ action_space:
18
+ tools:
19
+ - read_file
20
+ - read_logs
21
+ - analyze_error
22
+ - edit_config
23
+ - run_pipeline_stage
24
+ - run_tests
25
+ - validate_fix
26
+ - submit_solution
27
+
28
+ tasks:
29
+ - id: "easy-command-typo"
30
+ grader: "env.graders.deterministic:DeterministicGrader"
31
+
32
+ - id: "easy-missing-checkout"
33
+ grader: "env.graders.deterministic:DeterministicGrader"
34
+
35
+ - id: "easy-yaml-indentation"
36
+ grader: "env.graders.deterministic:DeterministicGrader"
37
+
38
+ - id: "medium-python-version"
39
+ grader: "env.graders.deterministic:DeterministicGrader"
40
+
41
+ - id: "medium-cache-key"
42
+ grader: "env.graders.deterministic:DeterministicGrader"
43
+
44
+ - id: "medium-artifact-permissions"
45
+ grader: "env.graders.deterministic:DeterministicGrader"
46
+
47
+ - id: "hard-matrix-logic"
48
+ grader: "env.graders.deterministic:DeterministicGrader"
49
+
50
+ - id: "hard-conditional-deploy"
51
+ grader: "env.graders.deterministic:DeterministicGrader"
52
+
53
+ - id: "hard-needs-order"
54
+ grader: "env.graders.deterministic:DeterministicGrader"
pyproject.toml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "cicd-debugger-env"
3
+ version = "0.1.0"
4
+ description = "OpenEnv CI/CD pipeline debugging environment with hybrid grading and reward shaping"
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ dependencies = [
8
+ "pyyaml",
9
+ "fastapi",
10
+ "uvicorn",
11
+ "openenv-core",
12
+ "openai",
13
+ ]
14
+
15
+ [project.scripts]
16
+ server = "server.app:main"
17
+
18
+ [build-system]
19
+ requires = ["setuptools>=68", "wheel"]
20
+ build-backend = "setuptools.build_meta"
21
+
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ pyyaml
2
+ fastapi
3
+ uvicorn[standard]
4
+ openenv-core
5
+ openai
6
+ pydantic>=2.0.0
7
+ transformers>=4.30.0
8
+ torch>=2.0.0
9
+ pytest>=7.0.0
server/__init__.py ADDED
File without changes
server/app.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ import os
5
+ from typing import Any
6
+
7
+ from fastapi import FastAPI
8
+ from fastapi import HTTPException
9
+ from pydantic import BaseModel, Field
10
+ import uvicorn
11
+
12
+ from env.environment import CICDDebuggerEnvironment, MAX_STEPS
13
+ from env.models import Action, Observation, Reward
14
+
15
+
16
+ app = FastAPI(title="CI/CD Debugger OpenEnv Server")
17
+
18
+
19
+ class ResetRequest(BaseModel):
20
+ task_id: str | None = None
21
+ difficulty: str | None = None
22
+ max_steps: int = Field(default=MAX_STEPS, ge=1, le=100)
23
+
24
+
25
+ class StepRequest(BaseModel):
26
+ action: Action | str | dict[str, Any]
27
+
28
+
29
+ class StepResponse(BaseModel):
30
+ task_id: str
31
+ step_count: int
32
+ reward: float
33
+ reward_model: Reward
34
+ done: bool
35
+ observation: Observation
36
+ last_action: str | None = None
37
+ info: dict[str, Any] = Field(default_factory=dict)
38
+
39
+
40
+ class StateResponse(BaseModel):
41
+ initialized: bool
42
+ task_id: str | None = None
43
+ step_count: int = 0
44
+ done: bool = False
45
+ last_action: str | None = None
46
+ observation: Observation | None = None
47
+ internal_state: dict[str, Any] = Field(default_factory=dict)
48
+
49
+
50
+ @dataclass
51
+ class RuntimeSession:
52
+ env: CICDDebuggerEnvironment
53
+ task_id: str
54
+ step_count: int = 0
55
+ done: bool = False
56
+ last_action: str | None = None
57
+ last_reward: float = 0.0
58
+ last_observation: dict[str, Any] | None = None
59
+ last_info: dict[str, Any] | None = None
60
+
61
+
62
+ runtime_session: RuntimeSession | None = None
63
+
64
+
65
+ def _as_observation_model(observation: dict[str, Any] | Observation) -> Observation:
66
+ if isinstance(observation, Observation):
67
+ return observation
68
+ return Observation.model_validate(observation)
69
+
70
+
71
+ def _build_step_response(session: RuntimeSession) -> StepResponse:
72
+ observation = session.last_observation or {}
73
+ info_payload = session.last_info or {}
74
+ reward_payload = info_payload.get("reward_model")
75
+ if isinstance(reward_payload, dict):
76
+ reward_model = Reward.model_validate(reward_payload)
77
+ else:
78
+ reward_model = Reward(value=float(session.last_reward), components={"total": float(session.last_reward)})
79
+
80
+ return StepResponse(
81
+ task_id=session.task_id,
82
+ step_count=int(observation.get("step_count") or session.step_count),
83
+ reward=float(session.last_reward),
84
+ reward_model=reward_model,
85
+ done=bool(session.done),
86
+ observation=_as_observation_model(observation),
87
+ last_action=session.last_action,
88
+ info=info_payload,
89
+ )
90
+
91
+
92
+ @app.get("/")
93
+ def root() -> dict[str, Any]:
94
+ return {
95
+ "message": "CI/CD Debugger Environment is running 🚀",
96
+ "endpoints": ["/health", "/reset", "/step", "/state"],
97
+ }
98
+
99
+
100
+ @app.get("/health")
101
+ def health() -> dict[str, str]:
102
+ return {"status": "ok"}
103
+
104
+
105
+ @app.post("/reset", response_model=StepResponse)
106
+ async def reset(payload: ResetRequest | None = None) -> StepResponse:
107
+ global runtime_session
108
+
109
+ request = payload or ResetRequest()
110
+ env = CICDDebuggerEnvironment(max_steps=int(request.max_steps))
111
+ observation = await env.reset(task_id=request.task_id, difficulty=request.difficulty)
112
+
113
+ runtime_session = RuntimeSession(
114
+ env=env,
115
+ task_id=str(observation.get("task_id", request.task_id or "cicd-debugger-task")),
116
+ step_count=0,
117
+ done=False,
118
+ last_action=None,
119
+ last_reward=0.0,
120
+ last_observation=observation,
121
+ last_info={
122
+ "message": "environment reset",
123
+ "tool": "reset",
124
+ "error": None,
125
+ "reward_model": Reward(value=0.0, components={"total": 0.0}).model_dump(),
126
+ },
127
+ )
128
+ return _build_step_response(runtime_session)
129
+
130
+
131
+ @app.post("/step", response_model=StepResponse)
132
+ async def step(payload: StepRequest) -> StepResponse:
133
+ global runtime_session
134
+
135
+ if runtime_session is None:
136
+ raise HTTPException(status_code=400, detail="Environment not initialized. Call /reset first.")
137
+
138
+ if runtime_session.done:
139
+ return _build_step_response(runtime_session)
140
+
141
+ observation, reward, done, info = await runtime_session.env.step(payload.action)
142
+
143
+ runtime_session.step_count = int(observation.get("step_count", runtime_session.step_count + 1))
144
+ runtime_session.done = bool(done)
145
+ runtime_session.last_action = payload.action if isinstance(payload.action, str) else str(payload.action)
146
+ runtime_session.last_reward = float(reward)
147
+ runtime_session.last_observation = observation
148
+ runtime_session.last_info = dict(info or {})
149
+
150
+ return _build_step_response(runtime_session)
151
+
152
+
153
+ @app.get("/state", response_model=StateResponse)
154
+ async def state() -> StateResponse:
155
+ if runtime_session is None:
156
+ return StateResponse(initialized=False)
157
+
158
+ observation = None
159
+ if runtime_session.last_observation is not None:
160
+ observation = _as_observation_model(runtime_session.last_observation)
161
+
162
+ return StateResponse(
163
+ initialized=True,
164
+ task_id=runtime_session.task_id,
165
+ step_count=runtime_session.step_count,
166
+ done=runtime_session.done,
167
+ last_action=runtime_session.last_action,
168
+ observation=observation,
169
+ internal_state=runtime_session.env.state(),
170
+ )
171
+
172
+
173
+ @app.post("/state", response_model=StateResponse)
174
+ async def state_post() -> StateResponse:
175
+ return await state()
176
+
177
+
178
+ def main() -> None:
179
+ port = int(os.getenv("PORT", "7860"))
180
+ uvicorn.run(app, host="0.0.0.0", port=port)
181
+
182
+
183
+ if __name__ == "__main__":
184
+ main()
tests/__init__.py ADDED
File without changes
tests/test_day2_engine.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+
3
+ from env.anti_hacking import AntiHackingDetector
4
+ from env.graders.deterministic import DeterministicGrader
5
+ from env.hidden_tests import HiddenTestRunner
6
+ from env.rewards import RewardCalculator
7
+
8
+
9
+ EXPECTED_CONFIG = """
10
+ name: CI
11
+ on: [push]
12
+ jobs:
13
+ test:
14
+ runs-on: ubuntu-latest
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+ - run: npm ci
18
+ - run: npm test
19
+ """
20
+
21
+ WRONG_CONFIG = """
22
+ name: CI
23
+ on: [push]
24
+ jobs:
25
+ test:
26
+ runs-on: ubuntu-latest
27
+ steps:
28
+ - uses: actions/checkout@v4
29
+ - run: npm ci
30
+ - run: npm tset
31
+ """
32
+
33
+ BROKEN_YAML = """
34
+ name CI
35
+ jobs:
36
+ test:
37
+ steps
38
+ - run npm test
39
+ """
40
+
41
+
42
+ class FakeJudge:
43
+ def evaluate_fix(self, original, fixed, error):
44
+ return {
45
+ "correctness": 0.9,
46
+ "minimalism": 0.8,
47
+ "quality": 0.9,
48
+ }
49
+
50
+
51
+ class Day2EngineTests(unittest.TestCase):
52
+ def setUp(self):
53
+ self.grader = DeterministicGrader()
54
+ self.detector = AntiHackingDetector()
55
+ self.hidden_runner = HiddenTestRunner(grader=self.grader)
56
+ self.reward_calculator = RewardCalculator(
57
+ llm_judge=FakeJudge(),
58
+ anti_hacking_detector=self.detector,
59
+ deterministic_grader=self.grader,
60
+ hidden_test_runner=self.hidden_runner,
61
+ )
62
+
63
+ def test_deterministic_grader_high_for_correct_fix(self):
64
+ score = self.grader.grade(EXPECTED_CONFIG, EXPECTED_CONFIG)
65
+ self.assertGreaterEqual(score, 0.85)
66
+ self.assertLessEqual(score, 1.0)
67
+
68
+ def test_deterministic_grader_low_for_wrong_fix(self):
69
+ score = self.grader.grade(WRONG_CONFIG, EXPECTED_CONFIG)
70
+ self.assertLess(score, 0.6)
71
+
72
+ def test_deterministic_grader_penalizes_broken_yaml(self):
73
+ score = self.grader.grade(BROKEN_YAML, EXPECTED_CONFIG)
74
+ self.assertLess(score, 0.4)
75
+
76
+ def test_deterministic_grader_is_reproducible(self):
77
+ first = self.grader.grade(WRONG_CONFIG, EXPECTED_CONFIG)
78
+ second = self.grader.grade(WRONG_CONFIG, EXPECTED_CONFIG)
79
+ self.assertEqual(first, second)
80
+
81
+ def test_anti_hacking_detects_stage_skipping(self):
82
+ config = EXPECTED_CONFIG + "\n if: false\n"
83
+ penalty = self.detector.penalty_stage_skipping(config)
84
+ self.assertLess(penalty, 0.0)
85
+
86
+ def test_anti_hacking_detects_fake_success(self):
87
+ config = EXPECTED_CONFIG.replace("npm test", 'echo "tests passed" && exit 0')
88
+ penalty = self.detector.penalty_fake_success(config)
89
+ self.assertLess(penalty, 0.0)
90
+
91
+ def test_anti_hacking_detects_breaking_pipeline(self):
92
+ previous_config = """
93
+ stages:
94
+ - build
95
+ - test
96
+ jobs:
97
+ build:
98
+ run: npm ci
99
+ test:
100
+ run: npm test
101
+ """
102
+ new_config = """
103
+ stages:
104
+ - build
105
+ jobs:
106
+ build:
107
+ run: npm ci
108
+ """
109
+ penalty = self.detector.penalty_breaking_pipeline(previous_config, new_config)
110
+ self.assertLess(penalty, 0.0)
111
+
112
+ def test_anti_hacking_detects_excessive_edits(self):
113
+ penalty = self.detector.penalty_excessive_edits(changed_files_count=12, changed_lines_count=400)
114
+ self.assertLess(penalty, 0.0)
115
+
116
+ def test_anti_hacking_detects_timeout_abuse(self):
117
+ penalty = self.detector.penalty_timeout_abuse(step_count=25)
118
+ self.assertLess(penalty, 0.0)
119
+
120
+ def test_hidden_tests_returns_high_pass_rate_for_good_fix(self):
121
+ pass_rate = self.hidden_runner.evaluate_fix(
122
+ fixed_config=EXPECTED_CONFIG,
123
+ expected_config=EXPECTED_CONFIG,
124
+ )
125
+ self.assertGreaterEqual(pass_rate, 0.8)
126
+
127
+ def test_hidden_tests_returns_lower_pass_rate_for_bad_fix(self):
128
+ pass_rate = self.hidden_runner.evaluate_fix(
129
+ fixed_config=WRONG_CONFIG,
130
+ expected_config=EXPECTED_CONFIG,
131
+ )
132
+ self.assertLess(pass_rate, 0.8)
133
+
134
+ def test_reward_calculator_positive_for_good_behavior(self):
135
+ state = {
136
+ "step_count": 3,
137
+ "previous_config": WRONG_CONFIG,
138
+ "expected_config": EXPECTED_CONFIG,
139
+ "original_config": WRONG_CONFIG,
140
+ "error": "command not found",
141
+ }
142
+ result = {
143
+ "logs_analyzed": True,
144
+ "error_diagnosed": True,
145
+ "fix_proposed": True,
146
+ "pipeline_run": True,
147
+ "tests_passed": True,
148
+ "command_succeeded": True,
149
+ "current_config": EXPECTED_CONFIG,
150
+ "expected_config": EXPECTED_CONFIG,
151
+ "changed_files_count": 1,
152
+ "changed_lines_count": 1,
153
+ }
154
+
155
+ reward = self.reward_calculator.calculate_step_reward(
156
+ state=state,
157
+ action="validate_fix",
158
+ result=result,
159
+ original_config=WRONG_CONFIG,
160
+ fixed_config=EXPECTED_CONFIG,
161
+ error_message="command not found",
162
+ expected_config=EXPECTED_CONFIG,
163
+ metadata={"broken_token": "npm tset", "fixed_token": "npm test"},
164
+ )
165
+
166
+ self.assertGreater(reward, 0.5)
167
+
168
+ def test_reward_calculator_penalizes_hacking(self):
169
+ hacked_config = EXPECTED_CONFIG.replace("npm test", 'echo "tests passed" && exit 0') + "\n if: false\n"
170
+ state = {
171
+ "step_count": 25,
172
+ "previous_config": EXPECTED_CONFIG,
173
+ "changed_files_count": 15,
174
+ "changed_lines_count": 500,
175
+ }
176
+ result = {
177
+ "current_config": hacked_config,
178
+ "expected_config": EXPECTED_CONFIG,
179
+ "hacking_attempt": True,
180
+ "changed_files_count": 15,
181
+ "changed_lines_count": 500,
182
+ }
183
+
184
+ reward = self.reward_calculator.calculate_step_reward(
185
+ state=state,
186
+ action="edit_config",
187
+ result=result,
188
+ original_config=EXPECTED_CONFIG,
189
+ fixed_config=hacked_config,
190
+ error_message="",
191
+ expected_config=EXPECTED_CONFIG,
192
+ )
193
+
194
+ self.assertGreaterEqual(reward, 0.0)
195
+ self.assertLessEqual(reward, 0.3)
196
+
197
+
198
+ if __name__ == "__main__":
199
+ unittest.main()
tests/test_env.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import unittest
3
+
4
+ from env.environment import CICDDebuggerEnvironment, REQUIRED_TOOLS
5
+
6
+
7
+ class EnvironmentContractTests(unittest.TestCase):
8
+ def test_reset_returns_structured_observation(self):
9
+ env = CICDDebuggerEnvironment(max_steps=10, seed=7)
10
+ observation = asyncio.run(env.reset(task_id="easy-command-typo"))
11
+
12
+ self.assertIn("config", observation)
13
+ self.assertIn("logs", observation)
14
+ self.assertIn("error_message", observation)
15
+ self.assertIn("progress_flags", observation)
16
+ self.assertEqual(observation["task_id"], "easy-command-typo")
17
+ self.assertEqual(observation["available_tools"], REQUIRED_TOOLS)
18
+ self.assertEqual(observation["step_count"], 0)
19
+
20
+ def test_step_returns_obs_reward_done_info(self):
21
+ env = CICDDebuggerEnvironment(max_steps=10, seed=3)
22
+ asyncio.run(env.reset(task_id="easy-command-typo"))
23
+
24
+ observation, reward, done, info = asyncio.run(env.step("read_logs: inspect failing stage logs"))
25
+
26
+ self.assertIsInstance(observation, dict)
27
+ self.assertIsInstance(reward, float)
28
+ self.assertIsInstance(done, bool)
29
+ self.assertIsInstance(info, dict)
30
+ self.assertIn("tool", info)
31
+
32
+ def test_action_space_rejects_extra_tools(self):
33
+ env = CICDDebuggerEnvironment(max_steps=10, seed=5)
34
+ asyncio.run(env.reset(task_id="easy-command-typo"))
35
+
36
+ observation, reward, done, info = asyncio.run(env.step("propose_fix: force deploy"))
37
+
38
+ self.assertIn("error", info)
39
+ self.assertIsNotNone(info["error"])
40
+ self.assertFalse(done)
41
+ self.assertGreaterEqual(reward, 0.0)
42
+ self.assertIn("config", observation)
43
+
44
+ def test_action_space_rejects_alias_tools(self):
45
+ env = CICDDebuggerEnvironment(max_steps=10, seed=15)
46
+ asyncio.run(env.reset(task_id="easy-command-typo"))
47
+
48
+ _, _, done, info = asyncio.run(env.step("read: workflow file"))
49
+
50
+ self.assertIn("error", info)
51
+ self.assertIsNotNone(info["error"])
52
+ self.assertFalse(done)
53
+
54
+ def test_submit_solution_path(self):
55
+ env = CICDDebuggerEnvironment(max_steps=12, seed=9)
56
+ asyncio.run(env.reset(task_id="easy-command-typo"))
57
+
58
+ asyncio.run(env.step("read_logs: inspect logs"))
59
+ asyncio.run(env.step("analyze_error: identify root cause"))
60
+ asyncio.run(env.step("edit_config: replace npm tset with npm test"))
61
+ asyncio.run(env.step("run_pipeline_stage: run test stage"))
62
+ asyncio.run(env.step("run_tests: execute tests"))
63
+ asyncio.run(env.step("validate_fix: validate score"))
64
+ observation, reward, done, info = asyncio.run(env.step("submit_solution: submit current fix"))
65
+
66
+ self.assertTrue(done)
67
+ self.assertGreaterEqual(reward, 0.0)
68
+ self.assertIsNone(info.get("error"))
69
+ self.assertEqual(observation["progress_flags"].get("submit_solution"), True)
70
+
71
+ def test_internal_state_tracks_required_fields(self):
72
+ env = CICDDebuggerEnvironment(max_steps=10, seed=11)
73
+ asyncio.run(env.reset(task_id="easy-command-typo"))
74
+ asyncio.run(env.step("read_logs: inspect logs"))
75
+
76
+ state = env.get_state()
77
+ self.assertTrue(state.get("initialized"))
78
+ self.assertIn("actual_bug", state)
79
+ self.assertIn("correct_solution", state)
80
+ self.assertIn("progress_flags", state)
81
+ self.assertIn("file_modification_count", state)
82
+ self.assertIn("hidden_test_pass_rate", state)
83
+
84
+ def test_yaml_task_is_fixable_via_edit_flow(self):
85
+ env = CICDDebuggerEnvironment(max_steps=12, seed=17)
86
+ asyncio.run(env.reset(task_id="easy-yaml-indentation"))
87
+
88
+ asyncio.run(env.step("read_logs: inspect logs"))
89
+ asyncio.run(env.step("analyze_error: identify root cause"))
90
+ observation, _, _, _ = asyncio.run(env.step("edit_config: fix YAML indentation and syntax"))
91
+
92
+ self.assertIn("- run: pytest", observation["config"])
93
+ self.assertNotIn(" - run: pytest", observation["config"])
94
+
95
+ asyncio.run(env.step("run_tests: execute tests"))
96
+ asyncio.run(env.step("validate_fix: validate score"))
97
+ _, _, done, info = asyncio.run(env.step("submit_solution: submit current fix"))
98
+
99
+ self.assertTrue(done)
100
+ self.assertIsNone(info.get("error"))
101
+
102
+ def test_hard_needs_order_edit_updates_deploy_dependency(self):
103
+ env = CICDDebuggerEnvironment(max_steps=12, seed=19)
104
+ asyncio.run(env.reset(task_id="hard-needs-order"))
105
+
106
+ observation, _, _, _ = asyncio.run(env.step("edit_config: fix deploy dependency ordering"))
107
+
108
+ self.assertIn("needs: [build, test]", observation["config"])
109
+ self.assertEqual(observation["config"].count("needs: build"), 1)
110
+
111
+
112
+ if __name__ == "__main__":
113
+ unittest.main()
tests/test_inference.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import subprocess
4
+ import sys
5
+ from pathlib import Path
6
+ import unittest
7
+
8
+
9
+ class InferenceOutputFormatTests(unittest.TestCase):
10
+ def test_inference_prints_required_markers(self):
11
+ project_root = Path(__file__).resolve().parents[1]
12
+ env = os.environ.copy()
13
+ env["OFFLINE_INFERENCE"] = "1"
14
+
15
+ completed = subprocess.run(
16
+ [sys.executable, "inference.py", "--max-steps", "3", "--offline", "--force-local-env"],
17
+ cwd=project_root,
18
+ capture_output=True,
19
+ text=True,
20
+ env=env,
21
+ check=True,
22
+ )
23
+
24
+ lines = [line.strip() for line in completed.stdout.splitlines() if line.strip()]
25
+ self.assertGreaterEqual(len(lines), 3)
26
+ self.assertTrue(lines[0].startswith("[START] "))
27
+ self.assertTrue(lines[-1].startswith("[END] "))
28
+
29
+ start_pattern = re.compile(r"^\[START\] task=\S+ env=\S+ model=.+$")
30
+ step_pattern = re.compile(
31
+ r"^\[STEP\] step=\d+ action=.* reward=-?\d+\.\d{2} done=(true|false) error=(null|.+)$"
32
+ )
33
+ end_pattern = re.compile(
34
+ r"^\[END\] success=(true|false) steps=\d+ score=\d+\.\d{3} rewards=(-?\d+\.\d{2}(,-?\d+\.\d{2})*)?$"
35
+ )
36
+
37
+ self.assertRegex(lines[0], start_pattern)
38
+
39
+ step_lines = [line for line in lines if line.startswith("[STEP] ")]
40
+ self.assertTrue(step_lines)
41
+ for line in step_lines:
42
+ self.assertRegex(line, step_pattern)
43
+
44
+ self.assertRegex(lines[-1], end_pattern)
45
+
46
+ for line in lines:
47
+ self.assertTrue(
48
+ line.startswith("[START] ") or line.startswith("[STEP] ") or line.startswith("[END] "),
49
+ f"Unexpected output line: {line}",
50
+ )
51
+
52
+
53
+ if __name__ == "__main__":
54
+ unittest.main()
tests/test_judge.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+
3
+ from env.graders.llm_judge import LLMJudge
4
+
5
+
6
+ class FakeModel:
7
+ def __init__(self, payload, raise_error: bool = False):
8
+ self.payload = payload
9
+ self.raise_error = raise_error
10
+
11
+ def __call__(self, prompt, **kwargs):
12
+ if self.raise_error:
13
+ raise RuntimeError("model failure")
14
+ return [{"generated_text": self.payload}]
15
+
16
+
17
+ class LLMJudgeTests(unittest.TestCase):
18
+ def test_good_json_scores_are_parsed(self):
19
+ judge = LLMJudge(FakeModel('{"correctness": 1.0, "minimalism": 0.8, "quality": 0.9}'))
20
+ result = judge.evaluate_fix("npm tset", "npm test", "command not found")
21
+
22
+ self.assertGreaterEqual(result["correctness"], 0.9)
23
+ self.assertGreaterEqual(result["minimalism"], 0.7)
24
+ self.assertGreaterEqual(result["quality"], 0.8)
25
+
26
+ def test_regex_fallback_for_noisy_output(self):
27
+ noisy = "Correctness: 0.7\nMinimalism: 0.6\nQuality: 0.75"
28
+ judge = LLMJudge(FakeModel(noisy))
29
+ result = judge.evaluate_fix("a", "b", "err")
30
+
31
+ self.assertAlmostEqual(result["correctness"], 0.7)
32
+ self.assertAlmostEqual(result["minimalism"], 0.6)
33
+ self.assertAlmostEqual(result["quality"], 0.75)
34
+
35
+ def test_partial_fields_default_to_zero(self):
36
+ judge = LLMJudge(FakeModel('{"correctness": 0.8}'))
37
+ result = judge.evaluate_fix("a", "b", "err")
38
+
39
+ self.assertAlmostEqual(result["correctness"], 0.8)
40
+ self.assertAlmostEqual(result["minimalism"], 0.0)
41
+ self.assertAlmostEqual(result["quality"], 0.0)
42
+
43
+ def test_model_failure_returns_zeroes(self):
44
+ judge = LLMJudge(FakeModel("", raise_error=True))
45
+ result = judge.evaluate_fix("a", "b", "err")
46
+
47
+ self.assertEqual(result, {"correctness": 0.0, "minimalism": 0.0, "quality": 0.0})
48
+
49
+
50
+ if __name__ == "__main__":
51
+ unittest.main()
tests/test_server_api.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+
3
+ from fastapi.testclient import TestClient
4
+
5
+ from server.app import app
6
+ import server.app as server_app
7
+
8
+
9
+ class ServerApiTests(unittest.TestCase):
10
+ def setUp(self):
11
+ server_app.runtime_session = None
12
+ self.client = TestClient(app)
13
+
14
+ def test_health(self):
15
+ response = self.client.get("/health")
16
+ self.assertEqual(response.status_code, 200)
17
+ self.assertEqual(response.json().get("status"), "ok")
18
+
19
+ def test_reset_state_step_flow(self):
20
+ reset_response = self.client.post("/reset", json={})
21
+ self.assertEqual(reset_response.status_code, 200)
22
+ reset_payload = reset_response.json()
23
+ self.assertIn("observation", reset_payload)
24
+ self.assertIn("step_count", reset_payload)
25
+ self.assertEqual(reset_payload["step_count"], 0)
26
+
27
+ state_response = self.client.get("/state")
28
+ self.assertEqual(state_response.status_code, 200)
29
+ state_payload = state_response.json()
30
+ self.assertTrue(state_payload.get("initialized"))
31
+
32
+ step_response = self.client.post(
33
+ "/step",
34
+ json={"action": "edit_config: replace npm tset with npm test"},
35
+ )
36
+ self.assertEqual(step_response.status_code, 200)
37
+ step_payload = step_response.json()
38
+ self.assertIn("reward", step_payload)
39
+ self.assertIn("done", step_payload)
40
+
41
+ def test_step_requires_reset(self):
42
+ server_app.runtime_session = None
43
+ client = TestClient(app)
44
+ response = client.post("/step", json={"action": "read_logs: inspect logs"})
45
+ self.assertEqual(response.status_code, 400)
46
+
47
+
48
+ if __name__ == "__main__":
49
+ unittest.main()
uv.lock ADDED
The diff for this file is too large to render. See raw diff
 
validate-submission.sh ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # validate-submission.sh - OpenEnv Submission Validator
4
+ #
5
+ # Checks that your HF Space is live, Docker image builds, and openenv validate passes.
6
+ #
7
+ # Prerequisites:
8
+ # - Docker: https://docs.docker.com/get-docker/
9
+ # - openenv-core: pip install openenv-core
10
+ # - curl (usually pre-installed)
11
+ #
12
+ # Run:
13
+ # curl -fsSL https://raw.githubusercontent.com/<owner>/<repo>/main/scripts/validate-submission.sh | bash -s -- <ping_url> [repo_dir]
14
+ #
15
+ # Or download and run locally:
16
+ # chmod +x validate-submission.sh
17
+ # ./validate-submission.sh <ping_url> [repo_dir]
18
+ #
19
+ # Arguments:
20
+ # ping_url Your HuggingFace Space URL (e.g. https://your-space.hf.space)
21
+ # repo_dir Path to your repo (default: current directory)
22
+ #
23
+ # Examples:
24
+ # ./validate-submission.sh https://my-team.hf.space
25
+ # ./validate-submission.sh https://my-team.hf.space ./my-repo
26
+ #
27
+
28
+ set -uo pipefail
29
+
30
+ DOCKER_BUILD_TIMEOUT=600
31
+ if [ -t 1 ]; then
32
+ RED='\033[0;31m'
33
+ GREEN='\033[0;32m'
34
+ YELLOW='\033[1;33m'
35
+ BOLD='\033[1m'
36
+ NC='\033[0m'
37
+ else
38
+ RED='' GREEN='' YELLOW='' BOLD='' NC=''
39
+ fi
40
+
41
+ run_with_timeout() {
42
+ local secs="$1"; shift
43
+ if command -v timeout &>/dev/null; then
44
+ timeout "$secs" "$@"
45
+ elif command -v gtimeout &>/dev/null; then
46
+ gtimeout "$secs" "$@"
47
+ else
48
+ "$@" &
49
+ local pid=$!
50
+ ( sleep "$secs" && kill "$pid" 2>/dev/null ) &
51
+ local watcher=$!
52
+ wait "$pid" 2>/dev/null
53
+ local rc=$?
54
+ kill "$watcher" 2>/dev/null
55
+ wait "$watcher" 2>/dev/null
56
+ return $rc
57
+ fi
58
+ }
59
+
60
+ portable_mktemp() {
61
+ local prefix="${1:-validate}"
62
+ mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
63
+ }
64
+
65
+ CLEANUP_FILES=()
66
+ cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
67
+ trap cleanup EXIT
68
+
69
+ PING_URL="${1:-}"
70
+ REPO_DIR="${2:-.}"
71
+
72
+ if [ -z "$PING_URL" ]; then
73
+ printf "Usage: %s <ping_url> [repo_dir]\n" "$0"
74
+ printf "\n"
75
+ printf " ping_url Your HuggingFace Space URL (e.g. https://your-space.hf.space)\n"
76
+ printf " repo_dir Path to your repo (default: current directory)\n"
77
+ exit 1
78
+ fi
79
+
80
+ if ! REPO_DIR="$(cd "$REPO_DIR" 2>/dev/null && pwd)"; then
81
+ printf "Error: directory '%s' not found\n" "${2:-.}"
82
+ exit 1
83
+ fi
84
+ PING_URL="${PING_URL%/}"
85
+ export PING_URL
86
+ PASS=0
87
+
88
+ log() { printf "[%s] %b\n" "$(date -u +%H:%M:%S)" "$*"; }
89
+ pass() { log "${GREEN}PASSED${NC} -- $1"; PASS=$((PASS + 1)); }
90
+ fail() { log "${RED}FAILED${NC} -- $1"; }
91
+ hint() { printf " ${YELLOW}Hint:${NC} %b\n" "$1"; }
92
+ stop_at() {
93
+ printf "\n"
94
+ printf "${RED}${BOLD}Validation stopped at %s.${NC} Fix the above before continuing.\n" "$1"
95
+ exit 1
96
+ }
97
+
98
+ printf "\n"
99
+ printf "${BOLD}========================================${NC}\n"
100
+ printf "${BOLD} OpenEnv Submission Validator${NC}\n"
101
+ printf "${BOLD}========================================${NC}\n"
102
+ log "Repo: $REPO_DIR"
103
+ log "Ping URL: $PING_URL"
104
+ printf "\n"
105
+
106
+ log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
107
+
108
+ CURL_OUTPUT=$(portable_mktemp "validate-curl")
109
+ CLEANUP_FILES+=("$CURL_OUTPUT")
110
+ HTTP_CODE=$(curl -s -o "$CURL_OUTPUT" -w "%{http_code}" -X POST \
111
+ -H "Content-Type: application/json" -d '{}' \
112
+ "$PING_URL/reset" --max-time 30 2>"$CURL_OUTPUT" || printf "000")
113
+
114
+ if [ "$HTTP_CODE" = "200" ]; then
115
+ pass "HF Space is live and responds to /reset"
116
+ elif [ "$HTTP_CODE" = "000" ]; then
117
+ fail "HF Space not reachable (connection failed or timed out)"
118
+ hint "Check your network connection and that the Space is running."
119
+ hint "Try: curl -s -o /dev/null -w '%%{http_code}' -X POST $PING_URL/reset"
120
+ stop_at "Step 1"
121
+ else
122
+ fail "HF Space /reset returned HTTP $HTTP_CODE (expected 200)"
123
+ hint "Make sure your Space is running and the URL is correct."
124
+ hint "Try opening $PING_URL in your browser first."
125
+ stop_at "Step 1"
126
+ fi
127
+
128
+ log "${BOLD}Step 2/3: Running docker build${NC} ..."
129
+
130
+ if ! command -v docker &>/dev/null; then
131
+ fail "docker command not found"
132
+ hint "Install Docker: https://docs.docker.com/get-docker/"
133
+ stop_at "Step 2"
134
+ fi
135
+
136
+ if [ -f "$REPO_DIR/Dockerfile" ]; then
137
+ DOCKER_CONTEXT="$REPO_DIR"
138
+ elif [ -f "$REPO_DIR/server/Dockerfile" ]; then
139
+ DOCKER_CONTEXT="$REPO_DIR/server"
140
+ else
141
+ fail "No Dockerfile found in repo root or server/ directory"
142
+ stop_at "Step 2"
143
+ fi
144
+
145
+ log " Found Dockerfile in $DOCKER_CONTEXT"
146
+
147
+ BUILD_OK=false
148
+ BUILD_OUTPUT=$(run_with_timeout "$DOCKER_BUILD_TIMEOUT" docker build "$DOCKER_CONTEXT" 2>&1) && BUILD_OK=true
149
+
150
+ if [ "$BUILD_OK" = true ]; then
151
+ pass "Docker build succeeded"
152
+ else
153
+ fail "Docker build failed (timeout=${DOCKER_BUILD_TIMEOUT}s)"
154
+ printf "%s\n" "$BUILD_OUTPUT" | tail -20
155
+ stop_at "Step 2"
156
+ fi
157
+
158
+ log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
159
+
160
+ VALIDATE_OK=false
161
+ if command -v openenv &>/dev/null; then
162
+ VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
163
+ else
164
+ PY_VALIDATE="python3"
165
+ if [ -x "$REPO_DIR/.venv/bin/python" ]; then
166
+ PY_VALIDATE="$REPO_DIR/.venv/bin/python"
167
+ fi
168
+ VALIDATE_OUTPUT=$(cd "$REPO_DIR" && "$PY_VALIDATE" -m openenv.cli.__main__ validate 2>&1) && VALIDATE_OK=true
169
+ fi
170
+
171
+ if [ "$VALIDATE_OK" = true ]; then
172
+ pass "openenv validate passed"
173
+ [ -n "$VALIDATE_OUTPUT" ] && log " $VALIDATE_OUTPUT"
174
+ else
175
+ fail "openenv validate failed"
176
+ printf "%s\n" "$VALIDATE_OUTPUT"
177
+ stop_at "Step 3"
178
+ fi
179
+
180
+ printf "\n"
181
+ printf "${BOLD}========================================${NC}\n"
182
+ printf "${GREEN}${BOLD} All 3/3 checks passed!${NC}\n"
183
+ printf "${GREEN}${BOLD} Your submission is ready to submit.${NC}\n"
184
+ printf "${BOLD}========================================${NC}\n"
185
+ printf "\n"
186
+
187
+ exit 0