Lishika commited on Apr 7

Commit

30bf68a

0 Parent(s):

clean final submission

Browse files

Files changed (30) hide show

.dockerignore +11 -0
.env.example +23 -0
.gitattributes +35 -0
.gitignore +56 -0
Dockerfile +13 -0
README.md +382 -0
artifacts/baseline_scores.json +60 -0
artifacts/metrics.json +7 -0
artifacts/reward_curve.csv +8 -0
artifacts/success_rate.csv +2 -0
baseline_inference.py +200 -0
inference.py +257 -0
inference/__init__.py +4 -0
inference/metrics.py +56 -0
inference/model_wrapper.py +115 -0
inference/prompts.py +125 -0
inference/visualize.py +42 -0
openenv.yaml +54 -0
pyproject.toml +21 -0
requirements.txt +9 -0
server/__init__.py +0 -0
server/app.py +184 -0
tests/__init__.py +0 -0
tests/test_day2_engine.py +199 -0
tests/test_env.py +113 -0
tests/test_inference.py +54 -0
tests/test_judge.py +51 -0
tests/test_server_api.py +49 -0
uv.lock +0 -0
validate-submission.sh +187 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,11 @@

+.git/
+.venv/
+.pytest_cache/
+__pycache__/
+*.py[cod]
+*.log
+.env
+artifacts/
+tests/
+uv.lock
+migrated_from_cicd-debugger-env-2/

.env.example ADDED Viewed

	@@ -0,0 +1,23 @@

+# Default profile: Hugging Face Router (OpenAI-compatible API)
+API_BASE_URL=https://router.huggingface.co/v1
+MODEL_NAME=Qwen/Qwen2.5-72B-Instruct
+HF_TOKEN=<your_openai_or_router_api_key>
+# Optional alias. If both are set, OPENAI_API_KEY is used first by inference.py.
+OPENAI_API_KEY=
+# OpenAI direct profile (uncomment for OpenAI access token usage):
+# API_BASE_URL=https://api.openai.com/v1
+# MODEL_NAME=gpt-4o-mini
+# HF_TOKEN=<your_openai_access_token>
+# OPENAI_API_KEY=<optional_same_token_as_hf_token>
+# Optional runtime knobs
+LOCAL_IMAGE_NAME=
+MY_ENV_V4_TASK=easy-command-typo
+MY_ENV_V4_BENCHMARK=cicd_debugger_env
+MAX_STEPS=8
+TEMPERATURE=0.2
+MAX_TOKENS=120
+SUCCESS_SCORE_THRESHOLD=0.1
+OFFLINE_INFERENCE=0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,56 @@

+# Python bytecode and cache
+__pycache__/
+*.py[cod]
+*$py.class
+# Virtual environments
+# Keep source folder env/ tracked; only ignore venv directories.
+.venv/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Build and packaging artifacts
+build/
+dist/
+.eggs/
+*.egg-info/
+*.egg
+pip-wheel-metadata/
+# Testing, typing, linting caches
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+.pyre/
+.pytype/
+.hypothesis/
+.tox/
+.nox/
+.coverage
+.coverage.*
+htmlcov/
+# Jupyter
+.ipynb_checkpoints/
+# Logs and temp files
+*.log
+*.out
+*.err
+tmp/
+temp/
+# Local environment and secrets
+.env
+.env.*
+!.env.example
+# IDE/editor and OS files
+.vscode/
+.idea/
+*.swp
+*.swo
+.DS_Store
+Thumbs.db

Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.10-slim
+WORKDIR /app
+COPY requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+ENV PORT=7860
+EXPOSE 7860
+CMD ["python", "-m", "server.app"]

README.md ADDED Viewed

	@@ -0,0 +1,382 @@

+---
+title: CICD_DEBUGGER
+colorFrom: blue
+colorTo: green
+sdk: docker
+app_port: 7860
+pinned: false
+tags:
+  - openenv
+---
+# CI/CD Pipeline Debugger Environment (OpenEnv)
+## 1. Project Goal
+This repository implements an AI training and evaluation environment where an agent learns to debug broken CI/CD pipelines automatically.
+The environment targets real-world DevOps failure patterns, including:
+- YAML syntax and structure issues
+- Incorrect build/test commands (for example, npm tset -> npm test)
+- Dependency and setup failures
+- Multi-stage pipeline execution errors
+This is designed as an RL-style interaction loop:
+Observe -> Think -> Act -> Get Reward -> Repeat
+## 2. Why This Matters
+CI/CD failures are common, repetitive, and often multi-step to resolve. This project turns that workflow into a structured learning environment where agents:
+- Read failure context
+- Reason about root causes
+- Propose and apply fixes
+- Get shaped rewards for robust behavior
+## 3. System Architecture
+High-level flow:
+Agent (LLM) -> Action -> Environment.step() -> Reward/Evaluation -> Next step
+Core integration path:
+Model -> Action -> Environment.step() -> RewardCalculator
+RewardCalculator integrates:
+- DeterministicGrader
+- LLMJudge
+- HiddenTestRunner
+- AntiHackingDetector
+### 3.1 OpenEnv Interface (Typed)
+Typed Pydantic models are defined in `env/models.py`:
+- `Observation`: strict schema for environment observations
+- `Action`: normalized tool + payload action schema
+- `Reward`: bounded reward model with components
+Environment contract:
+- `reset()` returns the initial `Observation` payload
+- `step(action)` returns `(observation, reward, done, info)`
+- `state()` returns current environment state snapshot
+Server/API contract models are exposed in `server/app.py` and use the same typed observation/action/reward structures.
+### 3.2 Action and Observation Spaces
+Observation fields include:
+- `task_id`, `difficulty`, `failure_stage`, `actual_bug`
+- `config`, `logs`, `error_message`
+- `available_tools`, `progress_flags`
+- `file_modification_count`, `hidden_test_pass_rate`, `step_count`, `last_action_error`
+Action schema:
+- `tool`: one of `read_file`, `read_logs`, `analyze_error`, `edit_config`, `run_pipeline_stage`, `run_tests`, `validate_fix`, `submit_solution`
+- `payload`: optional dict (for example `{ "raw": "replace npm tset with npm test" }`)
+Reward schema:
+- `value`: bounded float in `[0.0, 1.0]`
+- `components`: reward breakdown dictionary
+## 4. Core Modules
+### 4.1 Quality Judge
+- File: env/graders/llm_judge.py
+- Purpose: quality-aware scoring of fixes
+- Output keys: correctness, minimalism, quality (all in [0,1])
+- Guarantees:
+	- strict JSON parsing attempt
+	- robust fallback parsing for messy output
+	- no-crash behavior (safe zero scores on failure)
+### 4.2 Deterministic Grader
+- File: env/graders/deterministic.py
+- Purpose: reproducible correctness scoring (0-1)
+- Checks:
+	- YAML validity
+	- command and fix correctness
+	- similarity and issue resolution
+- Rules:
+	- deterministic only
+	- same input, same score
+### 4.3 Anti-Hacking Detector
+- File: env/anti_hacking.py
+- Purpose: detect reward-hacking and shortcut behavior
+- Penalty detectors:
+	- stage skipping (if: false, when: never)
+	- fake success (echo tests passed, unsafe exit 0 patterns)
+	- pipeline breakage between versions
+	- excessive edits
+	- timeout abuse via too many steps
+### 4.4 Hidden Tests
+- File: env/hidden_tests.py
+- Purpose: test fix robustness, not just exact-match overfitting
+- Method:
+	- deterministic variant generation (OS, versions, env shifts)
+	- evaluate pass rate across variants
+### 4.5 Reward Shaping
+- File: env/rewards.py
+- Purpose: step-level learning signal
+- Components:
+	- progress rewards (logs, analysis, fix proposal)
+	- execution rewards (pipeline run, tests pass)
+	- quality rewards (deterministic + hidden tests + LLM judge)
+	- anti-hacking penalties
+## 5. Inference and Evaluation
+### 5.1 Prompt and Model Layers
+- inference/prompts.py: stable prompt templates and fallback action heuristics
+- inference/model_wrapper.py: OpenAI client action generation, candidate generation, and safe fallback
+Canonical action tools used by environment and inference:
+- read_file
+- read_logs
+- analyze_error
+- edit_config
+- run_pipeline_stage
+- run_tests
+- validate_fix
+- submit_solution
+### 5.2 Metrics and Artifacts
+- inference/metrics.py: reward, success-rate, and failure reason tracking
+- inference/visualize.py: reward curve and metrics artifact export
+### 5.3 Submission-Critical Runtime
+- File: inference.py (root)
+- Responsibilities:
+	- initialize model and environment
+	- run step loop
+	- calculate rewards
+	- emit strict stdout contract
+	- always emit END line
+Required output format:
+- [START] task=... env=... model=...
+- [STEP] step=<n> action=... reward=0.00 done=<true|false> error=<msg|null>
+- [END] success=<true|false> steps=<n> score=<0.000> rewards=<r1,r2,...>
+Rules enforced:
+- single-line logs only
+- reward values with 2 decimals
+- lowercase booleans
+- no extra runtime log noise
+## 6. Task Coverage
+The project includes 9 CI-fix tasks spanning:
+- easy: syntax and typo fixes
+- medium: dependency/env/cache/permissions issues
+- hard: matrix logic, conditional flow, orchestration-level failures
+Representative baseline tasks (one per difficulty):
+- easy: `easy-command-typo` (fix invalid `npm tset` command)
+- medium: `medium-python-version` (align workflow Python version)
+- hard: `hard-needs-order` (repair deploy job dependency ordering)
+## 7. Setup
+```bash
+python3 -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+```
+Environment variables:
+```bash
+export API_BASE_URL="https://router.huggingface.co/v1"
+export MODEL_NAME="Qwen/Qwen2.5-72B-Instruct"
+export HF_TOKEN="<your_openai_compatible_api_key>"
+# Optional alias; if set, this takes precedence over HF_TOKEN in inference.py
+export OPENAI_API_KEY="<same_token_optional>"
+# Optional, only if your inference spins environments from local images.
+export LOCAL_IMAGE_NAME="<local_env_image_name>"
+```
+If you want to use an OpenAI access token directly:
+```bash
+export API_BASE_URL="https://api.openai.com/v1"
+export MODEL_NAME="gpt-4o-mini"
+export HF_TOKEN="<your_openai_access_token>"
+# Optional alias:
+export OPENAI_API_KEY="<same_token_optional>"
+```
+## 8. Run Inference
+Offline/local mode:
+```bash
+python inference.py --offline --force-local-env --max-steps 8 --policy-mode imp --trajectories 4
+```
+Model-backed mode:
+```bash
+python inference.py --max-steps 8 --policy-mode imp --trajectories 4
+```
+Run baseline across easy/medium/hard tasks:
+OpenAI client mode:
+```bash
+OPENAI_API_KEY="<your_openai_compatible_api_key>" python baseline_inference.py --max-steps 5 --policy-mode imp --trajectories 3 --force-local-env
+```
+Offline reproducible mode:
+```bash
+python baseline_inference.py --max-steps 5 --policy-mode imp --trajectories 3 --offline --force-local-env
+```
+Policy modes:
+- sft: deterministic heuristic policy
+- direct: single model action per step
+- imp: multi-candidate generation and ranking
+## 9. Baseline Scores
+Reproducible baseline artifact:
+- `artifacts/baseline_scores.json`
+Latest baseline run (`max_steps=5`, `policy_mode=imp`, `trajectories=3`):
+| Task ID | Difficulty | Score | Success |
+|---|---|---:|---:|
+| easy-command-typo | easy | 0.541 | false |
+| medium-python-version | medium | 0.679 | false |
+| hard-needs-order | hard | 0.513 | false |
+Aggregate:
+- average score: `0.578`
+- success rate: `0.000`
+When `OPENAI_API_KEY` is provided, the same script runs with the OpenAI API client path in `inference.py`.
+## 10. Tests
+Run all tests:
+```bash
+python -m unittest discover -s tests -v
+```
+Coverage includes:
+- LLM judge
+- deterministic grader
+- anti-hacking detectors
+- hidden tests
+- reward system
+- end-to-end inference output format
+## 11. Validation and Submission
+OpenEnv validation:
+```bash
+python -m openenv.cli.__main__ validate
+```
+Pre-submission script:
+```bash
+./validate-submission.sh <your_hf_space_url>
+```
+Required environment variables:
+```bash
+export API_BASE_URL="https://router.huggingface.co/v1"
+export MODEL_NAME="Qwen/Qwen2.5-72B-Instruct"
+export OPENAI_API_KEY="<your_openai_compatible_api_key>"
+# Optional fallback:
+export HF_TOKEN="<your_token>"
+```
+Docker run (Space/API mode):
+```bash
+docker build -t cicd-debugger-env .
+docker run --rm -p 7860:7860 cicd-debugger-env
+```
+Server endpoints used by validators:
+- `POST /reset`
+- `POST /step`
+- `GET /state`
+- `GET /health`
+## 12. Deploy to Hugging Face Space (OpenAI Token)
+This repository is already configured for Docker Spaces (`sdk: docker` in this README front matter).
+1. Create a new Hugging Face Space with SDK set to `Docker`.
+2. Push this repository to the Space git remote.
+3. In Space Settings -> Variables and secrets, add these Secrets:
+```text
+OPENAI_API_KEY=<your_openai_access_token>
+API_BASE_URL=https://api.openai.com/v1
+MODEL_NAME=gpt-4o-mini
+```
+4. Optional Secrets:
+```text
+HF_TOKEN=<optional_fallback_token>
+OFFLINE_INFERENCE=0
+MAX_STEPS=8
+TEMPERATURE=0.2
+MAX_TOKENS=120
+```
+5. Keep the app port as `7860` (already configured).
+6. Wait for build completion, then verify:
+```bash
+curl -sS https://<your-space-name>.hf.space/health
+curl -sS -X POST https://<your-space-name>.hf.space/reset -H 'Content-Type: application/json' -d '{}'
+```
+Notes:
+- `.env.example` is for local development reference only. Hugging Face Spaces use Secrets/Variables from Space Settings.
+- Runtime code reads `OPENAI_API_KEY` first and falls back to `HF_TOKEN` when `OPENAI_API_KEY` is not provided.
+## 13. One-line Presentation Summary
+We built an OpenEnv-compliant reinforcement learning environment where AI agents learn to debug real CI/CD pipelines using multi-step reasoning, hybrid grading, anti-hacking safeguards, and robust reward shaping.

artifacts/baseline_scores.json ADDED Viewed

	@@ -0,0 +1,60 @@

+{
+  "mode": "offline",
+  "model_name": "Qwen/Qwen2.5-72B-Instruct",
+  "api_base_url": "https://router.huggingface.co/v1",
+  "max_steps": 5,
+  "policy_mode": "imp",
+  "trajectories": 3,
+  "average_score": 0.578,
+  "success_rate": 0.0,
+  "results": [
+    {
+      "task_id": "easy-command-typo",
+      "difficulty": "easy",
+      "success": false,
+      "steps": 5,
+      "score": 0.541,
+      "rewards": [
+        0.3,
+        0.36,
+        0.57,
+        0.63,
+        0.84
+      ],
+      "start_line": "[START] task=easy-command-typo env=cicd_debugger_env model=Qwen/Qwen2.5-72B-Instruct",
+      "end_line": "[END] success=false steps=5 score=0.541 rewards=0.30,0.36,0.57,0.63,0.84"
+    },
+    {
+      "task_id": "medium-python-version",
+      "difficulty": "medium",
+      "success": false,
+      "steps": 5,
+      "score": 0.679,
+      "rewards": [
+        0.48,
+        0.54,
+        0.58,
+        0.79,
+        1.0
+      ],
+      "start_line": "[START] task=medium-python-version env=cicd_debugger_env model=Qwen/Qwen2.5-72B-Instruct",
+      "end_line": "[END] success=false steps=5 score=0.679 rewards=0.48,0.54,0.58,0.79,1.00"
+    },
+    {
+      "task_id": "hard-needs-order",
+      "difficulty": "hard",
+      "success": false,
+      "steps": 5,
+      "score": 0.513,
+      "rewards": [
+        0.48,
+        0.54,
+        0.52,
+        0.57,
+        0.46
+      ],
+      "start_line": "[START] task=hard-needs-order env=cicd_debugger_env model=Qwen/Qwen2.5-72B-Instruct",
+      "end_line": "[END] success=false steps=5 score=0.513 rewards=0.48,0.54,0.52,0.57,0.46"
+    }
+  ]
+}

artifacts/metrics.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "average_reward": 0.6719,
+  "failure_reasons": {},
+  "steps": 7,
+  "success_rate": 0.1429,
+  "total_reward": 4.7032
+}

artifacts/reward_curve.csv ADDED Viewed

	@@ -0,0 +1,8 @@

+step,reward
+1,0.3016
+2,0.3616
+3,0.5700
+4,0.6300
+5,0.8400
+6,1.0000
+7,1.0000

artifacts/success_rate.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ episode,success,success_rate
2	+ 1,1,1.0000

baseline_inference.py ADDED Viewed

	@@ -0,0 +1,200 @@

+from __future__ import annotations
+import argparse
+import json
+import os
+from pathlib import Path
+import re
+import subprocess
+import sys
+from typing import Any
+BASELINE_TASKS: list[tuple[str, str]] = [
+    ("easy-command-typo", "easy"),
+    ("medium-python-version", "medium"),
+    ("hard-needs-order", "hard"),
+]
+END_PATTERN = re.compile(
+    r"^\[END\] success=(true|false) steps=(\d+) score=(\d+\.\d{3}) rewards=(.*)$"
+)
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Run baseline inference on easy/medium/hard tasks")
+    parser.add_argument("--tasks", default=",".join(task for task, _ in BASELINE_TASKS))
+    parser.add_argument("--max-steps", type=int, default=int(os.getenv("MAX_STEPS", "8")))
+    parser.add_argument("--policy-mode", choices=["sft", "imp", "direct"], default="imp")
+    parser.add_argument("--trajectories", type=int, default=3)
+    parser.add_argument("--benchmark", default=os.getenv("MY_ENV_V4_BENCHMARK", "cicd_debugger_env"))
+    parser.add_argument("--offline", action="store_true", default=False)
+    parser.add_argument("--force-local-env", action="store_true", default=True)
+    parser.add_argument("--output", default="artifacts/baseline_scores.json")
+    return parser.parse_args()
+def should_run_offline(args: argparse.Namespace) -> bool:
+    if args.offline:
+        return True
+    key = os.getenv("OPENAI_API_KEY") or os.getenv("HF_TOKEN")
+    if not key:
+        return True
+    return os.getenv("OFFLINE_INFERENCE", "0") == "1"
+def parse_end_line(lines: list[str]) -> dict[str, Any]:
+    for raw_line in reversed(lines):
+        line = raw_line.strip()
+        if not line.startswith("[END] "):
+            continue
+        matched = END_PATTERN.match(line)
+        if not matched:
+            raise RuntimeError(f"Malformed END line: {line}")
+        success = matched.group(1) == "true"
+        steps = int(matched.group(2))
+        score = float(matched.group(3))
+        rewards_str = matched.group(4).strip()
+        rewards: list[float] = []
+        if rewards_str:
+            rewards = [float(value) for value in rewards_str.split(",") if value]
+        return {
+            "success": success,
+            "steps": steps,
+            "score": score,
+            "rewards": rewards,
+            "end_line": line,
+        }
+    raise RuntimeError("No END line found in inference output")
+def run_single_task(
+    task_id: str,
+    difficulty: str,
+    args: argparse.Namespace,
+    project_root: Path,
+    offline_mode: bool,
+) -> dict[str, Any]:
+    command = [
+        sys.executable,
+        "inference.py",
+        "--task",
+        task_id,
+        "--benchmark",
+        str(args.benchmark),
+        "--max-steps",
+        str(max(1, int(args.max_steps))),
+        "--policy-mode",
+        str(args.policy_mode),
+        "--trajectories",
+        str(max(1, int(args.trajectories))),
+    ]
+    if offline_mode:
+        command.append("--offline")
+    if args.force_local_env:
+        command.append("--force-local-env")
+    env = os.environ.copy()
+    if offline_mode:
+        env["OFFLINE_INFERENCE"] = "1"
+    completed = subprocess.run(
+        command,
+        cwd=project_root,
+        capture_output=True,
+        text=True,
+        env=env,
+        check=True,
+    )
+    lines = [line for line in completed.stdout.splitlines() if line.strip()]
+    summary = parse_end_line(lines)
+    return {
+        "task_id": task_id,
+        "difficulty": difficulty,
+        "success": summary["success"],
+        "steps": summary["steps"],
+        "score": summary["score"],
+        "rewards": summary["rewards"],
+        "start_line": next((line for line in lines if line.startswith("[START] ")), ""),
+        "end_line": summary["end_line"],
+    }
+def main() -> int:
+    args = parse_args()
+    project_root = Path(__file__).resolve().parent
+    known_difficulties = {task: difficulty for task, difficulty in BASELINE_TASKS}
+    requested_tasks = [task.strip() for task in str(args.tasks).split(",") if task.strip()]
+    if not requested_tasks:
+        print("No tasks provided for baseline run", file=sys.stderr)
+        return 1
+    offline_mode = should_run_offline(args)
+    print(
+        f"[BASELINE] mode={'offline' if offline_mode else 'openai'} tasks={len(requested_tasks)} "
+        f"max_steps={max(1, int(args.max_steps))} policy={args.policy_mode}",
+        flush=True,
+    )
+    results: list[dict[str, Any]] = []
+    for task_id in requested_tasks:
+        difficulty = known_difficulties.get(task_id, "custom")
+        try:
+            result = run_single_task(task_id, difficulty, args, project_root, offline_mode)
+            results.append(result)
+            print(
+                f"[BASELINE] task={task_id} difficulty={difficulty} success={str(result['success']).lower()} "
+                f"score={result['score']:.3f} steps={result['steps']}",
+                flush=True,
+            )
+        except subprocess.CalledProcessError as exc:
+            print(f"[BASELINE] task={task_id} failed with return code {exc.returncode}", file=sys.stderr)
+            if exc.stdout:
+                print(exc.stdout, file=sys.stderr)
+            if exc.stderr:
+                print(exc.stderr, file=sys.stderr)
+            return exc.returncode or 1
+        except Exception as exc:
+            print(f"[BASELINE] task={task_id} failed: {exc}", file=sys.stderr)
+            return 1
+    average_score = sum(item["score"] for item in results) / len(results)
+    success_rate = sum(1 for item in results if item["success"]) / len(results)
+    payload = {
+        "mode": "offline" if offline_mode else "openai",
+        "model_name": os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct"),
+        "api_base_url": os.getenv("API_BASE_URL", "https://router.huggingface.co/v1"),
+        "max_steps": max(1, int(args.max_steps)),
+        "policy_mode": str(args.policy_mode),
+        "trajectories": max(1, int(args.trajectories)),
+        "average_score": round(float(average_score), 3),
+        "success_rate": round(float(success_rate), 3),
+        "results": results,
+    }
+    output_path = project_root / str(args.output)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    output_path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
+    print(f"[BASELINE] average_score={payload['average_score']:.3f} success_rate={payload['success_rate']:.3f}", flush=True)
+    print(f"[BASELINE] wrote {output_path}", flush=True)
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

inference.py ADDED Viewed

	@@ -0,0 +1,257 @@

+from __future__ import annotations
+import argparse
+import asyncio
+import os
+from typing import Any
+from openai import OpenAI
+from env.environment import CICDDebuggerEnvironment, REQUIRED_TOOLS
+from inference.metrics import EpisodeMetrics
+from inference.model_wrapper import ModelWrapper, score_action_candidate
+from inference.prompts import heuristic_action
+from inference.visualize import save_metrics_json, save_reward_curve, save_success_rate_history
+API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
+MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
+HF_TOKEN = os.getenv("HF_TOKEN")
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+API_KEY = OPENAI_API_KEY or HF_TOKEN
+LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME")
+DEFAULT_TASK_ID = os.getenv("MY_ENV_V4_TASK", "easy-command-typo")
+DEFAULT_BENCHMARK = os.getenv("MY_ENV_V4_BENCHMARK", "cicd_debugger_env")
+MAX_STEPS_DEFAULT = int(os.getenv("MAX_STEPS", "8"))
+TEMPERATURE = float(os.getenv("TEMPERATURE", "0.2"))
+MAX_TOKENS = int(os.getenv("MAX_TOKENS", "120"))
+OFFLINE_INFERENCE = os.getenv("OFFLINE_INFERENCE", "0") == "1"
+SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.1"))
+def log_start(task: str, env_name: str, model: str) -> None:
+    print(f"[START] task={_single_line(task)} env={_single_line(env_name)} model={_single_line(model)}", flush=True)
+def log_step(step: int, action: str, reward: float, done: bool, error: str | None) -> None:
+    done_val = str(done).lower()
+    error_val = _single_line(error) if error else "null"
+    action_val = _single_line(action)
+    print(f"[STEP] step={step} action={action_val} reward={reward:.2f} done={done_val} error={error_val}", flush=True)
+def log_end(success: bool, steps: int, score: float, rewards: list[float]) -> None:
+    rewards_str = ",".join(f"{value:.2f}" for value in rewards)
+    print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
+def _single_line(value: Any) -> str:
+    return " ".join(str(value).replace("\n", " ").replace("\r", " ").split())
+def _is_hacking_action(action_text: str) -> bool:
+    value = (action_text or "").lower()
+    patterns = (
+        "if: false",
+        "when: never",
+        "echo \"tests passed\"",
+        "echo 'tests passed'",
+        "exit 0",
+        "force success",
+        "status: success",
+    )
+    return any(token in value for token in patterns)
+def _extract_error(info: dict[str, Any] | None) -> str | None:
+    if not info:
+        return None
+    error = info.get("error")
+    return str(error) if error else None
+def _extract_observation_fields(observation: dict[str, Any]) -> tuple[str, str, list[str]]:
+    config_text = str(observation.get("config") or "")
+    error_message = str(observation.get("error_message") or "")
+    tools = [str(item) for item in (observation.get("available_tools") or REQUIRED_TOOLS)]
+    return config_text, error_message, tools
+def _tool_from_action(action_text: str) -> str:
+    return str(action_text or "").split(":", 1)[0].strip().lower()
+def _is_action_allowed(action_text: str, available_tools: list[str]) -> bool:
+    return _tool_from_action(action_text) in {tool.lower() for tool in available_tools}
+def _normalize_action(action_text: str, available_tools: list[str], fallback: str) -> str:
+    action = str(action_text or "").strip()
+    if not action:
+        return fallback
+    aliases = {
+        "run_stage": "run_pipeline_stage",
+        "validate": "validate_fix",
+        "submit": "submit_solution",
+        "submit_fix": "submit_solution",
+    }
+    tool = _tool_from_action(action)
+    normalized_tool = aliases.get(tool, tool)
+    if normalized_tool != tool:
+        suffix = action.split(":", 1)[1].strip() if ":" in action else ""
+        action = f"{normalized_tool}: {suffix}" if suffix else normalized_tool
+    if _is_action_allowed(action, available_tools):
+        return action
+    return fallback
+def _select_action(
+    model_wrapper: ModelWrapper,
+    step: int,
+    config_text: str,
+    error_message: str,
+    history: list[str],
+    available_actions: list[str],
+    policy_mode: str,
+    trajectories: int,
+) -> str:
+    mode = (policy_mode or "imp").lower()
+    fallback = heuristic_action(config_text, error_message, available_actions, history)
+    if mode == "sft":
+        return _normalize_action(fallback, available_actions, fallback)
+    if mode == "direct":
+        action = model_wrapper.generate_action(
+            step=step,
+            config_text=config_text,
+            error_message=error_message,
+            history=history,
+            available_actions=available_actions,
+        )
+        return _normalize_action(action, available_actions, fallback)
+    candidates = model_wrapper.generate_candidates(
+        step=step,
+        config_text=config_text,
+        error_message=error_message,
+        history=history,
+        count=max(1, int(trajectories)),
+        available_actions=available_actions,
+    )
+    if not candidates:
+        return _normalize_action(fallback, available_actions, fallback)
+    observation_text = f"{config_text}\n{error_message}"
+    best = max(candidates, key=lambda item: score_action_candidate(observation_text, item, _is_hacking_action))
+    return _normalize_action(best, available_actions, fallback)
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Run CI/CD debugger inference loop")
+    parser.add_argument("--max-steps", type=int, default=MAX_STEPS_DEFAULT)
+    parser.add_argument("--task", default=DEFAULT_TASK_ID)
+    parser.add_argument("--benchmark", default=DEFAULT_BENCHMARK)
+    parser.add_argument("--difficulty", choices=["easy", "medium", "hard"], default=None)
+    parser.add_argument("--offline", action="store_true", default=OFFLINE_INFERENCE)
+    parser.add_argument("--force-local-env", action="store_true", default=False)
+    parser.add_argument("--policy-mode", choices=["sft", "imp", "direct"], default="imp")
+    parser.add_argument("--trajectories", type=int, default=3)
+    return parser.parse_args()
+async def run_episode(args: argparse.Namespace) -> int:
+    history: list[str] = []
+    steps_taken = 0
+    success = False
+    episode_completed_cleanly = False
+    metrics = EpisodeMetrics()
+    env = CICDDebuggerEnvironment(max_steps=max(1, int(args.max_steps)))
+    offline_mode = bool(args.offline or not API_KEY)
+    client: OpenAI | None = None
+    if not offline_mode:
+        try:
+            client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
+        except Exception:
+            client = None
+            offline_mode = True
+    model_wrapper = ModelWrapper(
+        client=client,
+        model_name=MODEL_NAME,
+        temperature=TEMPERATURE,
+        max_tokens=MAX_TOKENS,
+        offline=offline_mode,
+    )
+    log_start(task=str(args.task), env_name=str(args.benchmark), model=MODEL_NAME)
+    try:
+        observation = await env.reset(task_id=str(args.task), difficulty=args.difficulty)
+        for step in range(1, max(1, int(args.max_steps)) + 1):
+            config_text, error_message, available_tools = _extract_observation_fields(observation)
+            action_text = _select_action(
+                model_wrapper=model_wrapper,
+                step=step,
+                config_text=config_text,
+                error_message=error_message,
+                history=history,
+                available_actions=available_tools,
+                policy_mode=str(args.policy_mode),
+                trajectories=max(1, int(args.trajectories)),
+            )
+            observation, reward, done, info = await env.step(action_text)
+            step_error = _extract_error(info)
+            metrics.add_step(action=action_text, reward=float(reward), error=step_error, done=bool(done))
+            steps_taken = step
+            log_step(step=step, action=action_text, reward=float(reward), done=bool(done), error=step_error)
+            history.append(f"step={step} action={_single_line(action_text)} reward={float(reward):.2f}")
+            if done:
+                episode_completed_cleanly = step_error is None and not _is_hacking_action(action_text)
+                break
+    except Exception as exc:
+        success = False
+        if not metrics.rewards:
+            metrics.add_step(action="system_error", reward=0.0, error=str(exc), done=True)
+    finally:
+        score = max(0.0, min(1.0, float(metrics.average_reward)))
+        success = episode_completed_cleanly and score >= SUCCESS_SCORE_THRESHOLD
+        try:
+            save_reward_curve(metrics.rewards)
+            save_metrics_json(metrics.summary())
+            save_success_rate_history([success])
+        except Exception:
+            pass
+        try:
+            await env.close()
+        except Exception:
+            pass
+        log_end(success=success, steps=steps_taken, score=score, rewards=metrics.rewards)
+    return 0
+def main() -> int:
+    args = parse_args()
+    return asyncio.run(run_episode(args))
+if __name__ == "__main__":
+    raise SystemExit(main())

inference/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from inference.metrics import EpisodeMetrics
+from inference.model_wrapper import ModelWrapper
+__all__ = ["EpisodeMetrics", "ModelWrapper"]

inference/metrics.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from __future__ import annotations
+from dataclasses import dataclass, field
+@dataclass
+class EpisodeMetrics:
+    rewards: list[float] = field(default_factory=list)
+    actions: list[str] = field(default_factory=list)
+    errors: list[str | None] = field(default_factory=list)
+    dones: list[bool] = field(default_factory=list)
+    def add_step(self, action: str, reward: float, error: str | None, done: bool) -> None:
+        self.actions.append(action)
+        self.rewards.append(float(reward))
+        self.errors.append(error)
+        self.dones.append(bool(done))
+    @property
+    def steps(self) -> int:
+        return len(self.rewards)
+    @property
+    def total_reward(self) -> float:
+        return round(sum(self.rewards), 4)
+    @property
+    def average_reward(self) -> float:
+        if not self.rewards:
+            return 0.0
+        return round(self.total_reward / len(self.rewards), 4)
+    @property
+    def success_rate(self) -> float:
+        if not self.dones:
+            return 0.0
+        successes = sum(1 for flag in self.dones if flag)
+        return round(successes / len(self.dones), 4)
+    @property
+    def failure_reasons(self) -> dict[str, int]:
+        counts: dict[str, int] = {}
+        for err in self.errors:
+            if not err:
+                continue
+            counts[err] = counts.get(err, 0) + 1
+        return counts
+    def summary(self) -> dict[str, float | int | dict[str, int]]:
+        return {
+            "steps": self.steps,
+            "total_reward": self.total_reward,
+            "average_reward": self.average_reward,
+            "success_rate": self.success_rate,
+            "failure_reasons": self.failure_reasons,
+        }

inference/model_wrapper.py ADDED Viewed

	@@ -0,0 +1,115 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any, Iterable
+from openai import OpenAI
+from inference.prompts import REQUIRED_ACTIONS, SYSTEM_PROMPT, build_user_prompt, heuristic_action, sanitize_action_text
+@dataclass
+class ModelWrapper:
+    client: OpenAI | None
+    model_name: str
+    temperature: float
+    max_tokens: int
+    offline: bool
+    def generate_action(
+        self,
+        step: int,
+        config_text: str,
+        error_message: str,
+        history: list[str],
+        available_actions: Iterable[str] | None = None,
+    ) -> str:
+        fallback = heuristic_action(config_text, error_message, available_actions, history)
+        if self.offline or self.client is None:
+            return fallback
+        user_prompt = build_user_prompt(
+            step=step,
+            config_text=config_text,
+            error_message=error_message,
+            history=history,
+            available_actions=available_actions,
+        )
+        try:
+            completion = self.client.chat.completions.create(
+                model=self.model_name,
+                messages=[
+                    {"role": "system", "content": SYSTEM_PROMPT},
+                    {"role": "user", "content": user_prompt},
+                ],
+                temperature=max(float(self.temperature), 0.0),
+                max_tokens=max(16, int(self.max_tokens)),
+                stream=False,
+            )
+            generated = str(completion.choices[0].message.content or "")
+            return sanitize_action_text(generated, fallback=fallback)
+        except Exception:
+            return fallback
+    def generate_candidates(
+        self,
+        step: int,
+        config_text: str,
+        error_message: str,
+        history: list[str],
+        count: int,
+        available_actions: Iterable[str] | None = None,
+    ) -> list[str]:
+        candidates = [heuristic_action(config_text, error_message, available_actions, history)]
+        for idx in range(max(1, count)):
+            action = self.generate_action(
+                step=step,
+                config_text=config_text,
+                error_message=error_message,
+                history=history + [f"candidate={idx}"],
+                available_actions=available_actions,
+            )
+            if action:
+                candidates.append(action)
+        deduped: list[str] = []
+        seen: set[str] = set()
+        for candidate in candidates:
+            normalized = candidate.strip()
+            if not normalized or normalized in seen:
+                continue
+            seen.add(normalized)
+            deduped.append(normalized)
+        return deduped
+def score_action_candidate(observation_text: str, action_text: str, hacking_predicate: Any) -> float:
+    obs = (observation_text or "").lower()
+    action = (action_text or "").lower()
+    score = 0.0
+    selected_tool = action.split(":", 1)[0].strip()
+    if selected_tool in REQUIRED_ACTIONS:
+        score += 0.5
+    else:
+        score -= 0.5
+    if "npm tset" in obs and "edit_config" in action and "npm test" in action:
+        score += 2.0
+    if "yaml" in obs and "edit_config" in action:
+        score += 1.2
+    if "missing" in obs and "analyze_error" in action:
+        score += 0.8
+    if "validate_fix" in action or "submit_solution" in action:
+        score += 0.5
+    if hacking_predicate(action_text):
+        score -= 3.0
+    score += min(len(action_text) / 240.0, 0.2)
+    return score

inference/prompts.py ADDED Viewed

	@@ -0,0 +1,125 @@

+from __future__ import annotations
+import re
+import textwrap
+from typing import Iterable
+SYSTEM_PROMPT = textwrap.dedent(
+    """
+    You are a CI/CD pipeline debugger assistant.
+    Return exactly one single-line action describing the next debugging move.
+    Do not output markdown. Do not include explanations.
+    """
+).strip()
+JUDGE_SYSTEM_PROMPT = textwrap.dedent(
+    """
+    You are a strict CI/CD judge.
+    Return JSON only with keys correctness, minimalism, quality and values in [0,1].
+    """
+).strip()
+REQUIRED_ACTIONS = (
+    "read_file",
+    "read_logs",
+    "analyze_error",
+    "edit_config",
+    "run_pipeline_stage",
+    "run_tests",
+    "validate_fix",
+    "submit_solution",
+)
+def build_user_prompt(
+    step: int,
+    config_text: str,
+    error_message: str,
+    history: list[str],
+    available_actions: Iterable[str] | None = None,
+) -> str:
+    history_text = "\n".join(history[-5:]) if history else "None"
+    actions_text = ", ".join(available_actions) if available_actions else ", ".join(REQUIRED_ACTIONS)
+    return textwrap.dedent(
+        f"""
+        Step: {step}
+        Current config:
+        {config_text}
+        Current error:
+        {error_message}
+        Recent history:
+        {history_text}
+        Available action categories:
+        {actions_text}
+        Output one actionable single-line fix/debug action.
+        """
+    ).strip()
+def sanitize_action_text(raw_text: str, fallback: str = "read logs and analyze failing command") -> str:
+    text = (raw_text or "").strip()
+    if not text:
+        return fallback
+    text = text.replace("\n", " ").replace("\r", " ")
+    text = " ".join(text.split())
+    return text or fallback
+def heuristic_action(
+    config_text: str,
+    error_message: str,
+    available_actions: Iterable[str] | None = None,
+    history: list[str] | None = None,
+) -> str:
+    lower_cfg = (config_text or "").lower()
+    lower_err = (error_message or "").lower()
+    seen = _extract_seen_tools(history or [])
+    allowed = {item.strip() for item in (available_actions or REQUIRED_ACTIONS)}
+    def has_tool(name: str) -> bool:
+        return name in allowed
+    if has_tool("read_logs") and "read_logs" not in seen:
+        return "read_logs: inspect failing stage logs"
+    if has_tool("analyze_error") and "analyze_error" not in seen:
+        return "analyze_error: identify root cause from logs and config"
+    if has_tool("edit_config") and "npm tset" in lower_cfg:
+        return "edit_config: replace npm tset with npm test"
+    if has_tool("edit_config") and ("yaml" in lower_err or "mapping values are not allowed" in lower_err):
+        return "edit_config: fix YAML indentation and syntax"
+    if has_tool("edit_config") and ("module not found" in lower_err or "dependency" in lower_err):
+        return "edit_config: repair dependency install and test commands"
+    if has_tool("run_pipeline_stage") and "run_pipeline_stage" not in seen:
+        return "run_pipeline_stage: run test stage"
+    if has_tool("run_tests") and "run_tests" not in seen:
+        return "run_tests: execute full pipeline tests"
+    if has_tool("validate_fix") and "validate_fix" not in seen:
+        return "validate_fix: check deterministic, hidden, and quality scores"
+    if has_tool("submit_solution"):
+        return "submit_solution: submit current configuration"
+    return "read_logs: inspect failing stage logs and identify root cause"
+def _extract_seen_tools(history: list[str]) -> set[str]:
+    seen: set[str] = set()
+    for item in history:
+        for tool in REQUIRED_ACTIONS:
+            if re.search(rf"\b{re.escape(tool)}\b", item):
+                seen.add(tool)
+    return seen

inference/visualize.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from __future__ import annotations
+import json
+from pathlib import Path
+def save_reward_curve(rewards: list[float], output_path: str = "artifacts/reward_curve.csv") -> str:
+    path = Path(output_path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as handle:
+        handle.write("step,reward\n")
+        for idx, reward in enumerate(rewards, start=1):
+            handle.write(f"{idx},{float(reward):.4f}\n")
+    return str(path)
+def save_success_rate_history(success_flags: list[bool], output_path: str = "artifacts/success_rate.csv") -> str:
+    path = Path(output_path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    running = 0
+    with path.open("w", encoding="utf-8") as handle:
+        handle.write("episode,success,success_rate\n")
+        for idx, flag in enumerate(success_flags, start=1):
+            if flag:
+                running += 1
+            rate = running / idx
+            handle.write(f"{idx},{int(flag)},{rate:.4f}\n")
+    return str(path)
+def save_metrics_json(metrics: dict, output_path: str = "artifacts/metrics.json") -> str:
+    path = Path(output_path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as handle:
+        json.dump(metrics, handle, indent=2, sort_keys=True)
+    return str(path)

openenv.yaml ADDED Viewed

	@@ -0,0 +1,54 @@

+version: "0.2"
+name: "cicd-debugger-env"
+description: "RL environment for CI/CD debugging with deterministic, hidden, and quality-aware scoring"
+metadata:
+  domain: "devops"
+  real_world_task: "ci-cd pipeline debugging"
+  deployment: "huggingface-space-docker"
+environment:
+  entry_point: "env.environment:CICDDebuggerEnvironment"
+interface:
+  observation_type: "json"
+  action_type: "text"
+  max_steps: 30
+action_space:
+  tools:
+    - read_file
+    - read_logs
+    - analyze_error
+    - edit_config
+    - run_pipeline_stage
+    - run_tests
+    - validate_fix
+    - submit_solution
+tasks:
+  - id: "easy-command-typo"
+    grader: "env.graders.deterministic:DeterministicGrader"
+  - id: "easy-missing-checkout"
+    grader: "env.graders.deterministic:DeterministicGrader"
+  - id: "easy-yaml-indentation"
+    grader: "env.graders.deterministic:DeterministicGrader"
+  - id: "medium-python-version"
+    grader: "env.graders.deterministic:DeterministicGrader"
+  - id: "medium-cache-key"
+    grader: "env.graders.deterministic:DeterministicGrader"
+  - id: "medium-artifact-permissions"
+    grader: "env.graders.deterministic:DeterministicGrader"
+  - id: "hard-matrix-logic"
+    grader: "env.graders.deterministic:DeterministicGrader"
+  - id: "hard-conditional-deploy"
+    grader: "env.graders.deterministic:DeterministicGrader"
+  - id: "hard-needs-order"
+    grader: "env.graders.deterministic:DeterministicGrader"

pyproject.toml ADDED Viewed

	@@ -0,0 +1,21 @@

+[project]
+name = "cicd-debugger-env"
+version = "0.1.0"
+description = "OpenEnv CI/CD pipeline debugging environment with hybrid grading and reward shaping"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+  "pyyaml",
+  "fastapi",
+  "uvicorn",
+  "openenv-core",
+  "openai",
+]
+[project.scripts]
+server = "server.app:main"
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+pyyaml
+fastapi
+uvicorn[standard]
+openenv-core
+openai
+pydantic>=2.0.0
+transformers>=4.30.0
+torch>=2.0.0
+pytest>=7.0.0

server/__init__.py ADDED Viewed

File without changes

server/app.py ADDED Viewed

	@@ -0,0 +1,184 @@

+from __future__ import annotations
+from dataclasses import dataclass
+import os
+from typing import Any
+from fastapi import FastAPI
+from fastapi import HTTPException
+from pydantic import BaseModel, Field
+import uvicorn
+from env.environment import CICDDebuggerEnvironment, MAX_STEPS
+from env.models import Action, Observation, Reward
+app = FastAPI(title="CI/CD Debugger OpenEnv Server")
+class ResetRequest(BaseModel):
+    task_id: str | None = None
+    difficulty: str | None = None
+    max_steps: int = Field(default=MAX_STEPS, ge=1, le=100)
+class StepRequest(BaseModel):
+    action: Action | str | dict[str, Any]
+class StepResponse(BaseModel):
+    task_id: str
+    step_count: int
+    reward: float
+    reward_model: Reward
+    done: bool
+    observation: Observation
+    last_action: str | None = None
+    info: dict[str, Any] = Field(default_factory=dict)
+class StateResponse(BaseModel):
+    initialized: bool
+    task_id: str | None = None
+    step_count: int = 0
+    done: bool = False
+    last_action: str | None = None
+    observation: Observation | None = None
+    internal_state: dict[str, Any] = Field(default_factory=dict)
+@dataclass
+class RuntimeSession:
+    env: CICDDebuggerEnvironment
+    task_id: str
+    step_count: int = 0
+    done: bool = False
+    last_action: str | None = None
+    last_reward: float = 0.0
+    last_observation: dict[str, Any] | None = None
+    last_info: dict[str, Any] | None = None
+runtime_session: RuntimeSession | None = None
+def _as_observation_model(observation: dict[str, Any] | Observation) -> Observation:
+    if isinstance(observation, Observation):
+        return observation
+    return Observation.model_validate(observation)
+def _build_step_response(session: RuntimeSession) -> StepResponse:
+    observation = session.last_observation or {}
+    info_payload = session.last_info or {}
+    reward_payload = info_payload.get("reward_model")
+    if isinstance(reward_payload, dict):
+        reward_model = Reward.model_validate(reward_payload)
+    else:
+        reward_model = Reward(value=float(session.last_reward), components={"total": float(session.last_reward)})
+    return StepResponse(
+        task_id=session.task_id,
+        step_count=int(observation.get("step_count") or session.step_count),
+        reward=float(session.last_reward),
+        reward_model=reward_model,
+        done=bool(session.done),
+        observation=_as_observation_model(observation),
+        last_action=session.last_action,
+        info=info_payload,
+    )
+@app.get("/")
+def root() -> dict[str, Any]:
+    return {
+        "message": "CI/CD Debugger Environment is running 🚀",
+        "endpoints": ["/health", "/reset", "/step", "/state"],
+    }
+@app.get("/health")
+def health() -> dict[str, str]:
+    return {"status": "ok"}
+@app.post("/reset", response_model=StepResponse)
+async def reset(payload: ResetRequest | None = None) -> StepResponse:
+    global runtime_session
+    request = payload or ResetRequest()
+    env = CICDDebuggerEnvironment(max_steps=int(request.max_steps))
+    observation = await env.reset(task_id=request.task_id, difficulty=request.difficulty)
+    runtime_session = RuntimeSession(
+        env=env,
+        task_id=str(observation.get("task_id", request.task_id or "cicd-debugger-task")),
+        step_count=0,
+        done=False,
+        last_action=None,
+        last_reward=0.0,
+        last_observation=observation,
+        last_info={
+            "message": "environment reset",
+            "tool": "reset",
+            "error": None,
+            "reward_model": Reward(value=0.0, components={"total": 0.0}).model_dump(),
+        },
+    )
+    return _build_step_response(runtime_session)
+@app.post("/step", response_model=StepResponse)
+async def step(payload: StepRequest) -> StepResponse:
+    global runtime_session
+    if runtime_session is None:
+        raise HTTPException(status_code=400, detail="Environment not initialized. Call /reset first.")
+    if runtime_session.done:
+        return _build_step_response(runtime_session)
+    observation, reward, done, info = await runtime_session.env.step(payload.action)
+    runtime_session.step_count = int(observation.get("step_count", runtime_session.step_count + 1))
+    runtime_session.done = bool(done)
+    runtime_session.last_action = payload.action if isinstance(payload.action, str) else str(payload.action)
+    runtime_session.last_reward = float(reward)
+    runtime_session.last_observation = observation
+    runtime_session.last_info = dict(info or {})
+    return _build_step_response(runtime_session)
+@app.get("/state", response_model=StateResponse)
+async def state() -> StateResponse:
+    if runtime_session is None:
+        return StateResponse(initialized=False)
+    observation = None
+    if runtime_session.last_observation is not None:
+        observation = _as_observation_model(runtime_session.last_observation)
+    return StateResponse(
+        initialized=True,
+        task_id=runtime_session.task_id,
+        step_count=runtime_session.step_count,
+        done=runtime_session.done,
+        last_action=runtime_session.last_action,
+        observation=observation,
+        internal_state=runtime_session.env.state(),
+    )
+@app.post("/state", response_model=StateResponse)
+async def state_post() -> StateResponse:
+    return await state()
+def main() -> None:
+    port = int(os.getenv("PORT", "7860"))
+    uvicorn.run(app, host="0.0.0.0", port=port)
+if __name__ == "__main__":
+    main()

tests/__init__.py ADDED Viewed

File without changes

tests/test_day2_engine.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import unittest
+from env.anti_hacking import AntiHackingDetector
+from env.graders.deterministic import DeterministicGrader
+from env.hidden_tests import HiddenTestRunner
+from env.rewards import RewardCalculator
+EXPECTED_CONFIG = """
+name: CI
+on: [push]
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - run: npm ci
+      - run: npm test
+"""
+WRONG_CONFIG = """
+name: CI
+on: [push]
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - run: npm ci
+      - run: npm tset
+"""
+BROKEN_YAML = """
+name CI
+jobs:
+  test:
+    steps
+      - run npm test
+"""
+class FakeJudge:
+    def evaluate_fix(self, original, fixed, error):
+        return {
+            "correctness": 0.9,
+            "minimalism": 0.8,
+            "quality": 0.9,
+        }
+class Day2EngineTests(unittest.TestCase):
+    def setUp(self):
+        self.grader = DeterministicGrader()
+        self.detector = AntiHackingDetector()
+        self.hidden_runner = HiddenTestRunner(grader=self.grader)
+        self.reward_calculator = RewardCalculator(
+            llm_judge=FakeJudge(),
+            anti_hacking_detector=self.detector,
+            deterministic_grader=self.grader,
+            hidden_test_runner=self.hidden_runner,
+        )
+    def test_deterministic_grader_high_for_correct_fix(self):
+        score = self.grader.grade(EXPECTED_CONFIG, EXPECTED_CONFIG)
+        self.assertGreaterEqual(score, 0.85)
+        self.assertLessEqual(score, 1.0)
+    def test_deterministic_grader_low_for_wrong_fix(self):
+        score = self.grader.grade(WRONG_CONFIG, EXPECTED_CONFIG)
+        self.assertLess(score, 0.6)
+    def test_deterministic_grader_penalizes_broken_yaml(self):
+        score = self.grader.grade(BROKEN_YAML, EXPECTED_CONFIG)
+        self.assertLess(score, 0.4)
+    def test_deterministic_grader_is_reproducible(self):
+        first = self.grader.grade(WRONG_CONFIG, EXPECTED_CONFIG)
+        second = self.grader.grade(WRONG_CONFIG, EXPECTED_CONFIG)
+        self.assertEqual(first, second)
+    def test_anti_hacking_detects_stage_skipping(self):
+        config = EXPECTED_CONFIG + "\n    if: false\n"
+        penalty = self.detector.penalty_stage_skipping(config)
+        self.assertLess(penalty, 0.0)
+    def test_anti_hacking_detects_fake_success(self):
+        config = EXPECTED_CONFIG.replace("npm test", 'echo "tests passed" && exit 0')
+        penalty = self.detector.penalty_fake_success(config)
+        self.assertLess(penalty, 0.0)
+    def test_anti_hacking_detects_breaking_pipeline(self):
+        previous_config = """
+stages:
+  - build
+  - test
+jobs:
+  build:
+    run: npm ci
+  test:
+    run: npm test
+"""
+        new_config = """
+stages:
+  - build
+jobs:
+  build:
+    run: npm ci
+"""
+        penalty = self.detector.penalty_breaking_pipeline(previous_config, new_config)
+        self.assertLess(penalty, 0.0)
+    def test_anti_hacking_detects_excessive_edits(self):
+        penalty = self.detector.penalty_excessive_edits(changed_files_count=12, changed_lines_count=400)
+        self.assertLess(penalty, 0.0)
+    def test_anti_hacking_detects_timeout_abuse(self):
+        penalty = self.detector.penalty_timeout_abuse(step_count=25)
+        self.assertLess(penalty, 0.0)
+    def test_hidden_tests_returns_high_pass_rate_for_good_fix(self):
+        pass_rate = self.hidden_runner.evaluate_fix(
+            fixed_config=EXPECTED_CONFIG,
+            expected_config=EXPECTED_CONFIG,
+        )
+        self.assertGreaterEqual(pass_rate, 0.8)
+    def test_hidden_tests_returns_lower_pass_rate_for_bad_fix(self):
+        pass_rate = self.hidden_runner.evaluate_fix(
+            fixed_config=WRONG_CONFIG,
+            expected_config=EXPECTED_CONFIG,
+        )
+        self.assertLess(pass_rate, 0.8)
+    def test_reward_calculator_positive_for_good_behavior(self):
+        state = {
+            "step_count": 3,
+            "previous_config": WRONG_CONFIG,
+            "expected_config": EXPECTED_CONFIG,
+            "original_config": WRONG_CONFIG,
+            "error": "command not found",
+        }
+        result = {
+            "logs_analyzed": True,
+            "error_diagnosed": True,
+            "fix_proposed": True,
+            "pipeline_run": True,
+            "tests_passed": True,
+            "command_succeeded": True,
+            "current_config": EXPECTED_CONFIG,
+            "expected_config": EXPECTED_CONFIG,
+            "changed_files_count": 1,
+            "changed_lines_count": 1,
+        }
+        reward = self.reward_calculator.calculate_step_reward(
+            state=state,
+            action="validate_fix",
+            result=result,
+            original_config=WRONG_CONFIG,
+            fixed_config=EXPECTED_CONFIG,
+            error_message="command not found",
+            expected_config=EXPECTED_CONFIG,
+            metadata={"broken_token": "npm tset", "fixed_token": "npm test"},
+        )
+        self.assertGreater(reward, 0.5)
+    def test_reward_calculator_penalizes_hacking(self):
+        hacked_config = EXPECTED_CONFIG.replace("npm test", 'echo "tests passed" && exit 0') + "\n    if: false\n"
+        state = {
+            "step_count": 25,
+            "previous_config": EXPECTED_CONFIG,
+            "changed_files_count": 15,
+            "changed_lines_count": 500,
+        }
+        result = {
+            "current_config": hacked_config,
+            "expected_config": EXPECTED_CONFIG,
+            "hacking_attempt": True,
+            "changed_files_count": 15,
+            "changed_lines_count": 500,
+        }
+        reward = self.reward_calculator.calculate_step_reward(
+            state=state,
+            action="edit_config",
+            result=result,
+            original_config=EXPECTED_CONFIG,
+            fixed_config=hacked_config,
+            error_message="",
+            expected_config=EXPECTED_CONFIG,
+        )
+        self.assertGreaterEqual(reward, 0.0)
+        self.assertLessEqual(reward, 0.3)
+if __name__ == "__main__":
+    unittest.main()

tests/test_env.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import asyncio
+import unittest
+from env.environment import CICDDebuggerEnvironment, REQUIRED_TOOLS
+class EnvironmentContractTests(unittest.TestCase):
+    def test_reset_returns_structured_observation(self):
+        env = CICDDebuggerEnvironment(max_steps=10, seed=7)
+        observation = asyncio.run(env.reset(task_id="easy-command-typo"))
+        self.assertIn("config", observation)
+        self.assertIn("logs", observation)
+        self.assertIn("error_message", observation)
+        self.assertIn("progress_flags", observation)
+        self.assertEqual(observation["task_id"], "easy-command-typo")
+        self.assertEqual(observation["available_tools"], REQUIRED_TOOLS)
+        self.assertEqual(observation["step_count"], 0)
+    def test_step_returns_obs_reward_done_info(self):
+        env = CICDDebuggerEnvironment(max_steps=10, seed=3)
+        asyncio.run(env.reset(task_id="easy-command-typo"))
+        observation, reward, done, info = asyncio.run(env.step("read_logs: inspect failing stage logs"))
+        self.assertIsInstance(observation, dict)
+        self.assertIsInstance(reward, float)
+        self.assertIsInstance(done, bool)
+        self.assertIsInstance(info, dict)
+        self.assertIn("tool", info)
+    def test_action_space_rejects_extra_tools(self):
+        env = CICDDebuggerEnvironment(max_steps=10, seed=5)
+        asyncio.run(env.reset(task_id="easy-command-typo"))
+        observation, reward, done, info = asyncio.run(env.step("propose_fix: force deploy"))
+        self.assertIn("error", info)
+        self.assertIsNotNone(info["error"])
+        self.assertFalse(done)
+        self.assertGreaterEqual(reward, 0.0)
+        self.assertIn("config", observation)
+    def test_action_space_rejects_alias_tools(self):
+        env = CICDDebuggerEnvironment(max_steps=10, seed=15)
+        asyncio.run(env.reset(task_id="easy-command-typo"))
+        _, _, done, info = asyncio.run(env.step("read: workflow file"))
+        self.assertIn("error", info)
+        self.assertIsNotNone(info["error"])
+        self.assertFalse(done)
+    def test_submit_solution_path(self):
+        env = CICDDebuggerEnvironment(max_steps=12, seed=9)
+        asyncio.run(env.reset(task_id="easy-command-typo"))
+        asyncio.run(env.step("read_logs: inspect logs"))
+        asyncio.run(env.step("analyze_error: identify root cause"))
+        asyncio.run(env.step("edit_config: replace npm tset with npm test"))
+        asyncio.run(env.step("run_pipeline_stage: run test stage"))
+        asyncio.run(env.step("run_tests: execute tests"))
+        asyncio.run(env.step("validate_fix: validate score"))
+        observation, reward, done, info = asyncio.run(env.step("submit_solution: submit current fix"))
+        self.assertTrue(done)
+        self.assertGreaterEqual(reward, 0.0)
+        self.assertIsNone(info.get("error"))
+        self.assertEqual(observation["progress_flags"].get("submit_solution"), True)
+    def test_internal_state_tracks_required_fields(self):
+        env = CICDDebuggerEnvironment(max_steps=10, seed=11)
+        asyncio.run(env.reset(task_id="easy-command-typo"))
+        asyncio.run(env.step("read_logs: inspect logs"))
+        state = env.get_state()
+        self.assertTrue(state.get("initialized"))
+        self.assertIn("actual_bug", state)
+        self.assertIn("correct_solution", state)
+        self.assertIn("progress_flags", state)
+        self.assertIn("file_modification_count", state)
+        self.assertIn("hidden_test_pass_rate", state)
+    def test_yaml_task_is_fixable_via_edit_flow(self):
+        env = CICDDebuggerEnvironment(max_steps=12, seed=17)
+        asyncio.run(env.reset(task_id="easy-yaml-indentation"))
+        asyncio.run(env.step("read_logs: inspect logs"))
+        asyncio.run(env.step("analyze_error: identify root cause"))
+        observation, _, _, _ = asyncio.run(env.step("edit_config: fix YAML indentation and syntax"))
+        self.assertIn("- run: pytest", observation["config"])
+        self.assertNotIn("       - run: pytest", observation["config"])
+        asyncio.run(env.step("run_tests: execute tests"))
+        asyncio.run(env.step("validate_fix: validate score"))
+        _, _, done, info = asyncio.run(env.step("submit_solution: submit current fix"))
+        self.assertTrue(done)
+        self.assertIsNone(info.get("error"))
+    def test_hard_needs_order_edit_updates_deploy_dependency(self):
+        env = CICDDebuggerEnvironment(max_steps=12, seed=19)
+        asyncio.run(env.reset(task_id="hard-needs-order"))
+        observation, _, _, _ = asyncio.run(env.step("edit_config: fix deploy dependency ordering"))
+        self.assertIn("needs: [build, test]", observation["config"])
+        self.assertEqual(observation["config"].count("needs: build"), 1)
+if __name__ == "__main__":
+    unittest.main()

tests/test_inference.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import os
+import re
+import subprocess
+import sys
+from pathlib import Path
+import unittest
+class InferenceOutputFormatTests(unittest.TestCase):
+    def test_inference_prints_required_markers(self):
+        project_root = Path(__file__).resolve().parents[1]
+        env = os.environ.copy()
+        env["OFFLINE_INFERENCE"] = "1"
+        completed = subprocess.run(
+            [sys.executable, "inference.py", "--max-steps", "3", "--offline", "--force-local-env"],
+            cwd=project_root,
+            capture_output=True,
+            text=True,
+            env=env,
+            check=True,
+        )
+        lines = [line.strip() for line in completed.stdout.splitlines() if line.strip()]
+        self.assertGreaterEqual(len(lines), 3)
+        self.assertTrue(lines[0].startswith("[START] "))
+        self.assertTrue(lines[-1].startswith("[END] "))
+        start_pattern = re.compile(r"^\[START\] task=\S+ env=\S+ model=.+$")
+        step_pattern = re.compile(
+            r"^\[STEP\] step=\d+ action=.* reward=-?\d+\.\d{2} done=(true|false) error=(null|.+)$"
+        )
+        end_pattern = re.compile(
+            r"^\[END\] success=(true|false) steps=\d+ score=\d+\.\d{3} rewards=(-?\d+\.\d{2}(,-?\d+\.\d{2})*)?$"
+        )
+        self.assertRegex(lines[0], start_pattern)
+        step_lines = [line for line in lines if line.startswith("[STEP] ")]
+        self.assertTrue(step_lines)
+        for line in step_lines:
+            self.assertRegex(line, step_pattern)
+        self.assertRegex(lines[-1], end_pattern)
+        for line in lines:
+            self.assertTrue(
+                line.startswith("[START] ") or line.startswith("[STEP] ") or line.startswith("[END] "),
+                f"Unexpected output line: {line}",
+            )
+if __name__ == "__main__":
+    unittest.main()

tests/test_judge.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import unittest
+from env.graders.llm_judge import LLMJudge
+class FakeModel:
+    def __init__(self, payload, raise_error: bool = False):
+        self.payload = payload
+        self.raise_error = raise_error
+    def __call__(self, prompt, **kwargs):
+        if self.raise_error:
+            raise RuntimeError("model failure")
+        return [{"generated_text": self.payload}]
+class LLMJudgeTests(unittest.TestCase):
+    def test_good_json_scores_are_parsed(self):
+        judge = LLMJudge(FakeModel('{"correctness": 1.0, "minimalism": 0.8, "quality": 0.9}'))
+        result = judge.evaluate_fix("npm tset", "npm test", "command not found")
+        self.assertGreaterEqual(result["correctness"], 0.9)
+        self.assertGreaterEqual(result["minimalism"], 0.7)
+        self.assertGreaterEqual(result["quality"], 0.8)
+    def test_regex_fallback_for_noisy_output(self):
+        noisy = "Correctness: 0.7\nMinimalism: 0.6\nQuality: 0.75"
+        judge = LLMJudge(FakeModel(noisy))
+        result = judge.evaluate_fix("a", "b", "err")
+        self.assertAlmostEqual(result["correctness"], 0.7)
+        self.assertAlmostEqual(result["minimalism"], 0.6)
+        self.assertAlmostEqual(result["quality"], 0.75)
+    def test_partial_fields_default_to_zero(self):
+        judge = LLMJudge(FakeModel('{"correctness": 0.8}'))
+        result = judge.evaluate_fix("a", "b", "err")
+        self.assertAlmostEqual(result["correctness"], 0.8)
+        self.assertAlmostEqual(result["minimalism"], 0.0)
+        self.assertAlmostEqual(result["quality"], 0.0)
+    def test_model_failure_returns_zeroes(self):
+        judge = LLMJudge(FakeModel("", raise_error=True))
+        result = judge.evaluate_fix("a", "b", "err")
+        self.assertEqual(result, {"correctness": 0.0, "minimalism": 0.0, "quality": 0.0})
+if __name__ == "__main__":
+    unittest.main()

tests/test_server_api.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import unittest
+from fastapi.testclient import TestClient
+from server.app import app
+import server.app as server_app
+class ServerApiTests(unittest.TestCase):
+    def setUp(self):
+        server_app.runtime_session = None
+        self.client = TestClient(app)
+    def test_health(self):
+        response = self.client.get("/health")
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response.json().get("status"), "ok")
+    def test_reset_state_step_flow(self):
+        reset_response = self.client.post("/reset", json={})
+        self.assertEqual(reset_response.status_code, 200)
+        reset_payload = reset_response.json()
+        self.assertIn("observation", reset_payload)
+        self.assertIn("step_count", reset_payload)
+        self.assertEqual(reset_payload["step_count"], 0)
+        state_response = self.client.get("/state")
+        self.assertEqual(state_response.status_code, 200)
+        state_payload = state_response.json()
+        self.assertTrue(state_payload.get("initialized"))
+        step_response = self.client.post(
+            "/step",
+            json={"action": "edit_config: replace npm tset with npm test"},
+        )
+        self.assertEqual(step_response.status_code, 200)
+        step_payload = step_response.json()
+        self.assertIn("reward", step_payload)
+        self.assertIn("done", step_payload)
+    def test_step_requires_reset(self):
+        server_app.runtime_session = None
+        client = TestClient(app)
+        response = client.post("/step", json={"action": "read_logs: inspect logs"})
+        self.assertEqual(response.status_code, 400)
+if __name__ == "__main__":
+    unittest.main()

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

validate-submission.sh ADDED Viewed

	@@ -0,0 +1,187 @@

+#!/usr/bin/env bash
+#
+# validate-submission.sh - OpenEnv Submission Validator
+#
+# Checks that your HF Space is live, Docker image builds, and openenv validate passes.
+#
+# Prerequisites:
+#   - Docker:       https://docs.docker.com/get-docker/
+#   - openenv-core: pip install openenv-core
+#   - curl (usually pre-installed)
+#
+# Run:
+#   curl -fsSL https://raw.githubusercontent.com/<owner>/<repo>/main/scripts/validate-submission.sh | bash -s -- <ping_url> [repo_dir]
+#
+#   Or download and run locally:
+#     chmod +x validate-submission.sh
+#     ./validate-submission.sh <ping_url> [repo_dir]
+#
+# Arguments:
+#   ping_url   Your HuggingFace Space URL (e.g. https://your-space.hf.space)
+#   repo_dir   Path to your repo (default: current directory)
+#
+# Examples:
+#   ./validate-submission.sh https://my-team.hf.space
+#   ./validate-submission.sh https://my-team.hf.space ./my-repo
+#
+set -uo pipefail
+DOCKER_BUILD_TIMEOUT=600
+if [ -t 1 ]; then
+  RED='\033[0;31m'
+  GREEN='\033[0;32m'
+  YELLOW='\033[1;33m'
+  BOLD='\033[1m'
+  NC='\033[0m'
+else
+  RED='' GREEN='' YELLOW='' BOLD='' NC=''
+fi
+run_with_timeout() {
+  local secs="$1"; shift
+  if command -v timeout &>/dev/null; then
+    timeout "$secs" "$@"
+  elif command -v gtimeout &>/dev/null; then
+    gtimeout "$secs" "$@"
+  else
+    "$@" &
+    local pid=$!
+    ( sleep "$secs" && kill "$pid" 2>/dev/null ) &
+    local watcher=$!
+    wait "$pid" 2>/dev/null
+    local rc=$?
+    kill "$watcher" 2>/dev/null
+    wait "$watcher" 2>/dev/null
+    return $rc
+  fi
+}
+portable_mktemp() {
+  local prefix="${1:-validate}"
+  mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
+}
+CLEANUP_FILES=()
+cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
+trap cleanup EXIT
+PING_URL="${1:-}"
+REPO_DIR="${2:-.}"
+if [ -z "$PING_URL" ]; then
+  printf "Usage: %s <ping_url> [repo_dir]\n" "$0"
+  printf "\n"
+  printf "  ping_url   Your HuggingFace Space URL (e.g. https://your-space.hf.space)\n"
+  printf "  repo_dir   Path to your repo (default: current directory)\n"
+  exit 1
+fi
+if ! REPO_DIR="$(cd "$REPO_DIR" 2>/dev/null && pwd)"; then
+  printf "Error: directory '%s' not found\n" "${2:-.}"
+  exit 1
+fi
+PING_URL="${PING_URL%/}"
+export PING_URL
+PASS=0
+log()  { printf "[%s] %b\n" "$(date -u +%H:%M:%S)" "$*"; }
+pass() { log "${GREEN}PASSED${NC} -- $1"; PASS=$((PASS + 1)); }
+fail() { log "${RED}FAILED${NC} -- $1"; }
+hint() { printf "  ${YELLOW}Hint:${NC} %b\n" "$1"; }
+stop_at() {
+  printf "\n"
+  printf "${RED}${BOLD}Validation stopped at %s.${NC} Fix the above before continuing.\n" "$1"
+  exit 1
+}
+printf "\n"
+printf "${BOLD}========================================${NC}\n"
+printf "${BOLD}  OpenEnv Submission Validator${NC}\n"
+printf "${BOLD}========================================${NC}\n"
+log "Repo:     $REPO_DIR"
+log "Ping URL: $PING_URL"
+printf "\n"
+log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
+CURL_OUTPUT=$(portable_mktemp "validate-curl")
+CLEANUP_FILES+=("$CURL_OUTPUT")
+HTTP_CODE=$(curl -s -o "$CURL_OUTPUT" -w "%{http_code}" -X POST \
+  -H "Content-Type: application/json" -d '{}' \
+  "$PING_URL/reset" --max-time 30 2>"$CURL_OUTPUT" || printf "000")
+if [ "$HTTP_CODE" = "200" ]; then
+  pass "HF Space is live and responds to /reset"
+elif [ "$HTTP_CODE" = "000" ]; then
+  fail "HF Space not reachable (connection failed or timed out)"
+  hint "Check your network connection and that the Space is running."
+  hint "Try: curl -s -o /dev/null -w '%%{http_code}' -X POST $PING_URL/reset"
+  stop_at "Step 1"
+else
+  fail "HF Space /reset returned HTTP $HTTP_CODE (expected 200)"
+  hint "Make sure your Space is running and the URL is correct."
+  hint "Try opening $PING_URL in your browser first."
+  stop_at "Step 1"
+fi
+log "${BOLD}Step 2/3: Running docker build${NC} ..."
+if ! command -v docker &>/dev/null; then
+  fail "docker command not found"
+  hint "Install Docker: https://docs.docker.com/get-docker/"
+  stop_at "Step 2"
+fi
+if [ -f "$REPO_DIR/Dockerfile" ]; then
+  DOCKER_CONTEXT="$REPO_DIR"
+elif [ -f "$REPO_DIR/server/Dockerfile" ]; then
+  DOCKER_CONTEXT="$REPO_DIR/server"
+else
+  fail "No Dockerfile found in repo root or server/ directory"
+  stop_at "Step 2"
+fi
+log "  Found Dockerfile in $DOCKER_CONTEXT"
+BUILD_OK=false
+BUILD_OUTPUT=$(run_with_timeout "$DOCKER_BUILD_TIMEOUT" docker build "$DOCKER_CONTEXT" 2>&1) && BUILD_OK=true
+if [ "$BUILD_OK" = true ]; then
+  pass "Docker build succeeded"
+else
+  fail "Docker build failed (timeout=${DOCKER_BUILD_TIMEOUT}s)"
+  printf "%s\n" "$BUILD_OUTPUT" | tail -20
+  stop_at "Step 2"
+fi
+log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
+VALIDATE_OK=false
+if command -v openenv &>/dev/null; then
+  VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
+else
+  PY_VALIDATE="python3"
+  if [ -x "$REPO_DIR/.venv/bin/python" ]; then
+    PY_VALIDATE="$REPO_DIR/.venv/bin/python"
+  fi
+  VALIDATE_OUTPUT=$(cd "$REPO_DIR" && "$PY_VALIDATE" -m openenv.cli.__main__ validate 2>&1) && VALIDATE_OK=true
+fi
+if [ "$VALIDATE_OK" = true ]; then
+  pass "openenv validate passed"
+  [ -n "$VALIDATE_OUTPUT" ] && log "  $VALIDATE_OUTPUT"
+else
+  fail "openenv validate failed"
+  printf "%s\n" "$VALIDATE_OUTPUT"
+  stop_at "Step 3"
+fi
+printf "\n"
+printf "${BOLD}========================================${NC}\n"
+printf "${GREEN}${BOLD}  All 3/3 checks passed!${NC}\n"
+printf "${GREEN}${BOLD}  Your submission is ready to submit.${NC}\n"
+printf "${BOLD}========================================${NC}\n"
+printf "\n"
+exit 0