Commit ·
5cf6185
0
Parent(s):
API Contract Debugger OpenEnv Environment
Browse files- .DS_Store +0 -0
- Dockerfile +26 -0
- README.md +194 -0
- RL_ARCHITECTURE.md +637 -0
- inference.py +234 -0
- openenv.yaml +53 -0
- pyproject.toml +18 -0
- requirements.txt +4 -0
- server/.DS_Store +0 -0
- server/__pycache__/app.cpython-314.pyc +0 -0
- server/__pycache__/environment.cpython-314.pyc +0 -0
- server/__pycache__/fixtures.cpython-314.pyc +0 -0
- server/__pycache__/graders.cpython-314.pyc +0 -0
- server/__pycache__/models.cpython-314.pyc +0 -0
- server/app.py +168 -0
- server/environment.py +291 -0
- server/fixtures.py +241 -0
- server/graders.py +193 -0
- server/models.py +181 -0
- tests/__pycache__/test_env.cpython-314-pytest-9.0.2.pyc +0 -0
- tests/test_env.py +565 -0
.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
Dockerfile
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
# HuggingFace Spaces requires a non-root user with uid 1000
|
| 4 |
+
RUN useradd -m -u 1000 user
|
| 5 |
+
|
| 6 |
+
WORKDIR /app
|
| 7 |
+
|
| 8 |
+
# Install dependencies as root first
|
| 9 |
+
COPY --chown=user requirements.txt .
|
| 10 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 11 |
+
|
| 12 |
+
# Copy source code
|
| 13 |
+
COPY --chown=user . .
|
| 14 |
+
|
| 15 |
+
# Switch to non-root user
|
| 16 |
+
USER user
|
| 17 |
+
|
| 18 |
+
ENV HOME=/home/user \
|
| 19 |
+
PATH=/home/user/.local/bin:$PATH \
|
| 20 |
+
PORT=7860 \
|
| 21 |
+
PYTHONUNBUFFERED=1 \
|
| 22 |
+
PYTHONPATH=/app
|
| 23 |
+
|
| 24 |
+
EXPOSE 7860
|
| 25 |
+
|
| 26 |
+
CMD ["python", "-m", "uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: API Contract Debugger
|
| 3 |
+
emoji: 🔍
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
tags:
|
| 9 |
+
- openenv
|
| 10 |
+
- rl-environment
|
| 11 |
+
- api-debugging
|
| 12 |
+
- contract-testing
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
+
# API Contract Debugger — OpenEnv Environment
|
| 16 |
+
|
| 17 |
+
An OpenEnv environment where AI agents debug broken OpenAPI-style contract
|
| 18 |
+
specifications by proposing targeted field-level corrections.
|
| 19 |
+
|
| 20 |
+
## What Is This?
|
| 21 |
+
|
| 22 |
+
Every backend engineer debugs API contract violations constantly — mismatched
|
| 23 |
+
types, missing required fields, wrong HTTP status codes, forbidden extra fields
|
| 24 |
+
leaking into responses. This environment turns that real-world task into a
|
| 25 |
+
structured RL benchmark.
|
| 26 |
+
|
| 27 |
+
The agent receives a broken API spec and a list of violations. Each step, it
|
| 28 |
+
proposes one fix. It gets rewarded for each violation resolved and penalised
|
| 29 |
+
for introducing new ones.
|
| 30 |
+
|
| 31 |
+
---
|
| 32 |
+
|
| 33 |
+
## Action Space
|
| 34 |
+
|
| 35 |
+
```json
|
| 36 |
+
{
|
| 37 |
+
"kind": "add_field | remove_field | change_type | change_status | no_op",
|
| 38 |
+
"endpoint_index": 0,
|
| 39 |
+
"location": "request_body | response_body | status_code",
|
| 40 |
+
"field_name": "field_name_or_null",
|
| 41 |
+
"new_value": "<type string | field spec dict | int status code | null>"
|
| 42 |
+
}
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
| `kind` | `new_value` type | Description |
|
| 46 |
+
|-----------------|-----------------|-------------|
|
| 47 |
+
| `add_field` | `{"type": "...", "required": true, "description": "..."}` | Add a missing field |
|
| 48 |
+
| `remove_field` | `null` | Remove a forbidden field |
|
| 49 |
+
| `change_type` | `"integer"` / `"string"` / `"boolean"` / `"number"` | Fix a field's type |
|
| 50 |
+
| `change_status` | `204` / `200` / `201` etc. | Fix the HTTP status code |
|
| 51 |
+
| `no_op` | `null` | Do nothing (small implicit cost) |
|
| 52 |
+
|
| 53 |
+
---
|
| 54 |
+
|
| 55 |
+
## Observation Space
|
| 56 |
+
|
| 57 |
+
| Field | Type | Description |
|
| 58 |
+
|-------|------|-------------|
|
| 59 |
+
| `task_name` | str | Active task: `easy`, `medium`, `hard` |
|
| 60 |
+
| `task_description` | str | Plain-English description of violations |
|
| 61 |
+
| `endpoints` | list | Current (partially fixed) endpoint specs |
|
| 62 |
+
| `violations` | list | Remaining violations with type + description |
|
| 63 |
+
| `violations_fixed_this_step` | int | How many the last action resolved |
|
| 64 |
+
| `violations_introduced_this_step` | int | How many the last action introduced |
|
| 65 |
+
| `total_violations_at_start` | int | Violation count at episode start |
|
| 66 |
+
| `step_count` | int | Steps taken so far |
|
| 67 |
+
| `max_steps` | int | Episode step budget |
|
| 68 |
+
| `last_action_error` | str\|null | Validation error if action was malformed |
|
| 69 |
+
| `reward` | float | Per-step reward |
|
| 70 |
+
| `done` | bool | Whether the episode has terminated |
|
| 71 |
+
|
| 72 |
+
---
|
| 73 |
+
|
| 74 |
+
## Tasks
|
| 75 |
+
|
| 76 |
+
### Easy (1 endpoint, 1 violation, max 5 steps)
|
| 77 |
+
A user registration endpoint is missing `created_at` (string) in its response.
|
| 78 |
+
Expected score for a capable agent: **1.0**
|
| 79 |
+
|
| 80 |
+
### Medium (3 endpoints, 3 violations, max 10 steps)
|
| 81 |
+
An e-commerce API has:
|
| 82 |
+
1. `GET /products/{id}` — `product_id` returned as `string` instead of `integer`
|
| 83 |
+
2. `POST /orders` — `quantity` accepted as `string` instead of `integer`
|
| 84 |
+
3. `DELETE /orders/{id}` — returns status `200` instead of `204`
|
| 85 |
+
|
| 86 |
+
Expected score for a capable agent: **1.0**
|
| 87 |
+
|
| 88 |
+
### Hard (4 endpoints, 6 violations, max 15 steps)
|
| 89 |
+
An auth + profile API has:
|
| 90 |
+
1. `POST /auth/login` — missing `refresh_token` in response
|
| 91 |
+
2. `POST /auth/login` — `expires_in` is `string` instead of `integer`
|
| 92 |
+
3. `GET /users/{id}/profile` — missing `created_at` in response
|
| 93 |
+
4. `GET /users/{id}/profile` — exposes forbidden `password_hash` field (must be removed)
|
| 94 |
+
5. `PATCH /users/{id}/profile` — returns status `500` instead of `200`
|
| 95 |
+
6. `PATCH /users/{id}/profile` — missing `updated_at` in response
|
| 96 |
+
|
| 97 |
+
Expected score for a capable agent: **0.7–1.0** (frontier models)
|
| 98 |
+
|
| 99 |
+
---
|
| 100 |
+
|
| 101 |
+
## Reward Function
|
| 102 |
+
|
| 103 |
+
| Event | Reward |
|
| 104 |
+
|-------|--------|
|
| 105 |
+
| Fix a violation | `+0.2 × severity` |
|
| 106 |
+
| Introduce a violation | `−0.15 × severity` |
|
| 107 |
+
| Malformed action | `−0.05` |
|
| 108 |
+
| Solve all violations | `+0.5` bonus |
|
| 109 |
+
|
| 110 |
+
Severity weights: `missing_field=1.0`, `wrong_type=0.9`, `wrong_status=0.8`, `extra_field=0.7`
|
| 111 |
+
|
| 112 |
+
Final episode score is computed by `grade_episode()` → float in `[0.0, 1.0]`.
|
| 113 |
+
|
| 114 |
+
---
|
| 115 |
+
|
| 116 |
+
## API Endpoints
|
| 117 |
+
|
| 118 |
+
| Method | Path | Description |
|
| 119 |
+
|--------|------|-------------|
|
| 120 |
+
| `POST` | `/reset` | Reset environment. Body: `{"task_name": "easy\|medium\|hard"}` |
|
| 121 |
+
| `POST` | `/step` | Apply one action. Body: `{"action": {...}}` |
|
| 122 |
+
| `GET` | `/state` | Full internal state |
|
| 123 |
+
| `GET` | `/score` | Final episode score |
|
| 124 |
+
| `GET` | `/tasks` | List all available tasks |
|
| 125 |
+
| `GET` | `/health`| Health check |
|
| 126 |
+
| `GET` | `/schema`| JSON schemas for action + observation |
|
| 127 |
+
|
| 128 |
+
---
|
| 129 |
+
|
| 130 |
+
## Setup & Usage
|
| 131 |
+
|
| 132 |
+
### Run locally
|
| 133 |
+
|
| 134 |
+
```bash
|
| 135 |
+
git clone <your-repo-url>
|
| 136 |
+
cd api_contract_debugger_env
|
| 137 |
+
pip install -r requirements.txt
|
| 138 |
+
uvicorn server.app:app --host 0.0.0.0 --port 7860
|
| 139 |
+
```
|
| 140 |
+
|
| 141 |
+
### Run with Docker
|
| 142 |
+
|
| 143 |
+
```bash
|
| 144 |
+
docker build -t api-contract-debugger .
|
| 145 |
+
docker run -p 7860:7860 api-contract-debugger
|
| 146 |
+
```
|
| 147 |
+
|
| 148 |
+
### Run the baseline agent
|
| 149 |
+
|
| 150 |
+
```bash
|
| 151 |
+
export HF_TOKEN=your_token
|
| 152 |
+
export ENV_BASE_URL=http://localhost:7860
|
| 153 |
+
python inference.py
|
| 154 |
+
```
|
| 155 |
+
|
| 156 |
+
### Run tests
|
| 157 |
+
|
| 158 |
+
```bash
|
| 159 |
+
pip install pytest httpx
|
| 160 |
+
pytest tests/ -v
|
| 161 |
+
```
|
| 162 |
+
|
| 163 |
+
---
|
| 164 |
+
|
| 165 |
+
## Baseline Scores
|
| 166 |
+
|
| 167 |
+
| Task | Model | Score | Steps Used |
|
| 168 |
+
|------|-------|-------|-----------|
|
| 169 |
+
| easy | Qwen2.5-72B-Instruct | 1.000 | 1 |
|
| 170 |
+
| medium | Qwen2.5-72B-Instruct | 1.000 | 3 |
|
| 171 |
+
| hard | Qwen2.5-72B-Instruct | ~0.85 | 12 |
|
| 172 |
+
|
| 173 |
+
---
|
| 174 |
+
|
| 175 |
+
## Project Structure
|
| 176 |
+
|
| 177 |
+
```
|
| 178 |
+
api_contract_debugger_env/
|
| 179 |
+
├── server/
|
| 180 |
+
│ ├── __init__.py
|
| 181 |
+
│ ├── app.py # FastAPI app, route registration
|
| 182 |
+
│ ├── environment.py # OpenEnv Environment subclass
|
| 183 |
+
│ ├── models.py # Pydantic Action / Observation / State
|
| 184 |
+
│ ├── graders.py # Violation detection + reward shaping
|
| 185 |
+
│ └── fixtures.py # Task definitions (broken + golden specs)
|
| 186 |
+
├── tests/
|
| 187 |
+
│ └── test_env.py # 56 tests covering all components
|
| 188 |
+
├── inference.py # Baseline agent
|
| 189 |
+
├── openenv.yaml # OpenEnv metadata
|
| 190 |
+
├── pyproject.toml # Package config + server entry point
|
| 191 |
+
├── requirements.txt
|
| 192 |
+
├── uv.lock
|
| 193 |
+
└── Dockerfile
|
| 194 |
+
```
|
RL_ARCHITECTURE.md
ADDED
|
@@ -0,0 +1,637 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Reinforcement Learning Architecture: API Contract Debugger
|
| 2 |
+
|
| 3 |
+
## Overview
|
| 4 |
+
|
| 5 |
+
The API Contract Debugger is a **reinforcement learning environment** built on the OpenEnv framework. It challenges AI agents to fix broken OpenAPI-style contract specifications by proposing targeted field-level corrections.
|
| 6 |
+
|
| 7 |
+
This document explains how the codebase implements the core RL concepts:
|
| 8 |
+
- **Agent** — The external AI system interacting with the environment
|
| 9 |
+
- **Environment** — The `APIContractDebuggerEnv` class that simulates the debugging task
|
| 10 |
+
- **State** — What the agent observes and the internal environment state
|
| 11 |
+
- **Action** — The fixes the agent can propose
|
| 12 |
+
- **Reward/Result** — The feedback signal and scoring mechanism
|
| 13 |
+
|
| 14 |
+
---
|
| 15 |
+
|
| 16 |
+
## 1. Agent (External AI System)
|
| 17 |
+
|
| 18 |
+
### What is the Agent?
|
| 19 |
+
|
| 20 |
+
The **agent** is an **external AI system** (e.g., an LLM, RL policy, or human) that:
|
| 21 |
+
- Receives observations from the environment
|
| 22 |
+
- Proposes actions (fixes to the API spec)
|
| 23 |
+
- Receives reward feedback and the next state
|
| 24 |
+
- Aims to maximize cumulative reward by fixing all violations
|
| 25 |
+
|
| 26 |
+
### Agent Interaction Pattern
|
| 27 |
+
|
| 28 |
+
```
|
| 29 |
+
Agent Environment
|
| 30 |
+
| |
|
| 31 |
+
|---- POST /reset (task_name) -----> |
|
| 32 |
+
| |
|
| 33 |
+
| <------ Initial Observation --------|
|
| 34 |
+
| (endpoints, violations, reward=0) |
|
| 35 |
+
| |
|
| 36 |
+
|---- POST /step (action) ----------> |
|
| 37 |
+
| |
|
| 38 |
+
| <---- Updated Observation --------- |
|
| 39 |
+
| (new endpoints, new violations, |
|
| 40 |
+
| reward, done, fixed/introduced) |
|
| 41 |
+
| |
|
| 42 |
+
| [repeat until done=True] |
|
| 43 |
+
| |
|
| 44 |
+
| ---- GET /score - GET /state -----> |
|
| 45 |
+
| |
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
### Agent Location in Codebase
|
| 49 |
+
|
| 50 |
+
- **File**: `server/app.py`
|
| 51 |
+
- **Routes**:
|
| 52 |
+
- `POST /reset` — Initialize new episode
|
| 53 |
+
- `POST /step` — Apply one action
|
| 54 |
+
- `GET /state` — Query full environment state (for debugging)
|
| 55 |
+
- `GET /score` — Get final episode score
|
| 56 |
+
- `GET /tasks` — List available tasks
|
| 57 |
+
|
| 58 |
+
The agent communicates via HTTP REST API. All observations are JSON and fully serializable.
|
| 59 |
+
|
| 60 |
+
---
|
| 61 |
+
|
| 62 |
+
## 2. Environment (`APIContractDebuggerEnv`)
|
| 63 |
+
|
| 64 |
+
### Class Definition
|
| 65 |
+
|
| 66 |
+
**File**: `server/environment.py`
|
| 67 |
+
|
| 68 |
+
```python
|
| 69 |
+
class APIContractDebuggerEnv(Environment[DebugAction, DebugObservation, DebugState]):
|
| 70 |
+
"""
|
| 71 |
+
Environment where an agent debugs broken API contract specifications.
|
| 72 |
+
|
| 73 |
+
Inherits from OpenEnv's Environment base class.
|
| 74 |
+
Implements reset(), step(), and state property.
|
| 75 |
+
"""
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
### Environment Responsibilities
|
| 79 |
+
|
| 80 |
+
1. **Initialize tasks** — Load broken + golden endpoint specs from fixtures
|
| 81 |
+
2. **Detect violations** — Compare current spec against golden spec
|
| 82 |
+
3. **Apply actions** — Mutate the current spec based on agent's fix proposal
|
| 83 |
+
4. **Compute rewards** — Dense per-step reward based on violations fixed/introduced
|
| 84 |
+
5. **Track state** — Maintain episode counter, step count, violations
|
| 85 |
+
6. **Terminate episodes** — Check for success (all fixed) or max steps reached
|
| 86 |
+
|
| 87 |
+
### Key Methods
|
| 88 |
+
|
| 89 |
+
#### `reset(seed, episode_id, task_name, **kwargs) → DebugObservation`
|
| 90 |
+
|
| 91 |
+
Initializes a fresh episode:
|
| 92 |
+
- Loads task config from fixtures
|
| 93 |
+
- Deep-copies broken endpoints to avoid cross-episode state leakage
|
| 94 |
+
- Detects initial violations
|
| 95 |
+
- Returns initial observation with reward=0
|
| 96 |
+
|
| 97 |
+
```python
|
| 98 |
+
def reset(self, seed=None, episode_id=None, task_name=None, **kwargs):
|
| 99 |
+
"""
|
| 100 |
+
Reset the environment and return the initial observation.
|
| 101 |
+
"""
|
| 102 |
+
# Load task config and deep-copy endpoints
|
| 103 |
+
self._current_endpoints = copy.deepcopy(self._task_cfg["broken_endpoints"])
|
| 104 |
+
self._golden_endpoints = copy.deepcopy(self._task_cfg["golden_endpoints"])
|
| 105 |
+
|
| 106 |
+
# Detect violations (agent's starting problem)
|
| 107 |
+
self._violations = detect_violations(self._current_endpoints, self._golden_endpoints)
|
| 108 |
+
|
| 109 |
+
return self._make_observation(reward=0.0, done=False, ...)
|
| 110 |
+
```
|
| 111 |
+
|
| 112 |
+
#### `step(action, timeout_s, **kwargs) → DebugObservation`
|
| 113 |
+
|
| 114 |
+
Processes one agent action and returns the updated state:
|
| 115 |
+
|
| 116 |
+
```python
|
| 117 |
+
def step(self, action: DebugAction, **kwargs) -> DebugObservation:
|
| 118 |
+
"""
|
| 119 |
+
Apply one fix action → return updated observation + reward.
|
| 120 |
+
"""
|
| 121 |
+
# 1. Apply the action (mutate current_endpoints)
|
| 122 |
+
action_error = self._apply_action(action)
|
| 123 |
+
|
| 124 |
+
# 2. Recompute violations
|
| 125 |
+
self._violations = detect_violations(self._current_endpoints, self._golden_endpoints)
|
| 126 |
+
|
| 127 |
+
# 3. Compute dense reward
|
| 128 |
+
reward = step_reward(prev_violations, self._violations, action_error)
|
| 129 |
+
|
| 130 |
+
# 4. Check termination
|
| 131 |
+
all_fixed = len(self._violations) == 0
|
| 132 |
+
out_of_steps = self._step_count >= max_steps
|
| 133 |
+
self._done = all_fixed or out_of_steps
|
| 134 |
+
|
| 135 |
+
# 5. Bonus reward if solved
|
| 136 |
+
if all_fixed:
|
| 137 |
+
reward += 0.5
|
| 138 |
+
|
| 139 |
+
return self._make_observation(reward, done, fixed_this_step, ...)
|
| 140 |
+
```
|
| 141 |
+
|
| 142 |
+
#### `_apply_action(action) → Optional[str]`
|
| 143 |
+
|
| 144 |
+
Attempts to mutate `self._current_endpoints` according to the action:
|
| 145 |
+
|
| 146 |
+
- **Validates** endpoint index, field name, locations
|
| 147 |
+
- **Executes** the fix:
|
| 148 |
+
- `ADD_FIELD` — Insert new field into request/response body
|
| 149 |
+
- `REMOVE_FIELD` — Delete field from body
|
| 150 |
+
- `CHANGE_TYPE` — Update field's type
|
| 151 |
+
- `CHANGE_STATUS` — Update endpoint's HTTP status code
|
| 152 |
+
- `NO_OP` — Explicit pass (implicit penalty via no reward)
|
| 153 |
+
- **Returns** error string if invalid, `None` on success
|
| 154 |
+
|
| 155 |
+
#### `state` Property
|
| 156 |
+
|
| 157 |
+
Returns the complete internal state (not exposed to agent by default, but available via `/state`):
|
| 158 |
+
|
| 159 |
+
```python
|
| 160 |
+
@property
|
| 161 |
+
def state(self) -> DebugState:
|
| 162 |
+
"""Return full internal environment state."""
|
| 163 |
+
return DebugState(
|
| 164 |
+
episode_id=self._episode_id,
|
| 165 |
+
step_count=self._step_count,
|
| 166 |
+
task_name=self._task_name,
|
| 167 |
+
original_endpoints=self._original_endpoints, # Snapshot of broken spec
|
| 168 |
+
current_endpoints=self._current_endpoints, # Current state after fixes
|
| 169 |
+
golden_endpoints=self._golden_endpoints, # Target spec
|
| 170 |
+
violations=self._violations, # Current violations
|
| 171 |
+
total_violations_at_start=len(self._initial_violations),
|
| 172 |
+
max_steps=self._task_cfg["max_steps"],
|
| 173 |
+
)
|
| 174 |
+
```
|
| 175 |
+
|
| 176 |
+
### Supported Tasks
|
| 177 |
+
|
| 178 |
+
**File**: `server/fixtures.py`
|
| 179 |
+
|
| 180 |
+
Three difficulty levels:
|
| 181 |
+
|
| 182 |
+
| Task | Difficulty | Endpoints | Violations | Max Steps | Description |
|
| 183 |
+
|------|-----------|-----------|-----------|-----------|-------------|
|
| 184 |
+
| **easy** | Beginner | 1 | 1 missing field | 5 | Simple: add one field to response |
|
| 185 |
+
| **medium** | Intermediate | 3 | 3 (type errors + wrong status) | 10 | Type mismatches and HTTP status fixes |
|
| 186 |
+
| **hard** | Advanced | 4 | 6 (missing, extra, type, status) | 15 | Complex: multiple violation types |
|
| 187 |
+
|
| 188 |
+
Each task has:
|
| 189 |
+
- `broken_endpoints` — Starting state (what agent sees)
|
| 190 |
+
- `golden_endpoints` — Ground truth (what violations are measured against)
|
| 191 |
+
- `description` — Human-readable task objective
|
| 192 |
+
- `max_steps` — Episode cut-off
|
| 193 |
+
|
| 194 |
+
---
|
| 195 |
+
|
| 196 |
+
## 3. State
|
| 197 |
+
|
| 198 |
+
### Observation (`DebugObservation`)
|
| 199 |
+
|
| 200 |
+
**What the agent sees after each action.**
|
| 201 |
+
|
| 202 |
+
File: `server/models.py`
|
| 203 |
+
|
| 204 |
+
```python
|
| 205 |
+
class DebugObservation(Observation):
|
| 206 |
+
"""
|
| 207 |
+
What the agent observes after reset() or step().
|
| 208 |
+
"""
|
| 209 |
+
# Task info
|
| 210 |
+
task_name: str # "easy" | "medium" | "hard"
|
| 211 |
+
task_description: str # Human description
|
| 212 |
+
|
| 213 |
+
# Current spec
|
| 214 |
+
endpoints: List[Dict[str, Any]] # Current endpoints (partially fixed)
|
| 215 |
+
violations: List[Dict[str, Any]] # Detected violations still present
|
| 216 |
+
|
| 217 |
+
# Reward signals
|
| 218 |
+
reward: float # Dense per-step reward
|
| 219 |
+
done: bool # Episode termination flag
|
| 220 |
+
violations_fixed_this_step: int # Count of fixed violations
|
| 221 |
+
violations_introduced_this_step: int # Count of new violations
|
| 222 |
+
total_violations_at_start: int # Reference baseline
|
| 223 |
+
|
| 224 |
+
# Tracking
|
| 225 |
+
step_count: int # Steps taken so far
|
| 226 |
+
max_steps: int # Episode limit
|
| 227 |
+
last_action_error: Optional[str] # Validation error message
|
| 228 |
+
```
|
| 229 |
+
|
| 230 |
+
#### Example Observation
|
| 231 |
+
|
| 232 |
+
```json
|
| 233 |
+
{
|
| 234 |
+
"task_name": "easy",
|
| 235 |
+
"task_description": "Add missing 'created_at' field to response...",
|
| 236 |
+
"endpoints": [
|
| 237 |
+
{
|
| 238 |
+
"method": "POST",
|
| 239 |
+
"path": "/users/register",
|
| 240 |
+
"status_code": 201,
|
| 241 |
+
"request_body": {
|
| 242 |
+
"username": {"type": "string", "required": true},
|
| 243 |
+
"email": {"type": "string", "required": true},
|
| 244 |
+
"password": {"type": "string", "required": true}
|
| 245 |
+
},
|
| 246 |
+
"response_body": {
|
| 247 |
+
"user_id": {"type": "integer", "required": true},
|
| 248 |
+
"username": {"type": "string", "required": true}
|
| 249 |
+
// missing: created_at
|
| 250 |
+
}
|
| 251 |
+
}
|
| 252 |
+
],
|
| 253 |
+
"violations": [
|
| 254 |
+
{
|
| 255 |
+
"endpoint_index": 0,
|
| 256 |
+
"location": "response_body",
|
| 257 |
+
"field_name": "created_at",
|
| 258 |
+
"violation_type": "missing_field",
|
| 259 |
+
"description": "POST /users/register response_body: required field 'created_at' (string) is missing",
|
| 260 |
+
"severity": 1.0
|
| 261 |
+
}
|
| 262 |
+
],
|
| 263 |
+
"violations_fixed_this_step": 0,
|
| 264 |
+
"violations_introduced_this_step": 0,
|
| 265 |
+
"total_violations_at_start": 1,
|
| 266 |
+
"step_count": 0,
|
| 267 |
+
"max_steps": 5,
|
| 268 |
+
"reward": 0.0,
|
| 269 |
+
"done": false,
|
| 270 |
+
"last_action_error": null
|
| 271 |
+
}
|
| 272 |
+
```
|
| 273 |
+
|
| 274 |
+
### Full Internal State (`DebugState`)
|
| 275 |
+
|
| 276 |
+
**Available via `GET /state` endpoint (for debugging/analysis, not given to agent by default).**
|
| 277 |
+
|
| 278 |
+
```python
|
| 279 |
+
class DebugState(State):
|
| 280 |
+
"""
|
| 281 |
+
Full internal state (not exposed to agent by default).
|
| 282 |
+
"""
|
| 283 |
+
task_name: str
|
| 284 |
+
original_endpoints: List[Dict[str, Any]] # Snapshot of broken spec
|
| 285 |
+
current_endpoints: List[Dict[str, Any]] # Mutated by agent's actions
|
| 286 |
+
golden_endpoints: List[Dict[str, Any]] # Ground truth
|
| 287 |
+
violations: List[Dict[str, Any]] # Computed violations
|
| 288 |
+
total_violations_at_start: int
|
| 289 |
+
max_steps: int
|
| 290 |
+
```
|
| 291 |
+
|
| 292 |
+
---
|
| 293 |
+
|
| 294 |
+
## 4. Action (`DebugAction`)
|
| 295 |
+
|
| 296 |
+
**What the agent can propose.**
|
| 297 |
+
|
| 298 |
+
File: `server/models.py`
|
| 299 |
+
|
| 300 |
+
```python
|
| 301 |
+
class DebugAction(Action):
|
| 302 |
+
"""
|
| 303 |
+
A single fix proposed by the agent.
|
| 304 |
+
The agent targets one endpoint + one field and proposes exactly one change.
|
| 305 |
+
"""
|
| 306 |
+
|
| 307 |
+
kind: ActionKind # Type of fix
|
| 308 |
+
endpoint_index: int # Which endpoint to fix (0-indexed)
|
| 309 |
+
location: str # "request_body" | "response_body" | "status_code"
|
| 310 |
+
field_name: Optional[str] # Field to modify (null for status_code)
|
| 311 |
+
new_value: Optional[Any] # The corrected value
|
| 312 |
+
```
|
| 313 |
+
|
| 314 |
+
### Action Types (`ActionKind`)
|
| 315 |
+
|
| 316 |
+
| Kind | Target | Effect | new_value |
|
| 317 |
+
|------|--------|--------|-----------|
|
| 318 |
+
| `ADD_FIELD` | Field | Insert missing field into body | `{"type": str, "description"?: str}` |
|
| 319 |
+
| `REMOVE_FIELD` | Field | Delete forbidden field from body | `null` |
|
| 320 |
+
| `CHANGE_TYPE` | Field | Fix field's JSON Schema type | Type string (e.g., `"integer"`) |
|
| 321 |
+
| `CHANGE_STATUS` | Endpoint | Fix HTTP status code | Integer (e.g., `201`) |
|
| 322 |
+
| `NO_OP` | None | Explicit pass/wait | `null` |
|
| 323 |
+
|
| 324 |
+
#### Example Actions
|
| 325 |
+
|
| 326 |
+
```python
|
| 327 |
+
# Fix 1: Add missing 'created_at' field
|
| 328 |
+
{
|
| 329 |
+
"kind": "add_field",
|
| 330 |
+
"endpoint_index": 0,
|
| 331 |
+
"location": "response_body",
|
| 332 |
+
"field_name": "created_at",
|
| 333 |
+
"new_value": {
|
| 334 |
+
"type": "string",
|
| 335 |
+
"description": "ISO-8601 timestamp"
|
| 336 |
+
}
|
| 337 |
+
}
|
| 338 |
+
|
| 339 |
+
# Fix 2: Change field type from string to integer
|
| 340 |
+
{
|
| 341 |
+
"kind": "change_type",
|
| 342 |
+
"endpoint_index": 1,
|
| 343 |
+
"location": "request_body",
|
| 344 |
+
"field_name": "user_id",
|
| 345 |
+
"new_value": "integer"
|
| 346 |
+
}
|
| 347 |
+
|
| 348 |
+
# Fix 3: Correct HTTP status code
|
| 349 |
+
{
|
| 350 |
+
"kind": "change_status",
|
| 351 |
+
"endpoint_index": 0,
|
| 352 |
+
"location": "status_code",
|
| 353 |
+
"field_name": null,
|
| 354 |
+
"new_value": 201
|
| 355 |
+
}
|
| 356 |
+
|
| 357 |
+
# Fix 4: Remove extra field
|
| 358 |
+
{
|
| 359 |
+
"kind": "remove_field",
|
| 360 |
+
"endpoint_index": 2,
|
| 361 |
+
"location": "response_body",
|
| 362 |
+
"field_name": "deprecated_field",
|
| 363 |
+
"new_value": null
|
| 364 |
+
}
|
| 365 |
+
|
| 366 |
+
# Fix 5: Explicit pass
|
| 367 |
+
{
|
| 368 |
+
"kind": "no_op",
|
| 369 |
+
"endpoint_index": 0,
|
| 370 |
+
"location": "request_body",
|
| 371 |
+
"field_name": null,
|
| 372 |
+
"new_value": null
|
| 373 |
+
}
|
| 374 |
+
```
|
| 375 |
+
|
| 376 |
+
### Action Validation
|
| 377 |
+
|
| 378 |
+
The environment validates actions in `_apply_action()`:
|
| 379 |
+
|
| 380 |
+
- **Endpoint index bounds** — Must be `0 ≤ index < len(endpoints)`
|
| 381 |
+
- **Location validity** — Must be `"request_body"`, `"response_body"`, or `"status_code"`
|
| 382 |
+
- **Field existence** — REMOVE_FIELD and CHANGE_TYPE require field to exist
|
| 383 |
+
- **Type format** — Fields must have `{"type": "..."}` structure
|
| 384 |
+
- **Status code format** — Must be an integer
|
| 385 |
+
|
| 386 |
+
If validation fails, `_apply_action()` returns an error string and the step receives `-0.05` reward penalty.
|
| 387 |
+
|
| 388 |
+
---
|
| 389 |
+
|
| 390 |
+
## 5. Reward & Result
|
| 391 |
+
|
| 392 |
+
### Dense Per-Step Reward
|
| 393 |
+
|
| 394 |
+
**File**: `server/graders.py` → `step_reward()` function
|
| 395 |
+
|
| 396 |
+
The agent receives feedback after each step:
|
| 397 |
+
|
| 398 |
+
```python
|
| 399 |
+
def step_reward(
|
| 400 |
+
prev_violations: List[Dict[str, Any]],
|
| 401 |
+
new_violations: List[Dict[str, Any]],
|
| 402 |
+
initial_violations: List[Dict[str, Any]],
|
| 403 |
+
action_error: bool,
|
| 404 |
+
) -> float:
|
| 405 |
+
"""
|
| 406 |
+
Dense per-step reward:
|
| 407 |
+
+0.2 × severity per violation resolved
|
| 408 |
+
-0.15 × severity per new violation introduced
|
| 409 |
+
-0.05 for malformed action
|
| 410 |
+
+0.5 bonus if all violations fixed (episode success)
|
| 411 |
+
"""
|
| 412 |
+
if action_error:
|
| 413 |
+
return -0.05
|
| 414 |
+
|
| 415 |
+
reward = 0.0
|
| 416 |
+
for v in violations_fixed_this_step:
|
| 417 |
+
reward += 0.2 * v["severity"]
|
| 418 |
+
for v in violations_introduced_this_step:
|
| 419 |
+
reward -= 0.15 * v["severity"]
|
| 420 |
+
|
| 421 |
+
return reward
|
| 422 |
+
```
|
| 423 |
+
|
| 424 |
+
### Violation Severity Weights
|
| 425 |
+
|
| 426 |
+
Weighted by problem importance:
|
| 427 |
+
|
| 428 |
+
| Violation Type | Severity | Reason |
|
| 429 |
+
|----------------|----------|--------|
|
| 430 |
+
| `missing_field` | 1.0 | Breaks contract — top priority |
|
| 431 |
+
| `wrong_type` | 0.9 | Type mismatch — critical |
|
| 432 |
+
| `wrong_status` | 0.8 | HTTP code error — significant |
|
| 433 |
+
| `extra_field` | 0.7 | Forbidden field — less critical |
|
| 434 |
+
|
| 435 |
+
### Episode Scoring (`grade_episode()`)
|
| 436 |
+
|
| 437 |
+
**Computed at episode end.** Returns final score in `[0.0, 1.0]`.
|
| 438 |
+
|
| 439 |
+
```python
|
| 440 |
+
def grade_episode(
|
| 441 |
+
current_endpoints: List[Dict[str, Any]],
|
| 442 |
+
golden_endpoints: List[Dict[str, Any]],
|
| 443 |
+
initial_violations: List[Dict[str, Any]],
|
| 444 |
+
) -> float:
|
| 445 |
+
"""
|
| 446 |
+
Final episode score:
|
| 447 |
+
|
| 448 |
+
score = (weighted_violations_fixed - weighted_violations_introduced)
|
| 449 |
+
/ total_initial_weight
|
| 450 |
+
|
| 451 |
+
Clamped to [0.0, 1.0]
|
| 452 |
+
|
| 453 |
+
1.0 = all violations fixed, no new ones introduced
|
| 454 |
+
0.5 = 50% of violations fixed
|
| 455 |
+
0.0 = no improvement or made things worse
|
| 456 |
+
"""
|
| 457 |
+
```
|
| 458 |
+
|
| 459 |
+
#### Example Scoring Scenario
|
| 460 |
+
|
| 461 |
+
**Task: easy (1 violation)**
|
| 462 |
+
- Initial violation: `missing_field "created_at" (severity=1.0)`
|
| 463 |
+
- After 1 step: Agent adds `created_at` correctly
|
| 464 |
+
- After 2 steps: Agent incorrectly changes type of `username` to `integer` (introduces 1 violation)
|
| 465 |
+
- Final state: 0 remaining violations, but 1 introduced
|
| 466 |
+
|
| 467 |
+
```
|
| 468 |
+
score = (1.0 - 1.0) / 1.0 = 0.0
|
| 469 |
+
```
|
| 470 |
+
|
| 471 |
+
Clamped to 0.0 (agent made things worse overall).
|
| 472 |
+
|
| 473 |
+
---
|
| 474 |
+
|
| 475 |
+
## 6. Complete RL Loop Example
|
| 476 |
+
|
| 477 |
+
### Scenario: Easy Task
|
| 478 |
+
|
| 479 |
+
**Initial state:**
|
| 480 |
+
```
|
| 481 |
+
Broken spec: POST /users/register response missing "created_at"
|
| 482 |
+
Golden spec: response has user_id, username, created_at
|
| 483 |
+
```
|
| 484 |
+
|
| 485 |
+
### Episode Transcript
|
| 486 |
+
|
| 487 |
+
```
|
| 488 |
+
RESET request (task_name="easy")
|
| 489 |
+
↓
|
| 490 |
+
Observation #0:
|
| 491 |
+
endpoints: [broken registration endpoint]
|
| 492 |
+
violations: [missing_field "created_at"]
|
| 493 |
+
reward: 0.0
|
| 494 |
+
done: false
|
| 495 |
+
step_count: 0
|
| 496 |
+
|
| 497 |
+
STEP 1: Agent proposes ADD_FIELD action
|
| 498 |
+
action.kind = "add_field"
|
| 499 |
+
action.endpoint_index = 0
|
| 500 |
+
action.location = "response_body"
|
| 501 |
+
action.field_name = "created_at"
|
| 502 |
+
action.new_value = {"type": "string", "description": "ISO-8601 timestamp"}
|
| 503 |
+
↓
|
| 504 |
+
Environment:
|
| 505 |
+
- Validates action ✓
|
| 506 |
+
- Adds field to response_body
|
| 507 |
+
- Recomputes violations → [] (0 violations!)
|
| 508 |
+
- Computes reward: +0.2 × 1.0 (fixed 1 violation of severity 1.0) = +0.2
|
| 509 |
+
+ 0.5 (bonus for all_fixed=true) = +0.7 total
|
| 510 |
+
- Sets done=true (all violations fixed)
|
| 511 |
+
↓
|
| 512 |
+
Observation #1:
|
| 513 |
+
endpoints: [fixed registration endpoint]
|
| 514 |
+
violations: []
|
| 515 |
+
violations_fixed_this_step: 1
|
| 516 |
+
violations_introduced_this_step: 0
|
| 517 |
+
reward: 0.7
|
| 518 |
+
done: true
|
| 519 |
+
step_count: 1
|
| 520 |
+
|
| 521 |
+
SCORE request
|
| 522 |
+
↓
|
| 523 |
+
score = (1.0 fixed - 0 introduced) / 1.0 initial = 1.0 ✓
|
| 524 |
+
|
| 525 |
+
Agent succeeds with perfect score!
|
| 526 |
+
```
|
| 527 |
+
|
| 528 |
+
---
|
| 529 |
+
|
| 530 |
+
## 7. File Structure Summary
|
| 531 |
+
|
| 532 |
+
```
|
| 533 |
+
server/
|
| 534 |
+
├── app.py # FastAPI routes, HTTP interface
|
| 535 |
+
├── environment.py # APIContractDebuggerEnv (core RL logic)
|
| 536 |
+
├── models.py # Pydantic models: DebugAction, DebugObservation, DebugState
|
| 537 |
+
├── fixtures.py # Task definitions (easy, medium, hard)
|
| 538 |
+
├── graders.py # Violation detection + reward/scoring
|
| 539 |
+
└── __pycache__/
|
| 540 |
+
|
| 541 |
+
tests/ # Unit tests for environment, graders, fixtures
|
| 542 |
+
|
| 543 |
+
RL_ARCHITECTURE.md # This file
|
| 544 |
+
```
|
| 545 |
+
|
| 546 |
+
---
|
| 547 |
+
|
| 548 |
+
## 8. Key Design Principles
|
| 549 |
+
|
| 550 |
+
1. **Stateful Environment** — One episode per task at a time (OpenEnv singleton pattern)
|
| 551 |
+
|
| 552 |
+
2. **Dense Rewards** — Agent gets per-step feedback (not just final score) to guide learning
|
| 553 |
+
|
| 554 |
+
3. **Severity-Weighted** — Different violation types have different weights (missing fields = highest priority)
|
| 555 |
+
|
| 556 |
+
4. **Action Validation** — Invalid actions receive penalty and return error messages
|
| 557 |
+
|
| 558 |
+
5. **Deep-Copied State** — Endpoints are deep-copied to prevent cross-episode contamination
|
| 559 |
+
|
| 560 |
+
6. **Observable Violations** — Agent sees exact list of violations (not hidden)
|
| 561 |
+
|
| 562 |
+
7. **Termination Conditions**:
|
| 563 |
+
- Success: All violations fixed
|
| 564 |
+
- Failure: Max steps exceeded
|
| 565 |
+
|
| 566 |
+
8. **JSON/REST Interface** — Agent communicates via HTTP (language-agnostic)
|
| 567 |
+
|
| 568 |
+
---
|
| 569 |
+
|
| 570 |
+
## 9. Typical Agent Workflow
|
| 571 |
+
|
| 572 |
+
```python
|
| 573 |
+
import requests
|
| 574 |
+
|
| 575 |
+
BASE_URL = "http://localhost:7860"
|
| 576 |
+
|
| 577 |
+
# 1. Reset to start new episode
|
| 578 |
+
reset_resp = requests.post(f"{BASE_URL}/reset", json={
|
| 579 |
+
"task_name": "easy",
|
| 580 |
+
"seed": 42
|
| 581 |
+
})
|
| 582 |
+
obs = reset_resp.json()
|
| 583 |
+
print(f"Violations to fix: {len(obs['violations'])}")
|
| 584 |
+
|
| 585 |
+
# 2. Repeat: observe → decide → act
|
| 586 |
+
for step in range(obs['max_steps']):
|
| 587 |
+
if obs['done']:
|
| 588 |
+
break
|
| 589 |
+
|
| 590 |
+
# Agent decision logic (depends on obs['violations'])
|
| 591 |
+
action = {
|
| 592 |
+
"kind": "add_field",
|
| 593 |
+
"endpoint_index": 0,
|
| 594 |
+
"location": "response_body",
|
| 595 |
+
"field_name": "created_at",
|
| 596 |
+
"new_value": {"type": "string"}
|
| 597 |
+
}
|
| 598 |
+
|
| 599 |
+
# 3. Apply action
|
| 600 |
+
step_resp = requests.post(f"{BASE_URL}/step", json={"action": action})
|
| 601 |
+
obs = step_resp.json()
|
| 602 |
+
|
| 603 |
+
print(f"Step {step+1}: reward={obs['reward']}, violations={len(obs['violations'])}")
|
| 604 |
+
|
| 605 |
+
# 4. Check final score
|
| 606 |
+
score_resp = requests.get(f"{BASE_URL}/score")
|
| 607 |
+
print(f"Final score: {score_resp.json()['score']}")
|
| 608 |
+
```
|
| 609 |
+
|
| 610 |
+
---
|
| 611 |
+
|
| 612 |
+
## 10. Future Extensions
|
| 613 |
+
|
| 614 |
+
Potential enhancements to the RL framework:
|
| 615 |
+
|
| 616 |
+
1. **Multi-Agent** — Support concurrent episodes via session IDs
|
| 617 |
+
2. **Curriculum Learning** — Dynamically adapt difficulty based on agent performance
|
| 618 |
+
3. **Partial Observability** — Hide some violations initially to increase challenge
|
| 619 |
+
4. **Action Constraints** — Limit action space per step (e.g., "fix at most 1 field")
|
| 620 |
+
5. **Custom Reward Shaping** — Configurable severity weights + bonus structures
|
| 621 |
+
6. **State Representation** — Multiple formats (JSON, graph, embedding-friendly)
|
| 622 |
+
|
| 623 |
+
---
|
| 624 |
+
|
| 625 |
+
## Summary Table
|
| 626 |
+
|
| 627 |
+
| Concept | Implementation | File | Purpose |
|
| 628 |
+
|---------|---|---|---|
|
| 629 |
+
| **Agent** | External AI/LLM | HTTP client | Proposes fixes |
|
| 630 |
+
| **Environment** | `APIContractDebuggerEnv` | `environment.py` | Simulates faults + validates fixes |
|
| 631 |
+
| **State** | `DebugObservation` + `DebugState` | `models.py` | Agent observes + internal tracking |
|
| 632 |
+
| **Action** | `DebugAction` | `models.py` | Fix proposals |
|
| 633 |
+
| **Reward** | `step_reward()` | `graders.py` | Dense per-step feedback |
|
| 634 |
+
| **Result** | Episode score `[0.0, 1.0]` | `graders.py` | Final performance metric |
|
| 635 |
+
| **Tasks** | Fixtures (easy/medium/hard) | `fixtures.py` | Problem instances |
|
| 636 |
+
| **HTTP API** | FastAPI routes | `app.py` | Communication interface |
|
| 637 |
+
|
inference.py
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Baseline Inference Script — API Contract Debugger
|
| 3 |
+
===================================================
|
| 4 |
+
Runs a GPT model against all three tasks and emits the required
|
| 5 |
+
[START] / [STEP] / [END] log format.
|
| 6 |
+
|
| 7 |
+
Environment variables:
|
| 8 |
+
API_BASE_URL LLM endpoint (default: https://router.huggingface.co/v1)
|
| 9 |
+
MODEL_NAME Model ID (default: Qwen/Qwen2.5-72B-Instruct)
|
| 10 |
+
HF_TOKEN API key
|
| 11 |
+
ENV_BASE_URL Running env (default: http://localhost:7860)
|
| 12 |
+
TASK_NAME One task or "all" (default: all)
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from __future__ import annotations
|
| 16 |
+
|
| 17 |
+
import json
|
| 18 |
+
import os
|
| 19 |
+
import textwrap
|
| 20 |
+
from typing import Any, Dict, List, Optional
|
| 21 |
+
|
| 22 |
+
import requests
|
| 23 |
+
from openai import OpenAI
|
| 24 |
+
|
| 25 |
+
# ---------------------------------------------------------------------------
|
| 26 |
+
# Configuration
|
| 27 |
+
# ---------------------------------------------------------------------------
|
| 28 |
+
|
| 29 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 30 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
|
| 31 |
+
API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY", "hf_placeholder")
|
| 32 |
+
ENV_BASE_URL = os.getenv("ENV_BASE_URL", "http://localhost:7860").rstrip("/")
|
| 33 |
+
TASK_NAME = os.getenv("TASK_NAME", "all")
|
| 34 |
+
|
| 35 |
+
TEMPERATURE = 0.0
|
| 36 |
+
MAX_TOKENS = 512
|
| 37 |
+
BENCHMARK = "api_contract_debugger"
|
| 38 |
+
|
| 39 |
+
TASKS = ["easy", "medium", "hard"]
|
| 40 |
+
|
| 41 |
+
# ---------------------------------------------------------------------------
|
| 42 |
+
# Logging helpers (required stdout format)
|
| 43 |
+
# ---------------------------------------------------------------------------
|
| 44 |
+
|
| 45 |
+
def log_start(task: str, env: str, model: str) -> None:
|
| 46 |
+
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
|
| 50 |
+
error_val = error if error else "null"
|
| 51 |
+
print(
|
| 52 |
+
f"[STEP] step={step} action={action} reward={reward:.2f} "
|
| 53 |
+
f"done={str(done).lower()} error={error_val}",
|
| 54 |
+
flush=True,
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
|
| 59 |
+
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 60 |
+
print(
|
| 61 |
+
f"[END] success={str(success).lower()} steps={steps} "
|
| 62 |
+
f"score={score:.3f} rewards={rewards_str}",
|
| 63 |
+
flush=True,
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
# ---------------------------------------------------------------------------
|
| 68 |
+
# Environment HTTP client
|
| 69 |
+
# ---------------------------------------------------------------------------
|
| 70 |
+
|
| 71 |
+
def env_reset(task_name: str) -> Dict[str, Any]:
|
| 72 |
+
r = requests.post(f"{ENV_BASE_URL}/reset", json={"task_name": task_name}, timeout=30)
|
| 73 |
+
r.raise_for_status()
|
| 74 |
+
return r.json()
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def env_step(action_payload: Dict[str, Any]) -> Dict[str, Any]:
|
| 78 |
+
r = requests.post(f"{ENV_BASE_URL}/step", json={"action": action_payload}, timeout=30)
|
| 79 |
+
r.raise_for_status()
|
| 80 |
+
return r.json()
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def env_score() -> float:
|
| 84 |
+
r = requests.get(f"{ENV_BASE_URL}/score", timeout=10)
|
| 85 |
+
r.raise_for_status()
|
| 86 |
+
return float(r.json()["score"])
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
# ---------------------------------------------------------------------------
|
| 90 |
+
# LLM agent
|
| 91 |
+
# ---------------------------------------------------------------------------
|
| 92 |
+
|
| 93 |
+
SYSTEM_PROMPT = textwrap.dedent("""
|
| 94 |
+
You are an expert API contract debugger. You will be shown a broken API spec
|
| 95 |
+
and a list of violations. Your job is to propose ONE fix per turn.
|
| 96 |
+
|
| 97 |
+
You must respond with ONLY a valid JSON object matching this schema:
|
| 98 |
+
{
|
| 99 |
+
"kind": "add_field" | "remove_field" | "change_type" | "change_status" | "no_op",
|
| 100 |
+
"endpoint_index": <integer, 0-based>,
|
| 101 |
+
"location": "request_body" | "response_body" | "status_code",
|
| 102 |
+
"field_name": <string or null>,
|
| 103 |
+
"new_value": <string | integer | object | null>
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
Rules:
|
| 107 |
+
- add_field: new_value must be {"type": "<type>", "required": true/false, "description": "..."}
|
| 108 |
+
- change_type: new_value must be a type string e.g. "integer", "string", "boolean", "number"
|
| 109 |
+
- change_status: new_value must be an integer HTTP status code; location must be "status_code"
|
| 110 |
+
- remove_field: new_value must be null
|
| 111 |
+
- no_op: use when no fix is needed; new_value must be null
|
| 112 |
+
|
| 113 |
+
Do NOT include any explanation — output ONLY the JSON object.
|
| 114 |
+
""").strip()
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def build_user_prompt(obs: Dict[str, Any], step: int, history: List[str]) -> str:
|
| 118 |
+
violations = obs.get("violations", [])
|
| 119 |
+
endpoints = obs.get("endpoints", [])
|
| 120 |
+
history_block = "\n".join(history[-6:]) if history else "None"
|
| 121 |
+
|
| 122 |
+
viol_text = json.dumps(violations, indent=2) if violations else "None — all fixed!"
|
| 123 |
+
ep_text = json.dumps(endpoints, indent=2)
|
| 124 |
+
|
| 125 |
+
return textwrap.dedent(f"""
|
| 126 |
+
Step {step} | Task: {obs.get('task_name')} | Violations remaining: {len(violations)}
|
| 127 |
+
|
| 128 |
+
TASK DESCRIPTION:
|
| 129 |
+
{obs.get('task_description', '')}
|
| 130 |
+
|
| 131 |
+
CURRENT ENDPOINTS:
|
| 132 |
+
{ep_text}
|
| 133 |
+
|
| 134 |
+
REMAINING VIOLATIONS:
|
| 135 |
+
{viol_text}
|
| 136 |
+
|
| 137 |
+
PREVIOUS ACTIONS:
|
| 138 |
+
{history_block}
|
| 139 |
+
|
| 140 |
+
Propose ONE fix as a JSON object.
|
| 141 |
+
""").strip()
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def get_action(client: OpenAI, obs: Dict[str, Any], step: int, history: List[str]) -> Dict[str, Any]:
|
| 145 |
+
"""Call the LLM and parse a DebugAction payload."""
|
| 146 |
+
prompt = build_user_prompt(obs, step, history)
|
| 147 |
+
try:
|
| 148 |
+
completion = client.chat.completions.create(
|
| 149 |
+
model=MODEL_NAME,
|
| 150 |
+
messages=[
|
| 151 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 152 |
+
{"role": "user", "content": prompt},
|
| 153 |
+
],
|
| 154 |
+
temperature=TEMPERATURE,
|
| 155 |
+
max_tokens=MAX_TOKENS,
|
| 156 |
+
)
|
| 157 |
+
text = (completion.choices[0].message.content or "").strip()
|
| 158 |
+
# Strip markdown fences if present
|
| 159 |
+
if text.startswith("```"):
|
| 160 |
+
text = text.split("```")[1]
|
| 161 |
+
if text.startswith("json"):
|
| 162 |
+
text = text[4:]
|
| 163 |
+
return json.loads(text.strip())
|
| 164 |
+
except Exception as exc:
|
| 165 |
+
print(f"[DEBUG] LLM call failed: {exc}", flush=True)
|
| 166 |
+
return {"kind": "no_op", "endpoint_index": 0, "location": "response_body",
|
| 167 |
+
"field_name": None, "new_value": None}
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
# ---------------------------------------------------------------------------
|
| 171 |
+
# Single episode runner
|
| 172 |
+
# ---------------------------------------------------------------------------
|
| 173 |
+
|
| 174 |
+
def run_episode(client: OpenAI, task_name: str) -> None:
|
| 175 |
+
log_start(task=task_name, env=BENCHMARK, model=MODEL_NAME)
|
| 176 |
+
|
| 177 |
+
rewards: List[float] = []
|
| 178 |
+
steps_taken = 0
|
| 179 |
+
success = False
|
| 180 |
+
score = 0.0
|
| 181 |
+
|
| 182 |
+
try:
|
| 183 |
+
obs = env_reset(task_name)
|
| 184 |
+
history: List[str] = []
|
| 185 |
+
max_steps = obs.get("max_steps", 15)
|
| 186 |
+
|
| 187 |
+
for step in range(1, max_steps + 1):
|
| 188 |
+
if obs.get("done"):
|
| 189 |
+
break
|
| 190 |
+
|
| 191 |
+
action_payload = get_action(client, obs, step, history)
|
| 192 |
+
action_str = json.dumps(action_payload, separators=(",", ":"))
|
| 193 |
+
|
| 194 |
+
obs = env_step(action_payload)
|
| 195 |
+
|
| 196 |
+
reward = float(obs.get("reward") or 0.0)
|
| 197 |
+
done = bool(obs.get("done", False))
|
| 198 |
+
error = obs.get("last_action_error")
|
| 199 |
+
|
| 200 |
+
rewards.append(reward)
|
| 201 |
+
steps_taken = step
|
| 202 |
+
|
| 203 |
+
log_step(step=step, action=action_str, reward=reward, done=done, error=error)
|
| 204 |
+
|
| 205 |
+
history.append(
|
| 206 |
+
f"Step {step}: {action_str} → reward={reward:+.2f} "
|
| 207 |
+
f"fixed={obs.get('violations_fixed_this_step', 0)} "
|
| 208 |
+
f"remaining={len(obs.get('violations', []))}"
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
if done:
|
| 212 |
+
break
|
| 213 |
+
|
| 214 |
+
score = env_score()
|
| 215 |
+
success = score >= 0.8
|
| 216 |
+
|
| 217 |
+
finally:
|
| 218 |
+
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
# ---------------------------------------------------------------------------
|
| 222 |
+
# Main
|
| 223 |
+
# ---------------------------------------------------------------------------
|
| 224 |
+
|
| 225 |
+
def main() -> None:
|
| 226 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
|
| 227 |
+
|
| 228 |
+
tasks_to_run = TASKS if TASK_NAME == "all" else [TASK_NAME]
|
| 229 |
+
for task in tasks_to_run:
|
| 230 |
+
run_episode(client, task)
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
if __name__ == "__main__":
|
| 234 |
+
main()
|
openenv.yaml
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: api-contract-debugger
|
| 2 |
+
version: "1.0.0"
|
| 3 |
+
description: >
|
| 4 |
+
An OpenEnv environment where AI agents debug broken OpenAPI-style contract
|
| 5 |
+
specifications. The agent receives a broken API spec and must identify and
|
| 6 |
+
fix contract violations (missing fields, wrong types, wrong status codes,
|
| 7 |
+
forbidden extra fields) by proposing targeted field-level corrections.
|
| 8 |
+
|
| 9 |
+
tags:
|
| 10 |
+
- api
|
| 11 |
+
- debugging
|
| 12 |
+
- contract-testing
|
| 13 |
+
- real-world
|
| 14 |
+
- nlp
|
| 15 |
+
|
| 16 |
+
tasks:
|
| 17 |
+
- name: easy
|
| 18 |
+
description: "Single endpoint with one missing required response field."
|
| 19 |
+
difficulty: easy
|
| 20 |
+
max_steps: 5
|
| 21 |
+
|
| 22 |
+
- name: medium
|
| 23 |
+
description: "Three endpoints with type mismatches and a wrong HTTP status code."
|
| 24 |
+
difficulty: medium
|
| 25 |
+
max_steps: 10
|
| 26 |
+
|
| 27 |
+
- name: hard
|
| 28 |
+
description: >
|
| 29 |
+
Four endpoints with 6 violations: missing fields, wrong types,
|
| 30 |
+
wrong status code, and a forbidden extra field that must be removed.
|
| 31 |
+
difficulty: hard
|
| 32 |
+
max_steps: 15
|
| 33 |
+
|
| 34 |
+
action_space:
|
| 35 |
+
type: structured
|
| 36 |
+
description: >
|
| 37 |
+
DebugAction — proposes one fix per step: add_field, remove_field,
|
| 38 |
+
change_type, change_status, or no_op.
|
| 39 |
+
|
| 40 |
+
observation_space:
|
| 41 |
+
type: structured
|
| 42 |
+
description: >
|
| 43 |
+
DebugObservation — returns the current (partially fixed) endpoint specs,
|
| 44 |
+
the list of remaining violations, per-step fix counts, and reward signal.
|
| 45 |
+
|
| 46 |
+
reward:
|
| 47 |
+
type: dense
|
| 48 |
+
range: [-1.0, 1.5]
|
| 49 |
+
description: >
|
| 50 |
+
+0.2×severity per violation fixed, -0.15×severity per violation introduced,
|
| 51 |
+
-0.05 for malformed action, +0.5 bonus when all violations are resolved.
|
| 52 |
+
|
| 53 |
+
hf_space: "" # fill in your HuggingFace Space URL before submitting
|
pyproject.toml
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "api-contract-debugger-env"
|
| 3 |
+
version = "1.0.0"
|
| 4 |
+
description = "OpenEnv environment for debugging broken API contract specifications"
|
| 5 |
+
requires-python = ">=3.10"
|
| 6 |
+
dependencies = [
|
| 7 |
+
"openenv-core>=0.2.0",
|
| 8 |
+
"fastapi>=0.110.0",
|
| 9 |
+
"uvicorn[standard]>=0.29.0",
|
| 10 |
+
"pydantic>=2.0.0",
|
| 11 |
+
]
|
| 12 |
+
|
| 13 |
+
[project.scripts]
|
| 14 |
+
server = "server.app:main"
|
| 15 |
+
|
| 16 |
+
[build-system]
|
| 17 |
+
requires = ["setuptools>=68"]
|
| 18 |
+
build-backend = "setuptools.backends.legacy:build"
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openenv-core>=0.2.0
|
| 2 |
+
fastapi>=0.110.0
|
| 3 |
+
uvicorn[standard]>=0.29.0
|
| 4 |
+
pydantic>=2.0.0
|
server/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
server/__pycache__/app.cpython-314.pyc
ADDED
|
Binary file (7.84 kB). View file
|
|
|
server/__pycache__/environment.cpython-314.pyc
ADDED
|
Binary file (12.8 kB). View file
|
|
|
server/__pycache__/fixtures.cpython-314.pyc
ADDED
|
Binary file (6.15 kB). View file
|
|
|
server/__pycache__/graders.cpython-314.pyc
ADDED
|
Binary file (7.67 kB). View file
|
|
|
server/__pycache__/models.cpython-314.pyc
ADDED
|
Binary file (6.31 kB). View file
|
|
|
server/app.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
FastAPI application entry point for the API Contract Debugger OpenEnv environment.
|
| 3 |
+
|
| 4 |
+
Route registration order:
|
| 5 |
+
1. Custom stateful /reset, /step, /state routes registered FIRST.
|
| 6 |
+
2. OpenEnv PRODUCTION-mode routes (/health, /schema, /metadata, /ws) attached LAST.
|
| 7 |
+
PRODUCTION mode does NOT register /reset /step /state, so our routes win.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
import os
|
| 13 |
+
from typing import Any, Dict, Optional
|
| 14 |
+
|
| 15 |
+
from fastapi import FastAPI, HTTPException
|
| 16 |
+
from pydantic import BaseModel, Field
|
| 17 |
+
|
| 18 |
+
from openenv.core.env_server.http_server import HTTPEnvServer
|
| 19 |
+
from openenv.core.env_server.types import ServerMode
|
| 20 |
+
|
| 21 |
+
from .environment import APIContractDebuggerEnv
|
| 22 |
+
from .models import DebugAction, DebugObservation, DebugState
|
| 23 |
+
|
| 24 |
+
# ---------------------------------------------------------------------------
|
| 25 |
+
# Singleton environment instances — one per task
|
| 26 |
+
# ---------------------------------------------------------------------------
|
| 27 |
+
|
| 28 |
+
_envs: Dict[str, APIContractDebuggerEnv] = {
|
| 29 |
+
"easy": APIContractDebuggerEnv(task_name="easy"),
|
| 30 |
+
"medium": APIContractDebuggerEnv(task_name="medium"),
|
| 31 |
+
"hard": APIContractDebuggerEnv(task_name="hard"),
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
_active_task: str = "easy"
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def _get_env() -> APIContractDebuggerEnv:
|
| 38 |
+
return _envs[_active_task]
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
# ---------------------------------------------------------------------------
|
| 42 |
+
# Request bodies for our custom routes
|
| 43 |
+
# ---------------------------------------------------------------------------
|
| 44 |
+
|
| 45 |
+
class ResetBody(BaseModel):
|
| 46 |
+
task_name: Optional[str] = Field(
|
| 47 |
+
default=None,
|
| 48 |
+
description="Task to run: 'easy', 'medium', or 'hard'.",
|
| 49 |
+
)
|
| 50 |
+
seed: Optional[int] = Field(default=None)
|
| 51 |
+
episode_id: Optional[str] = Field(default=None)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class StepBody(BaseModel):
|
| 55 |
+
action: Dict[str, Any] = Field(
|
| 56 |
+
...,
|
| 57 |
+
description="Serialised DebugAction payload.",
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
# ---------------------------------------------------------------------------
|
| 62 |
+
# App factory
|
| 63 |
+
# ---------------------------------------------------------------------------
|
| 64 |
+
|
| 65 |
+
def create_app() -> FastAPI:
|
| 66 |
+
app = FastAPI(
|
| 67 |
+
title="API Contract Debugger",
|
| 68 |
+
description=(
|
| 69 |
+
"An OpenEnv environment where AI agents debug broken OpenAPI-style "
|
| 70 |
+
"contract specifications by proposing targeted field-level fixes."
|
| 71 |
+
),
|
| 72 |
+
version="1.0.0",
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
# ------------------------------------------------------------------
|
| 76 |
+
# 1. Our stateful routes — registered FIRST
|
| 77 |
+
# ------------------------------------------------------------------
|
| 78 |
+
|
| 79 |
+
@app.post("/reset", tags=["Environment"])
|
| 80 |
+
async def reset(req: ResetBody = ResetBody()) -> Dict[str, Any]:
|
| 81 |
+
"""Reset the environment. Optionally switch task via task_name."""
|
| 82 |
+
global _active_task
|
| 83 |
+
if req.task_name is not None:
|
| 84 |
+
if req.task_name not in _envs:
|
| 85 |
+
raise HTTPException(
|
| 86 |
+
status_code=422,
|
| 87 |
+
detail=f"Unknown task '{req.task_name}'. Choose: {list(_envs.keys())}",
|
| 88 |
+
)
|
| 89 |
+
_active_task = req.task_name
|
| 90 |
+
|
| 91 |
+
obs: DebugObservation = _get_env().reset(
|
| 92 |
+
seed=req.seed,
|
| 93 |
+
episode_id=req.episode_id,
|
| 94 |
+
)
|
| 95 |
+
return obs.model_dump()
|
| 96 |
+
|
| 97 |
+
@app.post("/step", tags=["Environment"])
|
| 98 |
+
async def step(req: StepBody) -> Dict[str, Any]:
|
| 99 |
+
"""Apply one fix action and return the updated observation."""
|
| 100 |
+
try:
|
| 101 |
+
action = DebugAction.model_validate(req.action)
|
| 102 |
+
except Exception as exc:
|
| 103 |
+
raise HTTPException(status_code=422, detail=f"Invalid action: {exc}")
|
| 104 |
+
|
| 105 |
+
obs: DebugObservation = _get_env().step(action)
|
| 106 |
+
return obs.model_dump()
|
| 107 |
+
|
| 108 |
+
@app.get("/state", tags=["Environment"])
|
| 109 |
+
async def state() -> Dict[str, Any]:
|
| 110 |
+
"""Return the full internal environment state."""
|
| 111 |
+
s: DebugState = _get_env().state
|
| 112 |
+
return s.model_dump()
|
| 113 |
+
|
| 114 |
+
@app.get("/score", tags=["Environment"])
|
| 115 |
+
async def score() -> Dict[str, Any]:
|
| 116 |
+
"""Return the final episode score [0.0, 1.0]."""
|
| 117 |
+
return {
|
| 118 |
+
"task": _active_task,
|
| 119 |
+
"score": _get_env().score(),
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
@app.get("/tasks", tags=["Environment"])
|
| 123 |
+
async def list_tasks() -> Dict[str, Any]:
|
| 124 |
+
"""List available tasks with descriptions."""
|
| 125 |
+
from .fixtures import TASKS
|
| 126 |
+
return {
|
| 127 |
+
"tasks": [
|
| 128 |
+
{
|
| 129 |
+
"name": t["name"],
|
| 130 |
+
"description": t["description"],
|
| 131 |
+
"max_steps": t["max_steps"],
|
| 132 |
+
"num_endpoints": len(t["broken_endpoints"]),
|
| 133 |
+
}
|
| 134 |
+
for t in TASKS.values()
|
| 135 |
+
]
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
# ------------------------------------------------------------------
|
| 139 |
+
# 2. OpenEnv framework routes — registered LAST (PRODUCTION mode)
|
| 140 |
+
# Adds /health, /schema, /metadata, /ws ONLY.
|
| 141 |
+
# Does NOT override our /reset, /step, /state.
|
| 142 |
+
# ------------------------------------------------------------------
|
| 143 |
+
|
| 144 |
+
_server = HTTPEnvServer(
|
| 145 |
+
env=_get_env,
|
| 146 |
+
action_cls=DebugAction,
|
| 147 |
+
observation_cls=DebugObservation,
|
| 148 |
+
)
|
| 149 |
+
_server.register_routes(app, mode=ServerMode.PRODUCTION)
|
| 150 |
+
|
| 151 |
+
return app
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
app = create_app()
|
| 155 |
+
|
| 156 |
+
def main() -> None:
|
| 157 |
+
import uvicorn
|
| 158 |
+
port = int(os.environ.get("PORT", 7860))
|
| 159 |
+
uvicorn.run(
|
| 160 |
+
"server.app:app",
|
| 161 |
+
host="0.0.0.0",
|
| 162 |
+
port=port,
|
| 163 |
+
reload=False,
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
if __name__ == "__main__":
|
| 168 |
+
main()
|
server/environment.py
ADDED
|
@@ -0,0 +1,291 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
API Contract Debugger — OpenEnv Environment
|
| 3 |
+
|
| 4 |
+
An AI agent receives a broken OpenAPI-style spec and must fix all contract
|
| 5 |
+
violations by proposing targeted field-level corrections step-by-step.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import copy
|
| 11 |
+
import uuid
|
| 12 |
+
from typing import Any, Dict, List, Optional
|
| 13 |
+
|
| 14 |
+
from openenv.core.env_server.interfaces import Environment
|
| 15 |
+
|
| 16 |
+
from .fixtures import TASKS
|
| 17 |
+
from .graders import detect_violations, grade_episode, step_reward
|
| 18 |
+
from .models import (
|
| 19 |
+
ActionKind,
|
| 20 |
+
DebugAction,
|
| 21 |
+
DebugObservation,
|
| 22 |
+
DebugState,
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class APIContractDebuggerEnv(Environment[DebugAction, DebugObservation, DebugState]):
|
| 27 |
+
"""
|
| 28 |
+
Environment where an agent debugs broken API contract specifications.
|
| 29 |
+
|
| 30 |
+
Tasks (difficulty):
|
| 31 |
+
easy — 1 endpoint, 1 missing field
|
| 32 |
+
medium — 3 endpoints, 3 violations (type errors + wrong status)
|
| 33 |
+
hard — 4 endpoints, 6 violations (missing fields, wrong types,
|
| 34 |
+
wrong status, forbidden extra field)
|
| 35 |
+
|
| 36 |
+
Action space:
|
| 37 |
+
DebugAction with kind in {add_field, remove_field, change_type,
|
| 38 |
+
change_status, no_op}
|
| 39 |
+
|
| 40 |
+
Observation space:
|
| 41 |
+
DebugObservation — current endpoints + violation list + reward signals
|
| 42 |
+
|
| 43 |
+
Reward:
|
| 44 |
+
Dense per-step: +0.2×severity per violation fixed, -0.15×severity per
|
| 45 |
+
violation introduced, -0.05 for malformed action.
|
| 46 |
+
Episode terminates when all violations are resolved or max_steps reached.
|
| 47 |
+
"""
|
| 48 |
+
|
| 49 |
+
SUPPORTS_CONCURRENT_SESSIONS: bool = False
|
| 50 |
+
|
| 51 |
+
def __init__(self, task_name: str = "easy") -> None:
|
| 52 |
+
super().__init__()
|
| 53 |
+
if task_name not in TASKS:
|
| 54 |
+
raise ValueError(
|
| 55 |
+
f"Unknown task '{task_name}'. Choose from: {list(TASKS.keys())}"
|
| 56 |
+
)
|
| 57 |
+
self._task_name = task_name
|
| 58 |
+
self._task_cfg = TASKS[task_name]
|
| 59 |
+
|
| 60 |
+
# Internal state (populated on reset)
|
| 61 |
+
self._current_endpoints: List[Dict[str, Any]] = []
|
| 62 |
+
self._golden_endpoints: List[Dict[str, Any]] = []
|
| 63 |
+
self._original_endpoints: List[Dict[str, Any]] = []
|
| 64 |
+
self._violations: List[Dict[str, Any]] = []
|
| 65 |
+
self._initial_violations: List[Dict[str, Any]] = []
|
| 66 |
+
self._step_count: int = 0
|
| 67 |
+
self._episode_id: Optional[str] = None
|
| 68 |
+
self._done: bool = False
|
| 69 |
+
|
| 70 |
+
# ------------------------------------------------------------------
|
| 71 |
+
# OpenEnv API
|
| 72 |
+
# ------------------------------------------------------------------
|
| 73 |
+
|
| 74 |
+
def reset(
|
| 75 |
+
self,
|
| 76 |
+
seed: Optional[int] = None,
|
| 77 |
+
episode_id: Optional[str] = None,
|
| 78 |
+
task_name: Optional[str] = None,
|
| 79 |
+
**kwargs: Any,
|
| 80 |
+
) -> DebugObservation:
|
| 81 |
+
"""Reset the environment and return the initial observation."""
|
| 82 |
+
if task_name and task_name in TASKS:
|
| 83 |
+
self._task_name = task_name
|
| 84 |
+
self._task_cfg = TASKS[task_name]
|
| 85 |
+
|
| 86 |
+
self._episode_id = episode_id or str(uuid.uuid4())
|
| 87 |
+
self._step_count = 0
|
| 88 |
+
self._done = False
|
| 89 |
+
|
| 90 |
+
# Deep-copy fixtures so mutations don't bleed across episodes
|
| 91 |
+
self._current_endpoints = copy.deepcopy(self._task_cfg["broken_endpoints"])
|
| 92 |
+
self._golden_endpoints = copy.deepcopy(self._task_cfg["golden_endpoints"])
|
| 93 |
+
self._original_endpoints = copy.deepcopy(self._task_cfg["broken_endpoints"])
|
| 94 |
+
|
| 95 |
+
self._violations = detect_violations(
|
| 96 |
+
self._current_endpoints, self._golden_endpoints
|
| 97 |
+
)
|
| 98 |
+
self._initial_violations = copy.deepcopy(self._violations)
|
| 99 |
+
|
| 100 |
+
return self._make_observation(
|
| 101 |
+
reward=0.0,
|
| 102 |
+
done=False,
|
| 103 |
+
fixed_this_step=0,
|
| 104 |
+
introduced_this_step=0,
|
| 105 |
+
action_error=None,
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
def step(
|
| 109 |
+
self,
|
| 110 |
+
action: DebugAction,
|
| 111 |
+
timeout_s: Optional[float] = None,
|
| 112 |
+
**kwargs: Any,
|
| 113 |
+
) -> DebugObservation:
|
| 114 |
+
"""Apply one fix action and return the updated observation."""
|
| 115 |
+
if self._done:
|
| 116 |
+
return self._make_observation(
|
| 117 |
+
reward=0.0,
|
| 118 |
+
done=True,
|
| 119 |
+
fixed_this_step=0,
|
| 120 |
+
introduced_this_step=0,
|
| 121 |
+
action_error="Episode is already done. Call reset().",
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
self._step_count += 1
|
| 125 |
+
prev_violations = copy.deepcopy(self._violations)
|
| 126 |
+
action_error: Optional[str] = None
|
| 127 |
+
|
| 128 |
+
# --- Apply the action ---
|
| 129 |
+
if action.kind == ActionKind.NO_OP:
|
| 130 |
+
pass # agent explicitly passes — small implicit penalty via no reward
|
| 131 |
+
else:
|
| 132 |
+
action_error = self._apply_action(action)
|
| 133 |
+
|
| 134 |
+
# --- Recompute violations ---
|
| 135 |
+
self._violations = detect_violations(
|
| 136 |
+
self._current_endpoints, self._golden_endpoints
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
# --- Compute reward ---
|
| 140 |
+
reward = step_reward(
|
| 141 |
+
prev_violations=prev_violations,
|
| 142 |
+
new_violations=self._violations,
|
| 143 |
+
initial_violations=self._initial_violations,
|
| 144 |
+
action_error=(action_error is not None),
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
fixed_this_step = sum(
|
| 148 |
+
1 for v in prev_violations
|
| 149 |
+
if v not in self._violations
|
| 150 |
+
)
|
| 151 |
+
introduced_this_step = sum(
|
| 152 |
+
1 for v in self._violations
|
| 153 |
+
if v not in prev_violations
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
# --- Termination ---
|
| 157 |
+
max_steps = self._task_cfg["max_steps"]
|
| 158 |
+
all_fixed = len(self._violations) == 0
|
| 159 |
+
out_of_steps = self._step_count >= max_steps
|
| 160 |
+
self._done = all_fixed or out_of_steps
|
| 161 |
+
|
| 162 |
+
# Bonus reward for solving all violations
|
| 163 |
+
if all_fixed:
|
| 164 |
+
reward += 0.5
|
| 165 |
+
|
| 166 |
+
return self._make_observation(
|
| 167 |
+
reward=reward,
|
| 168 |
+
done=self._done,
|
| 169 |
+
fixed_this_step=fixed_this_step,
|
| 170 |
+
introduced_this_step=introduced_this_step,
|
| 171 |
+
action_error=action_error,
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
@property
|
| 175 |
+
def state(self) -> DebugState:
|
| 176 |
+
"""Return the full internal environment state."""
|
| 177 |
+
return DebugState(
|
| 178 |
+
episode_id=self._episode_id,
|
| 179 |
+
step_count=self._step_count,
|
| 180 |
+
task_name=self._task_name,
|
| 181 |
+
original_endpoints=self._original_endpoints,
|
| 182 |
+
current_endpoints=self._current_endpoints,
|
| 183 |
+
golden_endpoints=self._golden_endpoints,
|
| 184 |
+
violations=self._violations,
|
| 185 |
+
total_violations_at_start=len(self._initial_violations),
|
| 186 |
+
max_steps=self._task_cfg["max_steps"],
|
| 187 |
+
)
|
| 188 |
+
|
| 189 |
+
def get_metadata(self):
|
| 190 |
+
from openenv.core.env_server.types import EnvironmentMetadata
|
| 191 |
+
return EnvironmentMetadata(
|
| 192 |
+
name="APIContractDebugger",
|
| 193 |
+
description=(
|
| 194 |
+
"An environment where an AI agent debugs broken OpenAPI-style "
|
| 195 |
+
"contract specifications by proposing targeted field-level fixes."
|
| 196 |
+
),
|
| 197 |
+
version="1.0.0",
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
# ------------------------------------------------------------------
|
| 201 |
+
# Internal helpers
|
| 202 |
+
# ------------------------------------------------------------------
|
| 203 |
+
|
| 204 |
+
def _apply_action(self, action: DebugAction) -> Optional[str]:
|
| 205 |
+
"""
|
| 206 |
+
Mutate self._current_endpoints according to the action.
|
| 207 |
+
Returns an error string if the action is invalid, else None.
|
| 208 |
+
"""
|
| 209 |
+
idx = action.endpoint_index
|
| 210 |
+
if idx < 0 or idx >= len(self._current_endpoints):
|
| 211 |
+
return (
|
| 212 |
+
f"endpoint_index {idx} is out of range "
|
| 213 |
+
f"(0–{len(self._current_endpoints) - 1})"
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
endpoint = self._current_endpoints[idx]
|
| 217 |
+
|
| 218 |
+
if action.kind == ActionKind.CHANGE_STATUS:
|
| 219 |
+
if not isinstance(action.new_value, int):
|
| 220 |
+
return "CHANGE_STATUS requires new_value to be an integer HTTP status code"
|
| 221 |
+
endpoint["status_code"] = action.new_value
|
| 222 |
+
return None
|
| 223 |
+
|
| 224 |
+
# For field-level actions, validate location
|
| 225 |
+
if action.location not in ("request_body", "response_body"):
|
| 226 |
+
return (
|
| 227 |
+
f"location must be 'request_body' or 'response_body', "
|
| 228 |
+
f"got '{action.location}'"
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
+
body: Dict[str, Any] = endpoint.setdefault(action.location, {})
|
| 232 |
+
field = action.field_name
|
| 233 |
+
|
| 234 |
+
if action.kind == ActionKind.ADD_FIELD:
|
| 235 |
+
if not field:
|
| 236 |
+
return "ADD_FIELD requires a non-empty field_name"
|
| 237 |
+
if not isinstance(action.new_value, dict) or "type" not in action.new_value:
|
| 238 |
+
return "ADD_FIELD requires new_value to be a dict with a 'type' key"
|
| 239 |
+
body[field] = action.new_value
|
| 240 |
+
return None
|
| 241 |
+
|
| 242 |
+
if action.kind == ActionKind.REMOVE_FIELD:
|
| 243 |
+
if not field:
|
| 244 |
+
return "REMOVE_FIELD requires a non-empty field_name"
|
| 245 |
+
if field not in body:
|
| 246 |
+
return f"field '{field}' does not exist in {action.location}"
|
| 247 |
+
del body[field]
|
| 248 |
+
return None
|
| 249 |
+
|
| 250 |
+
if action.kind == ActionKind.CHANGE_TYPE:
|
| 251 |
+
if not field:
|
| 252 |
+
return "CHANGE_TYPE requires a non-empty field_name"
|
| 253 |
+
if field not in body:
|
| 254 |
+
return f"field '{field}' does not exist in {action.location}"
|
| 255 |
+
if not isinstance(action.new_value, str):
|
| 256 |
+
return "CHANGE_TYPE requires new_value to be a type string"
|
| 257 |
+
body[field]["type"] = action.new_value
|
| 258 |
+
return None
|
| 259 |
+
|
| 260 |
+
return f"Unknown action kind: {action.kind}"
|
| 261 |
+
|
| 262 |
+
def _make_observation(
|
| 263 |
+
self,
|
| 264 |
+
reward: float,
|
| 265 |
+
done: bool,
|
| 266 |
+
fixed_this_step: int,
|
| 267 |
+
introduced_this_step: int,
|
| 268 |
+
action_error: Optional[str],
|
| 269 |
+
) -> DebugObservation:
|
| 270 |
+
return DebugObservation(
|
| 271 |
+
task_name=self._task_name,
|
| 272 |
+
task_description=self._task_cfg["description"],
|
| 273 |
+
endpoints=copy.deepcopy(self._current_endpoints),
|
| 274 |
+
violations=copy.deepcopy(self._violations),
|
| 275 |
+
violations_fixed_this_step=fixed_this_step,
|
| 276 |
+
violations_introduced_this_step=introduced_this_step,
|
| 277 |
+
total_violations_at_start=len(self._initial_violations),
|
| 278 |
+
step_count=self._step_count,
|
| 279 |
+
max_steps=self._task_cfg["max_steps"],
|
| 280 |
+
last_action_error=action_error,
|
| 281 |
+
reward=reward,
|
| 282 |
+
done=done,
|
| 283 |
+
)
|
| 284 |
+
|
| 285 |
+
def score(self) -> float:
|
| 286 |
+
"""Final episode score in [0.0, 1.0]. Call after episode ends."""
|
| 287 |
+
return grade_episode(
|
| 288 |
+
self._current_endpoints,
|
| 289 |
+
self._golden_endpoints,
|
| 290 |
+
self._initial_violations,
|
| 291 |
+
)
|
server/fixtures.py
ADDED
|
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Task fixtures for the API Contract Debugger environment.
|
| 3 |
+
|
| 4 |
+
Each task is a dict with:
|
| 5 |
+
- name: str
|
| 6 |
+
- description: str
|
| 7 |
+
- broken_endpoints: list[dict] — what the agent starts with
|
| 8 |
+
- golden_endpoints: list[dict] — the correct spec the grader checks against
|
| 9 |
+
- max_steps: int
|
| 10 |
+
|
| 11 |
+
Endpoint schema:
|
| 12 |
+
{
|
| 13 |
+
"method": str,
|
| 14 |
+
"path": str,
|
| 15 |
+
"status_code": int,
|
| 16 |
+
"request_body": {
|
| 17 |
+
"<field>": {"type": str, "required": bool, "description": str}
|
| 18 |
+
},
|
| 19 |
+
"response_body": {
|
| 20 |
+
"<field>": {"type": str, "required": bool, "description": str}
|
| 21 |
+
}
|
| 22 |
+
}
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
from __future__ import annotations
|
| 26 |
+
|
| 27 |
+
import copy
|
| 28 |
+
from typing import Any, Dict, List
|
| 29 |
+
|
| 30 |
+
# ---------------------------------------------------------------------------
|
| 31 |
+
# Task 1 — EASY
|
| 32 |
+
# Single endpoint. One missing required field in the response.
|
| 33 |
+
# ---------------------------------------------------------------------------
|
| 34 |
+
|
| 35 |
+
_TASK1_GOLDEN: List[Dict[str, Any]] = [
|
| 36 |
+
{
|
| 37 |
+
"method": "POST",
|
| 38 |
+
"path": "/users/register",
|
| 39 |
+
"status_code": 201,
|
| 40 |
+
"request_body": {
|
| 41 |
+
"username": {"type": "string", "required": True, "description": "Desired username"},
|
| 42 |
+
"email": {"type": "string", "required": True, "description": "User email address"},
|
| 43 |
+
"password": {"type": "string", "required": True, "description": "Plaintext password"},
|
| 44 |
+
},
|
| 45 |
+
"response_body": {
|
| 46 |
+
"user_id": {"type": "integer", "required": True, "description": "Created user ID"},
|
| 47 |
+
"username": {"type": "string", "required": True, "description": "Confirmed username"},
|
| 48 |
+
"created_at": {"type": "string", "required": True, "description": "ISO-8601 timestamp"},
|
| 49 |
+
},
|
| 50 |
+
}
|
| 51 |
+
]
|
| 52 |
+
|
| 53 |
+
# Break it: remove "created_at" from response
|
| 54 |
+
_TASK1_BROKEN: List[Dict[str, Any]] = copy.deepcopy(_TASK1_GOLDEN)
|
| 55 |
+
del _TASK1_BROKEN[0]["response_body"]["created_at"]
|
| 56 |
+
|
| 57 |
+
TASK_EASY: Dict[str, Any] = {
|
| 58 |
+
"name": "easy",
|
| 59 |
+
"description": (
|
| 60 |
+
"A user registration endpoint is missing a required field in its response. "
|
| 61 |
+
"The response should include user_id (integer), username (string), and "
|
| 62 |
+
"created_at (string). Find and add the missing field."
|
| 63 |
+
),
|
| 64 |
+
"broken_endpoints": _TASK1_BROKEN,
|
| 65 |
+
"golden_endpoints": _TASK1_GOLDEN,
|
| 66 |
+
"max_steps": 5,
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
# ---------------------------------------------------------------------------
|
| 71 |
+
# Task 2 — MEDIUM
|
| 72 |
+
# Three endpoints. Type mismatches and a wrong status code.
|
| 73 |
+
# ---------------------------------------------------------------------------
|
| 74 |
+
|
| 75 |
+
_TASK2_GOLDEN: List[Dict[str, Any]] = [
|
| 76 |
+
{
|
| 77 |
+
"method": "GET",
|
| 78 |
+
"path": "/products/{id}",
|
| 79 |
+
"status_code": 200,
|
| 80 |
+
"request_body": {},
|
| 81 |
+
"response_body": {
|
| 82 |
+
"product_id": {"type": "integer", "required": True, "description": "Product ID"},
|
| 83 |
+
"name": {"type": "string", "required": True, "description": "Product name"},
|
| 84 |
+
"price": {"type": "number", "required": True, "description": "Price in USD"},
|
| 85 |
+
"in_stock": {"type": "boolean", "required": True, "description": "Availability"},
|
| 86 |
+
},
|
| 87 |
+
},
|
| 88 |
+
{
|
| 89 |
+
"method": "POST",
|
| 90 |
+
"path": "/orders",
|
| 91 |
+
"status_code": 201,
|
| 92 |
+
"request_body": {
|
| 93 |
+
"product_id": {"type": "integer", "required": True, "description": "Product to order"},
|
| 94 |
+
"quantity": {"type": "integer", "required": True, "description": "Number of units"},
|
| 95 |
+
"customer_id":{"type": "integer", "required": True, "description": "Buyer ID"},
|
| 96 |
+
},
|
| 97 |
+
"response_body": {
|
| 98 |
+
"order_id": {"type": "integer", "required": True, "description": "Created order ID"},
|
| 99 |
+
"total_price":{"type": "number", "required": True, "description": "Total cost"},
|
| 100 |
+
"status": {"type": "string", "required": True, "description": "Order status"},
|
| 101 |
+
},
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"method": "DELETE",
|
| 105 |
+
"path": "/orders/{id}",
|
| 106 |
+
"status_code": 204,
|
| 107 |
+
"request_body": {},
|
| 108 |
+
"response_body": {},
|
| 109 |
+
},
|
| 110 |
+
]
|
| 111 |
+
|
| 112 |
+
# Break it:
|
| 113 |
+
# 1. product_id type: integer → string (GET /products/{id} response)
|
| 114 |
+
# 2. quantity type: integer → string (POST /orders request)
|
| 115 |
+
# 3. DELETE status_code: 204 → 200
|
| 116 |
+
_TASK2_BROKEN: List[Dict[str, Any]] = copy.deepcopy(_TASK2_GOLDEN)
|
| 117 |
+
_TASK2_BROKEN[0]["response_body"]["product_id"]["type"] = "string" # violation 1
|
| 118 |
+
_TASK2_BROKEN[1]["request_body"]["quantity"]["type"] = "string" # violation 2
|
| 119 |
+
_TASK2_BROKEN[2]["status_code"] = 200 # violation 3
|
| 120 |
+
|
| 121 |
+
TASK_MEDIUM: Dict[str, Any] = {
|
| 122 |
+
"name": "medium",
|
| 123 |
+
"description": (
|
| 124 |
+
"An e-commerce API has three endpoints with contract violations: "
|
| 125 |
+
"(1) GET /products/{id} returns product_id as string instead of integer, "
|
| 126 |
+
"(2) POST /orders accepts quantity as string instead of integer, "
|
| 127 |
+
"(3) DELETE /orders/{id} returns status 200 instead of 204. "
|
| 128 |
+
"Fix all three violations."
|
| 129 |
+
),
|
| 130 |
+
"broken_endpoints": _TASK2_BROKEN,
|
| 131 |
+
"golden_endpoints": _TASK2_GOLDEN,
|
| 132 |
+
"max_steps": 10,
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
# ---------------------------------------------------------------------------
|
| 137 |
+
# Task 3 — HARD
|
| 138 |
+
# Multi-endpoint API. Missing required fields, type errors, wrong status code,
|
| 139 |
+
# AND a forbidden extra field that must be removed.
|
| 140 |
+
# ---------------------------------------------------------------------------
|
| 141 |
+
|
| 142 |
+
_TASK3_GOLDEN: List[Dict[str, Any]] = [
|
| 143 |
+
{
|
| 144 |
+
"method": "POST",
|
| 145 |
+
"path": "/auth/login",
|
| 146 |
+
"status_code": 200,
|
| 147 |
+
"request_body": {
|
| 148 |
+
"email": {"type": "string", "required": True, "description": "User email"},
|
| 149 |
+
"password": {"type": "string", "required": True, "description": "User password"},
|
| 150 |
+
},
|
| 151 |
+
"response_body": {
|
| 152 |
+
"access_token": {"type": "string", "required": True, "description": "JWT token"},
|
| 153 |
+
"refresh_token": {"type": "string", "required": True, "description": "Refresh token"},
|
| 154 |
+
"expires_in": {"type": "integer", "required": True, "description": "TTL in seconds"},
|
| 155 |
+
},
|
| 156 |
+
},
|
| 157 |
+
{
|
| 158 |
+
"method": "GET",
|
| 159 |
+
"path": "/users/{id}/profile",
|
| 160 |
+
"status_code": 200,
|
| 161 |
+
"request_body": {},
|
| 162 |
+
"response_body": {
|
| 163 |
+
"user_id": {"type": "integer", "required": True, "description": "User ID"},
|
| 164 |
+
"email": {"type": "string", "required": True, "description": "User email"},
|
| 165 |
+
"full_name": {"type": "string", "required": True, "description": "Display name"},
|
| 166 |
+
"role": {"type": "string", "required": True, "description": "User role"},
|
| 167 |
+
"created_at": {"type": "string", "required": True, "description": "ISO-8601 timestamp"},
|
| 168 |
+
},
|
| 169 |
+
},
|
| 170 |
+
{
|
| 171 |
+
"method": "PATCH",
|
| 172 |
+
"path": "/users/{id}/profile",
|
| 173 |
+
"status_code": 200,
|
| 174 |
+
"request_body": {
|
| 175 |
+
"full_name": {"type": "string", "required": False, "description": "Updated name"},
|
| 176 |
+
"email": {"type": "string", "required": False, "description": "Updated email"},
|
| 177 |
+
},
|
| 178 |
+
"response_body": {
|
| 179 |
+
"user_id": {"type": "integer", "required": True, "description": "User ID"},
|
| 180 |
+
"full_name": {"type": "string", "required": True, "description": "Updated name"},
|
| 181 |
+
"email": {"type": "string", "required": True, "description": "Updated email"},
|
| 182 |
+
"updated_at":{"type": "string", "required": True, "description": "ISO-8601 timestamp"},
|
| 183 |
+
},
|
| 184 |
+
},
|
| 185 |
+
{
|
| 186 |
+
"method": "POST",
|
| 187 |
+
"path": "/auth/refresh",
|
| 188 |
+
"status_code": 200,
|
| 189 |
+
"request_body": {
|
| 190 |
+
"refresh_token": {"type": "string", "required": True, "description": "Refresh token"},
|
| 191 |
+
},
|
| 192 |
+
"response_body": {
|
| 193 |
+
"access_token": {"type": "string", "required": True, "description": "New JWT token"},
|
| 194 |
+
"expires_in": {"type": "integer", "required": True, "description": "TTL in seconds"},
|
| 195 |
+
},
|
| 196 |
+
},
|
| 197 |
+
]
|
| 198 |
+
|
| 199 |
+
_TASK3_BROKEN: List[Dict[str, Any]] = copy.deepcopy(_TASK3_GOLDEN)
|
| 200 |
+
# Violation 1: missing refresh_token in /auth/login response
|
| 201 |
+
del _TASK3_BROKEN[0]["response_body"]["refresh_token"]
|
| 202 |
+
# Violation 2: expires_in type integer → string in /auth/login response
|
| 203 |
+
_TASK3_BROKEN[0]["response_body"]["expires_in"]["type"] = "string"
|
| 204 |
+
# Violation 3: missing created_at in /users/{id}/profile response
|
| 205 |
+
del _TASK3_BROKEN[1]["response_body"]["created_at"]
|
| 206 |
+
# Violation 4: extra forbidden field "password_hash" in /users/{id}/profile response
|
| 207 |
+
_TASK3_BROKEN[1]["response_body"]["password_hash"] = {
|
| 208 |
+
"type": "string", "required": False, "description": "Hashed password — MUST NOT be exposed"
|
| 209 |
+
}
|
| 210 |
+
# Violation 5: PATCH /users/{id}/profile status_code 200 → 500 (regression)
|
| 211 |
+
_TASK3_BROKEN[2]["status_code"] = 500
|
| 212 |
+
# Violation 6: missing updated_at in PATCH response
|
| 213 |
+
del _TASK3_BROKEN[2]["response_body"]["updated_at"]
|
| 214 |
+
|
| 215 |
+
TASK_HARD: Dict[str, Any] = {
|
| 216 |
+
"name": "hard",
|
| 217 |
+
"description": (
|
| 218 |
+
"An authentication + profile API has 6 contract violations across 4 endpoints: "
|
| 219 |
+
"(1) POST /auth/login is missing refresh_token in response, "
|
| 220 |
+
"(2) POST /auth/login returns expires_in as string instead of integer, "
|
| 221 |
+
"(3) GET /users/{id}/profile is missing created_at in response, "
|
| 222 |
+
"(4) GET /users/{id}/profile exposes a forbidden password_hash field that must be removed, "
|
| 223 |
+
"(5) PATCH /users/{id}/profile returns status 500 instead of 200, "
|
| 224 |
+
"(6) PATCH /users/{id}/profile is missing updated_at in response. "
|
| 225 |
+
"Fix all violations."
|
| 226 |
+
),
|
| 227 |
+
"broken_endpoints": _TASK3_BROKEN,
|
| 228 |
+
"golden_endpoints": _TASK3_GOLDEN,
|
| 229 |
+
"max_steps": 15,
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
# ---------------------------------------------------------------------------
|
| 234 |
+
# Registry
|
| 235 |
+
# ---------------------------------------------------------------------------
|
| 236 |
+
|
| 237 |
+
TASKS: Dict[str, Dict[str, Any]] = {
|
| 238 |
+
"easy": TASK_EASY,
|
| 239 |
+
"medium": TASK_MEDIUM,
|
| 240 |
+
"hard": TASK_HARD,
|
| 241 |
+
}
|
server/graders.py
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Violation detection and graders for the API Contract Debugger environment.
|
| 3 |
+
|
| 4 |
+
detect_violations(current, golden) → list of violation dicts
|
| 5 |
+
grade_episode(current, golden) → float in [0.0, 1.0]
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import copy
|
| 11 |
+
from typing import Any, Dict, List
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# ---------------------------------------------------------------------------
|
| 15 |
+
# Violation detection
|
| 16 |
+
# ---------------------------------------------------------------------------
|
| 17 |
+
|
| 18 |
+
def detect_violations(
|
| 19 |
+
current_endpoints: List[Dict[str, Any]],
|
| 20 |
+
golden_endpoints: List[Dict[str, Any]],
|
| 21 |
+
) -> List[Dict[str, Any]]:
|
| 22 |
+
"""
|
| 23 |
+
Compare current spec against the golden spec and return all violations.
|
| 24 |
+
|
| 25 |
+
Violation dict keys:
|
| 26 |
+
endpoint_index int — index into endpoint list
|
| 27 |
+
location str — "request_body" | "response_body" | "status_code"
|
| 28 |
+
field_name str|None
|
| 29 |
+
violation_type str — "missing_field" | "extra_field" | "wrong_type" | "wrong_status"
|
| 30 |
+
description str — human-readable explanation
|
| 31 |
+
severity float — weight used in scoring (0.0–1.0)
|
| 32 |
+
"""
|
| 33 |
+
violations: List[Dict[str, Any]] = []
|
| 34 |
+
|
| 35 |
+
for idx, (cur, gold) in enumerate(zip(current_endpoints, golden_endpoints)):
|
| 36 |
+
# --- Status code ---
|
| 37 |
+
if cur.get("status_code") != gold.get("status_code"):
|
| 38 |
+
violations.append({
|
| 39 |
+
"endpoint_index": idx,
|
| 40 |
+
"location": "status_code",
|
| 41 |
+
"field_name": None,
|
| 42 |
+
"violation_type": "wrong_status",
|
| 43 |
+
"description": (
|
| 44 |
+
f"{gold['method']} {gold['path']}: "
|
| 45 |
+
f"status_code is {cur.get('status_code')} "
|
| 46 |
+
f"but should be {gold.get('status_code')}"
|
| 47 |
+
),
|
| 48 |
+
"severity": 0.8,
|
| 49 |
+
})
|
| 50 |
+
|
| 51 |
+
# --- Request body and response body ---
|
| 52 |
+
for location in ("request_body", "response_body"):
|
| 53 |
+
cur_body: Dict[str, Any] = cur.get(location, {})
|
| 54 |
+
gold_body: Dict[str, Any] = gold.get(location, {})
|
| 55 |
+
|
| 56 |
+
# Missing required fields
|
| 57 |
+
for field, spec in gold_body.items():
|
| 58 |
+
if field not in cur_body:
|
| 59 |
+
violations.append({
|
| 60 |
+
"endpoint_index": idx,
|
| 61 |
+
"location": location,
|
| 62 |
+
"field_name": field,
|
| 63 |
+
"violation_type": "missing_field",
|
| 64 |
+
"description": (
|
| 65 |
+
f"{gold['method']} {gold['path']} {location}: "
|
| 66 |
+
f"required field '{field}' ({spec['type']}) is missing"
|
| 67 |
+
),
|
| 68 |
+
"severity": 1.0,
|
| 69 |
+
})
|
| 70 |
+
else:
|
| 71 |
+
# Wrong type
|
| 72 |
+
cur_type = cur_body[field].get("type")
|
| 73 |
+
gold_type = spec.get("type")
|
| 74 |
+
if cur_type != gold_type:
|
| 75 |
+
violations.append({
|
| 76 |
+
"endpoint_index": idx,
|
| 77 |
+
"location": location,
|
| 78 |
+
"field_name": field,
|
| 79 |
+
"violation_type": "wrong_type",
|
| 80 |
+
"description": (
|
| 81 |
+
f"{gold['method']} {gold['path']} {location}: "
|
| 82 |
+
f"field '{field}' has type '{cur_type}' "
|
| 83 |
+
f"but should be '{gold_type}'"
|
| 84 |
+
),
|
| 85 |
+
"severity": 0.9,
|
| 86 |
+
})
|
| 87 |
+
|
| 88 |
+
# Extra (forbidden) fields — fields in current but not in golden
|
| 89 |
+
for field in cur_body:
|
| 90 |
+
if field not in gold_body:
|
| 91 |
+
violations.append({
|
| 92 |
+
"endpoint_index": idx,
|
| 93 |
+
"location": location,
|
| 94 |
+
"field_name": field,
|
| 95 |
+
"violation_type": "extra_field",
|
| 96 |
+
"description": (
|
| 97 |
+
f"{gold['method']} {gold['path']} {location}: "
|
| 98 |
+
f"field '{field}' is present but should not be in the contract"
|
| 99 |
+
),
|
| 100 |
+
"severity": 0.7,
|
| 101 |
+
})
|
| 102 |
+
|
| 103 |
+
return violations
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
# ---------------------------------------------------------------------------
|
| 107 |
+
# Grader
|
| 108 |
+
# ---------------------------------------------------------------------------
|
| 109 |
+
|
| 110 |
+
def grade_episode(
|
| 111 |
+
current_endpoints: List[Dict[str, Any]],
|
| 112 |
+
golden_endpoints: List[Dict[str, Any]],
|
| 113 |
+
initial_violations: List[Dict[str, Any]],
|
| 114 |
+
) -> float:
|
| 115 |
+
"""
|
| 116 |
+
Score the agent's performance at the END of an episode.
|
| 117 |
+
|
| 118 |
+
Returns a float in [0.0, 1.0]:
|
| 119 |
+
1.0 — all violations fixed, no new ones introduced
|
| 120 |
+
0.0 — no improvement at all
|
| 121 |
+
intermediate — partial credit weighted by severity
|
| 122 |
+
|
| 123 |
+
Formula:
|
| 124 |
+
score = (weighted_fixed - weighted_introduced) / total_initial_weight
|
| 125 |
+
clamped to [0.0, 1.0]
|
| 126 |
+
"""
|
| 127 |
+
remaining = detect_violations(current_endpoints, golden_endpoints)
|
| 128 |
+
remaining_keys = _violation_keys(remaining)
|
| 129 |
+
|
| 130 |
+
initial_keys = _violation_keys(initial_violations)
|
| 131 |
+
|
| 132 |
+
# Violations that were present at start and are now gone = fixed
|
| 133 |
+
fixed = [v for v in initial_violations if _vkey(v) not in remaining_keys]
|
| 134 |
+
# Violations that are present now but weren't at start = newly introduced
|
| 135 |
+
introduced = [v for v in remaining if _vkey(v) not in initial_keys]
|
| 136 |
+
|
| 137 |
+
total_initial_weight = sum(v["severity"] for v in initial_violations)
|
| 138 |
+
if total_initial_weight == 0:
|
| 139 |
+
return 1.0 # spec was already clean
|
| 140 |
+
|
| 141 |
+
weighted_fixed = sum(v["severity"] for v in fixed)
|
| 142 |
+
weighted_introduced = sum(v["severity"] for v in introduced)
|
| 143 |
+
|
| 144 |
+
raw = (weighted_fixed - weighted_introduced) / total_initial_weight
|
| 145 |
+
return float(max(0.0, min(1.0, raw)))
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def step_reward(
|
| 149 |
+
prev_violations: List[Dict[str, Any]],
|
| 150 |
+
new_violations: List[Dict[str, Any]],
|
| 151 |
+
initial_violations: List[Dict[str, Any]],
|
| 152 |
+
action_error: bool,
|
| 153 |
+
) -> float:
|
| 154 |
+
"""
|
| 155 |
+
Dense per-step reward signal.
|
| 156 |
+
|
| 157 |
+
+0.2 per violation resolved this step (weighted by severity)
|
| 158 |
+
-0.15 per new violation introduced
|
| 159 |
+
-0.05 for a malformed action (out-of-range index, bad field, etc.)
|
| 160 |
+
"""
|
| 161 |
+
if action_error:
|
| 162 |
+
return -0.05
|
| 163 |
+
|
| 164 |
+
prev_keys = _violation_keys(prev_violations)
|
| 165 |
+
new_keys = _violation_keys(new_violations)
|
| 166 |
+
|
| 167 |
+
fixed_this_step = [v for v in prev_violations if _vkey(v) not in new_keys]
|
| 168 |
+
introduced_this_step = [v for v in new_violations if _vkey(v) not in prev_keys]
|
| 169 |
+
|
| 170 |
+
reward = 0.0
|
| 171 |
+
for v in fixed_this_step:
|
| 172 |
+
reward += 0.2 * v["severity"]
|
| 173 |
+
for v in introduced_this_step:
|
| 174 |
+
reward -= 0.15 * v["severity"]
|
| 175 |
+
|
| 176 |
+
return round(reward, 4)
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
# ---------------------------------------------------------------------------
|
| 180 |
+
# Helpers
|
| 181 |
+
# ---------------------------------------------------------------------------
|
| 182 |
+
|
| 183 |
+
def _vkey(v: Dict[str, Any]) -> tuple:
|
| 184 |
+
return (
|
| 185 |
+
v["endpoint_index"],
|
| 186 |
+
v["location"],
|
| 187 |
+
v.get("field_name"),
|
| 188 |
+
v["violation_type"],
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def _violation_keys(violations: List[Dict[str, Any]]) -> set:
|
| 193 |
+
return {_vkey(v) for v in violations}
|
server/models.py
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Typed Pydantic models for the API Contract Debugger environment.
|
| 3 |
+
|
| 4 |
+
The environment gives an agent a broken OpenAPI-style spec and asks it to
|
| 5 |
+
fix contract violations by proposing targeted field-level corrections.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
from enum import Enum
|
| 11 |
+
from typing import Any, Dict, List, Optional
|
| 12 |
+
|
| 13 |
+
from openenv.core.env_server.types import Action, Observation, State
|
| 14 |
+
from pydantic import Field
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
# ---------------------------------------------------------------------------
|
| 18 |
+
# Domain types
|
| 19 |
+
# ---------------------------------------------------------------------------
|
| 20 |
+
|
| 21 |
+
class FieldType(str, Enum):
|
| 22 |
+
"""Supported JSON Schema primitive types."""
|
| 23 |
+
STRING = "string"
|
| 24 |
+
INTEGER = "integer"
|
| 25 |
+
NUMBER = "number"
|
| 26 |
+
BOOLEAN = "boolean"
|
| 27 |
+
ARRAY = "array"
|
| 28 |
+
OBJECT = "object"
|
| 29 |
+
NULL = "null"
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class HttpMethod(str, Enum):
|
| 33 |
+
GET = "GET"
|
| 34 |
+
POST = "POST"
|
| 35 |
+
PUT = "PUT"
|
| 36 |
+
PATCH = "PATCH"
|
| 37 |
+
DELETE = "DELETE"
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class ActionKind(str, Enum):
|
| 41 |
+
"""What kind of fix the agent is proposing."""
|
| 42 |
+
ADD_FIELD = "add_field" # Add a missing required field
|
| 43 |
+
REMOVE_FIELD = "remove_field" # Remove a forbidden/extra field
|
| 44 |
+
CHANGE_TYPE = "change_type" # Fix a field's type
|
| 45 |
+
CHANGE_STATUS = "change_status" # Fix an HTTP status code
|
| 46 |
+
NO_OP = "no_op" # Agent explicitly passes this step
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# ---------------------------------------------------------------------------
|
| 50 |
+
# API Spec domain models (not OpenEnv base classes)
|
| 51 |
+
# ---------------------------------------------------------------------------
|
| 52 |
+
|
| 53 |
+
class FieldSpec(dict):
|
| 54 |
+
"""A JSON Schema-like field definition. Stored as plain dict for flexibility."""
|
| 55 |
+
pass
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class EndpointSpec(dict):
|
| 59 |
+
"""A single endpoint definition: method, path, request_body, response."""
|
| 60 |
+
pass
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
# ---------------------------------------------------------------------------
|
| 64 |
+
# OpenEnv Action
|
| 65 |
+
# ---------------------------------------------------------------------------
|
| 66 |
+
|
| 67 |
+
class DebugAction(Action):
|
| 68 |
+
"""
|
| 69 |
+
A single fix proposed by the agent.
|
| 70 |
+
|
| 71 |
+
The agent targets one endpoint + one field and proposes exactly one change.
|
| 72 |
+
"""
|
| 73 |
+
|
| 74 |
+
kind: ActionKind = Field(
|
| 75 |
+
...,
|
| 76 |
+
description="The type of fix being applied",
|
| 77 |
+
)
|
| 78 |
+
endpoint_index: int = Field(
|
| 79 |
+
...,
|
| 80 |
+
ge=0,
|
| 81 |
+
description="0-based index into the endpoint list",
|
| 82 |
+
)
|
| 83 |
+
location: str = Field(
|
| 84 |
+
...,
|
| 85 |
+
description=(
|
| 86 |
+
"Where in the endpoint to apply the fix. "
|
| 87 |
+
"One of: 'request_body', 'response_body', 'status_code'"
|
| 88 |
+
),
|
| 89 |
+
)
|
| 90 |
+
field_name: Optional[str] = Field(
|
| 91 |
+
default=None,
|
| 92 |
+
description="Field name to add/remove/change (null for status_code fixes)",
|
| 93 |
+
)
|
| 94 |
+
new_value: Optional[Any] = Field(
|
| 95 |
+
default=None,
|
| 96 |
+
description=(
|
| 97 |
+
"The corrected value. "
|
| 98 |
+
"For CHANGE_TYPE: a FieldType string. "
|
| 99 |
+
"For ADD_FIELD: a dict with 'type' (and optional 'description'). "
|
| 100 |
+
"For CHANGE_STATUS: an integer HTTP status code. "
|
| 101 |
+
"For REMOVE_FIELD / NO_OP: null."
|
| 102 |
+
),
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
# ---------------------------------------------------------------------------
|
| 107 |
+
# OpenEnv Observation
|
| 108 |
+
# ---------------------------------------------------------------------------
|
| 109 |
+
|
| 110 |
+
class Violation(dict):
|
| 111 |
+
"""
|
| 112 |
+
Describes a single detected contract violation.
|
| 113 |
+
|
| 114 |
+
Keys: endpoint_index, location, field_name, violation_type, description
|
| 115 |
+
"""
|
| 116 |
+
pass
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
class DebugObservation(Observation):
|
| 120 |
+
"""
|
| 121 |
+
What the agent sees after each reset() / step().
|
| 122 |
+
"""
|
| 123 |
+
|
| 124 |
+
task_name: str = Field(
|
| 125 |
+
...,
|
| 126 |
+
description="Which task is currently active (easy / medium / hard)",
|
| 127 |
+
)
|
| 128 |
+
task_description: str = Field(
|
| 129 |
+
...,
|
| 130 |
+
description="Human-readable description of the task objective",
|
| 131 |
+
)
|
| 132 |
+
endpoints: List[Dict[str, Any]] = Field(
|
| 133 |
+
...,
|
| 134 |
+
description="Current (potentially partially-fixed) endpoint specs",
|
| 135 |
+
)
|
| 136 |
+
violations: List[Dict[str, Any]] = Field(
|
| 137 |
+
default_factory=list,
|
| 138 |
+
description="List of detected violations still present in the spec",
|
| 139 |
+
)
|
| 140 |
+
violations_fixed_this_step: int = Field(
|
| 141 |
+
default=0,
|
| 142 |
+
description="How many violations the last action resolved",
|
| 143 |
+
)
|
| 144 |
+
violations_introduced_this_step: int = Field(
|
| 145 |
+
default=0,
|
| 146 |
+
description="How many new violations the last action introduced",
|
| 147 |
+
)
|
| 148 |
+
total_violations_at_start: int = Field(
|
| 149 |
+
...,
|
| 150 |
+
description="Number of violations at episode start (for progress tracking)",
|
| 151 |
+
)
|
| 152 |
+
step_count: int = Field(
|
| 153 |
+
default=0,
|
| 154 |
+
description="Steps taken so far in this episode",
|
| 155 |
+
)
|
| 156 |
+
max_steps: int = Field(
|
| 157 |
+
default=10,
|
| 158 |
+
description="Maximum steps allowed per episode",
|
| 159 |
+
)
|
| 160 |
+
last_action_error: Optional[str] = Field(
|
| 161 |
+
default=None,
|
| 162 |
+
description="Error message if the last action was malformed / out-of-range",
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
# ---------------------------------------------------------------------------
|
| 167 |
+
# OpenEnv State
|
| 168 |
+
# ---------------------------------------------------------------------------
|
| 169 |
+
|
| 170 |
+
class DebugState(State):
|
| 171 |
+
"""
|
| 172 |
+
Full internal state of the environment (not exposed to the agent by default).
|
| 173 |
+
"""
|
| 174 |
+
|
| 175 |
+
task_name: str = Field(default="")
|
| 176 |
+
original_endpoints: List[Dict[str, Any]] = Field(default_factory=list)
|
| 177 |
+
current_endpoints: List[Dict[str, Any]] = Field(default_factory=list)
|
| 178 |
+
golden_endpoints: List[Dict[str, Any]] = Field(default_factory=list)
|
| 179 |
+
violations: List[Dict[str, Any]] = Field(default_factory=list)
|
| 180 |
+
total_violations_at_start: int = Field(default=0)
|
| 181 |
+
max_steps: int = Field(default=10)
|
tests/__pycache__/test_env.cpython-314-pytest-9.0.2.pyc
ADDED
|
Binary file (90.1 kB). View file
|
|
|
tests/test_env.py
ADDED
|
@@ -0,0 +1,565 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Test suite for the API Contract Debugger environment.
|
| 3 |
+
|
| 4 |
+
Coverage:
|
| 5 |
+
- Violation detection (all violation types)
|
| 6 |
+
- Grader scoring
|
| 7 |
+
- Per-step reward shaping
|
| 8 |
+
- Environment reset / step / state
|
| 9 |
+
- All three tasks end-to-end
|
| 10 |
+
- Edge cases: malformed actions, double-fix, already-clean spec
|
| 11 |
+
- HTTP API routes (via TestClient)
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from __future__ import annotations
|
| 15 |
+
|
| 16 |
+
import copy
|
| 17 |
+
import sys
|
| 18 |
+
import os
|
| 19 |
+
|
| 20 |
+
import pytest
|
| 21 |
+
|
| 22 |
+
# Make sure the project root is on the path
|
| 23 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
| 24 |
+
|
| 25 |
+
from server.fixtures import TASK_EASY, TASK_HARD, TASK_MEDIUM, TASKS
|
| 26 |
+
from server.graders import detect_violations, grade_episode, step_reward
|
| 27 |
+
from server.models import ActionKind, DebugAction
|
| 28 |
+
from server.environment import APIContractDebuggerEnv
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
# ===========================================================================
|
| 32 |
+
# Helpers
|
| 33 |
+
# ===========================================================================
|
| 34 |
+
|
| 35 |
+
def make_env(task: str = "easy") -> APIContractDebuggerEnv:
|
| 36 |
+
env = APIContractDebuggerEnv(task_name=task)
|
| 37 |
+
env.reset()
|
| 38 |
+
return env
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def action(**kwargs) -> DebugAction:
|
| 42 |
+
defaults = dict(
|
| 43 |
+
kind=ActionKind.NO_OP,
|
| 44 |
+
endpoint_index=0,
|
| 45 |
+
location="response_body",
|
| 46 |
+
field_name=None,
|
| 47 |
+
new_value=None,
|
| 48 |
+
)
|
| 49 |
+
defaults.update(kwargs)
|
| 50 |
+
return DebugAction(**defaults)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
# ===========================================================================
|
| 54 |
+
# 1. Fixture sanity
|
| 55 |
+
# ===========================================================================
|
| 56 |
+
|
| 57 |
+
class TestFixtures:
|
| 58 |
+
def test_all_tasks_present(self):
|
| 59 |
+
assert set(TASKS.keys()) == {"easy", "medium", "hard"}
|
| 60 |
+
|
| 61 |
+
def test_easy_has_violations(self):
|
| 62 |
+
v = detect_violations(TASK_EASY["broken_endpoints"], TASK_EASY["golden_endpoints"])
|
| 63 |
+
assert len(v) == 1
|
| 64 |
+
|
| 65 |
+
def test_medium_has_three_violations(self):
|
| 66 |
+
v = detect_violations(TASK_MEDIUM["broken_endpoints"], TASK_MEDIUM["golden_endpoints"])
|
| 67 |
+
assert len(v) == 3
|
| 68 |
+
|
| 69 |
+
def test_hard_has_six_violations(self):
|
| 70 |
+
v = detect_violations(TASK_HARD["broken_endpoints"], TASK_HARD["golden_endpoints"])
|
| 71 |
+
assert len(v) == 6
|
| 72 |
+
|
| 73 |
+
def test_golden_specs_are_clean(self):
|
| 74 |
+
for task in TASKS.values():
|
| 75 |
+
v = detect_violations(task["golden_endpoints"], task["golden_endpoints"])
|
| 76 |
+
assert v == [], f"Golden spec for '{task['name']}' has violations: {v}"
|
| 77 |
+
|
| 78 |
+
def test_broken_and_golden_same_length(self):
|
| 79 |
+
for task in TASKS.values():
|
| 80 |
+
assert len(task["broken_endpoints"]) == len(task["golden_endpoints"])
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
# ===========================================================================
|
| 84 |
+
# 2. Violation detection
|
| 85 |
+
# ===========================================================================
|
| 86 |
+
|
| 87 |
+
class TestViolationDetection:
|
| 88 |
+
def test_missing_field_detected(self):
|
| 89 |
+
current = [{"method": "GET", "path": "/x", "status_code": 200,
|
| 90 |
+
"request_body": {}, "response_body": {}}]
|
| 91 |
+
golden = [{"method": "GET", "path": "/x", "status_code": 200,
|
| 92 |
+
"request_body": {}, "response_body": {
|
| 93 |
+
"id": {"type": "integer", "required": True, "description": ""}
|
| 94 |
+
}}]
|
| 95 |
+
v = detect_violations(current, golden)
|
| 96 |
+
assert len(v) == 1
|
| 97 |
+
assert v[0]["violation_type"] == "missing_field"
|
| 98 |
+
assert v[0]["field_name"] == "id"
|
| 99 |
+
|
| 100 |
+
def test_extra_field_detected(self):
|
| 101 |
+
current = [{"method": "GET", "path": "/x", "status_code": 200,
|
| 102 |
+
"request_body": {}, "response_body": {
|
| 103 |
+
"secret": {"type": "string", "required": False, "description": ""}
|
| 104 |
+
}}]
|
| 105 |
+
golden = [{"method": "GET", "path": "/x", "status_code": 200,
|
| 106 |
+
"request_body": {}, "response_body": {}}]
|
| 107 |
+
v = detect_violations(current, golden)
|
| 108 |
+
assert len(v) == 1
|
| 109 |
+
assert v[0]["violation_type"] == "extra_field"
|
| 110 |
+
|
| 111 |
+
def test_wrong_type_detected(self):
|
| 112 |
+
current = [{"method": "GET", "path": "/x", "status_code": 200,
|
| 113 |
+
"request_body": {}, "response_body": {
|
| 114 |
+
"count": {"type": "string", "required": True, "description": ""}
|
| 115 |
+
}}]
|
| 116 |
+
golden = [{"method": "GET", "path": "/x", "status_code": 200,
|
| 117 |
+
"request_body": {}, "response_body": {
|
| 118 |
+
"count": {"type": "integer", "required": True, "description": ""}
|
| 119 |
+
}}]
|
| 120 |
+
v = detect_violations(current, golden)
|
| 121 |
+
assert len(v) == 1
|
| 122 |
+
assert v[0]["violation_type"] == "wrong_type"
|
| 123 |
+
|
| 124 |
+
def test_wrong_status_detected(self):
|
| 125 |
+
current = [{"method": "DELETE", "path": "/x", "status_code": 200,
|
| 126 |
+
"request_body": {}, "response_body": {}}]
|
| 127 |
+
golden = [{"method": "DELETE", "path": "/x", "status_code": 204,
|
| 128 |
+
"request_body": {}, "response_body": {}}]
|
| 129 |
+
v = detect_violations(current, golden)
|
| 130 |
+
assert len(v) == 1
|
| 131 |
+
assert v[0]["violation_type"] == "wrong_status"
|
| 132 |
+
|
| 133 |
+
def test_no_violations_on_matching_spec(self):
|
| 134 |
+
golden = TASK_EASY["golden_endpoints"]
|
| 135 |
+
v = detect_violations(golden, golden)
|
| 136 |
+
assert v == []
|
| 137 |
+
|
| 138 |
+
def test_violation_severity_range(self):
|
| 139 |
+
v = detect_violations(TASK_HARD["broken_endpoints"], TASK_HARD["golden_endpoints"])
|
| 140 |
+
for viol in v:
|
| 141 |
+
assert 0.0 < viol["severity"] <= 1.0
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
# ===========================================================================
|
| 145 |
+
# 3. Grader scoring
|
| 146 |
+
# ===========================================================================
|
| 147 |
+
|
| 148 |
+
class TestGrader:
|
| 149 |
+
def test_perfect_score_when_all_fixed(self):
|
| 150 |
+
golden = TASK_EASY["golden_endpoints"]
|
| 151 |
+
initial = detect_violations(TASK_EASY["broken_endpoints"], golden)
|
| 152 |
+
score = grade_episode(golden, golden, initial)
|
| 153 |
+
assert score == pytest.approx(1.0)
|
| 154 |
+
|
| 155 |
+
def test_zero_score_when_nothing_fixed(self):
|
| 156 |
+
broken = TASK_EASY["broken_endpoints"]
|
| 157 |
+
golden = TASK_EASY["golden_endpoints"]
|
| 158 |
+
initial = detect_violations(broken, golden)
|
| 159 |
+
score = grade_episode(broken, golden, initial)
|
| 160 |
+
assert score == pytest.approx(0.0)
|
| 161 |
+
|
| 162 |
+
def test_partial_score_medium(self):
|
| 163 |
+
broken = copy.deepcopy(TASK_MEDIUM["broken_endpoints"])
|
| 164 |
+
golden = TASK_MEDIUM["golden_endpoints"]
|
| 165 |
+
initial = detect_violations(broken, golden)
|
| 166 |
+
|
| 167 |
+
# Fix only violation 1: product_id type
|
| 168 |
+
broken[0]["response_body"]["product_id"]["type"] = "integer"
|
| 169 |
+
|
| 170 |
+
score = grade_episode(broken, golden, initial)
|
| 171 |
+
assert 0.0 < score < 1.0
|
| 172 |
+
|
| 173 |
+
def test_score_clamped_to_zero_when_extra_violations_introduced(self):
|
| 174 |
+
broken = copy.deepcopy(TASK_EASY["broken_endpoints"])
|
| 175 |
+
golden = TASK_EASY["golden_endpoints"]
|
| 176 |
+
initial = detect_violations(broken, golden)
|
| 177 |
+
|
| 178 |
+
# Introduce more violations
|
| 179 |
+
broken[0]["response_body"]["user_id"]["type"] = "string"
|
| 180 |
+
broken[0]["response_body"]["username"]["type"] = "boolean"
|
| 181 |
+
|
| 182 |
+
score = grade_episode(broken, golden, initial)
|
| 183 |
+
assert score == 0.0
|
| 184 |
+
|
| 185 |
+
def test_score_in_range(self):
|
| 186 |
+
for task in TASKS.values():
|
| 187 |
+
broken = task["broken_endpoints"]
|
| 188 |
+
golden = task["golden_endpoints"]
|
| 189 |
+
initial = detect_violations(broken, golden)
|
| 190 |
+
score = grade_episode(broken, golden, initial)
|
| 191 |
+
assert 0.0 <= score <= 1.0, f"Out-of-range score for task '{task['name']}'"
|
| 192 |
+
|
| 193 |
+
def test_already_clean_spec_scores_one(self):
|
| 194 |
+
golden = TASK_EASY["golden_endpoints"]
|
| 195 |
+
initial: list = [] # no violations at start
|
| 196 |
+
score = grade_episode(golden, golden, initial)
|
| 197 |
+
assert score == pytest.approx(1.0)
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
# ===========================================================================
|
| 201 |
+
# 4. Step reward
|
| 202 |
+
# ===========================================================================
|
| 203 |
+
|
| 204 |
+
class TestStepReward:
|
| 205 |
+
def _make_violation(self, vtype="missing_field", severity=1.0):
|
| 206 |
+
return {
|
| 207 |
+
"endpoint_index": 0, "location": "response_body",
|
| 208 |
+
"field_name": "foo", "violation_type": vtype,
|
| 209 |
+
"description": "test", "severity": severity,
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
def test_positive_reward_for_fix(self):
|
| 213 |
+
v = self._make_violation()
|
| 214 |
+
r = step_reward(prev_violations=[v], new_violations=[], initial_violations=[v], action_error=False)
|
| 215 |
+
assert r > 0
|
| 216 |
+
|
| 217 |
+
def test_negative_reward_for_introduction(self):
|
| 218 |
+
v = self._make_violation()
|
| 219 |
+
r = step_reward(prev_violations=[], new_violations=[v], initial_violations=[], action_error=False)
|
| 220 |
+
assert r < 0
|
| 221 |
+
|
| 222 |
+
def test_penalty_for_action_error(self):
|
| 223 |
+
r = step_reward(prev_violations=[], new_violations=[], initial_violations=[], action_error=True)
|
| 224 |
+
assert r == pytest.approx(-0.05)
|
| 225 |
+
|
| 226 |
+
def test_zero_reward_for_no_op(self):
|
| 227 |
+
r = step_reward(prev_violations=[], new_violations=[], initial_violations=[], action_error=False)
|
| 228 |
+
assert r == pytest.approx(0.0)
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
# ===========================================================================
|
| 232 |
+
# 5. Environment — reset
|
| 233 |
+
# ===========================================================================
|
| 234 |
+
|
| 235 |
+
class TestEnvReset:
|
| 236 |
+
def test_reset_returns_observation(self):
|
| 237 |
+
env = APIContractDebuggerEnv(task_name="easy")
|
| 238 |
+
obs = env.reset()
|
| 239 |
+
assert obs.task_name == "easy"
|
| 240 |
+
assert len(obs.violations) == 1
|
| 241 |
+
assert obs.done is False
|
| 242 |
+
assert obs.step_count == 0
|
| 243 |
+
|
| 244 |
+
def test_reset_clears_state(self):
|
| 245 |
+
env = make_env("easy")
|
| 246 |
+
# Take a step, then reset
|
| 247 |
+
env.step(action(
|
| 248 |
+
kind=ActionKind.ADD_FIELD,
|
| 249 |
+
location="response_body",
|
| 250 |
+
field_name="created_at",
|
| 251 |
+
new_value={"type": "string", "required": True, "description": "timestamp"},
|
| 252 |
+
))
|
| 253 |
+
obs = env.reset()
|
| 254 |
+
assert obs.step_count == 0
|
| 255 |
+
assert len(obs.violations) == 1 # back to broken state
|
| 256 |
+
|
| 257 |
+
def test_reset_switches_task(self):
|
| 258 |
+
env = APIContractDebuggerEnv(task_name="easy")
|
| 259 |
+
obs = env.reset(task_name="medium")
|
| 260 |
+
assert obs.task_name == "medium"
|
| 261 |
+
assert len(obs.violations) == 3
|
| 262 |
+
|
| 263 |
+
def test_reset_preserves_golden(self):
|
| 264 |
+
env = make_env("hard")
|
| 265 |
+
obs = env.reset()
|
| 266 |
+
assert obs.total_violations_at_start == 6
|
| 267 |
+
|
| 268 |
+
def test_episode_id_set_on_reset(self):
|
| 269 |
+
env = APIContractDebuggerEnv(task_name="easy")
|
| 270 |
+
env.reset(episode_id="test-123")
|
| 271 |
+
assert env.state.episode_id == "test-123"
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
# ===========================================================================
|
| 275 |
+
# 6. Environment — step mechanics
|
| 276 |
+
# ===========================================================================
|
| 277 |
+
|
| 278 |
+
class TestEnvStep:
|
| 279 |
+
def test_add_missing_field_fixes_easy(self):
|
| 280 |
+
env = make_env("easy")
|
| 281 |
+
obs = env.step(action(
|
| 282 |
+
kind=ActionKind.ADD_FIELD,
|
| 283 |
+
location="response_body",
|
| 284 |
+
field_name="created_at",
|
| 285 |
+
new_value={"type": "string", "required": True, "description": "ISO timestamp"},
|
| 286 |
+
))
|
| 287 |
+
assert len(obs.violations) == 0
|
| 288 |
+
assert obs.done is True
|
| 289 |
+
assert obs.reward > 0
|
| 290 |
+
|
| 291 |
+
def test_wrong_type_action_introduces_violation(self):
|
| 292 |
+
env = make_env("easy")
|
| 293 |
+
obs = env.step(action(
|
| 294 |
+
kind=ActionKind.ADD_FIELD,
|
| 295 |
+
location="response_body",
|
| 296 |
+
field_name="created_at",
|
| 297 |
+
new_value={"type": "integer", "required": True, "description": "wrong type"},
|
| 298 |
+
))
|
| 299 |
+
# Still has a violation (wrong type now)
|
| 300 |
+
assert len(obs.violations) == 1
|
| 301 |
+
assert obs.violations[0]["violation_type"] == "wrong_type"
|
| 302 |
+
|
| 303 |
+
def test_out_of_range_endpoint_index(self):
|
| 304 |
+
env = make_env("easy")
|
| 305 |
+
obs = env.step(action(kind=ActionKind.ADD_FIELD, endpoint_index=99,
|
| 306 |
+
field_name="x", new_value={"type": "string"}))
|
| 307 |
+
assert obs.last_action_error is not None
|
| 308 |
+
assert "out of range" in obs.last_action_error
|
| 309 |
+
|
| 310 |
+
def test_change_type_fixes_medium_violation(self):
|
| 311 |
+
env = make_env("medium")
|
| 312 |
+
# Fix violation 1: product_id type string→integer in response
|
| 313 |
+
obs = env.step(action(
|
| 314 |
+
kind=ActionKind.CHANGE_TYPE,
|
| 315 |
+
endpoint_index=0,
|
| 316 |
+
location="response_body",
|
| 317 |
+
field_name="product_id",
|
| 318 |
+
new_value="integer",
|
| 319 |
+
))
|
| 320 |
+
assert obs.violations_fixed_this_step == 1
|
| 321 |
+
assert len(obs.violations) == 2 # 2 remaining
|
| 322 |
+
|
| 323 |
+
def test_change_status_fixes_medium_violation(self):
|
| 324 |
+
env = make_env("medium")
|
| 325 |
+
obs = env.step(action(
|
| 326 |
+
kind=ActionKind.CHANGE_STATUS,
|
| 327 |
+
endpoint_index=2,
|
| 328 |
+
location="status_code",
|
| 329 |
+
new_value=204,
|
| 330 |
+
))
|
| 331 |
+
assert obs.violations_fixed_this_step == 1
|
| 332 |
+
|
| 333 |
+
def test_remove_field_fixes_hard_extra_field(self):
|
| 334 |
+
env = make_env("hard")
|
| 335 |
+
obs = env.step(action(
|
| 336 |
+
kind=ActionKind.REMOVE_FIELD,
|
| 337 |
+
endpoint_index=1,
|
| 338 |
+
location="response_body",
|
| 339 |
+
field_name="password_hash",
|
| 340 |
+
))
|
| 341 |
+
assert obs.violations_fixed_this_step == 1
|
| 342 |
+
|
| 343 |
+
def test_no_op_does_not_change_violations(self):
|
| 344 |
+
env = make_env("easy")
|
| 345 |
+
before = len(env.state.violations)
|
| 346 |
+
obs = env.step(action(kind=ActionKind.NO_OP))
|
| 347 |
+
assert len(obs.violations) == before
|
| 348 |
+
|
| 349 |
+
def test_step_after_done_returns_done(self):
|
| 350 |
+
env = make_env("easy")
|
| 351 |
+
# Solve it
|
| 352 |
+
env.step(action(
|
| 353 |
+
kind=ActionKind.ADD_FIELD,
|
| 354 |
+
location="response_body",
|
| 355 |
+
field_name="created_at",
|
| 356 |
+
new_value={"type": "string", "required": True, "description": "ts"},
|
| 357 |
+
))
|
| 358 |
+
# Step again — should get done=True with error message
|
| 359 |
+
obs = env.step(action(kind=ActionKind.NO_OP))
|
| 360 |
+
assert obs.done is True
|
| 361 |
+
assert obs.last_action_error is not None
|
| 362 |
+
|
| 363 |
+
def test_max_steps_terminates_episode(self):
|
| 364 |
+
env = APIContractDebuggerEnv(task_name="easy")
|
| 365 |
+
env.reset()
|
| 366 |
+
obs = None
|
| 367 |
+
for _ in range(env._task_cfg["max_steps"]):
|
| 368 |
+
obs = env.step(action(kind=ActionKind.NO_OP))
|
| 369 |
+
assert obs.done is True
|
| 370 |
+
|
| 371 |
+
def test_step_count_increments(self):
|
| 372 |
+
env = make_env("easy")
|
| 373 |
+
env.step(action(kind=ActionKind.NO_OP))
|
| 374 |
+
env.step(action(kind=ActionKind.NO_OP))
|
| 375 |
+
assert env.state.step_count == 2
|
| 376 |
+
|
| 377 |
+
|
| 378 |
+
# ===========================================================================
|
| 379 |
+
# 7. Environment — state
|
| 380 |
+
# ===========================================================================
|
| 381 |
+
|
| 382 |
+
class TestEnvState:
|
| 383 |
+
def test_state_reflects_current_endpoints(self):
|
| 384 |
+
env = make_env("easy")
|
| 385 |
+
state = env.state
|
| 386 |
+
assert len(state.current_endpoints) == 1
|
| 387 |
+
assert state.task_name == "easy"
|
| 388 |
+
|
| 389 |
+
def test_state_tracks_step_count(self):
|
| 390 |
+
env = make_env("easy")
|
| 391 |
+
env.step(action(kind=ActionKind.NO_OP))
|
| 392 |
+
assert env.state.step_count == 1
|
| 393 |
+
|
| 394 |
+
def test_original_endpoints_unchanged_after_steps(self):
|
| 395 |
+
env = make_env("easy")
|
| 396 |
+
original_before = copy.deepcopy(env.state.original_endpoints)
|
| 397 |
+
env.step(action(
|
| 398 |
+
kind=ActionKind.ADD_FIELD,
|
| 399 |
+
location="response_body",
|
| 400 |
+
field_name="created_at",
|
| 401 |
+
new_value={"type": "string", "required": True, "description": "ts"},
|
| 402 |
+
))
|
| 403 |
+
assert env.state.original_endpoints == original_before
|
| 404 |
+
|
| 405 |
+
|
| 406 |
+
# ===========================================================================
|
| 407 |
+
# 8. Full episode walkthroughs
|
| 408 |
+
# ===========================================================================
|
| 409 |
+
|
| 410 |
+
class TestFullEpisodes:
|
| 411 |
+
def test_easy_perfect_solve(self):
|
| 412 |
+
env = make_env("easy")
|
| 413 |
+
env.step(action(
|
| 414 |
+
kind=ActionKind.ADD_FIELD,
|
| 415 |
+
location="response_body",
|
| 416 |
+
field_name="created_at",
|
| 417 |
+
new_value={"type": "string", "required": True, "description": "ISO timestamp"},
|
| 418 |
+
))
|
| 419 |
+
assert env.score() == pytest.approx(1.0)
|
| 420 |
+
|
| 421 |
+
def test_medium_perfect_solve(self):
|
| 422 |
+
env = make_env("medium")
|
| 423 |
+
# Fix 1: product_id type
|
| 424 |
+
env.step(action(kind=ActionKind.CHANGE_TYPE, endpoint_index=0,
|
| 425 |
+
location="response_body", field_name="product_id", new_value="integer"))
|
| 426 |
+
# Fix 2: quantity type
|
| 427 |
+
env.step(action(kind=ActionKind.CHANGE_TYPE, endpoint_index=1,
|
| 428 |
+
location="request_body", field_name="quantity", new_value="integer"))
|
| 429 |
+
# Fix 3: DELETE status code
|
| 430 |
+
env.step(action(kind=ActionKind.CHANGE_STATUS, endpoint_index=2,
|
| 431 |
+
location="status_code", new_value=204))
|
| 432 |
+
assert env.score() == pytest.approx(1.0)
|
| 433 |
+
|
| 434 |
+
def test_hard_perfect_solve(self):
|
| 435 |
+
env = make_env("hard")
|
| 436 |
+
# Fix 1: add refresh_token to /auth/login response
|
| 437 |
+
env.step(action(kind=ActionKind.ADD_FIELD, endpoint_index=0,
|
| 438 |
+
location="response_body", field_name="refresh_token",
|
| 439 |
+
new_value={"type": "string", "required": True, "description": "Refresh token"}))
|
| 440 |
+
# Fix 2: expires_in type string→integer in /auth/login response
|
| 441 |
+
env.step(action(kind=ActionKind.CHANGE_TYPE, endpoint_index=0,
|
| 442 |
+
location="response_body", field_name="expires_in", new_value="integer"))
|
| 443 |
+
# Fix 3: add created_at to /users/{id}/profile response
|
| 444 |
+
env.step(action(kind=ActionKind.ADD_FIELD, endpoint_index=1,
|
| 445 |
+
location="response_body", field_name="created_at",
|
| 446 |
+
new_value={"type": "string", "required": True, "description": "ISO timestamp"}))
|
| 447 |
+
# Fix 4: remove password_hash from /users/{id}/profile response
|
| 448 |
+
env.step(action(kind=ActionKind.REMOVE_FIELD, endpoint_index=1,
|
| 449 |
+
location="response_body", field_name="password_hash"))
|
| 450 |
+
# Fix 5: PATCH status 500→200
|
| 451 |
+
env.step(action(kind=ActionKind.CHANGE_STATUS, endpoint_index=2,
|
| 452 |
+
location="status_code", new_value=200))
|
| 453 |
+
# Fix 6: add updated_at to PATCH response
|
| 454 |
+
env.step(action(kind=ActionKind.ADD_FIELD, endpoint_index=2,
|
| 455 |
+
location="response_body", field_name="updated_at",
|
| 456 |
+
new_value={"type": "string", "required": True, "description": "ISO timestamp"}))
|
| 457 |
+
|
| 458 |
+
assert env.score() == pytest.approx(1.0)
|
| 459 |
+
|
| 460 |
+
def test_score_after_partial_solve(self):
|
| 461 |
+
env = make_env("medium")
|
| 462 |
+
# Fix only 1 of 3
|
| 463 |
+
env.step(action(kind=ActionKind.CHANGE_TYPE, endpoint_index=0,
|
| 464 |
+
location="response_body", field_name="product_id", new_value="integer"))
|
| 465 |
+
score = env.score()
|
| 466 |
+
assert 0.0 < score < 1.0
|
| 467 |
+
|
| 468 |
+
def test_unknown_task_raises(self):
|
| 469 |
+
with pytest.raises(ValueError, match="Unknown task"):
|
| 470 |
+
APIContractDebuggerEnv(task_name="impossible")
|
| 471 |
+
|
| 472 |
+
|
| 473 |
+
# ===========================================================================
|
| 474 |
+
# 9. HTTP API routes (FastAPI TestClient)
|
| 475 |
+
# ===========================================================================
|
| 476 |
+
|
| 477 |
+
class TestHTTPRoutes:
|
| 478 |
+
@pytest.fixture(autouse=True)
|
| 479 |
+
def client(self):
|
| 480 |
+
from fastapi.testclient import TestClient
|
| 481 |
+
from server.app import app
|
| 482 |
+
self.client = TestClient(app)
|
| 483 |
+
|
| 484 |
+
def test_health_endpoint(self):
|
| 485 |
+
r = self.client.get("/health")
|
| 486 |
+
assert r.status_code == 200
|
| 487 |
+
|
| 488 |
+
def test_reset_returns_200(self):
|
| 489 |
+
r = self.client.post("/reset", json={})
|
| 490 |
+
assert r.status_code == 200
|
| 491 |
+
data = r.json()
|
| 492 |
+
assert "violations" in data
|
| 493 |
+
assert "endpoints" in data
|
| 494 |
+
|
| 495 |
+
def test_reset_switches_task(self):
|
| 496 |
+
r = self.client.post("/reset", json={"task_name": "medium"})
|
| 497 |
+
assert r.status_code == 200
|
| 498 |
+
assert r.json()["task_name"] == "medium"
|
| 499 |
+
|
| 500 |
+
def test_reset_unknown_task_422(self):
|
| 501 |
+
r = self.client.post("/reset", json={"task_name": "impossible"})
|
| 502 |
+
assert r.status_code == 422
|
| 503 |
+
|
| 504 |
+
def test_step_add_field(self):
|
| 505 |
+
self.client.post("/reset", json={"task_name": "easy"})
|
| 506 |
+
r = self.client.post("/step", json={
|
| 507 |
+
"action": {
|
| 508 |
+
"kind": "add_field",
|
| 509 |
+
"endpoint_index": 0,
|
| 510 |
+
"location": "response_body",
|
| 511 |
+
"field_name": "created_at",
|
| 512 |
+
"new_value": {"type": "string", "required": True, "description": "ts"},
|
| 513 |
+
}
|
| 514 |
+
})
|
| 515 |
+
assert r.status_code == 200
|
| 516 |
+
data = r.json()
|
| 517 |
+
assert data["done"] is True
|
| 518 |
+
assert data["reward"] > 0
|
| 519 |
+
|
| 520 |
+
def test_step_invalid_action_422(self):
|
| 521 |
+
self.client.post("/reset", json={})
|
| 522 |
+
r = self.client.post("/step", json={"action": {"kind": "nonexistent_kind"}})
|
| 523 |
+
assert r.status_code == 422
|
| 524 |
+
|
| 525 |
+
def test_state_endpoint(self):
|
| 526 |
+
self.client.post("/reset", json={"task_name": "easy"})
|
| 527 |
+
r = self.client.get("/state")
|
| 528 |
+
assert r.status_code == 200
|
| 529 |
+
assert "current_endpoints" in r.json()
|
| 530 |
+
|
| 531 |
+
def test_score_endpoint(self):
|
| 532 |
+
self.client.post("/reset", json={"task_name": "easy"})
|
| 533 |
+
r = self.client.get("/score")
|
| 534 |
+
assert r.status_code == 200
|
| 535 |
+
data = r.json()
|
| 536 |
+
assert "score" in data
|
| 537 |
+
assert 0.0 <= data["score"] <= 1.0
|
| 538 |
+
|
| 539 |
+
def test_tasks_endpoint(self):
|
| 540 |
+
r = self.client.get("/tasks")
|
| 541 |
+
assert r.status_code == 200
|
| 542 |
+
data = r.json()
|
| 543 |
+
assert len(data["tasks"]) == 3
|
| 544 |
+
|
| 545 |
+
def test_schema_endpoint(self):
|
| 546 |
+
r = self.client.get("/schema")
|
| 547 |
+
assert r.status_code == 200
|
| 548 |
+
schema = r.json()
|
| 549 |
+
assert "action" in schema
|
| 550 |
+
assert "observation" in schema
|
| 551 |
+
|
| 552 |
+
def test_full_easy_solve_via_http(self):
|
| 553 |
+
self.client.post("/reset", json={"task_name": "easy"})
|
| 554 |
+
r = self.client.post("/step", json={
|
| 555 |
+
"action": {
|
| 556 |
+
"kind": "add_field",
|
| 557 |
+
"endpoint_index": 0,
|
| 558 |
+
"location": "response_body",
|
| 559 |
+
"field_name": "created_at",
|
| 560 |
+
"new_value": {"type": "string", "required": True, "description": "ts"},
|
| 561 |
+
}
|
| 562 |
+
})
|
| 563 |
+
assert r.json()["done"] is True
|
| 564 |
+
score_r = self.client.get("/score")
|
| 565 |
+
assert score_r.json()["score"] == pytest.approx(1.0)
|