Spaces:
Runtime error
Runtime error
Thakur, Mahipal commited on
Commit Β·
62f5d41
0
Parent(s):
Initial modification
Browse files- README.md +184 -0
- __init__.py +16 -0
- __pycache__/__init__.cpython-314.pyc +0 -0
- __pycache__/client.cpython-314.pyc +0 -0
- __pycache__/models.cpython-314.pyc +0 -0
- client.py +77 -0
- models.py +97 -0
- openenv.yaml +99 -0
- openenv_CodeReviewAgent.egg-info/PKG-INFO +11 -0
- openenv_CodeReviewAgent.egg-info/SOURCES.txt +16 -0
- openenv_CodeReviewAgent.egg-info/dependency_links.txt +1 -0
- openenv_CodeReviewAgent.egg-info/entry_points.txt +2 -0
- openenv_CodeReviewAgent.egg-info/requires.txt +7 -0
- openenv_CodeReviewAgent.egg-info/top_level.txt +1 -0
- pyproject.toml +40 -0
- server/CodeReviewAgent_environment.py +327 -0
- server/Dockerfile +80 -0
- server/__init__.py +11 -0
- server/__pycache__/CodeReviewAgent_environment.cpython-314.pyc +0 -0
- server/__pycache__/__init__.cpython-314.pyc +0 -0
- server/__pycache__/grader.cpython-314.pyc +0 -0
- server/__pycache__/tasks.cpython-314.pyc +0 -0
- server/app.py +198 -0
- server/grader.py +152 -0
- server/requirements.txt +6 -0
- server/tasks.py +719 -0
- uv.lock +0 -0
README.md
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: CodeReviewAgent Environment
|
| 3 |
+
emoji: π
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
app_port: 8000
|
| 9 |
+
base_path: /web
|
| 10 |
+
tags:
|
| 11 |
+
- openenv
|
| 12 |
+
- code-review
|
| 13 |
+
- rl-training
|
| 14 |
+
- grpo
|
| 15 |
+
---
|
| 16 |
+
|
| 17 |
+
# CodeReviewAgent β OpenEnv Environment
|
| 18 |
+
|
| 19 |
+
> **OpenEnv Hackathon 2026 Β· Theme #3.1 β World Modeling (Professional Tasks)**
|
| 20 |
+
|
| 21 |
+
An RL training environment where an LLM learns to perform structured **pull-request code reviews** on real Python source files. The agent must identify bugs, security vulnerabilities, performance bottlenecks, and design issues β and submit a structured review with line-level comments.
|
| 22 |
+
|
| 23 |
+
---
|
| 24 |
+
|
| 25 |
+
## Problem Motivation
|
| 26 |
+
|
| 27 |
+
LLMs can already *do* code review, but they do it inconsistently: they miss critical security bugs, produce noisy false positives, and fail to categorise issues by severity.
|
| 28 |
+
This environment provides a **reward signal** that directly measures review quality, enabling GRPO-style RL to close that gap in a measurable, repeatable way.
|
| 29 |
+
|
| 30 |
+
---
|
| 31 |
+
|
| 32 |
+
## Environment Design
|
| 33 |
+
|
| 34 |
+
### Tasks (5 total)
|
| 35 |
+
|
| 36 |
+
| ID | Difficulty | File | Issues | Domain |
|
| 37 |
+
|----|-----------|------|--------|--------|
|
| 38 |
+
| 0 | Easy | `utils.py` | 3 | Logic bugs, off-by-one, dead code |
|
| 39 |
+
| 1 | Medium | `auth.py` | 5 | SQL injection, MD5, eval(), hardcoded creds |
|
| 40 |
+
| 2 | Hard | `data_pipeline.py` | 7 | N+1, SSL bypass, thread leak, OOM cache |
|
| 41 |
+
| 3 | Medium | `async_worker.py` | 5 | Race condition, missing await, resource leak |
|
| 42 |
+
| 4 | Hard | `api_server.py` | 6 | Command injection, path traversal, pickle RCE |
|
| 43 |
+
|
| 44 |
+
Tasks cycle automatically on each `reset()` call.
|
| 45 |
+
|
| 46 |
+
### Observation
|
| 47 |
+
|
| 48 |
+
```python
|
| 49 |
+
{
|
| 50 |
+
"code_snippet": str, # Python source to review
|
| 51 |
+
"task_description": str, # What to look for
|
| 52 |
+
"file_name": str,
|
| 53 |
+
"task_id": int, # 0β4
|
| 54 |
+
"task_difficulty": str, # easy / medium / hard
|
| 55 |
+
"review_history": list, # actions taken so far this episode
|
| 56 |
+
"step_count": int,
|
| 57 |
+
"max_steps": int,
|
| 58 |
+
"issues_found_count": int,
|
| 59 |
+
"total_issues": int,
|
| 60 |
+
"done": bool,
|
| 61 |
+
"reward": float,
|
| 62 |
+
}
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
### Actions
|
| 66 |
+
|
| 67 |
+
| action_type | Required fields | Effect |
|
| 68 |
+
|-------------|----------------|--------|
|
| 69 |
+
| `add_comment` | `line_number`, `comment`, `severity`, `category` | Annotate a line; partial reward if it matches a ground-truth issue |
|
| 70 |
+
| `request_changes` | `comment` | Signal PR needs work |
|
| 71 |
+
| `approve` | β | Approve PR (penalised if issues remain) |
|
| 72 |
+
| `submit_review` | β | Finalise review; terminal reward |
|
| 73 |
+
|
| 74 |
+
### Reward Function
|
| 75 |
+
|
| 76 |
+
```
|
| 77 |
+
Per-step (ADD_COMMENT):
|
| 78 |
+
+ weight/total_weight Γ 0.60 per newly found issue (max 0.60 cumulative)
|
| 79 |
+
β 0.02 per false-positive (substantive comment, no match)
|
| 80 |
+
|
| 81 |
+
Terminal (SUBMIT_REVIEW):
|
| 82 |
+
+ coverage Γ 0.20 weighted issue coverage bonus (max 0.20)
|
| 83 |
+
+ 0.10 / β0.10 correct / incorrect final decision
|
| 84 |
+
+ efficiency Γ 0.10 step-efficiency bonus when coverage β₯ 60%
|
| 85 |
+
|
| 86 |
+
Maximum achievable: ~1.0
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
Grading uses **keyword + line-range matching** (Β±3 lines tolerance) against hand-labelled ground-truth issues β no LLM judge needed, fully deterministic.
|
| 90 |
+
|
| 91 |
+
---
|
| 92 |
+
|
| 93 |
+
## Training
|
| 94 |
+
|
| 95 |
+
### GRPO (single-turn format)
|
| 96 |
+
|
| 97 |
+
For efficient LLM training the environment is also exposed in a **single-turn format**: the model receives the full code and must output a **JSON array** of all issues in one response. The same keyword-matching reward function scores the output.
|
| 98 |
+
|
| 99 |
+
```python
|
| 100 |
+
# Input prompt
|
| 101 |
+
{"role": "system", "content": "You are an expert code reviewer. Output a JSON array of issues..."}
|
| 102 |
+
{"role": "user", "content": "File: auth.py\n```python\n...\n```\nProvide your review:"}
|
| 103 |
+
|
| 104 |
+
# Expected output
|
| 105 |
+
[{"line": 5, "category": "security", "severity": "critical",
|
| 106 |
+
"comment": "Hardcoded DB_PASSWORD should be loaded from environment variable"},
|
| 107 |
+
...]
|
| 108 |
+
```
|
| 109 |
+
|
| 110 |
+
### Files
|
| 111 |
+
|
| 112 |
+
| File | Purpose |
|
| 113 |
+
|------|---------|
|
| 114 |
+
| `train_grpo.py` | Standalone GRPO training script (TRL, full-precision or LoRA) |
|
| 115 |
+
| `train_grpo_colab.ipynb` | Colab notebook β T4 GPU, Unsloth 4-bit, plots included |
|
| 116 |
+
| `baseline.py` | GPT-4o-mini baseline for comparison |
|
| 117 |
+
|
| 118 |
+
### Quick Start
|
| 119 |
+
|
| 120 |
+
```bash
|
| 121 |
+
# Run baseline
|
| 122 |
+
export OPENAI_API_KEY=sk-...
|
| 123 |
+
python baseline.py
|
| 124 |
+
|
| 125 |
+
# Run reward smoke test (no GPU needed)
|
| 126 |
+
python train_grpo.py --test
|
| 127 |
+
|
| 128 |
+
# Train (requires GPU + trl>=0.12)
|
| 129 |
+
pip install trl datasets accelerate unsloth
|
| 130 |
+
python train_grpo.py
|
| 131 |
+
```
|
| 132 |
+
|
| 133 |
+
### Colab Training
|
| 134 |
+
|
| 135 |
+
Open `train_grpo_colab.ipynb` in Google Colab (T4 runtime).
|
| 136 |
+
All install, training, evaluation, and plotting cells are included.
|
| 137 |
+
|
| 138 |
+
---
|
| 139 |
+
|
| 140 |
+
## Results
|
| 141 |
+
|
| 142 |
+
*(Fill in after training run)*
|
| 143 |
+
|
| 144 |
+
| Model | Avg Reward | Task-0 | Task-1 | Task-2 | Task-3 | Task-4 |
|
| 145 |
+
|-------|-----------|--------|--------|--------|--------|--------|
|
| 146 |
+
| GPT-4o-mini (baseline) | β | β | β | β | β | β |
|
| 147 |
+
| Qwen2.5-1.5B (untrained) | β | β | β | β | β | β |
|
| 148 |
+
| Qwen2.5-1.5B (GRPO 3 epochs) | β | β | β | β | β | β |
|
| 149 |
+
|
| 150 |
+
Training curves: `training_curves.png` Β· Per-task rewards: `per_task_reward.png`
|
| 151 |
+
|
| 152 |
+
---
|
| 153 |
+
|
| 154 |
+
## Project Structure
|
| 155 |
+
|
| 156 |
+
```
|
| 157 |
+
CodeReviewAgent/
|
| 158 |
+
βββ openenv.yaml # OpenEnv manifest
|
| 159 |
+
βββ pyproject.toml
|
| 160 |
+
βββ models.py # Action + Observation types
|
| 161 |
+
βββ client.py # OpenEnv client
|
| 162 |
+
βββ server/
|
| 163 |
+
βββ app.py # FastAPI server
|
| 164 |
+
βββ CodeReviewAgent_environment.py
|
| 165 |
+
βββ grader.py # Deterministic reward grader
|
| 166 |
+
βββ tasks.py # 5 ground-truth tasks
|
| 167 |
+
βββ Dockerfile
|
| 168 |
+
train_grpo.py # GRPO training script
|
| 169 |
+
train_grpo_colab.ipynb # Colab notebook
|
| 170 |
+
baseline.py # GPT-4o-mini baseline
|
| 171 |
+
```
|
| 172 |
+
|
| 173 |
+
---
|
| 174 |
+
|
| 175 |
+
## API
|
| 176 |
+
|
| 177 |
+
The environment server exposes standard OpenEnv HTTP + WebSocket endpoints:
|
| 178 |
+
|
| 179 |
+
- `POST /reset` β start a new episode
|
| 180 |
+
- `POST /step` β execute an action
|
| 181 |
+
- `GET /state` β current episode state
|
| 182 |
+
- `WS /ws` β persistent low-latency session
|
| 183 |
+
- `GET /web` β interactive web UI
|
| 184 |
+
- `GET /docs` β Swagger / OpenAPI docs
|
__init__.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Codereviewagent Environment."""
|
| 8 |
+
|
| 9 |
+
from .client import CodereviewagentEnv
|
| 10 |
+
from .models import CodereviewagentAction, CodereviewagentObservation
|
| 11 |
+
|
| 12 |
+
__all__ = [
|
| 13 |
+
"CodereviewagentAction",
|
| 14 |
+
"CodereviewagentObservation",
|
| 15 |
+
"CodereviewagentEnv",
|
| 16 |
+
]
|
__pycache__/__init__.cpython-314.pyc
ADDED
|
Binary file (382 Bytes). View file
|
|
|
__pycache__/client.cpython-314.pyc
ADDED
|
Binary file (4.37 kB). View file
|
|
|
__pycache__/models.cpython-314.pyc
ADDED
|
Binary file (6.52 kB). View file
|
|
|
client.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""CodeReviewAgent Environment Client."""
|
| 2 |
+
|
| 3 |
+
from typing import Dict
|
| 4 |
+
|
| 5 |
+
from openenv.core import EnvClient
|
| 6 |
+
from openenv.core.client_types import StepResult
|
| 7 |
+
from openenv.core.env_server.types import State
|
| 8 |
+
|
| 9 |
+
from .models import CodereviewagentAction, CodereviewagentObservation
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class CodereviewagentEnv(
|
| 13 |
+
EnvClient[CodereviewagentAction, CodereviewagentObservation, State]
|
| 14 |
+
):
|
| 15 |
+
"""
|
| 16 |
+
Client for the CodeReviewAgent environment.
|
| 17 |
+
|
| 18 |
+
Maintains a persistent WebSocket connection to the server.
|
| 19 |
+
|
| 20 |
+
Example:
|
| 21 |
+
>>> with CodereviewagentEnv(base_url="http://localhost:8000") as env:
|
| 22 |
+
... result = env.reset()
|
| 23 |
+
... print(result.observation.task_description)
|
| 24 |
+
...
|
| 25 |
+
... action = CodereviewagentAction(
|
| 26 |
+
... action_type="add_comment",
|
| 27 |
+
... line_number=4,
|
| 28 |
+
... comment="Off-by-one: range(len+1) causes IndexError",
|
| 29 |
+
... severity="error",
|
| 30 |
+
... category="bug",
|
| 31 |
+
... )
|
| 32 |
+
... result = env.step(action)
|
| 33 |
+
... print(result.reward)
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
def _step_payload(self, action: CodereviewagentAction) -> Dict:
|
| 37 |
+
payload = {"action_type": action.action_type.value}
|
| 38 |
+
if action.line_number is not None:
|
| 39 |
+
payload["line_number"] = action.line_number
|
| 40 |
+
if action.comment is not None:
|
| 41 |
+
payload["comment"] = action.comment
|
| 42 |
+
if action.severity is not None:
|
| 43 |
+
payload["severity"] = action.severity.value
|
| 44 |
+
if action.category is not None:
|
| 45 |
+
payload["category"] = action.category.value
|
| 46 |
+
return payload
|
| 47 |
+
|
| 48 |
+
def _parse_result(
|
| 49 |
+
self, payload: Dict
|
| 50 |
+
) -> StepResult[CodereviewagentObservation]:
|
| 51 |
+
obs_data = payload.get("observation", {})
|
| 52 |
+
observation = CodereviewagentObservation(
|
| 53 |
+
code_snippet=obs_data.get("code_snippet", ""),
|
| 54 |
+
task_description=obs_data.get("task_description", ""),
|
| 55 |
+
file_name=obs_data.get("file_name", ""),
|
| 56 |
+
task_id=obs_data.get("task_id", 0),
|
| 57 |
+
task_difficulty=obs_data.get("task_difficulty", "easy"),
|
| 58 |
+
review_history=obs_data.get("review_history", []),
|
| 59 |
+
step_count=obs_data.get("step_count", 0),
|
| 60 |
+
max_steps=obs_data.get("max_steps", 20),
|
| 61 |
+
issues_found_count=obs_data.get("issues_found_count", 0),
|
| 62 |
+
total_issues=obs_data.get("total_issues", 0),
|
| 63 |
+
done=payload.get("done", False),
|
| 64 |
+
reward=payload.get("reward"),
|
| 65 |
+
metadata=obs_data.get("metadata", {}),
|
| 66 |
+
)
|
| 67 |
+
return StepResult(
|
| 68 |
+
observation=observation,
|
| 69 |
+
reward=payload.get("reward"),
|
| 70 |
+
done=payload.get("done", False),
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
def _parse_state(self, payload: Dict) -> State:
|
| 74 |
+
return State(
|
| 75 |
+
episode_id=payload.get("episode_id"),
|
| 76 |
+
step_count=payload.get("step_count", 0),
|
| 77 |
+
)
|
models.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data models for the CodeReviewAgent Environment.
|
| 3 |
+
|
| 4 |
+
An agent reviews Python source files, identifies bugs, security issues,
|
| 5 |
+
and design problems, then submits a structured review.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from enum import Enum
|
| 9 |
+
from typing import Any
|
| 10 |
+
|
| 11 |
+
from openenv.core.env_server.types import Action, Observation
|
| 12 |
+
from pydantic import BaseModel, ConfigDict, Field
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class ActionType(str, Enum):
|
| 16 |
+
ADD_COMMENT = "add_comment"
|
| 17 |
+
REQUEST_CHANGES = "request_changes"
|
| 18 |
+
APPROVE = "approve"
|
| 19 |
+
SUBMIT_REVIEW = "submit_review"
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class Severity(str, Enum):
|
| 23 |
+
INFO = "info"
|
| 24 |
+
WARNING = "warning"
|
| 25 |
+
ERROR = "error"
|
| 26 |
+
CRITICAL = "critical"
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class IssueCategory(str, Enum):
|
| 30 |
+
BUG = "bug"
|
| 31 |
+
SECURITY = "security"
|
| 32 |
+
PERFORMANCE = "performance"
|
| 33 |
+
STYLE = "style"
|
| 34 |
+
DESIGN = "design"
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class RewardType(BaseModel):
|
| 38 |
+
"""
|
| 39 |
+
Structured reward returned by step().
|
| 40 |
+
|
| 41 |
+
total : final clamped score in [-1.0, 1.0]
|
| 42 |
+
components : named sub-scores before clamping (may sum outside [-1, 1])
|
| 43 |
+
passed : True when the action was a clear positive signal
|
| 44 |
+
explanation : human-readable breakdown for logging / debugging
|
| 45 |
+
step : environment step this reward was issued at
|
| 46 |
+
terminal : True only on the SUBMIT_REVIEW step
|
| 47 |
+
"""
|
| 48 |
+
|
| 49 |
+
model_config = ConfigDict(frozen=True)
|
| 50 |
+
|
| 51 |
+
total: float = Field(..., ge=-1.0, le=1.0)
|
| 52 |
+
components: dict[str, float] = Field(default_factory=dict)
|
| 53 |
+
passed: bool = Field(False)
|
| 54 |
+
explanation: str = Field("")
|
| 55 |
+
step: int = Field(0)
|
| 56 |
+
terminal: bool = Field(False)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
class CodereviewagentAction(Action):
|
| 60 |
+
"""
|
| 61 |
+
- ADD_COMMENT : annotate a specific line with a review comment
|
| 62 |
+
- REQUEST_CHANGES: mark the PR as needing changes
|
| 63 |
+
- APPROVE : approve the PR (only when no significant issues remain)
|
| 64 |
+
- SUBMIT_REVIEW : finalize and submit the review (ends the episode)
|
| 65 |
+
"""
|
| 66 |
+
|
| 67 |
+
action_type: ActionType = Field(..., description="Type of review action")
|
| 68 |
+
line_number: int | None = Field(None, description="Source line being commented on")
|
| 69 |
+
comment: str | None = Field(None, description="Review comment text")
|
| 70 |
+
severity: Severity | None = Field(None, description="Issue severity level")
|
| 71 |
+
category: IssueCategory | None = Field(None, description="Issue category")
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
class CodereviewagentObservation(Observation):
|
| 75 |
+
"""
|
| 76 |
+
Contains the code to review, task instructions, and the running
|
| 77 |
+
review history so the agent can track what it has already flagged.
|
| 78 |
+
The `reward` field mirrors the most recent step reward for convenience;
|
| 79 |
+
the authoritative reward is the RewardType returned by step().
|
| 80 |
+
"""
|
| 81 |
+
|
| 82 |
+
code_snippet: str = Field(default="", description="Python source code to review")
|
| 83 |
+
task_description: str = Field(default="", description="Review instructions and goals")
|
| 84 |
+
file_name: str = Field(default="", description="Name of the file being reviewed")
|
| 85 |
+
task_id: int = Field(default=0, description="Current task index")
|
| 86 |
+
task_difficulty: str = Field(default="ultra-easy", description="Task difficulty label")
|
| 87 |
+
review_history: list[dict[str, Any]] = Field(
|
| 88 |
+
default_factory=list,
|
| 89 |
+
description="Ordered list of actions taken so far this episode",
|
| 90 |
+
)
|
| 91 |
+
step_count: int = Field(default=0, description="Steps taken in current episode")
|
| 92 |
+
max_steps: int = Field(default=6, description="Step budget for this task")
|
| 93 |
+
issues_found_count: int = Field(default=0, description="Number of issues identified so far")
|
| 94 |
+
total_issues: int = Field(default=0, description="Total issues in this task")
|
| 95 |
+
done: bool = Field(default=False, description="Whether the episode has ended")
|
| 96 |
+
reward: float = Field(default=0.0, description="Most recent step reward (mirror of RewardType.total)")
|
| 97 |
+
metadata: dict[str, Any] = Field(default_factory=dict, description="Extra episode metadata")
|
openenv.yaml
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
spec_version: 1
|
| 2 |
+
name: CodeReviewAgent
|
| 3 |
+
type: space
|
| 4 |
+
runtime: fastapi
|
| 5 |
+
app: server.app:app
|
| 6 |
+
port: 8000
|
| 7 |
+
|
| 8 |
+
description: >
|
| 9 |
+
Code review environment where an agent reviews Python source files,
|
| 10 |
+
identifies bugs, security vulnerabilities, performance bottlenecks,
|
| 11 |
+
and design issues, then submits a structured review with comments
|
| 12 |
+
and a final decision (request_changes or approve).
|
| 13 |
+
|
| 14 |
+
tasks:
|
| 15 |
+
- id: 0
|
| 16 |
+
name: Basic Bug Detection
|
| 17 |
+
difficulty: easy
|
| 18 |
+
description: Identify logical bugs in a simple Python utility module
|
| 19 |
+
max_steps: 15
|
| 20 |
+
issues: 3
|
| 21 |
+
|
| 22 |
+
- id: 1
|
| 23 |
+
name: Security Vulnerability Review
|
| 24 |
+
difficulty: medium
|
| 25 |
+
description: Find security vulnerabilities in an authentication module
|
| 26 |
+
max_steps: 20
|
| 27 |
+
issues: 5
|
| 28 |
+
|
| 29 |
+
- id: 2
|
| 30 |
+
name: Full Architecture and Performance Review
|
| 31 |
+
difficulty: hard
|
| 32 |
+
description: >
|
| 33 |
+
Comprehensive review of a data pipeline for bugs, security,
|
| 34 |
+
performance, and design issues
|
| 35 |
+
max_steps: 30
|
| 36 |
+
issues: 7
|
| 37 |
+
|
| 38 |
+
- id: 3
|
| 39 |
+
name: Async Worker Review
|
| 40 |
+
difficulty: medium
|
| 41 |
+
description: Find concurrency bugs and resource leaks in an async worker
|
| 42 |
+
max_steps: 20
|
| 43 |
+
issues: 5
|
| 44 |
+
|
| 45 |
+
- id: 4
|
| 46 |
+
name: Flask API Security Review
|
| 47 |
+
difficulty: hard
|
| 48 |
+
description: >
|
| 49 |
+
Comprehensive security review of a Flask REST API for injection flaws,
|
| 50 |
+
path traversal, insecure deserialization, and missing access controls
|
| 51 |
+
max_steps: 30
|
| 52 |
+
issues: 6
|
| 53 |
+
|
| 54 |
+
observation:
|
| 55 |
+
type: object
|
| 56 |
+
fields:
|
| 57 |
+
code_snippet: {type: string, description: "Python source to review"}
|
| 58 |
+
task_description: {type: string, description: "Review instructions"}
|
| 59 |
+
file_name: {type: string}
|
| 60 |
+
task_id: {type: integer, range: [0, 4]}
|
| 61 |
+
task_difficulty: {type: string, values: [easy, medium, hard]}
|
| 62 |
+
review_history: {type: array, description: "Actions taken so far"}
|
| 63 |
+
step_count: {type: integer}
|
| 64 |
+
max_steps: {type: integer}
|
| 65 |
+
issues_found_count: {type: integer}
|
| 66 |
+
total_issues: {type: integer}
|
| 67 |
+
done: {type: boolean}
|
| 68 |
+
reward: {type: number}
|
| 69 |
+
|
| 70 |
+
action:
|
| 71 |
+
type: object
|
| 72 |
+
fields:
|
| 73 |
+
action_type:
|
| 74 |
+
type: enum
|
| 75 |
+
values: [add_comment, request_changes, approve, submit_review]
|
| 76 |
+
line_number: {type: integer, required: false}
|
| 77 |
+
comment: {type: string, required: false}
|
| 78 |
+
severity:
|
| 79 |
+
type: enum
|
| 80 |
+
values: [info, warning, error, critical]
|
| 81 |
+
required: false
|
| 82 |
+
category:
|
| 83 |
+
type: enum
|
| 84 |
+
values: [bug, security, performance, style, design]
|
| 85 |
+
required: false
|
| 86 |
+
|
| 87 |
+
reward_design:
|
| 88 |
+
range: [-1.0, 1.0]
|
| 89 |
+
per_step:
|
| 90 |
+
issue_found: "up to 0.60 total (weight/total_weight Γ 0.60 per issue)"
|
| 91 |
+
false_positive: -0.02
|
| 92 |
+
correct_request_changes: +0.05
|
| 93 |
+
bad_approval: -0.15
|
| 94 |
+
terminal:
|
| 95 |
+
coverage_bonus: "coverage Γ 0.20 (max +0.20)"
|
| 96 |
+
decision_correct: +0.10
|
| 97 |
+
decision_incorrect: -0.10
|
| 98 |
+
efficiency_bonus: "up to +0.10 when coverage β₯ 60%"
|
| 99 |
+
max_achievable: ~1.0
|
openenv_CodeReviewAgent.egg-info/PKG-INFO
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: openenv-CodeReviewAgent
|
| 3 |
+
Version: 0.1.0
|
| 4 |
+
Summary: Codereviewagent environment for OpenEnv
|
| 5 |
+
Requires-Python: >=3.10
|
| 6 |
+
Requires-Dist: openenv-core[core]>=0.2.2
|
| 7 |
+
Requires-Dist: openai>=1.0.0
|
| 8 |
+
Requires-Dist: python-dotenv>=1.2.2
|
| 9 |
+
Provides-Extra: dev
|
| 10 |
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
| 11 |
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
openenv_CodeReviewAgent.egg-info/SOURCES.txt
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
README.md
|
| 2 |
+
pyproject.toml
|
| 3 |
+
./__init__.py
|
| 4 |
+
./client.py
|
| 5 |
+
./models.py
|
| 6 |
+
openenv_CodeReviewAgent.egg-info/PKG-INFO
|
| 7 |
+
openenv_CodeReviewAgent.egg-info/SOURCES.txt
|
| 8 |
+
openenv_CodeReviewAgent.egg-info/dependency_links.txt
|
| 9 |
+
openenv_CodeReviewAgent.egg-info/entry_points.txt
|
| 10 |
+
openenv_CodeReviewAgent.egg-info/requires.txt
|
| 11 |
+
openenv_CodeReviewAgent.egg-info/top_level.txt
|
| 12 |
+
server/CodeReviewAgent_environment.py
|
| 13 |
+
server/__init__.py
|
| 14 |
+
server/app.py
|
| 15 |
+
server/grader.py
|
| 16 |
+
server/tasks.py
|
openenv_CodeReviewAgent.egg-info/dependency_links.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
openenv_CodeReviewAgent.egg-info/entry_points.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[console_scripts]
|
| 2 |
+
server = CodeReviewAgent.server.app:main
|
openenv_CodeReviewAgent.egg-info/requires.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openenv-core[core]>=0.2.2
|
| 2 |
+
openai>=1.0.0
|
| 3 |
+
python-dotenv>=1.2.2
|
| 4 |
+
|
| 5 |
+
[dev]
|
| 6 |
+
pytest>=8.0.0
|
| 7 |
+
pytest-cov>=4.0.0
|
openenv_CodeReviewAgent.egg-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
CodeReviewAgent
|
pyproject.toml
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
[build-system]
|
| 8 |
+
requires = ["setuptools>=45", "wheel"]
|
| 9 |
+
build-backend = "setuptools.build_meta"
|
| 10 |
+
|
| 11 |
+
[project]
|
| 12 |
+
name = "openenv-CodeReviewAgent"
|
| 13 |
+
version = "0.1.0"
|
| 14 |
+
description = "Codereviewagent environment for OpenEnv"
|
| 15 |
+
requires-python = ">=3.10"
|
| 16 |
+
dependencies = [
|
| 17 |
+
# Core OpenEnv runtime (provides FastAPI server + HTTP client types)
|
| 18 |
+
# install from github
|
| 19 |
+
# "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
|
| 20 |
+
"openenv-core[core]>=0.2.2",
|
| 21 |
+
# Environment-specific dependencies
|
| 22 |
+
"openai>=1.0.0",
|
| 23 |
+
"python-dotenv>=1.2.2",
|
| 24 |
+
]
|
| 25 |
+
|
| 26 |
+
[project.optional-dependencies]
|
| 27 |
+
dev = [
|
| 28 |
+
"pytest>=8.0.0",
|
| 29 |
+
"pytest-cov>=4.0.0",
|
| 30 |
+
]
|
| 31 |
+
|
| 32 |
+
[project.scripts]
|
| 33 |
+
# Server entry point - enables running via: uv run --project . server
|
| 34 |
+
# or: python -m CodeReviewAgent.server.app
|
| 35 |
+
server = "CodeReviewAgent.server.app:main"
|
| 36 |
+
|
| 37 |
+
[tool.setuptools]
|
| 38 |
+
include-package-data = true
|
| 39 |
+
packages = ["CodeReviewAgent", "CodeReviewAgent.server"]
|
| 40 |
+
package-dir = { "CodeReviewAgent" = ".", "CodeReviewAgent.server" = "server" }
|
server/CodeReviewAgent_environment.py
ADDED
|
@@ -0,0 +1,327 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CodeReviewAgent Environment β async-native implementation.
|
| 3 |
+
|
| 4 |
+
Episode lifecycle:
|
| 5 |
+
1. reset() β ObservationType (starts a new episode)
|
| 6 |
+
2. step(a) β (Obs, RewardType, done, info) (execute one action)
|
| 7 |
+
3. state() β dict (full internal snapshot)
|
| 8 |
+
|
| 9 |
+
Tasks cycle automatically: 0 (ultra-easy) β 1 (easy) β β¦ β 5 (hard flask) β 0 β¦
|
| 10 |
+
|
| 11 |
+
Thread / task safety: each Environment instance owns its own state.
|
| 12 |
+
For concurrent GRPO rollouts spin up one instance per worker.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from __future__ import annotations
|
| 16 |
+
|
| 17 |
+
import asyncio
|
| 18 |
+
from typing import Any
|
| 19 |
+
from uuid import uuid4
|
| 20 |
+
|
| 21 |
+
from openenv.core.env_server.interfaces import Environment
|
| 22 |
+
from openenv.core.env_server.types import State
|
| 23 |
+
|
| 24 |
+
try:
|
| 25 |
+
from ..models import (
|
| 26 |
+
ActionType,
|
| 27 |
+
CodereviewagentAction,
|
| 28 |
+
CodereviewagentObservation,
|
| 29 |
+
RewardType,
|
| 30 |
+
)
|
| 31 |
+
from .grader import CodeReviewGrader
|
| 32 |
+
from .tasks import TASKS
|
| 33 |
+
except ImportError:
|
| 34 |
+
from models import ( # type: ignore[no-redef]
|
| 35 |
+
ActionType,
|
| 36 |
+
CodereviewagentAction,
|
| 37 |
+
CodereviewagentObservation,
|
| 38 |
+
RewardType,
|
| 39 |
+
)
|
| 40 |
+
from server.grader import CodeReviewGrader # type: ignore[no-redef]
|
| 41 |
+
from server.tasks import TASKS # type: ignore[no-redef]
|
| 42 |
+
|
| 43 |
+
# Sentinel reward returned on non-terminal steps that produce no signal
|
| 44 |
+
_ZERO_REWARD = RewardType(total=0.0, components={}, passed=False,
|
| 45 |
+
explanation="No signal this step.", step=0, terminal=False)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class CodereviewagentEnvironment(Environment):
|
| 49 |
+
"""
|
| 50 |
+
OpenEnv-compliant code-review environment.
|
| 51 |
+
|
| 52 |
+
Public interface is fully async. The sync wrappers (reset / step / state)
|
| 53 |
+
required by openenv's create_app are also provided; they delegate to the
|
| 54 |
+
async versions via asyncio.run() so they are safe to call from sync
|
| 55 |
+
contexts (e.g. tests without an event loop, openenv HTTP wrappers).
|
| 56 |
+
"""
|
| 57 |
+
|
| 58 |
+
SUPPORTS_CONCURRENT_SESSIONS: bool = True
|
| 59 |
+
|
| 60 |
+
# ββ Construction ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 61 |
+
|
| 62 |
+
def __init__(self) -> None:
|
| 63 |
+
self._episode_id: str = str(uuid4())
|
| 64 |
+
self._step_count: int = 0
|
| 65 |
+
self._reset_count: int = 0
|
| 66 |
+
task = TASKS[0]
|
| 67 |
+
self._grader: CodeReviewGrader = CodeReviewGrader(task)
|
| 68 |
+
self._ep: dict[str, Any] = self._fresh_episode(task)
|
| 69 |
+
|
| 70 |
+
@staticmethod
|
| 71 |
+
def _fresh_episode(task: dict[str, Any]) -> dict[str, Any]:
|
| 72 |
+
return {
|
| 73 |
+
"task": task,
|
| 74 |
+
"review_comments": [],
|
| 75 |
+
"issues_found": [],
|
| 76 |
+
"review_decision": None,
|
| 77 |
+
"review_submitted": False,
|
| 78 |
+
"cumulative_reward": 0.0,
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
# ββ Async-native interface (primary) ββββββββββββββββββββββββββββββββββ
|
| 82 |
+
|
| 83 |
+
async def async_reset(self) -> CodereviewagentObservation:
|
| 84 |
+
task_id = self._reset_count % len(TASKS)
|
| 85 |
+
self._reset_count += 1
|
| 86 |
+
self._episode_id = str(uuid4())
|
| 87 |
+
self._step_count = 0
|
| 88 |
+
task = TASKS[task_id]
|
| 89 |
+
self._grader = CodeReviewGrader(task)
|
| 90 |
+
self._ep = self._fresh_episode(task)
|
| 91 |
+
return self._make_obs(reward=0.0, done=False)
|
| 92 |
+
|
| 93 |
+
async def async_step(
|
| 94 |
+
self, action: CodereviewagentAction
|
| 95 |
+
) -> tuple[CodereviewagentObservation, RewardType, bool, dict[str, Any]]:
|
| 96 |
+
self._step_count += 1
|
| 97 |
+
task = self._ep["task"]
|
| 98 |
+
done = False
|
| 99 |
+
reward_obj: RewardType
|
| 100 |
+
|
| 101 |
+
if action.action_type == ActionType.ADD_COMMENT:
|
| 102 |
+
reward_obj = self._handle_add_comment(action)
|
| 103 |
+
|
| 104 |
+
elif action.action_type == ActionType.REQUEST_CHANGES:
|
| 105 |
+
reward_obj = self._handle_request_changes(action)
|
| 106 |
+
|
| 107 |
+
elif action.action_type == ActionType.APPROVE:
|
| 108 |
+
reward_obj = self._handle_approve()
|
| 109 |
+
|
| 110 |
+
elif action.action_type == ActionType.SUBMIT_REVIEW:
|
| 111 |
+
reward_obj, done = self._handle_submit_review()
|
| 112 |
+
|
| 113 |
+
else:
|
| 114 |
+
reward_obj = RewardType(
|
| 115 |
+
total=-0.05,
|
| 116 |
+
components={"illegal_action": -0.05},
|
| 117 |
+
passed=False,
|
| 118 |
+
explanation=f"Unknown action type: {action.action_type}",
|
| 119 |
+
step=self._step_count,
|
| 120 |
+
terminal=False,
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
# Step-budget exhaustion
|
| 124 |
+
if not done and self._step_count >= task["max_steps"]:
|
| 125 |
+
# merge budget penalty into existing reward
|
| 126 |
+
penalised = max(-1.0, reward_obj.total - 0.05)
|
| 127 |
+
components = {**reward_obj.components, "step_budget_penalty": -0.05}
|
| 128 |
+
reward_obj = RewardType(
|
| 129 |
+
total=round(penalised, 4),
|
| 130 |
+
components=components,
|
| 131 |
+
passed=False,
|
| 132 |
+
explanation=reward_obj.explanation + " [Step limit reached.]",
|
| 133 |
+
step=self._step_count,
|
| 134 |
+
terminal=True,
|
| 135 |
+
)
|
| 136 |
+
done = True
|
| 137 |
+
|
| 138 |
+
self._ep["cumulative_reward"] = round(
|
| 139 |
+
self._ep["cumulative_reward"] + reward_obj.total, 4
|
| 140 |
+
)
|
| 141 |
+
obs = self._make_obs(reward=reward_obj.total, done=done)
|
| 142 |
+
info = {
|
| 143 |
+
"episode_id": self._episode_id,
|
| 144 |
+
"cumulative_reward": self._ep["cumulative_reward"],
|
| 145 |
+
"issues_found": list(self._ep["issues_found"]),
|
| 146 |
+
"review_decision": self._ep.get("review_decision"),
|
| 147 |
+
}
|
| 148 |
+
return obs, reward_obj, done, info
|
| 149 |
+
|
| 150 |
+
async def async_state(self) -> dict[str, Any]:
|
| 151 |
+
task = self._ep["task"]
|
| 152 |
+
return {
|
| 153 |
+
"episode_id": self._episode_id,
|
| 154 |
+
"step_count": self._step_count,
|
| 155 |
+
"task_id": task["id"],
|
| 156 |
+
"task_difficulty": task["difficulty"],
|
| 157 |
+
"task_name": task["name"],
|
| 158 |
+
"issues_found": list(self._ep["issues_found"]),
|
| 159 |
+
"total_issues": len(task["issues"]),
|
| 160 |
+
"review_decision": self._ep.get("review_decision"),
|
| 161 |
+
"review_submitted": self._ep.get("review_submitted", False),
|
| 162 |
+
"cumulative_reward": self._ep.get("cumulative_reward", 0.0),
|
| 163 |
+
"max_steps": task["max_steps"],
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
# ββ Sync wrappers (openenv / create_app compatibility) ββββββββββββββββ
|
| 167 |
+
|
| 168 |
+
def reset(self) -> CodereviewagentObservation: # type: ignore[override]
|
| 169 |
+
try:
|
| 170 |
+
loop = asyncio.get_running_loop()
|
| 171 |
+
except RuntimeError:
|
| 172 |
+
return asyncio.run(self.async_reset())
|
| 173 |
+
# Called from inside a running loop (e.g. pytest-asyncio) β run directly
|
| 174 |
+
import concurrent.futures
|
| 175 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
|
| 176 |
+
fut = pool.submit(asyncio.run, self.async_reset())
|
| 177 |
+
return fut.result()
|
| 178 |
+
|
| 179 |
+
def step(self, action: CodereviewagentAction) -> CodereviewagentObservation: # type: ignore[override]
|
| 180 |
+
"""
|
| 181 |
+
Sync step for openenv compatibility.
|
| 182 |
+
Returns only the Observation (reward is embedded in obs.reward).
|
| 183 |
+
Use async_step() for the full (obs, reward, done, info) tuple.
|
| 184 |
+
"""
|
| 185 |
+
try:
|
| 186 |
+
loop = asyncio.get_running_loop()
|
| 187 |
+
except RuntimeError:
|
| 188 |
+
obs, _, _, _ = asyncio.run(self.async_step(action))
|
| 189 |
+
return obs
|
| 190 |
+
import concurrent.futures
|
| 191 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
|
| 192 |
+
fut = pool.submit(asyncio.run, self.async_step(action))
|
| 193 |
+
obs, _, _, _ = fut.result()
|
| 194 |
+
return obs
|
| 195 |
+
|
| 196 |
+
@property
|
| 197 |
+
def state(self) -> State: # type: ignore[override]
|
| 198 |
+
return State(episode_id=self._episode_id, step_count=self._step_count)
|
| 199 |
+
|
| 200 |
+
# ββ Action handlers βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 201 |
+
|
| 202 |
+
def _handle_add_comment(self, action: CodereviewagentAction) -> RewardType:
|
| 203 |
+
entry = {
|
| 204 |
+
"type": "comment",
|
| 205 |
+
"line": action.line_number,
|
| 206 |
+
"text": action.comment,
|
| 207 |
+
"severity": action.severity.value if action.severity else None,
|
| 208 |
+
"category": action.category.value if action.category else None,
|
| 209 |
+
}
|
| 210 |
+
self._ep["review_comments"].append(entry)
|
| 211 |
+
|
| 212 |
+
score, new_finds, breakdown = self._grader.score_comment(
|
| 213 |
+
line_number=action.line_number,
|
| 214 |
+
comment=action.comment,
|
| 215 |
+
already_found=self._ep["issues_found"],
|
| 216 |
+
)
|
| 217 |
+
self._ep["issues_found"].extend(new_finds)
|
| 218 |
+
|
| 219 |
+
clamped = round(max(-1.0, min(1.0, score)), 4)
|
| 220 |
+
if new_finds:
|
| 221 |
+
explanation = f"Identified issue(s): {new_finds}"
|
| 222 |
+
elif score < 0:
|
| 223 |
+
explanation = "False-positive comment β matched no known issue."
|
| 224 |
+
else:
|
| 225 |
+
explanation = "Comment recorded; no new issue matched."
|
| 226 |
+
|
| 227 |
+
return RewardType(
|
| 228 |
+
total=clamped,
|
| 229 |
+
components=breakdown,
|
| 230 |
+
passed=bool(new_finds),
|
| 231 |
+
explanation=explanation,
|
| 232 |
+
step=self._step_count,
|
| 233 |
+
terminal=False,
|
| 234 |
+
)
|
| 235 |
+
|
| 236 |
+
def _handle_request_changes(self, action: CodereviewagentAction) -> RewardType:
|
| 237 |
+
self._ep["review_decision"] = "request_changes"
|
| 238 |
+
self._ep["review_comments"].append(
|
| 239 |
+
{"type": "request_changes", "text": action.comment}
|
| 240 |
+
)
|
| 241 |
+
if self._ep["issues_found"]:
|
| 242 |
+
return RewardType(
|
| 243 |
+
total=0.05,
|
| 244 |
+
components={"decision_bonus": 0.05},
|
| 245 |
+
passed=True,
|
| 246 |
+
explanation="REQUEST_CHANGES after finding issues β correct.",
|
| 247 |
+
step=self._step_count,
|
| 248 |
+
terminal=False,
|
| 249 |
+
)
|
| 250 |
+
return RewardType(
|
| 251 |
+
total=-0.05,
|
| 252 |
+
components={"premature_decision_penalty": -0.05},
|
| 253 |
+
passed=False,
|
| 254 |
+
explanation="REQUEST_CHANGES with no issues found yet.",
|
| 255 |
+
step=self._step_count,
|
| 256 |
+
terminal=False,
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
+
def _handle_approve(self) -> RewardType:
|
| 260 |
+
self._ep["review_decision"] = "approve"
|
| 261 |
+
total_issues = len(self._ep["task"]["issues"])
|
| 262 |
+
found = len(set(self._ep["issues_found"]))
|
| 263 |
+
if total_issues > 0 and found < total_issues * 0.5:
|
| 264 |
+
return RewardType(
|
| 265 |
+
total=-0.15,
|
| 266 |
+
components={"bad_approval_penalty": -0.15},
|
| 267 |
+
passed=False,
|
| 268 |
+
explanation=f"APPROVE with only {found}/{total_issues} issues found.",
|
| 269 |
+
step=self._step_count,
|
| 270 |
+
terminal=False,
|
| 271 |
+
)
|
| 272 |
+
return RewardType(
|
| 273 |
+
total=0.02,
|
| 274 |
+
components={"approval_credit": 0.02},
|
| 275 |
+
passed=True,
|
| 276 |
+
explanation="APPROVE recorded.",
|
| 277 |
+
step=self._step_count,
|
| 278 |
+
terminal=False,
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
def _handle_submit_review(self) -> tuple[RewardType, bool]:
|
| 282 |
+
if self._ep.get("review_submitted"):
|
| 283 |
+
return (
|
| 284 |
+
RewardType(
|
| 285 |
+
total=-0.05,
|
| 286 |
+
components={"duplicate_submit_penalty": -0.05},
|
| 287 |
+
passed=False,
|
| 288 |
+
explanation="Review already submitted.",
|
| 289 |
+
step=self._step_count,
|
| 290 |
+
terminal=False,
|
| 291 |
+
),
|
| 292 |
+
False,
|
| 293 |
+
)
|
| 294 |
+
self._ep["review_submitted"] = True
|
| 295 |
+
task = self._ep["task"]
|
| 296 |
+
reward_obj = self._grader.final_score(
|
| 297 |
+
issues_found=list(set(self._ep["issues_found"])),
|
| 298 |
+
review_decision=self._ep.get("review_decision"),
|
| 299 |
+
step_count=self._step_count,
|
| 300 |
+
max_steps=task["max_steps"],
|
| 301 |
+
current_step=self._step_count,
|
| 302 |
+
)
|
| 303 |
+
return reward_obj, True
|
| 304 |
+
|
| 305 |
+
# ββ Observation builder βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 306 |
+
|
| 307 |
+
def _make_obs(self, reward: float, done: bool) -> CodereviewagentObservation:
|
| 308 |
+
task = self._ep["task"]
|
| 309 |
+
return CodereviewagentObservation(
|
| 310 |
+
code_snippet=task["code"],
|
| 311 |
+
task_description=task["description"],
|
| 312 |
+
file_name=task["file_name"],
|
| 313 |
+
task_id=task["id"],
|
| 314 |
+
task_difficulty=task["difficulty"],
|
| 315 |
+
review_history=list(self._ep.get("review_comments", [])),
|
| 316 |
+
step_count=self._step_count,
|
| 317 |
+
max_steps=task["max_steps"],
|
| 318 |
+
issues_found_count=len(set(self._ep.get("issues_found", []))),
|
| 319 |
+
total_issues=len(task["issues"]),
|
| 320 |
+
done=done,
|
| 321 |
+
reward=round(max(-1.0, min(1.0, reward)), 4),
|
| 322 |
+
metadata={
|
| 323 |
+
"cumulative_reward": self._ep.get("cumulative_reward", 0.0),
|
| 324 |
+
"review_decision": self._ep.get("review_decision"),
|
| 325 |
+
"episode_id": self._episode_id,
|
| 326 |
+
},
|
| 327 |
+
)
|
server/Dockerfile
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
# Multi-stage build using openenv-base
|
| 8 |
+
# This Dockerfile is flexible and works for both:
|
| 9 |
+
# - In-repo environments (with local OpenEnv sources)
|
| 10 |
+
# - Standalone environments (with openenv from PyPI/Git)
|
| 11 |
+
# The build script (openenv build) handles context detection and sets appropriate build args.
|
| 12 |
+
|
| 13 |
+
ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
|
| 14 |
+
FROM ${BASE_IMAGE} AS builder
|
| 15 |
+
|
| 16 |
+
WORKDIR /app
|
| 17 |
+
|
| 18 |
+
# Ensure git is available (required for installing dependencies from VCS)
|
| 19 |
+
RUN apt-get update && \
|
| 20 |
+
apt-get install -y --no-install-recommends git && \
|
| 21 |
+
rm -rf /var/lib/apt/lists/*
|
| 22 |
+
|
| 23 |
+
# Build argument to control whether we're building standalone or in-repo
|
| 24 |
+
ARG BUILD_MODE=in-repo
|
| 25 |
+
ARG ENV_NAME=CodeReviewAgent
|
| 26 |
+
|
| 27 |
+
# Copy environment code (always at root of build context)
|
| 28 |
+
COPY . /app/env
|
| 29 |
+
|
| 30 |
+
# For in-repo builds, openenv is already vendored in the build context
|
| 31 |
+
# For standalone builds, openenv will be installed via pyproject.toml
|
| 32 |
+
WORKDIR /app/env
|
| 33 |
+
|
| 34 |
+
# Ensure uv is available (for local builds where base image lacks it)
|
| 35 |
+
RUN if ! command -v uv >/dev/null 2>&1; then \
|
| 36 |
+
curl -LsSf https://astral.sh/uv/install.sh | sh && \
|
| 37 |
+
mv /root/.local/bin/uv /usr/local/bin/uv && \
|
| 38 |
+
mv /root/.local/bin/uvx /usr/local/bin/uvx; \
|
| 39 |
+
fi
|
| 40 |
+
|
| 41 |
+
# Install dependencies using uv sync
|
| 42 |
+
# If uv.lock exists, use it; otherwise resolve on the fly
|
| 43 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 44 |
+
if [ -f uv.lock ]; then \
|
| 45 |
+
uv sync --frozen --no-install-project --no-editable; \
|
| 46 |
+
else \
|
| 47 |
+
uv sync --no-install-project --no-editable; \
|
| 48 |
+
fi
|
| 49 |
+
|
| 50 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 51 |
+
if [ -f uv.lock ]; then \
|
| 52 |
+
uv sync --frozen --no-editable; \
|
| 53 |
+
else \
|
| 54 |
+
uv sync --no-editable; \
|
| 55 |
+
fi
|
| 56 |
+
|
| 57 |
+
# Final runtime stage
|
| 58 |
+
FROM ${BASE_IMAGE}
|
| 59 |
+
|
| 60 |
+
WORKDIR /app
|
| 61 |
+
|
| 62 |
+
# Copy the virtual environment from builder
|
| 63 |
+
COPY --from=builder /app/env/.venv /app/.venv
|
| 64 |
+
|
| 65 |
+
# Copy the environment code
|
| 66 |
+
COPY --from=builder /app/env /app/env
|
| 67 |
+
|
| 68 |
+
# Set PATH to use the virtual environment
|
| 69 |
+
ENV PATH="/app/.venv/bin:$PATH"
|
| 70 |
+
|
| 71 |
+
# Set PYTHONPATH so imports work correctly
|
| 72 |
+
ENV PYTHONPATH="/app/env:$PYTHONPATH"
|
| 73 |
+
|
| 74 |
+
# Health check
|
| 75 |
+
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
|
| 76 |
+
CMD curl -f http://localhost:8000/health || exit 1
|
| 77 |
+
|
| 78 |
+
# Run the FastAPI server
|
| 79 |
+
# The module path is constructed to work with the /app/env structure
|
| 80 |
+
CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
|
server/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Codereviewagent environment server components."""
|
| 8 |
+
|
| 9 |
+
from .CodeReviewAgent_environment import CodereviewagentEnvironment
|
| 10 |
+
|
| 11 |
+
__all__ = ["CodereviewagentEnvironment"]
|
server/__pycache__/CodeReviewAgent_environment.cpython-314.pyc
ADDED
|
Binary file (16.7 kB). View file
|
|
|
server/__pycache__/__init__.cpython-314.pyc
ADDED
|
Binary file (333 Bytes). View file
|
|
|
server/__pycache__/grader.cpython-314.pyc
ADDED
|
Binary file (8.04 kB). View file
|
|
|
server/__pycache__/tasks.cpython-314.pyc
ADDED
|
Binary file (18.7 kB). View file
|
|
|
server/app.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Async FastAPI server for the CodeReviewAgent environment.
|
| 3 |
+
|
| 4 |
+
Endpoints:
|
| 5 |
+
POST /reset β start a new episode (HTTP session)
|
| 6 |
+
POST /step β execute one action
|
| 7 |
+
GET /state β current episode snapshot
|
| 8 |
+
GET /health β liveness probe
|
| 9 |
+
GET /schema β action / observation schema
|
| 10 |
+
WS /ws β WebSocket session (own env per connection)
|
| 11 |
+
|
| 12 |
+
HTTP endpoints share a single env instance (sequential use).
|
| 13 |
+
WebSocket endpoints each spin up an isolated env instance, enabling
|
| 14 |
+
concurrent GRPO rollouts.
|
| 15 |
+
|
| 16 |
+
OpenEnv web interface is mounted at /web via create_app if available;
|
| 17 |
+
falls back to a minimal HTML redirect page.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
from __future__ import annotations
|
| 21 |
+
|
| 22 |
+
import json
|
| 23 |
+
from contextlib import asynccontextmanager
|
| 24 |
+
from typing import Any
|
| 25 |
+
|
| 26 |
+
from fastapi import FastAPI, HTTPException, WebSocket, WebSocketDisconnect
|
| 27 |
+
from fastapi.responses import HTMLResponse
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
from openenv.core.env_server.http_server import create_app as _create_openenv_app
|
| 31 |
+
_OPENENV_AVAILABLE = True
|
| 32 |
+
except Exception: # pragma: no cover
|
| 33 |
+
_OPENENV_AVAILABLE = False
|
| 34 |
+
|
| 35 |
+
try:
|
| 36 |
+
from ..models import CodereviewagentAction, CodereviewagentObservation, RewardType
|
| 37 |
+
from .CodeReviewAgent_environment import CodereviewagentEnvironment
|
| 38 |
+
except ModuleNotFoundError:
|
| 39 |
+
from models import CodereviewagentAction, CodereviewagentObservation, RewardType # type: ignore
|
| 40 |
+
from server.CodeReviewAgent_environment import CodereviewagentEnvironment # type: ignore
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
# ββ Shared HTTP session env βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 44 |
+
|
| 45 |
+
_http_env: CodereviewagentEnvironment | None = None
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
@asynccontextmanager
|
| 49 |
+
async def lifespan(application: FastAPI):
|
| 50 |
+
global _http_env
|
| 51 |
+
_http_env = CodereviewagentEnvironment()
|
| 52 |
+
yield
|
| 53 |
+
_http_env = None
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
# ββ Response shapes βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 57 |
+
|
| 58 |
+
class StepResponse:
|
| 59 |
+
def __init__(
|
| 60 |
+
self,
|
| 61 |
+
obs: CodereviewagentObservation,
|
| 62 |
+
reward: RewardType,
|
| 63 |
+
done: bool,
|
| 64 |
+
info: dict[str, Any],
|
| 65 |
+
) -> None:
|
| 66 |
+
self.obs = obs
|
| 67 |
+
self.reward = reward
|
| 68 |
+
self.done = done
|
| 69 |
+
self.info = info
|
| 70 |
+
|
| 71 |
+
def to_dict(self) -> dict[str, Any]:
|
| 72 |
+
return {
|
| 73 |
+
"observation": self.obs.model_dump(),
|
| 74 |
+
"reward": self.reward.model_dump(),
|
| 75 |
+
"done": self.done,
|
| 76 |
+
"info": self.info,
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
# ββ App factory βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 81 |
+
|
| 82 |
+
def _build_app() -> FastAPI:
|
| 83 |
+
application = FastAPI(
|
| 84 |
+
title="CodeReviewAgent",
|
| 85 |
+
description="OpenEnv code-review environment β async FastAPI server.",
|
| 86 |
+
version="2.0.0",
|
| 87 |
+
lifespan=lifespan,
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
# ββ HTTP endpoints ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 91 |
+
|
| 92 |
+
@application.post("/reset", summary="Start a new episode")
|
| 93 |
+
async def reset_endpoint() -> dict[str, Any]:
|
| 94 |
+
assert _http_env is not None
|
| 95 |
+
obs = await _http_env.async_reset()
|
| 96 |
+
return {"observation": obs.model_dump(), "reward": None, "done": False, "info": {}}
|
| 97 |
+
|
| 98 |
+
@application.post("/step", summary="Execute one action")
|
| 99 |
+
async def step_endpoint(action: CodereviewagentAction) -> dict[str, Any]:
|
| 100 |
+
assert _http_env is not None
|
| 101 |
+
obs, reward, done, info = await _http_env.async_step(action)
|
| 102 |
+
return StepResponse(obs, reward, done, info).to_dict()
|
| 103 |
+
|
| 104 |
+
@application.get("/state", summary="Current episode state snapshot")
|
| 105 |
+
async def state_endpoint() -> dict[str, Any]:
|
| 106 |
+
assert _http_env is not None
|
| 107 |
+
return await _http_env.async_state()
|
| 108 |
+
|
| 109 |
+
@application.get("/health", summary="Liveness probe")
|
| 110 |
+
async def health() -> dict[str, str]:
|
| 111 |
+
return {"status": "ok"}
|
| 112 |
+
|
| 113 |
+
@application.get("/schema", summary="Action and observation JSON schemas")
|
| 114 |
+
async def schema() -> dict[str, Any]:
|
| 115 |
+
return {
|
| 116 |
+
"action": CodereviewagentAction.model_json_schema(),
|
| 117 |
+
"observation": CodereviewagentObservation.model_json_schema(),
|
| 118 |
+
"reward": RewardType.model_json_schema(),
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
# ββ WebSocket endpoint (one env per connection) βββββββββββββββββββββββ
|
| 122 |
+
|
| 123 |
+
@application.websocket("/ws")
|
| 124 |
+
async def ws_endpoint(websocket: WebSocket) -> None:
|
| 125 |
+
await websocket.accept()
|
| 126 |
+
env = CodereviewagentEnvironment()
|
| 127 |
+
try:
|
| 128 |
+
while True:
|
| 129 |
+
raw = await websocket.receive_text()
|
| 130 |
+
msg = json.loads(raw)
|
| 131 |
+
cmd = msg.get("command")
|
| 132 |
+
|
| 133 |
+
if cmd == "reset":
|
| 134 |
+
obs = await env.async_reset()
|
| 135 |
+
await websocket.send_json(
|
| 136 |
+
{"type": "reset", "observation": obs.model_dump()}
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
elif cmd == "step":
|
| 140 |
+
try:
|
| 141 |
+
action = CodereviewagentAction(**msg["action"])
|
| 142 |
+
except Exception as exc:
|
| 143 |
+
await websocket.send_json({"type": "error", "detail": str(exc)})
|
| 144 |
+
continue
|
| 145 |
+
obs, reward, done, info = await env.async_step(action)
|
| 146 |
+
await websocket.send_json(
|
| 147 |
+
{
|
| 148 |
+
"type": "step",
|
| 149 |
+
"observation": obs.model_dump(),
|
| 150 |
+
"reward": reward.model_dump(),
|
| 151 |
+
"done": done,
|
| 152 |
+
"info": info,
|
| 153 |
+
}
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
elif cmd == "state":
|
| 157 |
+
state = await env.async_state()
|
| 158 |
+
await websocket.send_json({"type": "state", "state": state})
|
| 159 |
+
|
| 160 |
+
else:
|
| 161 |
+
await websocket.send_json(
|
| 162 |
+
{"type": "error", "detail": f"Unknown command: {cmd}"}
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
except WebSocketDisconnect:
|
| 166 |
+
pass
|
| 167 |
+
|
| 168 |
+
# ββ Web UI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 169 |
+
|
| 170 |
+
@application.get("/web", response_class=HTMLResponse, include_in_schema=False)
|
| 171 |
+
async def web_ui() -> str:
|
| 172 |
+
return """
|
| 173 |
+
<!doctype html><html><head><title>CodeReviewAgent</title></head>
|
| 174 |
+
<body>
|
| 175 |
+
<h2>CodeReviewAgent Environment</h2>
|
| 176 |
+
<p>API docs: <a href="/docs">/docs</a></p>
|
| 177 |
+
<p>Health: <a href="/health">/health</a></p>
|
| 178 |
+
<p>Schema: <a href="/schema">/schema</a></p>
|
| 179 |
+
</body></html>
|
| 180 |
+
"""
|
| 181 |
+
|
| 182 |
+
return application
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
app = _build_app()
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
def main(host: str = "0.0.0.0", port: int = 8000) -> None:
|
| 189 |
+
import uvicorn
|
| 190 |
+
uvicorn.run(app, host=host, port=port)
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
if __name__ == "__main__":
|
| 194 |
+
import argparse
|
| 195 |
+
parser = argparse.ArgumentParser()
|
| 196 |
+
parser.add_argument("--port", type=int, default=8000)
|
| 197 |
+
args = parser.parse_args()
|
| 198 |
+
main(port=args.port)
|
server/grader.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Deterministic grader for CodeReviewAgent tasks.
|
| 3 |
+
|
| 4 |
+
Scoring design
|
| 5 |
+
--------------
|
| 6 |
+
During the episode (ADD_COMMENT actions):
|
| 7 |
+
+weight/total_weight * 0.60 per newly found issue (max 0.60 cumulative)
|
| 8 |
+
-0.02 per false-positive (substantive comment, no match)
|
| 9 |
+
|
| 10 |
+
Final (SUBMIT_REVIEW):
|
| 11 |
+
+coverage * 0.20 weighted coverage bonus (max 0.20)
|
| 12 |
+
+/-0.10 correct / incorrect final decision
|
| 13 |
+
+efficiency * 0.10 step-efficiency bonus when coverage >= 60%
|
| 14 |
+
|
| 15 |
+
Maximum achievable total: ~1.0 Minimum: β1.0
|
| 16 |
+
|
| 17 |
+
Anti-exploit rule (enforced since v2):
|
| 18 |
+
A comment MUST satisfy BOTH:
|
| 19 |
+
1. keyword_hit β at least one issue keyword appears in the comment text
|
| 20 |
+
2. line_hit β comment line_number is within Β±LINE_TOLERANCE of the issue
|
| 21 |
+
`category` match is NOT sufficient on its own. This closes the keyword-spam
|
| 22 |
+
exploit where a model dumps all known keywords on a single line.
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
from typing import Any
|
| 26 |
+
|
| 27 |
+
try:
|
| 28 |
+
from ..models import RewardType
|
| 29 |
+
except ImportError:
|
| 30 |
+
from models import RewardType # type: ignore[no-redef]
|
| 31 |
+
|
| 32 |
+
LINE_TOLERANCE: int = 3 # lines either side of an issue's declared range
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class CodeReviewGrader:
|
| 36 |
+
def __init__(self, task: dict[str, Any]) -> None:
|
| 37 |
+
self.task = task
|
| 38 |
+
self.total_weight: float = sum(iss["weight"] for iss in task["issues"])
|
| 39 |
+
|
| 40 |
+
# ββ Per-comment scoring βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 41 |
+
|
| 42 |
+
def score_comment(
|
| 43 |
+
self,
|
| 44 |
+
line_number: int | None,
|
| 45 |
+
comment: str | None,
|
| 46 |
+
already_found: list[str],
|
| 47 |
+
) -> tuple[float, list[str], dict[str, float]]:
|
| 48 |
+
"""
|
| 49 |
+
Score an ADD_COMMENT action.
|
| 50 |
+
|
| 51 |
+
Returns:
|
| 52 |
+
(reward_delta, newly_found_issue_ids, component_breakdown)
|
| 53 |
+
|
| 54 |
+
Match condition (BOTH required β no shortcut):
|
| 55 |
+
keyword_hit AND line_hit
|
| 56 |
+
"""
|
| 57 |
+
if not comment:
|
| 58 |
+
return 0.0, [], {}
|
| 59 |
+
|
| 60 |
+
comment_lower = comment.lower()
|
| 61 |
+
newly_found: list[str] = []
|
| 62 |
+
issue_credit: float = 0.0
|
| 63 |
+
false_positive_penalty: float = 0.0
|
| 64 |
+
|
| 65 |
+
for issue in self.task["issues"]:
|
| 66 |
+
if issue["id"] in already_found:
|
| 67 |
+
continue
|
| 68 |
+
|
| 69 |
+
keyword_hit = any(kw.lower() in comment_lower for kw in issue["keywords"])
|
| 70 |
+
line_hit = self._line_in_range(line_number, issue["line_range"])
|
| 71 |
+
|
| 72 |
+
# BOTH conditions required β no cat_hit shortcut
|
| 73 |
+
if keyword_hit and line_hit:
|
| 74 |
+
credit = (issue["weight"] / self.total_weight) * 0.60
|
| 75 |
+
newly_found.append(issue["id"])
|
| 76 |
+
issue_credit += credit
|
| 77 |
+
|
| 78 |
+
# Penalise substantive comments that matched nothing
|
| 79 |
+
if not newly_found and comment and len(comment.strip()) > 15:
|
| 80 |
+
false_positive_penalty = -0.02
|
| 81 |
+
|
| 82 |
+
total = round(issue_credit + false_positive_penalty, 4)
|
| 83 |
+
breakdown = {
|
| 84 |
+
"issue_credit": round(issue_credit, 4),
|
| 85 |
+
"false_positive_penalty": round(false_positive_penalty, 4),
|
| 86 |
+
}
|
| 87 |
+
return total, newly_found, breakdown
|
| 88 |
+
|
| 89 |
+
# ββ Terminal scoring ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 90 |
+
|
| 91 |
+
def final_score(
|
| 92 |
+
self,
|
| 93 |
+
issues_found: list[str],
|
| 94 |
+
review_decision: str | None,
|
| 95 |
+
step_count: int,
|
| 96 |
+
max_steps: int,
|
| 97 |
+
current_step: int = 0,
|
| 98 |
+
) -> RewardType:
|
| 99 |
+
"""
|
| 100 |
+
Compute the terminal reward on SUBMIT_REVIEW.
|
| 101 |
+
Returns a fully typed RewardType with component breakdown.
|
| 102 |
+
"""
|
| 103 |
+
unique_found = list(set(issues_found))
|
| 104 |
+
found_weight = sum(
|
| 105 |
+
iss["weight"]
|
| 106 |
+
for iss in self.task["issues"]
|
| 107 |
+
if iss["id"] in unique_found
|
| 108 |
+
)
|
| 109 |
+
coverage = found_weight / self.total_weight if self.total_weight > 0 else 0.0
|
| 110 |
+
|
| 111 |
+
correct_decision = self.task.get("correct_decision", "request_changes")
|
| 112 |
+
decision_score = 0.10 if review_decision == correct_decision else -0.10
|
| 113 |
+
|
| 114 |
+
efficiency = max(0.0, 1.0 - step_count / max_steps)
|
| 115 |
+
efficiency_bonus = round(0.10 * efficiency, 4) if coverage >= 0.60 else 0.0
|
| 116 |
+
coverage_bonus = round(coverage * 0.20, 4)
|
| 117 |
+
|
| 118 |
+
raw_total = coverage_bonus + decision_score + efficiency_bonus
|
| 119 |
+
clamped = round(max(-1.0, min(1.0, raw_total)), 4)
|
| 120 |
+
|
| 121 |
+
components = {
|
| 122 |
+
"coverage_bonus": coverage_bonus,
|
| 123 |
+
"decision_score": round(decision_score, 4),
|
| 124 |
+
"efficiency_bonus": efficiency_bonus,
|
| 125 |
+
}
|
| 126 |
+
explanation = (
|
| 127 |
+
f"Found {len(unique_found)}/{len(self.task['issues'])} issues "
|
| 128 |
+
f"(weighted coverage {coverage:.0%}). "
|
| 129 |
+
f"Decision '{review_decision}' was "
|
| 130 |
+
f"{'correct' if review_decision == correct_decision else 'incorrect'}. "
|
| 131 |
+
f"Used {step_count}/{max_steps} steps."
|
| 132 |
+
)
|
| 133 |
+
return RewardType(
|
| 134 |
+
total=clamped,
|
| 135 |
+
components=components,
|
| 136 |
+
passed=review_decision == correct_decision and coverage >= 0.60,
|
| 137 |
+
explanation=explanation,
|
| 138 |
+
step=current_step,
|
| 139 |
+
terminal=True,
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
# ββ Helper ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 143 |
+
|
| 144 |
+
@staticmethod
|
| 145 |
+
def _line_in_range(
|
| 146 |
+
line_number: int | None,
|
| 147 |
+
line_range: tuple[int, int],
|
| 148 |
+
) -> bool:
|
| 149 |
+
if line_number is None:
|
| 150 |
+
return False
|
| 151 |
+
start, end = line_range
|
| 152 |
+
return (start - LINE_TOLERANCE) <= line_number <= (end + LINE_TOLERANCE)
|
server/requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openenv[core]>=0.2.0
|
| 2 |
+
fastapi>=0.115.0
|
| 3 |
+
uvicorn>=0.24.0
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
|
server/tasks.py
ADDED
|
@@ -0,0 +1,719 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Task definitions for the CodeReviewAgent environment.
|
| 3 |
+
|
| 4 |
+
Six tasks across four difficulty tiers. Each task defines:
|
| 5 |
+
- code: Python source to review
|
| 6 |
+
- issues: list of ground-truth issues with grading metadata
|
| 7 |
+
- correct_decision: expected final review decision
|
| 8 |
+
|
| 9 |
+
Difficulty ladder:
|
| 10 |
+
0 ultra-easy β hints embedded in comments; bootstraps GRPO positive trajectories
|
| 11 |
+
1 easy β 3 clean logic bugs, no hints
|
| 12 |
+
2 medium β 5 security issues in an auth module
|
| 13 |
+
3 hard β 7 mixed issues in a data pipeline
|
| 14 |
+
4 medium β 5 async concurrency bugs
|
| 15 |
+
5 hard β 6 Flask API security issues
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
from typing import Any
|
| 19 |
+
|
| 20 |
+
TASKS: list[dict[str, Any]] = [
|
| 21 |
+
# ββ Task 0: Ultra-easy (bootstrap) βββββββββββββββββββββββββββββββββββββββ
|
| 22 |
+
# DESIGN INTENT: both issues have their category name spelled out in a code
|
| 23 |
+
# comment directly above them. A frozen weak model that simply reads the
|
| 24 |
+
# comments and echoes them back should reliably score > 0. This task exists
|
| 25 |
+
# solely to guarantee that GRPO has at least a few positive trajectories from
|
| 26 |
+
# training step 1.
|
| 27 |
+
{
|
| 28 |
+
"id": 0,
|
| 29 |
+
"name": "Bootstrap: Obvious Issues",
|
| 30 |
+
"difficulty": "ultra-easy",
|
| 31 |
+
"file_name": "bootstrap.py",
|
| 32 |
+
"description": (
|
| 33 |
+
"Review this short Python module. "
|
| 34 |
+
"The comments above each function hint at the kind of issue present. "
|
| 35 |
+
"Add a comment for each bug you find (line number, severity, category), "
|
| 36 |
+
"call request_changes, then submit."
|
| 37 |
+
),
|
| 38 |
+
"max_steps": 6,
|
| 39 |
+
"code": """\
|
| 40 |
+
# BUG: this loop has an off-by-one error β it iterates one index too far
|
| 41 |
+
def sum_items(data):
|
| 42 |
+
total = 0
|
| 43 |
+
for i in range(len(data) + 1): # line 4: causes IndexError on last iteration
|
| 44 |
+
total += data[i]
|
| 45 |
+
return total
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
# SECURITY: hardcoded credential β move to environment variable
|
| 49 |
+
def connect_db():
|
| 50 |
+
db_password = "s3cr3t_prod_pw" # line 11: hardcoded credential in source
|
| 51 |
+
return f"postgresql://admin:{db_password}@localhost/mydb"
|
| 52 |
+
""",
|
| 53 |
+
"issues": [
|
| 54 |
+
{
|
| 55 |
+
"id": "bootstrap_off_by_one",
|
| 56 |
+
"description": "Off-by-one: range(len+1) causes IndexError on the last iteration",
|
| 57 |
+
"line_range": (4, 4),
|
| 58 |
+
"keywords": [
|
| 59 |
+
"off-by-one", "off by one", "bug", "index", "indexerror",
|
| 60 |
+
"range", "+ 1", "len + 1", "out of bounds",
|
| 61 |
+
],
|
| 62 |
+
"category": "bug",
|
| 63 |
+
"severity": "error",
|
| 64 |
+
"weight": 1.0,
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"id": "bootstrap_hardcoded_cred",
|
| 68 |
+
"description": "Hardcoded password in source should be an environment variable",
|
| 69 |
+
"line_range": (11, 11),
|
| 70 |
+
"keywords": [
|
| 71 |
+
"hardcoded", "hard-coded", "security", "credential", "password",
|
| 72 |
+
"secret", "env", "environment variable", "os.environ",
|
| 73 |
+
],
|
| 74 |
+
"category": "security",
|
| 75 |
+
"severity": "critical",
|
| 76 |
+
"weight": 1.0,
|
| 77 |
+
},
|
| 78 |
+
],
|
| 79 |
+
"correct_decision": "request_changes",
|
| 80 |
+
},
|
| 81 |
+
|
| 82 |
+
# ββ Task 1: Easy βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 83 |
+
{
|
| 84 |
+
"id": 1,
|
| 85 |
+
"name": "Basic Bug Detection",
|
| 86 |
+
"difficulty": "easy",
|
| 87 |
+
"file_name": "utils.py",
|
| 88 |
+
"description": (
|
| 89 |
+
"Review this Python utility module. "
|
| 90 |
+
"Identify any bugs, logical errors, or code quality issues. "
|
| 91 |
+
"Add a comment for each issue you find (include line number, severity, "
|
| 92 |
+
"and category), then submit your review."
|
| 93 |
+
),
|
| 94 |
+
"max_steps": 15,
|
| 95 |
+
"code": """\
|
| 96 |
+
def calculate_average(numbers):
|
| 97 |
+
\"\"\"Calculate the average of a list of numbers.\"\"\"
|
| 98 |
+
total = 0
|
| 99 |
+
for i in range(len(numbers) + 1): # line 4
|
| 100 |
+
total += numbers[i]
|
| 101 |
+
average = total / len(numbers)
|
| 102 |
+
unused_result = sorted(numbers) # line 7
|
| 103 |
+
return average
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def find_max(items):
|
| 107 |
+
\"\"\"Return the maximum value in a list.\"\"\"
|
| 108 |
+
if len(items) == 0:
|
| 109 |
+
return None
|
| 110 |
+
max_val = items[0]
|
| 111 |
+
for item in items:
|
| 112 |
+
if item > max_val:
|
| 113 |
+
max_val == item # line 17: should be =, not ==
|
| 114 |
+
return max_val
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def is_palindrome(s):
|
| 118 |
+
\"\"\"Check if a string is a palindrome.\"\"\"
|
| 119 |
+
return s == s[::-1]
|
| 120 |
+
""",
|
| 121 |
+
"issues": [
|
| 122 |
+
{
|
| 123 |
+
"id": "off_by_one",
|
| 124 |
+
"description": "Off-by-one: range(len+1) causes IndexError on the last iteration",
|
| 125 |
+
"line_range": (4, 5),
|
| 126 |
+
"keywords": [
|
| 127 |
+
"off-by-one", "off by one", "range", "index", "indexerror",
|
| 128 |
+
"out of bounds", "len + 1", "+ 1", "index out",
|
| 129 |
+
],
|
| 130 |
+
"category": "bug",
|
| 131 |
+
"severity": "error",
|
| 132 |
+
"weight": 1.0,
|
| 133 |
+
},
|
| 134 |
+
{
|
| 135 |
+
"id": "unused_variable",
|
| 136 |
+
"description": "unused_result is assigned but never used",
|
| 137 |
+
"line_range": (7, 7),
|
| 138 |
+
"keywords": [
|
| 139 |
+
"unused", "unused_result", "never used", "dead code",
|
| 140 |
+
"not used", "unnecessary",
|
| 141 |
+
],
|
| 142 |
+
"category": "style",
|
| 143 |
+
"severity": "info",
|
| 144 |
+
"weight": 0.5,
|
| 145 |
+
},
|
| 146 |
+
{
|
| 147 |
+
"id": "assignment_not_update",
|
| 148 |
+
"description": "max_val == item uses == (comparison) instead of = (assignment); max is never updated",
|
| 149 |
+
"line_range": (17, 17),
|
| 150 |
+
"keywords": [
|
| 151 |
+
"==", "assignment", "comparison", "max_val", "never update",
|
| 152 |
+
"not updating", "wrong operator", "should be =", "max never",
|
| 153 |
+
],
|
| 154 |
+
"category": "bug",
|
| 155 |
+
"severity": "error",
|
| 156 |
+
"weight": 1.0,
|
| 157 |
+
},
|
| 158 |
+
],
|
| 159 |
+
"correct_decision": "request_changes",
|
| 160 |
+
},
|
| 161 |
+
|
| 162 |
+
# ββ Task 2: Medium βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 163 |
+
{
|
| 164 |
+
"id": 2,
|
| 165 |
+
"name": "Security Vulnerability Review",
|
| 166 |
+
"difficulty": "medium",
|
| 167 |
+
"file_name": "auth.py",
|
| 168 |
+
"description": (
|
| 169 |
+
"Review this authentication module for security vulnerabilities. "
|
| 170 |
+
"Pay careful attention to credential handling, input sanitization, "
|
| 171 |
+
"and cryptographic choices. Annotate every issue with its severity "
|
| 172 |
+
"and category, then submit your review."
|
| 173 |
+
),
|
| 174 |
+
"max_steps": 20,
|
| 175 |
+
"code": """\
|
| 176 |
+
import sqlite3
|
| 177 |
+
import hashlib
|
| 178 |
+
import os
|
| 179 |
+
|
| 180 |
+
DB_PASSWORD = "super_secret_123" # line 5
|
| 181 |
+
ADMIN_TOKEN = "tok_admin_abc123" # line 6
|
| 182 |
+
|
| 183 |
+
def authenticate_user(username, password):
|
| 184 |
+
\"\"\"Authenticate a user against the database.\"\"\"
|
| 185 |
+
conn = sqlite3.connect('app.db')
|
| 186 |
+
cursor = conn.cursor()
|
| 187 |
+
# line 12: f-string interpolation β SQL injection
|
| 188 |
+
query = f"SELECT * FROM users WHERE username = '{username}' AND password = '{password}'"
|
| 189 |
+
cursor.execute(query)
|
| 190 |
+
user = cursor.fetchone()
|
| 191 |
+
conn.close()
|
| 192 |
+
return user is not None
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
def hash_password(password):
|
| 196 |
+
\"\"\"Hash a password for storage.\"\"\"
|
| 197 |
+
return hashlib.md5(password.encode()).hexdigest() # line 21
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
def execute_admin_command(command):
|
| 201 |
+
\"\"\"Execute an admin maintenance command.\"\"\"
|
| 202 |
+
result = eval(command) # line 25
|
| 203 |
+
return result
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
def get_user_data(user_id):
|
| 207 |
+
\"\"\"Fetch user profile from internal service.\"\"\"
|
| 208 |
+
import requests
|
| 209 |
+
url = f"https://internal-api/users/{user_id}"
|
| 210 |
+
response = requests.get(url, verify=False) # line 32
|
| 211 |
+
return response.json()
|
| 212 |
+
""",
|
| 213 |
+
"issues": [
|
| 214 |
+
{
|
| 215 |
+
"id": "hardcoded_credentials",
|
| 216 |
+
"description": "Credentials hard-coded in source (lines 5-6)",
|
| 217 |
+
"line_range": (5, 6),
|
| 218 |
+
"keywords": [
|
| 219 |
+
"hardcoded", "hard-coded", "hard coded", "hardcode",
|
| 220 |
+
"db_password", "admin_token", "plaintext credential",
|
| 221 |
+
"environment variable", "env var", "os.environ",
|
| 222 |
+
],
|
| 223 |
+
"category": "security",
|
| 224 |
+
"severity": "critical",
|
| 225 |
+
"weight": 1.0,
|
| 226 |
+
},
|
| 227 |
+
{
|
| 228 |
+
"id": "sql_injection",
|
| 229 |
+
"description": "SQL injection via unsanitised f-string interpolation",
|
| 230 |
+
"line_range": (12, 14),
|
| 231 |
+
"keywords": [
|
| 232 |
+
"sql injection", "sql", "injection", "f-string", "parameterized",
|
| 233 |
+
"sanitize", "escape", "prepared statement", "placeholder",
|
| 234 |
+
],
|
| 235 |
+
"category": "security",
|
| 236 |
+
"severity": "critical",
|
| 237 |
+
"weight": 1.0,
|
| 238 |
+
},
|
| 239 |
+
{
|
| 240 |
+
"id": "weak_hashing",
|
| 241 |
+
"description": "MD5 is cryptographically broken for password storage",
|
| 242 |
+
"line_range": (21, 21),
|
| 243 |
+
"keywords": [
|
| 244 |
+
"md5", "weak", "bcrypt", "argon2", "pbkdf2", "scrypt",
|
| 245 |
+
"cryptographic", "password hashing", "hash", "broken",
|
| 246 |
+
],
|
| 247 |
+
"category": "security",
|
| 248 |
+
"severity": "error",
|
| 249 |
+
"weight": 0.75,
|
| 250 |
+
},
|
| 251 |
+
{
|
| 252 |
+
"id": "arbitrary_code_execution",
|
| 253 |
+
"description": "eval() on untrusted input allows arbitrary code execution",
|
| 254 |
+
"line_range": (25, 25),
|
| 255 |
+
"keywords": [
|
| 256 |
+
"eval", "arbitrary code", "code execution", "rce",
|
| 257 |
+
"remote code", "dangerous", "unsafe",
|
| 258 |
+
],
|
| 259 |
+
"category": "security",
|
| 260 |
+
"severity": "critical",
|
| 261 |
+
"weight": 1.0,
|
| 262 |
+
},
|
| 263 |
+
{
|
| 264 |
+
"id": "ssl_verification_disabled",
|
| 265 |
+
"description": "verify=False disables TLS cert validation, enabling MITM attacks",
|
| 266 |
+
"line_range": (32, 32),
|
| 267 |
+
"keywords": [
|
| 268 |
+
"ssl", "verify", "certificate", "mitm",
|
| 269 |
+
"man-in-the-middle", "tls", "verify=false", "cert",
|
| 270 |
+
],
|
| 271 |
+
"category": "security",
|
| 272 |
+
"severity": "error",
|
| 273 |
+
"weight": 0.75,
|
| 274 |
+
},
|
| 275 |
+
],
|
| 276 |
+
"correct_decision": "request_changes",
|
| 277 |
+
},
|
| 278 |
+
|
| 279 |
+
# ββ Task 3: Hard βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 280 |
+
{
|
| 281 |
+
"id": 3,
|
| 282 |
+
"name": "Full Architecture and Performance Review",
|
| 283 |
+
"difficulty": "hard",
|
| 284 |
+
"file_name": "data_pipeline.py",
|
| 285 |
+
"description": (
|
| 286 |
+
"Perform a comprehensive review of this data pipeline. "
|
| 287 |
+
"Identify bugs, security vulnerabilities, performance bottlenecks, "
|
| 288 |
+
"and architectural design issues. Each comment should clearly explain "
|
| 289 |
+
"the problem and suggest a fix. Submit your review when done."
|
| 290 |
+
),
|
| 291 |
+
"max_steps": 30,
|
| 292 |
+
"code": """\
|
| 293 |
+
import requests
|
| 294 |
+
import json
|
| 295 |
+
import time
|
| 296 |
+
from threading import Thread
|
| 297 |
+
|
| 298 |
+
API_KEY = "sk-prod-abc123def456" # line 6
|
| 299 |
+
|
| 300 |
+
|
| 301 |
+
class DataPipeline:
|
| 302 |
+
def __init__(self, endpoint):
|
| 303 |
+
self.endpoint = endpoint
|
| 304 |
+
self.results = []
|
| 305 |
+
self.cache = {} # line 13: unbounded
|
| 306 |
+
|
| 307 |
+
def fetch_batch(self, item_ids):
|
| 308 |
+
\"\"\"Fetch items from the API.\"\"\"
|
| 309 |
+
items = []
|
| 310 |
+
for item_id in item_ids: # line 17: N+1 pattern
|
| 311 |
+
response = requests.get(
|
| 312 |
+
f"{self.endpoint}/items/{item_id}",
|
| 313 |
+
headers={"Authorization": f"Bearer {API_KEY}"},
|
| 314 |
+
verify=False, # line 22
|
| 315 |
+
)
|
| 316 |
+
items.append(response.json())
|
| 317 |
+
return items
|
| 318 |
+
|
| 319 |
+
def process_items(self, items):
|
| 320 |
+
\"\"\"Transform items for storage.\"\"\"
|
| 321 |
+
results = []
|
| 322 |
+
for i in range(len(items)): # line 28: use enumerate
|
| 323 |
+
item = items[i]
|
| 324 |
+
transformed = {
|
| 325 |
+
"id": item["id"], # line 31: KeyError not handled
|
| 326 |
+
"value": item["value"] * 2,
|
| 327 |
+
"label": item.get("label", "unknown"),
|
| 328 |
+
}
|
| 329 |
+
results.append(transformed)
|
| 330 |
+
self.cache[item["id"]] = transformed # line 36
|
| 331 |
+
return results
|
| 332 |
+
|
| 333 |
+
def run_async(self, func, *args):
|
| 334 |
+
\"\"\"Run function in a background thread.\"\"\"
|
| 335 |
+
t = Thread(target=func, args=args)
|
| 336 |
+
t.start()
|
| 337 |
+
# line 43: thread not tracked or joined β resource leak
|
| 338 |
+
|
| 339 |
+
def save_results(self, results, output_path):
|
| 340 |
+
\"\"\"Persist results to disk.\"\"\"
|
| 341 |
+
with open(output_path, "w") as f:
|
| 342 |
+
json.dump(results, f)
|
| 343 |
+
|
| 344 |
+
def retry_failed(self, failed_ids, max_retries=10): # line 50
|
| 345 |
+
\"\"\"Re-fetch items that previously failed.\"\"\"
|
| 346 |
+
for item_id in failed_ids:
|
| 347 |
+
for attempt in range(max_retries):
|
| 348 |
+
try:
|
| 349 |
+
result = requests.get(
|
| 350 |
+
f"{self.endpoint}/items/{item_id}"
|
| 351 |
+
)
|
| 352 |
+
if result.status_code == 200:
|
| 353 |
+
self.results.append(result.json())
|
| 354 |
+
break
|
| 355 |
+
except Exception:
|
| 356 |
+
time.sleep(1) # line 60: no exponential backoff
|
| 357 |
+
""",
|
| 358 |
+
"issues": [
|
| 359 |
+
{
|
| 360 |
+
"id": "hardcoded_api_key",
|
| 361 |
+
"description": "API key hard-coded in source instead of an environment variable",
|
| 362 |
+
"line_range": (6, 6),
|
| 363 |
+
"keywords": [
|
| 364 |
+
"hardcoded", "hard-coded", "hardcode", "api key", "api_key",
|
| 365 |
+
"environment variable", "env var", "os.environ", "sk-prod",
|
| 366 |
+
],
|
| 367 |
+
"category": "security",
|
| 368 |
+
"severity": "critical",
|
| 369 |
+
"weight": 1.0,
|
| 370 |
+
},
|
| 371 |
+
{
|
| 372 |
+
"id": "n_plus_one_requests",
|
| 373 |
+
"description": "One HTTP request per item (N+1 pattern); should use a bulk/batch endpoint",
|
| 374 |
+
"line_range": (17, 24),
|
| 375 |
+
"keywords": [
|
| 376 |
+
"n+1", "n plus 1", "batch", "bulk", "loop",
|
| 377 |
+
"individual request", "serial", "one request per",
|
| 378 |
+
],
|
| 379 |
+
"category": "performance",
|
| 380 |
+
"severity": "error",
|
| 381 |
+
"weight": 1.0,
|
| 382 |
+
},
|
| 383 |
+
{
|
| 384 |
+
"id": "ssl_disabled",
|
| 385 |
+
"description": "SSL certificate verification disabled (verify=False)",
|
| 386 |
+
"line_range": (22, 22),
|
| 387 |
+
"keywords": [
|
| 388 |
+
"ssl", "verify", "certificate", "tls",
|
| 389 |
+
"mitm", "verify=false", "cert",
|
| 390 |
+
],
|
| 391 |
+
"category": "security",
|
| 392 |
+
"severity": "error",
|
| 393 |
+
"weight": 0.75,
|
| 394 |
+
},
|
| 395 |
+
{
|
| 396 |
+
"id": "missing_key_error_handling",
|
| 397 |
+
"description": "Direct dict access item['id'] / item['value'] raises KeyError on unexpected payloads",
|
| 398 |
+
"line_range": (31, 32),
|
| 399 |
+
"keywords": [
|
| 400 |
+
"keyerror", "key error", "error handling", "missing key",
|
| 401 |
+
"exception", "try", ".get(", "dict access",
|
| 402 |
+
],
|
| 403 |
+
"category": "bug",
|
| 404 |
+
"severity": "warning",
|
| 405 |
+
"weight": 0.75,
|
| 406 |
+
},
|
| 407 |
+
{
|
| 408 |
+
"id": "unbounded_cache",
|
| 409 |
+
"description": "self.cache grows without bound; will cause OOM on large inputs",
|
| 410 |
+
"line_range": (13, 13),
|
| 411 |
+
"keywords": [
|
| 412 |
+
"unbounded", "memory leak", "cache size", "limit",
|
| 413 |
+
"lru", "eviction", "grow", "oom", "memory",
|
| 414 |
+
],
|
| 415 |
+
"category": "design",
|
| 416 |
+
"severity": "warning",
|
| 417 |
+
"weight": 0.75,
|
| 418 |
+
},
|
| 419 |
+
{
|
| 420 |
+
"id": "thread_not_joined",
|
| 421 |
+
"description": "Thread is started but never stored or joined β silent resource/exception leak",
|
| 422 |
+
"line_range": (40, 43),
|
| 423 |
+
"keywords": [
|
| 424 |
+
"thread", "join", "track", "resource leak",
|
| 425 |
+
"daemon", "not joined", "not tracked",
|
| 426 |
+
],
|
| 427 |
+
"category": "bug",
|
| 428 |
+
"severity": "error",
|
| 429 |
+
"weight": 1.0,
|
| 430 |
+
},
|
| 431 |
+
{
|
| 432 |
+
"id": "no_exponential_backoff",
|
| 433 |
+
"description": "Retry loop sleeps 1 s flat; needs exponential backoff to avoid hammering the API",
|
| 434 |
+
"line_range": (50, 60),
|
| 435 |
+
"keywords": [
|
| 436 |
+
"backoff", "exponential", "retry", "sleep", "rate limit",
|
| 437 |
+
"jitter", "aggressive",
|
| 438 |
+
],
|
| 439 |
+
"category": "design",
|
| 440 |
+
"severity": "warning",
|
| 441 |
+
"weight": 0.5,
|
| 442 |
+
},
|
| 443 |
+
],
|
| 444 |
+
"correct_decision": "request_changes",
|
| 445 |
+
},
|
| 446 |
+
|
| 447 |
+
# ββ Task 4: Medium β Async Concurrency βββββββββββββββββββββββββββββββ
|
| 448 |
+
{
|
| 449 |
+
"id": 4,
|
| 450 |
+
"name": "Async Worker Review",
|
| 451 |
+
"difficulty": "medium",
|
| 452 |
+
"file_name": "async_worker.py",
|
| 453 |
+
"description": (
|
| 454 |
+
"Review this async worker module for concurrency bugs, "
|
| 455 |
+
"resource leaks, and exception-handling problems. "
|
| 456 |
+
"Comment on every issue with its line number, severity, "
|
| 457 |
+
"and category, then submit your review."
|
| 458 |
+
),
|
| 459 |
+
"max_steps": 20,
|
| 460 |
+
"code": """\
|
| 461 |
+
import asyncio
|
| 462 |
+
import aiohttp
|
| 463 |
+
|
| 464 |
+
_counter = 0 # line 3: shared mutable state, not thread/task-safe
|
| 465 |
+
|
| 466 |
+
async def fetch_url(url: str) -> dict:
|
| 467 |
+
\"\"\"Fetch a URL and return JSON.\"\"\"
|
| 468 |
+
session = aiohttp.ClientSession() # line 7: session never closed β resource leak
|
| 469 |
+
async with session.get(url) as resp:
|
| 470 |
+
return await resp.json()
|
| 471 |
+
|
| 472 |
+
|
| 473 |
+
async def increment_and_fetch(url: str) -> dict:
|
| 474 |
+
\"\"\"Increment shared counter then fetch.\"\"\"
|
| 475 |
+
global _counter
|
| 476 |
+
_counter += 1 # line 15: race condition β not atomic in concurrent tasks
|
| 477 |
+
data = fetch_url(url) # line 16: missing await β returns coroutine, not result
|
| 478 |
+
return data
|
| 479 |
+
|
| 480 |
+
|
| 481 |
+
async def run_all(urls: list) -> list:
|
| 482 |
+
\"\"\"Run all fetches concurrently.\"\"\"
|
| 483 |
+
tasks = [increment_and_fetch(u) for u in urls]
|
| 484 |
+
results = []
|
| 485 |
+
for coro in tasks:
|
| 486 |
+
try:
|
| 487 |
+
result = await coro
|
| 488 |
+
results.append(result)
|
| 489 |
+
except Exception:
|
| 490 |
+
pass # line 27: swallows all exceptions silently
|
| 491 |
+
return results
|
| 492 |
+
|
| 493 |
+
|
| 494 |
+
async def retry_fetch(url: str, retries: int = 3) -> dict:
|
| 495 |
+
\"\"\"Fetch with retry logic.\"\"\"
|
| 496 |
+
for attempt in range(retries):
|
| 497 |
+
try:
|
| 498 |
+
return await fetch_url(url)
|
| 499 |
+
except Exception as e:
|
| 500 |
+
if attempt == retries - 1:
|
| 501 |
+
raise
|
| 502 |
+
await asyncio.sleep(1) # line 38: flat sleep, no exponential backoff
|
| 503 |
+
""",
|
| 504 |
+
"issues": [
|
| 505 |
+
{
|
| 506 |
+
"id": "shared_mutable_state",
|
| 507 |
+
"description": "Module-level _counter mutated by concurrent tasks without a lock",
|
| 508 |
+
"line_range": (3, 3),
|
| 509 |
+
"keywords": [
|
| 510 |
+
"shared", "race condition", "thread-safe", "task-safe",
|
| 511 |
+
"atomic", "lock", "asyncio.lock", "concurrent", "global",
|
| 512 |
+
"mutable", "not safe",
|
| 513 |
+
],
|
| 514 |
+
"category": "bug",
|
| 515 |
+
"severity": "error",
|
| 516 |
+
"weight": 1.0,
|
| 517 |
+
},
|
| 518 |
+
{
|
| 519 |
+
"id": "unclosed_session",
|
| 520 |
+
"description": "aiohttp.ClientSession created inside function is never closed β resource leak",
|
| 521 |
+
"line_range": (7, 9),
|
| 522 |
+
"keywords": [
|
| 523 |
+
"session", "not closed", "resource leak", "close", "context manager",
|
| 524 |
+
"async with", "clientsession", "leak", "aiohttp",
|
| 525 |
+
],
|
| 526 |
+
"category": "bug",
|
| 527 |
+
"severity": "error",
|
| 528 |
+
"weight": 1.0,
|
| 529 |
+
},
|
| 530 |
+
{
|
| 531 |
+
"id": "missing_await",
|
| 532 |
+
"description": "fetch_url(url) called without await β returns unawaited coroutine",
|
| 533 |
+
"line_range": (16, 16),
|
| 534 |
+
"keywords": [
|
| 535 |
+
"await", "missing await", "coroutine", "not awaited", "unawaited",
|
| 536 |
+
"returns coroutine",
|
| 537 |
+
],
|
| 538 |
+
"category": "bug",
|
| 539 |
+
"severity": "critical",
|
| 540 |
+
"weight": 1.0,
|
| 541 |
+
},
|
| 542 |
+
{
|
| 543 |
+
"id": "silent_exception",
|
| 544 |
+
"description": "bare except: pass swallows all exceptions, hiding errors",
|
| 545 |
+
"line_range": (27, 27),
|
| 546 |
+
"keywords": [
|
| 547 |
+
"swallow", "silent", "bare except", "exception", "pass",
|
| 548 |
+
"ignore", "hidden", "suppress", "logging",
|
| 549 |
+
],
|
| 550 |
+
"category": "design",
|
| 551 |
+
"severity": "warning",
|
| 552 |
+
"weight": 0.75,
|
| 553 |
+
},
|
| 554 |
+
{
|
| 555 |
+
"id": "no_backoff",
|
| 556 |
+
"description": "Retry sleep is flat 1 s; should use exponential backoff with jitter",
|
| 557 |
+
"line_range": (38, 38),
|
| 558 |
+
"keywords": [
|
| 559 |
+
"backoff", "exponential", "jitter", "retry", "sleep",
|
| 560 |
+
"flat", "rate limit",
|
| 561 |
+
],
|
| 562 |
+
"category": "design",
|
| 563 |
+
"severity": "warning",
|
| 564 |
+
"weight": 0.5,
|
| 565 |
+
},
|
| 566 |
+
],
|
| 567 |
+
"correct_decision": "request_changes",
|
| 568 |
+
},
|
| 569 |
+
|
| 570 |
+
# ββ Task 5: Hard β Flask API Vulnerabilities ββββββββββββββββββββββββββ
|
| 571 |
+
{
|
| 572 |
+
"id": 5,
|
| 573 |
+
"name": "Flask API Security Review",
|
| 574 |
+
"difficulty": "hard",
|
| 575 |
+
"file_name": "api_server.py",
|
| 576 |
+
"description": (
|
| 577 |
+
"Perform a thorough security review of this Flask REST API. "
|
| 578 |
+
"Look for injection flaws, path traversal, insecure deserialization, "
|
| 579 |
+
"sensitive data exposure, and missing access controls. "
|
| 580 |
+
"Comment on every issue, then submit your review."
|
| 581 |
+
),
|
| 582 |
+
"max_steps": 30,
|
| 583 |
+
"code": """\
|
| 584 |
+
import os
|
| 585 |
+
import pickle
|
| 586 |
+
import subprocess
|
| 587 |
+
import logging
|
| 588 |
+
from flask import Flask, request, jsonify, send_file
|
| 589 |
+
|
| 590 |
+
app = Flask(__name__)
|
| 591 |
+
SECRET_KEY = "flask-secret-hardcoded" # line 8
|
| 592 |
+
logging.basicConfig(level=logging.DEBUG)
|
| 593 |
+
|
| 594 |
+
|
| 595 |
+
@app.route("/run", methods=["POST"])
|
| 596 |
+
def run_command():
|
| 597 |
+
\"\"\"Run a system command and return output.\"\"\"
|
| 598 |
+
cmd = request.json.get("command", "")
|
| 599 |
+
# line 15: unsanitised shell command β OS command injection
|
| 600 |
+
result = subprocess.check_output(cmd, shell=True, text=True)
|
| 601 |
+
return jsonify({"output": result})
|
| 602 |
+
|
| 603 |
+
|
| 604 |
+
@app.route("/files", methods=["GET"])
|
| 605 |
+
def get_file():
|
| 606 |
+
\"\"\"Serve a file from the data directory.\"\"\"
|
| 607 |
+
filename = request.args.get("name", "")
|
| 608 |
+
# line 23: no path normalisation β path traversal
|
| 609 |
+
path = os.path.join("/app/data", filename)
|
| 610 |
+
return send_file(path)
|
| 611 |
+
|
| 612 |
+
|
| 613 |
+
@app.route("/load", methods=["POST"])
|
| 614 |
+
def load_object():
|
| 615 |
+
\"\"\"Deserialise a user-supplied payload.\"\"\"
|
| 616 |
+
data = request.get_data()
|
| 617 |
+
# line 30: pickle.loads on untrusted data β arbitrary code execution
|
| 618 |
+
obj = pickle.loads(data)
|
| 619 |
+
return jsonify({"type": str(type(obj))})
|
| 620 |
+
|
| 621 |
+
|
| 622 |
+
@app.route("/login", methods=["POST"])
|
| 623 |
+
def login():
|
| 624 |
+
\"\"\"Authenticate and return a token.\"\"\"
|
| 625 |
+
username = request.json.get("username")
|
| 626 |
+
password = request.json.get("password")
|
| 627 |
+
# line 38: credentials logged at DEBUG level
|
| 628 |
+
logging.debug(f"Login attempt: username={username} password={password}")
|
| 629 |
+
if username == "admin" and password == SECRET_KEY:
|
| 630 |
+
return jsonify({"token": SECRET_KEY}) # line 41: secret returned in response
|
| 631 |
+
return jsonify({"error": "unauthorized"}), 401
|
| 632 |
+
|
| 633 |
+
|
| 634 |
+
@app.route("/admin", methods=["GET"])
|
| 635 |
+
def admin_panel():
|
| 636 |
+
\"\"\"Return admin data β no auth check.\"\"\"
|
| 637 |
+
# line 47: no authentication or authorisation check
|
| 638 |
+
return jsonify({"users": ["alice", "bob", "admin"], "config": {"debug": True}})
|
| 639 |
+
""",
|
| 640 |
+
"issues": [
|
| 641 |
+
{
|
| 642 |
+
"id": "hardcoded_secret",
|
| 643 |
+
"description": "Flask SECRET_KEY hard-coded in source; should come from env var",
|
| 644 |
+
"line_range": (8, 8),
|
| 645 |
+
"keywords": [
|
| 646 |
+
"hardcoded", "hard-coded", "secret_key", "environment variable",
|
| 647 |
+
"env var", "os.environ", "secret", "hardcode",
|
| 648 |
+
],
|
| 649 |
+
"category": "security",
|
| 650 |
+
"severity": "critical",
|
| 651 |
+
"weight": 0.75,
|
| 652 |
+
},
|
| 653 |
+
{
|
| 654 |
+
"id": "command_injection",
|
| 655 |
+
"description": "subprocess.check_output with shell=True and unsanitised user input β OS command injection",
|
| 656 |
+
"line_range": (15, 16),
|
| 657 |
+
"keywords": [
|
| 658 |
+
"command injection", "shell injection", "shell=true", "subprocess",
|
| 659 |
+
"os injection", "arbitrary command", "unsanitised", "sanitize",
|
| 660 |
+
"injection",
|
| 661 |
+
],
|
| 662 |
+
"category": "security",
|
| 663 |
+
"severity": "critical",
|
| 664 |
+
"weight": 1.0,
|
| 665 |
+
},
|
| 666 |
+
{
|
| 667 |
+
"id": "path_traversal",
|
| 668 |
+
"description": "No path normalisation allows ../../../etc/passwd-style traversal",
|
| 669 |
+
"line_range": (23, 24),
|
| 670 |
+
"keywords": [
|
| 671 |
+
"path traversal", "directory traversal", "path normaliz",
|
| 672 |
+
"os.path.abspath", "realpath", "../", "dot dot",
|
| 673 |
+
"escape", "filename", "traversal",
|
| 674 |
+
],
|
| 675 |
+
"category": "security",
|
| 676 |
+
"severity": "critical",
|
| 677 |
+
"weight": 1.0,
|
| 678 |
+
},
|
| 679 |
+
{
|
| 680 |
+
"id": "insecure_deserialization",
|
| 681 |
+
"description": "pickle.loads on untrusted user data allows arbitrary code execution",
|
| 682 |
+
"line_range": (30, 31),
|
| 683 |
+
"keywords": [
|
| 684 |
+
"pickle", "deserialization", "deserialisation", "arbitrary code",
|
| 685 |
+
"untrusted", "rce", "remote code", "insecure deserialization",
|
| 686 |
+
],
|
| 687 |
+
"category": "security",
|
| 688 |
+
"severity": "critical",
|
| 689 |
+
"weight": 1.0,
|
| 690 |
+
},
|
| 691 |
+
{
|
| 692 |
+
"id": "credentials_in_logs",
|
| 693 |
+
"description": "Plaintext username and password written to DEBUG log",
|
| 694 |
+
"line_range": (38, 38),
|
| 695 |
+
"keywords": [
|
| 696 |
+
"log", "logging", "credential", "password", "sensitive",
|
| 697 |
+
"plaintext", "debug", "leak", "exposure",
|
| 698 |
+
],
|
| 699 |
+
"category": "security",
|
| 700 |
+
"severity": "error",
|
| 701 |
+
"weight": 0.75,
|
| 702 |
+
},
|
| 703 |
+
{
|
| 704 |
+
"id": "missing_auth_check",
|
| 705 |
+
"description": "Admin endpoint has no authentication or authorisation guard",
|
| 706 |
+
"line_range": (47, 47),
|
| 707 |
+
"keywords": [
|
| 708 |
+
"auth", "authentication", "authorization", "authorisation",
|
| 709 |
+
"access control", "no check", "unprotected", "unauthenticated",
|
| 710 |
+
"missing auth",
|
| 711 |
+
],
|
| 712 |
+
"category": "security",
|
| 713 |
+
"severity": "critical",
|
| 714 |
+
"weight": 1.0,
|
| 715 |
+
},
|
| 716 |
+
],
|
| 717 |
+
"correct_decision": "request_changes",
|
| 718 |
+
},
|
| 719 |
+
]
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|