Commit ·
7a23e48
1
Parent(s): 5e92b80
hf new add
Browse files- .github/workflows/ci.yml +31 -0
- README.md +61 -0
- environment/env.py +12 -0
- environment/graders.py +4 -0
- environment/tasks.py +23 -2
- inference.py +51 -3
- openenv.yaml +4 -0
- scripts/run_benchmark.py +106 -0
- scripts/validate-submission.sh +63 -0
- submit.py +11 -3
- tests/test_env.py +116 -0
.github/workflows/ci.yml
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: CI
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
pull_request:
|
| 6 |
+
|
| 7 |
+
jobs:
|
| 8 |
+
test-and-validate:
|
| 9 |
+
runs-on: ubuntu-latest
|
| 10 |
+
steps:
|
| 11 |
+
- name: Checkout
|
| 12 |
+
uses: actions/checkout@v4
|
| 13 |
+
|
| 14 |
+
- name: Setup Python
|
| 15 |
+
uses: actions/setup-python@v5
|
| 16 |
+
with:
|
| 17 |
+
python-version: '3.12'
|
| 18 |
+
|
| 19 |
+
- name: Install dependencies
|
| 20 |
+
run: |
|
| 21 |
+
python -m pip install --upgrade pip
|
| 22 |
+
pip install -r requirements.txt
|
| 23 |
+
|
| 24 |
+
- name: Run unit tests
|
| 25 |
+
run: pytest -q
|
| 26 |
+
|
| 27 |
+
- name: Validate OpenEnv
|
| 28 |
+
run: openenv validate
|
| 29 |
+
|
| 30 |
+
- name: Run submit preflight (CI mode)
|
| 31 |
+
run: python submit.py --skip-baseline --skip-docker --no-coverage
|
README.md
CHANGED
|
@@ -181,3 +181,64 @@ Each inference run writes JSON like:
|
|
| 181 |
- The baseline now enforces phased review behavior and falls back to deterministic actions when the model is temporarily unavailable.
|
| 182 |
- For reproducible runs, keep `TEMPERATURE=0.0`.
|
| 183 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
- The baseline now enforces phased review behavior and falls back to deterministic actions when the model is temporarily unavailable.
|
| 182 |
- For reproducible runs, keep `TEMPERATURE=0.0`.
|
| 183 |
|
| 184 |
+
## Fast Start (3 Commands)
|
| 185 |
+
|
| 186 |
+
```bash
|
| 187 |
+
source .venv/bin/activate
|
| 188 |
+
pytest -q
|
| 189 |
+
python submit.py --skip-docker --max-steps 10
|
| 190 |
+
```
|
| 191 |
+
|
| 192 |
+
## Judge Map (Criterion -> Evidence)
|
| 193 |
+
|
| 194 |
+
| Criterion | Evidence | File |
|
| 195 |
+
|---|---|---|
|
| 196 |
+
| OpenEnv lifecycle compliance | reset/step/state implemented and served over HTTP | `environment/env.py`, `server/app.py` |
|
| 197 |
+
| Typed models | Pydantic action/state/observation models | `environment/models.py` |
|
| 198 |
+
| Task difficulty progression | easy/medium/hard tasks + calibration approve tasks | `environment/tasks.py` |
|
| 199 |
+
| Grading quality | detection/suggestion/decision + partial credit + FP penalty + efficiency bonus | `environment/graders.py` |
|
| 200 |
+
| Baseline reproducibility | deterministic seed support in reset + inference output metadata | `environment/env.py`, `inference.py` |
|
| 201 |
+
| Submission validation | Python preflight + bash validator script | `submit.py`, `scripts/validate-submission.sh` |
|
| 202 |
+
|
| 203 |
+
## Grader Rubric (Summary)
|
| 204 |
+
|
| 205 |
+
| Component | Weight / Effect | Notes |
|
| 206 |
+
|---|---|---|
|
| 207 |
+
| Detection score | 0.4 | Partial credit for near-line matches |
|
| 208 |
+
| Suggestion score | 0.3 | Line-proximity matching for fixes |
|
| 209 |
+
| Decision score | 0.3 | Approve for no-issue tasks, request_changes otherwise |
|
| 210 |
+
| False positive penalty | up to -0.4 | Strong penalty for issue spam |
|
| 211 |
+
| Efficiency bonus | up to +0.1 | Bonus for completing in fewer steps |
|
| 212 |
+
| Final score clamp | [0,1] | Safety clamp in grader |
|
| 213 |
+
|
| 214 |
+
## Benchmark Snapshot (3-Task Local Run)
|
| 215 |
+
|
| 216 |
+
| Task | Task Score | Total Reward | Model |
|
| 217 |
+
|---|---:|---:|---|
|
| 218 |
+
| bug_detection_easy_1 | 1.000 | 1.410 | meta/llama-3.3-70b-instruct |
|
| 219 |
+
| memory_leak_medium_1 | 0.875 | 1.285 | meta/llama-3.3-70b-instruct |
|
| 220 |
+
| security_hard_1 | 1.000 | 1.410 | meta/llama-3.3-70b-instruct |
|
| 221 |
+
|
| 222 |
+
Note: `task_score` is normalized to [0,1]. `total_reward` is cumulative step reward and can exceed 1.0 by design.
|
| 223 |
+
|
| 224 |
+
## One-Command Benchmark Table
|
| 225 |
+
|
| 226 |
+
Generate per-task JSON outputs plus a markdown table for judge submission:
|
| 227 |
+
|
| 228 |
+
```bash
|
| 229 |
+
source .venv/bin/activate
|
| 230 |
+
python scripts/run_benchmark.py --max-steps 10
|
| 231 |
+
```
|
| 232 |
+
|
| 233 |
+
Artifacts:
|
| 234 |
+
|
| 235 |
+
- `outputs/benchmark_<task_id>.json`
|
| 236 |
+
- `outputs/benchmark_table.md`
|
| 237 |
+
|
| 238 |
+
## Failure Analysis Template
|
| 239 |
+
|
| 240 |
+
- Missed issue type:
|
| 241 |
+
- Why it was missed (model behavior or prompt failure):
|
| 242 |
+
- Grader diagnostics (precision/recall/F1/FP):
|
| 243 |
+
- Fix applied (prompt/rubric/task change):
|
| 244 |
+
|
environment/env.py
CHANGED
|
@@ -71,9 +71,20 @@ class CodeReviewEnv:
|
|
| 71 |
self._state.last_error = str(e)
|
| 72 |
return self._get_observation(), -0.1, False, {"error": str(e), "last_action_valid": False}
|
| 73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
self._state.current_step += 1
|
| 75 |
self._process_action(review_action)
|
| 76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
self._action_history.append({
|
| 78 |
"step": self._state.current_step,
|
| 79 |
"action_type": review_action.action_type.value,
|
|
@@ -103,6 +114,7 @@ class CodeReviewEnv:
|
|
| 103 |
self._state.final_decision or "changes_requested",
|
| 104 |
grader,
|
| 105 |
self._state.last_action_valid,
|
|
|
|
| 106 |
steps_taken=self._state.current_step,
|
| 107 |
max_steps=self.max_steps,
|
| 108 |
)
|
|
|
|
| 71 |
self._state.last_error = str(e)
|
| 72 |
return self._get_observation(), -0.1, False, {"error": str(e), "last_action_valid": False}
|
| 73 |
|
| 74 |
+
existing_comment_keys = {
|
| 75 |
+
(c.line_number, c.content.strip().lower()) for c in self._state.comments_made
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
self._state.current_step += 1
|
| 79 |
self._process_action(review_action)
|
| 80 |
|
| 81 |
+
duplicate_comment_count = 0
|
| 82 |
+
if review_action.action_type.value == "add_comment":
|
| 83 |
+
for c in review_action.comments:
|
| 84 |
+
key = (c.line_number, c.content.strip().lower())
|
| 85 |
+
if key in existing_comment_keys:
|
| 86 |
+
duplicate_comment_count += 1
|
| 87 |
+
|
| 88 |
self._action_history.append({
|
| 89 |
"step": self._state.current_step,
|
| 90 |
"action_type": review_action.action_type.value,
|
|
|
|
| 114 |
self._state.final_decision or "changes_requested",
|
| 115 |
grader,
|
| 116 |
self._state.last_action_valid,
|
| 117 |
+
duplicate_comment_count=duplicate_comment_count,
|
| 118 |
steps_taken=self._state.current_step,
|
| 119 |
max_steps=self.max_steps,
|
| 120 |
)
|
environment/graders.py
CHANGED
|
@@ -204,6 +204,7 @@ class RewardCalculator:
|
|
| 204 |
final_decision: str,
|
| 205 |
grader: TaskGrader,
|
| 206 |
last_action_valid: bool,
|
|
|
|
| 207 |
steps_taken: int = 0,
|
| 208 |
max_steps: int = 50) -> float:
|
| 209 |
|
|
@@ -223,6 +224,9 @@ class RewardCalculator:
|
|
| 223 |
if not last_action_valid:
|
| 224 |
reward -= 0.15
|
| 225 |
|
|
|
|
|
|
|
|
|
|
| 226 |
if not current_action.comments and not current_action.suggestions:
|
| 227 |
if current_action.action_type.value in ["approve", "request_changes"]:
|
| 228 |
pass
|
|
|
|
| 204 |
final_decision: str,
|
| 205 |
grader: TaskGrader,
|
| 206 |
last_action_valid: bool,
|
| 207 |
+
duplicate_comment_count: int = 0,
|
| 208 |
steps_taken: int = 0,
|
| 209 |
max_steps: int = 50) -> float:
|
| 210 |
|
|
|
|
| 224 |
if not last_action_valid:
|
| 225 |
reward -= 0.15
|
| 226 |
|
| 227 |
+
if duplicate_comment_count > 0:
|
| 228 |
+
reward -= min(0.25, 0.08 * duplicate_comment_count)
|
| 229 |
+
|
| 230 |
if not current_action.comments and not current_action.suggestions:
|
| 231 |
if current_action.action_type.value in ["approve", "request_changes"]:
|
| 232 |
pass
|
environment/tasks.py
CHANGED
|
@@ -276,6 +276,27 @@ class Counter:
|
|
| 276 |
"language": "python",
|
| 277 |
"line_count": 9,
|
| 278 |
"expected_issues": []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
}
|
| 280 |
]
|
| 281 |
|
|
@@ -286,8 +307,8 @@ class Counter:
|
|
| 286 |
for task in all_tasks:
|
| 287 |
if task["task_id"] == canonical_task_id:
|
| 288 |
return task
|
| 289 |
-
|
| 290 |
-
|
| 291 |
|
| 292 |
@classmethod
|
| 293 |
def get_all_tasks(cls) -> List[Dict[str, Any]]:
|
|
|
|
| 276 |
"language": "python",
|
| 277 |
"line_count": 9,
|
| 278 |
"expected_issues": []
|
| 279 |
+
},
|
| 280 |
+
{
|
| 281 |
+
"task_id": "adversarial_hard_4",
|
| 282 |
+
"task_name": "Adversarial: Safe SQL Builder",
|
| 283 |
+
"difficulty": "hard",
|
| 284 |
+
"description": "No issues expected: query text looks suspicious but uses parameterized execution safely",
|
| 285 |
+
"code_diff": """def find_user(database, user_id):
|
| 286 |
+
query = \"SELECT * FROM users WHERE id = ?\"
|
| 287 |
+
params = [int(user_id)]
|
| 288 |
+
return database.execute(query, params)""",
|
| 289 |
+
"surrounding_code": """def find_user(database, user_id):
|
| 290 |
+
query = \"SELECT * FROM users WHERE id = ?\"
|
| 291 |
+
params = [int(user_id)]
|
| 292 |
+
return database.execute(query, params)
|
| 293 |
+
|
| 294 |
+
def find_all_users(database):
|
| 295 |
+
return database.execute(\"SELECT * FROM users\")""",
|
| 296 |
+
"file_path": "user_repository.py",
|
| 297 |
+
"language": "python",
|
| 298 |
+
"line_count": 4,
|
| 299 |
+
"expected_issues": []
|
| 300 |
}
|
| 301 |
]
|
| 302 |
|
|
|
|
| 307 |
for task in all_tasks:
|
| 308 |
if task["task_id"] == canonical_task_id:
|
| 309 |
return task
|
| 310 |
+
available = ", ".join(t["task_id"] for t in all_tasks)
|
| 311 |
+
raise KeyError(f"Unknown task_id '{task_id}'. Available task IDs: {available}")
|
| 312 |
|
| 313 |
@classmethod
|
| 314 |
def get_all_tasks(cls) -> List[Dict[str, Any]]:
|
inference.py
CHANGED
|
@@ -52,6 +52,38 @@ FALLBACK_ACTION = json.dumps({
|
|
| 52 |
})
|
| 53 |
|
| 54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
def add_line_numbers(code: str) -> str:
|
| 56 |
lines = code.split("\n")
|
| 57 |
return "\n".join(f"{i+1}: {line}" for i, line in enumerate(lines))
|
|
@@ -401,17 +433,20 @@ Respond with JSON only.
|
|
| 401 |
return {"action_type": "request_changes", "comments": [], "suggestions": []}
|
| 402 |
|
| 403 |
|
| 404 |
-
def run_episode(env, agent, task_id: str, max_steps: int) -> Dict[str, Any]:
|
| 405 |
agent.reset()
|
| 406 |
-
obs = env.reset(task_id=task_id)
|
| 407 |
done = False
|
| 408 |
step = 0
|
| 409 |
total_reward = 0.0
|
|
|
|
| 410 |
|
| 411 |
print(f"\nTask : {task_id}")
|
| 412 |
print(f"Desc : {obs.get('task_description', 'N/A')}")
|
| 413 |
print("-" * 60)
|
| 414 |
|
|
|
|
|
|
|
| 415 |
while not done and step < max_steps:
|
| 416 |
action_str = agent.get_action(obs)
|
| 417 |
action = agent.parse_action(action_str)
|
|
@@ -419,6 +454,7 @@ def run_episode(env, agent, task_id: str, max_steps: int) -> Dict[str, Any]:
|
|
| 419 |
|
| 420 |
obs, reward, done, info = env.step(action)
|
| 421 |
total_reward += reward
|
|
|
|
| 422 |
step += 1
|
| 423 |
|
| 424 |
print(f"\nStep {step}/{max_steps}:")
|
|
@@ -434,8 +470,18 @@ def run_episode(env, agent, task_id: str, max_steps: int) -> Dict[str, Any]:
|
|
| 434 |
if info.get('last_action_valid') is False:
|
| 435 |
print(f" Warning : {info.get('error', 'Invalid action')}")
|
| 436 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 437 |
final_score = env.get_task_score()
|
| 438 |
diagnostics = env.summary()
|
|
|
|
|
|
|
| 439 |
|
| 440 |
return {
|
| 441 |
"task_id": task_id,
|
|
@@ -450,6 +496,7 @@ def run_episode(env, agent, task_id: str, max_steps: int) -> Dict[str, Any]:
|
|
| 450 |
"efficiency_bonus": diagnostics.get("efficiency_bonus", 0),
|
| 451 |
"model": MODEL_NAME,
|
| 452 |
"api_base_url": API_BASE_URL,
|
|
|
|
| 453 |
}
|
| 454 |
|
| 455 |
|
|
@@ -508,6 +555,7 @@ def main():
|
|
| 508 |
parser.add_argument("--output", type=str, default="baseline_results.json")
|
| 509 |
parser.add_argument("--batch", action="store_true", help="Run all tasks in batch mode")
|
| 510 |
parser.add_argument("--difficulty", type=str, default=None, help="Filter batch by difficulty: easy, medium, hard")
|
|
|
|
| 511 |
args = parser.parse_args()
|
| 512 |
|
| 513 |
print("=" * 60)
|
|
@@ -526,7 +574,7 @@ def main():
|
|
| 526 |
task_ids = [t["task_id"] for t in TaskDefinitions.get_all_tasks()]
|
| 527 |
run_batch(env, agent, task_ids, args.max_steps, args.output)
|
| 528 |
else:
|
| 529 |
-
result = run_episode(env, agent, args.task_id, args.max_steps)
|
| 530 |
|
| 531 |
print("\n" + "=" * 60)
|
| 532 |
print("Final Results:")
|
|
|
|
| 52 |
})
|
| 53 |
|
| 54 |
|
| 55 |
+
def log_start(task: str, env_name: str, model: str, max_steps: int, seed: int | None) -> None:
|
| 56 |
+
payload = {
|
| 57 |
+
"task": task,
|
| 58 |
+
"env": env_name,
|
| 59 |
+
"model": model,
|
| 60 |
+
"max_steps": max_steps,
|
| 61 |
+
"seed": seed,
|
| 62 |
+
}
|
| 63 |
+
print(f"[START] {json.dumps(payload, ensure_ascii=True, separators=(',', ':'))}", flush=True)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def log_step(step: int, action: str, reward: float, done: bool, error: str | None) -> None:
|
| 67 |
+
payload = {
|
| 68 |
+
"step": step,
|
| 69 |
+
"action": action,
|
| 70 |
+
"reward": round(float(reward), 4),
|
| 71 |
+
"done": bool(done),
|
| 72 |
+
"error": error,
|
| 73 |
+
}
|
| 74 |
+
print(f"[STEP] {json.dumps(payload, ensure_ascii=True, separators=(',', ':'))}", flush=True)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
|
| 78 |
+
payload = {
|
| 79 |
+
"success": bool(success),
|
| 80 |
+
"steps": int(steps),
|
| 81 |
+
"score": round(float(score), 4),
|
| 82 |
+
"rewards": [round(float(r), 4) for r in rewards],
|
| 83 |
+
}
|
| 84 |
+
print(f"[END] {json.dumps(payload, ensure_ascii=True, separators=(',', ':'))}", flush=True)
|
| 85 |
+
|
| 86 |
+
|
| 87 |
def add_line_numbers(code: str) -> str:
|
| 88 |
lines = code.split("\n")
|
| 89 |
return "\n".join(f"{i+1}: {line}" for i, line in enumerate(lines))
|
|
|
|
| 433 |
return {"action_type": "request_changes", "comments": [], "suggestions": []}
|
| 434 |
|
| 435 |
|
| 436 |
+
def run_episode(env, agent, task_id: str, max_steps: int, seed: int | None = None) -> Dict[str, Any]:
|
| 437 |
agent.reset()
|
| 438 |
+
obs = env.reset(task_id=task_id, seed=seed)
|
| 439 |
done = False
|
| 440 |
step = 0
|
| 441 |
total_reward = 0.0
|
| 442 |
+
rewards: List[float] = []
|
| 443 |
|
| 444 |
print(f"\nTask : {task_id}")
|
| 445 |
print(f"Desc : {obs.get('task_description', 'N/A')}")
|
| 446 |
print("-" * 60)
|
| 447 |
|
| 448 |
+
log_start(task=task_id, env_name="code-review-agent-env", model=MODEL_NAME, max_steps=max_steps, seed=seed)
|
| 449 |
+
|
| 450 |
while not done and step < max_steps:
|
| 451 |
action_str = agent.get_action(obs)
|
| 452 |
action = agent.parse_action(action_str)
|
|
|
|
| 454 |
|
| 455 |
obs, reward, done, info = env.step(action)
|
| 456 |
total_reward += reward
|
| 457 |
+
rewards.append(float(reward))
|
| 458 |
step += 1
|
| 459 |
|
| 460 |
print(f"\nStep {step}/{max_steps}:")
|
|
|
|
| 470 |
if info.get('last_action_valid') is False:
|
| 471 |
print(f" Warning : {info.get('error', 'Invalid action')}")
|
| 472 |
|
| 473 |
+
log_step(
|
| 474 |
+
step=step,
|
| 475 |
+
action=action.get("action_type", "unknown"),
|
| 476 |
+
reward=float(reward),
|
| 477 |
+
done=bool(done),
|
| 478 |
+
error=info.get("error"),
|
| 479 |
+
)
|
| 480 |
+
|
| 481 |
final_score = env.get_task_score()
|
| 482 |
diagnostics = env.summary()
|
| 483 |
+
success = final_score >= 0.7
|
| 484 |
+
log_end(success=success, steps=step, score=final_score, rewards=rewards)
|
| 485 |
|
| 486 |
return {
|
| 487 |
"task_id": task_id,
|
|
|
|
| 496 |
"efficiency_bonus": diagnostics.get("efficiency_bonus", 0),
|
| 497 |
"model": MODEL_NAME,
|
| 498 |
"api_base_url": API_BASE_URL,
|
| 499 |
+
"seed": seed,
|
| 500 |
}
|
| 501 |
|
| 502 |
|
|
|
|
| 555 |
parser.add_argument("--output", type=str, default="baseline_results.json")
|
| 556 |
parser.add_argument("--batch", action="store_true", help="Run all tasks in batch mode")
|
| 557 |
parser.add_argument("--difficulty", type=str, default=None, help="Filter batch by difficulty: easy, medium, hard")
|
| 558 |
+
parser.add_argument("--seed", type=int, default=42, help="Random seed for deterministic environment reset")
|
| 559 |
args = parser.parse_args()
|
| 560 |
|
| 561 |
print("=" * 60)
|
|
|
|
| 574 |
task_ids = [t["task_id"] for t in TaskDefinitions.get_all_tasks()]
|
| 575 |
run_batch(env, agent, task_ids, args.max_steps, args.output)
|
| 576 |
else:
|
| 577 |
+
result = run_episode(env, agent, args.task_id, args.max_steps, seed=args.seed)
|
| 578 |
|
| 579 |
print("\n" + "=" * 60)
|
| 580 |
print("Final Results:")
|
openenv.yaml
CHANGED
|
@@ -58,6 +58,10 @@ tasks:
|
|
| 58 |
name: "Hard: Approve Thread-Safe Counter"
|
| 59 |
difficulty: hard
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
observation_space:
|
| 62 |
type: dict
|
| 63 |
description: |
|
|
|
|
| 58 |
name: "Hard: Approve Thread-Safe Counter"
|
| 59 |
difficulty: hard
|
| 60 |
|
| 61 |
+
- id: adversarial_hard_4
|
| 62 |
+
name: "Hard: Adversarial Safe SQL Builder"
|
| 63 |
+
difficulty: hard
|
| 64 |
+
|
| 65 |
observation_space:
|
| 66 |
type: dict
|
| 67 |
description: |
|
scripts/run_benchmark.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import argparse
|
| 5 |
+
import json
|
| 6 |
+
import subprocess
|
| 7 |
+
import sys
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Any, Dict, List
|
| 10 |
+
|
| 11 |
+
REPO_ROOT = Path(__file__).resolve().parents[1]
|
| 12 |
+
INFERENCE_PATH = REPO_ROOT / "inference.py"
|
| 13 |
+
|
| 14 |
+
DEFAULT_TASKS = [
|
| 15 |
+
"bug_detection_easy_1",
|
| 16 |
+
"memory_leak_medium_1",
|
| 17 |
+
"security_hard_1",
|
| 18 |
+
]
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def run_task(task_id: str, max_steps: int, output_dir: Path) -> Dict[str, Any]:
|
| 22 |
+
output_path = output_dir / f"benchmark_{task_id}.json"
|
| 23 |
+
cmd = [
|
| 24 |
+
sys.executable,
|
| 25 |
+
str(INFERENCE_PATH),
|
| 26 |
+
"--task-id",
|
| 27 |
+
task_id,
|
| 28 |
+
"--max-steps",
|
| 29 |
+
str(max_steps),
|
| 30 |
+
"--output",
|
| 31 |
+
str(output_path),
|
| 32 |
+
]
|
| 33 |
+
|
| 34 |
+
completed = subprocess.run(cmd, capture_output=True, text=True, cwd=str(REPO_ROOT))
|
| 35 |
+
if completed.returncode != 0:
|
| 36 |
+
raise RuntimeError(
|
| 37 |
+
f"Task {task_id} failed with exit code {completed.returncode}\n"
|
| 38 |
+
f"stdout:\n{completed.stdout}\n\n"
|
| 39 |
+
f"stderr:\n{completed.stderr}"
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
with output_path.open("r", encoding="utf-8") as fh:
|
| 43 |
+
return json.load(fh)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def to_markdown(results: List[Dict[str, Any]]) -> str:
|
| 47 |
+
lines = [
|
| 48 |
+
"# Benchmark Results",
|
| 49 |
+
"",
|
| 50 |
+
"| Task | Task Score | Total Reward | Steps | Model |",
|
| 51 |
+
"|---|---:|---:|---:|---|",
|
| 52 |
+
]
|
| 53 |
+
|
| 54 |
+
for row in results:
|
| 55 |
+
lines.append(
|
| 56 |
+
f"| {row.get('task_id')} | {row.get('task_score', 0):.3f} | "
|
| 57 |
+
f"{row.get('total_reward', 0):.3f} | {row.get('steps', 0)} | {row.get('model', 'unknown')} |"
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
avg_score = sum(float(r.get("task_score", 0.0)) for r in results) / max(1, len(results))
|
| 61 |
+
avg_reward = sum(float(r.get("total_reward", 0.0)) for r in results) / max(1, len(results))
|
| 62 |
+
|
| 63 |
+
lines.extend(
|
| 64 |
+
[
|
| 65 |
+
"",
|
| 66 |
+
f"Average task score: **{avg_score:.3f}**",
|
| 67 |
+
f"Average total reward: **{avg_reward:.3f}**",
|
| 68 |
+
"",
|
| 69 |
+
"Note: task_score is normalized to [0, 1]; total_reward is cumulative and may exceed 1.0.",
|
| 70 |
+
]
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
return "\n".join(lines) + "\n"
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def main() -> int:
|
| 77 |
+
parser = argparse.ArgumentParser(description="Run benchmark tasks and generate a markdown table")
|
| 78 |
+
parser.add_argument(
|
| 79 |
+
"--tasks",
|
| 80 |
+
nargs="+",
|
| 81 |
+
default=DEFAULT_TASKS,
|
| 82 |
+
help="Task IDs to evaluate (default: 3 core tasks)",
|
| 83 |
+
)
|
| 84 |
+
parser.add_argument("--max-steps", type=int, default=10)
|
| 85 |
+
parser.add_argument("--output-dir", type=Path, default=REPO_ROOT / "outputs")
|
| 86 |
+
parser.add_argument("--table", type=Path, default=REPO_ROOT / "outputs/benchmark_table.md")
|
| 87 |
+
args = parser.parse_args()
|
| 88 |
+
|
| 89 |
+
args.output_dir.mkdir(parents=True, exist_ok=True)
|
| 90 |
+
args.table.parent.mkdir(parents=True, exist_ok=True)
|
| 91 |
+
|
| 92 |
+
results: List[Dict[str, Any]] = []
|
| 93 |
+
for task_id in args.tasks:
|
| 94 |
+
print(f"Running task: {task_id}")
|
| 95 |
+
result = run_task(task_id, args.max_steps, args.output_dir)
|
| 96 |
+
results.append(result)
|
| 97 |
+
|
| 98 |
+
table = to_markdown(results)
|
| 99 |
+
args.table.write_text(table, encoding="utf-8")
|
| 100 |
+
|
| 101 |
+
print(f"Wrote benchmark table to {args.table}")
|
| 102 |
+
return 0
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
if __name__ == "__main__":
|
| 106 |
+
raise SystemExit(main())
|
scripts/validate-submission.sh
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# validate-submission.sh — OpenEnv submission validator
|
| 3 |
+
# Usage: ./scripts/validate-submission.sh <ping_url> [repo_dir]
|
| 4 |
+
|
| 5 |
+
set -euo pipefail
|
| 6 |
+
|
| 7 |
+
PING_URL="${1:-}"
|
| 8 |
+
REPO_DIR="${2:-.}"
|
| 9 |
+
DOCKER_BUILD_TIMEOUT="${DOCKER_BUILD_TIMEOUT:-600}"
|
| 10 |
+
|
| 11 |
+
if [[ -z "$PING_URL" ]]; then
|
| 12 |
+
echo "Usage: $0 <ping_url> [repo_dir]"
|
| 13 |
+
echo "Example: $0 https://my-space.hf.space ."
|
| 14 |
+
exit 2
|
| 15 |
+
fi
|
| 16 |
+
|
| 17 |
+
if [[ ! -d "$REPO_DIR" ]]; then
|
| 18 |
+
echo "ERROR: repo_dir not found: $REPO_DIR"
|
| 19 |
+
exit 2
|
| 20 |
+
fi
|
| 21 |
+
|
| 22 |
+
cd "$REPO_DIR"
|
| 23 |
+
|
| 24 |
+
echo "==> 1/4 Ping Space root"
|
| 25 |
+
ROOT_CODE=$(curl -s -o /tmp/openenv_root.json -w '%{http_code}' "$PING_URL/")
|
| 26 |
+
if [[ "$ROOT_CODE" != "200" ]]; then
|
| 27 |
+
echo "FAIL: root ping returned $ROOT_CODE"
|
| 28 |
+
exit 1
|
| 29 |
+
fi
|
| 30 |
+
|
| 31 |
+
echo "==> 2/4 Ping reset endpoint"
|
| 32 |
+
RESET_CODE=$(curl -s -o /tmp/openenv_reset.json -w '%{http_code}' "$PING_URL/reset")
|
| 33 |
+
if [[ "$RESET_CODE" != "200" ]]; then
|
| 34 |
+
echo "FAIL: reset ping returned $RESET_CODE"
|
| 35 |
+
exit 1
|
| 36 |
+
fi
|
| 37 |
+
if ! grep -q '"observation"' /tmp/openenv_reset.json; then
|
| 38 |
+
echo "FAIL: /reset did not return observation"
|
| 39 |
+
exit 1
|
| 40 |
+
fi
|
| 41 |
+
|
| 42 |
+
echo "==> 3/4 openenv validate"
|
| 43 |
+
openenv validate
|
| 44 |
+
|
| 45 |
+
echo "==> 4/4 docker build"
|
| 46 |
+
if ! command -v docker >/dev/null 2>&1; then
|
| 47 |
+
echo "FAIL: docker command not found"
|
| 48 |
+
exit 1
|
| 49 |
+
fi
|
| 50 |
+
if command -v timeout >/dev/null 2>&1; then
|
| 51 |
+
if ! timeout "$DOCKER_BUILD_TIMEOUT" docker build -t code-review-env-validate .; then
|
| 52 |
+
echo "FAIL: docker build failed or timed out"
|
| 53 |
+
exit 1
|
| 54 |
+
fi
|
| 55 |
+
else
|
| 56 |
+
echo "WARN: timeout command not found, running docker build without timeout"
|
| 57 |
+
if ! docker build -t code-review-env-validate .; then
|
| 58 |
+
echo "FAIL: docker build failed"
|
| 59 |
+
exit 1
|
| 60 |
+
fi
|
| 61 |
+
fi
|
| 62 |
+
|
| 63 |
+
echo "PASS: submission validator checks completed"
|
submit.py
CHANGED
|
@@ -249,6 +249,11 @@ def main() -> int:
|
|
| 249 |
default="code-review-env",
|
| 250 |
help="Docker image name for validation build",
|
| 251 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
parser.add_argument(
|
| 253 |
"--report-path",
|
| 254 |
default="submission_report.json",
|
|
@@ -268,8 +273,11 @@ def main() -> int:
|
|
| 268 |
ok, detail = run_tests(with_coverage=not args.no_coverage)
|
| 269 |
checks["tests"] = detail
|
| 270 |
|
| 271 |
-
|
| 272 |
-
|
|
|
|
|
|
|
|
|
|
| 273 |
|
| 274 |
baseline_scores: Dict[str, float] = {}
|
| 275 |
if args.skip_baseline:
|
|
@@ -283,7 +291,7 @@ def main() -> int:
|
|
| 283 |
required_checks_ok = (
|
| 284 |
checks["validation"]["ok"]
|
| 285 |
and checks["tests"]["ok"]
|
| 286 |
-
and checks["docker"]
|
| 287 |
)
|
| 288 |
|
| 289 |
if required_checks_ok:
|
|
|
|
| 249 |
default="code-review-env",
|
| 250 |
help="Docker image name for validation build",
|
| 251 |
)
|
| 252 |
+
parser.add_argument(
|
| 253 |
+
"--skip-docker",
|
| 254 |
+
action="store_true",
|
| 255 |
+
help="Skip docker build checks (useful in CI environments without docker daemon)",
|
| 256 |
+
)
|
| 257 |
parser.add_argument(
|
| 258 |
"--report-path",
|
| 259 |
default="submission_report.json",
|
|
|
|
| 273 |
ok, detail = run_tests(with_coverage=not args.no_coverage)
|
| 274 |
checks["tests"] = detail
|
| 275 |
|
| 276 |
+
if args.skip_docker:
|
| 277 |
+
checks["docker"] = {"ok": False, "skipped": True, "reason": "Skipped by --skip-docker"}
|
| 278 |
+
else:
|
| 279 |
+
ok, detail = check_docker(args.image_name)
|
| 280 |
+
checks["docker"] = detail
|
| 281 |
|
| 282 |
baseline_scores: Dict[str, float] = {}
|
| 283 |
if args.skip_baseline:
|
|
|
|
| 291 |
required_checks_ok = (
|
| 292 |
checks["validation"]["ok"]
|
| 293 |
and checks["tests"]["ok"]
|
| 294 |
+
and (checks["docker"].get("ok") or checks["docker"].get("skipped"))
|
| 295 |
)
|
| 296 |
|
| 297 |
if required_checks_ok:
|
tests/test_env.py
CHANGED
|
@@ -248,6 +248,122 @@ class TestCodeReviewEnv(unittest.TestCase):
|
|
| 248 |
self.assertGreater(info["diagnostics"]["false_positive_count"], 0)
|
| 249 |
self.assertLess(info["task_score"], 1.0)
|
| 250 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
|
| 252 |
if __name__ == "__main__":
|
| 253 |
unittest.main()
|
|
|
|
| 248 |
self.assertGreater(info["diagnostics"]["false_positive_count"], 0)
|
| 249 |
self.assertLess(info["task_score"], 1.0)
|
| 250 |
|
| 251 |
+
def test_reset_step_state_contract(self):
|
| 252 |
+
obs = self.env.reset(task_id="bug_detection_easy_1", seed=123)
|
| 253 |
+
self.assertIn("code_diff", obs)
|
| 254 |
+
self.assertIn("file_context", obs)
|
| 255 |
+
self.assertIn("task_description", obs)
|
| 256 |
+
self.assertIn("valid_actions", obs)
|
| 257 |
+
|
| 258 |
+
action = ReviewAction(
|
| 259 |
+
action_type=ReviewActionType.ADD_COMMENT,
|
| 260 |
+
comments=[
|
| 261 |
+
Comment(
|
| 262 |
+
line_number=3,
|
| 263 |
+
content="division_by_zero risk",
|
| 264 |
+
is_issue=True,
|
| 265 |
+
severity="high",
|
| 266 |
+
)
|
| 267 |
+
],
|
| 268 |
+
suggestions=[],
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
step_obs, reward, done, info = self.env.step(action.model_dump())
|
| 272 |
+
self.assertIn("observation", {"observation": step_obs})
|
| 273 |
+
self.assertIsInstance(reward, float)
|
| 274 |
+
self.assertIsInstance(done, bool)
|
| 275 |
+
self.assertIn("diagnostics", info)
|
| 276 |
+
self.assertIn("task_score", info)
|
| 277 |
+
|
| 278 |
+
state = self.env.state()
|
| 279 |
+
self.assertIn("code_context", state)
|
| 280 |
+
self.assertIn("task_metadata", state)
|
| 281 |
+
self.assertIn("comments_made", state)
|
| 282 |
+
self.assertEqual(state["current_step"], 1)
|
| 283 |
+
|
| 284 |
+
def test_seed_reproducibility(self):
|
| 285 |
+
env_a = CodeReviewEnv()
|
| 286 |
+
env_b = CodeReviewEnv()
|
| 287 |
+
|
| 288 |
+
obs_a = env_a.reset(task_id="bug_detection_easy_1", seed=7)
|
| 289 |
+
obs_b = env_b.reset(task_id="bug_detection_easy_1", seed=7)
|
| 290 |
+
|
| 291 |
+
self.assertEqual(obs_a["code_diff"], obs_b["code_diff"])
|
| 292 |
+
self.assertEqual(obs_a["task_description"], obs_b["task_description"])
|
| 293 |
+
self.assertEqual(obs_a["line_count"], obs_b["line_count"])
|
| 294 |
+
|
| 295 |
+
action = ReviewAction(
|
| 296 |
+
action_type=ReviewActionType.ADD_COMMENT,
|
| 297 |
+
comments=[
|
| 298 |
+
Comment(
|
| 299 |
+
line_number=3,
|
| 300 |
+
content="division_by_zero issue",
|
| 301 |
+
is_issue=True,
|
| 302 |
+
severity="high",
|
| 303 |
+
)
|
| 304 |
+
],
|
| 305 |
+
suggestions=[],
|
| 306 |
+
).model_dump()
|
| 307 |
+
|
| 308 |
+
_, _, _, info_a = env_a.step(action)
|
| 309 |
+
_, _, _, info_b = env_b.step(action)
|
| 310 |
+
self.assertEqual(info_a["task_score"], info_b["task_score"])
|
| 311 |
+
|
| 312 |
+
def test_action_masking_changes_after_comment(self):
|
| 313 |
+
obs = self.env.reset(task_id="bug_detection_easy_1")
|
| 314 |
+
self.assertIn("add_comment", obs["valid_actions"])
|
| 315 |
+
self.assertNotIn("suggest_fix", obs["valid_actions"])
|
| 316 |
+
self.assertNotIn("mark_as_resolved", obs["valid_actions"])
|
| 317 |
+
|
| 318 |
+
action = ReviewAction(
|
| 319 |
+
action_type=ReviewActionType.ADD_COMMENT,
|
| 320 |
+
comments=[Comment(line_number=3, content="issue", is_issue=True, severity="high")],
|
| 321 |
+
suggestions=[],
|
| 322 |
+
)
|
| 323 |
+
next_obs, _, _, _ = self.env.step(action.model_dump())
|
| 324 |
+
self.assertIn("suggest_fix", next_obs["valid_actions"])
|
| 325 |
+
self.assertIn("mark_as_resolved", next_obs["valid_actions"])
|
| 326 |
+
|
| 327 |
+
def test_unknown_task_id_raises_key_error(self):
|
| 328 |
+
with self.assertRaises(KeyError):
|
| 329 |
+
self.env.reset(task_id="does_not_exist")
|
| 330 |
+
|
| 331 |
+
def test_repeated_comment_gets_penalized(self):
|
| 332 |
+
self.env.reset(task_id="bug_detection_easy_1")
|
| 333 |
+
|
| 334 |
+
action = ReviewAction(
|
| 335 |
+
action_type=ReviewActionType.ADD_COMMENT,
|
| 336 |
+
comments=[
|
| 337 |
+
Comment(
|
| 338 |
+
line_number=3,
|
| 339 |
+
content="division_by_zero risk",
|
| 340 |
+
is_issue=True,
|
| 341 |
+
severity="high",
|
| 342 |
+
)
|
| 343 |
+
],
|
| 344 |
+
suggestions=[],
|
| 345 |
+
)
|
| 346 |
+
|
| 347 |
+
_, first_reward, _, _ = self.env.step(action.model_dump())
|
| 348 |
+
_, second_reward, _, _ = self.env.step(action.model_dump())
|
| 349 |
+
|
| 350 |
+
self.assertLess(second_reward, first_reward)
|
| 351 |
+
|
| 352 |
+
def test_adversarial_safe_sql_task_should_approve(self):
|
| 353 |
+
self.env.reset(task_id="adversarial_hard_4")
|
| 354 |
+
|
| 355 |
+
action = ReviewAction(
|
| 356 |
+
action_type=ReviewActionType.APPROVE,
|
| 357 |
+
comments=[],
|
| 358 |
+
suggestions=[],
|
| 359 |
+
final_decision="approved",
|
| 360 |
+
)
|
| 361 |
+
|
| 362 |
+
obs, _, done, info = self.env.step(action.model_dump())
|
| 363 |
+
self.assertTrue(done)
|
| 364 |
+
self.assertEqual(obs["final_decision_made"], "approved")
|
| 365 |
+
self.assertEqual(info["task_score"], 1.0)
|
| 366 |
+
|
| 367 |
|
| 368 |
if __name__ == "__main__":
|
| 369 |
unittest.main()
|