ashishbaberwal commited on
Commit
7a23e48
·
1 Parent(s): 5e92b80

hf new add

Browse files
.github/workflows/ci.yml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ pull_request:
6
+
7
+ jobs:
8
+ test-and-validate:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - name: Checkout
12
+ uses: actions/checkout@v4
13
+
14
+ - name: Setup Python
15
+ uses: actions/setup-python@v5
16
+ with:
17
+ python-version: '3.12'
18
+
19
+ - name: Install dependencies
20
+ run: |
21
+ python -m pip install --upgrade pip
22
+ pip install -r requirements.txt
23
+
24
+ - name: Run unit tests
25
+ run: pytest -q
26
+
27
+ - name: Validate OpenEnv
28
+ run: openenv validate
29
+
30
+ - name: Run submit preflight (CI mode)
31
+ run: python submit.py --skip-baseline --skip-docker --no-coverage
README.md CHANGED
@@ -181,3 +181,64 @@ Each inference run writes JSON like:
181
  - The baseline now enforces phased review behavior and falls back to deterministic actions when the model is temporarily unavailable.
182
  - For reproducible runs, keep `TEMPERATURE=0.0`.
183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  - The baseline now enforces phased review behavior and falls back to deterministic actions when the model is temporarily unavailable.
182
  - For reproducible runs, keep `TEMPERATURE=0.0`.
183
 
184
+ ## Fast Start (3 Commands)
185
+
186
+ ```bash
187
+ source .venv/bin/activate
188
+ pytest -q
189
+ python submit.py --skip-docker --max-steps 10
190
+ ```
191
+
192
+ ## Judge Map (Criterion -> Evidence)
193
+
194
+ | Criterion | Evidence | File |
195
+ |---|---|---|
196
+ | OpenEnv lifecycle compliance | reset/step/state implemented and served over HTTP | `environment/env.py`, `server/app.py` |
197
+ | Typed models | Pydantic action/state/observation models | `environment/models.py` |
198
+ | Task difficulty progression | easy/medium/hard tasks + calibration approve tasks | `environment/tasks.py` |
199
+ | Grading quality | detection/suggestion/decision + partial credit + FP penalty + efficiency bonus | `environment/graders.py` |
200
+ | Baseline reproducibility | deterministic seed support in reset + inference output metadata | `environment/env.py`, `inference.py` |
201
+ | Submission validation | Python preflight + bash validator script | `submit.py`, `scripts/validate-submission.sh` |
202
+
203
+ ## Grader Rubric (Summary)
204
+
205
+ | Component | Weight / Effect | Notes |
206
+ |---|---|---|
207
+ | Detection score | 0.4 | Partial credit for near-line matches |
208
+ | Suggestion score | 0.3 | Line-proximity matching for fixes |
209
+ | Decision score | 0.3 | Approve for no-issue tasks, request_changes otherwise |
210
+ | False positive penalty | up to -0.4 | Strong penalty for issue spam |
211
+ | Efficiency bonus | up to +0.1 | Bonus for completing in fewer steps |
212
+ | Final score clamp | [0,1] | Safety clamp in grader |
213
+
214
+ ## Benchmark Snapshot (3-Task Local Run)
215
+
216
+ | Task | Task Score | Total Reward | Model |
217
+ |---|---:|---:|---|
218
+ | bug_detection_easy_1 | 1.000 | 1.410 | meta/llama-3.3-70b-instruct |
219
+ | memory_leak_medium_1 | 0.875 | 1.285 | meta/llama-3.3-70b-instruct |
220
+ | security_hard_1 | 1.000 | 1.410 | meta/llama-3.3-70b-instruct |
221
+
222
+ Note: `task_score` is normalized to [0,1]. `total_reward` is cumulative step reward and can exceed 1.0 by design.
223
+
224
+ ## One-Command Benchmark Table
225
+
226
+ Generate per-task JSON outputs plus a markdown table for judge submission:
227
+
228
+ ```bash
229
+ source .venv/bin/activate
230
+ python scripts/run_benchmark.py --max-steps 10
231
+ ```
232
+
233
+ Artifacts:
234
+
235
+ - `outputs/benchmark_<task_id>.json`
236
+ - `outputs/benchmark_table.md`
237
+
238
+ ## Failure Analysis Template
239
+
240
+ - Missed issue type:
241
+ - Why it was missed (model behavior or prompt failure):
242
+ - Grader diagnostics (precision/recall/F1/FP):
243
+ - Fix applied (prompt/rubric/task change):
244
+
environment/env.py CHANGED
@@ -71,9 +71,20 @@ class CodeReviewEnv:
71
  self._state.last_error = str(e)
72
  return self._get_observation(), -0.1, False, {"error": str(e), "last_action_valid": False}
73
 
 
 
 
 
74
  self._state.current_step += 1
75
  self._process_action(review_action)
76
 
 
 
 
 
 
 
 
77
  self._action_history.append({
78
  "step": self._state.current_step,
79
  "action_type": review_action.action_type.value,
@@ -103,6 +114,7 @@ class CodeReviewEnv:
103
  self._state.final_decision or "changes_requested",
104
  grader,
105
  self._state.last_action_valid,
 
106
  steps_taken=self._state.current_step,
107
  max_steps=self.max_steps,
108
  )
 
71
  self._state.last_error = str(e)
72
  return self._get_observation(), -0.1, False, {"error": str(e), "last_action_valid": False}
73
 
74
+ existing_comment_keys = {
75
+ (c.line_number, c.content.strip().lower()) for c in self._state.comments_made
76
+ }
77
+
78
  self._state.current_step += 1
79
  self._process_action(review_action)
80
 
81
+ duplicate_comment_count = 0
82
+ if review_action.action_type.value == "add_comment":
83
+ for c in review_action.comments:
84
+ key = (c.line_number, c.content.strip().lower())
85
+ if key in existing_comment_keys:
86
+ duplicate_comment_count += 1
87
+
88
  self._action_history.append({
89
  "step": self._state.current_step,
90
  "action_type": review_action.action_type.value,
 
114
  self._state.final_decision or "changes_requested",
115
  grader,
116
  self._state.last_action_valid,
117
+ duplicate_comment_count=duplicate_comment_count,
118
  steps_taken=self._state.current_step,
119
  max_steps=self.max_steps,
120
  )
environment/graders.py CHANGED
@@ -204,6 +204,7 @@ class RewardCalculator:
204
  final_decision: str,
205
  grader: TaskGrader,
206
  last_action_valid: bool,
 
207
  steps_taken: int = 0,
208
  max_steps: int = 50) -> float:
209
 
@@ -223,6 +224,9 @@ class RewardCalculator:
223
  if not last_action_valid:
224
  reward -= 0.15
225
 
 
 
 
226
  if not current_action.comments and not current_action.suggestions:
227
  if current_action.action_type.value in ["approve", "request_changes"]:
228
  pass
 
204
  final_decision: str,
205
  grader: TaskGrader,
206
  last_action_valid: bool,
207
+ duplicate_comment_count: int = 0,
208
  steps_taken: int = 0,
209
  max_steps: int = 50) -> float:
210
 
 
224
  if not last_action_valid:
225
  reward -= 0.15
226
 
227
+ if duplicate_comment_count > 0:
228
+ reward -= min(0.25, 0.08 * duplicate_comment_count)
229
+
230
  if not current_action.comments and not current_action.suggestions:
231
  if current_action.action_type.value in ["approve", "request_changes"]:
232
  pass
environment/tasks.py CHANGED
@@ -276,6 +276,27 @@ class Counter:
276
  "language": "python",
277
  "line_count": 9,
278
  "expected_issues": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
  }
280
  ]
281
 
@@ -286,8 +307,8 @@ class Counter:
286
  for task in all_tasks:
287
  if task["task_id"] == canonical_task_id:
288
  return task
289
- print(f"WARNING: task_id '{task_id}' not found, falling back to bug_detection_easy_1")
290
- return cls.EASY_TASKS[0]
291
 
292
  @classmethod
293
  def get_all_tasks(cls) -> List[Dict[str, Any]]:
 
276
  "language": "python",
277
  "line_count": 9,
278
  "expected_issues": []
279
+ },
280
+ {
281
+ "task_id": "adversarial_hard_4",
282
+ "task_name": "Adversarial: Safe SQL Builder",
283
+ "difficulty": "hard",
284
+ "description": "No issues expected: query text looks suspicious but uses parameterized execution safely",
285
+ "code_diff": """def find_user(database, user_id):
286
+ query = \"SELECT * FROM users WHERE id = ?\"
287
+ params = [int(user_id)]
288
+ return database.execute(query, params)""",
289
+ "surrounding_code": """def find_user(database, user_id):
290
+ query = \"SELECT * FROM users WHERE id = ?\"
291
+ params = [int(user_id)]
292
+ return database.execute(query, params)
293
+
294
+ def find_all_users(database):
295
+ return database.execute(\"SELECT * FROM users\")""",
296
+ "file_path": "user_repository.py",
297
+ "language": "python",
298
+ "line_count": 4,
299
+ "expected_issues": []
300
  }
301
  ]
302
 
 
307
  for task in all_tasks:
308
  if task["task_id"] == canonical_task_id:
309
  return task
310
+ available = ", ".join(t["task_id"] for t in all_tasks)
311
+ raise KeyError(f"Unknown task_id '{task_id}'. Available task IDs: {available}")
312
 
313
  @classmethod
314
  def get_all_tasks(cls) -> List[Dict[str, Any]]:
inference.py CHANGED
@@ -52,6 +52,38 @@ FALLBACK_ACTION = json.dumps({
52
  })
53
 
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  def add_line_numbers(code: str) -> str:
56
  lines = code.split("\n")
57
  return "\n".join(f"{i+1}: {line}" for i, line in enumerate(lines))
@@ -401,17 +433,20 @@ Respond with JSON only.
401
  return {"action_type": "request_changes", "comments": [], "suggestions": []}
402
 
403
 
404
- def run_episode(env, agent, task_id: str, max_steps: int) -> Dict[str, Any]:
405
  agent.reset()
406
- obs = env.reset(task_id=task_id)
407
  done = False
408
  step = 0
409
  total_reward = 0.0
 
410
 
411
  print(f"\nTask : {task_id}")
412
  print(f"Desc : {obs.get('task_description', 'N/A')}")
413
  print("-" * 60)
414
 
 
 
415
  while not done and step < max_steps:
416
  action_str = agent.get_action(obs)
417
  action = agent.parse_action(action_str)
@@ -419,6 +454,7 @@ def run_episode(env, agent, task_id: str, max_steps: int) -> Dict[str, Any]:
419
 
420
  obs, reward, done, info = env.step(action)
421
  total_reward += reward
 
422
  step += 1
423
 
424
  print(f"\nStep {step}/{max_steps}:")
@@ -434,8 +470,18 @@ def run_episode(env, agent, task_id: str, max_steps: int) -> Dict[str, Any]:
434
  if info.get('last_action_valid') is False:
435
  print(f" Warning : {info.get('error', 'Invalid action')}")
436
 
 
 
 
 
 
 
 
 
437
  final_score = env.get_task_score()
438
  diagnostics = env.summary()
 
 
439
 
440
  return {
441
  "task_id": task_id,
@@ -450,6 +496,7 @@ def run_episode(env, agent, task_id: str, max_steps: int) -> Dict[str, Any]:
450
  "efficiency_bonus": diagnostics.get("efficiency_bonus", 0),
451
  "model": MODEL_NAME,
452
  "api_base_url": API_BASE_URL,
 
453
  }
454
 
455
 
@@ -508,6 +555,7 @@ def main():
508
  parser.add_argument("--output", type=str, default="baseline_results.json")
509
  parser.add_argument("--batch", action="store_true", help="Run all tasks in batch mode")
510
  parser.add_argument("--difficulty", type=str, default=None, help="Filter batch by difficulty: easy, medium, hard")
 
511
  args = parser.parse_args()
512
 
513
  print("=" * 60)
@@ -526,7 +574,7 @@ def main():
526
  task_ids = [t["task_id"] for t in TaskDefinitions.get_all_tasks()]
527
  run_batch(env, agent, task_ids, args.max_steps, args.output)
528
  else:
529
- result = run_episode(env, agent, args.task_id, args.max_steps)
530
 
531
  print("\n" + "=" * 60)
532
  print("Final Results:")
 
52
  })
53
 
54
 
55
+ def log_start(task: str, env_name: str, model: str, max_steps: int, seed: int | None) -> None:
56
+ payload = {
57
+ "task": task,
58
+ "env": env_name,
59
+ "model": model,
60
+ "max_steps": max_steps,
61
+ "seed": seed,
62
+ }
63
+ print(f"[START] {json.dumps(payload, ensure_ascii=True, separators=(',', ':'))}", flush=True)
64
+
65
+
66
+ def log_step(step: int, action: str, reward: float, done: bool, error: str | None) -> None:
67
+ payload = {
68
+ "step": step,
69
+ "action": action,
70
+ "reward": round(float(reward), 4),
71
+ "done": bool(done),
72
+ "error": error,
73
+ }
74
+ print(f"[STEP] {json.dumps(payload, ensure_ascii=True, separators=(',', ':'))}", flush=True)
75
+
76
+
77
+ def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
78
+ payload = {
79
+ "success": bool(success),
80
+ "steps": int(steps),
81
+ "score": round(float(score), 4),
82
+ "rewards": [round(float(r), 4) for r in rewards],
83
+ }
84
+ print(f"[END] {json.dumps(payload, ensure_ascii=True, separators=(',', ':'))}", flush=True)
85
+
86
+
87
  def add_line_numbers(code: str) -> str:
88
  lines = code.split("\n")
89
  return "\n".join(f"{i+1}: {line}" for i, line in enumerate(lines))
 
433
  return {"action_type": "request_changes", "comments": [], "suggestions": []}
434
 
435
 
436
+ def run_episode(env, agent, task_id: str, max_steps: int, seed: int | None = None) -> Dict[str, Any]:
437
  agent.reset()
438
+ obs = env.reset(task_id=task_id, seed=seed)
439
  done = False
440
  step = 0
441
  total_reward = 0.0
442
+ rewards: List[float] = []
443
 
444
  print(f"\nTask : {task_id}")
445
  print(f"Desc : {obs.get('task_description', 'N/A')}")
446
  print("-" * 60)
447
 
448
+ log_start(task=task_id, env_name="code-review-agent-env", model=MODEL_NAME, max_steps=max_steps, seed=seed)
449
+
450
  while not done and step < max_steps:
451
  action_str = agent.get_action(obs)
452
  action = agent.parse_action(action_str)
 
454
 
455
  obs, reward, done, info = env.step(action)
456
  total_reward += reward
457
+ rewards.append(float(reward))
458
  step += 1
459
 
460
  print(f"\nStep {step}/{max_steps}:")
 
470
  if info.get('last_action_valid') is False:
471
  print(f" Warning : {info.get('error', 'Invalid action')}")
472
 
473
+ log_step(
474
+ step=step,
475
+ action=action.get("action_type", "unknown"),
476
+ reward=float(reward),
477
+ done=bool(done),
478
+ error=info.get("error"),
479
+ )
480
+
481
  final_score = env.get_task_score()
482
  diagnostics = env.summary()
483
+ success = final_score >= 0.7
484
+ log_end(success=success, steps=step, score=final_score, rewards=rewards)
485
 
486
  return {
487
  "task_id": task_id,
 
496
  "efficiency_bonus": diagnostics.get("efficiency_bonus", 0),
497
  "model": MODEL_NAME,
498
  "api_base_url": API_BASE_URL,
499
+ "seed": seed,
500
  }
501
 
502
 
 
555
  parser.add_argument("--output", type=str, default="baseline_results.json")
556
  parser.add_argument("--batch", action="store_true", help="Run all tasks in batch mode")
557
  parser.add_argument("--difficulty", type=str, default=None, help="Filter batch by difficulty: easy, medium, hard")
558
+ parser.add_argument("--seed", type=int, default=42, help="Random seed for deterministic environment reset")
559
  args = parser.parse_args()
560
 
561
  print("=" * 60)
 
574
  task_ids = [t["task_id"] for t in TaskDefinitions.get_all_tasks()]
575
  run_batch(env, agent, task_ids, args.max_steps, args.output)
576
  else:
577
+ result = run_episode(env, agent, args.task_id, args.max_steps, seed=args.seed)
578
 
579
  print("\n" + "=" * 60)
580
  print("Final Results:")
openenv.yaml CHANGED
@@ -58,6 +58,10 @@ tasks:
58
  name: "Hard: Approve Thread-Safe Counter"
59
  difficulty: hard
60
 
 
 
 
 
61
  observation_space:
62
  type: dict
63
  description: |
 
58
  name: "Hard: Approve Thread-Safe Counter"
59
  difficulty: hard
60
 
61
+ - id: adversarial_hard_4
62
+ name: "Hard: Adversarial Safe SQL Builder"
63
+ difficulty: hard
64
+
65
  observation_space:
66
  type: dict
67
  description: |
scripts/run_benchmark.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import json
6
+ import subprocess
7
+ import sys
8
+ from pathlib import Path
9
+ from typing import Any, Dict, List
10
+
11
+ REPO_ROOT = Path(__file__).resolve().parents[1]
12
+ INFERENCE_PATH = REPO_ROOT / "inference.py"
13
+
14
+ DEFAULT_TASKS = [
15
+ "bug_detection_easy_1",
16
+ "memory_leak_medium_1",
17
+ "security_hard_1",
18
+ ]
19
+
20
+
21
+ def run_task(task_id: str, max_steps: int, output_dir: Path) -> Dict[str, Any]:
22
+ output_path = output_dir / f"benchmark_{task_id}.json"
23
+ cmd = [
24
+ sys.executable,
25
+ str(INFERENCE_PATH),
26
+ "--task-id",
27
+ task_id,
28
+ "--max-steps",
29
+ str(max_steps),
30
+ "--output",
31
+ str(output_path),
32
+ ]
33
+
34
+ completed = subprocess.run(cmd, capture_output=True, text=True, cwd=str(REPO_ROOT))
35
+ if completed.returncode != 0:
36
+ raise RuntimeError(
37
+ f"Task {task_id} failed with exit code {completed.returncode}\n"
38
+ f"stdout:\n{completed.stdout}\n\n"
39
+ f"stderr:\n{completed.stderr}"
40
+ )
41
+
42
+ with output_path.open("r", encoding="utf-8") as fh:
43
+ return json.load(fh)
44
+
45
+
46
+ def to_markdown(results: List[Dict[str, Any]]) -> str:
47
+ lines = [
48
+ "# Benchmark Results",
49
+ "",
50
+ "| Task | Task Score | Total Reward | Steps | Model |",
51
+ "|---|---:|---:|---:|---|",
52
+ ]
53
+
54
+ for row in results:
55
+ lines.append(
56
+ f"| {row.get('task_id')} | {row.get('task_score', 0):.3f} | "
57
+ f"{row.get('total_reward', 0):.3f} | {row.get('steps', 0)} | {row.get('model', 'unknown')} |"
58
+ )
59
+
60
+ avg_score = sum(float(r.get("task_score", 0.0)) for r in results) / max(1, len(results))
61
+ avg_reward = sum(float(r.get("total_reward", 0.0)) for r in results) / max(1, len(results))
62
+
63
+ lines.extend(
64
+ [
65
+ "",
66
+ f"Average task score: **{avg_score:.3f}**",
67
+ f"Average total reward: **{avg_reward:.3f}**",
68
+ "",
69
+ "Note: task_score is normalized to [0, 1]; total_reward is cumulative and may exceed 1.0.",
70
+ ]
71
+ )
72
+
73
+ return "\n".join(lines) + "\n"
74
+
75
+
76
+ def main() -> int:
77
+ parser = argparse.ArgumentParser(description="Run benchmark tasks and generate a markdown table")
78
+ parser.add_argument(
79
+ "--tasks",
80
+ nargs="+",
81
+ default=DEFAULT_TASKS,
82
+ help="Task IDs to evaluate (default: 3 core tasks)",
83
+ )
84
+ parser.add_argument("--max-steps", type=int, default=10)
85
+ parser.add_argument("--output-dir", type=Path, default=REPO_ROOT / "outputs")
86
+ parser.add_argument("--table", type=Path, default=REPO_ROOT / "outputs/benchmark_table.md")
87
+ args = parser.parse_args()
88
+
89
+ args.output_dir.mkdir(parents=True, exist_ok=True)
90
+ args.table.parent.mkdir(parents=True, exist_ok=True)
91
+
92
+ results: List[Dict[str, Any]] = []
93
+ for task_id in args.tasks:
94
+ print(f"Running task: {task_id}")
95
+ result = run_task(task_id, args.max_steps, args.output_dir)
96
+ results.append(result)
97
+
98
+ table = to_markdown(results)
99
+ args.table.write_text(table, encoding="utf-8")
100
+
101
+ print(f"Wrote benchmark table to {args.table}")
102
+ return 0
103
+
104
+
105
+ if __name__ == "__main__":
106
+ raise SystemExit(main())
scripts/validate-submission.sh ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # validate-submission.sh — OpenEnv submission validator
3
+ # Usage: ./scripts/validate-submission.sh <ping_url> [repo_dir]
4
+
5
+ set -euo pipefail
6
+
7
+ PING_URL="${1:-}"
8
+ REPO_DIR="${2:-.}"
9
+ DOCKER_BUILD_TIMEOUT="${DOCKER_BUILD_TIMEOUT:-600}"
10
+
11
+ if [[ -z "$PING_URL" ]]; then
12
+ echo "Usage: $0 <ping_url> [repo_dir]"
13
+ echo "Example: $0 https://my-space.hf.space ."
14
+ exit 2
15
+ fi
16
+
17
+ if [[ ! -d "$REPO_DIR" ]]; then
18
+ echo "ERROR: repo_dir not found: $REPO_DIR"
19
+ exit 2
20
+ fi
21
+
22
+ cd "$REPO_DIR"
23
+
24
+ echo "==> 1/4 Ping Space root"
25
+ ROOT_CODE=$(curl -s -o /tmp/openenv_root.json -w '%{http_code}' "$PING_URL/")
26
+ if [[ "$ROOT_CODE" != "200" ]]; then
27
+ echo "FAIL: root ping returned $ROOT_CODE"
28
+ exit 1
29
+ fi
30
+
31
+ echo "==> 2/4 Ping reset endpoint"
32
+ RESET_CODE=$(curl -s -o /tmp/openenv_reset.json -w '%{http_code}' "$PING_URL/reset")
33
+ if [[ "$RESET_CODE" != "200" ]]; then
34
+ echo "FAIL: reset ping returned $RESET_CODE"
35
+ exit 1
36
+ fi
37
+ if ! grep -q '"observation"' /tmp/openenv_reset.json; then
38
+ echo "FAIL: /reset did not return observation"
39
+ exit 1
40
+ fi
41
+
42
+ echo "==> 3/4 openenv validate"
43
+ openenv validate
44
+
45
+ echo "==> 4/4 docker build"
46
+ if ! command -v docker >/dev/null 2>&1; then
47
+ echo "FAIL: docker command not found"
48
+ exit 1
49
+ fi
50
+ if command -v timeout >/dev/null 2>&1; then
51
+ if ! timeout "$DOCKER_BUILD_TIMEOUT" docker build -t code-review-env-validate .; then
52
+ echo "FAIL: docker build failed or timed out"
53
+ exit 1
54
+ fi
55
+ else
56
+ echo "WARN: timeout command not found, running docker build without timeout"
57
+ if ! docker build -t code-review-env-validate .; then
58
+ echo "FAIL: docker build failed"
59
+ exit 1
60
+ fi
61
+ fi
62
+
63
+ echo "PASS: submission validator checks completed"
submit.py CHANGED
@@ -249,6 +249,11 @@ def main() -> int:
249
  default="code-review-env",
250
  help="Docker image name for validation build",
251
  )
 
 
 
 
 
252
  parser.add_argument(
253
  "--report-path",
254
  default="submission_report.json",
@@ -268,8 +273,11 @@ def main() -> int:
268
  ok, detail = run_tests(with_coverage=not args.no_coverage)
269
  checks["tests"] = detail
270
 
271
- ok, detail = check_docker(args.image_name)
272
- checks["docker"] = detail
 
 
 
273
 
274
  baseline_scores: Dict[str, float] = {}
275
  if args.skip_baseline:
@@ -283,7 +291,7 @@ def main() -> int:
283
  required_checks_ok = (
284
  checks["validation"]["ok"]
285
  and checks["tests"]["ok"]
286
- and checks["docker"]["ok"]
287
  )
288
 
289
  if required_checks_ok:
 
249
  default="code-review-env",
250
  help="Docker image name for validation build",
251
  )
252
+ parser.add_argument(
253
+ "--skip-docker",
254
+ action="store_true",
255
+ help="Skip docker build checks (useful in CI environments without docker daemon)",
256
+ )
257
  parser.add_argument(
258
  "--report-path",
259
  default="submission_report.json",
 
273
  ok, detail = run_tests(with_coverage=not args.no_coverage)
274
  checks["tests"] = detail
275
 
276
+ if args.skip_docker:
277
+ checks["docker"] = {"ok": False, "skipped": True, "reason": "Skipped by --skip-docker"}
278
+ else:
279
+ ok, detail = check_docker(args.image_name)
280
+ checks["docker"] = detail
281
 
282
  baseline_scores: Dict[str, float] = {}
283
  if args.skip_baseline:
 
291
  required_checks_ok = (
292
  checks["validation"]["ok"]
293
  and checks["tests"]["ok"]
294
+ and (checks["docker"].get("ok") or checks["docker"].get("skipped"))
295
  )
296
 
297
  if required_checks_ok:
tests/test_env.py CHANGED
@@ -248,6 +248,122 @@ class TestCodeReviewEnv(unittest.TestCase):
248
  self.assertGreater(info["diagnostics"]["false_positive_count"], 0)
249
  self.assertLess(info["task_score"], 1.0)
250
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
  if __name__ == "__main__":
253
  unittest.main()
 
248
  self.assertGreater(info["diagnostics"]["false_positive_count"], 0)
249
  self.assertLess(info["task_score"], 1.0)
250
 
251
+ def test_reset_step_state_contract(self):
252
+ obs = self.env.reset(task_id="bug_detection_easy_1", seed=123)
253
+ self.assertIn("code_diff", obs)
254
+ self.assertIn("file_context", obs)
255
+ self.assertIn("task_description", obs)
256
+ self.assertIn("valid_actions", obs)
257
+
258
+ action = ReviewAction(
259
+ action_type=ReviewActionType.ADD_COMMENT,
260
+ comments=[
261
+ Comment(
262
+ line_number=3,
263
+ content="division_by_zero risk",
264
+ is_issue=True,
265
+ severity="high",
266
+ )
267
+ ],
268
+ suggestions=[],
269
+ )
270
+
271
+ step_obs, reward, done, info = self.env.step(action.model_dump())
272
+ self.assertIn("observation", {"observation": step_obs})
273
+ self.assertIsInstance(reward, float)
274
+ self.assertIsInstance(done, bool)
275
+ self.assertIn("diagnostics", info)
276
+ self.assertIn("task_score", info)
277
+
278
+ state = self.env.state()
279
+ self.assertIn("code_context", state)
280
+ self.assertIn("task_metadata", state)
281
+ self.assertIn("comments_made", state)
282
+ self.assertEqual(state["current_step"], 1)
283
+
284
+ def test_seed_reproducibility(self):
285
+ env_a = CodeReviewEnv()
286
+ env_b = CodeReviewEnv()
287
+
288
+ obs_a = env_a.reset(task_id="bug_detection_easy_1", seed=7)
289
+ obs_b = env_b.reset(task_id="bug_detection_easy_1", seed=7)
290
+
291
+ self.assertEqual(obs_a["code_diff"], obs_b["code_diff"])
292
+ self.assertEqual(obs_a["task_description"], obs_b["task_description"])
293
+ self.assertEqual(obs_a["line_count"], obs_b["line_count"])
294
+
295
+ action = ReviewAction(
296
+ action_type=ReviewActionType.ADD_COMMENT,
297
+ comments=[
298
+ Comment(
299
+ line_number=3,
300
+ content="division_by_zero issue",
301
+ is_issue=True,
302
+ severity="high",
303
+ )
304
+ ],
305
+ suggestions=[],
306
+ ).model_dump()
307
+
308
+ _, _, _, info_a = env_a.step(action)
309
+ _, _, _, info_b = env_b.step(action)
310
+ self.assertEqual(info_a["task_score"], info_b["task_score"])
311
+
312
+ def test_action_masking_changes_after_comment(self):
313
+ obs = self.env.reset(task_id="bug_detection_easy_1")
314
+ self.assertIn("add_comment", obs["valid_actions"])
315
+ self.assertNotIn("suggest_fix", obs["valid_actions"])
316
+ self.assertNotIn("mark_as_resolved", obs["valid_actions"])
317
+
318
+ action = ReviewAction(
319
+ action_type=ReviewActionType.ADD_COMMENT,
320
+ comments=[Comment(line_number=3, content="issue", is_issue=True, severity="high")],
321
+ suggestions=[],
322
+ )
323
+ next_obs, _, _, _ = self.env.step(action.model_dump())
324
+ self.assertIn("suggest_fix", next_obs["valid_actions"])
325
+ self.assertIn("mark_as_resolved", next_obs["valid_actions"])
326
+
327
+ def test_unknown_task_id_raises_key_error(self):
328
+ with self.assertRaises(KeyError):
329
+ self.env.reset(task_id="does_not_exist")
330
+
331
+ def test_repeated_comment_gets_penalized(self):
332
+ self.env.reset(task_id="bug_detection_easy_1")
333
+
334
+ action = ReviewAction(
335
+ action_type=ReviewActionType.ADD_COMMENT,
336
+ comments=[
337
+ Comment(
338
+ line_number=3,
339
+ content="division_by_zero risk",
340
+ is_issue=True,
341
+ severity="high",
342
+ )
343
+ ],
344
+ suggestions=[],
345
+ )
346
+
347
+ _, first_reward, _, _ = self.env.step(action.model_dump())
348
+ _, second_reward, _, _ = self.env.step(action.model_dump())
349
+
350
+ self.assertLess(second_reward, first_reward)
351
+
352
+ def test_adversarial_safe_sql_task_should_approve(self):
353
+ self.env.reset(task_id="adversarial_hard_4")
354
+
355
+ action = ReviewAction(
356
+ action_type=ReviewActionType.APPROVE,
357
+ comments=[],
358
+ suggestions=[],
359
+ final_decision="approved",
360
+ )
361
+
362
+ obs, _, done, info = self.env.step(action.model_dump())
363
+ self.assertTrue(done)
364
+ self.assertEqual(obs["final_decision_made"], "approved")
365
+ self.assertEqual(info["task_score"], 1.0)
366
+
367
 
368
  if __name__ == "__main__":
369
  unittest.main()