Spaces:

Spirit-26
/

code-review-environment

Running

App Files Files Community

ashishbaberwal commited on 1 day ago

Commit

7a23e48

1 Parent(s): 5e92b80

hf new add

Browse files

Files changed (11) hide show

.github/workflows/ci.yml +31 -0
README.md +61 -0
environment/env.py +12 -0
environment/graders.py +4 -0
environment/tasks.py +23 -2
inference.py +51 -3
openenv.yaml +4 -0
scripts/run_benchmark.py +106 -0
scripts/validate-submission.sh +63 -0
submit.py +11 -3
tests/test_env.py +116 -0

.github/workflows/ci.yml ADDED Viewed

	@@ -0,0 +1,31 @@

+name: CI
+on:
+  push:
+  pull_request:
+jobs:
+  test-and-validate:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+      - name: Run unit tests
+        run: pytest -q
+      - name: Validate OpenEnv
+        run: openenv validate
+      - name: Run submit preflight (CI mode)
+        run: python submit.py --skip-baseline --skip-docker --no-coverage

README.md CHANGED Viewed

@@ -181,3 +181,64 @@ Each inference run writes JSON like:
 - The baseline now enforces phased review behavior and falls back to deterministic actions when the model is temporarily unavailable.
 - For reproducible runs, keep `TEMPERATURE=0.0`.

 - The baseline now enforces phased review behavior and falls back to deterministic actions when the model is temporarily unavailable.
 - For reproducible runs, keep `TEMPERATURE=0.0`.
+## Fast Start (3 Commands)
+```bash
+source .venv/bin/activate
+pytest -q
+python submit.py --skip-docker --max-steps 10
+```
+## Judge Map (Criterion -> Evidence)
+| Criterion | Evidence | File |
+|---|---|---|
+| OpenEnv lifecycle compliance | reset/step/state implemented and served over HTTP | `environment/env.py`, `server/app.py` |
+| Typed models | Pydantic action/state/observation models | `environment/models.py` |
+| Task difficulty progression | easy/medium/hard tasks + calibration approve tasks | `environment/tasks.py` |
+| Grading quality | detection/suggestion/decision + partial credit + FP penalty + efficiency bonus | `environment/graders.py` |
+| Baseline reproducibility | deterministic seed support in reset + inference output metadata | `environment/env.py`, `inference.py` |
+| Submission validation | Python preflight + bash validator script | `submit.py`, `scripts/validate-submission.sh` |
+## Grader Rubric (Summary)
+| Component | Weight / Effect | Notes |
+|---|---|---|
+| Detection score | 0.4 | Partial credit for near-line matches |
+| Suggestion score | 0.3 | Line-proximity matching for fixes |
+| Decision score | 0.3 | Approve for no-issue tasks, request_changes otherwise |
+| False positive penalty | up to -0.4 | Strong penalty for issue spam |
+| Efficiency bonus | up to +0.1 | Bonus for completing in fewer steps |
+| Final score clamp | [0,1] | Safety clamp in grader |
+## Benchmark Snapshot (3-Task Local Run)
+| Task | Task Score | Total Reward | Model |
+|---|---:|---:|---|
+| bug_detection_easy_1 | 1.000 | 1.410 | meta/llama-3.3-70b-instruct |
+| memory_leak_medium_1 | 0.875 | 1.285 | meta/llama-3.3-70b-instruct |
+| security_hard_1 | 1.000 | 1.410 | meta/llama-3.3-70b-instruct |
+Note: `task_score` is normalized to [0,1]. `total_reward` is cumulative step reward and can exceed 1.0 by design.
+## One-Command Benchmark Table
+Generate per-task JSON outputs plus a markdown table for judge submission:
+```bash
+source .venv/bin/activate
+python scripts/run_benchmark.py --max-steps 10
+```
+Artifacts:
+- `outputs/benchmark_<task_id>.json`
+- `outputs/benchmark_table.md`
+## Failure Analysis Template
+- Missed issue type:
+- Why it was missed (model behavior or prompt failure):
+- Grader diagnostics (precision/recall/F1/FP):
+- Fix applied (prompt/rubric/task change):

environment/env.py CHANGED Viewed

@@ -71,9 +71,20 @@ class CodeReviewEnv:
             self._state.last_error = str(e)
             return self._get_observation(), -0.1, False, {"error": str(e), "last_action_valid": False}
         self._state.current_step += 1
         self._process_action(review_action)
         self._action_history.append({
             "step": self._state.current_step,
             "action_type": review_action.action_type.value,
@@ -103,6 +114,7 @@ class CodeReviewEnv:
             self._state.final_decision or "changes_requested",
             grader,
             self._state.last_action_valid,
             steps_taken=self._state.current_step,
             max_steps=self.max_steps,
         )

             self._state.last_error = str(e)
             return self._get_observation(), -0.1, False, {"error": str(e), "last_action_valid": False}
+        existing_comment_keys = {
+            (c.line_number, c.content.strip().lower()) for c in self._state.comments_made
+        }
         self._state.current_step += 1
         self._process_action(review_action)
+        duplicate_comment_count = 0
+        if review_action.action_type.value == "add_comment":
+            for c in review_action.comments:
+                key = (c.line_number, c.content.strip().lower())
+                if key in existing_comment_keys:
+                    duplicate_comment_count += 1
         self._action_history.append({
             "step": self._state.current_step,
             "action_type": review_action.action_type.value,
             self._state.final_decision or "changes_requested",
             grader,
             self._state.last_action_valid,
+            duplicate_comment_count=duplicate_comment_count,
             steps_taken=self._state.current_step,
             max_steps=self.max_steps,
         )

environment/graders.py CHANGED Viewed

@@ -204,6 +204,7 @@ class RewardCalculator:
                          final_decision: str,
                          grader: TaskGrader,
                          last_action_valid: bool,
                          steps_taken: int = 0,
                          max_steps: int = 50) -> float:
@@ -223,6 +224,9 @@ class RewardCalculator:
         if not last_action_valid:
             reward -= 0.15
         if not current_action.comments and not current_action.suggestions:
             if current_action.action_type.value in ["approve", "request_changes"]:
                 pass

                          final_decision: str,
                          grader: TaskGrader,
                          last_action_valid: bool,
+                         duplicate_comment_count: int = 0,
                          steps_taken: int = 0,
                          max_steps: int = 50) -> float:
         if not last_action_valid:
             reward -= 0.15
+        if duplicate_comment_count > 0:
+            reward -= min(0.25, 0.08 * duplicate_comment_count)
         if not current_action.comments and not current_action.suggestions:
             if current_action.action_type.value in ["approve", "request_changes"]:
                 pass

environment/tasks.py CHANGED Viewed

@@ -276,6 +276,27 @@ class Counter:
             "language": "python",
             "line_count": 9,
             "expected_issues": []
         }
     ]
@@ -286,8 +307,8 @@ class Counter:
         for task in all_tasks:
             if task["task_id"] == canonical_task_id:
                 return task
-        print(f"WARNING: task_id '{task_id}' not found, falling back to bug_detection_easy_1")
-        return cls.EASY_TASKS[0]
     @classmethod
     def get_all_tasks(cls) -> List[Dict[str, Any]]:

             "language": "python",
             "line_count": 9,
             "expected_issues": []
+        },
+        {
+            "task_id": "adversarial_hard_4",
+            "task_name": "Adversarial: Safe SQL Builder",
+            "difficulty": "hard",
+            "description": "No issues expected: query text looks suspicious but uses parameterized execution safely",
+            "code_diff": """def find_user(database, user_id):
+    query = \"SELECT * FROM users WHERE id = ?\"
+    params = [int(user_id)]
+    return database.execute(query, params)""",
+            "surrounding_code": """def find_user(database, user_id):
+    query = \"SELECT * FROM users WHERE id = ?\"
+    params = [int(user_id)]
+    return database.execute(query, params)
+def find_all_users(database):
+    return database.execute(\"SELECT * FROM users\")""",
+            "file_path": "user_repository.py",
+            "language": "python",
+            "line_count": 4,
+            "expected_issues": []
         }
     ]
         for task in all_tasks:
             if task["task_id"] == canonical_task_id:
                 return task
+        available = ", ".join(t["task_id"] for t in all_tasks)
+        raise KeyError(f"Unknown task_id '{task_id}'. Available task IDs: {available}")
     @classmethod
     def get_all_tasks(cls) -> List[Dict[str, Any]]:

inference.py CHANGED Viewed

@@ -52,6 +52,38 @@ FALLBACK_ACTION = json.dumps({
 })
 def add_line_numbers(code: str) -> str:
     lines = code.split("\n")
     return "\n".join(f"{i+1}: {line}" for i, line in enumerate(lines))
@@ -401,17 +433,20 @@ Respond with JSON only.
             return {"action_type": "request_changes", "comments": [], "suggestions": []}
-def run_episode(env, agent, task_id: str, max_steps: int) -> Dict[str, Any]:
     agent.reset()
-    obs = env.reset(task_id=task_id)
     done = False
     step = 0
     total_reward = 0.0
     print(f"\nTask    : {task_id}")
     print(f"Desc    : {obs.get('task_description', 'N/A')}")
     print("-" * 60)
     while not done and step < max_steps:
         action_str = agent.get_action(obs)
         action = agent.parse_action(action_str)
@@ -419,6 +454,7 @@ def run_episode(env, agent, task_id: str, max_steps: int) -> Dict[str, Any]:
         obs, reward, done, info = env.step(action)
         total_reward += reward
         step += 1
         print(f"\nStep {step}/{max_steps}:")
@@ -434,8 +470,18 @@ def run_episode(env, agent, task_id: str, max_steps: int) -> Dict[str, Any]:
         if info.get('last_action_valid') is False:
             print(f"  Warning     : {info.get('error', 'Invalid action')}")
     final_score = env.get_task_score()
     diagnostics = env.summary()
     return {
         "task_id": task_id,
@@ -450,6 +496,7 @@ def run_episode(env, agent, task_id: str, max_steps: int) -> Dict[str, Any]:
         "efficiency_bonus": diagnostics.get("efficiency_bonus", 0),
         "model": MODEL_NAME,
         "api_base_url": API_BASE_URL,
     }
@@ -508,6 +555,7 @@ def main():
     parser.add_argument("--output", type=str, default="baseline_results.json")
     parser.add_argument("--batch", action="store_true", help="Run all tasks in batch mode")
     parser.add_argument("--difficulty", type=str, default=None, help="Filter batch by difficulty: easy, medium, hard")
     args = parser.parse_args()
     print("=" * 60)
@@ -526,7 +574,7 @@ def main():
             task_ids = [t["task_id"] for t in TaskDefinitions.get_all_tasks()]
         run_batch(env, agent, task_ids, args.max_steps, args.output)
     else:
-        result = run_episode(env, agent, args.task_id, args.max_steps)
         print("\n" + "=" * 60)
         print("Final Results:")

 })
+def log_start(task: str, env_name: str, model: str, max_steps: int, seed: int | None) -> None:
+    payload = {
+        "task": task,
+        "env": env_name,
+        "model": model,
+        "max_steps": max_steps,
+        "seed": seed,
+    }
+    print(f"[START] {json.dumps(payload, ensure_ascii=True, separators=(',', ':'))}", flush=True)
+def log_step(step: int, action: str, reward: float, done: bool, error: str | None) -> None:
+    payload = {
+        "step": step,
+        "action": action,
+        "reward": round(float(reward), 4),
+        "done": bool(done),
+        "error": error,
+    }
+    print(f"[STEP] {json.dumps(payload, ensure_ascii=True, separators=(',', ':'))}", flush=True)
+def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
+    payload = {
+        "success": bool(success),
+        "steps": int(steps),
+        "score": round(float(score), 4),
+        "rewards": [round(float(r), 4) for r in rewards],
+    }
+    print(f"[END] {json.dumps(payload, ensure_ascii=True, separators=(',', ':'))}", flush=True)
 def add_line_numbers(code: str) -> str:
     lines = code.split("\n")
     return "\n".join(f"{i+1}: {line}" for i, line in enumerate(lines))
             return {"action_type": "request_changes", "comments": [], "suggestions": []}
+def run_episode(env, agent, task_id: str, max_steps: int, seed: int | None = None) -> Dict[str, Any]:
     agent.reset()
+    obs = env.reset(task_id=task_id, seed=seed)
     done = False
     step = 0
     total_reward = 0.0
+    rewards: List[float] = []
     print(f"\nTask    : {task_id}")
     print(f"Desc    : {obs.get('task_description', 'N/A')}")
     print("-" * 60)
+    log_start(task=task_id, env_name="code-review-agent-env", model=MODEL_NAME, max_steps=max_steps, seed=seed)
     while not done and step < max_steps:
         action_str = agent.get_action(obs)
         action = agent.parse_action(action_str)
         obs, reward, done, info = env.step(action)
         total_reward += reward
+        rewards.append(float(reward))
         step += 1
         print(f"\nStep {step}/{max_steps}:")
         if info.get('last_action_valid') is False:
             print(f"  Warning     : {info.get('error', 'Invalid action')}")
+        log_step(
+            step=step,
+            action=action.get("action_type", "unknown"),
+            reward=float(reward),
+            done=bool(done),
+            error=info.get("error"),
+        )
     final_score = env.get_task_score()
     diagnostics = env.summary()
+    success = final_score >= 0.7
+    log_end(success=success, steps=step, score=final_score, rewards=rewards)
     return {
         "task_id": task_id,
         "efficiency_bonus": diagnostics.get("efficiency_bonus", 0),
         "model": MODEL_NAME,
         "api_base_url": API_BASE_URL,
+        "seed": seed,
     }
     parser.add_argument("--output", type=str, default="baseline_results.json")
     parser.add_argument("--batch", action="store_true", help="Run all tasks in batch mode")
     parser.add_argument("--difficulty", type=str, default=None, help="Filter batch by difficulty: easy, medium, hard")
+    parser.add_argument("--seed", type=int, default=42, help="Random seed for deterministic environment reset")
     args = parser.parse_args()
     print("=" * 60)
             task_ids = [t["task_id"] for t in TaskDefinitions.get_all_tasks()]
         run_batch(env, agent, task_ids, args.max_steps, args.output)
     else:
+        result = run_episode(env, agent, args.task_id, args.max_steps, seed=args.seed)
         print("\n" + "=" * 60)
         print("Final Results:")

openenv.yaml CHANGED Viewed

@@ -58,6 +58,10 @@ tasks:
     name: "Hard: Approve Thread-Safe Counter"
     difficulty: hard
 observation_space:
   type: dict
   description: |

     name: "Hard: Approve Thread-Safe Counter"
     difficulty: hard
+  - id: adversarial_hard_4
+    name: "Hard: Adversarial Safe SQL Builder"
+    difficulty: hard
 observation_space:
   type: dict
   description: |

scripts/run_benchmark.py ADDED Viewed

	@@ -0,0 +1,106 @@

+#!/usr/bin/env python3
+from __future__ import annotations
+import argparse
+import json
+import subprocess
+import sys
+from pathlib import Path
+from typing import Any, Dict, List
+REPO_ROOT = Path(__file__).resolve().parents[1]
+INFERENCE_PATH = REPO_ROOT / "inference.py"
+DEFAULT_TASKS = [
+    "bug_detection_easy_1",
+    "memory_leak_medium_1",
+    "security_hard_1",
+]
+def run_task(task_id: str, max_steps: int, output_dir: Path) -> Dict[str, Any]:
+    output_path = output_dir / f"benchmark_{task_id}.json"
+    cmd = [
+        sys.executable,
+        str(INFERENCE_PATH),
+        "--task-id",
+        task_id,
+        "--max-steps",
+        str(max_steps),
+        "--output",
+        str(output_path),
+    ]
+    completed = subprocess.run(cmd, capture_output=True, text=True, cwd=str(REPO_ROOT))
+    if completed.returncode != 0:
+        raise RuntimeError(
+            f"Task {task_id} failed with exit code {completed.returncode}\n"
+            f"stdout:\n{completed.stdout}\n\n"
+            f"stderr:\n{completed.stderr}"
+        )
+    with output_path.open("r", encoding="utf-8") as fh:
+        return json.load(fh)
+def to_markdown(results: List[Dict[str, Any]]) -> str:
+    lines = [
+        "# Benchmark Results",
+        "",
+        "| Task | Task Score | Total Reward | Steps | Model |",
+        "|---|---:|---:|---:|---|",
+    ]
+    for row in results:
+        lines.append(
+            f"| {row.get('task_id')} | {row.get('task_score', 0):.3f} | "
+            f"{row.get('total_reward', 0):.3f} | {row.get('steps', 0)} | {row.get('model', 'unknown')} |"
+        )
+    avg_score = sum(float(r.get("task_score", 0.0)) for r in results) / max(1, len(results))
+    avg_reward = sum(float(r.get("total_reward", 0.0)) for r in results) / max(1, len(results))
+    lines.extend(
+        [
+            "",
+            f"Average task score: **{avg_score:.3f}**",
+            f"Average total reward: **{avg_reward:.3f}**",
+            "",
+            "Note: task_score is normalized to [0, 1]; total_reward is cumulative and may exceed 1.0.",
+        ]
+    )
+    return "\n".join(lines) + "\n"
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Run benchmark tasks and generate a markdown table")
+    parser.add_argument(
+        "--tasks",
+        nargs="+",
+        default=DEFAULT_TASKS,
+        help="Task IDs to evaluate (default: 3 core tasks)",
+    )
+    parser.add_argument("--max-steps", type=int, default=10)
+    parser.add_argument("--output-dir", type=Path, default=REPO_ROOT / "outputs")
+    parser.add_argument("--table", type=Path, default=REPO_ROOT / "outputs/benchmark_table.md")
+    args = parser.parse_args()
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+    args.table.parent.mkdir(parents=True, exist_ok=True)
+    results: List[Dict[str, Any]] = []
+    for task_id in args.tasks:
+        print(f"Running task: {task_id}")
+        result = run_task(task_id, args.max_steps, args.output_dir)
+        results.append(result)
+    table = to_markdown(results)
+    args.table.write_text(table, encoding="utf-8")
+    print(f"Wrote benchmark table to {args.table}")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

scripts/validate-submission.sh ADDED Viewed

	@@ -0,0 +1,63 @@

+#!/usr/bin/env bash
+# validate-submission.sh — OpenEnv submission validator
+# Usage: ./scripts/validate-submission.sh <ping_url> [repo_dir]
+set -euo pipefail
+PING_URL="${1:-}"
+REPO_DIR="${2:-.}"
+DOCKER_BUILD_TIMEOUT="${DOCKER_BUILD_TIMEOUT:-600}"
+if [[ -z "$PING_URL" ]]; then
+  echo "Usage: $0 <ping_url> [repo_dir]"
+  echo "Example: $0 https://my-space.hf.space ."
+  exit 2
+fi
+if [[ ! -d "$REPO_DIR" ]]; then
+  echo "ERROR: repo_dir not found: $REPO_DIR"
+  exit 2
+fi
+cd "$REPO_DIR"
+echo "==> 1/4 Ping Space root"
+ROOT_CODE=$(curl -s -o /tmp/openenv_root.json -w '%{http_code}' "$PING_URL/")
+if [[ "$ROOT_CODE" != "200" ]]; then
+  echo "FAIL: root ping returned $ROOT_CODE"
+  exit 1
+fi
+echo "==> 2/4 Ping reset endpoint"
+RESET_CODE=$(curl -s -o /tmp/openenv_reset.json -w '%{http_code}' "$PING_URL/reset")
+if [[ "$RESET_CODE" != "200" ]]; then
+  echo "FAIL: reset ping returned $RESET_CODE"
+  exit 1
+fi
+if ! grep -q '"observation"' /tmp/openenv_reset.json; then
+  echo "FAIL: /reset did not return observation"
+  exit 1
+fi
+echo "==> 3/4 openenv validate"
+openenv validate
+echo "==> 4/4 docker build"
+if ! command -v docker >/dev/null 2>&1; then
+  echo "FAIL: docker command not found"
+  exit 1
+fi
+if command -v timeout >/dev/null 2>&1; then
+  if ! timeout "$DOCKER_BUILD_TIMEOUT" docker build -t code-review-env-validate .; then
+    echo "FAIL: docker build failed or timed out"
+    exit 1
+  fi
+else
+  echo "WARN: timeout command not found, running docker build without timeout"
+  if ! docker build -t code-review-env-validate .; then
+    echo "FAIL: docker build failed"
+    exit 1
+  fi
+fi
+echo "PASS: submission validator checks completed"

submit.py CHANGED Viewed

@@ -249,6 +249,11 @@ def main() -> int:
         default="code-review-env",
         help="Docker image name for validation build",
     )
     parser.add_argument(
         "--report-path",
         default="submission_report.json",
@@ -268,8 +273,11 @@ def main() -> int:
     ok, detail = run_tests(with_coverage=not args.no_coverage)
     checks["tests"] = detail
-    ok, detail = check_docker(args.image_name)
-    checks["docker"] = detail
     baseline_scores: Dict[str, float] = {}
     if args.skip_baseline:
@@ -283,7 +291,7 @@ def main() -> int:
     required_checks_ok = (
         checks["validation"]["ok"]
         and checks["tests"]["ok"]
-        and checks["docker"]["ok"]
     )
     if required_checks_ok:

         default="code-review-env",
         help="Docker image name for validation build",
     )
+    parser.add_argument(
+        "--skip-docker",
+        action="store_true",
+        help="Skip docker build checks (useful in CI environments without docker daemon)",
+    )
     parser.add_argument(
         "--report-path",
         default="submission_report.json",
     ok, detail = run_tests(with_coverage=not args.no_coverage)
     checks["tests"] = detail
+    if args.skip_docker:
+        checks["docker"] = {"ok": False, "skipped": True, "reason": "Skipped by --skip-docker"}
+    else:
+        ok, detail = check_docker(args.image_name)
+        checks["docker"] = detail
     baseline_scores: Dict[str, float] = {}
     if args.skip_baseline:
     required_checks_ok = (
         checks["validation"]["ok"]
         and checks["tests"]["ok"]
+        and (checks["docker"].get("ok") or checks["docker"].get("skipped"))
     )
     if required_checks_ok:

tests/test_env.py CHANGED Viewed

@@ -248,6 +248,122 @@ class TestCodeReviewEnv(unittest.TestCase):
         self.assertGreater(info["diagnostics"]["false_positive_count"], 0)
         self.assertLess(info["task_score"], 1.0)
 if __name__ == "__main__":
     unittest.main()

         self.assertGreater(info["diagnostics"]["false_positive_count"], 0)
         self.assertLess(info["task_score"], 1.0)
+    def test_reset_step_state_contract(self):
+        obs = self.env.reset(task_id="bug_detection_easy_1", seed=123)
+        self.assertIn("code_diff", obs)
+        self.assertIn("file_context", obs)
+        self.assertIn("task_description", obs)
+        self.assertIn("valid_actions", obs)
+        action = ReviewAction(
+            action_type=ReviewActionType.ADD_COMMENT,
+            comments=[
+                Comment(
+                    line_number=3,
+                    content="division_by_zero risk",
+                    is_issue=True,
+                    severity="high",
+                )
+            ],
+            suggestions=[],
+        )
+        step_obs, reward, done, info = self.env.step(action.model_dump())
+        self.assertIn("observation", {"observation": step_obs})
+        self.assertIsInstance(reward, float)
+        self.assertIsInstance(done, bool)
+        self.assertIn("diagnostics", info)
+        self.assertIn("task_score", info)
+        state = self.env.state()
+        self.assertIn("code_context", state)
+        self.assertIn("task_metadata", state)
+        self.assertIn("comments_made", state)
+        self.assertEqual(state["current_step"], 1)
+    def test_seed_reproducibility(self):
+        env_a = CodeReviewEnv()
+        env_b = CodeReviewEnv()
+        obs_a = env_a.reset(task_id="bug_detection_easy_1", seed=7)
+        obs_b = env_b.reset(task_id="bug_detection_easy_1", seed=7)
+        self.assertEqual(obs_a["code_diff"], obs_b["code_diff"])
+        self.assertEqual(obs_a["task_description"], obs_b["task_description"])
+        self.assertEqual(obs_a["line_count"], obs_b["line_count"])
+        action = ReviewAction(
+            action_type=ReviewActionType.ADD_COMMENT,
+            comments=[
+                Comment(
+                    line_number=3,
+                    content="division_by_zero issue",
+                    is_issue=True,
+                    severity="high",
+                )
+            ],
+            suggestions=[],
+        ).model_dump()
+        _, _, _, info_a = env_a.step(action)
+        _, _, _, info_b = env_b.step(action)
+        self.assertEqual(info_a["task_score"], info_b["task_score"])
+    def test_action_masking_changes_after_comment(self):
+        obs = self.env.reset(task_id="bug_detection_easy_1")
+        self.assertIn("add_comment", obs["valid_actions"])
+        self.assertNotIn("suggest_fix", obs["valid_actions"])
+        self.assertNotIn("mark_as_resolved", obs["valid_actions"])
+        action = ReviewAction(
+            action_type=ReviewActionType.ADD_COMMENT,
+            comments=[Comment(line_number=3, content="issue", is_issue=True, severity="high")],
+            suggestions=[],
+        )
+        next_obs, _, _, _ = self.env.step(action.model_dump())
+        self.assertIn("suggest_fix", next_obs["valid_actions"])
+        self.assertIn("mark_as_resolved", next_obs["valid_actions"])
+    def test_unknown_task_id_raises_key_error(self):
+        with self.assertRaises(KeyError):
+            self.env.reset(task_id="does_not_exist")
+    def test_repeated_comment_gets_penalized(self):
+        self.env.reset(task_id="bug_detection_easy_1")
+        action = ReviewAction(
+            action_type=ReviewActionType.ADD_COMMENT,
+            comments=[
+                Comment(
+                    line_number=3,
+                    content="division_by_zero risk",
+                    is_issue=True,
+                    severity="high",
+                )
+            ],
+            suggestions=[],
+        )
+        _, first_reward, _, _ = self.env.step(action.model_dump())
+        _, second_reward, _, _ = self.env.step(action.model_dump())
+        self.assertLess(second_reward, first_reward)
+    def test_adversarial_safe_sql_task_should_approve(self):
+        self.env.reset(task_id="adversarial_hard_4")
+        action = ReviewAction(
+            action_type=ReviewActionType.APPROVE,
+            comments=[],
+            suggestions=[],
+            final_decision="approved",
+        )
+        obs, _, done, info = self.env.step(action.model_dump())
+        self.assertTrue(done)
+        self.assertEqual(obs["final_decision_made"], "approved")
+        self.assertEqual(info["task_score"], 1.0)
 if __name__ == "__main__":
     unittest.main()