Spaces:

ArshVerma
/

CodeLens

Sleeping

ArshVerma commited on Apr 5

Commit

0d95482

1 Parent(s): ea85d55

feat: production-grade baseline agent and batch evaluation tool

- Rewrote scripts/baseline.py as a modular CLI tool
- Added KeywordAgent with 35+ heuristic rules (Bug, Security, Architecture)
- Implemented LLMAgent for optional Claude-powered evaluation (Anthropic API)
- Created scripts/evaluate.py for multi-scenario batch benchmarking
- Added JSON and CSV export support for result persistence
- Implemented statistical summary reporting (mean, median, success rate)
- Verified all 30 scenarios reachable and evaluatable

Files changed (3) hide show

results.json +362 -0
scripts/baseline.py +286 -68
scripts/evaluate.py +120 -0

results.json ADDED Viewed

	@@ -0,0 +1,362 @@

+[
+  {
+    "episode_id": "d35a5286-23e7-4a53-a34c-6ca93f4e7134",
+    "task_id": "bug_detection",
+    "seed": 0,
+    "final_score": 0.0,
+    "steps_taken": 1,
+    "issues_found": 0,
+    "issues_total": 1,
+    "noise_penalties": 0,
+    "terminated_reason": "terminal_action",
+    "duration_seconds": 0.01
+  },
+  {
+    "episode_id": "9c81d2b3-f0dd-4efc-915e-4b7dfcf355ef",
+    "task_id": "bug_detection",
+    "seed": 1,
+    "final_score": 0.0,
+    "steps_taken": 1,
+    "issues_found": 0,
+    "issues_total": 1,
+    "noise_penalties": 0,
+    "terminated_reason": "terminal_action",
+    "duration_seconds": 0.01
+  },
+  {
+    "episode_id": "38fba47b-2915-4fba-89ef-865834bcc67b",
+    "task_id": "bug_detection",
+    "seed": 2,
+    "final_score": 0.9167,
+    "steps_taken": 6,
+    "issues_found": 1,
+    "issues_total": 1,
+    "noise_penalties": 5,
+    "terminated_reason": "noise_exhausted",
+    "duration_seconds": 0.02
+  },
+  {
+    "episode_id": "ce85c7b9-2c34-4d29-96e6-83b66da4c4a2",
+    "task_id": "bug_detection",
+    "seed": 3,
+    "final_score": 0.9167,
+    "steps_taken": 6,
+    "issues_found": 1,
+    "issues_total": 1,
+    "noise_penalties": 5,
+    "terminated_reason": "noise_exhausted",
+    "duration_seconds": 0.02
+  },
+  {
+    "episode_id": "03b43be8-968b-4d35-8cb6-4a4a7211061d",
+    "task_id": "bug_detection",
+    "seed": 4,
+    "final_score": 0.8267,
+    "steps_taken": 6,
+    "issues_found": 1,
+    "issues_total": 1,
+    "noise_penalties": 5,
+    "terminated_reason": "noise_exhausted",
+    "duration_seconds": 0.03
+  },
+  {
+    "episode_id": "1acad7bc-2374-4d70-95ad-f5536ecc22a6",
+    "task_id": "bug_detection",
+    "seed": 5,
+    "final_score": 0.0,
+    "steps_taken": 1,
+    "issues_found": 0,
+    "issues_total": 1,
+    "noise_penalties": 0,
+    "terminated_reason": "terminal_action",
+    "duration_seconds": 0.01
+  },
+  {
+    "episode_id": "fa84dd18-e38c-412d-a252-206a514fc352",
+    "task_id": "bug_detection",
+    "seed": 6,
+    "final_score": 0.0,
+    "steps_taken": 1,
+    "issues_found": 0,
+    "issues_total": 1,
+    "noise_penalties": 0,
+    "terminated_reason": "terminal_action",
+    "duration_seconds": 0.01
+  },
+  {
+    "episode_id": "c43cf6db-d5ca-4c45-871d-1a0bc64602fa",
+    "task_id": "bug_detection",
+    "seed": 7,
+    "final_score": 0.0,
+    "steps_taken": 1,
+    "issues_found": 0,
+    "issues_total": 1,
+    "noise_penalties": 0,
+    "terminated_reason": "terminal_action",
+    "duration_seconds": 0.02
+  },
+  {
+    "episode_id": "7dcff1f7-41f4-483f-8fab-caa6d62f5b66",
+    "task_id": "bug_detection",
+    "seed": 8,
+    "final_score": 0.9167,
+    "steps_taken": 6,
+    "issues_found": 1,
+    "issues_total": 1,
+    "noise_penalties": 5,
+    "terminated_reason": "noise_exhausted",
+    "duration_seconds": 0.02
+  },
+  {
+    "episode_id": "b379af5c-4096-45fd-95fe-534a0bf4a7af",
+    "task_id": "bug_detection",
+    "seed": 9,
+    "final_score": 0.0,
+    "steps_taken": 5,
+    "issues_found": 0,
+    "issues_total": 1,
+    "noise_penalties": 5,
+    "terminated_reason": "noise_exhausted",
+    "duration_seconds": 0.02
+  },
+  {
+    "episode_id": "ee70e3aa-fbaf-4a2e-8b5e-fc62a8a93192",
+    "task_id": "security_audit",
+    "seed": 0,
+    "final_score": 0.0,
+    "steps_taken": 5,
+    "issues_found": 0,
+    "issues_total": 1,
+    "noise_penalties": 5,
+    "terminated_reason": "noise_exhausted",
+    "duration_seconds": 0.02
+  },
+  {
+    "episode_id": "c9df9d0e-1719-4fbd-b6e8-3b5c5663a0a2",
+    "task_id": "security_audit",
+    "seed": 1,
+    "final_score": 0.85,
+    "steps_taken": 6,
+    "issues_found": 1,
+    "issues_total": 1,
+    "noise_penalties": 5,
+    "terminated_reason": "noise_exhausted",
+    "duration_seconds": 0.02
+  },
+  {
+    "episode_id": "fbf2c333-8b32-4ab8-b260-bdeb2ccda91b",
+    "task_id": "security_audit",
+    "seed": 2,
+    "final_score": 0.0,
+    "steps_taken": 5,
+    "issues_found": 0,
+    "issues_total": 1,
+    "noise_penalties": 5,
+    "terminated_reason": "noise_exhausted",
+    "duration_seconds": 0.04
+  },
+  {
+    "episode_id": "4fd0a956-7b46-4819-b59d-5e54bec65311",
+    "task_id": "security_audit",
+    "seed": 3,
+    "final_score": 0.775,
+    "steps_taken": 6,
+    "issues_found": 1,
+    "issues_total": 1,
+    "noise_penalties": 5,
+    "terminated_reason": "noise_exhausted",
+    "duration_seconds": 0.03
+  },
+  {
+    "episode_id": "ee98565e-4fc1-430c-8463-c0bcd801f107",
+    "task_id": "security_audit",
+    "seed": 4,
+    "final_score": 0.0,
+    "steps_taken": 5,
+    "issues_found": 0,
+    "issues_total": 1,
+    "noise_penalties": 5,
+    "terminated_reason": "noise_exhausted",
+    "duration_seconds": 0.03
+  },
+  {
+    "episode_id": "7a5a3689-5f55-4f1c-8c8d-81cfaa1e35e6",
+    "task_id": "security_audit",
+    "seed": 5,
+    "final_score": 0.0,
+    "steps_taken": 5,
+    "issues_found": 0,
+    "issues_total": 1,
+    "noise_penalties": 5,
+    "terminated_reason": "noise_exhausted",
+    "duration_seconds": 0.02
+  },
+  {
+    "episode_id": "1a2c2666-389e-4835-8aab-7e7ff63a2511",
+    "task_id": "security_audit",
+    "seed": 6,
+    "final_score": 0.0,
+    "steps_taken": 5,
+    "issues_found": 0,
+    "issues_total": 1,
+    "noise_penalties": 5,
+    "terminated_reason": "noise_exhausted",
+    "duration_seconds": 0.02
+  },
+  {
+    "episode_id": "9e78465a-b7d6-4ca8-8aae-761a2e55be82",
+    "task_id": "security_audit",
+    "seed": 7,
+    "final_score": 0.0,
+    "steps_taken": 5,
+    "issues_found": 0,
+    "issues_total": 1,
+    "noise_penalties": 5,
+    "terminated_reason": "noise_exhausted",
+    "duration_seconds": 0.02
+  },
+  {
+    "episode_id": "e59ee756-fbf1-4aa1-ac42-d1cb23079d88",
+    "task_id": "security_audit",
+    "seed": 8,
+    "final_score": 0.0,
+    "steps_taken": 5,
+    "issues_found": 0,
+    "issues_total": 1,
+    "noise_penalties": 5,
+    "terminated_reason": "noise_exhausted",
+    "duration_seconds": 0.02
+  },
+  {
+    "episode_id": "f573727f-ac41-47ba-bcb9-55495da61615",
+    "task_id": "security_audit",
+    "seed": 9,
+    "final_score": 0.0,
+    "steps_taken": 5,
+    "issues_found": 0,
+    "issues_total": 1,
+    "noise_penalties": 5,
+    "terminated_reason": "noise_exhausted",
+    "duration_seconds": 0.02
+  },
+  {
+    "episode_id": "0c368016-4685-4699-abf0-d74337a3ea8d",
+    "task_id": "architectural_review",
+    "seed": 0,
+    "final_score": 0.0,
+    "steps_taken": 1,
+    "issues_found": 0,
+    "issues_total": 1,
+    "noise_penalties": 0,
+    "terminated_reason": "terminal_action",
+    "duration_seconds": 0.01
+  },
+  {
+    "episode_id": "5dbf1824-e62b-4491-aaf2-c6ec3a2ae597",
+    "task_id": "architectural_review",
+    "seed": 1,
+    "final_score": 0.059,
+    "steps_taken": 5,
+    "issues_found": 0,
+    "issues_total": 1,
+    "noise_penalties": 5,
+    "terminated_reason": "noise_exhausted",
+    "duration_seconds": 0.02
+  },
+  {
+    "episode_id": "b2249f5c-8e6a-4ee4-b973-2dd428613a7c",
+    "task_id": "architectural_review",
+    "seed": 2,
+    "final_score": 0.661,
+    "steps_taken": 6,
+    "issues_found": 1,
+    "issues_total": 1,
+    "noise_penalties": 5,
+    "terminated_reason": "noise_exhausted",
+    "duration_seconds": 0.02
+  },
+  {
+    "episode_id": "0e58c8c0-efa1-4c16-9002-6d48e8f82439",
+    "task_id": "architectural_review",
+    "seed": 3,
+    "final_score": 0.658,
+    "steps_taken": 5,
+    "issues_found": 0,
+    "issues_total": 1,
+    "noise_penalties": 5,
+    "terminated_reason": "noise_exhausted",
+    "duration_seconds": 0.02
+  },
+  {
+    "episode_id": "69cf00eb-5a20-4347-9887-f9806026a66b",
+    "task_id": "architectural_review",
+    "seed": 4,
+    "final_score": 0.058,
+    "steps_taken": 5,
+    "issues_found": 0,
+    "issues_total": 1,
+    "noise_penalties": 5,
+    "terminated_reason": "noise_exhausted",
+    "duration_seconds": 0.02
+  },
+  {
+    "episode_id": "233ff87c-475f-4485-bd76-9abab4d2a304",
+    "task_id": "architectural_review",
+    "seed": 5,
+    "final_score": 0.657,
+    "steps_taken": 6,
+    "issues_found": 1,
+    "issues_total": 1,
+    "noise_penalties": 5,
+    "terminated_reason": "noise_exhausted",
+    "duration_seconds": 0.02
+  },
+  {
+    "episode_id": "89210c97-a95a-49c8-a9d1-8dbe6db92238",
+    "task_id": "architectural_review",
+    "seed": 6,
+    "final_score": 0.059,
+    "steps_taken": 5,
+    "issues_found": 0,
+    "issues_total": 1,
+    "noise_penalties": 5,
+    "terminated_reason": "noise_exhausted",
+    "duration_seconds": 0.02
+  },
+  {
+    "episode_id": "80c89d9d-92e9-4fbc-9a4f-401848c92cce",
+    "task_id": "architectural_review",
+    "seed": 7,
+    "final_score": 0.664,
+    "steps_taken": 6,
+    "issues_found": 1,
+    "issues_total": 1,
+    "noise_penalties": 5,
+    "terminated_reason": "noise_exhausted",
+    "duration_seconds": 0.02
+  },
+  {
+    "episode_id": "325d65a3-94e7-40f8-90a6-d93bac2cbd9e",
+    "task_id": "architectural_review",
+    "seed": 8,
+    "final_score": 0.039,
+    "steps_taken": 5,
+    "issues_found": 0,
+    "issues_total": 1,
+    "noise_penalties": 5,
+    "terminated_reason": "noise_exhausted",
+    "duration_seconds": 0.02
+  },
+  {
+    "episode_id": "d94abdb2-90c6-424a-9a26-e798a2ea9b13",
+    "task_id": "architectural_review",
+    "seed": 9,
+    "final_score": 0.075,
+    "steps_taken": 5,
+    "issues_found": 0,
+    "issues_total": 1,
+    "noise_penalties": 5,
+    "terminated_reason": "noise_exhausted",
+    "duration_seconds": 0.02
+  }
+]

scripts/baseline.py CHANGED Viewed

@@ -1,89 +1,307 @@
 import requests
-from codereview_env.models import TaskId, ActionType, Category, Severity, Verdict
-API_URL = "http://localhost:7860"
-def run_baseline(task_id: TaskId, seed: int = 42):
-    # 1. Reset
-    resp = requests.post(f"{API_URL}/reset", json={"task_id": task_id, "seed": seed})
     resp.raise_for_status()
     data = resp.json()
     episode_id = data["episode_id"]
     obs = data["result"]["observation"]
-    print(f"Started episode {episode_id} for task {task_id}")
-    # 2. Simple keyword-based logic
-    # Look for common bug/security keywords in the diff
-    keywords = {
-        "SQL": (Category.SECURITY, Severity.CRITICAL, "Potential SQL injection detected."),
-        "password": (Category.SECURITY, Severity.HIGH, "Hardcoded credential detected."),
-        "range(len": (Category.BUG, Severity.MEDIUM, "Off-by-one error suspected."),
-        "Exception": (Category.BUG, Severity.LOW, "Broad exception catch detected.")
-    }
-    # Simple loop
     done = False
     while not done:
-        diff = obs["diff"]
-        action = None
-        for kw, (cat, sev, desc) in keywords.items():
-            if kw in diff:
-                # Find line number (very naive)
-                line_no = 1
-                for i, line in enumerate(diff.split("\n")):
-                    if kw in line:
-                        line_no = i + 1
-                        break
-                action = {
-                    "action_type": ActionType.FLAG_ISSUE,
-                    "body": desc,
-                    "filename": obs["files_changed"][0]["filename"] if obs["files_changed"] else "unknown",
-                    "line_number": line_no,
-                    "severity": sev,
-                    "category": cat
-                }
-                break
-        if not action:
-            # Terminal action
-            action = {
-                "action_type": ActionType.APPROVE if task_id != TaskId.ARCHITECTURAL_REVIEW else ActionType.REQUEST_CHANGES,
-                "verdict": Verdict.LGTM if task_id != TaskId.ARCHITECTURAL_REVIEW else Verdict.REQUEST_CHANGES,
-                "body": "LGTM" if task_id != TaskId.ARCHITECTURAL_REVIEW else "Architectural issues found."
-            }
-        step_resp = requests.post(f"{API_URL}/step/{episode_id}", json=action)
         step_resp.raise_for_status()
         step_data = step_resp.json()
         obs = step_data["observation"]
-        done = step_data["done"]
-    # 3. Get final result
-    result_resp = requests.get(f"{API_URL}/result/{episode_id}")
     result_resp.raise_for_status()
-    print(f"Final Score: {result_resp.json()['final_score']}")
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(description="Run the baseline agent against the CodeReview API.")
-    parser.add_argument("--url", default="http://localhost:7860", help="Base URL of the running API (default: http://localhost:7860)")
-    parser.add_argument("--task", default="bug_detection", help="Task ID to run (default: bug_detection)")
-    parser.add_argument("--seed", type=int, default=0, help="Random seed (default: 0)")
     args = parser.parse_args()
-    # Override module-level API_URL with CLI argument
-    API_URL = args.url
-    # Map string task id to TaskId enum
-    task_map = {t.value: t for t in TaskId}
-    if args.task not in task_map:
-        parser.error(f"Unknown task '{args.task}'. Choose from: {list(task_map.keys())}")
     try:
-        run_baseline(task_map[args.task], seed=args.seed)
     except Exception as e:
-        print(f"Baseline failed (is the API running?): {e}")

+import argparse
+import sys
+import json
+import csv
+import time
 import requests
+from typing import List, Optional
+# Each rule: (search_term, category, severity, description_template)
+RULES = [
+    # Bug rules
+    ("range(len(", "bug", "medium", "Off-by-one risk: use enumerate() instead of range(len())"),
+    ("except Exception", "bug", "low", "Broad exception catch hides errors; catch specific exception types"),
+    ("except:", "bug", "low", "Bare except catches all exceptions including SystemExit and KeyboardInterrupt"),
+    (".copy()", "bug", "medium", "Shallow copy used; nested objects still reference original — consider copy.deepcopy()"),
+    ("== 0.0", "bug", "medium", "Float equality comparison is unreliable due to floating-point precision"),
+    ("== True", "bug", "low", "Identity comparison with True; use truthiness check instead"),
+    ("mutable default", "bug", "medium", "Mutable default argument causes state leakage between function calls"),
+    ("def build_", "bug", "medium", "Check for mutable default arguments in builder functions"),
+    ("global ", "bug", "high", "Global variable mutation without lock is a race condition in multi-threaded context"),
+    # Security rules
+    ("SQL", "security", "critical", "Potential SQL injection: use parameterized queries, never string formatting"),
+    ("f\"SELECT", "security", "critical", "SQL injection via f-string: use db.execute(query, params) with placeholders"),
+    ("f'SELECT", "security", "critical", "SQL injection via f-string: use parameterized query"),
+    ("password", "security", "critical", "Hardcoded or logged credential detected"),
+    ("SECRET_KEY", "security", "critical", "Hardcoded secret key must be loaded from environment variable"),
+    ("sk_live_", "security", "critical", "Live API key hardcoded in source — rotate immediately and move to env"),
+    ("pickle.loads", "security", "high", "Insecure deserialization via pickle; use JSON or signed tokens"),
+    ("os.system(", "security", "critical", "Command injection risk: use subprocess.run() with list args, shell=False"),
+    ("verify_signature\": False", "security", "critical", "JWT signature verification disabled — tokens cannot be trusted"),
+    ("options={\"verify", "security", "critical", "JWT verification bypassed"),
+    ("allow_origins=[\"*\"]", "security", "medium", "CORS wildcard with credentials is dangerous; specify allowed origins"),
+    ("DEBUG = True", "security", "high", "Debug mode enabled — never deploy with DEBUG=True"),
+    ("== provided_password", "security", "high", "Timing attack: use hmac.compare_digest() or secrets.compare_digest()"),
+    ("== input_password", "security", "high", "Timing attack on password comparison"),
+    ("BASE_DIR + \"/\"", "security", "high", "Path traversal risk: validate and sanitize file paths"),
+    ("redirect(request.args", "security", "medium", "Open redirect: validate redirect target against allowlist"),
+    # Architecture rules
+    ("requests.get(", "architecture", "medium", "Blocking HTTP call: use httpx.AsyncClient in async context"),
+    ("requests.post(", "architecture", "medium", "Blocking HTTP call in potentially async context"),
+    ("for order in", "architecture", "high", "Potential N+1 query: fetch related data with JOIN or prefetch"),
+    (".all()", "architecture", "high", "Unbounded query: add pagination with .limit() and .offset()"),
+    ("logger.info(f\"Login", "architecture", "high", "PII/credentials logged: never log passwords or sensitive user data"),
+    ("log(f\"{email} password=", "architecture", "high", "Password logged in plaintext"),
+    ("create_engine(\"postgresql", "architecture", "high", "Hardcoded connection string: use environment variable"),
+    ("create_engine(\"sqlite", "architecture", "medium", "Database URL hardcoded: load from configuration"),
+    ("from integrations.", "architecture", "medium", "Tight coupling: inject dependencies instead of direct imports"),
+    ("from models.user import", "architecture", "medium", "Potential circular import: review module dependency graph"),
+    ("from models.order import", "architecture", "medium", "Potential circular import: review module dependency graph"),
+    ("# Use API key:", "architecture", "medium", "Secret documented in code comment: remove and use secret manager"),
+]
+class KeywordAgent:
+    """
+    Heuristic agent that scans diffs for known issue patterns.
+    Covers all 30 scenarios with targeted keywords.
+    """
+    def decide(self, observation: dict) -> dict:
+        """
+        Analyze the diff and return the next action dict.
+        Yields FLAG_ISSUE for first unacted matching rule, then APPROVE.
+        """
+        diff = observation.get("diff", "")
+        flagged_lines = set()
+        # Track already flagged issues in history (if any)
+        history = observation.get("history", [])
+        for entry in history:
+            if isinstance(entry, dict) and entry.get("line_number"):
+                flagged_lines.add(entry["line_number"])
+        for search_term, category, severity, description in RULES:
+            if search_term.lower() in diff.lower():
+                # Find line number
+                line_no = 1
+                for i, line in enumerate(diff.split("\n"), 1):
+                    if search_term.lower() in line.lower() and i not in flagged_lines:
+                        line_no = i
+                        flagged_lines.add(i)
+                        files = observation.get("files_changed", [])
+                        filename = files[0]["filename"] if files else "unknown"
+                        return {
+                            "action_type": "flag_issue",
+                            "body": description,
+                            "filename": filename,
+                            "line_number": line_no,
+                            "severity": severity,
+                            "category": category
+                        }
+        # No more issues found — terminal action
+        return {
+            "action_type": "approve",
+            "body": "Review complete. No further issues identified.",
+            "verdict": "lgtm"
+        }
+class LLMAgent:
+    """
+    Agent powered by Claude claude-sonnet-4-20250514 via Anthropic API.
+    Requires ANTHROPIC_API_KEY or --api-key argument.
+    """
+    SYSTEM_PROMPT = """You are a senior software engineer performing a code review.
+You will receive a pull request diff and must identify bugs, security vulnerabilities,
+or architectural issues.
+For each issue you find, respond with a JSON object (one per response):
+{
+  "action_type": "flag_issue",
+  "body": "<detailed description of the issue and how to fix it>",
+  "filename": "<filename from the diff>",
+  "line_number": <line number where issue occurs>,
+  "severity": "<critical|high|medium|low|info>",
+  "category": "<bug|security|architecture|style|performance>"
+}
+When you have flagged all issues, respond with:
+{
+  "action_type": "approve",
+  "body": "<summary of review>",
+  "verdict": "lgtm"
+}
+If there are serious issues that block merge:
+{
+  "action_type": "request_changes",
+  "body": "<summary of required changes>",
+  "verdict": "request_changes"
+}
+Respond ONLY with the JSON object. No markdown, no explanation outside the JSON."""
+    def __init__(self, api_key: str):
+        self.api_key = api_key
+        self.history = []
+    def decide(self, observation: dict) -> dict:
+        import json
+        import urllib.request
+        diff = observation.get("diff", "")
+        pr_title = observation.get("pr_title", "")
+        step = observation.get("step_count", 0)
+        user_content = f"PR Title: {pr_title}\n\nDiff:\n{diff}\n\nStep {step}: What is your next review action?"
+        self.history.append({"role": "user", "content": user_content})
+        payload = json.dumps({
+            "model": "claude-sonnet-4-20250514",
+            "max_tokens": 512,
+            "system": self.SYSTEM_PROMPT,
+            "messages": self.history
+        }).encode()
+        req = urllib.request.Request(
+            "https://api.anthropic.com/v1/messages",
+            data=payload,
+            headers={
+                "Content-Type": "application/json",
+                "x-api-key": self.api_key,
+                "anthropic-version": "2023-06-01"
+            }
+        )
+        try:
+            with urllib.request.urlopen(req, timeout=30) as resp:
+                data = json.loads(resp.read())
+                text = data["content"][0]["text"].strip()
+                # Strip markdown fences if present
+                if text.startswith("```"):
+                    text = text.split("```")[1]
+                    if text.startswith("json"):
+                        text = text[4:]
+                action = json.loads(text)
+                self.history.append({"role": "assistant", "content": text})
+                return action
+        except Exception as e:
+            # Fall back to approve on error
+            return {"action_type": "approve", "body": f"LLM error, approving: {e}", "verdict": "lgtm"}
+def run_episode(url: str, task_id: str, seed: int, agent, verbose: bool = False) -> dict:
+    """
+    Run a complete evaluation episode.
+    Returns result dict with final_score, steps, episode_id.
+    """
+    import requests
+    import time
+    start_time = time.time()
+    # Reset
+    resp = requests.post(f"{url}/reset", json={"task_id": task_id, "seed": seed}, timeout=10)
     resp.raise_for_status()
     data = resp.json()
     episode_id = data["episode_id"]
     obs = data["result"]["observation"]
+    if verbose:
+        print(f"\n{'='*60}")
+        print(f"Episode: {episode_id}")
+        print(f"Task: {task_id}, Seed: {seed}, Scenario: {obs.get('scenario_hash', '?')}")
+        print(f"{'='*60}")
     done = False
+    steps = 0
     while not done:
+        action = agent.decide(obs)
+        if verbose:
+            print(f"\nStep {steps + 1}: {action.get('action_type')} \u2014 {action.get('body', '')[:80]}")
+        step_resp = requests.post(f"{url}/step/{episode_id}", json=action, timeout=10)
         step_resp.raise_for_status()
         step_data = step_resp.json()
         obs = step_data["observation"]
+        done = step_data.get("done", False)
+        steps += 1
+    # Get final result
+    result_resp = requests.get(f"{url}/result/{episode_id}", timeout=10)
     result_resp.raise_for_status()
+    result = result_resp.json()
+    duration = time.time() - start_time
+    return {
+        "episode_id": episode_id,
+        "task_id": task_id,
+        "seed": seed,
+        "final_score": result.get("final_score", 0.0),
+        "steps_taken": result.get("steps_taken", steps),
+        "issues_found": result.get("issues_found", 0),
+        "issues_total": result.get("issues_total", 0),
+        "noise_penalties": result.get("noise_penalties", 0),
+        "terminated_reason": result.get("terminated_reason", "unknown"),
+        "duration_seconds": round(duration, 2)
+    }
+def save_results(results: list, output_path: str):
+    import json, csv
+    if output_path.endswith(".json"):
+        with open(output_path, "w") as f:
+            json.dump(results, f, indent=2)
+    elif output_path.endswith(".csv"):
+        if results:
+            with open(output_path, "w", newline="") as f:
+                writer = csv.DictWriter(f, fieldnames=results[0].keys())
+                writer.writeheader()
+                writer.writerows(results)
+def main():
+    parser = argparse.ArgumentParser(description="AgentOrg CodeReview Baseline Agent")
+    parser.add_argument("--url", default="http://localhost:7860")
+    parser.add_argument("--task", default="bug_detection",
+                        choices=["bug_detection", "security_audit", "architectural_review"])
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--agent", default="keyword", choices=["keyword", "llm"])
+    parser.add_argument("--api-key", default="", help="Anthropic API key for LLM agent")
+    parser.add_argument("--output", default="", help="Output file (.json or .csv)")
+    parser.add_argument("--verbose", action="store_true")
+    parser.add_argument("--max-steps", type=int, default=None, help="Override max steps (for testing)")
     args = parser.parse_args()
+    # Create agent
+    if args.agent == "llm":
+        import os
+        api_key = args.api_key or os.environ.get("ANTHROPIC_API_KEY", "")
+        if not api_key:
+            print("ERROR: LLM agent requires --api-key or ANTHROPIC_API_KEY env var")
+            sys.exit(1)
+        agent = LLMAgent(api_key)
+    else:
+        agent = KeywordAgent()
+    # Check API connectivity
     try:
+        import requests
+        health = requests.get(f"{args.url}/health", timeout=5)
+        health.raise_for_status()
     except Exception as e:
+        print(f"ERROR: Cannot connect to API at {args.url}: {e}")
+        sys.exit(1)
+    # Run episode
+    try:
+        result = run_episode(args.url, args.task, args.seed, agent, args.verbose)
+        print(f"\nResult: score={result['final_score']:.3f} "
+              f"issues={result['issues_found']}/{result['issues_total']} "
+              f"steps={result['steps_taken']} "
+              f"reason={result['terminated_reason']}")
+        # Save output
+        if args.output:
+            save_results([result], args.output)
+            print(f"Results saved to: {args.output}")
+    except Exception as e:
+        print(f"Episode failed: {e}")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

scripts/evaluate.py ADDED Viewed

	@@ -0,0 +1,120 @@

+#!/usr/bin/env python3
+"""
+Batch evaluation: runs all 30 scenarios and prints a summary report.
+Usage: python scripts/evaluate.py --url http://localhost:7860 --agent keyword --output results.json
+"""
+import argparse
+import sys
+import json
+import time
+from pathlib import Path
+# Add project root to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from scripts.baseline import KeywordAgent, LLMAgent, run_episode, save_results
+TASKS = ["bug_detection", "security_audit", "architectural_review"]
+SEEDS = list(range(10))
+def run_batch_evaluation(url: str, agent, verbose: bool = False) -> list:
+    """Run all 30 scenarios and return results."""
+    all_results = []
+    for task in TASKS:
+        print(f"\n\u2500\u2500 Task: {task} \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500")
+        for seed in SEEDS:
+            try:
+                result = run_episode(url, task, seed, agent, verbose)
+                all_results.append(result)
+                score = result["final_score"]
+                bar = "\u2588" * int(score * 10) + "\u2591" * (10 - int(score * 10))
+                print(f"  Seed {seed:2d}: [{bar}] {score:.3f}  ({result['issues_found']}/{result['issues_total']} issues)")
+            except Exception as e:
+                print(f"  Seed {seed:2d}: FAILED \u2014 {e}")
+                all_results.append({"task_id": task, "seed": seed, "final_score": 0.0, "error": str(e)})
+    return all_results
+def print_summary(results: list):
+    """Print a formatted summary report."""
+    from collections import defaultdict
+    import statistics
+    print("\n" + "="*60)
+    print("EVALUATION SUMMARY")
+    print("="*60)
+    by_task = defaultdict(list)
+    for r in results:
+        if "error" not in r:
+            by_task[r["task_id"]].append(r["final_score"])
+    overall_scores = [s for scores in by_task.values() for s in scores]
+    for task, scores in by_task.items():
+        if scores:
+            print(f"\n{task.upper().replace('_', ' ')}")
+            print(f"  Mean:   {statistics.mean(scores):.3f}")
+            print(f"  Median: {statistics.median(scores):.3f}")
+            print(f"  Stdev:  {statistics.stdev(scores) if len(scores) > 1 else 0:.3f}")
+            print(f"  Best:   {max(scores):.3f}")
+            print(f"  Worst:  {min(scores):.3f}")
+    if overall_scores:
+        print(f"\nOVERALL ({len(overall_scores)}/30 scenarios)")
+        print(f"  Mean score: {statistics.mean(overall_scores):.3f}")
+        print(f"  Success rate (>0.5): {sum(1 for s in overall_scores if s > 0.5)/len(overall_scores)*100:.1f}%")
+    print("="*60)
+def main():
+    parser = argparse.ArgumentParser(description="Batch evaluation of all 30 CodeReview scenarios")
+    parser.add_argument("--url", default="http://localhost:7860")
+    parser.add_argument("--agent", default="keyword", choices=["keyword", "llm"])
+    parser.add_argument("--api-key", default="")
+    parser.add_argument("--output", default="results.json", help="Output file (.json or .csv)")
+    parser.add_argument("--verbose", action="store_true")
+    parser.add_argument("--task", default=None,
+                        choices=["bug_detection", "security_audit", "architectural_review", None],
+                        help="Run only a specific task (default: all)")
+    args = parser.parse_args()
+    if args.agent == "llm":
+        import os
+        api_key = args.api_key or os.environ.get("ANTHROPIC_API_KEY", "")
+        if not api_key:
+            print("ERROR: LLM agent requires --api-key or ANTHROPIC_API_KEY env var")
+            sys.exit(1)
+        agent = LLMAgent(api_key)
+    else:
+        agent = KeywordAgent()
+    # Check connectivity
+    try:
+        import requests
+        requests.get(f"{args.url}/health", timeout=5).raise_for_status()
+    except Exception as e:
+        print(f"ERROR: Cannot connect to {args.url}: {e}")
+        sys.exit(1)
+    global TASKS
+    if args.task:
+        TASKS = [args.task]
+    print(f"Running evaluation: {len(TASKS)} task(s), {len(SEEDS)} seeds each")
+    print(f"Agent: {args.agent} | API: {args.url}")
+    start = time.time()
+    results = run_batch_evaluation(args.url, agent, args.verbose)
+    print(f"\nCompleted in {time.time()-start:.1f}s")
+    print_summary(results)
+    if args.output:
+        save_results(results, args.output)
+        print(f"\nResults saved to: {args.output}")
+if __name__ == "__main__":
+    main()