code-review / baseline_agent.py
Avnishjain's picture
Upload 26 files
ec566e9 verified
#!/usr/bin/env python3
"""
baseline_agent.py – Baseline inference script for CodeReview OpenEnv.
Runs gpt-4o against all three tasks using the OpenAI client.
Reads credentials from OPENAI_API_KEY environment variable.
Connects to the env either locally (direct Python import) or via HTTP.
Usage
-----
# Direct mode (no server needed):
python baseline_agent.py
# Against a running server:
python baseline_agent.py --mode http --base-url http://localhost:7860
# Single task:
python baseline_agent.py --task task_2_medium
"""
from __future__ import annotations
import argparse
import json
import os
import sys
import textwrap
import time
from typing import Any, Dict, List, Optional
import requests
from openai import OpenAI
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
MODEL = os.environ.get("BASELINE_MODEL", "gpt-4o")
API_KEY = os.environ.get("OPENAI_API_KEY", "")
ENV_BASE_URL = os.environ.get("ENV_BASE_URL", "http://localhost:7860")
TASKS = ["task_1_easy", "task_2_medium", "task_3_hard"]
# ---------------------------------------------------------------------------
# Prompt construction
# ---------------------------------------------------------------------------
SYSTEM_PROMPT = textwrap.dedent("""
You are an expert Python code reviewer.
You will be given a code snippet along with review instructions.
Your job is to produce a JSON action object that identifies issues in the code.
The JSON object you return must match this schema exactly:
{
"comments": [
{
"line": <int or null>,
"category": <"bug"|"security"|"performance"|"style"|"documentation">,
"severity": <"low"|"medium"|"high"|"critical">,
"message": "<clear description of the issue>",
"suggestion": "<optional fix>"
}
],
"summary": "<overall assessment – required for hard tasks, optional otherwise>",
"submit": true
}
Rules:
- Only flag genuine issues. Do not fabricate problems.
- Be precise about line numbers (1-indexed from the code).
- Match the categories listed in the instructions.
- Always set "submit": true when you believe your review is complete.
- Return ONLY the JSON object. No markdown, no explanations.
""").strip()
def build_user_message(observation: dict) -> str:
snippet = observation["snippet"]
instructions = observation["instructions"]
previous = observation.get("previous_comments", [])
numbered_source = "\n".join(
f"{i+1:3d} {line}"
for i, line in enumerate(snippet["source"].splitlines())
)
msg = f"""
{instructions}
### File: {snippet['file_name']}
```python
{numbered_source}
```
"""
if previous:
msg += f"\n### Your previous comments ({len(previous)} so far):\n"
for c in previous:
msg += f" - L{c.get('line','?')} [{c['category']}] {c['message'][:80]}\n"
return msg.strip()
# ---------------------------------------------------------------------------
# Direct mode (import env directly)
# ---------------------------------------------------------------------------
def run_direct(task_id: str, client: OpenAI) -> dict:
"""Run the agent against the environment by direct Python import."""
# Import here to avoid circular dependency when running in HTTP mode
sys.path.insert(0, os.path.dirname(__file__))
from env.environment import CodeReviewEnv
from env.models import Action, ReviewComment, ReviewCategory, Severity
env = CodeReviewEnv(task_id=task_id)
obs = env.reset()
total_reward = 0.0
final_score = 0.0
steps_taken = 0
for step_num in range(env.spec.max_steps):
user_msg = build_user_message(obs.model_dump())
try:
response = client.chat.completions.create(
model=MODEL,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_msg},
],
temperature=0.2,
response_format={"type": "json_object"},
)
raw = response.choices[0].message.content or "{}"
action_dict = json.loads(raw)
except Exception as e:
print(f" [!] LLM error on step {step_num}: {e}")
action_dict = {"comments": [], "submit": True}
# Build Action
comments = []
for c in action_dict.get("comments", []):
try:
comments.append(ReviewComment(
line=c.get("line"),
category=ReviewCategory(c.get("category", "bug")),
severity=Severity(c.get("severity", "medium")),
message=c.get("message", ""),
suggestion=c.get("suggestion"),
))
except Exception:
pass # skip malformed comments
action = Action(
comments=comments,
summary=action_dict.get("summary"),
submit=action_dict.get("submit", True),
)
result = env.step(action)
total_reward += result.reward.value
steps_taken += 1
final_score = result.info.get("grader", {}).get("score", 0.0)
print(f" Step {step_num+1}: reward={result.reward.value:+.3f} | "
f"comments={result.info['total_comments']} | "
f"score={final_score:.3f}")
obs = result.observation
if result.done:
break
passed = final_score >= env.spec.passing_threshold
return {
"task_id": task_id,
"steps": steps_taken,
"total_reward": round(total_reward, 4),
"final_score": round(final_score, 4),
"passed": passed,
"threshold": env.spec.passing_threshold,
}
# ---------------------------------------------------------------------------
# HTTP mode (against a running server)
# ---------------------------------------------------------------------------
def run_http(task_id: str, client: OpenAI, base_url: str) -> dict:
"""Run the agent against a live HTTP server."""
session_id = f"baseline-{task_id}-{int(time.time())}"
headers = {"Content-Type": "application/json"}
# Reset
r = requests.post(f"{base_url}/reset",
json={"task_id": task_id, "session_id": session_id}, headers=headers)
r.raise_for_status()
obs = r.json()["observation"]
# Get task spec for threshold
tasks_r = requests.get(f"{base_url}/tasks")
spec = tasks_r.json()[task_id]
max_steps = spec["max_steps"]
threshold = spec["passing_threshold"]
total_reward = 0.0
final_score = 0.0
steps_taken = 0
for step_num in range(max_steps):
user_msg = build_user_message(obs)
try:
response = client.chat.completions.create(
model=MODEL,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_msg},
],
temperature=0.2,
response_format={"type": "json_object"},
)
action_dict = json.loads(response.choices[0].message.content or "{}")
except Exception as e:
print(f" [!] LLM error: {e}")
action_dict = {"comments": [], "submit": True}
step_r = requests.post(
f"{base_url}/step",
json={"session_id": session_id, "action": action_dict},
headers=headers,
)
step_r.raise_for_status()
result = step_r.json()
total_reward += result["reward"]["value"]
steps_taken += 1
final_score = result["info"].get("grader", {}).get("score", 0.0)
print(f" Step {step_num+1}: reward={result['reward']['value']:+.3f} | "
f"comments={result['info']['total_comments']} | "
f"score={final_score:.3f}")
obs = result["observation"]
if result["done"]:
break
return {
"task_id": task_id,
"steps": steps_taken,
"total_reward": round(total_reward, 4),
"final_score": round(final_score, 4),
"passed": final_score >= threshold,
"threshold": threshold,
}
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
parser = argparse.ArgumentParser(description="Baseline agent for CodeReview OpenEnv")
parser.add_argument("--mode", choices=["direct", "http"], default="direct")
parser.add_argument("--base-url", default=ENV_BASE_URL)
parser.add_argument("--task", choices=TASKS + ["all"], default="all")
args = parser.parse_args()
if not API_KEY:
print("ERROR: OPENAI_API_KEY environment variable not set.")
sys.exit(1)
client = OpenAI(api_key=API_KEY)
tasks_to_run = TASKS if args.task == "all" else [args.task]
print(f"\n{'='*60}")
print(f" CodeReview OpenEnv – Baseline Agent ({MODEL})")
print(f" Mode: {args.mode}")
print(f"{'='*60}\n")
results: List[dict] = []
for task_id in tasks_to_run:
print(f"β–Ά Running {task_id} ...")
t0 = time.time()
if args.mode == "direct":
r = run_direct(task_id, client)
else:
r = run_http(task_id, client, args.base_url)
elapsed = round(time.time() - t0, 1)
r["elapsed_s"] = elapsed
results.append(r)
status = "βœ… PASSED" if r["passed"] else "❌ FAILED"
print(f" β†’ {status} | score={r['final_score']:.3f} | reward={r['total_reward']:+.3f} | {elapsed}s\n")
# Summary table
print(f"\n{'='*60}")
print(f" BASELINE RESULTS")
print(f"{'='*60}")
print(f" {'Task':<22} {'Score':>7} {'Threshold':>10} {'Reward':>8} {'Pass':>6}")
print(f" {'-'*55}")
for r in results:
print(f" {r['task_id']:<22} {r['final_score']:>7.3f} {r['threshold']:>10.2f} "
f"{r['total_reward']:>+8.3f} {'βœ…' if r['passed'] else '❌':>6}")
avg_score = sum(r["final_score"] for r in results) / len(results)
pass_rate = sum(1 for r in results if r["passed"]) / len(results)
print(f" {'-'*55}")
print(f" {'AVERAGE':<22} {avg_score:>7.3f} {'':>10} {'':>8} {pass_rate*100:>5.0f}%")
print(f"{'='*60}\n")
# Save results
out_path = "baseline_results.json"
with open(out_path, "w") as f:
json.dump({"model": MODEL, "results": results}, f, indent=2)
print(f" Results saved to {out_path}")
if __name__ == "__main__":
main()