Spaces:

Avnishjain
/

code-review

Configuration error

File size: 10,675 Bytes

ec566e9

#!/usr/bin/env python3
"""
baseline_agent.py – Baseline inference script for CodeReview OpenEnv.

Runs gpt-4o against all three tasks using the OpenAI client.
Reads credentials from OPENAI_API_KEY environment variable.
Connects to the env either locally (direct Python import) or via HTTP.

Usage
-----
    # Direct mode (no server needed):
    python baseline_agent.py

    # Against a running server:
    python baseline_agent.py --mode http --base-url http://localhost:7860

    # Single task:
    python baseline_agent.py --task task_2_medium
"""

from __future__ import annotations

import argparse
import json
import os
import sys
import textwrap
import time
from typing import Any, Dict, List, Optional

import requests
from openai import OpenAI

# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------

MODEL = os.environ.get("BASELINE_MODEL", "gpt-4o")
API_KEY = os.environ.get("OPENAI_API_KEY", "")
ENV_BASE_URL = os.environ.get("ENV_BASE_URL", "http://localhost:7860")
TASKS = ["task_1_easy", "task_2_medium", "task_3_hard"]

# ---------------------------------------------------------------------------
# Prompt construction
# ---------------------------------------------------------------------------

SYSTEM_PROMPT = textwrap.dedent("""
You are an expert Python code reviewer.
You will be given a code snippet along with review instructions.
Your job is to produce a JSON action object that identifies issues in the code.

The JSON object you return must match this schema exactly:
{
  "comments": [
    {
      "line": <int or null>,
      "category": <"bug"|"security"|"performance"|"style"|"documentation">,
      "severity": <"low"|"medium"|"high"|"critical">,
      "message": "<clear description of the issue>",
      "suggestion": "<optional fix>"
    }
  ],
  "summary": "<overall assessment – required for hard tasks, optional otherwise>",
  "submit": true
}

Rules:
- Only flag genuine issues. Do not fabricate problems.
- Be precise about line numbers (1-indexed from the code).
- Match the categories listed in the instructions.
- Always set "submit": true when you believe your review is complete.
- Return ONLY the JSON object. No markdown, no explanations.
""").strip()


def build_user_message(observation: dict) -> str:
    snippet = observation["snippet"]
    instructions = observation["instructions"]
    previous = observation.get("previous_comments", [])

    numbered_source = "\n".join(
        f"{i+1:3d}  {line}"
        for i, line in enumerate(snippet["source"].splitlines())
    )

    msg = f"""
{instructions}

### File: {snippet['file_name']}
```python
{numbered_source}
```
"""
    if previous:
        msg += f"\n### Your previous comments ({len(previous)} so far):\n"
        for c in previous:
            msg += f"  - L{c.get('line','?')} [{c['category']}] {c['message'][:80]}\n"

    return msg.strip()


# ---------------------------------------------------------------------------
# Direct mode (import env directly)
# ---------------------------------------------------------------------------

def run_direct(task_id: str, client: OpenAI) -> dict:
    """Run the agent against the environment by direct Python import."""
    # Import here to avoid circular dependency when running in HTTP mode
    sys.path.insert(0, os.path.dirname(__file__))
    from env.environment import CodeReviewEnv
    from env.models import Action, ReviewComment, ReviewCategory, Severity

    env = CodeReviewEnv(task_id=task_id)
    obs = env.reset()

    total_reward = 0.0
    final_score = 0.0
    steps_taken = 0

    for step_num in range(env.spec.max_steps):
        user_msg = build_user_message(obs.model_dump())

        try:
            response = client.chat.completions.create(
                model=MODEL,
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": user_msg},
                ],
                temperature=0.2,
                response_format={"type": "json_object"},
            )
            raw = response.choices[0].message.content or "{}"
            action_dict = json.loads(raw)
        except Exception as e:
            print(f"  [!] LLM error on step {step_num}: {e}")
            action_dict = {"comments": [], "submit": True}

        # Build Action
        comments = []
        for c in action_dict.get("comments", []):
            try:
                comments.append(ReviewComment(
                    line=c.get("line"),
                    category=ReviewCategory(c.get("category", "bug")),
                    severity=Severity(c.get("severity", "medium")),
                    message=c.get("message", ""),
                    suggestion=c.get("suggestion"),
                ))
            except Exception:
                pass  # skip malformed comments

        action = Action(
            comments=comments,
            summary=action_dict.get("summary"),
            submit=action_dict.get("submit", True),
        )

        result = env.step(action)
        total_reward += result.reward.value
        steps_taken += 1
        final_score = result.info.get("grader", {}).get("score", 0.0)

        print(f"  Step {step_num+1}: reward={result.reward.value:+.3f} | "
              f"comments={result.info['total_comments']} | "
              f"score={final_score:.3f}")

        obs = result.observation
        if result.done:
            break

    passed = final_score >= env.spec.passing_threshold
    return {
        "task_id": task_id,
        "steps": steps_taken,
        "total_reward": round(total_reward, 4),
        "final_score": round(final_score, 4),
        "passed": passed,
        "threshold": env.spec.passing_threshold,
    }


# ---------------------------------------------------------------------------
# HTTP mode (against a running server)
# ---------------------------------------------------------------------------

def run_http(task_id: str, client: OpenAI, base_url: str) -> dict:
    """Run the agent against a live HTTP server."""
    session_id = f"baseline-{task_id}-{int(time.time())}"
    headers = {"Content-Type": "application/json"}

    # Reset
    r = requests.post(f"{base_url}/reset",
                      json={"task_id": task_id, "session_id": session_id}, headers=headers)
    r.raise_for_status()
    obs = r.json()["observation"]

    # Get task spec for threshold
    tasks_r = requests.get(f"{base_url}/tasks")
    spec = tasks_r.json()[task_id]
    max_steps = spec["max_steps"]
    threshold = spec["passing_threshold"]

    total_reward = 0.0
    final_score = 0.0
    steps_taken = 0

    for step_num in range(max_steps):
        user_msg = build_user_message(obs)

        try:
            response = client.chat.completions.create(
                model=MODEL,
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": user_msg},
                ],
                temperature=0.2,
                response_format={"type": "json_object"},
            )
            action_dict = json.loads(response.choices[0].message.content or "{}")
        except Exception as e:
            print(f"  [!] LLM error: {e}")
            action_dict = {"comments": [], "submit": True}

        step_r = requests.post(
            f"{base_url}/step",
            json={"session_id": session_id, "action": action_dict},
            headers=headers,
        )
        step_r.raise_for_status()
        result = step_r.json()

        total_reward += result["reward"]["value"]
        steps_taken += 1
        final_score = result["info"].get("grader", {}).get("score", 0.0)

        print(f"  Step {step_num+1}: reward={result['reward']['value']:+.3f} | "
              f"comments={result['info']['total_comments']} | "
              f"score={final_score:.3f}")

        obs = result["observation"]
        if result["done"]:
            break

    return {
        "task_id": task_id,
        "steps": steps_taken,
        "total_reward": round(total_reward, 4),
        "final_score": round(final_score, 4),
        "passed": final_score >= threshold,
        "threshold": threshold,
    }


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main():
    parser = argparse.ArgumentParser(description="Baseline agent for CodeReview OpenEnv")
    parser.add_argument("--mode", choices=["direct", "http"], default="direct")
    parser.add_argument("--base-url", default=ENV_BASE_URL)
    parser.add_argument("--task", choices=TASKS + ["all"], default="all")
    args = parser.parse_args()

    if not API_KEY:
        print("ERROR: OPENAI_API_KEY environment variable not set.")
        sys.exit(1)

    client = OpenAI(api_key=API_KEY)
    tasks_to_run = TASKS if args.task == "all" else [args.task]

    print(f"\n{'='*60}")
    print(f"  CodeReview OpenEnv – Baseline Agent ({MODEL})")
    print(f"  Mode: {args.mode}")
    print(f"{'='*60}\n")

    results: List[dict] = []
    for task_id in tasks_to_run:
        print(f"▶ Running {task_id} ...")
        t0 = time.time()
        if args.mode == "direct":
            r = run_direct(task_id, client)
        else:
            r = run_http(task_id, client, args.base_url)
        elapsed = round(time.time() - t0, 1)
        r["elapsed_s"] = elapsed
        results.append(r)
        status = "✅ PASSED" if r["passed"] else "❌ FAILED"
        print(f"  → {status} | score={r['final_score']:.3f} | reward={r['total_reward']:+.3f} | {elapsed}s\n")

    # Summary table
    print(f"\n{'='*60}")
    print(f"  BASELINE RESULTS")
    print(f"{'='*60}")
    print(f"  {'Task':<22} {'Score':>7} {'Threshold':>10} {'Reward':>8} {'Pass':>6}")
    print(f"  {'-'*55}")
    for r in results:
        print(f"  {r['task_id']:<22} {r['final_score']:>7.3f} {r['threshold']:>10.2f} "
              f"{r['total_reward']:>+8.3f} {'✅' if r['passed'] else '❌':>6}")
    avg_score = sum(r["final_score"] for r in results) / len(results)
    pass_rate = sum(1 for r in results if r["passed"]) / len(results)
    print(f"  {'-'*55}")
    print(f"  {'AVERAGE':<22} {avg_score:>7.3f} {'':>10} {'':>8} {pass_rate*100:>5.0f}%")
    print(f"{'='*60}\n")

    # Save results
    out_path = "baseline_results.json"
    with open(out_path, "w") as f:
        json.dump({"model": MODEL, "results": results}, f, indent=2)
    print(f"  Results saved to {out_path}")


if __name__ == "__main__":
    main()