Spaces:
Configuration error
Configuration error
| #!/usr/bin/env python3 | |
| """ | |
| baseline_agent.py β Baseline inference script for CodeReview OpenEnv. | |
| Runs gpt-4o against all three tasks using the OpenAI client. | |
| Reads credentials from OPENAI_API_KEY environment variable. | |
| Connects to the env either locally (direct Python import) or via HTTP. | |
| Usage | |
| ----- | |
| # Direct mode (no server needed): | |
| python baseline_agent.py | |
| # Against a running server: | |
| python baseline_agent.py --mode http --base-url http://localhost:7860 | |
| # Single task: | |
| python baseline_agent.py --task task_2_medium | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import os | |
| import sys | |
| import textwrap | |
| import time | |
| from typing import Any, Dict, List, Optional | |
| import requests | |
| from openai import OpenAI | |
| # --------------------------------------------------------------------------- | |
| # Configuration | |
| # --------------------------------------------------------------------------- | |
| MODEL = os.environ.get("BASELINE_MODEL", "gpt-4o") | |
| API_KEY = os.environ.get("OPENAI_API_KEY", "") | |
| ENV_BASE_URL = os.environ.get("ENV_BASE_URL", "http://localhost:7860") | |
| TASKS = ["task_1_easy", "task_2_medium", "task_3_hard"] | |
| # --------------------------------------------------------------------------- | |
| # Prompt construction | |
| # --------------------------------------------------------------------------- | |
| SYSTEM_PROMPT = textwrap.dedent(""" | |
| You are an expert Python code reviewer. | |
| You will be given a code snippet along with review instructions. | |
| Your job is to produce a JSON action object that identifies issues in the code. | |
| The JSON object you return must match this schema exactly: | |
| { | |
| "comments": [ | |
| { | |
| "line": <int or null>, | |
| "category": <"bug"|"security"|"performance"|"style"|"documentation">, | |
| "severity": <"low"|"medium"|"high"|"critical">, | |
| "message": "<clear description of the issue>", | |
| "suggestion": "<optional fix>" | |
| } | |
| ], | |
| "summary": "<overall assessment β required for hard tasks, optional otherwise>", | |
| "submit": true | |
| } | |
| Rules: | |
| - Only flag genuine issues. Do not fabricate problems. | |
| - Be precise about line numbers (1-indexed from the code). | |
| - Match the categories listed in the instructions. | |
| - Always set "submit": true when you believe your review is complete. | |
| - Return ONLY the JSON object. No markdown, no explanations. | |
| """).strip() | |
| def build_user_message(observation: dict) -> str: | |
| snippet = observation["snippet"] | |
| instructions = observation["instructions"] | |
| previous = observation.get("previous_comments", []) | |
| numbered_source = "\n".join( | |
| f"{i+1:3d} {line}" | |
| for i, line in enumerate(snippet["source"].splitlines()) | |
| ) | |
| msg = f""" | |
| {instructions} | |
| ### File: {snippet['file_name']} | |
| ```python | |
| {numbered_source} | |
| ``` | |
| """ | |
| if previous: | |
| msg += f"\n### Your previous comments ({len(previous)} so far):\n" | |
| for c in previous: | |
| msg += f" - L{c.get('line','?')} [{c['category']}] {c['message'][:80]}\n" | |
| return msg.strip() | |
| # --------------------------------------------------------------------------- | |
| # Direct mode (import env directly) | |
| # --------------------------------------------------------------------------- | |
| def run_direct(task_id: str, client: OpenAI) -> dict: | |
| """Run the agent against the environment by direct Python import.""" | |
| # Import here to avoid circular dependency when running in HTTP mode | |
| sys.path.insert(0, os.path.dirname(__file__)) | |
| from env.environment import CodeReviewEnv | |
| from env.models import Action, ReviewComment, ReviewCategory, Severity | |
| env = CodeReviewEnv(task_id=task_id) | |
| obs = env.reset() | |
| total_reward = 0.0 | |
| final_score = 0.0 | |
| steps_taken = 0 | |
| for step_num in range(env.spec.max_steps): | |
| user_msg = build_user_message(obs.model_dump()) | |
| try: | |
| response = client.chat.completions.create( | |
| model=MODEL, | |
| messages=[ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| {"role": "user", "content": user_msg}, | |
| ], | |
| temperature=0.2, | |
| response_format={"type": "json_object"}, | |
| ) | |
| raw = response.choices[0].message.content or "{}" | |
| action_dict = json.loads(raw) | |
| except Exception as e: | |
| print(f" [!] LLM error on step {step_num}: {e}") | |
| action_dict = {"comments": [], "submit": True} | |
| # Build Action | |
| comments = [] | |
| for c in action_dict.get("comments", []): | |
| try: | |
| comments.append(ReviewComment( | |
| line=c.get("line"), | |
| category=ReviewCategory(c.get("category", "bug")), | |
| severity=Severity(c.get("severity", "medium")), | |
| message=c.get("message", ""), | |
| suggestion=c.get("suggestion"), | |
| )) | |
| except Exception: | |
| pass # skip malformed comments | |
| action = Action( | |
| comments=comments, | |
| summary=action_dict.get("summary"), | |
| submit=action_dict.get("submit", True), | |
| ) | |
| result = env.step(action) | |
| total_reward += result.reward.value | |
| steps_taken += 1 | |
| final_score = result.info.get("grader", {}).get("score", 0.0) | |
| print(f" Step {step_num+1}: reward={result.reward.value:+.3f} | " | |
| f"comments={result.info['total_comments']} | " | |
| f"score={final_score:.3f}") | |
| obs = result.observation | |
| if result.done: | |
| break | |
| passed = final_score >= env.spec.passing_threshold | |
| return { | |
| "task_id": task_id, | |
| "steps": steps_taken, | |
| "total_reward": round(total_reward, 4), | |
| "final_score": round(final_score, 4), | |
| "passed": passed, | |
| "threshold": env.spec.passing_threshold, | |
| } | |
| # --------------------------------------------------------------------------- | |
| # HTTP mode (against a running server) | |
| # --------------------------------------------------------------------------- | |
| def run_http(task_id: str, client: OpenAI, base_url: str) -> dict: | |
| """Run the agent against a live HTTP server.""" | |
| session_id = f"baseline-{task_id}-{int(time.time())}" | |
| headers = {"Content-Type": "application/json"} | |
| # Reset | |
| r = requests.post(f"{base_url}/reset", | |
| json={"task_id": task_id, "session_id": session_id}, headers=headers) | |
| r.raise_for_status() | |
| obs = r.json()["observation"] | |
| # Get task spec for threshold | |
| tasks_r = requests.get(f"{base_url}/tasks") | |
| spec = tasks_r.json()[task_id] | |
| max_steps = spec["max_steps"] | |
| threshold = spec["passing_threshold"] | |
| total_reward = 0.0 | |
| final_score = 0.0 | |
| steps_taken = 0 | |
| for step_num in range(max_steps): | |
| user_msg = build_user_message(obs) | |
| try: | |
| response = client.chat.completions.create( | |
| model=MODEL, | |
| messages=[ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| {"role": "user", "content": user_msg}, | |
| ], | |
| temperature=0.2, | |
| response_format={"type": "json_object"}, | |
| ) | |
| action_dict = json.loads(response.choices[0].message.content or "{}") | |
| except Exception as e: | |
| print(f" [!] LLM error: {e}") | |
| action_dict = {"comments": [], "submit": True} | |
| step_r = requests.post( | |
| f"{base_url}/step", | |
| json={"session_id": session_id, "action": action_dict}, | |
| headers=headers, | |
| ) | |
| step_r.raise_for_status() | |
| result = step_r.json() | |
| total_reward += result["reward"]["value"] | |
| steps_taken += 1 | |
| final_score = result["info"].get("grader", {}).get("score", 0.0) | |
| print(f" Step {step_num+1}: reward={result['reward']['value']:+.3f} | " | |
| f"comments={result['info']['total_comments']} | " | |
| f"score={final_score:.3f}") | |
| obs = result["observation"] | |
| if result["done"]: | |
| break | |
| return { | |
| "task_id": task_id, | |
| "steps": steps_taken, | |
| "total_reward": round(total_reward, 4), | |
| "final_score": round(final_score, 4), | |
| "passed": final_score >= threshold, | |
| "threshold": threshold, | |
| } | |
| # --------------------------------------------------------------------------- | |
| # Main | |
| # --------------------------------------------------------------------------- | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Baseline agent for CodeReview OpenEnv") | |
| parser.add_argument("--mode", choices=["direct", "http"], default="direct") | |
| parser.add_argument("--base-url", default=ENV_BASE_URL) | |
| parser.add_argument("--task", choices=TASKS + ["all"], default="all") | |
| args = parser.parse_args() | |
| if not API_KEY: | |
| print("ERROR: OPENAI_API_KEY environment variable not set.") | |
| sys.exit(1) | |
| client = OpenAI(api_key=API_KEY) | |
| tasks_to_run = TASKS if args.task == "all" else [args.task] | |
| print(f"\n{'='*60}") | |
| print(f" CodeReview OpenEnv β Baseline Agent ({MODEL})") | |
| print(f" Mode: {args.mode}") | |
| print(f"{'='*60}\n") | |
| results: List[dict] = [] | |
| for task_id in tasks_to_run: | |
| print(f"βΆ Running {task_id} ...") | |
| t0 = time.time() | |
| if args.mode == "direct": | |
| r = run_direct(task_id, client) | |
| else: | |
| r = run_http(task_id, client, args.base_url) | |
| elapsed = round(time.time() - t0, 1) | |
| r["elapsed_s"] = elapsed | |
| results.append(r) | |
| status = "β PASSED" if r["passed"] else "β FAILED" | |
| print(f" β {status} | score={r['final_score']:.3f} | reward={r['total_reward']:+.3f} | {elapsed}s\n") | |
| # Summary table | |
| print(f"\n{'='*60}") | |
| print(f" BASELINE RESULTS") | |
| print(f"{'='*60}") | |
| print(f" {'Task':<22} {'Score':>7} {'Threshold':>10} {'Reward':>8} {'Pass':>6}") | |
| print(f" {'-'*55}") | |
| for r in results: | |
| print(f" {r['task_id']:<22} {r['final_score']:>7.3f} {r['threshold']:>10.2f} " | |
| f"{r['total_reward']:>+8.3f} {'β ' if r['passed'] else 'β':>6}") | |
| avg_score = sum(r["final_score"] for r in results) / len(results) | |
| pass_rate = sum(1 for r in results if r["passed"]) / len(results) | |
| print(f" {'-'*55}") | |
| print(f" {'AVERAGE':<22} {avg_score:>7.3f} {'':>10} {'':>8} {pass_rate*100:>5.0f}%") | |
| print(f"{'='*60}\n") | |
| # Save results | |
| out_path = "baseline_results.json" | |
| with open(out_path, "w") as f: | |
| json.dump({"model": MODEL, "results": results}, f, indent=2) | |
| print(f" Results saved to {out_path}") | |
| if __name__ == "__main__": | |
| main() | |