Spaces:

luciferai-devil
/

code-debug-env

Sleeping

File size: 4,345 Bytes

#!/usr/bin/env python3
"""
Baseline inference script.
Runs an LLM agent on all 3 tasks using OpenAI API.
Usage: python baseline/run_baseline.py [--output json]
Requires: OPENAI_API_KEY environment variable.
"""
import asyncio
import sys
import json
import os
from pathlib import Path

# Add parent to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent.parent))

from code_debug_env.client import CodeDebugEnv
from code_debug_env.models import Action

try:
    from openai import AsyncOpenAI
except ImportError:
    print("Please install openai: pip install openai", file=sys.stderr)
    sys.exit(1)

BASE_URL = os.getenv("OPENENV_URL", "http://127.0.0.1:8000")
API_BASE_URL = os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1")
MODEL_NAME = os.getenv("OPENENV_MODEL", "gpt-4o-mini")

_client = None

def get_openai_client():
    global _client
    if _client is None:
        api_key = os.getenv("OPENAI_API_KEY")
        if not api_key:
            return None
        _client = AsyncOpenAI(
            api_key=api_key,
            base_url=API_BASE_URL
        )
    return _client

async def openai_agent(observation) -> Action:
    """Uses LLM to suggest a code fix."""
    prompt = f"""You are an expert Python debugger. Your task is to fix the buggy code below.
Task Description: {observation.task_description}

Buggy Code:
```python
{observation.buggy_code}
```

Test Results so far:
{[[t.name, t.passed, t.error] for t in observation.test_results]}
Passed {observation.passed} out of {observation.total} tests.

Provide ONLY a valid JSON object matching this schema:
{{
  "patch": "The FULL python function as a string, with the bugs fixed",
  "task_id": "{observation.task_id}",
  "think": "Your chain-of-thought reasoning before patching (important!)"
}}
"""
    client = get_openai_client()
    if not client:
        return Action(
            patch=observation.buggy_code,
            task_id=observation.task_id,
            think="Skipping LLM call: OPENAI_API_KEY not set."
        )

    try:
        response = await client.chat.completions.create(
            model=MODEL_NAME,
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"} if "gpt-4" in MODEL_NAME or "gpt-oss" in MODEL_NAME else None,
            temperature=0.2,
        )
        content = response.choices[0].message.content
        data = json.loads(content)
        return Action(
            patch=data["patch"],
            task_id=observation.task_id,
            think=data.get("think", "Applied fix based on test errors."),
        )
    except Exception as e:
        print(f"LLM Error: {e}", file=sys.stderr)
        # fallback to returning original code to avoid crashing the loop
        return Action(
            patch=observation.buggy_code,
            task_id=observation.task_id,
            think="Failed to generate patch.",
        )

async def evaluate_task(env, task_id: str) -> dict:
    result = await env.reset(task_id=task_id)
    obs = result.observation
    best_score = 0.0
    for step in range(10):
        action = await openai_agent(obs)
        result = await env.step(action)
        best_score = max(best_score, result.observation.score)
        obs = result.observation
        if obs.done:
            break
    return {"task_id": task_id, "best_score": round(best_score, 4), "steps": step + 1}

async def main(output_format: str = "table"):
    if not os.getenv("OPENAI_API_KEY"):
        print("Warning: OPENAI_API_KEY not set. LLM calls will fail.", file=sys.stderr)
        
    results = []
    async with CodeDebugEnv(base_url=BASE_URL) as env:
        for task_id in ["task_easy", "task_medium", "task_hard"]:
            res = await evaluate_task(env, task_id)
            results.append(res)

    if output_format == "json":
        print(json.dumps({"baseline_results": results, "agent": "openai_api"}))
    else:
        print("\n=== Baseline Results ===", file=sys.stderr)
        for r in results:
            print(f"  {r['task_id']:15s}  score={r['best_score']:.3f}  steps={r['steps']}", file=sys.stderr)
        print(f"\n  avg score: {sum(r['best_score'] for r in results) / len(results):.3f}", file=sys.stderr)

if __name__ == "__main__":
    output = "json" if "json" in sys.argv else "table"
    asyncio.run(main(output))