File size: 6,563 Bytes

30bf68a

from __future__ import annotations

import argparse
import json
import os
from pathlib import Path
import re
import subprocess
import sys
from typing import Any


BASELINE_TASKS: list[tuple[str, str]] = [
    ("easy-command-typo", "easy"),
    ("medium-python-version", "medium"),
    ("hard-needs-order", "hard"),
]

END_PATTERN = re.compile(
    r"^\[END\] success=(true|false) steps=(\d+) score=(\d+\.\d{3}) rewards=(.*)$"
)


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Run baseline inference on easy/medium/hard tasks")
    parser.add_argument("--tasks", default=",".join(task for task, _ in BASELINE_TASKS))
    parser.add_argument("--max-steps", type=int, default=int(os.getenv("MAX_STEPS", "8")))
    parser.add_argument("--policy-mode", choices=["sft", "imp", "direct"], default="imp")
    parser.add_argument("--trajectories", type=int, default=3)
    parser.add_argument("--benchmark", default=os.getenv("MY_ENV_V4_BENCHMARK", "cicd_debugger_env"))
    parser.add_argument("--offline", action="store_true", default=False)
    parser.add_argument("--force-local-env", action="store_true", default=True)
    parser.add_argument("--output", default="artifacts/baseline_scores.json")
    return parser.parse_args()


def should_run_offline(args: argparse.Namespace) -> bool:
    if args.offline:
        return True

    key = os.getenv("OPENAI_API_KEY") or os.getenv("HF_TOKEN")
    if not key:
        return True

    return os.getenv("OFFLINE_INFERENCE", "0") == "1"


def parse_end_line(lines: list[str]) -> dict[str, Any]:
    for raw_line in reversed(lines):
        line = raw_line.strip()
        if not line.startswith("[END] "):
            continue

        matched = END_PATTERN.match(line)
        if not matched:
            raise RuntimeError(f"Malformed END line: {line}")

        success = matched.group(1) == "true"
        steps = int(matched.group(2))
        score = float(matched.group(3))
        rewards_str = matched.group(4).strip()

        rewards: list[float] = []
        if rewards_str:
            rewards = [float(value) for value in rewards_str.split(",") if value]

        return {
            "success": success,
            "steps": steps,
            "score": score,
            "rewards": rewards,
            "end_line": line,
        }

    raise RuntimeError("No END line found in inference output")


def run_single_task(
    task_id: str,
    difficulty: str,
    args: argparse.Namespace,
    project_root: Path,
    offline_mode: bool,
) -> dict[str, Any]:
    command = [
        sys.executable,
        "inference.py",
        "--task",
        task_id,
        "--benchmark",
        str(args.benchmark),
        "--max-steps",
        str(max(1, int(args.max_steps))),
        "--policy-mode",
        str(args.policy_mode),
        "--trajectories",
        str(max(1, int(args.trajectories))),
    ]

    if offline_mode:
        command.append("--offline")
    if args.force_local_env:
        command.append("--force-local-env")

    env = os.environ.copy()
    if offline_mode:
        env["OFFLINE_INFERENCE"] = "1"

    completed = subprocess.run(
        command,
        cwd=project_root,
        capture_output=True,
        text=True,
        env=env,
        check=True,
    )

    lines = [line for line in completed.stdout.splitlines() if line.strip()]
    summary = parse_end_line(lines)

    return {
        "task_id": task_id,
        "difficulty": difficulty,
        "success": summary["success"],
        "steps": summary["steps"],
        "score": summary["score"],
        "rewards": summary["rewards"],
        "start_line": next((line for line in lines if line.startswith("[START] ")), ""),
        "end_line": summary["end_line"],
    }


def main() -> int:
    args = parse_args()
    project_root = Path(__file__).resolve().parent

    known_difficulties = {task: difficulty for task, difficulty in BASELINE_TASKS}
    requested_tasks = [task.strip() for task in str(args.tasks).split(",") if task.strip()]

    if not requested_tasks:
        print("No tasks provided for baseline run", file=sys.stderr)
        return 1

    offline_mode = should_run_offline(args)

    print(
        f"[BASELINE] mode={'offline' if offline_mode else 'openai'} tasks={len(requested_tasks)} "
        f"max_steps={max(1, int(args.max_steps))} policy={args.policy_mode}",
        flush=True,
    )

    results: list[dict[str, Any]] = []
    for task_id in requested_tasks:
        difficulty = known_difficulties.get(task_id, "custom")
        try:
            result = run_single_task(task_id, difficulty, args, project_root, offline_mode)
            results.append(result)
            print(
                f"[BASELINE] task={task_id} difficulty={difficulty} success={str(result['success']).lower()} "
                f"score={result['score']:.3f} steps={result['steps']}",
                flush=True,
            )
        except subprocess.CalledProcessError as exc:
            print(f"[BASELINE] task={task_id} failed with return code {exc.returncode}", file=sys.stderr)
            if exc.stdout:
                print(exc.stdout, file=sys.stderr)
            if exc.stderr:
                print(exc.stderr, file=sys.stderr)
            return exc.returncode or 1
        except Exception as exc:
            print(f"[BASELINE] task={task_id} failed: {exc}", file=sys.stderr)
            return 1

    average_score = sum(item["score"] for item in results) / len(results)
    success_rate = sum(1 for item in results if item["success"]) / len(results)

    payload = {
        "mode": "offline" if offline_mode else "openai",
        "model_name": os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct"),
        "api_base_url": os.getenv("API_BASE_URL", "https://router.huggingface.co/v1"),
        "max_steps": max(1, int(args.max_steps)),
        "policy_mode": str(args.policy_mode),
        "trajectories": max(1, int(args.trajectories)),
        "average_score": round(float(average_score), 3),
        "success_rate": round(float(success_rate), 3),
        "results": results,
    }

    output_path = project_root / str(args.output)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    output_path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")

    print(f"[BASELINE] average_score={payload['average_score']:.3f} success_rate={payload['success_rate']:.3f}", flush=True)
    print(f"[BASELINE] wrote {output_path}", flush=True)

    return 0


if __name__ == "__main__":
    raise SystemExit(main())