| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import os |
| from pathlib import Path |
| import re |
| import subprocess |
| import sys |
| from typing import Any |
|
|
|
|
| BASELINE_TASKS: list[tuple[str, str]] = [ |
| ("easy-command-typo", "easy"), |
| ("medium-python-version", "medium"), |
| ("hard-needs-order", "hard"), |
| ] |
|
|
| END_PATTERN = re.compile( |
| r"^\[END\] success=(true|false) steps=(\d+) score=(\d+\.\d{3}) rewards=(.*)$" |
| ) |
|
|
|
|
| def parse_args() -> argparse.Namespace: |
| parser = argparse.ArgumentParser(description="Run baseline inference on easy/medium/hard tasks") |
| parser.add_argument("--tasks", default=",".join(task for task, _ in BASELINE_TASKS)) |
| parser.add_argument("--max-steps", type=int, default=int(os.getenv("MAX_STEPS", "8"))) |
| parser.add_argument("--policy-mode", choices=["sft", "imp", "direct"], default="imp") |
| parser.add_argument("--trajectories", type=int, default=3) |
| parser.add_argument("--benchmark", default=os.getenv("MY_ENV_V4_BENCHMARK", "cicd_debugger_env")) |
| parser.add_argument("--offline", action="store_true", default=False) |
| parser.add_argument("--force-local-env", action="store_true", default=True) |
| parser.add_argument("--output", default="artifacts/baseline_scores.json") |
| return parser.parse_args() |
|
|
|
|
| def should_run_offline(args: argparse.Namespace) -> bool: |
| if args.offline: |
| return True |
|
|
| key = os.getenv("OPENAI_API_KEY") or os.getenv("HF_TOKEN") |
| if not key: |
| return True |
|
|
| return os.getenv("OFFLINE_INFERENCE", "0") == "1" |
|
|
|
|
| def parse_end_line(lines: list[str]) -> dict[str, Any]: |
| for raw_line in reversed(lines): |
| line = raw_line.strip() |
| if not line.startswith("[END] "): |
| continue |
|
|
| matched = END_PATTERN.match(line) |
| if not matched: |
| raise RuntimeError(f"Malformed END line: {line}") |
|
|
| success = matched.group(1) == "true" |
| steps = int(matched.group(2)) |
| score = float(matched.group(3)) |
| rewards_str = matched.group(4).strip() |
|
|
| rewards: list[float] = [] |
| if rewards_str: |
| rewards = [float(value) for value in rewards_str.split(",") if value] |
|
|
| return { |
| "success": success, |
| "steps": steps, |
| "score": score, |
| "rewards": rewards, |
| "end_line": line, |
| } |
|
|
| raise RuntimeError("No END line found in inference output") |
|
|
|
|
| def run_single_task( |
| task_id: str, |
| difficulty: str, |
| args: argparse.Namespace, |
| project_root: Path, |
| offline_mode: bool, |
| ) -> dict[str, Any]: |
| command = [ |
| sys.executable, |
| "inference.py", |
| "--task", |
| task_id, |
| "--benchmark", |
| str(args.benchmark), |
| "--max-steps", |
| str(max(1, int(args.max_steps))), |
| "--policy-mode", |
| str(args.policy_mode), |
| "--trajectories", |
| str(max(1, int(args.trajectories))), |
| ] |
|
|
| if offline_mode: |
| command.append("--offline") |
| if args.force_local_env: |
| command.append("--force-local-env") |
|
|
| env = os.environ.copy() |
| if offline_mode: |
| env["OFFLINE_INFERENCE"] = "1" |
|
|
| completed = subprocess.run( |
| command, |
| cwd=project_root, |
| capture_output=True, |
| text=True, |
| env=env, |
| check=True, |
| ) |
|
|
| lines = [line for line in completed.stdout.splitlines() if line.strip()] |
| summary = parse_end_line(lines) |
|
|
| return { |
| "task_id": task_id, |
| "difficulty": difficulty, |
| "success": summary["success"], |
| "steps": summary["steps"], |
| "score": summary["score"], |
| "rewards": summary["rewards"], |
| "start_line": next((line for line in lines if line.startswith("[START] ")), ""), |
| "end_line": summary["end_line"], |
| } |
|
|
|
|
| def main() -> int: |
| args = parse_args() |
| project_root = Path(__file__).resolve().parent |
|
|
| known_difficulties = {task: difficulty for task, difficulty in BASELINE_TASKS} |
| requested_tasks = [task.strip() for task in str(args.tasks).split(",") if task.strip()] |
|
|
| if not requested_tasks: |
| print("No tasks provided for baseline run", file=sys.stderr) |
| return 1 |
|
|
| offline_mode = should_run_offline(args) |
|
|
| print( |
| f"[BASELINE] mode={'offline' if offline_mode else 'openai'} tasks={len(requested_tasks)} " |
| f"max_steps={max(1, int(args.max_steps))} policy={args.policy_mode}", |
| flush=True, |
| ) |
|
|
| results: list[dict[str, Any]] = [] |
| for task_id in requested_tasks: |
| difficulty = known_difficulties.get(task_id, "custom") |
| try: |
| result = run_single_task(task_id, difficulty, args, project_root, offline_mode) |
| results.append(result) |
| print( |
| f"[BASELINE] task={task_id} difficulty={difficulty} success={str(result['success']).lower()} " |
| f"score={result['score']:.3f} steps={result['steps']}", |
| flush=True, |
| ) |
| except subprocess.CalledProcessError as exc: |
| print(f"[BASELINE] task={task_id} failed with return code {exc.returncode}", file=sys.stderr) |
| if exc.stdout: |
| print(exc.stdout, file=sys.stderr) |
| if exc.stderr: |
| print(exc.stderr, file=sys.stderr) |
| return exc.returncode or 1 |
| except Exception as exc: |
| print(f"[BASELINE] task={task_id} failed: {exc}", file=sys.stderr) |
| return 1 |
|
|
| average_score = sum(item["score"] for item in results) / len(results) |
| success_rate = sum(1 for item in results if item["success"]) / len(results) |
|
|
| payload = { |
| "mode": "offline" if offline_mode else "openai", |
| "model_name": os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct"), |
| "api_base_url": os.getenv("API_BASE_URL", "https://router.huggingface.co/v1"), |
| "max_steps": max(1, int(args.max_steps)), |
| "policy_mode": str(args.policy_mode), |
| "trajectories": max(1, int(args.trajectories)), |
| "average_score": round(float(average_score), 3), |
| "success_rate": round(float(success_rate), 3), |
| "results": results, |
| } |
|
|
| output_path = project_root / str(args.output) |
| output_path.parent.mkdir(parents=True, exist_ok=True) |
| output_path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8") |
|
|
| print(f"[BASELINE] average_score={payload['average_score']:.3f} success_rate={payload['success_rate']:.3f}", flush=True) |
| print(f"[BASELINE] wrote {output_path}", flush=True) |
|
|
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main()) |
|
|