cicd-debugger-env / baseline_inference.py
Lishika's picture
clean final submission
30bf68a
from __future__ import annotations
import argparse
import json
import os
from pathlib import Path
import re
import subprocess
import sys
from typing import Any
BASELINE_TASKS: list[tuple[str, str]] = [
("easy-command-typo", "easy"),
("medium-python-version", "medium"),
("hard-needs-order", "hard"),
]
END_PATTERN = re.compile(
r"^\[END\] success=(true|false) steps=(\d+) score=(\d+\.\d{3}) rewards=(.*)$"
)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Run baseline inference on easy/medium/hard tasks")
parser.add_argument("--tasks", default=",".join(task for task, _ in BASELINE_TASKS))
parser.add_argument("--max-steps", type=int, default=int(os.getenv("MAX_STEPS", "8")))
parser.add_argument("--policy-mode", choices=["sft", "imp", "direct"], default="imp")
parser.add_argument("--trajectories", type=int, default=3)
parser.add_argument("--benchmark", default=os.getenv("MY_ENV_V4_BENCHMARK", "cicd_debugger_env"))
parser.add_argument("--offline", action="store_true", default=False)
parser.add_argument("--force-local-env", action="store_true", default=True)
parser.add_argument("--output", default="artifacts/baseline_scores.json")
return parser.parse_args()
def should_run_offline(args: argparse.Namespace) -> bool:
if args.offline:
return True
key = os.getenv("OPENAI_API_KEY") or os.getenv("HF_TOKEN")
if not key:
return True
return os.getenv("OFFLINE_INFERENCE", "0") == "1"
def parse_end_line(lines: list[str]) -> dict[str, Any]:
for raw_line in reversed(lines):
line = raw_line.strip()
if not line.startswith("[END] "):
continue
matched = END_PATTERN.match(line)
if not matched:
raise RuntimeError(f"Malformed END line: {line}")
success = matched.group(1) == "true"
steps = int(matched.group(2))
score = float(matched.group(3))
rewards_str = matched.group(4).strip()
rewards: list[float] = []
if rewards_str:
rewards = [float(value) for value in rewards_str.split(",") if value]
return {
"success": success,
"steps": steps,
"score": score,
"rewards": rewards,
"end_line": line,
}
raise RuntimeError("No END line found in inference output")
def run_single_task(
task_id: str,
difficulty: str,
args: argparse.Namespace,
project_root: Path,
offline_mode: bool,
) -> dict[str, Any]:
command = [
sys.executable,
"inference.py",
"--task",
task_id,
"--benchmark",
str(args.benchmark),
"--max-steps",
str(max(1, int(args.max_steps))),
"--policy-mode",
str(args.policy_mode),
"--trajectories",
str(max(1, int(args.trajectories))),
]
if offline_mode:
command.append("--offline")
if args.force_local_env:
command.append("--force-local-env")
env = os.environ.copy()
if offline_mode:
env["OFFLINE_INFERENCE"] = "1"
completed = subprocess.run(
command,
cwd=project_root,
capture_output=True,
text=True,
env=env,
check=True,
)
lines = [line for line in completed.stdout.splitlines() if line.strip()]
summary = parse_end_line(lines)
return {
"task_id": task_id,
"difficulty": difficulty,
"success": summary["success"],
"steps": summary["steps"],
"score": summary["score"],
"rewards": summary["rewards"],
"start_line": next((line for line in lines if line.startswith("[START] ")), ""),
"end_line": summary["end_line"],
}
def main() -> int:
args = parse_args()
project_root = Path(__file__).resolve().parent
known_difficulties = {task: difficulty for task, difficulty in BASELINE_TASKS}
requested_tasks = [task.strip() for task in str(args.tasks).split(",") if task.strip()]
if not requested_tasks:
print("No tasks provided for baseline run", file=sys.stderr)
return 1
offline_mode = should_run_offline(args)
print(
f"[BASELINE] mode={'offline' if offline_mode else 'openai'} tasks={len(requested_tasks)} "
f"max_steps={max(1, int(args.max_steps))} policy={args.policy_mode}",
flush=True,
)
results: list[dict[str, Any]] = []
for task_id in requested_tasks:
difficulty = known_difficulties.get(task_id, "custom")
try:
result = run_single_task(task_id, difficulty, args, project_root, offline_mode)
results.append(result)
print(
f"[BASELINE] task={task_id} difficulty={difficulty} success={str(result['success']).lower()} "
f"score={result['score']:.3f} steps={result['steps']}",
flush=True,
)
except subprocess.CalledProcessError as exc:
print(f"[BASELINE] task={task_id} failed with return code {exc.returncode}", file=sys.stderr)
if exc.stdout:
print(exc.stdout, file=sys.stderr)
if exc.stderr:
print(exc.stderr, file=sys.stderr)
return exc.returncode or 1
except Exception as exc:
print(f"[BASELINE] task={task_id} failed: {exc}", file=sys.stderr)
return 1
average_score = sum(item["score"] for item in results) / len(results)
success_rate = sum(1 for item in results if item["success"]) / len(results)
payload = {
"mode": "offline" if offline_mode else "openai",
"model_name": os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct"),
"api_base_url": os.getenv("API_BASE_URL", "https://router.huggingface.co/v1"),
"max_steps": max(1, int(args.max_steps)),
"policy_mode": str(args.policy_mode),
"trajectories": max(1, int(args.trajectories)),
"average_score": round(float(average_score), 3),
"success_rate": round(float(success_rate), 3),
"results": results,
}
output_path = project_root / str(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
print(f"[BASELINE] average_score={payload['average_score']:.3f} success_rate={payload['success_rate']:.3f}", flush=True)
print(f"[BASELINE] wrote {output_path}", flush=True)
return 0
if __name__ == "__main__":
raise SystemExit(main())