cicd-debugger-env-final / baseline_inference.py

clean final submission

30bf68a 9 days ago

6.56 kB

	from __future__ import annotations

	import argparse
	import json
	import os
	from pathlib import Path
	import re
	import subprocess
	import sys
	from typing import Any


	BASELINE_TASKS: list[tuple[str, str]] = [
	("easy-command-typo", "easy"),
	("medium-python-version", "medium"),
	("hard-needs-order", "hard"),
	]

	END_PATTERN = re.compile(
	r"^\[END\] success=(true\|false) steps=(\d+) score=(\d+\.\d{3}) rewards=(.*)$"
	)


	def parse_args() -> argparse.Namespace:
	parser = argparse.ArgumentParser(description="Run baseline inference on easy/medium/hard tasks")
	parser.add_argument("--tasks", default=",".join(task for task, _ in BASELINE_TASKS))
	parser.add_argument("--max-steps", type=int, default=int(os.getenv("MAX_STEPS", "8")))
	parser.add_argument("--policy-mode", choices=["sft", "imp", "direct"], default="imp")
	parser.add_argument("--trajectories", type=int, default=3)
	parser.add_argument("--benchmark", default=os.getenv("MY_ENV_V4_BENCHMARK", "cicd_debugger_env"))
	parser.add_argument("--offline", action="store_true", default=False)
	parser.add_argument("--force-local-env", action="store_true", default=True)
	parser.add_argument("--output", default="artifacts/baseline_scores.json")
	return parser.parse_args()


	def should_run_offline(args: argparse.Namespace) -> bool:
	if args.offline:
	return True

	key = os.getenv("OPENAI_API_KEY") or os.getenv("HF_TOKEN")
	if not key:
	return True

	return os.getenv("OFFLINE_INFERENCE", "0") == "1"


	def parse_end_line(lines: list[str]) -> dict[str, Any]:
	for raw_line in reversed(lines):
	line = raw_line.strip()
	if not line.startswith("[END] "):
	continue

	matched = END_PATTERN.match(line)
	if not matched:
	raise RuntimeError(f"Malformed END line: {line}")

	success = matched.group(1) == "true"
	steps = int(matched.group(2))
	score = float(matched.group(3))
	rewards_str = matched.group(4).strip()

	rewards: list[float] = []
	if rewards_str:
	rewards = [float(value) for value in rewards_str.split(",") if value]

	return {
	"success": success,
	"steps": steps,
	"score": score,
	"rewards": rewards,
	"end_line": line,
	}

	raise RuntimeError("No END line found in inference output")


	def run_single_task(
	task_id: str,
	difficulty: str,
	args: argparse.Namespace,
	project_root: Path,
	offline_mode: bool,
	) -> dict[str, Any]:
	command = [
	sys.executable,
	"inference.py",
	"--task",
	task_id,
	"--benchmark",
	str(args.benchmark),
	"--max-steps",
	str(max(1, int(args.max_steps))),
	"--policy-mode",
	str(args.policy_mode),
	"--trajectories",
	str(max(1, int(args.trajectories))),
	]

	if offline_mode:
	command.append("--offline")
	if args.force_local_env:
	command.append("--force-local-env")

	env = os.environ.copy()
	if offline_mode:
	env["OFFLINE_INFERENCE"] = "1"

	completed = subprocess.run(
	command,
	cwd=project_root,
	capture_output=True,
	text=True,
	env=env,
	check=True,
	)

	lines = [line for line in completed.stdout.splitlines() if line.strip()]
	summary = parse_end_line(lines)

	return {
	"task_id": task_id,
	"difficulty": difficulty,
	"success": summary["success"],
	"steps": summary["steps"],
	"score": summary["score"],
	"rewards": summary["rewards"],
	"start_line": next((line for line in lines if line.startswith("[START] ")), ""),
	"end_line": summary["end_line"],
	}


	def main() -> int:
	args = parse_args()
	project_root = Path(__file__).resolve().parent

	known_difficulties = {task: difficulty for task, difficulty in BASELINE_TASKS}
	requested_tasks = [task.strip() for task in str(args.tasks).split(",") if task.strip()]

	if not requested_tasks:
	print("No tasks provided for baseline run", file=sys.stderr)
	return 1

	offline_mode = should_run_offline(args)

	print(
	f"[BASELINE] mode={'offline' if offline_mode else 'openai'} tasks={len(requested_tasks)} "
	f"max_steps={max(1, int(args.max_steps))} policy={args.policy_mode}",
	flush=True,
	)

	results: list[dict[str, Any]] = []
	for task_id in requested_tasks:
	difficulty = known_difficulties.get(task_id, "custom")
	try:
	result = run_single_task(task_id, difficulty, args, project_root, offline_mode)
	results.append(result)
	print(
	f"[BASELINE] task={task_id} difficulty={difficulty} success={str(result['success']).lower()} "
	f"score={result['score']:.3f} steps={result['steps']}",
	flush=True,
	)
	except subprocess.CalledProcessError as exc:
	print(f"[BASELINE] task={task_id} failed with return code {exc.returncode}", file=sys.stderr)
	if exc.stdout:
	print(exc.stdout, file=sys.stderr)
	if exc.stderr:
	print(exc.stderr, file=sys.stderr)
	return exc.returncode or 1
	except Exception as exc:
	print(f"[BASELINE] task={task_id} failed: {exc}", file=sys.stderr)
	return 1

	average_score = sum(item["score"] for item in results) / len(results)
	success_rate = sum(1 for item in results if item["success"]) / len(results)

	payload = {
	"mode": "offline" if offline_mode else "openai",
	"model_name": os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct"),
	"api_base_url": os.getenv("API_BASE_URL", "https://router.huggingface.co/v1"),
	"max_steps": max(1, int(args.max_steps)),
	"policy_mode": str(args.policy_mode),
	"trajectories": max(1, int(args.trajectories)),
	"average_score": round(float(average_score), 3),
	"success_rate": round(float(success_rate), 3),
	"results": results,
	}

	output_path = project_root / str(args.output)
	output_path.parent.mkdir(parents=True, exist_ok=True)
	output_path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")

	print(f"[BASELINE] average_score={payload['average_score']:.3f} success_rate={payload['success_rate']:.3f}", flush=True)
	print(f"[BASELINE] wrote {output_path}", flush=True)

	return 0


	if __name__ == "__main__":
	raise SystemExit(main())