| from __future__ import annotations |
|
|
| import argparse |
| import importlib.util |
| import json |
| import os |
| import shutil |
| import subprocess |
| import sys |
| from datetime import datetime, timezone |
| from typing import Any, Dict, List, Tuple |
|
|
| from dotenv import load_dotenv |
|
|
|
|
| load_dotenv() |
|
|
|
|
| CORE_TASKS = [ |
| "bug_detection_easy_1", |
| "memory_leak_medium_1", |
| "security_hard_1", |
| ] |
|
|
|
|
| def _run_cmd(command: List[str]) -> Tuple[int, str, str]: |
| """Run a command and return (returncode, stdout, stderr).""" |
| result = subprocess.run( |
| command, |
| capture_output=True, |
| text=True, |
| encoding="utf-8", |
| errors="replace", |
| ) |
| return result.returncode, result.stdout, result.stderr |
|
|
|
|
| def _require_command(binary: str) -> bool: |
| """Return True if a command exists on PATH.""" |
| return shutil.which(binary) is not None |
|
|
|
|
| def run_validation() -> Tuple[bool, Dict[str, Any]]: |
| print("Running OpenEnv validation...") |
| if not _require_command("openenv"): |
| return False, { |
| "ok": False, |
| "reason": "openenv command not found on PATH", |
| "stdout": "", |
| "stderr": "", |
| } |
|
|
| code, out, err = _run_cmd(["openenv", "validate"]) |
| if code != 0: |
| print("Validation failed") |
| return False, {"ok": False, "stdout": out, "stderr": err} |
|
|
| print("Validation passed") |
| return True, {"ok": True, "stdout": out, "stderr": err} |
|
|
|
|
| def run_tests(with_coverage: bool) -> Tuple[bool, Dict[str, Any]]: |
| print("Running unit tests...") |
| cmd = ["pytest", "tests/", "-v"] |
| coverage_enabled = False |
| coverage_reason = "" |
| if with_coverage and importlib.util.find_spec("pytest_cov") is not None: |
| cmd.extend(["--cov=environment", "--cov-report=html"]) |
| coverage_enabled = True |
| elif with_coverage: |
| coverage_reason = "pytest-cov not installed; ran tests without coverage" |
|
|
| code, out, err = _run_cmd(cmd) |
| if code != 0: |
| print("Tests failed") |
| return False, { |
| "ok": False, |
| "stdout": out, |
| "stderr": err, |
| "coverage_enabled": coverage_enabled, |
| "coverage_reason": coverage_reason, |
| } |
|
|
| print("Tests passed") |
| return True, { |
| "ok": True, |
| "stdout": out, |
| "stderr": err, |
| "coverage_enabled": coverage_enabled, |
| "coverage_reason": coverage_reason, |
| } |
|
|
|
|
| def check_docker(image_name: str) -> Tuple[bool, Dict[str, Any]]: |
| print("Checking Docker build...") |
| if not _require_command("docker"): |
| return False, { |
| "ok": False, |
| "reason": "docker command not found on PATH", |
| "stdout": "", |
| "stderr": "", |
| } |
|
|
| info_code, info_out, info_err = _run_cmd(["docker", "info"]) |
| if info_code != 0: |
| return False, { |
| "ok": False, |
| "reason": "docker daemon not reachable. Start Docker Desktop and retry.", |
| "stdout": info_out, |
| "stderr": info_err, |
| } |
|
|
| code, out, err = _run_cmd(["docker", "build", "-t", image_name, "."]) |
| if code != 0: |
| print("Docker build failed") |
| return False, {"ok": False, "stdout": out, "stderr": err} |
|
|
| print("Docker build successful") |
| return True, {"ok": True, "stdout": out, "stderr": err} |
|
|
|
|
| def _inference_env_ready() -> Tuple[bool, str]: |
| if not (os.getenv("API_BASE_URL") or "").strip(): |
| return False, "API_BASE_URL is not set" |
| if not (os.getenv("MODEL_NAME") or "").strip(): |
| return False, "MODEL_NAME is not set" |
| token = (os.getenv("API_KEY") or os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY") or "").strip() |
| if not token: |
| return False, "API_KEY is not set (supported aliases: HF_TOKEN, OPENAI_API_KEY)" |
| return True, "" |
|
|
|
|
| def run_baseline(tasks: List[str], max_steps: int) -> Tuple[bool, Dict[str, Any], Dict[str, float]]: |
| """Run inference for each task and collect task_score from result JSONs.""" |
| print("Running baseline inference for core tasks...") |
|
|
| ready, reason = _inference_env_ready() |
| if not ready: |
| print(f"Skipping baseline inference: {reason}") |
| return False, {"ok": False, "reason": reason}, {} |
|
|
| baseline_scores: Dict[str, float] = {} |
| details: Dict[str, Any] = {"ok": True, "runs": []} |
|
|
| for task_id in tasks: |
| output_file = f"baseline_{task_id}.json" |
| cmd = [ |
| sys.executable, |
| "inference.py", |
| "--task-id", |
| task_id, |
| "--max-steps", |
| str(max_steps), |
| "--output", |
| output_file, |
| ] |
| code, out, err = _run_cmd(cmd) |
| run_info: Dict[str, Any] = { |
| "task_id": task_id, |
| "ok": code == 0, |
| "stdout": out, |
| "stderr": err, |
| "output_file": output_file, |
| } |
|
|
| if code != 0: |
| details["ok"] = False |
| details["runs"].append(run_info) |
| continue |
|
|
| |
| |
| combined_logs = f"{out}\n{err}".lower() |
| if "error getting action from llm" in combined_logs or "insufficient balance" in combined_logs: |
| details["ok"] = False |
| run_info["ok"] = False |
| run_info["reason"] = "Model API call failed; fallback action used" |
|
|
| try: |
| with open(output_file, "r", encoding="utf-8") as fh: |
| payload = json.load(fh) |
| score = float(payload.get("task_score", 0.0)) |
| baseline_scores[task_id] = score |
| run_info["task_score"] = score |
| except (OSError, json.JSONDecodeError, ValueError) as exc: |
| details["ok"] = False |
| run_info["ok"] = False |
| run_info["parse_error"] = str(exc) |
|
|
| details["runs"].append(run_info) |
|
|
| if details["ok"]: |
| print("Baseline inference passed for all selected tasks") |
| else: |
| print("Baseline inference had failures") |
|
|
| return bool(details["ok"]), details, baseline_scores |
|
|
|
|
| def generate_report( |
| checks: Dict[str, Dict[str, Any]], |
| baseline_scores: Dict[str, float], |
| report_path: str, |
| ) -> None: |
| openenv_ok = checks["validation"]["ok"] |
| tests_ok = checks["tests"]["ok"] |
| docker_ok = checks["docker"]["ok"] |
| baseline_ok = checks["baseline"]["ok"] |
|
|
| report = { |
| "project": "code-review-agent-env", |
| "generated_at_utc": datetime.now(timezone.utc).isoformat(), |
| "tasks": CORE_TASKS, |
| "difficulties": ["easy", "medium", "hard"], |
| "openenv_compliant": openenv_ok, |
| "docker_supported": docker_ok, |
| "tests_passed": tests_ok, |
| "baseline_passed": baseline_ok, |
| "baseline_scores": baseline_scores, |
| "checks": checks, |
| } |
|
|
| with open(report_path, "w", encoding="utf-8") as fh: |
| json.dump(report, fh, indent=2) |
|
|
| print(f"Submission report generated: {report_path}") |
|
|
|
|
| def main() -> int: |
| parser = argparse.ArgumentParser(description="Pre-submission checklist for OpenEnv hackathon") |
| parser.add_argument( |
| "--skip-baseline", |
| action="store_true", |
| help="Skip inference baseline runs", |
| ) |
| parser.add_argument( |
| "--max-steps", |
| type=int, |
| default=50, |
| help="Max steps for each baseline inference run", |
| ) |
| parser.add_argument( |
| "--no-coverage", |
| action="store_true", |
| help="Run tests without coverage output", |
| ) |
| parser.add_argument( |
| "--image-name", |
| default="code-review-env", |
| help="Docker image name for validation build", |
| ) |
| parser.add_argument( |
| "--report-path", |
| default="submission_report.json", |
| help="Where to write the JSON report", |
| ) |
| args = parser.parse_args() |
|
|
| print("=" * 50) |
| print("Pre-submission Checklist") |
| print("=" * 50) |
|
|
| checks: Dict[str, Dict[str, Any]] = {} |
|
|
| ok, detail = run_validation() |
| checks["validation"] = detail |
|
|
| ok, detail = run_tests(with_coverage=not args.no_coverage) |
| checks["tests"] = detail |
|
|
| ok, detail = check_docker(args.image_name) |
| checks["docker"] = detail |
|
|
| baseline_scores: Dict[str, float] = {} |
| if args.skip_baseline: |
| checks["baseline"] = {"ok": False, "skipped": True, "reason": "Skipped by --skip-baseline"} |
| else: |
| ok, detail, baseline_scores = run_baseline(CORE_TASKS, max_steps=args.max_steps) |
| checks["baseline"] = detail |
|
|
| generate_report(checks, baseline_scores, args.report_path) |
|
|
| required_checks_ok = ( |
| checks["validation"]["ok"] |
| and checks["tests"]["ok"] |
| and checks["docker"]["ok"] |
| ) |
|
|
| if required_checks_ok: |
| print("\nRequired checks passed. Ready for submission.") |
| return 0 |
|
|
| print("\nSome required checks failed. Please fix before submitting.") |
| return 1 |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main()) |
|
|