""" Evaluator for Frontier-CS algorithmic problems. This evaluator integrates with SkyDiscover to evaluate generated C++ solutions against Frontier-CS benchmark problems using the local judge server. """ import traceback from pathlib import Path import logging import sys import os import random logger = logging.getLogger(__name__) # Support multiple judge servers for load balancing DEFAULT_JUDGE_URL = "http://localhost:8081" JUDGE_URLS = os.environ.get("JUDGE_URLS", DEFAULT_JUDGE_URL).split(",") JUDGE_URLS = [url.strip() for url in JUDGE_URLS if url.strip()] def get_judge_url() -> str: """Get a judge URL using random selection for load balancing.""" return random.choice(JUDGE_URLS) # Add Frontier-CS to path frontier_cs_path = Path(__file__).resolve().parent / "Frontier-CS" / "src" if str(frontier_cs_path) not in sys.path: sys.path.insert(0, str(frontier_cs_path)) try: from frontier_cs.single_evaluator import SingleEvaluator as FrontierCSEvaluator from frontier_cs.runner.base import EvaluationStatus except ImportError as e: logger.error(f"Failed to import Frontier-CS: {e}") logger.error("Please ensure Frontier-CS is installed as a submodule in benchmarks/frontier-cs-eval/Frontier-CS") raise def evaluate(program_path: str, problem_id: str = None, **kwargs) -> dict: """ Evaluate a C++ solution for a Frontier-CS algorithmic problem. Args: program_path: Path to the C++ solution file problem_id: Frontier-CS problem ID (e.g., "0", "1", "2", etc.) If None, will be read from FRONTIER_CS_PROBLEM env var or config Returns: dict with evaluation results: - combined_score: The score from the judge (higher is better) - runs_successfully: 1.0 if evaluation succeeded, 0.0 otherwise - status: Evaluation status string - message: Any error or status messages - problem_id: The problem ID - program_path: Path to the evaluated program - score_unbounded: Unbounded score if available - metadata: Additional evaluation metadata """ # Get problem_id from parameter, environment, or kwargs if problem_id is None: import os problem_id = os.environ.get('FRONTIER_CS_PROBLEM') if problem_id is None: problem_id = kwargs.get('frontier_cs_problem', '0') logger.info(f"Evaluating program {program_path} for Frontier-CS problem {problem_id}") try: # Initialize evaluator with judge server (load balanced if multiple configured) judge_url = get_judge_url() logger.info(f"Using judge server: {judge_url}") evaluator = FrontierCSEvaluator( backend="docker", judge_url=judge_url, register_cleanup=False, ) # Read the solution code solution_path = Path(program_path) if not solution_path.exists(): error_msg = f"Solution file not found: {program_path}" logger.error(error_msg) return { "combined_score": 0.0, "runs_successfully": 0.0, "status": "error", "message": error_msg, "problem_id": problem_id, "program_path": program_path, } # Extract code and remove any EVOLVE-BLOCK markers code = solution_path.read_text().replace( "// EVOLVE-BLOCK-START", "" ).replace( "// EVOLVE-BLOCK-END", "" ).strip() logger.info(f"Code extracted from {program_path}") # Evaluate the solution result = evaluator.evaluate( track="algorithmic", problem_id=problem_id, code=code, backend="docker", ) logger.info(f"Evaluation completed with status: {result.status}") # Process result if result.status == EvaluationStatus.SUCCESS: print(result) score = result.score # Use unbounded score for optimization (allows >100 if beating reference) score_unbounded = result.metadata.get('scoreUnbounded', score) if result.metadata else score print(f"score={score}, score_unbounded={score_unbounded}") # Extract only essential metadata (exclude large test case outputs) essential_metadata = {} if result.metadata: essential_metadata = { "status": result.metadata.get("status"), "passed": result.metadata.get("passed"), "result": result.metadata.get("result"), "score": result.metadata.get("score"), "scoreUnbounded": result.metadata.get("scoreUnbounded"), } return { "combined_score": float(score), # Ensure it's a float "score_unbounded": score_unbounded, "runs_successfully": 1.0, "status": "success", "message": result.message or "Evaluation successful", "problem_id": problem_id, "program_path": program_path, "duration_seconds": result.duration_seconds, "metadata": essential_metadata, } elif result.status == EvaluationStatus.TIMEOUT: logger.warning(f"Evaluation timed out: {result.message}") return { "combined_score": 0.0, "runs_successfully": 0.0, "status": "timeout", "message": result.message or "Evaluation timed out", "problem_id": problem_id, "program_path": program_path, } else: # ERROR status logger.error(f"Evaluation error: {result.message}") return { "combined_score": 0.0, "runs_successfully": 0.0, "status": "error", "message": result.message or "Evaluation failed", "problem_id": problem_id, "program_path": program_path, "logs": result.logs, } except Exception as e: logger.error(f"Evaluation failed completely: {str(e)}") logger.error(traceback.format_exc()) return { "combined_score": 0.0, "runs_successfully": 0.0, "status": "error", "message": str(e), "problem_id": problem_id, "program_path": program_path, "error": str(e), }