File size: 6,661 Bytes
b0e88cf | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 | """
Evaluator for Frontier-CS algorithmic problems.
This evaluator integrates with SkyDiscover to evaluate generated C++ solutions
against Frontier-CS benchmark problems using the local judge server.
"""
import traceback
from pathlib import Path
import logging
import sys
import os
import random
logger = logging.getLogger(__name__)
# Support multiple judge servers for load balancing
DEFAULT_JUDGE_URL = "http://localhost:8081"
JUDGE_URLS = os.environ.get("JUDGE_URLS", DEFAULT_JUDGE_URL).split(",")
JUDGE_URLS = [url.strip() for url in JUDGE_URLS if url.strip()]
def get_judge_url() -> str:
"""Get a judge URL using random selection for load balancing."""
return random.choice(JUDGE_URLS)
# Add Frontier-CS to path
frontier_cs_path = Path(__file__).resolve().parent / "Frontier-CS" / "src"
if str(frontier_cs_path) not in sys.path:
sys.path.insert(0, str(frontier_cs_path))
try:
from frontier_cs.single_evaluator import SingleEvaluator as FrontierCSEvaluator
from frontier_cs.runner.base import EvaluationStatus
except ImportError as e:
logger.error(f"Failed to import Frontier-CS: {e}")
logger.error("Please ensure Frontier-CS is installed as a submodule in benchmarks/frontier-cs-eval/Frontier-CS")
raise
def evaluate(program_path: str, problem_id: str = None, **kwargs) -> dict:
"""
Evaluate a C++ solution for a Frontier-CS algorithmic problem.
Args:
program_path: Path to the C++ solution file
problem_id: Frontier-CS problem ID (e.g., "0", "1", "2", etc.)
If None, will be read from FRONTIER_CS_PROBLEM env var or config
Returns:
dict with evaluation results:
- combined_score: The score from the judge (higher is better)
- runs_successfully: 1.0 if evaluation succeeded, 0.0 otherwise
- status: Evaluation status string
- message: Any error or status messages
- problem_id: The problem ID
- program_path: Path to the evaluated program
- score_unbounded: Unbounded score if available
- metadata: Additional evaluation metadata
"""
# Get problem_id from parameter, environment, or kwargs
if problem_id is None:
import os
problem_id = os.environ.get('FRONTIER_CS_PROBLEM')
if problem_id is None:
problem_id = kwargs.get('frontier_cs_problem', '0')
logger.info(f"Evaluating program {program_path} for Frontier-CS problem {problem_id}")
try:
# Initialize evaluator with judge server (load balanced if multiple configured)
judge_url = get_judge_url()
logger.info(f"Using judge server: {judge_url}")
evaluator = FrontierCSEvaluator(
backend="docker",
judge_url=judge_url,
register_cleanup=False,
)
# Read the solution code
solution_path = Path(program_path)
if not solution_path.exists():
error_msg = f"Solution file not found: {program_path}"
logger.error(error_msg)
return {
"combined_score": 0.0,
"runs_successfully": 0.0,
"status": "error",
"message": error_msg,
"problem_id": problem_id,
"program_path": program_path,
}
# Extract code and remove any EVOLVE-BLOCK markers
code = solution_path.read_text().replace(
"// EVOLVE-BLOCK-START", ""
).replace(
"// EVOLVE-BLOCK-END", ""
).strip()
logger.info(f"Code extracted from {program_path}")
# Evaluate the solution
result = evaluator.evaluate(
track="algorithmic",
problem_id=problem_id,
code=code,
backend="docker",
)
logger.info(f"Evaluation completed with status: {result.status}")
# Process result
if result.status == EvaluationStatus.SUCCESS:
print(result)
score = result.score
# Use unbounded score for optimization (allows >100 if beating reference)
score_unbounded = result.metadata.get('scoreUnbounded', score) if result.metadata else score
print(f"score={score}, score_unbounded={score_unbounded}")
# Extract only essential metadata (exclude large test case outputs)
essential_metadata = {}
if result.metadata:
essential_metadata = {
"status": result.metadata.get("status"),
"passed": result.metadata.get("passed"),
"result": result.metadata.get("result"),
"score": result.metadata.get("score"),
"scoreUnbounded": result.metadata.get("scoreUnbounded"),
}
return {
"combined_score": float(score), # Ensure it's a float
"score_unbounded": score_unbounded,
"runs_successfully": 1.0,
"status": "success",
"message": result.message or "Evaluation successful",
"problem_id": problem_id,
"program_path": program_path,
"duration_seconds": result.duration_seconds,
"metadata": essential_metadata,
}
elif result.status == EvaluationStatus.TIMEOUT:
logger.warning(f"Evaluation timed out: {result.message}")
return {
"combined_score": 0.0,
"runs_successfully": 0.0,
"status": "timeout",
"message": result.message or "Evaluation timed out",
"problem_id": problem_id,
"program_path": program_path,
}
else: # ERROR status
logger.error(f"Evaluation error: {result.message}")
return {
"combined_score": 0.0,
"runs_successfully": 0.0,
"status": "error",
"message": result.message or "Evaluation failed",
"problem_id": problem_id,
"program_path": program_path,
"logs": result.logs,
}
except Exception as e:
logger.error(f"Evaluation failed completely: {str(e)}")
logger.error(traceback.format_exc())
return {
"combined_score": 0.0,
"runs_successfully": 0.0,
"status": "error",
"message": str(e),
"problem_id": problem_id,
"program_path": program_path,
"error": str(e),
}
|