sky2 / benchmarks /frontier-cs-eval /evaluator.py
JustinTX's picture
Add files using upload-large-folder tool
b0e88cf verified
"""
Evaluator for Frontier-CS algorithmic problems.
This evaluator integrates with SkyDiscover to evaluate generated C++ solutions
against Frontier-CS benchmark problems using the local judge server.
"""
import traceback
from pathlib import Path
import logging
import sys
import os
import random
logger = logging.getLogger(__name__)
# Support multiple judge servers for load balancing
DEFAULT_JUDGE_URL = "http://localhost:8081"
JUDGE_URLS = os.environ.get("JUDGE_URLS", DEFAULT_JUDGE_URL).split(",")
JUDGE_URLS = [url.strip() for url in JUDGE_URLS if url.strip()]
def get_judge_url() -> str:
"""Get a judge URL using random selection for load balancing."""
return random.choice(JUDGE_URLS)
# Add Frontier-CS to path
frontier_cs_path = Path(__file__).resolve().parent / "Frontier-CS" / "src"
if str(frontier_cs_path) not in sys.path:
sys.path.insert(0, str(frontier_cs_path))
try:
from frontier_cs.single_evaluator import SingleEvaluator as FrontierCSEvaluator
from frontier_cs.runner.base import EvaluationStatus
except ImportError as e:
logger.error(f"Failed to import Frontier-CS: {e}")
logger.error("Please ensure Frontier-CS is installed as a submodule in benchmarks/frontier-cs-eval/Frontier-CS")
raise
def evaluate(program_path: str, problem_id: str = None, **kwargs) -> dict:
"""
Evaluate a C++ solution for a Frontier-CS algorithmic problem.
Args:
program_path: Path to the C++ solution file
problem_id: Frontier-CS problem ID (e.g., "0", "1", "2", etc.)
If None, will be read from FRONTIER_CS_PROBLEM env var or config
Returns:
dict with evaluation results:
- combined_score: The score from the judge (higher is better)
- runs_successfully: 1.0 if evaluation succeeded, 0.0 otherwise
- status: Evaluation status string
- message: Any error or status messages
- problem_id: The problem ID
- program_path: Path to the evaluated program
- score_unbounded: Unbounded score if available
- metadata: Additional evaluation metadata
"""
# Get problem_id from parameter, environment, or kwargs
if problem_id is None:
import os
problem_id = os.environ.get('FRONTIER_CS_PROBLEM')
if problem_id is None:
problem_id = kwargs.get('frontier_cs_problem', '0')
logger.info(f"Evaluating program {program_path} for Frontier-CS problem {problem_id}")
try:
# Initialize evaluator with judge server (load balanced if multiple configured)
judge_url = get_judge_url()
logger.info(f"Using judge server: {judge_url}")
evaluator = FrontierCSEvaluator(
backend="docker",
judge_url=judge_url,
register_cleanup=False,
)
# Read the solution code
solution_path = Path(program_path)
if not solution_path.exists():
error_msg = f"Solution file not found: {program_path}"
logger.error(error_msg)
return {
"combined_score": 0.0,
"runs_successfully": 0.0,
"status": "error",
"message": error_msg,
"problem_id": problem_id,
"program_path": program_path,
}
# Extract code and remove any EVOLVE-BLOCK markers
code = solution_path.read_text().replace(
"// EVOLVE-BLOCK-START", ""
).replace(
"// EVOLVE-BLOCK-END", ""
).strip()
logger.info(f"Code extracted from {program_path}")
# Evaluate the solution
result = evaluator.evaluate(
track="algorithmic",
problem_id=problem_id,
code=code,
backend="docker",
)
logger.info(f"Evaluation completed with status: {result.status}")
# Process result
if result.status == EvaluationStatus.SUCCESS:
print(result)
score = result.score
# Use unbounded score for optimization (allows >100 if beating reference)
score_unbounded = result.metadata.get('scoreUnbounded', score) if result.metadata else score
print(f"score={score}, score_unbounded={score_unbounded}")
# Extract only essential metadata (exclude large test case outputs)
essential_metadata = {}
if result.metadata:
essential_metadata = {
"status": result.metadata.get("status"),
"passed": result.metadata.get("passed"),
"result": result.metadata.get("result"),
"score": result.metadata.get("score"),
"scoreUnbounded": result.metadata.get("scoreUnbounded"),
}
return {
"combined_score": float(score), # Ensure it's a float
"score_unbounded": score_unbounded,
"runs_successfully": 1.0,
"status": "success",
"message": result.message or "Evaluation successful",
"problem_id": problem_id,
"program_path": program_path,
"duration_seconds": result.duration_seconds,
"metadata": essential_metadata,
}
elif result.status == EvaluationStatus.TIMEOUT:
logger.warning(f"Evaluation timed out: {result.message}")
return {
"combined_score": 0.0,
"runs_successfully": 0.0,
"status": "timeout",
"message": result.message or "Evaluation timed out",
"problem_id": problem_id,
"program_path": program_path,
}
else: # ERROR status
logger.error(f"Evaluation error: {result.message}")
return {
"combined_score": 0.0,
"runs_successfully": 0.0,
"status": "error",
"message": result.message or "Evaluation failed",
"problem_id": problem_id,
"program_path": program_path,
"logs": result.logs,
}
except Exception as e:
logger.error(f"Evaluation failed completely: {str(e)}")
logger.error(traceback.format_exc())
return {
"combined_score": 0.0,
"runs_successfully": 0.0,
"status": "error",
"message": str(e),
"problem_id": problem_id,
"program_path": program_path,
"error": str(e),
}