Add files using upload-large-folder tool

b0e88cf verified about 1 month ago

6.66 kB

	"""
	Evaluator for Frontier-CS algorithmic problems.

	This evaluator integrates with SkyDiscover to evaluate generated C++ solutions
	against Frontier-CS benchmark problems using the local judge server.
	"""

	import traceback
	from pathlib import Path
	import logging
	import sys
	import os
	import random

	logger = logging.getLogger(__name__)

	# Support multiple judge servers for load balancing
	DEFAULT_JUDGE_URL = "http://localhost:8081"
	JUDGE_URLS = os.environ.get("JUDGE_URLS", DEFAULT_JUDGE_URL).split(",")
	JUDGE_URLS = [url.strip() for url in JUDGE_URLS if url.strip()]

	def get_judge_url() -> str:
	"""Get a judge URL using random selection for load balancing."""
	return random.choice(JUDGE_URLS)

	# Add Frontier-CS to path
	frontier_cs_path = Path(__file__).resolve().parent / "Frontier-CS" / "src"
	if str(frontier_cs_path) not in sys.path:
	sys.path.insert(0, str(frontier_cs_path))

	try:
	from frontier_cs.single_evaluator import SingleEvaluator as FrontierCSEvaluator
	from frontier_cs.runner.base import EvaluationStatus
	except ImportError as e:
	logger.error(f"Failed to import Frontier-CS: {e}")
	logger.error("Please ensure Frontier-CS is installed as a submodule in benchmarks/frontier-cs-eval/Frontier-CS")
	raise

	def evaluate(program_path: str, problem_id: str = None, **kwargs) -> dict:
	"""
	Evaluate a C++ solution for a Frontier-CS algorithmic problem.

	Args:
	program_path: Path to the C++ solution file
	problem_id: Frontier-CS problem ID (e.g., "0", "1", "2", etc.)
	If None, will be read from FRONTIER_CS_PROBLEM env var or config

	Returns:
	dict with evaluation results:
	- combined_score: The score from the judge (higher is better)
	- runs_successfully: 1.0 if evaluation succeeded, 0.0 otherwise
	- status: Evaluation status string
	- message: Any error or status messages
	- problem_id: The problem ID
	- program_path: Path to the evaluated program
	- score_unbounded: Unbounded score if available
	- metadata: Additional evaluation metadata
	"""
	# Get problem_id from parameter, environment, or kwargs
	if problem_id is None:
	import os
	problem_id = os.environ.get('FRONTIER_CS_PROBLEM')
	if problem_id is None:
	problem_id = kwargs.get('frontier_cs_problem', '0')

	logger.info(f"Evaluating program {program_path} for Frontier-CS problem {problem_id}")

	try:
	# Initialize evaluator with judge server (load balanced if multiple configured)
	judge_url = get_judge_url()
	logger.info(f"Using judge server: {judge_url}")
	evaluator = FrontierCSEvaluator(
	backend="docker",
	judge_url=judge_url,
	register_cleanup=False,
	)

	# Read the solution code
	solution_path = Path(program_path)
	if not solution_path.exists():
	error_msg = f"Solution file not found: {program_path}"
	logger.error(error_msg)
	return {
	"combined_score": 0.0,
	"runs_successfully": 0.0,
	"status": "error",
	"message": error_msg,
	"problem_id": problem_id,
	"program_path": program_path,
	}

	# Extract code and remove any EVOLVE-BLOCK markers
	code = solution_path.read_text().replace(
	"// EVOLVE-BLOCK-START", ""
	).replace(
	"// EVOLVE-BLOCK-END", ""
	).strip()

	logger.info(f"Code extracted from {program_path}")

	# Evaluate the solution
	result = evaluator.evaluate(
	track="algorithmic",
	problem_id=problem_id,
	code=code,
	backend="docker",
	)

	logger.info(f"Evaluation completed with status: {result.status}")

	# Process result
	if result.status == EvaluationStatus.SUCCESS:
	print(result)
	score = result.score
	# Use unbounded score for optimization (allows >100 if beating reference)
	score_unbounded = result.metadata.get('scoreUnbounded', score) if result.metadata else score
	print(f"score={score}, score_unbounded={score_unbounded}")

	# Extract only essential metadata (exclude large test case outputs)
	essential_metadata = {}
	if result.metadata:
	essential_metadata = {
	"status": result.metadata.get("status"),
	"passed": result.metadata.get("passed"),
	"result": result.metadata.get("result"),
	"score": result.metadata.get("score"),
	"scoreUnbounded": result.metadata.get("scoreUnbounded"),
	}

	return {
	"combined_score": float(score), # Ensure it's a float
	"score_unbounded": score_unbounded,
	"runs_successfully": 1.0,
	"status": "success",
	"message": result.message or "Evaluation successful",
	"problem_id": problem_id,
	"program_path": program_path,
	"duration_seconds": result.duration_seconds,
	"metadata": essential_metadata,
	}
	elif result.status == EvaluationStatus.TIMEOUT:
	logger.warning(f"Evaluation timed out: {result.message}")
	return {
	"combined_score": 0.0,
	"runs_successfully": 0.0,
	"status": "timeout",
	"message": result.message or "Evaluation timed out",
	"problem_id": problem_id,
	"program_path": program_path,
	}
	else: # ERROR status
	logger.error(f"Evaluation error: {result.message}")
	return {
	"combined_score": 0.0,
	"runs_successfully": 0.0,
	"status": "error",
	"message": result.message or "Evaluation failed",
	"problem_id": problem_id,
	"program_path": program_path,
	"logs": result.logs,
	}

	except Exception as e:
	logger.error(f"Evaluation failed completely: {str(e)}")
	logger.error(traceback.format_exc())
	return {
	"combined_score": 0.0,
	"runs_successfully": 0.0,
	"status": "error",
	"message": str(e),
	"problem_id": problem_id,
	"program_path": program_path,
	"error": str(e),
	}