File size: 6,661 Bytes
b0e88cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
"""
Evaluator for Frontier-CS algorithmic problems.

This evaluator integrates with SkyDiscover to evaluate generated C++ solutions
against Frontier-CS benchmark problems using the local judge server.
"""

import traceback
from pathlib import Path
import logging
import sys
import os
import random

logger = logging.getLogger(__name__)

# Support multiple judge servers for load balancing
DEFAULT_JUDGE_URL = "http://localhost:8081"
JUDGE_URLS = os.environ.get("JUDGE_URLS", DEFAULT_JUDGE_URL).split(",")
JUDGE_URLS = [url.strip() for url in JUDGE_URLS if url.strip()]

def get_judge_url() -> str:
    """Get a judge URL using random selection for load balancing."""
    return random.choice(JUDGE_URLS)

# Add Frontier-CS to path
frontier_cs_path = Path(__file__).resolve().parent / "Frontier-CS" / "src"
if str(frontier_cs_path) not in sys.path:
    sys.path.insert(0, str(frontier_cs_path))

try:
    from frontier_cs.single_evaluator import SingleEvaluator as FrontierCSEvaluator
    from frontier_cs.runner.base import EvaluationStatus
except ImportError as e:
    logger.error(f"Failed to import Frontier-CS: {e}")
    logger.error("Please ensure Frontier-CS is installed as a submodule in benchmarks/frontier-cs-eval/Frontier-CS")
    raise

def evaluate(program_path: str, problem_id: str = None, **kwargs) -> dict:
    """
    Evaluate a C++ solution for a Frontier-CS algorithmic problem.

    Args:
        program_path: Path to the C++ solution file
        problem_id: Frontier-CS problem ID (e.g., "0", "1", "2", etc.)
                    If None, will be read from FRONTIER_CS_PROBLEM env var or config

    Returns:
        dict with evaluation results:
            - combined_score: The score from the judge (higher is better)
            - runs_successfully: 1.0 if evaluation succeeded, 0.0 otherwise
            - status: Evaluation status string
            - message: Any error or status messages
            - problem_id: The problem ID
            - program_path: Path to the evaluated program
            - score_unbounded: Unbounded score if available
            - metadata: Additional evaluation metadata
    """
    # Get problem_id from parameter, environment, or kwargs
    if problem_id is None:
        import os
        problem_id = os.environ.get('FRONTIER_CS_PROBLEM')
        if problem_id is None:
            problem_id = kwargs.get('frontier_cs_problem', '0')

    logger.info(f"Evaluating program {program_path} for Frontier-CS problem {problem_id}")

    try:
        # Initialize evaluator with judge server (load balanced if multiple configured)
        judge_url = get_judge_url()
        logger.info(f"Using judge server: {judge_url}")
        evaluator = FrontierCSEvaluator(
            backend="docker",
            judge_url=judge_url,
            register_cleanup=False,
        )
        
        # Read the solution code
        solution_path = Path(program_path)
        if not solution_path.exists():
            error_msg = f"Solution file not found: {program_path}"
            logger.error(error_msg)
            return {
                "combined_score": 0.0,
                "runs_successfully": 0.0,
                "status": "error",
                "message": error_msg,
                "problem_id": problem_id,
                "program_path": program_path,
            }
        
        # Extract code and remove any EVOLVE-BLOCK markers
        code = solution_path.read_text().replace(
            "// EVOLVE-BLOCK-START", ""
        ).replace(
            "// EVOLVE-BLOCK-END", ""
        ).strip()
        
        logger.info(f"Code extracted from {program_path}")
        
        # Evaluate the solution
        result = evaluator.evaluate(
            track="algorithmic",
            problem_id=problem_id,
            code=code,
            backend="docker",
        )
        
        logger.info(f"Evaluation completed with status: {result.status}")
        
        # Process result
        if result.status == EvaluationStatus.SUCCESS:
            print(result)
            score = result.score
            # Use unbounded score for optimization (allows >100 if beating reference)
            score_unbounded = result.metadata.get('scoreUnbounded', score) if result.metadata else score
            print(f"score={score}, score_unbounded={score_unbounded}")

            # Extract only essential metadata (exclude large test case outputs)
            essential_metadata = {}
            if result.metadata:
                essential_metadata = {
                    "status": result.metadata.get("status"),
                    "passed": result.metadata.get("passed"),
                    "result": result.metadata.get("result"),
                    "score": result.metadata.get("score"),
                    "scoreUnbounded": result.metadata.get("scoreUnbounded"),
                }

            return {
                "combined_score": float(score),  # Ensure it's a float
                "score_unbounded": score_unbounded,
                "runs_successfully": 1.0,
                "status": "success",
                "message": result.message or "Evaluation successful",
                "problem_id": problem_id,
                "program_path": program_path,
                "duration_seconds": result.duration_seconds,
                "metadata": essential_metadata,
            } 
        elif result.status == EvaluationStatus.TIMEOUT:
            logger.warning(f"Evaluation timed out: {result.message}")
            return {
                "combined_score": 0.0,
                "runs_successfully": 0.0,
                "status": "timeout",
                "message": result.message or "Evaluation timed out",
                "problem_id": problem_id,
                "program_path": program_path,
            }
        else:  # ERROR status
            logger.error(f"Evaluation error: {result.message}")
            return {
                "combined_score": 0.0,
                "runs_successfully": 0.0,
                "status": "error",
                "message": result.message or "Evaluation failed",
                "problem_id": problem_id,
                "program_path": program_path,
                "logs": result.logs,
            }
            
    except Exception as e:
        logger.error(f"Evaluation failed completely: {str(e)}")
        logger.error(traceback.format_exc())
        return {
            "combined_score": 0.0,
            "runs_successfully": 0.0,
            "status": "error",
            "message": str(e),
            "problem_id": problem_id,
            "program_path": program_path,
            "error": str(e),
        }