File size: 13,937 Bytes
2facf1f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
"""
Bridging evaluator for Frontier-CS algorithmic problems.

Translates between ShinkaEvolve's evaluation interface and Frontier-CS's
go-judge based evaluation system. Works for all 172 algorithmic problems
via the `problem_id` parameter.

Usage as evaluator_module:
    evaluator_module: "tasks.frontier_cs_entry.evaluate_algorithmic"
    evaluator_function: "main"
    evaluator_kwargs: {"problem_id": "0"}

Direct usage:
    python -m tasks.frontier_cs_entry.evaluate_algorithmic \\
        --program-path solution.cpp --results-dir /tmp/results --problem-id 0
"""

from __future__ import annotations

import json
import logging
import os
import sys
from pathlib import Path
from typing import Any, Dict, List, Optional

logger = logging.getLogger(__name__)

# Default paths
DEFAULT_JUDGE_URL = "http://localhost:8081"
DEFAULT_FRONTIER_CS_DIR = "tasks/Frontier-CS"


def _ensure_frontier_cs_importable(frontier_cs_dir: str) -> None:
    """
    Make Frontier-CS runner classes importable without triggering the full
    package __init__.py (which pulls in google.generativeai and other heavy deps).

    We register lightweight stub modules so that only the runner subpackage
    is actually loaded.
    """
    import types

    src_dir = str(Path(frontier_cs_dir).resolve() / "src")
    if src_dir not in sys.path:
        sys.path.insert(0, src_dir)

    # If already set up, skip
    if "frontier_cs" in sys.modules and hasattr(sys.modules["frontier_cs"], "__path__"):
        return

    fc_src = Path(src_dir) / "frontier_cs"

    # Register empty frontier_cs package (bypass __init__.py)
    fc = types.ModuleType("frontier_cs")
    fc.__path__ = [str(fc_src)]
    sys.modules["frontier_cs"] = fc

    # Stub frontier_cs.gen to avoid importing LLM interface
    fc_gen = types.ModuleType("frontier_cs.gen")
    fc_gen.__path__ = [str(fc_src / "gen")]
    sys.modules["frontier_cs.gen"] = fc_gen

    # Provide the single constant that base.py needs
    fc_gen_sf = types.ModuleType("frontier_cs.gen.solution_format")
    fc_gen_sf.FAILED_EXTENSION = "FAILED"
    sys.modules["frontier_cs.gen.solution_format"] = fc_gen_sf


def _load_problem_statement(frontier_cs_dir: str, problem_id: str) -> str:
    """Load problem statement text. Returns empty string if not found."""
    statement_path = (
        Path(frontier_cs_dir) / "algorithmic" / "problems" / str(problem_id) / "statement.txt"
    )
    if statement_path.exists():
        try:
            return statement_path.read_text(encoding="utf-8")
        except Exception:
            pass
    return ""


def _format_case_feedback(cases: List[Dict[str, Any]], time_limit_ns: int = 2_000_000_000) -> str:
    """Format per-case results into readable text feedback."""
    if not cases:
        return "No per-case data available."

    lines = []
    weak_cases = []

    for i, case in enumerate(cases):
        ratio = case.get("scoreRatio", 0.0)
        time_ns = case.get("time", 0)
        time_ms = time_ns / 1_000_000 if time_ns else 0
        status = case.get("status", "Unknown")
        memory_kb = case.get("memory", 0) / 1024 if case.get("memory") else 0

        # Determine status indicator
        if ratio >= 1.0:
            indicator = "OK"
        elif ratio > 0:
            indicator = "PARTIAL"
        else:
            indicator = "FAIL"

        # Detect near-timeout (>80% of time limit)
        near_timeout = ""
        if time_ns > time_limit_ns * 0.8 and ratio < 1.0:
            near_timeout = " [near timeout]"

        lines.append(
            f"  Case {i + 1}: ratio={ratio:.4f} time={time_ms:.0f}ms "
            f"mem={memory_kb:.0f}KB {indicator}{near_timeout}"
        )

        if ratio < 1.0:
            weak_cases.append((i + 1, ratio, status))

    result = "\n".join(lines)

    if weak_cases:
        weak_summary = ", ".join(
            f"case {idx} ({r:.2f})" for idx, r, _ in sorted(weak_cases, key=lambda x: x[1])
        )
        result += f"\nWeakest: {weak_summary}"

    return result


def _build_text_feedback(
    problem_id: str,
    result_metadata: Dict[str, Any],
    score_bounded: float,
    score_unbounded: float,
    statement_summary: str = "",
    error_msg: str = "",
) -> str:
    """Build comprehensive text feedback for the LLM."""
    parts = []

    # Error information (compilation failure, runtime error, etc.)
    if error_msg:
        parts.append(f"Error: {error_msg}")

    # Per-case analysis
    cases = result_metadata.get("cases", [])
    if cases:
        n_cases = len(cases)
        n_passed = sum(1 for c in cases if c.get("scoreRatio", 0) >= 1.0)
        parts.append(f"Problem {problem_id} | {n_cases} test cases | {n_passed}/{n_cases} perfect")
        parts.append(_format_case_feedback(cases))

    # Score summary
    parts.append(f"Score: {score_bounded:.2f}/100 (unbounded: {score_unbounded:.2f})")

    # Problem statement (truncated for context)
    if statement_summary:
        # Keep first 2000 chars of statement to avoid bloating the prompt
        truncated = statement_summary[:2000]
        if len(statement_summary) > 2000:
            truncated += "\n[... truncated]"
        parts.append(f"\n--- Problem Statement ---\n{truncated}")

    return "\n".join(parts)


def main(
    program_path: str,
    results_dir: str,
    problem_id: str = "",
    judge_url: str = "",
    frontier_cs_dir: str = "",
) -> Dict[str, Any]:
    """
    Evaluate a C++ solution for a Frontier-CS algorithmic problem.

    Parameters can also be set via environment variables (env takes precedence
    over defaults, explicit args take precedence over env):
        FRONTIER_CS_PROBLEM_ID, FRONTIER_CS_JUDGE_URL, FRONTIER_CS_DIR

    This is the bridging evaluator that translates between ShinkaEvolve's
    evaluation interface and Frontier-CS's go-judge system.

    Args:
        program_path: Path to the C++ solution file.
        results_dir: Directory to write metrics.json and correct.json.
        problem_id: Frontier-CS problem ID (e.g., "0", "1", "42").
        judge_url: URL of the go-judge server.
        frontier_cs_dir: Path to the Frontier-CS repository root.

    Returns:
        Dict with combined_score, public, private, text_feedback, correct.
    """
    # Resolve from env vars when args are empty (local scheduler path)
    problem_id = problem_id or os.environ.get("FRONTIER_CS_PROBLEM_ID", "0")
    judge_url = judge_url or os.environ.get("FRONTIER_CS_JUDGE_URL", DEFAULT_JUDGE_URL)
    frontier_cs_dir = frontier_cs_dir or os.environ.get("FRONTIER_CS_DIR", DEFAULT_FRONTIER_CS_DIR)

    results_dir_path = Path(results_dir)
    results_dir_path.mkdir(parents=True, exist_ok=True)

    # Resolve frontier_cs_dir relative to project root if needed
    if not Path(frontier_cs_dir).is_absolute():
        # Try relative to CWD, then relative to this file's location
        if not Path(frontier_cs_dir).exists():
            project_root = Path(__file__).resolve().parents[2]
            frontier_cs_dir = str(project_root / frontier_cs_dir)

    # Load problem statement for feedback context
    statement = _load_problem_statement(frontier_cs_dir, problem_id)

    # Read the C++ code
    code_path = Path(program_path)
    if not code_path.exists():
        return _save_error_result(
            results_dir_path,
            f"Solution file not found: {program_path}",
            problem_id,
            statement,
        )

    code = code_path.read_text(encoding="utf-8")
    if not code.strip():
        return _save_error_result(
            results_dir_path,
            "Empty solution file",
            problem_id,
            statement,
        )

    # Import and call Frontier-CS evaluator
    _ensure_frontier_cs_importable(frontier_cs_dir)
    try:
        from frontier_cs.runner.algorithmic_local import AlgorithmicLocalRunner
        from frontier_cs.runner.base import EvaluationStatus
    except ImportError as e:
        return _save_error_result(
            results_dir_path,
            f"Failed to import frontier_cs: {e}. "
            f"Ensure Frontier-CS is installed (pip install -e {frontier_cs_dir})",
            problem_id,
            statement,
        )

    # Run evaluation via go-judge
    try:
        runner = AlgorithmicLocalRunner(judge_url=judge_url)
        result = runner.evaluate(str(problem_id), code)
    except Exception as e:
        return _save_error_result(
            results_dir_path,
            f"go-judge evaluation failed: {e}",
            problem_id,
            statement,
        )

    # Translate EvaluationResult to ShinkaEvolve format
    if result.status == EvaluationStatus.SUCCESS:
        metadata = result.metadata or {}
        score_bounded = result.score or 0.0
        score_unbounded = result.score_unbounded if result.score_unbounded is not None else score_bounded
        passed = metadata.get("passed", False)
        cases = metadata.get("cases", [])

        # Build public metrics (visible to LLM)
        public_metrics = {
            "score_bounded": score_bounded,
            "score_unbounded": score_unbounded,
            "passed": passed,
            "n_cases": len(cases),
            "n_perfect": sum(1 for c in cases if c.get("scoreRatio", 0) >= 1.0),
        }

        # Add per-case ratios (up to 20 cases to avoid bloat)
        for i, case in enumerate(cases[:20]):
            public_metrics[f"case_{i}_ratio"] = round(case.get("scoreRatio", 0.0), 4)
            time_ns = case.get("time", 0)
            if time_ns:
                public_metrics[f"case_{i}_time_ms"] = round(time_ns / 1_000_000, 1)

        text_feedback = _build_text_feedback(
            problem_id=problem_id,
            result_metadata=metadata,
            score_bounded=score_bounded,
            score_unbounded=score_unbounded,
            statement_summary=statement,
        )

        metrics = {
            "combined_score": score_unbounded,
            # Any code that compiles and runs counts as correct.
            # "passed" (all test cases perfect) is too strict for optimization problems.
            "correct": True,
            "public": public_metrics,
            "private": metadata,
            "text_feedback": text_feedback,
        }

    elif result.status == EvaluationStatus.TIMEOUT:
        metrics = _build_error_metrics(
            problem_id=problem_id,
            error_msg=f"Evaluation timed out: {result.message}",
            statement=statement,
        )

    else:
        # ERROR or SKIPPED
        error_msg = result.message or f"Evaluation failed with status: {result.status.value}"
        # Include logs for compilation errors
        if result.logs:
            error_msg += f"\n--- Logs ---\n{result.logs[:1000]}"
        metrics = _build_error_metrics(
            problem_id=problem_id,
            error_msg=error_msg,
            statement=statement,
        )

    # Save results
    _save_results(results_dir_path, metrics)

    logger.info(
        f"Frontier-CS Problem {problem_id}: "
        f"score={metrics.get('combined_score', 0):.2f}, "
        f"correct={metrics.get('correct', False)}"
    )

    return metrics


def _build_error_metrics(
    problem_id: str, error_msg: str, statement: str = ""
) -> Dict[str, Any]:
    """Build metrics dict for error cases."""
    return {
        "combined_score": 0.0,
        "correct": False,
        "public": {"error": error_msg[:500]},
        "private": {},
        "text_feedback": _build_text_feedback(
            problem_id=problem_id,
            result_metadata={},
            score_bounded=0.0,
            score_unbounded=0.0,
            statement_summary=statement,
            error_msg=error_msg,
        ),
    }


def _save_error_result(
    results_dir: Path, error_msg: str, problem_id: str, statement: str = ""
) -> Dict[str, Any]:
    """Save error result and return metrics dict."""
    metrics = _build_error_metrics(problem_id, error_msg, statement)
    _save_results(results_dir, metrics)
    return metrics


def _save_results(results_dir: Path, metrics: Dict[str, Any]) -> None:
    """Write metrics.json and correct.json in ShinkaEvolve format."""
    metrics_path = results_dir / "metrics.json"
    correct_path = results_dir / "correct.json"

    # metrics.json
    serializable_metrics = {
        "combined_score": metrics.get("combined_score", 0.0),
        "public": metrics.get("public", {}),
        "private": {},  # Don't serialize full go-judge metadata (can be huge)
        "text_feedback": metrics.get("text_feedback", ""),
    }
    with open(metrics_path, "w") as f:
        json.dump(serializable_metrics, f, indent=2, default=str)

    # correct.json
    correct_data = {
        "correct": metrics.get("correct", False),
        "error": None if metrics.get("correct") else metrics.get("public", {}).get("error"),
    }
    with open(correct_path, "w") as f:
        json.dump(correct_data, f, indent=2)


# --- CLI entry point ---
if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Frontier-CS algorithmic evaluator bridge")
    parser.add_argument("--program_path", required=True, help="Path to C++ solution")
    parser.add_argument("--results_dir", required=True, help="Output directory for metrics")
    parser.add_argument("--problem-id", default="", help="Frontier-CS problem ID (falls back to FRONTIER_CS_PROBLEM_ID env var, then '0')")
    parser.add_argument("--judge-url", default=DEFAULT_JUDGE_URL, help="go-judge URL")
    parser.add_argument("--frontier-cs-dir", default=DEFAULT_FRONTIER_CS_DIR)

    args = parser.parse_args()
    logging.basicConfig(level=logging.INFO)

    result = main(
        program_path=args.program_path,
        results_dir=args.results_dir,
        problem_id=args.problem_id,
        judge_url=args.judge_url,
        frontier_cs_dir=args.frontier_cs_dir,
    )
    print(f"Score: {result.get('combined_score', 0):.2f}")