| """ |
| Bridging evaluator for Frontier-CS algorithmic problems. |
| |
| Translates between ShinkaEvolve's evaluation interface and Frontier-CS's |
| go-judge based evaluation system. Works for all 172 algorithmic problems |
| via the `problem_id` parameter. |
| |
| Usage as evaluator_module: |
| evaluator_module: "tasks.frontier_cs_entry.evaluate_algorithmic" |
| evaluator_function: "main" |
| evaluator_kwargs: {"problem_id": "0"} |
| |
| Direct usage: |
| python -m tasks.frontier_cs_entry.evaluate_algorithmic \\ |
| --program-path solution.cpp --results-dir /tmp/results --problem-id 0 |
| """ |
|
|
| from __future__ import annotations |
|
|
| import json |
| import logging |
| import os |
| import sys |
| from pathlib import Path |
| from typing import Any, Dict, List, Optional |
|
|
| logger = logging.getLogger(__name__) |
|
|
| |
| DEFAULT_JUDGE_URL = "http://localhost:8081" |
| DEFAULT_FRONTIER_CS_DIR = "tasks/Frontier-CS" |
|
|
|
|
| def _ensure_frontier_cs_importable(frontier_cs_dir: str) -> None: |
| """ |
| Make Frontier-CS runner classes importable without triggering the full |
| package __init__.py (which pulls in google.generativeai and other heavy deps). |
| |
| We register lightweight stub modules so that only the runner subpackage |
| is actually loaded. |
| """ |
| import types |
|
|
| src_dir = str(Path(frontier_cs_dir).resolve() / "src") |
| if src_dir not in sys.path: |
| sys.path.insert(0, src_dir) |
|
|
| |
| if "frontier_cs" in sys.modules and hasattr(sys.modules["frontier_cs"], "__path__"): |
| return |
|
|
| fc_src = Path(src_dir) / "frontier_cs" |
|
|
| |
| fc = types.ModuleType("frontier_cs") |
| fc.__path__ = [str(fc_src)] |
| sys.modules["frontier_cs"] = fc |
|
|
| |
| fc_gen = types.ModuleType("frontier_cs.gen") |
| fc_gen.__path__ = [str(fc_src / "gen")] |
| sys.modules["frontier_cs.gen"] = fc_gen |
|
|
| |
| fc_gen_sf = types.ModuleType("frontier_cs.gen.solution_format") |
| fc_gen_sf.FAILED_EXTENSION = "FAILED" |
| sys.modules["frontier_cs.gen.solution_format"] = fc_gen_sf |
|
|
|
|
| def _load_problem_statement(frontier_cs_dir: str, problem_id: str) -> str: |
| """Load problem statement text. Returns empty string if not found.""" |
| statement_path = ( |
| Path(frontier_cs_dir) / "algorithmic" / "problems" / str(problem_id) / "statement.txt" |
| ) |
| if statement_path.exists(): |
| try: |
| return statement_path.read_text(encoding="utf-8") |
| except Exception: |
| pass |
| return "" |
|
|
|
|
| def _format_case_feedback(cases: List[Dict[str, Any]], time_limit_ns: int = 2_000_000_000) -> str: |
| """Format per-case results into readable text feedback.""" |
| if not cases: |
| return "No per-case data available." |
|
|
| lines = [] |
| weak_cases = [] |
|
|
| for i, case in enumerate(cases): |
| ratio = case.get("scoreRatio", 0.0) |
| time_ns = case.get("time", 0) |
| time_ms = time_ns / 1_000_000 if time_ns else 0 |
| status = case.get("status", "Unknown") |
| memory_kb = case.get("memory", 0) / 1024 if case.get("memory") else 0 |
|
|
| |
| if ratio >= 1.0: |
| indicator = "OK" |
| elif ratio > 0: |
| indicator = "PARTIAL" |
| else: |
| indicator = "FAIL" |
|
|
| |
| near_timeout = "" |
| if time_ns > time_limit_ns * 0.8 and ratio < 1.0: |
| near_timeout = " [near timeout]" |
|
|
| lines.append( |
| f" Case {i + 1}: ratio={ratio:.4f} time={time_ms:.0f}ms " |
| f"mem={memory_kb:.0f}KB {indicator}{near_timeout}" |
| ) |
|
|
| if ratio < 1.0: |
| weak_cases.append((i + 1, ratio, status)) |
|
|
| result = "\n".join(lines) |
|
|
| if weak_cases: |
| weak_summary = ", ".join( |
| f"case {idx} ({r:.2f})" for idx, r, _ in sorted(weak_cases, key=lambda x: x[1]) |
| ) |
| result += f"\nWeakest: {weak_summary}" |
|
|
| return result |
|
|
|
|
| def _build_text_feedback( |
| problem_id: str, |
| result_metadata: Dict[str, Any], |
| score_bounded: float, |
| score_unbounded: float, |
| statement_summary: str = "", |
| error_msg: str = "", |
| ) -> str: |
| """Build comprehensive text feedback for the LLM.""" |
| parts = [] |
|
|
| |
| if error_msg: |
| parts.append(f"Error: {error_msg}") |
|
|
| |
| cases = result_metadata.get("cases", []) |
| if cases: |
| n_cases = len(cases) |
| n_passed = sum(1 for c in cases if c.get("scoreRatio", 0) >= 1.0) |
| parts.append(f"Problem {problem_id} | {n_cases} test cases | {n_passed}/{n_cases} perfect") |
| parts.append(_format_case_feedback(cases)) |
|
|
| |
| parts.append(f"Score: {score_bounded:.2f}/100 (unbounded: {score_unbounded:.2f})") |
|
|
| |
| if statement_summary: |
| |
| truncated = statement_summary[:2000] |
| if len(statement_summary) > 2000: |
| truncated += "\n[... truncated]" |
| parts.append(f"\n--- Problem Statement ---\n{truncated}") |
|
|
| return "\n".join(parts) |
|
|
|
|
| def main( |
| program_path: str, |
| results_dir: str, |
| problem_id: str = "", |
| judge_url: str = "", |
| frontier_cs_dir: str = "", |
| ) -> Dict[str, Any]: |
| """ |
| Evaluate a C++ solution for a Frontier-CS algorithmic problem. |
| |
| Parameters can also be set via environment variables (env takes precedence |
| over defaults, explicit args take precedence over env): |
| FRONTIER_CS_PROBLEM_ID, FRONTIER_CS_JUDGE_URL, FRONTIER_CS_DIR |
| |
| This is the bridging evaluator that translates between ShinkaEvolve's |
| evaluation interface and Frontier-CS's go-judge system. |
| |
| Args: |
| program_path: Path to the C++ solution file. |
| results_dir: Directory to write metrics.json and correct.json. |
| problem_id: Frontier-CS problem ID (e.g., "0", "1", "42"). |
| judge_url: URL of the go-judge server. |
| frontier_cs_dir: Path to the Frontier-CS repository root. |
| |
| Returns: |
| Dict with combined_score, public, private, text_feedback, correct. |
| """ |
| |
| problem_id = problem_id or os.environ.get("FRONTIER_CS_PROBLEM_ID", "0") |
| judge_url = judge_url or os.environ.get("FRONTIER_CS_JUDGE_URL", DEFAULT_JUDGE_URL) |
| frontier_cs_dir = frontier_cs_dir or os.environ.get("FRONTIER_CS_DIR", DEFAULT_FRONTIER_CS_DIR) |
|
|
| results_dir_path = Path(results_dir) |
| results_dir_path.mkdir(parents=True, exist_ok=True) |
|
|
| |
| if not Path(frontier_cs_dir).is_absolute(): |
| |
| if not Path(frontier_cs_dir).exists(): |
| project_root = Path(__file__).resolve().parents[2] |
| frontier_cs_dir = str(project_root / frontier_cs_dir) |
|
|
| |
| statement = _load_problem_statement(frontier_cs_dir, problem_id) |
|
|
| |
| code_path = Path(program_path) |
| if not code_path.exists(): |
| return _save_error_result( |
| results_dir_path, |
| f"Solution file not found: {program_path}", |
| problem_id, |
| statement, |
| ) |
|
|
| code = code_path.read_text(encoding="utf-8") |
| if not code.strip(): |
| return _save_error_result( |
| results_dir_path, |
| "Empty solution file", |
| problem_id, |
| statement, |
| ) |
|
|
| |
| _ensure_frontier_cs_importable(frontier_cs_dir) |
| try: |
| from frontier_cs.runner.algorithmic_local import AlgorithmicLocalRunner |
| from frontier_cs.runner.base import EvaluationStatus |
| except ImportError as e: |
| return _save_error_result( |
| results_dir_path, |
| f"Failed to import frontier_cs: {e}. " |
| f"Ensure Frontier-CS is installed (pip install -e {frontier_cs_dir})", |
| problem_id, |
| statement, |
| ) |
|
|
| |
| try: |
| runner = AlgorithmicLocalRunner(judge_url=judge_url) |
| result = runner.evaluate(str(problem_id), code) |
| except Exception as e: |
| return _save_error_result( |
| results_dir_path, |
| f"go-judge evaluation failed: {e}", |
| problem_id, |
| statement, |
| ) |
|
|
| |
| if result.status == EvaluationStatus.SUCCESS: |
| metadata = result.metadata or {} |
| score_bounded = result.score or 0.0 |
| score_unbounded = result.score_unbounded if result.score_unbounded is not None else score_bounded |
| passed = metadata.get("passed", False) |
| cases = metadata.get("cases", []) |
|
|
| |
| public_metrics = { |
| "score_bounded": score_bounded, |
| "score_unbounded": score_unbounded, |
| "passed": passed, |
| "n_cases": len(cases), |
| "n_perfect": sum(1 for c in cases if c.get("scoreRatio", 0) >= 1.0), |
| } |
|
|
| |
| for i, case in enumerate(cases[:20]): |
| public_metrics[f"case_{i}_ratio"] = round(case.get("scoreRatio", 0.0), 4) |
| time_ns = case.get("time", 0) |
| if time_ns: |
| public_metrics[f"case_{i}_time_ms"] = round(time_ns / 1_000_000, 1) |
|
|
| text_feedback = _build_text_feedback( |
| problem_id=problem_id, |
| result_metadata=metadata, |
| score_bounded=score_bounded, |
| score_unbounded=score_unbounded, |
| statement_summary=statement, |
| ) |
|
|
| metrics = { |
| "combined_score": score_unbounded, |
| |
| |
| "correct": True, |
| "public": public_metrics, |
| "private": metadata, |
| "text_feedback": text_feedback, |
| } |
|
|
| elif result.status == EvaluationStatus.TIMEOUT: |
| metrics = _build_error_metrics( |
| problem_id=problem_id, |
| error_msg=f"Evaluation timed out: {result.message}", |
| statement=statement, |
| ) |
|
|
| else: |
| |
| error_msg = result.message or f"Evaluation failed with status: {result.status.value}" |
| |
| if result.logs: |
| error_msg += f"\n--- Logs ---\n{result.logs[:1000]}" |
| metrics = _build_error_metrics( |
| problem_id=problem_id, |
| error_msg=error_msg, |
| statement=statement, |
| ) |
|
|
| |
| _save_results(results_dir_path, metrics) |
|
|
| logger.info( |
| f"Frontier-CS Problem {problem_id}: " |
| f"score={metrics.get('combined_score', 0):.2f}, " |
| f"correct={metrics.get('correct', False)}" |
| ) |
|
|
| return metrics |
|
|
|
|
| def _build_error_metrics( |
| problem_id: str, error_msg: str, statement: str = "" |
| ) -> Dict[str, Any]: |
| """Build metrics dict for error cases.""" |
| return { |
| "combined_score": 0.0, |
| "correct": False, |
| "public": {"error": error_msg[:500]}, |
| "private": {}, |
| "text_feedback": _build_text_feedback( |
| problem_id=problem_id, |
| result_metadata={}, |
| score_bounded=0.0, |
| score_unbounded=0.0, |
| statement_summary=statement, |
| error_msg=error_msg, |
| ), |
| } |
|
|
|
|
| def _save_error_result( |
| results_dir: Path, error_msg: str, problem_id: str, statement: str = "" |
| ) -> Dict[str, Any]: |
| """Save error result and return metrics dict.""" |
| metrics = _build_error_metrics(problem_id, error_msg, statement) |
| _save_results(results_dir, metrics) |
| return metrics |
|
|
|
|
| def _save_results(results_dir: Path, metrics: Dict[str, Any]) -> None: |
| """Write metrics.json and correct.json in ShinkaEvolve format.""" |
| metrics_path = results_dir / "metrics.json" |
| correct_path = results_dir / "correct.json" |
|
|
| |
| serializable_metrics = { |
| "combined_score": metrics.get("combined_score", 0.0), |
| "public": metrics.get("public", {}), |
| "private": {}, |
| "text_feedback": metrics.get("text_feedback", ""), |
| } |
| with open(metrics_path, "w") as f: |
| json.dump(serializable_metrics, f, indent=2, default=str) |
|
|
| |
| correct_data = { |
| "correct": metrics.get("correct", False), |
| "error": None if metrics.get("correct") else metrics.get("public", {}).get("error"), |
| } |
| with open(correct_path, "w") as f: |
| json.dump(correct_data, f, indent=2) |
|
|
|
|
| |
| if __name__ == "__main__": |
| import argparse |
|
|
| parser = argparse.ArgumentParser(description="Frontier-CS algorithmic evaluator bridge") |
| parser.add_argument("--program_path", required=True, help="Path to C++ solution") |
| parser.add_argument("--results_dir", required=True, help="Output directory for metrics") |
| parser.add_argument("--problem-id", default="", help="Frontier-CS problem ID (falls back to FRONTIER_CS_PROBLEM_ID env var, then '0')") |
| parser.add_argument("--judge-url", default=DEFAULT_JUDGE_URL, help="go-judge URL") |
| parser.add_argument("--frontier-cs-dir", default=DEFAULT_FRONTIER_CS_DIR) |
|
|
| args = parser.parse_args() |
| logging.basicConfig(level=logging.INFO) |
|
|
| result = main( |
| program_path=args.program_path, |
| results_dir=args.results_dir, |
| problem_id=args.problem_id, |
| judge_url=args.judge_url, |
| frontier_cs_dir=args.frontier_cs_dir, |
| ) |
| print(f"Score: {result.get('combined_score', 0):.2f}") |
|
|