""" Bridging evaluator for Frontier-CS algorithmic problems. Translates between ShinkaEvolve's evaluation interface and Frontier-CS's go-judge based evaluation system. Works for all 172 algorithmic problems via the `problem_id` parameter. Usage as evaluator_module: evaluator_module: "tasks.frontier_cs_entry.evaluate_algorithmic" evaluator_function: "main" evaluator_kwargs: {"problem_id": "0"} Direct usage: python -m tasks.frontier_cs_entry.evaluate_algorithmic \\ --program-path solution.cpp --results-dir /tmp/results --problem-id 0 """ from __future__ import annotations import json import logging import os import sys from pathlib import Path from typing import Any, Dict, List, Optional logger = logging.getLogger(__name__) # Default paths DEFAULT_JUDGE_URL = "http://localhost:8081" DEFAULT_FRONTIER_CS_DIR = "tasks/Frontier-CS" def _ensure_frontier_cs_importable(frontier_cs_dir: str) -> None: """ Make Frontier-CS runner classes importable without triggering the full package __init__.py (which pulls in google.generativeai and other heavy deps). We register lightweight stub modules so that only the runner subpackage is actually loaded. """ import types src_dir = str(Path(frontier_cs_dir).resolve() / "src") if src_dir not in sys.path: sys.path.insert(0, src_dir) # If already set up, skip if "frontier_cs" in sys.modules and hasattr(sys.modules["frontier_cs"], "__path__"): return fc_src = Path(src_dir) / "frontier_cs" # Register empty frontier_cs package (bypass __init__.py) fc = types.ModuleType("frontier_cs") fc.__path__ = [str(fc_src)] sys.modules["frontier_cs"] = fc # Stub frontier_cs.gen to avoid importing LLM interface fc_gen = types.ModuleType("frontier_cs.gen") fc_gen.__path__ = [str(fc_src / "gen")] sys.modules["frontier_cs.gen"] = fc_gen # Provide the single constant that base.py needs fc_gen_sf = types.ModuleType("frontier_cs.gen.solution_format") fc_gen_sf.FAILED_EXTENSION = "FAILED" sys.modules["frontier_cs.gen.solution_format"] = fc_gen_sf def _load_problem_statement(frontier_cs_dir: str, problem_id: str) -> str: """Load problem statement text. Returns empty string if not found.""" statement_path = ( Path(frontier_cs_dir) / "algorithmic" / "problems" / str(problem_id) / "statement.txt" ) if statement_path.exists(): try: return statement_path.read_text(encoding="utf-8") except Exception: pass return "" def _format_case_feedback(cases: List[Dict[str, Any]], time_limit_ns: int = 2_000_000_000) -> str: """Format per-case results into readable text feedback.""" if not cases: return "No per-case data available." lines = [] weak_cases = [] for i, case in enumerate(cases): ratio = case.get("scoreRatio", 0.0) time_ns = case.get("time", 0) time_ms = time_ns / 1_000_000 if time_ns else 0 status = case.get("status", "Unknown") memory_kb = case.get("memory", 0) / 1024 if case.get("memory") else 0 # Determine status indicator if ratio >= 1.0: indicator = "OK" elif ratio > 0: indicator = "PARTIAL" else: indicator = "FAIL" # Detect near-timeout (>80% of time limit) near_timeout = "" if time_ns > time_limit_ns * 0.8 and ratio < 1.0: near_timeout = " [near timeout]" lines.append( f" Case {i + 1}: ratio={ratio:.4f} time={time_ms:.0f}ms " f"mem={memory_kb:.0f}KB {indicator}{near_timeout}" ) if ratio < 1.0: weak_cases.append((i + 1, ratio, status)) result = "\n".join(lines) if weak_cases: weak_summary = ", ".join( f"case {idx} ({r:.2f})" for idx, r, _ in sorted(weak_cases, key=lambda x: x[1]) ) result += f"\nWeakest: {weak_summary}" return result def _build_text_feedback( problem_id: str, result_metadata: Dict[str, Any], score_bounded: float, score_unbounded: float, statement_summary: str = "", error_msg: str = "", ) -> str: """Build comprehensive text feedback for the LLM.""" parts = [] # Error information (compilation failure, runtime error, etc.) if error_msg: parts.append(f"Error: {error_msg}") # Per-case analysis cases = result_metadata.get("cases", []) if cases: n_cases = len(cases) n_passed = sum(1 for c in cases if c.get("scoreRatio", 0) >= 1.0) parts.append(f"Problem {problem_id} | {n_cases} test cases | {n_passed}/{n_cases} perfect") parts.append(_format_case_feedback(cases)) # Score summary parts.append(f"Score: {score_bounded:.2f}/100 (unbounded: {score_unbounded:.2f})") # Problem statement (truncated for context) if statement_summary: # Keep first 2000 chars of statement to avoid bloating the prompt truncated = statement_summary[:2000] if len(statement_summary) > 2000: truncated += "\n[... truncated]" parts.append(f"\n--- Problem Statement ---\n{truncated}") return "\n".join(parts) def main( program_path: str, results_dir: str, problem_id: str = "", judge_url: str = "", frontier_cs_dir: str = "", ) -> Dict[str, Any]: """ Evaluate a C++ solution for a Frontier-CS algorithmic problem. Parameters can also be set via environment variables (env takes precedence over defaults, explicit args take precedence over env): FRONTIER_CS_PROBLEM_ID, FRONTIER_CS_JUDGE_URL, FRONTIER_CS_DIR This is the bridging evaluator that translates between ShinkaEvolve's evaluation interface and Frontier-CS's go-judge system. Args: program_path: Path to the C++ solution file. results_dir: Directory to write metrics.json and correct.json. problem_id: Frontier-CS problem ID (e.g., "0", "1", "42"). judge_url: URL of the go-judge server. frontier_cs_dir: Path to the Frontier-CS repository root. Returns: Dict with combined_score, public, private, text_feedback, correct. """ # Resolve from env vars when args are empty (local scheduler path) problem_id = problem_id or os.environ.get("FRONTIER_CS_PROBLEM_ID", "0") judge_url = judge_url or os.environ.get("FRONTIER_CS_JUDGE_URL", DEFAULT_JUDGE_URL) frontier_cs_dir = frontier_cs_dir or os.environ.get("FRONTIER_CS_DIR", DEFAULT_FRONTIER_CS_DIR) results_dir_path = Path(results_dir) results_dir_path.mkdir(parents=True, exist_ok=True) # Resolve frontier_cs_dir relative to project root if needed if not Path(frontier_cs_dir).is_absolute(): # Try relative to CWD, then relative to this file's location if not Path(frontier_cs_dir).exists(): project_root = Path(__file__).resolve().parents[2] frontier_cs_dir = str(project_root / frontier_cs_dir) # Load problem statement for feedback context statement = _load_problem_statement(frontier_cs_dir, problem_id) # Read the C++ code code_path = Path(program_path) if not code_path.exists(): return _save_error_result( results_dir_path, f"Solution file not found: {program_path}", problem_id, statement, ) code = code_path.read_text(encoding="utf-8") if not code.strip(): return _save_error_result( results_dir_path, "Empty solution file", problem_id, statement, ) # Import and call Frontier-CS evaluator _ensure_frontier_cs_importable(frontier_cs_dir) try: from frontier_cs.runner.algorithmic_local import AlgorithmicLocalRunner from frontier_cs.runner.base import EvaluationStatus except ImportError as e: return _save_error_result( results_dir_path, f"Failed to import frontier_cs: {e}. " f"Ensure Frontier-CS is installed (pip install -e {frontier_cs_dir})", problem_id, statement, ) # Run evaluation via go-judge try: runner = AlgorithmicLocalRunner(judge_url=judge_url) result = runner.evaluate(str(problem_id), code) except Exception as e: return _save_error_result( results_dir_path, f"go-judge evaluation failed: {e}", problem_id, statement, ) # Translate EvaluationResult to ShinkaEvolve format if result.status == EvaluationStatus.SUCCESS: metadata = result.metadata or {} score_bounded = result.score or 0.0 score_unbounded = result.score_unbounded if result.score_unbounded is not None else score_bounded passed = metadata.get("passed", False) cases = metadata.get("cases", []) # Build public metrics (visible to LLM) public_metrics = { "score_bounded": score_bounded, "score_unbounded": score_unbounded, "passed": passed, "n_cases": len(cases), "n_perfect": sum(1 for c in cases if c.get("scoreRatio", 0) >= 1.0), } # Add per-case ratios (up to 20 cases to avoid bloat) for i, case in enumerate(cases[:20]): public_metrics[f"case_{i}_ratio"] = round(case.get("scoreRatio", 0.0), 4) time_ns = case.get("time", 0) if time_ns: public_metrics[f"case_{i}_time_ms"] = round(time_ns / 1_000_000, 1) text_feedback = _build_text_feedback( problem_id=problem_id, result_metadata=metadata, score_bounded=score_bounded, score_unbounded=score_unbounded, statement_summary=statement, ) metrics = { "combined_score": score_unbounded, # Any code that compiles and runs counts as correct. # "passed" (all test cases perfect) is too strict for optimization problems. "correct": True, "public": public_metrics, "private": metadata, "text_feedback": text_feedback, } elif result.status == EvaluationStatus.TIMEOUT: metrics = _build_error_metrics( problem_id=problem_id, error_msg=f"Evaluation timed out: {result.message}", statement=statement, ) else: # ERROR or SKIPPED error_msg = result.message or f"Evaluation failed with status: {result.status.value}" # Include logs for compilation errors if result.logs: error_msg += f"\n--- Logs ---\n{result.logs[:1000]}" metrics = _build_error_metrics( problem_id=problem_id, error_msg=error_msg, statement=statement, ) # Save results _save_results(results_dir_path, metrics) logger.info( f"Frontier-CS Problem {problem_id}: " f"score={metrics.get('combined_score', 0):.2f}, " f"correct={metrics.get('correct', False)}" ) return metrics def _build_error_metrics( problem_id: str, error_msg: str, statement: str = "" ) -> Dict[str, Any]: """Build metrics dict for error cases.""" return { "combined_score": 0.0, "correct": False, "public": {"error": error_msg[:500]}, "private": {}, "text_feedback": _build_text_feedback( problem_id=problem_id, result_metadata={}, score_bounded=0.0, score_unbounded=0.0, statement_summary=statement, error_msg=error_msg, ), } def _save_error_result( results_dir: Path, error_msg: str, problem_id: str, statement: str = "" ) -> Dict[str, Any]: """Save error result and return metrics dict.""" metrics = _build_error_metrics(problem_id, error_msg, statement) _save_results(results_dir, metrics) return metrics def _save_results(results_dir: Path, metrics: Dict[str, Any]) -> None: """Write metrics.json and correct.json in ShinkaEvolve format.""" metrics_path = results_dir / "metrics.json" correct_path = results_dir / "correct.json" # metrics.json serializable_metrics = { "combined_score": metrics.get("combined_score", 0.0), "public": metrics.get("public", {}), "private": {}, # Don't serialize full go-judge metadata (can be huge) "text_feedback": metrics.get("text_feedback", ""), } with open(metrics_path, "w") as f: json.dump(serializable_metrics, f, indent=2, default=str) # correct.json correct_data = { "correct": metrics.get("correct", False), "error": None if metrics.get("correct") else metrics.get("public", {}).get("error"), } with open(correct_path, "w") as f: json.dump(correct_data, f, indent=2) # --- CLI entry point --- if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Frontier-CS algorithmic evaluator bridge") parser.add_argument("--program_path", required=True, help="Path to C++ solution") parser.add_argument("--results_dir", required=True, help="Output directory for metrics") parser.add_argument("--problem-id", default="", help="Frontier-CS problem ID (falls back to FRONTIER_CS_PROBLEM_ID env var, then '0')") parser.add_argument("--judge-url", default=DEFAULT_JUDGE_URL, help="go-judge URL") parser.add_argument("--frontier-cs-dir", default=DEFAULT_FRONTIER_CS_DIR) args = parser.parse_args() logging.basicConfig(level=logging.INFO) result = main( program_path=args.program_path, results_dir=args.results_dir, problem_id=args.problem_id, judge_url=args.judge_url, frontier_cs_dir=args.frontier_cs_dir, ) print(f"Score: {result.get('combined_score', 0):.2f}")