File size: 13,937 Bytes

2facf1f

"""
Bridging evaluator for Frontier-CS algorithmic problems.

Translates between ShinkaEvolve's evaluation interface and Frontier-CS's
go-judge based evaluation system. Works for all 172 algorithmic problems
via the `problem_id` parameter.

Usage as evaluator_module:
    evaluator_module: "tasks.frontier_cs_entry.evaluate_algorithmic"
    evaluator_function: "main"
    evaluator_kwargs: {"problem_id": "0"}

Direct usage:
    python -m tasks.frontier_cs_entry.evaluate_algorithmic \\
        --program-path solution.cpp --results-dir /tmp/results --problem-id 0
"""

from __future__ import annotations

import json
import logging
import os
import sys
from pathlib import Path
from typing import Any, Dict, List, Optional

logger = logging.getLogger(__name__)

# Default paths
DEFAULT_JUDGE_URL = "http://localhost:8081"
DEFAULT_FRONTIER_CS_DIR = "tasks/Frontier-CS"


def _ensure_frontier_cs_importable(frontier_cs_dir: str) -> None:
    """
    Make Frontier-CS runner classes importable without triggering the full
    package __init__.py (which pulls in google.generativeai and other heavy deps).

    We register lightweight stub modules so that only the runner subpackage
    is actually loaded.
    """
    import types

    src_dir = str(Path(frontier_cs_dir).resolve() / "src")
    if src_dir not in sys.path:
        sys.path.insert(0, src_dir)

    # If already set up, skip
    if "frontier_cs" in sys.modules and hasattr(sys.modules["frontier_cs"], "__path__"):
        return

    fc_src = Path(src_dir) / "frontier_cs"

    # Register empty frontier_cs package (bypass __init__.py)
    fc = types.ModuleType("frontier_cs")
    fc.__path__ = [str(fc_src)]
    sys.modules["frontier_cs"] = fc

    # Stub frontier_cs.gen to avoid importing LLM interface
    fc_gen = types.ModuleType("frontier_cs.gen")
    fc_gen.__path__ = [str(fc_src / "gen")]
    sys.modules["frontier_cs.gen"] = fc_gen

    # Provide the single constant that base.py needs
    fc_gen_sf = types.ModuleType("frontier_cs.gen.solution_format")
    fc_gen_sf.FAILED_EXTENSION = "FAILED"
    sys.modules["frontier_cs.gen.solution_format"] = fc_gen_sf


def _load_problem_statement(frontier_cs_dir: str, problem_id: str) -> str:
    """Load problem statement text. Returns empty string if not found."""
    statement_path = (
        Path(frontier_cs_dir) / "algorithmic" / "problems" / str(problem_id) / "statement.txt"
    )
    if statement_path.exists():
        try:
            return statement_path.read_text(encoding="utf-8")
        except Exception:
            pass
    return ""


def _format_case_feedback(cases: List[Dict[str, Any]], time_limit_ns: int = 2_000_000_000) -> str:
    """Format per-case results into readable text feedback."""
    if not cases:
        return "No per-case data available."

    lines = []
    weak_cases = []

    for i, case in enumerate(cases):
        ratio = case.get("scoreRatio", 0.0)
        time_ns = case.get("time", 0)
        time_ms = time_ns / 1_000_000 if time_ns else 0
        status = case.get("status", "Unknown")
        memory_kb = case.get("memory", 0) / 1024 if case.get("memory") else 0

        # Determine status indicator
        if ratio >= 1.0:
            indicator = "OK"
        elif ratio > 0:
            indicator = "PARTIAL"
        else:
            indicator = "FAIL"

        # Detect near-timeout (>80% of time limit)
        near_timeout = ""
        if time_ns > time_limit_ns * 0.8 and ratio < 1.0:
            near_timeout = " [near timeout]"

        lines.append(
            f"  Case {i + 1}: ratio={ratio:.4f} time={time_ms:.0f}ms "
            f"mem={memory_kb:.0f}KB {indicator}{near_timeout}"
        )

        if ratio < 1.0:
            weak_cases.append((i + 1, ratio, status))

    result = "\n".join(lines)

    if weak_cases:
        weak_summary = ", ".join(
            f"case {idx} ({r:.2f})" for idx, r, _ in sorted(weak_cases, key=lambda x: x[1])
        )
        result += f"\nWeakest: {weak_summary}"

    return result


def _build_text_feedback(
    problem_id: str,
    result_metadata: Dict[str, Any],
    score_bounded: float,
    score_unbounded: float,
    statement_summary: str = "",
    error_msg: str = "",
) -> str:
    """Build comprehensive text feedback for the LLM."""
    parts = []

    # Error information (compilation failure, runtime error, etc.)
    if error_msg:
        parts.append(f"Error: {error_msg}")

    # Per-case analysis
    cases = result_metadata.get("cases", [])
    if cases:
        n_cases = len(cases)
        n_passed = sum(1 for c in cases if c.get("scoreRatio", 0) >= 1.0)
        parts.append(f"Problem {problem_id} | {n_cases} test cases | {n_passed}/{n_cases} perfect")
        parts.append(_format_case_feedback(cases))

    # Score summary
    parts.append(f"Score: {score_bounded:.2f}/100 (unbounded: {score_unbounded:.2f})")

    # Problem statement (truncated for context)
    if statement_summary:
        # Keep first 2000 chars of statement to avoid bloating the prompt
        truncated = statement_summary[:2000]
        if len(statement_summary) > 2000:
            truncated += "\n[... truncated]"
        parts.append(f"\n--- Problem Statement ---\n{truncated}")

    return "\n".join(parts)


def main(
    program_path: str,
    results_dir: str,
    problem_id: str = "",
    judge_url: str = "",
    frontier_cs_dir: str = "",
) -> Dict[str, Any]:
    """
    Evaluate a C++ solution for a Frontier-CS algorithmic problem.

    Parameters can also be set via environment variables (env takes precedence
    over defaults, explicit args take precedence over env):
        FRONTIER_CS_PROBLEM_ID, FRONTIER_CS_JUDGE_URL, FRONTIER_CS_DIR

    This is the bridging evaluator that translates between ShinkaEvolve's
    evaluation interface and Frontier-CS's go-judge system.

    Args:
        program_path: Path to the C++ solution file.
        results_dir: Directory to write metrics.json and correct.json.
        problem_id: Frontier-CS problem ID (e.g., "0", "1", "42").
        judge_url: URL of the go-judge server.
        frontier_cs_dir: Path to the Frontier-CS repository root.

    Returns:
        Dict with combined_score, public, private, text_feedback, correct.
    """
    # Resolve from env vars when args are empty (local scheduler path)
    problem_id = problem_id or os.environ.get("FRONTIER_CS_PROBLEM_ID", "0")
    judge_url = judge_url or os.environ.get("FRONTIER_CS_JUDGE_URL", DEFAULT_JUDGE_URL)
    frontier_cs_dir = frontier_cs_dir or os.environ.get("FRONTIER_CS_DIR", DEFAULT_FRONTIER_CS_DIR)

    results_dir_path = Path(results_dir)
    results_dir_path.mkdir(parents=True, exist_ok=True)

    # Resolve frontier_cs_dir relative to project root if needed
    if not Path(frontier_cs_dir).is_absolute():
        # Try relative to CWD, then relative to this file's location
        if not Path(frontier_cs_dir).exists():
            project_root = Path(__file__).resolve().parents[2]
            frontier_cs_dir = str(project_root / frontier_cs_dir)

    # Load problem statement for feedback context
    statement = _load_problem_statement(frontier_cs_dir, problem_id)

    # Read the C++ code
    code_path = Path(program_path)
    if not code_path.exists():
        return _save_error_result(
            results_dir_path,
            f"Solution file not found: {program_path}",
            problem_id,
            statement,
        )

    code = code_path.read_text(encoding="utf-8")
    if not code.strip():
        return _save_error_result(
            results_dir_path,
            "Empty solution file",
            problem_id,
            statement,
        )

    # Import and call Frontier-CS evaluator
    _ensure_frontier_cs_importable(frontier_cs_dir)
    try:
        from frontier_cs.runner.algorithmic_local import AlgorithmicLocalRunner
        from frontier_cs.runner.base import EvaluationStatus
    except ImportError as e:
        return _save_error_result(
            results_dir_path,
            f"Failed to import frontier_cs: {e}. "
            f"Ensure Frontier-CS is installed (pip install -e {frontier_cs_dir})",
            problem_id,
            statement,
        )

    # Run evaluation via go-judge
    try:
        runner = AlgorithmicLocalRunner(judge_url=judge_url)
        result = runner.evaluate(str(problem_id), code)
    except Exception as e:
        return _save_error_result(
            results_dir_path,
            f"go-judge evaluation failed: {e}",
            problem_id,
            statement,
        )

    # Translate EvaluationResult to ShinkaEvolve format
    if result.status == EvaluationStatus.SUCCESS:
        metadata = result.metadata or {}
        score_bounded = result.score or 0.0
        score_unbounded = result.score_unbounded if result.score_unbounded is not None else score_bounded
        passed = metadata.get("passed", False)
        cases = metadata.get("cases", [])

        # Build public metrics (visible to LLM)
        public_metrics = {
            "score_bounded": score_bounded,
            "score_unbounded": score_unbounded,
            "passed": passed,
            "n_cases": len(cases),
            "n_perfect": sum(1 for c in cases if c.get("scoreRatio", 0) >= 1.0),
        }

        # Add per-case ratios (up to 20 cases to avoid bloat)
        for i, case in enumerate(cases[:20]):
            public_metrics[f"case_{i}_ratio"] = round(case.get("scoreRatio", 0.0), 4)
            time_ns = case.get("time", 0)
            if time_ns:
                public_metrics[f"case_{i}_time_ms"] = round(time_ns / 1_000_000, 1)

        text_feedback = _build_text_feedback(
            problem_id=problem_id,
            result_metadata=metadata,
            score_bounded=score_bounded,
            score_unbounded=score_unbounded,
            statement_summary=statement,
        )

        metrics = {
            "combined_score": score_unbounded,
            # Any code that compiles and runs counts as correct.
            # "passed" (all test cases perfect) is too strict for optimization problems.
            "correct": True,
            "public": public_metrics,
            "private": metadata,
            "text_feedback": text_feedback,
        }

    elif result.status == EvaluationStatus.TIMEOUT:
        metrics = _build_error_metrics(
            problem_id=problem_id,
            error_msg=f"Evaluation timed out: {result.message}",
            statement=statement,
        )

    else:
        # ERROR or SKIPPED
        error_msg = result.message or f"Evaluation failed with status: {result.status.value}"
        # Include logs for compilation errors
        if result.logs:
            error_msg += f"\n--- Logs ---\n{result.logs[:1000]}"
        metrics = _build_error_metrics(
            problem_id=problem_id,
            error_msg=error_msg,
            statement=statement,
        )

    # Save results
    _save_results(results_dir_path, metrics)

    logger.info(
        f"Frontier-CS Problem {problem_id}: "
        f"score={metrics.get('combined_score', 0):.2f}, "
        f"correct={metrics.get('correct', False)}"
    )

    return metrics


def _build_error_metrics(
    problem_id: str, error_msg: str, statement: str = ""
) -> Dict[str, Any]:
    """Build metrics dict for error cases."""
    return {
        "combined_score": 0.0,
        "correct": False,
        "public": {"error": error_msg[:500]},
        "private": {},
        "text_feedback": _build_text_feedback(
            problem_id=problem_id,
            result_metadata={},
            score_bounded=0.0,
            score_unbounded=0.0,
            statement_summary=statement,
            error_msg=error_msg,
        ),
    }


def _save_error_result(
    results_dir: Path, error_msg: str, problem_id: str, statement: str = ""
) -> Dict[str, Any]:
    """Save error result and return metrics dict."""
    metrics = _build_error_metrics(problem_id, error_msg, statement)
    _save_results(results_dir, metrics)
    return metrics


def _save_results(results_dir: Path, metrics: Dict[str, Any]) -> None:
    """Write metrics.json and correct.json in ShinkaEvolve format."""
    metrics_path = results_dir / "metrics.json"
    correct_path = results_dir / "correct.json"

    # metrics.json
    serializable_metrics = {
        "combined_score": metrics.get("combined_score", 0.0),
        "public": metrics.get("public", {}),
        "private": {},  # Don't serialize full go-judge metadata (can be huge)
        "text_feedback": metrics.get("text_feedback", ""),
    }
    with open(metrics_path, "w") as f:
        json.dump(serializable_metrics, f, indent=2, default=str)

    # correct.json
    correct_data = {
        "correct": metrics.get("correct", False),
        "error": None if metrics.get("correct") else metrics.get("public", {}).get("error"),
    }
    with open(correct_path, "w") as f:
        json.dump(correct_data, f, indent=2)


# --- CLI entry point ---
if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Frontier-CS algorithmic evaluator bridge")
    parser.add_argument("--program_path", required=True, help="Path to C++ solution")
    parser.add_argument("--results_dir", required=True, help="Output directory for metrics")
    parser.add_argument("--problem-id", default="", help="Frontier-CS problem ID (falls back to FRONTIER_CS_PROBLEM_ID env var, then '0')")
    parser.add_argument("--judge-url", default=DEFAULT_JUDGE_URL, help="go-judge URL")
    parser.add_argument("--frontier-cs-dir", default=DEFAULT_FRONTIER_CS_DIR)

    args = parser.parse_args()
    logging.basicConfig(level=logging.INFO)

    result = main(
        program_path=args.program_path,
        results_dir=args.results_dir,
        problem_id=args.problem_id,
        judge_url=args.judge_url,
        frontier_cs_dir=args.frontier_cs_dir,
    )
    print(f"Score: {result.get('combined_score', 0):.2f}")