shinka-backup / tasks /frontier_cs_entry /evaluate_algorithmic.py
JustinTX's picture
Add files using upload-large-folder tool
2facf1f verified
"""
Bridging evaluator for Frontier-CS algorithmic problems.
Translates between ShinkaEvolve's evaluation interface and Frontier-CS's
go-judge based evaluation system. Works for all 172 algorithmic problems
via the `problem_id` parameter.
Usage as evaluator_module:
evaluator_module: "tasks.frontier_cs_entry.evaluate_algorithmic"
evaluator_function: "main"
evaluator_kwargs: {"problem_id": "0"}
Direct usage:
python -m tasks.frontier_cs_entry.evaluate_algorithmic \\
--program-path solution.cpp --results-dir /tmp/results --problem-id 0
"""
from __future__ import annotations
import json
import logging
import os
import sys
from pathlib import Path
from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
# Default paths
DEFAULT_JUDGE_URL = "http://localhost:8081"
DEFAULT_FRONTIER_CS_DIR = "tasks/Frontier-CS"
def _ensure_frontier_cs_importable(frontier_cs_dir: str) -> None:
"""
Make Frontier-CS runner classes importable without triggering the full
package __init__.py (which pulls in google.generativeai and other heavy deps).
We register lightweight stub modules so that only the runner subpackage
is actually loaded.
"""
import types
src_dir = str(Path(frontier_cs_dir).resolve() / "src")
if src_dir not in sys.path:
sys.path.insert(0, src_dir)
# If already set up, skip
if "frontier_cs" in sys.modules and hasattr(sys.modules["frontier_cs"], "__path__"):
return
fc_src = Path(src_dir) / "frontier_cs"
# Register empty frontier_cs package (bypass __init__.py)
fc = types.ModuleType("frontier_cs")
fc.__path__ = [str(fc_src)]
sys.modules["frontier_cs"] = fc
# Stub frontier_cs.gen to avoid importing LLM interface
fc_gen = types.ModuleType("frontier_cs.gen")
fc_gen.__path__ = [str(fc_src / "gen")]
sys.modules["frontier_cs.gen"] = fc_gen
# Provide the single constant that base.py needs
fc_gen_sf = types.ModuleType("frontier_cs.gen.solution_format")
fc_gen_sf.FAILED_EXTENSION = "FAILED"
sys.modules["frontier_cs.gen.solution_format"] = fc_gen_sf
def _load_problem_statement(frontier_cs_dir: str, problem_id: str) -> str:
"""Load problem statement text. Returns empty string if not found."""
statement_path = (
Path(frontier_cs_dir) / "algorithmic" / "problems" / str(problem_id) / "statement.txt"
)
if statement_path.exists():
try:
return statement_path.read_text(encoding="utf-8")
except Exception:
pass
return ""
def _format_case_feedback(cases: List[Dict[str, Any]], time_limit_ns: int = 2_000_000_000) -> str:
"""Format per-case results into readable text feedback."""
if not cases:
return "No per-case data available."
lines = []
weak_cases = []
for i, case in enumerate(cases):
ratio = case.get("scoreRatio", 0.0)
time_ns = case.get("time", 0)
time_ms = time_ns / 1_000_000 if time_ns else 0
status = case.get("status", "Unknown")
memory_kb = case.get("memory", 0) / 1024 if case.get("memory") else 0
# Determine status indicator
if ratio >= 1.0:
indicator = "OK"
elif ratio > 0:
indicator = "PARTIAL"
else:
indicator = "FAIL"
# Detect near-timeout (>80% of time limit)
near_timeout = ""
if time_ns > time_limit_ns * 0.8 and ratio < 1.0:
near_timeout = " [near timeout]"
lines.append(
f" Case {i + 1}: ratio={ratio:.4f} time={time_ms:.0f}ms "
f"mem={memory_kb:.0f}KB {indicator}{near_timeout}"
)
if ratio < 1.0:
weak_cases.append((i + 1, ratio, status))
result = "\n".join(lines)
if weak_cases:
weak_summary = ", ".join(
f"case {idx} ({r:.2f})" for idx, r, _ in sorted(weak_cases, key=lambda x: x[1])
)
result += f"\nWeakest: {weak_summary}"
return result
def _build_text_feedback(
problem_id: str,
result_metadata: Dict[str, Any],
score_bounded: float,
score_unbounded: float,
statement_summary: str = "",
error_msg: str = "",
) -> str:
"""Build comprehensive text feedback for the LLM."""
parts = []
# Error information (compilation failure, runtime error, etc.)
if error_msg:
parts.append(f"Error: {error_msg}")
# Per-case analysis
cases = result_metadata.get("cases", [])
if cases:
n_cases = len(cases)
n_passed = sum(1 for c in cases if c.get("scoreRatio", 0) >= 1.0)
parts.append(f"Problem {problem_id} | {n_cases} test cases | {n_passed}/{n_cases} perfect")
parts.append(_format_case_feedback(cases))
# Score summary
parts.append(f"Score: {score_bounded:.2f}/100 (unbounded: {score_unbounded:.2f})")
# Problem statement (truncated for context)
if statement_summary:
# Keep first 2000 chars of statement to avoid bloating the prompt
truncated = statement_summary[:2000]
if len(statement_summary) > 2000:
truncated += "\n[... truncated]"
parts.append(f"\n--- Problem Statement ---\n{truncated}")
return "\n".join(parts)
def main(
program_path: str,
results_dir: str,
problem_id: str = "",
judge_url: str = "",
frontier_cs_dir: str = "",
) -> Dict[str, Any]:
"""
Evaluate a C++ solution for a Frontier-CS algorithmic problem.
Parameters can also be set via environment variables (env takes precedence
over defaults, explicit args take precedence over env):
FRONTIER_CS_PROBLEM_ID, FRONTIER_CS_JUDGE_URL, FRONTIER_CS_DIR
This is the bridging evaluator that translates between ShinkaEvolve's
evaluation interface and Frontier-CS's go-judge system.
Args:
program_path: Path to the C++ solution file.
results_dir: Directory to write metrics.json and correct.json.
problem_id: Frontier-CS problem ID (e.g., "0", "1", "42").
judge_url: URL of the go-judge server.
frontier_cs_dir: Path to the Frontier-CS repository root.
Returns:
Dict with combined_score, public, private, text_feedback, correct.
"""
# Resolve from env vars when args are empty (local scheduler path)
problem_id = problem_id or os.environ.get("FRONTIER_CS_PROBLEM_ID", "0")
judge_url = judge_url or os.environ.get("FRONTIER_CS_JUDGE_URL", DEFAULT_JUDGE_URL)
frontier_cs_dir = frontier_cs_dir or os.environ.get("FRONTIER_CS_DIR", DEFAULT_FRONTIER_CS_DIR)
results_dir_path = Path(results_dir)
results_dir_path.mkdir(parents=True, exist_ok=True)
# Resolve frontier_cs_dir relative to project root if needed
if not Path(frontier_cs_dir).is_absolute():
# Try relative to CWD, then relative to this file's location
if not Path(frontier_cs_dir).exists():
project_root = Path(__file__).resolve().parents[2]
frontier_cs_dir = str(project_root / frontier_cs_dir)
# Load problem statement for feedback context
statement = _load_problem_statement(frontier_cs_dir, problem_id)
# Read the C++ code
code_path = Path(program_path)
if not code_path.exists():
return _save_error_result(
results_dir_path,
f"Solution file not found: {program_path}",
problem_id,
statement,
)
code = code_path.read_text(encoding="utf-8")
if not code.strip():
return _save_error_result(
results_dir_path,
"Empty solution file",
problem_id,
statement,
)
# Import and call Frontier-CS evaluator
_ensure_frontier_cs_importable(frontier_cs_dir)
try:
from frontier_cs.runner.algorithmic_local import AlgorithmicLocalRunner
from frontier_cs.runner.base import EvaluationStatus
except ImportError as e:
return _save_error_result(
results_dir_path,
f"Failed to import frontier_cs: {e}. "
f"Ensure Frontier-CS is installed (pip install -e {frontier_cs_dir})",
problem_id,
statement,
)
# Run evaluation via go-judge
try:
runner = AlgorithmicLocalRunner(judge_url=judge_url)
result = runner.evaluate(str(problem_id), code)
except Exception as e:
return _save_error_result(
results_dir_path,
f"go-judge evaluation failed: {e}",
problem_id,
statement,
)
# Translate EvaluationResult to ShinkaEvolve format
if result.status == EvaluationStatus.SUCCESS:
metadata = result.metadata or {}
score_bounded = result.score or 0.0
score_unbounded = result.score_unbounded if result.score_unbounded is not None else score_bounded
passed = metadata.get("passed", False)
cases = metadata.get("cases", [])
# Build public metrics (visible to LLM)
public_metrics = {
"score_bounded": score_bounded,
"score_unbounded": score_unbounded,
"passed": passed,
"n_cases": len(cases),
"n_perfect": sum(1 for c in cases if c.get("scoreRatio", 0) >= 1.0),
}
# Add per-case ratios (up to 20 cases to avoid bloat)
for i, case in enumerate(cases[:20]):
public_metrics[f"case_{i}_ratio"] = round(case.get("scoreRatio", 0.0), 4)
time_ns = case.get("time", 0)
if time_ns:
public_metrics[f"case_{i}_time_ms"] = round(time_ns / 1_000_000, 1)
text_feedback = _build_text_feedback(
problem_id=problem_id,
result_metadata=metadata,
score_bounded=score_bounded,
score_unbounded=score_unbounded,
statement_summary=statement,
)
metrics = {
"combined_score": score_unbounded,
# Any code that compiles and runs counts as correct.
# "passed" (all test cases perfect) is too strict for optimization problems.
"correct": True,
"public": public_metrics,
"private": metadata,
"text_feedback": text_feedback,
}
elif result.status == EvaluationStatus.TIMEOUT:
metrics = _build_error_metrics(
problem_id=problem_id,
error_msg=f"Evaluation timed out: {result.message}",
statement=statement,
)
else:
# ERROR or SKIPPED
error_msg = result.message or f"Evaluation failed with status: {result.status.value}"
# Include logs for compilation errors
if result.logs:
error_msg += f"\n--- Logs ---\n{result.logs[:1000]}"
metrics = _build_error_metrics(
problem_id=problem_id,
error_msg=error_msg,
statement=statement,
)
# Save results
_save_results(results_dir_path, metrics)
logger.info(
f"Frontier-CS Problem {problem_id}: "
f"score={metrics.get('combined_score', 0):.2f}, "
f"correct={metrics.get('correct', False)}"
)
return metrics
def _build_error_metrics(
problem_id: str, error_msg: str, statement: str = ""
) -> Dict[str, Any]:
"""Build metrics dict for error cases."""
return {
"combined_score": 0.0,
"correct": False,
"public": {"error": error_msg[:500]},
"private": {},
"text_feedback": _build_text_feedback(
problem_id=problem_id,
result_metadata={},
score_bounded=0.0,
score_unbounded=0.0,
statement_summary=statement,
error_msg=error_msg,
),
}
def _save_error_result(
results_dir: Path, error_msg: str, problem_id: str, statement: str = ""
) -> Dict[str, Any]:
"""Save error result and return metrics dict."""
metrics = _build_error_metrics(problem_id, error_msg, statement)
_save_results(results_dir, metrics)
return metrics
def _save_results(results_dir: Path, metrics: Dict[str, Any]) -> None:
"""Write metrics.json and correct.json in ShinkaEvolve format."""
metrics_path = results_dir / "metrics.json"
correct_path = results_dir / "correct.json"
# metrics.json
serializable_metrics = {
"combined_score": metrics.get("combined_score", 0.0),
"public": metrics.get("public", {}),
"private": {}, # Don't serialize full go-judge metadata (can be huge)
"text_feedback": metrics.get("text_feedback", ""),
}
with open(metrics_path, "w") as f:
json.dump(serializable_metrics, f, indent=2, default=str)
# correct.json
correct_data = {
"correct": metrics.get("correct", False),
"error": None if metrics.get("correct") else metrics.get("public", {}).get("error"),
}
with open(correct_path, "w") as f:
json.dump(correct_data, f, indent=2)
# --- CLI entry point ---
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Frontier-CS algorithmic evaluator bridge")
parser.add_argument("--program_path", required=True, help="Path to C++ solution")
parser.add_argument("--results_dir", required=True, help="Output directory for metrics")
parser.add_argument("--problem-id", default="", help="Frontier-CS problem ID (falls back to FRONTIER_CS_PROBLEM_ID env var, then '0')")
parser.add_argument("--judge-url", default=DEFAULT_JUDGE_URL, help="go-judge URL")
parser.add_argument("--frontier-cs-dir", default=DEFAULT_FRONTIER_CS_DIR)
args = parser.parse_args()
logging.basicConfig(level=logging.INFO)
result = main(
program_path=args.program_path,
results_dir=args.results_dir,
problem_id=args.problem_id,
judge_url=args.judge_url,
frontier_cs_dir=args.frontier_cs_dir,
)
print(f"Score: {result.get('combined_score', 0):.2f}")