shinka-backup / tasks /frontier_cs_entry /evaluate_algorithmic.py

Add files using upload-large-folder tool

2facf1f verified 22 days ago

13.9 kB

	"""
	Bridging evaluator for Frontier-CS algorithmic problems.

	Translates between ShinkaEvolve's evaluation interface and Frontier-CS's
	go-judge based evaluation system. Works for all 172 algorithmic problems
	via the `problem_id` parameter.

	Usage as evaluator_module:
	evaluator_module: "tasks.frontier_cs_entry.evaluate_algorithmic"
	evaluator_function: "main"
	evaluator_kwargs: {"problem_id": "0"}

	Direct usage:
	python -m tasks.frontier_cs_entry.evaluate_algorithmic \\
	--program-path solution.cpp --results-dir /tmp/results --problem-id 0
	"""

	from __future__ import annotations

	import json
	import logging
	import os
	import sys
	from pathlib import Path
	from typing import Any, Dict, List, Optional

	logger = logging.getLogger(__name__)

	# Default paths
	DEFAULT_JUDGE_URL = "http://localhost:8081"
	DEFAULT_FRONTIER_CS_DIR = "tasks/Frontier-CS"


	def _ensure_frontier_cs_importable(frontier_cs_dir: str) -> None:
	"""
	Make Frontier-CS runner classes importable without triggering the full
	package __init__.py (which pulls in google.generativeai and other heavy deps).

	We register lightweight stub modules so that only the runner subpackage
	is actually loaded.
	"""
	import types

	src_dir = str(Path(frontier_cs_dir).resolve() / "src")
	if src_dir not in sys.path:
	sys.path.insert(0, src_dir)

	# If already set up, skip
	if "frontier_cs" in sys.modules and hasattr(sys.modules["frontier_cs"], "__path__"):
	return

	fc_src = Path(src_dir) / "frontier_cs"

	# Register empty frontier_cs package (bypass __init__.py)
	fc = types.ModuleType("frontier_cs")
	fc.__path__ = [str(fc_src)]
	sys.modules["frontier_cs"] = fc

	# Stub frontier_cs.gen to avoid importing LLM interface
	fc_gen = types.ModuleType("frontier_cs.gen")
	fc_gen.__path__ = [str(fc_src / "gen")]
	sys.modules["frontier_cs.gen"] = fc_gen

	# Provide the single constant that base.py needs
	fc_gen_sf = types.ModuleType("frontier_cs.gen.solution_format")
	fc_gen_sf.FAILED_EXTENSION = "FAILED"
	sys.modules["frontier_cs.gen.solution_format"] = fc_gen_sf


	def _load_problem_statement(frontier_cs_dir: str, problem_id: str) -> str:
	"""Load problem statement text. Returns empty string if not found."""
	statement_path = (
	Path(frontier_cs_dir) / "algorithmic" / "problems" / str(problem_id) / "statement.txt"
	)
	if statement_path.exists():
	try:
	return statement_path.read_text(encoding="utf-8")
	except Exception:
	pass
	return ""


	def _format_case_feedback(cases: List[Dict[str, Any]], time_limit_ns: int = 2_000_000_000) -> str:
	"""Format per-case results into readable text feedback."""
	if not cases:
	return "No per-case data available."

	lines = []
	weak_cases = []

	for i, case in enumerate(cases):
	ratio = case.get("scoreRatio", 0.0)
	time_ns = case.get("time", 0)
	time_ms = time_ns / 1_000_000 if time_ns else 0
	status = case.get("status", "Unknown")
	memory_kb = case.get("memory", 0) / 1024 if case.get("memory") else 0

	# Determine status indicator
	if ratio >= 1.0:
	indicator = "OK"
	elif ratio > 0:
	indicator = "PARTIAL"
	else:
	indicator = "FAIL"

	# Detect near-timeout (>80% of time limit)
	near_timeout = ""
	if time_ns > time_limit_ns * 0.8 and ratio < 1.0:
	near_timeout = " [near timeout]"

	lines.append(
	f" Case {i + 1}: ratio={ratio:.4f} time={time_ms:.0f}ms "
	f"mem={memory_kb:.0f}KB {indicator}{near_timeout}"
	)

	if ratio < 1.0:
	weak_cases.append((i + 1, ratio, status))

	result = "\n".join(lines)

	if weak_cases:
	weak_summary = ", ".join(
	f"case {idx} ({r:.2f})" for idx, r, _ in sorted(weak_cases, key=lambda x: x[1])
	)
	result += f"\nWeakest: {weak_summary}"

	return result


	def _build_text_feedback(
	problem_id: str,
	result_metadata: Dict[str, Any],
	score_bounded: float,
	score_unbounded: float,
	statement_summary: str = "",
	error_msg: str = "",
	) -> str:
	"""Build comprehensive text feedback for the LLM."""
	parts = []

	# Error information (compilation failure, runtime error, etc.)
	if error_msg:
	parts.append(f"Error: {error_msg}")

	# Per-case analysis
	cases = result_metadata.get("cases", [])
	if cases:
	n_cases = len(cases)
	n_passed = sum(1 for c in cases if c.get("scoreRatio", 0) >= 1.0)
	parts.append(f"Problem {problem_id} \| {n_cases} test cases \| {n_passed}/{n_cases} perfect")
	parts.append(_format_case_feedback(cases))

	# Score summary
	parts.append(f"Score: {score_bounded:.2f}/100 (unbounded: {score_unbounded:.2f})")

	# Problem statement (truncated for context)
	if statement_summary:
	# Keep first 2000 chars of statement to avoid bloating the prompt
	truncated = statement_summary[:2000]
	if len(statement_summary) > 2000:
	truncated += "\n[... truncated]"
	parts.append(f"\n--- Problem Statement ---\n{truncated}")

	return "\n".join(parts)


	def main(
	program_path: str,
	results_dir: str,
	problem_id: str = "",
	judge_url: str = "",
	frontier_cs_dir: str = "",
	) -> Dict[str, Any]:
	"""
	Evaluate a C++ solution for a Frontier-CS algorithmic problem.

	Parameters can also be set via environment variables (env takes precedence
	over defaults, explicit args take precedence over env):
	FRONTIER_CS_PROBLEM_ID, FRONTIER_CS_JUDGE_URL, FRONTIER_CS_DIR

	This is the bridging evaluator that translates between ShinkaEvolve's
	evaluation interface and Frontier-CS's go-judge system.

	Args:
	program_path: Path to the C++ solution file.
	results_dir: Directory to write metrics.json and correct.json.
	problem_id: Frontier-CS problem ID (e.g., "0", "1", "42").
	judge_url: URL of the go-judge server.
	frontier_cs_dir: Path to the Frontier-CS repository root.

	Returns:
	Dict with combined_score, public, private, text_feedback, correct.
	"""
	# Resolve from env vars when args are empty (local scheduler path)
	problem_id = problem_id or os.environ.get("FRONTIER_CS_PROBLEM_ID", "0")
	judge_url = judge_url or os.environ.get("FRONTIER_CS_JUDGE_URL", DEFAULT_JUDGE_URL)
	frontier_cs_dir = frontier_cs_dir or os.environ.get("FRONTIER_CS_DIR", DEFAULT_FRONTIER_CS_DIR)

	results_dir_path = Path(results_dir)
	results_dir_path.mkdir(parents=True, exist_ok=True)

	# Resolve frontier_cs_dir relative to project root if needed
	if not Path(frontier_cs_dir).is_absolute():
	# Try relative to CWD, then relative to this file's location
	if not Path(frontier_cs_dir).exists():
	project_root = Path(__file__).resolve().parents[2]
	frontier_cs_dir = str(project_root / frontier_cs_dir)

	# Load problem statement for feedback context
	statement = _load_problem_statement(frontier_cs_dir, problem_id)

	# Read the C++ code
	code_path = Path(program_path)
	if not code_path.exists():
	return _save_error_result(
	results_dir_path,
	f"Solution file not found: {program_path}",
	problem_id,
	statement,
	)

	code = code_path.read_text(encoding="utf-8")
	if not code.strip():
	return _save_error_result(
	results_dir_path,
	"Empty solution file",
	problem_id,
	statement,
	)

	# Import and call Frontier-CS evaluator
	_ensure_frontier_cs_importable(frontier_cs_dir)
	try:
	from frontier_cs.runner.algorithmic_local import AlgorithmicLocalRunner
	from frontier_cs.runner.base import EvaluationStatus
	except ImportError as e:
	return _save_error_result(
	results_dir_path,
	f"Failed to import frontier_cs: {e}. "
	f"Ensure Frontier-CS is installed (pip install -e {frontier_cs_dir})",
	problem_id,
	statement,
	)

	# Run evaluation via go-judge
	try:
	runner = AlgorithmicLocalRunner(judge_url=judge_url)
	result = runner.evaluate(str(problem_id), code)
	except Exception as e:
	return _save_error_result(
	results_dir_path,
	f"go-judge evaluation failed: {e}",
	problem_id,
	statement,
	)

	# Translate EvaluationResult to ShinkaEvolve format
	if result.status == EvaluationStatus.SUCCESS:
	metadata = result.metadata or {}
	score_bounded = result.score or 0.0
	score_unbounded = result.score_unbounded if result.score_unbounded is not None else score_bounded
	passed = metadata.get("passed", False)
	cases = metadata.get("cases", [])

	# Build public metrics (visible to LLM)
	public_metrics = {
	"score_bounded": score_bounded,
	"score_unbounded": score_unbounded,
	"passed": passed,
	"n_cases": len(cases),
	"n_perfect": sum(1 for c in cases if c.get("scoreRatio", 0) >= 1.0),
	}

	# Add per-case ratios (up to 20 cases to avoid bloat)
	for i, case in enumerate(cases[:20]):
	public_metrics[f"case_{i}_ratio"] = round(case.get("scoreRatio", 0.0), 4)
	time_ns = case.get("time", 0)
	if time_ns:
	public_metrics[f"case_{i}_time_ms"] = round(time_ns / 1_000_000, 1)

	text_feedback = _build_text_feedback(
	problem_id=problem_id,
	result_metadata=metadata,
	score_bounded=score_bounded,
	score_unbounded=score_unbounded,
	statement_summary=statement,
	)

	metrics = {
	"combined_score": score_unbounded,
	# Any code that compiles and runs counts as correct.
	# "passed" (all test cases perfect) is too strict for optimization problems.
	"correct": True,
	"public": public_metrics,
	"private": metadata,
	"text_feedback": text_feedback,
	}

	elif result.status == EvaluationStatus.TIMEOUT:
	metrics = _build_error_metrics(
	problem_id=problem_id,
	error_msg=f"Evaluation timed out: {result.message}",
	statement=statement,
	)

	else:
	# ERROR or SKIPPED
	error_msg = result.message or f"Evaluation failed with status: {result.status.value}"
	# Include logs for compilation errors
	if result.logs:
	error_msg += f"\n--- Logs ---\n{result.logs[:1000]}"
	metrics = _build_error_metrics(
	problem_id=problem_id,
	error_msg=error_msg,
	statement=statement,
	)

	# Save results
	_save_results(results_dir_path, metrics)

	logger.info(
	f"Frontier-CS Problem {problem_id}: "
	f"score={metrics.get('combined_score', 0):.2f}, "
	f"correct={metrics.get('correct', False)}"
	)

	return metrics


	def _build_error_metrics(
	problem_id: str, error_msg: str, statement: str = ""
	) -> Dict[str, Any]:
	"""Build metrics dict for error cases."""
	return {
	"combined_score": 0.0,
	"correct": False,
	"public": {"error": error_msg[:500]},
	"private": {},
	"text_feedback": _build_text_feedback(
	problem_id=problem_id,
	result_metadata={},
	score_bounded=0.0,
	score_unbounded=0.0,
	statement_summary=statement,
	error_msg=error_msg,
	),
	}


	def _save_error_result(
	results_dir: Path, error_msg: str, problem_id: str, statement: str = ""
	) -> Dict[str, Any]:
	"""Save error result and return metrics dict."""
	metrics = _build_error_metrics(problem_id, error_msg, statement)
	_save_results(results_dir, metrics)
	return metrics


	def _save_results(results_dir: Path, metrics: Dict[str, Any]) -> None:
	"""Write metrics.json and correct.json in ShinkaEvolve format."""
	metrics_path = results_dir / "metrics.json"
	correct_path = results_dir / "correct.json"

	# metrics.json
	serializable_metrics = {
	"combined_score": metrics.get("combined_score", 0.0),
	"public": metrics.get("public", {}),
	"private": {}, # Don't serialize full go-judge metadata (can be huge)
	"text_feedback": metrics.get("text_feedback", ""),
	}
	with open(metrics_path, "w") as f:
	json.dump(serializable_metrics, f, indent=2, default=str)

	# correct.json
	correct_data = {
	"correct": metrics.get("correct", False),
	"error": None if metrics.get("correct") else metrics.get("public", {}).get("error"),
	}
	with open(correct_path, "w") as f:
	json.dump(correct_data, f, indent=2)


	# --- CLI entry point ---
	if __name__ == "__main__":
	import argparse

	parser = argparse.ArgumentParser(description="Frontier-CS algorithmic evaluator bridge")
	parser.add_argument("--program_path", required=True, help="Path to C++ solution")
	parser.add_argument("--results_dir", required=True, help="Output directory for metrics")
	parser.add_argument("--problem-id", default="", help="Frontier-CS problem ID (falls back to FRONTIER_CS_PROBLEM_ID env var, then '0')")
	parser.add_argument("--judge-url", default=DEFAULT_JUDGE_URL, help="go-judge URL")
	parser.add_argument("--frontier-cs-dir", default=DEFAULT_FRONTIER_CS_DIR)

	args = parser.parse_args()
	logging.basicConfig(level=logging.INFO)

	result = main(
	program_path=args.program_path,
	results_dir=args.results_dir,
	problem_id=args.problem_id,
	judge_url=args.judge_url,
	frontier_cs_dir=args.frontier_cs_dir,
	)
	print(f"Score: {result.get('combined_score', 0):.2f}")