shinka-backup / tasks /frontier_cs_entry /run_experiment.py
JustinTX's picture
Add files using upload-large-folder tool
2facf1f verified
#!/usr/bin/env python3
"""
Fully automated experiment runner for Frontier-CS algorithmic problems.
Given a problem_id, this script:
1. Loads the problem statement from problems/{id}/statement.txt
2. Picks a seed solution from solutions/{id}/ (default: gpt5.cpp)
3. Wraps the seed with EVOLVE-BLOCK markers
4. Builds EvolutionConfig + DatabaseConfig
5. Starts the evolution loop
Usage:
python tasks/frontier_cs_entry/run_experiment.py --experiment-name test --problem-id 0
python tasks/frontier_cs_entry/run_experiment.py --experiment-name test --problem-id 42 --seed-model gpt5_1
"""
from __future__ import annotations
import argparse
import logging
import os
import sys
import time
from datetime import datetime
from pathlib import Path
import requests
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
from shinka.core import EvolutionConfig, EvolutionRunner
from shinka.database import DatabaseConfig
from shinka.launch import LocalJobConfig
logger = logging.getLogger(__name__)
DEFAULT_FRONTIER_CS_DIR = "tasks/Frontier-CS"
TASK_SYSTEM_PREFIX = (
"You are an expert competitive programmer. "
"Your goal is to write C++ code that maximizes the score on the given problem. "
"The scoring is continuous (0-100) based on solution quality, not just correctness. "
"Optimize for both correctness and performance. "
"Focus on algorithmic improvements, not micro-optimizations.\n\n"
"--- Problem Statement ---\n"
)
def parse_args():
parser = argparse.ArgumentParser(
description="Run Frontier-CS algorithmic evolution experiment",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
# Problem selection
parser.add_argument("--experiment-name", type=str, required=True)
parser.add_argument("--problem-id", type=str, required=True,
help="Frontier-CS problem ID (e.g., 0, 1, 42)")
parser.add_argument("--seed-model", type=str, default="gpt5",
help="Model prefix for seed solution (e.g., gpt5, gemini3pro)")
parser.add_argument("--frontier-cs-dir", type=str, default=DEFAULT_FRONTIER_CS_DIR,
help="Path to Frontier-CS repo")
# Evolution parameters
parser.add_argument("--num-generations", type=int, default=200)
parser.add_argument("--max-parallel-jobs", type=int, default=5)
parser.add_argument("--meta-interval", type=int, default=10)
# Persistent agents
parser.add_argument("--persistent-agents", action="store_true", default=False)
parser.add_argument("--persistent-context-refresh-interval", type=int, default=10)
parser.add_argument("--persistent-context-max-recent-attempts", type=int, default=12)
parser.add_argument("--persistent-context-max-recent-insights", type=int, default=8)
parser.add_argument("--persistent-invalid-burst-threshold", type=int, default=3)
parser.add_argument("--persistent-invalid-burst-window", type=int, default=5)
# Text feedback
parser.add_argument(
"--use-text-feedback", dest="use_text_feedback",
action="store_true", default=True,
help="Include evaluator text_feedback in mutation prompts",
)
parser.add_argument(
"--no-text-feedback", dest="use_text_feedback",
action="store_false",
)
# Database
parser.add_argument("--num-islands", type=int, default=2)
parser.add_argument("--archive-size", type=int, default=40)
# LLM
parser.add_argument(
"--llm-models", nargs="+", type=str,
default=["native-gemini-3-flash-preview"],
)
parser.add_argument(
"--llm-selection", type=str, default="ucb1",
choices=["ucb1", "thompson", "epsilon_greedy", "random"],
)
parser.add_argument(
"--llm-temperatures", nargs="+", type=float,
default=[0.0, 0.5, 1.0],
)
parser.add_argument("--llm-max-tokens", type=int, default=65536)
# Trajectory logging
parser.add_argument("--trajectory-log", action="store_true", default=False)
parser.add_argument("--trajectory-log-dir", type=str, default="llm_trajectories")
# Edit backend
parser.add_argument(
"--edit-backend", type=str, default="single_shot_patch",
choices=["single_shot_patch", "openhands_agent"],
help="Code generation mode: single LLM call or multi-round agent",
)
# Patch strategy
parser.add_argument(
"--patch-types", nargs="+", type=str,
default=["diff", "full", "cross"],
)
parser.add_argument(
"--patch-probs", nargs="+", type=float,
default=[0.6, 0.3, 0.1],
)
# Eval service
parser.add_argument("--use-eval-service", action="store_true", default=False)
parser.add_argument("--eval-service-url", type=str, default="http://localhost:8765")
parser.add_argument(
"--eval-trigger-mode", type=str, default=None,
choices=["always", "periodic", "plateau", "mixed"],
)
parser.add_argument("--eval-trigger-interval", type=int, default=None)
# WandB
parser.add_argument("--use-wandb", action="store_true", default=False)
parser.add_argument("--wandb-project", type=str, default="frontier-cs")
parser.add_argument("--wandb-entity", type=str, default="tengxiao")
parser.add_argument("--wandb-run-name", type=str, default=None)
parser.add_argument("--wandb-tags", nargs="*", type=str, default=None)
# Output
parser.add_argument("--results-dir", type=str, default=None)
parser.add_argument(
"--run-dir", type=str, default=None,
help="Shared root directory for a batch run. Results go to <run-dir>/p<problem_id>/. "
"Overrides --results-dir when set.",
)
parser.add_argument("--judge-url", type=str, default="http://localhost:8081")
parser.add_argument("--verbose", action="store_true", default=True)
return parser.parse_args()
# ------------------------------------------------------------------
# Helpers
# ------------------------------------------------------------------
def load_problem_statement(frontier_cs_dir: str, problem_id: str) -> str:
"""Load problem statement from Frontier-CS problem directory."""
path = Path(frontier_cs_dir) / "algorithmic" / "problems" / str(problem_id) / "statement.txt"
if not path.exists():
raise FileNotFoundError(f"Problem statement not found: {path}")
return path.read_text(encoding="utf-8")
def find_seed_solution(frontier_cs_dir: str, problem_id: str, model_prefix: str) -> Path:
"""Find a seed solution file from the solutions directory."""
solutions_dir = Path(frontier_cs_dir) / "algorithmic" / "solutions" / str(problem_id)
if not solutions_dir.exists():
solutions_dir = (
Path(frontier_cs_dir) / "algorithmic" / "problems" / str(problem_id) / "examples"
)
if not solutions_dir.exists():
raise FileNotFoundError(
f"No solutions directory found for problem {problem_id}. "
f"Checked: {solutions_dir}"
)
# Exact match
exact = solutions_dir / f"{model_prefix}.cpp"
if exact.exists():
return exact
# Prefix match
matches = sorted(solutions_dir.glob(f"{model_prefix}*.cpp"))
if matches:
return matches[0]
# Fallback: any .cpp file
all_cpp = sorted(solutions_dir.glob("*.cpp"))
if all_cpp:
logger.warning(f"No solution matching '{model_prefix}'. Using {all_cpp[0].name}")
return all_cpp[0]
raise FileNotFoundError(f"No .cpp solutions found in {solutions_dir}")
def prepare_seed_code(solution_path: Path) -> str:
"""Read seed solution and wrap with EVOLVE-BLOCK markers."""
code = solution_path.read_text(encoding="utf-8")
return f"// EVOLVE-BLOCK-START\n{code}\n// EVOLVE-BLOCK-END\n"
def check_eval_service(url: str):
try:
response = requests.get(f"{url}/api/v1/status", timeout=2.0)
if response.status_code == 200:
return True, response.json()
except Exception as exc:
return False, str(exc)
return False, "Unknown error"
def resolve_defaults(args):
"""Fill in auto-generated defaults."""
if args.run_dir is not None:
# Batch mode: shared root directory, per-problem subdirectory
args.results_dir = f"{args.run_dir}/p{args.problem_id}"
elif args.results_dir is None:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
args.results_dir = (
f"results/frontier_cs_algorithmic/"
f"p{args.problem_id}_{args.experiment_name}_{timestamp}"
)
if args.use_wandb and args.wandb_run_name is None:
args.wandb_run_name = (
f"fcs_p{args.problem_id}_{args.experiment_name}_"
f"{datetime.now().strftime('%Y%m%d_%H%M%S')}"
)
return args
# ------------------------------------------------------------------
# Main
# ------------------------------------------------------------------
def main():
args = resolve_defaults(parse_args())
# Resolve frontier_cs_dir
frontier_cs_dir = args.frontier_cs_dir
if not Path(frontier_cs_dir).is_absolute():
project_root = Path(__file__).resolve().parents[2]
frontier_cs_dir = str(project_root / frontier_cs_dir)
# Set env vars so the evaluator subprocess can read them (local scheduler path)
os.environ["FRONTIER_CS_PROBLEM_ID"] = args.problem_id
os.environ["FRONTIER_CS_JUDGE_URL"] = args.judge_url
os.environ["FRONTIER_CS_DIR"] = frontier_cs_dir
# Load problem and seed
statement = load_problem_statement(frontier_cs_dir, args.problem_id)
task_sys_msg = TASK_SYSTEM_PREFIX + statement
seed_path = find_seed_solution(frontier_cs_dir, args.problem_id, args.seed_model)
seed_code = prepare_seed_code(seed_path)
results_dir = Path(args.results_dir)
results_dir.mkdir(parents=True, exist_ok=True)
# Write seed code for the runner
init_program_path = results_dir / "initial.cpp"
init_program_path.write_text(seed_code, encoding="utf-8")
# Print summary
print("=" * 80)
print("ShinkaEvolve: Frontier-CS Algorithmic")
print("=" * 80)
print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Experiment: {args.experiment_name}")
print(f"Problem: {args.problem_id}")
print(f"Seed: {seed_path.name}")
print(f"Generations: {args.num_generations}")
print(f"Parallel: {args.max_parallel_jobs}")
print(f"Models: {', '.join(args.llm_models)}")
print(f"Results Dir: {results_dir}")
print(f"Judge: {args.judge_url}")
print(f"Persistent: {'enabled' if args.persistent_agents else 'disabled'}")
print("=" * 80)
if args.use_eval_service:
ok, info = check_eval_service(args.eval_service_url)
if not ok:
print(f"Eval service not available at {args.eval_service_url}: {info}")
sys.exit(1)
print(f"Eval service ready: {args.eval_service_url}")
# Build configs (aligned with tasks/circle_packing/run_experiment.py)
job_config = LocalJobConfig(
eval_program_path="tasks/frontier_cs_entry/evaluate_algorithmic.py",
extra_cmd_args={
"problem-id": args.problem_id,
"judge-url": args.judge_url,
},
)
db_config = DatabaseConfig(
num_islands=args.num_islands,
archive_size=args.archive_size,
elite_selection_ratio=0.3,
num_archive_inspirations=4,
num_top_k_inspirations=2,
migration_interval=10,
migration_rate=0.1,
island_elitism=True,
parent_selection_strategy="weighted",
parent_selection_lambda=10.0,
)
evo_config = EvolutionConfig(
task_sys_msg=task_sys_msg,
patch_types=args.patch_types,
patch_type_probs=args.patch_probs,
num_generations=args.num_generations,
max_parallel_jobs=args.max_parallel_jobs,
max_patch_resamples=3,
max_patch_attempts=3,
edit_backend=args.edit_backend,
job_type="local",
language="cpp",
# LLM (same defaults as circle_packing)
llm_models=args.llm_models,
llm_kwargs=dict(
temperatures=args.llm_temperatures,
max_tokens=args.llm_max_tokens,
reasoning_efforts=["high"],
),
llm_dynamic_selection=args.llm_selection,
llm_dynamic_selection_kwargs=dict(exploration_coef=1.0),
# Meta (uses first model, deterministic)
meta_rec_interval=args.meta_interval,
meta_llm_models=[args.llm_models[0]],
meta_llm_kwargs=dict(temperatures=[0.0], max_tokens=32768),
# Novelty (uses first model, deterministic)
novelty_llm_models=[args.llm_models[0]],
novelty_llm_kwargs=dict(temperatures=[0.0], max_tokens=32768),
embedding_model="text-embedding-3-small",
code_embed_sim_threshold=0.995,
# Paths
init_program_path=str(init_program_path),
results_dir=str(results_dir),
use_text_feedback=args.use_text_feedback,
# Evaluator
evaluator_module="tasks.frontier_cs_entry.evaluate_algorithmic",
evaluator_function="main",
evaluator_kwargs={
"problem_id": args.problem_id,
"judge_url": args.judge_url,
"frontier_cs_dir": frontier_cs_dir,
},
# Eval service
eval_service_url=args.eval_service_url if args.use_eval_service else None,
use_eval_service=args.use_eval_service,
eval_service_trigger_mode=(
args.eval_trigger_mode if args.use_eval_service else None
),
eval_service_trigger_interval=(
args.eval_trigger_interval if args.use_eval_service else None
),
# WandB
enable_wandb=args.use_wandb,
wandb_project=args.wandb_project,
wandb_entity=args.wandb_entity,
wandb_run_name=args.wandb_run_name,
wandb_tags=args.wandb_tags,
# Trajectory & persistent agents
trajectory_log=args.trajectory_log,
trajectory_log_dir=args.trajectory_log_dir,
persistent_agents_enabled=args.persistent_agents,
persistent_context_refresh_interval=args.persistent_context_refresh_interval,
persistent_context_max_recent_attempts=args.persistent_context_max_recent_attempts,
persistent_context_max_recent_insights=args.persistent_context_max_recent_insights,
persistent_invalid_burst_threshold=args.persistent_invalid_burst_threshold,
persistent_invalid_burst_window=args.persistent_invalid_burst_window,
)
runner = EvolutionRunner(
evo_config=evo_config,
job_config=job_config,
db_config=db_config,
verbose=args.verbose,
)
runner.run()
if __name__ == "__main__":
main()