#!/usr/bin/env python3 """ Fully automated experiment runner for Frontier-CS algorithmic problems. Given a problem_id, this script: 1. Loads the problem statement from problems/{id}/statement.txt 2. Picks a seed solution from solutions/{id}/ (default: gpt5.cpp) 3. Wraps the seed with EVOLVE-BLOCK markers 4. Builds EvolutionConfig + DatabaseConfig 5. Starts the evolution loop Usage: python tasks/frontier_cs_entry/run_experiment.py --experiment-name test --problem-id 0 python tasks/frontier_cs_entry/run_experiment.py --experiment-name test --problem-id 42 --seed-model gpt5_1 """ from __future__ import annotations import argparse import logging import os import sys import time from datetime import datetime from pathlib import Path import requests sys.path.insert(0, str(Path(__file__).resolve().parents[2])) from shinka.core import EvolutionConfig, EvolutionRunner from shinka.database import DatabaseConfig from shinka.launch import LocalJobConfig logger = logging.getLogger(__name__) DEFAULT_FRONTIER_CS_DIR = "tasks/Frontier-CS" TASK_SYSTEM_PREFIX = ( "You are an expert competitive programmer. " "Your goal is to write C++ code that maximizes the score on the given problem. " "The scoring is continuous (0-100) based on solution quality, not just correctness. " "Optimize for both correctness and performance. " "Focus on algorithmic improvements, not micro-optimizations.\n\n" "--- Problem Statement ---\n" ) def parse_args(): parser = argparse.ArgumentParser( description="Run Frontier-CS algorithmic evolution experiment", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) # Problem selection parser.add_argument("--experiment-name", type=str, required=True) parser.add_argument("--problem-id", type=str, required=True, help="Frontier-CS problem ID (e.g., 0, 1, 42)") parser.add_argument("--seed-model", type=str, default="gpt5", help="Model prefix for seed solution (e.g., gpt5, gemini3pro)") parser.add_argument("--frontier-cs-dir", type=str, default=DEFAULT_FRONTIER_CS_DIR, help="Path to Frontier-CS repo") # Evolution parameters parser.add_argument("--num-generations", type=int, default=200) parser.add_argument("--max-parallel-jobs", type=int, default=5) parser.add_argument("--meta-interval", type=int, default=10) # Persistent agents parser.add_argument("--persistent-agents", action="store_true", default=False) parser.add_argument("--persistent-context-refresh-interval", type=int, default=10) parser.add_argument("--persistent-context-max-recent-attempts", type=int, default=12) parser.add_argument("--persistent-context-max-recent-insights", type=int, default=8) parser.add_argument("--persistent-invalid-burst-threshold", type=int, default=3) parser.add_argument("--persistent-invalid-burst-window", type=int, default=5) # Text feedback parser.add_argument( "--use-text-feedback", dest="use_text_feedback", action="store_true", default=True, help="Include evaluator text_feedback in mutation prompts", ) parser.add_argument( "--no-text-feedback", dest="use_text_feedback", action="store_false", ) # Database parser.add_argument("--num-islands", type=int, default=2) parser.add_argument("--archive-size", type=int, default=40) # LLM parser.add_argument( "--llm-models", nargs="+", type=str, default=["native-gemini-3-flash-preview"], ) parser.add_argument( "--llm-selection", type=str, default="ucb1", choices=["ucb1", "thompson", "epsilon_greedy", "random"], ) parser.add_argument( "--llm-temperatures", nargs="+", type=float, default=[0.0, 0.5, 1.0], ) parser.add_argument("--llm-max-tokens", type=int, default=65536) # Trajectory logging parser.add_argument("--trajectory-log", action="store_true", default=False) parser.add_argument("--trajectory-log-dir", type=str, default="llm_trajectories") # Edit backend parser.add_argument( "--edit-backend", type=str, default="single_shot_patch", choices=["single_shot_patch", "openhands_agent"], help="Code generation mode: single LLM call or multi-round agent", ) # Patch strategy parser.add_argument( "--patch-types", nargs="+", type=str, default=["diff", "full", "cross"], ) parser.add_argument( "--patch-probs", nargs="+", type=float, default=[0.6, 0.3, 0.1], ) # Eval service parser.add_argument("--use-eval-service", action="store_true", default=False) parser.add_argument("--eval-service-url", type=str, default="http://localhost:8765") parser.add_argument( "--eval-trigger-mode", type=str, default=None, choices=["always", "periodic", "plateau", "mixed"], ) parser.add_argument("--eval-trigger-interval", type=int, default=None) # WandB parser.add_argument("--use-wandb", action="store_true", default=False) parser.add_argument("--wandb-project", type=str, default="frontier-cs") parser.add_argument("--wandb-entity", type=str, default="tengxiao") parser.add_argument("--wandb-run-name", type=str, default=None) parser.add_argument("--wandb-tags", nargs="*", type=str, default=None) # Output parser.add_argument("--results-dir", type=str, default=None) parser.add_argument( "--run-dir", type=str, default=None, help="Shared root directory for a batch run. Results go to /p/. " "Overrides --results-dir when set.", ) parser.add_argument("--judge-url", type=str, default="http://localhost:8081") parser.add_argument("--verbose", action="store_true", default=True) return parser.parse_args() # ------------------------------------------------------------------ # Helpers # ------------------------------------------------------------------ def load_problem_statement(frontier_cs_dir: str, problem_id: str) -> str: """Load problem statement from Frontier-CS problem directory.""" path = Path(frontier_cs_dir) / "algorithmic" / "problems" / str(problem_id) / "statement.txt" if not path.exists(): raise FileNotFoundError(f"Problem statement not found: {path}") return path.read_text(encoding="utf-8") def find_seed_solution(frontier_cs_dir: str, problem_id: str, model_prefix: str) -> Path: """Find a seed solution file from the solutions directory.""" solutions_dir = Path(frontier_cs_dir) / "algorithmic" / "solutions" / str(problem_id) if not solutions_dir.exists(): solutions_dir = ( Path(frontier_cs_dir) / "algorithmic" / "problems" / str(problem_id) / "examples" ) if not solutions_dir.exists(): raise FileNotFoundError( f"No solutions directory found for problem {problem_id}. " f"Checked: {solutions_dir}" ) # Exact match exact = solutions_dir / f"{model_prefix}.cpp" if exact.exists(): return exact # Prefix match matches = sorted(solutions_dir.glob(f"{model_prefix}*.cpp")) if matches: return matches[0] # Fallback: any .cpp file all_cpp = sorted(solutions_dir.glob("*.cpp")) if all_cpp: logger.warning(f"No solution matching '{model_prefix}'. Using {all_cpp[0].name}") return all_cpp[0] raise FileNotFoundError(f"No .cpp solutions found in {solutions_dir}") def prepare_seed_code(solution_path: Path) -> str: """Read seed solution and wrap with EVOLVE-BLOCK markers.""" code = solution_path.read_text(encoding="utf-8") return f"// EVOLVE-BLOCK-START\n{code}\n// EVOLVE-BLOCK-END\n" def check_eval_service(url: str): try: response = requests.get(f"{url}/api/v1/status", timeout=2.0) if response.status_code == 200: return True, response.json() except Exception as exc: return False, str(exc) return False, "Unknown error" def resolve_defaults(args): """Fill in auto-generated defaults.""" if args.run_dir is not None: # Batch mode: shared root directory, per-problem subdirectory args.results_dir = f"{args.run_dir}/p{args.problem_id}" elif args.results_dir is None: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") args.results_dir = ( f"results/frontier_cs_algorithmic/" f"p{args.problem_id}_{args.experiment_name}_{timestamp}" ) if args.use_wandb and args.wandb_run_name is None: args.wandb_run_name = ( f"fcs_p{args.problem_id}_{args.experiment_name}_" f"{datetime.now().strftime('%Y%m%d_%H%M%S')}" ) return args # ------------------------------------------------------------------ # Main # ------------------------------------------------------------------ def main(): args = resolve_defaults(parse_args()) # Resolve frontier_cs_dir frontier_cs_dir = args.frontier_cs_dir if not Path(frontier_cs_dir).is_absolute(): project_root = Path(__file__).resolve().parents[2] frontier_cs_dir = str(project_root / frontier_cs_dir) # Set env vars so the evaluator subprocess can read them (local scheduler path) os.environ["FRONTIER_CS_PROBLEM_ID"] = args.problem_id os.environ["FRONTIER_CS_JUDGE_URL"] = args.judge_url os.environ["FRONTIER_CS_DIR"] = frontier_cs_dir # Load problem and seed statement = load_problem_statement(frontier_cs_dir, args.problem_id) task_sys_msg = TASK_SYSTEM_PREFIX + statement seed_path = find_seed_solution(frontier_cs_dir, args.problem_id, args.seed_model) seed_code = prepare_seed_code(seed_path) results_dir = Path(args.results_dir) results_dir.mkdir(parents=True, exist_ok=True) # Write seed code for the runner init_program_path = results_dir / "initial.cpp" init_program_path.write_text(seed_code, encoding="utf-8") # Print summary print("=" * 80) print("ShinkaEvolve: Frontier-CS Algorithmic") print("=" * 80) print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print(f"Experiment: {args.experiment_name}") print(f"Problem: {args.problem_id}") print(f"Seed: {seed_path.name}") print(f"Generations: {args.num_generations}") print(f"Parallel: {args.max_parallel_jobs}") print(f"Models: {', '.join(args.llm_models)}") print(f"Results Dir: {results_dir}") print(f"Judge: {args.judge_url}") print(f"Persistent: {'enabled' if args.persistent_agents else 'disabled'}") print("=" * 80) if args.use_eval_service: ok, info = check_eval_service(args.eval_service_url) if not ok: print(f"Eval service not available at {args.eval_service_url}: {info}") sys.exit(1) print(f"Eval service ready: {args.eval_service_url}") # Build configs (aligned with tasks/circle_packing/run_experiment.py) job_config = LocalJobConfig( eval_program_path="tasks/frontier_cs_entry/evaluate_algorithmic.py", extra_cmd_args={ "problem-id": args.problem_id, "judge-url": args.judge_url, }, ) db_config = DatabaseConfig( num_islands=args.num_islands, archive_size=args.archive_size, elite_selection_ratio=0.3, num_archive_inspirations=4, num_top_k_inspirations=2, migration_interval=10, migration_rate=0.1, island_elitism=True, parent_selection_strategy="weighted", parent_selection_lambda=10.0, ) evo_config = EvolutionConfig( task_sys_msg=task_sys_msg, patch_types=args.patch_types, patch_type_probs=args.patch_probs, num_generations=args.num_generations, max_parallel_jobs=args.max_parallel_jobs, max_patch_resamples=3, max_patch_attempts=3, edit_backend=args.edit_backend, job_type="local", language="cpp", # LLM (same defaults as circle_packing) llm_models=args.llm_models, llm_kwargs=dict( temperatures=args.llm_temperatures, max_tokens=args.llm_max_tokens, reasoning_efforts=["high"], ), llm_dynamic_selection=args.llm_selection, llm_dynamic_selection_kwargs=dict(exploration_coef=1.0), # Meta (uses first model, deterministic) meta_rec_interval=args.meta_interval, meta_llm_models=[args.llm_models[0]], meta_llm_kwargs=dict(temperatures=[0.0], max_tokens=32768), # Novelty (uses first model, deterministic) novelty_llm_models=[args.llm_models[0]], novelty_llm_kwargs=dict(temperatures=[0.0], max_tokens=32768), embedding_model="text-embedding-3-small", code_embed_sim_threshold=0.995, # Paths init_program_path=str(init_program_path), results_dir=str(results_dir), use_text_feedback=args.use_text_feedback, # Evaluator evaluator_module="tasks.frontier_cs_entry.evaluate_algorithmic", evaluator_function="main", evaluator_kwargs={ "problem_id": args.problem_id, "judge_url": args.judge_url, "frontier_cs_dir": frontier_cs_dir, }, # Eval service eval_service_url=args.eval_service_url if args.use_eval_service else None, use_eval_service=args.use_eval_service, eval_service_trigger_mode=( args.eval_trigger_mode if args.use_eval_service else None ), eval_service_trigger_interval=( args.eval_trigger_interval if args.use_eval_service else None ), # WandB enable_wandb=args.use_wandb, wandb_project=args.wandb_project, wandb_entity=args.wandb_entity, wandb_run_name=args.wandb_run_name, wandb_tags=args.wandb_tags, # Trajectory & persistent agents trajectory_log=args.trajectory_log, trajectory_log_dir=args.trajectory_log_dir, persistent_agents_enabled=args.persistent_agents, persistent_context_refresh_interval=args.persistent_context_refresh_interval, persistent_context_max_recent_attempts=args.persistent_context_max_recent_attempts, persistent_context_max_recent_insights=args.persistent_context_max_recent_insights, persistent_invalid_burst_threshold=args.persistent_invalid_burst_threshold, persistent_invalid_burst_window=args.persistent_invalid_burst_window, ) runner = EvolutionRunner( evo_config=evo_config, job_config=job_config, db_config=db_config, verbose=args.verbose, ) runner.run() if __name__ == "__main__": main()