| |
| """ |
| Universal experiment runner for ShinkaEvolve with Eval Service integration. |
| |
| Usage: python run_experiment.py --experiment-name NAME [options] |
| |
| Based on run_full_experiment.py pattern. |
| """ |
|
|
| import argparse |
| import sys |
| import time |
| from pathlib import Path |
| from datetime import datetime |
|
|
| |
| sys.path.insert(0, str(Path(__file__).parent.parent.parent)) |
|
|
| from shinka.core import EvolutionRunner, EvolutionConfig |
| from shinka.launch import LocalJobConfig |
| from shinka.database import DatabaseConfig |
|
|
|
|
| def parse_args(): |
| parser = argparse.ArgumentParser( |
| description="Run ShinkaEvolve experiment with Eval Service", |
| formatter_class=argparse.ArgumentDefaultsHelpFormatter |
| ) |
| |
| |
| parser.add_argument("--experiment-name", type=str, required=True, |
| help="Experiment name (e.g., 'quick_test', 'full_50gen')") |
| parser.add_argument("--num-generations", type=int, default=50, |
| help="Number of generations to evolve") |
| parser.add_argument("--max-parallel-jobs", type=int, default=4, |
| help="Maximum parallel evaluation jobs") |
| |
| |
| parser.add_argument("--task", type=str, default="circle_packing", |
| help="Task domain") |
| parser.add_argument("--initial-code", type=str, default=None, |
| help="Path to initial code") |
| parser.add_argument("--evaluator", type=str, default=None, |
| help="Path to evaluator") |
| |
| |
| parser.add_argument("--use-eval-service", action="store_true", default=True, |
| help="Use eval service for evaluation") |
| parser.add_argument("--eval-service-url", type=str, default="http://localhost:8765", |
| help="Eval service URL") |
| parser.add_argument("--evaluator-module", type=str, default=None, |
| help="Evaluator module") |
| parser.add_argument("--evaluator-function", type=str, default="main", |
| help="Evaluator function name") |
| parser.add_argument("--eval-trigger-mode", type=str, default=None, |
| choices=["always", "periodic", "plateau", "mixed"], |
| help="Override eval service trigger mode for this experiment") |
| parser.add_argument("--eval-trigger-interval", type=int, default=None, |
| help="Override eval service trigger interval for this experiment") |
| |
| |
| parser.add_argument("--num-islands", type=int, default=2, |
| help="Number of islands for island model") |
| parser.add_argument("--archive-size", type=int, default=40, |
| help="Archive size for elites") |
| |
| |
| parser.add_argument("--meta-interval", type=int, default=10, |
| help="Meta-summarizer interval (generations)") |
| parser.add_argument("--meta-max-recommendations", type=int, default=5, |
| help="Maximum meta recommendations") |
| parser.add_argument("--persistent-agents", action="store_true", default=False, |
| help="Enable optional long-horizon shared-memory agents for Shinka and eval service") |
| parser.add_argument("--persistent-context-refresh-interval", type=int, default=10, |
| help="How often persistent contexts are compacted and rebuilt") |
| parser.add_argument("--persistent-context-max-recent-attempts", type=int, default=12, |
| help="Max recent search attempts injected into persistent context") |
| parser.add_argument("--persistent-context-max-recent-insights", type=int, default=8, |
| help="Max recent eval insights injected into persistent context") |
| parser.add_argument("--persistent-invalid-burst-threshold", type=int, default=3, |
| help="Invalid-output threshold that forces a persistent-context refresh") |
| parser.add_argument("--persistent-invalid-burst-window", type=int, default=5, |
| help="Lookback window for invalid-output burst detection") |
| parser.add_argument("--use-text-feedback", dest="use_text_feedback", action="store_true", default=True, |
| help="Include evaluator text_feedback (including auxiliary metric descriptions) in mutation prompts") |
| parser.add_argument("--no-text-feedback", dest="use_text_feedback", action="store_false", |
| help="Disable text_feedback injection into mutation prompts") |
| |
| |
| parser.add_argument("--llm-models", type=str, nargs="+", |
| default=["native-gemini-2.5-flash", "native-gemini-2.5-pro"], |
| help="LLM models to use") |
| parser.add_argument("--llm-selection", type=str, default="ucb1", |
| choices=["ucb1", "thompson", "epsilon_greedy", "random"], |
| help="LLM dynamic selection strategy") |
| parser.add_argument("--llm-temperatures", type=float, nargs="+", |
| default=[0.5, 0.7, 1.0], |
| help="LLM temperatures") |
| parser.add_argument("--llm-max-tokens", type=int, default=65536, |
| help="LLM max tokens") |
| parser.add_argument("--trajectory-log", action="store_true", default=False, |
| help="Enable per-LLM-call trajectory logging for Shinka mutation loop") |
| parser.add_argument("--trajectory-log-dir", type=str, default="llm_trajectories", |
| help="Directory (relative to gen dir or absolute) for trajectory JSON files") |
| |
| |
| parser.add_argument("--patch-types", type=str, nargs="+", |
| default=["diff", "full", "cross"], |
| help="Patch types") |
| parser.add_argument("--patch-probs", type=float, nargs="+", |
| default=[0.6, 0.3, 0.1], |
| help="Patch type probabilities") |
| |
| |
| parser.add_argument("--use-wandb", action="store_true", default=False, |
| help="Enable WandB logging") |
| parser.add_argument("--wandb-project", type=str, default="ev2", |
| help="WandB project name") |
| parser.add_argument("--wandb-entity", type=str, default="tengxiao", |
| help="WandB entity") |
| parser.add_argument("--wandb-run-name", type=str, default=None, |
| help="WandB run name") |
| parser.add_argument("--wandb-tags", type=str, nargs="*", default=None, |
| help="WandB tags") |
| |
| |
| parser.add_argument("--results-dir", type=str, default=None, |
| help="Results directory") |
| parser.add_argument("--verbose", action="store_true", default=True, |
| help="Verbose output") |
| |
| return parser.parse_args() |
|
|
|
|
| def setup_defaults(args): |
| """Setup default values based on task.""" |
| |
| if args.initial_code is None: |
| args.initial_code = f"examples/{args.task}/initial.py" |
| if args.evaluator is None: |
| args.evaluator = f"examples/{args.task}/evaluate_ori.py" |
| if args.evaluator_module is None: |
| args.evaluator_module = f"examples.{args.task}.evaluate_ori" |
| |
| |
| if args.results_dir is None: |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| args.results_dir = f"examples/{args.task}/results/results_{args.experiment_name}_{timestamp}" |
| |
| |
| if args.use_wandb and args.wandb_run_name is None: |
| args.wandb_run_name = f"{args.experiment_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}" |
| |
| return args |
|
|
|
|
| def check_eval_service(url): |
| """Check if eval service is running.""" |
| import requests |
| try: |
| response = requests.get(f"{url}/api/v1/status", timeout=2.0) |
| if response.status_code == 200: |
| return True, response.json() |
| except Exception as e: |
| return False, str(e) |
| return False, "Unknown error" |
|
|
|
|
| def get_task_description(task): |
| """Get task-specific description.""" |
| if task == "circle_packing": |
| return """You are an expert mathematician specializing in circle packing problems and computational geometry. |
| |
| Your task is to maximize the sum of radii when packing 26 circles in a unit square [0,1] x [0,1]. |
| The best known result is 2.635 (sum of radii). |
| |
| Key strategies to consider: |
| 1. Efficient spatial distribution - avoid clustering |
| 2. Utilize corners and edges effectively |
| 3. Balance between many small circles vs fewer large circles |
| 4. Consider geometric patterns: grid, hexagonal, concentric rings |
| 5. Optimize placement to minimize wasted space |
| |
| You will receive: |
| - Current code implementation |
| - Performance metrics (sum of radii) |
| - Circle center coordinates as text |
| |
| Make improvements based on the numerical data and geometric reasoning. |
| Ensure all circles are disjoint and lie inside the unit square.""" |
| else: |
| return "Solve the given task optimally." |
|
|
|
|
| def main(): |
| args = parse_args() |
| args = setup_defaults(args) |
| |
| results_dir = Path(args.results_dir) |
| results_dir.mkdir(parents=True, exist_ok=True) |
| |
| |
| print("=" * 80) |
| print("π ShinkaEvolve Experiment Runner") |
| print("=" * 80) |
| print(f"π
Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") |
| print(f"π¬ Experiment: {args.experiment_name}") |
| print(f"π― Task: {args.task}") |
| print(f"π Generations: {args.num_generations}") |
| print(f"β‘ Parallel Jobs: {args.max_parallel_jobs}") |
| print(f"π Results: {results_dir}") |
| print("=" * 80) |
| print() |
| |
| |
| if args.use_eval_service: |
| print(f"π Checking eval service at {args.eval_service_url}...") |
| is_running, info = check_eval_service(args.eval_service_url) |
| if is_running: |
| print(f"β
Eval service is running") |
| if isinstance(info, dict): |
| print(f" Status: {info.get('status', 'unknown')}") |
| else: |
| print(f"β Eval service not running: {info}") |
| print() |
| print("Please start eval service first:") |
| print(f" bash scripts/dev/start_eval_server.sh") |
| print() |
| sys.exit(1) |
| print() |
| |
| |
| job_config = LocalJobConfig( |
| eval_program_path=args.evaluator |
| ) |
| |
| db_config = DatabaseConfig( |
| num_islands=args.num_islands, |
| archive_size=args.archive_size, |
| elite_selection_ratio=0.3, |
| num_archive_inspirations=4, |
| num_top_k_inspirations=2, |
| migration_interval=10, |
| migration_rate=0.1, |
| island_elitism=True, |
| parent_selection_strategy="weighted", |
| parent_selection_lambda=10.0, |
| ) |
| |
| evo_config = EvolutionConfig( |
| task_sys_msg=get_task_description(args.task), |
| patch_types=args.patch_types, |
| patch_type_probs=args.patch_probs, |
| num_generations=args.num_generations, |
| max_parallel_jobs=args.max_parallel_jobs, |
| max_patch_resamples=3, |
| max_patch_attempts=3, |
| job_type="local", |
| language="python", |
| |
| |
| llm_models=args.llm_models, |
| llm_kwargs=dict( |
| temperatures=args.llm_temperatures, |
| max_tokens=args.llm_max_tokens, |
| ), |
| llm_dynamic_selection=args.llm_selection, |
| llm_dynamic_selection_kwargs=dict(exploration_coef=1.0), |
| |
| |
| meta_rec_interval=args.meta_interval, |
| meta_llm_models=["native-gemini-2.5-flash"], |
| meta_llm_kwargs=dict(temperatures=[0.7], max_tokens=16384), |
| meta_max_recommendations=args.meta_max_recommendations, |
| |
| |
| embedding_model="text-embedding-3-small", |
| code_embed_sim_threshold=0.995, |
| novelty_llm_models=["native-gemini-2.5-flash"], |
| novelty_llm_kwargs=dict(temperatures=[0.7], max_tokens=16384), |
| |
| |
| init_program_path=args.initial_code, |
| results_dir=str(results_dir), |
| use_text_feedback=args.use_text_feedback, |
| |
| |
| eval_service_url=args.eval_service_url if args.use_eval_service else None, |
| use_eval_service=args.use_eval_service, |
| evaluator_module=args.evaluator_module if args.use_eval_service else None, |
| evaluator_function=args.evaluator_function, |
| eval_service_trigger_mode=args.eval_trigger_mode if args.use_eval_service else None, |
| eval_service_trigger_interval=args.eval_trigger_interval if args.use_eval_service else None, |
| |
| |
| enable_wandb=args.use_wandb, |
| wandb_project=args.wandb_project if args.use_wandb else None, |
| wandb_entity=args.wandb_entity if args.use_wandb else None, |
| wandb_run_name=args.wandb_run_name if args.use_wandb else None, |
| wandb_tags=args.wandb_tags if args.use_wandb else None, |
| trajectory_log=args.trajectory_log, |
| trajectory_log_dir=args.trajectory_log_dir, |
| persistent_agents_enabled=args.persistent_agents, |
| persistent_context_refresh_interval=args.persistent_context_refresh_interval, |
| persistent_context_max_recent_attempts=args.persistent_context_max_recent_attempts, |
| persistent_context_max_recent_insights=args.persistent_context_max_recent_insights, |
| persistent_invalid_burst_threshold=args.persistent_invalid_burst_threshold, |
| persistent_invalid_burst_window=args.persistent_invalid_burst_window, |
| ) |
| |
| |
| print("π Configuration Summary:") |
| print(f" β’ Generations: {evo_config.num_generations}") |
| print(f" β’ Parallel Jobs: {evo_config.max_parallel_jobs}") |
| print(f" β’ Islands: {db_config.num_islands}") |
| print(f" β’ Archive Size: {db_config.archive_size}") |
| print(f" β’ Models: {', '.join(evo_config.llm_models)}") |
| print(f" β’ LLM Selection: {evo_config.llm_dynamic_selection}") |
| print(f" β’ Meta Interval: {evo_config.meta_rec_interval}") |
| print(f" β’ Evaluator: {args.evaluator}") |
| if args.use_eval_service: |
| print(f" β’ Eval Service: {evo_config.eval_service_url} β
") |
| if args.eval_trigger_mode is not None: |
| print(f" β’ Eval Trigger Mode: {args.eval_trigger_mode}") |
| if args.eval_trigger_interval is not None: |
| print(f" β’ Eval Trigger Interval: {args.eval_trigger_interval}") |
| if args.use_wandb: |
| print(f" β’ WandB: {args.wandb_project}/{args.wandb_run_name}") |
| if args.trajectory_log: |
| print(f" β’ Trajectory Log: enabled ({args.trajectory_log_dir})") |
| print(f" β’ Text Feedback in Prompt: {'enabled' if args.use_text_feedback else 'disabled'}") |
| print(f" β’ Persistent Agents: {'enabled' if args.persistent_agents else 'disabled'}") |
| print() |
| |
| |
| try: |
| input("Press Enter to start (Ctrl+C to cancel)...") |
| except KeyboardInterrupt: |
| print("\nβ Cancelled") |
| sys.exit(0) |
| |
| print() |
| print("=" * 80) |
| print("π Starting Evolution") |
| print("=" * 80) |
| print() |
| |
| start_time = time.time() |
| |
| |
| try: |
| runner = EvolutionRunner( |
| evo_config=evo_config, |
| job_config=job_config, |
| db_config=db_config |
| ) |
| |
| runner.run() |
| |
| elapsed = time.time() - start_time |
| |
| print() |
| print("=" * 80) |
| print("β
Experiment Completed!") |
| print("=" * 80) |
| print(f"β±οΈ Total time: {elapsed/3600:.2f} hours") |
| print(f"π Results: {results_dir}") |
| print() |
| |
| print("π Check:") |
| print(f" β’ Best program: {results_dir}/best/") |
| print(f" β’ Database: {results_dir}/evolution_db.sqlite") |
| if args.use_eval_service: |
| print(f" β’ Eval agent memory: {results_dir}/eval_agent_memory/") |
| print(f" β’ Metrics docs: {results_dir}/eval_agent_memory/EVAL_AGENTS.md") |
| if args.use_wandb: |
| wandb_url = f"https://wandb.ai/{args.wandb_entity or 'YOUR_ENTITY'}/{args.wandb_project}" |
| print(f" β’ WandB: {wandb_url}") |
| print() |
| |
| except KeyboardInterrupt: |
| print() |
| print("=" * 80) |
| print("β οΈ Experiment Interrupted") |
| print("=" * 80) |
| print(f"π Partial results: {results_dir}") |
| sys.exit(130) |
| |
| except Exception as e: |
| print() |
| print("=" * 80) |
| print("β Experiment Failed") |
| print("=" * 80) |
| print(f"Error: {e}") |
| print() |
| import traceback |
| traceback.print_exc() |
| sys.exit(1) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|