#!/usr/bin/env python3 """ Universal experiment runner for ShinkaEvolve with Eval Service integration. Usage: python run_experiment.py --experiment-name NAME [options] Based on run_full_experiment.py pattern. """ import argparse import sys import time from pathlib import Path from datetime import datetime # Add project root to path sys.path.insert(0, str(Path(__file__).parent.parent.parent)) from shinka.core import EvolutionRunner, EvolutionConfig from shinka.launch import LocalJobConfig from shinka.database import DatabaseConfig def parse_args(): parser = argparse.ArgumentParser( description="Run ShinkaEvolve experiment with Eval Service", formatter_class=argparse.ArgumentDefaultsHelpFormatter ) # Experiment configuration parser.add_argument("--experiment-name", type=str, required=True, help="Experiment name (e.g., 'quick_test', 'full_50gen')") parser.add_argument("--num-generations", type=int, default=50, help="Number of generations to evolve") parser.add_argument("--max-parallel-jobs", type=int, default=4, help="Maximum parallel evaluation jobs") # Task configuration parser.add_argument("--task", type=str, default="circle_packing", help="Task domain") parser.add_argument("--initial-code", type=str, default=None, help="Path to initial code") parser.add_argument("--evaluator", type=str, default=None, help="Path to evaluator") # Eval Service configuration parser.add_argument("--use-eval-service", action="store_true", default=True, help="Use eval service for evaluation") parser.add_argument("--eval-service-url", type=str, default="http://localhost:8765", help="Eval service URL") parser.add_argument("--evaluator-module", type=str, default=None, help="Evaluator module") parser.add_argument("--evaluator-function", type=str, default="main", help="Evaluator function name") parser.add_argument("--eval-trigger-mode", type=str, default=None, choices=["always", "periodic", "plateau", "mixed"], help="Override eval service trigger mode for this experiment") parser.add_argument("--eval-trigger-interval", type=int, default=None, help="Override eval service trigger interval for this experiment") # Database configuration parser.add_argument("--num-islands", type=int, default=2, help="Number of islands for island model") parser.add_argument("--archive-size", type=int, default=40, help="Archive size for elites") # Meta configuration parser.add_argument("--meta-interval", type=int, default=10, help="Meta-summarizer interval (generations)") parser.add_argument("--meta-max-recommendations", type=int, default=5, help="Maximum meta recommendations") parser.add_argument("--persistent-agents", action="store_true", default=False, help="Enable optional long-horizon shared-memory agents for Shinka and eval service") parser.add_argument("--persistent-context-refresh-interval", type=int, default=10, help="How often persistent contexts are compacted and rebuilt") parser.add_argument("--persistent-context-max-recent-attempts", type=int, default=12, help="Max recent search attempts injected into persistent context") parser.add_argument("--persistent-context-max-recent-insights", type=int, default=8, help="Max recent eval insights injected into persistent context") parser.add_argument("--persistent-invalid-burst-threshold", type=int, default=3, help="Invalid-output threshold that forces a persistent-context refresh") parser.add_argument("--persistent-invalid-burst-window", type=int, default=5, help="Lookback window for invalid-output burst detection") parser.add_argument("--use-text-feedback", dest="use_text_feedback", action="store_true", default=True, help="Include evaluator text_feedback (including auxiliary metric descriptions) in mutation prompts") parser.add_argument("--no-text-feedback", dest="use_text_feedback", action="store_false", help="Disable text_feedback injection into mutation prompts") # LLM configuration parser.add_argument("--llm-models", type=str, nargs="+", default=["native-gemini-2.5-flash", "native-gemini-2.5-pro"], help="LLM models to use") parser.add_argument("--llm-selection", type=str, default="ucb1", choices=["ucb1", "thompson", "epsilon_greedy", "random"], help="LLM dynamic selection strategy") parser.add_argument("--llm-temperatures", type=float, nargs="+", default=[0.5, 0.7, 1.0], help="LLM temperatures") parser.add_argument("--llm-max-tokens", type=int, default=65536, help="LLM max tokens") parser.add_argument("--trajectory-log", action="store_true", default=False, help="Enable per-LLM-call trajectory logging for Shinka mutation loop") parser.add_argument("--trajectory-log-dir", type=str, default="llm_trajectories", help="Directory (relative to gen dir or absolute) for trajectory JSON files") # Patch configuration parser.add_argument("--patch-types", type=str, nargs="+", default=["diff", "full", "cross"], help="Patch types") parser.add_argument("--patch-probs", type=float, nargs="+", default=[0.6, 0.3, 0.1], help="Patch type probabilities") # WandB configuration parser.add_argument("--use-wandb", action="store_true", default=False, help="Enable WandB logging") parser.add_argument("--wandb-project", type=str, default="ev2", help="WandB project name") parser.add_argument("--wandb-entity", type=str, default="tengxiao", help="WandB entity") parser.add_argument("--wandb-run-name", type=str, default=None, help="WandB run name") parser.add_argument("--wandb-tags", type=str, nargs="*", default=None, help="WandB tags") # Output configuration parser.add_argument("--results-dir", type=str, default=None, help="Results directory") parser.add_argument("--verbose", action="store_true", default=True, help="Verbose output") return parser.parse_args() def setup_defaults(args): """Setup default values based on task.""" # Set task-specific defaults if args.initial_code is None: args.initial_code = f"examples/{args.task}/initial.py" if args.evaluator is None: args.evaluator = f"examples/{args.task}/evaluate_ori.py" if args.evaluator_module is None: args.evaluator_module = f"examples.{args.task}.evaluate_ori" # Setup results directory if args.results_dir is None: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") args.results_dir = f"examples/{args.task}/results/results_{args.experiment_name}_{timestamp}" # Setup WandB run name if args.use_wandb and args.wandb_run_name is None: args.wandb_run_name = f"{args.experiment_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}" return args def check_eval_service(url): """Check if eval service is running.""" import requests try: response = requests.get(f"{url}/api/v1/status", timeout=2.0) if response.status_code == 200: return True, response.json() except Exception as e: return False, str(e) return False, "Unknown error" def get_task_description(task): """Get task-specific description.""" if task == "circle_packing": return """You are an expert mathematician specializing in circle packing problems and computational geometry. Your task is to maximize the sum of radii when packing 26 circles in a unit square [0,1] x [0,1]. The best known result is 2.635 (sum of radii). Key strategies to consider: 1. Efficient spatial distribution - avoid clustering 2. Utilize corners and edges effectively 3. Balance between many small circles vs fewer large circles 4. Consider geometric patterns: grid, hexagonal, concentric rings 5. Optimize placement to minimize wasted space You will receive: - Current code implementation - Performance metrics (sum of radii) - Circle center coordinates as text Make improvements based on the numerical data and geometric reasoning. Ensure all circles are disjoint and lie inside the unit square.""" else: return "Solve the given task optimally." def main(): args = parse_args() args = setup_defaults(args) results_dir = Path(args.results_dir) results_dir.mkdir(parents=True, exist_ok=True) # Print header print("=" * 80) print("šŸš€ ShinkaEvolve Experiment Runner") print("=" * 80) print(f"šŸ“… Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print(f"šŸ”¬ Experiment: {args.experiment_name}") print(f"šŸŽÆ Task: {args.task}") print(f"šŸ“Š Generations: {args.num_generations}") print(f"⚔ Parallel Jobs: {args.max_parallel_jobs}") print(f"šŸ“ Results: {results_dir}") print("=" * 80) print() # Check eval service if args.use_eval_service: print(f"šŸ” Checking eval service at {args.eval_service_url}...") is_running, info = check_eval_service(args.eval_service_url) if is_running: print(f"āœ… Eval service is running") if isinstance(info, dict): print(f" Status: {info.get('status', 'unknown')}") else: print(f"āŒ Eval service not running: {info}") print() print("Please start eval service first:") print(f" bash scripts/dev/start_eval_server.sh") print() sys.exit(1) print() # Setup configurations job_config = LocalJobConfig( eval_program_path=args.evaluator ) db_config = DatabaseConfig( num_islands=args.num_islands, archive_size=args.archive_size, elite_selection_ratio=0.3, num_archive_inspirations=4, num_top_k_inspirations=2, migration_interval=10, migration_rate=0.1, island_elitism=True, parent_selection_strategy="weighted", parent_selection_lambda=10.0, ) evo_config = EvolutionConfig( task_sys_msg=get_task_description(args.task), patch_types=args.patch_types, patch_type_probs=args.patch_probs, num_generations=args.num_generations, max_parallel_jobs=args.max_parallel_jobs, max_patch_resamples=3, max_patch_attempts=3, job_type="local", language="python", # LLM configuration llm_models=args.llm_models, llm_kwargs=dict( temperatures=args.llm_temperatures, max_tokens=args.llm_max_tokens, ), llm_dynamic_selection=args.llm_selection, llm_dynamic_selection_kwargs=dict(exploration_coef=1.0), # Meta configuration meta_rec_interval=args.meta_interval, meta_llm_models=["native-gemini-2.5-flash"], meta_llm_kwargs=dict(temperatures=[0.7], max_tokens=16384), meta_max_recommendations=args.meta_max_recommendations, # Embedding for novelty embedding_model="text-embedding-3-small", code_embed_sim_threshold=0.995, novelty_llm_models=["native-gemini-2.5-flash"], novelty_llm_kwargs=dict(temperatures=[0.7], max_tokens=16384), # Paths init_program_path=args.initial_code, results_dir=str(results_dir), use_text_feedback=args.use_text_feedback, # Eval Service eval_service_url=args.eval_service_url if args.use_eval_service else None, use_eval_service=args.use_eval_service, evaluator_module=args.evaluator_module if args.use_eval_service else None, evaluator_function=args.evaluator_function, eval_service_trigger_mode=args.eval_trigger_mode if args.use_eval_service else None, eval_service_trigger_interval=args.eval_trigger_interval if args.use_eval_service else None, # WandB enable_wandb=args.use_wandb, wandb_project=args.wandb_project if args.use_wandb else None, wandb_entity=args.wandb_entity if args.use_wandb else None, wandb_run_name=args.wandb_run_name if args.use_wandb else None, wandb_tags=args.wandb_tags if args.use_wandb else None, trajectory_log=args.trajectory_log, trajectory_log_dir=args.trajectory_log_dir, persistent_agents_enabled=args.persistent_agents, persistent_context_refresh_interval=args.persistent_context_refresh_interval, persistent_context_max_recent_attempts=args.persistent_context_max_recent_attempts, persistent_context_max_recent_insights=args.persistent_context_max_recent_insights, persistent_invalid_burst_threshold=args.persistent_invalid_burst_threshold, persistent_invalid_burst_window=args.persistent_invalid_burst_window, ) # Print configuration summary print("šŸ“‹ Configuration Summary:") print(f" • Generations: {evo_config.num_generations}") print(f" • Parallel Jobs: {evo_config.max_parallel_jobs}") print(f" • Islands: {db_config.num_islands}") print(f" • Archive Size: {db_config.archive_size}") print(f" • Models: {', '.join(evo_config.llm_models)}") print(f" • LLM Selection: {evo_config.llm_dynamic_selection}") print(f" • Meta Interval: {evo_config.meta_rec_interval}") print(f" • Evaluator: {args.evaluator}") if args.use_eval_service: print(f" • Eval Service: {evo_config.eval_service_url} āœ…") if args.eval_trigger_mode is not None: print(f" • Eval Trigger Mode: {args.eval_trigger_mode}") if args.eval_trigger_interval is not None: print(f" • Eval Trigger Interval: {args.eval_trigger_interval}") if args.use_wandb: print(f" • WandB: {args.wandb_project}/{args.wandb_run_name}") if args.trajectory_log: print(f" • Trajectory Log: enabled ({args.trajectory_log_dir})") print(f" • Text Feedback in Prompt: {'enabled' if args.use_text_feedback else 'disabled'}") print(f" • Persistent Agents: {'enabled' if args.persistent_agents else 'disabled'}") print() # Confirmation try: input("Press Enter to start (Ctrl+C to cancel)...") except KeyboardInterrupt: print("\nāŒ Cancelled") sys.exit(0) print() print("=" * 80) print("šŸƒ Starting Evolution") print("=" * 80) print() start_time = time.time() # Run evolution try: runner = EvolutionRunner( evo_config=evo_config, job_config=job_config, db_config=db_config ) runner.run() elapsed = time.time() - start_time print() print("=" * 80) print("āœ… Experiment Completed!") print("=" * 80) print(f"ā±ļø Total time: {elapsed/3600:.2f} hours") print(f"šŸ“ Results: {results_dir}") print() print("šŸ“Š Check:") print(f" • Best program: {results_dir}/best/") print(f" • Database: {results_dir}/evolution_db.sqlite") if args.use_eval_service: print(f" • Eval agent memory: {results_dir}/eval_agent_memory/") print(f" • Metrics docs: {results_dir}/eval_agent_memory/EVAL_AGENTS.md") if args.use_wandb: wandb_url = f"https://wandb.ai/{args.wandb_entity or 'YOUR_ENTITY'}/{args.wandb_project}" print(f" • WandB: {wandb_url}") print() except KeyboardInterrupt: print() print("=" * 80) print("āš ļø Experiment Interrupted") print("=" * 80) print(f"šŸ“ Partial results: {results_dir}") sys.exit(130) except Exception as e: print() print("=" * 80) print("āŒ Experiment Failed") print("=" * 80) print(f"Error: {e}") print() import traceback traceback.print_exc() sys.exit(1) if __name__ == "__main__": main()